run into clIREEInputConversionDLCCodegen run into clIREEInputConversionDLCCodegen run into clIREEInputConversionDLCCodegen // -----// IR Dump After AutoInputConversionPipelinePass (iree-auto-input-conversion) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info module { hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After IREEImportPublicPass (iree-import-public) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info module { hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After ImportMLProgramPass (iree-import-ml-program) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info module { hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After SanitizeModuleNamesPass (iree-sanitize-module-names) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info module { hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After ConvertMeshToFlowPass (iree-convert-mesh-to-flow) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info module { hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After DemoteF64ToF32Pass (iree-input-conversion-demote-f64-to-f32) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info module { hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After LinalgFoldUnitExtentDimsPass (linalg-fold-unit-extent-dims) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info module { hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After LLVMDLCHostTensorPaddingPass (iree-llvmdlc-host-tensor-padding-pass) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info module { hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info module { hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After mlir::iree_compiler::IREE::ABI::ConvertStreamableOpsPass (iree-abi-convert-streamable-ops) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info module { hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After mlir::iree_compiler::IREE::ABI::WrapEntryPointsPass (iree-abi-wrap-entry-points) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info module { hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After Canonicalizer (canonicalize) //----- // func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #iree_codegen.translation_info} { %0 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } // -----// IR Dump After Inliner (inline) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info module { hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After SymbolDCE (symbol-dce) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info module { hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After AssignTargetDevicesPass (iree-hal-assign-target-devices) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info module attributes {hal.device.targets = {__device_0 = #hal.device.alias<"cuda"> : !hal.device}} { hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After MaterializeTargetDevicesPass (iree-hal-materialize-target-devices) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #hal.device.alias<"cuda"> : !hal.device hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After ResolveDevicePromisesPass (iree-hal-resolve-device-promises) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #hal.device.alias<"cuda"> : !hal.device hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After ResolveDeviceAliasesPass (iree-hal-resolve-device-aliases) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After VerifyDevicesPass (iree-hal-verify-devices) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After EraseUnusedLinalgOperandsPass (iree-global-opt-erase-unused-linalg-operands) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After ExpandTensorShapesPass (iree-global-opt-expand-tensor-shapes) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After FoldUnitExtentDimsPass (iree-dispatch-creation-fold-unit-extent-dims) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After MaterializeHomogeneousEncodingsPass (iree-global-opt-materialize-homogeneous-encodings) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After CSE (cse) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After SimplifyPackUnpackPass (iree-global-opt-simplify-pack-unpack) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After IPOPass (iree-util-ipo) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After HoistIntoGlobalsPass (iree-util-hoist-into-globals) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After JitGlobalsPass (iree-consteval-jit-globals) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After TensorPadToTensorInsertSlicePass (iree-dispatch-creation-tensor-pad-to-tensor-insert-slice) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After IPOPass (iree-util-ipo) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After FixedPointIteratorPass (iree-util-fixed-point-iterator) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After VerifyInputLegalityPass (iree-verify-input-legality) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After OutlineDispatchExternsPass (iree-flow-outline-dispatch-externs) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After OutlineDispatchRegionsPass (iree-flow-outline-dispatch-regions) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After AnnotateDispatchesPass (iree-flow-annotate-dispatches) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After DeduplicateExecutablesPass (iree-flow-deduplicate-executables) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After OutlineConstantsPass (iree-flow-outline-constants) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After IPOPass (iree-util-ipo) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After FixedPointIteratorPass (iree-util-fixed-point-iterator) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After SymbolDCE (symbol-dce) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After VerifyInputPass (iree-stream-verify-input) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After IPOPass (iree-util-ipo) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After ConvertToStreamPass (iree-stream-conversion) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After VerifyLoweringToTensorsPass (iree-stream-verify-lowering-to-tensors) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After Canonicalizer (canonicalize) //----- // func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #iree_codegen.translation_info} { %0 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } // -----// IR Dump After Inliner (inline) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After IPOPass (iree-util-ipo) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After CombineInitializersPass (iree-util-combine-initializers) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After IPOPass (iree-util-ipo) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After VerifyLoweringToAsyncResourcesPass (iree-stream-verify-lowering-to-async-resources) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After ElideAsyncCopiesPass (iree-stream-elide-async-copies) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After RefineUsagePass (iree-stream-refine-usage) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After IPOPass (iree-util-ipo) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After VerifyAsyncAccessRangesPass (iree-stream-verify-async-access-ranges) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After PropagateTimepointsPass (iree-stream-propagate-timepoints) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After MaterializeBuiltinsPass (iree-stream-materialize-builtins) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After IPOPass (iree-util-ipo) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After VerifyLoweringToAsyncPass (iree-stream-verify-lowering-to-async) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After ScheduleAllocationPass (iree-stream-schedule-allocation) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After PropagateSubrangesPass (iree-util-propagate-subranges) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After IPOPass (iree-util-ipo) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After VerifyLoweringToCmdPass (iree-stream-verify-lowering-to-cmd) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After IPOPass (iree-util-ipo) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After IPOPass (iree-util-ipo) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After ElideTimepointsPass (iree-stream-elide-timepoints) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After FixedPointIteratorPass (iree-util-fixed-point-iterator) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After FuseDispatchBindingsPass (iree-stream-fuse-dispatch-bindings) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After AnnotateDispatchArgumentsPass (iree-stream-annotate-dispatch-arguments) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After AnnotateDispatchAssumptionsPass (iree-stream-annotate-dispatch-assumptions) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After PackDispatchOperandsPass (iree-stream-pack-dispatch-operands) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After IPOPass (iree-util-ipo) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After FoldUniformOperandsPass (iree-stream-fold-uniform-operands) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After IPOPass (iree-util-ipo) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After SymbolDCE (symbol-dce) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After AssignTargetDevicesPass (iree-hal-assign-target-devices) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After MaterializeTargetDevicesPass (iree-hal-materialize-target-devices) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After ResolveDevicePromisesPass (iree-hal-resolve-device-promises) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After ResolveDeviceAliasesPass (iree-hal-resolve-device-aliases) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After VerifyDevicesPass (iree-hal-verify-devices) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After VerifyDevicesPass (iree-hal-verify-devices) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After MaterializeInterfacesPass (iree-hal-materialize-interfaces) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After PruneExecutablesPass (iree-hal-prune-executables) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } // -----// IR Dump After GPUGeneralizeNamedOpsPass (iree-codegen-gpu-generalize-named-ops) //----- // func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #iree_codegen.translation_info} { %0 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } // -----// IR Dump After TypePropagationPass (iree-codegen-type-propagation) //----- // func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #iree_codegen.translation_info} { %0 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } // -----// IR Dump After BubbleUpOrdinalOpsPass (iree-codegen-bubble-up-ordinal-ops) //----- // func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #iree_codegen.translation_info} { %0 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } // -----// IR Dump After BufferizeCopyOnlyDispatchesPass (iree-codegen-bufferize-copy-only-dispatches) //----- // func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #iree_codegen.translation_info} { %0 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } // -----// IR Dump After DecomposeSoftmaxPass (iree-codegen-decompose-softmax) //----- // func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #iree_codegen.translation_info} { %0 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } // -----// IR Dump After MaterializeEncodingIntoNopPass (iree-codegen-materialize-encoding-into-nop) //----- // func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #iree_codegen.translation_info} { %0 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } // -----// IR Dump After BufferizeCopyOnlyDispatchesPass (iree-codegen-bufferize-copy-only-dispatches) //----- // func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #iree_codegen.translation_info} { %0 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } // -----// IR Dump After Canonicalizer (canonicalize) //----- // func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #iree_codegen.translation_info} { %0 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } // -----// IR Dump After BlockDynamicDimensionsPass (iree-codegen-block-dynamic-dimensions) //----- // func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #iree_codegen.translation_info} { %0 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } // -----// IR Dump After ConfigTrackingCanonicalizerPass (iree-codegen-config-tracking-canonicalize) //----- // func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #iree_codegen.translation_info} { %0 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } // -----// IR Dump After CSE (cse) //----- // func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #iree_codegen.translation_info} { %0 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } // -----// IR Dump After MaterializeTuningSpecsPass (iree-codegen-materialize-tuning-specs) //----- // module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #iree_codegen.translation_info} { %0 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } // -----// IR Dump After MaterializeUserConfigsPass (iree-codegen-materialize-user-configs) //----- // module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #iree_codegen.translation_info} { %0 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } funcops in LLVMGPUSelectLoweringStrategyPass: func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #iree_codegen.translation_info} { %0 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } funcOp after initGPULaunchConfig: func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #iree_codegen.translation_info} { %0 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } translationInfo in LLVMGPUSelectLoweringStrategyPass: #iree_codegen.translation_info // -----// IR Dump After LLVMGPUSelectLoweringStrategyPass (iree-llvmgpu-select-lowering-strategy) //----- // module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #iree_codegen.translation_info} { %0 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } // -----// IR Dump After ConfigureTargetExecutableVariantsPass (iree-hal-configure-target-executable-variants) //----- // hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}>) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #iree_codegen.translation_info} { %0 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } // -----// IR Dump After ConfigureExecutablesPass (iree-hal-configure-executables) //----- // hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}>) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #iree_codegen.translation_info} { %0 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } // -----// IR Dump After DumpExecutableSourcesPass (iree-hal-dump-executable-sources) //----- // #config = #iree_codegen.lowering_config #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> #map = affine_map<(d0, d1) -> (d0, d1)> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> #translation = #iree_codegen.translation_info #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { util.global private @__device_0 = #device_target_cuda hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #translation} { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } } } run into createConvertToNVVMPass // -----// IR Dump After HoistExecutableObjectsPass (iree-hal-hoist-executable-objects) //----- // hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}>) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #iree_codegen.translation_info} { %0 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } } // -----// IR Dump After LowerExecutableUsingTransformDialectPass (iree-codegen-lower-executable-using-transform-dialect) //----- // module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #iree_codegen.translation_info} { %0 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%9 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } return } } // -----// IR Dump After TileAndDistributeToWorkgroupsUsingForallOpPass (iree-codegen-tile-and-distribute-to-workgroups-using-forall-op) //----- // func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #iree_codegen.translation_info} { %0 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (1, 64) { %subview_1 = memref.subview %subview[%arg0, %arg1] [1, 64] [1, 1] : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> to memref<1x64xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_2 = memref.subview %9[%arg0, %arg1] [1, 64] [1, 1] : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> to memref<1x64xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_1 : memref<1x64xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%subview_2 : memref<1x64xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } } {mapping = [#iree_codegen.workgroup_mapping, #iree_codegen.workgroup_mapping]} return } // -----// IR Dump After ConfigTrackingCanonicalizerPass (iree-codegen-config-tracking-canonicalize) //----- // func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #iree_codegen.translation_info} { %0 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (1, 64) { %subview_1 = memref.subview %subview[%arg0, %arg1] [1, 64] [1, 1] : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> to memref<1x64xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_2 = memref.subview %9[%arg0, %arg1] [1, 64] [1, 1] : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> to memref<1x64xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_1 : memref<1x64xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%subview_2 : memref<1x64xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } } {mapping = [#iree_codegen.workgroup_mapping, #iree_codegen.workgroup_mapping]} return } // -----// IR Dump After CSE (cse) //----- // func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #iree_codegen.translation_info} { %0 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (1, 64) { %subview_1 = memref.subview %subview[%arg0, %arg1] [1, 64] [1, 1] : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> to memref<1x64xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_2 = memref.subview %9[%arg0, %arg1] [1, 64] [1, 1] : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> to memref<1x64xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_1 : memref<1x64xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%subview_2 : memref<1x64xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } } {mapping = [#iree_codegen.workgroup_mapping, #iree_codegen.workgroup_mapping]} return } // -----// IR Dump After EliminateEmptyTensorsPass (iree-eliminate-empty-tensors) //----- // func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #iree_codegen.translation_info} { %0 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (1, 64) { %subview_1 = memref.subview %subview[%arg0, %arg1] [1, 64] [1, 1] : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> to memref<1x64xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_2 = memref.subview %9[%arg0, %arg1] [1, 64] [1, 1] : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> to memref<1x64xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_1 : memref<1x64xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%subview_2 : memref<1x64xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } } {mapping = [#iree_codegen.workgroup_mapping, #iree_codegen.workgroup_mapping]} return } // -----// IR Dump After EmptyTensorToAllocTensor (empty-tensor-to-alloc-tensor) //----- // func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #iree_codegen.translation_info} { %0 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (1, 64) { %subview_1 = memref.subview %subview[%arg0, %arg1] [1, 64] [1, 1] : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> to memref<1x64xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_2 = memref.subview %9[%arg0, %arg1] [1, 64] [1, 1] : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> to memref<1x64xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_1 : memref<1x64xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%subview_2 : memref<1x64xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } } {mapping = [#iree_codegen.workgroup_mapping, #iree_codegen.workgroup_mapping]} return } // -----// IR Dump After IREEComprehensiveBufferizePass (iree-codegen-iree-comprehensive-bufferize) //----- // func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #iree_codegen.translation_info} { %0 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (1, 64) { %subview_1 = memref.subview %subview[%arg0, %arg1] [1, 64] [1, 1] : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> to memref<1x64xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_2 = memref.subview %9[%arg0, %arg1] [1, 64] [1, 1] : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> to memref<1x64xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_1 : memref<1x64xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%subview_2 : memref<1x64xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } } {mapping = [#iree_codegen.workgroup_mapping, #iree_codegen.workgroup_mapping]} return } // -----// IR Dump After ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) //----- // func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #iree_codegen.translation_info} { %0 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (1, 64) { %subview_1 = memref.subview %subview[%arg0, %arg1] [1, 64] [1, 1] : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> to memref<1x64xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_2 = memref.subview %9[%arg0, %arg1] [1, 64] [1, 1] : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> to memref<1x64xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_1 : memref<1x64xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%subview_2 : memref<1x64xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } } {mapping = [#iree_codegen.workgroup_mapping, #iree_codegen.workgroup_mapping]} return } // -----// IR Dump After Canonicalizer (canonicalize) //----- // func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #iree_codegen.translation_info} { %0 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (1, 64) { %subview_1 = memref.subview %subview[%arg0, %arg1] [1, 64] [1, 1] : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> to memref<1x64xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_2 = memref.subview %9[%arg0, %arg1] [1, 64] [1, 1] : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> to memref<1x64xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_1 : memref<1x64xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%subview_2 : memref<1x64xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } } {mapping = [#iree_codegen.workgroup_mapping, #iree_codegen.workgroup_mapping]} return } // -----// IR Dump After CSE (cse) //----- // func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #iree_codegen.translation_info} { %0 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (1, 64) { %subview_1 = memref.subview %subview[%arg0, %arg1] [1, 64] [1, 1] : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> to memref<1x64xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_2 = memref.subview %9[%arg0, %arg1] [1, 64] [1, 1] : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> to memref<1x64xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_1 : memref<1x64xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%subview_2 : memref<1x64xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } } {mapping = [#iree_codegen.workgroup_mapping, #iree_codegen.workgroup_mapping]} return } // -----// IR Dump After Canonicalizer (canonicalize) //----- // func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #iree_codegen.translation_info} { %0 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (1, 64) { %subview_1 = memref.subview %subview[%arg0, %arg1] [1, 64] [1, 1] : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> to memref<1x64xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_2 = memref.subview %9[%arg0, %arg1] [1, 64] [1, 1] : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> to memref<1x64xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_1 : memref<1x64xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%subview_2 : memref<1x64xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } } {mapping = [#iree_codegen.workgroup_mapping, #iree_codegen.workgroup_mapping]} return } // -----// IR Dump After CleanupBufferAllocViewPass (iree-codegen-cleanup-buffer-alloc-view) //----- // func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #iree_codegen.translation_info} { %0 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (1, 64) { %subview_1 = memref.subview %subview[%arg0, %arg1] [1, 64] [1, 1] : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> to memref<1x64xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_2 = memref.subview %9[%arg0, %arg1] [1, 64] [1, 1] : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> to memref<1x64xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_1 : memref<1x64xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%subview_2 : memref<1x64xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } } {mapping = [#iree_codegen.workgroup_mapping, #iree_codegen.workgroup_mapping]} return } // -----// IR Dump After Canonicalizer (canonicalize) //----- // func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #iree_codegen.translation_info} { %0 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (1, 64) { %subview_1 = memref.subview %subview[%arg0, %arg1] [1, 64] [1, 1] : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> to memref<1x64xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_2 = memref.subview %9[%arg0, %arg1] [1, 64] [1, 1] : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> to memref<1x64xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_1 : memref<1x64xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%subview_2 : memref<1x64xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } } {mapping = [#iree_codegen.workgroup_mapping, #iree_codegen.workgroup_mapping]} return } // -----// IR Dump After CSE (cse) //----- // func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #iree_codegen.translation_info} { %0 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (1, 64) { %subview_1 = memref.subview %subview[%arg0, %arg1] [1, 64] [1, 1] : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> to memref<1x64xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_2 = memref.subview %9[%arg0, %arg1] [1, 64] [1, 1] : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> to memref<1x64xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_1 : memref<1x64xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%subview_2 : memref<1x64xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } } {mapping = [#iree_codegen.workgroup_mapping, #iree_codegen.workgroup_mapping]} return } // -----// IR Dump After LLVMGPUTileAndDistributePass (iree-llvmgpu-tile-and-distribute) //----- // func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #iree_codegen.translation_info} { %c1 = arith.constant 1 : index %c2048 = arith.constant 2048 : index %c64 = arith.constant 64 : index %0 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %thread_id_x = gpu.thread_id x %thread_id_y = gpu.thread_id y scf.for %arg0 = %thread_id_y to %c2048 step %c1 { scf.for %arg1 = %thread_id_x to %c2048 step %c64 { %subview_1 = memref.subview %subview_0[%arg0, %arg1] [1, 1] [1, 1] : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> to memref<1x1xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_2 = memref.subview %8[%arg0, %arg1] [1, 1] [1, 1] : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> to memref<1x1xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_1 : memref<1x1xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%subview_2 : memref<1x1xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {__internal_linalg_transform__ = "vectorize", lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } } } scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (1, 64) { %subview_1 = memref.subview %subview[%arg0, %arg1] [1, 64] [1, 1] : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> to memref<1x64xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_2 = memref.subview %9[%arg0, %arg1] [1, 64] [1, 1] : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> to memref<1x64xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %thread_id_x_3 = gpu.thread_id x scf.for %arg2 = %thread_id_x_3 to %c64 step %c64 { %subview_4 = memref.subview %subview_1[0, %arg2] [1, 1] [1, 1] : memref<1x64xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> to memref<1x1xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_5 = memref.subview %subview_2[0, %arg2] [1, 1] [1, 1] : memref<1x64xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> to memref<1x1xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_4 : memref<1x1xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%subview_5 : memref<1x1xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {__internal_linalg_transform__ = "vectorize", lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } } } {mapping = [#iree_codegen.workgroup_mapping, #iree_codegen.workgroup_mapping]} return } // -----// IR Dump After Canonicalizer (canonicalize) //----- // func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #iree_codegen.translation_info} { %c1 = arith.constant 1 : index %c2048 = arith.constant 2048 : index %c64 = arith.constant 64 : index %0 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %thread_id_x = gpu.thread_id x %thread_id_y = gpu.thread_id y scf.for %arg0 = %thread_id_y to %c2048 step %c1 { scf.for %arg1 = %thread_id_x to %c2048 step %c64 { %subview_1 = memref.subview %subview_0[%arg0, %arg1] [1, 1] [1, 1] : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> to memref<1x1xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_2 = memref.subview %8[%arg0, %arg1] [1, 1] [1, 1] : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> to memref<1x1xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_1 : memref<1x1xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%subview_2 : memref<1x1xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {__internal_linalg_transform__ = "vectorize", lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } } } scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (1, 64) { %subview_1 = memref.subview %subview[%arg0, %arg1] [1, 64] [1, 1] : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> to memref<1x64xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_2 = memref.subview %9[%arg0, %arg1] [1, 64] [1, 1] : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> to memref<1x64xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %thread_id_x_3 = gpu.thread_id x scf.for %arg2 = %thread_id_x_3 to %c64 step %c64 { %subview_4 = memref.subview %subview_1[0, %arg2] [1, 1] [1, 1] : memref<1x64xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> to memref<1x1xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_5 = memref.subview %subview_2[0, %arg2] [1, 1] [1, 1] : memref<1x64xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> to memref<1x1xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_4 : memref<1x1xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%subview_5 : memref<1x1xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {__internal_linalg_transform__ = "vectorize", lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } } } {mapping = [#iree_codegen.workgroup_mapping, #iree_codegen.workgroup_mapping]} return } // -----// IR Dump After CSE (cse) //----- // func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #iree_codegen.translation_info} { %c1 = arith.constant 1 : index %c2048 = arith.constant 2048 : index %c64 = arith.constant 64 : index %0 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %thread_id_x = gpu.thread_id x %thread_id_y = gpu.thread_id y scf.for %arg0 = %thread_id_y to %c2048 step %c1 { scf.for %arg1 = %thread_id_x to %c2048 step %c64 { %subview_1 = memref.subview %subview_0[%arg0, %arg1] [1, 1] [1, 1] : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> to memref<1x1xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_2 = memref.subview %8[%arg0, %arg1] [1, 1] [1, 1] : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> to memref<1x1xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_1 : memref<1x1xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%subview_2 : memref<1x1xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {__internal_linalg_transform__ = "vectorize", lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } } } scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (1, 64) { %subview_1 = memref.subview %subview[%arg0, %arg1] [1, 64] [1, 1] : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> to memref<1x64xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_2 = memref.subview %9[%arg0, %arg1] [1, 64] [1, 1] : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> to memref<1x64xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> scf.for %arg2 = %thread_id_x to %c64 step %c64 { %subview_3 = memref.subview %subview_1[0, %arg2] [1, 1] [1, 1] : memref<1x64xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> to memref<1x1xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_4 = memref.subview %subview_2[0, %arg2] [1, 1] [1, 1] : memref<1x64xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> to memref<1x1xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_3 : memref<1x1xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%subview_4 : memref<1x1xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {__internal_linalg_transform__ = "vectorize", lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } } } {mapping = [#iree_codegen.workgroup_mapping, #iree_codegen.workgroup_mapping]} return } // -----// IR Dump After RemoveSingleIterationLoopPass (iree-codegen-remove-single-iteration-loop) //----- // func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #iree_codegen.translation_info} { %c1 = arith.constant 1 : index %c2048 = arith.constant 2048 : index %c64 = arith.constant 64 : index %0 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %thread_id_x = gpu.thread_id x %thread_id_y = gpu.thread_id y scf.for %arg0 = %thread_id_y to %c2048 step %c1 { scf.for %arg1 = %thread_id_x to %c2048 step %c64 { %subview_1 = memref.subview %subview_0[%arg0, %arg1] [1, 1] [1, 1] : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> to memref<1x1xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_2 = memref.subview %8[%arg0, %arg1] [1, 1] [1, 1] : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> to memref<1x1xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_1 : memref<1x1xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%subview_2 : memref<1x1xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {__internal_linalg_transform__ = "vectorize", lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } } } scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (1, 64) { %subview_1 = memref.subview %subview[%arg0, %arg1] [1, 64] [1, 1] : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> to memref<1x64xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_2 = memref.subview %9[%arg0, %arg1] [1, 64] [1, 1] : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> to memref<1x64xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview_3 = memref.subview %subview_1[0, %thread_id_x] [1, 1] [1, 1] : memref<1x64xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> to memref<1x1xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_4 = memref.subview %subview_2[0, %thread_id_x] [1, 1] [1, 1] : memref<1x64xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> to memref<1x1xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_3 : memref<1x1xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%subview_4 : memref<1x1xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {__internal_linalg_transform__ = "vectorize", lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } } {mapping = [#iree_codegen.workgroup_mapping, #iree_codegen.workgroup_mapping]} return } // -----// IR Dump After LLVMGPULowerExecutableTargetPass (iree-llvmgpu-lower-executable-target) //----- // func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #iree_codegen.translation_info} { %c1 = arith.constant 1 : index %c2048 = arith.constant 2048 : index %c64 = arith.constant 64 : index %0 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %thread_id_x = gpu.thread_id x %thread_id_y = gpu.thread_id y scf.for %arg0 = %thread_id_y to %c2048 step %c1 { scf.for %arg1 = %thread_id_x to %c2048 step %c64 { %subview_1 = memref.subview %subview_0[%arg0, %arg1] [1, 1] [1, 1] : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> to memref<1x1xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_2 = memref.subview %8[%arg0, %arg1] [1, 1] [1, 1] : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> to memref<1x1xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_1 : memref<1x1xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%subview_2 : memref<1x1xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {__internal_linalg_transform__ = "vectorize", lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } } } scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (1, 64) { %subview_1 = memref.subview %subview[%arg0, %arg1] [1, 64] [1, 1] : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> to memref<1x64xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_2 = memref.subview %9[%arg0, %arg1] [1, 64] [1, 1] : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> to memref<1x64xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview_3 = memref.subview %subview_1[0, %thread_id_x] [1, 1] [1, 1] : memref<1x64xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> to memref<1x1xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_4 = memref.subview %subview_2[0, %thread_id_x] [1, 1] [1, 1] : memref<1x64xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> to memref<1x1xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_3 : memref<1x1xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%subview_4 : memref<1x1xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {__internal_linalg_transform__ = "vectorize", lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } } {mapping = [#iree_codegen.workgroup_mapping, #iree_codegen.workgroup_mapping]} return } /workspace/test/main-test//configured_module_main$async_dispatch_6.mlir:29:8: error: 'linalg.generic' op write affecting operations on global resources are restricted to workgroup distributed contexts. linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_0 : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%8 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {lowering_config = #iree_codegen.lowering_config} { ^ /workspace/test/main-test//configured_module_main$async_dispatch_6.mlir:29:8: note: see current operation: "linalg.generic"(%21, %22) <{indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = [#linalg.iterator_type, #linalg.iterator_type], operandSegmentSizes = array}> ({ ^bb0(%arg6: f32, %arg7: f32): "linalg.yield"(%arg6) : (f32) -> () }) {__internal_linalg_transform__ = "vectorize", lowering_config = #iree_codegen.lowering_config} : (memref<1x1xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>, memref<1x1xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) -> () /workspace/test/main-test//configured_module_main$async_dispatch_6.mlir:9:6: error: 'func.func' op failed on workgroup distribution verification func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #iree_codegen.translation_info} { ^ /workspace/test/main-test//configured_module_main$async_dispatch_6.mlir:9:6: note: see current operation: "func.func"() <{function_type = () -> (), sym_name = "main$async_dispatch_6_slow_memcpy"}> ({ %0 = "arith.constant"() <{value = 1 : index}> : () -> index %1 = "arith.constant"() <{value = 2048 : index}> : () -> index %2 = "arith.constant"() <{value = 64 : index}> : () -> index %3 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>, ordinal = 0 : index} : () -> i32 %4 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>, ordinal = 1 : index} : () -> i32 %5 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>, ordinal = 2 : index} : () -> i32 %6 = "arith.index_castui"(%3) : (i32) -> index %7 = "arith.index_castui"(%4) : (i32) -> index %8 = "arith.index_castui"(%5) : (i32) -> index %9:3 = "util.assume.int"(%6, %7, %8) <{assumptions = [[#util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption], [#util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption], [#util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption]]}> : (index, index, index) -> (index, index, index) %10 = "hal.interface.binding.subspan"(%9#0) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>, operandSegmentSizes = array} : (index) -> memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> "memref.assume_alignment"(%10) <{alignment = 1 : i32}> : (memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type>) -> () %11 = "hal.interface.binding.subspan"(%9#1) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>, operandSegmentSizes = array} : (index) -> memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> "memref.assume_alignment"(%11) <{alignment = 1 : i32}> : (memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) -> () %12 = "hal.interface.binding.subspan"(%9#2) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>, operandSegmentSizes = array} : (index) -> memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> "memref.assume_alignment"(%12) <{alignment = 1 : i32}> : (memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) -> () %13 = "memref.subview"(%10) <{operandSegmentSizes = array, static_offsets = array, static_sizes = array, static_strides = array}> : (memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type>) -> memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %14 = "memref.subview"(%10) <{operandSegmentSizes = array, static_offsets = array, static_sizes = array, static_strides = array}> : (memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type>) -> memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %15 = "gpu.thread_id"() <{dimension = #gpu}> : () -> index %16 = "gpu.thread_id"() <{dimension = #gpu}> : () -> index "scf.for"(%16, %1, %0) ({ ^bb0(%arg4: index): "scf.for"(%15, %1, %2) ({ ^bb0(%arg5: index): %21 = "memref.subview"(%14, %arg4, %arg5) <{operandSegmentSizes = array, static_offsets = array, static_sizes = array, static_strides = array}> : (memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>, index, index) -> memref<1x1xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %22 = "memref.subview"(%11, %arg4, %arg5) <{operandSegmentSizes = array, static_offsets = array, static_sizes = array, static_strides = array}> : (memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>, index, index) -> memref<1x1xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> "linalg.generic"(%21, %22) <{indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = [#linalg.iterator_type, #linalg.iterator_type], operandSegmentSizes = array}> ({ ^bb0(%arg6: f32, %arg7: f32): "linalg.yield"(%arg6) : (f32) -> () }) {__internal_linalg_transform__ = "vectorize", lowering_config = #iree_codegen.lowering_config} : (memref<1x1xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>, memref<1x1xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) -> () "scf.yield"() : () -> () }) : (index, index, index) -> () "scf.yield"() : () -> () }) : (index, index, index) -> () "scf.forall"() <{mapping = [#iree_codegen.workgroup_mapping, #iree_codegen.workgroup_mapping], operandSegmentSizes = array, staticLowerBound = array, staticStep = array, staticUpperBound = array}> ({ ^bb0(%arg0: index, %arg1: index): %17 = "memref.subview"(%13, %arg0, %arg1) <{operandSegmentSizes = array, static_offsets = array, static_sizes = array, static_strides = array}> : (memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>, index, index) -> memref<1x64xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %18 = "memref.subview"(%12, %arg0, %arg1) <{operandSegmentSizes = array, static_offsets = array, static_sizes = array, static_strides = array}> : (memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>, index, index) -> memref<1x64xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %19 = "memref.subview"(%17, %15) <{operandSegmentSizes = array, static_offsets = array, static_sizes = array, static_strides = array}> : (memref<1x64xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>, index) -> memref<1x1xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %20 = "memref.subview"(%18, %15) <{operandSegmentSizes = array, static_offsets = array, static_sizes = array, static_strides = array}> : (memref<1x64xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>, index) -> memref<1x1xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> "linalg.generic"(%19, %20) <{indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = [#linalg.iterator_type, #linalg.iterator_type], operandSegmentSizes = array}> ({ ^bb0(%arg2: f32, %arg3: f32): "linalg.yield"(%arg2) : (f32) -> () }) {__internal_linalg_transform__ = "vectorize", lowering_config = #iree_codegen.lowering_config} : (memref<1x1xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>, memref<1x1xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) -> () "scf.forall.in_parallel"() ({ ^bb0: }) : () -> () }) : () -> () "func.return"() : () -> () }) {translation_info = #iree_codegen.translation_info} : () -> () // -----// IR Dump After VerifyWorkgroupDistributionPass Failed (iree-codegen-verify-workgroup-distribution) //----- // func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #iree_codegen.translation_info} { %c1 = arith.constant 1 : index %c2048 = arith.constant 2048 : index %c64 = arith.constant 64 : index %0 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %thread_id_x = gpu.thread_id x %thread_id_y = gpu.thread_id y scf.for %arg0 = %thread_id_y to %c2048 step %c1 { scf.for %arg1 = %thread_id_x to %c2048 step %c64 { %subview_1 = memref.subview %subview_0[%arg0, %arg1] [1, 1] [1, 1] : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> to memref<1x1xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_2 = memref.subview %8[%arg0, %arg1] [1, 1] [1, 1] : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> to memref<1x1xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_1 : memref<1x1xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%subview_2 : memref<1x1xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {__internal_linalg_transform__ = "vectorize", lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } } } scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (1, 64) { %subview_1 = memref.subview %subview[%arg0, %arg1] [1, 64] [1, 1] : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> to memref<1x64xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_2 = memref.subview %9[%arg0, %arg1] [1, 64] [1, 1] : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> to memref<1x64xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview_3 = memref.subview %subview_1[0, %thread_id_x] [1, 1] [1, 1] : memref<1x64xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> to memref<1x1xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_4 = memref.subview %subview_2[0, %thread_id_x] [1, 1] [1, 1] : memref<1x64xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> to memref<1x1xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_3 : memref<1x1xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%subview_4 : memref<1x1xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {__internal_linalg_transform__ = "vectorize", lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } } {mapping = [#iree_codegen.workgroup_mapping, #iree_codegen.workgroup_mapping]} return } /workspace/test/main-test//configured_module_main$async_dispatch_6.mlir:2:2: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}> hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}>) { ^ /workspace/test/main-test//configured_module_main$async_dispatch_6.mlir:2:2: note: see current operation: "hal.executable.variant"() ({ "hal.executable.export"() ({ ^bb0(%arg8: !hal.device, %arg9: index): %23:3 = "flow.dispatch.workgroup_count_from_dag_root"(%arg9, %arg9) : (index, index) -> (index, index, index) "hal.return"(%23#0, %23#1, %23#2) : (index, index, index) -> () }) {layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>, ordinal = 0 : index, sym_name = "main$async_dispatch_6_slow_memcpy"} : () -> () "builtin.module"() ({ "func.func"() <{function_type = () -> (), sym_name = "main$async_dispatch_6_slow_memcpy"}> ({ %0 = "arith.constant"() <{value = 1 : index}> : () -> index %1 = "arith.constant"() <{value = 2048 : index}> : () -> index %2 = "arith.constant"() <{value = 64 : index}> : () -> index %3 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>, ordinal = 0 : index} : () -> i32 %4 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>, ordinal = 1 : index} : () -> i32 %5 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>, ordinal = 2 : index} : () -> i32 %6 = "arith.index_castui"(%3) : (i32) -> index %7 = "arith.index_castui"(%4) : (i32) -> index %8 = "arith.index_castui"(%5) : (i32) -> index %9:3 = "util.assume.int"(%6, %7, %8) <{assumptions = [[#util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption], [#util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption], [#util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption, #util.int.assumption]]}> : (index, index, index) -> (index, index, index) %10 = "hal.interface.binding.subspan"(%9#0) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>, operandSegmentSizes = array} : (index) -> memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> "memref.assume_alignment"(%10) <{alignment = 1 : i32}> : (memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type>) -> () %11 = "hal.interface.binding.subspan"(%9#1) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>, operandSegmentSizes = array} : (index) -> memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> "memref.assume_alignment"(%11) <{alignment = 1 : i32}> : (memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) -> () %12 = "hal.interface.binding.subspan"(%9#2) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>, operandSegmentSizes = array} : (index) -> memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> "memref.assume_alignment"(%12) <{alignment = 1 : i32}> : (memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) -> () %13 = "memref.subview"(%10) <{operandSegmentSizes = array, static_offsets = array, static_sizes = array, static_strides = array}> : (memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type>) -> memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %14 = "memref.subview"(%10) <{operandSegmentSizes = array, static_offsets = array, static_sizes = array, static_strides = array}> : (memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type>) -> memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %15 = "gpu.thread_id"() <{dimension = #gpu}> : () -> index %16 = "gpu.thread_id"() <{dimension = #gpu}> : () -> index "scf.for"(%16, %1, %0) ({ ^bb0(%arg4: index): "scf.for"(%15, %1, %2) ({ ^bb0(%arg5: index): %21 = "memref.subview"(%14, %arg4, %arg5) <{operandSegmentSizes = array, static_offsets = array, static_sizes = array, static_strides = array}> : (memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>, index, index) -> memref<1x1xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %22 = "memref.subview"(%11, %arg4, %arg5) <{operandSegmentSizes = array, static_offsets = array, static_sizes = array, static_strides = array}> : (memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>, index, index) -> memref<1x1xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> "linalg.generic"(%21, %22) <{indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = [#linalg.iterator_type, #linalg.iterator_type], operandSegmentSizes = array}> ({ ^bb0(%arg6: f32, %arg7: f32): "linalg.yield"(%arg6) : (f32) -> () }) {__internal_linalg_transform__ = "vectorize", lowering_config = #iree_codegen.lowering_config} : (memref<1x1xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>, memref<1x1xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) -> () "scf.yield"() : () -> () }) : (index, index, index) -> () "scf.yield"() : () -> () }) : (index, index, index) -> () "scf.forall"() <{mapping = [#iree_codegen.workgroup_mapping, #iree_codegen.workgroup_mapping], operandSegmentSizes = array, staticLowerBound = array, staticStep = array, staticUpperBound = array}> ({ ^bb0(%arg0: index, %arg1: index): %17 = "memref.subview"(%13, %arg0, %arg1) <{operandSegmentSizes = array, static_offsets = array, static_sizes = array, static_strides = array}> : (memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>, index, index) -> memref<1x64xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %18 = "memref.subview"(%12, %arg0, %arg1) <{operandSegmentSizes = array, static_offsets = array, static_sizes = array, static_strides = array}> : (memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>, index, index) -> memref<1x64xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %19 = "memref.subview"(%17, %15) <{operandSegmentSizes = array, static_offsets = array, static_sizes = array, static_strides = array}> : (memref<1x64xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>, index) -> memref<1x1xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %20 = "memref.subview"(%18, %15) <{operandSegmentSizes = array, static_offsets = array, static_sizes = array, static_strides = array}> : (memref<1x64xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>, index) -> memref<1x1xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> "linalg.generic"(%19, %20) <{indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = [#linalg.iterator_type, #linalg.iterator_type], operandSegmentSizes = array}> ({ ^bb0(%arg2: f32, %arg3: f32): "linalg.yield"(%arg2) : (f32) -> () }) {__internal_linalg_transform__ = "vectorize", lowering_config = #iree_codegen.lowering_config} : (memref<1x1xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>, memref<1x1xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) -> () "scf.forall.in_parallel"() ({ ^bb0: }) : () -> () }) : () -> () "func.return"() : () -> () }) {translation_info = #iree_codegen.translation_info} : () -> () }) : () -> () "hal.executable.variant_end"() : () -> () }) {sym_name = "cuda_nvptx_fb", sym_visibility = "public", target = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}>} : () -> () // -----// IR Dump After TranslateTargetExecutableVariantsPass Failed (iree-hal-translate-target-executable-variants) //----- // hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}>) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #iree_codegen.translation_info} { %c1 = arith.constant 1 : index %c2048 = arith.constant 2048 : index %c64 = arith.constant 64 : index %0 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %thread_id_x = gpu.thread_id x %thread_id_y = gpu.thread_id y scf.for %arg0 = %thread_id_y to %c2048 step %c1 { scf.for %arg1 = %thread_id_x to %c2048 step %c64 { %subview_1 = memref.subview %subview_0[%arg0, %arg1] [1, 1] [1, 1] : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> to memref<1x1xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_2 = memref.subview %8[%arg0, %arg1] [1, 1] [1, 1] : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> to memref<1x1xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_1 : memref<1x1xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%subview_2 : memref<1x1xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {__internal_linalg_transform__ = "vectorize", lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } } } scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (1, 64) { %subview_1 = memref.subview %subview[%arg0, %arg1] [1, 64] [1, 1] : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> to memref<1x64xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_2 = memref.subview %9[%arg0, %arg1] [1, 64] [1, 1] : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> to memref<1x64xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview_3 = memref.subview %subview_1[0, %thread_id_x] [1, 1] [1, 1] : memref<1x64xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> to memref<1x1xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_4 = memref.subview %subview_2[0, %thread_id_x] [1, 1] [1, 1] : memref<1x64xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> to memref<1x1xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_3 : memref<1x1xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%subview_4 : memref<1x1xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {__internal_linalg_transform__ = "vectorize", lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } } {mapping = [#iree_codegen.workgroup_mapping, #iree_codegen.workgroup_mapping]} return } } } failed to translate executables // -----// IR Dump After TranslateAllExecutablesPass Failed (iree-hal-translate-all-executables) //----- // hal.executable public @main$async_dispatch_6 { hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target>}>) { hal.executable.export public @main$async_dispatch_6_slow_memcpy ordinal(0) layout(#hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg1 hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_6_slow_memcpy() attributes {translation_info = #iree_codegen.translation_info} { %c1 = arith.constant 1 : index %c2048 = arith.constant 2048 : index %c64 = arith.constant 64 : index %0 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(1) : i32 %2 = hal.interface.constant.load layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index %6:3 = util.assume.int %3[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %4[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ], %5[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ] : index, index, index %7 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %7, 1 : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> %8 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%6#1) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan layout(, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect>) binding(2) alignment(64) offset(%6#2) flags(Indirect) : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %7[0, 0] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_0 = memref.subview %7[0, 1] [2048, 2048] [1, 2] : memref<2048x4096xf32, strided<[4096, 1], offset: ?>, #hal.descriptor_type> to memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %thread_id_x = gpu.thread_id x %thread_id_y = gpu.thread_id y scf.for %arg0 = %thread_id_y to %c2048 step %c1 { scf.for %arg1 = %thread_id_x to %c2048 step %c64 { %subview_1 = memref.subview %subview_0[%arg0, %arg1] [1, 1] [1, 1] : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> to memref<1x1xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_2 = memref.subview %8[%arg0, %arg1] [1, 1] [1, 1] : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> to memref<1x1xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_1 : memref<1x1xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%subview_2 : memref<1x1xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {__internal_linalg_transform__ = "vectorize", lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } } } scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (1, 64) { %subview_1 = memref.subview %subview[%arg0, %arg1] [1, 64] [1, 1] : memref<2048x2048xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> to memref<1x64xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_2 = memref.subview %9[%arg0, %arg1] [1, 64] [1, 1] : memref<2048x2048xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> to memref<1x64xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> %subview_3 = memref.subview %subview_1[0, %thread_id_x] [1, 1] [1, 1] : memref<1x64xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> to memref<1x1xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type> %subview_4 = memref.subview %subview_2[0, %thread_id_x] [1, 1] [1, 1] : memref<1x64xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> to memref<1x1xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_3 : memref<1x1xf32, strided<[4096, 2], offset: ?>, #hal.descriptor_type>) outs(%subview_4 : memref<1x1xf32, strided<[2048, 1], offset: ?>, #hal.descriptor_type>) attrs = {__internal_linalg_transform__ = "vectorize", lowering_config = #iree_codegen.lowering_config} { ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } } {mapping = [#iree_codegen.workgroup_mapping, #iree_codegen.workgroup_mapping]} return } } } }