From 53e960146727759735815cac516683abb9bf5f86 Mon Sep 17 00:00:00 2001
From: Jakub Kuderski <jakub@nod-labs.com>
Date: Mon, 25 Nov 2024 16:02:51 -0500
Subject: [PATCH 01/54] Integrate llvm-project at
 fe3c23b439b9a2d00442d9bc6a4ca86f73066a3d (#19287)

Still carrying a revert for 1004865f1ca41a9581da8747f34b29862d3ebc3d and
a cherry pick for https://github.com/llvm/llvm-project/pull/116650.
---
 .../Common/GPU/VectorReductionToGPU.cpp       | 19 +++++++-------
 .../TransformExtensions/LLVMGPUExtensions.cpp | 26 +++++++++----------
 ...transform_dialect_vector_distribution.mlir |  2 +-
 third_party/llvm-project                      |  2 +-
 4 files changed, 24 insertions(+), 25 deletions(-)

diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/VectorReductionToGPU.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/VectorReductionToGPU.cpp
index 314b5844d966..e458da23707c 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/VectorReductionToGPU.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/VectorReductionToGPU.cpp
@@ -39,7 +39,7 @@ static void debugPrint(Operation *op, const char *message) {
 /// Emit shared local memory allocation in case it is needed when lowering the
 /// warp operations.
 static Value allocateGlobalSharedMemory(Location loc, OpBuilder &builder,
-                                        vector::WarpExecuteOnLane0Op warpOp,
+                                        gpu::WarpExecuteOnLane0Op warpOp,
                                         Type type) {
   MemRefType memrefType;
   auto addressSpaceAttr = gpu::AddressSpaceAttr::get(
@@ -83,8 +83,7 @@ static bool isUniformLoad(Operation *op) {
 
 /// Hoist uniform operations as well as special hal operations that have side
 /// effect but are safe to move out of the warp single lane region.
-static void
-moveScalarAndBindingUniformCode(vector::WarpExecuteOnLane0Op warpOp) {
+static void moveScalarAndBindingUniformCode(gpu::WarpExecuteOnLane0Op warpOp) {
   /// Hoist ops without side effect as well as special binding ops.
   auto canBeHoisted = [](Operation *op,
                          function_ref<bool(Value)> definedOutside) {
@@ -155,12 +154,12 @@ struct InsertToBroadcast final : OpRewritePattern<vector::InsertOp> {
 };
 
 /// Pattern to sink `gpu.barrier` ops out of a `warp_execute_on_lane_0` op.
-struct WarpOpBarrier final : OpRewritePattern<vector::WarpExecuteOnLane0Op> {
-  using OpRewritePattern<vector::WarpExecuteOnLane0Op>::OpRewritePattern;
+struct WarpOpBarrier final : OpRewritePattern<gpu::WarpExecuteOnLane0Op> {
+  using OpRewritePattern<gpu::WarpExecuteOnLane0Op>::OpRewritePattern;
 
-  LogicalResult matchAndRewrite(vector::WarpExecuteOnLane0Op warpOp,
+  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                 PatternRewriter &rewriter) const override {
-    auto yield = cast<vector::YieldOp>(
+    auto yield = cast<gpu::YieldOp>(
         warpOp.getBodyRegion().getBlocks().begin()->getTerminator());
     Operation *lastNode = yield->getPrevNode();
     auto barrierOp = dyn_cast_or_null<gpu::BarrierOp>(lastNode);
@@ -233,7 +232,7 @@ struct VectorReductionToGPUPass final
     auto threadX = builder.create<gpu::ThreadIdOp>(loc, builder.getIndexType(),
                                                    gpu::Dimension::x);
     auto cstGroupSize = builder.create<arith::ConstantIndexOp>(loc, groupSize);
-    auto warpOp = builder.create<vector::WarpExecuteOnLane0Op>(
+    auto warpOp = builder.create<gpu::WarpExecuteOnLane0Op>(
         loc, TypeRange(), threadX.getResult(), groupSize);
     warpOp.getWarpRegion().takeBody(funcOp.getFunctionBody());
     Block &newBlock = funcOp.getFunctionBody().emplaceBlock();
@@ -243,7 +242,7 @@ struct VectorReductionToGPUPass final
     warpOp.getWarpRegion().getBlocks().back().back().moveBefore(&newBlock,
                                                                 newBlock.end());
     builder.setInsertionPointToEnd(&warpOp.getWarpRegion().getBlocks().back());
-    builder.create<vector::YieldOp>(loc);
+    builder.create<gpu::YieldOp>(loc);
 
     debugPrint(funcOp, "after step #2: wrapping code with the warp execute op");
 
@@ -300,7 +299,7 @@ struct VectorReductionToGPUPass final
       vector::WarpExecuteOnLane0LoweringOptions options;
       options.warpAllocationFn = allocateGlobalSharedMemory;
       options.warpSyncronizationFn = [](Location loc, OpBuilder &builder,
-                                        vector::WarpExecuteOnLane0Op warpOp) {
+                                        gpu::WarpExecuteOnLane0Op warpOp) {
         builder.create<gpu::BarrierOp>(loc);
       };
       vector::populateWarpExecuteOnLane0OpToScfForPattern(patterns, options);
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp
index 7dc2e4093d58..c52ae4bcc157 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp
@@ -153,7 +153,7 @@ void transform_dialect::VectorToWarpExecuteOnLane0Op::build(
 // SCCP.
 static LogicalResult
 replaceAllUsesOfLaneWithin(RewriterBase &b,
-                           vector::WarpExecuteOnLane0Op executeOp) {
+                           gpu::WarpExecuteOnLane0Op executeOp) {
   OpBuilder::InsertionGuard g(b);
   b.setInsertionPoint(executeOp);
   Value zero = b.create<arith::ConstantIndexOp>(executeOp.getLoc(), 0);
@@ -225,7 +225,7 @@ static FailureOr<gpu::ThreadIdOp> isThreadIdxxZeroPredicate(scf::IfOp ifOp) {
 }
 
 struct VectorDistributionResult {
-  vector::WarpExecuteOnLane0Op warpOp;
+  gpu::WarpExecuteOnLane0Op warpOp;
 };
 
 static FailureOr<VectorDistributionResult>
@@ -257,7 +257,7 @@ rewriteScfIfAsWarpExecuteOnLane0(RewriterBase &rewriter, Location loc,
         rewriter.create<scf::IfOp>(loc, predicate, /*withElseRegion=*/false);
     rewriter.setInsertionPointToStart(&newIfOp.getThenRegion().front());
   }
-  auto warpOp = rewriter.create<vector::WarpExecuteOnLane0Op>(
+  auto warpOp = rewriter.create<gpu::WarpExecuteOnLane0Op>(
       loc, TypeRange(), threadIdxx, warpSize);
 
   // Move the code from the previous ifOp to the
@@ -270,7 +270,7 @@ rewriteScfIfAsWarpExecuteOnLane0(RewriterBase &rewriter, Location loc,
                                      sourceBlock.without_terminator().begin(),
                                      sourceBlock.without_terminator().end());
   rewriter.setInsertionPointToEnd(&targetBlock);
-  rewriter.create<vector::YieldOp>(loc);
+  rewriter.create<gpu::YieldOp>(loc);
 
   // Erase old op.
   rewriter.eraseOp(ifOp);
@@ -358,7 +358,7 @@ void transform_dialect::VectorWarpDistributionOp::getEffects(
 /// Emit shared local memory allocation in case it is needed when lowering the
 /// warp operations.
 static Value allocateGlobalSharedMemory(Location loc, OpBuilder &builder,
-                                        vector::WarpExecuteOnLane0Op warpOp,
+                                        gpu::WarpExecuteOnLane0Op warpOp,
                                         Type type) {
   MemRefType memrefType;
   auto addressSpaceAttr = gpu::AddressSpaceAttr::get(
@@ -374,11 +374,11 @@ static Value allocateGlobalSharedMemory(Location loc, OpBuilder &builder,
   return builder.create<memref::AllocOp>(loc, memrefType);
 }
 
-/// Return a value yielded by `warpOp` which statifies the filter lamdba
+/// Return a value yielded by `warpOp` which satisfies the filter lambda
 /// condition and is not dead.
-static OpOperand *getWarpResult(vector::WarpExecuteOnLane0Op warpOp,
+static OpOperand *getWarpResult(gpu::WarpExecuteOnLane0Op warpOp,
                                 function_ref<bool(Operation *)> fn) {
-  auto yield = cast<vector::YieldOp>(
+  auto yield = cast<gpu::YieldOp>(
       warpOp.getBodyRegion().getBlocks().begin()->getTerminator());
   for (OpOperand &yieldOperand : yield->getOpOperands()) {
     Value yieldValues = yieldOperand.get();
@@ -426,9 +426,9 @@ class InsertElementToBroadcast final
 /// }
 /// gpu.synchronize
 /// %0 = memref.load %src[%c0] : memref<1024xf32>
-struct WarpOpLoad : public OpRewritePattern<vector::WarpExecuteOnLane0Op> {
-  using OpRewritePattern<vector::WarpExecuteOnLane0Op>::OpRewritePattern;
-  LogicalResult matchAndRewrite(vector::WarpExecuteOnLane0Op warpOp,
+struct WarpOpLoad : public OpRewritePattern<gpu::WarpExecuteOnLane0Op> {
+  using OpRewritePattern<gpu::WarpExecuteOnLane0Op>::OpRewritePattern;
+  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                 PatternRewriter &rewriter) const override {
     OpOperand *operand = getWarpResult(warpOp, llvm::IsaPred<memref::LoadOp>);
     if (!operand)
@@ -476,7 +476,7 @@ struct HoistSharedMemoryAlloc : public OpRewritePattern<memref::AllocOp> {
                                 PatternRewriter &rewriter) const override {
     if (!iree_compiler::hasSharedMemoryAddressSpace(alloc.getType()))
       return failure();
-    auto warpParent = alloc->getParentOfType<vector::WarpExecuteOnLane0Op>();
+    auto warpParent = alloc->getParentOfType<gpu::WarpExecuteOnLane0Op>();
     if (!warpParent)
       return failure();
     alloc->moveBefore(warpParent);
@@ -561,7 +561,7 @@ static void populatePropagateVectorDistribution(Operation *target,
 }
 
 static void warpSyncronizationFn(Location loc, OpBuilder &builder,
-                                 vector::WarpExecuteOnLane0Op warpOp) {
+                                 gpu::WarpExecuteOnLane0Op warpOp) {
   builder.create<gpu::BarrierOp>(loc);
 };
 
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_vector_distribution.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_vector_distribution.mlir
index 1cad1aa50614..6ee43c98fcf8 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_vector_distribution.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_vector_distribution.mlir
@@ -24,7 +24,7 @@ func.func @reduce_dispatch_0() attributes {translation_info = #translation_info}
   // WARP-EXECUTE: %[[COND32:.*]] = arith.cmpi ult, %[[TIDX]], %[[C32]] : index
   // Single-warp guard filters out threads 32-63.
   // WARP-EXECUTE: scf.if %[[COND32]] {
-  // WARP-EXECUTE:   vector.warp_execute_on_lane_0(%[[TIDX]])[32] {
+  // WARP-EXECUTE:   gpu.warp_execute_on_lane_0(%[[TIDX]])[32] {
   // WARP-EXECUTE:     %[[V:.*]] = "some_def"() : () -> vector<128xf32>
   // WARP-EXECUTE:     vector.transfer_write %[[V]], %{{.*}} {in_bounds = [true]} : vector<128xf32>, memref<128xf32>
 
diff --git a/third_party/llvm-project b/third_party/llvm-project
index 58f1b107d7a3..534730273092 160000
--- a/third_party/llvm-project
+++ b/third_party/llvm-project
@@ -1 +1 @@
-Subproject commit 58f1b107d7a377ff6d456f16f060606ea4430041
+Subproject commit 534730273092b8e7d4bedc1a3206d76e6848c6c4

From ef4ecf3b7111cead7bf29f630965c36395e515e9 Mon Sep 17 00:00:00 2001
From: Stella Laurenzo <stellaraccident@gmail.com>
Date: Mon, 25 Nov 2024 16:37:57 -0800
Subject: [PATCH 02/54] [iree.build] Wire up out of process concurrency.
 (#19291)

* Introduces an explicit thunk creation stage which gives a way to
create a fully remotable object.
* Reworks process concurrency to occupy a host thread in addition to a
sub-process, which keeps the task concurrency accounting simple and
makes errors propagate more easily.
* Adds a test action for invoking a thunk out of process.
* This is the boilerplate required while implementing a turbine AOT
export action.

Signed-off-by: Stella Laurenzo <stellaraccident@gmail.com>
---
 compiler/bindings/python/CMakeLists.txt       |  1 +
 .../bindings/python/iree/build/executor.py    | 59 ++++++++++++++----
 .../python/iree/build/test_actions.py         | 31 ++++++++++
 .../python/test/build_api/CMakeLists.txt      |  7 +++
 .../python/test/build_api/concurrency_test.py | 61 +++++++++++++++++++
 5 files changed, 146 insertions(+), 13 deletions(-)
 create mode 100644 compiler/bindings/python/iree/build/test_actions.py
 create mode 100644 compiler/bindings/python/test/build_api/concurrency_test.py

diff --git a/compiler/bindings/python/CMakeLists.txt b/compiler/bindings/python/CMakeLists.txt
index e606e7309aef..5251319b7132 100644
--- a/compiler/bindings/python/CMakeLists.txt
+++ b/compiler/bindings/python/CMakeLists.txt
@@ -268,6 +268,7 @@ SOURCES
   net_actions.py
   onnx_actions.py
   target_machine.py
+  test_actions.py
 )
 
 add_mlir_python_modules(IREECompilerBuildPythonModules
diff --git a/compiler/bindings/python/iree/build/executor.py b/compiler/bindings/python/iree/build/executor.py
index 532f235ba6ca..8554e207976a 100644
--- a/compiler/bindings/python/iree/build/executor.py
+++ b/compiler/bindings/python/iree/build/executor.py
@@ -276,8 +276,17 @@ def __str__(self) -> str:
         return self.value
 
 
-class BuildAction(BuildDependency, abc.ABC):
-    """An action that must be carried out."""
+class BuildAction(BuildDependency):
+    """An action that must be carried out.
+
+    This class is designed to be subclassed by concrete actions. In-process
+    only actions should override `_invoke`, whereas those that can be executed
+    out-of-process must override `_remotable_thunk`.
+
+    Note that even actions that are marked for `PROCESS` concurrency will
+    run on a dedicated thread within the host process. Only the `_remotable_thunk`
+    result will be scheduled out of process.
+    """
 
     def __init__(
         self,
@@ -289,7 +298,7 @@ def __init__(
     ):
         super().__init__(executor=executor, deps=deps)
         self.desc = desc
-        self.concurrnecy = concurrency
+        self.concurrency = concurrency
 
     def __str__(self):
         return self.desc
@@ -297,12 +306,35 @@ def __str__(self):
     def __repr__(self):
         return f"Action[{type(self).__name__}]('{self.desc}')"
 
-    def invoke(self):
-        self._invoke()
+    def invoke(self, scheduler: "Scheduler"):
+        # Invoke is run within whatever in-process execution context was requested:
+        #   - On the scheduler thread for NONE
+        #   - On a worker thread for THREAD or PROCESS
+        # For PROCESS concurrency, we have to create a compatible invocation
+        # thunk, schedule that on the process pool and wait for it.
+        if self.concurrency == ActionConcurrency.PROCESS:
+            thunk = self._remotable_thunk()
+            fut = scheduler.process_pool_executor.submit(thunk)
+            fut.result()
+        else:
+            self._invoke()
 
-    @abc.abstractmethod
     def _invoke(self):
-        ...
+        self._remotable_thunk()()
+
+    def _remotable_thunk(self) -> Callable[[], None]:
+        """Creates a remotable no-arg thunk that will execute this out of process.
+
+        This must return a no arg/result callable that can be pickled. While there
+        are various ways to ensure this, here are a few guidelines:
+
+        * Must be a type/function defined at a module level.
+        * Cannot be decorated.
+        * Must only contain attributes with the same constraints.
+        """
+        raise NotImplementedError(
+            f"Action '{self}' does not implement remotable invocation"
+        )
 
 
 class BuildContext(BuildDependency):
@@ -513,19 +545,20 @@ def _schedule_action(self, dep: BuildDependency):
         if isinstance(dep, BuildAction):
 
             def invoke():
-                dep.invoke()
+                dep.invoke(self)
                 return dep
 
             print(f"Scheduling action: {dep}", file=self.stderr)
-            if dep.concurrnecy == ActionConcurrency.NONE:
+            if dep.concurrency == ActionConcurrency.NONE:
                 invoke()
-            elif dep.concurrnecy == ActionConcurrency.THREAD:
+            elif (
+                dep.concurrency == ActionConcurrency.THREAD
+                or dep.concurrency == ActionConcurrency.PROCESS
+            ):
                 dep.start(self.thread_pool_executor.submit(invoke))
-            elif dep.concurrnecy == ActionConcurrency.PROCESS:
-                dep.start(self.process_pool_executor.submit(invoke))
             else:
                 raise AssertionError(
-                    f"Unhandled ActionConcurrency value: {dep.concurrnecy}"
+                    f"Unhandled ActionConcurrency value: {dep.concurrency}"
                 )
         else:
             # Not schedulable. Just mark it as done.
diff --git a/compiler/bindings/python/iree/build/test_actions.py b/compiler/bindings/python/iree/build/test_actions.py
new file mode 100644
index 000000000000..e4b9c55e0eac
--- /dev/null
+++ b/compiler/bindings/python/iree/build/test_actions.py
@@ -0,0 +1,31 @@
+# Copyright 2024 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+from typing import Callable
+from iree.build.executor import ActionConcurrency, BuildAction
+
+
+class _ThunkTrampoline:
+    def __init__(self, thunk, args):
+        self.thunk = thunk
+        self.args = args
+
+    def __call__(self):
+        self.thunk(*self.args)
+
+
+class ExecuteOutOfProcessThunkAction(BuildAction):
+    """Executes a callback thunk with arguments.
+
+    Both the thunk and args must be pickleable.
+    """
+
+    def __init__(self, thunk, args, concurrency=ActionConcurrency.PROCESS, **kwargs):
+        super().__init__(concurrency=concurrency, **kwargs)
+        self.trampoline = _ThunkTrampoline(thunk, args)
+
+    def _remotable_thunk(self) -> Callable[[], None]:
+        return self.trampoline
diff --git a/compiler/bindings/python/test/build_api/CMakeLists.txt b/compiler/bindings/python/test/build_api/CMakeLists.txt
index b8bd81759ddc..5c8f97123d63 100644
--- a/compiler/bindings/python/test/build_api/CMakeLists.txt
+++ b/compiler/bindings/python/test/build_api/CMakeLists.txt
@@ -13,3 +13,10 @@ if(IREE_INPUT_TORCH)
       "mnist_builder_test.py"
   )
 endif()
+
+iree_py_test(
+  NAME
+    concurrency_test
+  SRCS
+    "concurrency_test.py"
+)
diff --git a/compiler/bindings/python/test/build_api/concurrency_test.py b/compiler/bindings/python/test/build_api/concurrency_test.py
new file mode 100644
index 000000000000..498179b73188
--- /dev/null
+++ b/compiler/bindings/python/test/build_api/concurrency_test.py
@@ -0,0 +1,61 @@
+# Copyright 2024 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import os
+from pathlib import Path
+import tempfile
+import unittest
+
+from iree.build import *
+from iree.build.executor import BuildContext
+from iree.build.test_actions import ExecuteOutOfProcessThunkAction
+
+
+@entrypoint
+def write_out_of_process_pid():
+    context = BuildContext.current()
+    output_file = context.allocate_file("pid.txt")
+    action = ExecuteOutOfProcessThunkAction(
+        _write_pid_file,
+        args=[output_file.get_fs_path()],
+        desc="Writing pid file",
+        executor=context.executor,
+    )
+    output_file.deps.add(action)
+    return output_file
+
+
+def _write_pid_file(output_path: Path):
+    pid = os.getpid()
+    print(f"Running action out of process: pid={pid}")
+    output_path.write_text(str(pid))
+
+
+class ConcurrencyTest(unittest.TestCase):
+    def setUp(self):
+        self._temp_dir = tempfile.TemporaryDirectory(ignore_cleanup_errors=True)
+        self._temp_dir.__enter__()
+        self.output_path = Path(self._temp_dir.name)
+
+    def tearDown(self) -> None:
+        self._temp_dir.__exit__(None, None, None)
+
+    def testProcessConcurrency(self):
+        parent_pid = os.getpid()
+        print(f"Testing out of process concurrency: pid={parent_pid}")
+        iree_build_main(
+            args=["write_out_of_process_pid", "--output-dir", str(self.output_path)]
+        )
+        pid_file = (
+            self.output_path / "genfiles" / "write_out_of_process_pid" / "pid.txt"
+        )
+        child_pid = int(pid_file.read_text())
+        print(f"Got child pid={child_pid}")
+        self.assertNotEqual(parent_pid, child_pid)
+
+
+if __name__ == "__main__":
+    unittest.main()

From 4e3e898f83c00bd3e5d03bf009dff95a066fe996 Mon Sep 17 00:00:00 2001
From: Jakub Kuderski <jakub@nod-labs.com>
Date: Mon, 25 Nov 2024 19:54:34 -0500
Subject: [PATCH 03/54] Use `llvm::filter_to_vector`. NFC. (#19297)

I recently added this to SmallVectorExtras:
https://github.com/llvm/llvm-project/pull/117460.
---
 .../Codegen/Common/LinkTuningSpecsPass.cpp         | 14 ++++++--------
 .../Codegen/Common/ReconcileTranslationInfo.cpp    |  6 +++---
 .../Interfaces/PartitionableLoopsInterface.cpp     |  8 ++++----
 .../compiler/Codegen/LLVMCPU/KernelDispatch.cpp    |  5 +++--
 .../SPIRV/SPIRVMaterializeExecutableConditions.cpp |  5 +++--
 .../Stream/Transforms/ScheduleAllocation.cpp       | 10 +++++-----
 6 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/compiler/src/iree/compiler/Codegen/Common/LinkTuningSpecsPass.cpp b/compiler/src/iree/compiler/Codegen/Common/LinkTuningSpecsPass.cpp
index a13f3802bec2..ab9ddce82dd0 100644
--- a/compiler/src/iree/compiler/Codegen/Common/LinkTuningSpecsPass.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/LinkTuningSpecsPass.cpp
@@ -35,19 +35,17 @@ using mlir::transform::NamedSequenceOp;
 static SmallVector<ModuleOp>
 findNestedModulesWithNamedSequences(ModuleOp module) {
   Block *body = module.getBody();
-  return llvm::to_vector(
-      llvm::make_filter_range(body->getOps<ModuleOp>(), [](ModuleOp op) {
-        return op.getSymName().has_value() &&
-               op->hasAttr(
-                   transform::TransformDialect::kWithNamedSequenceAttrName);
-      }));
+  return llvm::filter_to_vector(body->getOps<ModuleOp>(), [](ModuleOp op) {
+    return op.getSymName().has_value() &&
+           op->hasAttr(transform::TransformDialect::kWithNamedSequenceAttrName);
+  });
 }
 
 static SmallVector<NamedSequenceOp> findTuningSpecs(ModuleOp module) {
   Block *body = module.getBody();
-  return llvm::to_vector(llvm::make_filter_range(
+  return llvm::filter_to_vector(
       body->getOps<NamedSequenceOp>(),
-      [](NamedSequenceOp op) { return op->hasAttr(kTuningSpecAttrName); }));
+      [](NamedSequenceOp op) { return op->hasAttr(kTuningSpecAttrName); });
 }
 
 static LogicalResult validateTuningSpec(NamedSequenceOp op) {
diff --git a/compiler/src/iree/compiler/Codegen/Common/ReconcileTranslationInfo.cpp b/compiler/src/iree/compiler/Codegen/Common/ReconcileTranslationInfo.cpp
index 0fc04f80baff..714e50a5d5fd 100644
--- a/compiler/src/iree/compiler/Codegen/Common/ReconcileTranslationInfo.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/ReconcileTranslationInfo.cpp
@@ -265,8 +265,8 @@ static LogicalResult resolveWorkgroupForAll(RewriterBase &rewriter,
   }
 
   auto forAllOps = body.getOps<scf::ForallOp>();
-  SmallVector<scf::ForallOp> workgroupForAllOps = llvm::to_vector(
-      llvm::make_filter_range(forAllOps, [&](scf::ForallOp forAllOp) {
+  SmallVector<scf::ForallOp> workgroupForAllOps =
+      llvm::filter_to_vector(forAllOps, [&](scf::ForallOp forAllOp) {
         auto mapping = forAllOp.getMapping();
         if (!mapping) {
           return false;
@@ -277,7 +277,7 @@ static LogicalResult resolveWorkgroupForAll(RewriterBase &rewriter,
           return false;
         }
         return true;
-      }));
+      });
 
   if (workgroupForAllOps.empty()) {
     // If there are no workgroup distribution loops, set the default
diff --git a/compiler/src/iree/compiler/Codegen/Interfaces/PartitionableLoopsInterface.cpp b/compiler/src/iree/compiler/Codegen/Interfaces/PartitionableLoopsInterface.cpp
index 94be437540e3..f669b5e0e17b 100644
--- a/compiler/src/iree/compiler/Codegen/Interfaces/PartitionableLoopsInterface.cpp
+++ b/compiler/src/iree/compiler/Codegen/Interfaces/PartitionableLoopsInterface.cpp
@@ -11,6 +11,7 @@
 #include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtDialect.h"
 #include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtOps.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SmallVectorExtras.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/IR/BuiltinTypes.h"
@@ -26,10 +27,9 @@ namespace mlir::iree_compiler {
 static llvm::SmallVector<unsigned>
 pruneUnitTripParallelLoops(llvm::ArrayRef<unsigned> parallelLoops,
                            llvm::ArrayRef<int64_t> loopRanges) {
-  return llvm::to_vector(
-      llvm::make_filter_range(parallelLoops, [&loopRanges](unsigned loopDim) {
-        return loopRanges[loopDim] != 1;
-      }));
+  return llvm::filter_to_vector(parallelLoops, [&loopRanges](unsigned loopDim) {
+    return loopRanges[loopDim] != 1;
+  });
 }
 
 /// Returns the partitionable loops for all Linalg ops.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
index a5fb9bf313a7..bdb252745a2a 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
@@ -17,6 +17,7 @@
 #include "iree/compiler/Dialect/HAL/IR/HALTypes.h"
 #include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtOps.h"
 #include "iree/compiler/Dialect/LinalgExt/Utils/IndexingUtils.h"
+#include "llvm/ADT/SmallVectorExtras.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -2784,10 +2785,10 @@ adjustTileSizesForUnPackOp(mlir::FunctionOpInterface entryPointFn,
     // Remove the "enable_loop_peeling" attr from pipelineConfig
     auto enableLoopPeelingAttrName =
         getEnableLoopPeelingAttrName(rootOp->getContext());
-    auto newPipelineConfigEntries = llvm::to_vector(llvm::make_filter_range(
+    auto newPipelineConfigEntries = llvm::filter_to_vector(
         pipelineConfig.getValue(), [&](NamedAttribute entry) {
           return entry.getName() != enableLoopPeelingAttrName;
-        }));
+        });
 
     pipelineConfig =
         DictionaryAttr::get(rootOp->getContext(), newPipelineConfigEntries);
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVMaterializeExecutableConditions.cpp b/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVMaterializeExecutableConditions.cpp
index 0228656f57a0..57508cbdc2a2 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVMaterializeExecutableConditions.cpp
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVMaterializeExecutableConditions.cpp
@@ -8,6 +8,7 @@
 #include "iree/compiler/Codegen/SPIRV/Utils.h"
 #include "iree/compiler/Dialect/HAL/IR/HALOps.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVectorExtras.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/SPIRV/IR/SPIRVAttributes.h"
 #include "mlir/Dialect/SPIRV/IR/SPIRVEnums.h"
@@ -322,10 +323,10 @@ struct SPIRVMaterializeExecutableConditionsPass final
 
     // Drop the fine-grained SPIR-V target and add the course-grained device
     // queries as a list.
-    auto dictKeyValues = llvm::to_vector(llvm::make_filter_range(
+    auto dictKeyValues = llvm::filter_to_vector(
         configuration.getValue(), [](NamedAttribute attr) {
           return attr.getName() != spirv::getTargetEnvAttrName();
-        }));
+        });
     dictKeyValues.emplace_back(builder.getStringAttr("iree.spirv.features"),
                                builder.getStrArrayAttr(queries));
     variantOp.setTargetAttr(IREE::HAL::ExecutableTargetAttr::get(
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/ScheduleAllocation.cpp b/compiler/src/iree/compiler/Dialect/Stream/Transforms/ScheduleAllocation.cpp
index 7741d26651b8..e0b3fbfe93e9 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/ScheduleAllocation.cpp
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/ScheduleAllocation.cpp
@@ -102,10 +102,10 @@ static void computeRegionValueAliases(Operation *regionOp,
 
   // Filter out to only resource results - some regions may return additional
   // things like stream.async.execute returning a timepoint.
-  auto resourceResults = llvm::to_vector_of<OpResult>(
-      llvm::make_filter_range(regionOp->getResults(), [](OpResult result) {
+  auto resourceResults =
+      llvm::filter_to_vector(regionOp->getResults(), [](OpResult result) {
         return llvm::isa<IREE::Stream::ResourceType>(result.getType());
-      }));
+      });
 
   // Start with outputs so that we handle tied values that may lead all the way
   // back up the chain to the stream inputs.
@@ -1145,12 +1145,12 @@ static std::optional<ConstantAllocation>
 extractConstantsWithLifetime(IREE::Stream::AsyncExecuteOp executeOp,
                              IREE::Stream::Lifetime lifetime,
                              OpBuilder &externalBuilder) {
-  auto constantOps = llvm::to_vector(llvm::make_filter_range(
+  auto constantOps = llvm::filter_to_vector(
       executeOp.getOps<IREE::Stream::AsyncConstantOp>(),
       [&](IREE::Stream::AsyncConstantOp op) {
         return cast<IREE::Stream::ResourceType>(op.getResult().getType())
                    .getLifetime() == lifetime;
-      }));
+      });
   if (constantOps.empty())
     return {};
 

From 77ff99ca38592770e7e8619a3c3cdcebe2c587bc Mon Sep 17 00:00:00 2001
From: Jakub Kuderski <jakub@nod-labs.com>
Date: Mon, 25 Nov 2024 22:14:26 -0500
Subject: [PATCH 04/54] [Python] Disable build_api concurrency test (#19298)

This test currently fails with python3.10.

Error message:
https://gist.github.com/kuhar/7af56cf5a760bead6af9176b99e26cef.
---
 .../bindings/python/test/build_api/CMakeLists.txt   | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/compiler/bindings/python/test/build_api/CMakeLists.txt b/compiler/bindings/python/test/build_api/CMakeLists.txt
index 5c8f97123d63..9721bb06887b 100644
--- a/compiler/bindings/python/test/build_api/CMakeLists.txt
+++ b/compiler/bindings/python/test/build_api/CMakeLists.txt
@@ -14,9 +14,10 @@ if(IREE_INPUT_TORCH)
   )
 endif()
 
-iree_py_test(
-  NAME
-    concurrency_test
-  SRCS
-    "concurrency_test.py"
-)
+# FIXME: This test fails on python3.10.
+# iree_py_test(
+#   NAME
+#   concurrency_test
+#  SRCS
+#    "concurrency_test.py"
+#)

From 8677a6152ce7353cfaf30e9950ee9f52e7842641 Mon Sep 17 00:00:00 2001
From: Han-Chung Wang <hanhan0912@gmail.com>
Date: Mon, 25 Nov 2024 23:02:13 -0800
Subject: [PATCH 05/54] [Codegen][NFC] Move encoding structs to
 IREECodegenTypes.h (#19292)

---
 .../Codegen/Dialect/Codegen/IR/BUILD.bazel    |  1 +
 .../Codegen/Dialect/Codegen/IR/CMakeLists.txt |  1 +
 .../Dialect/Codegen/IR/IREECodegenTypes.h     | 93 +++++++++++++++++++
 .../Codegen/Dialect/Codegen/Utils/Utils.cpp   |  2 +-
 .../Codegen/Dialect/Codegen/Utils/Utils.h     | 77 +--------------
 5 files changed, 98 insertions(+), 76 deletions(-)
 create mode 100644 compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenTypes.h

diff --git a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/BUILD.bazel
index 49db3762434b..6a4ba9a8f272 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/BUILD.bazel
@@ -70,6 +70,7 @@ iree_compiler_cc_library(
         "IREECodegenInterfaces.h.inc",
         "IREECodegenOps.cpp.inc",
         "IREECodegenOps.h.inc",
+        "IREECodegenTypes.h",
         "LoweringConfigEnums.cpp.inc",
         "LoweringConfigEnums.h.inc",
         "UKernelOps.cpp.inc",
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/CMakeLists.txt
index bc40433ca636..999819510652 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/CMakeLists.txt
@@ -28,6 +28,7 @@ iree_cc_library(
     "IREECodegenInterfaces.h.inc"
     "IREECodegenOps.cpp.inc"
     "IREECodegenOps.h.inc"
+    "IREECodegenTypes.h"
     "LoweringConfigEnums.cpp.inc"
     "LoweringConfigEnums.h.inc"
     "UKernelOps.cpp.inc"
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenTypes.h b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenTypes.h
new file mode 100644
index 000000000000..c41d581bf765
--- /dev/null
+++ b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenTypes.h
@@ -0,0 +1,93 @@
+// Copyright 2024 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_COMPILER_CODEGEN_DIALECT_CODEGEN_IR_IREECODEGENTYPES_H_
+#define IREE_COMPILER_CODEGEN_DIALECT_CODEGEN_IR_IREECODEGENTYPES_H_
+
+#include <cstdint>
+
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Support/LLVM.h"
+
+namespace mlir::iree_compiler::IREE::Codegen {
+//===----------------------------------------------------------------------===//
+// Layout Struct Types.
+//===----------------------------------------------------------------------===//
+
+// Metadata for a swizzle, that is, an (expand_shape -> transposition)
+// pair of ops performing a change of layout within the tiles. This is used
+// on GPU, where the tiles themselves can have an arbitrary layout.
+struct TileSwizzle {
+  struct Dim {
+    // Describes what varies across this dimension.
+    enum class Kind : int8_t {
+      // This dimension is internal to one intrinsic on one thread. This
+      // is only seen for intrinsic operands that are themselves vectors.
+      // For example, with AMD MFMA, for the MFMA_F32_16x16x4_F32 intrinsic,
+      // the C-matrix operand is a vector of 4 floats already at the level of
+      // one intrinsic on one thread. That dimension of size 4 is 'Internal'.
+      Internal,
+      // This dimension is internal to one intrinsic, but is across threads.
+      // For example, with AMD MFMA, for the MFMA_F32_16x16x4_F32 intrinsic,
+      // the A-matrix tile has shape 16x4, and these two dimensions of size 16
+      // and 4 are 'CrossThread': neither is visible at the single-thread level
+      // (in the intrinsic itself, the A-matrix operand is a single scalar) but
+      // as we move along these dimensions, we are moving over the 64 threads
+      // of the subgroup.
+      //
+      // Another example of cross-thread dimensions is in kernels that are
+      // "unrolled" across subgroups. Such dimensions are cross-subgroup, so in
+      // particular they are cross-thread.
+      CrossThread,
+      // This dimensions is across intrinsics, as in, actual instructions in the
+      // generated code. In other words, it is an actual unrolling factor,
+      // resulting in this many more instructions being generated and executed
+      // on each thread/subgroup.
+      CrossIntrinsic
+    };
+
+    Kind kind = Kind::Internal;
+
+    // The size of the dimension.
+    int16_t size = 0;
+
+    // Support constructing from any size type.
+    template <typename T>
+    Dim(Kind kind, T size) : kind(kind), size(size) {}
+  };
+
+  using ExpandShapeDimVectorType = llvm::SmallVector<Dim, 4>;
+  using ExpandShapeType = llvm::SmallVector<ExpandShapeDimVectorType>;
+
+  // This vector-of-vectors contains all the information needed to generate
+  // a `tensor.expand_shape` creating additional internal dimensions into the
+  // tile. For example, expandShape = [[16], [4, 2]] means that the original
+  // tile shape [16, 8] gets expanded such that the first dimension 16 is left
+  // unchanged, and the second dimension 8 gets split into two internal dims
+  // of size 4 and 2.
+  ExpandShapeType expandShape;
+  // This permutation vector applies to the expanded dimensions and is used
+  // to generate a `linalg.transpose` changing the layout of the tile. For
+  // example, permutation[0] dictates which of the expanded dimensions becomes
+  // the leading dimension of the layout.
+  llvm::SmallVector<int64_t> permutation;
+};
+
+/// Container of information needed to materialize the layout transformations.
+struct MaterializeEncodingInfo {
+  // The next 3 fields are used to create a `tensor.pack` or `tensor.unpack` op,
+  // changing the overall layout between row-major and tiled (where each tile is
+  // row-major).
+  SmallVector<int64_t> innerDimsPos;
+  SmallVector<int64_t> innerTileSizes;
+  SmallVector<int64_t> outerDimsPerm;
+
+  // The optional swizzle, see the comment on TileSwizzle. Only used on GPU.
+  std::optional<TileSwizzle> swizzle;
+};
+
+} // namespace mlir::iree_compiler::IREE::Codegen
+#endif // IREE_COMPILER_CODEGEN_DIALECT_CODEGEN_IR_IREECODEGENTYPES_H_
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/Utils/Utils.cpp b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/Utils/Utils.cpp
index 266153b042af..f2fbe11b2565 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/Utils/Utils.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/Utils/Utils.cpp
@@ -14,7 +14,7 @@
 namespace mlir::iree_compiler::IREE::Codegen {
 
 //===----------------------------------------------------------------------===//
-// Layout Structs.
+// Relational operator and IOstream implementations for Layout Structs.
 //===----------------------------------------------------------------------===//
 
 bool operator==(TileSwizzle::Dim lhs, TileSwizzle::Dim rhs) {
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/Utils/Utils.h b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/Utils/Utils.h
index 98e4bf4ea56a..5ef27f7018d8 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/Utils/Utils.h
+++ b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/Utils/Utils.h
@@ -7,8 +7,7 @@
 #ifndef IREE_COMPILER_CODEGEN_DIALECT_CODEGEN_UTILS_H_
 #define IREE_COMPILER_CODEGEN_DIALECT_CODEGEN_UTILS_H_
 
-#include <cstdint>
-
+#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenTypes.h"
 #include "llvm-c/TargetMachine.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/raw_ostream.h"
@@ -19,68 +18,9 @@
 namespace mlir::iree_compiler::IREE::Codegen {
 
 //===----------------------------------------------------------------------===//
-// Layout Structs.
+// Relational operator and IOstream implementations for Layout Structs.
 //===----------------------------------------------------------------------===//
 
-// Metadata for a swizzle, that is, an (expand_shape -> transposition)
-// pair of ops performing a change of layout within the tiles. This is used
-// on GPU, where the tiles themselves can have an arbitrary layout.
-struct TileSwizzle {
-  struct Dim {
-    // Describes what varies across this dimension.
-    enum class Kind : int8_t {
-      // This dimension is internal to one intrinsic on one thread. This
-      // is only seen for intrinsic operands that are themselves vectors.
-      // For example, with AMD MFMA, for the MFMA_F32_16x16x4_F32 intrinsic,
-      // the C-matrix operand is a vector of 4 floats already at the level of
-      // one intrinsic on one thread. That dimension of size 4 is 'Internal'.
-      Internal,
-      // This dimension is internal to one intrinsic, but is across threads.
-      // For example, with AMD MFMA, for the MFMA_F32_16x16x4_F32 intrinsic,
-      // the A-matrix tile has shape 16x4, and these two dimensions of size 16
-      // and 4 are 'CrossThread': neither is visible at the single-thread level
-      // (in the intrinsic itself, the A-matrix operand is a single scalar) but
-      // as we move along these dimensions, we are moving over the 64 threads
-      // of the subgroup.
-      //
-      // Another example of cross-thread dimensions is in kernels that are
-      // "unrolled" across subgroups. Such dimensions are cross-subgroup, so in
-      // particular they are cross-thread.
-      CrossThread,
-      // This dimensions is across intrinsics, as in, actual instructions in the
-      // generated code. In other words, it is an actual unrolling factor,
-      // resulting in this many more instructions being generated and executed
-      // on each thread/subgroup.
-      CrossIntrinsic
-    };
-
-    Kind kind = Kind::Internal;
-
-    // The size of the dimension.
-    int16_t size = 0;
-
-    // Support constructing from any size type.
-    template <typename T>
-    Dim(Kind kind, T size) : kind(kind), size(size) {}
-  };
-
-  using ExpandShapeDimVectorType = llvm::SmallVector<Dim, 4>;
-  using ExpandShapeType = llvm::SmallVector<ExpandShapeDimVectorType>;
-
-  // This vector-of-vectors contains all the information needed to generate
-  // a `tensor.expand_shape` creating additional internal dimensions into the
-  // tile. For example, expandShape = [[16], [4, 2]] means that the original
-  // tile shape [16, 8] gets expanded such that the first dimension 16 is left
-  // unchanged, and the second dimension 8 gets split into two internal dims
-  // of size 4 and 2.
-  ExpandShapeType expandShape;
-  // This permutation vector applies to the expanded dimensions and is used
-  // to generate a `linalg.transpose` changing the layout of the tile. For
-  // example, permutation[0] dictates which of the expanded dimensions becomes
-  // the leading dimension of the layout.
-  llvm::SmallVector<int64_t> permutation;
-};
-
 bool operator==(TileSwizzle::Dim lhs, TileSwizzle::Dim rhs);
 bool operator!=(TileSwizzle::Dim lhs, TileSwizzle::Dim rhs);
 bool operator==(const TileSwizzle &lhs, const TileSwizzle &rhs);
@@ -94,19 +34,6 @@ llvm::raw_ostream &operator<<(llvm::raw_ostream &os, TileSwizzle::Dim dim);
 llvm::raw_ostream &operator<<(llvm::raw_ostream &os,
                               const TileSwizzle &swizzle);
 
-/// Container of information needed to materialize the layout transformations.
-struct MaterializeEncodingInfo {
-  // The next 3 fields are used to create a `tensor.pack` or `tensor.unpack` op,
-  // changing the overall layout between row-major and tiled (where each tile is
-  // row-major).
-  SmallVector<int64_t> innerDimsPos;
-  SmallVector<int64_t> innerTileSizes;
-  SmallVector<int64_t> outerDimsPerm;
-
-  // The optional swizzle, see the comment on TileSwizzle. Only used on GPU.
-  std::optional<TileSwizzle> swizzle;
-};
-
 bool operator==(const MaterializeEncodingInfo &lhs,
                 const MaterializeEncodingInfo &rhs);
 bool operator!=(const MaterializeEncodingInfo &lhs,

From a4c6f35fbf732c6c7fc349f0815b13551a1b8a26 Mon Sep 17 00:00:00 2001
From: Prashant Kumar <pk5561@gmail.com>
Date: Tue, 26 Nov 2024 19:34:34 +0530
Subject: [PATCH 06/54] [LLVMGPU] Disable scf.forall distribution for
 matmulSimt (#19302)

We are moving towards tile&fuse pipeline. `matmulSimt` pipeline is to be
deprecated. Meanwhile, all existing tests that depend on matmulsimt
pipeline shouldn't break and hence disabling the forall distribution for
the pipeline.
---
 compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
index 7107e131685e..26dced54768f 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
@@ -530,7 +530,7 @@ void addGPUWinogradVectorizePassPipeline(OpPassManager &funcPassManager) {
 
 void addGPUMatmulSimtPassPipeline(OpPassManager &funcPassManager,
                                   const GPUPipelineOptions &options) {
-  tileAndDistributeToWorkgroup(funcPassManager, /*useForall=*/true);
+  tileAndDistributeToWorkgroup(funcPassManager, /*useForall=*/false);
 
   funcPassManager.addPass(createConfigTrackingCanonicalizerPass());
   funcPassManager.addPass(createConfigTrackingCanonicalizerPass());

From 2e2c109e8d58c8964ad7a3ba26d6addbfefbc1c4 Mon Sep 17 00:00:00 2001
From: Jakub Kuderski <jakub@nod-labs.com>
Date: Tue, 26 Nov 2024 11:57:40 -0500
Subject: [PATCH 07/54] Integrate llvm-project at
 db6f627f3fd4072fe1814805653a352694527a91 (#19304)

Still carrying a revert for 1004865f1ca41a9581da8747f34b29862d3ebc3d and
a cherry pick for https://github.com/llvm/llvm-project/pull/116650.
---
 third_party/llvm-project | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/llvm-project b/third_party/llvm-project
index 534730273092..04082f21dde6 160000
--- a/third_party/llvm-project
+++ b/third_party/llvm-project
@@ -1 +1 @@
-Subproject commit 534730273092b8e7d4bedc1a3206d76e6848c6c4
+Subproject commit 04082f21dde6f5722520d253d0d99f55b4834b7c

From cef4178c43017be82209e2f11dd433ed3fbe88f5 Mon Sep 17 00:00:00 2001
From: Han-Chung Wang <hanhan0912@gmail.com>
Date: Tue, 26 Nov 2024 09:16:13 -0800
Subject: [PATCH 08/54] [Codegen][NFC] Switch Dim::Kind output stream to use
 existing methods. (#19293)

The `convertSwizzleKindToString` has the same implementation. The
revision switch the implementation to use the method directly.

Signed-off-by: hanhanW <hanhan0912@gmail.com>
---
 .../Codegen/Dialect/Codegen/Utils/Utils.cpp         | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/Utils/Utils.cpp b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/Utils/Utils.cpp
index f2fbe11b2565..7b1e57480a20 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/Utils/Utils.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/Utils/Utils.cpp
@@ -36,18 +36,7 @@ bool operator!=(const TileSwizzle &lhs, const TileSwizzle &rhs) {
 
 llvm::raw_ostream &operator<<(llvm::raw_ostream &os,
                               TileSwizzle::Dim::Kind kind) {
-  switch (kind) {
-  case TileSwizzle::Dim::Kind::Internal:
-    return os << "Internal";
-  case TileSwizzle::Dim::Kind::CrossThread:
-    return os << "CrossThread";
-  case TileSwizzle::Dim::Kind::CrossIntrinsic:
-    return os << "CrossIntrinsic";
-  default:
-    // Required by GCC.
-    assert(false);
-    return os;
-  }
+  return os << convertSwizzleKindToString(kind);
 }
 
 llvm::raw_ostream &operator<<(llvm::raw_ostream &os, TileSwizzle::Dim dim) {

From 746ad1efa3580ed53705e1246a29c749bf1e545b Mon Sep 17 00:00:00 2001
From: Nirvedh Meshram <96096277+nirvedhmeshram@users.noreply.github.com>
Date: Tue, 26 Nov 2024 11:44:00 -0600
Subject: [PATCH 09/54] [GPU] Add C promotion capability in promote matmul
 operands pass (#19256)

This PR sets up the convention that when the operand index for promotion
is beyond the dpsInputs then we promote the corresponding dpsInit's
tied-result.
Result promotion is implemented in this PR.

Co-authored-by : Quinn Dawkins <quinn.dawkins@gmail.com>

---------

Signed-off-by: Nirvedh <nirvedh@gmail.com>
---
 .../Common/GPU/GPUPromoteMatmulOperands.cpp   | 105 ++++++++++++++++--
 .../compiler/Codegen/Common/GPU/Passes.td     |   2 +
 .../GPU/test/gpu_promote_matmul_operands.mlir |  51 +++++++++
 .../test/ROCDL/pipeline_tile_and_fuse.mlir    |  72 ++++++++++++
 4 files changed, 221 insertions(+), 9 deletions(-)

diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUPromoteMatmulOperands.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUPromoteMatmulOperands.cpp
index 72e30caf709f..f3bfdde0d2df 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUPromoteMatmulOperands.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUPromoteMatmulOperands.cpp
@@ -9,8 +9,12 @@
 #include "iree/compiler/Codegen/Dialect/GPU/IR/GPULoweringConfigUtils.h"
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUDialect.h"
+#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.h"
+#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUInterfaces.h"
 #include "iree/compiler/Codegen/Utils/LinalgOpInfo.h"
 #include "iree/compiler/Codegen/Utils/Utils.h"
+#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/Linalg/IR/LinalgInterfaces.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
@@ -25,11 +29,83 @@ namespace mlir::iree_compiler {
 #include "iree/compiler/Codegen/Common/GPU/Passes.h.inc"
 
 namespace {
+/// Helper to insert copy with derived thread config.
+Value promoteValue(OpBuilder &builder, Location loc, Value v) {
+  auto tensorType = cast<RankedTensorType>(v.getType());
+  SmallVector<OpFoldResult> mixedSizes = tensor::getMixedSizes(builder, loc, v);
+  Value empty = builder.create<tensor::EmptyOp>(loc, mixedSizes,
+                                                tensorType.getElementType());
+  auto copy = builder.create<linalg::CopyOp>(loc, v, empty);
+  setLoweringConfig(
+      copy, IREE::GPU::DerivedThreadConfigAttr::get(builder.getContext()));
+  return copy.getResult(0);
+}
+
+/// Helper to promote results. If the target value is consumed only by a
+/// `tensor.extract_slice`, this will promote the result of the slice instead.
+void promoteResult(OpBuilder &builder, Operation *op, Value valToMakeShared) {
+  IRRewriter rewriter(builder);
+  Location loc = op->getLoc();
+  OpBuilder::InsertionGuard g(rewriter);
+  rewriter.setInsertionPointAfterValue(valToMakeShared);
+  tensor::ExtractSliceOp extractSliceOp;
+  SetVector<Operation *> opsToReplaceUseIn;
+  Value valueToReplace = valToMakeShared;
+  for (auto user : valToMakeShared.getUsers()) {
+    extractSliceOp = dyn_cast<tensor::ExtractSliceOp>(user);
+    if (extractSliceOp) {
+      // If the result is consumed by an extract_slice then we expect there to
+      // be exactly one extract slice that is then consumed.
+      // TODO (nirvedhmeshram) : This is fairly special case. Instead we should
+      // just promote results before doing padding which introduces the extract
+      // slice.
+      if (!valToMakeShared.hasOneUse())
+        return;
+      valueToReplace = extractSliceOp.getResult();
+      for (auto user : extractSliceOp->getUsers()) {
+        opsToReplaceUseIn.insert(user);
+      }
+      break;
+    }
+    opsToReplaceUseIn.insert(user);
+  }
+  auto tensorType = cast<RankedTensorType>(valToMakeShared.getType());
+  if (!tensorType) {
+    return;
+  }
+  SmallVector<Value> dynamicSizes;
+  for (auto [idx, size] : llvm::enumerate(tensorType.getShape())) {
+    if (ShapedType::isDynamic(size)) {
+      dynamicSizes.push_back(
+          rewriter.create<tensor::DimOp>(loc, valToMakeShared, idx));
+    }
+  }
+  Attribute addressSpace = gpu::AddressSpaceAttr::get(
+      rewriter.getContext(), gpu::GPUDialect::getWorkgroupAddressSpace());
+  auto alloc = rewriter.create<bufferization::AllocTensorOp>(loc, tensorType,
+                                                             dynamicSizes);
+  alloc.setMemorySpaceAttr(addressSpace);
+  auto copy =
+      rewriter.create<linalg::CopyOp>(loc, valToMakeShared, alloc.getResult());
+
+  Value replacement = copy.getResult(0);
+  // If in extract slice is present we make it consume the new copy.
+  if (extractSliceOp) {
+    extractSliceOp.getSourceMutable().assign(replacement);
+    replacement = valueToReplace;
+  }
+
+  rewriter.setInsertionPointAfterValue(replacement);
+  replacement = promoteValue(rewriter, loc, replacement);
+  valueToReplace.replaceUsesWithIf(replacement, [&](OpOperand &use) {
+    return opsToReplaceUseIn.contains(use.getOwner());
+  });
+}
 
 /// Inserts a `linalg.copy` directly before the given operation on the
 /// specified operand, for example with operand index = 1:
 ///
-///   linalg.matmul ins(%0, %1)
+///   %2 = linalg.matmul ins(%0, %1)
 ///
 /// becomes
 ///
@@ -41,7 +117,24 @@ namespace {
 /// If the producer is already a tilable op, the producer is just annotated with
 /// #iree_gpu.derived_thread_config to indicate that it should be distributed
 /// to threads independently of the matmul.
+/// Additionally we can also promote results so in above example we will
+/// generate for index = 2 :
+///   %out_buffer = bufferization.alloc_tensor
+///   %copy1 = linalg.copy %2 to %out_buffer
+///   %copy2 = linalg.copy %copy1 to %empty {
+///     lowering_config = #iree_gpu.derived_thread_config}
 void promoteOperand(OpBuilder &builder, Operation *op, unsigned index) {
+  auto dpsOp = dyn_cast<DestinationStyleOpInterface>(op);
+  if (!dpsOp)
+    return;
+  // We use the convention that if we are passing an index beyond the inputs
+  // then we promote the result of the corresponding dps init.
+  if (index >= dpsOp.getNumDpsInputs()) {
+    index -= dpsOp.getNumDpsInputs();
+    assert(index < op->getNumResults() &&
+           "trying to promote out of bound result index");
+    return promoteResult(builder, op, op->getResult(index));
+  }
   Value operand = op->getOperand(index);
 
   if (auto producer = operand.getDefiningOp<TilingInterface>()) {
@@ -70,14 +163,8 @@ void promoteOperand(OpBuilder &builder, Operation *op, unsigned index) {
     return;
   }
 
-  SmallVector<OpFoldResult> mixedSizes =
-      tensor::getMixedSizes(builder, op->getLoc(), operand);
-  Value empty = builder.create<tensor::EmptyOp>(op->getLoc(), mixedSizes,
-                                                tensorType.getElementType());
-  auto copy = builder.create<linalg::CopyOp>(op->getLoc(), operand, empty);
-  setLoweringConfig(
-      copy, IREE::GPU::DerivedThreadConfigAttr::get(builder.getContext()));
-  op->setOperand(index, copy.getResult(0));
+  auto replacement = promoteValue(builder, op->getLoc(), operand);
+  op->setOperand(index, replacement);
 }
 
 struct GPUPromoteMatmulOperandsPass final
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td b/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td
index e6bbb5da40ca..e2415fd4c6ee 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td
@@ -161,6 +161,8 @@ def GPUPromoteMatmulOperandsPass :
   let summary = "Pass to insert copies with a different thread configuration "
                 "on matmul operands";
   let dependentDialects = [
+    "::mlir::bufferization::BufferizationDialect",
+    "::mlir::gpu::GPUDialect",
     "::mlir::linalg::LinalgDialect",
     "::mlir::iree_compiler::IREE::GPU::IREEGPUDialect"
   ];
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_promote_matmul_operands.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_promote_matmul_operands.mlir
index 643b12c01e39..1d8be0dc1900 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_promote_matmul_operands.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_promote_matmul_operands.mlir
@@ -106,3 +106,54 @@ func.func @promote_pad(%a : tensor<4x127xf32>, %b: tensor<128x128xf32>) -> tenso
 //   CHECK:   linalg.copy
 // CHECK-SAME: derived_thread_config
 //       CHECK: return
+
+// -----
+
+#lowering_config = #iree_gpu.lowering_config<{promote_operands = [2]}>
+func.func @promote_result(%a : tensor<?x?xf32>, %b : tensor<?x?xf32>, %mdim : index, %ndim : index) -> tensor<?x?xf32> {
+  %cst = arith.constant 0.000000e+00 : f32
+  %empty = tensor.empty(%mdim, %ndim) : tensor<?x?xf32>
+  %fill = linalg.fill ins(%cst : f32) outs(%empty : tensor<?x?xf32>) -> tensor<?x?xf32>
+  %mm = linalg.matmul {lowering_config = #lowering_config}
+    ins(%a, %b : tensor<?x?xf32>, tensor<?x?xf32>) outs(%fill : tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %mm : tensor<?x?xf32>
+}
+
+// CHECK-LABEL: func @promote_result(
+//       CHECK:   %[[MATMUL:.+]] = linalg.matmul
+//       CHECK:   %[[ALLOC:.+]] = bufferization.alloc_tensor
+//       CHECK:   %[[COPY1:.+]] = linalg.copy
+//  CHECK-SAME:       ins(%[[MATMUL]] : tensor<?x?xf32>) outs(%[[ALLOC]] : tensor<?x?xf32>)
+//  CHECK-SAME:       -> tensor<?x?xf32>
+//       CHECK:   %[[COPY2:.+]] = linalg.copy
+//  CHECK-SAME:       {lowering_config = #iree_gpu.derived_thread_config}
+//  CHECK-SAME:       ins(%[[COPY1]] : tensor<?x?xf32>)
+//       CHECK:   return %[[COPY2]] : tensor<?x?xf32>
+
+// -----
+
+#lowering_config = #iree_gpu.lowering_config<{promote_operands = [2]}>
+func.func @promote_padded_result(%a : tensor<?x?xf32>, %b : tensor<?x?xf32>, %mdim : index, %ndim : index, %pad : index, %slice : index) -> tensor<?x?xf32> {
+  %cst = arith.constant 0.000000e+00 : f32
+  %empty = tensor.empty(%mdim, %ndim) : tensor<?x?xf32>
+  %fill = linalg.fill ins(%cst : f32) outs(%empty : tensor<?x?xf32>) -> tensor<?x?xf32>
+  %padded_fill = tensor.pad %fill low[0, 0] high[%pad, %pad] {
+    ^bb0(%arg3: index, %arg4: index):
+      tensor.yield %cst : f32
+    } : tensor<?x?xf32> to tensor<?x?xf32>
+  %mm = linalg.matmul {lowering_config = #lowering_config}
+    ins(%a, %b : tensor<?x?xf32>, tensor<?x?xf32>) outs(%padded_fill : tensor<?x?xf32>) -> tensor<?x?xf32>
+  %mm_slice = tensor.extract_slice %mm [0, 0] [%slice, %slice] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
+  return %mm_slice : tensor<?x?xf32>
+}
+
+// CHECK-LABEL: func @promote_padded_result(
+//       CHECK:   %[[MATMUL:.+]] = linalg.matmul
+//       CHECK:   %[[ALLOC:.+]] = bufferization.alloc_tensor
+//       CHECK:   %[[COPY1:.+]] = linalg.copy
+//  CHECK-SAME:       ins(%[[MATMUL]] : tensor<?x?xf32>) outs(%[[ALLOC]] : tensor<?x?xf32>)
+//       CHECK:   %[[EXTRACT:.+]] = tensor.extract_slice %[[COPY1]]
+//       CHECK:   %[[COPY2:.+]] = linalg.copy
+//  CHECK-SAME:       {lowering_config = #iree_gpu.derived_thread_config}
+//  CHECK-SAME:       ins(%[[EXTRACT]] : tensor<?x?xf32>)
+//       CHECK:   return %[[COPY2]] : tensor<?x?xf32>
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir
index 772e146c5d54..4e9758f83c78 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir
@@ -1019,3 +1019,75 @@ hal.executable public @main {
 //       CHECK:     scf.for
 // CHECK-COUNT-4:     arith.addf {{.*}} : vector<9xf32>
 //       CHECK:       vector.transfer_write {{.*}} vector<9xi8>, memref<32x16x9x9xi8, #hal.descriptor_type<storage_buffer>>
+
+// -----
+
+#pipeline_layout = #hal.pipeline.layout<bindings = [
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>
+]>
+#config = #iree_gpu.lowering_config<{
+  workgroup = [64, 64, 0],
+  reduction = [0, 0, 2],
+  subgroup = [2, 2],
+  mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
+  promote_operands = [0, 1, 2]
+}>
+hal.executable public @main {
+  hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) {
+    hal.executable.export public @matmul_transpose_b_promote_result ordinal(0) layout(#pipeline_layout) {
+    ^bb0(%arg0: !hal.device):
+      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
+      hal.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @matmul_transpose_b_promote_result()
+        attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [128, 2, 1] subgroup_size = 64>} {
+        %cst = arith.constant 0.000000e+00 : f16
+        %c0 = arith.constant 0 : index
+        %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x1280xf16>>
+        %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<10240x1280xf16>>
+        %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x10240xf32>>
+        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1280], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1280xf16>> -> tensor<2048x1280xf16>
+        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [10240, 1280], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<10240x1280xf16>> -> tensor<10240x1280xf16>
+        %5 = tensor.empty() : tensor<2048x10240xf32>
+        %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<2048x10240xf32>) -> tensor<2048x10240xf32>
+        %7 = linalg.matmul_transpose_b {lowering_config = #config}
+          ins(%3, %4 : tensor<2048x1280xf16>, tensor<10240x1280xf16>)
+          outs(%6 : tensor<2048x10240xf32>) -> tensor<2048x10240xf32>
+        flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 10240], strides = [1, 1] : tensor<2048x10240xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x10240xf32>>
+        return
+      }
+    }
+  }
+}
+
+// CHECK-LABEL: func @matmul_transpose_b_promote_result
+//   CHECK-DAG:   %[[B0:.+]] = hal.interface.binding.subspan layout({{.+}}) binding(0)
+//   CHECK-DAG:   %[[B1:.+]] = hal.interface.binding.subspan layout({{.+}}) binding(1)
+//   CHECK-DAG:   %[[B2:.+]] = hal.interface.binding.subspan layout({{.+}}) binding(2)
+//   CHECK-DAG:   memref.alloc() : memref<64x36xf16, #gpu.address_space<workgroup>>
+//   CHECK-DAG:   memref.alloc() : memref<64x36xf16, #gpu.address_space<workgroup>>
+//   CHECK-DAG:   memref.alloc() : memref<4x16x4x16xf32, #gpu.address_space<workgroup>>
+//       CHECK:   scf.forall ({{.*}}) in (32, 160) {
+//       CHECK:     %[[LOOP:.+]] = scf.for %[[IV:.+]] = %c0 to %c80 step %c2 {{.*}} -> (vector<2x2x4x1xf32>)
+//       CHECK:       gpu.barrier
+//   CHECK-DAG:       %[[LHS_RD:.+]] = vector.transfer_read %[[B0]]{{.*}} vector<8xf16>
+//   CHECK-DAG:       vector.transfer_write %[[LHS_RD]]
+//   CHECK-DAG:       %[[RHS_RD:.+]] = vector.transfer_read %[[B1]]{{.*}} vector<8xf16>
+//   CHECK-DAG:       vector.transfer_write %[[RHS_RD]]
+//       CHECK:       gpu.barrier
+//   CHECK-DAG:       vector.transfer_read {{.*}} #gpu.address_space<workgroup>>, vector<2x1x2x4xf16>
+//   CHECK-DAG:       vector.transfer_read {{.*}} #gpu.address_space<workgroup>>, vector<2x1x2x4xf16>
+//   CHECK-DAG:       vector.transpose %{{.*}}, [0, 2, 1, 3] : vector<2x1x2x4xf16>
+//   CHECK-DAG:       vector.transpose %{{.*}}, [0, 2, 1, 3] : vector<2x1x2x4xf16>
+// CHECK-COUNT-4:     amdgpu.mfma {{.*}}blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32
+//       CHECK:       scf.yield
+//       CHECK:     %[[LOOP_T:.+]] = vector.transpose %[[LOOP]], [0, 2, 1, 3] : vector<2x2x4x1xf32> to vector<2x4x2x1xf32>
+//       CHECK:     vector.transfer_write %[[LOOP_T]]
+//       CHECK:     scf.for {{.*}} {
+//       CHECK:       %[[SHARED_READ:.+]] = vector.transfer_read {{.*}} #gpu.address_space<workgroup>>, vector<4xf32>
+//       CHECK:       vector.transfer_write %[[SHARED_READ]], %[[B2]]
+//       CHECK:    }
+//       CHECK:   } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}

From 031accb09edf4b3ee42cf9c263e404223982857e Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak@amd.com>
Date: Tue, 26 Nov 2024 14:38:07 -0600
Subject: [PATCH 10/54] [GPU] Use affine.linearize_index (and
 delinearize_index) where possible (#19122)

There have been issues with the composition of affine maps being too
general and loosing important information, like the fact that
affine_map<(s0 + s1 * 32 + ... - (s0 floorDiv 16) * 16)> realy should be
affine_map<(s0 mod 16 + s1 * 32 + ...)>, and other issues with the
ultimate IR that block low-level arithmetic optimizations.

The affine.delinearize_index operation represents the div/mod chains
needed to break a flat index into its component parts. A recently added
affine.linearize_index operation is its inverse - combining multiple
indices into a flat 1D value.

Another advantage to linearize/delinearize is simpler upstream
canonicalizations and lead to more streamlined generated code.

This PR updates the vector distribution code and other GPU-related code
that I could find to

1. Use affine.linearize_index to construct flat thread IDs
2. Use affine.delinearize_index in places where there was a floorDiv/mod
chain.
3. Plumb the subgroup size through the transfer_read and transfer_write
distribution patterns to enable better reasoning about when you do/don't
need to take a mod of the lane ID
---
 .../Common/GPU/GPUDistributeForall.cpp        |  41 ++--
 .../GPU/GPUDistributeSharedMemoryCopy.cpp     |  32 ++-
 .../GPU/test/gpu_distribute_forall.mlir       |  63 +++---
 .../test/gpu_distribute_shared_memory.mlir    |  31 ++-
 ...ransform_gpu_distribute_shared_memory.mlir |  17 +-
 .../TransformExtensions/CommonExtensions.cpp  |   9 +-
 .../CommonExtensionsOps.td                    |   3 +-
 .../Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp   |  17 +-
 .../Dialect/GPU/Transforms/Transforms.cpp     |   5 +-
 .../test/distribute_mma_to_lanes.mlir         |  43 ++--
 .../LLVMGPU/LLVMGPUVectorDistribute.cpp       |  26 +--
 .../TransformExtensions/LLVMGPUExtensions.cpp |   5 +-
 .../LLVMGPUExtensionsOps.td                   |   3 +-
 .../test/ROCDL/pipeline_tile_and_fuse.mlir    |   8 +-
 .../LLVMGPU/test/transpose_pipeline_test.mlir | 186 ++++++++----------
 15 files changed, 224 insertions(+), 265 deletions(-)

diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistributeForall.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistributeForall.cpp
index 64623462a526..334427cfffb9 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistributeForall.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistributeForall.cpp
@@ -17,6 +17,7 @@
 #include "mlir/Dialect/GPU/Transforms/Passes.h"
 #include "mlir/Dialect/SCF/IR/DeviceMappingInterface.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/Utils/StaticValueUtils.h"
 
 namespace mlir::iree_compiler {
 
@@ -87,9 +88,16 @@ LogicalResult resolveGPUMappedForallOp(RewriterBase &rewriter,
   assert(!(hasThreadMapping && hasWarpMapping));
   Value flatId = linearThreadId;
   if (hasWarpMapping) {
-    OpFoldResult subgroupSizeVal = rewriter.getIndexAttr(subgroupSize);
-    flatId = affine::makeComposedAffineApply(rewriter, loc, d0.floorDiv(d1),
-                                             {flatId, subgroupSizeVal});
+    if (flatWorkgroupSize % subgroupSize != 0) {
+      return forallOp->emitOpError(
+          "found warp mapped forall with non-multiple workgroup size");
+    }
+    flatId = rewriter
+                 .create<affine::AffineDelinearizeIndexOp>(
+                     loc, flatId,
+                     ArrayRef<int64_t>{flatWorkgroupSize / subgroupSize,
+                                       subgroupSize})
+                 .getResult(0);
   }
 
   SmallVector<Value> delinSizes;
@@ -190,23 +198,18 @@ void GPUDistributeForallPass::runOnOperation() {
     return signalPassFailure();
   }
 
-  AffineExpr x, y, z;
-  bindSymbols(funcOp.getContext(), x, y, z);
-  // Compute the linearized thread id.
-  AffineExpr linearId =
-      x + workgroupSize[0] * y + workgroupSize[1] * workgroupSize[0] * z;
-
   rewriter.setInsertionPointToStart(&funcOp.getFunctionBody().front());
-  SmallVector<OpFoldResult> threadGrid = {
-      rewriter.createOrFold<gpu::ThreadIdOp>(funcOp.getLoc(),
-                                             gpu::Dimension::x),
-      rewriter.createOrFold<gpu::ThreadIdOp>(funcOp.getLoc(),
-                                             gpu::Dimension::y),
-      rewriter.createOrFold<gpu::ThreadIdOp>(funcOp.getLoc(),
-                                             gpu::Dimension::z)};
-
-  Value linearThreadIdVal = affine::makeComposedAffineApply(
-      rewriter, funcOp.getLoc(), linearId, threadGrid);
+  SmallVector<Value> threadGrid = {rewriter.createOrFold<gpu::ThreadIdOp>(
+                                       funcOp.getLoc(), gpu::Dimension::z),
+                                   rewriter.createOrFold<gpu::ThreadIdOp>(
+                                       funcOp.getLoc(), gpu::Dimension::y),
+                                   rewriter.createOrFold<gpu::ThreadIdOp>(
+                                       funcOp.getLoc(), gpu::Dimension::x)};
+  SmallVector<int64_t> threadGridBasis = {workgroupSize[2], workgroupSize[1],
+                                          workgroupSize[0]};
+
+  Value linearThreadIdVal = rewriter.create<affine::AffineLinearizeIndexOp>(
+      funcOp.getLoc(), threadGrid, threadGridBasis, /*disjoint=*/true);
   for (auto forall : forallOps) {
     rewriter.setInsertionPoint(forall);
     if (failed(resolveGPUMappedForallOp(rewriter, forall, linearThreadIdVal,
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistributeSharedMemoryCopy.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistributeSharedMemoryCopy.cpp
index 4610c545e553..47329c84f189 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistributeSharedMemoryCopy.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistributeSharedMemoryCopy.cpp
@@ -189,10 +189,8 @@ SmallVector<linalg::ProcInfo> getIds(OpBuilder &b, Location loc,
                                      ArrayRef<Range> parallelLoopRanges,
                                      Value flatThreadId) {
   SmallVector<linalg::ProcInfo> infos;
-  Value id = flatThreadId;
-  AffineExpr d0 = b.getAffineDimExpr(0);
-  for (Range r : llvm::reverse(parallelLoopRanges)) {
-    linalg::ProcInfo info;
+  SmallVector<int64_t> delinSizes;
+  for (Range r : parallelLoopRanges) {
     auto offset = dyn_cast<Attribute>(r.offset);
     auto stride = dyn_cast<Attribute>(r.stride);
     auto size = dyn_cast<Attribute>(r.size);
@@ -200,19 +198,20 @@ SmallVector<linalg::ProcInfo> getIds(OpBuilder &b, Location loc,
     int64_t numThreadsDim = (llvm::cast<IntegerAttr>(size).getInt() -
                              llvm::cast<IntegerAttr>(offset).getInt()) /
                             llvm::cast<IntegerAttr>(stride).getInt();
-    Value dimId = id;
-    if (infos.size() != parallelLoopRanges.size() - 1)
-      dimId =
-          affine::makeComposedAffineApply(b, loc, d0 % numThreadsDim, {dimId});
+    delinSizes.push_back(numThreadsDim);
+  }
+  ValueRange dims =
+      b.create<affine::AffineDelinearizeIndexOp>(loc, flatThreadId, delinSizes)
+          .getResults();
+
+  for (auto [dimId, numThreadsDim] : llvm::zip_equal(dims, delinSizes)) {
+    linalg::ProcInfo info;
     info.procId = dimId;
     info.nprocs = b.create<arith::ConstantIndexOp>(loc, numThreadsDim);
     info.distributionMethod =
         linalg::DistributionMethod::CyclicNumProcsEqNumIters;
     infos.push_back(info);
-    id = affine::makeComposedAffineApply(b, loc, d0.floorDiv(numThreadsDim),
-                                         {id});
   }
-  std::reverse(infos.begin(), infos.end());
   return infos;
 }
 
@@ -288,19 +287,16 @@ static Value createFlatId(mlir::FunctionOpInterface funcOp,
                           ArrayRef<int64_t> workgroupSize) {
   OpBuilder b(funcOp.getFunctionBody());
   Type indexType = b.getIndexType();
-  AffineExpr d0 = getAffineDimExpr(0, b.getContext());
-  AffineExpr d1 = getAffineDimExpr(1, b.getContext());
-  AffineExpr d2 = getAffineDimExpr(2, b.getContext());
   Value threadX =
       b.create<gpu::ThreadIdOp>(funcOp.getLoc(), indexType, gpu::Dimension::x);
   Value threadY =
       b.create<gpu::ThreadIdOp>(funcOp.getLoc(), indexType, gpu::Dimension::y);
   Value threadZ =
       b.create<gpu::ThreadIdOp>(funcOp.getLoc(), indexType, gpu::Dimension::z);
-  Value flatThreadId = affine::makeComposedAffineApply(
-      b, funcOp.getLoc(),
-      d0 + workgroupSize[0] * d1 + (workgroupSize[0] * workgroupSize[1]) * d2,
-      {threadX, threadY, threadZ});
+  Value flatThreadId = b.create<affine::AffineLinearizeIndexOp>(
+      funcOp.getLoc(), ValueRange{threadZ, threadY, threadX},
+      ArrayRef<int64_t>{workgroupSize[2], workgroupSize[1], workgroupSize[0]},
+      /*disjoint=*/true);
   return flatThreadId;
 }
 
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_distribute_forall.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_distribute_forall.mlir
index 214337437b76..32bda8c90f05 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_distribute_forall.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_distribute_forall.mlir
@@ -15,11 +15,9 @@ func.func @distribute_thread_forall(%out : memref<?xi32>)
 // CHECK-LABEL: func @distribute_thread_forall
 //   CHECK-DAG:   %[[TX:.+]] = gpu.thread_id x
 //   CHECK-DAG:   %[[TY:.+]] = gpu.thread_id y
-//   CHECK-DAG:   %[[TZ:.+]] = gpu.thread_id z
+//       CHECK:   %[[TFLAT:.+]] = affine.linearize_index disjoint [%[[TY]], %[[TX]]] by (2, 64)
 //       CHECK:   scf.for %[[I:.+]] = %c0 to %c1024 step %c128 {
-//       CHECK:     %[[LINID:.+]] = affine.apply
-//  CHECK-SAME:       affine_map<(d0)[s0, s1, s2] -> (d0 + s0 + s1 * 64 + s2 * 128)>(%[[I]])
-//  CHECK-SAME:       [%[[TX]], %[[TY]], %[[TZ]]]
+//       CHECK:     %[[LINID:.+]] = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%[[I]])[%[[TFLAT]]]
 //       CHECK:     memref.store {{.*}}[%[[LINID]]]
 
 // -----
@@ -38,11 +36,10 @@ func.func @distribute_warp_forall(%out : memref<?xi32>)
 // CHECK-LABEL: func @distribute_warp_forall
 //   CHECK-DAG:   %[[TX:.+]] = gpu.thread_id x
 //   CHECK-DAG:   %[[TY:.+]] = gpu.thread_id y
-//   CHECK-DAG:   %[[TZ:.+]] = gpu.thread_id z
+//       CHECK:   %[[TFLAT:.+]] = affine.linearize_index disjoint [%[[TY]], %[[TX]]] by (2, 64)
+//       CHECK:   %[[WARPSPLIT:.+]]:2 = affine.delinearize_index %[[TFLAT]] into (4, 32)
 //       CHECK:   scf.for %[[I:.+]] = %c0 to %c32 step %c4 {
-//       CHECK:     %[[LINID:.+]] = affine.apply
-//  CHECK-SAME:       affine_map<(d0)[s0, s1, s2] -> (d0 + s1 * 2 + s2 * 4 + s0 floordiv 32)>(%[[I]])
-//  CHECK-SAME:       [%[[TX]], %[[TY]], %[[TZ]]]
+//       CHECK:     %[[LINID:.+]] = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%[[I]])[%[[WARPSPLIT]]#0]
 //       CHECK:     memref.store {{.*}}[%[[LINID]]]
 
 // -----
@@ -78,11 +75,7 @@ func.func @distribute_thread_forall_drop_for_loop(%out : memref<?xi32>)
 // CHECK-LABEL: func @distribute_thread_forall_drop_for_loop
 //   CHECK-DAG:   %[[TX:.+]] = gpu.thread_id x
 //   CHECK-DAG:   %[[TY:.+]] = gpu.thread_id y
-//   CHECK-DAG:   %[[TZ:.+]] = gpu.thread_id z
-//   CHECK-NOT:   scf.for
-//       CHECK:   %[[LINID:.+]] = affine.apply
-//  CHECK-SAME:     affine_map<()[s0, s1, s2] -> (s0 + s1 * 64 + s2 * 128)>
-//  CHECK-SAME:     [%[[TX]], %[[TY]], %[[TZ]]]
+//       CHECK:   %[[LINID:.+]] = affine.linearize_index disjoint [%[[TY]], %[[TX]]] by (2, 64)
 //       CHECK:   memref.store {{.*}}[%[[LINID]]]
 
 // -----
@@ -99,13 +92,32 @@ func.func @distribute_thread_forall_single_thread(%out : memref<?xi32>)
 }
 
 // CHECK-LABEL: func @distribute_thread_forall_single_thread
+//   CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
 //   CHECK-DAG:   %[[TX:.+]] = gpu.thread_id x
 //   CHECK-DAG:   %[[TY:.+]] = gpu.thread_id y
-//   CHECK-DAG:   %[[TZ:.+]] = gpu.thread_id z
-//       CHECK:   %[[LINID:.+]] = affine.apply
-//  CHECK-SAME:     affine_map<()[s0, s1, s2] -> (s0 + s1 * 64 + s2 * 128)>
-//  CHECK-SAME:     [%[[TX]], %[[TY]], %[[TZ]]]
-//       CHECK:   scf.for %[[I:.+]] = %[[LINID]] to %c1 step %c128 {
+//       CHECK:   %[[TFLAT:.+]] = affine.linearize_index disjoint [%[[TY]], %[[TX]]] by (2, 64)
+//       CHECK:   scf.for %[[I:.+]] = %[[TFLAT]] to %c1 step %c128 {
+//       CHECK:     memref.store {{.*}}[%[[I]]]
+
+// -----
+
+#translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 2, 1] subgroup_size = 32>
+
+func.func @distribute_thread_forall_overhang(%out : memref<?xi32>)
+    attributes {translation_info = #translation_info} {
+  %c0 = arith.constant 0 : i32
+  scf.forall (%arg0) in (513) {
+    memref.store %c0, %out[%arg0] : memref<?xi32>
+  } {mapping = [#gpu.thread<linear_dim_0>]}
+  return
+}
+
+// CHECK-LABEL: func @distribute_thread_forall_overhang
+//   CHECK-DAG:   %[[C513:.+]] = arith.constant 513 : index
+//   CHECK-DAG:   %[[TX:.+]] = gpu.thread_id x
+//   CHECK-DAG:   %[[TY:.+]] = gpu.thread_id y
+//       CHECK:   %[[TFLAT:.+]] = affine.linearize_index disjoint [%[[TY]], %[[TX]]] by (2, 64)
+//       CHECK:   scf.for %[[I:.+]] = %[[TFLAT]] to %[[C513]] step %c128 {
 //       CHECK:     memref.store {{.*}}[%[[I]]]
 
 // -----
@@ -124,11 +136,9 @@ func.func @distribute_thread_forall_multi_dim(%out : memref<?x?x?xi32>)
 // CHECK-LABEL: func @distribute_thread_forall_multi_dim
 //   CHECK-DAG:   %[[TX:.+]] = gpu.thread_id x
 //   CHECK-DAG:   %[[TY:.+]] = gpu.thread_id y
-//   CHECK-DAG:   %[[TZ:.+]] = gpu.thread_id z
+//       CHECK:   %[[TFLAT:.+]] = affine.linearize_index disjoint [%[[TY]], %[[TX]]] by (2, 64)
 //       CHECK:   scf.for %[[I:.+]] = %c0 to %c512 step %c128 {
-//       CHECK:     %[[LINID:.+]] = affine.apply
-//  CHECK-SAME:       affine_map<(d0)[s0, s1, s2] -> (d0 + s0 + s1 * 64 + s2 * 128)>(%[[I]])
-//  CHECK-SAME:       [%[[TX]], %[[TY]], %[[TZ]]]
+//       CHECK:     %[[LINID:.+]] = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%[[I]])[%[[TFLAT]]]
 //       CHECK:     %[[DELIN:.+]]:3 = affine.delinearize_index %[[LINID]] into (16, 8, 4) : index
 //       CHECK:     memref.store {{.*}}[%[[DELIN]]#0, %[[DELIN]]#1, %[[DELIN]]#2]
 
@@ -147,10 +157,5 @@ func.func @distribute_thread_forall_small_workgroup(%out : memref<?xi32>)
 }
 
 // CHECK-LABEL: func @distribute_thread_forall_small_workgroup
-//   CHECK-DAG:   %[[TX:.+]] = gpu.thread_id x
-//   CHECK-DAG:   %[[TY:.+]] = gpu.thread_id y
-//   CHECK-DAG:   %[[TZ:.+]] = gpu.thread_id z
-//       CHECK:   %[[LINID:.+]] = affine.apply
-//  CHECK-SAME:     affine_map<()[s0, s1, s2] -> (s0 + s1 * 7 + s2 * 7)>
-//  CHECK-SAME:     [%[[TX]], %[[TY]], %[[TZ]]]
-//       CHECK:   memref.store {{.*}}[%[[LINID]]]
+//   CHECK:   %[[TX:.+]] = gpu.thread_id x
+//   CHECK:   memref.store {{.*}}[%[[TX]]]
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_distribute_shared_memory.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_distribute_shared_memory.mlir
index 636add66dd0d..8f526bd4dd91 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_distribute_shared_memory.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_distribute_shared_memory.mlir
@@ -49,12 +49,9 @@ module {
   }
 }
 
-//   CHECK-DAG: #[[$MAP0:.*]] = affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 32 + s0 floordiv 4)>
-//   CHECK-DAG: #[[$MAP1:.*]] = affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16)>
-//   CHECK-DAG: #[[$MAP2:.*]] = affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 32 + s0 floordiv 4 + 32)>
-//   CHECK-DAG: #[[$MAP3:.*]] = affine_map<()[s0, s1, s2] -> (s0 + s1 * 32 + s2 * 128)>
-//   CHECK-DAG: #[[$MAP4:.*]] = affine_map<()[s0, s1, s2] -> (s0 + s1 * 32 + s2 * 128 + 128)>
-//   CHECK-DAG: #[[$MAP5:.*]] = affine_map<()[s0, s1, s2] -> (s0 * 4 + s1 * 128 + s2 * 512)>
+//   CHECK-DAG: #[[$MAP0:.*]] = affine_map<()[s0] -> (s0 * 4)>
+//   CHECK-DAG: #[[$MAP1:.*]] = affine_map<()[s0] -> (s0 + 32)>
+//   CHECK-DAG: #[[$MAP2:.*]] = affine_map<()[s0] -> (s0 + 128)>
 // CHECK-LABEL: @shared_mem_cpy(
 
 //   CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
@@ -62,24 +59,22 @@ module {
 //   CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
 //   CHECK-DAG: %[[TX:.*]] = gpu.thread_id x
 //   CHECK-DAG: %[[TY:.*]] = gpu.thread_id y
-//   CHECK-DAG: %[[TZ:.*]] = gpu.thread_id z
-
-//   CHECK-DAG: %[[Y0:.*]] = affine.apply #[[$MAP0]]()[%[[TX]], %[[TY]], %[[TZ]]]
-//   CHECK-DAG: %[[X0:.*]] = affine.apply #[[$MAP1]]()[%[[TX]]]
-//       CHECK: %[[R0:.*]] = vector.transfer_read %{{.*}}[%[[Y0]], %[[X0]]], %{{.*}} {in_bounds = [true, true]} : memref<64x16xf32>, vector<1x4xf32>
-//       CHECK: vector.transfer_write %[[R0]], %{{.*}}[%[[Y0]], %[[X0]]] {in_bounds = [true, true]} : vector<1x4xf32>, memref<64x16xf32, 3>
-//   CHECK-DAG: %[[Y1:.*]] = affine.apply #[[$MAP2]]()[%[[TX]], %[[TY]], %[[TZ]]]
+//       CHECK: %[[TFLAT:.*]] = affine.linearize_index disjoint [%[[TY]], %[[TX]]] by (4, 32)
+//       CHECK: %[[YX:.*]]:2 = affine.delinearize_index %[[TFLAT]] into (32, 4)
+//       CHECK: %[[X0:.*]] = affine.apply #[[$MAP0]]()[%[[YX]]#1]
+//       CHECK: %[[R0:.*]] = vector.transfer_read %{{.*}}[%[[YX]]#0, %[[X0]]], %{{.*}} {in_bounds = [true, true]} : memref<64x16xf32>, vector<1x4xf32>
+//       CHECK: vector.transfer_write %[[R0]], %{{.*}}[%[[YX]]#0, %[[X0]]] {in_bounds = [true, true]} : vector<1x4xf32>, memref<64x16xf32, 3>
+//   CHECK-DAG: %[[Y1:.*]] = affine.apply #[[$MAP1]]()[%[[YX]]#0]
 //       CHECK: %[[R1:.*]] = vector.transfer_read %{{.*}}[%[[Y1]], %[[X0]]], %{{.*}} {in_bounds = [true, true]} : memref<64x16xf32>, vector<1x4xf32>
 //       CHECK: vector.transfer_write %[[R1]], %{{.*}}[%[[Y1]], %[[X0]]] {in_bounds = [true, true]} : vector<1x4xf32>, memref<64x16xf32, 3>
 
-//       CHECK: %[[Y1:.*]] = affine.apply #[[$MAP3]]()[%[[TX]], %[[TY]], %[[TZ]]]
-//       CHECK: %[[R2:.*]] = vector.transfer_read %{{.*}}[%[[Y1]], %[[C0]]], %{{.*}} {in_bounds = [true, true]} : memref<256x4xf32>, vector<1x4xf32>
-//       CHECK: vector.transfer_write %[[R2]], %{{.*}}[%[[Y1]], %[[C0]]] {in_bounds = [true, true]} : vector<1x4xf32>, memref<256x4xf32, 3>
-//       CHECK: %[[Y2:.*]] = affine.apply #[[$MAP4]]()[%[[TX]], %[[TY]], %[[TZ]]]
+//       CHECK: %[[R2:.*]] = vector.transfer_read %{{.*}}[%[[TFLAT]], %[[C0]]], %{{.*}} {in_bounds = [true, true]} : memref<256x4xf32>, vector<1x4xf32>
+//       CHECK: vector.transfer_write %[[R2]], %{{.*}}[%[[TFLAT]], %[[C0]]] {in_bounds = [true, true]} : vector<1x4xf32>, memref<256x4xf32, 3>
+//       CHECK: %[[Y2:.*]] = affine.apply #[[$MAP2]]()[%[[TFLAT]]]
 //       CHECK: %[[R3:.*]] = vector.transfer_read %{{.*}}[%[[Y2]], %[[C0]]], %{{.*}} {in_bounds = [true, true]} : memref<256x4xf32>, vector<1x4xf32>
 //       CHECK: vector.transfer_write %[[R3]], %{{.*}}[%[[Y2]], %[[C0]]] {in_bounds = [true, true]} : vector<1x4xf32>, memref<256x4xf32, 3>
 
-//       CHECK: %[[X1:.*]] = affine.apply #[[$MAP5]]()[%[[TX]], %[[TY]], %[[TZ]]]
+//       CHECK: %[[X1:.*]] = affine.apply #[[$MAP0]]()[%[[TFLAT]]]
 //       CHECK: %[[R4:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[X1]]], %{{.*}} {in_bounds = [true, true]} : memref<3x512xf32>, vector<1x4xf32>
 //       CHECK: vector.transfer_write %[[R4]], %{{.*}}[%[[C0]], %[[X1]]] {in_bounds = [true, true]} : vector<1x4xf32>, memref<3x512xf32, 3>
 //       CHECK: %[[R5:.*]] = vector.transfer_read %{{.*}}[%[[C1]], %[[X1]]], %{{.*}} {in_bounds = [true, true]} : memref<3x512xf32>, vector<1x4xf32>
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/transform_gpu_distribute_shared_memory.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/transform_gpu_distribute_shared_memory.mlir
index ec765a1d5aa6..907070a35c5c 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/transform_gpu_distribute_shared_memory.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/transform_gpu_distribute_shared_memory.mlir
@@ -46,20 +46,19 @@ module attributes {transform.with_named_sequence} {
       transform.yield
     }
 }
-//   CHECK-DAG: #[[$MAP0:.*]] = affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 32 + s0 floordiv 4)>
-//   CHECK-DAG: #[[$MAP1:.*]] = affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16)>
-//   CHECK-DAG: #[[$MAP2:.*]] = affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 32 + s0 floordiv 4 + 32)>
+//   CHECK-DAG: #[[$MAP0:.*]] = affine_map<()[s0] -> (s0 * 4)>
+//   CHECK-DAG: #[[$MAP1:.*]] = affine_map<()[s0] -> (s0 + 32)>
 //   CHECK-DAG: #[[$MAP3:.*]] = affine_map<(d0, d1) -> (d0, d1)>
 // CHECK-LABEL: @shared_mem_cpy(
 //   CHECK-DAG: %[[TX:.*]] = gpu.thread_id x
 //   CHECK-DAG: %[[TY:.*]] = gpu.thread_id y
-//   CHECK-DAG: %[[TZ:.*]] = gpu.thread_id z
 
-//   CHECK-DAG: %[[Y0:.*]] = affine.apply #[[$MAP0]]()[%[[TX]], %[[TY]], %[[TZ]]]
-//   CHECK-DAG: %[[X0:.*]] = affine.apply #[[$MAP1]]()[%[[TX]]]
-//       CHECK: %[[R0:.*]] = vector.transfer_read %{{.*}}[%[[Y0]], %[[X0]]], %{{.*}} {in_bounds = [true, true]} : memref<64x16xf32, #hal.descriptor_type<storage_buffer>>, vector<1x4xf32>
-//       CHECK: vector.transfer_write %[[R0]], %{{.*}}[%[[Y0]], %[[X0]]] {in_bounds = [true, true]} : vector<1x4xf32>, memref<64x16xf32, #gpu.address_space<workgroup>>
-//   CHECK-DAG: %[[Y1:.*]] = affine.apply #[[$MAP2]]()[%[[TX]], %[[TY]], %[[TZ]]]
+//   CHECK-DAG: %[[TFLAT:.*]] = affine.linearize_index disjoint [%[[TY]], %[[TX]]] by (4, 32)
+//   CHECK-DAG: %[[YX:.*]]:2 = affine.delinearize_index %[[TFLAT]] into (32, 4)
+//   CHECK-DAG: %[[X0:.*]] = affine.apply #[[$MAP0]]()[%[[YX]]#1]
+//       CHECK: %[[R0:.*]] = vector.transfer_read %{{.*}}[%[[YX]]#0, %[[X0]]], %{{.*}} {in_bounds = [true, true]} : memref<64x16xf32, #hal.descriptor_type<storage_buffer>>, vector<1x4xf32>
+//       CHECK: vector.transfer_write %[[R0]], %{{.*}}[%[[YX]]#0, %[[X0]]] {in_bounds = [true, true]} : vector<1x4xf32>, memref<64x16xf32, #gpu.address_space<workgroup>>
+//   CHECK-DAG: %[[Y1:.*]] = affine.apply #[[$MAP1]]()[%[[YX]]#0]
 //       CHECK: %[[R1:.*]] = vector.transfer_read %{{.*}}[%[[Y1]], %[[X0]]], %{{.*}} {in_bounds = [true, true]} : memref<64x16xf32, #hal.descriptor_type<storage_buffer>>, vector<1x4xf32>
 //       CHECK: vector.transfer_write %[[R1]], %{{.*}}[%[[Y1]], %[[X0]]] {in_bounds = [true, true]} : vector<1x4xf32>, memref<64x16xf32, #gpu.address_space<workgroup>>
 //       CHECK: linalg.generic
diff --git a/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.cpp b/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.cpp
index cc2649823f4e..bb841bf10e72 100644
--- a/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.cpp
@@ -1113,16 +1113,15 @@ transform_dialect::TestGpuVectorDistribution::applyToOne(
   rewriter.setInsertionPointToStart(&target.getFunctionBody().front());
   // This is a test op so we unsafely use thread_id x as the lane ID. In
   // general this should linearize the thread IDs based on the workgroup size
-  // and divide by the subgroup size. i.e.
+  // and take the modulo by the subgroup size. i.e.
   //
-  // lane_id = (tid_x + tid_y * dim_x + tid_z * dim_y * dim_x) / subgroup_size;
+  // lane_id = (tid_x + tid_y * dim_x + tid_z * dim_y * dim_x) % subgroup_size;
   Value laneId =
       rewriter.create<gpu::ThreadIdOp>(target.getLoc(), gpu::Dimension::x);
+  int64_t subgroupSize = getSubgroupSize();
 
   populateGPUDistributionPatterns(patterns);
-  // For testing we use subgroup size = 64.
-  populateGPUDistributeNestedLayoutAttrPatterns(patterns, laneId,
-                                                /*subgroupSize=*/64);
+  populateGPUDistributeNestedLayoutAttrPatterns(patterns, laneId, subgroupSize);
   populateGPUDistributeNestedLayoutContractAMDGPUPatterns(patterns);
   if (failed(distributeVectorOps(target, patterns, options))) {
     return emitDefaultDefiniteFailure(target);
diff --git a/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensionsOps.td b/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensionsOps.td
index 5219b4a2da9c..0c05178043c8 100644
--- a/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensionsOps.td
+++ b/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensionsOps.td
@@ -631,7 +631,8 @@ def TestGpuVectorDistribution :
     }];
 
     let arguments = (ins TransformHandleTypeInterface:$target,
-                         DefaultValuedOptionalAttr<BoolAttr, "false">:$experimental);
+                         DefaultValuedOptionalAttr<BoolAttr, "false">:$experimental,
+                         DefaultValuedOptionalAttr<I64Attr, "64">:$subgroup_size);
     let results = (outs);
 
     let assemblyFormat = [{ $target attr-dict `:` type($target)}];
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp
index eaa3f7249c05..803040d0451a 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp
@@ -649,21 +649,16 @@ LogicalResult DataTiledMMAAttr::populateOperandOffsetsSizesStrides(
         getSubgroupSize() / intrinsicLayoutThreadBound);
   }
 
-  // AffineDelinearizeIndexOp requires an in-bounds input index, so we bound it.
-  OpFoldResult threadIdBound =
-      builder.getIndexAttr(ShapedType::getNumElements(distributionThreadSizes));
-  AffineExpr d0 = builder.getAffineDimExpr(0), d1 = builder.getAffineDimExpr(1);
-  OpFoldResult boundedThreadId = affine::makeComposedFoldedAffineApply(
-      builder, loc, {d0 % d1}, {threadId, threadIdBound});
-
   // Obtain the offsets from delinearization along the distributionThreadSizes.
+  // Use a delinearize without outer bound and throw away its initial result
+  // to get clamping behavior.
   SmallVector<OpFoldResult> tileOffsets =
       builder
           .create<affine::AffineDelinearizeIndexOp>(
-              loc,
-              getValueOrCreateConstantIndexOp(builder, loc, boundedThreadId),
-              getAsIndexOpFoldResult(ctx, distributionThreadSizes))
-          ->getResults();
+              loc, getValueOrCreateConstantIndexOp(builder, loc, threadId),
+              distributionThreadSizes, /*hasOuterBound=*/false)
+          ->getResults()
+          .drop_front();
 
   if (hasDistributionOnlyDim) {
     // Erase the delinearized index that corresponds to the extra distribution
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/Transforms.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/Transforms.cpp
index ff4f17648aa8..75bf5e51d54c 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/Transforms.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/Transforms.cpp
@@ -209,11 +209,10 @@ LogicalResult fuseForallIntoConsumer(RewriterBase &rewriter,
   // Compute the total producer loop worker count (P0 * ... * Pn).
   Value linearConsumerIdVal =
       getValueOrCreateConstantIndexOp(rewriter, loc, linearId);
-  SmallVector<Value> producerRanges;
+  SmallVector<OpFoldResult> producerRanges;
   OpFoldResult producerWorkerCount = rewriter.getIndexAttr(1);
   for (auto workerCount : producer.getMixedUpperBound()) {
-    producerRanges.push_back(
-        getValueOrCreateConstantIndexOp(rewriter, loc, workerCount));
+    producerRanges.push_back(workerCount);
     producerWorkerCount = affine::makeComposedFoldedAffineApply(
         rewriter, loc, d0 * d1, {producerWorkerCount, workerCount});
   }
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/test/distribute_mma_to_lanes.mlir b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/test/distribute_mma_to_lanes.mlir
index 31c5074972c6..07729a11e2b5 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/test/distribute_mma_to_lanes.mlir
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/test/distribute_mma_to_lanes.mlir
@@ -387,24 +387,21 @@ func.func @data_tiled_1x1x1_tensor_multi_mma(%lhs: tensor<1x1x4x16xf32>, %rhs: t
   return %0 : tensor<1x1x4x16x4xf32>
 }
 
-// CHECK-DAG: #[[$MAP:.+]] = affine_map<(d0) -> (d0 mod 64)>
-
 // CHECK-LABEL: func @data_tiled_1x1x1_tensor_multi_mma
 //  CHECK-SAME:   %[[LHS:[A-Za-z0-9]+]]
 //  CHECK-SAME:   %[[RHS:[A-Za-z0-9]+]]
 //  CHECK-SAME:   %[[ACC:[A-Za-z0-9]+]]
 //       CHECK:   scf.forall (%[[THREAD_ID:.+]]) in (64) shared_outs(%[[ACC_ARG:.+]] = %[[ACC]]) -> (tensor<1x1x4x16x4xf32>)
-//       CHECK:     %[[ID_CLAMPED:.+]] = affine.apply #[[$MAP]](%[[THREAD_ID]])
-//   CHECK-DAG:     %[[IN_IDS:.+]]:2 = affine.delinearize_index %[[ID_CLAMPED]] into (4, 16)
-//   CHECK-DAG:     %[[LHS_SLICE:.+]] = tensor.extract_slice %[[LHS]][0, 0, %[[IN_IDS]]#0, %[[IN_IDS]]#1] [1, 1, 1, 1] [1, 1, 1, 1]
-//   CHECK-DAG:     %[[RHS_SLICE:.+]] = tensor.extract_slice %[[RHS]][0, 0, %[[IN_IDS]]#0, %[[IN_IDS]]#1] [1, 1, 1, 1] [1, 1, 1, 1]
+//   CHECK-DAG:     %[[IN_IDS:.+]]:3 = affine.delinearize_index %[[THREAD_ID]] into (4, 16)
+//   CHECK-DAG:     %[[LHS_SLICE:.+]] = tensor.extract_slice %[[LHS]][0, 0, %[[IN_IDS]]#1, %[[IN_IDS]]#2] [1, 1, 1, 1] [1, 1, 1, 1]
+//   CHECK-DAG:     %[[RHS_SLICE:.+]] = tensor.extract_slice %[[RHS]][0, 0, %[[IN_IDS]]#1, %[[IN_IDS]]#2] [1, 1, 1, 1] [1, 1, 1, 1]
 //   CHECK-DAG:     %[[ACC_SLICE:.+]] = tensor.extract_slice %[[ACC_ARG]]
-//  CHECK-SAME:       [0, 0, %[[IN_IDS]]#0, %[[IN_IDS]]#1, 0] [1, 1, 1, 1, 4] [1, 1, 1, 1, 1]
+//  CHECK-SAME:       [0, 0, %[[IN_IDS]]#1, %[[IN_IDS]]#2, 0] [1, 1, 1, 1, 4] [1, 1, 1, 1, 1]
 //       CHECK:     %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS_SLICE]], %[[RHS_SLICE]], %[[ACC_SLICE]]
 //  CHECK-SAME:       kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_F32_16x16x4_F32>
 //  CHECK-SAME:       : tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32> into tensor<1x1x1x1x4xf32>
 //       CHECK:     tensor.parallel_insert_slice %[[MMA]] into %[[ACC_ARG]]
-//  CHECK-SAME:       [0, 0, %[[IN_IDS]]#0, %[[IN_IDS]]#1, 0] [1, 1, 1, 1, 4] [1, 1, 1, 1, 1]
+//  CHECK-SAME:       [0, 0, %[[IN_IDS]]#1, %[[IN_IDS]]#2, 0] [1, 1, 1, 1, 4] [1, 1, 1, 1, 1]
 //       CHECK:   mapping = [#gpu.thread<linear_dim_0>]
 
 // -----
@@ -424,26 +421,23 @@ func.func @data_tiled_2x2x4_tensor_multi_mma_unrolled(%lhs: tensor<1x1x2x4x16x4x
   return %0 : tensor<1x1x2x2x4x16x4xf32>
 }
 
-// CHECK-DAG: #[[$MAP:.+]] = affine_map<(d0) -> (d0 mod 64)>
-
 // CHECK-LABEL: func @data_tiled_2x2x4_tensor_multi_mma_unrolled
 //  CHECK-SAME:   %[[LHS:[A-Za-z0-9]+]]
 //  CHECK-SAME:   %[[RHS:[A-Za-z0-9]+]]
 //  CHECK-SAME:   %[[ACC:[A-Za-z0-9]+]]
 //       CHECK:   scf.forall (%[[THREAD_ID:.+]]) in (64) shared_outs(%[[ACC_ARG:.+]] = %[[ACC]]) -> (tensor<1x1x2x2x4x16x4xf32>)
-//       CHECK:     %[[ID_CLAMPED:.+]] = affine.apply #[[$MAP]](%[[THREAD_ID]])
-//   CHECK-DAG:     %[[IN_IDS:.+]]:2 = affine.delinearize_index %[[ID_CLAMPED]] into (4, 16)
+//   CHECK-DAG:     %[[IN_IDS:.+]]:3 = affine.delinearize_index %[[THREAD_ID]] into (4, 16)
 //   CHECK-DAG:     %[[LHS_SLICE:.+]] = tensor.extract_slice %[[LHS]]
-//  CHECK-SAME:       [0, 0, 0, %[[IN_IDS]]#0, %[[IN_IDS]]#1, 0] [1, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1]
+//  CHECK-SAME:       [0, 0, 0, %[[IN_IDS]]#1, %[[IN_IDS]]#2, 0] [1, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1]
 //   CHECK-DAG:     %[[RHS_SLICE:.+]] = tensor.extract_slice %[[RHS]]
-//  CHECK-SAME:       [0, 0, 0, %[[IN_IDS]]#0, %[[IN_IDS]]#1, 0] [1, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1]
+//  CHECK-SAME:       [0, 0, 0, %[[IN_IDS]]#1, %[[IN_IDS]]#2, 0] [1, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1]
 //   CHECK-DAG:     %[[ACC_SLICE:.+]] = tensor.extract_slice %[[ACC_ARG]]
-//  CHECK-SAME:       [0, 0, 0, 0, %[[IN_IDS]]#0, %[[IN_IDS]]#1, 0] [1, 1, 2, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1]
+//  CHECK-SAME:       [0, 0, 0, 0, %[[IN_IDS]]#1, %[[IN_IDS]]#2, 0] [1, 1, 2, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1]
 //       CHECK:     %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS_SLICE]], %[[RHS_SLICE]], %[[ACC_SLICE]]
 //  CHECK-SAME:       kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_F32_16x16x4_F32, unroll_m = 2, unroll_n = 2, unroll_k = 4>
 //  CHECK-SAME:       : tensor<1x1x2x1x1x4xf32>, tensor<1x1x2x1x1x4xf32> into tensor<1x1x2x2x1x1x4xf32>
 //       CHECK:     tensor.parallel_insert_slice %[[MMA]] into %[[ACC_ARG]]
-//  CHECK-SAME:       [0, 0, 0, 0, %[[IN_IDS]]#0, %[[IN_IDS]]#1, 0] [1, 1, 2, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1]
+//  CHECK-SAME:       [0, 0, 0, 0, %[[IN_IDS]]#1, %[[IN_IDS]]#2, 0] [1, 1, 2, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1]
 //       CHECK:   mapping = [#gpu.thread<linear_dim_0>]
 
 // -----
@@ -463,27 +457,22 @@ func.func @data_tiled_2x2x4_tensor_multi_mma_unrolled_to_subgroups(%lhs: tensor<
   return %0 : tensor<1x1x2x2x4x16x4xf32>
 }
 
-// CHECK-DAG: #[[$MAP:.+]] = affine_map<(d0) -> (d0 mod 128)>
-// CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0) -> (d0 mod 256)>
-
 // CHECK-LABEL: func @data_tiled_2x2x4_tensor_multi_mma_unrolled_to_subgroups
 //  CHECK-SAME:   %[[LHS:[A-Za-z0-9]+]]
 //  CHECK-SAME:   %[[RHS:[A-Za-z0-9]+]]
 //  CHECK-SAME:   %[[ACC:[A-Za-z0-9]+]]
 //       CHECK:   scf.forall (%[[THREAD_ID:.+]]) in (256) shared_outs(%[[ACC_ARG:.+]] = %[[ACC]]) -> (tensor<1x1x2x2x4x16x4xf32>)
-//       CHECK:     %[[ID_CLAMPED_128:.+]] = affine.apply #[[$MAP]](%[[THREAD_ID]])
-//   CHECK-DAG:     %[[IN_IDS:.+]]:3 = affine.delinearize_index %[[ID_CLAMPED_128]] into (2, 4, 16)
+//   CHECK-DAG:     %[[IN_IDS:.+]]:4 = affine.delinearize_index %[[THREAD_ID]] into (2, 4, 16)
 //   CHECK-DAG:     %[[LHS_SLICE:.+]] = tensor.extract_slice %[[LHS]]
-//  CHECK-SAME:       [0, 0, %[[IN_IDS]]#0, %[[IN_IDS]]#1, %[[IN_IDS]]#2, 0] [1, 1, 1, 1, 1, 4] [1, 1, 1, 1, 1, 1]
+//  CHECK-SAME:       [0, 0, %[[IN_IDS]]#1, %[[IN_IDS]]#2, %[[IN_IDS]]#3, 0] [1, 1, 1, 1, 1, 4] [1, 1, 1, 1, 1, 1]
 //   CHECK-DAG:     %[[RHS_SLICE:.+]] = tensor.extract_slice %[[RHS]]
-//  CHECK-SAME:       [0, 0, %[[IN_IDS]]#0, %[[IN_IDS]]#1, %[[IN_IDS]]#2, 0] [1, 1, 1, 1, 1, 4] [1, 1, 1, 1, 1, 1]
-//       CHECK:     %[[ID_CLAMPED_256:.+]] = affine.apply #[[$MAP1]](%[[THREAD_ID]])
-//   CHECK-DAG:     %[[ACC_IDS:.+]]:4 = affine.delinearize_index %[[ID_CLAMPED_256]] into (2, 2, 4, 16)
+//  CHECK-SAME:       [0, 0, %[[IN_IDS]]#1, %[[IN_IDS]]#2, %[[IN_IDS]]#3, 0] [1, 1, 1, 1, 1, 4] [1, 1, 1, 1, 1, 1]
+//   CHECK-DAG:     %[[ACC_IDS:.+]]:5 = affine.delinearize_index %[[THREAD_ID]] into (2, 2, 4, 16)
 //   CHECK-DAG:     %[[ACC_SLICE:.+]] = tensor.extract_slice %[[ACC_ARG]]
-//  CHECK-SAME:       [0, 0, %[[ACC_IDS]]#0, %[[ACC_IDS]]#1, %[[ACC_IDS]]#2, %[[ACC_IDS]]#3, 0] [1, 1, 1, 1, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1]
+//  CHECK-SAME:       [0, 0, %[[ACC_IDS]]#1, %[[ACC_IDS]]#2, %[[ACC_IDS]]#3, %[[ACC_IDS]]#4, 0] [1, 1, 1, 1, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1]
 //       CHECK:     %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS_SLICE]], %[[RHS_SLICE]], %[[ACC_SLICE]]
 //  CHECK-SAME:       kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_F32_16x16x4_F32, unroll_k = 4>}
 //  CHECK-SAME:       : tensor<1x1x1x1x1x4xf32>, tensor<1x1x1x1x1x4xf32> into tensor<1x1x1x1x1x1x4xf32>
 //       CHECK:     tensor.parallel_insert_slice %[[MMA]] into %[[ACC_ARG]]
-//  CHECK-SAME:       [0, 0, %[[ACC_IDS]]#0, %[[ACC_IDS]]#1, %[[ACC_IDS]]#2, %[[ACC_IDS]]#3, 0] [1, 1, 1, 1, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1]
+//  CHECK-SAME:       [0, 0, %[[ACC_IDS]]#1, %[[ACC_IDS]]#2, %[[ACC_IDS]]#3, %[[ACC_IDS]]#4, 0] [1, 1, 1, 1, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1]
 //       CHECK:   mapping = [#gpu.thread<linear_dim_0>]
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUVectorDistribute.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUVectorDistribute.cpp
index 1640656b71a8..d5e0af1bd119 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUVectorDistribute.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUVectorDistribute.cpp
@@ -80,24 +80,18 @@ struct LLVMGPUVectorDistributePass final
       }
     }
 
-    AffineExpr x, y, z;
-    bindSymbols(func.getContext(), x, y, z);
-    // Construct the expression for linearizing the thread indices.
-    AffineExpr linearId =
-        x + workgroupSize[0] * y + workgroupSize[1] * workgroupSize[0] * z;
-
     IRRewriter rewriter(func);
     rewriter.setInsertionPointToStart(&func.getFunctionBody().front());
-    SmallVector<OpFoldResult> threadGrid = {
-        rewriter.createOrFold<gpu::ThreadIdOp>(func.getLoc(),
-                                               gpu::Dimension::x),
-        rewriter.createOrFold<gpu::ThreadIdOp>(func.getLoc(),
-                                               gpu::Dimension::y),
-        rewriter.createOrFold<gpu::ThreadIdOp>(func.getLoc(),
-                                               gpu::Dimension::z)};
-
-    Value linearThreadIdVal = affine::makeComposedAffineApply(
-        rewriter, func.getLoc(), linearId, threadGrid);
+    SmallVector<Value> threadGrid = {rewriter.createOrFold<gpu::ThreadIdOp>(
+                                         func.getLoc(), gpu::Dimension::z),
+                                     rewriter.createOrFold<gpu::ThreadIdOp>(
+                                         func.getLoc(), gpu::Dimension::y),
+                                     rewriter.createOrFold<gpu::ThreadIdOp>(
+                                         func.getLoc(), gpu::Dimension::x)};
+    std::reverse(workgroupSize.begin(), workgroupSize.end());
+
+    Value linearThreadIdVal = rewriter.create<affine::AffineLinearizeIndexOp>(
+        func.getLoc(), threadGrid, workgroupSize, /*disjoint=*/true);
 
     std::optional<int64_t> subgroupSize = getSubgroupSize(func);
     if (!subgroupSize) {
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp
index c52ae4bcc157..5c4c3ff471dd 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp
@@ -1476,11 +1476,10 @@ transform_dialect::AMDGPUDistributeVectorsOp::applyToOne(
   rewriter.setInsertionPointToStart(&target.getFunctionBody().front());
   Value laneId =
       rewriter.create<gpu::ThreadIdOp>(target.getLoc(), gpu::Dimension::x);
+  int64_t subgroupSize = getSubgroupSize();
 
   populateGPUDistributionPatterns(patterns);
-  // For testing we use subgroup size = 64.
-  populateGPUDistributeNestedLayoutAttrPatterns(patterns, laneId,
-                                                /*subgroupSize=*/64);
+  populateGPUDistributeNestedLayoutAttrPatterns(patterns, laneId, subgroupSize);
   if (failed(distributeVectorOps(target, patterns, options))) {
     return emitDefaultSilenceableFailure(target);
   }
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensionsOps.td b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensionsOps.td
index 69e766537c0b..28bd4eebdbbd 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensionsOps.td
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensionsOps.td
@@ -699,7 +699,8 @@ def AMDGPUDistributeVectorsOp :
     }];
 
     let arguments = (ins TransformHandleTypeInterface:$target,
-                         UnitAttr:$test_conversion);
+                         UnitAttr:$test_conversion,
+                         DefaultValuedOptionalAttr<I64Attr, "64">:$subgroup_size);
     let results = (outs TransformHandleTypeInterface:$result);
 
     let assemblyFormat = [{
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir
index 4e9758f83c78..3f5b280b6342 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir
@@ -544,17 +544,13 @@ hal.executable public @main {
   }
 }
 
-// CHECK: #[[$MAP:.+]] = affine_map<()[s0, s1, s2] -> (s0 + s1 * 8 + s2 * 32)>
-// CHECK: #[[$MAP1:.+]] = affine_map<()[s0, s1] -> (s0 * 8 + s1)>
+// CHECK: #[[$MAP0:.+]] = affine_map<()[s0, s1] -> (s0 * 8 + s1)>
 
 // CHECK-LABEL: func @skinny_matmul_config
 
 //   CHECK-DAG:   %[[IDX:.+]] = gpu.thread_id  x
 //   CHECK-DAG:   %[[IDY:.+]] = gpu.thread_id  y
-//   CHECK-DAG:   %[[IDZ:.+]] = gpu.thread_id  z
-//       CHECK:   %[[LINID0:.+]] = affine.apply #[[$MAP]]()[%[[IDX]], %[[IDY]], %[[IDZ]]]
-//       CHECK:   %[[IDS:.+]]:2 = affine.delinearize_index %[[LINID0:.+]] into (4, 8) : index, index
-//       CHECK:   %[[LINID1:.+]] = affine.apply #[[$MAP1]]()[%[[IDS]]#0, %[[IDS]]#1]
+//       CHECK:   %[[LINID1:.+]] = affine.apply #[[$MAP0]]()[%[[IDY]], %[[IDX]]]
 //       CHECK:   scf.forall ({{.*}}) in (32, 98) {
 //       CHECK:     scf.for %{{.*}} = %c0 to %c256 step %c4 {{.*}} -> (vector<1x4xf32>)
 //       CHECK:       scf.for %{{.*}} = %[[LINID1]] to %c4 step %c32
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transpose_pipeline_test.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transpose_pipeline_test.mlir
index 23c3977c8389..fd373f7ebadc 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transpose_pipeline_test.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transpose_pipeline_test.mlir
@@ -34,28 +34,25 @@ hal.executable @transpose_dispatch_0 {
 // CHECK-LABEL:  hal.executable public @transpose_dispatch_0
 //   CHECK-DAG:  %[[CST:.*]] = arith.constant 0.000000e+00 : f32
 //   CHECK-DAG:  %[[C0:.*]] = arith.constant 0 : index
-//   CHECK-DAG:  %[[D0:.*]] = gpu.thread_id  x
-//   CHECK-DAG:  %[[D1:.*]] = gpu.thread_id  y
-//   CHECK-DAG:  %[[D2:.*]] = gpu.thread_id  z
-//   CHECK-DAG:  %[[D3:.*]] = memref.alloc() : memref<32x33xf32, #gpu.address_space<workgroup>>
-//       CHECK:  %[[D4:.*]] = hal.interface.binding.subspan layout({{.+}}) binding(0) alignment(64) offset(%[[C0]]) : memref<4096x4096xf32, #hal.descriptor_type<storage_buffer>>
-//       CHECK:  memref.assume_alignment %[[D4]], 64 : memref<4096x4096xf32, #hal.descriptor_type<storage_buffer>>
-//       CHECK:  %[[D5:.*]] = hal.interface.binding.subspan layout({{.+}}) binding(1) alignment(64) offset(%[[C0]]) : memref<4096x4096xf32, #hal.descriptor_type<storage_buffer>>
-//       CHECK:  memref.assume_alignment %[[D5]], 64 : memref<4096x4096xf32, #hal.descriptor_type<storage_buffer>>
+//   CHECK-DAG:  %[[TX:.*]] = gpu.thread_id  x
+//   CHECK-DAG:  %[[TY:.*]] = gpu.thread_id  y
+//   CHECK-DAG:  %[[ALLOC:.*]] = memref.alloc() : memref<32x33xf32, #gpu.address_space<workgroup>>
+//       CHECK:  %[[D0:.*]] = hal.interface.binding.subspan layout({{.+}}) binding(0) alignment(64) offset(%[[C0]]) : memref<4096x4096xf32, #hal.descriptor_type<storage_buffer>>
+//       CHECK:  memref.assume_alignment %[[D0]], 64 : memref<4096x4096xf32, #hal.descriptor_type<storage_buffer>>
+//       CHECK:  %[[D1:.*]] = hal.interface.binding.subspan layout({{.+}}) binding(1) alignment(64) offset(%[[C0]]) : memref<4096x4096xf32, #hal.descriptor_type<storage_buffer>>
+//       CHECK:  memref.assume_alignment %[[D1]], 64 : memref<4096x4096xf32, #hal.descriptor_type<storage_buffer>>
 //       CHECK:  gpu.barrier
-//       CHECK:  %[[D6:.*]] = affine.apply #{{.*}}()[%{{.*}}, %[[D0]], %[[D1]], %[[D2]]]
-//       CHECK:  %[[D7:.*]] = affine.apply #{{.*}}()[%{{.*}}, %[[D0]]]
-//       CHECK:  %[[D8:.*]] = vector.transfer_read %[[D4]][%[[D6]], %[[D7]]], %[[CST]] {in_bounds = [true, true]} : memref<4096x4096xf32, #hal.descriptor_type<storage_buffer>>, vector<1x4xf32>
-//       CHECK:  %[[D9:.*]] = affine.apply #{{.*}}()[%[[D0]], %[[D1]], %[[D2]]]
-//       CHECK:  %[[D10:.*]] = affine.apply #{{.*}}()[%[[D0]]]
-//       CHECK:  vector.transfer_write %[[D8]], %[[D3]][%[[D9]], %[[D10]]] {in_bounds = [true, true]} : vector<1x4xf32>, memref<32x33xf32, #gpu.address_space<workgroup>>
+//       CHECK:  %[[D2:.*]] = affine.apply #{{.*}}()[%{{.+}}, %[[TY]]]
+//       CHECK:  %[[D3:.*]] = affine.apply #{{.*}}()[%{{.+}}, %[[TX]]]
+//       CHECK:  %[[D4:.*]] = vector.transfer_read %[[D0]][%[[D2]], %[[D3]]], %[[CST]] {in_bounds = [true, true]} : memref<4096x4096xf32, #hal.descriptor_type<storage_buffer>>, vector<1x4xf32>
+//       CHECK:  %[[D5:.*]] = affine.apply #{{.*}}()[%[[TX]]]
+//       CHECK:  vector.transfer_write %[[D4]], %[[ALLOC]][%[[TY]], %[[D5]]] {in_bounds = [true, true]} : vector<1x4xf32>, memref<32x33xf32, #gpu.address_space<workgroup>>
 //       CHECK:  gpu.barrier
-//       CHECK:  %[[D11:.*]] = affine.apply #{{.*}}()[%[[D0]]]
-//       CHECK:  %[[D12:.*]] = vector.transfer_read %[[D3]][%[[D11]], %[[D1]]], %[[CST]] {in_bounds = [true, true]} : memref<32x33xf32, #gpu.address_space<workgroup>>, vector<4x1xf32>
-//       CHECK:  %[[D13:.*]] = vector.shape_cast %[[D12]] : vector<4x1xf32> to vector<4xf32>
-//       CHECK:  %[[D15:.*]] = affine.apply #{{.*}}()[%{{.*}}, %[[D1]]]
-//       CHECK:  %[[D16:.*]] = affine.apply #{{.*}}()[%{{.*}}, %[[D0]]]
-//       CHECK:  vector.transfer_write %[[D13]], %[[D5]][%[[D15]], %[[D16]]] {in_bounds = [true]} : vector<4xf32>, memref<4096x4096xf32, #hal.descriptor_type<storage_buffer>>
+//       CHECK:  %[[D6:.*]] = vector.transfer_read %[[ALLOC]][%[[D5]], %[[TY]]], %[[CST]] {in_bounds = [true, true]} : memref<32x33xf32, #gpu.address_space<workgroup>>, vector<4x1xf32>
+//       CHECK:  %[[D7:.*]] = vector.shape_cast %[[D6]] : vector<4x1xf32> to vector<4xf32>
+//       CHECK:  %[[D8:.*]] = affine.apply #{{.*}}()[%{{.+}}, %[[TY]]]
+//       CHECK:  %[[D9:.*]] = affine.apply #{{.*}}()[%{{.+}}, %[[TX]]]
+//       CHECK:  vector.transfer_write %[[D7]], %[[D1]][%[[D8]], %[[D9]]] {in_bounds = [true]} : vector<4xf32>, memref<4096x4096xf32, #hal.descriptor_type<storage_buffer>>
 
 // -----
 
@@ -96,34 +93,31 @@ hal.executable @transpose_single_operand_dispatch_0_generic_768x2048 {
 // CHECK-LABEL:  hal.executable public @transpose_single_operand_dispatch_0_generic_768x2048
 //       CHECK:  %[[CST:.*]] = arith.constant 0.000000e+00 : f32
 //       CHECK:  %[[C0:.*]] = arith.constant 0 : index
-//       CHECK:  %[[D0:.*]] = gpu.thread_id  x
-//       CHECK:  %[[D1:.*]] = gpu.thread_id  y
-//       CHECK:  %[[D2:.*]] = gpu.thread_id  z
-//       CHECK:  %[[D3:.*]] = memref.alloc() : memref<32x33xf32, #gpu.address_space<workgroup>>
-//       CHECK:  %[[D4:.*]] = hal.interface.binding.subspan layout({{.+}}) binding(0) alignment(64) offset(%[[C0]]) : memref<2048x768xf32, #hal.descriptor_type<storage_buffer>>
-//       CHECK:  memref.assume_alignment %[[D4]], 64 : memref<2048x768xf32, #hal.descriptor_type<storage_buffer>>
-//       CHECK:  %[[D5:.*]] = hal.interface.binding.subspan layout({{.+}}) binding(1) alignment(64) offset(%[[C0]]) : memref<768x2048xf32, #hal.descriptor_type<storage_buffer>>
-//       CHECK:  memref.assume_alignment %[[D5]], 64 : memref<768x2048xf32, #hal.descriptor_type<storage_buffer>>
-//       CHECK:  %[[D6:.*]] = hal.interface.binding.subspan layout({{.+}}) binding(2) alignment(64) offset(%[[C0]]) : memref<768x2048xf32, #hal.descriptor_type<storage_buffer>>
-//       CHECK:  memref.assume_alignment %[[D6]], 64 : memref<768x2048xf32, #hal.descriptor_type<storage_buffer>>
+//       CHECK:  %[[TX:.*]] = gpu.thread_id  x
+//       CHECK:  %[[TY:.*]] = gpu.thread_id  y
+//       CHECK:  %[[ALLOC:.*]] = memref.alloc() : memref<32x33xf32, #gpu.address_space<workgroup>>
+//       CHECK:  %[[D0:.*]] = hal.interface.binding.subspan layout({{.+}}) binding(0) alignment(64) offset(%[[C0]]) : memref<2048x768xf32, #hal.descriptor_type<storage_buffer>>
+//       CHECK:  memref.assume_alignment %[[D0]], 64 : memref<2048x768xf32, #hal.descriptor_type<storage_buffer>>
+//       CHECK:  %[[D1:.*]] = hal.interface.binding.subspan layout({{.+}}) binding(1) alignment(64) offset(%[[C0]]) : memref<768x2048xf32, #hal.descriptor_type<storage_buffer>>
+//       CHECK:  memref.assume_alignment %[[D1]], 64 : memref<768x2048xf32, #hal.descriptor_type<storage_buffer>>
+//       CHECK:  %[[D2:.*]] = hal.interface.binding.subspan layout({{.+}}) binding(2) alignment(64) offset(%[[C0]]) : memref<768x2048xf32, #hal.descriptor_type<storage_buffer>>
+//       CHECK:  memref.assume_alignment %[[D2]], 64 : memref<768x2048xf32, #hal.descriptor_type<storage_buffer>>
 //       CHECK:  gpu.barrier
-//       CHECK:  %[[D7:.*]] = affine.apply #{{.*}}()[%{{.*}}, %[[D0]], %[[D1]], %[[D2]]]
-//       CHECK:  %[[D8:.*]] = affine.apply #{{.*}}()[%{{.*}}, %[[D0]]]
-//       CHECK:  %[[D9:.*]] = vector.transfer_read %[[D4]][%[[D7]], %[[D8]]], %[[CST]] {in_bounds = [true, true]} : memref<2048x768xf32, #hal.descriptor_type<storage_buffer>>, vector<1x4xf32>
-//       CHECK:  %[[D10:.*]] = affine.apply #{{.*}}()[%[[D0]], %[[D1]], %[[D2]]]
-//       CHECK:  %[[D11:.*]] = affine.apply #{{.*}}()[%[[D0]]]
-//       CHECK:  vector.transfer_write %[[D9]], %[[D3]][%[[D10]], %[[D11]]] {in_bounds = [true, true]} : vector<1x4xf32>, memref<32x33xf32, #gpu.address_space<workgroup>>
+//       CHECK:  %[[D3:.*]] = affine.apply #{{.*}}()[%[[TX]]]
+//       CHECK:  %[[D4:.*]] = affine.apply #{{.*}}()[%{{.*}}, %[[TY]]]
+//       CHECK:  %[[D5:.*]] = affine.apply #{{.*}}()[%{{.*}}, %[[TX]]]
+//       CHECK:  %[[D6:.*]] = vector.transfer_read %[[D0]][%[[D4]], %[[D5]]], %[[CST]] {in_bounds = [true, true]} : memref<2048x768xf32, #hal.descriptor_type<storage_buffer>>, vector<1x4xf32>
+//       CHECK:  vector.transfer_write %[[D6]], %[[ALLOC]][%[[TY]], %[[D3]]] {in_bounds = [true, true]} : vector<1x4xf32>, memref<32x33xf32, #gpu.address_space<workgroup>>
 //       CHECK:  gpu.barrier
-//       CHECK:  %[[D12:.*]] = affine.apply #{{.*}}()[%[[D0]]]
-//       CHECK:  %[[D13:.*]] = vector.transfer_read %[[D3]][%[[D12]], %[[D1]]], %[[CST]] {in_bounds = [true, true]} : memref<32x33xf32, #gpu.address_space<workgroup>>, vector<4x1xf32>
-//       CHECK: %[[DUP_D15:.*]] = arith.addi %[[D1]], %{{.*}} : index
-//       CHECK: %[[DUP_D16:.*]] = arith.addi %[[D12]], %{{.*}} : index
-//       CHECK:  %[[D17:.*]] = vector.transfer_read %[[D5]][%[[DUP_D15]], %[[DUP_D16]]], %[[CST]] {in_bounds = [true]} : memref<768x2048xf32, #hal.descriptor_type<storage_buffer>>, vector<4xf32>
-//       CHECK:  %[[D14:.*]] = vector.shape_cast %[[D13]] : vector<4x1xf32> to vector<4xf32>
-//       CHECK:  %[[D19:.*]] = arith.addf %[[D14]], %[[D17]] : vector<4xf32>
-//       CHECK:  %[[D15:.*]] = affine.apply #{{.*}}()[%{{.*}}, %[[D1]]]
-//       CHECK:  %[[D16:.*]] = affine.apply #{{.*}}()[%{{.*}}, %[[D0]]]
-//       CHECK:  vector.transfer_write %[[D19]], %[[D6]][%[[D15]], %[[D16]]] {in_bounds = [true]} : vector<4xf32>, memref<768x2048xf32, #hal.descriptor_type<storage_buffer>>
+//       CHECK:  %[[D7:.*]] = vector.transfer_read %[[ALLOC]][%[[D3]], %[[TY]]], %[[CST]] {in_bounds = [true, true]} : memref<32x33xf32, #gpu.address_space<workgroup>>, vector<4x1xf32>
+//       CHECK:  %[[D8:.*]] = arith.addi %[[TY]], %{{.*}}
+//       CHECK:  %[[D9:.*]] = arith.addi %[[D3]], %{{.*}}
+//       CHECK:  %[[D10:.*]] = vector.transfer_read %[[D1]][%[[D8]], %[[D9]]], %[[CST]] {in_bounds = [true]} : memref<768x2048xf32, #hal.descriptor_type<storage_buffer>>, vector<4xf32>
+//       CHECK:  %[[D11:.*]] = vector.shape_cast %[[D7]] : vector<4x1xf32> to vector<4xf32>
+//       CHECK:  %[[D12:.*]] = arith.addf %[[D11]], %[[D10]] : vector<4xf32>
+//       CHECK:  %[[D13:.*]] = affine.apply #{{.*}}()[%{{.*}}, %[[TY]]]
+//       CHECK:  %[[D14:.*]] = affine.apply #{{.*}}()[%{{.*}}, %[[TX]]]
+//       CHECK:  vector.transfer_write %[[D12]], %[[D2]][%[[D13]], %[[D14]]] {in_bounds = [true]} : vector<4xf32>, memref<768x2048xf32, #hal.descriptor_type<storage_buffer>>
 
 // -----
 
@@ -205,34 +199,31 @@ hal.executable @transpose_3d_yes_dispatch_0_generic_10x768x2048 {
 // CHECK-LABEL:   hal.executable public @transpose_3d_yes_dispatch_0_generic_10x768x2048 {
 //   CHECK-DAG:   %[[CST:.*]] = arith.constant 0.000000e+00 : f32
 //   CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
-//       CHECK:   %[[D0:.*]] = gpu.thread_id  x
-//       CHECK:   %[[D1:.*]] = gpu.thread_id  y
-//       CHECK:   %[[D2:.*]] = gpu.thread_id  z
-//       CHECK:   %[[D3:.*]] = memref.alloc() : memref<1x32x33xf32, #gpu.address_space<workgroup>>
-//       CHECK:   %[[D4:.*]] = hal.interface.binding.subspan layout({{.+}}) binding(0) alignment(64) offset(%[[C0]]) : memref<10x2048x768xf32, #hal.descriptor_type<storage_buffer>>
-//       CHECK:   memref.assume_alignment %[[D4]], 64 : memref<10x2048x768xf32, #hal.descriptor_type<storage_buffer>>
-//       CHECK:   %[[D5:.*]] = hal.interface.binding.subspan layout({{.+}}) binding(1) alignment(64) offset(%[[C0]]) : memref<10x768x2048xf32, #hal.descriptor_type<storage_buffer>>
-//       CHECK:   memref.assume_alignment %[[D5]], 64 : memref<10x768x2048xf32, #hal.descriptor_type<storage_buffer>>
-//       CHECK:   %[[D6:.*]] = hal.interface.binding.subspan layout({{.+}}) binding(2) alignment(64) offset(%[[C0]]) : memref<10x768x2048xf32, #hal.descriptor_type<storage_buffer>>
-//       CHECK:   memref.assume_alignment %[[D6]], 64 : memref<10x768x2048xf32, #hal.descriptor_type<storage_buffer>>
+//       CHECK:   %[[TX:.*]] = gpu.thread_id  x
+//       CHECK:   %[[TY:.*]] = gpu.thread_id  y
+//       CHECK:   %[[ALLOC:.*]] = memref.alloc() : memref<1x32x33xf32, #gpu.address_space<workgroup>>
+//       CHECK:   %[[D0:.*]] = hal.interface.binding.subspan layout({{.+}}) binding(0) alignment(64) offset(%[[C0]]) : memref<10x2048x768xf32, #hal.descriptor_type<storage_buffer>>
+//       CHECK:   memref.assume_alignment %[[D0]], 64 : memref<10x2048x768xf32, #hal.descriptor_type<storage_buffer>>
+//       CHECK:   %[[D1:.*]] = hal.interface.binding.subspan layout({{.+}}) binding(1) alignment(64) offset(%[[C0]]) : memref<10x768x2048xf32, #hal.descriptor_type<storage_buffer>>
+//       CHECK:   memref.assume_alignment %[[D1]], 64 : memref<10x768x2048xf32, #hal.descriptor_type<storage_buffer>>
+//       CHECK:   %[[D2:.*]] = hal.interface.binding.subspan layout({{.+}}) binding(2) alignment(64) offset(%[[C0]]) : memref<10x768x2048xf32, #hal.descriptor_type<storage_buffer>>
+//       CHECK:   memref.assume_alignment %[[D2]], 64 : memref<10x768x2048xf32, #hal.descriptor_type<storage_buffer>>
 //       CHECK:   gpu.barrier
-//       CHECK:   %[[D7:.*]] = affine.apply #{{.*}}()[%{{.*}}, %[[D0]], %[[D1]], %[[D2]]]
-//       CHECK:   %[[D8:.*]] = affine.apply #{{.*}}()[%{{.*}}, %[[D0]]]
-//       CHECK:   %[[D9:.*]] = vector.transfer_read %[[D4]][%{{.*}}, %[[D7]], %[[D8]]], %[[CST]] {in_bounds = [true, true, true]} : memref<10x2048x768xf32, #hal.descriptor_type<storage_buffer>>, vector<1x1x4xf32>
-//       CHECK:   %[[D10:.*]] = affine.apply #{{.*}}()[%[[D0]], %[[D1]], %[[D2]]]
-//       CHECK:   %[[D11:.*]] = affine.apply #{{.*}}()[%[[D0]]]
-//       CHECK:   vector.transfer_write %[[D9]], %[[D3]][%[[C0]], %[[D10]], %[[D11]]] {in_bounds = [true, true, true]} : vector<1x1x4xf32>, memref<1x32x33xf32, #gpu.address_space<workgroup>>
+//       CHECK:   %[[D3:.*]] = affine.apply #{{.*}}()[%[[TX]]]
+//       CHECK:   %[[D4:.*]] = affine.apply #{{.*}}()[%{{.*}}, %[[TY]]]
+//       CHECK:   %[[D5:.*]] = affine.apply #{{.*}}()[%{{.*}}, %[[TX]]]
+//       CHECK:   %[[D6:.*]] = vector.transfer_read %[[D0]][%{{.*}}, %[[D4]], %[[D5]]], %[[CST]] {in_bounds = [true, true, true]} : memref<10x2048x768xf32, #hal.descriptor_type<storage_buffer>>, vector<1x1x4xf32>
+//       CHECK:   vector.transfer_write %[[D6]], %[[ALLOC]][%[[C0]], %[[TY]], %[[D3]]] {in_bounds = [true, true, true]} : vector<1x1x4xf32>, memref<1x32x33xf32, #gpu.address_space<workgroup>>
 //       CHECK:   gpu.barrier
-//       CHECK:   %[[D12:.*]] = affine.apply #{{.*}}()[%[[D0]]]
-//       CHECK:   %[[D13:.*]] = vector.transfer_read %[[D3]][%[[C0]], %[[D12]], %[[D1]]], %[[CST]] {in_bounds = [true, true]} : memref<1x32x33xf32, #gpu.address_space<workgroup>>, vector<4x1xf32>
-//       CHECK:   %[[DUP_D16:.*]] = arith.addi %[[D1]], %{{.*}} : index
-//       CHECK:   %[[DUP_D17:.*]] = arith.addi %[[D12]], %{{.*}} : index
-//       CHECK:   %[[D18:.*]] = vector.transfer_read %[[D5]][%{{.*}}, %[[DUP_D16]], %[[DUP_D17]]], %[[CST]] {in_bounds = [true]} : memref<10x768x2048xf32, #hal.descriptor_type<storage_buffer>>, vector<4xf32>
-//       CHECK:   %[[D15:.*]] = vector.shape_cast %[[D13]] : vector<4x1xf32> to vector<4xf32>
-//       CHECK:   %[[D20:.*]] = arith.addf %[[D15]], %[[D18]] : vector<4xf32>
-//       CHECK:   %[[D16:.*]] = affine.apply #{{.*}}()[%{{.*}}, %[[D1]]]
-//       CHECK:   %[[D17:.*]] = affine.apply #{{.*}}()[%{{.*}}, %[[D0]]]
-//       CHECK:   vector.transfer_write %[[D20]], %[[D6]][%{{.*}}, %[[D16]], %[[D17]]] {in_bounds = [true]} : vector<4xf32>, memref<10x768x2048xf32, #hal.descriptor_type<storage_buffer>>
+//       CHECK:   %[[D7:.*]] = vector.transfer_read %[[ALLOC]][%[[C0]], %[[D3]], %[[TY]]], %[[CST]] {in_bounds = [true, true]} : memref<1x32x33xf32, #gpu.address_space<workgroup>>, vector<4x1xf32>
+//       CHECK:   %[[D8:.*]] = arith.addi %[[TY]], %{{.*}}
+//       CHECK:   %[[D9:.*]] = arith.addi %[[D3]], %{{.*}}
+//       CHECK:   %[[D10:.*]] = vector.transfer_read %[[D1]][%{{.*}}, %[[D8]], %[[D9]]], %[[CST]] {in_bounds = [true]} : memref<10x768x2048xf32, #hal.descriptor_type<storage_buffer>>, vector<4xf32>
+//       CHECK:   %[[D11:.*]] = vector.shape_cast %[[D7]] : vector<4x1xf32> to vector<4xf32>
+//       CHECK:   %[[D12:.*]] = arith.addf %[[D11]], %[[D10]] : vector<4xf32>
+//       CHECK:   %[[D13:.*]] = affine.apply #{{.*}}()[%{{.*}}, %[[TY]]]
+//       CHECK:   %[[D14:.*]] = affine.apply #{{.*}}()[%{{.*}}, %[[TX]]]
+//       CHECK:   vector.transfer_write %[[D12]], %[[D2]][%{{.*}}, %[[D13]], %[[D14]]] {in_bounds = [true]} : vector<4xf32>, memref<10x768x2048xf32, #hal.descriptor_type<storage_buffer>>
 
 // -----
 
@@ -273,35 +264,32 @@ hal.executable @transpose_3d_trans_out_dispatch_0_generic_10x2048x768 {
 // CHECK-LABEL:   hal.executable public @transpose_3d_trans_out_dispatch_0_generic_10x2048x768 {
 //   CHECK-DAG:   %[[CST:.*]] = arith.constant 0.000000e+00 : f32
 //   CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
-//       CHECK:   %[[D0:.*]] = gpu.thread_id  x
-//       CHECK:   %[[D1:.*]] = gpu.thread_id  y
-//       CHECK:   %[[D2:.*]] = gpu.thread_id  z
-//       CHECK:   %[[D3:.*]] = memref.alloc() : memref<1x32x33xf32, #gpu.address_space<workgroup>>
-//       CHECK:   %[[D4:.*]] = memref.alloc() : memref<1x32x33xf32, #gpu.address_space<workgroup>>
-//       CHECK:   %[[D5:.*]] = hal.interface.binding.subspan layout({{.+}}) binding(0) alignment(64) offset(%[[C0]]) : memref<10x768x2048xf32, #hal.descriptor_type<storage_buffer>>
-//       CHECK:   memref.assume_alignment %[[D5]], 64 : memref<10x768x2048xf32, #hal.descriptor_type<storage_buffer>>
-//       CHECK:   %[[D6:.*]] = hal.interface.binding.subspan layout({{.+}}) binding(1) alignment(64) offset(%[[C0]]) : memref<10x768x2048xf32, #hal.descriptor_type<storage_buffer>>
-//       CHECK:   memref.assume_alignment %[[D6]], 64 : memref<10x768x2048xf32, #hal.descriptor_type<storage_buffer>>
-//       CHECK:   %[[D7:.*]] = hal.interface.binding.subspan layout({{.+}}) binding(2) alignment(64) offset(%[[C0]]) : memref<10x2048x768xf32, #hal.descriptor_type<storage_buffer>>
-//       CHECK:   memref.assume_alignment %[[D7]], 64 : memref<10x2048x768xf32, #hal.descriptor_type<storage_buffer>>
+//       CHECK:   %[[TX:.*]] = gpu.thread_id  x
+//       CHECK:   %[[TY:.*]] = gpu.thread_id  y
+//       CHECK:   %[[ALLOC:.*]] = memref.alloc() : memref<1x32x33xf32, #gpu.address_space<workgroup>>
+//       CHECK:   %[[ALLOC1:.*]] = memref.alloc() : memref<1x32x33xf32, #gpu.address_space<workgroup>>
+//       CHECK:   %[[D0:.*]] = hal.interface.binding.subspan layout({{.+}}) binding(0) alignment(64) offset(%[[C0]]) : memref<10x768x2048xf32, #hal.descriptor_type<storage_buffer>>
+//       CHECK:   memref.assume_alignment %[[D0]], 64 : memref<10x768x2048xf32, #hal.descriptor_type<storage_buffer>>
+//       CHECK:   %[[D1:.*]] = hal.interface.binding.subspan layout({{.+}}) binding(1) alignment(64) offset(%[[C0]]) : memref<10x768x2048xf32, #hal.descriptor_type<storage_buffer>>
+//       CHECK:   memref.assume_alignment %[[D1]], 64 : memref<10x768x2048xf32, #hal.descriptor_type<storage_buffer>>
+//       CHECK:   %[[D2:.*]] = hal.interface.binding.subspan layout({{.+}}) binding(2) alignment(64) offset(%[[C0]]) : memref<10x2048x768xf32, #hal.descriptor_type<storage_buffer>>
+//       CHECK:   memref.assume_alignment %[[D2]], 64 : memref<10x2048x768xf32, #hal.descriptor_type<storage_buffer>>
 //       CHECK:   gpu.barrier
-//       CHECK:   %[[D8:.*]] = affine.apply #{{.*}}()[%{{.*}}, %[[D0]], %[[D1]], %[[D2]]]
-//       CHECK:   %[[D9:.*]] = affine.apply #{{.*}}()[%{{.*}}, %[[D0]]]
-//       CHECK:   %[[D10:.*]] = vector.transfer_read %[[D5]][%{{.*}}, %[[D8]], %[[D9]]], %[[CST]] {in_bounds = [true, true, true]} : memref<10x768x2048xf32, #hal.descriptor_type<storage_buffer>>, vector<1x1x4xf32>
-//       CHECK:   %[[D11:.*]] = affine.apply #{{.*}}()[%[[D0]], %[[D1]], %[[D2]]]
-//       CHECK:   %[[D12:.*]] = affine.apply #{{.*}}()[%[[D0]]]
-//       CHECK:   vector.transfer_write %[[D10]], %[[D4]][%[[C0]], %[[D11]], %[[D12]]] {in_bounds = [true, true, true]} : vector<1x1x4xf32>, memref<1x32x33xf32, #gpu.address_space<workgroup>>
-//       CHECK:   %[[D13:.*]] = vector.transfer_read %[[D6]][%{{.*}}, %[[D8]], %[[D9]]], %[[CST]] {in_bounds = [true, true, true]} : memref<10x768x2048xf32, #hal.descriptor_type<storage_buffer>>, vector<1x1x4xf32>
-//       CHECK:   vector.transfer_write %[[D13]], %[[D3]][%[[C0]], %[[D11]], %[[D12]]] {in_bounds = [true, true, true]} : vector<1x1x4xf32>, memref<1x32x33xf32, #gpu.address_space<workgroup>>
+//       CHECK:   %[[D3:.*]] = affine.apply #{{.*}}()[%{{.*}}, %[[TY]]]
+//       CHECK:   %[[D4:.*]] = affine.apply #{{.*}}()[%{{.*}}, %[[TX]]]
+//       CHECK:   %[[D5:.*]] = vector.transfer_read %[[D0]][%{{.*}}, %[[D3]], %[[D4]]], %[[CST]] {in_bounds = [true, true, true]} : memref<10x768x2048xf32, #hal.descriptor_type<storage_buffer>>, vector<1x1x4xf32>
+//       CHECK:   %[[D6:.*]] = affine.apply #{{.*}}()[%[[TX]]]
+//       CHECK:   vector.transfer_write %[[D5]], %[[ALLOC1]][%[[C0]], %[[TY]], %[[D6]]] {in_bounds = [true, true, true]} : vector<1x1x4xf32>, memref<1x32x33xf32, #gpu.address_space<workgroup>>
+//       CHECK:   %[[D7:.*]] = vector.transfer_read %[[D1]][%{{.*}}, %[[D3]], %[[D4]]], %[[CST]] {in_bounds = [true, true, true]} : memref<10x768x2048xf32, #hal.descriptor_type<storage_buffer>>, vector<1x1x4xf32>
+//       CHECK:   vector.transfer_write %[[D7]], %[[ALLOC]][%[[C0]], %[[TY]], %[[D6]]] {in_bounds = [true, true, true]} : vector<1x1x4xf32>, memref<1x32x33xf32, #gpu.address_space<workgroup>>
 //       CHECK:   gpu.barrier
-//       CHECK:   %[[D14:.*]] = affine.apply #{{.*}}()[%[[D0]]]
-//       CHECK:   %[[D15:.*]] = vector.transfer_read %[[D4]][%[[C0]], %[[D14]], %[[D1]]], %[[CST]] {in_bounds = [true, true]} : memref<1x32x33xf32, #gpu.address_space<workgroup>>, vector<4x1xf32>
-//       CHECK:   %[[D16:.*]] = vector.transfer_read %[[D3]][%[[C0]], %[[D14]], %[[D1]]], %[[CST]] {in_bounds = [true, true]} : memref<1x32x33xf32, #gpu.address_space<workgroup>>, vector<4x1xf32>
-//       CHECK:   %[[D17:.*]] = arith.addf %[[D15]], %[[D16]] : vector<4x1xf32>
-//       CHECK:   %[[D19:.*]] = vector.shape_cast %[[D17]] : vector<4x1xf32> to vector<4xf32>
-//       CHECK:   %[[D21:.*]] = affine.apply #{{.*}}()[%{{.*}}, %[[D1]]]
-//       CHECK:   %[[D22:.*]] = affine.apply #{{.*}}()[%{{.*}}, %[[D0]]]
-//       CHECK:   vector.transfer_write %[[D19]], %[[D7]][%{{.*}}, %[[D21]], %[[D22]]] {in_bounds = [true]} : vector<4xf32>, memref<10x2048x768xf32, #hal.descriptor_type<storage_buffer>>
+//       CHECK:   %[[D8:.*]] = vector.transfer_read %[[ALLOC1]][%[[C0]], %[[D6]], %[[TY]]], %[[CST]] {in_bounds = [true, true]} : memref<1x32x33xf32, #gpu.address_space<workgroup>>, vector<4x1xf32>
+//       CHECK:   %[[D9:.*]] = vector.transfer_read %[[ALLOC]][%[[C0]], %[[D6]], %[[TY]]], %[[CST]] {in_bounds = [true, true]} : memref<1x32x33xf32, #gpu.address_space<workgroup>>, vector<4x1xf32>
+//       CHECK:   %[[D10:.*]] = arith.addf %[[D8]], %[[D9]] : vector<4x1xf32>
+//       CHECK:   %[[D11:.*]] = vector.shape_cast %[[D10]] : vector<4x1xf32> to vector<4xf32>
+//       CHECK:   %[[D12:.*]] = affine.apply #{{.*}}()[%{{.*}}, %[[TY]]]
+//       CHECK:   %[[D13:.*]] = affine.apply #{{.*}}()[%{{.*}}, %[[TX]]]
+//       CHECK:   vector.transfer_write %[[D11]], %[[D2]][%{{.*}}, %[[D12]], %[[D13]]] {in_bounds = [true]} : vector<4xf32>, memref<10x2048x768xf32, #hal.descriptor_type<storage_buffer>>
 
 // -----
 

From 5708d428b029504498cb00e19710d7ecc4a89670 Mon Sep 17 00:00:00 2001
From: Stella Laurenzo <stellaraccident@gmail.com>
Date: Tue, 26 Nov 2024 14:08:46 -0800
Subject: [PATCH 11/54] [python] Make detection of TimeoutError compatible with
 3.10. (#19308)

* In python < 3.11, future methods throw
concurrent.futures.TimeoutError.
* In python 3.11, concurrent.futures.TimeoutError is a subclass of
TimeoutError and the former is deprecated.
* In python 3.10, TimeoutError exists so can have an except clause, but
it is not thrown by futures.

---------

Signed-off-by: Stella Laurenzo <stellaraccident@gmail.com>
---
 compiler/bindings/python/iree/build/executor.py     |  6 ++++++
 .../bindings/python/test/build_api/CMakeLists.txt   | 13 ++++++-------
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/compiler/bindings/python/iree/build/executor.py b/compiler/bindings/python/iree/build/executor.py
index 8554e207976a..6e5afe8939ec 100644
--- a/compiler/bindings/python/iree/build/executor.py
+++ b/compiler/bindings/python/iree/build/executor.py
@@ -519,6 +519,12 @@ def _service_graph(self):
                 completed_deps.add(completed_dep)
         except TimeoutError:
             pass
+        except concurrent.futures.TimeoutError:
+            # In Python 3.10, future access throws concurrent.futures.TimeoutError.
+            # In 3.11, that was made a subclass of TimeoutError, which is advertised
+            # as thrown (and the original is marked as deprecated).
+            # TODO: Remove this clause once 3.10 support is dropped.
+            pass
 
         # Purge done from in-flight list.
         self.in_flight_deps.difference_update(completed_deps)
diff --git a/compiler/bindings/python/test/build_api/CMakeLists.txt b/compiler/bindings/python/test/build_api/CMakeLists.txt
index 9721bb06887b..fceac40c531c 100644
--- a/compiler/bindings/python/test/build_api/CMakeLists.txt
+++ b/compiler/bindings/python/test/build_api/CMakeLists.txt
@@ -14,10 +14,9 @@ if(IREE_INPUT_TORCH)
   )
 endif()
 
-# FIXME: This test fails on python3.10.
-# iree_py_test(
-#   NAME
-#   concurrency_test
-#  SRCS
-#    "concurrency_test.py"
-#)
+iree_py_test(
+ NAME
+   concurrency_test
+ SRCS
+   "concurrency_test.py"
+)

From 41115bba05960e563791ce6ed1af26093f4fab1e Mon Sep 17 00:00:00 2001
From: Stanley Winata <68087699+raikonenfnu@users.noreply.github.com>
Date: Tue, 26 Nov 2024 15:25:35 -0800
Subject: [PATCH 12/54] [Codegen] Bubble up Transpose attention V and try fuse
 with others before attention (#19250)

Flash Attention transpose_V variant is significantly faster than the
non-transpose_V variant. This is due to many matmul intrinsics being
mmtb by default. Hence, doing FA transpose_V will allow for better/more
contiguous reads from shared memory to register, improving the attention
performance quite a bit.

This PR exposes the attention_transposeV form by generating a
linalg.transpose on the V during bubbling up of transpose S.T we can
give the graph some opportunities to fuse the transpose-V to it's
producer. I have also confirmed that if we do not find any producer, the
transpose will indeed fuse back with the attenionOp. Hence worse case,
we will get same perf as before this PR.

Additionally, we modify elementwise op fusion to try fuse transpose with
other ops before letting it get fused back into attention.

---------

Signed-off-by: Stanley Winata <stanley.winata@amd.com>
---
 .github/workflows/pkgci_regression_test.yml   |  16 +--
 .../attention_and_matmul_spec_punet.mlir      |  71 ++++++++++++
 .../Dialect/LinalgExt/Transforms/Transforms.h |   6 +
 .../LinalgExt/Transforms/TransposeFusion.cpp  | 105 ++++++++++++++++++
 .../DispatchCreation/ElementwiseOpFusion.cpp  |  28 +++--
 .../test/elementwise_op_fusion.mlir           |  51 +++++++++
 .../PropagateLinalgTranspose.cpp              |  10 ++
 .../test/propagate_linalg_transpose.mlir      |  44 ++++++++
 8 files changed, 314 insertions(+), 17 deletions(-)

diff --git a/.github/workflows/pkgci_regression_test.yml b/.github/workflows/pkgci_regression_test.yml
index 8b0b4e0189d2..7a67778c0585 100644
--- a/.github/workflows/pkgci_regression_test.yml
+++ b/.github/workflows/pkgci_regression_test.yml
@@ -220,7 +220,7 @@ jobs:
             --goldentime-rocm-unet-ms 419.0 \
             --goldentime-rocm-clip-ms 18.5 \
             --goldentime-rocm-vae-ms 337.0 \
-            --goldendispatch-rocm-unet 1531 \
+            --goldendispatch-rocm-unet 1602 \
             --goldendispatch-rocm-clip 1139 \
             --goldendispatch-rocm-vae 246 \
             --goldensize-rocm-unet-bytes 2280000  \
@@ -238,21 +238,21 @@ jobs:
         run: |
           source ${VENV_DIR}/bin/activate
           pytest ./experimental/benchmarks/sdxl/benchmark_sdxl_rocm.py \
-            --goldentime-rocm-e2e-ms 372.0 \
-            --goldentime-rocm-unet-ms 95.0 \
+            --goldentime-rocm-e2e-ms 330.0 \
+            --goldentime-rocm-unet-ms 80.0 \
             --goldentime-rocm-clip-ms 15.5 \
             --goldentime-rocm-vae-ms 80.0 \
-            --goldendispatch-rocm-unet 1531 \
+            --goldendispatch-rocm-unet 1602 \
             --goldendispatch-rocm-clip 1139 \
             --goldendispatch-rocm-vae 246 \
             --goldensize-rocm-unet-bytes 2270000 \
             --goldensize-rocm-clip-bytes 860000  \
             --goldensize-rocm-vae-bytes 840000 \
-            --goldentime-rocm-punet-int8-fp16-ms 55 \
-            --goldendispatch-rocm-punet-int8-fp16 1284 \
+            --goldentime-rocm-punet-int8-fp16-ms 53 \
+            --goldendispatch-rocm-punet-int8-fp16 1424 \
             --goldensize-rocm-punet-int8-fp16-bytes 2560000 \
-            --goldentime-rocm-punet-int8-fp8-ms 59 \
-            --goldendispatch-rocm-punet-int8-fp8 1564 \
+            --goldentime-rocm-punet-int8-fp8-ms 53 \
+            --goldendispatch-rocm-punet-int8-fp8 1704 \
             --goldensize-rocm-punet-int8-fp8-bytes 2800000 \
             --rocm-chip gfx942 \
             --log-cli-level=info \
diff --git a/build_tools/pkgci/external_test_suite/attention_and_matmul_spec_punet.mlir b/build_tools/pkgci/external_test_suite/attention_and_matmul_spec_punet.mlir
index a566203907e4..7b0944471990 100644
--- a/build_tools/pkgci/external_test_suite/attention_and_matmul_spec_punet.mlir
+++ b/build_tools/pkgci/external_test_suite/attention_and_matmul_spec_punet.mlir
@@ -208,6 +208,41 @@ transform.named_sequence @match_attention_f8(%attention: !transform.any_op {tran
     transform.yield %cont, %config : !transform.any_op, !transform.any_param
   }
 
+
+  // Variant of matmul_like_Bx20x1024x64x1280_i8xi8xi32 from Transposed-V.
+  transform.named_sequence @match_matmul_like_Bx20x64x1024x1280_i8xi8xi32(%cont: !transform.any_op {transform.readonly})
+    -> (!transform.any_op, !transform.any_param) {
+    %ins, %outs = transform.iree.match.cast_compatible_dag_from_root %cont {
+    ^bb0(%lhs: tensor<?x1024x1280xi8>, %rhs: tensor<20x64x1280xi8>, %out: tensor<?x20x64x1024xi32>):
+      %16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>,
+                                             affine_map<(d0, d1, d2, d3, d4) -> (d1, d2, d4)>,
+                                             affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>],
+                            iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"]}
+        ins(%lhs, %rhs : tensor<?x1024x1280xi8>, tensor<20x64x1280xi8>)
+        outs(%out : tensor<?x20x64x1024xi32>) {
+      ^bb0(%in: i8, %in_0: i8, %acc: i32):
+        %18 = arith.extsi %in : i8 to i32
+        %19 = arith.extsi %in_0 : i8 to i32
+        %20 = arith.muli %18, %19 : i32
+        %21 = arith.addi %acc, %20 : i32
+        linalg.yield %21 : i32
+      } -> tensor<?x20x64x1024xi32>
+    } : (!transform.any_op) -> (!transform.any_value, !transform.any_value)
+    %config = transform.param.constant #iree_codegen.compilation_info<
+      lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1],
+                                                   mma_kind = #iree_gpu.mma_layout<MFMA_I32_16x16x32_I8>,
+                                                   subgroup_m_count = 2, subgroup_n_count = 2,
+                                                   reduction = [0, 0, 0, 0, 128],
+                                                   workgroup = [1, 1, 160, 64, 0]}>,
+      translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute
+        workgroup_size = [256, 1, 1] subgroup_size = 64,
+        {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true,
+                                                           reorder_workgroups_strategy = <Transpose>>
+        }>
+    > -> !transform.any_param
+    transform.yield %cont, %config : !transform.any_op, !transform.any_param
+  }
+
   transform.named_sequence @match_matmul_like_Bx20x64x64x2048_i8xi8xi32(%cont: !transform.any_op {transform.readonly})
     -> (!transform.any_op, !transform.any_param) {
     %ins, %outs = transform.iree.match.cast_compatible_dag_from_root %cont {
@@ -239,6 +274,38 @@ transform.named_sequence @match_attention_f8(%attention: !transform.any_op {tran
     transform.yield %cont, %config : !transform.any_op, !transform.any_param
   }
 
+  // Variant of matmul_like_Bx20x64x64x2048_i8xi8xi32 from Transposed-V.
+transform.named_sequence @match_matmul_like_Bx20x64x64x2048_transposev_i8xi8xi32(%cont: !transform.any_op {transform.readonly})
+    -> (!transform.any_op, !transform.any_param) {
+    %ins, %outs = transform.iree.match.cast_compatible_dag_from_root %cont {
+    ^bb0(%lhs: tensor<?x64x2048xi8>, %rhs: tensor<20x64x2048xi8>, %out: tensor<?x20x64x64xi32>):
+      %16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>,
+                                             affine_map<(d0, d1, d2, d3, d4) -> (d1, d2, d4)>,
+                                             affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>],
+                            iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"]}
+        ins(%lhs, %rhs : tensor<?x64x2048xi8>, tensor<20x64x2048xi8>)
+        outs(%out : tensor<?x20x64x64xi32>) {
+      ^bb0(%in: i8, %in_0: i8, %acc: i32):
+        %18 = arith.extsi %in : i8 to i32
+        %19 = arith.extsi %in_0 : i8 to i32
+        %20 = arith.muli %18, %19 : i32
+        %21 = arith.addi %acc, %20 : i32
+        linalg.yield %21 : i32
+      } -> tensor<?x20x64x64xi32>
+    } : (!transform.any_op) -> (!transform.any_value, !transform.any_value)
+    %config = transform.param.constant #iree_codegen.compilation_info<
+      lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1],
+                                                   mma_kind = #iree_gpu.mma_layout<MFMA_I32_16x16x32_I8>,
+                                                   subgroup_m_count = 2, subgroup_n_count = 1,
+                                                   reduction = [0, 0, 0, 0, 128],
+                                                   workgroup = [1, 1, 320, 32, 0]}>,
+      translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute
+        workgroup_size = [128, 1, 1] subgroup_size = 64,
+        {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true>}>
+    > -> !transform.any_param
+    transform.yield %cont, %config : !transform.any_op, !transform.any_param
+  }
+
   transform.named_sequence @match_matmul_like_Bx10x4096x64x640_i8xi8xi32(%cont: !transform.any_op {transform.readonly})
     -> (!transform.any_op, !transform.any_param) {
     %ins, %outs = transform.iree.match.cast_compatible_dag_from_root %cont {
@@ -302,6 +369,10 @@ transform.named_sequence @match_attention_f8(%attention: !transform.any_op {tran
         , @match_matmul_like_Bx10x4096x64x640_i8xi8xi32 -> @apply_op_config
         , @match_matmul_like_Bx20x64x64x2048_i8xi8xi32 -> @apply_op_config
 
+        // Transpose-V generated contraction.
+        , @match_matmul_like_Bx20x64x1024x1280_i8xi8xi32 -> @apply_op_config
+        , @match_matmul_like_Bx20x64x64x2048_transposev_i8xi8xi32 -> @apply_op_config
+
         // TUNING_MATCH_END DO NOT REMOVE
       : (!transform.any_op) -> (!transform.any_op)
     transform.yield
diff --git a/compiler/src/iree/compiler/Dialect/LinalgExt/Transforms/Transforms.h b/compiler/src/iree/compiler/Dialect/LinalgExt/Transforms/Transforms.h
index aec6bd704e5d..8bf84cab2574 100644
--- a/compiler/src/iree/compiler/Dialect/LinalgExt/Transforms/Transforms.h
+++ b/compiler/src/iree/compiler/Dialect/LinalgExt/Transforms/Transforms.h
@@ -19,6 +19,12 @@ void populateFuseLinalgExtOpsWithTransposes(
     RewritePatternSet &patterns,
     const linalg::ControlFusionFn &controlFusionFn);
 
+/// Bubble up transpose-like ops from LinalgExt ops (only `AttentionOp`
+/// supported).
+void populateBubbleTransposeFromLinalgExtOps(
+    RewritePatternSet &patterns,
+    const linalg::ControlFusionFn &controlFusionFn);
+
 /// Helper struct to hold the results of collapsing an operation.
 struct CollapseResult {
   SmallVector<Value> results;
diff --git a/compiler/src/iree/compiler/Dialect/LinalgExt/Transforms/TransposeFusion.cpp b/compiler/src/iree/compiler/Dialect/LinalgExt/Transforms/TransposeFusion.cpp
index 2d158d54014a..bcc94ec951c0 100644
--- a/compiler/src/iree/compiler/Dialect/LinalgExt/Transforms/TransposeFusion.cpp
+++ b/compiler/src/iree/compiler/Dialect/LinalgExt/Transforms/TransposeFusion.cpp
@@ -7,6 +7,7 @@
 #include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtInterfaces.h"
 #include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtOps.h"
 #include "iree/compiler/Dialect/LinalgExt/Transforms/Transforms.h"
+#include "iree/compiler/Dialect/LinalgExt/Utils/IndexingUtils.h"
 #include "llvm/ADT/STLExtras.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
@@ -101,6 +102,103 @@ struct FuseTransposeWithAttentionOp final
 private:
   linalg::ControlFusionFn controlFn;
 };
+
+// Bubbles transpose-V out of attention to expose the more performant
+// attention-transposeV.
+struct BubbleTransposeVFromAttentionOp
+    : public OpRewritePattern<LinalgExt::AttentionOp> {
+  BubbleTransposeVFromAttentionOp(MLIRContext *context,
+                                  linalg::ControlFusionFn controlFn,
+                                  PatternBenefit benefit = 1)
+      : OpRewritePattern<LinalgExt::AttentionOp>(context, benefit),
+        controlFn(controlFn) {}
+
+  LogicalResult matchAndRewrite(LinalgExt::AttentionOp attentionOp,
+                                PatternRewriter &rewriter) const override {
+    // Only checking for V because we are only bubbling transpose-V.
+    OpOperand *valueOpOperand = &attentionOp.getValueMutable();
+    if (controlFn && !controlFn(valueOpOperand)) {
+      return rewriter.notifyMatchFailure(
+          attentionOp, "Expected attentionOp and producer of V to be non-null "
+                       "and outside dispatch.");
+    }
+    // Extract Attention indexing information.
+    AffineMap qMap = attentionOp.getQueryMap();
+    AffineMap kMap = attentionOp.getKeyMap();
+    AffineMap vMap = attentionOp.getValueMap();
+    AffineMap oMap = attentionOp.getOutputMap();
+    FailureOr<AttentionOpDetail> maybeOpInfo =
+        AttentionOpDetail::get(qMap, kMap, vMap, oMap);
+    if (failed(maybeOpInfo)) {
+      return failure();
+    }
+
+    // Only handle single dim for K2 and N for now.
+    if (maybeOpInfo->getK2Dims().size() != 1 ||
+        maybeOpInfo->getNDims().size() != 1) {
+      return failure();
+    }
+    // Check that V has standard map/non transposed V.
+    AffineExpr k2Dim =
+        rewriter.getAffineDimExpr(maybeOpInfo->getK2Dims().back());
+    AffineExpr nDim = rewriter.getAffineDimExpr(maybeOpInfo->getNDims().back());
+    int64_t vRank = vMap.getNumResults();
+    // TODO: This check is quite conservative, in the future we should simply
+    //       do vMap.getResultPosition(k2Dim) > vMap.getResultPosition(nDim).
+    if (vMap.getResult(vRank - 1) != nDim ||
+        vMap.getResult(vRank - 2) != k2Dim) {
+      return failure();
+    }
+
+    // Get dimension positions to prepare for transpose.
+    std::optional<int64_t> maybeK2Pos = vMap.getResultPosition(k2Dim);
+    std::optional<int64_t> maybeNPos = vMap.getResultPosition(nDim);
+    assert(maybeK2Pos.has_value() && maybeNPos.has_value() &&
+           "Expected K2 dim and N dim to be in V-map.");
+    int64_t k2Pos = maybeK2Pos.value();
+    int64_t nPos = maybeNPos.value();
+    SmallVector<int64_t> perm = llvm::to_vector(llvm::seq<int64_t>(0, vRank));
+    std::swap(perm[k2Pos], perm[nPos]);
+
+    // Expose transposeOp for V.
+    Location loc = attentionOp.getLoc();
+    Value value = attentionOp.getValue();
+    auto valueType = dyn_cast<ShapedType>(value.getType());
+    auto valueElType = valueType.getElementType();
+    SmallVector<OpFoldResult> transVShape =
+        tensor::getMixedSizes(rewriter, loc, value);
+    applyPermutationToVector(transVShape, perm);
+    Value initTransV =
+        rewriter.create<tensor::EmptyOp>(loc, transVShape, valueElType)
+            .getResult();
+    Value transposeV =
+        rewriter.create<linalg::TransposeOp>(loc, value, initTransV, perm)
+            ->getResult(0);
+
+    // Generate transpose V map.
+    SmallVector<AffineExpr> newExprs =
+        applyPermutation(vMap.getResults(), perm);
+    AffineMap transposedVMap =
+        AffineMap::get(vMap.getNumDims(), vMap.getNumSymbols(), newExprs,
+                       rewriter.getContext());
+
+    // Modify attention to have transposed V inputs and mapping.
+    int64_t valueIndex = valueOpOperand->getOperandNumber();
+    rewriter.modifyOpInPlace(attentionOp, [&]() {
+      SmallVector<AffineMap> newIndexingMaps =
+          attentionOp.getIndexingMapsArray();
+      newIndexingMaps[valueIndex] = transposedVMap;
+      attentionOp.setIndexingMapsAttr(
+          rewriter.getAffineMapArrayAttr(newIndexingMaps));
+      attentionOp.setOperand(valueIndex, transposeV);
+    });
+    return success();
+  }
+
+private:
+  linalg::ControlFusionFn controlFn;
+};
+
 } // namespace
 
 void populateFuseLinalgExtOpsWithTransposes(
@@ -110,4 +208,11 @@ void populateFuseLinalgExtOpsWithTransposes(
                                              controlFusionFn);
 }
 
+void populateBubbleTransposeFromLinalgExtOps(
+    RewritePatternSet &patterns,
+    const linalg::ControlFusionFn &controlFusionFn) {
+  patterns.add<BubbleTransposeVFromAttentionOp>(patterns.getContext(),
+                                                controlFusionFn);
+}
+
 } // namespace mlir::iree_compiler::IREE::LinalgExt
diff --git a/compiler/src/iree/compiler/DispatchCreation/ElementwiseOpFusion.cpp b/compiler/src/iree/compiler/DispatchCreation/ElementwiseOpFusion.cpp
index 41db09f07a16..3c1a783ecba3 100644
--- a/compiler/src/iree/compiler/DispatchCreation/ElementwiseOpFusion.cpp
+++ b/compiler/src/iree/compiler/DispatchCreation/ElementwiseOpFusion.cpp
@@ -104,7 +104,6 @@ struct GatherFusionPattern final : public OpRewritePattern<tensor::ExtractOp> {
 void ElementwiseOpFusionPass::runOnOperation() {
   MLIRContext *context = &getContext();
 
-  RewritePatternSet fusionPatterns(context);
   // Only fuse operations where all uses of the producer are generic
   // operations. If an operation is used in a named op, it will be computed
   // anyway, so the consumers can just use that value.
@@ -135,24 +134,35 @@ void ElementwiseOpFusionPass::runOnOperation() {
         return areFusableAsElementwiseOps(context, fusedOperand,
                                           fuseMultiReduction);
       };
-  linalg::populateElementwiseOpsFusionPatterns(fusionPatterns,
+
+  RewritePatternSet linalgFusionPatterns(context);
+  linalg::populateElementwiseOpsFusionPatterns(linalgFusionPatterns,
                                                fuseElementwiseOpsControlFn);
 
+  GreedyRewriteConfig rewriteConfig;
+  rewriteConfig.maxIterations = GreedyRewriteConfig::kNoLimit;
+  if (failed(applyPatternsAndFoldGreedily(
+          getOperation(), std::move(linalgFusionPatterns), rewriteConfig))) {
+    getOperation()->emitOpError(
+        "Failed to fuse elementwise ops with upstream patterns.");
+    return signalPassFailure();
+  }
+
+  // Try fuse with linalgExt patterns.
   linalg::ControlFusionFn foldTransposeControlFn = [](OpOperand *fusedOperand) {
     Operation *producer = fusedOperand->get().getDefiningOp();
     Operation *consumer = fusedOperand->getOwner();
 
     return IREE::Flow::isNonNullAndOutsideDispatch({producer, consumer});
   };
+  RewritePatternSet linalgExtFusionPatterns(context);
   IREE::LinalgExt::populateFuseLinalgExtOpsWithTransposes(
-      fusionPatterns, foldTransposeControlFn);
-  fusionPatterns.insert<GatherFusionPattern>(context);
-
-  GreedyRewriteConfig rewriteConfig;
-  rewriteConfig.maxIterations = GreedyRewriteConfig::kNoLimit;
+      linalgExtFusionPatterns, foldTransposeControlFn);
+  linalgExtFusionPatterns.insert<GatherFusionPattern>(context);
   if (failed(applyPatternsAndFoldGreedily(
-          getOperation(), std::move(fusionPatterns), rewriteConfig))) {
-    getOperation()->emitOpError("Failed to perform elementwise operations");
+          getOperation(), std::move(linalgExtFusionPatterns), rewriteConfig))) {
+    getOperation()->emitOpError(
+        "Failed to fuse elementwise ops with linalgExt patterns.");
     return signalPassFailure();
   }
 }
diff --git a/compiler/src/iree/compiler/DispatchCreation/test/elementwise_op_fusion.mlir b/compiler/src/iree/compiler/DispatchCreation/test/elementwise_op_fusion.mlir
index 8b556a03835d..096c882ab219 100644
--- a/compiler/src/iree/compiler/DispatchCreation/test/elementwise_op_fusion.mlir
+++ b/compiler/src/iree/compiler/DispatchCreation/test/elementwise_op_fusion.mlir
@@ -207,3 +207,54 @@ util.func public @fuse_generic_gather2(
 // CHECK-NEXT:    %[[RES3:[a-zA-Z0-9]+]] = arith.mulf %[[RES]], %[[RES]] : f32
 // CHECK-NEXT:    %[[RES4:[a-zA-Z0-9]+]] = arith.addf %[[RES2]], %[[RES3]] : f32
 // CHECK-NEXT:    linalg.yield %[[RES4]] : f32
+
+util.func public @fuse_transpose_attention_to_producer(%q: tensor<2x10x4096x64xf16>, %k: tensor<2x10x4096x64xf16>, %quantized_v: tensor<2x10x4096x64xi32>, %quant_offset: tensor<10x64xi32>, %quant_scale: tensor<10x64xf32>, %scale: f16) -> tensor<2x10x4096x64xf16> {
+  // Dequantize int-quantization of V
+  %init_dequant = tensor.empty() : tensor<2x10x4096x64xf16>
+  %v = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%quantized_v, %quant_offset, %quant_scale : tensor<2x10x4096x64xi32>, tensor<10x64xi32>, tensor<10x64xf32>) outs(%init_dequant : tensor<2x10x4096x64xf16>) {
+  ^bb0(%in: i32, %in_0: i32, %in_1: f32, %out: f16):
+      %19 = arith.addi %in, %in_0 : i32
+      %20 = arith.sitofp %19 : i32 to f32
+      %21 = arith.mulf %20, %in_1 : f32
+      %22 = arith.truncf %21 : f32 to f16
+      linalg.yield %22 : f16
+  } -> tensor<2x10x4096x64xf16>
+
+  // Transpose-V
+  %init_transpose = tensor.empty() : tensor<2x10x64x4096xf16>
+  %transpose_v = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d3, d2)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%v : tensor<2x10x4096x64xf16>) outs(%init_transpose : tensor<2x10x64x4096xf16>) {
+  ^bb0(%in: f16, %out: f16):
+    linalg.yield %in : f16
+  } -> tensor<2x10x64x4096xf16>
+
+  // Attention-Transpose-V
+  %init_attention = tensor.empty() : tensor<2x10x4096x64xf16>
+  %attention = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d3)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> ()>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d5)>]} ins(%q, %k, %transpose_v, %scale : tensor<2x10x4096x64xf16>, tensor<2x10x4096x64xf16>, tensor<2x10x64x4096xf16>, f16) outs(%init_attention : tensor<2x10x4096x64xf16>) {
+    ^bb0(%score: f16):
+      iree_linalg_ext.yield %score: f16
+  } -> tensor<2x10x4096x64xf16>
+  util.return %attention : tensor<2x10x4096x64xf16>
+}
+
+// CHECK-LABEL: util.func public @fuse_transpose_attention_to_producer
+//  CHECK-SAME:   %[[ARG0:[A-Za-z0-9]+]]: tensor
+//  CHECK-SAME:   %[[ARG1:[A-Za-z0-9]+]]: tensor
+//  CHECK-SAME:   %[[ARG2:[A-Za-z0-9]+]]: tensor
+//  CHECK-SAME:   %[[ARG3:[A-Za-z0-9]+]]: tensor
+//  CHECK-SAME:   %[[ARG4:[A-Za-z0-9]+]]: tensor
+//  CHECK-SAME:   %[[ARG5:[A-Za-z0-9]+]]: f16
+//       CHECK:   %[[DEQUANT_V:.+]] = linalg.generic
+//  CHECK-SAME:     indexing_maps =
+//  CHECK-SAME:     affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+//  CHECK-SAME:     affine_map<(d0, d1, d2, d3) -> (d1, d3)>
+// CHECK-SAME:      affine_map<(d0, d1, d2, d3) -> (d1, d3)>
+// CHECK-SAME:      affine_map<(d0, d1, d2, d3) -> (d0, d1, d3, d2)>]
+//  CHECK-SAME:     ins(%[[ARG2]], %[[ARG3]], %[[ARG4]]
+//       CHECK:   %[[RESULT:.+]] = iree_linalg_ext.attention
+//  CHECK-SAME:     indexing_maps =
+//  CHECK-SAME:     affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>
+//  CHECK-SAME:     affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d3)>
+//  CHECK-SAME:     affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>
+//  CHECK-SAME:     affine_map<(d0, d1, d2, d3, d4, d5) -> ()>
+//  CHECK-SAME:     affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d5)>
+//  CHECK-SAME:     ins(%[[ARG0]], %[[ARG1]], %[[DEQUANT_V]], %[[ARG5]]
diff --git a/compiler/src/iree/compiler/GlobalOptimization/PropagateLinalgTranspose.cpp b/compiler/src/iree/compiler/GlobalOptimization/PropagateLinalgTranspose.cpp
index 846233841732..265ddbbc5890 100644
--- a/compiler/src/iree/compiler/GlobalOptimization/PropagateLinalgTranspose.cpp
+++ b/compiler/src/iree/compiler/GlobalOptimization/PropagateLinalgTranspose.cpp
@@ -13,6 +13,7 @@
 
 #include "iree/compiler/Dialect/Flow/Conversion/TensorToFlow/Utils.h"
 #include "iree/compiler/Dialect/Flow/Transforms/RegionOpUtils.h"
+#include "iree/compiler/Dialect/LinalgExt/Transforms/Transforms.h"
 #include "iree/compiler/Dialect/Util/IR/UtilOps.h"
 #include "iree/compiler/GlobalOptimization/Passes.h"
 #include "llvm/Support/Debug.h"
@@ -1087,6 +1088,15 @@ void PropagateLinalgTransposePass::runOnOperation() {
     linalg::populateFoldReshapeOpsByExpansionPatterns(bubblingPatterns,
                                                       reshapePropagationFn);
     linalg::FillOp::getCanonicalizationPatterns(bubblingPatterns, context);
+    linalg::ControlFusionFn bubbleTransposeControlFn =
+        [](OpOperand *fusedOperand) {
+          Operation *producer = fusedOperand->get().getDefiningOp();
+          Operation *consumer = fusedOperand->getOwner();
+
+          return IREE::Flow::isNonNullAndOutsideDispatch({producer, consumer});
+        };
+    IREE::LinalgExt::populateBubbleTransposeFromLinalgExtOps(
+        bubblingPatterns, bubbleTransposeControlFn);
     bubblingPatterns.insert<FuseTransposeWithProducerLinalgOp>(
         context, enableAggressivePropagation);
     bubblingPatterns.insert<BubbleTransposeThroughCollapseShape>(context);
diff --git a/compiler/src/iree/compiler/GlobalOptimization/test/propagate_linalg_transpose.mlir b/compiler/src/iree/compiler/GlobalOptimization/test/propagate_linalg_transpose.mlir
index 6b9571666808..16f37473eb47 100644
--- a/compiler/src/iree/compiler/GlobalOptimization/test/propagate_linalg_transpose.mlir
+++ b/compiler/src/iree/compiler/GlobalOptimization/test/propagate_linalg_transpose.mlir
@@ -665,3 +665,47 @@ util.func public @bubble_transpose_to_broadcast_elementwise(%arg0: tensor<2x3x4x
 //  BUBBLE-SAME:     ins(%[[ARG0]], %[[ARG1]] : tensor<2x3x4xf32>, tensor<2x4xf32>
 //       BUBBLE:     arith.addf
 //       BUBBLE:   util.return %[[ELEM]] : tensor<3x4x2xf32>
+
+// -----
+
+util.func public @bubble_transpose_v_from_attention(%q: tensor<2x10x4096x64xf16>, %k: tensor<2x10x4096x64xf16>, %quantized_v: tensor<2x10x4096x64xi32>, %quant_offset: tensor<10x64xi32>, %quant_scale: tensor<10x64xf32>, %scale: f16) -> tensor<2x10x4096x64xf16> {
+  // Dequantize int-quantization of V
+  %init_dequant = tensor.empty() : tensor<2x10x4096x64xf16>
+  %v = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%quantized_v, %quant_offset, %quant_scale : tensor<2x10x4096x64xi32>, tensor<10x64xi32>, tensor<10x64xf32>) outs(%init_dequant : tensor<2x10x4096x64xf16>) {
+  ^bb0(%in: i32, %in_0: i32, %in_1: f32, %out: f16):
+      %19 = arith.addi %in, %in_0 : i32
+      %20 = arith.sitofp %19 : i32 to f32
+      %21 = arith.mulf %20, %in_1 : f32
+      %22 = arith.truncf %21 : f32 to f16
+      linalg.yield %22 : f16
+  } -> tensor<2x10x4096x64xf16>
+
+  // Attention with transposed V
+  %init_attention = tensor.empty() : tensor<2x10x4096x64xf16>
+  %attention = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d3)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> ()>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d5)>]} ins(%q, %k, %v, %scale : tensor<2x10x4096x64xf16>, tensor<2x10x4096x64xf16>, tensor<2x10x4096x64xf16>, f16) outs(%init_attention : tensor<2x10x4096x64xf16>) {
+    ^bb0(%score: f16):
+      iree_linalg_ext.yield %score: f16
+  } -> tensor<2x10x4096x64xf16>
+  util.return %attention : tensor<2x10x4096x64xf16>
+}
+
+
+// CHECK-DAG: #[[$MAP_Q:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>
+// CHECK-DAG: #[[$MAP_K:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d3)>
+// CHECK-DAG: #[[$MAP_V:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>
+// CHECK-DAG: #[[$MAP_S:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> ()>
+// CHECK-DAG: #[[$MAP_O:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d5)>
+
+// CHECK-LABEL:         util.func public @bubble_transpose_v_from_attention(
+// CHECK-SAME:         %[[ARG0:.*]]: tensor<2x10x4096x64xf16>, %[[ARG1:.*]]: tensor<2x10x4096x64xf16>, %[[ARG2:.*]]: tensor<2x10x4096x64xi32>,
+// CHECK-SAME:         %[[ARG3:.*]]: tensor<10x64xi32>, %[[ARG4:.*]]: tensor<10x64xf32>, %[[ARG5:.*]]: f16) -> tensor<2x10x4096x64xf16> {
+// CHECK:         %[[EMPTY:.*]] = tensor.empty() : tensor<2x10x4096x64xf16>
+// CHECK:         %[[DEQUANT_V:.+]] = linalg.generic
+// CHECK-SAME:    ins(%[[ARG2]], %[[ARG3]], %[[ARG4]] : tensor<2x10x4096x64xi32>, tensor<10x64xi32>, tensor<10x64xf32>)
+// CHECK-SAME:    outs(%{{.*}} : tensor<2x10x4096x64xf16>)
+// CHECK:         %[[TRANS_V:.*]] = linalg.transpose ins(%[[DEQUANT_V]] : tensor<2x10x4096x64xf16>) outs({{.*}} : tensor<2x10x64x4096xf16>) permutation = [0, 1, 3, 2]
+// CHECK:         %[[ATTN:.*]] = iree_linalg_ext.attention
+// CHECK-SAME:    {indexing_maps = [#[[$MAP_Q]], #[[$MAP_K]], #[[$MAP_V]], #[[$MAP_S]], #[[$MAP_O]]]}
+// CHECK-SAME:    ins(%[[ARG0]], %[[ARG1]], %[[TRANS_V]], %[[ARG5]] : tensor<2x10x4096x64xf16>, tensor<2x10x4096x64xf16>, tensor<2x10x64x4096xf16>, f16)
+// CHECK-SAME:    outs(%[[EMPTY]] : tensor<2x10x4096x64xf16>)
+// CHECK:         util.return %[[ATTN]] : tensor<2x10x4096x64xf16>

From 615e7ff5ad0b97a214802f6b1b8a965dd05c1722 Mon Sep 17 00:00:00 2001
From: Han-Chung Wang <hanhan0912@gmail.com>
Date: Tue, 26 Nov 2024 20:36:28 -0800
Subject: [PATCH 13/54] [Codegen][NFC] Remove the HAL dependency from the
 Codegen dialect. (#19311)

We used to set translation_info attribute on hal ops. Now they are set
on the function, so the Codegen dialect no longer need to depend on the
HAL dialect. The revision also updates the comments for those functions.

---------

Signed-off-by: hanhanW <hanhan0912@gmail.com>
---
 .../compiler/Codegen/Dialect/Codegen/IR/BUILD.bazel   |  1 -
 .../Codegen/Dialect/Codegen/IR/CMakeLists.txt         |  1 -
 .../Codegen/Dialect/Codegen/IR/IREECodegenAttrs.cpp   |  4 ++--
 .../Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h     | 11 ++++++-----
 compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.h   |  1 +
 compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h   |  1 +
 6 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/BUILD.bazel
index 6a4ba9a8f272..ed860aeb5134 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/BUILD.bazel
@@ -83,7 +83,6 @@ iree_compiler_cc_library(
         ":LoweringConfigInterfaceGen",
         ":UKernelOpsGen",
         "//compiler/src/iree/compiler/Codegen/Interfaces:UKernelOpInterface",
-        "//compiler/src/iree/compiler/Dialect/HAL/IR",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:ArithUtils",
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/CMakeLists.txt
index 999819510652..0e02fea71220 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/CMakeLists.txt
@@ -63,7 +63,6 @@ iree_cc_library(
     MLIRTransformDialectTransforms
     MLIRViewLikeInterface
     iree::compiler::Codegen::Interfaces::UKernelOpInterface
-    iree::compiler::Dialect::HAL::IR
   PUBLIC
 )
 
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.cpp b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.cpp
index 6c80f5015458..b5403a2b1f88 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.cpp
@@ -476,8 +476,8 @@ void IREECodegenDialect::initializeCodegenAttrs() {
 namespace mlir::iree_compiler {
 
 //===----------------------------------------------------------------------===//
-// Helpers for getting/setting iree_codegen.translation_info attribute on the
-// `hal.executable.export`
+// Helpers for getting/setting iree_codegen.translation_info attribute on a
+// FunctionOpInterface op.
 // ===----------------------------------------------------------------------===//
 
 IREE::Codegen::TranslationInfoAttr
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h
index 6dc56a673f8e..bb2d747f0a5f 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h
+++ b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h
@@ -11,12 +11,13 @@
 #define IREE_COMPILER_CODEGEN_DIALECT_LOWERINGCONFIG_H_
 
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenInterfaces.h"
-#include "iree/compiler/Dialect/HAL/IR/HALOps.h"
+#include "llvm/ADT/TypeSwitch.h"
 #include "mlir/Dialect/SCF/IR/DeviceMappingInterface.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/BuiltinTypes.h"
+#include "mlir/Interfaces/FunctionInterfaces.h"
 
 namespace mlir::iree_compiler {
 /// Typedef for tile sizes to use at different levels of tiling.
@@ -43,8 +44,8 @@ constexpr StringLiteral kTuningSpecAttrName =
 constexpr StringLiteral kKernelConfigSpecName = "__kernel_config";
 
 //===----------------------------------------------------------------------===//
-// Helpers for getting/setting iree_codegen.translation_info attribute on the
-// `hal.executable.export`
+// Helpers for getting/setting iree_codegen.translation_info attribute on a
+// FunctionOpInterface op.
 //===----------------------------------------------------------------------===//
 
 /// Returns the translation info for the `funcOp`. Returns `nullptr` on failure.
@@ -59,8 +60,8 @@ getWorkgroupSize(mlir::FunctionOpInterface funcOp);
 std::optional<int64_t> getSubgroupSize(mlir::FunctionOpInterface funcOp);
 
 /// Sets and overwites the translate executable info for the given entry point.
-/// Returns failure if the given entry point is not exported via
-/// hal.executable.export.
+/// Returns success() at the end. It is convenient when a caller need to
+/// propagate the state.
 LogicalResult
 setTranslationInfo(mlir::FunctionOpInterface entryPoint,
                    IREE::Codegen::TranslationInfoAttr translationInfo);
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.h b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.h
index 16e1fafcf805..d96dc0a1ad50 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.h
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.h
@@ -15,6 +15,7 @@
 #include <optional>
 
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
+#include "iree/compiler/Dialect/HAL/IR/HALOps.h"
 #include "mlir/Pass/Pass.h"
 
 namespace mlir::iree_compiler {
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h
index e7132c7bbd08..caacfb2656e3 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h
@@ -17,6 +17,7 @@
 #include "iree/compiler/Codegen/Common/GPU/Passes.h"
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
 #include "iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.h"
+#include "iree/compiler/Dialect/HAL/IR/HALOps.h"
 #include "mlir/Pass/Pass.h"
 
 namespace mlir::iree_compiler {

From 7adf8c1b3213ec0573ccd7e3b3d05dc3c6c07a80 Mon Sep 17 00:00:00 2001
From: Stanley Winata <68087699+raikonenfnu@users.noreply.github.com>
Date: Tue, 26 Nov 2024 23:46:24 -0800
Subject: [PATCH 14/54] [mlperf][pkgci] Update punet-fp8 with reduction dim as
 last dim (#19316)

We have changes in sharktank that converts reduction dim of the custom
attention to be the fastest dimension. This makes it more uniform with
the FP16 and canonical attention form and hopefully makes optimization
gets called more easily down the line with this.

Additionally, this is to prefetch S.T we do not break the coming
sharktank/mlperf bots and runs.

Signed-off-by: Stanley Winata <stanley.winata@amd.com>
---
 .../external_test_suite/attention_and_matmul_spec_punet.mlir    | 2 +-
 .../regression_suite/shark-test-suite-models/sdxl/test_unet.py  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/build_tools/pkgci/external_test_suite/attention_and_matmul_spec_punet.mlir b/build_tools/pkgci/external_test_suite/attention_and_matmul_spec_punet.mlir
index 7b0944471990..16049fad2543 100644
--- a/build_tools/pkgci/external_test_suite/attention_and_matmul_spec_punet.mlir
+++ b/build_tools/pkgci/external_test_suite/attention_and_matmul_spec_punet.mlir
@@ -76,7 +76,7 @@ transform.named_sequence @match_attention_f8(%attention: !transform.any_op {tran
     transform.iree.match.cast_compatible_type %in0 = tensor<?x?x?x?xf8E4M3FNUZ> : !transform.any_value
 
     %config = transform.param.constant #iree_codegen.compilation_info<
-            lowering_config = #iree_gpu.lowering_config<{workgroup = [1, 1, 64, 0, 0, 0], reduction=[0, 0, 0, 0, 64, 0], promote_operands = [1, 2]}>,
+            lowering_config = #iree_gpu.lowering_config<{workgroup = [1, 1, 64, 0, 0, 0], reduction=[0, 0, 0, 0, 0, 64], promote_operands = [1, 2]}>,
             translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute
                                                               workgroup_size = [64, 4]
                                                               subgroup_size = 64 ,
diff --git a/experimental/regression_suite/shark-test-suite-models/sdxl/test_unet.py b/experimental/regression_suite/shark-test-suite-models/sdxl/test_unet.py
index 415f710281cb..62113c39ea7c 100644
--- a/experimental/regression_suite/shark-test-suite-models/sdxl/test_unet.py
+++ b/experimental/regression_suite/shark-test-suite-models/sdxl/test_unet.py
@@ -121,7 +121,7 @@
 )
 
 sdxl_punet_int8_fp8_mlir = fetch_source_fixture(
-    "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sdxl-punet/11-8-2024/punet_fp8.mlir",
+    "https://sharkpublic.blob.core.windows.net/sharkpublic/stan/sdxl-punet/11-26-2024/punet_fp8.mlir",
     group="sdxl_punet_int8_fp8",
 )
 

From ee3797d02ccf779ee086687e62ca2a563a7b8c3d Mon Sep 17 00:00:00 2001
From: saienduri <77521230+saienduri@users.noreply.github.com>
Date: Wed, 27 Nov 2024 00:52:47 -0800
Subject: [PATCH 15/54] Add SDXL EulerDiscreteScheduler compilation test
 (#19315)

This commit adds a compilation test for the scheduler used in sdxl. We
can iterate on this and add some tests that runs the different functions
as well (`run_initialize`, `run_scale`, `run_step`) and checks for
accuracy.

ci-exactly: build_packages, regression_test

---------

Signed-off-by: saienduri <saimanas.enduri@amd.com>
---
 .../shark-test-suite-models/conftest.py       |  4 +-
 .../shark-test-suite-models/sd3/test_clip.py  |  2 +-
 .../shark-test-suite-models/sd3/test_mmdit.py |  2 +-
 .../shark-test-suite-models/sd3/test_vae.py   |  2 +-
 .../shark-test-suite-models/sdxl/test_clip.py |  2 +-
 .../sdxl/test_scheduler.py                    | 76 +++++++++++++++++++
 .../shark-test-suite-models/sdxl/test_vae.py  |  2 +-
 7 files changed, 84 insertions(+), 6 deletions(-)
 create mode 100644 experimental/regression_suite/shark-test-suite-models/sdxl/test_scheduler.py

diff --git a/experimental/regression_suite/shark-test-suite-models/conftest.py b/experimental/regression_suite/shark-test-suite-models/conftest.py
index 03f925804334..8e62bcb97274 100644
--- a/experimental/regression_suite/shark-test-suite-models/conftest.py
+++ b/experimental/regression_suite/shark-test-suite-models/conftest.py
@@ -9,13 +9,15 @@ class VmfbManager:
     sdxl_clip_cpu_vmfb = None
     sdxl_vae_cpu_vmfb = None
     sdxl_unet_fp16_cpu_vmfb = None
+    sdxl_unet_fp16_cpu_pipeline_vmfb = None
+    sdxl_scheduler_cpu_vmfb = None
     sdxl_clip_rocm_vmfb = None
     sdxl_vae_rocm_vmfb = None
     sdxl_unet_fp16_rocm_vmfb = None
     sdxl_punet_int8_fp16_rocm_vmfb = None
     sdxl_punet_int8_fp8_rocm_vmfb = None
-    sdxl_unet_fp16_cpu_pipeline_vmfb = None
     sdxl_unet_fp16_rocm_pipeline_vmfb = None
+    sdxl_scheduler_rocm_vmfb = None
     sd3_clip_cpu_vmfb = None
     sd3_vae_cpu_vmfb = None
     sd3_mmdit_cpu_vmfb = None
diff --git a/experimental/regression_suite/shark-test-suite-models/sd3/test_clip.py b/experimental/regression_suite/shark-test-suite-models/sd3/test_clip.py
index 61368765bf6a..d5e2778481c6 100644
--- a/experimental/regression_suite/shark-test-suite-models/sd3/test_clip.py
+++ b/experimental/regression_suite/shark-test-suite-models/sd3/test_clip.py
@@ -10,7 +10,7 @@
 from conftest import VmfbManager
 from pathlib import Path
 
-rocm_chip = os.getenv("ROCM_CHIP", default="gfx90a")
+rocm_chip = os.getenv("ROCM_CHIP", default="gfx942")
 vmfb_dir = os.getenv("TEST_OUTPUT_ARTIFACTS", default=Path.cwd())
 
 ###############################################################################
diff --git a/experimental/regression_suite/shark-test-suite-models/sd3/test_mmdit.py b/experimental/regression_suite/shark-test-suite-models/sd3/test_mmdit.py
index 12c25b53966d..0cbde07ba31a 100644
--- a/experimental/regression_suite/shark-test-suite-models/sd3/test_mmdit.py
+++ b/experimental/regression_suite/shark-test-suite-models/sd3/test_mmdit.py
@@ -10,7 +10,7 @@
 from pathlib import Path
 from conftest import VmfbManager
 
-rocm_chip = os.getenv("ROCM_CHIP", default="gfx90a")
+rocm_chip = os.getenv("ROCM_CHIP", default="gfx942")
 vmfb_dir = os.getenv("TEST_OUTPUT_ARTIFACTS", default=Path.cwd())
 
 ###############################################################################
diff --git a/experimental/regression_suite/shark-test-suite-models/sd3/test_vae.py b/experimental/regression_suite/shark-test-suite-models/sd3/test_vae.py
index b41566c6ae0f..72ae9e28167e 100644
--- a/experimental/regression_suite/shark-test-suite-models/sd3/test_vae.py
+++ b/experimental/regression_suite/shark-test-suite-models/sd3/test_vae.py
@@ -10,7 +10,7 @@
 from conftest import VmfbManager
 from pathlib import Path
 
-rocm_chip = os.getenv("ROCM_CHIP", default="gfx90a")
+rocm_chip = os.getenv("ROCM_CHIP", default="gfx942")
 vmfb_dir = os.getenv("TEST_OUTPUT_ARTIFACTS", default=Path.cwd())
 
 ###############################################################################
diff --git a/experimental/regression_suite/shark-test-suite-models/sdxl/test_clip.py b/experimental/regression_suite/shark-test-suite-models/sdxl/test_clip.py
index 416f57bc3955..978595cd4c7d 100644
--- a/experimental/regression_suite/shark-test-suite-models/sdxl/test_clip.py
+++ b/experimental/regression_suite/shark-test-suite-models/sdxl/test_clip.py
@@ -10,7 +10,7 @@
 from conftest import VmfbManager
 from pathlib import Path
 
-rocm_chip = os.getenv("ROCM_CHIP", default="gfx90a")
+rocm_chip = os.getenv("ROCM_CHIP", default="gfx942")
 vmfb_dir = os.getenv("TEST_OUTPUT_ARTIFACTS", default=Path.cwd())
 
 ###############################################################################
diff --git a/experimental/regression_suite/shark-test-suite-models/sdxl/test_scheduler.py b/experimental/regression_suite/shark-test-suite-models/sdxl/test_scheduler.py
new file mode 100644
index 000000000000..2003d88db07c
--- /dev/null
+++ b/experimental/regression_suite/shark-test-suite-models/sdxl/test_scheduler.py
@@ -0,0 +1,76 @@
+# Copyright 2024 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import pytest
+from ireers_tools import *
+import os
+from conftest import VmfbManager
+from pathlib import Path
+
+rocm_chip = os.getenv("ROCM_CHIP", default="gfx942")
+vmfb_dir = os.getenv("TEST_OUTPUT_ARTIFACTS", default=Path.cwd())
+
+###############################################################################
+# Fixtures
+###############################################################################
+
+sdxl_scheduler_mlir = fetch_source_fixture(
+    "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sdxl-scheduler/11-26-2024/model.mlir",
+    group="sdxl_scheduler",
+)
+
+CPU_COMPILE_FLAGS = [
+    "--iree-hal-target-backends=llvm-cpu",
+    "--iree-llvmcpu-target-cpu-features=host",
+    "--iree-llvmcpu-fail-on-out-of-bounds-stack-allocation=false",
+    "--iree-llvmcpu-distribution-size=32",
+    "--iree-opt-const-eval=false",
+    "--iree-opt-strip-assertions=true",
+    "--iree-llvmcpu-enable-ukernels=all",
+    "--iree-global-opt-enable-quantized-matmul-reassociation",
+]
+
+
+ROCM_COMPILE_FLAGS = [
+    "--iree-hal-target-backends=rocm",
+    f"--iree-hip-target={rocm_chip}",
+    "--iree-opt-const-eval=false",
+    "--iree-global-opt-propagate-transposes=true",
+    "--iree-llvmgpu-enable-prefetch=true",
+    "--iree-execution-model=async-external",
+    "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline,iree-preprocessing-pad-to-intrinsics)",
+    "--iree-scheduling-dump-statistics-format=json",
+    "--iree-scheduling-dump-statistics-file=compilation_info.json",
+]
+
+###############################################################################
+# CPU
+###############################################################################
+
+
+def test_compile_scheduler_cpu(sdxl_scheduler_mlir):
+    VmfbManager.sdxl_scheduler_cpu_vmfb = iree_compile(
+        sdxl_scheduler_mlir,
+        CPU_COMPILE_FLAGS,
+        Path(vmfb_dir)
+        / Path("sdxl_scheduler_vmfbs")
+        / Path(sdxl_scheduler_mlir.path.name).with_suffix(f".cpu.vmfb"),
+    )
+
+
+###############################################################################
+# ROCM
+###############################################################################
+
+
+def test_compile_scheduler_rocm(sdxl_scheduler_mlir):
+    VmfbManager.sdxl_scheduler_rocm_vmfb = iree_compile(
+        sdxl_scheduler_mlir,
+        ROCM_COMPILE_FLAGS,
+        Path(vmfb_dir)
+        / Path("sdxl_scheduler_vmfbs")
+        / Path(sdxl_scheduler_mlir.path.name).with_suffix(f".rocm_{rocm_chip}.vmfb"),
+    )
diff --git a/experimental/regression_suite/shark-test-suite-models/sdxl/test_vae.py b/experimental/regression_suite/shark-test-suite-models/sdxl/test_vae.py
index 42eb58f5d934..6eb8d903759b 100644
--- a/experimental/regression_suite/shark-test-suite-models/sdxl/test_vae.py
+++ b/experimental/regression_suite/shark-test-suite-models/sdxl/test_vae.py
@@ -10,7 +10,7 @@
 from conftest import VmfbManager
 from pathlib import Path
 
-rocm_chip = os.getenv("ROCM_CHIP", default="gfx90a")
+rocm_chip = os.getenv("ROCM_CHIP", default="gfx942")
 vmfb_dir = os.getenv("TEST_OUTPUT_ARTIFACTS", default=Path.cwd())
 
 ###############################################################################

From 8cf207726908c0c2d7d9893ff6555ac702776d72 Mon Sep 17 00:00:00 2001
From: Jakub Kuderski <jakub@nod-labs.com>
Date: Wed, 27 Nov 2024 11:48:15 -0500
Subject: [PATCH 16/54] Integrate with llvm-project at
 a807bbea6f48b368388cd796782724e3a53f58a0 (#19321)

Still carrying a revert for 1004865f1ca41a9581da8747f34b29862d3ebc3d and
a cherry pick for https://github.com/llvm/llvm-project/pull/116650.

This time, we have some changes related to tablegen renaming in the
vector dialect and op syntax changes in the bufferization dialect.
---
 .../Common/test/bufferize_copy_only_dispatches.mlir  |  2 +-
 .../Codegen/Common/test/fold_tensor_extract_op.mlir  |  2 +-
 .../Common/test/iree_comprehensive_bufferize.mlir    |  4 ++--
 .../test/tile_and_distribute_to_workgroups.mlir      |  4 ++--
 .../Codegen/Dialect/GPU/IR/IREEGPUDialect.td         |  2 +-
 .../test/select_x86_64_lowering_strategy.mlir        |  4 ++--
 .../Codegen/LLVMGPU/test/gpu_set_num_workgroups.mlir |  4 ++--
 .../SPIRV/test/config_default_linalg_ext_ops.mlir    |  4 ++--
 .../test/tile_and_promote_cooperative_matrix.mlir    |  2 +-
 .../src/iree/compiler/Dialect/Flow/IR/FlowOps.td     |  4 ++--
 compiler/src/iree/compiler/Dialect/HAL/IR/HALBase.td |  2 +-
 compiler/src/iree/compiler/Dialect/HAL/IR/HALOps.td  |  4 ++--
 .../Dialect/LinalgExt/IR/TilingInterfaceImpl.cpp     |  3 ++-
 .../src/iree/compiler/Dialect/Stream/IR/StreamOps.td | 12 ++++++------
 .../include/iree-dialects/Dialect/Input/InputOps.td  |  4 ++--
 third_party/llvm-project                             |  2 +-
 16 files changed, 30 insertions(+), 29 deletions(-)

diff --git a/compiler/src/iree/compiler/Codegen/Common/test/bufferize_copy_only_dispatches.mlir b/compiler/src/iree/compiler/Codegen/Common/test/bufferize_copy_only_dispatches.mlir
index ab2a5bd354a7..9e346814804e 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/bufferize_copy_only_dispatches.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/test/bufferize_copy_only_dispatches.mlir
@@ -88,7 +88,7 @@ func.func @concatenate_cst() {
 
 // CHECK-LABEL: func.func @concatenate_cst()
 //   CHECK-DAG:   %[[CST:.+]] = arith.constant dense<0> : tensor<2x3xi32>
-//   CHECK-DAG:   %[[ZERO:.+]] = bufferization.to_memref %[[CST]] : memref<2x3xi32
+//   CHECK-DAG:   %[[ZERO:.+]] = bufferization.to_memref %[[CST]] : tensor<2x3xi32> to memref<2x3xi32
 //   CHECK-DAG:   %[[DEST_BINDING:.+]] = hal.interface.binding.subspan
 //   CHECK-DAG:   %[[SUBVIEW:.+]] = memref.subview %[[DEST_BINDING]][0, 2] [2, 3]
 //       CHECK:   linalg.generic
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/fold_tensor_extract_op.mlir b/compiler/src/iree/compiler/Codegen/Common/test/fold_tensor_extract_op.mlir
index 9938b0283aa9..967ee23fce15 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/fold_tensor_extract_op.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/test/fold_tensor_extract_op.mlir
@@ -4,7 +4,7 @@ func.func @fold_tensor_extract(%arg0 : memref<2x3xi32>) -> i32
 {
   %c1 = arith.constant 1 : index
   %c2 = arith.constant 2 : index
-  %0 = bufferization.to_tensor %arg0 : memref<2x3xi32>
+  %0 = bufferization.to_tensor %arg0 : memref<2x3xi32> to tensor<2x3xi32>
   %1 = tensor.extract %0[%c1, %c2] : tensor<2x3xi32>
   return %1 : i32
 }
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/iree_comprehensive_bufferize.mlir b/compiler/src/iree/compiler/Codegen/Common/test/iree_comprehensive_bufferize.mlir
index 42dd373e1c34..f0fe5eea375c 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/iree_comprehensive_bufferize.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/test/iree_comprehensive_bufferize.mlir
@@ -272,7 +272,7 @@ func.func @early_bufferized_copy_cst_ops() {
   %c1 = arith.constant 1 : index
   %c2 = arith.constant 2 : index
   %cst = arith.constant dense<0> : tensor<2x3xi32>
-  %0 = bufferization.to_memref %cst : memref<2x3xi32, affine_map<(d0, d1)[s0, s1, s2] -> (d0 * s1 + s0 + d1 * s2)>>
+  %0 = bufferization.to_memref %cst : tensor<2x3xi32> to memref<2x3xi32, affine_map<(d0, d1)[s0, s1, s2] -> (d0 * s1 + s0 + d1 * s2)>>
   %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : memref<2x5xi32>
   memref.assume_alignment %1, 64 : memref<2x5xi32>
   %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor<readwrite:tensor<2x5xi32>>
@@ -1402,7 +1402,7 @@ func.func @bufferize_cst_output_tensor() {
 
 //       CHECK-DAG: %[[CST1:.+]] = arith.constant -2147483648 : i32
 //       CHECK-DAG: %[[CST5:.+]] = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
-//       CHECK: %[[CAST5:.+]] = bufferization.to_memref %[[CST5]] : memref<5xi32>
+//       CHECK: %[[CAST5:.+]] = bufferization.to_memref %[[CST5]] : tensor<5xi32> to memref<5xi32>
 //       CHECK: %[[INPUT:.+]] = hal.interface.binding.subspan layout({{.+}}) binding(0) : memref<5xf32, #hal.descriptor_type<storage_buffer>>
 //       CHECK: %[[OUTPUT:.+]] = hal.interface.binding.subspan layout({{.+}}) binding(1) : memref<i32, #hal.descriptor_type<storage_buffer>>
 //       CHECK: linalg.fill ins(%[[CST1]] : i32) outs(%[[OUTPUT]] : memref<i32{{.+}}>)
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/tile_and_distribute_to_workgroups.mlir b/compiler/src/iree/compiler/Codegen/Common/test/tile_and_distribute_to_workgroups.mlir
index 1a42e4dd2d5f..025a4bf19236 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/tile_and_distribute_to_workgroups.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/test/tile_and_distribute_to_workgroups.mlir
@@ -743,8 +743,8 @@ hal.executable private @static_3d_fft_stage3 {
         %c3 = arith.constant 3 : index
         %cst = arith.constant dense<[1.000000e+00, 0.707106769, 6.12323426E-17, -0.707106769]> : tensor<4xf32>
         %cst_0 = arith.constant dense<[-0.000000e+00, -0.707106769, -1.000000e+00, -0.707106769]> : tensor<4xf32>
-        %0 = bufferization.to_memref %cst_0 : memref<4xf32>
-        %1 = bufferization.to_memref %cst : memref<4xf32>
+        %0 = bufferization.to_memref %cst_0 : tensor<4xf32> to memref<4xf32>
+        %1 = bufferization.to_memref %cst : tensor<4xf32> to memref<4xf32>
         %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) : memref<64x128x32xf32>
         %3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) : memref<64x128x32xf32>
         iree_linalg_ext.fft {lowering_config = #config}
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUDialect.td b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUDialect.td
index d6655172cddf..dcac6c87ba3e 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUDialect.td
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUDialect.td
@@ -41,7 +41,7 @@ def IREEGPU_Dialect : Dialect {
 
 class RankedTensorOrVectorOf<list<Type> allowedTypes> :
   ShapedContainerType<allowedTypes,
-      Or<[IsVectorTypePred, And<[IsTensorTypePred, HasRankPred]>]>,
+      Or<[IsVectorOfNonZeroRankTypePred, And<[IsTensorTypePred, HasRankPred]>]>,
   "ranked tensor or vector", "::mlir::ShapedType">;
 
 def AnyRankedTensorOrVector : RankedTensorOrVectorOf<[AnyType]>;
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir
index 22a288062bc2..12e6058402ea 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir
@@ -366,8 +366,8 @@ func.func @static_3d_fft_stage3() attributes {hal.executable.target = #executabl
   %c3 = arith.constant 3 : index
   %cst = arith.constant dense<[1.000000e+00, 0.707106769, 6.12323426E-17, -0.707106769]> : tensor<4xf32>
   %cst_0 = arith.constant dense<[-0.000000e+00, -0.707106769, -1.000000e+00, -0.707106769]> : tensor<4xf32>
-  %0 = bufferization.to_memref %cst_0 : memref<4xf32>
-  %1 = bufferization.to_memref %cst : memref<4xf32>
+  %0 = bufferization.to_memref %cst_0 : tensor<4xf32> to memref<4xf32>
+  %1 = bufferization.to_memref %cst : tensor<4xf32> to memref<4xf32>
   %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) : memref<64x128x32xf32>
   %3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) : memref<64x128x32xf32>
   iree_linalg_ext.fft ins(%c3, %1, %0 : index, memref<4xf32>, memref<4xf32>) outs(%2, %3 : memref<64x128x32xf32>, memref<64x128x32xf32>)
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/gpu_set_num_workgroups.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/gpu_set_num_workgroups.mlir
index 623e840d8108..6145ebd9688f 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/gpu_set_num_workgroups.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/gpu_set_num_workgroups.mlir
@@ -231,8 +231,8 @@ func.func @static_3d_fft_stage3() {
   %c32 = arith.constant 32 : index
   %cst = arith.constant dense<[1.000000e+00, 0.707106769, 6.12323426E-17, -0.707106769]> : tensor<4xf32>
   %cst_0 = arith.constant dense<[-0.000000e+00, -0.707106769, -1.000000e+00, -0.707106769]> : tensor<4xf32>
-  %0 = bufferization.to_memref %cst_0 : memref<4xf32>
-  %1 = bufferization.to_memref %cst : memref<4xf32>
+  %0 = bufferization.to_memref %cst_0 : tensor<4xf32> to memref<4xf32>
+  %1 = bufferization.to_memref %cst : tensor<4xf32> to memref<4xf32>
   %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) : memref<64x128x32xf32>
   %3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) : memref<64x128x32xf32>
   iree_linalg_ext.fft {__internal_linalg_transform__ = "workgroup"} ins(%c3, %1, %0 : index, memref<4xf32>, memref<4xf32>) outs(%2, %3 : memref<64x128x32xf32>, memref<64x128x32xf32>)
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_linalg_ext_ops.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_linalg_ext_ops.mlir
index 657b94155e40..c201d6011ba1 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_linalg_ext_ops.mlir
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_linalg_ext_ops.mlir
@@ -99,8 +99,8 @@ func.func @static_3d_fft_stage3() {
   %c32 = arith.constant 32 : index
   %cst = arith.constant dense<[1.000000e+00, 0.707106769, 6.12323426E-17, -0.707106769]> : tensor<4xf32>
   %cst_0 = arith.constant dense<[-0.000000e+00, -0.707106769, -1.000000e+00, -0.707106769]> : tensor<4xf32>
-  %0 = bufferization.to_memref %cst_0 : memref<4xf32>
-  %1 = bufferization.to_memref %cst : memref<4xf32>
+  %0 = bufferization.to_memref %cst_0 : tensor<4xf32> to memref<4xf32>
+  %1 = bufferization.to_memref %cst : tensor<4xf32> to memref<4xf32>
   %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) : memref<64x128x32xf32>
   %3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) : memref<64x128x32xf32>
   iree_linalg_ext.fft ins(%c3, %1, %0 : index, memref<4xf32>, memref<4xf32>) outs(%2, %3 : memref<64x128x32xf32>, memref<64x128x32xf32>)
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/tile_and_promote_cooperative_matrix.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/tile_and_promote_cooperative_matrix.mlir
index cdec7efa7351..00ca9efdd21c 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/tile_and_promote_cooperative_matrix.mlir
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/tile_and_promote_cooperative_matrix.mlir
@@ -616,7 +616,7 @@ func.func @matmul_f16_128x262144x2304() attributes {translation_info = #translat
   %c134217728 = arith.constant 134217728 : index
   %cst = arith.constant 0.000000e+00 : f16
   %cst_0 = arith.constant dense<"0x69222B2E40A3002A45AC1AAB2E2E202DA21C212680264C2A102314A041A7D029CB28352E5BAAD3B02F299D9A142B8AA1D1285C28412B25AF9A24EE2BA22C242D53AD9E2948A9289FCF301D28012F08AD68A6DD20ECAC912465290B2E9420C5AA50A222A912AB9526B62ADA2039AD4D912C9FDD287B20B224D329BA2A4D2C41A76DAB7E30B027F62ED1A0F1273A2BAE9D0FA48029812992A65AA92A2C9C2EE9A744A4632C5FA8A9A4CF2D70A482A0F5A2DBA7B6304B9D22A52B1B9DA8E424722AB5ACD0248A2B8B29C82D782E402D1A99F0A60CA4DE2DD32815266F2A6B247FA6FE214E2853AA402390AB6925F1A339307F2664A23CACBE28BA2B3D286DB0BA2E"> : tensor<128xf16>
-  %0 = bufferization.to_memref %cst_0 : memref<128xf16>
+  %0 = bufferization.to_memref %cst_0 : tensor<128xf16> to memref<128xf16>
   %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c96565312) : memref<128x2304xf16>
   %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c806357120) : memref<2304x262144xf16>
   %3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c134217728) : memref<128x262144xf16>
diff --git a/compiler/src/iree/compiler/Dialect/Flow/IR/FlowOps.td b/compiler/src/iree/compiler/Dialect/Flow/IR/FlowOps.td
index 69d8cc419382..2992de96fc34 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/IR/FlowOps.td
+++ b/compiler/src/iree/compiler/Dialect/Flow/IR/FlowOps.td
@@ -1269,7 +1269,7 @@ def FLOW_TensorLoadOp : FLOW_PureOp<"tensor.load", [
     Variadic<FLOW_Dim>:$indices
   );
   let results = (outs
-    AnyTypeOf<[FLOW_PrimitiveType, AnyVector]>:$result
+    AnyTypeOf<[FLOW_PrimitiveType, AnyVectorOfNonZeroRank]>:$result
   );
 
   let assemblyFormat = [{
@@ -1315,7 +1315,7 @@ def FLOW_TensorStoreOp : FLOW_PureOp<"tensor.store", [
   }];
 
   let arguments = (ins
-    AnyTypeOf<[FLOW_PrimitiveType, AnyVector]>:$value,
+    AnyTypeOf<[FLOW_PrimitiveType, AnyVectorOfNonZeroRank]>:$value,
     FLOW_Tensor:$target,
     FLOW_ShapeDynamicDims:$target_dims,
     Variadic<FLOW_Dim>:$indices
diff --git a/compiler/src/iree/compiler/Dialect/HAL/IR/HALBase.td b/compiler/src/iree/compiler/Dialect/HAL/IR/HALBase.td
index 3f1e8110b83e..8b60ef5a1083 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/IR/HALBase.td
+++ b/compiler/src/iree/compiler/Dialect/HAL/IR/HALBase.td
@@ -171,7 +171,7 @@ def HAL_PrimitiveType : AnyTypeOf<[Index, AnySignlessInteger, AnyFloat, AnyCompl
 def HAL_FillPatternType : AnyTypeOf<[I8, I16, I32]>;
 
 def HAL_GlobalRefAttr : Util_AliasedSymbolRefAttr;
-def HAL_GlobalType : AnyTypeOf<[HAL_PrimitiveType, AnyVector, HAL_ObjectType]>;
+def HAL_GlobalType : AnyTypeOf<[HAL_PrimitiveType, AnyVectorOfNonZeroRank, HAL_ObjectType]>;
 def HAL_GlobalPtr : Util_PtrOf<HAL_GlobalType>;
 
 def HAL_IndexAttr : Util_IndexAttrBase<"index">;
diff --git a/compiler/src/iree/compiler/Dialect/HAL/IR/HALOps.td b/compiler/src/iree/compiler/Dialect/HAL/IR/HALOps.td
index 25b9753070fd..16f1eadfdffd 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/IR/HALOps.td
+++ b/compiler/src/iree/compiler/Dialect/HAL/IR/HALOps.td
@@ -746,7 +746,7 @@ def HAL_BufferLoadOp : HAL_PureOp<"buffer.load"> {
     HAL_DeviceSize:$source_offset
   );
   let results = (outs
-    AnyTypeOf<[HAL_PrimitiveType, AnyVector]>:$result
+    AnyTypeOf<[HAL_PrimitiveType, AnyVectorOfNonZeroRank]>:$result
   );
 
   let assemblyFormat = [{
@@ -764,7 +764,7 @@ def HAL_BufferStoreOp : HAL_Op<"buffer.store"> {
   }];
 
   let arguments = (ins
-    AnyTypeOf<[HAL_PrimitiveType, AnyVector]>:$value,
+    AnyTypeOf<[HAL_PrimitiveType, AnyVectorOfNonZeroRank]>:$value,
     HAL_BufferType:$target_buffer,
     HAL_DeviceSize:$target_offset
   );
diff --git a/compiler/src/iree/compiler/Dialect/LinalgExt/IR/TilingInterfaceImpl.cpp b/compiler/src/iree/compiler/Dialect/LinalgExt/IR/TilingInterfaceImpl.cpp
index f52db1ae53fb..5661020253dd 100644
--- a/compiler/src/iree/compiler/Dialect/LinalgExt/IR/TilingInterfaceImpl.cpp
+++ b/compiler/src/iree/compiler/Dialect/LinalgExt/IR/TilingInterfaceImpl.cpp
@@ -2186,7 +2186,8 @@ SmallVector<Range> CustomOp::getIterationDomainForDimensions(
       convertDimsToSymbols(context, maps, numDims, numSymbols);
 
   // 2b. Concat the affine maps.
-  AffineMap concatMap = inversePermutation(concatAffineMaps(modifiedMaps));
+  AffineMap concatMap =
+      inversePermutation(concatAffineMaps(modifiedMaps, context));
   // TODO: Ideally we should bail if the map is invalid, i.e. we abort from
   // applying the transformation. We could add this to the verifier as well, but
   // it is unclear if this makes the op invalid. Revisit after more experience
diff --git a/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.td b/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.td
index 9fd5dcc007e9..d499dd00a7ce 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.td
+++ b/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.td
@@ -340,7 +340,7 @@ def Stream_ResourceLoadOp : Stream_Op<"resource.load", [
     Stream_Offset:$source_offset
   );
   let results = (outs
-    AnyTypeOf<[Stream_PrimitiveType, AnyVector]>:$result
+    AnyTypeOf<[Stream_PrimitiveType, AnyVectorOfNonZeroRank]>:$result
   );
 
   let assemblyFormat = [{
@@ -375,7 +375,7 @@ def Stream_ResourceStoreOp : Stream_Op<"resource.store", [
     Stream_StagingResource:$target,
     Stream_Size:$target_size,
     Stream_Offset:$target_offset,
-    AnyTypeOf<[Stream_PrimitiveType, AnyVector]>:$value
+    AnyTypeOf<[Stream_PrimitiveType, AnyVectorOfNonZeroRank]>:$value
   );
 
   let assemblyFormat = [{
@@ -1626,7 +1626,7 @@ def Stream_TensorLoadOp : Stream_PureOp<"tensor.load", [
     Variadic<Stream_Dim>:$indices
   );
   let results = (outs
-    AnyTypeOf<[Stream_PrimitiveType, AnyVector]>:$result
+    AnyTypeOf<[Stream_PrimitiveType, AnyVectorOfNonZeroRank]>:$result
   );
 
   let assemblyFormat = [{
@@ -1674,7 +1674,7 @@ def Stream_TensorStoreOp : Stream_PureOp<"tensor.store", [
     Stream_ShapeDynamicDims:$target_encoding_dims,
     Stream_Size:$target_size,
     Variadic<Stream_Dim>:$indices,
-    AnyTypeOf<[Stream_PrimitiveType, AnyVector]>:$value
+    AnyTypeOf<[Stream_PrimitiveType, AnyVectorOfNonZeroRank]>:$value
   );
   let results = (outs
     Stream_StagingResource:$result
@@ -2289,7 +2289,7 @@ def Stream_AsyncLoadOp : Stream_PureOp<"async.load", [
     Stream_Offset:$source_offset
   );
   let results = (outs
-    AnyTypeOf<[Stream_PrimitiveType, AnyVector]>:$result
+    AnyTypeOf<[Stream_PrimitiveType, AnyVectorOfNonZeroRank]>:$result
   );
 
   let assemblyFormat = [{
@@ -2330,7 +2330,7 @@ def Stream_AsyncStoreOp : Stream_PureOp<"async.store", [
     Stream_StagingResource:$target,
     Stream_Size:$target_size,
     Stream_Offset:$target_offset,
-    AnyTypeOf<[Stream_PrimitiveType, AnyVector]>:$value
+    AnyTypeOf<[Stream_PrimitiveType, AnyVectorOfNonZeroRank]>:$value
   );
   let results = (outs
     Stream_StagingResource:$result
diff --git a/llvm-external-projects/iree-dialects/include/iree-dialects/Dialect/Input/InputOps.td b/llvm-external-projects/iree-dialects/include/iree-dialects/Dialect/Input/InputOps.td
index 3fb48c3c24a5..bae30a29b175 100644
--- a/llvm-external-projects/iree-dialects/include/iree-dialects/Dialect/Input/InputOps.td
+++ b/llvm-external-projects/iree-dialects/include/iree-dialects/Dialect/Input/InputOps.td
@@ -549,7 +549,7 @@ def IREEInput_TensorLoadOp : IREEInput_PureOp<"tensor.load", [
     Variadic<IREEInput_Dim>:$indices
   );
   let results = (outs
-    AnyTypeOf<[IREEInput_PrimitiveType, AnyVector]>:$result
+    AnyTypeOf<[IREEInput_PrimitiveType, AnyVectorOfNonZeroRank]>:$result
   );
 
   let assemblyFormat = [{
@@ -573,7 +573,7 @@ def IREEInput_TensorStoreOp : IREEInput_PureOp<"tensor.store", [
   }];
 
   let arguments = (ins
-    AnyTypeOf<[IREEInput_PrimitiveType, AnyVector]>:$value,
+    AnyTypeOf<[IREEInput_PrimitiveType, AnyVectorOfNonZeroRank]>:$value,
     IREEInput_Tensor:$target,
     IREEInput_ShapeDynamicDims:$target_dims,
     Variadic<IREEInput_Dim>:$indices
diff --git a/third_party/llvm-project b/third_party/llvm-project
index 04082f21dde6..3833fdcdb01b 160000
--- a/third_party/llvm-project
+++ b/third_party/llvm-project
@@ -1 +1 @@
-Subproject commit 04082f21dde6f5722520d253d0d99f55b4834b7c
+Subproject commit 3833fdcdb01b69c2815db08388e0e092a79cbc58

From 9db347310371da4857ce9d73f3e3cf6cf97cb32d Mon Sep 17 00:00:00 2001
From: Jakub Kuderski <jakub@nod-labs.com>
Date: Wed, 27 Nov 2024 12:00:26 -0500
Subject: [PATCH 17/54] [Codegen] Load transform library only once in
 MaterializeUserConfigs (#19313)

Hoist the library loading logic out of the loop that configures
functions.

This is in preparation for adding tuning spec loading from a new module
attr.

Issue: https://github.com/iree-org/iree/issues/19214
---
 .../Codegen/Common/MaterializeUserConfigs.cpp | 112 ++++++++++--------
 1 file changed, 61 insertions(+), 51 deletions(-)

diff --git a/compiler/src/iree/compiler/Codegen/Common/MaterializeUserConfigs.cpp b/compiler/src/iree/compiler/Codegen/Common/MaterializeUserConfigs.cpp
index 3c243d991fed..c4c97925eefe 100644
--- a/compiler/src/iree/compiler/Codegen/Common/MaterializeUserConfigs.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/MaterializeUserConfigs.cpp
@@ -8,7 +8,11 @@
 #include "iree/compiler/Codegen/Common/UserConfig.h"
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenDialect.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/LogicalResult.h"
 #include "mlir/Dialect/Transform/Transforms/TransformInterpreterUtils.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/MLIRContext.h"
 
 #define DEBUG_TYPE "iree-codegen-materialize-user-configs"
 #define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
@@ -61,6 +65,51 @@ runTransformConfigurationStrategy(Operation *payloadRoot,
   return StrategyRunResult::Success;
 }
 
+struct TransformLibraryWithEntrypoint {
+  ModuleOp transformLibrary;
+  std::string entrypointName;
+};
+
+static FailureOr<TransformLibraryWithEntrypoint>
+getTransformLibraryFromPath(ModuleOp compiledModule, StringRef path) {
+  SmallVector<StringRef, 2> parts;
+  llvm::SplitString(path, parts, "@");
+  if (parts.empty()) {
+    return failure();
+  }
+  if (parts.size() > 2) {
+    return compiledModule.emitError()
+           << "Invalid transform library path and sequence name " << path;
+  }
+  StringRef libraryFileName = parts[0];
+  StringRef entrySequenceName = kKernelConfigSpecName;
+  if (parts.size() == 2) {
+    entrySequenceName = parts[1];
+  }
+
+  // Validate both the file name and the spec name.
+  if (libraryFileName.empty()) {
+    return compiledModule.emitError() << "Cannot specify an empty library path";
+  }
+  if (entrySequenceName.empty()) {
+    return compiledModule.emitError()
+           << "Cannot specify an empty sequence name";
+  }
+
+  MLIRContext *ctx = compiledModule->getContext();
+  auto dialect = ctx->getOrLoadDialect<IREE::Codegen::IREECodegenDialect>();
+  auto maybeTransformLibrary =
+      dialect->getOrLoadTransformLibraryModule(libraryFileName.str());
+  if (failed(maybeTransformLibrary)) {
+    return compiledModule.emitError()
+           << "Failed to load transform library module: " << libraryFileName;
+  }
+  LDBG("--found transform library " << libraryFileName << "@"
+                                    << entrySequenceName);
+  return TransformLibraryWithEntrypoint{*maybeTransformLibrary,
+                                        entrySequenceName.str()};
+}
+
 struct MaterializeUserConfigsPass final
     : impl::MaterializeUserConfigsPassBase<MaterializeUserConfigsPass> {
   void getDependentDialects(DialectRegistry &registry) const override {
@@ -68,8 +117,12 @@ struct MaterializeUserConfigsPass final
   }
 
   void runOnOperation() override {
-    auto moduleOp = getOperation();
-    MLIRContext *context = &getContext();
+    ModuleOp moduleOp = getOperation();
+
+    FailureOr<TransformLibraryWithEntrypoint> userTransformLibrary =
+        getTransformLibraryFromPath(moduleOp,
+                                    clCodegenTransformDialectLibraryFileName);
+
     for (auto funcOp : moduleOp.getOps<FunctionOpInterface>()) {
 
       // Parse the file path and kernel config strategy from flags. There are
@@ -84,54 +137,11 @@ struct MaterializeUserConfigsPass final
       //      "translation_info" =
       //        #iree_codegen.translation_info<pipeline = None>
       //      ```
-      SmallVector<StringRef, 2> parts;
-      llvm::SplitString(
-          llvm::StringRef(clCodegenTransformDialectLibraryFileName), parts,
-          "@");
-      if (parts.size() > 2) {
-        funcOp.emitError()
-            << "Invalid transform library path and sequence name "
-            << clCodegenTransformDialectLibraryFileName;
-        return signalPassFailure();
-      }
-      bool hasTransformLibrary = !parts.empty();
-
-      std::string libraryFileName;
-      if (hasTransformLibrary) {
-        if (parts[0].empty()) {
-          funcOp.emitError() << "Cannot specify an empty library path";
-          return signalPassFailure();
-        }
-        libraryFileName = parts[0];
-      }
-
-      StringRef entrySequenceName = kKernelConfigSpecName;
-      // Check if the user specified a custom entry point name.
-      if (parts.size() == 2) {
-        if (parts[1].empty()) {
-          funcOp.emitError() << "Cannot specify an empty sequence name";
-          return signalPassFailure();
-        }
-        entrySequenceName = parts[1];
-      }
-
       LDBG("MaterializeUserConfigsPass on function: " << funcOp);
-      std::optional<ModuleOp> transformLibrary = std::nullopt;
-      if (hasTransformLibrary) {
-        auto dialect =
-            context->getOrLoadDialect<IREE::Codegen::IREECodegenDialect>();
-        auto maybeTransformLibrary =
-            dialect->getOrLoadTransformLibraryModule(libraryFileName);
-        if (failed(maybeTransformLibrary)) {
-          funcOp.emitError()
-              << "failed to load transform library module: " << libraryFileName;
-          return signalPassFailure();
-        }
-        transformLibrary = *maybeTransformLibrary;
-        LDBG("--found transform library @" << libraryFileName);
-
+      if (succeeded(userTransformLibrary)) {
+        StringRef entrySequenceName = userTransformLibrary->entrypointName;
         auto runResult = runTransformConfigurationStrategy(
-            funcOp, entrySequenceName, *transformLibrary);
+            funcOp, entrySequenceName, userTransformLibrary->transformLibrary);
         if (runResult == StrategyRunResult::NotFound) {
           funcOp.emitError() << "transform kernel config strategy `"
                              << entrySequenceName << " not found";
@@ -186,9 +196,9 @@ struct MaterializeUserConfigsPass final
       /// If we have a symbol, verify the existence of the symbol within the
       /// transform library.
       StringRef entryPoint = strategyName->getLeafReference();
-      if (!transformLibrary || !(*transformLibrary) ||
-          !transform::detail::findTransformEntryPoint(funcOp, *transformLibrary,
-                                                      entryPoint)) {
+      if (failed(userTransformLibrary) ||
+          !transform::detail::findTransformEntryPoint(
+              funcOp, userTransformLibrary->transformLibrary, entryPoint)) {
         funcOp.emitOpError("failed to find transform strategy symbol");
       }
     }

From ad4cf1a588dc5e05122e533260072612ef516a77 Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak@amd.com>
Date: Wed, 27 Nov 2024 12:27:26 -0600
Subject: [PATCH 18/54] [Codegen][LLVM] Annotate pointers an noundef + nonnull,
 consants as noundef (#19309)

We know, as a matter of the API, that pointers passed into GPU functions
and the HAL VM entry point won't be null and will have a defined value.

Annotating these facts may unblock LLVM optimizations. In particuar,
when future commits will annotate ranges on the push constants, those
annotations will need a `noundef` to be used. The existing `noalias` may
also sometimes be turned off by the possibility of undefined arguments.
---
 .../Codegen/LLVMCPU/ConvertToLLVM.cpp         |  6 ++--
 .../Codegen/LLVMCPU/test/convert_to_llvm.mlir |  6 ++--
 .../Codegen/LLVMGPU/ConvertToLLVM.cpp         | 18 ++++++++---
 .../Codegen/LLVMGPU/test/convert_to_nvvm.mlir | 30 +++++++++----------
 .../LLVMGPU/test/convert_to_rocdl.mlir        |  6 ++--
 5 files changed, 39 insertions(+), 27 deletions(-)

diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/ConvertToLLVM.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/ConvertToLLVM.cpp
index e006e1d5f110..d369a4f5e517 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/ConvertToLLVM.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/ConvertToLLVM.cpp
@@ -161,8 +161,10 @@ struct ConvertHALEntryPointFuncOp
     // can use the attributes.
     // (%arg0: environment, %arg1: dispatch_state, %arg2: workgroup_state)
     for (unsigned i = 0; i <= 2; ++i) {
-      llvmFuncOp.setArgAttr(i, LLVM::LLVMDialect::getNoAliasAttrName(),
-                            rewriter.getUnitAttr());
+      Attribute unit = rewriter.getUnitAttr();
+      llvmFuncOp.setArgAttr(i, LLVM::LLVMDialect::getNoAliasAttrName(), unit);
+      llvmFuncOp.setArgAttr(i, LLVM::LLVMDialect::getNonNullAttrName(), unit);
+      llvmFuncOp.setArgAttr(i, LLVM::LLVMDialect::getNoUndefAttrName(), unit);
       llvmFuncOp.setArgAttr(i, LLVM::LLVMDialect::getAlignAttrName(),
                             rewriter.getI64IntegerAttr(16));
     }
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/convert_to_llvm.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/convert_to_llvm.mlir
index f9189dbb409c..3e524ca70201 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/convert_to_llvm.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/convert_to_llvm.mlir
@@ -8,9 +8,9 @@ builtin.module {
 }
 //      CHECK: llvm.func @extern_public()
 //      CHECK: llvm.func @entry_point(
-// CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias},
-// CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias},
-// CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32
+// CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef},
+// CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef},
+// CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias, llvm.nonnull, llvm.noundef}) -> i32
 //      CHECK:     llvm.return %{{.+}} : i32
 
 // -----
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToLLVM.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToLLVM.cpp
index e5f1149f43df..c056d44538bb 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToLLVM.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToLLVM.cpp
@@ -354,15 +354,18 @@ class ConvertIREEBindingSubspanOp : public ConvertToLLVMPattern {
                           rewriter.getI32IntegerAttr(16));
     // It is safe to set the noalias attribute as it is guaranteed that the
     // ranges within bindings won't alias.
+    Attribute unit = rewriter.getUnitAttr();
     llvmFuncOp.setArgAttr(llvmBufferArg.getArgNumber(),
-                          LLVM::LLVMDialect::getNoAliasAttrName(),
-                          rewriter.getUnitAttr());
+                          LLVM::LLVMDialect::getNoAliasAttrName(), unit);
+    llvmFuncOp.setArgAttr(llvmBufferArg.getArgNumber(),
+                          LLVM::LLVMDialect::getNonNullAttrName(), unit);
+    llvmFuncOp.setArgAttr(llvmBufferArg.getArgNumber(),
+                          LLVM::LLVMDialect::getNoUndefAttrName(), unit);
     if (checkAllSubspansReadonly(llvmFuncOp, subspanOp.getBinding())) {
       // Setting the readonly attribute here will generate non-coherent cache
       // loads.
       llvmFuncOp.setArgAttr(llvmBufferArg.getArgNumber(),
-                            LLVM::LLVMDialect::getReadonlyAttrName(),
-                            rewriter.getUnitAttr());
+                            LLVM::LLVMDialect::getReadonlyAttrName(), unit);
     }
     // Add the byte offset.
     Value llvmBufferBasePtr = llvmBufferArg;
@@ -471,6 +474,13 @@ class ConvertIREEConstantOp : public ConvertToLLVMPattern {
     mlir::BlockArgument llvmBufferArg = llvmFuncOp.getArgument(
         argMapping.size() + ireeConstantOp.getOrdinal().getZExtValue());
     assert(llvmBufferArg.getType().isInteger(32));
+
+    // Push constants are never `undef`, annotate that here, just as with
+    // bindings.
+    llvmFuncOp.setArgAttr(llvmBufferArg.getArgNumber(),
+                          LLVM::LLVMDialect::getNoUndefAttrName(),
+                          rewriter.getUnitAttr());
+
     Type dstType = getTypeConverter()->convertType(ireeConstantOp.getType());
     // llvm.zext requires that the result type has a larger bitwidth.
     if (dstType == llvmBufferArg.getType()) {
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/convert_to_nvvm.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/convert_to_nvvm.mlir
index f20c2c3a8d30..bd876a377857 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/convert_to_nvvm.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/convert_to_nvvm.mlir
@@ -32,9 +32,9 @@ hal.executable @abs_ex_dispatch_0 {
   }
 }
 // CHECK-LABEL: llvm.func @abs_ex_dispatch_0
-//  CHECK-SAME: (%[[ARG0:.+]]: !llvm.ptr {llvm.align = 16 : i32, llvm.noalias},
-//  CHECK-SAME:  %[[ARG1:.+]]: !llvm.ptr {llvm.align = 16 : i32, llvm.noalias, llvm.readonly},
-//  CHECK-SAME:  %[[ARG2:.+]]: !llvm.ptr {llvm.align = 16 : i32, llvm.noalias})
+//  CHECK-SAME: (%[[ARG0:.+]]: !llvm.ptr {llvm.align = 16 : i32, llvm.noalias, llvm.nonnull, llvm.noundef},
+//  CHECK-SAME:  %[[ARG1:.+]]: !llvm.ptr {llvm.align = 16 : i32, llvm.noalias, llvm.nonnull, llvm.noundef, llvm.readonly},
+//  CHECK-SAME:  %[[ARG2:.+]]: !llvm.ptr {llvm.align = 16 : i32, llvm.noalias, llvm.nonnull, llvm.noundef})
 //  CHECK: %[[FADD:.+]] = llvm.fadd %{{.*}}, %{{.*}}  : f32
 //  CHECK: %[[ADDR:.+]] = llvm.getelementptr %[[ARG2]][%{{.*}}] : (!llvm.ptr, i64) -> !llvm.ptr, f32
 //  CHECK: llvm.store %[[FADD]], %[[ADDR]] : f32, !llvm.ptr
@@ -72,13 +72,13 @@ hal.executable @abs_dynamic {
   }
 }
 // CHECK-LABEL: llvm.func @abs_dynamic
-//  CHECK-SAME: (%[[ARG0:[a-zA-Z0-9]+]]: !llvm.ptr {llvm.align = 16 : i32, llvm.noalias},
-//  CHECK-SAME:  %[[ARG1:[a-zA-Z0-9]+]]: !llvm.ptr {llvm.align = 16 : i32, llvm.noalias},
-//  CHECK-SAME:  %[[ARG2:[a-zA-Z0-9]+]]: !llvm.ptr {llvm.align = 16 : i32, llvm.noalias},
-//  CHECK-SAME:  %[[ARG3:[a-zA-Z0-9]+]]: i32,
-//  CHECK-SAME:  %[[ARG4:[a-zA-Z0-9]+]]: i32,
-//  CHECK-SAME:  %[[ARG5:[a-zA-Z0-9]+]]: i32,
-//  CHECK-SAME:  %[[ARG6:[a-zA-Z0-9]+]]: i32)
+//  CHECK-SAME: (%[[ARG0:[a-zA-Z0-9]+]]: !llvm.ptr {llvm.align = 16 : i32, llvm.noalias, llvm.nonnull, llvm.noundef},
+//  CHECK-SAME:  %[[ARG1:[a-zA-Z0-9]+]]: !llvm.ptr {llvm.align = 16 : i32, llvm.noalias, llvm.nonnull, llvm.noundef},
+//  CHECK-SAME:  %[[ARG2:[a-zA-Z0-9]+]]: !llvm.ptr {llvm.align = 16 : i32, llvm.noalias, llvm.nonnull, llvm.noundef},
+//  CHECK-SAME:  %[[ARG3:[a-zA-Z0-9]+]]: i32 {llvm.noundef},
+//  CHECK-SAME:  %[[ARG4:[a-zA-Z0-9]+]]: i32 {llvm.noundef},
+//  CHECK-SAME:  %[[ARG5:[a-zA-Z0-9]+]]: i32 {llvm.noundef},
+//  CHECK-SAME:  %[[ARG6:[a-zA-Z0-9]+]]: i32 {llvm.noundef})
 //   CHECK-DAG:   %[[OFFSET:.+]] = llvm.zext %[[ARG3]] : i32 to i64
 //   CHECK-DAG:   %[[D1:.+]] = llvm.zext %[[ARG5]] : i32 to i64
 //   CHECK-DAG:   %[[D2:.+]] = llvm.zext %[[ARG6]] : i32 to i64
@@ -126,8 +126,8 @@ hal.executable @dead_symbol {
   }
 }
 // CHECK-LABEL: llvm.func @dead_symbol
-//  CHECK-SAME: (%[[ARG0:.+]]: !llvm.ptr {llvm.align = 16 : i32, llvm.noalias},
-//  CHECK-SAME:  %[[ARG1:.+]]: !llvm.ptr {llvm.align = 16 : i32, llvm.noalias})
+//  CHECK-SAME: (%[[ARG0:.+]]: !llvm.ptr {llvm.align = 16 : i32, llvm.noalias, llvm.nonnull, llvm.noundef},
+//  CHECK-SAME:  %[[ARG1:.+]]: !llvm.ptr {llvm.align = 16 : i32, llvm.noalias, llvm.nonnull, llvm.noundef})
 //      CHECK:    llvm.fadd
 
 // -----
@@ -165,8 +165,8 @@ hal.executable @mixed_type {
 }
 
 // CHECK-LABEL: llvm.func @mixed_type
-//  CHECK-SAME: (%[[ARG0:.+]]: !llvm.ptr {llvm.align = 16 : i32, llvm.noalias},
-//  CHECK-SAME:  %{{.*}}: !llvm.ptr {llvm.align = 16 : i32, llvm.noalias})
+//  CHECK-SAME: (%[[ARG0:.+]]: !llvm.ptr {llvm.align = 16 : i32, llvm.noalias, llvm.nonnull, llvm.noundef},
+//  CHECK-SAME:  %{{.*}}: !llvm.ptr {llvm.align = 16 : i32, llvm.noalias, llvm.nonnull, llvm.noundef})
 //       CHECK:   nvvm.read.ptx.sreg.tid.x
 //       CHECK:   llvm.getelementptr %[[ARG0]][4] : (!llvm.ptr) -> !llvm.ptr, f32
 //       CHECK:   llvm.fadd
@@ -304,7 +304,7 @@ hal.executable @check_not_readonly {
   }
 }
 // CHECK-LABEL: llvm.func @check_not_readonly
-//  CHECK-NOT: (%[[ARG0:.+]]: !llvm.ptr {llvm.align = 16 : i32, llvm.noalias, llvm.readonly},
+//  CHECK-NOT: (%[[ARG0:.+]]: !llvm.ptr {llvm.align = 16 : i32, llvm.noalias, llvm.nonnull, llvm.noundef, llvm.readonly},
 
 // -----
 
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/convert_to_rocdl.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/convert_to_rocdl.mlir
index 3e158e13daf3..896b6f2294a4 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/convert_to_rocdl.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/convert_to_rocdl.mlir
@@ -33,9 +33,9 @@ hal.executable @abs_ex_dispatch_0 {
 }
 //   CHECK-LABEL: llvm.func @abs_ex_dispatch_0
 // INDEX32-LABEL: llvm.func @abs_ex_dispatch_0
-//    CHECK-SAME: (%{{[a-zA-Z0-9]*}}: !llvm.ptr {llvm.align = 16 : i32, llvm.noalias, llvm.readonly},
-//    CHECK-SAME:  %{{[a-zA-Z0-9]*}}: !llvm.ptr {llvm.align = 16 : i32, llvm.noalias},
-//    CHECK-SAME:  %{{[a-zA-Z0-9]*}}: !llvm.ptr {llvm.align = 16 : i32, llvm.noalias})
+//    CHECK-SAME: (%{{[a-zA-Z0-9]*}}: !llvm.ptr {llvm.align = 16 : i32, llvm.noalias, llvm.nonnull, llvm.noundef, llvm.readonly},
+//    CHECK-SAME:  %{{[a-zA-Z0-9]*}}: !llvm.ptr {llvm.align = 16 : i32, llvm.noalias, llvm.nonnull, llvm.noundef},
+//    CHECK-SAME:  %{{[a-zA-Z0-9]*}}: !llvm.ptr {llvm.align = 16 : i32, llvm.noalias, llvm.nonnull, llvm.noundef})
 //         CHECK:    rocdl.workgroup.dim.x
 //         CHECK:    llvm.getelementptr %{{.*}} : (!llvm.ptr, i64) -> !llvm.ptr, f32
 //       INDEX32:    llvm.getelementptr %{{.*}} : (!llvm.ptr, i32) -> !llvm.ptr, f32

From 677ae420b7f7fda05599b22267395d85d0db0521 Mon Sep 17 00:00:00 2001
From: Ian Wood <ianwood2024@u.northwestern.edu>
Date: Wed, 27 Nov 2024 19:24:09 +0000
Subject: [PATCH 19/54] [Global Opt] Fix transpose propagation failure (#19322)

When applying the "bubbling" patterns in the transpose propagation pass,
the greedy rewriter was failing because it reached 10 iterations before
converging. This PR sets the iteration limit to `kNoLimit` which is the
same config used for the "sinking" patterns that are applied afterwards.
This is needed because these patterns take a bit to converge.


Fixes https://github.com/iree-org/iree/issues/19320

Signed-off-by: Ian Wood <ianwood2024@u.northwestern.edu>
---
 .../GlobalOptimization/PropagateLinalgTranspose.cpp        | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/compiler/src/iree/compiler/GlobalOptimization/PropagateLinalgTranspose.cpp b/compiler/src/iree/compiler/GlobalOptimization/PropagateLinalgTranspose.cpp
index 265ddbbc5890..af54017adb93 100644
--- a/compiler/src/iree/compiler/GlobalOptimization/PropagateLinalgTranspose.cpp
+++ b/compiler/src/iree/compiler/GlobalOptimization/PropagateLinalgTranspose.cpp
@@ -1104,8 +1104,11 @@ void PropagateLinalgTransposePass::runOnOperation() {
         context, /*benefit=*/2);
     bubblingPatterns.insert<ComposeTransposes>(context);
     populateCommonCanonicalizationPatterns(context, bubblingPatterns);
-    if (failed(applyPatternsAndFoldGreedily(funcOp,
-                                            std::move(bubblingPatterns)))) {
+
+    GreedyRewriteConfig config;
+    config.maxIterations = GreedyRewriteConfig::kNoLimit;
+    if (failed(applyPatternsAndFoldGreedily(funcOp, std::move(bubblingPatterns),
+                                            config))) {
       funcOp.emitError("Transpose bubbling patterns failed");
       return signalPassFailure();
     }

From 82724905d64eebb2f62bcc0e41626a7b5156fd8f Mon Sep 17 00:00:00 2001
From: Benoit Jacob <jacob.benoit.1@gmail.com>
Date: Wed, 27 Nov 2024 14:36:55 -0500
Subject: [PATCH 20/54] AMDGPU ukernels: Bazel build, separate bitcode files,
 c-embed archives. (#19274)

1. Implement Bazel, generate CMake from Bazel.
2. Split .bc bitcode files, one .bc file <-> one ukernel function.
3. Generate embedded-data archives.
4. Update the compiler code to use the embedded-data archives.
5. Simplify setAlwaysInline now that we are no longer dealing with HIP
symbols.

---------

Signed-off-by: Benoit Jacob <jacob.benoit.1@gmail.com>
---
 .gitignore                                    |   2 +
 build_tools/bazel/iree_bitcode_library.bzl    |  74 ++++
 .../bazel_to_cmake_converter.py               |  19 +
 build_tools/cmake/iree_bitcode_library.cmake  |  92 ++++
 compiler/plugins/target/ROCM/BUILD.bazel      |   4 +
 compiler/plugins/target/ROCM/CMakeLists.txt   |   4 +
 .../plugins/target/ROCM/ROCMTargetUtils.cpp   |  99 ++---
 .../plugins/target/ROCM/ROCMTargetUtils.h     |   3 -
 .../target/ROCM/builtins/ukernel/BUILD.bazel  |  67 +++
 .../ROCM/builtins/ukernel/CMakeLists.txt      | 412 +++++++++++-------
 .../ROCM/builtins/ukernel/compile_flags.txt   |   1 -
 .../Codegen/Dialect/Codegen/Utils/Utils.h     |   2 -
 12 files changed, 540 insertions(+), 239 deletions(-)
 create mode 100644 compiler/plugins/target/ROCM/builtins/ukernel/BUILD.bazel

diff --git a/.gitignore b/.gitignore
index 700eccf7081e..a7326c77408c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -27,6 +27,8 @@ Testing/
 
 # Bazel artifacts
 **/bazel-*
+MODULE.bazel
+MODULE.bazel.lock
 
 # Executables
 *.exe
diff --git a/build_tools/bazel/iree_bitcode_library.bzl b/build_tools/bazel/iree_bitcode_library.bzl
index 27b1ed64ca06..3382b04ee977 100644
--- a/build_tools/bazel/iree_bitcode_library.bzl
+++ b/build_tools/bazel/iree_bitcode_library.bzl
@@ -255,6 +255,80 @@ def iree_cuda_bitcode_library(
         **kwargs
     )
 
+def iree_amdgpu_bitcode_library(
+        name,
+        gpu_arch,
+        srcs,
+        copts = [],
+        out = None,
+        **kwargs):
+    """Builds an AMDGPU LLVM bitcode library from an input file using clang.
+
+    Args:
+        name: Name of the target.
+        gpu_arch: Target AMDGPU architecture, e.g. gfx942.
+        srcs: Source files to pass to clang. Headers (*.h) are for dependency
+              tracking only. Current limitation: only one non-header source is
+              supported.
+        copts: Additional flags to pass to clang.
+        out: Output file name. Defaults to {source.c}.{gpu_arch}.bc.
+        **kwargs: any additional attributes to pass to the underlying rules.
+    """
+
+    clang_tool = "@llvm-project//clang:clang"
+
+    base_copts = [
+        # Language: C23.
+        "-std=c23",
+
+        # Avoid dependencies.
+        "-nogpulib",
+
+        # Avoid ABI issues.
+        "-fno-short-wchar",  # Shouldn't matter to us, but doesn't hurt.
+
+        # Target architecture/machine.
+        "-target",
+        "amdgcn-amd-amdhsa",
+        "-march=%s" % gpu_arch,
+        "-fgpu-rdc",  # NOTE: may not be required for all targets.
+
+        # Optimized.
+        "-O3",
+        "-fno-ident",
+        "-fvisibility=hidden",
+
+        # Object file only in bitcode format.
+        "-c",
+        "-emit-llvm",
+    ]
+
+    non_header_srcs = [src for src in srcs if not src.endswith(".h")]
+    if len(non_header_srcs) != 1:
+        fail("Expected exactly one non-header file in srcs, got srcs=[" + ", ".join(srcs) + "]")
+    src = non_header_srcs[0]
+
+    if not out:
+        out = "%s.%s.bc" % (src, gpu_arch)
+
+    native.genrule(
+        name = "gen_%s" % (out),
+        srcs = srcs,
+        outs = [out],
+        cmd = " ".join([
+            "$(location %s)" % (clang_tool),
+            "$(location %s)" % (src),
+            "-o $(location %s)" % (out),
+            "-I .",
+        ] + base_copts + copts),
+        tools = [
+            clang_tool,
+        ],
+        message = "Compiling %s to %s..." % (src, out),
+        output_to_bindir = 1,
+        **kwargs
+    )
+
 def iree_link_bitcode(
         name,
         bitcode_files,
diff --git a/build_tools/bazel_to_cmake/bazel_to_cmake_converter.py b/build_tools/bazel_to_cmake/bazel_to_cmake_converter.py
index 4f8084f14ff8..0fb0fd85492f 100644
--- a/build_tools/bazel_to_cmake/bazel_to_cmake_converter.py
+++ b/build_tools/bazel_to_cmake/bazel_to_cmake_converter.py
@@ -610,6 +610,25 @@ def iree_cuda_bitcode_library(
             f")\n\n"
         )
 
+    def iree_amdgpu_bitcode_library(self, name, gpu_arch, srcs, copts=None, out=None):
+        name_block = self._convert_string_arg_block("NAME", name, quote=False)
+        gpu_arch_block = self._convert_string_arg_block(
+            "GPU_ARCH", gpu_arch, quote=False
+        )
+        srcs_block = self._convert_srcs_block(srcs)
+        out_block = self._convert_string_arg_block("OUT", out, quote=False)
+        copts_block = self._convert_string_list_block("COPTS", copts, sort=False)
+
+        self._converter.body += (
+            f"iree_amdgpu_bitcode_library(\n"
+            f"{name_block}"
+            f"{gpu_arch_block}"
+            f"{srcs_block}"
+            f"{out_block}"
+            f"{copts_block}"
+            f")\n\n"
+        )
+
     def iree_link_bitcode(self, name, bitcode_files):
         name_block = self._convert_string_arg_block("NAME", name, quote=False)
         bitcode_files_block = self._convert_srcs_block(
diff --git a/build_tools/cmake/iree_bitcode_library.cmake b/build_tools/cmake/iree_bitcode_library.cmake
index 0c685626ac31..4ae800d1fd65 100644
--- a/build_tools/cmake/iree_bitcode_library.cmake
+++ b/build_tools/cmake/iree_bitcode_library.cmake
@@ -227,6 +227,98 @@ function(iree_cuda_bitcode_library)
   )
 endfunction()
 
+# iree_amdgpu_bitcode_library()
+#
+# Builds an AMDGPU LLVM bitcode library from an input file via clang.
+#
+# Parameters:
+# NAME: Name of the target.
+# GPU_ARCH: Target AMDGPU architecture, e.g. gfx942.
+# SRCS: Source files to pass to clang. Headers (*.h) are for dependency
+#       tracking only. Current limitation: only one non-header source is
+#       supported.
+# COPTS: Additional flags to pass to clang.
+# OUT: Output file name. Defaults to {source.c}.{gpu_arch}.bc.
+#
+function(iree_amdgpu_bitcode_library)
+  cmake_parse_arguments(
+    _RULE
+    ""
+    "NAME;OUT;GPU_ARCH"
+    "SRCS;COPTS"
+    ${ARGN}
+  )
+
+  set(_SRC "")
+  foreach(_SRCS_ENTRY IN LISTS _RULE_SRCS)
+    if(_SRCS_ENTRY MATCHES "\.h$")
+      continue()
+    endif()
+    if (_SRC)
+      message(SEND_ERROR "Currently limitation: only one non-header file allowed in SRCS.")
+    endif()
+    set(_SRC "${_SRCS_ENTRY}")
+  endforeach()
+  if(NOT _SRC)
+    message(SEND_ERROR "Error: no non-header file found in SRCS=${_RULE_SRCS}.")
+  endif()
+
+  if(DEFINED _RULE_OUT)
+    set(_OUT "${_RULE_OUT}")
+  else()
+    set(_OUT "${_SRC}.${_RULE_GPU_ARCH}.bc")
+  endif()
+
+  set(_COPTS
+    # Language: C23
+    "-std=c23"
+
+    # Avoid dependencies.
+    "-nogpulib"
+
+    # Avoid ABI issues.
+    "-fno-short-wchar"  # Shouldn't matter to us, but doesn't hurt.
+
+    # Target architecture/machine.
+    "-target"
+    "amdgcn-amd-amdhsa"
+    "-march=${_RULE_GPU_ARCH}"
+    "-fgpu-rdc"  # NOTE: may not be required for all targets.
+
+    # Optimized.
+    "-O3"
+    "-fno-ident"
+    "-fvisibility=hidden"
+
+    # Object file only in bitcode format.
+    "-c"
+    "-emit-llvm"
+  )
+
+  add_custom_command(
+    OUTPUT
+      "${_OUT}"
+    COMMAND
+      "${IREE_CLANG_BINARY}"
+      ${_COPTS}
+      "-I" "${IREE_SOURCE_DIR}"
+      "${CMAKE_CURRENT_SOURCE_DIR}/${_SRC}"
+      "-o" "${_OUT}"
+    DEPENDS
+      "${IREE_CLANG_BINARY}"
+      "${_RULE_SRCS}"
+    COMMENT
+      "Compiling ${_SRC} to ${_OUT}"
+    VERBATIM
+  )
+
+  # Only add iree_${NAME} as custom target doesn't support aliasing to
+  # iree::${NAME}.
+  iree_package_name(_PACKAGE_NAME)
+  add_custom_target("${_PACKAGE_NAME}_${_RULE_NAME}"
+    DEPENDS "${_OUT}"
+  )
+endfunction()
 
 # iree_link_bitcode()
 #
diff --git a/compiler/plugins/target/ROCM/BUILD.bazel b/compiler/plugins/target/ROCM/BUILD.bazel
index 9692d1aafd26..6ae9b95c4714 100644
--- a/compiler/plugins/target/ROCM/BUILD.bazel
+++ b/compiler/plugins/target/ROCM/BUILD.bazel
@@ -27,6 +27,10 @@ iree_compiler_cc_library(
         "ROCMTargetUtils.h",
     ],
     deps = [
+        "//compiler/plugins/target/ROCM/builtins/ukernel:iree_uk_amdgpu_gfx1030",
+        "//compiler/plugins/target/ROCM/builtins/ukernel:iree_uk_amdgpu_gfx1100",
+        "//compiler/plugins/target/ROCM/builtins/ukernel:iree_uk_amdgpu_gfx90a",
+        "//compiler/plugins/target/ROCM/builtins/ukernel:iree_uk_amdgpu_gfx942",
         "//compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR:IREECodegenDialect",
         "//compiler/src/iree/compiler/Codegen/Dialect/GPU/IR:IREEGPUDialect",
         "//compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils:KnownTargets",
diff --git a/compiler/plugins/target/ROCM/CMakeLists.txt b/compiler/plugins/target/ROCM/CMakeLists.txt
index 938261acd14e..0efc3df479e6 100644
--- a/compiler/plugins/target/ROCM/CMakeLists.txt
+++ b/compiler/plugins/target/ROCM/CMakeLists.txt
@@ -64,6 +64,10 @@ iree_cc_library(
     iree::compiler::Dialect::HAL::Utils::LLVMLinkerUtils
     iree::compiler::PluginAPI
     iree::compiler::Utils
+    iree::compiler::plugins::target::ROCM::builtins::ukernel::iree_uk_amdgpu_gfx1030
+    iree::compiler::plugins::target::ROCM::builtins::ukernel::iree_uk_amdgpu_gfx1100
+    iree::compiler::plugins::target::ROCM::builtins::ukernel::iree_uk_amdgpu_gfx90a
+    iree::compiler::plugins::target::ROCM::builtins::ukernel::iree_uk_amdgpu_gfx942
     iree::schemas::amdgpu_executable_def_c_fbs
     iree::schemas::executable_debug_info_c_fbs
     iree::schemas::hip_executable_def_c_fbs
diff --git a/compiler/plugins/target/ROCM/ROCMTargetUtils.cpp b/compiler/plugins/target/ROCM/ROCMTargetUtils.cpp
index a1757afd75f1..792de8e4a4b0 100644
--- a/compiler/plugins/target/ROCM/ROCMTargetUtils.cpp
+++ b/compiler/plugins/target/ROCM/ROCMTargetUtils.cpp
@@ -6,9 +6,14 @@
 
 #include "compiler/plugins/target/ROCM/ROCMTargetUtils.h"
 
+#include "compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_gfx1030.h"
+#include "compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_gfx1100.h"
+#include "compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_gfx90a.h"
+#include "compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_gfx942.h"
 #include "iree/compiler/Codegen/Utils/GPUUtils.h"
 #include "iree/compiler/Dialect/HAL/Utils/LLVMLinkerUtils.h"
 #include "iree/compiler/Utils/ToolUtils.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IRReader/IRReader.h"
@@ -79,76 +84,28 @@ static LogicalResult linkWithBitcodeFiles(Location loc, llvm::Module *module,
 }
 
 static LogicalResult linkBitcodeFile(Location loc, llvm::Linker &linker,
-                                     unsigned linkerFlags, StringRef path,
+                                     unsigned linkerFlags, StringRef filename,
+                                     StringRef contents,
                                      llvm::TargetMachine &targetMachine,
                                      llvm::LLVMContext &context) {
-  auto bitcodeBufferRef = llvm::MemoryBuffer::getFile(path);
-  if (auto ec = bitcodeBufferRef.getError()) {
-    return mlir::emitError(loc) << "failed reading user bitcode file `" << path
-                                << "`: " << ec.message();
-  }
+  llvm::MemoryBufferRef bitcodeBufferRef(contents, filename);
   auto setAlwaysInline = [&](llvm::Module &module) {
-    if (targetMachine.getTargetCPU().contains("gfx10") ||
-        targetMachine.getTargetCPU().contains("gfx11")) {
-      // Some ROCM/HIP functions for gfx10 or gfx11 has accuracy issue if
-      // inlined.
-      return;
-    }
     for (auto &func : module.getFunctionList()) {
-      // Some ROCM/HIP builtin functions have Optnone and NoInline for default.
-      if (targetMachine.getTargetTriple().isAMDGCN()) {
-        if (func.hasFnAttribute(llvm::Attribute::OptimizeNone)) {
-          func.removeFnAttr(llvm::Attribute::OptimizeNone);
-        }
-        if (targetMachine.getTargetTriple().isAMDGCN() &&
-            func.hasFnAttribute(llvm::Attribute::NoInline)) {
-          func.removeFnAttr(llvm::Attribute::NoInline);
-        }
-      }
       func.addFnAttr(llvm::Attribute::AlwaysInline);
     }
   };
-  if (failed(linkBitcodeModule(
-          loc, linker, linkerFlags, targetMachine, path,
-          llvm::parseBitcodeFile(*bitcodeBufferRef->get(), context),
-          setAlwaysInline))) {
+  if (failed(
+          linkBitcodeModule(loc, linker, linkerFlags, targetMachine, filename,
+                            llvm::parseBitcodeFile(bitcodeBufferRef, context),
+                            setAlwaysInline))) {
     return mlir::emitError(loc) << "failed linking in user bitcode file `"
-                                << path << "` for target triple '"
+                                << filename << "` for target triple '"
                                 << targetMachine.getTargetTriple().str() << "'";
   }
 
   return success();
 }
 
-static std::vector<std::string> getUkernelPaths(StringRef enabledUkernelsStr,
-                                                StringRef targetChip,
-                                                StringRef bitcodePath) {
-  std::vector<std::string> selectedUkernelNames;
-  if (enabledUkernelsStr == "all") {
-    const char *allUkernelNames[] = {"argmax"};
-    size_t numUkernels = sizeof(allUkernelNames) / sizeof(allUkernelNames[0]);
-    for (int i = 0; i < numUkernels; i++) {
-      selectedUkernelNames.push_back(allUkernelNames[i]);
-    }
-  } else {
-    while (!enabledUkernelsStr.empty()) {
-      auto split = enabledUkernelsStr.split(',');
-      selectedUkernelNames.push_back(split.first.str());
-      enabledUkernelsStr = split.second;
-    }
-  }
-
-  // Construct full path to ROCDL bitcode libraries.
-  std::vector<std::string> result;
-  std::string app = "/";
-  for (auto &kernelName : selectedUkernelNames) {
-    std::string filename =
-        "rocm_" + kernelName + "_ukernel_" + targetChip.str();
-    result.push_back(bitcodePath.str() + app + filename + ".bc");
-  }
-  return result;
-}
-
 static void overridePlatformGlobal(llvm::Module *module, StringRef globalName,
                                    uint32_t newValue, llvm::Type *globalTy) {
   // NOTE: the global will not be defined if it is not used in the module.
@@ -228,6 +185,20 @@ LogicalResult linkHIPBitcodeIfNeeded(Location loc, llvm::Module *module,
   return linkWithBitcodeFiles(loc, module, bitcodePaths);
 }
 
+static std::tuple<const iree_file_toc_t *, int>
+getUkernelBitcodeTOC(StringRef gpuArch) {
+  return llvm::StringSwitch<std::tuple<const iree_file_toc_t *, int>>(gpuArch)
+      .Case("gfx90a",
+            {iree_uk_amdgpu_gfx90a_create(), iree_uk_amdgpu_gfx90a_size()})
+      .Case("gfx942",
+            {iree_uk_amdgpu_gfx942_create(), iree_uk_amdgpu_gfx942_size()})
+      .Case("gfx1030",
+            {iree_uk_amdgpu_gfx1030_create(), iree_uk_amdgpu_gfx1030_size()})
+      .Case("gfx1100",
+            {iree_uk_amdgpu_gfx1100_create(), iree_uk_amdgpu_gfx1100_size()})
+      .Default({nullptr, 0});
+}
+
 // Links optimized Ukernel bitcode into the given module if the module needs it.
 LogicalResult linkUkernelBitcodeFiles(Location loc, llvm::Module *module,
                                       StringRef enabledUkernelsStr,
@@ -235,17 +206,15 @@ LogicalResult linkUkernelBitcodeFiles(Location loc, llvm::Module *module,
                                       StringRef bitcodePath,
                                       unsigned linkerFlags,
                                       llvm::TargetMachine &targetMachine) {
-  // Early exit if Ukernel not supported on target chip.
-  if (!iree_compiler::hasUkernelSupportedRocmArch(targetChip)) {
-    return mlir::emitError(loc)
-           << "ukernel '" << enabledUkernelsStr
-           << "' not supported on target chip: " << targetChip;
+  auto [toc, toc_size] = getUkernelBitcodeTOC(targetChip);
+  if (!toc) {
+    return failure();
   }
-  std::vector<std::string> ukernelPaths =
-      getUkernelPaths(enabledUkernelsStr, targetChip, bitcodePath);
+
   llvm::Linker linker(*module);
-  for (auto &path : ukernelPaths) {
-    if (failed(linkBitcodeFile(loc, linker, linkerFlags, StringRef(path),
+  for (int i = 0; i < toc_size; ++i) {
+    if (failed(linkBitcodeFile(loc, linker, linkerFlags, toc[i].name,
+                               llvm::StringRef(toc[i].data, toc[i].size),
                                targetMachine, module->getContext())))
       return failure();
   }
diff --git a/compiler/plugins/target/ROCM/ROCMTargetUtils.h b/compiler/plugins/target/ROCM/ROCMTargetUtils.h
index ab18e242e920..5002eb08b832 100644
--- a/compiler/plugins/target/ROCM/ROCMTargetUtils.h
+++ b/compiler/plugins/target/ROCM/ROCMTargetUtils.h
@@ -34,9 +34,6 @@ LogicalResult linkUkernelBitcodeFiles(Location loc, llvm::Module *module,
 // a blob.
 std::string createHsaco(Location loc, StringRef isa, StringRef name);
 
-// Returns true if the rocm archtecture target is supported for ukernels.
-bool hasUkernelSupportedRocmArch(IREE::HAL::ExecutableTargetAttr targetAttr);
-
 } // namespace mlir::iree_compiler::IREE::HAL
 
 #endif // IREE_COMPILER_PLUGINS_TARGET_ROCM_ROCMTARGETUTILS_H_
diff --git a/compiler/plugins/target/ROCM/builtins/ukernel/BUILD.bazel b/compiler/plugins/target/ROCM/builtins/ukernel/BUILD.bazel
new file mode 100644
index 000000000000..93e6c86bd4a3
--- /dev/null
+++ b/compiler/plugins/target/ROCM/builtins/ukernel/BUILD.bazel
@@ -0,0 +1,67 @@
+# Copyright 2024 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+load("//build_tools/bazel:build_defs.oss.bzl", "iree_cmake_extra_content", "iree_runtime_cc_library")
+load("//build_tools/bazel:iree_bitcode_library.bzl", "iree_amdgpu_bitcode_library")
+load("//build_tools/embed_data:build_defs.bzl", "iree_c_embed_data")
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = ["layering_check"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+iree_cmake_extra_content(
+    content = """
+if(NOT IREE_TARGET_BACKEND_ROCM)
+  return()
+endif()
+""",
+    inline = True,
+)
+
+# Target archs for ukernels. https://llvm.org/docs/AMDGPUUsage.html#processors
+# In general, we won't support all ukernels on all of these archs. It's fine to
+# support a ukernel on just one of these archs, and that will be the generic
+# case with "multi_mma" ukernels which will be entirely specific to the matrix
+# intrinsics found on each arch.
+gpu_archs = [
+    "gfx90a",
+    "gfx942",
+    "gfx1030",
+    "gfx1100",
+]
+
+# Element type combinations for the argmax ukernel.
+argmax_types = [
+    "f16i32",
+    "f16i64",
+    "f32i32",
+    "f32i64",
+]
+
+[iree_amdgpu_bitcode_library(
+    name = "iree_uk_amdgpu_argmax_%s_%s" % (type, gpu_arch),
+    srcs = [
+        "iree_uk_amdgpu_argmax_%s.c" % type,
+        "common.h",
+    ],
+    gpu_arch = gpu_arch,
+) for type in argmax_types for gpu_arch in gpu_archs]
+
+argmax_bc_files = {gpu_arch: [
+    ":iree_uk_amdgpu_argmax_%s.c.%s.bc" % (type, gpu_arch)
+    for type in argmax_types
+] for gpu_arch in gpu_archs}
+
+[iree_c_embed_data(
+    name = "iree_uk_amdgpu_%s" % gpu_arch,
+    srcs = argmax_bc_files[gpu_arch],
+    c_file_output = "iree_uk_amdgpu_%s.c" % gpu_arch,
+    flatten = True,
+    h_file_output = "iree_uk_amdgpu_%s.h" % gpu_arch,
+    identifier = "iree_uk_amdgpu_%s" % gpu_arch,
+) for gpu_arch in gpu_archs]
diff --git a/compiler/plugins/target/ROCM/builtins/ukernel/CMakeLists.txt b/compiler/plugins/target/ROCM/builtins/ukernel/CMakeLists.txt
index d94ba4b52001..6b3014f3bd53 100644
--- a/compiler/plugins/target/ROCM/builtins/ukernel/CMakeLists.txt
+++ b/compiler/plugins/target/ROCM/builtins/ukernel/CMakeLists.txt
@@ -1,173 +1,249 @@
-# Copyright 2023 The IREE Authors
-#
-# Licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+################################################################################
+# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from           #
+# compiler/plugins/target/ROCM/builtins/ukernel/BUILD.bazel                    #
+#                                                                              #
+# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary   #
+# CMake-only content.                                                          #
+#                                                                              #
+# To disable autogeneration for this file entirely, delete this header.        #
+################################################################################
+
+iree_add_all_subdirs()
+
 if(NOT IREE_TARGET_BACKEND_ROCM)
   return()
 endif()
 
-iree_add_all_subdirs()
+iree_amdgpu_bitcode_library(
+  NAME
+    iree_uk_amdgpu_argmax_f16i32_gfx90a
+  GPU_ARCH
+    gfx90a
+  SRCS
+    "common.h"
+    "iree_uk_amdgpu_argmax_f16i32.c"
+)
+
+iree_amdgpu_bitcode_library(
+  NAME
+    iree_uk_amdgpu_argmax_f16i32_gfx942
+  GPU_ARCH
+    gfx942
+  SRCS
+    "common.h"
+    "iree_uk_amdgpu_argmax_f16i32.c"
+)
+
+iree_amdgpu_bitcode_library(
+  NAME
+    iree_uk_amdgpu_argmax_f16i32_gfx1030
+  GPU_ARCH
+    gfx1030
+  SRCS
+    "common.h"
+    "iree_uk_amdgpu_argmax_f16i32.c"
+)
+
+iree_amdgpu_bitcode_library(
+  NAME
+    iree_uk_amdgpu_argmax_f16i32_gfx1100
+  GPU_ARCH
+    gfx1100
+  SRCS
+    "common.h"
+    "iree_uk_amdgpu_argmax_f16i32.c"
+)
+
+iree_amdgpu_bitcode_library(
+  NAME
+    iree_uk_amdgpu_argmax_f16i64_gfx90a
+  GPU_ARCH
+    gfx90a
+  SRCS
+    "common.h"
+    "iree_uk_amdgpu_argmax_f16i64.c"
+)
+
+iree_amdgpu_bitcode_library(
+  NAME
+    iree_uk_amdgpu_argmax_f16i64_gfx942
+  GPU_ARCH
+    gfx942
+  SRCS
+    "common.h"
+    "iree_uk_amdgpu_argmax_f16i64.c"
+)
+
+iree_amdgpu_bitcode_library(
+  NAME
+    iree_uk_amdgpu_argmax_f16i64_gfx1030
+  GPU_ARCH
+    gfx1030
+  SRCS
+    "common.h"
+    "iree_uk_amdgpu_argmax_f16i64.c"
+)
+
+iree_amdgpu_bitcode_library(
+  NAME
+    iree_uk_amdgpu_argmax_f16i64_gfx1100
+  GPU_ARCH
+    gfx1100
+  SRCS
+    "common.h"
+    "iree_uk_amdgpu_argmax_f16i64.c"
+)
+
+iree_amdgpu_bitcode_library(
+  NAME
+    iree_uk_amdgpu_argmax_f32i32_gfx90a
+  GPU_ARCH
+    gfx90a
+  SRCS
+    "common.h"
+    "iree_uk_amdgpu_argmax_f32i32.c"
+)
+
+iree_amdgpu_bitcode_library(
+  NAME
+    iree_uk_amdgpu_argmax_f32i32_gfx942
+  GPU_ARCH
+    gfx942
+  SRCS
+    "common.h"
+    "iree_uk_amdgpu_argmax_f32i32.c"
+)
+
+iree_amdgpu_bitcode_library(
+  NAME
+    iree_uk_amdgpu_argmax_f32i32_gfx1030
+  GPU_ARCH
+    gfx1030
+  SRCS
+    "common.h"
+    "iree_uk_amdgpu_argmax_f32i32.c"
+)
+
+iree_amdgpu_bitcode_library(
+  NAME
+    iree_uk_amdgpu_argmax_f32i32_gfx1100
+  GPU_ARCH
+    gfx1100
+  SRCS
+    "common.h"
+    "iree_uk_amdgpu_argmax_f32i32.c"
+)
+
+iree_amdgpu_bitcode_library(
+  NAME
+    iree_uk_amdgpu_argmax_f32i64_gfx90a
+  GPU_ARCH
+    gfx90a
+  SRCS
+    "common.h"
+    "iree_uk_amdgpu_argmax_f32i64.c"
+)
+
+iree_amdgpu_bitcode_library(
+  NAME
+    iree_uk_amdgpu_argmax_f32i64_gfx942
+  GPU_ARCH
+    gfx942
+  SRCS
+    "common.h"
+    "iree_uk_amdgpu_argmax_f32i64.c"
+)
+
+iree_amdgpu_bitcode_library(
+  NAME
+    iree_uk_amdgpu_argmax_f32i64_gfx1030
+  GPU_ARCH
+    gfx1030
+  SRCS
+    "common.h"
+    "iree_uk_amdgpu_argmax_f32i64.c"
+)
+
+iree_amdgpu_bitcode_library(
+  NAME
+    iree_uk_amdgpu_argmax_f32i64_gfx1100
+  GPU_ARCH
+    gfx1100
+  SRCS
+    "common.h"
+    "iree_uk_amdgpu_argmax_f32i64.c"
+)
+
+iree_c_embed_data(
+  NAME
+    iree_uk_amdgpu_gfx90a
+  SRCS
+    "iree_uk_amdgpu_argmax_f16i32.c.gfx90a.bc"
+    "iree_uk_amdgpu_argmax_f16i64.c.gfx90a.bc"
+    "iree_uk_amdgpu_argmax_f32i32.c.gfx90a.bc"
+    "iree_uk_amdgpu_argmax_f32i64.c.gfx90a.bc"
+  C_FILE_OUTPUT
+    "iree_uk_amdgpu_gfx90a.c"
+  H_FILE_OUTPUT
+    "iree_uk_amdgpu_gfx90a.h"
+  IDENTIFIER
+    "iree_uk_amdgpu_gfx90a"
+  FLATTEN
+  PUBLIC
+)
+
+iree_c_embed_data(
+  NAME
+    iree_uk_amdgpu_gfx942
+  SRCS
+    "iree_uk_amdgpu_argmax_f16i32.c.gfx942.bc"
+    "iree_uk_amdgpu_argmax_f16i64.c.gfx942.bc"
+    "iree_uk_amdgpu_argmax_f32i32.c.gfx942.bc"
+    "iree_uk_amdgpu_argmax_f32i64.c.gfx942.bc"
+  C_FILE_OUTPUT
+    "iree_uk_amdgpu_gfx942.c"
+  H_FILE_OUTPUT
+    "iree_uk_amdgpu_gfx942.h"
+  IDENTIFIER
+    "iree_uk_amdgpu_gfx942"
+  FLATTEN
+  PUBLIC
+)
+
+iree_c_embed_data(
+  NAME
+    iree_uk_amdgpu_gfx1030
+  SRCS
+    "iree_uk_amdgpu_argmax_f16i32.c.gfx1030.bc"
+    "iree_uk_amdgpu_argmax_f16i64.c.gfx1030.bc"
+    "iree_uk_amdgpu_argmax_f32i32.c.gfx1030.bc"
+    "iree_uk_amdgpu_argmax_f32i64.c.gfx1030.bc"
+  C_FILE_OUTPUT
+    "iree_uk_amdgpu_gfx1030.c"
+  H_FILE_OUTPUT
+    "iree_uk_amdgpu_gfx1030.h"
+  IDENTIFIER
+    "iree_uk_amdgpu_gfx1030"
+  FLATTEN
+  PUBLIC
+)
+
+iree_c_embed_data(
+  NAME
+    iree_uk_amdgpu_gfx1100
+  SRCS
+    "iree_uk_amdgpu_argmax_f16i32.c.gfx1100.bc"
+    "iree_uk_amdgpu_argmax_f16i64.c.gfx1100.bc"
+    "iree_uk_amdgpu_argmax_f32i32.c.gfx1100.bc"
+    "iree_uk_amdgpu_argmax_f32i64.c.gfx1100.bc"
+  C_FILE_OUTPUT
+    "iree_uk_amdgpu_gfx1100.c"
+  H_FILE_OUTPUT
+    "iree_uk_amdgpu_gfx1100.h"
+  IDENTIFIER
+    "iree_uk_amdgpu_gfx1100"
+  FLATTEN
+  PUBLIC
+)
 
-set(_platform_lib_reldir "iree_platform_libs/rocm")
-set(_device_bc_path "${IREE_COMPILER_DYLIB_DIR}/iree_platform_libs/rocm")
-set(_amd_ukernel_libs)
-set(_amd_ukernel_targets)
-function(iree_amdgpu_bitcode_library)
-  cmake_parse_arguments(
-    _RULE
-    ""
-    "NAME;OUT;ROCM_ARCH"
-    "SRCS;COPTS"
-    ${ARGN}
-  )
-
-  if(DEFINED _RULE_OUT)
-    set(_OUT "${_RULE_OUT}")
-  else()
-    set(_OUT "${_RULE_NAME}_${_RULE_ROCM_ARCH}.bc")
-  endif()
-
-  set(_ROCM_ARCH "${_RULE_ROCM_ARCH}")
-  set(_COPTS
-    # Language: C23
-    "-x" "c"
-    "-std=c23"
-
-    # Local headers.
-    "-I${IREE_SOURCE_DIR}"
-
-    # Avoid dependencies.
-    "-nogpulib"
-
-    # Avoid ABI issues.
-    "-fno-short-wchar"  # Shouldn't matter to us, but doesn't hurt.
-
-    # Target architecture/machine.
-    "-target" "amdgcn-amd-amdhsa"
-    "-march=${_ROCM_ARCH}"
-    "-fgpu-rdc"  # NOTE: may not be required for all targets.
-
-    # Optimized.
-    "-O3"
-    "-fno-ident"
-    "-fvisibility=hidden"
-
-    # Object file only in bitcode format:
-    "-c"
-    "-emit-llvm"
-  )
-
-  set(_BITCODE_FILES)
-  foreach(_SRC ${_RULE_SRCS})
-    get_filename_component(_SRC_PATH "${_SRC}" REALPATH)
-    get_filename_component(_COMMON_H_PATH "common.h" REALPATH)
-    set(_BITCODE_FILE "${_RULE_NAME}_${_SRC}_${_ROCM_ARCH}.bc")
-    list(APPEND _BITCODE_FILES ${_BITCODE_FILE})
-    add_custom_command(
-      OUTPUT
-        "${_BITCODE_FILE}"
-      COMMAND
-        "${IREE_CLANG_BINARY}"
-        ${_COPTS}
-        "${_SRC_PATH}"
-        "-o"
-        "${_BITCODE_FILE}"
-      DEPENDS
-        "${IREE_CLANG_BINARY}"
-        "${_SRC_PATH}"
-        "${_COMMON_H_PATH}"
-      COMMENT
-        "Compiling ${_SRC} to ${_BITCODE_FILE}"
-      VERBATIM
-    )
-  endforeach()
-
-  add_custom_command(
-    OUTPUT
-      "${_OUT}"
-    COMMAND
-      ${IREE_LLVM_LINK_BINARY}
-      ${_BITCODE_FILES}
-      "-o"
-      "${_OUT}"
-    DEPENDS
-      ${IREE_LLVM_LINK_BINARY}
-      ${_BITCODE_FILES}
-    COMMENT
-      "Linking bitcode to ${_OUT}"
-    VERBATIM
-  )
-  # Only add iree_${NAME} as custom target doesn't support aliasing to
-  # iree::${NAME}.
-  iree_package_name(_PACKAGE_NAME)
-  add_custom_target("${_PACKAGE_NAME}_${_RULE_NAME}_${_ROCM_ARCH}"
-    DEPENDS "${_OUT}"
-  )
-  set(_amd_ukernel_libs ${_amd_ukernel_libs} ${_OUT} PARENT_SCOPE)
-  set(_amd_ukernel_targets ${_amd_ukernel_targets} "${_PACKAGE_NAME}_${_RULE_NAME}_${_ROCM_ARCH}" PARENT_SCOPE)
-endfunction()
-
-# TODO: Decide what to build by default. No real constaints here
-#       except compile-time cost, so just picked out the popular ones.
-set(_ukernel_supported_chips "gfx90a" "gfx942" "gfx1030" "gfx1100")
-foreach(_amd_chip ${_ukernel_supported_chips})
-  iree_amdgpu_bitcode_library(
-    NAME
-      rocm_argmax_ukernel
-    ROCM_ARCH
-      ${_amd_chip}
-    SRCS
-      "iree_uk_amdgpu_argmax_f16i32.c"
-      "iree_uk_amdgpu_argmax_f16i64.c"
-      "iree_uk_amdgpu_argmax_f32i32.c"
-      "iree_uk_amdgpu_argmax_f32i64.c"
-  )
-endforeach()
-
-# Copy UKernel into platform dir.
-set(_all_ukernel_bc_copy_commands)
-set(_all_ukernel_bc_files)
-set(_ukernel_lib_srcdir ${CMAKE_CURRENT_BINARY_DIR})
-foreach(_amd_ukernel_name ${_amd_ukernel_libs})
-  # Copy to lib/ tree.
-  set(_ukernel_bc_srcpath "${_ukernel_lib_srcdir}/${_amd_ukernel_name}")
-  set(_ukernel_bc_relpath "${_platform_lib_reldir}/${_amd_ukernel_name}")
-  list(APPEND _all_ukernel_bc_files "${IREE_COMPILER_DYLIB_DIR}/${_ukernel_bc_relpath}")
-  list(APPEND _all_ukernel_bc_deps "${_ukernel_bc_path}")
-  list(APPEND _all_ukernel_bc_copy_commands
-    COMMAND ${CMAKE_COMMAND} -E copy
-      "${_ukernel_bc_srcpath}"
-      "${IREE_COMPILER_DYLIB_DIR}/${_ukernel_bc_relpath}"
-  )
-  # Note this bc file as being part of the bundle that must be included with
-  # the compiler dylib.
-  set_property(GLOBAL APPEND PROPERTY IREE_COMPILER_DYLIB_RELPATHS "${_ukernel_bc_relpath}")
-endforeach()
-
-# Generate a custom target with all file level dependencies and commands to
-# copy to our build tree locations.
-# Our GenDeviceLibs target depends on all of the defined device lib targets.
-message(STATUS "_all_ukernel_bc_files=${_all_ukernel_bc_files}")
-message(STATUS "_amd_ukernel_targets=${_amd_ukernel_targets}")
-message(STATUS "_all_ukernel_bc_copy_commands=${_all_ukernel_bc_copy_commands}")
-
-add_custom_command(
-  OUTPUT ${_all_ukernel_bc_files}
-  DEPENDS ${_amd_ukernel_targets}
-  POST_BUILD
-    ${_all_ukernel_bc_copy_commands}
-)
-
-add_custom_target(iree_builtin_ROCM_UkernelDeviceLibs
-  DEPENDS
-    ${_all_ukernel_bc_files}
-)
-
-# Ensure that the device libs are built when the compiler dylib is built.
-set_property(GLOBAL APPEND PROPERTY IREE_COMPILER_DEPENDS
-  iree_builtin_ROCM_UkernelDeviceLibs)
-
-# Install.
-install(FILES ${_all_ukernel_bc_files}
-  DESTINATION "${IREE_COMPILER_DYLIB_INSTALL_PREFIX}/${_platform_lib_reldir}")
+### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
diff --git a/compiler/plugins/target/ROCM/builtins/ukernel/compile_flags.txt b/compiler/plugins/target/ROCM/builtins/ukernel/compile_flags.txt
index a02c9f9e49a6..79fcaa723c80 100644
--- a/compiler/plugins/target/ROCM/builtins/ukernel/compile_flags.txt
+++ b/compiler/plugins/target/ROCM/builtins/ukernel/compile_flags.txt
@@ -2,7 +2,6 @@
 # The flags here mirror CMakeLists.txt. Flags that should not matter to clangd
 # have been omitted.
 
--x c
 -std=c23
 -I
 ../../../../../..
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/Utils/Utils.h b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/Utils/Utils.h
index 5ef27f7018d8..d19096ec41f7 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/Utils/Utils.h
+++ b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/Utils/Utils.h
@@ -8,8 +8,6 @@
 #define IREE_COMPILER_CODEGEN_DIALECT_CODEGEN_UTILS_H_
 
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenTypes.h"
-#include "llvm-c/TargetMachine.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/MLIRContext.h"

From 516ff10aae2abd37529f673b27ce20648ce59fc9 Mon Sep 17 00:00:00 2001
From: Stella Laurenzo <stellaraccident@gmail.com>
Date: Wed, 27 Nov 2024 13:22:22 -0800
Subject: [PATCH 21/54] [python] Overhaul iree.build console output and error
 handling. (#19314)

* Status updates look more like what would be expected from something
like ninja or bazel:
  * Short update one liners wink past
* If some actions are taking a long time, a multi-line display is shown
with the worst lagging ones
* Per action errors and dependence errors are now tracked in the build
graph and reported correctly (vs triggering the last chance exception
handler).
* The last chance exception handler for true program errors now prints
its exception verbosely prior to blocking waiting for the executor to
shutdown (this was causing exception swallowing).

Signed-off-by: Stella Laurenzo <stellaraccident@gmail.com>
---
 compiler/bindings/python/CMakeLists.txt       |   1 +
 .../bindings/python/iree/build/console.py     | 152 +++++++++++++++++
 .../bindings/python/iree/build/executor.py    | 140 ++++++++++++----
 compiler/bindings/python/iree/build/main.py   |  46 +++++-
 .../python/test/build_api/CMakeLists.txt      |   4 +-
 .../python/test/build_api/basic_test.py       | 154 ++++++++++++++++++
 .../python/test/build_api/concurrency_test.py |  61 -------
 7 files changed, 464 insertions(+), 94 deletions(-)
 create mode 100644 compiler/bindings/python/iree/build/console.py
 create mode 100644 compiler/bindings/python/test/build_api/basic_test.py
 delete mode 100644 compiler/bindings/python/test/build_api/concurrency_test.py

diff --git a/compiler/bindings/python/CMakeLists.txt b/compiler/bindings/python/CMakeLists.txt
index 5251319b7132..9164982fa0db 100644
--- a/compiler/bindings/python/CMakeLists.txt
+++ b/compiler/bindings/python/CMakeLists.txt
@@ -261,6 +261,7 @@ SOURCES
   __main__.py
   args.py
   compile_actions.py
+  console.py
   executor.py
   lang.py
   main.py
diff --git a/compiler/bindings/python/iree/build/console.py b/compiler/bindings/python/iree/build/console.py
new file mode 100644
index 000000000000..bfd4b2b859d6
--- /dev/null
+++ b/compiler/bindings/python/iree/build/console.py
@@ -0,0 +1,152 @@
+# Copyright 2024 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+from typing import IO
+
+import shutil
+import textwrap
+import threading
+import traceback
+from iree.build.executor import BuildDependency, ProgressReporter
+
+
+class ConsoleProgressReporter(ProgressReporter):
+    def __init__(
+        self,
+        out: IO,
+        *,
+        rich_console: bool = True,
+        long_display_time_threshold: int = 5,
+    ):
+        self.out = out
+        self.rich_console = rich_console
+        self.long_display_time_threshold = long_display_time_threshold
+        self.display_lines: list[str] = []
+        self.inflight_deps: set[BuildDependency] = set()
+        self.finished_deps: set[BuildDependency] = set()
+        self.most_recent_dep: BuildDependency | None = None
+        self.all_deps: set[BuildDependency] = set()
+        self.poller_thread: threading.Thread | None = None
+        self.lock = threading.RLock()
+        self.exit_poller_event = threading.Event()
+
+    @property
+    def started_count(self) -> int:
+        return len(self.finished_deps) + len(self.inflight_deps)
+
+    def reset_display(self):
+        # Clean all known displayed lines.
+        if not self.rich_console:
+            return
+        for line in reversed(self.display_lines):
+            print(f"\033[A{' ' * len(line)}", file=self.out, end="\r")
+
+    def draw_display(self):
+        for line in self.display_lines:
+            print(line, file=self.out)
+
+    def refresh(self):
+        current_deps = list(self.inflight_deps)
+        if not current_deps:
+            return
+        new_display_lines = []
+        if not self.rich_console:
+            if not self.most_recent_dep:
+                return
+            progress_prefix = f"[{self.started_count + 1}/{len(self.all_deps)}]"
+            new_display_lines.append(f"{progress_prefix} {self.most_recent_dep}")
+        else:
+            current_deps.sort(key=lambda dep: dep.execution_time)
+            active_deps = [d for d in current_deps if d.invoke_time is not None]
+            if not active_deps:
+                active_deps = current_deps
+            focus_dep = active_deps[0]
+            longest_time = active_deps[-1].execution_time
+
+            progress_prefix = f"[{self.started_count + 1}/{len(self.all_deps)}]"
+            if longest_time > self.long_display_time_threshold:
+                # Do a long display.
+                long_count = 15
+                report_count = min(long_count, len(active_deps))
+                report_deps = active_deps[-report_count:]
+                new_display_lines.append(
+                    f"{progress_prefix} Waiting for long running actions:"
+                )
+                for dep in report_deps:
+                    new_display_lines.append(
+                        f"    {dep} ({round(dep.execution_time)}s)"
+                    )
+                remaining_count = len(active_deps) - report_count
+                if remaining_count > 0:
+                    new_display_lines.append(f"    ... and {remaining_count} more")
+            else:
+                # Summary display
+                new_display_lines.append(f"{progress_prefix} {focus_dep}")
+
+        # Reduce flicker by only refreshing if changed.
+        if new_display_lines != self.display_lines:
+            self.reset_display()
+            self.display_lines.clear()
+            self.display_lines.extend(new_display_lines)
+            self.draw_display()
+
+    def start_graph(self, all_deps: set[BuildDependency]):
+        with self.lock:
+            self.all_deps.update(all_deps)
+            self.inflight_deps.clear()
+            self.finished_deps.clear()
+            if self.rich_console:
+                self.poller_thread = threading.Thread(
+                    target=self._poll, name="ConsolePoller", daemon=True
+                )
+                self.poller_thread.start()
+
+    def start_dep(self, dep: BuildDependency):
+        with self.lock:
+            self.inflight_deps.add(dep)
+            self.most_recent_dep = dep
+            self.refresh()
+
+    def finish_dep(self, dep: BuildDependency):
+        with self.lock:
+            self.finished_deps.add(dep)
+            if dep in self.inflight_deps:
+                self.inflight_deps.remove(dep)
+            self.refresh()
+
+    def report_failure(self, dep: "BuildDependency"):
+        if dep.is_dependence_failure:
+            return
+        with self.lock:
+            self.reset_display()
+            self.display_lines.clear()
+            print(f"ERROR: Building '{dep}' failed:", file=self.out)
+            if dep.failure:
+                failure_formatted = "".join(traceback.format_exception(dep.failure))
+                print(f"{textwrap.indent(failure_formatted, '    ')}\n", file=self.out)
+
+    def end_graph(self):
+        if self.rich_console:
+            self.exit_poller_event.set()
+            self.poller_thread.join()
+        with self.lock:
+            self.reset_display()
+            self.display_lines.clear()
+
+            success_count = 0
+            failed_count = 0
+            for dep in self.finished_deps:
+                if dep.failure:
+                    failed_count += 1
+                else:
+                    success_count += 1
+            if failed_count == 0:
+                print(f"Successfully built {success_count} actions", file=self.out)
+
+    def _poll(self):
+        while not self.exit_poller_event.wait(timeout=1):
+            with self.lock:
+                self.refresh()
diff --git a/compiler/bindings/python/iree/build/executor.py b/compiler/bindings/python/iree/build/executor.py
index 6e5afe8939ec..c0fefe804ced 100644
--- a/compiler/bindings/python/iree/build/executor.py
+++ b/compiler/bindings/python/iree/build/executor.py
@@ -6,10 +6,11 @@
 
 from typing import Callable, Collection, IO, Type, TypeVar
 
-import abc
 import concurrent.futures
 import enum
+import math
 import multiprocessing
+import os
 import time
 import traceback
 from pathlib import Path
@@ -88,16 +89,45 @@ def __call__(self, *args, **kwargs):
                 return files
 
 
+class ProgressReporter:
+    def reset_display(self):
+        ...
+
+    def start_graph(self, all_deps: set["BuildDependency"]):
+        ...
+
+    def start_dep(self, dep: "BuildDependency"):
+        ...
+
+    def finish_dep(self, dep: "BuildDependency"):
+        ...
+
+    def end_graph(self):
+        ...
+
+    def report_failure(self, dep: "BuildDependency"):
+        ...
+
+
+class DependenceException(Exception):
+    """Noted on a BuildDependency.failure when the dep could not be satisfied because
+    of failed dependencies."""
+
+    ...
+
+
 class Executor:
     """Executor that all build contexts share."""
 
-    def __init__(self, output_dir: Path, stderr: IO):
+    def __init__(self, output_dir: Path, stderr: IO, reporter: ProgressReporter):
         self.output_dir = output_dir
         self.verbose_level = 0
         # Keyed by path
         self.all: dict[str, "BuildContext" | "BuildFile"] = {}
         self.entrypoints: list["BuildEntrypoint"] = []
+        self.failed_deps: set["BuildDependency"] = set()
         self.stderr = stderr
+        self.reporter = reporter
         BuildContext("", self)
 
     def check_path_not_exists(self, path: str, for_entity):
@@ -143,19 +173,34 @@ def analyze(self, *entrypoints: Entrypoint):
             with self.get_context("") as context:
                 entrypoint()
 
-    def build(self, *initial_deps: "BuildDependency"):
+    def build(self, *initial_deps: "BuildDependency") -> bool:
         """Transitively builds the given deps."""
-        scheduler = Scheduler(stderr=self.stderr)
+        scheduler = Scheduler(reporter=self.reporter)
         success = False
+        started_reporter = False
         try:
             for d in initial_deps:
                 scheduler.add_initial_dep(d)
+                self.reporter.start_graph(set(scheduler.producer_graph.keys()))
+                started_reporter = True
                 scheduler.build()
-            success = True
+        except KeyboardInterrupt:
+            raise
+        except:
+            # This catches truly unhandled exceptions (not just build action failures,
+            # which are noted in the graph). Eagerly print the exception so that it
+            # doesn't get swallowed waiting for shutdown.
+            self.reporter.reset_display()
+            print(
+                "Unhandled exception during build. Waiting for background tasks to complete...",
+                file=self.stderr,
+            )
+            traceback.print_exc(file=self.stderr)
         finally:
-            if not success:
-                print("Waiting for background tasks to complete...", file=self.stderr)
             scheduler.shutdown()
+            if started_reporter:
+                self.reporter.end_graph()
+        self.failed_deps.update(scheduler.failed_deps)
 
 
 BuildMetaType = TypeVar("BuildMetaType", bound="BuildMeta")
@@ -206,8 +251,12 @@ def __init__(
 
         # Scheduling state.
         self.future: concurrent.futures.Future | None = None
-        self.start_time: float | None = None
-        self.finish_time: float | None = None
+        self.start_time: float | None = None  # Time the action was scheduled.
+        self.invoke_time: float | None = None  # Time that invocation began.
+        self.finish_time: float | None = None  # Time that finished.
+
+        # If the dep ended in failure, there will be an exception here.
+        self.failure: Exception | None = None
 
         # Metadata.
         self._metadata: dict[str, BuildMeta] = {}
@@ -216,13 +265,23 @@ def __init__(
     def is_scheduled(self) -> bool:
         return self.future is not None
 
+    @property
+    def is_dependence_failure(self) -> bool:
+        return isinstance(self.failure, DependenceException)
+
     @property
     def execution_time(self) -> float:
-        if self.start_time is None:
+        """Time from begin of invocation to present or action finish.
+
+        This will be zero if the dependency has no invoke time. This does not
+        track queued time prior to receiving a thread.
+        """
+        start_time = self.invoke_time
+        if start_time is None:
             return 0.0
         if self.finish_time is None:
-            return time.time() - self.start_time
-        return self.finish_time - self.start_time
+            return time.time() - start_time
+        return self.finish_time - start_time
 
     def start(self, future: concurrent.futures.Future):
         assert not self.is_scheduled, f"Cannot start an already scheduled dep: {self}"
@@ -312,12 +371,16 @@ def invoke(self, scheduler: "Scheduler"):
         #   - On a worker thread for THREAD or PROCESS
         # For PROCESS concurrency, we have to create a compatible invocation
         # thunk, schedule that on the process pool and wait for it.
-        if self.concurrency == ActionConcurrency.PROCESS:
-            thunk = self._remotable_thunk()
-            fut = scheduler.process_pool_executor.submit(thunk)
-            fut.result()
-        else:
-            self._invoke()
+        self.invoke_time = time.time()
+        try:
+            if self.concurrency == ActionConcurrency.PROCESS:
+                thunk = self._remotable_thunk()
+                fut = scheduler.process_pool_executor.submit(thunk)
+                fut.result()
+            else:
+                self._invoke()
+        except Exception as e:
+            self.failure = e
 
     def _invoke(self):
         self._remotable_thunk()()
@@ -431,8 +494,8 @@ def __init__(self, path: str, executor: Executor, entrypoint: Entrypoint):
 class Scheduler:
     """Holds resources related to scheduling."""
 
-    def __init__(self, stderr: IO):
-        self.stderr = stderr
+    def __init__(self, reporter: ProgressReporter):
+        self.reporter = reporter
 
         # Inverted producer-consumer graph nodes mapping a producer dep to
         # all deps which directly depend on it and will be unblocked by it
@@ -443,11 +506,23 @@ def __init__(self, stderr: IO):
         # have a future set on them prior to adding to the set.
         self.in_flight_deps: set[BuildDependency] = set()
 
+        # Any deps that have failed are added here.
+        self.failed_deps: set[BuildDependency] = set()
+
+        # TODO: This needs a flag or additional heuristics. Empirically, at the
+        # time of writing, it was found best to limit the scheduler concurrency
+        # to a bit less than half of the hardware concurrency and then letting
+        # the compiler's thread pool fan out to full hardware concurrency via
+        # `export IREE_COMPILER_TASK_COUNT=0`. This wasn't tested super
+        # scientifically but was shown to get the best throughput on a mixed
+        # torch import -> compile build of 1000 models (about 1m9s for all of it
+        # on the tested configuration).
+        concurrency = int(max(1, math.ceil((os.cpu_count() or 1) * 0.40)))
         self.thread_pool_executor = concurrent.futures.ThreadPoolExecutor(
-            max_workers=10, thread_name_prefix="iree.build"
+            max_workers=concurrency, thread_name_prefix="iree.build"
         )
         self.process_pool_executor = concurrent.futures.ProcessPoolExecutor(
-            max_workers=10, mp_context=multiprocessing.get_context("spawn")
+            max_workers=concurrency, mp_context=multiprocessing.get_context("spawn")
         )
 
     def shutdown(self):
@@ -501,10 +576,6 @@ def build(self):
                 self.in_flight_deps.add(eligible_dep)
 
         while self.producer_graph:
-            print(
-                f"Servicing {len(self.producer_graph)} outstanding tasks",
-                file=self.stderr,
-            )
             self._service_graph()
 
     def _service_graph(self):
@@ -515,8 +586,12 @@ def _service_graph(self):
             ):
                 completed_dep = completed_fut.result()
                 assert isinstance(completed_dep, BuildDependency)
-                print(f"Completed {completed_dep}", file=self.stderr)
+                if completed_dep.failure:
+                    self.failed_deps.add(completed_dep)
+                    self.reporter.report_failure(completed_dep)
                 completed_deps.add(completed_dep)
+                self.reporter.finish_dep(completed_dep)
+
         except TimeoutError:
             pass
         except concurrent.futures.TimeoutError:
@@ -548,13 +623,22 @@ def _service_graph(self):
     def _schedule_action(self, dep: BuildDependency):
         if dep.is_scheduled:
             return
+
+        # If any deps depended on failed, then cascade the failure.
+        for dep_dep in dep.deps:
+            if dep_dep.failure:
+                dep.failure = DependenceException()
+                dep.start(concurrent.futures.Future())
+                dep.finish()
+                return
+
         if isinstance(dep, BuildAction):
 
             def invoke():
                 dep.invoke(self)
                 return dep
 
-            print(f"Scheduling action: {dep}", file=self.stderr)
+            self.reporter.start_dep(dep)
             if dep.concurrency == ActionConcurrency.NONE:
                 invoke()
             elif (
diff --git a/compiler/bindings/python/iree/build/main.py b/compiler/bindings/python/iree/build/main.py
index bc1e9eef52d5..04e1c27952ec 100644
--- a/compiler/bindings/python/iree/build/main.py
+++ b/compiler/bindings/python/iree/build/main.py
@@ -17,7 +17,14 @@
     configure_arg_parser,
     run_global_arg_handlers,
 )
-from iree.build.executor import BuildEntrypoint, BuildFile, Entrypoint, Executor
+from iree.build.executor import (
+    BuildEntrypoint,
+    BuildFile,
+    DependenceException,
+    Entrypoint,
+    Executor,
+)
+from iree.build.console import ConsoleProgressReporter
 
 __all__ = [
     "iree_build_main",
@@ -123,6 +130,17 @@ def __init__(
             help="Paths of actions to build (default to top-level actions)",
         )
 
+        test_group = p.add_argument_group(title="Testing flags")
+        test_group.add_argument(
+            "--test-long-display-time-threshold",
+            default=5,
+            type=int,
+            help=argparse.SUPPRESS,
+        )
+        test_group.add_argument(
+            "--test-force-console", action="store_true", help=argparse.SUPPRESS
+        )
+
         configure_arg_parser(p)
         self._define_action_arguments(p)
         self.args = self.arg_parser.parse_args(args)
@@ -176,7 +194,19 @@ def _resolve_module_arguments(
         return rem_args, top_module
 
     def _create_executor(self) -> Executor:
-        executor = Executor(self.args.output_dir, stderr=self.stderr)
+        force_console_mode = self.args.test_force_console
+        is_tty = False
+        if hasattr(self.stderr, "isatty"):
+            is_tty = self.stderr.isatty()
+        executor = Executor(
+            self.args.output_dir,
+            stderr=self.stderr,
+            reporter=ConsoleProgressReporter(
+                self.stderr,
+                long_display_time_threshold=self.args.test_long_display_time_threshold,
+                rich_console=force_console_mode or is_tty,
+            ),
+        )
         executor.analyze(*self.top_module.entrypoints.values())
         return executor
 
@@ -208,7 +238,17 @@ def build_command(self):
                     )
                     self.abort()
         executor.build(*build_actions)
-
+        if executor.failed_deps:
+            print(
+                f"ERROR: Failed to build {len(executor.failed_deps)} actions. Root causes:",
+                file=self.stderr,
+            )
+            for failed_dep in executor.failed_deps:
+                if not failed_dep.is_dependence_failure:
+                    print(f"  * {failed_dep}", file=self.stderr)
+            self.abort()
+
+        # Success report.
         for build_action in build_actions:
             if isinstance(build_action, BuildEntrypoint):
                 for output in build_action.outputs:
diff --git a/compiler/bindings/python/test/build_api/CMakeLists.txt b/compiler/bindings/python/test/build_api/CMakeLists.txt
index fceac40c531c..6dfcc38f3f9a 100644
--- a/compiler/bindings/python/test/build_api/CMakeLists.txt
+++ b/compiler/bindings/python/test/build_api/CMakeLists.txt
@@ -16,7 +16,7 @@ endif()
 
 iree_py_test(
  NAME
-   concurrency_test
+   basic_test
  SRCS
-   "concurrency_test.py"
+   "basic_test.py"
 )
diff --git a/compiler/bindings/python/test/build_api/basic_test.py b/compiler/bindings/python/test/build_api/basic_test.py
new file mode 100644
index 000000000000..e70f841c835b
--- /dev/null
+++ b/compiler/bindings/python/test/build_api/basic_test.py
@@ -0,0 +1,154 @@
+# Copyright 2024 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import io
+import os
+from pathlib import Path
+import tempfile
+import unittest
+
+from iree.build import *
+from iree.build.executor import BuildContext
+from iree.build.test_actions import ExecuteOutOfProcessThunkAction
+
+
+@entrypoint
+def write_out_of_process_pid():
+    context = BuildContext.current()
+    output_file = context.allocate_file("pid.txt")
+    action = ExecuteOutOfProcessThunkAction(
+        _write_pid_file,
+        args=[output_file.get_fs_path()],
+        desc="Writing pid file",
+        executor=context.executor,
+    )
+    output_file.deps.add(action)
+    return output_file
+
+
+@entrypoint
+def exception_in_action():
+    context = BuildContext.current()
+    output_file = context.allocate_file("pid.txt")
+    action = ExecuteOutOfProcessThunkAction(
+        _raise_error,
+        args=[],
+        desc="Writing pid file",
+        executor=context.executor,
+    )
+    output_file.deps.add(action)
+    return output_file
+
+
+def _write_pid_file(output_path: Path):
+    pid = os.getpid()
+    print(f"Running action out of process: pid={pid}")
+    output_path.write_text(str(pid))
+
+
+def _raise_error():
+    raise RuntimeError("Failure in action")
+
+
+class BasicTest(unittest.TestCase):
+    def setUp(self):
+        self._temp_dir = tempfile.TemporaryDirectory(ignore_cleanup_errors=True)
+        self._temp_dir.__enter__()
+        self.output_path = Path(self._temp_dir.name)
+
+    def tearDown(self) -> None:
+        self._temp_dir.__exit__(None, None, None)
+
+    def testProcessConcurrency(self):
+        parent_pid = os.getpid()
+        print(f"Testing out of process concurrency: pid={parent_pid}")
+        iree_build_main(
+            args=["write_out_of_process_pid", "--output-dir", str(self.output_path)]
+        )
+        pid_file = (
+            self.output_path / "genfiles" / "write_out_of_process_pid" / "pid.txt"
+        )
+        child_pid = int(pid_file.read_text())
+        print(f"Got child pid={child_pid}")
+        self.assertNotEqual(parent_pid, child_pid)
+
+    def test_rich_console(self):
+        # This just does a sanity check that rich console mode does not crash. Actual
+        # behavior can really only be completely verified visually.
+        out_io = io.StringIO()
+        err_io = io.StringIO()
+        iree_build_main(
+            args=[
+                "write_out_of_process_pid",
+                "--output-dir",
+                str(self.output_path),
+                "--test-force-console",
+            ],
+            stderr=err_io,
+            stdout=out_io,
+        )
+        err = err_io.getvalue()
+        print(f"test_rich_console output: {err!r}")
+        self.assertIn("\x1b[A", err)
+
+    def test_exception_in_action(self):
+        # Tests that an exception in an action causes an abort and proper error
+        # reporting.
+        out_io = io.StringIO()
+        err_io = io.StringIO()
+        with self.assertRaises(SystemExit):
+            iree_build_main(
+                args=[
+                    "exception_in_action",
+                    "--output-dir",
+                    str(self.output_path),
+                    "--test-force-console",
+                ],
+                stderr=err_io,
+                stdout=out_io,
+            )
+
+        err = err_io.getvalue()
+        print(f"test_exception_in_action output: {err!r}")
+        self.assertIn("\x1b[A", err)
+        self.assertIn("ERROR: Building", err)
+        self.assertIn("Root causes:\n  * Writing pid file\n", err)
+
+    def test_non_tty(self):
+        # Verifies that the non-tty path reports.
+        out_io = io.StringIO()
+        err_io = io.StringIO()
+        iree_build_main(
+            args=["write_out_of_process_pid", "--output-dir", str(self.output_path)],
+            stderr=err_io,
+            stdout=out_io,
+        )
+        err = err_io.getvalue()
+        print(f"test_non_tty output: {err!r}")
+        self.assertNotIn("\x1b[A", err)
+
+    def test_long_summary(self):
+        # Verifies that the rich console long summary path reports.
+        out_io = io.StringIO()
+        err_io = io.StringIO()
+        iree_build_main(
+            args=[
+                "write_out_of_process_pid",
+                "--output-dir",
+                str(self.output_path),
+                "--test-force-console",
+                "--test-long-display-time-threshold=-1",
+            ],
+            stderr=err_io,
+            stdout=out_io,
+        )
+        err = err_io.getvalue()
+        print(f"test_long_summary output: {err!r}")
+        self.assertIn("Waiting for long running actions:", err)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/compiler/bindings/python/test/build_api/concurrency_test.py b/compiler/bindings/python/test/build_api/concurrency_test.py
deleted file mode 100644
index 498179b73188..000000000000
--- a/compiler/bindings/python/test/build_api/concurrency_test.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# Copyright 2024 The IREE Authors
-#
-# Licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-import os
-from pathlib import Path
-import tempfile
-import unittest
-
-from iree.build import *
-from iree.build.executor import BuildContext
-from iree.build.test_actions import ExecuteOutOfProcessThunkAction
-
-
-@entrypoint
-def write_out_of_process_pid():
-    context = BuildContext.current()
-    output_file = context.allocate_file("pid.txt")
-    action = ExecuteOutOfProcessThunkAction(
-        _write_pid_file,
-        args=[output_file.get_fs_path()],
-        desc="Writing pid file",
-        executor=context.executor,
-    )
-    output_file.deps.add(action)
-    return output_file
-
-
-def _write_pid_file(output_path: Path):
-    pid = os.getpid()
-    print(f"Running action out of process: pid={pid}")
-    output_path.write_text(str(pid))
-
-
-class ConcurrencyTest(unittest.TestCase):
-    def setUp(self):
-        self._temp_dir = tempfile.TemporaryDirectory(ignore_cleanup_errors=True)
-        self._temp_dir.__enter__()
-        self.output_path = Path(self._temp_dir.name)
-
-    def tearDown(self) -> None:
-        self._temp_dir.__exit__(None, None, None)
-
-    def testProcessConcurrency(self):
-        parent_pid = os.getpid()
-        print(f"Testing out of process concurrency: pid={parent_pid}")
-        iree_build_main(
-            args=["write_out_of_process_pid", "--output-dir", str(self.output_path)]
-        )
-        pid_file = (
-            self.output_path / "genfiles" / "write_out_of_process_pid" / "pid.txt"
-        )
-        child_pid = int(pid_file.read_text())
-        print(f"Got child pid={child_pid}")
-        self.assertNotEqual(parent_pid, child_pid)
-
-
-if __name__ == "__main__":
-    unittest.main()

From 991594e7f7995d10d46cd4e62d6fb112fe805fb9 Mon Sep 17 00:00:00 2001
From: Han-Chung Wang <hanhan0912@gmail.com>
Date: Wed, 27 Nov 2024 14:43:54 -0800
Subject: [PATCH 22/54] [DT][NFC] Refactor encoding utilities. (1/n) (#19310)

The revision shuffles the utilities to the Encoding dialect and the
Codegen dialect:

1. Move TileMxNxK struct and getEncodingInfoForMatmul method to the
Codegen dialect (i.e., `Dialect/Codegen/*`)
2. Move isNarrowNResult to the Encoding dialect because it does not
depend on any other dialects other than the Encoding dialect.
3. Move lowerContractionOpWithEncoding to Codegen dialect utils for the
preparation. All the materialization logic will be moved to Codegen
dialect; they share the utilities during the transition period.

To accomplish (3), the revision introduces ResolveEncodingInfoFn
function type, which decouple the dependency from
MaterializeEncodingTypeConvert. It is a requirement because the type
converter uses HAL while we don't want the Codegen dialect depending on
HAL. We do not need the dependency once we move all the logic to
attribute implementation.

Minor cleanups:

- Remove the `rank` argument from getEncodingInfoForMatmul. It is not
used at all.
- Add the `static` keyword to the local `getExpandedType` function.

Note that the `lowerSetEncodingOpToPackOp` and
`lowerUnsetEncodingToUnpackOp` functions are not moved because it
requires more changes. They will be moved in a separate patch.

---------

Signed-off-by: hanhanW <hanhan0912@gmail.com>
---
 .../Common/CPU/CPUMaterializeEncodings.cpp    |   4 +-
 .../compiler/Codegen/Common/EncodingUtils.cpp |  48 ----
 .../compiler/Codegen/Common/EncodingUtils.h   |  14 --
 .../Common/GPU/GPUMaterializeEncoding.cpp     |   5 +-
 .../MaterializeEncodingIntoPackUnPack.cpp     | 178 +--------------
 .../Dialect/Codegen/IR/IREECodegenTypes.h     |   4 +
 .../Codegen/Dialect/Codegen/Utils/BUILD.bazel |   4 +
 .../Dialect/Codegen/Utils/CMakeLists.txt      |   4 +
 .../Codegen/Dialect/Codegen/Utils/Utils.cpp   | 213 ++++++++++++++++++
 .../Codegen/Dialect/Codegen/Utils/Utils.h     |  19 ++
 .../Dialect/Encoding/IR/EncodingAttrs.cpp     |   8 +
 .../Dialect/Encoding/IR/EncodingTypes.h       |   4 +
 12 files changed, 271 insertions(+), 234 deletions(-)

diff --git a/compiler/src/iree/compiler/Codegen/Common/CPU/CPUMaterializeEncodings.cpp b/compiler/src/iree/compiler/Codegen/Common/CPU/CPUMaterializeEncodings.cpp
index 18fabd49a4a5..c7517d8bca1a 100644
--- a/compiler/src/iree/compiler/Codegen/Common/CPU/CPUMaterializeEncodings.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/CPU/CPUMaterializeEncodings.cpp
@@ -31,6 +31,7 @@
 namespace mlir::iree_compiler {
 
 using IREE::Codegen::MaterializeEncodingInfo;
+using IREE::Codegen::TileMxNxK;
 
 #define GEN_PASS_DEF_CPUMATERIALIZEDEVICEENCODINGPASS
 #define GEN_PASS_DEF_CPUMATERIALIZEHOSTENCODINGPASS
@@ -445,8 +446,7 @@ materializeEncodingForTarget(RankedTensorType tensorType,
 
   // Map the matmul TileMxNxK to an actual tile shape for the tensor at hand,
   // based on its operand index in the matmul.
-  auto rank = tensorType.getRank();
-  return getEncodingInfoForMatmul(encoding, rank, chosenTileMxNxK);
+  return IREE::Codegen::getEncodingInfoForMatmul(encoding, chosenTileMxNxK);
 }
 
 static FailureOr<MaterializeEncodingValueInfo>
diff --git a/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.cpp b/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.cpp
index 7d041d09a738..fd75e74a987e 100644
--- a/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.cpp
@@ -153,52 +153,4 @@ RankedTensorType dropEncoding(RankedTensorType type) {
   return RankedTensorType::get(type.getShape(), type.getElementType());
 }
 
-MaterializeEncodingInfo getEncodingInfoForMatmul(EncodingAttr encoding,
-                                                 int64_t rank,
-                                                 TileMxNxK tileMxNxK) {
-  MaterializeEncodingInfo encodingInfo;
-  auto cDims = getEncodingContractionDims(encoding);
-  // The following expects M, N, K, and Batch sizes of at most 1 for now
-  assert(cDims->m.size() <= 1 && cDims->n.size() <= 1 && cDims->k.size() == 1 &&
-         cDims->batch.size() <= 1 &&
-         "Expected at most one M, N, K, and Batch dimension");
-  std::optional<unsigned> batchDim =
-      cDims->batch.empty() ? std::nullopt
-                           : encoding.mapDimToOperandIndex(cDims->batch[0]);
-  std::optional<unsigned> mDim =
-      cDims->m.empty() ? std::nullopt
-                       : encoding.mapDimToOperandIndex(cDims->m[0]);
-  std::optional<unsigned> nDim =
-      cDims->n.empty() ? std::nullopt
-                       : encoding.mapDimToOperandIndex(cDims->n[0]);
-  std::optional<unsigned> kDim = encoding.mapDimToOperandIndex(cDims->k[0]);
-  if (batchDim.has_value()) {
-    encodingInfo.outerDimsPerm.push_back(batchDim.value());
-  }
-  if (mDim.has_value()) {
-    encodingInfo.outerDimsPerm.push_back(mDim.value());
-    encodingInfo.innerDimsPos.push_back(mDim.value());
-    encodingInfo.innerTileSizes.push_back(tileMxNxK.M);
-  }
-  if (nDim.has_value()) {
-    encodingInfo.outerDimsPerm.push_back(nDim.value());
-    encodingInfo.innerDimsPos.push_back(nDim.value());
-    encodingInfo.innerTileSizes.push_back(tileMxNxK.N);
-  }
-  if (kDim.has_value()) {
-    encodingInfo.outerDimsPerm.push_back(kDim.value());
-    encodingInfo.innerDimsPos.push_back(kDim.value());
-    encodingInfo.innerTileSizes.push_back(tileMxNxK.K);
-  }
-  return encodingInfo;
-}
-
-bool isNarrowNResult(EncodingAttr encoding) {
-  if (encoding.getOperandIndex().getValue() != IREE::Encoding::MATMUL_RESULT) {
-    return false;
-  }
-
-  return IREE::Encoding::getMatmulNarrowDim(encoding).isN();
-}
-
 } // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.h b/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.h
index 1c9d0860c5d8..7077fb6a05f1 100644
--- a/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.h
+++ b/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.h
@@ -85,16 +85,6 @@ class OpMaterializeEncodingPattern : public OpConversionPattern<OpTy> {
 /// Returns the RankedTensorType without encodings.
 RankedTensorType dropEncoding(RankedTensorType type);
 
-struct TileMxNxK {
-  int64_t M = 1;
-  int64_t N = 1;
-  int64_t K = 1;
-};
-
-IREE::Codegen::MaterializeEncodingInfo
-getEncodingInfoForMatmul(IREE::Encoding::EncodingAttr encoding, int64_t rank,
-                         TileMxNxK tileMxNxK);
-
 /// Utility method to convert from `set_encoding` op to `pack` operation.
 /// For now this takes a `paddingValue` as input. The source is also taken
 /// as input so that these could be used with `OpConversionPatterns`.
@@ -126,10 +116,6 @@ void populateShapeIndependentMaterializeEncodingPatterns(
     MaterializeEncodingTypeConverter &typeConverter,
     MaterializeEncodingValueFn materializeEncodingValueFn);
 
-// Returns true if `encoding` represents a narrow-N matmul RESULT, e.g. the
-// result of a matvec.
-bool isNarrowNResult(IREE::Encoding::EncodingAttr encoding);
-
 } // namespace mlir::iree_compiler
 
 #endif // IREE_COMPILER_SRC_IREE_COMPILER_CODEGEN_COMMON_ENCODINGUTILS_H_
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUMaterializeEncoding.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUMaterializeEncoding.cpp
index e32760b44215..6debc2a8ffbc 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUMaterializeEncoding.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUMaterializeEncoding.cpp
@@ -43,6 +43,7 @@ namespace mlir::iree_compiler {
 #include "iree/compiler/Codegen/Common/GPU/Passes.h.inc"
 
 using IREE::Codegen::MaterializeEncodingInfo;
+using IREE::Codegen::TileMxNxK;
 using IREE::Codegen::TileSwizzle;
 
 static IREE::GPU::MMAAttr chooseIntrinsicMMAAttr(TypeRange eTypes,
@@ -245,10 +246,10 @@ materializeEncodingForTarget(RankedTensorType tensorType,
 
   // Map the matmul TileMxNxK to an actual tile shape for the tensor at hand,
   // based on its operand index in the matmul.
-  auto rank = tensorType.getRank();
   TileMxNxK innerTile;
   std::tie(innerTile.M, innerTile.N, innerTile.K) = mma.getMNKShape();
-  auto encodingInfo = getEncodingInfoForMatmul(encoding, rank, innerTile);
+  auto encodingInfo =
+      IREE::Codegen::getEncodingInfoForMatmul(encoding, innerTile);
   auto fragment =
       static_cast<IREE::GPU::MMAFragment>(encoding.getOperandIndex().getInt());
   encodingInfo.swizzle = getSwizzle(mma, fragment);
diff --git a/compiler/src/iree/compiler/Codegen/Common/MaterializeEncodingIntoPackUnPack.cpp b/compiler/src/iree/compiler/Codegen/Common/MaterializeEncodingIntoPackUnPack.cpp
index 57a990b78bfc..fc3bb45c8be6 100644
--- a/compiler/src/iree/compiler/Codegen/Common/MaterializeEncodingIntoPackUnPack.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/MaterializeEncodingIntoPackUnPack.cpp
@@ -61,19 +61,6 @@ getSwizzledShape(ArrayRef<OpFoldResult> packedShape,
   return newShape;
 }
 
-static Operation *dropEncodingAndCloneOp(OpBuilder &builder, Operation *op,
-                                         ValueRange convertedInputOperands,
-                                         ValueRange convertedOutputOperands) {
-  SmallVector<Value> operands;
-  operands.append(convertedInputOperands.begin(), convertedInputOperands.end());
-  operands.append(convertedOutputOperands.begin(),
-                  convertedOutputOperands.end());
-  return mlir::clone(builder, op,
-                     {dropEncoding(cast<RankedTensorType>(
-                         convertedOutputOperands[0].getType()))},
-                     operands);
-}
-
 static FailureOr<SmallVector<OpFoldResult>>
 getInnerTileSizesOfr(OpBuilder &rewriter, Location loc,
                      RankedTensorType tensorType,
@@ -111,91 +98,6 @@ getInnerTileSizesOfr(OpBuilder &rewriter, Location loc,
   return result;
 }
 
-RankedTensorType getExpandedType(RankedTensorType type, bool isBatched,
-                                 bool isTransposed,
-                                 SmallVectorImpl<ReassociationIndices> &ri) {
-  if (!isBatched) {
-    ri.assign({{0, 1}, {2, 3}});
-    if (!isTransposed) {
-      return RankedTensorType::get(
-          {1, type.getDimSize(0), 1, type.getDimSize(1)},
-          type.getElementType());
-    }
-    return RankedTensorType::get({type.getDimSize(0), 1, type.getDimSize(1), 1},
-                                 type.getElementType());
-  }
-
-  ri.assign({{0}, {1, 2}, {3, 4}});
-  if (!isTransposed) {
-    return RankedTensorType::get(
-        {type.getDimSize(0), 1, type.getDimSize(1), 1, type.getDimSize(2)},
-        type.getElementType());
-  }
-  return RankedTensorType::get(
-      {type.getDimSize(0), type.getDimSize(1), 1, type.getDimSize(2), 1},
-      type.getElementType());
-}
-
-/// Given an input Value and a desired output element type, create and return
-/// an element-wise linalg::GenericOp that extends the input Value to the
-/// output element type.
-static Value createElementWiseExtUIOp(RewriterBase &rewriter, Value input,
-                                      Location loc, Type outElemType) {
-  auto inputType = cast<RankedTensorType>(input.getType());
-  SmallVector<AffineMap> maps(
-      2, rewriter.getMultiDimIdentityMap(inputType.getRank()));
-  SmallVector<utils::IteratorType> iteratorTypes(inputType.getRank(),
-                                                 utils::IteratorType::parallel);
-  auto castedType = inputType.clone(outElemType);
-  SmallVector<OpFoldResult> inputMixedSizes =
-      tensor::getMixedSizes(rewriter, loc, input);
-  Value init =
-      rewriter.create<tensor::EmptyOp>(loc, inputMixedSizes, outElemType);
-  return rewriter
-      .create<linalg::GenericOp>(
-          loc, castedType, input, init, maps, iteratorTypes,
-          [&](OpBuilder &b, Location nestedLoc, ValueRange args) {
-            Value castRes =
-                b.create<arith::ExtUIOp>(nestedLoc, outElemType, args[0])
-                    ->getResult(0);
-            b.create<linalg::YieldOp>(nestedLoc, castRes);
-          })
-      .getResult(0);
-}
-
-/// If needed, expand and the input Value, and return the resulting input with
-/// the canonical mmt4d input shape. If the input element type is unsigned,
-/// create a producer Linalg::GenericOp on the input that unsigned extends the
-/// input to the output element type. This extension is required to keep the
-/// unsignedness information on the input for ukernels. If `transpose` is true,
-/// the `linalgOp`'s indexing maps are transposed.
-static Value getMmt4dOperand(Value value, linalg::LinalgOp linalgOp,
-                             bool transpose, RewriterBase &rewriter,
-                             SmallVectorImpl<ReassociationIndices> &ri,
-                             ArrayRef<Type> elemTypes, int operandIdx) {
-  assert(linalgOp.getNumDpsInputs() == 2);
-  assert(linalgOp.getNumDpsInits() == 1);
-  auto cDims = linalg::inferContractionDims(linalgOp);
-  Location loc = linalgOp->getLoc();
-  Value expandedValue = value;
-  // If vecmat with non-rhs operandIdx or matvec with non-lhs operandIdx, the
-  // operand is a vector and must be extended
-  if ((cDims->m.empty() && operandIdx != 1) ||
-      (cDims->n.empty() && operandIdx != 0)) {
-    auto type = cast<RankedTensorType>(value.getType());
-    RankedTensorType newType = getExpandedType(
-        type, /*isBatched=*/!cDims->batch.empty(),
-        /*isTransposed=*/operandIdx == 2 && (transpose ^ cDims->n.empty()), ri);
-    expandedValue =
-        rewriter.create<tensor::ExpandShapeOp>(loc, newType, value, ri);
-  }
-  if (elemTypes[operandIdx].isUnsignedInteger()) {
-    return createElementWiseExtUIOp(rewriter, expandedValue, loc,
-                                    elemTypes.back());
-  }
-  return expandedValue;
-}
-
 static void transposeInPlace(MaterializeEncodingInfo &info) {
   // Vector cases: nothing to do.
   if (info.innerTileSizes.size() < 2) {
@@ -297,75 +199,6 @@ FailureOr<tensor::UnPackOp> lowerUnsetEncodingToUnpackOp(
       encodingInfo->outerDimsPerm);
 }
 
-static FailureOr<Operation *> lowerContractionOpWithEncoding(
-    RewriterBase &rewriter, linalg::LinalgOp linalgOp, ValueRange operands,
-    const MaterializeEncodingTypeConverter &typeConverter) {
-  if (!linalgOp.hasPureTensorSemantics())
-    return failure();
-
-  auto inputs = linalgOp.getDpsInputOperands();
-  auto outputs = linalgOp.getDpsInits();
-
-  auto lhsType = cast<RankedTensorType>(inputs[0]->get().getType());
-  auto rhsType = cast<RankedTensorType>(inputs[1]->get().getType());
-  auto resultType = cast<RankedTensorType>(outputs[0].getType());
-  auto lhsEncoding = IREE::Encoding::getEncodingAttr(lhsType);
-  auto rhsEncoding = IREE::Encoding::getEncodingAttr(rhsType);
-  auto resultEncoding = IREE::Encoding::getEncodingAttr(resultType);
-  if (!lhsEncoding || !rhsEncoding || !resultEncoding) {
-    return failure();
-  }
-
-  if (lhsEncoding.getOperandIndex().getValue() != IREE::Encoding::MATMUL_LHS ||
-      rhsEncoding.getOperandIndex().getValue() != IREE::Encoding::MATMUL_RHS ||
-      resultEncoding.getOperandIndex().getValue() !=
-          IREE::Encoding::MATMUL_RESULT) {
-    return failure();
-  }
-
-  FailureOr<MaterializeEncodingInfo> encodingInfo =
-      typeConverter.getEncodingInfo(
-          cast<RankedTensorType>(linalgOp->getResultTypes()[0]));
-
-  Operation *result;
-  if (failed(encodingInfo)) {
-    result = dropEncodingAndCloneOp(rewriter, linalgOp,
-                                    operands.take_front(inputs.size()),
-                                    operands.drop_front(inputs.size()));
-  } else {
-    bool transpose =
-        typeConverter.getTransposeNarrowN() && isNarrowNResult(resultEncoding);
-    SmallVector<Type> elemTypes = lhsEncoding.getElementTypesArray();
-    SmallVector<ReassociationIndices> ri;
-    Value newLhs = getMmt4dOperand(operands[0], linalgOp, transpose, rewriter,
-                                   ri, elemTypes, /*operandIdx=*/0);
-    Value newRhs = getMmt4dOperand(operands[1], linalgOp, transpose, rewriter,
-                                   ri, elemTypes, /*operandIdx=*/1);
-    Value newResult =
-        getMmt4dOperand(operands[2], linalgOp, transpose, rewriter, ri,
-                        elemTypes, /*operandIdx=*/2);
-    if (transpose) {
-      std::swap(newLhs, newRhs);
-    }
-    Type newResultType = newResult.getType();
-    auto cDims = IREE::Encoding::getEncodingContractionDims(lhsEncoding);
-    if (cDims->batch.empty()) {
-      result = rewriter.create<linalg::Mmt4DOp>(
-          linalgOp.getLoc(), newResultType, ValueRange{newLhs, newRhs},
-          ValueRange{newResult});
-    } else {
-      result = rewriter.create<linalg::BatchMmt4DOp>(
-          linalgOp.getLoc(), newResultType, ValueRange{newLhs, newRhs},
-          ValueRange{newResult});
-    }
-    if (!ri.empty()) {
-      result = rewriter.create<tensor::CollapseShapeOp>(
-          linalgOp->getLoc(), operands[2].getType(), result->getResult(0), ri);
-    }
-  }
-  return result;
-}
-
 /// Utility method to convert `tensor.empty` with encoding to a `tensor.empty`
 /// of the materialized type.
 static FailureOr<Operation *>
@@ -901,8 +734,17 @@ class MaterializeContractionOp
 
     auto converter = static_cast<const MaterializeEncodingTypeConverter *>(
         this->getTypeConverter());
+    // TODO(hanchung): This is a transition state for moving the implementation
+    // details to backend attributes. We won't need the function type argument
+    // after all the backends that support encodings implement the attribute.
+    auto getEncodingInfoWrapper =
+        [&](RankedTensorType type) -> FailureOr<MaterializeEncodingInfo> {
+      return converter->getEncodingInfo(type);
+    };
     FailureOr<Operation *> convertedOp =
-        lowerContractionOpWithEncoding(rewriter, op, operands, *converter);
+        IREE::Codegen::lowerContractionOpWithEncoding(
+            rewriter, op, operands, converter->getTransposeNarrowN(),
+            getEncodingInfoWrapper);
     if (failed(convertedOp)) {
       return failure();
     }
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenTypes.h b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenTypes.h
index c41d581bf765..c4ad11e3da73 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenTypes.h
+++ b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenTypes.h
@@ -10,6 +10,7 @@
 #include <cstdint>
 
 #include "llvm/ADT/SmallVector.h"
+#include "mlir/IR/BuiltinTypes.h"
 #include "mlir/Support/LLVM.h"
 
 namespace mlir::iree_compiler::IREE::Codegen {
@@ -89,5 +90,8 @@ struct MaterializeEncodingInfo {
   std::optional<TileSwizzle> swizzle;
 };
 
+using ResolveEncodingInfoFn =
+    std::function<FailureOr<MaterializeEncodingInfo>(RankedTensorType type)>;
+
 } // namespace mlir::iree_compiler::IREE::Codegen
 #endif // IREE_COMPILER_CODEGEN_DIALECT_CODEGEN_IR_IREECODEGENTYPES_H_
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/Utils/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/Utils/BUILD.bazel
index 2cca489b7102..a155423dcf95 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/Utils/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/Utils/BUILD.bazel
@@ -22,8 +22,12 @@ iree_compiler_cc_library(
     ],
     deps = [
         "//compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR:IREECodegenDialect",
+        "//compiler/src/iree/compiler/Dialect/Encoding/IR",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:DialectUtils",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:LinalgDialect",
+        "@llvm-project//mlir:TensorDialect",
     ],
 )
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/Utils/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/Utils/CMakeLists.txt
index cbfa9245cc88..bf4a0ed7f073 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/Utils/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/Utils/CMakeLists.txt
@@ -19,8 +19,12 @@ iree_cc_library(
     "Utils.cpp"
   DEPS
     LLVMSupport
+    MLIRArithDialect
     MLIRIR
+    MLIRLinalgDialect
+    MLIRTensorDialect
     iree::compiler::Codegen::Dialect::Codegen::IR::IREECodegenDialect
+    iree::compiler::Dialect::Encoding::IR
   PUBLIC
 )
 
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/Utils/Utils.cpp b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/Utils/Utils.cpp
index 7b1e57480a20..4a12f7013417 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/Utils/Utils.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/Utils/Utils.cpp
@@ -6,6 +6,10 @@
 
 #include "iree/compiler/Codegen/Dialect/Codegen/Utils/Utils.h"
 #include "llvm/ADT/STLExtras.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Linalg/IR/LinalgInterfaces.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
@@ -251,4 +255,213 @@ getExpandedTileShape(const TileSwizzle::ExpandShapeType &expandShape) {
   return result;
 }
 
+MaterializeEncodingInfo
+getEncodingInfoForMatmul(Encoding::EncodingAttr encoding, TileMxNxK tileMxNxK) {
+  MaterializeEncodingInfo encodingInfo;
+  auto cDims = getEncodingContractionDims(encoding);
+  // The following expects M, N, K, and Batch sizes of at most 1 for now
+  assert(cDims->m.size() <= 1 && cDims->n.size() <= 1 && cDims->k.size() == 1 &&
+         cDims->batch.size() <= 1 &&
+         "Expected at most one M, N, K, and Batch dimension");
+  std::optional<unsigned> batchDim =
+      cDims->batch.empty() ? std::nullopt
+                           : encoding.mapDimToOperandIndex(cDims->batch[0]);
+  std::optional<unsigned> mDim =
+      cDims->m.empty() ? std::nullopt
+                       : encoding.mapDimToOperandIndex(cDims->m[0]);
+  std::optional<unsigned> nDim =
+      cDims->n.empty() ? std::nullopt
+                       : encoding.mapDimToOperandIndex(cDims->n[0]);
+  std::optional<unsigned> kDim = encoding.mapDimToOperandIndex(cDims->k[0]);
+  if (batchDim.has_value()) {
+    encodingInfo.outerDimsPerm.push_back(batchDim.value());
+  }
+  if (mDim.has_value()) {
+    encodingInfo.outerDimsPerm.push_back(mDim.value());
+    encodingInfo.innerDimsPos.push_back(mDim.value());
+    encodingInfo.innerTileSizes.push_back(tileMxNxK.M);
+  }
+  if (nDim.has_value()) {
+    encodingInfo.outerDimsPerm.push_back(nDim.value());
+    encodingInfo.innerDimsPos.push_back(nDim.value());
+    encodingInfo.innerTileSizes.push_back(tileMxNxK.N);
+  }
+  if (kDim.has_value()) {
+    encodingInfo.outerDimsPerm.push_back(kDim.value());
+    encodingInfo.innerDimsPos.push_back(kDim.value());
+    encodingInfo.innerTileSizes.push_back(tileMxNxK.K);
+  }
+  return encodingInfo;
+}
+
+static RankedTensorType dropEncoding(RankedTensorType type) {
+  return RankedTensorType::get(type.getShape(), type.getElementType());
+}
+
+static Operation *dropEncodingAndCloneOp(OpBuilder &builder, Operation *op,
+                                         ValueRange convertedInputOperands,
+                                         ValueRange convertedOutputOperands) {
+  SmallVector<Value> operands;
+  operands.append(convertedInputOperands.begin(), convertedInputOperands.end());
+  operands.append(convertedOutputOperands.begin(),
+                  convertedOutputOperands.end());
+  return mlir::clone(builder, op,
+                     {dropEncoding(cast<RankedTensorType>(
+                         convertedOutputOperands[0].getType()))},
+                     operands);
+}
+
+static RankedTensorType
+getExpandedType(RankedTensorType type, bool isBatched, bool isTransposed,
+                SmallVectorImpl<ReassociationIndices> &ri) {
+  if (!isBatched) {
+    ri.assign({{0, 1}, {2, 3}});
+    if (!isTransposed) {
+      return RankedTensorType::get(
+          {1, type.getDimSize(0), 1, type.getDimSize(1)},
+          type.getElementType());
+    }
+    return RankedTensorType::get({type.getDimSize(0), 1, type.getDimSize(1), 1},
+                                 type.getElementType());
+  }
+
+  ri.assign({{0}, {1, 2}, {3, 4}});
+  if (!isTransposed) {
+    return RankedTensorType::get(
+        {type.getDimSize(0), 1, type.getDimSize(1), 1, type.getDimSize(2)},
+        type.getElementType());
+  }
+  return RankedTensorType::get(
+      {type.getDimSize(0), type.getDimSize(1), 1, type.getDimSize(2), 1},
+      type.getElementType());
+}
+
+/// Given an input Value and a desired output element type, create and return
+/// an element-wise linalg::GenericOp that extends the input Value to the
+/// output element type.
+static Value createElementWiseExtUIOp(OpBuilder &builder, Value input,
+                                      Location loc, Type outElemType) {
+  auto inputType = cast<RankedTensorType>(input.getType());
+  SmallVector<AffineMap> maps(
+      2, builder.getMultiDimIdentityMap(inputType.getRank()));
+  SmallVector<utils::IteratorType> iteratorTypes(inputType.getRank(),
+                                                 utils::IteratorType::parallel);
+  auto castedType = inputType.clone(outElemType);
+  SmallVector<OpFoldResult> inputMixedSizes =
+      tensor::getMixedSizes(builder, loc, input);
+  Value init =
+      builder.create<tensor::EmptyOp>(loc, inputMixedSizes, outElemType);
+  return builder
+      .create<linalg::GenericOp>(
+          loc, castedType, input, init, maps, iteratorTypes,
+          [&](OpBuilder &b, Location nestedLoc, ValueRange args) {
+            Value castRes =
+                b.create<arith::ExtUIOp>(nestedLoc, outElemType, args[0])
+                    ->getResult(0);
+            b.create<linalg::YieldOp>(nestedLoc, castRes);
+          })
+      .getResult(0);
+}
+
+/// If needed, expand and the input Value, and return the resulting input with
+/// the canonical mmt4d input shape. If the input element type is unsigned,
+/// create a producer Linalg::GenericOp on the input that unsigned extends the
+/// input to the output element type. This extension is required to keep the
+/// unsignedness information on the input for ukernels. If `transpose` is true,
+/// the `linalgOp`'s indexing maps are transposed.
+static Value getMmt4dOperand(Value value, linalg::LinalgOp linalgOp,
+                             bool transpose, OpBuilder &builder,
+                             SmallVectorImpl<ReassociationIndices> &ri,
+                             ArrayRef<Type> elemTypes, int operandIdx) {
+  assert(linalgOp.getNumDpsInputs() == 2);
+  assert(linalgOp.getNumDpsInits() == 1);
+  auto cDims = linalg::inferContractionDims(linalgOp);
+  Location loc = linalgOp->getLoc();
+  Value expandedValue = value;
+  // If vecmat with non-rhs operandIdx or matvec with non-lhs operandIdx, the
+  // operand is a vector and must be extended
+  if ((cDims->m.empty() && operandIdx != 1) ||
+      (cDims->n.empty() && operandIdx != 0)) {
+    auto type = cast<RankedTensorType>(value.getType());
+    RankedTensorType newType = getExpandedType(
+        type, /*isBatched=*/!cDims->batch.empty(),
+        /*isTransposed=*/operandIdx == 2 && (transpose ^ cDims->n.empty()), ri);
+    expandedValue =
+        builder.create<tensor::ExpandShapeOp>(loc, newType, value, ri);
+  }
+  if (elemTypes[operandIdx].isUnsignedInteger()) {
+    return createElementWiseExtUIOp(builder, expandedValue, loc,
+                                    elemTypes.back());
+  }
+  return expandedValue;
+}
+
+FailureOr<Operation *>
+lowerContractionOpWithEncoding(OpBuilder &builder, linalg::LinalgOp linalgOp,
+                               ValueRange operands, bool transposeNarrowN,
+                               ResolveEncodingInfoFn getEncodingInfo) {
+  if (!linalgOp.hasPureTensorSemantics()) {
+    return failure();
+  }
+
+  auto inputs = linalgOp.getDpsInputOperands();
+  auto outputs = linalgOp.getDpsInits();
+
+  auto lhsType = cast<RankedTensorType>(inputs[0]->get().getType());
+  auto rhsType = cast<RankedTensorType>(inputs[1]->get().getType());
+  auto resultType = cast<RankedTensorType>(outputs[0].getType());
+  auto lhsEncoding = IREE::Encoding::getEncodingAttr(lhsType);
+  auto rhsEncoding = IREE::Encoding::getEncodingAttr(rhsType);
+  auto resultEncoding = IREE::Encoding::getEncodingAttr(resultType);
+  if (!lhsEncoding || !rhsEncoding || !resultEncoding) {
+    return failure();
+  }
+
+  if (lhsEncoding.getOperandIndex().getValue() != IREE::Encoding::MATMUL_LHS ||
+      rhsEncoding.getOperandIndex().getValue() != IREE::Encoding::MATMUL_RHS ||
+      resultEncoding.getOperandIndex().getValue() !=
+          IREE::Encoding::MATMUL_RESULT) {
+    return failure();
+  }
+
+  FailureOr<MaterializeEncodingInfo> encodingInfo =
+      getEncodingInfo(cast<RankedTensorType>(linalgOp->getResultTypes()[0]));
+
+  Operation *result;
+  if (failed(encodingInfo)) {
+    result = dropEncodingAndCloneOp(builder, linalgOp,
+                                    operands.take_front(inputs.size()),
+                                    operands.drop_front(inputs.size()));
+  } else {
+    bool transpose = transposeNarrowN && isNarrowNResult(resultEncoding);
+    SmallVector<Type> elemTypes = lhsEncoding.getElementTypesArray();
+    SmallVector<ReassociationIndices> ri;
+    Value newLhs = getMmt4dOperand(operands[0], linalgOp, transpose, builder,
+                                   ri, elemTypes, /*operandIdx=*/0);
+    Value newRhs = getMmt4dOperand(operands[1], linalgOp, transpose, builder,
+                                   ri, elemTypes, /*operandIdx=*/1);
+    Value newResult = getMmt4dOperand(operands[2], linalgOp, transpose, builder,
+                                      ri, elemTypes, /*operandIdx=*/2);
+    if (transpose) {
+      std::swap(newLhs, newRhs);
+    }
+    Type newResultType = newResult.getType();
+    auto cDims = IREE::Encoding::getEncodingContractionDims(lhsEncoding);
+    if (cDims->batch.empty()) {
+      result = builder.create<linalg::Mmt4DOp>(linalgOp.getLoc(), newResultType,
+                                               ValueRange{newLhs, newRhs},
+                                               ValueRange{newResult});
+    } else {
+      result = builder.create<linalg::BatchMmt4DOp>(
+          linalgOp.getLoc(), newResultType, ValueRange{newLhs, newRhs},
+          ValueRange{newResult});
+    }
+    if (!ri.empty()) {
+      result = builder.create<tensor::CollapseShapeOp>(
+          linalgOp->getLoc(), operands[2].getType(), result->getResult(0), ri);
+    }
+  }
+  return result;
+}
+
 } // namespace mlir::iree_compiler::IREE::Codegen
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/Utils/Utils.h b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/Utils/Utils.h
index d19096ec41f7..b1997c1b91fd 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/Utils/Utils.h
+++ b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/Utils/Utils.h
@@ -8,6 +8,7 @@
 #define IREE_COMPILER_CODEGEN_DIALECT_CODEGEN_UTILS_H_
 
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenTypes.h"
+#include "iree/compiler/Dialect/Encoding/IR/EncodingOps.h"
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/MLIRContext.h"
@@ -60,6 +61,24 @@ deserializeEncodingInfo(DictionaryAttr attr);
 SmallVector<int64_t>
 getExpandedTileShape(const TileSwizzle::ExpandShapeType &expandShape);
 
+struct TileMxNxK {
+  int64_t M = 1;
+  int64_t N = 1;
+  int64_t K = 1;
+};
+
+MaterializeEncodingInfo
+getEncodingInfoForMatmul(Encoding::EncodingAttr encoding, TileMxNxK tileMxNxK);
+
+//===----------------------------------------------------------------------===//
+// Operation Lowering Utilities.
+//===----------------------------------------------------------------------===//
+
+FailureOr<Operation *>
+lowerContractionOpWithEncoding(OpBuilder &builder, linalg::LinalgOp linalgOp,
+                               ValueRange operands, bool transposeNarrowN,
+                               ResolveEncodingInfoFn getEncodingInfo);
+
 } // namespace mlir::iree_compiler::IREE::Codegen
 
 #endif // IREE_COMPILER_CODEGEN_DIALECT_CODEGEN_UTILS_H_
diff --git a/compiler/src/iree/compiler/Dialect/Encoding/IR/EncodingAttrs.cpp b/compiler/src/iree/compiler/Dialect/Encoding/IR/EncodingAttrs.cpp
index 333145d0e8c3..cd023d0ec92b 100644
--- a/compiler/src/iree/compiler/Dialect/Encoding/IR/EncodingAttrs.cpp
+++ b/compiler/src/iree/compiler/Dialect/Encoding/IR/EncodingAttrs.cpp
@@ -130,6 +130,14 @@ MatmulNarrowDim getMatmulNarrowDim(EncodingAttr encoding) {
   return {};
 }
 
+bool isNarrowNResult(EncodingAttr encoding) {
+  if (encoding.getOperandIndex().getValue() != IREE::Encoding::MATMUL_RESULT) {
+    return false;
+  }
+
+  return IREE::Encoding::getMatmulNarrowDim(encoding).isN();
+}
+
 EncodingAttr getEncodingAttr(RankedTensorType type) {
   return dyn_cast_or_null<EncodingAttr>(type.getEncoding());
 }
diff --git a/compiler/src/iree/compiler/Dialect/Encoding/IR/EncodingTypes.h b/compiler/src/iree/compiler/Dialect/Encoding/IR/EncodingTypes.h
index 62a3efdcb98a..e4354c51a298 100644
--- a/compiler/src/iree/compiler/Dialect/Encoding/IR/EncodingTypes.h
+++ b/compiler/src/iree/compiler/Dialect/Encoding/IR/EncodingTypes.h
@@ -86,6 +86,10 @@ MatmulNarrowDim getMatmulNarrowDim(linalg::LinalgOp linalgOp,
 /// value.
 MatmulNarrowDim getMatmulNarrowDim(EncodingAttr encoding);
 
+// Returns true if `encoding` represents a narrow-N matmul RESULT, e.g. the
+// result of a matvec.
+bool isNarrowNResult(EncodingAttr encoding);
+
 } // namespace mlir::iree_compiler::IREE::Encoding
 
 #endif // IREE_COMPILER_DIALECT_ENCODING_IR_ENCODINGTYPES_H_

From d182e57d49310a313346ebbdf7abae823caabcad Mon Sep 17 00:00:00 2001
From: Stella Laurenzo <stellaraccident@gmail.com>
Date: Wed, 27 Nov 2024 15:09:36 -0800
Subject: [PATCH 23/54] Set MLIR_DISABLE_CONFIGURE_PYTHON_DEV_PACKAGES to
 disable MLIR python config. (#19328)

This is in preparation for
https://github.com/llvm/llvm-project/pull/117934 and a followon patch
which changes the default python setup in a way that will conflict with
what we do. Landing pre-emptively to avoid disruption.

Signed-off-by: Stella Laurenzo <stellaraccident@gmail.com>
---
 build_tools/cmake/iree_llvm.cmake | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/build_tools/cmake/iree_llvm.cmake b/build_tools/cmake/iree_llvm.cmake
index aab44e6ceda8..6fed3183e140 100644
--- a/build_tools/cmake/iree_llvm.cmake
+++ b/build_tools/cmake/iree_llvm.cmake
@@ -161,6 +161,10 @@ macro(iree_llvm_set_bundled_cmake_options)
   set(MLIR_ENABLE_BINDINGS_PYTHON OFF CACHE BOOL "")
   set(MHLO_ENABLE_BINDINGS_PYTHON OFF CACHE BOOL "")
 
+  # Disable MLIR attempting to configure Python dev packages. We take care of
+  # that in IREE as a super-project.
+  set(MLIR_DISABLE_CONFIGURE_PYTHON_DEV_PACKAGES ON CACHE BOOL "" FORCE)
+
   # If we are building clang/lld/etc, these will be the targets.
   # Otherwise, empty so scripts can detect unavailability.
   set(IREE_CLANG_TARGET)

From 978943859418d160ac3c7b32f6917e13fc240a97 Mon Sep 17 00:00:00 2001
From: Stella Laurenzo <stellaraccident@gmail.com>
Date: Wed, 27 Nov 2024 16:45:57 -0800
Subject: [PATCH 24/54] [iree.build] Make the fetch_http action more robust.
 (#19330)

* Downloads to a staging file and then atomically renames into place,
avoiding potential for partial downloads.
* Reports completion percent as part of the console updates.
* Persists metadata for the source URL and will refetch if changed.
* Fixes an error handling test for the onnx mnist_builder that missed
the prior update.

More sophistication is possible but this brings it up to min-viable from
a usability perspective.

Signed-off-by: Stella Laurenzo <stellaraccident@gmail.com>
---
 .../bindings/python/iree/build/executor.py    |  52 +++++++++
 .../bindings/python/iree/build/net_actions.py |  42 +++++++-
 .../python/test/build_api/CMakeLists.txt      |   7 ++
 .../test/build_api/mnist_builder_test.py      |   6 +-
 .../python/test/build_api/net_test.py         | 100 ++++++++++++++++++
 5 files changed, 201 insertions(+), 6 deletions(-)
 create mode 100644 compiler/bindings/python/test/build_api/net_test.py

diff --git a/compiler/bindings/python/iree/build/executor.py b/compiler/bindings/python/iree/build/executor.py
index c0fefe804ced..c0463b1dacfe 100644
--- a/compiler/bindings/python/iree/build/executor.py
+++ b/compiler/bindings/python/iree/build/executor.py
@@ -8,6 +8,7 @@
 
 import concurrent.futures
 import enum
+import json
 import math
 import multiprocessing
 import os
@@ -128,6 +129,7 @@ def __init__(self, output_dir: Path, stderr: IO, reporter: ProgressReporter):
         self.failed_deps: set["BuildDependency"] = set()
         self.stderr = stderr
         self.reporter = reporter
+        self.metadata_lock = threading.RLock()
         BuildContext("", self)
 
     def check_path_not_exists(self, path: str, for_entity):
@@ -160,6 +162,7 @@ def get_file(self, path: str) -> "BuildFile":
         return existing
 
     def write_status(self, message: str):
+        self.reporter.reset_display()
         print(message, file=self.stderr)
 
     def get_root(self, namespace: FileNamespace) -> Path:
@@ -294,6 +297,9 @@ def finish(self):
         self.future.set_result(self)
 
 
+BuildFileMetadata = dict[str, str | int | bool | float]
+
+
 class BuildFile(BuildDependency):
     """Generated file in the build tree."""
 
@@ -322,6 +328,35 @@ def get_fs_path(self) -> Path:
         path.parent.mkdir(parents=True, exist_ok=True)
         return path
 
+    def access_metadata(
+        self,
+        mutation_callback: Callable[[BuildFileMetadata], bool] | None = None,
+    ) -> BuildFileMetadata:
+        """Accesses persistent metadata about the build file.
+
+        This is intended for the storage of small amounts of metadata relevant to the
+        build system for performing up-to-date checks and the like.
+
+        If a `mutation_callback=` is provided, then any modifications it makes will be
+        persisted prior to returning. Using a callback in this fashion holds a lock
+        and avoids data races. If the callback returns True, it is persisted.
+        """
+        with self.executor.metadata_lock:
+            metadata = _load_metadata(self.executor)
+            path_metadata = metadata.get("paths")
+            if path_metadata is None:
+                path_metadata = {}
+                metadata["paths"] = path_metadata
+            file_key = f"{self.namespace}/{self.path}"
+            file_metadata = path_metadata.get(file_key)
+            if file_metadata is None:
+                file_metadata = {}
+                path_metadata[file_key] = file_metadata
+            if mutation_callback:
+                if mutation_callback(file_metadata):
+                    _save_metadata(self.executor, metadata)
+            return file_metadata
+
     def __repr__(self):
         return f"BuildFile[{self.namespace}]({self.path})"
 
@@ -658,3 +693,20 @@ def invoke():
 
 # Type aliases.
 BuildFileLike = BuildFile | str
+
+# Private utilities.
+_METADATA_FILENAME = ".metadata.json"
+
+
+def _load_metadata(executor: Executor) -> dict:
+    path = executor.output_dir / _METADATA_FILENAME
+    if not path.exists():
+        return {}
+    with open(path, "rb") as f:
+        return json.load(f)
+
+
+def _save_metadata(executor: Executor, metadata: dict):
+    path = executor.output_dir / _METADATA_FILENAME
+    with open(path, "wt") as f:
+        json.dump(metadata, f, sort_keys=True, indent=2)
diff --git a/compiler/bindings/python/iree/build/net_actions.py b/compiler/bindings/python/iree/build/net_actions.py
index da74d9ac51ff..7a262bffe81a 100644
--- a/compiler/bindings/python/iree/build/net_actions.py
+++ b/compiler/bindings/python/iree/build/net_actions.py
@@ -7,7 +7,7 @@
 import urllib.error
 import urllib.request
 
-from iree.build.executor import BuildAction, BuildContext, BuildFile
+from iree.build.executor import BuildAction, BuildContext, BuildFile, BuildFileMetadata
 
 __all__ = [
     "fetch_http",
@@ -29,11 +29,49 @@ def __init__(self, url: str, output_file: BuildFile, **kwargs):
         super().__init__(**kwargs)
         self.url = url
         self.output_file = output_file
+        self.original_desc = self.desc
 
     def _invoke(self):
+        # Determine whether metadata indicates that fetch is needed.
         path = self.output_file.get_fs_path()
+        needs_fetch = False
+        existing_metadata = self.output_file.access_metadata()
+        existing_url = existing_metadata.get("fetch_http.url")
+        if existing_url != self.url:
+            needs_fetch = True
+
+        # Always fetch if empty or absent.
+        if not path.exists() or path.stat().st_size == 0:
+            needs_fetch = True
+
+        # Bail if already obtained.
+        if not needs_fetch:
+            return
+
+        # Download to a staging file.
+        stage_path = path.with_name(f".{path.name}.download")
         self.executor.write_status(f"Fetching URL: {self.url} -> {path}")
+
+        def reporthook(received_blocks: int, block_size: int, total_size: int):
+            received_size = received_blocks * block_size
+            if total_size == 0:
+                self.desc = f"{self.original_desc} ({received_size} bytes received)"
+            else:
+                complete_percent = round(100 * received_size / total_size)
+                self.desc = f"{self.original_desc} ({complete_percent}% complete)"
+
         try:
-            urllib.request.urlretrieve(self.url, str(path))
+            urllib.request.urlretrieve(self.url, str(stage_path), reporthook=reporthook)
         except urllib.error.HTTPError as e:
             raise IOError(f"Failed to fetch URL '{self.url}': {e}") from None
+        finally:
+            self.desc = self.original_desc
+
+        # Commit the download.
+        def commit(metadata: BuildFileMetadata) -> bool:
+            metadata["fetch_http.url"] = self.url
+            path.unlink(missing_ok=True)
+            stage_path.rename(path)
+            return True
+
+        self.output_file.access_metadata(commit)
diff --git a/compiler/bindings/python/test/build_api/CMakeLists.txt b/compiler/bindings/python/test/build_api/CMakeLists.txt
index 6dfcc38f3f9a..6b9916ccee62 100644
--- a/compiler/bindings/python/test/build_api/CMakeLists.txt
+++ b/compiler/bindings/python/test/build_api/CMakeLists.txt
@@ -20,3 +20,10 @@ iree_py_test(
  SRCS
    "basic_test.py"
 )
+
+iree_py_test(
+ NAME
+   net_test
+ SRCS
+   "net_test.py"
+)
diff --git a/compiler/bindings/python/test/build_api/mnist_builder_test.py b/compiler/bindings/python/test/build_api/mnist_builder_test.py
index 7b1f641110b9..60f750c6bd30 100644
--- a/compiler/bindings/python/test/build_api/mnist_builder_test.py
+++ b/compiler/bindings/python/test/build_api/mnist_builder_test.py
@@ -90,10 +90,7 @@ def testActionCLArg(self):
         mod = load_build_module(THIS_DIR / "mnist_builder.py")
         out_file = io.StringIO()
         err_file = io.StringIO()
-        with self.assertRaisesRegex(
-            IOError,
-            re.escape("Failed to fetch URL 'https://github.com/iree-org/doesnotexist'"),
-        ):
+        with self.assertRaises(SystemExit):
             iree_build_main(
                 mod,
                 args=[
@@ -104,6 +101,7 @@ def testActionCLArg(self):
                 stdout=out_file,
                 stderr=err_file,
             )
+        self.assertIn("ERROR:", err_file.getvalue())
 
     def testBuildNonDefaultSubTarget(self):
         mod = load_build_module(THIS_DIR / "mnist_builder.py")
diff --git a/compiler/bindings/python/test/build_api/net_test.py b/compiler/bindings/python/test/build_api/net_test.py
new file mode 100644
index 000000000000..6e10c7b4c231
--- /dev/null
+++ b/compiler/bindings/python/test/build_api/net_test.py
@@ -0,0 +1,100 @@
+# Copyright 2024 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import io
+import os
+from pathlib import Path
+import tempfile
+import unittest
+
+from iree.build import *
+from iree.build.executor import BuildContext
+from iree.build.test_actions import ExecuteOutOfProcessThunkAction
+
+
+TEST_URL = None
+TEST_URL_1 = "https://huggingface.co/google-bert/bert-base-cased/resolve/cd5ef92a9fb2f889e972770a36d4ed042daf221e/tokenizer.json"
+TEST_URL_2 = "https://huggingface.co/google-bert/bert-base-cased/resolve/cd5ef92a9fb2f889e972770a36d4ed042daf221e/tokenizer_config.json"
+
+
+@entrypoint
+def tokenizer_via_http():
+    return fetch_http(
+        name="tokenizer.json",
+        url=TEST_URL,
+    )
+
+
+class BasicTest(unittest.TestCase):
+    def setUp(self):
+        self._temp_dir = tempfile.TemporaryDirectory(ignore_cleanup_errors=True)
+        self._temp_dir.__enter__()
+        self.output_path = Path(self._temp_dir.name)
+
+    def tearDown(self) -> None:
+        self._temp_dir.__exit__(None, None, None)
+
+    def test_fetch_http(self):
+        # This just does a sanity check that rich console mode does not crash. Actual
+        # behavior can really only be completely verified visually.
+        out = None
+        err = None
+        global TEST_URL
+        path = self.output_path / "genfiles" / "tokenizer_via_http" / "tokenizer.json"
+
+        def run():
+            nonlocal out
+            nonlocal err
+            try:
+                out_io = io.StringIO()
+                err_io = io.StringIO()
+                iree_build_main(
+                    args=[
+                        "tokenizer_via_http",
+                        "--output-dir",
+                        str(self.output_path),
+                        "--test-force-console",
+                    ],
+                    stderr=err_io,
+                    stdout=out_io,
+                )
+            finally:
+                out = out_io.getvalue()
+                err = err_io.getvalue()
+                print(f"::test_fetch_http err: {err!r}")
+                print(f"::test_fetch_http out: {out!r}")
+
+        def assertExists():
+            self.assertTrue(path.exists(), msg=f"Path {path} exists")
+
+        # First run should fetch.
+        TEST_URL = TEST_URL_1
+        run()
+        self.assertIn("Fetching URL: https://", err)
+        assertExists()
+
+        # Second run should not fetch.
+        TEST_URL = TEST_URL_1
+        run()
+        self.assertNotIn("Fetching URL: https://", err)
+        assertExists()
+
+        # Fetching a different URL should download again.
+        TEST_URL = TEST_URL_2
+        run()
+        self.assertIn("Fetching URL: https://", err)
+        assertExists()
+
+        # Removing the file should fetch again.
+        TEST_URL = TEST_URL_2
+        path.unlink()
+        run()
+        self.assertIn("Fetching URL: https://", err)
+        assertExists()
+
+
+if __name__ == "__main__":
+    unittest.main()

From 7f9958760578bb84ecc61b1443690ce101b4867f Mon Sep 17 00:00:00 2001
From: Rob Suderman <rob.suderman@gmail.com>
Date: Wed, 27 Nov 2024 17:42:32 -0800
Subject: [PATCH 25/54] Add sharktank model test to presubmits (#19329)

Signed-off-by: Rob Suderman <rob.suderman@gmail.com>
---
 .github/workflows/pkgci.yml                |  6 ++
 .github/workflows/pkgci_test_sharktank.yml | 73 ++++++++++++++++++++++
 2 files changed, 79 insertions(+)
 create mode 100644 .github/workflows/pkgci_test_sharktank.yml

diff --git a/.github/workflows/pkgci.yml b/.github/workflows/pkgci.yml
index e4017cedeb06..78e03a9644cf 100644
--- a/.github/workflows/pkgci.yml
+++ b/.github/workflows/pkgci.yml
@@ -104,6 +104,12 @@ jobs:
     if: contains(fromJson(needs.setup.outputs.enabled-jobs), 'test_onnx')
     uses: ./.github/workflows/pkgci_test_onnx.yml
 
+  test_sharktank:
+    name: Test Sharktank
+    needs: [setup, build_packages]
+    if: contains(fromJson(needs.setup.outputs.enabled-jobs), 'test_sharktank')
+    uses: ./.github/workflows/pkgci_test_sharktank.yml
+
   test_tensorflow:
     name: Test TensorFlow
     needs: [setup, build_packages]
diff --git a/.github/workflows/pkgci_test_sharktank.yml b/.github/workflows/pkgci_test_sharktank.yml
new file mode 100644
index 000000000000..42140f7c2db8
--- /dev/null
+++ b/.github/workflows/pkgci_test_sharktank.yml
@@ -0,0 +1,73 @@
+# Copyright 2024 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+name: PkgCI Test Sharktank
+on:
+  workflow_call:
+    inputs:
+      artifact_run_id:
+        type: string
+        default: ""
+  workflow_dispatch:
+    inputs:
+      artifact_run_id:
+        type: string
+        default: ""
+
+jobs:
+  test_sharktank_models:
+    name: "test_sharktank_models :: ${{ matrix.name }}"
+    runs-on: ${{ matrix.runs-on }}
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          # CPU
+          - name: cpu_llvm_task
+            runs-on: ubuntu-20.04
+
+    env:
+      VENV_DIR: ${{ github.workspace }}/venv
+    steps:
+      - name: Checking out IREE repository
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
+        with:
+          submodules: false
+      - uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.1.0
+        with:
+          # Must match the subset of versions built in pkgci_build_packages.
+          python-version: "3.11"
+      - uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
+        with:
+          name: linux_x86_64_release_packages
+          path: ${{ env.PACKAGE_DOWNLOAD_DIR }}
+      - name: Setup venv
+        run: |
+          ./build_tools/pkgci/setup_venv.py ${VENV_DIR} \
+            --artifact-path=${PACKAGE_DOWNLOAD_DIR} \
+            --fetch-gh-workflow=${{ inputs.artifact_run_id }}
+
+      - name: Checkout test suites repository
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
+        with:
+          repository: iree-org/iree-test-suites
+          ref: a0c84d59c4332463dd46a3c4877d8e0ab2e0a80d
+          path: iree-test-suites
+          lfs: true
+      - name: Install Sharktank models test suite requirements
+        run: |
+          source ${VENV_DIR}/bin/activate
+          python -m pip install -r iree-test-suites/sharktank_models/requirements.txt
+      - name: Run Sharktank models test suite
+        run: |
+          source ${VENV_DIR}/bin/activate
+          pytest iree-test-suites/sharktank_models/ \
+              -rA \
+              -m "target_cpu" \
+              --log-cli-level=info \
+              --override-ini=xfail_strict=false \
+              --timeout=120 \
+              --durations=0

From d3eef09090d8ad5fba335d7921895e1fdf600e45 Mon Sep 17 00:00:00 2001
From: Jakub Kuderski <jakub@nod-labs.com>
Date: Thu, 28 Nov 2024 09:57:07 -0500
Subject: [PATCH 26/54] Integrate llvm-project at
 07a8ebed56cfa223d1587903e4de0d5788b5f777 (#19334)

Still carrying a revert for 1004865f1ca41a9581da8747f34b29862d3ebc3d and
a cherry pick for https://github.com/llvm/llvm-project/pull/116650.

Removed `FieldParser`s for optional enums that get autogenerated as of
https://github.com/llvm/llvm-project/pull/117719.
---
 .../iree/compiler/Dialect/HAL/IR/HALTypes.h   | 85 -------------------
 .../compiler/Dialect/Stream/IR/StreamTypes.h  | 31 -------
 .../Dialect/Input/InputDialect.h              | 68 ---------------
 third_party/llvm-project                      |  2 +-
 4 files changed, 1 insertion(+), 185 deletions(-)

diff --git a/compiler/src/iree/compiler/Dialect/HAL/IR/HALTypes.h b/compiler/src/iree/compiler/Dialect/HAL/IR/HALTypes.h
index fefe68fd38f9..d7168069fcd9 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/IR/HALTypes.h
+++ b/compiler/src/iree/compiler/Dialect/HAL/IR/HALTypes.h
@@ -190,91 +190,6 @@ struct StaticRange {
 
 } // namespace mlir::iree_compiler::IREE::HAL
 
-// It's unfortunate this is required.
-namespace mlir {
-
-template <>
-struct FieldParser<
-    std::optional<mlir::iree_compiler::IREE::HAL::CollectiveReductionOp>> {
-  static FailureOr<mlir::iree_compiler::IREE::HAL::CollectiveReductionOp>
-  parse(AsmParser &parser) {
-    std::string value;
-    if (parser.parseKeywordOrString(&value))
-      return failure();
-    auto result = mlir::iree_compiler::IREE::HAL::symbolizeEnum<
-        mlir::iree_compiler::IREE::HAL::CollectiveReductionOp>(value);
-    if (!result.has_value())
-      return failure();
-    return result.value();
-  }
-};
-static inline AsmPrinter &
-operator<<(AsmPrinter &printer,
-           std::optional<mlir::iree_compiler::IREE::HAL::CollectiveReductionOp>
-               param) {
-  printer << (param.has_value()
-                  ? mlir::iree_compiler::IREE::HAL::stringifyEnum(param.value())
-                  : StringRef{""});
-  return printer;
-}
-
-template <>
-struct FieldParser<
-    std::optional<mlir::iree_compiler::IREE::HAL::PipelineLayoutFlags>> {
-  static FailureOr<mlir::iree_compiler::IREE::HAL::PipelineLayoutFlags>
-  parse(AsmParser &parser) {
-    std::string value;
-    if (parser.parseKeywordOrString(&value))
-      return failure();
-    auto result = mlir::iree_compiler::IREE::HAL::symbolizeEnum<
-        mlir::iree_compiler::IREE::HAL::PipelineLayoutFlags>(value);
-    if (!result.has_value())
-      return failure();
-    return result.value();
-  }
-};
-static inline AsmPrinter &operator<<(
-    AsmPrinter &printer,
-    std::optional<mlir::iree_compiler::IREE::HAL::PipelineLayoutFlags> param) {
-  printer << (param.has_value()
-                  ? mlir::iree_compiler::IREE::HAL::stringifyEnum(param.value())
-                  : StringRef{""});
-  return printer;
-}
-
-template <>
-struct FieldParser<
-    std::optional<mlir::iree_compiler::IREE::HAL::DescriptorFlags>> {
-  static FailureOr<mlir::iree_compiler::IREE::HAL::DescriptorFlags>
-  parse(AsmParser &parser) {
-    std::string value;
-    if (parser.parseKeywordOrString(&value))
-      return failure();
-    auto result = mlir::iree_compiler::IREE::HAL::symbolizeEnum<
-        mlir::iree_compiler::IREE::HAL::DescriptorFlags>(value);
-    if (!result.has_value())
-      return failure();
-    return result.value();
-  }
-};
-static inline AsmPrinter &operator<<(
-    AsmPrinter &printer,
-    std::optional<mlir::iree_compiler::IREE::HAL::DescriptorFlags> param) {
-  printer << (param.has_value()
-                  ? mlir::iree_compiler::IREE::HAL::stringifyEnum(param.value())
-                  : StringRef{""});
-  return printer;
-}
-
-static inline AsmPrinter &
-operator<<(AsmPrinter &printer,
-           mlir::iree_compiler::IREE::HAL::DescriptorType param) {
-  printer << mlir::iree_compiler::IREE::HAL::stringifyEnum(param);
-  return printer;
-}
-
-} // namespace mlir
-
 // clang-format off: must be included after all LLVM/MLIR headers.
 #define GET_ATTRDEF_CLASSES
 #include "iree/compiler/Dialect/HAL/IR/HALAttrs.h.inc" // IWYU pragma: keep
diff --git a/compiler/src/iree/compiler/Dialect/Stream/IR/StreamTypes.h b/compiler/src/iree/compiler/Dialect/Stream/IR/StreamTypes.h
index d69e226fb868..75cc00ef2b84 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/IR/StreamTypes.h
+++ b/compiler/src/iree/compiler/Dialect/Stream/IR/StreamTypes.h
@@ -27,37 +27,6 @@
 #include "iree/compiler/Dialect/Stream/IR/StreamEnums.h.inc" // IWYU pragma: export
 // clang-format on
 
-// It's unfortunate this is required.
-namespace mlir {
-
-template <>
-struct FieldParser<
-    std::optional<mlir::iree_compiler::IREE::Stream::CollectiveReductionOp>> {
-  static FailureOr<mlir::iree_compiler::IREE::Stream::CollectiveReductionOp>
-  parse(AsmParser &parser) {
-    std::string value;
-    if (parser.parseKeywordOrString(&value))
-      return failure();
-    auto result = mlir::iree_compiler::IREE::Stream::symbolizeEnum<
-        mlir::iree_compiler::IREE::Stream::CollectiveReductionOp>(value);
-    if (!result.has_value())
-      return failure();
-    return result.value();
-  }
-};
-static inline AsmPrinter &operator<<(
-    AsmPrinter &printer,
-    std::optional<mlir::iree_compiler::IREE::Stream::CollectiveReductionOp>
-        param) {
-  printer << (param.has_value()
-                  ? mlir::iree_compiler::IREE::Stream::stringifyEnum(
-                        param.value())
-                  : StringRef{""});
-  return printer;
-}
-
-} // namespace mlir
-
 namespace mlir::iree_compiler::IREE::Stream {
 class AffinityAttr;
 } // namespace mlir::iree_compiler::IREE::Stream
diff --git a/llvm-external-projects/iree-dialects/include/iree-dialects/Dialect/Input/InputDialect.h b/llvm-external-projects/iree-dialects/include/iree-dialects/Dialect/Input/InputDialect.h
index 0ae1d30b986b..032dbd000c1f 100644
--- a/llvm-external-projects/iree-dialects/include/iree-dialects/Dialect/Input/InputDialect.h
+++ b/llvm-external-projects/iree-dialects/include/iree-dialects/Dialect/Input/InputDialect.h
@@ -40,72 +40,4 @@ std::optional<int32_t> getEncodingTypeValue(Attribute attr);
 
 } // namespace mlir::iree_compiler::IREE::Input
 
-//===----------------------------------------------------------------------===//
-// Specialize templates in mlir namespace to support enum attributes
-//===----------------------------------------------------------------------===//
-
-namespace mlir {
-
-template <>
-struct FieldParser<
-    std::optional<mlir::iree_compiler::IREE::Input::PipelineLayoutFlags>> {
-  static FailureOr<mlir::iree_compiler::IREE::Input::PipelineLayoutFlags>
-  parse(AsmParser &parser) {
-    std::string value;
-    if (parser.parseKeywordOrString(&value))
-      return failure();
-    auto result = mlir::iree_compiler::IREE::Input::symbolizeEnum<
-        mlir::iree_compiler::IREE::Input::PipelineLayoutFlags>(value);
-    if (!result.has_value())
-      return failure();
-    return result.value();
-  }
-};
-
-static inline AsmPrinter &
-operator<<(AsmPrinter &printer,
-           std::optional<mlir::iree_compiler::IREE::Input::PipelineLayoutFlags>
-               param) {
-  printer << (param.has_value()
-                  ? mlir::iree_compiler::IREE::Input::stringifyEnum(
-                        param.value())
-                  : StringRef{""});
-  return printer;
-}
-
-template <>
-struct FieldParser<
-    std::optional<mlir::iree_compiler::IREE::Input::DescriptorFlags>> {
-  static FailureOr<mlir::iree_compiler::IREE::Input::DescriptorFlags>
-  parse(AsmParser &parser) {
-    std::string value;
-    if (parser.parseKeywordOrString(&value))
-      return failure();
-    auto result = mlir::iree_compiler::IREE::Input::symbolizeEnum<
-        mlir::iree_compiler::IREE::Input::DescriptorFlags>(value);
-    if (!result.has_value())
-      return failure();
-    return result.value();
-  }
-};
-
-static inline AsmPrinter &operator<<(
-    AsmPrinter &printer,
-    std::optional<mlir::iree_compiler::IREE::Input::DescriptorFlags> param) {
-  printer << (param.has_value()
-                  ? mlir::iree_compiler::IREE::Input::stringifyEnum(
-                        param.value())
-                  : StringRef{""});
-  return printer;
-}
-
-static inline AsmPrinter &
-operator<<(AsmPrinter &printer,
-           mlir::iree_compiler::IREE::Input::DescriptorType param) {
-  printer << mlir::iree_compiler::IREE::Input::stringifyEnum(param);
-  return printer;
-}
-
-} // namespace mlir
-
 #endif // IREE_DIALECTS_DIALECT_INPUT_DIALECT_H
diff --git a/third_party/llvm-project b/third_party/llvm-project
index 3833fdcdb01b..dfbfc0594887 160000
--- a/third_party/llvm-project
+++ b/third_party/llvm-project
@@ -1 +1 @@
-Subproject commit 3833fdcdb01b69c2815db08388e0e092a79cbc58
+Subproject commit dfbfc059488770ac1b96a8074739c605475166f9

From b124695f8139cd8695780a590487684e40cd1d0a Mon Sep 17 00:00:00 2001
From: Kunwar Grover <groverkss@gmail.com>
Date: Fri, 29 Nov 2024 13:58:14 +0000
Subject: [PATCH 27/54] [VectorDistribution] Remove signatures after
 distribution (#19319)

This patch removes discardable attributes set for vector distribution,
after the pass. These attributes otherwise keep propagating and make
output IR harder to read.
---
 .../Codegen/Common/GPU/GPUVectorDistribution.cpp       |  5 +++++
 .../GPU/test/gpu_nested_layout_contract_amdgpu.mlir    | 10 +++++-----
 .../test/gpu_nested_layout_vector_distribution.mlir    | 10 ----------
 .../Common/GPU/test/gpu_vector_distribution.mlir       |  8 ++++----
 .../test/ROCDL/pipeline_vector_distribute_gfx942.mlir  |  4 ++--
 5 files changed, 16 insertions(+), 21 deletions(-)

diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUVectorDistribution.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUVectorDistribution.cpp
index a8831809e25b..53b8676e7d8c 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUVectorDistribution.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUVectorDistribution.cpp
@@ -314,6 +314,11 @@ LogicalResult distributeVectorOps(Operation *root,
     return failure();
   }
 
+  // Remove signature after distribution.
+  root->walk([](Operation *op) {
+    op->removeDiscardableAttr(kVectorLayoutFetcherStorageAttrName);
+  });
+
   if (options.verifyConversion()) {
     WalkResult hasConversionOp = root->walk([](Operation *op) {
       if (isa<IREE::VectorExt::ToSIMDOp, IREE::VectorExt::ToSIMTOp>(op)) {
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_contract_amdgpu.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_contract_amdgpu.mlir
index eecd2f0653ce..7872b517dfa1 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_contract_amdgpu.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_contract_amdgpu.mlir
@@ -83,7 +83,7 @@ builtin.module attributes { transform.with_named_sequence } {
 // CHECK:       %[[R_CAST:.+]] = vector.shape_cast %[[MFMA]] : vector<16xf32> to vector<4x1x4x1xf32>
 // CHECK:       %[[B_OUT:.*]] = vector.broadcast %[[R_CAST]] : vector<4x1x4x1xf32> to vector<1x1x4x1x4x1xf32>
 // CHECK:       %[[R_SIMD:.+]] = iree_vector_ext.to_simd %[[B_OUT]] : vector<1x1x4x1x4x1xf32> -> vector<32x32xf32>
-// CHECK:       return {{.*}} %[[R_SIMD]]
+// CHECK:       return %[[R_SIMD]]
 
 // -----
 
@@ -161,7 +161,7 @@ builtin.module attributes { transform.with_named_sequence } {
 //       CHECK:   %[[R_CAST:.+]] = vector.shape_cast %[[MFMA]]  : vector<4xf32> to vector<1x1x4x1xf32>
 //       CHECK:   %[[B_OUT:.*]]  = vector.broadcast %[[R_CAST]] : vector<1x1x4x1xf32> to vector<1x1x1x1x4x1xf32>
 //       CHECK:   %[[R_SIMD:.+]] = iree_vector_ext.to_simd %[[B_OUT]] : vector<1x1x1x1x4x1xf32> -> vector<16x16xf32>
-//       CHECK:   return {{.*}} %[[R_SIMD]]
+//       CHECK:   return %[[R_SIMD]]
 
 // -----
 
@@ -250,7 +250,7 @@ builtin.module attributes { transform.with_named_sequence } {
 //       CHECK:   %[[R1_CAST:.+]] = vector.shape_cast %[[MFMA1]] : vector<16xf32> to vector<4x1x4x1xf32>
 //       CHECK:   %[[C1_INS:.+]] = vector.insert %[[R1_CAST]], %[[C0_INS]] [1, 0] : vector<4x1x4x1xf32> into vector<2x1x4x1x4x1xf32>
 //       CHECK:   %[[R:.+]] = iree_vector_ext.to_simd %[[C1_INS]] : vector<2x1x4x1x4x1xf32> -> vector<64x32xf32>
-//       CHECK:   return {{.*}}} %[[R]]
+//       CHECK:   return %[[R]]
 
 // -----
 
@@ -589,7 +589,7 @@ builtin.module attributes { transform.with_named_sequence } {
 //       CHECK:   %[[R_CAST:.+]] = vector.shape_cast %[[WMMA]] : vector<8xf32> to vector<8x1x1x1xf32>
 //       CHECK:   %[[B_OUT:.*]] = vector.broadcast %[[R_CAST]] : vector<8x1x1x1xf32> to vector<1x1x8x1x1x1xf32>
 //       CHECK:   %[[R_SIMD:.+]] = iree_vector_ext.to_simd %[[B_OUT]] : vector<1x1x8x1x1x1xf32> -> vector<16x16xf32>
-//       CHECK:   return {{.*}} %[[R_SIMD]]
+//       CHECK:   return %[[R_SIMD]]
 
 // -----
 
@@ -682,4 +682,4 @@ builtin.module attributes { transform.with_named_sequence } {
 // CHECK:       %[[R_CAST:.+]] = vector.shape_cast %[[MFMA_1]] : vector<16xf32> to vector<4x1x4x1xf32>
 // CHECK:       %[[B_OUT:.*]] = vector.broadcast %[[R_CAST]] : vector<4x1x4x1xf32> to vector<1x1x4x1x4x1xf32>
 // CHECK:       %[[R_SIMD:.+]] = iree_vector_ext.to_simd %[[B_OUT]] : vector<1x1x4x1x4x1xf32> -> vector<32x32xf32>
-// CHECK:       return {{.*}} %[[R_SIMD]]
+// CHECK:       return %[[R_SIMD]]
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_vector_distribution.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_vector_distribution.mlir
index f33c91fe3754..b982f634d690 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_vector_distribution.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_vector_distribution.mlir
@@ -955,21 +955,11 @@ builtin.module attributes { transform.with_named_sequence } {
   }
 }
 
-// CHECK:      #[[$LAYOUT:.+]] = #iree_vector_ext.nested_layout
-// CHECK-SAME:   subgroup_tile = [2, 2],
-// CHECK-SAME:   batch_tile = [4, 2]
-// CHECK-SAME:   outer_tile = [1, 2]
-// CHECK-SAME:   thread_tile = [16, 4]
-// CHECK-SAME:   element_tile = [2, 2]
-// CHECK-SAME:   subgroup_strides = [1, 2],
-// CHECK-SAME:   thread_strides = [1, 16]
-
 // CHECK-LABEL: func @transpose
 // CHECK: iree_vector_ext.to_simt %{{.*}} : vector<64x256xf16> -> vector<2x4x2x1x2x2xf16>
 // CHECK: vector.transpose %{{.*}}, [1, 0, 3, 2, 5, 4] : vector<2x4x2x1x2x2xf16> to vector<4x2x1x2x2x2xf16>
 // CHECK: math.sqrt %{{.*}} : vector<4x2x1x2x2x2xf16>
 // CHECK: iree_vector_ext.to_simd %{{.*}} : vector<4x2x1x2x2x2xf16> -> vector<256x64xf16>
-// CHECK: return {{.*}}#[[$LAYOUT]]
 
 // -----
 
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_vector_distribution.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_vector_distribution.mlir
index a503664ecef4..343366e1d7c1 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_vector_distribution.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_vector_distribution.mlir
@@ -19,10 +19,10 @@ func.func @distribute_elementwise_nested_layout_f16(%a: vector<128x128x128xf16>,
   %root = arith.constant dense<0.0> : vector<128x128x128xf16>
   %rootl = iree_vector_ext.to_layout %root to layout(#nested) : vector<128x128x128xf16>
   // CHECK-DAG: %[[B:.*]] = iree_vector_ext.to_simt %{{.*}} : vector<128x128x128xf16> -> vector<8x2x4x1x4x4x1x8x2xf16>
-  // CHECK-DAG: %[[C:.*]] = arith.mulf %[[B]], %[[ROOT]] {{.*}} : vector<8x2x4x1x4x4x1x8x2xf16>
+  // CHECK-DAG: %[[C:.*]] = arith.mulf %[[B]], %[[ROOT]] : vector<8x2x4x1x4x4x1x8x2xf16>
   %c = arith.mulf %rootl, %b : vector<128x128x128xf16>
   // CHECK-DAG: %[[A:.*]] = iree_vector_ext.to_simt %{{.*}} : vector<128x128x128xf16> -> vector<8x2x4x1x4x4x1x8x2xf16>
-  // CHECK-DAG: %[[D:.*]] = arith.addf %[[C]], %[[A]] fastmath<reassoc,nnan> {{.*}} : vector<8x2x4x1x4x4x1x8x2xf16>
+  // CHECK-DAG: %[[D:.*]] = arith.addf %[[C]], %[[A]] fastmath<reassoc,nnan> : vector<8x2x4x1x4x4x1x8x2xf16>
   %d = arith.addf %c, %a fastmath<reassoc,nnan> : vector<128x128x128xf16>
   // CHECK: iree_vector_ext.to_simd %[[D]] : vector<8x2x4x1x4x4x1x8x2xf16> -> vector<128x128x128xf16>
   return %d : vector<128x128x128xf16>
@@ -51,10 +51,10 @@ func.func @distribute_scf_for(%a: vector<16x16xi32>, %b: vector<16x16xi32>) -> v
   // CHECK: iter_args(%[[ARG0:.*]] = %[[ROOT]]) -> (vector<1x1x1x1x16x16xi32>)
   %out = scf.for %i = %c0 to %c128 step %c1 iter_args(%arg0 = %rootl) -> (vector<16x16xi32>) {
     // CHECK-DAG: %[[B:.*]] = iree_vector_ext.to_simt %{{.*}} : vector<16x16xi32> -> vector<1x1x1x1x16x16xi32>
-    // CHECK-DAG: %[[C:.*]] = arith.muli %[[ARG0]], %[[B]] {{.*}} : vector<1x1x1x1x16x16xi32>
+    // CHECK-DAG: %[[C:.*]] = arith.muli %[[ARG0]], %[[B]] : vector<1x1x1x1x16x16xi32>
     %c = arith.muli %arg0, %b : vector<16x16xi32>
     // CHECK-DAG: %[[A:.*]] = iree_vector_ext.to_simt %{{.*}} : vector<16x16xi32> -> vector<1x1x1x1x16x16xi32>
-    // CHECK-DAG: %[[D:.*]] = arith.addi %[[C]], %[[A]] {{.*}} : vector<1x1x1x1x16x16xi32>
+    // CHECK-DAG: %[[D:.*]] = arith.addi %[[C]], %[[A]] : vector<1x1x1x1x16x16xi32>
     %d = arith.addi %c, %a : vector<16x16xi32>
     // CHECK: scf.yield %[[D]] : vector<1x1x1x1x16x16xi32>
     scf.yield %d : vector<16x16xi32>
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute_gfx942.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute_gfx942.mlir
index 389339c1d771..184d49799faf 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute_gfx942.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute_gfx942.mlir
@@ -90,7 +90,7 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) {
 
 //    CHECK-LABEL: func.func @matmul_256x256x256_f16_f16()
 //          CHECK:   scf.for {{.*}} = %c0 to %c256 step %c128 iter_args(%[[ARG:.+]] = {{.*}}) -> (vector<2x2x1x1x4x1xf16>)
-//          CHECK:     arith.extf %[[ARG]] {{.*}} : vector<2x2x1x1x4x1xf16> to vector<2x2x1x1x4x1xf32>
+//          CHECK:     arith.extf %[[ARG]] : vector<2x2x1x1x4x1xf16> to vector<2x2x1x1x4x1xf32>
 // CHECK-COUNT-32:     amdgpu.mfma {{.*}} {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
 //          CHECK:     %[[TRUNC:.+]] = arith.truncf %{{.*}} : vector<2x2x1x1x4x1xf32> to vector<2x2x1x1x4x1xf16>
 //          CHECK:     scf.yield %[[TRUNC]] : vector<2x2x1x1x4x1xf16>
@@ -157,7 +157,7 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) {
 // This has more than 2 iteartions. So we have prefetching enabled for this case. Due to
 // prefetching, we have one iteration peeled of so upper bound is 2048 - 128 = 1920.
 //          CHECK:   scf.for {{.*}} = %c0 to %c1920 step %c128 iter_args(%[[ARG:.+]] = {{.*}}) -> (vector<4x1x1x1x4x1xf16>)
-//          CHECK:     arith.extf %[[ARG]] {{.*}} : vector<4x1x1x1x4x1xf16> to vector<4x1x1x1x4x1xf32>
+//          CHECK:     arith.extf %[[ARG]] : vector<4x1x1x1x4x1xf16> to vector<4x1x1x1x4x1xf32>
 // CHECK-COUNT-32:     amdgpu.mfma {{.*}} {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
 //          CHECK:     %[[TRUNC:.+]] = arith.truncf %{{.*}} : vector<4x1x1x1x4x1xf32> to vector<4x1x1x1x4x1xf16>
 //          CHECK:     scf.yield %[[TRUNC]] : vector<4x1x1x1x4x1xf16>

From fc1d40298fe46948b6f413761f3ff2cbe481ec95 Mon Sep 17 00:00:00 2001
From: Kunwar Grover <groverkss@gmail.com>
Date: Fri, 29 Nov 2024 14:08:26 +0000
Subject: [PATCH 28/54] [VectorDistribution] Allow 0-d vectors in scf.for
 distribution (#19317)

---
 .../Common/GPU/GPUDistributionPatterns.cpp    | 22 ++++--------
 .../GPU/test/gpu_vector_distribution.mlir     | 34 +++++++++++++++++++
 2 files changed, 40 insertions(+), 16 deletions(-)

diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistributionPatterns.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistributionPatterns.cpp
index d72ac17b0e9e..276b7fe11d4b 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistributionPatterns.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistributionPatterns.cpp
@@ -145,10 +145,8 @@ struct DistributeScfFor final : OpDistributionPattern<scf::ForOp> {
     SmallVector<Value> newInitArgs;
     for (Value initArg : forOp.getInitArgs()) {
       if (auto vectorInitArg = dyn_cast<VectorValue>(initArg)) {
-        if (isNonZeroRank(vectorInitArg)) {
-          initArg =
-              getDistributed(rewriter, vectorInitArg, signature[vectorInitArg]);
-        }
+        initArg =
+            getDistributed(rewriter, vectorInitArg, signature[vectorInitArg]);
       }
       newInitArgs.push_back(initArg);
     }
@@ -193,14 +191,8 @@ struct DistributeScfFor final : OpDistributionPattern<scf::ForOp> {
     SmallVector<Value> operands;
     for (Value operand : yieldOp->getOperands()) {
       if (auto vectorOperand = dyn_cast<VectorValue>(operand)) {
-        // Distributing the operand requires it to have a non-zero rank, meaning
-        // it must have at least one dimension. If the vector has a non-zero
-        // rank, the operand is distributed according to the provided layout
-        // signature.
-        if (isNonZeroRank(vectorOperand)) {
-          operand = DistributionPattern::getDistributed(
-              rewriter, vectorOperand, signature[vectorOperand]);
-        }
+        operand = DistributionPattern::getDistributed(rewriter, vectorOperand,
+                                                      signature[vectorOperand]);
       }
       operands.push_back(operand);
     }
@@ -223,10 +215,8 @@ struct DistributeScfFor final : OpDistributionPattern<scf::ForOp> {
     for (auto [bbArg, oldInit] : llvm::zip_equal(bbArgs, oldInits)) {
       Value val = bbArg;
       if (auto oldVectorInit = dyn_cast<VectorValue>(oldInit)) {
-        if (isNonZeroRank(oldVectorInit)) {
-          val = rewriter.create<IREE::VectorExt::ToSIMDOp>(
-              oldVectorInit.getLoc(), oldVectorInit.getType(), val);
-        }
+        val = rewriter.create<IREE::VectorExt::ToSIMDOp>(
+            oldVectorInit.getLoc(), oldVectorInit.getType(), val);
       }
       replacements.push_back(val);
     }
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_vector_distribution.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_vector_distribution.mlir
index 343366e1d7c1..972c8c3b860f 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_vector_distribution.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_vector_distribution.mlir
@@ -62,6 +62,40 @@ func.func @distribute_scf_for(%a: vector<16x16xi32>, %b: vector<16x16xi32>) -> v
   return %out : vector<16x16xi32>
 }
 
+#layout_0d = #iree_vector_ext.nested_layout<
+  subgroup_tile = [],
+  batch_tile = [],
+  outer_tile = [],
+  thread_tile = [],
+  element_tile = [],
+
+  subgroup_strides = [],
+  thread_strides = []
+>
+
+// CHECK-LABEL: @distribute_scf_for_0d
+func.func @distribute_scf_for_0d(%a: vector<i32>, %b: vector<i32>) -> vector<i32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c128 = arith.constant 128 : index
+  %cst_0 = arith.constant 0 : i32
+  // CHECK: %[[ROOT:.*]] = arith.constant dense<0> : vector<i32>
+  %root = arith.constant dense<0> : vector<i32>
+  %rootl = iree_vector_ext.to_layout %root to layout(#layout_0d) : vector<i32>
+  // CHECK: iter_args(%[[ARG0:.*]] = %[[ROOT]]) -> (vector<i32>)
+  %out = scf.for %i = %c0 to %c128 step %c1 iter_args(%arg0 = %rootl) -> (vector<i32>) {
+    // CHECK-DAG: %[[B:.*]] = iree_vector_ext.to_simt %{{.*}} : vector<i32> -> vector<i32>
+    // CHECK-DAG: %[[C:.*]] = arith.muli %[[ARG0]], %[[B]] {{.*}} : vector<i32>
+    %c = arith.muli %arg0, %b : vector<i32>
+    // CHECK-DAG: %[[A:.*]] = iree_vector_ext.to_simt %{{.*}} : vector<i32> -> vector<i32>
+    // CHECK-DAG: %[[D:.*]] = arith.addi %[[C]], %[[A]] {{.*}} : vector<i32>
+    %d = arith.addi %c, %a : vector<i32>
+    // CHECK: scf.yield %[[D]] : vector<i32>
+    scf.yield %d : vector<i32>
+  }
+  return %out : vector<i32>
+}
+
 builtin.module attributes { transform.with_named_sequence } {
   transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
     %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op

From 1684c5647dca23944989755bc5f6c6fb294a400c Mon Sep 17 00:00:00 2001
From: Jakub Kuderski <jakub@nod-labs.com>
Date: Fri, 29 Nov 2024 14:02:00 -0500
Subject: [PATCH 29/54] Integrate llvm-project at
 be81df25178f4e301df9e4de75c5cbbd6f773891 (#19338)

Still carrying a revert for 1004865f1ca41a9581da8747f34b29862d3ebc3d and
a cherry pick for https://github.com/llvm/llvm-project/pull/116650.

Signed-off-by: Jakub Kuderski <jakub@nod-labs.com>
---
 .../Common/GPU/test/gpu_vector_distribution.mlir     |  4 ++--
 .../Conversion/MeshToFlow/test/channel_creation.mlir | 12 +++---------
 third_party/llvm-project                             |  2 +-
 3 files changed, 6 insertions(+), 12 deletions(-)

diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_vector_distribution.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_vector_distribution.mlir
index 972c8c3b860f..1f0833fa8768 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_vector_distribution.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_vector_distribution.mlir
@@ -85,10 +85,10 @@ func.func @distribute_scf_for_0d(%a: vector<i32>, %b: vector<i32>) -> vector<i32
   // CHECK: iter_args(%[[ARG0:.*]] = %[[ROOT]]) -> (vector<i32>)
   %out = scf.for %i = %c0 to %c128 step %c1 iter_args(%arg0 = %rootl) -> (vector<i32>) {
     // CHECK-DAG: %[[B:.*]] = iree_vector_ext.to_simt %{{.*}} : vector<i32> -> vector<i32>
-    // CHECK-DAG: %[[C:.*]] = arith.muli %[[ARG0]], %[[B]] {{.*}} : vector<i32>
+    // CHECK-DAG: %[[C:.*]] = arith.muli %[[ARG0]], %[[B]] : vector<i32>
     %c = arith.muli %arg0, %b : vector<i32>
     // CHECK-DAG: %[[A:.*]] = iree_vector_ext.to_simt %{{.*}} : vector<i32> -> vector<i32>
-    // CHECK-DAG: %[[D:.*]] = arith.addi %[[C]], %[[A]] {{.*}} : vector<i32>
+    // CHECK-DAG: %[[D:.*]] = arith.addi %[[C]], %[[A]] : vector<i32>
     %d = arith.addi %c, %a : vector<i32>
     // CHECK: scf.yield %[[D]] : vector<i32>
     scf.yield %d : vector<i32>
diff --git a/compiler/src/iree/compiler/Dialect/Flow/Conversion/MeshToFlow/test/channel_creation.mlir b/compiler/src/iree/compiler/Dialect/Flow/Conversion/MeshToFlow/test/channel_creation.mlir
index a0f52b299724..1680f3e4a071 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Conversion/MeshToFlow/test/channel_creation.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/Conversion/MeshToFlow/test/channel_creation.mlir
@@ -17,12 +17,9 @@ module @static_1d_mesh_grouping_along_axis_0 {
 module @static_2d_mesh_grouping_along_axis_1 {
   //      CHECK: util.global private @_mesh_mesh_2d_axes_1 {inlining_policy = #util.inline.never} : !flow.channel
   //      CHECK: util.initializer {
-  //  CHECK-DAG:   %[[AXIS_1_SIZE:.+]] = arith.constant 4 : index
-  //  CHECK-DAG:   %[[AXIS_0_SIZE:.+]] = arith.constant 3 : index
   //  CHECK-DAG:   %[[DEFAULT_CHANNEL:.+]] = flow.channel.default : !flow.channel
   //      CHECK:   %[[CHANNEL_RANK:.+]] = flow.channel.rank %[[DEFAULT_CHANNEL]] : index
-  //      CHECK:   %[[COLOR_AND_KEY:.+]]:2 = affine.delinearize_index %[[CHANNEL_RANK]] into
-  // CHECK-SAME:   (%[[AXIS_0_SIZE]], %[[AXIS_1_SIZE]]) : index, index
+  //      CHECK:   %[[COLOR_AND_KEY:.+]]:2 = affine.delinearize_index %[[CHANNEL_RANK]] into (3, 4) : index, index
   //      CHECK:   %[[CHANNEL:.+]] = flow.channel.split
   // CHECK-SAME:   %[[DEFAULT_CHANNEL]], %[[COLOR_AND_KEY]]#0, %[[COLOR_AND_KEY]]#1 : !flow.channel -> !flow.channel
   //      CHECK:   util.global.store %[[CHANNEL]], @_mesh_mesh_2d_axes_1 : !flow.channel
@@ -42,13 +39,10 @@ module @static_4d_mesh_grouping_along_axes_2_1 {
   //      CHECK: util.global private @_mesh_mesh_4d_axes_2_1 {inlining_policy = #util.inline.never} : !flow.channel
   //      CHECK: util.initializer {
   //  CHECK-DAG:   %[[AXIS_3_SIZE:.+]] = arith.constant 6 : index
-  //  CHECK-DAG:   %[[AXIS_2_SIZE:.+]] = arith.constant 5 : index
   //  CHECK-DAG:   %[[AXIS_1_SIZE:.+]] = arith.constant 4 : index
-  //  CHECK-DAG:   %[[AXIS_0_SIZE:.+]] = arith.constant 3 : index
   //  CHECK-DAG:   %[[DEFAULT_CHANNEL:.+]] = flow.channel.default : !flow.channel
-  //      CHECK:   %[[CHANNEL_RANK:.+]] = flow.channel.rank %[[DEFAULT_CHANNEL]] : index
-  //      CHECK:   %[[DEVICE_MULTI_IDX:.+]]:4 = affine.delinearize_index %[[CHANNEL_RANK]] into
-  // CHECK-SAME:       (%[[AXIS_0_SIZE]], %[[AXIS_1_SIZE]], %[[AXIS_2_SIZE]], %[[AXIS_3_SIZE]]) : index, index, index, index
+  //      CHECK:   %[[CHANNEL_RANK:.+]] = flow.channel.rank %[[DEFAULT_CHANNEL]]
+  //      CHECK:   %[[DEVICE_MULTI_IDX:.+]]:4 = affine.delinearize_index %[[CHANNEL_RANK]] into (3, 4, 5, 6) : index
   //      CHECK:   %[[IN_GROUP_IDX:.+]] = affine.apply
   // CHECK-SAME:       #map()[%[[DEVICE_MULTI_IDX]]#2, %[[AXIS_1_SIZE]], %[[DEVICE_MULTI_IDX]]#1]
   //      CHECK:   %[[GROUP_IDX:.+]] = affine.apply
diff --git a/third_party/llvm-project b/third_party/llvm-project
index dfbfc0594887..be81df25178f 160000
--- a/third_party/llvm-project
+++ b/third_party/llvm-project
@@ -1 +1 @@
-Subproject commit dfbfc059488770ac1b96a8074739c605475166f9
+Subproject commit be81df25178f4e301df9e4de75c5cbbd6f773891

From ecd87d8680c1e79819d4b5914c092c064af32787 Mon Sep 17 00:00:00 2001
From: Jakub Kuderski <jakub@nod-labs.com>
Date: Sun, 1 Dec 2024 17:31:12 -0500
Subject: [PATCH 30/54] [Codegen] Add pass to materialize tuning specs (#19337)

... and update 'Materialize User Configs' to pick up those tuning specs.

The overall flow is as follows:
* We pick up any user-specified tuning specs in `materialize tuning
specs` and link them into a single transform dialect library module.
* We serialize that linked tuning spec as MLIR bytecode.
* We embed this MLIR bytecode as a module attribute. This is so that
none of the subsequent passes will accidentally `walk` or otherwise
modify it.
* In `materialize user configs`, we first check if there are any
transform libraries provided. If not, then we check if the tuning spec
is present.
* We deserialize the tuning spec attribute into a transform dialect
library module and execute it.
* We remove the serialized tuning spec from the module, as it's no
longer needed.

I also modified `getOrLoadTransformLibraryModule` so that it doesn't use
the `transform::detail::assembleTransformLibraryFromPaths` function,
because it has some logic to perform library merging that would
overwrite module symbol names. There's no need to call it anyway, since
we are loading a single library at a time.

This is not added to any codegen pipeline yet -- I will do that in a
future PR.

Issue: https://github.com/iree-org/iree/issues/19214
---
 .../iree/compiler/Codegen/Common/BUILD.bazel  |   4 +
 .../compiler/Codegen/Common/CMakeLists.txt    |   4 +
 .../Codegen/Common/LinkTuningSpecsPass.cpp    |  47 ++---
 .../Common/MaterializeTuningSpecsPass.cpp     | 166 ++++++++++++++++++
 .../Codegen/Common/MaterializeUserConfigs.cpp |  57 +++++-
 .../src/iree/compiler/Codegen/Common/Passes.h |   7 +
 .../iree/compiler/Codegen/Common/Passes.td    |  19 ++
 .../compiler/Codegen/Common/test/BUILD.bazel  |   5 +
 .../Codegen/Common/test/CMakeLists.txt        |   4 +
 .../Common/test/materialize_tuning_specs.mlir |  23 +++
 ...materialize_tuning_specs_invalid_spec.mlir |  12 ++
 ...erialize_user_config_from_tuning_spec.mlir |  42 +++++
 .../Codegen/Common/test/tuning_spec.mlir      |   9 +
 .../Dialect/Codegen/IR/IREECodegenAttrs.h     |   4 +-
 .../Dialect/Codegen/IR/IREECodegenDialect.td  |   2 +-
 .../Codegen/IR/IREECodegenLibraryManager.cpp  |  42 +++--
 16 files changed, 411 insertions(+), 36 deletions(-)
 create mode 100644 compiler/src/iree/compiler/Codegen/Common/MaterializeTuningSpecsPass.cpp
 create mode 100644 compiler/src/iree/compiler/Codegen/Common/test/materialize_tuning_specs.mlir
 create mode 100644 compiler/src/iree/compiler/Codegen/Common/test/materialize_tuning_specs_invalid_spec.mlir
 create mode 100644 compiler/src/iree/compiler/Codegen/Common/test/materialize_user_config_from_tuning_spec.mlir
 create mode 100644 compiler/src/iree/compiler/Codegen/Common/test/tuning_spec.mlir

diff --git a/compiler/src/iree/compiler/Codegen/Common/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Common/BUILD.bazel
index 776b03d91a99..e3513ba69d29 100644
--- a/compiler/src/iree/compiler/Codegen/Common/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/Common/BUILD.bazel
@@ -127,6 +127,7 @@ iree_compiler_cc_library(
         "LowerUKernelsToCalls.cpp",
         "MaterializeEncodingIntoNop.cpp",
         "MaterializeEncodingIntoPackUnPack.cpp",
+        "MaterializeTuningSpecsPass.cpp",
         "MemrefCopyToLinalg.cpp",
         "NormalizeLoopBounds.cpp",
         "OptimizeTensorInsertExtractSlices.cpp",
@@ -201,6 +202,7 @@ iree_compiler_cc_library(
         "@llvm-project//mlir:BufferizationDialect",
         "@llvm-project//mlir:BufferizationInterfaces",
         "@llvm-project//mlir:BufferizationTransforms",
+        "@llvm-project//mlir:BytecodeWriter",
         "@llvm-project//mlir:DestinationStyleOpInterface",
         "@llvm-project//mlir:DialectUtils",
         "@llvm-project//mlir:FuncDialect",
@@ -219,6 +221,7 @@ iree_compiler_cc_library(
         "@llvm-project//mlir:MemRefDialect",
         "@llvm-project//mlir:MemRefTransforms",
         "@llvm-project//mlir:MemRefUtils",
+        "@llvm-project//mlir:Parser",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:SCFDialect",
         "@llvm-project//mlir:SCFToControlFlow",
@@ -284,6 +287,7 @@ iree_compiler_cc_library(
         "@llvm-project//mlir:GPUDialect",
         "@llvm-project//mlir:LinalgDialect",
         "@llvm-project//mlir:LLVMDialect",
+        "@llvm-project//mlir:Parser",
         "@llvm-project//mlir:PDLDialect",
         "@llvm-project//mlir:PDLInterpDialect",
         "@llvm-project//mlir:SCFDialect",
diff --git a/compiler/src/iree/compiler/Codegen/Common/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Common/CMakeLists.txt
index f7ed254c31f3..adec8aad7583 100644
--- a/compiler/src/iree/compiler/Codegen/Common/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/Common/CMakeLists.txt
@@ -119,6 +119,7 @@ iree_cc_library(
     "LowerUKernelsToCalls.cpp"
     "MaterializeEncodingIntoNop.cpp"
     "MaterializeEncodingIntoPackUnPack.cpp"
+    "MaterializeTuningSpecsPass.cpp"
     "MemrefCopyToLinalg.cpp"
     "NormalizeLoopBounds.cpp"
     "OptimizeTensorInsertExtractSlices.cpp"
@@ -163,6 +164,7 @@ iree_cc_library(
     MLIRArithUtils
     MLIRBufferizationDialect
     MLIRBufferizationTransforms
+    MLIRBytecodeWriter
     MLIRDestinationStyleOpInterface
     MLIRFuncDialect
     MLIRFuncTransforms
@@ -180,6 +182,7 @@ iree_cc_library(
     MLIRMemRefDialect
     MLIRMemRefTransforms
     MLIRMemRefUtils
+    MLIRParser
     MLIRPass
     MLIRSCFDialect
     MLIRSCFToControlFlow
@@ -257,6 +260,7 @@ iree_cc_library(
     MLIRMemRefTransformOps
     MLIRPDLDialect
     MLIRPDLInterpDialect
+    MLIRParser
     MLIRPass
     MLIRRewrite
     MLIRSCFDialect
diff --git a/compiler/src/iree/compiler/Codegen/Common/LinkTuningSpecsPass.cpp b/compiler/src/iree/compiler/Codegen/Common/LinkTuningSpecsPass.cpp
index ab9ddce82dd0..8f57104d6f07 100644
--- a/compiler/src/iree/compiler/Codegen/Common/LinkTuningSpecsPass.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/LinkTuningSpecsPass.cpp
@@ -44,8 +44,9 @@ findNestedModulesWithNamedSequences(ModuleOp module) {
 static SmallVector<NamedSequenceOp> findTuningSpecs(ModuleOp module) {
   Block *body = module.getBody();
   return llvm::filter_to_vector(
-      body->getOps<NamedSequenceOp>(),
-      [](NamedSequenceOp op) { return op->hasAttr(kTuningSpecAttrName); });
+      body->getOps<NamedSequenceOp>(), [](NamedSequenceOp op) {
+        return op->hasAttr(kTuningSpecEntrypointAttrName);
+      });
 }
 
 static LogicalResult validateTuningSpec(NamedSequenceOp op) {
@@ -85,7 +86,7 @@ emitLinkedTuningSpec(ModuleOp module, ArrayRef<NamedSequenceOp> specsToLink) {
       /*res_attrs*/ ArrayAttr{});
   newSpec.setArgAttr(0, transform::TransformDialect::kArgReadOnlyAttrName,
                      builder.getUnitAttr());
-  newSpec->setAttr(kTuningSpecAttrName, builder.getUnitAttr());
+  newSpec->setAttr(kTuningSpecEntrypointAttrName, builder.getUnitAttr());
 
   Region &region = newSpec.getRegion();
   Block *body = builder.createBlock(&region, region.begin(),
@@ -122,28 +123,34 @@ struct LinkTuningSpecsPass final
   }
 
   void runOnOperation() override {
-    ModuleOp module = getOperation();
-    SmallVector<NamedSequenceOp> tuningSpecs;
-
-    for (ModuleOp nested : findNestedModulesWithNamedSequences(module)) {
-      llvm::append_range(tuningSpecs, findTuningSpecs(nested));
+    if (failed(linkTuningSpecs(getOperation()))) {
+      signalPassFailure();
     }
+  }
+};
 
-    for (NamedSequenceOp spec : tuningSpecs) {
-      LDBG("Found tuning spec: " << spec.getSymName());
-      if (failed(validateTuningSpec(spec))) {
-        return signalPassFailure();
-      }
-    }
+} // namespace
+
+FailureOr<NamedSequenceOp> linkTuningSpecs(ModuleOp module) {
+  SmallVector<NamedSequenceOp> tuningSpecs;
 
-    if (tuningSpecs.empty()) {
-      LDBG("No tuning specs found, exiting without linking");
-      return;
+  for (ModuleOp nested : findNestedModulesWithNamedSequences(module)) {
+    llvm::append_range(tuningSpecs, findTuningSpecs(nested));
+  }
+
+  for (NamedSequenceOp spec : tuningSpecs) {
+    LDBG("Found tuning spec: " << spec.getSymName());
+    if (failed(validateTuningSpec(spec))) {
+      return failure();
     }
+  }
 
-    emitLinkedTuningSpec(module, tuningSpecs);
+  if (tuningSpecs.empty()) {
+    LDBG("No tuning specs found, exiting without linking");
+    return NamedSequenceOp{};
   }
-};
 
-} // namespace
+  return emitLinkedTuningSpec(module, tuningSpecs);
+}
+
 } // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/Common/MaterializeTuningSpecsPass.cpp b/compiler/src/iree/compiler/Codegen/Common/MaterializeTuningSpecsPass.cpp
new file mode 100644
index 000000000000..f14aa92d5a66
--- /dev/null
+++ b/compiler/src/iree/compiler/Codegen/Common/MaterializeTuningSpecsPass.cpp
@@ -0,0 +1,166 @@
+// Copyright 2024 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <cassert>
+#include "iree/compiler/Codegen/Common/Passes.h"
+#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
+#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenDialect.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/Bytecode/BytecodeWriter.h"
+#include "mlir/Dialect/Transform/IR/TransformDialect.h"
+#include "mlir/Dialect/Transform/IR/TransformOps.h"
+#include "mlir/Dialect/Transform/IR/TransformTypes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/BuiltinTypeInterfaces.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/OwningOpRef.h"
+#include "mlir/Support/FileUtilities.h"
+
+#define DEBUG_TYPE "iree-codegen-materialize-tuning-specs"
+#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
+#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
+
+namespace mlir::iree_compiler {
+
+#define GEN_PASS_DEF_MATERIALIZETUNINGSPECSPASS
+#include "iree/compiler/Codegen/Common/Passes.h.inc"
+
+namespace {
+
+llvm::cl::opt<std::string> clCodegenTuningSpecPath(
+    "iree-codegen-tuning-spec-path",
+    llvm::cl::desc("File path to a module containing a tuning spec (transform "
+                   "dialect library)."),
+    llvm::cl::init(""));
+
+llvm::cl::opt<std::string> clCodegenTuningSpecDumpDir(
+    "iree-codegen-dump-tuning-specs-to",
+    llvm::cl::desc(
+        "Dump the final tuning spec modules to the specified directory. When "
+        "set to '-', prints the tuning spec to stdout."),
+    llvm::cl::init(""));
+
+using mlir::transform::NamedSequenceOp;
+
+static LogicalResult dumpFinalTuningSpecToDir(ModuleOp tuningSpec,
+                                              StringRef dir) {
+  if (dir == "-") {
+    tuningSpec->print(llvm::outs());
+    return success();
+  }
+
+  llvm::sys::fs::create_directories(dir);
+  llvm::SmallString<64> dumpPath;
+  auto dumpFileEC = llvm::sys::fs::createUniqueFile(
+      Twine(dir) + "/iree_tuning_spec_%%.mlir", dumpPath);
+  if (dumpFileEC) {
+    return tuningSpec->emitError()
+           << "Failed to create a unique file in " << dir << "\n";
+  }
+  LDBG("Linked tuning spec file path: " << dumpPath);
+
+  std::string error;
+  auto file = mlir::openOutputFile(dumpPath, &error);
+  if (!file) {
+    return tuningSpec->emitError()
+           << "Failed to open a tuning spec dump file " << dumpPath << "\n";
+  }
+
+  tuningSpec->print(file->os());
+  file->keep();
+  return success();
+}
+
+static FailureOr<DenseElementsAttr>
+serializeTuningSpecToAttr(ModuleOp tuningSpec) {
+  std::string buffer;
+  llvm::raw_string_ostream os(buffer);
+  if (failed(writeBytecodeToFile(tuningSpec, os))) {
+    return failure();
+  }
+
+  auto bufferSize = static_cast<int64_t>(buffer.size());
+  auto bufferShape = VectorType::get(
+      bufferSize, IntegerType::get(tuningSpec->getContext(), 8));
+  return DenseElementsAttr::getFromRawBuffer(
+      bufferShape, ArrayRef(buffer.data(), buffer.data() + bufferSize));
+}
+
+struct MaterializeTuningSpecsPass final
+    : impl::MaterializeTuningSpecsPassBase<MaterializeTuningSpecsPass> {
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registerTransformDialectTranslationDependentDialects(registry);
+  }
+
+  void runOnOperation() override {
+    if (clCodegenTuningSpecPath.empty()) {
+      return;
+    }
+
+    ModuleOp module = getOperation();
+    MLIRContext *ctx = &getContext();
+    auto dialect = ctx->getOrLoadDialect<IREE::Codegen::IREECodegenDialect>();
+    auto maybeTransformLibrary =
+        dialect->getOrLoadTransformLibraryModule(clCodegenTuningSpecPath);
+    if (failed(maybeTransformLibrary)) {
+      module->emitError()
+          << "Failed to load tuning spec transform dialect library from "
+          << clCodegenTuningSpecPath;
+      return signalPassFailure();
+    }
+
+    ModuleOp userTuningSpec = *maybeTransformLibrary;
+    if (!userTuningSpec.getSymName()) {
+      // Set a module name so that we can refer to its nested symbols.
+      userTuningSpec.setSymName("iree_user_tuning_spec");
+    }
+
+    Location loc = userTuningSpec.getLoc();
+
+    // This module will always be released at the end of the pass.
+    OwningOpRef<ModuleOp> linkedTuningSpec(
+        ModuleOp::create(loc, "iree_linked_tuning_spec"));
+    linkedTuningSpec.get()->setAttr(
+        transform::TransformDialect::kWithNamedSequenceAttrName,
+        UnitAttr::get(ctx));
+    linkedTuningSpec->insert(linkedTuningSpec->begin(), userTuningSpec.clone());
+
+    // TODO(https://github.com/iree-org/iree/issues/19214): Add linked tuning
+    // spec memoization to IREECodegenDialect. We should be able to provide a
+    // list of input libraries that may have already been linked and ask the
+    // dialect to return it to us, or invoke a callback that will insert it if
+    // not found.
+    FailureOr<transform::NamedSequenceOp> newEntrypoint =
+        linkTuningSpecs(linkedTuningSpec.get());
+    if (failed(newEntrypoint)) {
+      module->emitError("Failed to link tuning specs");
+      return signalPassFailure();
+    }
+
+    if (!clCodegenTuningSpecDumpDir.empty()) {
+      if (failed(dumpFinalTuningSpecToDir(linkedTuningSpec.get(),
+                                          clCodegenTuningSpecDumpDir))) {
+        return signalPassFailure();
+      }
+    }
+
+    FailureOr<DenseElementsAttr> serializedSpec =
+        serializeTuningSpecToAttr(linkedTuningSpec.get());
+    if (failed(serializedSpec)) {
+      module->emitError("Failed to serialize linked tuning specs");
+      return signalPassFailure();
+    }
+    module->setAttr(kSerializedTuningSpecAttrName, *serializedSpec);
+  }
+};
+
+} // namespace
+} // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/Common/MaterializeUserConfigs.cpp b/compiler/src/iree/compiler/Codegen/Common/MaterializeUserConfigs.cpp
index c4c97925eefe..21fee4a3f065 100644
--- a/compiler/src/iree/compiler/Codegen/Common/MaterializeUserConfigs.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/MaterializeUserConfigs.cpp
@@ -4,15 +4,17 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+#include <cassert>
 #include "iree/compiler/Codegen/Common/Passes.h"
 #include "iree/compiler/Codegen/Common/UserConfig.h"
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenDialect.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Support/LogicalResult.h"
 #include "mlir/Dialect/Transform/Transforms/TransformInterpreterUtils.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/OwningOpRef.h"
+#include "mlir/Parser/Parser.h"
 
 #define DEBUG_TYPE "iree-codegen-materialize-user-configs"
 #define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
@@ -110,6 +112,40 @@ getTransformLibraryFromPath(ModuleOp compiledModule, StringRef path) {
                                         entrySequenceName.str()};
 }
 
+/// Look up the tuning spec in the given module or any of its parents.
+static LogicalResult getModuleTuningSpec(ModuleOp compiledModule,
+                                         OwningOpRef<ModuleOp> &tuningSpec) {
+  IREE::Util::SerializableAttrInterface serializedTuningSpec;
+  Operation *op = compiledModule;
+  while (!serializedTuningSpec && op) {
+    serializedTuningSpec =
+        op->getAttrOfType<IREE::Util::SerializableAttrInterface>(
+            kSerializedTuningSpecAttrName);
+    op = op->getParentOp();
+  }
+
+  if (!serializedTuningSpec) {
+    return failure();
+  }
+
+  SmallVector<char, 0> bytecode;
+  if (failed(serializedTuningSpec.serializeToVector(
+          compiledModule->getLoc(), llvm::endianness::native, bytecode))) {
+    return compiledModule.emitError()
+           << "Failed to read attribute " << kSerializedTuningSpecAttrName;
+  }
+
+  ParserConfig config(compiledModule.getContext());
+  tuningSpec = parseSourceString<ModuleOp>(
+      StringRef(bytecode.data(), bytecode.size()), config);
+  if (!tuningSpec) {
+    return compiledModule.emitError() << "Failed to parse tuning spec in "
+                                      << kSerializedTuningSpecAttrName;
+  }
+  LDBG("--loaded tuning spec");
+  return success();
+}
+
 struct MaterializeUserConfigsPass final
     : impl::MaterializeUserConfigsPassBase<MaterializeUserConfigsPass> {
   void getDependentDialects(DialectRegistry &registry) const override {
@@ -119,9 +155,28 @@ struct MaterializeUserConfigsPass final
   void runOnOperation() override {
     ModuleOp moduleOp = getOperation();
 
+    // Try to load the transform library from the user flag first. If none is
+    // specified, fall back to using the module tuning spec.
     FailureOr<TransformLibraryWithEntrypoint> userTransformLibrary =
         getTransformLibraryFromPath(moduleOp,
                                     clCodegenTransformDialectLibraryFileName);
+    OwningOpRef<ModuleOp> tuningSpec;
+    if (failed(userTransformLibrary)) {
+      if (succeeded(getModuleTuningSpec(moduleOp, tuningSpec))) {
+        assert(tuningSpec);
+        userTransformLibrary = TransformLibraryWithEntrypoint{
+            tuningSpec.get(), kKernelConfigSpecName.str()};
+      }
+    }
+
+    // Remove the tuning spec, if any, from the current module. If the tuning
+    // spec is attached to some other parent op, we conservatively keep it
+    // as-is, as we are not sure who the producer is and if they want it
+    // removed.
+    if (moduleOp->hasAttr(kSerializedTuningSpecAttrName)) {
+      moduleOp->removeAttr(kSerializedTuningSpecAttrName);
+      LDBG("--dropped the serialized tuning spec from the module");
+    }
 
     for (auto funcOp : moduleOp.getOps<FunctionOpInterface>()) {
 
diff --git a/compiler/src/iree/compiler/Codegen/Common/Passes.h b/compiler/src/iree/compiler/Codegen/Common/Passes.h
index eac457dc6280..2938bdd87da5 100644
--- a/compiler/src/iree/compiler/Codegen/Common/Passes.h
+++ b/compiler/src/iree/compiler/Codegen/Common/Passes.h
@@ -21,6 +21,7 @@
 #include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/Transform/IR/TransformDialect.h"
+#include "mlir/Dialect/Transform/IR/TransformOps.h"
 #include "mlir/Pass/Pass.h"
 
 namespace mlir::iree_compiler {
@@ -52,6 +53,12 @@ void addConstantBufferizePasses(OpPassManager &funcPassManager);
 /// Populate Encoding to Nop pass and canonicalizer pass to the pipeline
 void addEncodingToNopPasses(FunctionLikeNest &passManager);
 
+/// Links nested transform dialect tuning specs named sequences into a single
+/// entry point. Returns the new named sequence op (inserted into the `module`)
+/// that includes the nested tuning specs, or a null op when no nested named
+/// sequences were found.
+FailureOr<transform::NamedSequenceOp> linkTuningSpecs(ModuleOp module);
+
 //------------------------------------------------------------------------------
 // Wrappers that not use tablegen options. See Passes.td for details.
 //------------------------------------------------------------------------------
diff --git a/compiler/src/iree/compiler/Codegen/Common/Passes.td b/compiler/src/iree/compiler/Codegen/Common/Passes.td
index 9852cc693e5c..5471c95b0cad 100644
--- a/compiler/src/iree/compiler/Codegen/Common/Passes.td
+++ b/compiler/src/iree/compiler/Codegen/Common/Passes.td
@@ -436,6 +436,25 @@ def MaterializeEncodingIntoNopPass :
   let summary = "Drop the encodings from tensor types with encodings.";
 }
 
+def MaterializeTuningSpecsPass : Pass<"iree-codegen-materialize-tuning-specs", "ModuleOp"> {
+  let summary =
+      "Load tuning spec transform dialect libraries and encode them in the module";
+  let description = [{
+    Links all available tuning spec transform dialect modules into a single
+    tuning spec. Next, serializes this tuning spec to bytecode and attaches it
+    as a module attribute. We do this so that the full tuning spec is always
+    encoded in the program IR and can be checked with `--mlir-print-ir-after-all`
+    (or equivalent). The alternative would be to add the tuning spec as a
+    submodule in the compiled program, but this may result in the tuning spec
+    being inadvertently visited by other passes that attempt to `walk` the outer
+    module. Serialization makes the tuning specs opaque and prevents it from
+    happening.
+
+    This attribute is expected to be short-lived and removed by
+    `iree-codegen-materialize-user-configs`.
+  }];
+}
+
 def MaterializeUserConfigsPass : Pass<"iree-codegen-materialize-user-configs", "ModuleOp"> {
   let summary = "Sets the lowering configs and translation info from user configs";
   let dependentDialects = [
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Common/test/BUILD.bazel
index 4a89365196c6..5644f4855ab7 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/Common/test/BUILD.bazel
@@ -55,6 +55,9 @@ iree_lit_test_suite(
             "link_tuning_specs.mlir",
             "lower_ukernel_to_calls.mlir",
             "materialize_encoding_into_nop.mlir",
+            "materialize_tuning_specs.mlir",
+            "materialize_tuning_specs_invalid_spec.mlir",
+            "materialize_user_config_from_tuning_spec.mlir",
             "materialize_user_configs.mlir",
             "normalize_loop_bounds.mlir",
             "optimize_tensor_insert_extract_slices.mlir",
@@ -95,6 +98,7 @@ iree_lit_test_suite(
             "convolution_match_spec.mlir",
             "reductions_codegen_spec.mlir",
             "reductions_match_spec.mlir",
+            "tuning_spec.mlir",
         ],
     ),
     cfg = "//compiler:lit.cfg.py",
@@ -105,6 +109,7 @@ iree_lit_test_suite(
         "convolution_match_spec.mlir",
         "reductions_codegen_spec.mlir",
         "reductions_match_spec.mlir",
+        "tuning_spec.mlir",
     ],
     tools = [
         "//tools:iree-opt",
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Common/test/CMakeLists.txt
index ae563af014fd..325e72828c60 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/Common/test/CMakeLists.txt
@@ -51,6 +51,9 @@ iree_lit_test_suite(
     "link_tuning_specs.mlir"
     "lower_ukernel_to_calls.mlir"
     "materialize_encoding_into_nop.mlir"
+    "materialize_tuning_specs.mlir"
+    "materialize_tuning_specs_invalid_spec.mlir"
+    "materialize_user_config_from_tuning_spec.mlir"
     "materialize_user_configs.mlir"
     "normalize_loop_bounds.mlir"
     "optimize_tensor_insert_extract_slices.mlir"
@@ -92,6 +95,7 @@ iree_lit_test_suite(
     convolution_match_spec.mlir
     reductions_codegen_spec.mlir
     reductions_match_spec.mlir
+    tuning_spec.mlir
 )
 
 ### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/materialize_tuning_specs.mlir b/compiler/src/iree/compiler/Codegen/Common/test/materialize_tuning_specs.mlir
new file mode 100644
index 000000000000..d28cd6874f3b
--- /dev/null
+++ b/compiler/src/iree/compiler/Codegen/Common/test/materialize_tuning_specs.mlir
@@ -0,0 +1,23 @@
+// RUN: iree-opt --pass-pipeline='builtin.module(iree-codegen-materialize-tuning-specs)' \
+// RUN:   --iree-codegen-tuning-spec-path=%p/tuning_spec.mlir \
+// RUN:   --iree-codegen-dump-tuning-specs-to=- \
+// RUN:   --mlir-disable-threading --no-implicit-module %s | FileCheck %s
+
+// Check that the final tuning spec is as expected.
+// CHECK-LABEL: module @iree_linked_tuning_spec attributes {transform.with_named_sequence}
+// CHECK-LABEL:   module @user_spec attributes {transform.with_named_sequence}
+// CHECK-LABEL:     transform.named_sequence @hello
+// CHECK-SAME:        attributes {iree_codegen.tuning_spec_entrypoint}
+// CHECK-LABEL:   transform.named_sequence @__kernel_config
+// CHECK:           @user_spec::@hello
+
+// Check that the transform spec gets materialized as a module attribute.
+// CHECK:        module attributes
+// CHECK-SAME:     iree_codegen.tuning_spec_mlirbc = dense<{{.+}}> : vector<{{[0-9]+}}xi8>
+// CHECK-LABEL:    func.func @main_0
+
+module {
+  func.func @main_0() {
+    return
+  }
+}
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/materialize_tuning_specs_invalid_spec.mlir b/compiler/src/iree/compiler/Codegen/Common/test/materialize_tuning_specs_invalid_spec.mlir
new file mode 100644
index 000000000000..f5f80e1b50ba
--- /dev/null
+++ b/compiler/src/iree/compiler/Codegen/Common/test/materialize_tuning_specs_invalid_spec.mlir
@@ -0,0 +1,12 @@
+// RUN: iree-opt --pass-pipeline='builtin.module(iree-codegen-materialize-tuning-specs)' \
+// RUN:   --iree-codegen-tuning-spec-path=%s --no-implicit-module --verify-diagnostics %s
+
+// Check that we error out on mlir inputs that are not tuning specs (e.g., the input itself).
+
+// expected-error@+2 {{Module without the 'transform.with_named_sequence' attribute is not a transform dialect library}}
+// expected-error@+1 {{Failed to load tuning spec transform dialect library from}}
+module {
+  func.func @main_0() {
+    return
+  }
+}
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/materialize_user_config_from_tuning_spec.mlir b/compiler/src/iree/compiler/Codegen/Common/test/materialize_user_config_from_tuning_spec.mlir
new file mode 100644
index 000000000000..08f52791de3f
--- /dev/null
+++ b/compiler/src/iree/compiler/Codegen/Common/test/materialize_user_config_from_tuning_spec.mlir
@@ -0,0 +1,42 @@
+// RUN: iree-opt --pass-pipeline='builtin.module(builtin.module(iree-codegen-materialize-tuning-specs,iree-codegen-materialize-user-configs))' \
+// RUN:   --iree-codegen-tuning-spec-path=%p/tuning_spec.mlir \
+// RUN:   --mlir-disable-threading --no-implicit-module %s | FileCheck %s
+
+// RUN: iree-opt --pass-pipeline='builtin.module(iree-codegen-materialize-tuning-specs,builtin.module(iree-codegen-materialize-user-configs))' \
+// RUN:   --iree-codegen-tuning-spec-path=%p/tuning_spec.mlir \
+// RUN:   --mlir-disable-threading --no-implicit-module %s | FileCheck %s --check-prefix=PARENT
+
+// (1) We start by running the `Materialize Tuning Specs` pass to embed the
+// transform dialect library into the module. Doing it by hand hand is not
+// possible, because we serialize it as MLIR bytecode.
+//
+// Check that the transform spec gets executed and that it does not remain as
+// a module attribute after `Materialize User Configs`.
+
+// CHECK-LABEL:  [ IR printer: Hello Tuning Spec top-level ]
+// CHECK-NEXT:   func.func @main_0
+//
+// CHECK-LABEL:  module @parent {
+// CHECK-LABEL:    module @child {
+// CHECK:            func.func @main_0
+
+// (2) Check that the transform spec gets picked up from the **parent** module.
+// The tuning spec attribute should remain on the parent module as we
+// (conservatively) only remove tuning spec from the module passed
+// to the `materialize-user-configs` pass.
+
+// PARENT-LABEL:  [ IR printer: Hello Tuning Spec top-level ]
+// PARENT-NEXT:   func.func @main_0
+//
+// PARENT-LABEL:  module @parent attributes {
+// PARENT-SAME:     iree_codegen.tuning_spec_mlirbc = dense<
+// PARENT-LABEL:    module @child {
+// PARENT:            func.func @main_0
+
+module @parent {
+  module @child {
+    func.func @main_0() {
+      return
+    }
+  }
+}
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/tuning_spec.mlir b/compiler/src/iree/compiler/Codegen/Common/test/tuning_spec.mlir
new file mode 100644
index 000000000000..24af07386c2e
--- /dev/null
+++ b/compiler/src/iree/compiler/Codegen/Common/test/tuning_spec.mlir
@@ -0,0 +1,9 @@
+// RUN: iree-opt %s
+
+module @user_spec attributes { transform.with_named_sequence } {
+  transform.named_sequence @hello(%arg0: !transform.any_op {transform.readonly}) -> ()
+    attributes { iree_codegen.tuning_spec_entrypoint } {
+    transform.print {name = "Hello Tuning Spec", skip_regions}
+    transform.yield
+  }
+}
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h
index bb2d747f0a5f..989d7bda3441 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h
+++ b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h
@@ -39,8 +39,10 @@ namespace mlir::iree_compiler {
 // Constant names.
 //===----------------------------------------------------------------------===//
 constexpr StringLiteral kConfigAttrName = "lowering_config";
-constexpr StringLiteral kTuningSpecAttrName =
+constexpr StringLiteral kTuningSpecEntrypointAttrName =
     "iree_codegen.tuning_spec_entrypoint";
+constexpr StringLiteral kSerializedTuningSpecAttrName =
+    "iree_codegen.tuning_spec_mlirbc";
 constexpr StringLiteral kKernelConfigSpecName = "__kernel_config";
 
 //===----------------------------------------------------------------------===//
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenDialect.td b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenDialect.td
index 7f47fbe10be4..9e5a408f5957 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenDialect.td
+++ b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenDialect.td
@@ -23,7 +23,7 @@ def IREECodegen_Dialect : Dialect {
   let description = [{
     This dialect is primarily meant to hold attributes that carry the
     state of the compilation when lowered to scalar code for an
-    architecture. Typically, a backend starts by analysing the entry
+    architecture. Typically, a backend starts by analyzing the entry
     point functions within the `hal.executable.variant` and deciding
     which compilation pipeline to chose. During this, even the values
     for parameters such as tile sizes, etc. are also decided. The rest
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenLibraryManager.cpp b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenLibraryManager.cpp
index 93176084fd82..437a80b04cb7 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenLibraryManager.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenLibraryManager.cpp
@@ -5,6 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenDialect.h"
+#include "mlir/Dialect/Transform/IR/TransformDialect.h"
 #include "mlir/Dialect/Transform/Transforms/TransformInterpreterUtils.h"
 
 namespace mlir::iree_compiler::IREE::Codegen {
@@ -17,25 +18,40 @@ IREECodegenDialect::getOrLoadTransformLibraryModule(std::string libraryPath) {
   auto loadedLibrary = libraryModules.find(libraryPath);
   if (loadedLibrary != libraryModules.end()) {
     // Check whether the library already failed to load.
-    if (!(loadedLibrary->second) || !(*(loadedLibrary->second))) {
-      return failure();
+    if (ModuleOp module = loadedLibrary->second.get()) {
+      return module;
     }
-    return *(loadedLibrary->second);
+    return failure();
+  }
+
+  // We update the storage for the library regardless of whether parsing
+  // succeeds so that other threads don't have to retry.
+  OwningOpRef<ModuleOp> &parsedLibrary = libraryModules[libraryPath];
+
+  MLIRContext *ctx = getContext();
+  if (failed(transform::detail::parseTransformModuleFromFile(ctx, libraryPath,
+                                                             parsedLibrary))) {
+    return failure();
   }
 
-  OwningOpRef<ModuleOp> mergedParsedLibraries;
-  if (failed(transform::detail::assembleTransformLibraryFromPaths(
-          getContext(), SmallVector<std::string>{libraryPath},
-          mergedParsedLibraries))) {
-    // We update the storage for the library regardless of whether parsing
-    // succeeds so that other threads don't have to retry.
-    OwningOpRef<ModuleOp> emptyLibrary;
-    libraryModules[libraryPath] = std::move(emptyLibrary);
+  if (!parsedLibrary.get()->hasAttr(
+          transform::TransformDialect::kWithNamedSequenceAttrName)) {
+    parsedLibrary->emitError()
+        << "Module without the '"
+        << transform::TransformDialect::kWithNamedSequenceAttrName
+        << "' attribute is not a transform dialect library";
+
+    // Invalidate the module stored in the library so that this does not
+    // succeed on a retry.
+    parsedLibrary = nullptr;
     return failure();
   }
 
-  libraryModules[libraryPath] = std::move(mergedParsedLibraries);
-  return *libraryModules[libraryPath];
+  if (!parsedLibrary->getSymName()) {
+    parsedLibrary->setSymName("__transform");
+  }
+
+  return parsedLibrary.get();
 }
 
 } // namespace mlir::iree_compiler::IREE::Codegen

From 886f801d614916afbe40bb5497dc2cde585cf1b0 Mon Sep 17 00:00:00 2001
From: Zhuoran Yin <zhuoran.yin@amd.com>
Date: Mon, 2 Dec 2024 10:45:51 -0500
Subject: [PATCH 31/54] [Codegen][llvmgpu] Refactor op cloning in prefetch
 shared memory pass (#19196)

I refactored the prefetch shared memory pass by using `rewriter.clone()`
with a IRMapping. With this, the pass can now handle ops with a region
(like `scf.if`) which would otherwise create invalid IRs when there's
scf.if in k loop.

---------

Signed-off-by: jerryyin <zhuoryin@amd.com>
---
 .../Utils/PrefetchSharedMemoryCopy.cpp        | 72 ++-----------------
 .../LLVMGPU/test/prefetch_shared_memory.mlir  | 53 ++++++++++++++
 2 files changed, 58 insertions(+), 67 deletions(-)

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/PrefetchSharedMemoryCopy.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/PrefetchSharedMemoryCopy.cpp
index da5f5430be5e..ed96a1329db3 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/PrefetchSharedMemoryCopy.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/PrefetchSharedMemoryCopy.cpp
@@ -250,46 +250,15 @@ class LoopPrefetcher {
     return success();
   }
 
-  /// Clones |op| and call |callback| on the cloned op's operands as well as any
-  /// operands of nested ops that 1) aren't defined within the new op or 2) are
-  /// block arguments.
-  static Operation *
-  cloneAndUpdateOperands(RewriterBase &rewriter, Operation *op,
-                         function_ref<void(OpOperand *newOperand)> callback) {
-    Operation *clone = rewriter.clone(*op);
-    for (OpOperand &operand : clone->getOpOperands())
-      callback(&operand);
-    return clone;
-  }
-
   /// Creates all read stage ops for a loop iteration with |rewriter| and maps
   /// the original loop induction variable to |iv| in |mapping|.
-  SmallVector<Value> emitRead(IRMapping &mapping, RewriterBase &rewriter,
-                              Value iv) {
+  void emitRead(IRMapping &mapping, RewriterBase &rewriter, Value iv) {
     // Map the original loop induction variable to |iv| for later op rewrites.
     mapping.map(forOp.getInductionVar(), iv);
 
-    SmallVector<Value> results;
     for (Operation *op : readStage) {
-      // Clone the current read stage op and updates all its operands to
-      // reference newly created ops.
-      Operation *newOp =
-          cloneAndUpdateOperands(rewriter, op, [&](OpOperand *newOperand) {
-            if (mapping.contains(newOperand->get())) {
-              newOperand->set(mapping.lookup(newOperand->get()));
-            }
-          });
-
-      if (isa<vector::TransferReadOp>(newOp)) {
-        llvm::append_range(results, newOp->getResults());
-      }
-
-      // Update read stage op results mapping.
-      for (unsigned i = 0, e = op->getNumResults(); i != e; ++i) {
-        mapping.map(op->getResult(i), newOp->getResult(i));
-      }
+      rewriter.clone(*op, mapping);
     }
-    return results;
   }
 
   /// Creates all write stage ops for a loop iteration with |rewriter| and maps
@@ -299,22 +268,7 @@ class LoopPrefetcher {
     mapping.map(forOp.getInductionVar(), iv);
 
     for (Operation *op : writeStage) {
-      // Clone the current read stage op and updates all its operands to
-      // reference newly created ops.
-      Operation *newOp =
-          cloneAndUpdateOperands(rewriter, op, [&](OpOperand *newOperand) {
-            if (mapping.contains(newOperand->get())) {
-              newOperand->set(mapping.lookup(newOperand->get()));
-            }
-          });
-
-      // If a mapping for any results already exists, move on, otherwise,
-      // add a new mapping.
-      for (unsigned i = 0, e = op->getNumResults(); i != e; ++i) {
-        if (!mapping.contains(op->getResult(i))) {
-          mapping.map(op->getResult(i), newOp->getResult(i));
-        }
-      }
+      rewriter.clone(*op, mapping);
     }
   }
 
@@ -341,18 +295,7 @@ class LoopPrefetcher {
         break;
       }
 
-      Operation *newOp =
-          cloneAndUpdateOperands(rewriter, op, [&](OpOperand *newOperand) {
-            if (mapping.contains(newOperand->get())) {
-              newOperand->set(mapping.lookup(newOperand->get()));
-            }
-          });
-      results = newOp->getResults();
-
-      // Map compute operations to new compute operations.
-      for (unsigned i = 0, e = op->getNumResults(); i != e; ++i) {
-        mapping.map(op->getResult(i), newOp->getResult(i));
-      }
+      rewriter.clone(*op, mapping);
     }
 
     return results;
@@ -361,12 +304,7 @@ class LoopPrefetcher {
   void updateYield(IRMapping &mapping, RewriterBase &rewriter) {
     for (Operation *op : computeStage) {
       if (auto yield = dyn_cast<scf::YieldOp>(op)) {
-        cloneAndUpdateOperands(rewriter, yield, [&](OpOperand *newOperand) {
-          if (mapping.contains(newOperand->get())) {
-            newOperand->set(mapping.lookup(newOperand->get()));
-          }
-        });
-
+        rewriter.clone(*op, mapping);
         break;
       }
     }
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/prefetch_shared_memory.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/prefetch_shared_memory.mlir
index 68003274de0b..87cfeb5004c4 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/prefetch_shared_memory.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/prefetch_shared_memory.mlir
@@ -81,3 +81,56 @@ func.func @prefetch_multi_scf_return(%arg0: memref<128xf32>) -> (vector<1xf32>,
   // CHECK: return %[[EPI_COMPUTE]], %[[EPI_COMPUTE2]]
   return %0#0, %0#1 : vector<1xf32>, vector<1xf32>
 }
+
+// CHECK-LABEL: @prefetch_add_with_if
+// CHECK-SAME: (%[[GLOBAL:.*]]: memref<128xf32>)
+func.func @prefetch_add_with_if(%arg0: memref<128xf32>) {
+  // CHECK-DAG: %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<1xf32>
+  %cst = arith.constant dense<0.000000e+00> : vector<1xf32>
+  %cst_0 = arith.constant 0.000000e+00 : f32
+  // CHECK-DAG: %[[C127:.*]] = arith.constant 127 : index
+  %c128 = arith.constant 128 : index
+  // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
+  %c1 = arith.constant 1 : index
+  // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
+  %c0 = arith.constant 0 : index
+  %true = arith.constant true
+  // CHECK-DAG: %[[SHARED:.*]] = memref.alloc() : memref<1xf32, #gpu.address_space<workgroup>>
+  %alloc = memref.alloc() : memref<1xf32, #gpu.address_space<workgroup>>
+  // CHECK-DAG: %[[PRO_READ:.*]] = vector.transfer_read %[[GLOBAL]]
+  // CHECK: vector.transfer_write %[[PRO_READ]], %[[SHARED]]
+  // CHECK: %[[OUT:.*]] = scf.for %[[IV:.*]] = %[[C0]] to %[[C127]] step %[[C1]] iter_args(%[[ARG:.*]] = %[[CST]])
+  %0 = scf.for %arg1 = %c0 to %c128 step %c1 iter_args(%arg2 = %cst) -> (vector<1xf32>) {
+    %dummy = memref.load %arg0[%arg1] : memref<128xf32>
+    %5 = arith.cmpf "oeq", %cst_0, %dummy : f32
+    // CHECK: %[[BRANCH:.*]] = scf.if %[[COND:.*]] -> (index)
+    // CHECK: } else {
+    // CHECK: }
+    %updated = scf.if %5 -> (index) {
+      %override = arith.constant 5 : index
+      %add = arith.addi %arg1, %override : index
+      scf.yield %add : index
+    } else {
+      scf.yield %arg1 : index
+    }
+    // CHECK: %[[KER_READ:.*]] = vector.transfer_read %[[GLOBAL]][%[[UPDATED:.*]]]
+    //%1 = vector.transfer_read %arg0[%arg1], %cst_0 : memref<128xf32>, vector<1xf32>
+    %1 = vector.transfer_read %arg0[%updated], %cst_0 : memref<128xf32>, vector<1xf32>
+    vector.transfer_write %1, %alloc[%c0] {in_bounds = [true]} : vector<1xf32>, memref<1xf32, #gpu.address_space<workgroup>>
+    // CHECK: gpu.barrier
+    // CHECK: %[[COMPUTE_READ:.*]] = vector.transfer_read %[[SHARED]][%[[C0]]]
+    %2 = vector.transfer_read %alloc[%c0], %cst_0 : memref<1xf32, #gpu.address_space<workgroup>>, vector<1xf32>
+    // CHECK: %[[COMPUTE:.*]] = arith.addf %[[COMPUTE_READ]], %[[ARG]]
+    %3 = arith.addf %2, %arg2 : vector<1xf32>
+    // CHECK: gpu.barrier
+    // CHECK: vector.transfer_write %[[KER_READ]], %[[SHARED]]
+    // CHECK: scf.yield %[[COMPUTE]]
+    scf.yield %3 : vector<1xf32>
+  }
+  // CHECK: gpu.barrier
+  // CHECK: %[[EPI_READ:.*]] = vector.transfer_read %[[SHARED]][%[[C0]]]
+  // CHECK: %[[EPI_COMPUTE:.*]] = arith.addf %[[EPI_READ]], %[[OUT]]
+  // CHECK: vector.transfer_write %[[EPI_COMPUTE]], %[[GLOBAL]][%[[C0]]]
+  vector.transfer_write %0, %arg0[%c0] {in_bounds = [true]} : vector<1xf32>, memref<128xf32>
+  return
+}

From 03443413eebed64b6feae469cf89e21cff2a6744 Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak@amd.com>
Date: Mon, 2 Dec 2024 10:25:21 -0600
Subject: [PATCH 32/54] [NFC] Move the util passes to new-style boilerplate
 (#19327)

I notited the util dialect hadn't been upgraded to the
PASS_DECL/PASS_DEF system like other parts of IREE had, so I figured I
should go ahead and clean it up.
---
 .../Util/Transforms/AnnotateOpOrdinals.cpp    |  11 +-
 .../Dialect/Util/Transforms/ApplyPatterns.cpp |  13 +-
 .../Dialect/Util/Transforms/BUILD.bazel       |   1 -
 .../Dialect/Util/Transforms/CMakeLists.txt    |   1 -
 .../Util/Transforms/CombineInitializers.cpp   |  11 +-
 .../Util/Transforms/DropCompilerHints.cpp     |  12 +-
 .../Dialect/Util/Transforms/DumpModule.cpp    |  22 +--
 .../Util/Transforms/FixedPointIterator.cpp    |  10 +-
 .../Dialect/Util/Transforms/FoldGlobals.cpp   |  26 +---
 .../Dialect/Util/Transforms/FuseGlobals.cpp   |  15 +-
 .../Util/Transforms/HoistIntoGlobals.cpp      |  20 +--
 .../compiler/Dialect/Util/Transforms/IPO.cpp  |  18 +--
 .../Util/Transforms/ImportResources.cpp       |  11 +-
 .../Util/Transforms/OptimizeIntArithmetic.cpp |  15 +-
 .../Dialect/Util/Transforms/PassDetail.h      |  21 ---
 .../compiler/Dialect/Util/Transforms/Passes.h |  49 +++---
 .../Dialect/Util/Transforms/Passes.td         | 141 +++++++++---------
 .../Util/Transforms/PropagateSubranges.cpp    |  17 +--
 .../Transforms/SimplifyGlobalAccesses.cpp     |  10 +-
 .../Transforms/StripAndSplatConstants.cpp     |  19 +--
 .../Dialect/Util/Transforms/StripDebugOps.cpp |  11 +-
 .../Util/Transforms/TestConversion.cpp        |  23 +--
 .../Transforms/TestFloatRangeAnalysis.cpp     |  10 +-
 23 files changed, 207 insertions(+), 280 deletions(-)
 delete mode 100644 compiler/src/iree/compiler/Dialect/Util/Transforms/PassDetail.h

diff --git a/compiler/src/iree/compiler/Dialect/Util/Transforms/AnnotateOpOrdinals.cpp b/compiler/src/iree/compiler/Dialect/Util/Transforms/AnnotateOpOrdinals.cpp
index 2982407b49cb..1e29f64530bc 100644
--- a/compiler/src/iree/compiler/Dialect/Util/Transforms/AnnotateOpOrdinals.cpp
+++ b/compiler/src/iree/compiler/Dialect/Util/Transforms/AnnotateOpOrdinals.cpp
@@ -6,15 +6,17 @@
 
 #include "iree/compiler/Dialect/Util/IR/UtilDialect.h"
 #include "iree/compiler/Dialect/Util/IR/UtilOps.h"
-#include "iree/compiler/Dialect/Util/Transforms/PassDetail.h"
 #include "iree/compiler/Dialect/Util/Transforms/Passes.h"
 
 namespace mlir::iree_compiler::IREE::Util {
 
+#define GEN_PASS_DEF_ANNOTATEOPORDINALSPASS
+#include "iree/compiler/Dialect/Util/Transforms/Passes.h.inc"
+
 namespace {
 
 class AnnotateOpOrdinalsPass
-    : public AnnotateOpOrdinalsBase<AnnotateOpOrdinalsPass> {
+    : public impl::AnnotateOpOrdinalsPassBase<AnnotateOpOrdinalsPass> {
 public:
   void runOnOperation() override {
     auto *context = &getContext();
@@ -28,9 +30,4 @@ class AnnotateOpOrdinalsPass
 };
 
 } // namespace
-
-std::unique_ptr<OperationPass<ModuleOp>> createAnnotateOpOrdinalsPass() {
-  return std::make_unique<AnnotateOpOrdinalsPass>();
-}
-
 } // namespace mlir::iree_compiler::IREE::Util
diff --git a/compiler/src/iree/compiler/Dialect/Util/Transforms/ApplyPatterns.cpp b/compiler/src/iree/compiler/Dialect/Util/Transforms/ApplyPatterns.cpp
index 2d63d5219097..6e68b5bf1197 100644
--- a/compiler/src/iree/compiler/Dialect/Util/Transforms/ApplyPatterns.cpp
+++ b/compiler/src/iree/compiler/Dialect/Util/Transforms/ApplyPatterns.cpp
@@ -8,7 +8,6 @@
 
 #include "iree/compiler/Dialect/Util/IR/UtilDialect.h"
 #include "iree/compiler/Dialect/Util/IR/UtilOps.h"
-#include "iree/compiler/Dialect/Util/Transforms/PassDetail.h"
 #include "iree/compiler/Dialect/Util/Transforms/Passes.h"
 #include "iree/compiler/Dialect/Util/Transforms/Patterns.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
@@ -18,7 +17,13 @@
 
 namespace mlir::iree_compiler::IREE::Util {
 
-class ApplyPatternsPass : public ApplyPatternsBase<ApplyPatternsPass> {
+#define GEN_PASS_DEF_APPLYPATTERNSPASS
+#include "iree/compiler/Dialect/Util/Transforms/Passes.h.inc"
+
+namespace {
+
+class ApplyPatternsPass
+    : public impl::ApplyPatternsPassBase<ApplyPatternsPass> {
 public:
   void getDependentDialects(DialectRegistry &registry) const override {
     registry
@@ -47,8 +52,6 @@ class ApplyPatternsPass : public ApplyPatternsBase<ApplyPatternsPass> {
   }
 };
 
-std::unique_ptr<OperationPass<void>> createApplyPatternsPass() {
-  return std::make_unique<ApplyPatternsPass>();
-}
+} // namespace
 
 } // namespace mlir::iree_compiler::IREE::Util
diff --git a/compiler/src/iree/compiler/Dialect/Util/Transforms/BUILD.bazel b/compiler/src/iree/compiler/Dialect/Util/Transforms/BUILD.bazel
index da3c44f08cb1..fe55bf49af90 100644
--- a/compiler/src/iree/compiler/Dialect/Util/Transforms/BUILD.bazel
+++ b/compiler/src/iree/compiler/Dialect/Util/Transforms/BUILD.bazel
@@ -27,7 +27,6 @@ iree_compiler_cc_library(
         "IPO.cpp",
         "ImportResources.cpp",
         "OptimizeIntArithmetic.cpp",
-        "PassDetail.h",
         "Passes.cpp",
         "Patterns.cpp",
         "PropagateSubranges.cpp",
diff --git a/compiler/src/iree/compiler/Dialect/Util/Transforms/CMakeLists.txt b/compiler/src/iree/compiler/Dialect/Util/Transforms/CMakeLists.txt
index a8542c40ae64..df2be22d4d6b 100644
--- a/compiler/src/iree/compiler/Dialect/Util/Transforms/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Dialect/Util/Transforms/CMakeLists.txt
@@ -30,7 +30,6 @@ iree_cc_library(
     "IPO.cpp"
     "ImportResources.cpp"
     "OptimizeIntArithmetic.cpp"
-    "PassDetail.h"
     "Passes.cpp"
     "Patterns.cpp"
     "PropagateSubranges.cpp"
diff --git a/compiler/src/iree/compiler/Dialect/Util/Transforms/CombineInitializers.cpp b/compiler/src/iree/compiler/Dialect/Util/Transforms/CombineInitializers.cpp
index cf2d370d7ffc..2af05c3b6fc8 100644
--- a/compiler/src/iree/compiler/Dialect/Util/Transforms/CombineInitializers.cpp
+++ b/compiler/src/iree/compiler/Dialect/Util/Transforms/CombineInitializers.cpp
@@ -10,7 +10,6 @@
 #include "iree/compiler/Dialect/Util/IR/UtilDialect.h"
 #include "iree/compiler/Dialect/Util/IR/UtilOps.h"
 #include "iree/compiler/Dialect/Util/IR/UtilTraits.h"
-#include "iree/compiler/Dialect/Util/Transforms/PassDetail.h"
 #include "iree/compiler/Dialect/Util/Transforms/Passes.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Debug.h"
@@ -24,10 +23,14 @@
 #define DEBUG_TYPE "iree-util-combine-initializers"
 
 namespace mlir::iree_compiler::IREE::Util {
+
+#define GEN_PASS_DEF_COMBINEINITIALIZERSPASS
+#include "iree/compiler/Dialect/Util/Transforms/Passes.h.inc"
+
 namespace {
 
 class CombineInitializersPass
-    : public CombineInitializersBase<CombineInitializersPass> {
+    : public impl::CombineInitializersPassBase<CombineInitializersPass> {
 public:
   void getDependentDialects(DialectRegistry &registry) const override {
     registry.insert<IREE::Util::UtilDialect>();
@@ -75,8 +78,4 @@ class CombineInitializersPass
 
 } // namespace
 
-std::unique_ptr<OperationPass<mlir::ModuleOp>> createCombineInitializersPass() {
-  return std::make_unique<CombineInitializersPass>();
-}
-
 } // namespace mlir::iree_compiler::IREE::Util
diff --git a/compiler/src/iree/compiler/Dialect/Util/Transforms/DropCompilerHints.cpp b/compiler/src/iree/compiler/Dialect/Util/Transforms/DropCompilerHints.cpp
index 15a877576652..2d38d54a9748 100644
--- a/compiler/src/iree/compiler/Dialect/Util/Transforms/DropCompilerHints.cpp
+++ b/compiler/src/iree/compiler/Dialect/Util/Transforms/DropCompilerHints.cpp
@@ -7,14 +7,18 @@
 #include <utility>
 
 #include "iree/compiler/Dialect/Util/IR/UtilOps.h"
-#include "iree/compiler/Dialect/Util/Transforms/PassDetail.h"
 #include "iree/compiler/Dialect/Util/Transforms/Passes.h"
 #include "mlir/Pass/Pass.h"
 
 namespace mlir::iree_compiler::IREE::Util {
 
+#define GEN_PASS_DEF_DROPCOMPILERHINTSPASS
+#include "iree/compiler/Dialect/Util/Transforms/Passes.h.inc"
+
+namespace {
+
 class DropCompilerHintsPass
-    : public DropCompilerHintsBase<DropCompilerHintsPass> {
+    : public impl::DropCompilerHintsPassBase<DropCompilerHintsPass> {
 public:
   void runOnOperation() override {
     // We can't use patterns and applyPatternsAndFoldGreedily because that
@@ -31,8 +35,6 @@ class DropCompilerHintsPass
   }
 };
 
-std::unique_ptr<OperationPass<void>> createDropCompilerHintsPass() {
-  return std::make_unique<DropCompilerHintsPass>();
-}
+} // namespace
 
 } // namespace mlir::iree_compiler::IREE::Util
diff --git a/compiler/src/iree/compiler/Dialect/Util/Transforms/DumpModule.cpp b/compiler/src/iree/compiler/Dialect/Util/Transforms/DumpModule.cpp
index 69c5064b1b9c..86d0576b985c 100644
--- a/compiler/src/iree/compiler/Dialect/Util/Transforms/DumpModule.cpp
+++ b/compiler/src/iree/compiler/Dialect/Util/Transforms/DumpModule.cpp
@@ -6,7 +6,6 @@
 
 #include <utility>
 
-#include "iree/compiler/Dialect/Util/Transforms/PassDetail.h"
 #include "iree/compiler/Dialect/Util/Transforms/Passes.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
@@ -17,9 +16,13 @@
 
 namespace mlir::iree_compiler::IREE::Util {
 
-struct DumpModulePass : public DumpModuleBase<DumpModulePass> {
-  DumpModulePass(std::string path) { this->path = path; }
-  DumpModulePass(const DumpModulePass &pass) {}
+#define GEN_PASS_DEF_DUMPMODULEPASS
+#include "iree/compiler/Dialect/Util/Transforms/Passes.h.inc"
+
+namespace {
+
+struct DumpModulePass : public impl::DumpModulePassBase<DumpModulePass> {
+  using Base::Base;
 
   void runOnOperation() override {
     // Ensure the parent paths exist.
@@ -48,15 +51,12 @@ struct DumpModulePass : public DumpModuleBase<DumpModulePass> {
     // Keep the temporary file after the write succeeds.
     file->keep();
   }
-
-  Option<std::string> path{
-      *this, "path",
-      llvm::cl::desc("File path to write the module text or binary into.")};
 };
 
-std::unique_ptr<OperationPass<mlir::ModuleOp>>
-createDumpModulePass(std::string path) {
-  return std::make_unique<DumpModulePass>(path);
+} // namespace
+
+std::unique_ptr<Pass> createDumpModulePass(std::string path) {
+  return createDumpModulePass(DumpModulePassOptions{std::move(path)});
 }
 
 } // namespace mlir::iree_compiler::IREE::Util
diff --git a/compiler/src/iree/compiler/Dialect/Util/Transforms/FixedPointIterator.cpp b/compiler/src/iree/compiler/Dialect/Util/Transforms/FixedPointIterator.cpp
index ece11e96496d..9690b148edd7 100644
--- a/compiler/src/iree/compiler/Dialect/Util/Transforms/FixedPointIterator.cpp
+++ b/compiler/src/iree/compiler/Dialect/Util/Transforms/FixedPointIterator.cpp
@@ -4,13 +4,16 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#include "iree/compiler/Dialect/Util/Transforms/PassDetail.h"
 #include "iree/compiler/Dialect/Util/Transforms/Passes.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
 
 namespace mlir::iree_compiler::IREE::Util {
+
+#define GEN_PASS_DEF_FIXEDPOINTITERATORPASS
+#include "iree/compiler/Dialect/Util/Transforms/Passes.h.inc"
+
 namespace {
 
 // Dynamic pass which runs a sub-pipeline to a fixed point or a maximum
@@ -23,11 +26,12 @@ namespace {
 // iteration terminates. If a sub-pass removes it, then iteration will
 // continue.
 class FixedPointIteratorPass
-    : public FixedPointIteratorBase<FixedPointIteratorPass> {
+    : public impl::FixedPointIteratorPassBase<FixedPointIteratorPass> {
 public:
+  using Base::Base;
   FixedPointIteratorPass() = default;
   FixedPointIteratorPass(const FixedPointIteratorPass &other)
-      : FixedPointIteratorBase<FixedPointIteratorPass>(other) {}
+      : impl::FixedPointIteratorPassBase<FixedPointIteratorPass>(other) {}
   FixedPointIteratorPass(OpPassManager pipeline);
 
 private:
diff --git a/compiler/src/iree/compiler/Dialect/Util/Transforms/FoldGlobals.cpp b/compiler/src/iree/compiler/Dialect/Util/Transforms/FoldGlobals.cpp
index 467d45f85b75..b4a611e58bf4 100644
--- a/compiler/src/iree/compiler/Dialect/Util/Transforms/FoldGlobals.cpp
+++ b/compiler/src/iree/compiler/Dialect/Util/Transforms/FoldGlobals.cpp
@@ -12,7 +12,6 @@
 #include "iree/compiler/Dialect/Util/IR/UtilOps.h"
 #include "iree/compiler/Dialect/Util/IR/UtilTraits.h"
 #include "iree/compiler/Dialect/Util/IR/UtilTypes.h"
-#include "iree/compiler/Dialect/Util/Transforms/PassDetail.h"
 #include "iree/compiler/Dialect/Util/Transforms/Passes.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/EquivalenceClasses.h"
@@ -29,6 +28,10 @@
 #define DEBUG_TYPE "iree-util-fold-globals"
 
 namespace mlir::iree_compiler::IREE::Util {
+
+#define GEN_PASS_DEF_FOLDGLOBALSPASS
+#include "iree/compiler/Dialect/Util/Transforms/Passes.h.inc"
+
 namespace {
 
 template <typename R>
@@ -351,17 +354,8 @@ static bool deduplicateConstantGlobals(GlobalTable &globalTable) {
   return true; // did change
 }
 
-class FoldGlobalsPass : public FoldGlobalsBase<FoldGlobalsPass> {
+class FoldGlobalsPass : public impl::FoldGlobalsPassBase<FoldGlobalsPass> {
 public:
-  explicit FoldGlobalsPass() = default;
-  FoldGlobalsPass(const FoldGlobalsPass &pass) {}
-
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<mlir::func::FuncDialect>();
-    registry.insert<mlir::arith::ArithDialect>();
-    registry.insert<IREE::Util::UtilDialect>();
-  }
-
   void runOnOperation() override {
     auto *context = &getContext();
     RewritePatternSet patterns(context);
@@ -433,18 +427,8 @@ class FoldGlobalsPass : public FoldGlobalsBase<FoldGlobalsPass> {
     afterFoldingGlobals =
         count(moduleOp.getOps<IREE::Util::GlobalOpInterface>());
   }
-
-private:
-  Statistic beforeFoldingGlobals{this, "global ops before folding",
-                                 "Number of util.global ops before folding"};
-  Statistic afterFoldingGlobals{this, "global ops after folding",
-                                "Number of util.global ops after folding"};
 };
 
 } // namespace
 
-std::unique_ptr<OperationPass<mlir::ModuleOp>> createFoldGlobalsPass() {
-  return std::make_unique<FoldGlobalsPass>();
-}
-
 } // namespace mlir::iree_compiler::IREE::Util
diff --git a/compiler/src/iree/compiler/Dialect/Util/Transforms/FuseGlobals.cpp b/compiler/src/iree/compiler/Dialect/Util/Transforms/FuseGlobals.cpp
index ea45521e8615..2a076d982f5e 100644
--- a/compiler/src/iree/compiler/Dialect/Util/Transforms/FuseGlobals.cpp
+++ b/compiler/src/iree/compiler/Dialect/Util/Transforms/FuseGlobals.cpp
@@ -11,7 +11,6 @@
 #include "iree/compiler/Dialect/Util/IR/UtilDialect.h"
 #include "iree/compiler/Dialect/Util/IR/UtilOps.h"
 #include "iree/compiler/Dialect/Util/IR/UtilTraits.h"
-#include "iree/compiler/Dialect/Util/Transforms/PassDetail.h"
 #include "iree/compiler/Dialect/Util/Transforms/Passes.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/EquivalenceClasses.h"
@@ -27,6 +26,10 @@
 #define DEBUG_TYPE "iree-util-fuse-globals"
 
 namespace mlir::iree_compiler::IREE::Util {
+
+#define GEN_PASS_DEF_FUSEGLOBALSPASS
+#include "iree/compiler/Dialect/Util/Transforms/Passes.h.inc"
+
 namespace {
 
 static llvm::raw_ostream &operator<<(llvm::raw_ostream &os,
@@ -49,12 +52,8 @@ static llvm::raw_ostream &operator<<(llvm::raw_ostream &os,
 //  util.global mutable @fused : i32
 //  builtin.func @foo(%arg0: i32) {
 //    util.global.store %arg0, @fused : i32
-class FuseGlobalsPass : public FuseGlobalsBase<FuseGlobalsPass> {
+class FuseGlobalsPass : public impl::FuseGlobalsPassBase<FuseGlobalsPass> {
 public:
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<IREE::Util::UtilDialect>();
-  }
-
   void runOnOperation() override {
     auto moduleOp = getOperation();
 
@@ -240,8 +239,4 @@ class FuseGlobalsPass : public FuseGlobalsBase<FuseGlobalsPass> {
 
 } // namespace
 
-std::unique_ptr<OperationPass<mlir::ModuleOp>> createFuseGlobalsPass() {
-  return std::make_unique<FuseGlobalsPass>();
-}
-
 } // namespace mlir::iree_compiler::IREE::Util
diff --git a/compiler/src/iree/compiler/Dialect/Util/Transforms/HoistIntoGlobals.cpp b/compiler/src/iree/compiler/Dialect/Util/Transforms/HoistIntoGlobals.cpp
index aa360cb0944e..af8ae92fa8b3 100644
--- a/compiler/src/iree/compiler/Dialect/Util/Transforms/HoistIntoGlobals.cpp
+++ b/compiler/src/iree/compiler/Dialect/Util/Transforms/HoistIntoGlobals.cpp
@@ -8,7 +8,6 @@
 #include "iree/compiler/Dialect/Util/Analysis/Constant/OpOracle.h"
 #include "iree/compiler/Dialect/Util/IR/UtilOps.h"
 #include "iree/compiler/Dialect/Util/IR/UtilTypes.h"
-#include "iree/compiler/Dialect/Util/Transforms/PassDetail.h"
 #include "iree/compiler/Dialect/Util/Transforms/Passes.h"
 #include "iree/compiler/Utils/StringUtils.h"
 #include "llvm/Support/Debug.h"
@@ -20,6 +19,10 @@
 #define DEBUG_TYPE "iree-constexpr"
 
 namespace mlir::iree_compiler::IREE::Util {
+
+#define GEN_PASS_DEF_HOISTINTOGLOBALSPASS
+#include "iree/compiler/Dialect/Util/Transforms/Passes.h.inc"
+
 namespace {
 
 static llvm::cl::opt<std::string> clPrintDotGraphToFile(
@@ -49,8 +52,11 @@ static std::string getHoistedName(Type type) {
 // necessary. Either this algorithm can be made smarter or a follow-on pass
 // can sink globals into the program where it is profitable to reduce
 // working set size.
-class HoistIntoGlobalsPass : public HoistIntoGlobalsBase<HoistIntoGlobalsPass> {
+class HoistIntoGlobalsPass
+    : public impl::HoistIntoGlobalsPassBase<HoistIntoGlobalsPass> {
 public:
+  using Base::Base;
+
   void getDependentDialects(DialectRegistry &registry) const override {
     registerConstExprDependentDialects(registry);
     if (this->registerDependentDialectsFn) {
@@ -59,7 +65,8 @@ class HoistIntoGlobalsPass : public HoistIntoGlobalsBase<HoistIntoGlobalsPass> {
   }
 
   HoistIntoGlobalsPass(const ExprHoistingOptions &options)
-      : registerDependentDialectsFn(options.registerDependentDialectsFn) {
+      : Base(),
+        registerDependentDialectsFn(options.registerDependentDialectsFn) {
     this->maxSizeIncreaseThreshold.setValue(options.maxSizeIncreaseThreshold);
   }
 
@@ -347,14 +354,9 @@ class HoistIntoGlobalsPass : public HoistIntoGlobalsBase<HoistIntoGlobalsPass> {
 
 } // namespace
 
-std::unique_ptr<OperationPass<mlir::ModuleOp>>
+std::unique_ptr<Pass>
 createHoistIntoGlobalsPass(const ExprHoistingOptions &options) {
   return std::make_unique<HoistIntoGlobalsPass>(options);
 }
 
-std::unique_ptr<OperationPass<mlir::ModuleOp>> createHoistIntoGlobalsPass() {
-  IREE::Util::ExprHoistingOptions options;
-  return std::make_unique<HoistIntoGlobalsPass>(options);
-}
-
 } // namespace mlir::iree_compiler::IREE::Util
diff --git a/compiler/src/iree/compiler/Dialect/Util/Transforms/IPO.cpp b/compiler/src/iree/compiler/Dialect/Util/Transforms/IPO.cpp
index 91bef34017c1..8c284781e4fe 100644
--- a/compiler/src/iree/compiler/Dialect/Util/Transforms/IPO.cpp
+++ b/compiler/src/iree/compiler/Dialect/Util/Transforms/IPO.cpp
@@ -11,7 +11,6 @@
 #include "iree/compiler/Dialect/Util/IR/UtilDialect.h"
 #include "iree/compiler/Dialect/Util/IR/UtilOps.h"
 #include "iree/compiler/Dialect/Util/IR/UtilTraits.h"
-#include "iree/compiler/Dialect/Util/Transforms/PassDetail.h"
 #include "iree/compiler/Dialect/Util/Transforms/Passes.h"
 #include "iree/compiler/Utils/PassUtils.h"
 #include "llvm/ADT/SmallVector.h"
@@ -27,6 +26,10 @@
 #define DEBUG_TYPE "iree-util-ipo"
 
 namespace mlir::iree_compiler::IREE::Util {
+
+#define GEN_PASS_DEF_IPOPASS
+#include "iree/compiler/Dialect/Util/Transforms/Passes.h.inc"
+
 namespace {
 
 struct LocAttr {
@@ -641,13 +644,8 @@ static bool isFuncEmpty(FunctionOpInterface funcOp) {
   }
 }
 
-class IPOPass : public IPOBase<IPOPass> {
+class IPOPass : public impl::IPOPassBase<IPOPass> {
 public:
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<arith::ArithDialect>();
-    registry.insert<IREE::Util::UtilDialect>();
-  }
-
   void runOnOperation() override {
     auto moduleOp = getOperation();
 
@@ -705,10 +703,4 @@ class IPOPass : public IPOBase<IPOPass> {
 
 } // namespace
 
-std::unique_ptr<OperationPass<mlir::ModuleOp>> createIPOPass() {
-  return std::make_unique<IPOPass>();
-}
-
-static PassRegistration<IPOPass> pass;
-
 } // namespace mlir::iree_compiler::IREE::Util
diff --git a/compiler/src/iree/compiler/Dialect/Util/Transforms/ImportResources.cpp b/compiler/src/iree/compiler/Dialect/Util/Transforms/ImportResources.cpp
index 01a3ed74572d..59fe69e82149 100644
--- a/compiler/src/iree/compiler/Dialect/Util/Transforms/ImportResources.cpp
+++ b/compiler/src/iree/compiler/Dialect/Util/Transforms/ImportResources.cpp
@@ -6,7 +6,6 @@
 
 #include <utility>
 
-#include "iree/compiler/Dialect/Util/Transforms/PassDetail.h"
 #include "iree/compiler/Dialect/Util/Transforms/Passes.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Support/Debug.h"
@@ -19,6 +18,9 @@
 
 namespace mlir::iree_compiler::IREE::Util {
 
+#define GEN_PASS_DEF_IMPORTRESOURCESPASS
+#include "iree/compiler/Dialect/Util/Transforms/Passes.h.inc"
+
 namespace {
 
 template <typename ElementType, unsigned numBits = sizeof(ElementType) * 8>
@@ -60,7 +62,8 @@ static void copyFPAttrIntoBlob(AsmResourceBlob &blob,
   }
 }
 
-class ImportResourcesPass : public ImportResourcesBase<ImportResourcesPass> {
+class ImportResourcesPass
+    : public impl::ImportResourcesPassBase<ImportResourcesPass> {
 public:
   void getDependentDialects(DialectRegistry &registry) const override {
     registry.insert<BuiltinDialect>();
@@ -193,8 +196,4 @@ class ImportResourcesPass : public ImportResourcesBase<ImportResourcesPass> {
 
 } // namespace
 
-std::unique_ptr<OperationPass<void>> createImportResourcesPass() {
-  return std::make_unique<ImportResourcesPass>();
-}
-
 } // namespace mlir::iree_compiler::IREE::Util
diff --git a/compiler/src/iree/compiler/Dialect/Util/Transforms/OptimizeIntArithmetic.cpp b/compiler/src/iree/compiler/Dialect/Util/Transforms/OptimizeIntArithmetic.cpp
index 94b2ee237398..0c617c457931 100644
--- a/compiler/src/iree/compiler/Dialect/Util/Transforms/OptimizeIntArithmetic.cpp
+++ b/compiler/src/iree/compiler/Dialect/Util/Transforms/OptimizeIntArithmetic.cpp
@@ -6,7 +6,6 @@
 
 #include "iree/compiler/Dialect/Util/Analysis/IntegerDivisibilityAnalysis.h"
 #include "iree/compiler/Dialect/Util/IR/UtilDialect.h"
-#include "iree/compiler/Dialect/Util/Transforms/PassDetail.h"
 #include "iree/compiler/Dialect/Util/Transforms/Passes.h"
 #include "llvm/Support/Debug.h"
 #include "mlir/Analysis/DataFlow/DeadCodeAnalysis.h"
@@ -30,6 +29,9 @@ using namespace mlir::dataflow;
 
 namespace mlir::iree_compiler::IREE::Util {
 
+#define GEN_PASS_DEF_OPTIMIZEINTARITHMETICPASS
+#include "iree/compiler/Dialect/Util/Transforms/Passes.h.inc"
+
 namespace {
 
 // An index_cast from i64 to index is a no-op on targets where index is
@@ -303,12 +305,7 @@ class DataFlowListener : public RewriterBase::Listener {
 };
 
 class OptimizeIntArithmeticPass
-    : public OptimizeIntArithmeticBase<OptimizeIntArithmeticPass> {
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<arith::ArithDialect>();
-    registry.insert<IREE::Util::UtilDialect>();
-  }
-
+    : public impl::OptimizeIntArithmeticPassBase<OptimizeIntArithmeticPass> {
   void runOnOperation() override {
     Operation *op = getOperation();
     MLIRContext *ctx = op->getContext();
@@ -382,8 +379,4 @@ class OptimizeIntArithmeticPass
 
 } // namespace
 
-std::unique_ptr<OperationPass<void>> createOptimizeIntArithmeticPass() {
-  return std::make_unique<OptimizeIntArithmeticPass>();
-}
-
 } // namespace mlir::iree_compiler::IREE::Util
diff --git a/compiler/src/iree/compiler/Dialect/Util/Transforms/PassDetail.h b/compiler/src/iree/compiler/Dialect/Util/Transforms/PassDetail.h
deleted file mode 100644
index 115d20e60c4f..000000000000
--- a/compiler/src/iree/compiler/Dialect/Util/Transforms/PassDetail.h
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright 2022 The IREE Authors
-//
-// Licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#ifndef IREE_COMPILER_DIALECT_UTIL_TRANSFORMS_PASS_DETAIL_H_
-#define IREE_COMPILER_DIALECT_UTIL_TRANSFORMS_PASS_DETAIL_H_
-
-#include "mlir/IR/BuiltinOps.h"
-#include "mlir/Interfaces/CallInterfaces.h"
-#include "mlir/Pass/Pass.h"
-
-namespace mlir::iree_compiler::IREE::Util {
-
-#define GEN_PASS_CLASSES
-#include "iree/compiler/Dialect/Util/Transforms/Passes.h.inc" // IWYU pragma: keep
-
-} // namespace mlir::iree_compiler::IREE::Util
-
-#endif // IREE_COMPILER_DIALECT_UTIL_TRANSFORMS_PASS_DETAIL_H_
diff --git a/compiler/src/iree/compiler/Dialect/Util/Transforms/Passes.h b/compiler/src/iree/compiler/Dialect/Util/Transforms/Passes.h
index 16350b07b7e4..8f80207594ab 100644
--- a/compiler/src/iree/compiler/Dialect/Util/Transforms/Passes.h
+++ b/compiler/src/iree/compiler/Dialect/Util/Transforms/Passes.h
@@ -20,22 +20,32 @@ class Value;
 
 namespace mlir::iree_compiler::IREE::Util {
 
-std::unique_ptr<OperationPass<void>> createApplyPatternsPass();
-std::unique_ptr<OperationPass<mlir::ModuleOp>> createCombineInitializersPass();
-std::unique_ptr<OperationPass<void>> createDropCompilerHintsPass();
-std::unique_ptr<OperationPass<mlir::ModuleOp>>
-createDumpModulePass(std::string path = "");
+#define GEN_PASS_DECL_ANNOTATEOPORDINALSPASS
+#define GEN_PASS_DECL_APPLYPATTERNSPASS
+#define GEN_PASS_DECL_COMBINEINITIALIZERSPASS
+#define GEN_PASS_DECL_DROPCOMPILERHINTSPASS
+#define GEN_PASS_DECL_DUMPMODULEPASS
+// Has un-tablegen-able options (a pass pipeline)
+// #define GEN_PASS_DECL_FIXEDPOINTITERATORPASS
+#define GEN_PASS_DECL_FOLDGLOBALSPASS
+#define GEN_PASS_DECL_FUSEGLOBALSPASS
+#define GEN_PASS_DECL_HOISTINTOGLOBALSPASS
+#define GEN_PASS_DECL_IPOPASS
+#define GEN_PASS_DECL_IMPORTRESOURCESPASS
+#define GEN_PASS_DECL_OPTIMIZEINTARITHMETICPASS
+#define GEN_PASS_DECL_PROPAGATESUBRANGESPASS
+#define GEN_PASS_DECL_SIMPLIFYGLOBALACCESSESPASS
+#define GEN_PASS_DECL_STRIPANDSPLATCONSTANTSPASS
+#define GEN_PASS_DECL_STRIPDEBUGOPSPASS
+#define GEN_PASS_DECL_TESTCONVERSIONPASS
+#define GEN_PASS_DECL_TESTFLOATRANGEANALYSISPASS
+#include "iree/compiler/Dialect/Util/Transforms/Passes.h.inc"
+
+// Kept for compatibility
+std::unique_ptr<Pass> createDumpModulePass(std::string path);
+
 std::unique_ptr<OperationPass<void>>
 createFixedPointIteratorPass(OpPassManager pipeline);
-std::unique_ptr<OperationPass<mlir::ModuleOp>> createFoldGlobalsPass();
-std::unique_ptr<OperationPass<mlir::ModuleOp>> createFuseGlobalsPass();
-std::unique_ptr<OperationPass<mlir::ModuleOp>> createIPOPass();
-std::unique_ptr<OperationPass<void>> createOptimizeIntArithmeticPass();
-std::unique_ptr<OperationPass<mlir::ModuleOp>> createPropagateSubrangesPass();
-std::unique_ptr<OperationPass<void>> createSimplifyGlobalAccessesPass();
-std::unique_ptr<OperationPass<mlir::ModuleOp>>
-createStripAndSplatConstantsPass();
-std::unique_ptr<OperationPass<void>> createStripDebugOpsPass();
 
 // Expression hoisting.
 struct ExprHoistingOptions {
@@ -49,17 +59,8 @@ struct ExprHoistingOptions {
   // of a single global as a result of hoisting.
   int64_t maxSizeIncreaseThreshold = 2147483647;
 };
-std::unique_ptr<OperationPass<mlir::ModuleOp>>
+std::unique_ptr<Pass>
 createHoistIntoGlobalsPass(const ExprHoistingOptions &options);
-std::unique_ptr<OperationPass<mlir::ModuleOp>> createHoistIntoGlobalsPass();
-
-// Resource Management.
-std::unique_ptr<OperationPass<void>> createImportResourcesPass();
-
-// Debug/test passes.
-std::unique_ptr<OperationPass<mlir::ModuleOp>> createAnnotateOpOrdinalsPass();
-std::unique_ptr<OperationPass<mlir::ModuleOp>> createTestConversionPass();
-std::unique_ptr<OperationPass<void>> createTestFloatRangeAnalysisPass();
 
 // Register all Passes
 void registerTransformPasses();
diff --git a/compiler/src/iree/compiler/Dialect/Util/Transforms/Passes.td b/compiler/src/iree/compiler/Dialect/Util/Transforms/Passes.td
index 8827f16039a0..fb0cf7d028a0 100644
--- a/compiler/src/iree/compiler/Dialect/Util/Transforms/Passes.td
+++ b/compiler/src/iree/compiler/Dialect/Util/Transforms/Passes.td
@@ -13,44 +13,38 @@ include "mlir/Pass/PassBase.td"
 // Optimization and cleanup
 //===----------------------------------------------------------------------===//
 
-def ApplyPatterns : Pass<"iree-util-apply-patterns", ""> {
+def ApplyPatternsPass : Pass<"iree-util-apply-patterns", ""> {
   let summary = "Applies some risky/IREE-specific canonicalization patterns.";
-  let constructor = [{
-    mlir::iree_compiler::IREE::Util::createApplyPatternsPass()
-  }];
 }
 
-def CombineInitializers : Pass<"iree-util-combine-initializers", "mlir::ModuleOp"> {
+def CombineInitializersPass : Pass<"iree-util-combine-initializers", "mlir::ModuleOp"> {
   let summary = "Combines global initializers into one.";
-  let constructor = [{
-    mlir::iree_compiler::IREE::Util::createCombineInitializersPass()
-  }];
 }
 
-def DropCompilerHints : Pass<"iree-util-drop-compiler-hints", ""> {
+def DropCompilerHintsPass : Pass<"iree-util-drop-compiler-hints", ""> {
   let summary = "Deletes operations that have no runtime equivalent.";
   let description = [{
     Deletes operations that have no runtime equivalent and are only
     used in the compiler. This should be performed after all other
     compiler passes.
   }];
-  let constructor = [{
-    mlir::iree_compiler::IREE::Util::createDropCompilerHintsPass()
-  }];
 }
 
-def DumpModule : Pass<"iree-util-dump-module", "mlir::ModuleOp"> {
+def DumpModulePass : Pass<"iree-util-dump-module", "mlir::ModuleOp"> {
   let summary = "Dumps the module IR to the given file path.";
   let description = [{
     Dumps the module IR to the given file path in either textual (.mlir) or
     binary (.mlirbc) format. Source locations remain unchanged.
   }];
-  let constructor = [{
-    mlir::iree_compiler::IREE::Util::createDumpModulePass()
-  }];
+  let options = [
+    Option<"path", "path",
+      "std::string", /*default=*/"",
+      "File path to write the module text or binary into."
+    >
+  ];
 }
 
-def FixedPointIterator : Pass<"iree-util-fixed-point-iterator", ""> {
+def FixedPointIteratorPass : Pass<"iree-util-fixed-point-iterator", ""> {
   let summary = "Iterates a sub-pipeline to a fixed point.";
   let constructor = [{
     mlir::iree_compiler::IREE::Util::createFixedPointIteratorPass(
@@ -58,63 +52,75 @@ def FixedPointIterator : Pass<"iree-util-fixed-point-iterator", ""> {
   }];
 }
 
-def IPO : Pass<"iree-util-ipo", "mlir::ModuleOp"> {
+def IPOPass : Pass<"iree-util-ipo", "mlir::ModuleOp"> {
   let summary = "Performs basic inter-procedural optimization.";
-  let constructor = [{
-    mlir::iree_compiler::IREE::Util::createIPOPass()
-  }];
+  let dependentDialects = [
+    "::mlir::arith::ArithDialect",
+    "::mlir::iree_compiler::IREE::Util::UtilDialect"
+  ];
 }
 
-def OptimizeIntArithmetic : Pass<"iree-util-optimize-int-arithmetic", ""> {
+def OptimizeIntArithmeticPass : Pass<"iree-util-optimize-int-arithmetic", ""> {
   let summary = "Optimizes integer arithmetic using a variety of dataflow analysis and patterns.";
-  let constructor = [{
-    mlir::iree_compiler::IREE::Util::createOptimizeIntArithmeticPass()
-  }];
+  let dependentDialects = [
+    "::mlir::arith::ArithDialect",
+    "::mlir::iree_compiler::IREE::Util::UtilDialect"
+  ];
 }
 
-def PropagateSubranges : Pass<"iree-util-propagate-subranges", "mlir::ModuleOp"> {
+def PropagateSubrangesPass : Pass<"iree-util-propagate-subranges", "mlir::ModuleOp"> {
   let summary = "Propagates resource subranges across the program.";
-  let constructor = [{
-    mlir::iree_compiler::IREE::Util::createPropagateSubrangesPass()
-  }];
+  let dependentDialects = [
+    "::mlir::arith::ArithDialect",
+    "::mlir::scf::SCFDialect",
+    "::mlir::iree_compiler::IREE::Util::UtilDialect"
+  ];
+
 }
 
-def StripAndSplatConstants :
+def StripAndSplatConstantsPass :
     Pass<"iree-util-strip-and-splat-constants", "mlir::ModuleOp"> {
   let summary = "Strips constant util.global ops and replaces them with splats.";
-  let constructor = "mlir::iree_compiler::IREE::Util::createStripAndSplatConstantsPass()";
+  let dependentDialects = [
+    "::mlir::iree_compiler::IREE::Util::UtilDialect"
+  ];
 }
 
-def StripDebugOps : Pass<"iree-util-strip-debug-ops", ""> {
+def StripDebugOpsPass : Pass<"iree-util-strip-debug-ops", ""> {
   let summary = "Strips debug ops, like assertions.";
-  let constructor = [{
-    mlir::iree_compiler::IREE::Util::createStripDebugOpsPass()
-  }];
 }
 
 //===----------------------------------------------------------------------===//
 // Globals
 //===----------------------------------------------------------------------===//
 
-def FoldGlobals : Pass<"iree-util-fold-globals", "mlir::ModuleOp"> {
+def FoldGlobalsPass : Pass<"iree-util-fold-globals", "mlir::ModuleOp"> {
   let summary = "Folds duplicate globals and propagates constants.";
-  let constructor = [{
-    mlir::iree_compiler::IREE::Util::createFoldGlobalsPass()
-  }];
+
+  let dependentDialects = [
+    "::mlir::func::FuncDialect",
+    "::mlir::arith::ArithDialect",
+    "::mlir::iree_compiler::IREE::Util::UtilDialect"
+  ];
+
+  let statistics = [
+    Statistic<"beforeFoldingGlobals", "global ops before folding",
+      "Number of util.global ops before folding">,
+    Statistic<"afterFoldingGlobals", "global ops after folding",
+      "Number of util.global ops after folding">
+  ];
 }
 
-def FuseGlobals : Pass<"iree-util-fuse-globals", "mlir::ModuleOp"> {
+def FuseGlobalsPass : Pass<"iree-util-fuse-globals", "mlir::ModuleOp"> {
   let summary = "Fuses correlated globals together.";
-  let constructor = [{
-    mlir::iree_compiler::IREE::Util::createFuseGlobalsPass()
-  }];
+  let dependentDialects = [
+    "::mlir::iree_compiler::IREE::Util::UtilDialect"
+  ];
 }
 
-def HoistIntoGlobals : Pass<"iree-util-hoist-into-globals", "mlir::ModuleOp"> {
+def HoistIntoGlobalsPass : Pass<"iree-util-hoist-into-globals", "mlir::ModuleOp"> {
   let summary = "Greedily hoists eligible constant expressions into globals.";
-  let constructor = [{
-    mlir::iree_compiler::IREE::Util::createHoistIntoGlobalsPass()
-  }];
+  // Note: has a custom options struct that lets you register dependent dialects
   let options = [
     Option<"maxSizeIncreaseThreshold", "max-size-increase-threshold", "int64_t",
       /*default=*/"1048576",
@@ -123,19 +129,16 @@ def HoistIntoGlobals : Pass<"iree-util-hoist-into-globals", "mlir::ModuleOp"> {
   ];
 }
 
-def SimplifyGlobalAccesses :
+def SimplifyGlobalAccessesPass :
     InterfacePass<"iree-util-simplify-global-accesses", "mlir::CallableOpInterface"> {
   let summary = "Hoists loads and sinks stores to variables to decrease data dependency regions.";
-  let constructor = [{
-    mlir::iree_compiler::IREE::Util::createSimplifyGlobalAccessesPass()
-  }];
 }
 
 //===----------------------------------------------------------------------===//
 // Resource Management
 //===----------------------------------------------------------------------===//
 
-def ImportResources : Pass<"iree-util-import-resources", ""> {
+def ImportResourcesPass : Pass<"iree-util-import-resources", ""> {
   let summary = "Imports IR with arbitrary large-data into resources that IREE can manage efficiently";
   let description = [{
     MLIR has many interesting ways to store large constants, most of which
@@ -147,38 +150,42 @@ def ImportResources : Pass<"iree-util-import-resources", ""> {
     is done at the source (frontend), but this pass is provided to aid
     transition and testing by doing a manual conversion with iree-opt.
   }];
-  let constructor = [{
-    mlir::iree_compiler::IREE::Util::createImportResourcesPass()
-  }];
+
+  let dependentDialects = [
+    "::mlir::BuiltinDialect"
+  ];
 }
 
 //===----------------------------------------------------------------------===//
 // Debug/test passes
 //===----------------------------------------------------------------------===//
 
-def AnnotateOpOrdinals : Pass<"iree-util-annotate-op-ordinals", "mlir::ModuleOp"> {
+def AnnotateOpOrdinalsPass : Pass<"iree-util-annotate-op-ordinals", "mlir::ModuleOp"> {
   let summary = "Annotates ops with globally unique IDs for debugging.";
-  let constructor = [{
-    mlir::iree_compiler::IREE::Util::createAnnotateOpOrdinalsPass()
-  }];
 }
 
-def TestConversion : Pass<"iree-util-test-conversion", "mlir::ModuleOp"> {
+def TestConversionPass : Pass<"iree-util-test-conversion", "mlir::ModuleOp"> {
   let summary = "Tests util dialect conversion patterns.";
-  let constructor = [{
-    mlir::iree_compiler::IREE::Util::createTestConversionPass()
-  }];
+  let dependentDialects = [
+    "::mlir::iree_compiler::IREE::Util::UtilDialect",
+    "::mlir::arith::ArithDialect",
+    "::mlir::math::MathDialect",
+    "::mlir::affine::AffineDialect",
+    "::mlir::memref::MemRefDialect"
+  ];
+  let options = [
+    Option<"widenIntegers", "widen-integers",
+      "bool", /*default=*/"false",
+      "Tests type conversion by widening integers to i32">
+  ];
 }
 
-def TestFloatRangeAnalysis : Pass<"iree-util-test-float-range-analysis", ""> {
+def TestFloatRangeAnalysisPass : Pass<"iree-util-test-float-range-analysis", ""> {
   let summary = "Tests floating point range analysis.";
   let description = [{
     Tests floating point range analysis by evaluating any
     'iree_unregistered.test_fprange' op and setting the results on an attribute.
   }];
-  let constructor = [{
-    mlir::iree_compiler::IREE::Util::createTestFloatRangeAnalysisPass()
-  }];
 }
 
 #endif  // IREE_DIALECT_UTIL_PASSES
diff --git a/compiler/src/iree/compiler/Dialect/Util/Transforms/PropagateSubranges.cpp b/compiler/src/iree/compiler/Dialect/Util/Transforms/PropagateSubranges.cpp
index c5f5344037f0..ae8c1f02834c 100644
--- a/compiler/src/iree/compiler/Dialect/Util/Transforms/PropagateSubranges.cpp
+++ b/compiler/src/iree/compiler/Dialect/Util/Transforms/PropagateSubranges.cpp
@@ -8,7 +8,6 @@
 
 #include "iree/compiler/Dialect/Util/IR/UtilDialect.h"
 #include "iree/compiler/Dialect/Util/IR/UtilOps.h"
-#include "iree/compiler/Dialect/Util/Transforms/PassDetail.h"
 #include "iree/compiler/Dialect/Util/Transforms/Passes.h"
 #include "iree/compiler/Dialect/Util/Transforms/Patterns.h"
 #include "iree/compiler/Utils/IntegerSet.h"
@@ -29,6 +28,10 @@
 #define DEBUG_TYPE "iree-util-propagate-subranges"
 
 namespace mlir::iree_compiler::IREE::Util {
+
+#define GEN_PASS_DEF_PROPAGATESUBRANGESPASS
+#include "iree/compiler/Dialect/Util/Transforms/Passes.h.inc"
+
 namespace {
 
 // This pass is paired with the subrange type. Any type implementing the
@@ -634,14 +637,8 @@ static void expandSubranges(Operation *op, SymbolTable &symbolTable,
 // are always wrapped in a subrange op, with the elision/deduplication/etc left
 // until cleanup.
 class PropagateSubrangesPass
-    : public PropagateSubrangesBase<PropagateSubrangesPass> {
+    : public impl::PropagateSubrangesPassBase<PropagateSubrangesPass> {
 public:
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<mlir::arith::ArithDialect>();
-    registry.insert<mlir::scf::SCFDialect>();
-    registry.insert<IREE::Util::UtilDialect>();
-  }
-
   void runOnOperation() override {
     auto rootOp = getOperation();
     SymbolTable symbolTable(rootOp);
@@ -670,8 +667,4 @@ class PropagateSubrangesPass
 
 } // namespace
 
-std::unique_ptr<OperationPass<mlir::ModuleOp>> createPropagateSubrangesPass() {
-  return std::make_unique<PropagateSubrangesPass>();
-}
-
 } // namespace mlir::iree_compiler::IREE::Util
diff --git a/compiler/src/iree/compiler/Dialect/Util/Transforms/SimplifyGlobalAccesses.cpp b/compiler/src/iree/compiler/Dialect/Util/Transforms/SimplifyGlobalAccesses.cpp
index 318160fe0727..c00b66f2c68d 100644
--- a/compiler/src/iree/compiler/Dialect/Util/Transforms/SimplifyGlobalAccesses.cpp
+++ b/compiler/src/iree/compiler/Dialect/Util/Transforms/SimplifyGlobalAccesses.cpp
@@ -10,7 +10,6 @@
 #include "iree/compiler/Dialect/Util/IR/UtilDialect.h"
 #include "iree/compiler/Dialect/Util/IR/UtilOps.h"
 #include "iree/compiler/Dialect/Util/IR/UtilTraits.h"
-#include "iree/compiler/Dialect/Util/Transforms/PassDetail.h"
 #include "iree/compiler/Dialect/Util/Transforms/Passes.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Debug.h"
@@ -21,6 +20,9 @@
 
 namespace mlir::iree_compiler::IREE::Util {
 
+#define GEN_PASS_DEF_SIMPLIFYGLOBALACCESSESPASS
+#include "iree/compiler/Dialect/Util/Transforms/Passes.h.inc"
+
 // Builds symbol ref set for all immutable globals in |moduleOp|.
 static DenseSet<StringRef> gatherImmutableGlobals(mlir::ModuleOp moduleOp) {
   DenseSet<StringRef> set;
@@ -255,7 +257,7 @@ rearrangeBlockGlobalAccesses(Block &block,
 namespace {
 
 class SimplifyGlobalAccessesPass
-    : public SimplifyGlobalAccessesBase<SimplifyGlobalAccessesPass> {
+    : public impl::SimplifyGlobalAccessesPassBase<SimplifyGlobalAccessesPass> {
 public:
   void runOnOperation() override {
     auto callableOp = getOperation();
@@ -311,8 +313,4 @@ class SimplifyGlobalAccessesPass
 
 } // namespace
 
-std::unique_ptr<OperationPass<void>> createSimplifyGlobalAccessesPass() {
-  return std::make_unique<SimplifyGlobalAccessesPass>();
-}
-
 } // namespace mlir::iree_compiler::IREE::Util
diff --git a/compiler/src/iree/compiler/Dialect/Util/Transforms/StripAndSplatConstants.cpp b/compiler/src/iree/compiler/Dialect/Util/Transforms/StripAndSplatConstants.cpp
index 92cc4786f5a8..a3d73d001aa0 100644
--- a/compiler/src/iree/compiler/Dialect/Util/Transforms/StripAndSplatConstants.cpp
+++ b/compiler/src/iree/compiler/Dialect/Util/Transforms/StripAndSplatConstants.cpp
@@ -8,7 +8,6 @@
 
 #include "iree/compiler/Dialect/Util/IR/UtilDialect.h"
 #include "iree/compiler/Dialect/Util/IR/UtilOps.h"
-#include "iree/compiler/Dialect/Util/Transforms/PassDetail.h"
 #include "iree/compiler/Dialect/Util/Transforms/Passes.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/IR/Attributes.h"
@@ -18,15 +17,14 @@
 
 namespace mlir::iree_compiler::IREE::Util {
 
-class StripAndSplatConstantsPass
-    : public StripAndSplatConstantsBase<StripAndSplatConstantsPass> {
-public:
-  StripAndSplatConstantsPass() = default;
+#define GEN_PASS_DEF_STRIPANDSPLATCONSTANTSPASS
+#include "iree/compiler/Dialect/Util/Transforms/Passes.h.inc"
 
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<IREE::Util::UtilDialect>();
-  }
+namespace {
 
+class StripAndSplatConstantsPass
+    : public impl::StripAndSplatConstantsPassBase<StripAndSplatConstantsPass> {
+public:
   void runOnOperation() override {
     auto moduleOp = getOperation();
 
@@ -50,9 +48,6 @@ class StripAndSplatConstantsPass
   }
 };
 
-std::unique_ptr<OperationPass<mlir::ModuleOp>>
-createStripAndSplatConstantsPass() {
-  return std::make_unique<StripAndSplatConstantsPass>();
-}
+} // namespace
 
 } // namespace mlir::iree_compiler::IREE::Util
diff --git a/compiler/src/iree/compiler/Dialect/Util/Transforms/StripDebugOps.cpp b/compiler/src/iree/compiler/Dialect/Util/Transforms/StripDebugOps.cpp
index e9d8fa801586..533aa8839ef3 100644
--- a/compiler/src/iree/compiler/Dialect/Util/Transforms/StripDebugOps.cpp
+++ b/compiler/src/iree/compiler/Dialect/Util/Transforms/StripDebugOps.cpp
@@ -7,7 +7,6 @@
 #include "iree/compiler/Dialect/Util/IR/UtilDialect.h"
 #include "iree/compiler/Dialect/Util/IR/UtilOps.h"
 #include "iree/compiler/Dialect/Util/IR/UtilTraits.h"
-#include "iree/compiler/Dialect/Util/Transforms/PassDetail.h"
 #include "iree/compiler/Dialect/Util/Transforms/Passes.h"
 #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
 #include "mlir/Pass/Pass.h"
@@ -15,9 +14,13 @@
 
 namespace mlir::iree_compiler::IREE::Util {
 
+#define GEN_PASS_DEF_STRIPDEBUGOPSPASS
+#include "iree/compiler/Dialect/Util/Transforms/Passes.h.inc"
+
 namespace {
 
-class StripDebugOpsPass : public StripDebugOpsBase<StripDebugOpsPass> {
+class StripDebugOpsPass
+    : public impl::StripDebugOpsPassBase<StripDebugOpsPass> {
 public:
   void runOnOperation() override {
     getOperation()->walk([](Operation *op) {
@@ -31,8 +34,4 @@ class StripDebugOpsPass : public StripDebugOpsBase<StripDebugOpsPass> {
 
 } // namespace
 
-std::unique_ptr<OperationPass<void>> createStripDebugOpsPass() {
-  return std::make_unique<StripDebugOpsPass>();
-}
-
 } // namespace mlir::iree_compiler::IREE::Util
diff --git a/compiler/src/iree/compiler/Dialect/Util/Transforms/TestConversion.cpp b/compiler/src/iree/compiler/Dialect/Util/Transforms/TestConversion.cpp
index c57cca1855ab..06a203343881 100644
--- a/compiler/src/iree/compiler/Dialect/Util/Transforms/TestConversion.cpp
+++ b/compiler/src/iree/compiler/Dialect/Util/Transforms/TestConversion.cpp
@@ -8,7 +8,6 @@
 #include "iree/compiler/Dialect/Util/Conversion/MemRefToUtil/Patterns.h"
 #include "iree/compiler/Dialect/Util/IR/UtilDialect.h"
 #include "iree/compiler/Dialect/Util/IR/UtilOps.h"
-#include "iree/compiler/Dialect/Util/Transforms/PassDetail.h"
 #include "iree/compiler/Dialect/Util/Transforms/Passes.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/Transforms/Passes.h"
@@ -18,6 +17,9 @@
 
 namespace mlir::iree_compiler::IREE::Util {
 
+#define GEN_PASS_DEF_TESTCONVERSIONPASS
+#include "iree/compiler/Dialect/Util/Transforms/Passes.h.inc"
+
 namespace {
 
 static Value buildUnrealizedConversionCastOp(OpBuilder &builder, Type toType,
@@ -26,15 +28,10 @@ static Value buildUnrealizedConversionCastOp(OpBuilder &builder, Type toType,
       .getResult(0);
 }
 
-class TestConversionPass : public TestConversionBase<TestConversionPass> {
+class TestConversionPass
+    : public impl::TestConversionPassBase<TestConversionPass> {
 public:
-  TestConversionPass() = default;
-  TestConversionPass(const TestConversionPass &) {}
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<IREE::Util::UtilDialect, mlir::arith::ArithDialect,
-                    math::MathDialect, mlir::affine::AffineDialect,
-                    memref::MemRefDialect>();
-  }
+  using Base::Base;
 
   void runOnOperation() override {
     auto *context = &getContext();
@@ -72,16 +69,8 @@ class TestConversionPass : public TestConversionBase<TestConversionPass> {
       return signalPassFailure();
     }
   }
-
-  Option<bool> widenIntegers{
-      *this, "widen-integers",
-      llvm::cl::desc("Tests type conversion by widening integers to i32")};
 };
 
 } // namespace
 
-std::unique_ptr<OperationPass<ModuleOp>> createTestConversionPass() {
-  return std::make_unique<TestConversionPass>();
-}
-
 } // namespace mlir::iree_compiler::IREE::Util
diff --git a/compiler/src/iree/compiler/Dialect/Util/Transforms/TestFloatRangeAnalysis.cpp b/compiler/src/iree/compiler/Dialect/Util/Transforms/TestFloatRangeAnalysis.cpp
index 13ffa8acdc37..04a2b7140312 100644
--- a/compiler/src/iree/compiler/Dialect/Util/Transforms/TestFloatRangeAnalysis.cpp
+++ b/compiler/src/iree/compiler/Dialect/Util/Transforms/TestFloatRangeAnalysis.cpp
@@ -8,15 +8,17 @@
 #include "iree/compiler/Dialect/Util/Analysis/DFX/Solver.h"
 #include "iree/compiler/Dialect/Util/Analysis/DFX/State.h"
 #include "iree/compiler/Dialect/Util/Analysis/Explorer.h"
-#include "iree/compiler/Dialect/Util/Transforms/PassDetail.h"
 #include "iree/compiler/Dialect/Util/Transforms/Passes.h"
 
 namespace mlir::iree_compiler::IREE::Util {
 
+#define GEN_PASS_DEF_TESTFLOATRANGEANALYSISPASS
+#include "iree/compiler/Dialect/Util/Transforms/Passes.h.inc"
+
 namespace {
 
 class TestFloatRangeAnalysisPass
-    : public TestFloatRangeAnalysisBase<TestFloatRangeAnalysisPass> {
+    : public impl::TestFloatRangeAnalysisPassBase<TestFloatRangeAnalysisPass> {
 public:
   void runOnOperation() override {
     Explorer explorer(getOperation(), TraversalAction::SHALLOW);
@@ -53,8 +55,4 @@ class TestFloatRangeAnalysisPass
 
 } // namespace
 
-std::unique_ptr<OperationPass<void>> createTestFloatRangeAnalysisPass() {
-  return std::make_unique<TestFloatRangeAnalysisPass>();
-}
-
 } // namespace mlir::iree_compiler::IREE::Util

From f2abfa8b5bdf17ad363cad0af198278c2e700113 Mon Sep 17 00:00:00 2001
From: Quinn Dawkins <quinn.dawkins@gmail.com>
Date: Mon, 2 Dec 2024 11:45:41 -0500
Subject: [PATCH 33/54] [HAL] Add option to disable executable linking (#19028)

This is useful because without disabling linking,
`--iree-hal-dump-executable-binaries-to` and
`--iree-hal-dump-executable-intermediates-to` will dump a single large
linked file rather than one file per executable.

This is expected to only ever be used for development/debugging
purposes.
---
 .../Dialect/HAL/Target/TargetBackend.h        |  6 +++---
 .../Dialect/HAL/Target/TargetOptions.cpp      |  2 +-
 .../HAL/Transforms/LinkExecutables.cpp        | 13 ++++++------
 .../Dialect/HAL/Transforms/Passes.cpp         | 20 +++++++++++++++----
 .../compiler/Dialect/HAL/Transforms/Passes.td | 12 +++++------
 .../HAL/Transforms/SerializeExecutables.cpp   | 14 ++++++-------
 .../HAL/Transforms/TranslateExecutables.cpp   | 14 ++++++-------
 .../Modules/HAL/Inline/Transforms/Passes.cpp  |  2 +-
 .../Modules/HAL/Loader/Transforms/Passes.cpp  |  7 ++++---
 .../debugging/compile-time-regressions.md     |  2 +-
 tests/compiler_driver/hal_executable.mlir     |  2 +-
 tools/test/executable_configurations.mlir     |  6 +++---
 tools/test/executable_sources.mlir            |  6 +++---
 13 files changed, 60 insertions(+), 46 deletions(-)

diff --git a/compiler/src/iree/compiler/Dialect/HAL/Target/TargetBackend.h b/compiler/src/iree/compiler/Dialect/HAL/Target/TargetBackend.h
index eb01b55bf2a6..b2cd634bdf78 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Target/TargetBackend.h
+++ b/compiler/src/iree/compiler/Dialect/HAL/Target/TargetBackend.h
@@ -51,7 +51,7 @@ namespace mlir::iree_compiler::IREE::HAL {
 //      filter="spirv-v1.2-desktop*"
 //          hal.executable.export @my_entry
 //          module { ... }
-//   [[-iree-hal-translate-executables]]
+//   [[-iree-hal-translate-all-executables]]
 //   -> hal.executable @my_exe
 //      + hal.executable.variant @spirv-v1.1-mobile filter="spirv-v1.1-mobile*"
 //          hal.executable.export @my_entry_1
@@ -66,9 +66,9 @@ namespace mlir::iree_compiler::IREE::HAL {
 //      filter="spirv-v1.2-desktop*"
 //          hal.executable.export @my_entry
 //          module { spirv.module { ... } }
-//   [[-iree-hal-link-executables]]
+//   [[-iree-hal-link-all-executables]]
 //   -> TODO(benvanik): linkage rules.
-//   [[-iree-hal-serialize-executables]]
+//   [[-iree-hal-serialize-all-executables]]
 //   -> hal.executable @my_exe
 //      + hal.executable.binary attributes { ... }
 //          data blob...
diff --git a/compiler/src/iree/compiler/Dialect/HAL/Target/TargetOptions.cpp b/compiler/src/iree/compiler/Dialect/HAL/Target/TargetOptions.cpp
index 26fcae5402a6..e341232dc131 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Target/TargetOptions.cpp
+++ b/compiler/src/iree/compiler/Dialect/HAL/Target/TargetOptions.cpp
@@ -18,7 +18,7 @@ void TargetOptions::bindOptions(OptionsBinder &binder) {
       "IREE HAL executable target options");
 
   // This function is called as part of registering the pass
-  // TranslateExecutablesPass. Pass registry is also staticly
+  // TranslateAllExecutablesPass. Pass registry is also staticly
   // initialized, so targetBackendsFlags needs to be here to be initialized
   // first.
   binder.list<std::string>(
diff --git a/compiler/src/iree/compiler/Dialect/HAL/Transforms/LinkExecutables.cpp b/compiler/src/iree/compiler/Dialect/HAL/Transforms/LinkExecutables.cpp
index 89519458354f..4309e8ed9728 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Transforms/LinkExecutables.cpp
+++ b/compiler/src/iree/compiler/Dialect/HAL/Transforms/LinkExecutables.cpp
@@ -22,7 +22,7 @@
 
 namespace mlir::iree_compiler::IREE::HAL {
 
-#define GEN_PASS_DEF_LINKEXECUTABLESPASS
+#define GEN_PASS_DEF_LINKALLEXECUTABLESPASS
 #define GEN_PASS_DEF_LINKTARGETEXECUTABLESPASS
 #include "iree/compiler/Dialect/HAL/Transforms/Passes.h.inc"
 
@@ -66,13 +66,14 @@ struct LinkTargetExecutablesPass
 };
 
 //===----------------------------------------------------------------------===//
-// --iree-hal-link-executables
+// --iree-hal-link-all-executables
 //===----------------------------------------------------------------------===//
 
-struct LinkExecutablesPass
-    : public IREE::HAL::impl::LinkExecutablesPassBase<LinkExecutablesPass> {
-  using IREE::HAL::impl::LinkExecutablesPassBase<
-      LinkExecutablesPass>::LinkExecutablesPassBase;
+struct LinkAllExecutablesPass
+    : public IREE::HAL::impl::LinkAllExecutablesPassBase<
+          LinkAllExecutablesPass> {
+  using IREE::HAL::impl::LinkAllExecutablesPassBase<
+      LinkAllExecutablesPass>::LinkAllExecutablesPassBase;
   void runOnOperation() override {
     auto moduleOp = getOperation();
 
diff --git a/compiler/src/iree/compiler/Dialect/HAL/Transforms/Passes.cpp b/compiler/src/iree/compiler/Dialect/HAL/Transforms/Passes.cpp
index 46048f104920..66d9e2c7c3f6 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Transforms/Passes.cpp
+++ b/compiler/src/iree/compiler/Dialect/HAL/Transforms/Passes.cpp
@@ -139,6 +139,17 @@ static llvm::cl::list<std::string> clPreprocessExecutablesWith{
         "will fail compilation."),
 };
 
+static llvm::cl::opt<bool> clLinkExecutables{
+    "iree-hal-link-executables",
+    llvm::cl::desc(
+        "Controls linking of executables. The default is to always link, "
+        "however disabling linking allows inspecting serialization "
+        "of each executable in isolation and will dump a single binary per "
+        "executable when used in conjunction with "
+        "`--iree-hal-dump-executable-binaries-to`."),
+    llvm::cl::init(true),
+};
+
 } // namespace
 
 using FunctionLikeNest =
@@ -410,7 +421,7 @@ void buildHALTransformPassPipeline(OpPassManager &passManager,
 
   if (compileFrom < PipelinePhase::ExecutableTargets) {
     passManager.addNestedPass<IREE::HAL::ExecutableOp>(
-        IREE::HAL::createTranslateExecutablesPass({targetRegistry}));
+        IREE::HAL::createTranslateAllExecutablesPass({targetRegistry}));
   }
 
   // If debug information is requested capture the translated MLIR source text
@@ -475,8 +486,9 @@ void buildHALTransformPassPipeline(OpPassManager &passManager,
   // example, the LLVM AOT backend may combine all executable targets for the
   // same architecture into a single executable and link it as a shared
   // library.
-  if (transformOptions.linkExecutables) {
-    passManager.addPass(IREE::HAL::createLinkExecutablesPass({targetRegistry}));
+  if (transformOptions.linkExecutables && clLinkExecutables) {
+    passManager.addPass(
+        IREE::HAL::createLinkAllExecutablesPass({targetRegistry}));
   }
 
   // If any executable variants have external objects referenced within them
@@ -545,7 +557,7 @@ void buildHALTransformPassPipeline(OpPassManager &passManager,
   // contents not turned into a big base64 string.
   if (transformOptions.serializeExecutables) {
     passManager.addNestedPass<IREE::HAL::ExecutableOp>(
-        IREE::HAL::createSerializeExecutablesPass(
+        IREE::HAL::createSerializeAllExecutablesPass(
             {&targetRegistry, targetOptions.debugLevel,
              targetOptions.executableIntermediatesPath,
              targetOptions.executableBinariesPath}));
diff --git a/compiler/src/iree/compiler/Dialect/HAL/Transforms/Passes.td b/compiler/src/iree/compiler/Dialect/HAL/Transforms/Passes.td
index 6bec7a6771bd..20d172d19ff3 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Transforms/Passes.td
+++ b/compiler/src/iree/compiler/Dialect/HAL/Transforms/Passes.td
@@ -376,8 +376,8 @@ def ConfigureTargetExecutableVariantsPass :
   ];
 }
 
-def TranslateExecutablesPass :
-    Pass<"iree-hal-translate-executables", "IREE::HAL::ExecutableOp"> {
+def TranslateAllExecutablesPass :
+    Pass<"iree-hal-translate-all-executables", "IREE::HAL::ExecutableOp"> {
   let summary = "Translates hal.executable ops via a nested translation pipeline.";
   let description = [{
     Runs a nested pipeline on each executable to translate its variants from
@@ -435,8 +435,8 @@ def PruneExecutablesPass :
   }];
 }
 
-def LinkExecutablesPass :
-    Pass<"iree-hal-link-executables", "mlir::ModuleOp"> {
+def LinkAllExecutablesPass :
+    Pass<"iree-hal-link-all-executables", "mlir::ModuleOp"> {
   let summary = "Links hal.executable ops into one or more hal.executable ops.";
   let description = [{
     Runs a nested pipeline to link multiple `hal.executable` ops together if the
@@ -488,8 +488,8 @@ def ResolveExportOrdinalsPass :
   ];
 }
 
-def SerializeExecutablesPass :
-    Pass<"iree-hal-serialize-executables", "IREE::HAL::ExecutableOp"> {
+def SerializeAllExecutablesPass :
+    Pass<"iree-hal-serialize-all-executables", "IREE::HAL::ExecutableOp"> {
   let summary = "Converts hal.executable.variants to one or more hal.executable.binary ops.";
   let description = [{
     Runs a nested pipeline on each executable to serialize its variants from
diff --git a/compiler/src/iree/compiler/Dialect/HAL/Transforms/SerializeExecutables.cpp b/compiler/src/iree/compiler/Dialect/HAL/Transforms/SerializeExecutables.cpp
index 99a7df08f83d..dffddfdc842b 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Transforms/SerializeExecutables.cpp
+++ b/compiler/src/iree/compiler/Dialect/HAL/Transforms/SerializeExecutables.cpp
@@ -23,7 +23,7 @@
 
 namespace mlir::iree_compiler::IREE::HAL {
 
-#define GEN_PASS_DEF_SERIALIZEEXECUTABLESPASS
+#define GEN_PASS_DEF_SERIALIZEALLEXECUTABLESPASS
 #define GEN_PASS_DEF_SERIALIZETARGETEXECUTABLESPASS
 #include "iree/compiler/Dialect/HAL/Transforms/Passes.h.inc"
 
@@ -96,14 +96,14 @@ struct SerializeTargetExecutablesPass
 };
 
 //===----------------------------------------------------------------------===//
-// --iree-hal-serialize-executables
+// --iree-hal-serialize-all-executables
 //===----------------------------------------------------------------------===//
 
-struct SerializeExecutablesPass
-    : public IREE::HAL::impl::SerializeExecutablesPassBase<
-          SerializeExecutablesPass> {
-  using IREE::HAL::impl::SerializeExecutablesPassBase<
-      SerializeExecutablesPass>::SerializeExecutablesPassBase;
+struct SerializeAllExecutablesPass
+    : public IREE::HAL::impl::SerializeAllExecutablesPassBase<
+          SerializeAllExecutablesPass> {
+  using IREE::HAL::impl::SerializeAllExecutablesPassBase<
+      SerializeAllExecutablesPass>::SerializeAllExecutablesPassBase;
   void runOnOperation() override {
     auto executableOp = getOperation();
     OpPassManager passManager(executableOp.getOperationName());
diff --git a/compiler/src/iree/compiler/Dialect/HAL/Transforms/TranslateExecutables.cpp b/compiler/src/iree/compiler/Dialect/HAL/Transforms/TranslateExecutables.cpp
index b9b1a3187279..72ca23d9e5f8 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Transforms/TranslateExecutables.cpp
+++ b/compiler/src/iree/compiler/Dialect/HAL/Transforms/TranslateExecutables.cpp
@@ -23,7 +23,7 @@
 
 namespace mlir::iree_compiler::IREE::HAL {
 
-#define GEN_PASS_DEF_TRANSLATEEXECUTABLESPASS
+#define GEN_PASS_DEF_TRANSLATEALLEXECUTABLESPASS
 #define GEN_PASS_DEF_TRANSLATETARGETEXECUTABLEVARIANTSPASS
 #include "iree/compiler/Dialect/HAL/Transforms/Passes.h.inc"
 
@@ -75,14 +75,14 @@ struct TranslateTargetExecutableVariantsPass
 };
 
 //===----------------------------------------------------------------------===//
-// --iree-hal-translate-executables
+// --iree-hal-translate-all-executables
 //===----------------------------------------------------------------------===//
 
-struct TranslateExecutablesPass
-    : public IREE::HAL::impl::TranslateExecutablesPassBase<
-          TranslateExecutablesPass> {
-  using IREE::HAL::impl::TranslateExecutablesPassBase<
-      TranslateExecutablesPass>::TranslateExecutablesPassBase;
+struct TranslateAllExecutablesPass
+    : public IREE::HAL::impl::TranslateAllExecutablesPassBase<
+          TranslateAllExecutablesPass> {
+  using IREE::HAL::impl::TranslateAllExecutablesPassBase<
+      TranslateAllExecutablesPass>::TranslateAllExecutablesPassBase;
 
   void getDependentDialects(DialectRegistry &registry) const override {
     registry.insert<IREE::HAL::HALDialect>();
diff --git a/compiler/src/iree/compiler/Modules/HAL/Inline/Transforms/Passes.cpp b/compiler/src/iree/compiler/Modules/HAL/Inline/Transforms/Passes.cpp
index 68b479933178..c6fce971d993 100644
--- a/compiler/src/iree/compiler/Modules/HAL/Inline/Transforms/Passes.cpp
+++ b/compiler/src/iree/compiler/Modules/HAL/Inline/Transforms/Passes.cpp
@@ -72,7 +72,7 @@ void buildHALInlineStaticTransformPassPipeline(
   passManager.addNestedPass<IREE::HAL::ExecutableOp>(
       IREE::HAL::createConfigureExecutablesPass({targetRegistry}));
   passManager.addNestedPass<IREE::HAL::ExecutableOp>(
-      IREE::HAL::createTranslateExecutablesPass({targetRegistry}));
+      IREE::HAL::createTranslateAllExecutablesPass({targetRegistry}));
 
   // Inline the translated executable functions.
   // We preserve the executables for their metadata used during conversion.
diff --git a/compiler/src/iree/compiler/Modules/HAL/Loader/Transforms/Passes.cpp b/compiler/src/iree/compiler/Modules/HAL/Loader/Transforms/Passes.cpp
index 7adccce1514e..9c9acbc83d15 100644
--- a/compiler/src/iree/compiler/Modules/HAL/Loader/Transforms/Passes.cpp
+++ b/compiler/src/iree/compiler/Modules/HAL/Loader/Transforms/Passes.cpp
@@ -77,7 +77,7 @@ void buildHALInlineDynamicTransformPassPipeline(
   passManager.addNestedPass<IREE::HAL::ExecutableOp>(
       IREE::HAL::createConfigureExecutablesPass({targetRegistry}));
   passManager.addNestedPass<IREE::HAL::ExecutableOp>(
-      IREE::HAL::createTranslateExecutablesPass({targetRegistry}));
+      IREE::HAL::createTranslateAllExecutablesPass({targetRegistry}));
 
   //----------------------------------------------------------------------------
   // Conversion
@@ -91,7 +91,8 @@ void buildHALInlineDynamicTransformPassPipeline(
   //----------------------------------------------------------------------------
 
   // Link executables together.
-  passManager.addPass(IREE::HAL::createLinkExecutablesPass({targetRegistry}));
+  passManager.addPass(
+      IREE::HAL::createLinkAllExecutablesPass({targetRegistry}));
 
   // Resolve export ordinals from nested symbol references prior to
   // serialization.
@@ -99,7 +100,7 @@ void buildHALInlineDynamicTransformPassPipeline(
 
   // Serialize executables to their binary forms.
   passManager.addNestedPass<IREE::HAL::ExecutableOp>(
-      IREE::HAL::createSerializeExecutablesPass(
+      IREE::HAL::createSerializeAllExecutablesPass(
           {&targetRegistry, targetOptions.debugLevel,
            targetOptions.executableIntermediatesPath,
            targetOptions.executableBinariesPath}));
diff --git a/docs/website/docs/developers/debugging/compile-time-regressions.md b/docs/website/docs/developers/debugging/compile-time-regressions.md
index 94e1833dcd38..f8d7a04900e6 100644
--- a/docs/website/docs/developers/debugging/compile-time-regressions.md
+++ b/docs/website/docs/developers/debugging/compile-time-regressions.md
@@ -193,7 +193,7 @@ See our documentation on
 the section on
 [tracing `iree-compile`](../performance/profiling-with-tracy.md#tracing-iree-compile).
 For compile time regressions, pay particular attention to the compilation
-phases (Flow/Stream/HAL), how many times `TranslateExecutablesPass` runs, and
+phases (Flow/Stream/HAL), how many times `TranslateAllExecutablesPass` runs, and
 if there are outlier passes that take significantly longer to run than others.
 
 Here are some previous analyses for inspiration:
diff --git a/tests/compiler_driver/hal_executable.mlir b/tests/compiler_driver/hal_executable.mlir
index 2e7dc2c69cd2..629061ff071e 100644
--- a/tests/compiler_driver/hal_executable.mlir
+++ b/tests/compiler_driver/hal_executable.mlir
@@ -1,5 +1,5 @@
 // RUN: iree-compile --compile-mode=hal-executable \
-// RUN:   --mlir-print-ir-after=iree-hal-serialize-executables \
+// RUN:   --mlir-print-ir-after=iree-hal-serialize-all-executables \
 // RUN:   --iree-hal-target-backends=vmvx %s \
 // RUN:   --o=/dev/null 2>&1 | FileCheck %s
 
diff --git a/tools/test/executable_configurations.mlir b/tools/test/executable_configurations.mlir
index 35235c950819..444160a4e1ab 100644
--- a/tools/test/executable_configurations.mlir
+++ b/tools/test/executable_configurations.mlir
@@ -3,7 +3,7 @@
 // RUN:     --iree-hal-dump-executable-configurations-to=- | \
 // RUN: iree-compile - -o /dev/null \
 // RUN:     --compile-mode=hal-executable \
-// RUN:     --mlir-print-ir-before=iree-hal-serialize-executables 2>&1 | \
+// RUN:     --mlir-print-ir-before=iree-hal-serialize-all-executables 2>&1 | \
 // RUN: FileCheck %s
 
 // This test relies on piping stdout and that there is only a single
@@ -21,7 +21,7 @@
 //      --iree-hal-dump-executable-configurations-to=configs/ | \
 //  ls -1 sources/ | xargs -i sh -c "iree-compile configs/{}
 //      --compile-mode=hal-executable
-//      --mlir-print-ir-before=iree-hal-serialize-executables"
+//      --mlir-print-ir-before=iree-hal-serialize-all-executables"
 //
 // NOTE: executable configurations are not runnable: they only exist to allow
 // for iteration on executable translation. If you want to run them you need
@@ -38,7 +38,7 @@ func.func @abs(%input : tensor<f32>) -> tensor<f32> {
   return %result : tensor<f32>
 }
 
-// CHECK: IR Dump Before SerializeExecutablesPass
+// CHECK: IR Dump Before SerializeAllExecutablesPass
 // CHECK: hal.executable public @abs_dispatch_0
 // CHECK:   hal.executable.variant public @vmvx_bytecode_fb
 // CHECK:     vm.func private @abs_dispatch_0_elementwise
diff --git a/tools/test/executable_sources.mlir b/tools/test/executable_sources.mlir
index df5fa9468075..c24e16b63160 100644
--- a/tools/test/executable_sources.mlir
+++ b/tools/test/executable_sources.mlir
@@ -3,7 +3,7 @@
 // RUN:     --iree-hal-dump-executable-sources-to=- | \
 // RUN: iree-compile - -o /dev/null \
 // RUN:     --compile-mode=hal-executable \
-// RUN:     --mlir-print-ir-before=iree-hal-serialize-executables 2>&1 | \
+// RUN:     --mlir-print-ir-before=iree-hal-serialize-all-executables 2>&1 | \
 // RUN: FileCheck %s
 
 // This test relies on us piping stdout and that there's only a single
@@ -19,7 +19,7 @@
 //  iree-compile some_input.mlir -o ignored.mlir \
 //      --iree-hal-target-backends=vmvx \
 //      --iree-hal-dump-executable-sources-to=sources/ | \
-//  ls -1 sources/ | xargs -i sh -c "iree-compile sources/{} --compile-mode=hal-executable --mlir-print-ir-before=iree-hal-serialize-executables"
+//  ls -1 sources/ | xargs -i sh -c "iree-compile sources/{} --compile-mode=hal-executable --mlir-print-ir-before=iree-hal-serialize-all-executables"
 //
 // NOTE: executable sources are not runnable: they only exist to allow for
 // iteration on executable translation. If you want to run them you need
@@ -36,7 +36,7 @@ func.func @abs(%input : tensor<f32>) -> (tensor<f32>) {
   return %result : tensor<f32>
 }
 
-// CHECK: IR Dump Before SerializeExecutablesPass
+// CHECK: IR Dump Before SerializeAllExecutablesPass
 // CHECK: hal.executable public @abs_dispatch_0
 // CHECK:   hal.executable.variant public @vmvx_bytecode_fb
 // CHECK:     vm.func private @abs_dispatch_0_elementwise

From 8205885e4922c401714032f591d4cf36fbec3178 Mon Sep 17 00:00:00 2001
From: Scott Todd <scott.todd0@gmail.com>
Date: Mon, 2 Dec 2024 09:45:21 -0800
Subject: [PATCH 34/54] Delete no longer used docker_run.sh scripts, cleanup
 ci.yml. (#19243)

All workflows now either don't use Docker (directly) or they use
`container:`, so we can delete this `docker_run.sh` script. Having
`build.sh` and `test.sh` scripts runnable under Docker was nice for
reproducing what the CI was doing... when the CI builds were
complicated. Now, they are mostly just lists of options passed to CMake,
and we can simplify that even further with CMake toolchains or presets
in the future.

The `ci.yml` file is in a weird spot right now. It used to be the
central location for all CI jobs, but many jobs got split out as part of
reducing pressure on self-hosted runners. If we add jobs back into this
file we can add back the structure that was there before.

The `build_tools/github_actions/` folder is pretty empty now. It would
be nice to rework the
https://github.com/iree-org/iree/blob/main/build_tools/github_actions/cmake_ci.py
and
https://github.com/iree-org/iree/blob/main/build_tools/github_actions/build_dist.py
scripts at some point, but those are currently load bearing for the
`main-dist` archives that we publish with releases.
---
 .github/workflows/ci.yml                 |  98 ------------------
 build_tools/docker/docker_run.sh         | 121 -----------------------
 build_tools/github_actions/README.md     |   2 -
 build_tools/github_actions/docker_run.sh |  16 ---
 4 files changed, 237 deletions(-)
 delete mode 100755 build_tools/docker/docker_run.sh
 delete mode 100755 build_tools/github_actions/docker_run.sh

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index b94a4fc58698..29fa53710d07 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -41,21 +41,11 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.event.number || github.sha }}
   cancel-in-progress: true
 
-env:
-  # This needs to be in env instead of the outputs of setup because it contains
-  # the run attempt and we want that to be the current attempt, not whatever
-  # attempt the setup step last ran in.
-  GCS_URL: https://storage.googleapis.com/iree-github-actions-${{ github.event_name == 'pull_request' && 'presubmit' || 'postsubmit' }}-artifacts/${{ github.run_id }}/${{ github.run_attempt }}
-
 # Jobs are organized into groups and topologically sorted by dependencies
 jobs:
   setup:
     uses: ./.github/workflows/setup.yml
 
-  ############################### Configurations ###############################
-  # Jobs that build IREE in some non-default configuration
-  ##############################################################################
-
   runtime:
     needs: setup
     name: "runtime :: ${{ matrix.name }}"
@@ -206,86 +196,6 @@ jobs:
       - name: CMake - build
         run: cmake --build ${BUILD_DIR} -- -k 0
 
-  ############################## Crosscompilation ##############################
-  # Jobs that cross-compile IREE for other platforms
-  ##############################################################################
-
-  # Disabled to reduce self-hosted runners needed. See #17957
-  # TODO(#17957): migrate to pkgci
-  # cross_compile_and_test:
-  #   needs: [setup, build_all]
-  #   if: contains(fromJson(needs.setup.outputs.enabled-jobs), 'cross_compile_and_test')
-  #   runs-on:
-  #     - self-hosted # must come first
-  #     - runner-group=${{ needs.setup.outputs.runner-group }}
-  #     - environment=${{ needs.setup.outputs.runner-env }}
-  #     - cpu
-  #     - os-family=Linux
-  #   strategy:
-  #     matrix:
-  #       target:
-  #         - platform: linux
-  #           arch: riscv_32
-  #           abi: ilp32d
-  #           docker_image: "gcr.io/iree-oss/riscv@sha256:62e87bad3405d691ddba6f9be0ef44eeb60461a467c8d86f0842c81a1f97da79"
-  #           build_script: "./build_tools/cmake/build_riscv.sh"
-  #           test_script: "./build_tools/cmake/test_riscv.sh"
-  #         - platform: generic
-  #           arch: riscv_32
-  #           abi: ilp32
-  #           docker_image: "gcr.io/iree-oss/riscv@sha256:62e87bad3405d691ddba6f9be0ef44eeb60461a467c8d86f0842c81a1f97da79"
-  #           build_script: "./build_tools/cmake/build_riscv.sh"
-  #           test_script: "./tests/riscv32/smoke.sh"
-  #         - platform: emscripten
-  #           arch: wasm32
-  #           abi: wasm32
-  #           docker_image: "gcr.io/iree-oss/emscripten@sha256:2dd4c52f1bb499ab365aad0111fe5538b685d88af38636b409b0cf6a576ab214"
-  #           build_script: "./build_tools/cmake/build_runtime_emscripten.sh"
-  #           # No test script
-  #   env:
-  #     PLATFORM: ${{ matrix.target.platform }}
-  #     ARCH: ${{ matrix.target.arch }}
-  #     ABI: ${{ matrix.target.abi }}
-  #     DOCKER_IMAGE: ${{ matrix.target.docker_image }}
-  #     BUILD_SCRIPT: ${{ matrix.target.build_script }}
-  #     TEST_SCRIPT: ${{ matrix.target.test_script }}
-  #     INSTALL_DIR: ${{ needs.build_all.outputs.install-dir }}
-  #     INSTALL_DIR_ARCHIVE: ${{ needs.build_all.outputs.install-dir-archive }}
-  #     INSTALL_DIR_GCS_ARTIFACT: ${{ needs.build_all.outputs.install-dir-gcs-artifact }}
-  #     TARGET_BUILD_DIR: build-${{ matrix.target.platform }}-${{ matrix.target.arch }}
-  #     IREE_WRITE_REMOTE_CCACHE: ${{ needs.setup.outputs.write-caches }}
-  #   steps:
-  #     - name: "Checking out repository"
-  #       uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
-  #     - name: "Checking out runtime submodules"
-  #       run: ./build_tools/scripts/git/update_runtime_submodules.sh
-  #     - name: "Downloading install dir archive"
-  #       run: gcloud storage cp "${INSTALL_DIR_GCS_ARTIFACT}" "${INSTALL_DIR_ARCHIVE}"
-  #     - name: "Extracting install directory"
-  #       run: tar -xf "${INSTALL_DIR_ARCHIVE}"
-  #     - name: "Build cross-compiling target"
-  #       run: |
-  #         ./build_tools/github_actions/docker_run.sh \
-  #           --env "IREE_CCACHE_GCP_TOKEN=$(gcloud auth application-default print-access-token)" \
-  #           --env "IREE_WRITE_REMOTE_CCACHE=${IREE_WRITE_REMOTE_CCACHE}" \
-  #           --env "CCACHE_NAMESPACE=${DOCKER_IMAGE}" \
-  #           --env "IREE_TARGET_PLATFORM=${PLATFORM}" \
-  #           --env "IREE_TARGET_ARCH=${ARCH}" \
-  #           --env "IREE_TARGET_ABI=${ABI}" \
-  #           --env "IREE_TARGET_BUILD_DIR=${TARGET_BUILD_DIR}" \
-  #           --env "IREE_HOST_BIN_DIR=${INSTALL_DIR}/bin" \
-  #           "${DOCKER_IMAGE}" \
-  #           "${BUILD_SCRIPT}"
-  #     - name: "Test cross-compiling target"
-  #       if: matrix.target.test_script
-  #       run: |
-  #         ./build_tools/github_actions/docker_run.sh \
-  #           --env "IREE_TARGET_PLATFORM=${PLATFORM}" \
-  #           --env "IREE_TARGET_ARCH=${ARCH}" \
-  #           --env "IREE_TARGET_BUILD_DIR=${TARGET_BUILD_DIR}" \
-  #           "${DOCKER_IMAGE}" \
-  #           "${TEST_SCRIPT}"
-
   ##############################################################################
 
   # Depends on all the other jobs to provide a single anchor that indicates the
@@ -298,17 +208,9 @@ jobs:
     runs-on: ubuntu-20.04
     needs:
       - setup
-
-      # Accelerators
-      # - test_nvidia_a100
-
-      # Runtime build variants
       - runtime
       - runtime_small
       - runtime_tracing
-
-      # Crosscompilation
-      # - cross_compile_and_test
     steps:
       - name: "Checking out repository"
         uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
diff --git a/build_tools/docker/docker_run.sh b/build_tools/docker/docker_run.sh
deleted file mode 100755
index 7d23c56d9a27..000000000000
--- a/build_tools/docker/docker_run.sh
+++ /dev/null
@@ -1,121 +0,0 @@
-# Copyright 2020 The IREE Authors
-#
-# Licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-set -euo pipefail
-
-# It's convenient to have the paths inside the container match the paths
-# outside. This creates an issue, however, because we pass around CMake build
-# directories, which use absolute paths, so it's important that the paths match
-# between runners. Doing things this way allows runners to change their working
-# directory and enables local reproduction of issues.
-DOCKER_CONTAINER_WORKDIR="${DOCKER_CONTAINER_WORKDIR:-/work}"
-
-# Sets up files and environment to enable running scripts in docker.
-# In particular, does some shenanigans to enable running with the current user.
-# Some of this setup is only strictly necessary for Bazel, but it doesn't hurt
-# for anything else.
-# Requires that DOCKER_HOST_WORKDIR and DOCKER_HOST_TMPDIR have been set
-function docker_run() {
-    # Make the source repository available and launch containers in that
-    # directory.
-    DOCKER_RUN_ARGS=(
-      --mount="type=bind,source=${DOCKER_HOST_WORKDIR},dst=${DOCKER_CONTAINER_WORKDIR}"
-      --workdir="${DOCKER_CONTAINER_WORKDIR}"
-      --env "CCACHE_BASE_DIR=${DOCKER_CONTAINER_WORKDIR}"
-    )
-
-    # Delete the container after the run is complete.
-    DOCKER_RUN_ARGS+=(--rm)
-
-
-    # Run as the current user and group. If only it were this simple...
-    DOCKER_RUN_ARGS+=(--user="$(id -u):$(id -g)")
-
-    # Use the host network stack. Improves network performance and makes it
-    # possible for the container to talk to localhost.
-    DOCKER_RUN_ARGS+=(--network="host")
-
-    # The Docker container doesn't know about the users and groups of the host
-    # system. We have to tell it. This is just a mapping of IDs to names though.
-    # The thing that really matters is the IDs, so the key thing is that Docker
-    # writes files as the same ID as the current user, which we set above, but
-    # without the group and passwd file, lots of things get upset because they
-    # don't recognize the current user ID (e.g. `whoami` fails). Bazel in
-    # particular looks for a home directory and is not happy when it can't find
-    # one.
-    # So we make the container share the host mapping, which guarantees that the
-    # current user is mapped. If there was any user or group in the container
-    # that we cared about, this wouldn't necessarily work because the host and
-    # container don't necessarily map the ID to the same user. Luckily though,
-    # we don't.
-    # We don't just mount the real /etc/passwd and /etc/group because Google
-    # Linux workstations do some interesting stuff with user/group permissions
-    # such that they don't contain the information about normal users and we
-    # want these scripts to be runnable locally for debugging.
-    # Instead we dump the results of `getent` to some fake files.
-    local fake_etc_dir="${DOCKER_HOST_TMPDIR}/fake_etc"
-    mkdir -p "${fake_etc_dir}"
-
-    local fake_group="${fake_etc_dir}/group"
-    local fake_passwd="${fake_etc_dir}/passwd"
-
-    getent group > "${fake_group}"
-    getent passwd > "${fake_passwd}"
-
-    DOCKER_RUN_ARGS+=(
-      --mount="type=bind,src=${fake_group},dst=/etc/group,readonly"
-      --mount="type=bind,src=${fake_passwd},dst=/etc/passwd,readonly"
-    )
-
-
-    # Bazel stores its cache in the user home directory by default. It's
-    # possible to override this, but that would require changing our Bazel
-    # startup options, which means polluting all our scripts and making them not
-    # runnable locally. Instead, we give it a special home directory to write
-    # into. We don't just mount the user home directory (or some subset thereof)
-    # for two reasons:
-    #   1. We probably don't want Docker to just write into the user's home
-    #      directory when running locally.
-    #   2. This allows us to control the device the home directory is in. Bazel
-    #      tends to be IO bound at even moderate levels of CPU parallelism and
-    #      the difference between a persistent SSD and a local scratch SSD can
-    #      be huge.
-    local fake_home_dir="${DOCKER_HOST_TMPDIR}/fake_home"
-    mkdir -p "${fake_home_dir}"
-
-    DOCKER_RUN_ARGS+=(
-      --mount="type=bind,src=${fake_home_dir},dst=${HOME}"
-    )
-
-    # Make gcloud credentials available if they are present. This isn't
-    # necessary when running in GCE but enables using this script locally with
-    # remote caching.
-    if [[ -d "${HOME}/.config/gcloud" ]]; then
-      DOCKER_RUN_ARGS+=(
-        --mount="type=bind,src=${HOME}/.config/gcloud,dst=${HOME}/.config/gcloud,readonly"
-      )
-    fi
-
-    # Give the container a ramdisk and set the Bazel sandbox base to point to
-    # it. This helps a lot with Bazel getting IO bound. Note that SANDBOX_BASE
-    # is a custom environment variable we translate into the corresponding Bazel
-    # option.
-    DOCKER_RUN_ARGS+=(
-      --mount="type=tmpfs,dst=/dev/shm"
-      --env SANDBOX_BASE=/dev/shm
-    )
-
-    # Some scripts need elevated permissions to control system-level scheduling.
-    # Since we're not using Docker for sandboxing, it is fine to run in
-    # privileged mode.
-    DOCKER_RUN_ARGS+=(
-      --privileged
-    )
-
-    docker run "${DOCKER_RUN_ARGS[@]}" "$@"
-}
-
-docker_run "$@"
diff --git a/build_tools/github_actions/README.md b/build_tools/github_actions/README.md
index ad53fc5135aa..8ec1f83d7256 100644
--- a/build_tools/github_actions/README.md
+++ b/build_tools/github_actions/README.md
@@ -1,3 +1 @@
 # GitHub Actions Based CI
-
-## Debugging releases cookbook
diff --git a/build_tools/github_actions/docker_run.sh b/build_tools/github_actions/docker_run.sh
deleted file mode 100755
index 68543e5a88f5..000000000000
--- a/build_tools/github_actions/docker_run.sh
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright 2020 The IREE Authors
-#
-# Licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-# Runs docker configured for usage with GitHub Actions, translating GitHub
-# Actions environment variables into generic ones and then invoking the generic
-# docker_run script.
-
-set -euo pipefail
-
-export DOCKER_HOST_WORKDIR="${GITHUB_WORKSPACE}"
-export DOCKER_HOST_TMPDIR="${RUNNER_TEMP}"
-
-"${GITHUB_WORKSPACE}/build_tools/docker/docker_run.sh" "$@"

From 8230f41d53eeb30bf48238d47282a9ca124c3117 Mon Sep 17 00:00:00 2001
From: Scott Todd <scott.todd0@gmail.com>
Date: Mon, 2 Dec 2024 10:34:38 -0800
Subject: [PATCH 35/54] Add new package bisect tool using `git bisect`.
 (#19289)

This fixes https://github.com/iree-org/iree/issues/16556.

This automates the process of searching through commit history for the
commit at which an `iree-compile ...` command started producing errors
by leveraging https://git-scm.com/docs/git-bisect together with the
uploaded artifacts in post-submit PkgCI runs. See the included README
file for detailed documentation.

I tested that the script works by reproducing the culprit finding from
https://github.com/iree-org/iree/issues/18879. Full logs from that run
(after caching installed packages):
https://gist.github.com/ScottTodd/cff468a50df63b65e5c5f449fabab6af. Also
tested with this other regression:
https://github.com/iree-org/iree/issues/19290.

## Background on 'bisect'

Let's say we have 100 git commits over the last month and we know that a
program started failing to compile _at some point_ in that period:

```
o - main (broken)
o
o
o - nightly release
o
o
o - nightly release
o
...
o - 1 month ago (working)
```

We could build from source at every commit and run our test command, but
this is slow, even with ccache and other tricks to speed up the build.
We could also install the nightly release builds and test those, but
there are gaps between them.

This scripting uses the packages built by
[`.github/workflows/pkgci_build_packages.yml`](https://github.com/iree-org/iree/blob/main/.github/workflows/pkgci_build_packages.yml)
and uploaded to [GitHub
Artifacts](https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/storing-and-sharing-data-from-a-workflow),
which are retained for 90 days and can be downloaded from workflow runs
triggered on `push` events, after commits are merged to main, like [on
this run](https://github.com/iree-org/iree/actions/runs/12018722157):

![image](https://github.com/user-attachments/assets/2a8bbe91-6ab7-4d4a-9a21-8f4899af2616)

Since we have packages for every commit<sup>*</sup>, we can do a smarter
search than a linear scan. We can do a
[bisect](https://en.wikipedia.org/wiki/Bisection_(software_engineering)).
Since we're looking through git commits, we can have [`git
bisect`](https://git-scm.com/docs/git-bisect) handle the core bisect
loop as long we provide a suitable test script.

\* that didn't fail the build, which is unlikely

With all those pieces together, our culprit finding bisect now looks
something like this, only testing O(log N) commits:

```
o - main (broken)
o
o                            <--- TEST 2: broken
o - nightly release
o                            <--- TEST 3: broken (culprit)
o                            <--- TEST 1: working
o - nightly release
o
...
o - 1 month ago (working)
```

## Future work

Possible areas for improvement (after we use this a few times and see
how well it works):

* Improve the logging - particularly redirecting stdout/stderr for
compile failures so they don't clutter the main logs
* Add an interactive mode
* Add a mode that uses nightly releases instead of / in addition to
post-submit CI artifacts, since CI artifacts need authentication to
download
* Speed up artifact downloads somehow. Maybe using
https://github.com/elfshaker/elfshaker or some hosted mirror of the
github artifacts with a pip-compatible index. On a good network,
downloading the 60MB release files still takes 10+ seconds.
* Handle missing artifacts and other infrastructure issues by using `git
bisect skip` / `exit 125`
* Integrate with `git bisect log` and/or `git bisect replay`
(https://git-scm.com/docs/git-bisect#_bisect_log_and_bisect_replay)
* Run the bisect in the cloud (GitHub Actions?)
* Run the bisect in parallel or distributed across multiple machines,
using a project like https://github.com/talshorer/git-dissect,
https://github.com/bjackman/git-brisect, or
https://github.com/hoelzro/git-pisect (these would be particularly
helpful when the test step takes multiple minutes, so long as all the
test machines have the required test artifacts like `.mlir` files
already cached)
* Collect metrics / statistics during the run (time spent per commit on
downloading and testing, data downloaded)
---
 build_tools/pkgci/README.md                   |  20 +-
 build_tools/pkgci/bisect/README.md            | 329 ++++++++++++++++++
 .../pkgci/bisect/bisect_example_timestamp.sh  |  43 +++
 build_tools/pkgci/bisect/bisect_packages.py   | 266 ++++++++++++++
 build_tools/pkgci/setup_venv.py               | 148 ++++++--
 .../debugging/compile-time-regressions.md     |  11 +-
 6 files changed, 782 insertions(+), 35 deletions(-)
 create mode 100644 build_tools/pkgci/bisect/README.md
 create mode 100755 build_tools/pkgci/bisect/bisect_example_timestamp.sh
 create mode 100755 build_tools/pkgci/bisect/bisect_packages.py

diff --git a/build_tools/pkgci/README.md b/build_tools/pkgci/README.md
index 8121ba2a67a7..898da91d1826 100644
--- a/build_tools/pkgci/README.md
+++ b/build_tools/pkgci/README.md
@@ -1,12 +1,28 @@
 # PkgCI Scripts
 
-This directory contains scripts and configuration for the "new" CI, which
+This directory contains scripts and configuration for "PkgCI", which
 is based on building packages and then flowing those to followon jobs.
 
-The traditional CI attempted to do all steps as various kinds of source
+The prior/traditional CI attempted to do all steps as various kinds of source
 builds at head vs a split package/test style of workflow. It can mostly
 be found in the `cmake` directory but is also scattered around.
 
 This directory generally corresponds to "pkgci_" prefixed workflows. Over
 time, as this CI flow takes over more of the CI pipeline, the traditional
 CI will be reduced to outlier jobs and policy checks.
+
+### Development notes
+
+Testing venv setup using packages:
+
+```bash
+python3.11 ./setup_venv.py /tmp/.venv --fetch-git-ref=5b0740c97a33ed
+
+# Activate the venvs and test it
+source /tmp/.venv/bin/activate
+iree-compile --version
+# IREE (https://iree.dev):
+#   IREE compiler version 3.1.0.dev+5b0740c97a33edce29e753b14b9ff04789afcc53 @ 5b0740c97a33edce29e753b14b9ff04789afcc53
+#   LLVM version 20.0.0git
+#   Optimized build
+```
diff --git a/build_tools/pkgci/bisect/README.md b/build_tools/pkgci/bisect/README.md
new file mode 100644
index 000000000000..ba7e9153b2b1
--- /dev/null
+++ b/build_tools/pkgci/bisect/README.md
@@ -0,0 +1,329 @@
+# Package bisect scripting
+
+This scripting connects the `git bisect` tool
+(https://git-scm.com/docs/git-bisect) with IREE's package builds, allowing
+developers to run tests through commit history efficiently. For example, this
+can be used to spot at which commit an `iree-compile` command started failing.
+
+At each stage of the bisect process, this bisect tool will download the IREE
+packages (i.e. `iree-base-compiler` and `iree-base-runtime`) and prepend their
+installed location to the `PATH` environment variable. If you want to run a
+bisect that _does not_ need to run `iree-compile`, just use `git bisect`
+directly. However, if you _do_ need to run `iree-compile`, this can save
+substantial time by avoiding the need to build it from source at each test
+commit.
+
+## Prerequisites
+
+### System requirements
+
+Requirement | Details
+----------- | -------
+Linux | (at least until IREE builds packages for other systems at each commit)
+`git` | https://git-scm.com/
+`gh` CLI | https://cli.github.com/
+iree-org/iree repository read access | Needed to [download workflow artifacts](https://docs.github.com/en/actions/managing-workflow-runs-and-deployments/managing-workflow-runs/downloading-workflow-artifacts). See also [obtaining commit access](https://iree.dev/developers/general/contributing/#obtaining-commit-access).
+`python3.11` with `venv` support | (Version must match what PkgCI builds) `sudo apt install python3.11 python3.11-dev python3.11-venv`
+
+### Data requirements
+
+* A command to run, such as a `.mlir` file and a `iree-compile` command
+* A known-working commit
+* A known-broken commit
+
+The commit range between known-working and known-broken should be as small as
+possible to limit the number of test steps. The bisect algorithm is `O(log N)`
+where `N` is the number of commits in the range, but wider windows have larger
+risk of something in the test environment breaking (e.g. breaking API changes,
+serialized `.mlir` files not being stable, etc.).
+
+## Usage
+
+### Example
+
+Let's try to find the culprit commit for issue
+https://github.com/iree-org/iree/issues/18879. Thanks to the detailed issue
+description, we have all the data we need to run a bisect already.
+
+To run the bisect tool:
+
+1. Setup the test case by saving the input `.mlir` file and constructing a test
+   `.sh` script:
+
+    ```mlir
+    // /tmp/issue_18879.mlir
+
+    // This is the input program to the test script.
+    module {
+      func.func @main_graph(
+        // ...
+    ```
+
+    ```bash
+    # /tmp/issue_18879.sh
+
+    # This is the script that will be tested at each commit.
+    # This script should succeed (return 0) at and prior to the `--good-ref`
+    # commit and should fail (return non-0) at the `--bad-ref` commit.
+
+    # Try to keep these test scripts minimal. If the failure is in an earlier
+    # phase of the compiler (e.g. 'Flow' or 'Stream'), consider using
+    # a flag like `--compile-to=hal` to exit early on successful run instead
+    # of spending all the time to serialize an output `.vmfb` file.
+    # https://iree.dev/developers/general/developer-tips/#compiling-phase-by-phase
+
+    iree-compile --iree-hal-target-backends=llvm-cpu -o /dev/null /tmp/issue_18879.mlir
+    ```
+
+    ```bash
+    # Set the script as executable.
+    chmod +x /tmp/issue_18879.sh
+    ```
+
+2. Run the bisect tool, under Python 3.11:
+
+    ```bash
+    # Ensure 'python' is Python 3.11, e.g. using venv
+    # (https://docs.python.org/3/library/venv.html):
+    python3.11 -m venv .venv && source .venv/bin/activate
+    # OR using pyenv (https://github.com/pyenv/pyenv):
+    # pyenv shell 3.11
+    python --version
+    # Python 3.11.10
+
+    ./bisect_packages.py \
+      --good-ref=f9fa934c649749b30fc4be05d9cef78eb043f0e9 \
+      --bad-ref=05bbcf1385146d075829cd940a52bf06961614d0 \
+      --test-script=/tmp/issue_18879.sh
+
+    # 206b60ca59c9dbbca5769694df4714c38cecaced is the first bad commit
+    ```
+
+    As expected, the bisect agrees with the culprit mentioned on the issue:
+    https://github.com/iree-org/iree/issues/18879#issuecomment-2435531655.
+
+    Note that any git ref can be used, so we can use tags too:
+
+    ```bash
+    ./bisect_packages.py \
+      --good-ref=candidate-20241016.1048 \
+      --bad-ref=candidate-20241017.1049 \
+      --test-script=/tmp/issue_18879.sh
+    ```
+
+## How the tool works
+
+1. The [`bisect_packages.py`](./bisect_packages.py) script is the main entry
+   point which sets things up and runs `git bisect` commands.
+2. The bisect operation starts with
+   `git bisect start --no-checkout --first-parent` (flags to avoid modifying
+   the working tree and traversing merge commits) and then calls to
+   specify the commit range with `git bisect good` and `git bisect bad`.
+3. The script injects wrapper code around the provided `--test-script`. First,
+   the wrapper script calls
+   [`install_packages_for_commit.py`](./install_packages_for_commit.py) to
+   download IREE packages built at the test commit (marked by `BISECT_HEAD`) and
+   install those packages into an isolated virtual environment. The wrapper
+   script then puts that environment at the front of `PATH`, runs the original
+   script, and finally forwards the original script's exit code to `git bisect`.
+4. The script kicks off a `git bisect run` using the generated wrapper script,
+   which then proceeds to test commits between `--good-ref` and `--bad-ref`,
+   looking for when the test script switched from succeeding to failing.
+5. After the script, the logs can be analyzed and `git bisect log` can be run
+   from the repository root.
+
+### Working directory cache
+
+Downloaded files and virtual environments are cached at `~/.iree/bisect`
+(this path can be changed using the `--work-dir` option). Each commit tested
+gets its own subfolder that contains the downloaded release artifacts and a
+Python venv with those packages installed in it:
+
+```bash
+$ tree -a ~/.iree/bisect -L 2
+/home/nod/.iree/bisect
+├── 099ffd556bc5d35efcca32af51cccc061a273a91
+│   ├── iree_base_compiler-3.1.0.dev0+099ffd556bc5d35efcca32af51cccc061a273a91-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl
+│   ├── iree_base_runtime-3.1.0.dev0+099ffd556bc5d35efcca32af51cccc061a273a91-cp311-cp311-manylinux_2_28_x86_64.whl
+│   └── .venv
+├── 15006418ceb03023e8887cba87e93b499f669ad7
+│   ├── iree_compiler-0.dev1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl
+│   ├── iree_runtime-0.dev1-cp311-cp311-manylinux_2_28_x86_64.whl
+│   └── .venv
+├── 206b60ca59c9dbbca5769694df4714c38cecaced
+│   ├── iree_compiler-0.dev1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl
+│   ├── iree_runtime-0.dev1-cp311-cp311-manylinux_2_28_x86_64.whl
+│   └── .venv
+├── 23c32c633c01e0237cf5f3815b6647cf01827832
+│   ├── iree_base_compiler-3.1.0.dev0+23c32c633c01e0237cf5f3815b6647cf01827832-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl
+│   ├── iree_base_runtime-3.1.0.dev0+23c32c633c01e0237cf5f3815b6647cf01827832-cp311-cp311-manylinux_2_28_x86_64.whl
+│   └── .venv
+```
+
+### Wrapper script
+
+Here is an example of a script that wraps the original `--test-script`. This is
+what gets passed to `git bisect run`:
+
+```bash
+#!/bin/bash
+
+#########################################
+###### BISECT RELEASE SCRIPT SETUP ######
+#########################################
+
+set -xeuo pipefail
+
+REF_HASH=$(git rev-parse BISECT_HEAD)
+"/home/nod/.pyenv/shims/python3.11" /home/nod/dev/projects/iree/build_tools/pkgci/bisect/../setup_venv.py /home/nod/.iree/bisect/${REF_HASH}/.venv --artifact-path=/home/nod/.iree/bisect/${REF_HASH}  --fetch-git-ref=${REF_HASH}
+PATH="/home/nod/.iree/bisect/$REF_HASH/.venv/bin:$PATH"
+
+set +e
+
+#########################################
+############ ORIGINAL SCRIPT ############
+#########################################
+
+iree-compile --iree-hal-target-backends=llvm-cpu -o /dev/null /home/nod/.iree/bisect/issue_18879.mlir
+
+#########################################
+##### BISECT RELEASE SCRIPT CLEANUP #####
+#########################################
+
+RET_VALUE=$?
+if [ $RET_VALUE -ne 0 ]; then
+    exit 1
+fi
+```
+
+### Example annotated logs
+
+Raw logs here: https://gist.github.com/ScottTodd/cff468a50df63b65e5c5f449fabab6af
+
+```bash
+$ ./bisect_packages.py \
+  --good-ref=candidate-20241016.1048 \
+  --bad-ref=candidate-20241017.1049 \
+  --test-script=/home/nod/.iree/bisect/issue_18879.sh
+
+Welcome to bisect_packages.py!
+
+------------------------------------------------------------------
+--------- Configuration ------------------------------------------
+------------------------------------------------------------------
+
+  Searching range         : 'candidate-20241016.1048' - 'candidate-20241017.1049'
+  Using working directory : '/home/nod/.iree/bisect'
+  Using test script       : '/home/nod/.iree/bisect/issue_18879.sh'
+
+------------------------------------------------------------------
+
+------------------------------------------------------------------
+--------- Running git bisect -------------------------------------
+------------------------------------------------------------------
+
+# --------------------------------------
+# Here we start to test the first commit
+# --------------------------------------
+
+Bisecting: 5 revisions left to test after this (roughly 3 steps)
+[c7213deeb5c7abb0843088815580793b282fdc34] Produce releases for Python 3.13. (#18799)
+running  '/home/nod/.iree/bisect/bisect_run_script.sh'
+++ git rev-parse BISECT_HEAD
++ REF_HASH=c7213deeb5c7abb0843088815580793b282fdc34
++ /home/nod/dev/projects/iree/build_tools/pkgci/setup_venv_for_ref.py c7213deeb5c7abb0843088815580793b282fdc34 --work-dir /home/nod/.iree/bisect
+------------------------------------------------------------------
+Installing packages for ref: c7213deeb5c7abb0843088815580793b282fdc34
+  Using base working directory : '/home/nod/.iree/bisect'
+
+# -----------------------------------------------------
+# Here we download and install packages for that commit
+# -----------------------------------------------------
+Running command to list workflow runs:
+  gh api -H Accept: application/vnd.github+json -H X-GitHub-Api-Version: 2022-11-28 /repos/iree-org/iree/actions/workflows/pkgci.yml/runs?head_sha=c7213deeb5c7abb0843088815580793b282fdc34
+Found workflow run: https://github.com/iree-org/iree/actions/runs/11375010806
+Found cached .whl files in artifacts dir, skipping download
+Creating venv at '/home/nod/.iree/bisect/c7213deeb5c7abb0843088815580793b282fdc34/.venv'
+
+Running command to install dependencies:
+  /home/nod/.iree/bisect/c7213deeb5c7abb0843088815580793b282fdc34/.venv/bin/python -m pip install --quiet numpy sympy
+Running command to install package:
+  /home/nod/.iree/bisect/c7213deeb5c7abb0843088815580793b282fdc34/.venv/bin/python -m pip install --quiet /home/nod/.iree/bisect/c7213deeb5c7abb0843088815580793b282fdc34/iree_compiler-0.dev1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl
+Running command to install package:
+  /home/nod/.iree/bisect/c7213deeb5c7abb0843088815580793b282fdc34/.venv/bin/python -m pip install --quiet /home/nod/.iree/bisect/c7213deeb5c7abb0843088815580793b282fdc34/iree_runtime-0.dev1-cp311-cp311-manylinux_2_28_x86_64.whl
+
+Checking packages with 'pip freeze':
+iree-compiler @ file:///home/nod/.iree/bisect/c7213deeb5c7abb0843088815580793b282fdc34/iree_compiler-0.dev1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl#sha256=4078073daae1b706361091389753a4887bfa7d4797ea66dce1d0daaa5bffc58c
+iree-runtime @ file:///home/nod/.iree/bisect/c7213deeb5c7abb0843088815580793b282fdc34/iree_runtime-0.dev1-cp311-cp311-manylinux_2_28_x86_64.whl#sha256=564779699f560ba1da406c3d7d08fc75ba8b8eb2f6fc6e074e691a34bbb29bdf
+mpmath==1.3.0
+numpy==2.1.3
+sympy==1.13.3
+------------------------------------------------------------------
+
++ PATH=/home/nod/.iree/bisect/c7213deeb5c7abb0843088815580793b282fdc34/.venv/bin:/usr/lib/git-core:/usr/lib/git-core:/home/nod/.pyenv/libexec:/home/nod/.pyenv/plugins/python-build/bin:/home/nod/.pyenv/plugins/pyenv-virtualenv/bin:/home/nod/.pyenv/plugins/pyenv-update/bin:/home/nod/.pyenv/plugins/pyenv-doctor/bin:/home/nod/.pyenv/shims:/home/nod/.pyenv/bin:/home/nod/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin:/snap/bin
+# -----------------------------------------------------
+# Here we run the test script
+# -----------------------------------------------------
++ set +e
++ iree-compile --iree-hal-target-backends=llvm-cpu -o /dev/null /home/nod/.iree/bisect/issue_18879.mlir
+/home/nod/.iree/bisect/issue_18879.mlir:17:11: error: operand #0 does not dominate this use
+    %21 = torch.operator "onnx.Resize"(%20, %none, %1) {torch.onnx.coordinate_transformation_mode = "asymmetric", torch.onnx.cubic_coeff_a = -7.500000e-01 : f32, torch.onnx.mode = "nearest", torch.onnx.nearest_mode = "floor"} : (!torch.vtensor<[1,18,14,14],f32>, !torch.none, !torch.vtensor<[4],f32>) -> !torch.vtensor<[1,18,56,56],f32>
+          ^
+/home/nod/.iree/bisect/issue_18879.mlir:17:11: note: see current operation: %144 = "tensor.extract"(%32, %1, %129, %137, %143) : (tensor<1x18x14x14xf32>, index, index, index, index) -> f32
+/home/nod/.iree/bisect/issue_18879.mlir:16:11: note: operand defined here (op in a parent region)
+    %20 = torch.operator "onnx.Conv"(%arg2, %2, %3) {torch.onnx.dilations = [1 : si64, 1 : si64], torch.onnx.group = 1 : si64, torch.onnx.kernel_shape = [1 : si64, 1 : si64], torch.onnx.pads = [0 : si64, 0 : si64, 0 : si64, 0 : si64], torch.onnx.strides = [1 : si64, 1 : si64]} : (!torch.vtensor<[1,72,14,14],f32>, !torch.vtensor<[18,72,1,1],f32>, !torch.vtensor<[18],f32>) -> !torch.vtensor<[1,18,14,14],f32>
+          ^
++ RET_VALUE=1
++ '[' 1 -ne 0 ']'
++ exit 1
+# --------------------------------------------------------------------
+# The test script completed, so now we proceed to test the next commit
+# --------------------------------------------------------------------
+Bisecting: 2 revisions left to test after this (roughly 2 steps)
+[8568efa3cceb6dbbd69e8b681436a17efcce1a74] [GPU] Adding support for opt pass plugins during AMDGPU executable serialization (#18347)
+
+# --------------------------------------------------------------------
+# (repeat the download packages --> run test script step for other commits)
+# ... skipping ahead ...
+# --------------------------------------------------------------------
+
+# -----------------------------------------
+# Bisecting finished. Here are the findings
+# -----------------------------------------
+206b60ca59c9dbbca5769694df4714c38cecaced is the first bad commit
+commit 206b60ca59c9dbbca5769694df4714c38cecaced
+Author: Ian Wood <75152913+IanWood1@users.noreply.github.com>
+Date:   Wed Oct 16 10:52:47 2024 -0700
+
+    [DispatchCreation] Extend multi-use producer fusion (#18551)
+
+    Fuse even in cases where the most dominant op isn't fusable, but other operations would be legal to fuse. Do this by moving the fusable consumer and all transitive defs before all other consumers (if legal).
+
+    ---------
+
+    Signed-off-by: Ian Wood <ianwood2024@u.northwestern.edu>
+
+ .github/workflows/pkgci_regression_test.yml        |  4 +-
+ .../FuseHorizontalContractions.cpp                 | 61 ++---------------
+ .../FuseMultiUseElementwiseProducer.cpp            | 76 +++++++++++++++++-----
+ .../iree/compiler/DispatchCreation/FusionUtils.cpp | 33 ++++++++++
+ .../iree/compiler/DispatchCreation/FusionUtils.h   | 44 +++++++++++++
+ .../test/fuse_multiuse_elementwise_producer.mlir   | 25 +++++++
+ 6 files changed, 169 insertions(+), 74 deletions(-)
+bisect found first bad commit
+```
+
+### Development notes
+
+Testing bisect:
+
+```bash
+pyenv shell 3.11
+
+./bisect_packages.py \
+  --good-ref=iree-3.0.0 \
+  --bad-ref=iree-3.1.0rc20241122 \
+  --test-script=./bisect_example_timestamp.sh
+
+# 5b0740c97a33edce29e753b14b9ff04789afcc53 is the first bad commit
+```
diff --git a/build_tools/pkgci/bisect/bisect_example_timestamp.sh b/build_tools/pkgci/bisect/bisect_example_timestamp.sh
new file mode 100755
index 000000000000..d1b61a05f45e
--- /dev/null
+++ b/build_tools/pkgci/bisect/bisect_example_timestamp.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+# Copyright 2024 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# Simple example script to test bisect with.
+# Fails for commits after Nov 19, 2024.
+
+# Example usage with https://git-scm.com/docs/git-bisect:
+#
+#   git bisect start --no-checkout --first-parent
+#   git bisect good iree-3.0.0
+#   git bisect bad iree-3.1.0rc20241122
+#   git bisect run bisect_example_timestamp.sh
+#
+#     running  'bisect_example_timestamp.sh'
+#     Commit 26ef79aa7c has timestamp: 1732059549
+#     Timestamp >= 1732000000, exit 1
+#     Bisecting: 10 revisions left to test after this (roughly 4 steps)
+#     ...
+#     5b0740c97a33edce29e753b14b9ff04789afcc53 is the first bad commit
+#
+# Example usage with ./bisect_packages.py (even though this doesn't use any
+# release artifacts like `iree-compile`):
+#
+#   ./bisect_packages.py \
+#     --good-ref=iree-3.0.0 \
+#     --bad-ref=iree-3.1.0rc20241122 \
+#     --test-script=./bisect_example_timestamp.sh
+
+SHORT_HASH=$(git rev-parse --short BISECT_HEAD)
+COMMIT_TIMESTAMP=$(git show --no-patch --format=%ct BISECT_HEAD)
+echo "Commit ${SHORT_HASH} has timestamp: ${COMMIT_TIMESTAMP}"
+
+if [ "$COMMIT_TIMESTAMP" -gt "1732000000" ]; then
+    echo "  Timestamp >= 1732000000, exit 1"
+    exit 1
+else
+    echo "  Timestamp < 1732000000, exit 0"
+    exit 0
+fi
diff --git a/build_tools/pkgci/bisect/bisect_packages.py b/build_tools/pkgci/bisect/bisect_packages.py
new file mode 100755
index 000000000000..33416dbb5f54
--- /dev/null
+++ b/build_tools/pkgci/bisect/bisect_packages.py
@@ -0,0 +1,266 @@
+#!/usr/bin/env python3
+# Copyright 2024 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+"""Dev package bisect script.
+
+This connects the `git bisect` tool (https://git-scm.com/docs/git-bisect)
+with IREE's package builds, allowing developers to run tests through commit
+history efficiently. For example, this can be used to spot at which commit
+an `iree-compile` command started failing.
+
+Requirements:
+    git     (https://git-scm.com/)
+    gh      (https://cli.github.com/)
+    Linux   (at least until IREE builds packages for other systems at each commit)
+    Python 3.11
+
+Example usage:
+    bisect_packages.py \
+        --good-ref=iree-3.0.0 \
+        --bad-ref=iree-3.1.0rc20241122 \
+        --test-script=bisect_example_timestamp.sh
+"""
+
+
+import argparse
+import os
+import platform
+import shutil
+import subprocess
+import sys
+from pathlib import Path
+
+THIS_DIR = Path(__file__).parent.resolve()
+REPO_ROOT = THIS_DIR.parent.parent.parent
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(description="Git release bisect tool")
+    # TODO(scotttodd): add --interactive mode that prompts like git bisect does
+    parser.add_argument(
+        "--good-ref",
+        help="The git ref (commit hash, branch name, tag name, etc.) at the lower end of the range",
+        required=True,
+    )
+    parser.add_argument(
+        "--bad-ref",
+        help="The git ref (commit hash, branch name, tag name, etc.) at the upper end of the range",
+        required=True,
+    )
+    parser.add_argument(
+        "--work-dir",
+        help="The working directory to use. Defaults to ~/.iree/bisect/",
+        default=Path.home() / ".iree" / "bisect",
+        type=Path,
+    )
+    # TODO(scotttodd): choice between manual or script (`git bisect run`) to use
+    #                  note that a "manual" mode would need developers to run
+    #   ```bash
+    #   REF_HASH=$(git rev-parse BISECT_HEAD)
+    #   python3.11 setup_venv.py \
+    #     $WORKDIR/$REF_HASH/.venv \
+    #     --artifact-path=$WORKDIR/$REF_HASH \
+    #     --fetch-git-ref=$REF_HASH
+    #   source $WORKDIR/$REF_HASH/.venv/bin/activate
+    #   ```
+    parser.add_argument(
+        "--test-script",
+        help="The script to run at each commit",
+        required=True,
+    )
+    parser.add_argument(
+        "--ignore-system-requirements",
+        help="Ignores system requirements like Python 3.11 and tries to run even if they are not met.",
+        action="store_true",
+        default=False,
+    )
+    # TODO(scotttodd): --clean arg to `rm -rf` the workdir
+    # TODO(scotttodd): control over logging
+    #   redirect stdout/stderr from test script separate files in the workdir?
+
+    return parser.parse_args()
+
+
+def check_system_requirements(ignore_system_requirements):
+    print("")
+    system_check_okay = True
+
+    # Check for Linux.
+    print(
+        f"  Current platform is '{platform.platform()}', platform.system is '{platform.system()}'."
+    )
+    if "Linux" not in platform.system():
+        print("  ERROR! platform.system must be 'Linux'.", file=sys.stderr)
+        system_check_okay = False
+
+    # Check for Python 3.11.
+    print("")
+    print(f"  Current Python version is '{sys.version}'. This script requires 3.11.")
+    if sys.version_info[:2] == (3, 11):
+        python311_path = "python"
+    else:
+        python311_path = shutil.which("python3.11")
+        if python311_path:
+            print(f"  Found python3.11 at '{python311_path}', using that instead.")
+        else:
+            print(
+                "  ERROR! Could not find Python version 3.11. Python version must be 3.11 to match package builds.",
+                file=sys.stderr,
+            )
+            print(
+                "  See `.github/workflows/pkgci_build_packages.yml` and `build_tools/pkgci/build_linux_packages.sh`.",
+                file=sys.stderr,
+            )
+            system_check_okay = False
+
+    # Check for 'gh'.
+    print("")
+    gh_path = shutil.which("gh")
+    if not gh_path:
+        print(
+            "  ERROR! Could not find 'gh'. Install by following https://github.com/cli/cli#installation.",
+            file=sys.stderr,
+        )
+        system_check_okay = False
+    else:
+        print(f"  Found gh at '{gh_path}'.")
+
+    if not system_check_okay:
+        print("")
+        if ignore_system_requirements:
+            print(
+                "One or more configuration issues detected, but --ignore-system-requirements is set. Continuing.",
+                file=sys.stderr,
+            )
+            return
+        print(
+            "One or more configuration issues detected. Fix the reported issues or pass --ignore-system-requirements to try running anyways. Exiting.",
+            file=sys.stderr,
+        )
+        print("")
+        print("------------------------------------------------------------------")
+        sys.exit(1)
+
+    return python311_path
+
+
+def main(args):
+    print("Welcome to bisect_packages.py!")
+
+    print("")
+    print("------------------------------------------------------------------")
+    print("--------- Configuration ------------------------------------------")
+    print("------------------------------------------------------------------")
+    print("")
+    print(f"  Searching range         : '{args.good_ref}' - '{args.bad_ref}'")
+
+    print(f"  Using working directory : '{args.work_dir}'")
+    Path.mkdir(args.work_dir, parents=True, exist_ok=True)
+
+    print(f"  Using test script       : '{args.test_script}'")
+
+    python311_path = check_system_requirements(args.ignore_system_requirements)
+
+    print("")
+    print("------------------------------------------------------------------")
+
+    # Create new script in working directory that:
+    #   * downloads the packages from the release and installs them
+    #   * runs the original test script
+    bisect_run_script = args.work_dir / "bisect_run_script.sh"
+    with open(bisect_run_script, "w") as bisect_run_script_file:
+        contents = ""
+        contents += "#!/bin/bash\n"
+
+        contents += "\n"
+        contents += "#########################################\n"
+        contents += "###### BISECT RELEASE SCRIPT SETUP ######\n"
+        contents += "#########################################\n"
+        contents += "\n"
+        contents += "set -xeuo pipefail\n"
+        contents += "\n"
+
+        # Download packages for REF_HASH and install them into REF_HASH/.venv/.
+        contents += "REF_HASH=$(git rev-parse BISECT_HEAD)\n"
+        contents += f'"{python311_path}" '
+        contents += str((THIS_DIR / ".." / "setup_venv.py").as_posix())
+        contents += f" {args.work_dir}/"
+        contents += "${REF_HASH}/.venv"
+        contents += f" --artifact-path={args.work_dir}/"
+        contents += "${REF_HASH} "
+        contents += " --fetch-git-ref=${REF_HASH}\n"
+        # Prepend the venv bin dir to $PATH. This is similar to running
+        #   `source .venv/bin/activate`
+        # while scoped to this process. Note that this does not modify
+        # $PYTHONHOME or support the `deactivate` command.
+        contents += f'PATH="{args.work_dir}/$REF_HASH/.venv/bin:$PATH"\n'
+
+        contents += "\n"
+        # Controlled failure - don't immediately exit. See below.
+        contents += "set +e\n"
+        contents += "\n"
+        contents += "#########################################\n"
+        contents += "############ ORIGINAL SCRIPT ############\n"
+        contents += "#########################################\n"
+        contents += "\n"
+
+        with open(args.test_script, "r") as original_script:
+            contents += original_script.read()
+
+        contents += "\n"
+        contents += "#########################################\n"
+        contents += "##### BISECT RELEASE SCRIPT CLEANUP #####\n"
+        contents += "#########################################\n"
+        contents += "\n"
+        # Controlled failure, See `set +e` above.
+        # `git bisect` is looking for exit values in the 1-127 range, while
+        # iree-compile can exit with value 245 sometimes:
+        # https://git-scm.com/docs/git-bisect#_bisect_run. Here we just check
+        # for non-zero and normalize back to 1.
+        contents += "RET_VALUE=$?\n"
+        contents += "if [ $RET_VALUE -ne 0 ]; then\n"
+        contents += "    exit 1\n"
+        contents += "fi\n"
+
+        bisect_run_script_file.write(contents)
+
+    os.chmod(str(bisect_run_script), 0o744)  # Set as executable.
+
+    print("")
+    print("------------------------------------------------------------------")
+    print("--------- Running git bisect -------------------------------------")
+    print("------------------------------------------------------------------")
+    print("")
+    subprocess.check_call(["git", "bisect", "reset"], cwd=REPO_ROOT)
+    subprocess.check_call(
+        [
+            "git",
+            "bisect",
+            "start",
+            # Just update the BISECT_HEAD reference instead of checking out the
+            # ref for each iteration of the bisect process. We won't be building
+            # from source and this script lives in the source tree, so keep the
+            # repository in a stable state.
+            # Note: scripts can access the hash via `git rev-parse BISECT_HEAD`.
+            "--no-checkout",
+            # We only care about the merge/aggregate commit when branches were
+            # merged. Ignore ancestors of merge commits.
+            "--first-parent",
+        ],
+        cwd=REPO_ROOT,
+    )
+    subprocess.check_call(["git", "bisect", "good", args.good_ref], cwd=REPO_ROOT)
+    subprocess.check_call(["git", "bisect", "bad", args.bad_ref], cwd=REPO_ROOT)
+    subprocess.check_call(
+        ["git", "bisect", "run", str(bisect_run_script)], cwd=REPO_ROOT
+    )
+
+    print("")
+
+
+if __name__ == "__main__":
+    main(parse_arguments())
diff --git a/build_tools/pkgci/setup_venv.py b/build_tools/pkgci/setup_venv.py
index d0d51638981f..8034edbb6374 100755
--- a/build_tools/pkgci/setup_venv.py
+++ b/build_tools/pkgci/setup_venv.py
@@ -7,14 +7,58 @@
 
 """Sets up a Python venv with compiler/runtime from a workflow run.
 
-There are two modes in which to use this script:
+There are several modes in which to use this script:
 
-* Within a workflow, an artifact action will typically be used to fetch
-  relevant package artifacts. Specify the fetch location with
-  `--artifact-path=`.
+* Within a workflow triggered by `workflow_call`, an artifact action will
+  typically be used to fetch relevant package artifacts. Specify the fetched
+  location with `--artifact-path=`:
 
-* Locally, the `--fetch-gh-workflow=WORKFLOW_ID` can be used instead in order
-  to download and setup the venv in one step.
+  ```yml
+  - uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
+    with:
+      name: linux_x86_64_release_packages
+      path: ${{ env.PACKAGE_DOWNLOAD_DIR }}
+  - name: Setup venv
+    run: |
+      ./build_tools/pkgci/setup_venv.py ${VENV_DIR} \
+      --artifact-path=${PACKAGE_DOWNLOAD_DIR}
+  ```
+
+* Within a workflow triggered by `workflow_dispatch`, pass `artifact_run_id` as
+  an input that developers must specify when running the workflow:
+
+  ```yml
+  on:
+    workflow_dispatch:
+      inputs:
+      artifact_run_id:
+        type: string
+        default: ""
+
+  ...
+    steps:
+    - name: Setup venv
+      run: |
+        ./build_tools/pkgci/setup_venv.py ${VENV_DIR} \
+        --fetch-gh-workflow=${{ inputs.artifact_run_id }}
+  ```
+
+  (Note that these two modes are often combined to allow for workflow testing)
+
+* Locally, the `--fetch-gh-workflow=WORKFLOW_ID` can be used to download and
+  setup the venv from a specific workflow run in one step:
+
+
+  ```bash
+  python3.11 ./build_tools/pkgci/setup_venv.py /tmp/.venv --fetch-gh-workflow=11977414405
+  ```
+
+* Locally, the `--fetch-git-ref=GIT_REF` can be used to download and setup the
+  venv from the latest workflow run for a given ref (commit) in one step:
+
+  ```bash
+  python3.11 ./build_tools/pkgci/setup_venv.py /tmp/.venv --fetch-git-ref=main
+  ```
 
 You must have the `gh` command line tool installed and authenticated if you
 will be fetching artifacts.
@@ -34,10 +78,70 @@
 import tempfile
 import zipfile
 
+THIS_DIR = Path(__file__).parent.resolve()
+REPO_ROOT = THIS_DIR.parent.parent
+
+
+def parse_arguments(argv=None):
+    parser = argparse.ArgumentParser(description="Setup venv")
+    parser.add_argument(
+        "venv_dir", type=Path, help="Directory in which to create the venv"
+    )
+    parser.add_argument("--artifact-path", help="Path in which to find/fetch artifacts")
+
+    fetch_group = parser.add_mutually_exclusive_group()
+    fetch_group.add_argument(
+        "--fetch-gh-workflow", help="Fetch artifacts from a GitHub workflow"
+    )
+    fetch_group.add_argument("--fetch-git-ref", help="Fetch artifacts for a git ref")
+
+    parser.add_argument(
+        "--compiler-variant",
+        default="",
+        help="Package variant to install for the compiler ('', 'asserts')",
+    )
+    parser.add_argument(
+        "--runtime-variant",
+        default="",
+        help="Package variant to install for the runtime ('', 'asserts')",
+    )
+    args = parser.parse_args(argv)
+    return args
+
+
+def get_latest_workflow_run_id_for_ref(ref: str) -> int:
+    print(f"Normalizing ref: {ref}")
+    normalized_ref = (
+        subprocess.check_output(["git", "rev-parse", ref], cwd=REPO_ROOT)
+        .decode()
+        .strip()
+    )
+
+    print(f"Fetching artifacts for normalized ref: {normalized_ref}")
+    base_path = f"/repos/iree-org/iree"
+    workflow_run_args = [
+        "gh",
+        "api",
+        "-H",
+        "Accept: application/vnd.github+json",
+        "-H",
+        "X-GitHub-Api-Version: 2022-11-28",
+        f"{base_path}/actions/workflows/pkgci.yml/runs?head_sha={normalized_ref}",
+    ]
+    print(f"Running command to list workflow runs:\n  {' '.join(workflow_run_args)}")
+    workflow_run_output = subprocess.check_output(workflow_run_args)
+    workflow_run_json_output = json.loads(workflow_run_output)
+    if workflow_run_json_output["total_count"] == 0:
+        raise RuntimeError("Workflow did not run at this commit")
+
+    latest_run = workflow_run_json_output["workflow_runs"][-1]
+    print(f"Found workflow run: {latest_run['html_url']}")
+    return latest_run["id"]
+
 
 @functools.lru_cache
 def list_gh_artifacts(run_id: str) -> Dict[str, str]:
-    print(f"Fetching artifacts for workflow run {run_id}")
+    print(f"Fetching artifacts for workflow run: {run_id}")
     base_path = f"/repos/iree-org/iree"
     output = subprocess.check_output(
         [
@@ -87,30 +191,14 @@ def find_venv_python(venv_path: Path) -> Optional[Path]:
     return None
 
 
-def parse_arguments(argv=None):
-    parser = argparse.ArgumentParser(description="Setup venv")
-    parser.add_argument("--artifact-path", help="Path in which to find/fetch artifacts")
-    parser.add_argument(
-        "--fetch-gh-workflow", help="Fetch artifacts from a GitHub workflow"
-    )
-    parser.add_argument(
-        "--compiler-variant",
-        default="",
-        help="Package variant to install for the compiler ('', 'asserts')",
-    )
-    parser.add_argument(
-        "--runtime-variant",
-        default="",
-        help="Package variant to install for the runtime ('', 'asserts')",
-    )
-    parser.add_argument(
-        "venv_dir", type=Path, help="Directory in which to create the venv"
-    )
-    args = parser.parse_args(argv)
-    return args
-
-
 def main(args):
+    # Look up the workflow run for a ref.
+    if args.fetch_git_ref:
+        latest_gh_workflow = get_latest_workflow_run_id_for_ref(args.fetch_git_ref)
+        args.fetch_git_ref = ""
+        args.fetch_gh_workflow = str(latest_gh_workflow)
+        return main(args)
+
     # Make sure we have an artifact path if fetching.
     if not args.artifact_path and args.fetch_gh_workflow:
         with tempfile.TemporaryDirectory() as td:
diff --git a/docs/website/docs/developers/debugging/compile-time-regressions.md b/docs/website/docs/developers/debugging/compile-time-regressions.md
index f8d7a04900e6..76c7f40f5359 100644
--- a/docs/website/docs/developers/debugging/compile-time-regressions.md
+++ b/docs/website/docs/developers/debugging/compile-time-regressions.md
@@ -48,6 +48,12 @@ Building the compiler from source and using
 specific commits in IREE, though it typically won't let you step through changes
 in submodules (e.g. MLIR updates in `third_party/llvm-project/`).
 
+#### Scripted bisecting with package artifacts
+
+See <https://github.com/iree-org/iree/tree/main/build_tools/pkgci/bisect>.
+
+#### Manually bisecting with source builds
+
 **Tip**: [Configure ccache](../building/cmake-with-ccache.md) if you'll be
 rebuilding the compiler while bisecting
 
@@ -71,8 +77,7 @@ git bisect bad [<rev>]
 
 An automated workflow can use `git bisect run` and a script:
 
-```shell
-# run_bisect.sh
+```shell title="run_bisect.sh"
 git submodule update
 cmake --build build/ --target iree-compile
 # Other logic here
@@ -87,7 +92,7 @@ git bisect run run_bisect.sh
 
 #### Sample: compile executable sources individually with a timeout
 
-```bash
+```bash title="run_bisect.sh"
 #!/bin/bash
 
 set -xeuo pipefail

From b5007fa1069fe2a35061060cc95166bc94f17e81 Mon Sep 17 00:00:00 2001
From: Han-Chung Wang <hanhan0912@gmail.com>
Date: Mon, 2 Dec 2024 13:24:02 -0800
Subject: [PATCH 36/54] [DT] Switch MaterializeEncodingIntoNop to use
 EncodingNopLayoutAttr.  (#19294)

The revision introduces LayoutAttrInterface attribute interface and
implements the codegen attribute that discards the encodings for
lowering. It is used in the MaterializeEncodingIntoNop pass that turns
encodings into nop.

The new interface has two methods in the patch:
- `getEncodingInfo`: It takes a ranked tensor type and returns the
layout.
- `lowerOp`: It takes the operation, converted operands and result types
to lower the op.

The layout attributes for other backends are not implemented, so the
GPU/CPU materialization pass does not take it into account.

The revision also implements a isIdentityLayout method for the
MaterializeEncodingInfo struct and adds an unit test for it.

To support identity layout materialization, it adds a shortcut for
lowering set/unset encodings ops to packing ops. If the relayout is not
needed, the original source tensor value is returned. It is possible to
add a folder for pack ops, but it is hard for unpack ops. Because the
unpack ops have slicing semantics and it is not easy to identify it.
Also, creating an operation is not cheap, so we have the shortcut in the
two methods.

There are no new lit tests because the tests are covered by
[materialize_encoding_into_nop.mlir](https://github.com/iree-org/iree/blob/main/compiler/src/iree/compiler/Codegen/Common/test/materialize_encoding_into_nop.mlir)

---------

Signed-off-by: hanhanW <hanhan0912@gmail.com>
---
 .../Common/CPU/CPUMaterializeEncodings.cpp    |  4 +-
 .../compiler/Codegen/Common/EncodingUtils.cpp |  5 +-
 .../compiler/Codegen/Common/EncodingUtils.h   | 33 ++++++----
 .../Common/GPU/GPUMaterializeEncoding.cpp     | 25 ++++----
 .../Common/MaterializeEncodingIntoNop.cpp     |  8 ++-
 .../MaterializeEncodingIntoPackUnPack.cpp     | 60 +++++++++++++------
 .../Dialect/Codegen/IR/IREECodegenAttrs.cpp   | 17 ++++++
 .../Dialect/Codegen/IR/IREECodegenAttrs.td    | 20 +++++++
 .../Codegen/IR/IREECodegenInterfaces.h        |  1 +
 .../Codegen/IR/IREECodegenInterfaces.td       | 40 +++++++++++++
 .../Codegen/Dialect/Codegen/Utils/Utils.cpp   |  8 +++
 .../Codegen/Dialect/Codegen/Utils/Utils.h     |  4 ++
 .../Codegen/Utils/unittests/BUILD.bazel       |  1 +
 .../Codegen/Utils/unittests/CMakeLists.txt    |  1 +
 .../Codegen/Utils/unittests/UtilsTest.cpp     |  8 +++
 .../compiler/GlobalOptimization/BUILD.bazel   |  1 +
 .../GlobalOptimization/CMakeLists.txt         |  1 +
 .../MaterializeHomogeneousEncodings.cpp       |  4 +-
 18 files changed, 195 insertions(+), 46 deletions(-)

diff --git a/compiler/src/iree/compiler/Codegen/Common/CPU/CPUMaterializeEncodings.cpp b/compiler/src/iree/compiler/Codegen/Common/CPU/CPUMaterializeEncodings.cpp
index c7517d8bca1a..a7c1f00fe1d2 100644
--- a/compiler/src/iree/compiler/Codegen/Common/CPU/CPUMaterializeEncodings.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/CPU/CPUMaterializeEncodings.cpp
@@ -8,6 +8,7 @@
 #include "iree/compiler/Codegen/Common/EncodingUtils.h"
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenDialect.h"
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenOps.h"
+#include "iree/compiler/Codegen/Dialect/Codegen/Utils/Utils.h"
 #include "iree/compiler/Codegen/Utils/Utils.h"
 #include "iree/compiler/Dialect/Encoding/IR/EncodingOps.h"
 #include "iree/compiler/Dialect/HAL/Analysis/DeviceAnalysis.h"
@@ -481,7 +482,8 @@ materializeFuncOpEncodings(FunctionOpInterface funcOp,
   // 3. Heuristics for cache-friendly dispatch tiling can get complex on CPU,
   //    so it is nice that they have fewer narrow cases to consider.
   MaterializeEncodingTypeConverter typeConverter(
-      materializeEncodingForTarget, targetAttr, /*transposeNarrowN=*/true);
+      materializeEncodingForTarget, targetAttr, /*transposeNarrowN=*/true,
+      /*layoutAttr=*/{});
   MaterializeEncodingConversionTarget target(*ctx);
   auto materializeEncodingValueFn = getMaterializeEncodingValueFn(targetAttr);
   populateMaterializeEncodingIntoPackUnPackPatterns(
diff --git a/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.cpp b/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.cpp
index fd75e74a987e..0464fab4e4ad 100644
--- a/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.cpp
@@ -89,9 +89,10 @@ static RankedTensorType transposeIfNarrowNResult(RankedTensorType tensorType) {
 
 MaterializeEncodingTypeConverter::MaterializeEncodingTypeConverter(
     MaterializeEncodingFn materializeEncodingFn,
-    IREE::HAL::ExecutableTargetAttr targetAttr, bool transposeNarrowN)
+    IREE::HAL::ExecutableTargetAttr targetAttr, bool transposeNarrowN,
+    IREE::Codegen::LayoutAttrInterface layoutAttr)
     : materializeEncodingFn(materializeEncodingFn), targetAttr(targetAttr),
-      transposeNarrowN(transposeNarrowN) {
+      transposeNarrowN(transposeNarrowN), layoutAttr(layoutAttr) {
   addConversion([](IntegerType intType) { return intType; });
   addConversion([](IndexType indexType) { return indexType; });
   addConversion([](FloatType floatType) { return floatType; });
diff --git a/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.h b/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.h
index 7077fb6a05f1..a88c36803b39 100644
--- a/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.h
+++ b/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.h
@@ -7,7 +7,8 @@
 #ifndef IREE_COMPILER_SRC_IREE_COMPILER_CODEGEN_COMMON_ENCODINGUTILS_H_
 #define IREE_COMPILER_SRC_IREE_COMPILER_CODEGEN_COMMON_ENCODINGUTILS_H_
 
-#include "iree/compiler/Codegen/Dialect/Codegen/Utils/Utils.h"
+#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenInterfaces.h"
+#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenTypes.h"
 #include "iree/compiler/Dialect/Encoding/IR/EncodingOps.h"
 #include "iree/compiler/Dialect/HAL/IR/HALTypes.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
@@ -34,9 +35,13 @@ using MaterializeEncodingValueFn =
 /// TypeConverter to use for materializing the encoding.
 class MaterializeEncodingTypeConverter : public TypeConverter {
 public:
-  MaterializeEncodingTypeConverter(MaterializeEncodingFn fn,
-                                   IREE::HAL::ExecutableTargetAttr targetAttr,
-                                   bool transposeNarrowN);
+  MaterializeEncodingTypeConverter(
+      MaterializeEncodingFn fn, IREE::HAL::ExecutableTargetAttr targetAttr,
+      bool transposeNarrowN, IREE::Codegen::LayoutAttrInterface layoutAttr);
+
+  const IREE::Codegen::LayoutAttrInterface &getLayoutAttr() const {
+    return layoutAttr;
+  }
 
   const MaterializeEncodingFn &getMaterializeEncodingFn() const {
     return materializeEncodingFn;
@@ -46,6 +51,9 @@ class MaterializeEncodingTypeConverter : public TypeConverter {
 
   FailureOr<IREE::Codegen::MaterializeEncodingInfo>
   getEncodingInfo(RankedTensorType type) const {
+    if (layoutAttr) {
+      return layoutAttr.getEncodingInfo(type);
+    }
     return materializeEncodingFn(type, targetAttr);
   }
 
@@ -55,6 +63,13 @@ class MaterializeEncodingTypeConverter : public TypeConverter {
   const MaterializeEncodingFn materializeEncodingFn;
   const IREE::HAL::ExecutableTargetAttr targetAttr;
   bool transposeNarrowN = false;
+  // The `layoutAttr` implements the logic of encoding materialization. It has
+  // a higher priority when it is present.
+  // TODO(hanchung): Move the logic that takes `targetAttr` and
+  // `transposeNarrowN` into account to their own attribute implementation. It
+  // is in a transition state, so we have two paths atm. We're incrementally
+  // moving the logic to attributes.
+  const IREE::Codegen::LayoutAttrInterface layoutAttr;
 };
 
 /// Conversion target to use for for materializing the encoding.
@@ -86,17 +101,15 @@ class OpMaterializeEncodingPattern : public OpConversionPattern<OpTy> {
 RankedTensorType dropEncoding(RankedTensorType type);
 
 /// Utility method to convert from `set_encoding` op to `pack` operation.
-/// For now this takes a `paddingValue` as input. The source is also taken
-/// as input so that these could be used with `OpConversionPatterns`.
-FailureOr<tensor::PackOp> lowerSetEncodingOpToPackOp(
+/// NOTE: `source` could be returned when packing is not needed.
+FailureOr<Value> lowerSetEncodingOpToPackOp(
     RewriterBase &rewriter, IREE::Encoding::SetEncodingOp encodingOp,
     Value source, const MaterializeEncodingTypeConverter &typeConverter,
     MaterializeEncodingValueFn materializeEncodingValueFn);
 
 /// Utility method to convert from `unset_encoding` op to `unpack` operation.
-/// The source is taken as input so that these could be used with
-/// `OpConversionPatterns`.
-FailureOr<tensor::UnPackOp> lowerUnsetEncodingToUnpackOp(
+/// NOTE: `packedValue` could be returned when unpacking is not needed.
+FailureOr<Value> lowerUnsetEncodingToUnpackOp(
     RewriterBase &rewriter, IREE::Encoding::UnsetEncodingOp encodingOp,
     Value packedValue, const MaterializeEncodingTypeConverter &typeConverter,
     MaterializeEncodingValueFn materializeEncodingValueFn);
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUMaterializeEncoding.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUMaterializeEncoding.cpp
index 6debc2a8ffbc..d85686c35a51 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUMaterializeEncoding.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUMaterializeEncoding.cpp
@@ -315,10 +315,10 @@ struct GPUSetEncodingOpLoweringConversion
                   ConversionPatternRewriter &rewriter) const override {
     auto converter = static_cast<const MaterializeEncodingTypeConverter *>(
         getTypeConverter());
-    auto packOp = lowerSetEncodingOpToPackOp(rewriter, encodingOp,
-                                             adaptor.getSource(), *converter,
-                                             this->materializeEncodingValueFn);
-    if (failed(packOp)) {
+    auto packedValue = lowerSetEncodingOpToPackOp(
+        rewriter, encodingOp, adaptor.getSource(), *converter,
+        this->materializeEncodingValueFn);
+    if (failed(packedValue)) {
       Type targetType =
           getTypeConverter()->convertType(encodingOp.getResultType());
       Value result = rewriter.createOrFold<tensor::CastOp>(
@@ -334,7 +334,7 @@ struct GPUSetEncodingOpLoweringConversion
                                          "unhandled result encoding");
     }
     if (!maybeEncodingInfo->swizzle) {
-      rewriter.replaceOp(encodingOp, packOp->getResult());
+      rewriter.replaceOp(encodingOp, packedValue.value());
       return success();
     }
 
@@ -343,7 +343,9 @@ struct GPUSetEncodingOpLoweringConversion
     // Create expand_shape op to tile the innermost two dimensions.
     int origRank = encodingOp.getSourceType().getRank();
     SmallVector<int64_t> expandShapeShape(
-        packOp->getDestType().getShape().take_front(origRank));
+        cast<ShapedType>(packedValue->getType())
+            .getShape()
+            .take_front(origRank));
     expandShapeShape.append(
         getExpandedTileShape(maybeEncodingInfo->swizzle->expandShape));
     RankedTensorType expandShapeType =
@@ -352,7 +354,7 @@ struct GPUSetEncodingOpLoweringConversion
     SmallVector<ReassociationIndices> reassociation = getReassociationIndices(
         origRank, maybeEncodingInfo->swizzle->expandShape);
     auto expandShapeOp = rewriter.create<tensor::ExpandShapeOp>(
-        loc, expandShapeType, packOp->getResult(), reassociation);
+        loc, expandShapeType, packedValue.value(), reassociation);
 
     SmallVector<int64_t> transposePerm =
         llvm::to_vector(llvm::seq<int64_t>(0, origRank));
@@ -433,10 +435,10 @@ struct GPUUnsetEncodingOpLoweringConversion
           loc, unpackSrcType, transposeOp->getResult(0), reassociation);
     }
 
-    auto unPackOp = lowerUnsetEncodingToUnpackOp(
+    auto unpackedValue = lowerUnsetEncodingToUnpackOp(
         rewriter, unsetEncodingOp, unpackSrc, *converter,
         this->materializeEncodingValueFn);
-    if (failed(unPackOp)) {
+    if (failed(unpackedValue)) {
       Type targetType =
           getTypeConverter()->convertType(unsetEncodingOp.getResultType());
       Value result = rewriter.createOrFold<tensor::CastOp>(loc, targetType,
@@ -444,7 +446,7 @@ struct GPUUnsetEncodingOpLoweringConversion
       rewriter.replaceOp(unsetEncodingOp, result);
       return success();
     }
-    rewriter.replaceOp(unsetEncodingOp, unPackOp->getResult());
+    rewriter.replaceOp(unsetEncodingOp, unpackedValue.value());
     return success();
   }
 };
@@ -559,7 +561,8 @@ materializeFuncOpEncodings(FunctionOpInterface funcOp,
     // 3. Heuristics for cache-friendly dispatch tiling are internal to the GPU
     //    runtime, so we don't need a simplification at that level either.
     MaterializeEncodingTypeConverter typeConverter(
-        materializeEncodingForTarget, targetAttr, /*transposeNarrowN=*/false);
+        materializeEncodingForTarget, targetAttr, /*transposeNarrowN=*/false,
+        /*layoutAttr=*/{});
     MaterializeEncodingConversionTarget target(*ctx);
     MaterializeEncodingValueFn materializeEncodingValueFn =
         [](RankedTensorType, OpBuilder,
diff --git a/compiler/src/iree/compiler/Codegen/Common/MaterializeEncodingIntoNop.cpp b/compiler/src/iree/compiler/Codegen/Common/MaterializeEncodingIntoNop.cpp
index 32eb822c189d..f9e1fc53bc94 100644
--- a/compiler/src/iree/compiler/Codegen/Common/MaterializeEncodingIntoNop.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/MaterializeEncodingIntoNop.cpp
@@ -7,6 +7,8 @@
 #include "iree/compiler/Codegen/Common/EncodingUtils.h"
 #include "iree/compiler/Codegen/Common/PassUtils.h"
 #include "iree/compiler/Codegen/Common/Passes.h"
+#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
+#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenDialect.h"
 #include "iree/compiler/Dialect/Encoding/IR/EncodingOps.h"
 #include "iree/compiler/Dialect/HAL/IR/HALTypes.h"
 #include "mlir/Dialect/MemRef/Transforms/Transforms.h"
@@ -28,7 +30,8 @@ namespace {
 struct MaterializeEncodingIntoNopPass final
     : impl::MaterializeEncodingIntoNopPassBase<MaterializeEncodingIntoNopPass> {
   void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<linalg::LinalgDialect, tensor::TensorDialect>();
+    registry.insert<linalg::LinalgDialect, tensor::TensorDialect,
+                    IREE::Codegen::IREECodegenDialect>();
   }
 
   void runOnOperation() override {
@@ -47,7 +50,8 @@ struct MaterializeEncodingIntoNopPass final
     RewritePatternSet materializeEncodingPattern(context);
     MaterializeEncodingTypeConverter typeConverter(
         materializeEncodingFn, IREE::HAL::ExecutableTargetAttr(),
-        /*transposeNarrowN=*/false);
+        /*transposeNarrowN=*/false,
+        IREE::Codegen::EncodingNopLayoutAttr::get(context));
     MaterializeEncodingConversionTarget target(*context);
     populateMaterializeEncodingIntoPackUnPackPatterns(
         materializeEncodingPattern, typeConverter, materializeEncodingValueFn);
diff --git a/compiler/src/iree/compiler/Codegen/Common/MaterializeEncodingIntoPackUnPack.cpp b/compiler/src/iree/compiler/Codegen/Common/MaterializeEncodingIntoPackUnPack.cpp
index fc3bb45c8be6..4d36b53af00c 100644
--- a/compiler/src/iree/compiler/Codegen/Common/MaterializeEncodingIntoPackUnPack.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/MaterializeEncodingIntoPackUnPack.cpp
@@ -10,12 +10,14 @@
 
 #include "iree/compiler/Codegen/Common/EncodingUtils.h"
 #include "iree/compiler/Codegen/Common/Passes.h"
+#include "iree/compiler/Codegen/Dialect/Codegen/Utils/Utils.h"
 #include "iree/compiler/Codegen/Utils/Utils.h"
 #include "iree/compiler/Dialect/Encoding/IR/EncodingOps.h"
 #include "iree/compiler/Dialect/Flow/IR/FlowOps.h"
 #include "iree/compiler/Dialect/HAL/IR/HALTypes.h"
 #include "iree/compiler/Dialect/Util/IR/UtilOps.h"
 #include "llvm/ADT/SmallVectorExtras.h"
+#include "llvm/Support/LogicalResult.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
@@ -119,12 +121,7 @@ static void transposeInPlace(MaterializeEncodingInfo &info) {
 // to `pack` and `unpack` operations respectively.
 //===---------------------------------------------------------------------===//
 
-/// TODO(hanchung): Move the implementation to EncodingUtils.cpp. It is not
-/// moved because it needs some cleanup for this file. E.g., `getPaddingValue`
-/// is no longer needed. Ideally we should move CPU specific patterns (e.g.,
-/// lowerContractionOpWithEncoding, etc) to the CPUMaterializeEncoding file;
-/// move general patterns to EncodingUtils, and retire this file.
-FailureOr<tensor::PackOp> lowerSetEncodingOpToPackOp(
+FailureOr<Value> lowerSetEncodingOpToPackOp(
     RewriterBase &rewriter, IREE::Encoding::SetEncodingOp encodingOp,
     Value source, const MaterializeEncodingTypeConverter &typeConverter,
     MaterializeEncodingValueFn materializeEncodingValueFn) {
@@ -135,6 +132,11 @@ FailureOr<tensor::PackOp> lowerSetEncodingOpToPackOp(
     return rewriter.notifyMatchFailure(encodingOp, "unhandled result encoding");
   }
 
+  // Shortcut to avoid creating new operations.
+  if (IREE::Codegen::isIdentityLayout(encodingInfo.value())) {
+    return source;
+  }
+
   auto encoding = IREE::Encoding::getEncodingAttr(resultType);
   if (!encoding) {
     return failure();
@@ -160,14 +162,14 @@ FailureOr<tensor::PackOp> lowerSetEncodingOpToPackOp(
       encodingInfo->outerDimsPerm);
   auto emptyOp = rewriter.create<tensor::EmptyOp>(loc, resultDims,
                                                   resultType.getElementType());
-  return rewriter.create<tensor::PackOp>(
-      loc, source, emptyOp, encodingInfo->innerDimsPos, *innerTileSizesOfr,
-      paddingValue, encodingInfo->outerDimsPerm);
+  return rewriter
+      .create<tensor::PackOp>(loc, source, emptyOp, encodingInfo->innerDimsPos,
+                              *innerTileSizesOfr, paddingValue,
+                              encodingInfo->outerDimsPerm)
+      .getResult();
 }
 
-/// TODO(hanchung): Move the implementation to EncodingUtils.cpp. See the reason
-/// in the implementation comment of lowerSetEncodingToPackOp method.
-FailureOr<tensor::UnPackOp> lowerUnsetEncodingToUnpackOp(
+FailureOr<Value> lowerUnsetEncodingToUnpackOp(
     RewriterBase &rewriter, IREE::Encoding::UnsetEncodingOp encodingOp,
     Value packedValue, const MaterializeEncodingTypeConverter &typeConverter,
     MaterializeEncodingValueFn materializeEncodingValueFn) {
@@ -177,6 +179,12 @@ FailureOr<tensor::UnPackOp> lowerUnsetEncodingToUnpackOp(
   if (failed(encodingInfo)) {
     return rewriter.notifyMatchFailure(encodingOp, "unhandled source encoding");
   }
+
+  // Shortcut to avoid creating new operations.
+  if (IREE::Codegen::isIdentityLayout(encodingInfo.value())) {
+    return packedValue;
+  }
+
   auto encoding = IREE::Encoding::getEncodingAttr(sourceType);
   if (typeConverter.getTransposeNarrowN() && isNarrowNResult(encoding)) {
     transposeInPlace(*encodingInfo);
@@ -194,9 +202,11 @@ FailureOr<tensor::UnPackOp> lowerUnsetEncodingToUnpackOp(
     return rewriter.notifyMatchFailure(
         encodingOp, "failed to generate runtime tile size query");
   }
-  return rewriter.create<tensor::UnPackOp>(
-      loc, packedValue, emptyOp, encodingInfo->innerDimsPos, *innerTileSizesOfr,
-      encodingInfo->outerDimsPerm);
+  return rewriter
+      .create<tensor::UnPackOp>(loc, packedValue, emptyOp,
+                                encodingInfo->innerDimsPos, *innerTileSizesOfr,
+                                encodingInfo->outerDimsPerm)
+      .getResult();
 }
 
 /// Utility method to convert `tensor.empty` with encoding to a `tensor.empty`
@@ -609,7 +619,7 @@ struct SetEncodingOpToPackOpConversion
       rewriter.replaceOp(encodingOp, result);
       return success();
     }
-    rewriter.replaceOp(encodingOp, packOp->getResult());
+    rewriter.replaceOp(encodingOp, packOp.value());
     return success();
   }
 };
@@ -625,10 +635,10 @@ struct UnsetEncodingOpToUnPackOpConversion
                   ConversionPatternRewriter &rewriter) const override {
     auto converter = static_cast<const MaterializeEncodingTypeConverter *>(
         this->getTypeConverter());
-    auto unpackOp = lowerUnsetEncodingToUnpackOp(
+    auto unpackedValue = lowerUnsetEncodingToUnpackOp(
         rewriter, encodingOp, adaptor.getSource(), *converter,
         this->materializeEncodingValueFn);
-    if (failed(unpackOp)) {
+    if (failed(unpackedValue)) {
       Type targetType =
           getTypeConverter()->convertType(encodingOp.getResultType());
       Value result = rewriter.createOrFold<tensor::CastOp>(
@@ -636,7 +646,7 @@ struct UnsetEncodingOpToUnPackOpConversion
       rewriter.replaceOp(encodingOp, result);
       return success();
     }
-    rewriter.replaceOp(encodingOp, unpackOp->getResult());
+    rewriter.replaceOp(encodingOp, unpackedValue.value());
     return success();
   }
 };
@@ -734,6 +744,18 @@ class MaterializeContractionOp
 
     auto converter = static_cast<const MaterializeEncodingTypeConverter *>(
         this->getTypeConverter());
+
+    if (auto layoutAttr = converter->getLayoutAttr()) {
+      SmallVector<Type> convertedResTypes;
+      for (auto init : op.getDpsInits()) {
+        convertedResTypes.push_back(converter->convertType(init.getType()));
+      }
+      Operation *newOp =
+          layoutAttr.lowerOp(rewriter, op, convertedResTypes, operands);
+      rewriter.replaceOp(op, newOp->getResults());
+      return success();
+    }
+
     // TODO(hanchung): This is a transition state for moving the implementation
     // details to backend attributes. We won't need the function type argument
     // after all the backends that support encodings implement the attribute.
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.cpp b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.cpp
index b5403a2b1f88..3def02aee6df 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.cpp
@@ -8,11 +8,13 @@
 
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenDialect.h"
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenInterfaces.h"
+#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenTypes.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Arith/Utils/Utils.h"
 #include "mlir/Dialect/Transform/IR/TransformOps.h"
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
+#include "mlir/Dialect/Utils/StructuredOpsUtils.h"
 #include "mlir/IR/DialectImplementation.h"
 #include "mlir/IR/StorageUniquerSupport.h"
 
@@ -460,6 +462,21 @@ int64_t WorkgroupMappingAttr::getRelativeIndex() const {
   return getMappingId();
 }
 
+//===---------------------------------------------------------------------===//
+// iree_codegen.encoding_nop_layout
+//===---------------------------------------------------------------------===//
+
+MaterializeEncodingInfo
+EncodingNopLayoutAttr::getEncodingInfo(RankedTensorType type) const {
+  return MaterializeEncodingInfo{};
+}
+
+Operation *EncodingNopLayoutAttr::lowerOp(OpBuilder &b, Operation *op,
+                                          TypeRange convertedResTypes,
+                                          ValueRange convertedOperands) const {
+  return clone(b, op, convertedResTypes, convertedOperands);
+}
+
 //===----------------------------------------------------------------------===//
 // Initialize attributes
 //===----------------------------------------------------------------------===//
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.td b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.td
index 3086c09b2069..26b37dd07e24 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.td
+++ b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.td
@@ -418,4 +418,24 @@ def IREECodegen_ExportConfig : AttrDef<IREECodegen_Dialect, "ExportConfig", []>
   let genVerifyDecl = 1;
 }
 
+//===---------------------------------------------------------------------===//
+// iree_codegen.encoding_layout
+//===---------------------------------------------------------------------===//
+
+def IREECodegen_EncodingNopLayoutAttr  :
+    AttrDef<IREECodegen_Dialect, "EncodingNopLayout", [
+    DeclareAttrInterfaceMethods<IREECodegen_LayoutAttrInterface, [
+        "getEncodingInfo",
+        "lowerOp"
+      ]>
+    ]> {
+  let mnemonic = "encoding_nop_layout";
+  let summary = "An attribute with implementation that treats encoding as nop.";
+  let description = [{
+    An attribute that implements the interface methods that discards the
+    encodings. It can be a default attribute when a backend does not implement
+    encoding details.
+  }];
+}
+
 #endif // IREE_COMPILER_CODEGEN_DIALECT_IREECODEGENATTRS
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenInterfaces.h b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenInterfaces.h
index b8a026db441c..c35058fd46ba 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenInterfaces.h
+++ b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenInterfaces.h
@@ -7,6 +7,7 @@
 #ifndef IREE_COMPILER_CODEGEN_DIALECT_CODEGEN_IREECODEGENINTERFACES_H_
 #define IREE_COMPILER_CODEGEN_DIALECT_CODEGEN_IREECODEGENINTERFACES_H_
 
+#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenTypes.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinTypes.h"
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenInterfaces.td b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenInterfaces.td
index 2f8dffb39792..36d5d3db31ec 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenInterfaces.td
+++ b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenInterfaces.td
@@ -116,4 +116,44 @@ def IREECodegen_LoweringConfigAttrInterface :
   ];
 }
 
+def IREECodegen_LayoutAttrInterface :
+  AttrInterface<"LayoutAttrInterface"> {
+  let cppNamespace = "::mlir::iree_compiler::IREE::Codegen";
+  let description = [{
+    An interface that collects a set of methods for encoding materialization.
+  }];
+
+  let methods = [
+    InterfaceMethod<
+      /*desc=*/[{
+        Returns the layout of materialized encoding for a tensor type.
+      }],
+      /*retTy=*/"::mlir::iree_compiler::IREE::Codegen::MaterializeEncodingInfo",
+      /*methodName=*/"getEncodingInfo",
+      /*args=*/(ins "::mlir::RankedTensorType":$type),
+      /*methodBody=*/"",
+      /*defaultImplementation=*/[{
+        assert(false && "unimplemented interface method");
+        return MaterializeEncodingInfo{};
+      }]
+    >,
+    InterfaceMethod<
+      /*desc=*/[{
+        Returns the layout of materialized encoding for a tensor type.
+      }],
+      /*retTy=*/"::mlir::Operation *",
+      /*methodName=*/"lowerOp",
+      /*args=*/(ins "::mlir::OpBuilder &":$b,
+                    "::mlir::Operation *":$op,
+                    "::mlir::TypeRange":$convertedResTypes,
+                    "::mlir::ValueRange":$convertedOperands),
+      /*methodBody=*/"",
+      /*defaultImplementation=*/[{
+        assert(false && "unimplemented interface method");
+        return nullptr;
+      }]
+    >
+  ];
+}
+
 #endif // IREE_COMPILER_CODEGEN_DIALECT_CODEGEN_IREECODEGENINTERFACES
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/Utils/Utils.cpp b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/Utils/Utils.cpp
index 4a12f7013417..7976b7ed8ba6 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/Utils/Utils.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/Utils/Utils.cpp
@@ -244,6 +244,14 @@ deserializeEncodingInfo(DictionaryAttr attr) {
   return info;
 }
 
+bool isIdentityLayout(const MaterializeEncodingInfo &info) {
+  // It is not an identity layout if swizzle is present. The swizzle is an
+  // optional variable. User should not set the field when they do not need
+  // swizzle.
+  return info.innerDimsPos.empty() && info.innerTileSizes.empty() &&
+         info.outerDimsPerm.empty() && !info.swizzle;
+}
+
 SmallVector<int64_t>
 getExpandedTileShape(const TileSwizzle::ExpandShapeType &expandShape) {
   SmallVector<int64_t> result;
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/Utils/Utils.h b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/Utils/Utils.h
index b1997c1b91fd..8498a95e11a3 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/Utils/Utils.h
+++ b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/Utils/Utils.h
@@ -57,6 +57,10 @@ DictionaryAttr serializeEncodingInfo(MLIRContext *ctx,
 std::optional<MaterializeEncodingInfo>
 deserializeEncodingInfo(DictionaryAttr attr);
 
+/// Returns true if the `info` denotes an identity layout, i.e., there is no
+/// relayout requirement.
+bool isIdentityLayout(const MaterializeEncodingInfo &info);
+
 /// Concatenates the vectors.
 SmallVector<int64_t>
 getExpandedTileShape(const TileSwizzle::ExpandShapeType &expandShape);
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/Utils/unittests/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/Utils/unittests/BUILD.bazel
index abe11581e12c..8159dea713f1 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/Utils/unittests/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/Utils/unittests/BUILD.bazel
@@ -16,6 +16,7 @@ iree_compiler_cc_test(
     testonly = True,
     srcs = ["UtilsTest.cpp"],
     deps = [
+        "//compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR:IREECodegenDialect",
         "//compiler/src/iree/compiler/Codegen/Dialect/Codegen/Utils",
         "//compiler/src/iree/testing:gtest_main",
         "@com_google_googletest//:gtest",
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/Utils/unittests/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/Utils/unittests/CMakeLists.txt
index bf20bd2bad9d..6624ee4fcee2 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/Utils/unittests/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/Utils/unittests/CMakeLists.txt
@@ -19,6 +19,7 @@ iree_cc_test(
     MLIRIR
     gmock
     gtest
+    iree::compiler::Codegen::Dialect::Codegen::IR::IREECodegenDialect
     iree::compiler::Codegen::Dialect::Codegen::Utils
     iree::testing::gtest_main
 )
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/Utils/unittests/UtilsTest.cpp b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/Utils/unittests/UtilsTest.cpp
index 82f482761da7..418626900ec9 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/Utils/unittests/UtilsTest.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/Utils/unittests/UtilsTest.cpp
@@ -7,6 +7,7 @@
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 
+#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenTypes.h"
 #include "iree/compiler/Codegen/Dialect/Codegen/Utils/Utils.h"
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
 #include "mlir/IR/Attributes.h"
@@ -186,5 +187,12 @@ TEST(MaterializeEncodingInfo, Deserialization) {
   EXPECT_TRUE(deserializeEncodingInfo(b.getDictionaryAttr(items)).has_value());
 }
 
+TEST(MaterializeEncodingInfo, IdentityLayout) {
+  MaterializeEncodingInfo info;
+  EXPECT_TRUE(isIdentityLayout(info));
+  info.swizzle = TileSwizzle();
+  EXPECT_FALSE(isIdentityLayout(info));
+}
+
 } // namespace
 } // namespace mlir::iree_compiler::IREE::Codegen
diff --git a/compiler/src/iree/compiler/GlobalOptimization/BUILD.bazel b/compiler/src/iree/compiler/GlobalOptimization/BUILD.bazel
index 3797824fa122..d85310e8dfe4 100644
--- a/compiler/src/iree/compiler/GlobalOptimization/BUILD.bazel
+++ b/compiler/src/iree/compiler/GlobalOptimization/BUILD.bazel
@@ -78,6 +78,7 @@ iree_compiler_cc_library(
         "//compiler/src/iree/compiler/Codegen/Common",
         "//compiler/src/iree/compiler/Codegen/Common/CPU:CommonCPUPasses",
         "//compiler/src/iree/compiler/Codegen/Common/GPU:CommonGPUPasses",
+        "//compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR:IREECodegenDialect",
         "//compiler/src/iree/compiler/Dialect/Encoding/IR",
         "//compiler/src/iree/compiler/Dialect/Flow/Conversion/TensorToFlow",
         "//compiler/src/iree/compiler/Dialect/Flow/IR",
diff --git a/compiler/src/iree/compiler/GlobalOptimization/CMakeLists.txt b/compiler/src/iree/compiler/GlobalOptimization/CMakeLists.txt
index 70bd927bfc7e..9ca16eed433d 100644
--- a/compiler/src/iree/compiler/GlobalOptimization/CMakeLists.txt
+++ b/compiler/src/iree/compiler/GlobalOptimization/CMakeLists.txt
@@ -93,6 +93,7 @@ iree_cc_library(
     iree::compiler::Codegen::Common
     iree::compiler::Codegen::Common::CPU::CommonCPUPasses
     iree::compiler::Codegen::Common::GPU::CommonGPUPasses
+    iree::compiler::Codegen::Dialect::Codegen::IR::IREECodegenDialect
     iree::compiler::Dialect::Encoding::IR
     iree::compiler::Dialect::Flow::Conversion::TensorToFlow
     iree::compiler::Dialect::Flow::IR
diff --git a/compiler/src/iree/compiler/GlobalOptimization/MaterializeHomogeneousEncodings.cpp b/compiler/src/iree/compiler/GlobalOptimization/MaterializeHomogeneousEncodings.cpp
index 4ce2d92d5748..adcc12977bad 100644
--- a/compiler/src/iree/compiler/GlobalOptimization/MaterializeHomogeneousEncodings.cpp
+++ b/compiler/src/iree/compiler/GlobalOptimization/MaterializeHomogeneousEncodings.cpp
@@ -7,6 +7,7 @@
 #include "iree/compiler/Codegen/Common/CPU/Passes.h"
 #include "iree/compiler/Codegen/Common/GPU/Passes.h"
 #include "iree/compiler/Codegen/Common/Passes.h"
+#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenDialect.h"
 #include "iree/compiler/Dialect/HAL/Analysis/DeviceAnalysis.h"
 #include "iree/compiler/Dialect/HAL/IR/HALDialect.h"
 #include "iree/compiler/Dialect/HAL/IR/HALOps.h"
@@ -44,7 +45,8 @@ class MaterializeHomogeneousEncodingsPass
           MaterializeHomogeneousEncodingsPass> {
 public:
   void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<IREE::HAL::HALDialect, tensor::TensorDialect>();
+    registry.insert<IREE::HAL::HALDialect, tensor::TensorDialect,
+                    IREE::Codegen::IREECodegenDialect>();
   }
 
   void addNopPipeline(OpPassManager &passManager) {

From 4ee7d600c9ef024f85c7f6860abdebc1a964f7bc Mon Sep 17 00:00:00 2001
From: Scott Todd <scott.todd0@gmail.com>
Date: Mon, 2 Dec 2024 13:48:14 -0800
Subject: [PATCH 37/54] Swap huggingface URLs for GitHub URLs in net_test.py.
 (#19341)

This should help the test pass on Windows runners
(https://github.com/iree-org/iree/actions/runs/12116330595/job/33776475671#step:10:1818)
and other environments that may have trouble accessing huggingface.

Test Windows run:
https://github.com/iree-org/iree/actions/runs/12126633267/job/33809139469
---
 .../python/test/build_api/net_test.py         | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/compiler/bindings/python/test/build_api/net_test.py b/compiler/bindings/python/test/build_api/net_test.py
index 6e10c7b4c231..8899065be1ff 100644
--- a/compiler/bindings/python/test/build_api/net_test.py
+++ b/compiler/bindings/python/test/build_api/net_test.py
@@ -5,25 +5,26 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 import io
-import os
 from pathlib import Path
 import tempfile
 import unittest
 
 from iree.build import *
-from iree.build.executor import BuildContext
-from iree.build.test_actions import ExecuteOutOfProcessThunkAction
 
 
 TEST_URL = None
-TEST_URL_1 = "https://huggingface.co/google-bert/bert-base-cased/resolve/cd5ef92a9fb2f889e972770a36d4ed042daf221e/tokenizer.json"
-TEST_URL_2 = "https://huggingface.co/google-bert/bert-base-cased/resolve/cd5ef92a9fb2f889e972770a36d4ed042daf221e/tokenizer_config.json"
+# Arbitrary URLs to download from via HTTP requests. These should require no
+# authentication to access and should ideally sit behind a CDN that can handle
+# random CI and developer traffic. We could also mock the fetching to make the
+# tests hermetic.
+TEST_URL_1 = "https://raw.githubusercontent.com/iree-org/iree/82724905d64eebb2f62bcc0e41626a7b5156fd8f/.gitignore"
+TEST_URL_2 = "https://raw.githubusercontent.com/iree-org/iree/82724905d64eebb2f62bcc0e41626a7b5156fd8f/.gitmodules"
 
 
 @entrypoint
-def tokenizer_via_http():
+def file_via_http():
     return fetch_http(
-        name="tokenizer.json",
+        name="file.txt",
         url=TEST_URL,
     )
 
@@ -43,7 +44,7 @@ def test_fetch_http(self):
         out = None
         err = None
         global TEST_URL
-        path = self.output_path / "genfiles" / "tokenizer_via_http" / "tokenizer.json"
+        path = self.output_path / "genfiles" / "file_via_http" / "file.txt"
 
         def run():
             nonlocal out
@@ -53,7 +54,7 @@ def run():
                 err_io = io.StringIO()
                 iree_build_main(
                     args=[
-                        "tokenizer_via_http",
+                        "file_via_http",
                         "--output-dir",
                         str(self.output_path),
                         "--test-force-console",

From 0f21989438e5c422287882cb67a315329b7b744e Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak@amd.com>
Date: Mon, 2 Dec 2024 18:15:37 -0600
Subject: [PATCH 38/54] [Codegen][GPU] Add range information to GPU dispatch
 IDs (#17707)

First, this patch implements InferIntRangeInterface for
hal.interface.workgroup.{size,id,count} using a local upper_bound
attribute.

Then, it adds a -iree-codegen-gpu-propagate-dispatch-size-bounds pass
that adds these upper_bounds identifiers to the interface.workgroup
operations and to gpu.thread_id based on static information available
late in the codegen pipeline.

Then, it uses -optimize-int-arithmetic to optimize indexing after
-lower-affine, getting rid of a bunch of "if the input's negative" logic
that isn't actually needed in many of our kernels.

It also ensures that these upper_bound values propagate to LLVM.

---------

Signed-off-by: Krzysztof Drewniak <Krzysztof.Drewniak@amd.com>
Co-authored-by: Jakub Kuderski <kubakuderski@gmail.com>
---
 .../compiler/Codegen/Common/GPU/BUILD.bazel   |   1 +
 .../Codegen/Common/GPU/CMakeLists.txt         |   1 +
 .../GPU/GPUPropagateDispatchSizeBounds.cpp    | 103 +++++++++++++++
 .../compiler/Codegen/Common/GPU/Passes.td     |   5 +
 .../Codegen/Common/GPU/test/BUILD.bazel       |   1 +
 .../Codegen/Common/GPU/test/CMakeLists.txt    |   1 +
 .../gpu_propagate_dispatch_size_bounds.mlir   | 122 ++++++++++++++++++
 .../Codegen/LLVMGPU/ConvertToLLVM.cpp         |   5 +-
 .../iree/compiler/Codegen/LLVMGPU/Passes.cpp  |  10 +-
 .../nvvm_extract_address_computation.mlir     |   2 +-
 .../iree/compiler/Codegen/SPIRV/Passes.cpp    |   2 +
 .../iree/compiler/Dialect/HAL/IR/BUILD.bazel  |   2 +
 .../compiler/Dialect/HAL/IR/CMakeLists.txt    |   1 +
 .../iree/compiler/Dialect/HAL/IR/HALOps.cpp   |  36 ++++++
 .../iree/compiler/Dialect/HAL/IR/HALOps.td    |  74 ++++-------
 .../HAL/Transforms/MaterializeInterfaces.cpp  |   3 +-
 16 files changed, 314 insertions(+), 55 deletions(-)
 create mode 100644 compiler/src/iree/compiler/Codegen/Common/GPU/GPUPropagateDispatchSizeBounds.cpp
 create mode 100644 compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_propagate_dispatch_size_bounds.mlir

diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel
index 93cc6520e47d..40cd0864a0d6 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel
@@ -72,6 +72,7 @@ iree_compiler_cc_library(
         "GPUPatterns.cpp",
         "GPUPipelining.cpp",
         "GPUPromoteMatmulOperands.cpp",
+        "GPUPropagateDispatchSizeBounds.cpp",
         "GPUReduceBankConflicts.cpp",
         "GPUReuseSharedMemoryAllocs.cpp",
         "GPUTensorAlloc.cpp",
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt
index 2112a013bb4e..354a1f1b1e21 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt
@@ -70,6 +70,7 @@ iree_cc_library(
     "GPUPatterns.cpp"
     "GPUPipelining.cpp"
     "GPUPromoteMatmulOperands.cpp"
+    "GPUPropagateDispatchSizeBounds.cpp"
     "GPUReduceBankConflicts.cpp"
     "GPUReuseSharedMemoryAllocs.cpp"
     "GPUTensorAlloc.cpp"
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUPropagateDispatchSizeBounds.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUPropagateDispatchSizeBounds.cpp
new file mode 100644
index 000000000000..43aa70be6919
--- /dev/null
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUPropagateDispatchSizeBounds.cpp
@@ -0,0 +1,103 @@
+// Copyright 2024 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/compiler/Codegen/Common/GPU/Passes.h"
+#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
+#include "iree/compiler/Codegen/Utils/GPUUtils.h"
+#include "iree/compiler/Codegen/Utils/Utils.h"
+#include "iree/compiler/Dialect/HAL/IR/HALOps.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Interfaces/FunctionInterfaces.h"
+#include "mlir/Transforms/Passes.h"
+
+namespace mlir::iree_compiler {
+
+#define GEN_PASS_DEF_GPUPROPAGATEDISPATCHSIZEBOUNDSPASS
+#include "iree/compiler/Codegen/Common/GPU/Passes.h.inc"
+
+namespace {
+
+static void applyBounds(FunctionOpInterface funcOp,
+                        ArrayRef<int32_t> workgroupSizes,
+                        ArrayRef<int32_t> workgroupCounts) {
+  Builder b(funcOp->getContext());
+  funcOp->walk([&](Operation *op) {
+    TypeSwitch<Operation *>(op)
+        .Case([&](gpu::ThreadIdOp tidOp) {
+          tidOp.setUpperBoundAttr(b.getIndexAttr(
+              workgroupSizes[static_cast<uint32_t>(tidOp.getDimension())]));
+        })
+        .Case([&](IREE::HAL::InterfaceWorkgroupSizeOp wgSizeOp) {
+          wgSizeOp.setUpperBoundAttr(b.getIndexAttr(
+              workgroupSizes[wgSizeOp.getDimension().getZExtValue()]));
+        })
+        .Case([&](IREE::HAL::InterfaceWorkgroupIDOp wgIdOp) {
+          wgIdOp.setUpperBoundAttr(b.getIndexAttr(
+              workgroupCounts[wgIdOp.getDimension().getZExtValue()]));
+        })
+        .Case([&](IREE::HAL::InterfaceWorkgroupCountOp wgCountOp) {
+          wgCountOp.setUpperBoundAttr(b.getIndexAttr(
+              workgroupCounts[wgCountOp.getDimension().getZExtValue()]));
+        })
+        .Default([](Operation *) {});
+  });
+}
+
+struct GPUPropagateDispatchSizeBoundsPass final
+    : impl::GPUPropagateDispatchSizeBoundsPassBase<
+          GPUPropagateDispatchSizeBoundsPass> {
+  using Base::Base;
+
+  void runOnOperation() override {
+    FunctionOpInterface funcOp = getOperation();
+    IREE::GPU::TargetAttr target = getGPUTargetAttr(funcOp);
+    if (!target) {
+      funcOp.emitWarning("no known target attribute late in GPU codegen");
+      return;
+    }
+    SmallVector<int32_t, 3> workgroupSizes(
+        target.getWgp().getMaxWorkgroupSizes().asArrayRef());
+    SmallVector<int32_t, 3> workgroupCounts(
+        target.getWgp().getMaxWorkgroupCounts().asArrayRef());
+
+    std::optional<SmallVector<int64_t>> staticWorkgroupSize =
+        getWorkgroupSize(funcOp);
+
+    // Late in codegen, we've reconciled the workgroup size onto the export op.
+    if (std::optional<IREE::HAL::ExecutableExportOp> exportOp =
+            getEntryPoint(funcOp)) {
+      if (std::optional<ArrayAttr> exportWorkgroupSize =
+              exportOp->getWorkgroupSize()) {
+        staticWorkgroupSize =
+            llvm::map_to_vector(exportWorkgroupSize->getAsRange<IntegerAttr>(),
+                                [](IntegerAttr a) { return a.getInt(); });
+      }
+    }
+
+    if (staticWorkgroupSize) {
+      // Target info with no workgroup sizes gives a 0-length array, hence no
+      // zip_equal.
+      for (auto [size, staticSize] :
+           llvm::zip(workgroupSizes, *staticWorkgroupSize)) {
+        size = staticSize;
+      }
+    }
+    SmallVector<int64_t> staticWorkgroupCounts = getStaticNumWorkgroups(funcOp);
+    assert(staticWorkgroupCounts.size() <= 3 &&
+           "workgroup counts are 3D at most");
+    for (auto [count, staticCount] :
+         llvm::zip(workgroupCounts, staticWorkgroupCounts)) {
+      if (staticCount != ShapedType::kDynamic) {
+        count = staticCount;
+      }
+    }
+
+    applyBounds(funcOp, workgroupSizes, workgroupCounts);
+  }
+};
+} // namespace
+
+} // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td b/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td
index e2415fd4c6ee..e8f1551c477a 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td
@@ -168,6 +168,11 @@ def GPUPromoteMatmulOperandsPass :
   ];
 }
 
+def GPUPropagateDispatchSizeBoundsPass :
+    InterfacePass<"iree-codegen-gpu-propagate-dispatch-size-bounds", "mlir::FunctionOpInterface"> {
+  let summary = "Pass to annotate workitem and workgroup IDs with known bounds";
+}
+
 def GPUReduceBankConflictsPass :
     InterfacePass<"iree-codegen-gpu-reduce-bank-conflicts", "mlir::FunctionOpInterface"> {
   let summary = "Pass to try to reduce the number of bank conflicts by padding memref.alloc ops.";
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel
index dc80356c6bf1..7d0e6887d717 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel
@@ -41,6 +41,7 @@ iree_lit_test_suite(
             "gpu_nested_layout_vector_distribution_step.mlir",
             "gpu_pipeline.mlir",
             "gpu_promote_matmul_operands.mlir",
+            "gpu_propagate_dispatch_size_bounds.mlir",
             "gpu_reorder_workgroups_static.mlir",
             "gpu_reorder_workgroups.mlir",
             "gpu_reuse_shared_memory_allocs.mlir",
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Common/GPU/test/CMakeLists.txt
index d9f7d2aa124e..a9c584acd96d 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/CMakeLists.txt
@@ -37,6 +37,7 @@ iree_lit_test_suite(
     "gpu_pack_to_instrinsics.mlir"
     "gpu_pipeline.mlir"
     "gpu_promote_matmul_operands.mlir"
+    "gpu_propagate_dispatch_size_bounds.mlir"
     "gpu_reorder_workgroups.mlir"
     "gpu_reorder_workgroups_static.mlir"
     "gpu_reuse_shared_memory_allocs.mlir"
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_propagate_dispatch_size_bounds.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_propagate_dispatch_size_bounds.mlir
new file mode 100644
index 000000000000..f26f2c5dfe52
--- /dev/null
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_propagate_dispatch_size_bounds.mlir
@@ -0,0 +1,122 @@
+// RUN: iree-opt %s --split-input-file \
+// RUN:     --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(func.func(iree-codegen-gpu-propagate-dispatch-size-bounds)))))" \
+// RUN:  | FileCheck %s
+
+// Note: not the real target definition, missing types
+#executable_target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx1100", features = "",
+  wgp = <compute =  fp32,
+    storage =  b32,
+    subgroup =  arithmetic,
+    dot =  none, mma = [],
+    subgroup_size_choices = [32, 64],
+    max_workgroup_sizes = [1024, 1024, 1024],
+    max_thread_count_per_workgroup = 1024,
+    max_workgroup_memory_bytes = 65536,
+    max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>}>
+#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer>]>
+
+hal.executable private @static {
+  hal.executable.variant public @rocm_hsaco_fb target(#executable_target) {
+    hal.executable.export public @static ordinal(0) layout(#pipeline_layout) attributes {workgroup_size = [64 : index, 2 : index, 1 : index]} {
+    ^bb0(%arg0: !hal.device):
+      %c32 = arith.constant 32 : index
+      %c8 = arith.constant 8 : index
+      %c1 = arith.constant 1 : index
+      hal.return %c32, %c8, %c1 : index, index, index
+    }
+    builtin.module {
+// CHECK-LABEL: func.func @static
+      func.func @static() {
+// CHECK: gpu.thread_id x upper_bound 64
+// CHECK: gpu.thread_id y upper_bound 2
+// CHECK: gpu.thread_id z upper_bound 1
+        %thread_id_x = gpu.thread_id x
+        %thread_id_y = gpu.thread_id y
+        %thread_id_z = gpu.thread_id z
+
+// CHECK: hal.interface.workgroup.size[0] upper_bound 64
+// CHECK: hal.interface.workgroup.size[1] upper_bound 2
+// CHECK: hal.interface.workgroup.size[2] upper_bound 1
+        %workgroup_size_x = hal.interface.workgroup.size[0] : index
+        %workgroup_size_y = hal.interface.workgroup.size[1] : index
+        %workgroup_size_z = hal.interface.workgroup.size[2] : index
+
+// CHECK: hal.interface.workgroup.id[0] upper_bound 32
+// CHECK: hal.interface.workgroup.id[1] upper_bound 8
+// CHECK: hal.interface.workgroup.id[2] upper_bound 1
+        %workgroup_id_x = hal.interface.workgroup.id[0] : index
+        %workgroup_id_y = hal.interface.workgroup.id[1] : index
+        %workgroup_id_z = hal.interface.workgroup.id[2] : index
+
+// CHECK: hal.interface.workgroup.count[0] upper_bound 32
+// CHECK: hal.interface.workgroup.count[1] upper_bound 8
+// CHECK: hal.interface.workgroup.count[2] upper_bound 1
+        %workgroup_conut_x = hal.interface.workgroup.count[0] : index
+        %workgroup_count_y = hal.interface.workgroup.count[1] : index
+        %workgroup_count_z = hal.interface.workgroup.count[2] : index
+
+        return
+      }
+    }
+  }
+}
+
+// -----
+
+#executable_target = #hal.executable.target<"rocm", "rocm-hsaco-fb",
+  {iree.gpu.target = #iree_gpu.target<arch = "gfx1100", features = "",
+  wgp = <compute =  fp32,
+    storage =  b32,
+    subgroup = arithmetic,
+    dot =  none, mma = [],
+    subgroup_size_choices = [32, 64],
+    max_workgroup_sizes = [1024, 1024, 1024],
+    max_thread_count_per_workgroup = 1024,
+    max_workgroup_memory_bytes = 65536,
+    max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>}>
+#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer>]>
+
+hal.executable private @dynamic {
+  hal.executable.variant public @rocm_hsaco_fb target(#executable_target) {
+    hal.executable.export public @dynamic ordinal(0) layout(#pipeline_layout) {
+      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
+      %count_x = affine.apply affine_map<()[s0] -> (s0 ceildiv 32)>()[%arg1]
+      %count_y = affine.apply affine_map<()[s0] -> (s0 ceildiv 8)>()[%arg2]
+      %count_z = arith.constant 1 : index
+      hal.return %count_x, %count_y, %count_z : index, index, index
+    }
+    builtin.module {
+      func.func @dynamic() {
+// CHECK: gpu.thread_id x upper_bound 1024
+// CHECK: gpu.thread_id y upper_bound 1024
+// CHECK: gpu.thread_id z upper_bound 1024
+        %thread_id_x = gpu.thread_id x
+        %thread_id_y = gpu.thread_id y
+        %thread_id_z = gpu.thread_id z
+
+// CHECK: hal.interface.workgroup.size[0] upper_bound 1024
+// CHECK: hal.interface.workgroup.size[1] upper_bound 1024
+// CHECK: hal.interface.workgroup.size[2] upper_bound 1024
+        %workgroup_size_x = hal.interface.workgroup.size[0] : index
+        %workgroup_size_y = hal.interface.workgroup.size[1] : index
+        %workgroup_size_z = hal.interface.workgroup.size[2] : index
+
+// CHECK: hal.interface.workgroup.id[0] upper_bound 2147483647
+// CHECK: hal.interface.workgroup.id[1] upper_bound 2147483647
+// CHECK: hal.interface.workgroup.id[2] upper_bound 1
+        %workgroup_id_x = hal.interface.workgroup.id[0] : index
+        %workgroup_id_y = hal.interface.workgroup.id[1] : index
+        %workgroup_id_z = hal.interface.workgroup.id[2] : index
+
+// CHECK: hal.interface.workgroup.count[0] upper_bound 2147483647
+// CHECK: hal.interface.workgroup.count[1] upper_bound 2147483647
+// CHECK: hal.interface.workgroup.count[2] upper_bound 1
+        %workgroup_conut_x = hal.interface.workgroup.count[0] : index
+        %workgroup_count_y = hal.interface.workgroup.count[1] : index
+        %workgroup_count_z = hal.interface.workgroup.count[2] : index
+
+        return
+      }
+    }
+  }
+}
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToLLVM.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToLLVM.cpp
index c056d44538bb..1441f959b0bb 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToLLVM.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToLLVM.cpp
@@ -505,7 +505,10 @@ struct HALInterfaceWorkgroupOpsConverter final
     int32_t index = static_cast<int32_t>(op.getDimension().getSExtValue());
     std::array<gpu::Dimension, 3> dimAttr{gpu::Dimension::x, gpu::Dimension::y,
                                           gpu::Dimension::z};
-    rewriter.replaceOpWithNewOp<NewOpTy>(op, op.getType(), dimAttr[index]);
+    NewOpTy newOp =
+        rewriter.replaceOpWithNewOp<NewOpTy>(op, op.getType(), dimAttr[index]);
+    if (IntegerAttr bound = op.getUpperBoundAttr())
+      newOp.setUpperBoundAttr(bound);
     return success();
   }
 };
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
index 26dced54768f..64744890dbbd 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
@@ -1066,7 +1066,8 @@ addLowerAndOptimizeAddressComputationPasses(FunctionLikeNest &funcPassManager) {
       .addPass(createCSEPass)
       // Hoist the resulting decompositions.
       .addPass(createIREELoopInvariantCodeMotionPass)
-      .addPass(createLowerAffinePass);
+      .addPass(createLowerAffinePass)
+      .addPass(IREE::Util::createOptimizeIntArithmeticPass);
 }
 
 static void addLowerToLLVMGPUPasses(OpPassManager &modulePassManager,
@@ -1102,7 +1103,9 @@ static void addLowerToLLVMGPUPasses(OpPassManager &modulePassManager,
   FunctionLikeNest funcPassManager(modulePassManager);
   funcPassManager.addPass(createFoldTensorExtractOpPass)
       .addPass(createLLVMGPUVectorLoweringPass)
-      .addPass(createExpandGPUOpsPass);
+      .addPass(createExpandGPUOpsPass)
+      // Expose workitem and workgroup counts to range inference later.
+      .addPass(createGPUPropagateDispatchSizeBoundsPass);
 
   // This pass needs to run before SCF -> CF.
   addLowerAndOptimizeAddressComputationPasses(funcPassManager);
@@ -1130,6 +1133,9 @@ static void addLowerToLLVMGPUPasses(OpPassManager &modulePassManager,
       .addPass(createEmulateNarrowTypePass)
       .addPass(affine::createAffineExpandIndexOpsPass)
       .addPass(createLowerAffinePass)
+      // Re-run index optimizations to take care of this ronud of indexing
+      // even though now we can't reason about loop bounds
+      .addPass(IREE::Util::createOptimizeIntArithmeticPass)
       .addPass(createCanonicalizerPass)
       .addPass(createCSEPass);
 
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/nvvm_extract_address_computation.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/nvvm_extract_address_computation.mlir
index 6c1c5e117016..ba6b5da7f1fa 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/nvvm_extract_address_computation.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/nvvm_extract_address_computation.mlir
@@ -40,7 +40,7 @@
 // CHECK-DAG: %[[C8192:.*]] = llvm.mlir.constant(8192 : index) : i64
 //
 // Match the interesting special registers.
-// CHECK-DAG: %[[TID_Y:.*]] = nvvm.read.ptx.sreg.tid.y : i32
+// CHECK-DAG: %[[TID_Y:.*]] = nvvm.read.ptx.sreg.tid.y range <i32, 0, 2> : i32
 // CHECK-DAG: %[[TID_Y_EXT:.*]] = llvm.sext %[[TID_Y]] : i32 to i64
 // CHECK-DAG: %[[LANEID:.*]] = nvvm.read.ptx.sreg.laneid range <i32, 0, 32> : i32
 // CHECK-DAG: %[[LANEID_EXT:.*]] = llvm.sext %[[LANEID]] : i32 to i64
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/Passes.cpp b/compiler/src/iree/compiler/Codegen/SPIRV/Passes.cpp
index ea0aa9f45116..511dbe785300 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/Passes.cpp
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/Passes.cpp
@@ -227,9 +227,11 @@ static void addMemRefLoweringPasses(OpPassManager &modulePassManager) {
 /// Adds passes to perform the final SPIR-V conversion.
 static void addSPIRVLoweringPasses(OpPassManager &modulePassManager) {
   FunctionLikeNest(modulePassManager)
+      .addPass(createGPUPropagateDispatchSizeBoundsPass)
       .addPass(createCanonicalizerPass)
       .addPass(createCSEPass)
       .addPass(createLowerAffinePass)
+      .addPass(IREE::Util::createOptimizeIntArithmeticPass)
 
       // Lower ApplyScale before the i64 Emulation Pass so that new 64-bit ops
       // are also emulated if not supported by the target.
diff --git a/compiler/src/iree/compiler/Dialect/HAL/IR/BUILD.bazel b/compiler/src/iree/compiler/Dialect/HAL/IR/BUILD.bazel
index d9d6a92ef71c..3f80245bfc8c 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/IR/BUILD.bazel
+++ b/compiler/src/iree/compiler/Dialect/HAL/IR/BUILD.bazel
@@ -35,6 +35,7 @@ iree_td_library(
         "//compiler/src/iree/compiler/Dialect/Util/IR:td_files",
         "@llvm-project//mlir:BuiltinDialectTdFiles",
         "@llvm-project//mlir:FuncTdFiles",
+        "@llvm-project//mlir:InferIntRangeInterfaceTdFiles",
         "@llvm-project//mlir:InferTypeOpInterfaceTdFiles",
         "@llvm-project//mlir:OpBaseTdFiles",
         "@llvm-project//mlir:ViewLikeInterfaceTdFiles",
@@ -81,6 +82,7 @@ iree_compiler_cc_library(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:FunctionInterfaces",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:InferIntRangeInterface",
         "@llvm-project//mlir:InferTypeOpInterface",
         "@llvm-project//mlir:MemRefDialect",
         "@llvm-project//mlir:Parser",
diff --git a/compiler/src/iree/compiler/Dialect/HAL/IR/CMakeLists.txt b/compiler/src/iree/compiler/Dialect/HAL/IR/CMakeLists.txt
index 837855157e90..846bcf0d38a2 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/IR/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Dialect/HAL/IR/CMakeLists.txt
@@ -45,6 +45,7 @@ iree_cc_library(
     MLIRFuncDialect
     MLIRFunctionInterfaces
     MLIRIR
+    MLIRInferIntRangeInterface
     MLIRInferTypeOpInterface
     MLIRMemRefDialect
     MLIRParser
diff --git a/compiler/src/iree/compiler/Dialect/HAL/IR/HALOps.cpp b/compiler/src/iree/compiler/Dialect/HAL/IR/HALOps.cpp
index 7210d402598d..cb5bb411810a 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/IR/HALOps.cpp
+++ b/compiler/src/iree/compiler/Dialect/HAL/IR/HALOps.cpp
@@ -19,6 +19,7 @@
 #include "mlir/IR/SymbolTable.h"
 #include "mlir/IR/TypeUtilities.h"
 #include "mlir/Interfaces/FunctionImplementation.h"
+#include "mlir/Interfaces/InferIntRangeInterface.h"
 #include "mlir/Interfaces/InferTypeOpInterface.h"
 
 namespace mlir::iree_compiler::IREE::HAL {
@@ -2084,24 +2085,59 @@ static void getAsmResultNamesForInterfaceWorkgroupOp(
   }
 }
 
+// Minimum is the smallest possible result we could get. It's 0 for ID-like
+// operations and 1 for count-like ones.
+static void setResultRangesForInterfaceWorkgroupOp(
+    Value result, const std::optional<APInt> &upperBound,
+    SetIntRangeFn setResultRanges, int64_t minimum) {
+  unsigned width = ConstantIntRanges::getStorageBitwidth(result.getType());
+  if (!upperBound.has_value()) {
+    setResultRanges(
+        result, ConstantIntRanges::fromSigned(APInt(width, minimum),
+                                              APInt::getSignedMaxValue(width)));
+    return;
+  }
+  setResultRanges(result,
+                  ConstantIntRanges::fromUnsigned(APInt(width, minimum),
+                                                  *upperBound + minimum - 1));
+}
+
 void InterfaceWorkgroupIDOp::getAsmResultNames(
     function_ref<void(Value, StringRef)> setNameFn) {
   getAsmResultNamesForInterfaceWorkgroupOp("workgroup_id_", getDimension(),
                                            getResult(), setNameFn);
 }
 
+void InterfaceWorkgroupIDOp::inferResultRanges(
+    ArrayRef<ConstantIntRanges> argRanges, SetIntRangeFn setResultRanges) {
+  setResultRangesForInterfaceWorkgroupOp(getResult(), getUpperBound(),
+                                         setResultRanges, /*minimum=*/0);
+}
+
 void InterfaceWorkgroupCountOp::getAsmResultNames(
     function_ref<void(Value, StringRef)> setNameFn) {
   getAsmResultNamesForInterfaceWorkgroupOp("workgroup_count_", getDimension(),
                                            getResult(), setNameFn);
 }
 
+void InterfaceWorkgroupCountOp::inferResultRanges(
+    ArrayRef<ConstantIntRanges> argRanges, SetIntRangeFn setResultRanges) {
+  setResultRangesForInterfaceWorkgroupOp(getResult(), getUpperBound(),
+                                         setResultRanges, /*minimum=*/1);
+}
+
 void InterfaceWorkgroupSizeOp::getAsmResultNames(
     function_ref<void(Value, StringRef)> setNameFn) {
   getAsmResultNamesForInterfaceWorkgroupOp("workgroup_size_", getDimension(),
                                            getResult(), setNameFn);
 }
 
+void InterfaceWorkgroupSizeOp::inferResultRanges(
+    ArrayRef<ConstantIntRanges> argRanges, SetIntRangeFn setResultRanges) {
+  setResultRangesForInterfaceWorkgroupOp(getResult(), getUpperBound(),
+                                         setResultRanges, /*minimum=*/1);
+}
+
 //===----------------------------------------------------------------------===//
 // hal.fence.*
 //===----------------------------------------------------------------------===//
diff --git a/compiler/src/iree/compiler/Dialect/HAL/IR/HALOps.td b/compiler/src/iree/compiler/Dialect/HAL/IR/HALOps.td
index 16f1eadfdffd..d51e430b57c7 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/IR/HALOps.td
+++ b/compiler/src/iree/compiler/Dialect/HAL/IR/HALOps.td
@@ -3029,9 +3029,28 @@ def OpGroupInterfaceOps : OpDocGroup {
 
 let opDocGroup = OpGroupInterfaceOps in {
 
-def HAL_InterfaceWorkgroupIDOp : HAL_PureOp<"interface.workgroup.id", [
-  DeclareOpInterfaceMethods<OpAsmOpInterface, ["getAsmResultNames"]>,
-]> {
+class HAL_InterfaceWorkgroupOp<string mnemonic, list<Trait> traits = []>
+  : HAL_PureOp<mnemonic, !listconcat(traits, [
+      DeclareOpInterfaceMethods<OpAsmOpInterface, ["getAsmResultNames"]>,
+      DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>])> {
+  let arguments = (ins
+    IndexAttr:$dimension,
+    OptionalAttr<IndexAttr>:$upper_bound);
+  let results = (outs HAL_Dim:$result);
+
+  let builders = [
+    OpBuilder<(ins "unsigned":$dim),
+    [{
+      build($_builder, $_state, $_builder.getIndexType(), $_builder.getIndexAttr(dim), ::mlir::IntegerAttr{});
+    }]>,
+  ];
+
+  let assemblyFormat = [{
+    `[` $dimension `]` (`upper_bound` $upper_bound^)? attr-dict `:` type($result)
+  }];
+}
+
+def HAL_InterfaceWorkgroupIDOp : HAL_InterfaceWorkgroupOp<"interface.workgroup.id"> {
   let summary = [{returns the index of the current workgroup in the grid}];
   let description = [{
     The global workgroup ID of the current tile in the range of
@@ -3046,25 +3065,9 @@ def HAL_InterfaceWorkgroupIDOp : HAL_PureOp<"interface.workgroup.id", [
     %z = hal.interface.workgroup.id[2] : index
     ```
   }];
-
-  let arguments = (ins IndexAttr:$dimension);
-  let results = (outs HAL_Dim:$result);
-
-  let builders = [
-    OpBuilder<(ins "unsigned":$dim),
-    [{
-      build($_builder, $_state, $_builder.getIndexType(), $_builder.getIndexAttr(dim));
-    }]>,
-  ];
-
-  let assemblyFormat = [{
-    `[` $dimension `]` attr-dict `:` type($result)
-  }];
 }
 
-def HAL_InterfaceWorkgroupCountOp : HAL_PureOp<"interface.workgroup.count", [
-  DeclareOpInterfaceMethods<OpAsmOpInterface, ["getAsmResultNames"]>,
-]> {
+def HAL_InterfaceWorkgroupCountOp : HAL_InterfaceWorkgroupOp<"interface.workgroup.count"> {
   let summary = [{returns the total workgroup count of the grid}];
   let description = [{
     The total number of workgroups along each dimension in the dispatch grid.
@@ -3081,24 +3084,9 @@ def HAL_InterfaceWorkgroupCountOp : HAL_PureOp<"interface.workgroup.count", [
     ```
   }];
 
-  let arguments = (ins IndexAttr:$dimension);
-  let results = (outs HAL_Dim:$result);
-
-  let builders = [
-    OpBuilder<(ins "unsigned":$dim),
-    [{
-      build($_builder, $_state, $_builder.getIndexType(), $_builder.getIndexAttr(dim));
-    }]>,
-  ];
-
-  let assemblyFormat = [{
-    `[` $dimension `]` attr-dict `:` type($result)
-  }];
 }
 
-def HAL_InterfaceWorkgroupSizeOp : HAL_PureOp<"interface.workgroup.size", [
-  DeclareOpInterfaceMethods<OpAsmOpInterface, ["getAsmResultNames"]>,
-]> {
+def HAL_InterfaceWorkgroupSizeOp : HAL_InterfaceWorkgroupOp<"interface.workgroup.size"> {
   let summary = [{returns the size of each workgroup in invocations}];
   let description = [{
     The number of local invocations within the current workgroup along each
@@ -3114,20 +3102,6 @@ def HAL_InterfaceWorkgroupSizeOp : HAL_PureOp<"interface.workgroup.size", [
     %z = hal.interface.workgroup.size[2] : index
     ```
   }];
-
-  let arguments = (ins IndexAttr:$dimension);
-  let results = (outs HAL_Dim:$result);
-
-  let builders = [
-    OpBuilder<(ins "unsigned":$dim),
-    [{
-      build($_builder, $_state, $_builder.getIndexType(), $_builder.getIndexAttr(dim));
-    }]>,
-  ];
-
-  let assemblyFormat = [{
-    `[` $dimension `]` attr-dict `:` type($result)
-  }];
 }
 
 def HAL_InterfaceConstantLoadOp : HAL_PureOp<"interface.constant.load"> {
diff --git a/compiler/src/iree/compiler/Dialect/HAL/Transforms/MaterializeInterfaces.cpp b/compiler/src/iree/compiler/Dialect/HAL/Transforms/MaterializeInterfaces.cpp
index 9f3bee7d529a..d830c078b4bb 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Transforms/MaterializeInterfaces.cpp
+++ b/compiler/src/iree/compiler/Dialect/HAL/Transforms/MaterializeInterfaces.cpp
@@ -514,7 +514,8 @@ struct ConvertDispatchWorkgroupInfoPattern final
   LogicalResult matchAndRewrite(SrcOp op,
                                 PatternRewriter &rewriter) const override {
     rewriter.replaceOpWithNewOp<DstOp>(op, op.getResult().getType(),
-                                       op.getDimensionAttr());
+                                       op.getDimensionAttr(),
+                                       /*upper_bound=*/nullptr);
     return success();
   }
 };

From 006c5d8dcb638b0abfc704fb8c89cb6dc43377f5 Mon Sep 17 00:00:00 2001
From: Scott Todd <scott.todd0@gmail.com>
Date: Mon, 2 Dec 2024 17:01:09 -0800
Subject: [PATCH 39/54] Avoid compound literal struct init for better C/C++
 compat. (#19346)

Progress on this downstream build issue:
https://github.com/nod-ai/shark-ai/issues/534.

I haven't figured out specifically why the downstream build hits errors
here while upstream is fine, but these changes aren't that intrusive.

---------

Co-authored-by: Ben Vanik <ben.vanik@gmail.com>
---
 runtime/src/iree/base/alignment.h     | 16 ++++++++--------
 runtime/src/iree/hal/command_buffer.h | 16 ++++++++++++++--
 2 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/runtime/src/iree/base/alignment.h b/runtime/src/iree/base/alignment.h
index 81110d84878a..7e422e6206b0 100644
--- a/runtime/src/iree/base/alignment.h
+++ b/runtime/src/iree/base/alignment.h
@@ -238,19 +238,19 @@ static inline iree_page_range_t iree_page_range_union(
     const iree_page_range_t a, const iree_page_range_t b) {
   iree_host_size_t start = iree_min(a.offset, b.offset);
   iree_host_size_t end = iree_max(a.offset + a.length, b.offset + b.length);
-  return (iree_page_range_t){
-      /*.offset=*/start,
-      /*.length=*/end - start,
-  };
+  iree_page_range_t page_range = {0};
+  page_range.offset = start;
+  page_range.length = end - start;
+  return page_range;
 }
 
 // Aligns a byte range to page boundaries defined by |page_alignment|.
 static inline iree_page_range_t iree_align_byte_range_to_pages(
     const iree_byte_range_t byte_range, iree_host_size_t page_alignment) {
-  return (iree_page_range_t){
-      /*.offset=*/iree_host_align(byte_range.offset, page_alignment),
-      /*.length=*/iree_host_align(byte_range.length, page_alignment),
-  };
+  iree_page_range_t page_range = {0};
+  page_range.offset = iree_host_align(byte_range.offset, page_alignment);
+  page_range.length = iree_host_align(byte_range.length, page_alignment);
+  return page_range;
 }
 
 // Computes a page-aligned range base and total length from a range.
diff --git a/runtime/src/iree/hal/command_buffer.h b/runtime/src/iree/hal/command_buffer.h
index e433a9b6ef28..e6523f294538 100644
--- a/runtime/src/iree/hal/command_buffer.h
+++ b/runtime/src/iree/hal/command_buffer.h
@@ -115,13 +115,25 @@ typedef struct iree_hal_buffer_ref_t {
 static inline iree_hal_buffer_ref_t iree_hal_make_buffer_ref(
     iree_hal_buffer_t* buffer, iree_device_size_t offset,
     iree_device_size_t length) {
-  return (iree_hal_buffer_ref_t){0, 0, buffer, offset, length};
+  iree_hal_buffer_ref_t buffer_ref = {0};
+  buffer_ref.reserved = 0;
+  buffer_ref.buffer_slot = 0;
+  buffer_ref.buffer = buffer;
+  buffer_ref.offset = offset;
+  buffer_ref.length = length;
+  return buffer_ref;
 }
 
 static inline iree_hal_buffer_ref_t iree_hal_make_indirect_buffer_ref(
     uint32_t buffer_slot, iree_device_size_t offset,
     iree_device_size_t length) {
-  return (iree_hal_buffer_ref_t){0, buffer_slot, NULL, offset, length};
+  iree_hal_buffer_ref_t buffer_ref = {0};
+  buffer_ref.reserved = 0;
+  buffer_ref.buffer_slot = buffer_slot;
+  buffer_ref.buffer = NULL;
+  buffer_ref.offset = offset;
+  buffer_ref.length = length;
+  return buffer_ref;
 }
 
 // A list of buffer references.

From a30a4193bd58e7562e6b8a730e6f1c1962a71366 Mon Sep 17 00:00:00 2001
From: Benoit Jacob <jacob.benoit.1@gmail.com>
Date: Tue, 3 Dec 2024 10:22:21 -0500
Subject: [PATCH 40/54] Make `OpaqueMMALayout` local to `IREEGPUAttrs.cpp`
 (#19342)

This implements a TODO from #19161, which is now unblocked by #19233.

Signed-off-by: Benoit Jacob <jacob.benoit.1@gmail.com>
---
 .../Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp      | 15 ++++++++++-----
 .../Codegen/Dialect/GPU/IR/IREEGPUAttrs.h        | 16 ----------------
 2 files changed, 10 insertions(+), 21 deletions(-)

diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp
index 803040d0451a..306e43ed5d27 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp
@@ -276,6 +276,16 @@ MMASingleSubgroupLayout getSingleSubgroupLayout(MMAIntrinsic intrinsic,
   return {};
 }
 
+// Struct describing the shape of a MMA operation, but not the detailed layout.
+struct OpaqueMmaLayout {
+  int64_t mSize = 0;
+  int64_t nSize = 0;
+  int64_t kSize = 0;
+  Type aType;
+  Type bType;
+  Type cType;
+};
+
 template <typename MMAIntrinsicType>
 static OpaqueMmaLayout getOpaqueMMALayout(MLIRContext *context,
                                           MMAIntrinsicType intrinsic) {
@@ -289,11 +299,6 @@ static OpaqueMmaLayout getOpaqueMMALayout(MLIRContext *context,
   return o;
 }
 
-OpaqueMmaLayout getOpaqueMMALayout(MLIRContext *context,
-                                   IREE::GPU::MMAIntrinsic intrinsic) {
-  return getOpaqueMMALayout<IREE::GPU::MMAIntrinsic>(context, intrinsic);
-}
-
 MMASingleSubgroupLayout getSingleSubgroupLayout(MmaInterfaceAttr mmaKind,
                                                 MMAFragment fragment) {
   if (auto mmaAttr = dyn_cast<MMAAttr>(mmaKind)) {
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h
index cce4b682385e..c5b8d58cc153 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h
@@ -68,22 +68,6 @@ MMASingleSubgroupLayout getSingleSubgroupLayout(VirtualMMAIntrinsic intrinsic,
 MMASingleSubgroupLayout getSingleSubgroupLayout(MmaInterfaceAttr mmaKind,
                                                 MMAFragment fragment);
 
-// Struct describing the shape of a MMA operation, but not the detailed layout.
-// TODO(bjacob): the only user outside of IREEGPUAttrs.cpp is
-// LLVMGPU/TransformExtensions, so maybe make that internal again if/when that
-// goes away.
-struct OpaqueMmaLayout {
-  int64_t mSize = 0;
-  int64_t nSize = 0;
-  int64_t kSize = 0;
-  Type aType;
-  Type bType;
-  Type cType;
-};
-
-OpaqueMmaLayout getOpaqueMMALayout(MLIRContext *context,
-                                   IREE::GPU::MMAIntrinsic intrinsic);
-
 } // namespace mlir::iree_compiler::IREE::GPU
 
 // clang-format off

From 529cd89502aa59b73b5f67c18d22e405a42ca2aa Mon Sep 17 00:00:00 2001
From: Ian Wood <ianwood2024@u.northwestern.edu>
Date: Tue, 3 Dec 2024 16:36:21 +0000
Subject: [PATCH 41/54] [Dispatch] Add pattern to bubble expand through extract
 1/2 (#19325)

This is the 1/2 changes needed to reland
https://github.com/iree-org/iree/pull/18857 (with an open PR
https://github.com/iree-org/iree/pull/19113).


Adds pattern to bubble up expand shape through extract slice. i.e
`expand(extract)` to `extract(expand)`. This only supports the case
where the expanded dimensions are not modified by the extract slice and
there are no dynamic dimensions.

This is important because `tensor.expand_shape` ops _cannot be cloned_
while `tensor.extract_slice` ops _can be cloned_. So, if the
`expand_shape` gets stuck on the bottom of the `extract_slice` it will
block it from being cloned and the `extract_slice` will have to be put
into its own dispatch.

---------

Signed-off-by: Ian Wood <ianwood2024@u.northwestern.edu>
---
 .../DispatchCreation/BubbleUpExpandShapes.cpp | 76 +++++++++++++++++++
 .../DispatchCreation/test/BUILD.bazel         |  1 +
 .../DispatchCreation/test/CMakeLists.txt      |  1 +
 .../test/bubble_up_expand_shapes.mlir         | 23 ++++++
 .../test/bubble_up_extract_slice.mlir         |  2 +
 5 files changed, 103 insertions(+)
 create mode 100644 compiler/src/iree/compiler/DispatchCreation/test/bubble_up_expand_shapes.mlir

diff --git a/compiler/src/iree/compiler/DispatchCreation/BubbleUpExpandShapes.cpp b/compiler/src/iree/compiler/DispatchCreation/BubbleUpExpandShapes.cpp
index 79ae8d3b2ba8..7ce4bddd5731 100644
--- a/compiler/src/iree/compiler/DispatchCreation/BubbleUpExpandShapes.cpp
+++ b/compiler/src/iree/compiler/DispatchCreation/BubbleUpExpandShapes.cpp
@@ -38,6 +38,81 @@ struct BubbleUpExpandShapesPass final
   void runOnOperation() override;
 };
 
+/// Bubbles a `tensor.expand_shape` op through a `tensor.extract_slice` op. This
+/// pattern only gets applied when the `extract_slice` doesn't modify dimensions
+/// that are expanded by the `expand_shape` and when the `extract_slice` is
+/// completely static.
+/// TODO: move this upstream with other tensor bubbling patterns.
+struct BubbleExpandThroughExtract final
+    : public OpRewritePattern<tensor::ExpandShapeOp> {
+
+  using OpRewritePattern<tensor::ExpandShapeOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(tensor::ExpandShapeOp expandOp,
+                                PatternRewriter &rewriter) const override {
+    auto extractOp = expandOp.getSrc().getDefiningOp<tensor::ExtractSliceOp>();
+    if (!extractOp) {
+      return failure();
+    }
+
+    auto srcType = extractOp.getSourceType();
+    auto extractedType = extractOp.getType();
+    auto expandedType = expandOp.getType();
+
+    if (srcType.getRank() != extractedType.getRank()) {
+      return rewriter.notifyMatchFailure(
+          extractOp, "Rank reducing extract_slice not supported");
+    }
+
+    if (!srcType.hasStaticShape() || !extractedType.hasStaticShape() ||
+        !expandedType.hasStaticShape()) {
+      return failure();
+    }
+
+    auto reassoc = expandOp.getReassociationIndices();
+    for (auto i : llvm::seq<uint64_t>(0, extractedType.getRank())) {
+      if (reassoc[i].size() == 1) {
+        continue;
+      }
+
+      if (srcType.getShape()[i] != extractedType.getShape()[i]) {
+        return rewriter.notifyMatchFailure(
+            extractOp, "Extract modifies the expanded dimension");
+      }
+    }
+
+    SmallVector<int64_t> newExpandShape;
+    SmallVector<int64_t> offsets;
+    SmallVector<int64_t> sizes;
+    SmallVector<int64_t> strides;
+    for (auto [inDim, outDims] : llvm::enumerate(reassoc)) {
+      if (outDims.size() == 1) {
+        newExpandShape.push_back(srcType.getShape()[inDim]);
+        offsets.push_back(extractOp.getStaticOffsets()[inDim]);
+        sizes.push_back(extractOp.getStaticSizes()[inDim]);
+        strides.push_back(extractOp.getStaticStrides()[inDim]);
+      } else {
+        for (auto outDim : outDims) {
+          newExpandShape.push_back(expandedType.getShape()[outDim]);
+          offsets.push_back(0);
+          sizes.push_back(expandedType.getShape()[outDim]);
+          strides.push_back(1);
+        }
+      }
+    }
+
+    Type newExpandType =
+        RankedTensorType::get(newExpandShape, expandedType.getElementType());
+    auto newExpand = rewriter.create<tensor::ExpandShapeOp>(
+        expandOp.getLoc(), newExpandType, extractOp.getSource(), reassoc);
+
+    rewriter.replaceOpWithNewOp<tensor::ExtractSliceOp>(
+        expandOp, expandedType, newExpand, ValueRange{}, ValueRange{},
+        ValueRange{}, offsets, sizes, strides);
+    return success();
+  }
+};
+
 } // namespace
 
 void BubbleUpExpandShapesPass::runOnOperation() {
@@ -87,6 +162,7 @@ void BubbleUpExpandShapesPass::runOnOperation() {
   // Add patterns to do some additional cleanup (on top of canonicalizations
   // that can be done later) of reshape ops.
   tensor::populateFoldTensorEmptyPatterns(bubbleExpandShapePatterns);
+  bubbleExpandShapePatterns.insert<BubbleExpandThroughExtract>(context);
 
   GreedyRewriteConfig rewriteConfig;
   rewriteConfig.maxIterations = GreedyRewriteConfig::kNoLimit;
diff --git a/compiler/src/iree/compiler/DispatchCreation/test/BUILD.bazel b/compiler/src/iree/compiler/DispatchCreation/test/BUILD.bazel
index c132debab4f1..f45b4b75c30d 100644
--- a/compiler/src/iree/compiler/DispatchCreation/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/DispatchCreation/test/BUILD.bazel
@@ -27,6 +27,7 @@ iree_lit_test_suite(
             "form_dispatch_regions.mlir",
             "dispatch_linalg_on_tensors.mlir",
             "convert_region_to_workgroups.mlir",
+            "bubble_up_expand_shapes.mlir",
             "bubble_up_extract_slice.mlir",
             "form_dispatch_workgroups.mlir",
             "dispatch_linalg_ext_fusion.mlir",
diff --git a/compiler/src/iree/compiler/DispatchCreation/test/CMakeLists.txt b/compiler/src/iree/compiler/DispatchCreation/test/CMakeLists.txt
index 582e9ae937cc..d13f858de549 100644
--- a/compiler/src/iree/compiler/DispatchCreation/test/CMakeLists.txt
+++ b/compiler/src/iree/compiler/DispatchCreation/test/CMakeLists.txt
@@ -15,6 +15,7 @@ iree_lit_test_suite(
     lit
   SRCS
     "attention_fuse_by_expansion.mlir"
+    "bubble_up_expand_shapes.mlir"
     "bubble_up_extract_slice.mlir"
     "clone_producers_into_dispatch_regions.mlir"
     "collapse_dimensions.mlir"
diff --git a/compiler/src/iree/compiler/DispatchCreation/test/bubble_up_expand_shapes.mlir b/compiler/src/iree/compiler/DispatchCreation/test/bubble_up_expand_shapes.mlir
new file mode 100644
index 000000000000..b014d59f881c
--- /dev/null
+++ b/compiler/src/iree/compiler/DispatchCreation/test/bubble_up_expand_shapes.mlir
@@ -0,0 +1,23 @@
+// RUN: iree-opt --split-input-file --mlir-print-local-scope --pass-pipeline="builtin.module(util.func(iree-dispatch-creation-bubble-up-expand-shapes))" %s | FileCheck %s
+
+util.func public @bubbble_expand_through_extract(%arg0 : tensor<2x4096x5120xf16>) -> (tensor<2x64x64x2560xf16>) {
+  %extracted_slice_237 = tensor.extract_slice %arg0[0, 0, 0] [2, 4096, 2560] [1, 1, 1] : tensor<2x4096x5120xf16> to tensor<2x4096x2560xf16>
+  %expanded_239 = tensor.expand_shape %extracted_slice_237 [[0], [1, 2], [3]] output_shape [2, 64, 64, 2560] : tensor<2x4096x2560xf16> into tensor<2x64x64x2560xf16>
+  util.return %expanded_239 : tensor<2x64x64x2560xf16>
+}
+
+// CHECK-LABEL:  @bubbble_expand_through_extract
+//       CHECK:    %[[EXPAND:.+]] = tensor.expand_shape
+//       CHECK:    %[[EXTRACT:.+]] = tensor.extract_slice %[[EXPAND]]
+
+// -----
+
+util.func public @unsupported_bubbble_expand_through_extract(%arg0 : tensor<2x4096x5120xf16>) -> (tensor<2x32x64x2560xf16>) {
+  %extracted_slice_237 = tensor.extract_slice %arg0[0, 0, 0] [2, 2048, 2560] [1, 1, 1] : tensor<2x4096x5120xf16> to tensor<2x2048x2560xf16>
+  %expanded_239 = tensor.expand_shape %extracted_slice_237 [[0], [1, 2], [3]] output_shape [2, 32, 64, 2560] : tensor<2x2048x2560xf16> into tensor<2x32x64x2560xf16>
+  util.return %expanded_239 : tensor<2x32x64x2560xf16>
+}
+
+// CHECK-LABEL:  @unsupported_bubbble_expand_through_extract
+//       CHECK:    %[[EXTRACT:.+]] = tensor.extract_slice
+//       CHECK:    %[[EXPAND:.+]] = tensor.expand_shape %[[EXTRACT]]
diff --git a/compiler/src/iree/compiler/DispatchCreation/test/bubble_up_extract_slice.mlir b/compiler/src/iree/compiler/DispatchCreation/test/bubble_up_extract_slice.mlir
index 691cbfab0a19..b582b5628fa3 100644
--- a/compiler/src/iree/compiler/DispatchCreation/test/bubble_up_extract_slice.mlir
+++ b/compiler/src/iree/compiler/DispatchCreation/test/bubble_up_extract_slice.mlir
@@ -95,6 +95,8 @@ util.func public @bubble_up_extract_with_use(%arg0 : tensor<1024x7x7x2xi8>) -> (
 //  CHECK-SAME:      ins(%[[EXTRACT0]] : tensor<1024x7x7xi8>)
 //       CHECK:    util.return %[[GENERIC1]], %[[GENERIC0]]
 
+// -----
+
 util.func public @bubble_up_extract_fill_multi_use() -> tensor<2x320x130x130xf8E4M3FNUZ> {
   %cst_1 = arith.constant 1.000000e+00 : f8E4M3FNUZ
   %cst_2 = arith.constant 2.000000e+00 : f8E4M3FNUZ

From 263dcf0b99083080052263d61063dfe4be078a77 Mon Sep 17 00:00:00 2001
From: Scott Todd <scott.todd0@gmail.com>
Date: Tue, 3 Dec 2024 09:24:54 -0800
Subject: [PATCH 42/54] Fix `-Werror=dangling-pointer` errors reported
 linux-aarch64 gcc build. (#19351)

Another tentative fix for https://github.com/iree-org/iree/issues/19264.
This was not fixed by https://github.com/iree-org/iree/pull/19265.

(Untested beyond what presubmit covers)
---
 .../compiler/Codegen/Common/ConcretizePadResultShape.cpp     | 5 +++--
 .../compiler/Codegen/Common/ConfigTrackingCanonicalizer.cpp  | 1 +
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/compiler/src/iree/compiler/Codegen/Common/ConcretizePadResultShape.cpp b/compiler/src/iree/compiler/Codegen/Common/ConcretizePadResultShape.cpp
index a18577058fe6..13fcc9518173 100644
--- a/compiler/src/iree/compiler/Codegen/Common/ConcretizePadResultShape.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/ConcretizePadResultShape.cpp
@@ -137,11 +137,12 @@ class ConcretizePadResultShapePass final
     auto funcOp = getOperation();
 
     ConfigTrackingListener listener;
+    GreedyRewriteConfig config;
+    config.listener = &listener;
+
     {
       RewritePatternSet patterns(context);
       populateConcretizePadResultShapePatterns(patterns);
-      GreedyRewriteConfig config;
-      config.listener = &listener;
       if (failed(applyPatternsAndFoldGreedily(funcOp, std::move(patterns),
                                               config))) {
         return signalPassFailure();
diff --git a/compiler/src/iree/compiler/Codegen/Common/ConfigTrackingCanonicalizer.cpp b/compiler/src/iree/compiler/Codegen/Common/ConfigTrackingCanonicalizer.cpp
index 54d3cf6d11e1..d2a299db0e59 100644
--- a/compiler/src/iree/compiler/Codegen/Common/ConfigTrackingCanonicalizer.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/ConfigTrackingCanonicalizer.cpp
@@ -96,6 +96,7 @@ struct ConfigTrackingCanonicalizerPass final
       config.listener = &listener;
       LogicalResult didConverge =
           applyPatternsAndFoldGreedily(getOperation(), *patterns, config);
+      config.listener = nullptr;
       if (this->testConvergence && failed(didConverge)) {
         getOperation()->emitError("Canonicalizer failed to converge");
         return signalPassFailure();

From cbb11f220c69e0106dbfd1533a00237c3a74e7e3 Mon Sep 17 00:00:00 2001
From: Benoit Jacob <jacob.benoit.1@gmail.com>
Date: Tue, 3 Dec 2024 15:50:06 -0500
Subject: [PATCH 43/54] Load ukernel bitcode as `executable_object` at the time
 of lowering to ukernels. (#19323)

1. Moves the time of loading ukernel bitcode from `serializeExecutable`
to the `GPULowerToUKernels` pass.
2. The determination of whether an op can lower to a ukernel, is now
based on whether the expected bitcode file is found. This allows
removing several utility functions that implemented similar logic in
different places.
3. The `GPULowerToUKernels` pass searches for existing bitcode in a
`hal.executable.objects` attribute, and only loads the embedded ukernel
bitcode if that wasn't found, and in either case ensures that that
resulting ukernel op has a `hal.executable.objects` attribute containing
the necessary IR. This has several nice implications:
- The IR becomes completely self-contained: a ukernel op is no longer an
opaque interface to some bitcode at-a-distance.
- This solves the problem of allowing contributing one's own bitcode
from the outside. Users can write their own `hal.executable.objects`.
- De-duplication of bitcode is handled by the HoistExecutableObjects
pass.
- Linking bitcode is handled by generic linker code linking executable
objects.
- The only useful custom handling of ukernel symbols, was adding
`AlwaysInline` function attributes. This PR moves these attributes to
the ukernel source code: `[[clang::always_inline]]`. I verified that
these result in the expected `alwaysinline` in the bitcode.
4. The ukernel bitcode is part of the ROCM plugin. The
`serializeExecutable` implementation, which was the consumer of that
data, is also in the ROCM plugin. But the `GPULowerToUKernels` pass,
which is the new consumer, is outside of that plugin. So this required
creating a mechanism to export such embedded data files from the ROCM
plugin to the outside. That is solved by the new `EmbeddedDataDirectory`
utility.

---------

Signed-off-by: Benoit Jacob <jacob.benoit.1@gmail.com>
---
 .../bazel_to_cmake_converter.py               |   2 +-
 compiler/plugins/target/ROCM/BUILD.bazel      |   5 +-
 compiler/plugins/target/ROCM/CMakeLists.txt   |   5 +-
 compiler/plugins/target/ROCM/ROCMTarget.cpp   |  36 ++--
 .../plugins/target/ROCM/ROCMTargetUtils.cpp   |  41 -----
 .../target/ROCM/builtins/ukernel/BUILD.bazel  |  21 +--
 .../ROCM/builtins/ukernel/CMakeLists.txt      | 116 ++++++-------
 .../ukernel/iree_uk_amdgpu_argmax_f16i32.c    |   8 +-
 .../ukernel/iree_uk_amdgpu_argmax_f16i64.c    |   8 +-
 .../ukernel/iree_uk_amdgpu_argmax_f32i32.c    |   8 +-
 .../ukernel/iree_uk_amdgpu_argmax_f32i64.c    |   8 +-
 compiler/plugins/target/ROCM/test/BUILD.bazel |  26 +++
 .../plugins/target/ROCM/test/CMakeLists.txt   |  31 +++-
 .../ROCM}/test/gpu_lower_to_ukernels.mlir     |  53 +++++-
 .../test/ukernel_pipeline_transform.mlir      |  24 +--
 .../Codegen/Common/GPU/GPULowerToUKernels.cpp | 164 ++++++++++++------
 .../compiler/Codegen/Common/GPU/Passes.td     |   2 +-
 .../Codegen/Common/GPU/test/BUILD.bazel       |   3 -
 .../Codegen/Common/GPU/test/CMakeLists.txt    |   1 -
 .../Codegen/Dialect/Codegen/IR/UKernelOps.cpp |   7 +-
 .../compiler/Codegen/LLVMGPU/KernelConfig.cpp |  15 +-
 .../compiler/Codegen/LLVMGPU/test/BUILD.bazel |   1 -
 .../Codegen/LLVMGPU/test/CMakeLists.txt       |   1 -
 .../iree/compiler/Codegen/Utils/GPUUtils.cpp  |  36 ----
 .../iree/compiler/Codegen/Utils/GPUUtils.h    |  10 --
 compiler/src/iree/compiler/Utils/BUILD.bazel  |   1 +
 .../src/iree/compiler/Utils/CMakeLists.txt    |   1 +
 .../compiler/Utils/EmbeddedDataDirectory.h    |  59 +++++++
 .../iree/compiler/Utils/unittests/BUILD.bazel |   1 +
 .../compiler/Utils/unittests/CMakeLists.txt   |   1 +
 .../compiler/Utils/unittests/UtilsTest.cpp    |  47 +++++
 31 files changed, 461 insertions(+), 281 deletions(-)
 create mode 100644 compiler/plugins/target/ROCM/test/BUILD.bazel
 rename compiler/{src/iree/compiler/Codegen/Common/GPU => plugins/target/ROCM}/test/gpu_lower_to_ukernels.mlir (84%)
 rename compiler/{src/iree/compiler/Codegen/LLVMGPU => plugins/target/ROCM}/test/ukernel_pipeline_transform.mlir (90%)
 create mode 100644 compiler/src/iree/compiler/Utils/EmbeddedDataDirectory.h

diff --git a/build_tools/bazel_to_cmake/bazel_to_cmake_converter.py b/build_tools/bazel_to_cmake/bazel_to_cmake_converter.py
index 0fb0fd85492f..8d2db1050da3 100644
--- a/build_tools/bazel_to_cmake/bazel_to_cmake_converter.py
+++ b/build_tools/bazel_to_cmake/bazel_to_cmake_converter.py
@@ -616,7 +616,7 @@ def iree_amdgpu_bitcode_library(self, name, gpu_arch, srcs, copts=None, out=None
             "GPU_ARCH", gpu_arch, quote=False
         )
         srcs_block = self._convert_srcs_block(srcs)
-        out_block = self._convert_string_arg_block("OUT", out, quote=False)
+        out_block = self._convert_string_arg_block("OUT", out, quote=True)
         copts_block = self._convert_string_list_block("COPTS", copts, sort=False)
 
         self._converter.body += (
diff --git a/compiler/plugins/target/ROCM/BUILD.bazel b/compiler/plugins/target/ROCM/BUILD.bazel
index 6ae9b95c4714..48dfeb3ff401 100644
--- a/compiler/plugins/target/ROCM/BUILD.bazel
+++ b/compiler/plugins/target/ROCM/BUILD.bazel
@@ -27,10 +27,7 @@ iree_compiler_cc_library(
         "ROCMTargetUtils.h",
     ],
     deps = [
-        "//compiler/plugins/target/ROCM/builtins/ukernel:iree_uk_amdgpu_gfx1030",
-        "//compiler/plugins/target/ROCM/builtins/ukernel:iree_uk_amdgpu_gfx1100",
-        "//compiler/plugins/target/ROCM/builtins/ukernel:iree_uk_amdgpu_gfx90a",
-        "//compiler/plugins/target/ROCM/builtins/ukernel:iree_uk_amdgpu_gfx942",
+        "//compiler/plugins/target/ROCM/builtins/ukernel:iree_uk_amdgpu_bitcode",
         "//compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR:IREECodegenDialect",
         "//compiler/src/iree/compiler/Codegen/Dialect/GPU/IR:IREEGPUDialect",
         "//compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils:KnownTargets",
diff --git a/compiler/plugins/target/ROCM/CMakeLists.txt b/compiler/plugins/target/ROCM/CMakeLists.txt
index 0efc3df479e6..96c3305d936d 100644
--- a/compiler/plugins/target/ROCM/CMakeLists.txt
+++ b/compiler/plugins/target/ROCM/CMakeLists.txt
@@ -64,10 +64,7 @@ iree_cc_library(
     iree::compiler::Dialect::HAL::Utils::LLVMLinkerUtils
     iree::compiler::PluginAPI
     iree::compiler::Utils
-    iree::compiler::plugins::target::ROCM::builtins::ukernel::iree_uk_amdgpu_gfx1030
-    iree::compiler::plugins::target::ROCM::builtins::ukernel::iree_uk_amdgpu_gfx1100
-    iree::compiler::plugins::target::ROCM::builtins::ukernel::iree_uk_amdgpu_gfx90a
-    iree::compiler::plugins::target::ROCM::builtins::ukernel::iree_uk_amdgpu_gfx942
+    iree::compiler::plugins::target::ROCM::builtins::ukernel::iree_uk_amdgpu_bitcode
     iree::schemas::amdgpu_executable_def_c_fbs
     iree::schemas::executable_debug_info_c_fbs
     iree::schemas::hip_executable_def_c_fbs
diff --git a/compiler/plugins/target/ROCM/ROCMTarget.cpp b/compiler/plugins/target/ROCM/ROCMTarget.cpp
index a49780fbbcf4..48ef62e07220 100644
--- a/compiler/plugins/target/ROCM/ROCMTarget.cpp
+++ b/compiler/plugins/target/ROCM/ROCMTarget.cpp
@@ -8,6 +8,7 @@
 
 #include <cstdint>
 
+#include "compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_bitcode.h"
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenDialect.h"
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUDialect.h"
@@ -21,6 +22,7 @@
 #include "iree/compiler/Dialect/HAL/Utils/ExecutableDebugInfoUtils.h"
 #include "iree/compiler/Dialect/HAL/Utils/LLVMLinkerUtils.h"
 #include "iree/compiler/PluginAPI/Client.h"
+#include "iree/compiler/Utils/EmbeddedDataDirectory.h"
 #include "iree/compiler/Utils/FlatbufferUtils.h"
 #include "iree/compiler/Utils/ToolUtils.h"
 #include "iree/schemas/amdgpu_executable_def_builder.h"
@@ -206,6 +208,7 @@ static std::string translateModuleToISA(llvm::Module &module,
   }
   return targetISA;
 }
+
 } // namespace
 
 class ROCMTargetBackend final : public TargetBackend {
@@ -513,20 +516,6 @@ class ROCMTargetBackend final : public TargetBackend {
         return failure();
       }
 
-      // Link module to any enabled ukernels.
-      StringRef bitcodeDirectory = options.bitcodeDirectory;
-      StringRef enabledUkernels;
-      if (auto attr = getConfigStringAttr(targetAttr, "ukernels"))
-        enabledUkernels = attr->getValue();
-      if (!enabledUkernels.empty() && enabledUkernels != "none") {
-        if (failed(linkUkernelBitcodeFiles(
-                variantOp.getLoc(), llvmModule.get(), enabledUkernels,
-                targetArch, bitcodeDirectory, llvm::Linker::OverrideFromSrc,
-                *targetMachine))) {
-          return failure();
-        }
-      }
-
       // Link bitcode (*.bc) object attrs specified by the input program.
       // Note that this happens after the command-line files so that the command
       // line ones override the symbols coming from the embedded files.
@@ -548,14 +537,15 @@ class ROCMTargetBackend final : public TargetBackend {
       }
 
       // Link module to HIP device library.
-      if (bitcodeDirectory.empty()) {
+      if (options.bitcodeDirectory.empty()) {
         return variantOp.emitError()
                << "cannot find ROCM bitcode files. Check your installation "
                   "consistency and in the worst case, set "
                   "--iree-hip-bc-dir= to a path on your system.";
       }
       if (failed(linkHIPBitcodeIfNeeded(variantOp.getLoc(), llvmModule.get(),
-                                        targetArch, bitcodeDirectory))) {
+                                        targetArch,
+                                        options.bitcodeDirectory))) {
         return failure();
       }
 
@@ -881,6 +871,7 @@ class HIPTargetDevice final : public TargetDevice {
 };
 
 namespace {
+
 struct ROCMSession final
     : PluginSession<ROCMSession, ROCMOptions,
                     PluginActivationPolicy::DefaultActivated> {
@@ -910,10 +901,23 @@ struct ROCMSession final
 
 } // namespace mlir::iree_compiler::IREE::HAL
 
+// Iterate over ukernel bitcode embedded-data files, and insert them into the
+// EmbeddedDataDirectory singleton.
+static void addAMDGPUUkernelBitcodeToGlobalEmbeddedDataDirectory() {
+  using mlir::iree_compiler::EmbeddedDataDirectory;
+  EmbeddedDataDirectory::withGlobal([](EmbeddedDataDirectory &dir) {
+    const iree_file_toc_t *toc = iree_uk_amdgpu_bitcode_create();
+    for (size_t i = 0; i < iree_uk_amdgpu_bitcode_size(); ++i) {
+      dir.addFile(toc[i].name, llvm::StringRef{toc[i].data, toc[i].size});
+    }
+  });
+}
+
 extern "C" bool iree_register_compiler_plugin_hal_target_rocm(
     mlir::iree_compiler::PluginRegistrar *registrar) {
   registrar->registerPlugin<mlir::iree_compiler::IREE::HAL::ROCMSession>(
       "hal_target_rocm");
+  addAMDGPUUkernelBitcodeToGlobalEmbeddedDataDirectory();
   return true;
 }
 
diff --git a/compiler/plugins/target/ROCM/ROCMTargetUtils.cpp b/compiler/plugins/target/ROCM/ROCMTargetUtils.cpp
index 792de8e4a4b0..2cf9f20c0de5 100644
--- a/compiler/plugins/target/ROCM/ROCMTargetUtils.cpp
+++ b/compiler/plugins/target/ROCM/ROCMTargetUtils.cpp
@@ -6,10 +6,6 @@
 
 #include "compiler/plugins/target/ROCM/ROCMTargetUtils.h"
 
-#include "compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_gfx1030.h"
-#include "compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_gfx1100.h"
-#include "compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_gfx90a.h"
-#include "compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_gfx942.h"
 #include "iree/compiler/Codegen/Utils/GPUUtils.h"
 #include "iree/compiler/Dialect/HAL/Utils/LLVMLinkerUtils.h"
 #include "iree/compiler/Utils/ToolUtils.h"
@@ -185,43 +181,6 @@ LogicalResult linkHIPBitcodeIfNeeded(Location loc, llvm::Module *module,
   return linkWithBitcodeFiles(loc, module, bitcodePaths);
 }
 
-static std::tuple<const iree_file_toc_t *, int>
-getUkernelBitcodeTOC(StringRef gpuArch) {
-  return llvm::StringSwitch<std::tuple<const iree_file_toc_t *, int>>(gpuArch)
-      .Case("gfx90a",
-            {iree_uk_amdgpu_gfx90a_create(), iree_uk_amdgpu_gfx90a_size()})
-      .Case("gfx942",
-            {iree_uk_amdgpu_gfx942_create(), iree_uk_amdgpu_gfx942_size()})
-      .Case("gfx1030",
-            {iree_uk_amdgpu_gfx1030_create(), iree_uk_amdgpu_gfx1030_size()})
-      .Case("gfx1100",
-            {iree_uk_amdgpu_gfx1100_create(), iree_uk_amdgpu_gfx1100_size()})
-      .Default({nullptr, 0});
-}
-
-// Links optimized Ukernel bitcode into the given module if the module needs it.
-LogicalResult linkUkernelBitcodeFiles(Location loc, llvm::Module *module,
-                                      StringRef enabledUkernelsStr,
-                                      StringRef targetChip,
-                                      StringRef bitcodePath,
-                                      unsigned linkerFlags,
-                                      llvm::TargetMachine &targetMachine) {
-  auto [toc, toc_size] = getUkernelBitcodeTOC(targetChip);
-  if (!toc) {
-    return failure();
-  }
-
-  llvm::Linker linker(*module);
-  for (int i = 0; i < toc_size; ++i) {
-    if (failed(linkBitcodeFile(loc, linker, linkerFlags, toc[i].name,
-                               llvm::StringRef(toc[i].data, toc[i].size),
-                               targetMachine, module->getContext())))
-      return failure();
-  }
-
-  return success();
-}
-
 // Link object file using lld lnker to generate code object
 // Inspiration from this section comes from LLVM-PROJECT-MLIR by
 // ROCmSoftwarePlatform
diff --git a/compiler/plugins/target/ROCM/builtins/ukernel/BUILD.bazel b/compiler/plugins/target/ROCM/builtins/ukernel/BUILD.bazel
index 93e6c86bd4a3..aff7b8965b32 100644
--- a/compiler/plugins/target/ROCM/builtins/ukernel/BUILD.bazel
+++ b/compiler/plugins/target/ROCM/builtins/ukernel/BUILD.bazel
@@ -49,19 +49,20 @@ argmax_types = [
         "iree_uk_amdgpu_argmax_%s.c" % type,
         "common.h",
     ],
+    out = "iree_uk_amdgpu_argmax_%s.%s.bc" % (type, gpu_arch),
     gpu_arch = gpu_arch,
 ) for type in argmax_types for gpu_arch in gpu_archs]
 
-argmax_bc_files = {gpu_arch: [
-    ":iree_uk_amdgpu_argmax_%s.c.%s.bc" % (type, gpu_arch)
+argmax_bc_files = [
+    ":iree_uk_amdgpu_argmax_%s.%s.bc" % (type, gpu_arch)
     for type in argmax_types
-] for gpu_arch in gpu_archs}
+    for gpu_arch in gpu_archs
+]
 
-[iree_c_embed_data(
-    name = "iree_uk_amdgpu_%s" % gpu_arch,
-    srcs = argmax_bc_files[gpu_arch],
-    c_file_output = "iree_uk_amdgpu_%s.c" % gpu_arch,
+iree_c_embed_data(
+    name = "iree_uk_amdgpu_bitcode",
+    srcs = argmax_bc_files,
+    c_file_output = "iree_uk_amdgpu_bitcode.c",
     flatten = True,
-    h_file_output = "iree_uk_amdgpu_%s.h" % gpu_arch,
-    identifier = "iree_uk_amdgpu_%s" % gpu_arch,
-) for gpu_arch in gpu_archs]
+    h_file_output = "iree_uk_amdgpu_bitcode.h",
+)
diff --git a/compiler/plugins/target/ROCM/builtins/ukernel/CMakeLists.txt b/compiler/plugins/target/ROCM/builtins/ukernel/CMakeLists.txt
index 6b3014f3bd53..71d4705eed1a 100644
--- a/compiler/plugins/target/ROCM/builtins/ukernel/CMakeLists.txt
+++ b/compiler/plugins/target/ROCM/builtins/ukernel/CMakeLists.txt
@@ -22,6 +22,8 @@ iree_amdgpu_bitcode_library(
   SRCS
     "common.h"
     "iree_uk_amdgpu_argmax_f16i32.c"
+  OUT
+    "iree_uk_amdgpu_argmax_f16i32.gfx90a.bc"
 )
 
 iree_amdgpu_bitcode_library(
@@ -32,6 +34,8 @@ iree_amdgpu_bitcode_library(
   SRCS
     "common.h"
     "iree_uk_amdgpu_argmax_f16i32.c"
+  OUT
+    "iree_uk_amdgpu_argmax_f16i32.gfx942.bc"
 )
 
 iree_amdgpu_bitcode_library(
@@ -42,6 +46,8 @@ iree_amdgpu_bitcode_library(
   SRCS
     "common.h"
     "iree_uk_amdgpu_argmax_f16i32.c"
+  OUT
+    "iree_uk_amdgpu_argmax_f16i32.gfx1030.bc"
 )
 
 iree_amdgpu_bitcode_library(
@@ -52,6 +58,8 @@ iree_amdgpu_bitcode_library(
   SRCS
     "common.h"
     "iree_uk_amdgpu_argmax_f16i32.c"
+  OUT
+    "iree_uk_amdgpu_argmax_f16i32.gfx1100.bc"
 )
 
 iree_amdgpu_bitcode_library(
@@ -62,6 +70,8 @@ iree_amdgpu_bitcode_library(
   SRCS
     "common.h"
     "iree_uk_amdgpu_argmax_f16i64.c"
+  OUT
+    "iree_uk_amdgpu_argmax_f16i64.gfx90a.bc"
 )
 
 iree_amdgpu_bitcode_library(
@@ -72,6 +82,8 @@ iree_amdgpu_bitcode_library(
   SRCS
     "common.h"
     "iree_uk_amdgpu_argmax_f16i64.c"
+  OUT
+    "iree_uk_amdgpu_argmax_f16i64.gfx942.bc"
 )
 
 iree_amdgpu_bitcode_library(
@@ -82,6 +94,8 @@ iree_amdgpu_bitcode_library(
   SRCS
     "common.h"
     "iree_uk_amdgpu_argmax_f16i64.c"
+  OUT
+    "iree_uk_amdgpu_argmax_f16i64.gfx1030.bc"
 )
 
 iree_amdgpu_bitcode_library(
@@ -92,6 +106,8 @@ iree_amdgpu_bitcode_library(
   SRCS
     "common.h"
     "iree_uk_amdgpu_argmax_f16i64.c"
+  OUT
+    "iree_uk_amdgpu_argmax_f16i64.gfx1100.bc"
 )
 
 iree_amdgpu_bitcode_library(
@@ -102,6 +118,8 @@ iree_amdgpu_bitcode_library(
   SRCS
     "common.h"
     "iree_uk_amdgpu_argmax_f32i32.c"
+  OUT
+    "iree_uk_amdgpu_argmax_f32i32.gfx90a.bc"
 )
 
 iree_amdgpu_bitcode_library(
@@ -112,6 +130,8 @@ iree_amdgpu_bitcode_library(
   SRCS
     "common.h"
     "iree_uk_amdgpu_argmax_f32i32.c"
+  OUT
+    "iree_uk_amdgpu_argmax_f32i32.gfx942.bc"
 )
 
 iree_amdgpu_bitcode_library(
@@ -122,6 +142,8 @@ iree_amdgpu_bitcode_library(
   SRCS
     "common.h"
     "iree_uk_amdgpu_argmax_f32i32.c"
+  OUT
+    "iree_uk_amdgpu_argmax_f32i32.gfx1030.bc"
 )
 
 iree_amdgpu_bitcode_library(
@@ -132,6 +154,8 @@ iree_amdgpu_bitcode_library(
   SRCS
     "common.h"
     "iree_uk_amdgpu_argmax_f32i32.c"
+  OUT
+    "iree_uk_amdgpu_argmax_f32i32.gfx1100.bc"
 )
 
 iree_amdgpu_bitcode_library(
@@ -142,6 +166,8 @@ iree_amdgpu_bitcode_library(
   SRCS
     "common.h"
     "iree_uk_amdgpu_argmax_f32i64.c"
+  OUT
+    "iree_uk_amdgpu_argmax_f32i64.gfx90a.bc"
 )
 
 iree_amdgpu_bitcode_library(
@@ -152,6 +178,8 @@ iree_amdgpu_bitcode_library(
   SRCS
     "common.h"
     "iree_uk_amdgpu_argmax_f32i64.c"
+  OUT
+    "iree_uk_amdgpu_argmax_f32i64.gfx942.bc"
 )
 
 iree_amdgpu_bitcode_library(
@@ -162,6 +190,8 @@ iree_amdgpu_bitcode_library(
   SRCS
     "common.h"
     "iree_uk_amdgpu_argmax_f32i64.c"
+  OUT
+    "iree_uk_amdgpu_argmax_f32i64.gfx1030.bc"
 )
 
 iree_amdgpu_bitcode_library(
@@ -172,76 +202,34 @@ iree_amdgpu_bitcode_library(
   SRCS
     "common.h"
     "iree_uk_amdgpu_argmax_f32i64.c"
+  OUT
+    "iree_uk_amdgpu_argmax_f32i64.gfx1100.bc"
 )
 
 iree_c_embed_data(
   NAME
-    iree_uk_amdgpu_gfx90a
-  SRCS
-    "iree_uk_amdgpu_argmax_f16i32.c.gfx90a.bc"
-    "iree_uk_amdgpu_argmax_f16i64.c.gfx90a.bc"
-    "iree_uk_amdgpu_argmax_f32i32.c.gfx90a.bc"
-    "iree_uk_amdgpu_argmax_f32i64.c.gfx90a.bc"
-  C_FILE_OUTPUT
-    "iree_uk_amdgpu_gfx90a.c"
-  H_FILE_OUTPUT
-    "iree_uk_amdgpu_gfx90a.h"
-  IDENTIFIER
-    "iree_uk_amdgpu_gfx90a"
-  FLATTEN
-  PUBLIC
-)
-
-iree_c_embed_data(
-  NAME
-    iree_uk_amdgpu_gfx942
-  SRCS
-    "iree_uk_amdgpu_argmax_f16i32.c.gfx942.bc"
-    "iree_uk_amdgpu_argmax_f16i64.c.gfx942.bc"
-    "iree_uk_amdgpu_argmax_f32i32.c.gfx942.bc"
-    "iree_uk_amdgpu_argmax_f32i64.c.gfx942.bc"
-  C_FILE_OUTPUT
-    "iree_uk_amdgpu_gfx942.c"
-  H_FILE_OUTPUT
-    "iree_uk_amdgpu_gfx942.h"
-  IDENTIFIER
-    "iree_uk_amdgpu_gfx942"
-  FLATTEN
-  PUBLIC
-)
-
-iree_c_embed_data(
-  NAME
-    iree_uk_amdgpu_gfx1030
-  SRCS
-    "iree_uk_amdgpu_argmax_f16i32.c.gfx1030.bc"
-    "iree_uk_amdgpu_argmax_f16i64.c.gfx1030.bc"
-    "iree_uk_amdgpu_argmax_f32i32.c.gfx1030.bc"
-    "iree_uk_amdgpu_argmax_f32i64.c.gfx1030.bc"
-  C_FILE_OUTPUT
-    "iree_uk_amdgpu_gfx1030.c"
-  H_FILE_OUTPUT
-    "iree_uk_amdgpu_gfx1030.h"
-  IDENTIFIER
-    "iree_uk_amdgpu_gfx1030"
-  FLATTEN
-  PUBLIC
-)
-
-iree_c_embed_data(
-  NAME
-    iree_uk_amdgpu_gfx1100
-  SRCS
-    "iree_uk_amdgpu_argmax_f16i32.c.gfx1100.bc"
-    "iree_uk_amdgpu_argmax_f16i64.c.gfx1100.bc"
-    "iree_uk_amdgpu_argmax_f32i32.c.gfx1100.bc"
-    "iree_uk_amdgpu_argmax_f32i64.c.gfx1100.bc"
+    iree_uk_amdgpu_bitcode
+  SRCS
+    "iree_uk_amdgpu_argmax_f16i32.gfx1030.bc"
+    "iree_uk_amdgpu_argmax_f16i32.gfx1100.bc"
+    "iree_uk_amdgpu_argmax_f16i32.gfx90a.bc"
+    "iree_uk_amdgpu_argmax_f16i32.gfx942.bc"
+    "iree_uk_amdgpu_argmax_f16i64.gfx1030.bc"
+    "iree_uk_amdgpu_argmax_f16i64.gfx1100.bc"
+    "iree_uk_amdgpu_argmax_f16i64.gfx90a.bc"
+    "iree_uk_amdgpu_argmax_f16i64.gfx942.bc"
+    "iree_uk_amdgpu_argmax_f32i32.gfx1030.bc"
+    "iree_uk_amdgpu_argmax_f32i32.gfx1100.bc"
+    "iree_uk_amdgpu_argmax_f32i32.gfx90a.bc"
+    "iree_uk_amdgpu_argmax_f32i32.gfx942.bc"
+    "iree_uk_amdgpu_argmax_f32i64.gfx1030.bc"
+    "iree_uk_amdgpu_argmax_f32i64.gfx1100.bc"
+    "iree_uk_amdgpu_argmax_f32i64.gfx90a.bc"
+    "iree_uk_amdgpu_argmax_f32i64.gfx942.bc"
   C_FILE_OUTPUT
-    "iree_uk_amdgpu_gfx1100.c"
+    "iree_uk_amdgpu_bitcode.c"
   H_FILE_OUTPUT
-    "iree_uk_amdgpu_gfx1100.h"
-  IDENTIFIER
-    "iree_uk_amdgpu_gfx1100"
+    "iree_uk_amdgpu_bitcode.h"
   FLATTEN
   PUBLIC
 )
diff --git a/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f16i32.c b/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f16i32.c
index 41fe50a6528d..4a6beefa9198 100644
--- a/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f16i32.c
+++ b/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f16i32.c
@@ -6,10 +6,10 @@
 
 #include "compiler/plugins/target/ROCM/builtins/ukernel/common.h"
 
-void iree_uk_amdgpu_argmax_f16i32(const _Float16 *inputBuffer,
-                                  int64_t input_offset, int32_t *outputBuffer,
-                                  int64_t output_offset,
-                                  int64_t reductionSize) {
+[[clang::always_inline]] void
+iree_uk_amdgpu_argmax_f16i32(const _Float16 *inputBuffer, int64_t input_offset,
+                             int32_t *outputBuffer, int64_t output_offset,
+                             int64_t reductionSize) {
   const int warpSize = __builtin_amdgcn_wavefrontsize();
   _Float16 NEG_F16_MAX = (_Float16)(-65504.0f);
   int32_t laneID = __builtin_amdgcn_workitem_id_x();
diff --git a/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f16i64.c b/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f16i64.c
index 823fc3a4f296..33c1522d143d 100644
--- a/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f16i64.c
+++ b/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f16i64.c
@@ -6,10 +6,10 @@
 
 #include "compiler/plugins/target/ROCM/builtins/ukernel/common.h"
 
-void iree_uk_amdgpu_argmax_f16i64(const _Float16 *inputBuffer,
-                                  int64_t input_offset, int64_t *outputBuffer,
-                                  int64_t output_offset,
-                                  int64_t reductionSize) {
+[[clang::always_inline]] void
+iree_uk_amdgpu_argmax_f16i64(const _Float16 *inputBuffer, int64_t input_offset,
+                             int64_t *outputBuffer, int64_t output_offset,
+                             int64_t reductionSize) {
   const int warpSize = __builtin_amdgcn_wavefrontsize();
   _Float16 NEG_F16_MAX = (_Float16)(-65504.0f);
   int32_t laneID = __builtin_amdgcn_workitem_id_x();
diff --git a/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f32i32.c b/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f32i32.c
index 41aad8ba05c5..f39d62372799 100644
--- a/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f32i32.c
+++ b/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f32i32.c
@@ -6,10 +6,10 @@
 
 #include "compiler/plugins/target/ROCM/builtins/ukernel/common.h"
 
-void iree_uk_amdgpu_argmax_f32i32(const float *inputBuffer,
-                                  int64_t input_offset, int32_t *outputBuffer,
-                                  int64_t output_offset,
-                                  int64_t reductionSize) {
+[[clang::always_inline]] void
+iree_uk_amdgpu_argmax_f32i32(const float *inputBuffer, int64_t input_offset,
+                             int32_t *outputBuffer, int64_t output_offset,
+                             int64_t reductionSize) {
   const int warpSize = __builtin_amdgcn_wavefrontsize();
   int32_t laneID = __builtin_amdgcn_workitem_id_x();
   // Set identity value to handle problem non divisible by subgroupSize.
diff --git a/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f32i64.c b/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f32i64.c
index 5899322d7407..d6a9afbcf2d6 100644
--- a/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f32i64.c
+++ b/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f32i64.c
@@ -6,10 +6,10 @@
 
 #include "compiler/plugins/target/ROCM/builtins/ukernel/common.h"
 
-void iree_uk_amdgpu_argmax_f32i64(const float *inputBuffer,
-                                  int64_t input_offset, int64_t *outputBuffer,
-                                  int64_t output_offset,
-                                  int64_t reductionSize) {
+[[clang::always_inline]] void
+iree_uk_amdgpu_argmax_f32i64(const float *inputBuffer, int64_t input_offset,
+                             int64_t *outputBuffer, int64_t output_offset,
+                             int64_t reductionSize) {
   const int warpSize = __builtin_amdgcn_wavefrontsize();
   int32_t laneID = __builtin_amdgcn_workitem_id_x();
   // Set identity value to handle problem non divisible by subgroupSize.
diff --git a/compiler/plugins/target/ROCM/test/BUILD.bazel b/compiler/plugins/target/ROCM/test/BUILD.bazel
new file mode 100644
index 000000000000..bf9a18d582bd
--- /dev/null
+++ b/compiler/plugins/target/ROCM/test/BUILD.bazel
@@ -0,0 +1,26 @@
+# Copyright 2024 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+load("//build_tools/bazel:build_defs.oss.bzl", "iree_cmake_extra_content")
+load("//build_tools/bazel:iree_lit_test.bzl", "iree_lit_test_suite")
+
+package(
+    features = ["layering_check"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+iree_lit_test_suite(
+    name = "lit",
+    srcs = [
+        "gpu_lower_to_ukernels.mlir",
+        "ukernel_pipeline_transform.mlir",
+    ],
+    cfg = "//compiler:lit.cfg.py",
+    tools = [
+        "//tools:iree-opt",
+        "@llvm-project//llvm:FileCheck",
+    ],
+)
diff --git a/compiler/plugins/target/ROCM/test/CMakeLists.txt b/compiler/plugins/target/ROCM/test/CMakeLists.txt
index df185a05e72b..6d2199d8c4bb 100644
--- a/compiler/plugins/target/ROCM/test/CMakeLists.txt
+++ b/compiler/plugins/target/ROCM/test/CMakeLists.txt
@@ -1,10 +1,33 @@
-# NOTE: Bazel testing of this backend is impossible because there is no way
-# for Bazel to bundle the AMD bitcode files that the backend depends on. Users
-# of the compiler can pass explicit flags, but we prefer that default tests
-# exercise default flags, which cannot be supported properly on Bazel builds.
+################################################################################
+# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from           #
+# compiler/plugins/target/ROCM/test/BUILD.bazel                                #
+#                                                                              #
+# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary   #
+# CMake-only content.                                                          #
+#                                                                              #
+# To disable autogeneration for this file entirely, delete this header.        #
+################################################################################
 
 iree_add_all_subdirs()
 
+iree_lit_test_suite(
+  NAME
+    lit
+  SRCS
+    "gpu_lower_to_ukernels.mlir"
+    "ukernel_pipeline_transform.mlir"
+  TOOLS
+    FileCheck
+    iree-opt
+)
+
+### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
+
+# NOTE: The following tests are CMake-only because they depend on AMD device
+# bitcode libraries that are provided by custom CMake code in target/ROCM.
+# By contrast, the above tests that only require ukernel bitcode are part of the
+# Bazel build because ukernel bitcode is something that we generate ourselves.
+
 iree_lit_test_suite(
   NAME
     lit
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_lower_to_ukernels.mlir b/compiler/plugins/target/ROCM/test/gpu_lower_to_ukernels.mlir
similarity index 84%
rename from compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_lower_to_ukernels.mlir
rename to compiler/plugins/target/ROCM/test/gpu_lower_to_ukernels.mlir
index cc71c379959f..177bd0b36f7c 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_lower_to_ukernels.mlir
+++ b/compiler/plugins/target/ROCM/test/gpu_lower_to_ukernels.mlir
@@ -1,5 +1,4 @@
-// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx1100 --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-lower-to-ukernels,cse,canonicalize))" %s | FileCheck %s
-// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx90a --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-lower-to-ukernels,cse,canonicalize))" %s | FileCheck %s --check-prefix=CDNA2
+// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx942 --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-lower-to-ukernels,cse,canonicalize))" %s | FileCheck %s
 // RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx908 --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-lower-to-ukernels,cse,canonicalize))" %s | FileCheck %s --check-prefix=CDNA1
 
 func.func @argmax_2d_f32i64(%arg0 : tensor<1x?xf32>) -> tensor<1xi64> attributes {
@@ -28,7 +27,7 @@ func.func @argmax_2d_f32i64(%arg0 : tensor<1x?xf32>) -> tensor<1xi64> attributes
 //  CHECK-DAG:   %[[C1_index:.+]] = arith.constant 1 : index
 //  CHECK-DAG:   %[[C0_i64:.+]] = arith.constant 0
 //  CHECK-DAG:   %[[FILL:.+]] = linalg.fill ins(%[[C0_i64]]
-//      CHECK:   %[[MICRO_KERNEL:.+]] = iree_codegen.ukernel.generic "iree_uk_amdgpu_argmax_f32i64"
+//      CHECK:   %[[MICRO_KERNEL:.+]] = iree_codegen.ukernel.generic {hal.executable.objects = [{{.*}}]} "iree_uk_amdgpu_argmax_f32i64"
 // CHECK-SAME:       ins(%[[ARG0]] :
 // CHECK-SAME:       outs(%[[FILL]] :
 //      CHECK:   return %[[MICRO_KERNEL]]
@@ -284,3 +283,51 @@ func.func @argmax_ukernel_unsupported_arch(%arg0 : tensor<1x?xf32>) -> tensor<1x
 //      CDNA1-LABEL: func @argmax_ukernel_unsupported_arch(
 //      CDNA1-NOT: iree_codegen.ukernel.generic
 //      CDNA1: linalg.generic
+
+// -----
+
+// Test user-provided bitcode in the source IR.
+
+func.func @argmax_2d_f32i64(%arg0 : tensor<1x?xf32>) -> tensor<1xi64> attributes {
+  hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "all"}>,
+  // Dummy bitcode with an unusual length of 12. The first 4 bytes are the .bc file format signature.
+  hal.executable.objects = [
+    #hal.executable.object<{
+      path = "iree_uk_amdgpu_argmax_f32i64.gfx942.bc",
+      data = dense<[66, 67, -64, -34, 1, 35, 69, 103, -119, -85, -51, -17]> : tensor<12xi8>
+    }>
+  ]
+} {
+  %c0_i64 = arith.constant 0 : i64
+  %cst = arith.constant 0xFF800000 : f32
+  %0 = tensor.empty() : tensor<1xi64>
+  %1 = linalg.fill ins(%c0_i64 : i64) outs(%0 : tensor<1xi64>) -> tensor<1xi64>
+  %2 = tensor.empty() : tensor<1xf32>
+  %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<1xf32>) -> tensor<1xf32>
+  %4:2 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%arg0 : tensor<1x?xf32>) outs(%3, %1 : tensor<1xf32>, tensor<1xi64>) {
+  ^bb0(%in: f32, %out: f32, %out_0: i64):
+    %5 = linalg.index 1 : index
+    %6 = arith.index_cast %5 : index to i64
+    %7 = arith.maximumf %in, %out : f32
+    %8 = arith.cmpf ogt, %in, %out : f32
+    %9 = arith.select %8, %6, %out_0 : i64
+    linalg.yield %7, %9 : f32, i64
+  } -> (tensor<1xf32>, tensor<1xi64>)
+  return %4#1 : tensor<1xi64>
+}
+
+//CHECK-LABEL: func @argmax_2d_f32i64(
+// CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: tensor<1x?xf32>
+//  CHECK-DAG:   %[[C1_index:.+]] = arith.constant 1 : index
+//  CHECK-DAG:   %[[C0_i64:.+]] = arith.constant 0
+//  CHECK-DAG:   %[[FILL:.+]] = linalg.fill ins(%[[C0_i64]]
+//      CHECK:   %[[MICRO_KERNEL:.+]] = iree_codegen.ukernel.generic {
+// CHECK-SAME:     hal.executable.objects = [
+// CHECK-SAME:       #hal.executable.object<{
+// CHECK-SAME:         path = "iree_uk_amdgpu_argmax_f32i64.gfx942.bc",
+// CHECK-SAME:         data = dense<[66, 67, -64, -34, 1, 35, 69, 103, -119, -85, -51, -17]> : tensor<12xi8>
+// CHECK-SAME:       }>
+// CHECK-SAME:     ]} "iree_uk_amdgpu_argmax_f32i64"
+// CHECK-SAME:       ins(%[[ARG0]] :
+// CHECK-SAME:       outs(%[[FILL]] :
+//      CHECK:   return %[[MICRO_KERNEL]]
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ukernel_pipeline_transform.mlir b/compiler/plugins/target/ROCM/test/ukernel_pipeline_transform.mlir
similarity index 90%
rename from compiler/src/iree/compiler/Codegen/LLVMGPU/test/ukernel_pipeline_transform.mlir
rename to compiler/plugins/target/ROCM/test/ukernel_pipeline_transform.mlir
index af8ecf196115..26ce4c8959f4 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ukernel_pipeline_transform.mlir
+++ b/compiler/plugins/target/ROCM/test/ukernel_pipeline_transform.mlir
@@ -4,10 +4,11 @@
   #hal.pipeline.binding<storage_buffer>,
   #hal.pipeline.binding<storage_buffer>
 ]>
-#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "argmax"}>
 #map = affine_map<(d0) -> (d0)>
 #map1 = affine_map<(d0) -> ()>
-func.func @argmax_1d_f16i64() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} {
+func.func @argmax_1d_f16i64() attributes {
+  hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "argmax"}>
+} {
   %c32_i64 = arith.constant 32 : i64
   %cst = arith.constant 0xFC00 : f16
   %c0_i64 = arith.constant 0 : i64
@@ -43,7 +44,7 @@ func.func @argmax_1d_f16i64() attributes {hal.executable.target = #executable_ta
 //       CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<pipeline = LLVMGPUDefault workgroup_size = [32, 1, 1]>
 //       CHECK: func.func @argmax_1d_f16i64()
 //  CHECK-SAME:     translation_info = #[[$TRANSLATION]]
-//       CHECK:   iree_codegen.ukernel.generic  "iree_uk_amdgpu_argmax_f16i64"
+//       CHECK:   iree_codegen.ukernel.generic {hal.executable.objects = [{{.*}}]} "iree_uk_amdgpu_argmax_f16i64"
 
 // -----
 
@@ -51,10 +52,11 @@ func.func @argmax_1d_f16i64() attributes {hal.executable.target = #executable_ta
   #hal.pipeline.binding<storage_buffer>,
   #hal.pipeline.binding<storage_buffer>
 ]>
-#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "argmax"}>
 #map = affine_map<(d0, d1) -> (d0, d1)>
 #map1 = affine_map<(d0, d1) -> (d0)>
-func.func @argmax_2d_f32i64() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} {
+func.func @argmax_2d_f32i64() attributes {
+  hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "argmax"}>
+} {
   %c32_i64 = arith.constant 32 : i64
   %cst = arith.constant 0xFF800000 : f32
   %c0_i64 = arith.constant 0 : i64
@@ -92,7 +94,7 @@ func.func @argmax_2d_f32i64() attributes {hal.executable.target = #executable_ta
 // CHECK-SAME:     translation_info = #[[$TRANSLATION]]
 //      CHECK:   %[[SUBVIEW:.*]] = memref.subview{{.*}} memref<16x?xf32
 // CHECK-SAME:        to memref<1x?xf32
-//      CHECK:   iree_codegen.ukernel.generic  "iree_uk_amdgpu_argmax_f32i64" ins(%[[SUBVIEW]]
+//      CHECK:   iree_codegen.ukernel.generic {hal.executable.objects = [{{.*}}]} "iree_uk_amdgpu_argmax_f32i64" ins(%[[SUBVIEW]]
 
 // -----
 
@@ -100,10 +102,11 @@ func.func @argmax_2d_f32i64() attributes {hal.executable.target = #executable_ta
   #hal.pipeline.binding<storage_buffer>,
   #hal.pipeline.binding<storage_buffer>
 ]>
-#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb">
 #map = affine_map<(d0) -> (d0)>
 #map1 = affine_map<(d0) -> ()>
-func.func @no_ukernel_argmax_1d_f16i64() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} {
+func.func @no_ukernel_argmax_1d_f16i64() attributes {
+  hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb">
+} {
   %c32_i64 = arith.constant 32 : i64
   %cst = arith.constant 0xFC00 : f16
   %c0_i64 = arith.constant 0 : i64
@@ -147,10 +150,11 @@ func.func @no_ukernel_argmax_1d_f16i64() attributes {hal.executable.target = #ex
   #hal.pipeline.binding<storage_buffer>,
   #hal.pipeline.binding<storage_buffer>
 ]>
-#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "argmax"}>
 #map = affine_map<(d0) -> (d0)>
 #map1 = affine_map<(d0) -> ()>
-func.func @not_neg_inf_init_argmax_1d() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} {
+func.func @not_neg_inf_init_argmax_1d() attributes {
+  hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "argmax"}>
+} {
   %c32_i64 = arith.constant 32 : i64
   %cst = arith.constant 0.000000e+00 : f16
   %c0_i64 = arith.constant 0 : i64
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPULowerToUKernels.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPULowerToUKernels.cpp
index f76cdd1c6ccf..a72b4ff8e180 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPULowerToUKernels.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPULowerToUKernels.cpp
@@ -9,9 +9,12 @@
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/UKernelOps.h"
 #include "iree/compiler/Codegen/Utils/GPUUtils.h"
 #include "iree/compiler/Codegen/Utils/Utils.h"
+#include "iree/compiler/Utils/EmbeddedDataDirectory.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/Linalg/Utils/Utils.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/IR/AsmState.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/MLIRContext.h"
@@ -24,22 +27,101 @@ namespace mlir::iree_compiler {
 
 namespace {
 
+// Returns a ExecutableObjectAttr carrying the bitcode for the given ukernel.
+//
+// First tries finding the bitcode in the input `sourceExecutableObjects`, which
+// must be an array of ExecutableObjectAttr's and is typically coming from a
+// hal.executable.objects array attribute in the source IR, which is the
+// mechanism by which source programs may provide their own ukernel bitcode.
+//
+// If no matching bitcode was found in `sourceExecutableObjects`, this function
+// will then search in bitcode files that we have embedded as static data.
+static IREE::HAL::ExecutableObjectAttr
+getUKernelBitcode(OpBuilder &builder,
+                  IREE::HAL::ExecutableTargetAttr execTarget,
+                  ArrayAttr sourceExecutableObjects, StringRef ukernelName) {
+  IREE::GPU::TargetAttr gpuTarget = getGPUTargetAttr(execTarget);
+  if (!gpuTarget) {
+    return {};
+  }
+  StringRef gpuArch = gpuTarget.getArch();
+  std::string bitcodeFilename =
+      llvm::formatv("{0}.{1}.bc", ukernelName, gpuArch);
+
+  // Early-return if the source executable.objects already contain an object
+  // with the expected file name. This happens with user-provided bitcode in the
+  // source IR.
+  if (sourceExecutableObjects) {
+    for (Attribute a : sourceExecutableObjects) {
+      if (auto object = dyn_cast<IREE::HAL::ExecutableObjectAttr>(a)) {
+        if (object.getPath() == bitcodeFilename) {
+          return object;
+        }
+      }
+    }
+  }
+
+  // No user-provided bitcode, so we search our embedded bitcode files in the
+  // EmbeddedDataDirectory singleton.
+  std::optional<StringRef> bitcode;
+  EmbeddedDataDirectory::withGlobal([&](EmbeddedDataDirectory &dir) {
+    bitcode = dir.getFile(bitcodeFilename);
+  });
+  if (!bitcode) {
+    return {};
+  }
+  MLIRContext *context = builder.getContext();
+  auto blob = HeapAsmResourceBlob::allocateAndCopyInferAlign(
+      ArrayRef<char>(bitcode->data(), bitcode->size()));
+  auto bitcodeDenseAttr = DenseI8ResourceElementsAttr::get(
+      VectorType::get({static_cast<int64_t>(bitcode->size())},
+                      builder.getI8Type()),
+      bitcodeFilename, std::move(blob));
+  return IREE::HAL::ExecutableObjectAttr::get(
+      context, StringAttr::get(context, bitcodeFilename),
+      cast<IREE::Util::SerializableAttrInterface>(bitcodeDenseAttr));
+}
+
+// Walks parents ops from `op` to return the nearest hal.executable.objects
+// array attribute. If the parent hal.executable.variant is reached, its objects
+// attribute is returned.
+// Adapted from ExecutableTargetAttr::lookup.
+static ArrayAttr lookUpExecutableObjects(Operation *op) {
+  MLIRContext *context = op->getContext();
+  auto attrId = StringAttr::get(context, "hal.executable.objects");
+  while (op) {
+    // Take directly from the enclosing variant.
+    if (auto variantOp = dyn_cast<IREE::HAL::ExecutableVariantOp>(op)) {
+      if (std::optional<ArrayAttr> objects = variantOp.getObjects()) {
+        return *objects;
+      }
+    }
+    // Take from op attributes.
+    if (auto attr = op->getAttrOfType<ArrayAttr>(attrId)) {
+      return attr;
+    }
+    // Continue walk.
+    op = op->getParentOp();
+  }
+  return {};
+}
+
 /// Holds a function name and attributes.
 struct FnNameAndDefAttrs {
   std::string name;
   SmallVector<NamedAttribute> defAttrs;
+  explicit operator bool() const { return !name.empty(); }
 };
 
 /// Returns the function name and attributes to use for a ukernel with given
-/// `ukernelName` on the target described by `targetAttr`.
+/// `name` and `suffix` on the target described by `targetAttr`.
 static FnNameAndDefAttrs
-getFnNameAndDefAttrs(const char *ukernelName, std::string &typeSuffixID,
+getFnNameAndDefAttrs(const char *name, std::string &suffix,
                      RewriterBase &rewriter,
                      IREE::HAL::ExecutableTargetAttr targetAttr) {
   FnNameAndDefAttrs result;
   if (isROCMBackend(targetAttr)) {
-    result.name =
-        std::string("iree_uk_amdgpu_") + ukernelName + "_" + typeSuffixID;
+    result.name = llvm::formatv("iree_uk_amdgpu_{0}_{1}", name, suffix);
     result.defAttrs.emplace_back(rewriter.getStringAttr("vm.import.module"),
                                  rewriter.getStringAttr("rocm"));
   }
@@ -54,9 +136,21 @@ static FailureOr<IREE::Codegen::UKernelOpInterface>
 matchArgmaxDAGForUKernel(RewriterBase &rewriter, linalg::GenericOp op) {
   auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(op);
   const char ukernelName[] = "argmax";
-  if (!hasUkernel(targetAttr, ukernelName) ||
-      !hasUkernelSupportedGpuArch(targetAttr)) {
-    return failure();
+  Value input = op.getDpsInputOperand(0)->get();
+  auto inputType = cast<ShapedType>(input.getType());
+  Value index = op.getDpsInitOperand(1)->get();
+  auto indexType = cast<ShapedType>(index.getType());
+  std::string suffix;
+  llvm::raw_string_ostream(suffix)
+      << inputType.getElementType() << indexType.getElementType();
+  FnNameAndDefAttrs fn =
+      getFnNameAndDefAttrs(ukernelName, suffix, rewriter, targetAttr);
+  if (!fn) {
+    return rewriter.notifyMatchFailure(op, "no ukernels on this backend");
+  }
+
+  if (!hasUkernel(targetAttr, ukernelName)) {
+    return rewriter.notifyMatchFailure(op, "ukernel not enabled");
   }
 
   // Currently only support argmax where parallel dims are 1.
@@ -74,68 +168,40 @@ matchArgmaxDAGForUKernel(RewriterBase &rewriter, linalg::GenericOp op) {
     }
     parallelSize *= bounds[dim];
   }
-  if (parallelSize != 1)
-    return failure();
-
-  // Get value/input type.
-  Value input = op.getDpsInputOperand(0)->get();
-  auto inputType = llvm::cast<ShapedType>(input.getType());
-  Type inputElemType = inputType.getElementType();
-  // Only support f16 and f32 values.
-  if (!inputElemType.isF16() && !inputElemType.isF32()) {
-    return failure();
-  }
-
-  // Get index type.
-  Value index = op.getDpsInitOperand(1)->get();
-  auto indexType = llvm::cast<ShapedType>(index.getType());
-  Type indexElemType = indexType.getElementType();
-  // Only support i32 and i64 index.
-  if (!indexElemType.isInteger(32) && !indexElemType.isInteger(64)) {
+  if (parallelSize != 1) {
     return failure();
   }
-
-  std::string typeSuffixID;
-  llvm::raw_string_ostream(typeSuffixID) << inputElemType << indexElemType;
-  // TODO(bjacob): this check won't be needed one this code will be updated to
-  // look up the table of contents of embedded bitcode files, one per symbol.
-  if (!(typeSuffixID == "f16i32" || typeSuffixID == "f16i64" ||
-        typeSuffixID == "f32i32" || typeSuffixID == "f32i64")) {
-    return rewriter.notifyMatchFailure(
-        op, "unsupported combination of element types");
+  auto execTarget = IREE::HAL::ExecutableTargetAttr::lookup(op);
+  ArrayAttr sourceExecutableObjects = lookUpExecutableObjects(op);
+  IREE::HAL::ExecutableObjectAttr bitcodeObject =
+      getUKernelBitcode(rewriter, execTarget, sourceExecutableObjects, fn.name);
+  if (!bitcodeObject) {
+    return rewriter.notifyMatchFailure(op, "no ukernel bitcode for this op");
   }
-
   Location loc = op.getLoc();
   // Currently only support 1D reduction, where reduc is on fastest dim.
   // Tiling argmax ukernel is also set to enforce this structure.
   const int kReductionDim = op.getNumLoops() - 1;
   Value reductionDimSize =
       rewriter.create<tensor::DimOp>(loc, input, kReductionDim);
-  auto fn =
-      getFnNameAndDefAttrs(ukernelName, typeSuffixID, rewriter, targetAttr);
   auto genericMicroKernelOp = rewriter.create<IREE::Codegen::UKernelGenericOp>(
       loc, indexType, fn.name, ValueRange{input}, index,
       ValueRange{reductionDimSize},
       /*fn_def_attrs=*/rewriter.getDictionaryAttr(fn.defAttrs),
       /*strided_outer_dims=*/rewriter.getIndexAttr(0));
+  genericMicroKernelOp->setAttr(
+      "hal.executable.objects",
+      ArrayAttr::get(rewriter.getContext(), bitcodeObject));
   return cast<IREE::Codegen::UKernelOpInterface>(
       genericMicroKernelOp.getOperation());
 }
 
-using TargetPredicate = std::function<bool(IREE::HAL::ExecutableTargetAttr)>;
-
 struct LowerArgmaxToUKernelPattern : OpRewritePattern<linalg::GenericOp> {
-  LowerArgmaxToUKernelPattern(MLIRContext *context,
-                              TargetPredicate targetPredicate)
-      : OpRewritePattern<linalg::GenericOp>(context),
-        targetPredicate(targetPredicate) {}
+  LowerArgmaxToUKernelPattern(MLIRContext *context)
+      : OpRewritePattern<linalg::GenericOp>(context) {}
 
   LogicalResult matchAndRewrite(linalg::GenericOp op,
                                 PatternRewriter &rewriter) const override {
-    if (targetPredicate &&
-        !targetPredicate(IREE::HAL::ExecutableTargetAttr::lookup(op))) {
-      return failure();
-    }
     if (failed(isArgmaxOp(op))) {
       return failure();
     }
@@ -149,8 +215,6 @@ struct LowerArgmaxToUKernelPattern : OpRewritePattern<linalg::GenericOp> {
                                 ukernelOp.value()->getResults());
     return success();
   }
-
-  TargetPredicate targetPredicate;
 };
 
 struct GPULowerToUKernelsPass final
@@ -170,7 +234,7 @@ struct GPULowerToUKernelsPass final
     // evidence that it is difficult for codegen to consistently approach
     // microkernels performance, and that consideration overrides the benefit of
     // fusions for these ops.
-    patterns.insert<LowerArgmaxToUKernelPattern>(context, isROCMBackend);
+    patterns.insert<LowerArgmaxToUKernelPattern>(context);
     if (failed(applyPatternsAndFoldGreedily(getOperation(),
                                             std::move(patterns)))) {
       return signalPassFailure();
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td b/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td
index e8f1551c477a..15c6e9c23c93 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td
@@ -107,7 +107,7 @@ def GPUInferMemorySpacePass :
 
 def GPULowerToUKernelsPass :
     Pass<"iree-codegen-gpu-lower-to-ukernels", ""> {
-  let summary = "Separate out parts of the IR that lower to a micro-kernel";
+  let summary = "Lower suitable ops to microkernels.";
   let dependentDialects = [
     "::mlir::iree_compiler::IREE::Codegen::IREECodegenDialect",
     "::mlir::iree_compiler::IREE::GPU::IREEGPUDialect",
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel
index 7d0e6887d717..7fe33161bca1 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel
@@ -30,7 +30,6 @@ iree_lit_test_suite(
             "gpu_fuse_and_hoist_forall.mlir",
             "gpu_greedily_distribute_to_threads.mlir",
             "gpu_infer_memory_space.mlir",
-            "gpu_lower_to_ukernels.mlir",
             "gpu_combine_value_barriers.mlir",
             "gpu_materialize_encoding_gfx908.mlir",
             "gpu_materialize_encoding_gfx90a.mlir",
@@ -58,8 +57,6 @@ iree_lit_test_suite(
             "vector_reduction_to_gpu.mlir",
         ],
         include = ["*.mlir"],
-        exclude = [
-        ],
     ),
     cfg = "//compiler:lit.cfg.py",
     tools = [
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Common/GPU/test/CMakeLists.txt
index a9c584acd96d..4b9853df8213 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/CMakeLists.txt
@@ -26,7 +26,6 @@ iree_lit_test_suite(
     "gpu_generalize_named_ops.mlir"
     "gpu_greedily_distribute_to_threads.mlir"
     "gpu_infer_memory_space.mlir"
-    "gpu_lower_to_ukernels.mlir"
     "gpu_materialize_encoding_gfx1100.mlir"
     "gpu_materialize_encoding_gfx908.mlir"
     "gpu_materialize_encoding_gfx90a.mlir"
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/UKernelOps.cpp b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/UKernelOps.cpp
index b044244dbc0a..dbf082b5be03 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/UKernelOps.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/UKernelOps.cpp
@@ -61,7 +61,12 @@ createFunctionCall(RewriterBase &rewriter, Operation *op, StringRef fnName,
   }
 
   // Insert the function call.
-  return rewriter.create<func::CallOp>(loc, fnDecl, callOperands);
+  auto callOp = rewriter.create<func::CallOp>(loc, fnDecl, callOperands);
+  if (op->hasAttr("hal.executable.objects")) {
+    callOp->setAttr("hal.executable.objects",
+                    op->getAttr("hal.executable.objects"));
+  }
+  return callOp;
 }
 
 //===---------------------------------------------------------------------===//
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
index d63fd2d5d258..23e5cbb13e27 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
@@ -1824,18 +1824,25 @@ static LogicalResult setTransposeConfig(mlir::FunctionOpInterface entryPoint,
 // UKernel Pipeline Configuration
 //====---------------------------------------------------------------------===//
 
-/// Set the configuration for argmax that can be mapped to argmax uKernel.
+/// Set the configuration for argmax when ukernels are enabled.
 /// Distribute all parallel dim across different workgroups, and only use single
 /// subgroup per workgroup.
+///
+/// TODO(bjacob): This is fragile, as we can't know yet if this argmax will be
+/// lowered to a ukernel. We need instead a config that works regardless of
+/// ukernels. For now, we use the looser condition that the argmax ukernel is
+/// enabled, a necessary but not sufficient condition for this particular op to
+/// lower to the ukernel. This is good enough for now for a couple of reasons:
+/// 1. Even if a argmax does not actually lower to a ukernel, this config should
+///    still work.
+/// 2. Ukernels are not enabled by default.
 static LogicalResult
 setArgmaxUkernelConfig(IREE::GPU::TargetAttr target,
                        mlir::FunctionOpInterface entryPoint,
                        linalg::GenericOp op) {
   // Checks if UKernels are enabled.
   if (auto target = IREE::HAL::ExecutableTargetAttr::lookup(entryPoint)) {
-    const char ukernelName[] = "argmax";
-    if (!hasUkernel(target, ukernelName) ||
-        !hasUkernelSupportedGpuArch(target)) {
+    if (!hasUkernel(target, "argmax")) {
       return failure();
     }
   }
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel
index 756327ca4475..5d4042975f29 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel
@@ -65,7 +65,6 @@ iree_lit_test_suite(
             "transform_gpu_pipelining.mlir",
             "transform_vector_to_mma.mlir",
             "transpose_pipeline_test.mlir",
-            "ukernel_pipeline_transform.mlir",
             "configure_tensor_layout.mlir",
             "vector_lowering.mlir",
             "vector_to_gpu.mlir",
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt
index 1d0dcc979a56..fb9e495d9535 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt
@@ -62,7 +62,6 @@ iree_lit_test_suite(
     "transform_gpu_pipelining.mlir"
     "transform_vector_to_mma.mlir"
     "transpose_pipeline_test.mlir"
-    "ukernel_pipeline_transform.mlir"
     "vector_lowering.mlir"
     "vector_to_gpu.mlir"
     "winograd_pipeline_test.mlir"
diff --git a/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.cpp b/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.cpp
index 612183d94eda..8f09f6f932f8 100644
--- a/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.cpp
+++ b/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.cpp
@@ -936,42 +936,6 @@ bool sharedMemTransposeFilter(AffineMap indexMap) {
   return false;
 }
 
-//===----------------------------------------------------------------------===//
-// GPU UKernel Utils
-//===----------------------------------------------------------------------===//
-
-// TODO: Add more popular kernels into this list and the ukernel cmake.
-//       No real technical reason to only allow these aside from compile
-//       time and diskspace.
-bool hasUkernelSupportedRocmArch(StringRef targetChip) {
-  const char *kSupportedTargetChip[] = {"gfx90a", "gfx942", "gfx1030",
-                                        "gfx1100"};
-  size_t arraySize =
-      sizeof(kSupportedTargetChip) / sizeof(kSupportedTargetChip[0]);
-  for (int i = 0; i < arraySize; i++) {
-    // return true if targetChip is found inside kSupportedTargetChip.
-    if (targetChip.compare(kSupportedTargetChip[i]) == 0)
-      return true;
-  }
-  return false;
-}
-
-bool hasUkernelSupportedRocmArch(IREE::HAL::ExecutableTargetAttr targetAttr) {
-  auto targetArch = getGPUTargetAttr(targetAttr).getArch();
-  if (targetArch.empty())
-    return false;
-  return hasUkernelSupportedRocmArch(targetArch);
-}
-
-/// Checks if target GPU has UKernel support.
-bool hasUkernelSupportedGpuArch(IREE::HAL::ExecutableTargetAttr targetAttr) {
-  if (isROCMBackend(targetAttr) && hasUkernelSupportedRocmArch(targetAttr)) {
-    return true;
-  }
-  // TODO: Once plumbed, add a CUDA backend and supported cuda arch check.
-  return false;
-}
-
 //===----------------------------------------------------------------------===//
 // GPU Target Information
 //===----------------------------------------------------------------------===//
diff --git a/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.h b/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.h
index 133d7246a5fa..1bd088588af8 100644
--- a/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.h
+++ b/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.h
@@ -174,16 +174,6 @@ combiningKindToAllReduce(vector::CombiningKind kind);
 /// using shared memory when CodeGen towards the GPU.
 bool sharedMemTransposeFilter(AffineMap indexMap);
 
-//===----------------------------------------------------------------------===//
-// GPU UKernel Utils
-//===----------------------------------------------------------------------===//
-
-/// Checks if target Chip(StringRef) has UKernel support.
-bool hasUkernelSupportedRocmArch(StringRef targetChip);
-
-/// Checks if targetAttr's GPU target has UKernel support.
-bool hasUkernelSupportedGpuArch(IREE::HAL::ExecutableTargetAttr targetAttr);
-
 //===----------------------------------------------------------------------===//
 // GPU Target Information
 //===----------------------------------------------------------------------===//
diff --git a/compiler/src/iree/compiler/Utils/BUILD.bazel b/compiler/src/iree/compiler/Utils/BUILD.bazel
index dbcdc0156dae..c7c2acc2a8fd 100644
--- a/compiler/src/iree/compiler/Utils/BUILD.bazel
+++ b/compiler/src/iree/compiler/Utils/BUILD.bazel
@@ -31,6 +31,7 @@ iree_compiler_cc_library(
     hdrs = [
         "ConversionUtils.h",
         "ElementPackingUtils.h",
+        "EmbeddedDataDirectory.h",
         "EquivalenceUtils.h",
         "FlatbufferUtils.h",
         "Folding.h",
diff --git a/compiler/src/iree/compiler/Utils/CMakeLists.txt b/compiler/src/iree/compiler/Utils/CMakeLists.txt
index c4f20b2ac74f..84be0745bbf6 100644
--- a/compiler/src/iree/compiler/Utils/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Utils/CMakeLists.txt
@@ -16,6 +16,7 @@ iree_cc_library(
   HDRS
     "ConversionUtils.h"
     "ElementPackingUtils.h"
+    "EmbeddedDataDirectory.h"
     "EquivalenceUtils.h"
     "FlatbufferUtils.h"
     "Folding.h"
diff --git a/compiler/src/iree/compiler/Utils/EmbeddedDataDirectory.h b/compiler/src/iree/compiler/Utils/EmbeddedDataDirectory.h
new file mode 100644
index 000000000000..130db1d68bf1
--- /dev/null
+++ b/compiler/src/iree/compiler/Utils/EmbeddedDataDirectory.h
@@ -0,0 +1,59 @@
+// Copyright 2024 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_COMPILER_UTILS_EMBEDDEDDATADIRECTORY_H_
+#define IREE_COMPILER_UTILS_EMBEDDEDDATADIRECTORY_H_
+
+#include <mutex>
+#include "llvm/ADT/StringMap.h"
+
+namespace mlir::iree_compiler {
+
+// A string-to-StringRef map that acts as a virtual filesystem: the keys are
+// "filenames" and the values are file contents.
+class EmbeddedDataDirectory {
+public:
+  // Calls the given `callback` on a global singleton object, guarded by a
+  // global mutex.
+  //
+  // Only use this for use cases that require a global object, such as when
+  // exporting data between parts of the compiler that can't directly link to
+  // each other (e.g. from a plugin to outside of the plugin).
+  static void
+  withGlobal(llvm::function_ref<void(EmbeddedDataDirectory &)> callback) {
+    static EmbeddedDataDirectory dir;
+    static std::mutex mutex;
+    std::lock_guard<std::mutex> lock(mutex);
+    callback(dir);
+  }
+
+  // Add a new entry if it didn't already exist. Return `true` if it was added.
+  bool addFile(llvm::StringRef fileName, llvm::StringRef contents) {
+    auto [_iter, success] = map.insert({fileName, contents});
+    return success;
+  }
+
+  // Get an existing entry if it exists, otherwise return nullopt.
+  std::optional<llvm::StringRef> getFile(llvm::StringRef fileName) const {
+    auto iter = map.find(fileName);
+    if (iter == map.end()) {
+      return std::nullopt;
+    }
+    return iter->getValue();
+  }
+
+  // Direct access to the underlying StringMap, for use cases that are not well
+  // served by convenience methods like addFile and getFile. For example,
+  // iterating over all entries.
+  llvm::StringMap<llvm::StringRef> &getMap() { return map; }
+
+private:
+  llvm::StringMap<llvm::StringRef> map;
+};
+
+} // namespace mlir::iree_compiler
+
+#endif // IREE_COMPILER_UTILS_EMBEDDEDDATADIRECTORY_H_
diff --git a/compiler/src/iree/compiler/Utils/unittests/BUILD.bazel b/compiler/src/iree/compiler/Utils/unittests/BUILD.bazel
index 197037a2e5ba..3581fddea250 100644
--- a/compiler/src/iree/compiler/Utils/unittests/BUILD.bazel
+++ b/compiler/src/iree/compiler/Utils/unittests/BUILD.bazel
@@ -19,5 +19,6 @@ iree_compiler_cc_test(
         "//compiler/src/iree/compiler/Utils",
         "//compiler/src/iree/testing:gtest_main",
         "@com_google_googletest//:gtest",
+        "@llvm-project//llvm:Support",
     ],
 )
diff --git a/compiler/src/iree/compiler/Utils/unittests/CMakeLists.txt b/compiler/src/iree/compiler/Utils/unittests/CMakeLists.txt
index 421262fd5d71..a850b2d7c83c 100644
--- a/compiler/src/iree/compiler/Utils/unittests/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Utils/unittests/CMakeLists.txt
@@ -16,6 +16,7 @@ iree_cc_test(
   SRCS
     "UtilsTest.cpp"
   DEPS
+    LLVMSupport
     gmock
     gtest
     iree::compiler::Utils
diff --git a/compiler/src/iree/compiler/Utils/unittests/UtilsTest.cpp b/compiler/src/iree/compiler/Utils/unittests/UtilsTest.cpp
index d3ad37ca7512..39dc6cd1e712 100644
--- a/compiler/src/iree/compiler/Utils/unittests/UtilsTest.cpp
+++ b/compiler/src/iree/compiler/Utils/unittests/UtilsTest.cpp
@@ -6,8 +6,11 @@
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include <thread>
 
+#include "iree/compiler/Utils/EmbeddedDataDirectory.h"
 #include "iree/compiler/Utils/Permutation.h"
+#include "llvm/Support/FormatVariadic.h"
 
 using namespace mlir::iree_compiler;
 using namespace testing;
@@ -19,3 +22,47 @@ TEST(Permutation, MakeMovePermutation) {
   EXPECT_THAT(makeMovePermutation(3, 1, 2), ElementsAre(0, 2, 1));
   EXPECT_THAT(makeMovePermutation(3, 2, 0), ElementsAre(2, 0, 1));
 }
+
+TEST(EmbeddedDataDirectory, AddFileGetFile) {
+  EmbeddedDataDirectory dir;
+  EXPECT_TRUE(dir.addFile("filename1", "file contents 1"));
+  EXPECT_TRUE(dir.addFile("filename2", "file contents 2"));
+  EXPECT_FALSE(dir.addFile("filename1", "file contents 3"));
+  EXPECT_EQ(dir.getFile("filename1"), "file contents 1");
+  EXPECT_EQ(dir.getFile("filename2"), "file contents 2");
+  EXPECT_EQ(dir.getFile("filename3"), std::nullopt);
+}
+
+TEST(EmbeddedDataDirectory, WithGlobal) {
+  std::vector<std::thread> threads;
+  for (int i = 0; i < 3; ++i) {
+    threads.emplace_back([i] {
+      EmbeddedDataDirectory::withGlobal([i](EmbeddedDataDirectory &globalDir) {
+        EXPECT_TRUE(globalDir.addFile(llvm::formatv("filename{0}", i).str(),
+                                      "file contents xxx"));
+      });
+    });
+  }
+  for (std::thread &thread : threads) {
+    thread.join();
+  }
+  EmbeddedDataDirectory::withGlobal([](EmbeddedDataDirectory &globalDir) {
+    std::vector<std::string> keys;
+    for (auto iter : globalDir.getMap().keys()) {
+      keys.push_back(iter.str());
+    }
+    EXPECT_THAT(keys,
+                UnorderedElementsAre("filename0", "filename1", "filename2"));
+  });
+}
+
+TEST(EmbeddedDataDirectory, GetMap) {
+  EmbeddedDataDirectory dir;
+  EXPECT_TRUE(dir.addFile("filename1", "file contents 1"));
+  EXPECT_TRUE(dir.addFile("filename2", "file contents 2"));
+  std::vector<std::string> keys;
+  for (auto iter : dir.getMap().keys()) {
+    keys.push_back(iter.str());
+  }
+  EXPECT_THAT(keys, UnorderedElementsAre("filename1", "filename2"));
+}

From cb5be1dbd3560f692578c137eadbb413b41e44c7 Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak@amd.com>
Date: Tue, 3 Dec 2024 15:58:02 -0600
Subject: [PATCH 44/54] Revert "[Codegen][GPU] Add range information to GPU
 dispatch IDs" (#19361)

Reverts iree-org/iree#17707

Potential regression and I'm not in today to debug it, reverting
---
 .../compiler/Codegen/Common/GPU/BUILD.bazel   |   1 -
 .../Codegen/Common/GPU/CMakeLists.txt         |   1 -
 .../GPU/GPUPropagateDispatchSizeBounds.cpp    | 103 ---------------
 .../compiler/Codegen/Common/GPU/Passes.td     |   5 -
 .../Codegen/Common/GPU/test/BUILD.bazel       |   1 -
 .../Codegen/Common/GPU/test/CMakeLists.txt    |   1 -
 .../gpu_propagate_dispatch_size_bounds.mlir   | 122 ------------------
 .../Codegen/LLVMGPU/ConvertToLLVM.cpp         |   5 +-
 .../iree/compiler/Codegen/LLVMGPU/Passes.cpp  |  10 +-
 .../nvvm_extract_address_computation.mlir     |   2 +-
 .../iree/compiler/Codegen/SPIRV/Passes.cpp    |   2 -
 .../iree/compiler/Dialect/HAL/IR/BUILD.bazel  |   2 -
 .../compiler/Dialect/HAL/IR/CMakeLists.txt    |   1 -
 .../iree/compiler/Dialect/HAL/IR/HALOps.cpp   |  36 ------
 .../iree/compiler/Dialect/HAL/IR/HALOps.td    |  74 +++++++----
 .../HAL/Transforms/MaterializeInterfaces.cpp  |   3 +-
 16 files changed, 55 insertions(+), 314 deletions(-)
 delete mode 100644 compiler/src/iree/compiler/Codegen/Common/GPU/GPUPropagateDispatchSizeBounds.cpp
 delete mode 100644 compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_propagate_dispatch_size_bounds.mlir

diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel
index 40cd0864a0d6..93cc6520e47d 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel
@@ -72,7 +72,6 @@ iree_compiler_cc_library(
         "GPUPatterns.cpp",
         "GPUPipelining.cpp",
         "GPUPromoteMatmulOperands.cpp",
-        "GPUPropagateDispatchSizeBounds.cpp",
         "GPUReduceBankConflicts.cpp",
         "GPUReuseSharedMemoryAllocs.cpp",
         "GPUTensorAlloc.cpp",
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt
index 354a1f1b1e21..2112a013bb4e 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt
@@ -70,7 +70,6 @@ iree_cc_library(
     "GPUPatterns.cpp"
     "GPUPipelining.cpp"
     "GPUPromoteMatmulOperands.cpp"
-    "GPUPropagateDispatchSizeBounds.cpp"
     "GPUReduceBankConflicts.cpp"
     "GPUReuseSharedMemoryAllocs.cpp"
     "GPUTensorAlloc.cpp"
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUPropagateDispatchSizeBounds.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUPropagateDispatchSizeBounds.cpp
deleted file mode 100644
index 43aa70be6919..000000000000
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUPropagateDispatchSizeBounds.cpp
+++ /dev/null
@@ -1,103 +0,0 @@
-// Copyright 2024 The IREE Authors
-//
-// Licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#include "iree/compiler/Codegen/Common/GPU/Passes.h"
-#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
-#include "iree/compiler/Codegen/Utils/GPUUtils.h"
-#include "iree/compiler/Codegen/Utils/Utils.h"
-#include "iree/compiler/Dialect/HAL/IR/HALOps.h"
-#include "mlir/Dialect/GPU/IR/GPUDialect.h"
-#include "mlir/Interfaces/FunctionInterfaces.h"
-#include "mlir/Transforms/Passes.h"
-
-namespace mlir::iree_compiler {
-
-#define GEN_PASS_DEF_GPUPROPAGATEDISPATCHSIZEBOUNDSPASS
-#include "iree/compiler/Codegen/Common/GPU/Passes.h.inc"
-
-namespace {
-
-static void applyBounds(FunctionOpInterface funcOp,
-                        ArrayRef<int32_t> workgroupSizes,
-                        ArrayRef<int32_t> workgroupCounts) {
-  Builder b(funcOp->getContext());
-  funcOp->walk([&](Operation *op) {
-    TypeSwitch<Operation *>(op)
-        .Case([&](gpu::ThreadIdOp tidOp) {
-          tidOp.setUpperBoundAttr(b.getIndexAttr(
-              workgroupSizes[static_cast<uint32_t>(tidOp.getDimension())]));
-        })
-        .Case([&](IREE::HAL::InterfaceWorkgroupSizeOp wgSizeOp) {
-          wgSizeOp.setUpperBoundAttr(b.getIndexAttr(
-              workgroupSizes[wgSizeOp.getDimension().getZExtValue()]));
-        })
-        .Case([&](IREE::HAL::InterfaceWorkgroupIDOp wgIdOp) {
-          wgIdOp.setUpperBoundAttr(b.getIndexAttr(
-              workgroupCounts[wgIdOp.getDimension().getZExtValue()]));
-        })
-        .Case([&](IREE::HAL::InterfaceWorkgroupCountOp wgCountOp) {
-          wgCountOp.setUpperBoundAttr(b.getIndexAttr(
-              workgroupCounts[wgCountOp.getDimension().getZExtValue()]));
-        })
-        .Default([](Operation *) {});
-  });
-}
-
-struct GPUPropagateDispatchSizeBoundsPass final
-    : impl::GPUPropagateDispatchSizeBoundsPassBase<
-          GPUPropagateDispatchSizeBoundsPass> {
-  using Base::Base;
-
-  void runOnOperation() override {
-    FunctionOpInterface funcOp = getOperation();
-    IREE::GPU::TargetAttr target = getGPUTargetAttr(funcOp);
-    if (!target) {
-      funcOp.emitWarning("no known target attribute late in GPU codegen");
-      return;
-    }
-    SmallVector<int32_t, 3> workgroupSizes(
-        target.getWgp().getMaxWorkgroupSizes().asArrayRef());
-    SmallVector<int32_t, 3> workgroupCounts(
-        target.getWgp().getMaxWorkgroupCounts().asArrayRef());
-
-    std::optional<SmallVector<int64_t>> staticWorkgroupSize =
-        getWorkgroupSize(funcOp);
-
-    // Late in codegen, we've reconciled the workgroup size onto the export op.
-    if (std::optional<IREE::HAL::ExecutableExportOp> exportOp =
-            getEntryPoint(funcOp)) {
-      if (std::optional<ArrayAttr> exportWorkgroupSize =
-              exportOp->getWorkgroupSize()) {
-        staticWorkgroupSize =
-            llvm::map_to_vector(exportWorkgroupSize->getAsRange<IntegerAttr>(),
-                                [](IntegerAttr a) { return a.getInt(); });
-      }
-    }
-
-    if (staticWorkgroupSize) {
-      // Target info with no workgroup sizes gives a 0-length array, hence no
-      // zip_equal.
-      for (auto [size, staticSize] :
-           llvm::zip(workgroupSizes, *staticWorkgroupSize)) {
-        size = staticSize;
-      }
-    }
-    SmallVector<int64_t> staticWorkgroupCounts = getStaticNumWorkgroups(funcOp);
-    assert(staticWorkgroupCounts.size() <= 3 &&
-           "workgroup counts are 3D at most");
-    for (auto [count, staticCount] :
-         llvm::zip(workgroupCounts, staticWorkgroupCounts)) {
-      if (staticCount != ShapedType::kDynamic) {
-        count = staticCount;
-      }
-    }
-
-    applyBounds(funcOp, workgroupSizes, workgroupCounts);
-  }
-};
-} // namespace
-
-} // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td b/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td
index 15c6e9c23c93..323dac5afbce 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td
@@ -168,11 +168,6 @@ def GPUPromoteMatmulOperandsPass :
   ];
 }
 
-def GPUPropagateDispatchSizeBoundsPass :
-    InterfacePass<"iree-codegen-gpu-propagate-dispatch-size-bounds", "mlir::FunctionOpInterface"> {
-  let summary = "Pass to annotate workitem and workgroup IDs with known bounds";
-}
-
 def GPUReduceBankConflictsPass :
     InterfacePass<"iree-codegen-gpu-reduce-bank-conflicts", "mlir::FunctionOpInterface"> {
   let summary = "Pass to try to reduce the number of bank conflicts by padding memref.alloc ops.";
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel
index 7fe33161bca1..3c16ba190be2 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel
@@ -40,7 +40,6 @@ iree_lit_test_suite(
             "gpu_nested_layout_vector_distribution_step.mlir",
             "gpu_pipeline.mlir",
             "gpu_promote_matmul_operands.mlir",
-            "gpu_propagate_dispatch_size_bounds.mlir",
             "gpu_reorder_workgroups_static.mlir",
             "gpu_reorder_workgroups.mlir",
             "gpu_reuse_shared_memory_allocs.mlir",
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Common/GPU/test/CMakeLists.txt
index 4b9853df8213..ebb4f77898bf 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/CMakeLists.txt
@@ -36,7 +36,6 @@ iree_lit_test_suite(
     "gpu_pack_to_instrinsics.mlir"
     "gpu_pipeline.mlir"
     "gpu_promote_matmul_operands.mlir"
-    "gpu_propagate_dispatch_size_bounds.mlir"
     "gpu_reorder_workgroups.mlir"
     "gpu_reorder_workgroups_static.mlir"
     "gpu_reuse_shared_memory_allocs.mlir"
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_propagate_dispatch_size_bounds.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_propagate_dispatch_size_bounds.mlir
deleted file mode 100644
index f26f2c5dfe52..000000000000
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_propagate_dispatch_size_bounds.mlir
+++ /dev/null
@@ -1,122 +0,0 @@
-// RUN: iree-opt %s --split-input-file \
-// RUN:     --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(func.func(iree-codegen-gpu-propagate-dispatch-size-bounds)))))" \
-// RUN:  | FileCheck %s
-
-// Note: not the real target definition, missing types
-#executable_target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx1100", features = "",
-  wgp = <compute =  fp32,
-    storage =  b32,
-    subgroup =  arithmetic,
-    dot =  none, mma = [],
-    subgroup_size_choices = [32, 64],
-    max_workgroup_sizes = [1024, 1024, 1024],
-    max_thread_count_per_workgroup = 1024,
-    max_workgroup_memory_bytes = 65536,
-    max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>}>
-#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer>]>
-
-hal.executable private @static {
-  hal.executable.variant public @rocm_hsaco_fb target(#executable_target) {
-    hal.executable.export public @static ordinal(0) layout(#pipeline_layout) attributes {workgroup_size = [64 : index, 2 : index, 1 : index]} {
-    ^bb0(%arg0: !hal.device):
-      %c32 = arith.constant 32 : index
-      %c8 = arith.constant 8 : index
-      %c1 = arith.constant 1 : index
-      hal.return %c32, %c8, %c1 : index, index, index
-    }
-    builtin.module {
-// CHECK-LABEL: func.func @static
-      func.func @static() {
-// CHECK: gpu.thread_id x upper_bound 64
-// CHECK: gpu.thread_id y upper_bound 2
-// CHECK: gpu.thread_id z upper_bound 1
-        %thread_id_x = gpu.thread_id x
-        %thread_id_y = gpu.thread_id y
-        %thread_id_z = gpu.thread_id z
-
-// CHECK: hal.interface.workgroup.size[0] upper_bound 64
-// CHECK: hal.interface.workgroup.size[1] upper_bound 2
-// CHECK: hal.interface.workgroup.size[2] upper_bound 1
-        %workgroup_size_x = hal.interface.workgroup.size[0] : index
-        %workgroup_size_y = hal.interface.workgroup.size[1] : index
-        %workgroup_size_z = hal.interface.workgroup.size[2] : index
-
-// CHECK: hal.interface.workgroup.id[0] upper_bound 32
-// CHECK: hal.interface.workgroup.id[1] upper_bound 8
-// CHECK: hal.interface.workgroup.id[2] upper_bound 1
-        %workgroup_id_x = hal.interface.workgroup.id[0] : index
-        %workgroup_id_y = hal.interface.workgroup.id[1] : index
-        %workgroup_id_z = hal.interface.workgroup.id[2] : index
-
-// CHECK: hal.interface.workgroup.count[0] upper_bound 32
-// CHECK: hal.interface.workgroup.count[1] upper_bound 8
-// CHECK: hal.interface.workgroup.count[2] upper_bound 1
-        %workgroup_conut_x = hal.interface.workgroup.count[0] : index
-        %workgroup_count_y = hal.interface.workgroup.count[1] : index
-        %workgroup_count_z = hal.interface.workgroup.count[2] : index
-
-        return
-      }
-    }
-  }
-}
-
-// -----
-
-#executable_target = #hal.executable.target<"rocm", "rocm-hsaco-fb",
-  {iree.gpu.target = #iree_gpu.target<arch = "gfx1100", features = "",
-  wgp = <compute =  fp32,
-    storage =  b32,
-    subgroup = arithmetic,
-    dot =  none, mma = [],
-    subgroup_size_choices = [32, 64],
-    max_workgroup_sizes = [1024, 1024, 1024],
-    max_thread_count_per_workgroup = 1024,
-    max_workgroup_memory_bytes = 65536,
-    max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>}>
-#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer>]>
-
-hal.executable private @dynamic {
-  hal.executable.variant public @rocm_hsaco_fb target(#executable_target) {
-    hal.executable.export public @dynamic ordinal(0) layout(#pipeline_layout) {
-      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
-      %count_x = affine.apply affine_map<()[s0] -> (s0 ceildiv 32)>()[%arg1]
-      %count_y = affine.apply affine_map<()[s0] -> (s0 ceildiv 8)>()[%arg2]
-      %count_z = arith.constant 1 : index
-      hal.return %count_x, %count_y, %count_z : index, index, index
-    }
-    builtin.module {
-      func.func @dynamic() {
-// CHECK: gpu.thread_id x upper_bound 1024
-// CHECK: gpu.thread_id y upper_bound 1024
-// CHECK: gpu.thread_id z upper_bound 1024
-        %thread_id_x = gpu.thread_id x
-        %thread_id_y = gpu.thread_id y
-        %thread_id_z = gpu.thread_id z
-
-// CHECK: hal.interface.workgroup.size[0] upper_bound 1024
-// CHECK: hal.interface.workgroup.size[1] upper_bound 1024
-// CHECK: hal.interface.workgroup.size[2] upper_bound 1024
-        %workgroup_size_x = hal.interface.workgroup.size[0] : index
-        %workgroup_size_y = hal.interface.workgroup.size[1] : index
-        %workgroup_size_z = hal.interface.workgroup.size[2] : index
-
-// CHECK: hal.interface.workgroup.id[0] upper_bound 2147483647
-// CHECK: hal.interface.workgroup.id[1] upper_bound 2147483647
-// CHECK: hal.interface.workgroup.id[2] upper_bound 1
-        %workgroup_id_x = hal.interface.workgroup.id[0] : index
-        %workgroup_id_y = hal.interface.workgroup.id[1] : index
-        %workgroup_id_z = hal.interface.workgroup.id[2] : index
-
-// CHECK: hal.interface.workgroup.count[0] upper_bound 2147483647
-// CHECK: hal.interface.workgroup.count[1] upper_bound 2147483647
-// CHECK: hal.interface.workgroup.count[2] upper_bound 1
-        %workgroup_conut_x = hal.interface.workgroup.count[0] : index
-        %workgroup_count_y = hal.interface.workgroup.count[1] : index
-        %workgroup_count_z = hal.interface.workgroup.count[2] : index
-
-        return
-      }
-    }
-  }
-}
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToLLVM.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToLLVM.cpp
index 1441f959b0bb..c056d44538bb 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToLLVM.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToLLVM.cpp
@@ -505,10 +505,7 @@ struct HALInterfaceWorkgroupOpsConverter final
     int32_t index = static_cast<int32_t>(op.getDimension().getSExtValue());
     std::array<gpu::Dimension, 3> dimAttr{gpu::Dimension::x, gpu::Dimension::y,
                                           gpu::Dimension::z};
-    NewOpTy newOp =
-        rewriter.replaceOpWithNewOp<NewOpTy>(op, op.getType(), dimAttr[index]);
-    if (IntegerAttr bound = op.getUpperBoundAttr())
-      newOp.setUpperBoundAttr(bound);
+    rewriter.replaceOpWithNewOp<NewOpTy>(op, op.getType(), dimAttr[index]);
     return success();
   }
 };
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
index 64744890dbbd..26dced54768f 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
@@ -1066,8 +1066,7 @@ addLowerAndOptimizeAddressComputationPasses(FunctionLikeNest &funcPassManager) {
       .addPass(createCSEPass)
       // Hoist the resulting decompositions.
       .addPass(createIREELoopInvariantCodeMotionPass)
-      .addPass(createLowerAffinePass)
-      .addPass(IREE::Util::createOptimizeIntArithmeticPass);
+      .addPass(createLowerAffinePass);
 }
 
 static void addLowerToLLVMGPUPasses(OpPassManager &modulePassManager,
@@ -1103,9 +1102,7 @@ static void addLowerToLLVMGPUPasses(OpPassManager &modulePassManager,
   FunctionLikeNest funcPassManager(modulePassManager);
   funcPassManager.addPass(createFoldTensorExtractOpPass)
       .addPass(createLLVMGPUVectorLoweringPass)
-      .addPass(createExpandGPUOpsPass)
-      // Expose workitem and workgroup counts to range inference later.
-      .addPass(createGPUPropagateDispatchSizeBoundsPass);
+      .addPass(createExpandGPUOpsPass);
 
   // This pass needs to run before SCF -> CF.
   addLowerAndOptimizeAddressComputationPasses(funcPassManager);
@@ -1133,9 +1130,6 @@ static void addLowerToLLVMGPUPasses(OpPassManager &modulePassManager,
       .addPass(createEmulateNarrowTypePass)
       .addPass(affine::createAffineExpandIndexOpsPass)
       .addPass(createLowerAffinePass)
-      // Re-run index optimizations to take care of this ronud of indexing
-      // even though now we can't reason about loop bounds
-      .addPass(IREE::Util::createOptimizeIntArithmeticPass)
       .addPass(createCanonicalizerPass)
       .addPass(createCSEPass);
 
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/nvvm_extract_address_computation.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/nvvm_extract_address_computation.mlir
index ba6b5da7f1fa..6c1c5e117016 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/nvvm_extract_address_computation.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/nvvm_extract_address_computation.mlir
@@ -40,7 +40,7 @@
 // CHECK-DAG: %[[C8192:.*]] = llvm.mlir.constant(8192 : index) : i64
 //
 // Match the interesting special registers.
-// CHECK-DAG: %[[TID_Y:.*]] = nvvm.read.ptx.sreg.tid.y range <i32, 0, 2> : i32
+// CHECK-DAG: %[[TID_Y:.*]] = nvvm.read.ptx.sreg.tid.y : i32
 // CHECK-DAG: %[[TID_Y_EXT:.*]] = llvm.sext %[[TID_Y]] : i32 to i64
 // CHECK-DAG: %[[LANEID:.*]] = nvvm.read.ptx.sreg.laneid range <i32, 0, 32> : i32
 // CHECK-DAG: %[[LANEID_EXT:.*]] = llvm.sext %[[LANEID]] : i32 to i64
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/Passes.cpp b/compiler/src/iree/compiler/Codegen/SPIRV/Passes.cpp
index 511dbe785300..ea0aa9f45116 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/Passes.cpp
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/Passes.cpp
@@ -227,11 +227,9 @@ static void addMemRefLoweringPasses(OpPassManager &modulePassManager) {
 /// Adds passes to perform the final SPIR-V conversion.
 static void addSPIRVLoweringPasses(OpPassManager &modulePassManager) {
   FunctionLikeNest(modulePassManager)
-      .addPass(createGPUPropagateDispatchSizeBoundsPass)
       .addPass(createCanonicalizerPass)
       .addPass(createCSEPass)
       .addPass(createLowerAffinePass)
-      .addPass(IREE::Util::createOptimizeIntArithmeticPass)
 
       // Lower ApplyScale before the i64 Emulation Pass so that new 64-bit ops
       // are also emulated if not supported by the target.
diff --git a/compiler/src/iree/compiler/Dialect/HAL/IR/BUILD.bazel b/compiler/src/iree/compiler/Dialect/HAL/IR/BUILD.bazel
index 3f80245bfc8c..d9d6a92ef71c 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/IR/BUILD.bazel
+++ b/compiler/src/iree/compiler/Dialect/HAL/IR/BUILD.bazel
@@ -35,7 +35,6 @@ iree_td_library(
         "//compiler/src/iree/compiler/Dialect/Util/IR:td_files",
         "@llvm-project//mlir:BuiltinDialectTdFiles",
         "@llvm-project//mlir:FuncTdFiles",
-        "@llvm-project//mlir:InferIntRangeInterfaceTdFiles",
         "@llvm-project//mlir:InferTypeOpInterfaceTdFiles",
         "@llvm-project//mlir:OpBaseTdFiles",
         "@llvm-project//mlir:ViewLikeInterfaceTdFiles",
@@ -82,7 +81,6 @@ iree_compiler_cc_library(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:FunctionInterfaces",
         "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:InferIntRangeInterface",
         "@llvm-project//mlir:InferTypeOpInterface",
         "@llvm-project//mlir:MemRefDialect",
         "@llvm-project//mlir:Parser",
diff --git a/compiler/src/iree/compiler/Dialect/HAL/IR/CMakeLists.txt b/compiler/src/iree/compiler/Dialect/HAL/IR/CMakeLists.txt
index 846bcf0d38a2..837855157e90 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/IR/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Dialect/HAL/IR/CMakeLists.txt
@@ -45,7 +45,6 @@ iree_cc_library(
     MLIRFuncDialect
     MLIRFunctionInterfaces
     MLIRIR
-    MLIRInferIntRangeInterface
     MLIRInferTypeOpInterface
     MLIRMemRefDialect
     MLIRParser
diff --git a/compiler/src/iree/compiler/Dialect/HAL/IR/HALOps.cpp b/compiler/src/iree/compiler/Dialect/HAL/IR/HALOps.cpp
index cb5bb411810a..7210d402598d 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/IR/HALOps.cpp
+++ b/compiler/src/iree/compiler/Dialect/HAL/IR/HALOps.cpp
@@ -19,7 +19,6 @@
 #include "mlir/IR/SymbolTable.h"
 #include "mlir/IR/TypeUtilities.h"
 #include "mlir/Interfaces/FunctionImplementation.h"
-#include "mlir/Interfaces/InferIntRangeInterface.h"
 #include "mlir/Interfaces/InferTypeOpInterface.h"
 
 namespace mlir::iree_compiler::IREE::HAL {
@@ -2085,59 +2084,24 @@ static void getAsmResultNamesForInterfaceWorkgroupOp(
   }
 }
 
-// Minimum is the smallest possible result we could get. It's 0 for ID-like
-// operations and 1 for count-like ones.
-static void setResultRangesForInterfaceWorkgroupOp(
-    Value result, const std::optional<APInt> &upperBound,
-    SetIntRangeFn setResultRanges, int64_t minimum) {
-  unsigned width = ConstantIntRanges::getStorageBitwidth(result.getType());
-  if (!upperBound.has_value()) {
-    setResultRanges(
-        result, ConstantIntRanges::fromSigned(APInt(width, minimum),
-                                              APInt::getSignedMaxValue(width)));
-    return;
-  }
-  setResultRanges(result,
-                  ConstantIntRanges::fromUnsigned(APInt(width, minimum),
-                                                  *upperBound + minimum - 1));
-}
-
 void InterfaceWorkgroupIDOp::getAsmResultNames(
     function_ref<void(Value, StringRef)> setNameFn) {
   getAsmResultNamesForInterfaceWorkgroupOp("workgroup_id_", getDimension(),
                                            getResult(), setNameFn);
 }
 
-void InterfaceWorkgroupIDOp::inferResultRanges(
-    ArrayRef<ConstantIntRanges> argRanges, SetIntRangeFn setResultRanges) {
-  setResultRangesForInterfaceWorkgroupOp(getResult(), getUpperBound(),
-                                         setResultRanges, /*minimum=*/0);
-}
-
 void InterfaceWorkgroupCountOp::getAsmResultNames(
     function_ref<void(Value, StringRef)> setNameFn) {
   getAsmResultNamesForInterfaceWorkgroupOp("workgroup_count_", getDimension(),
                                            getResult(), setNameFn);
 }
 
-void InterfaceWorkgroupCountOp::inferResultRanges(
-    ArrayRef<ConstantIntRanges> argRanges, SetIntRangeFn setResultRanges) {
-  setResultRangesForInterfaceWorkgroupOp(getResult(), getUpperBound(),
-                                         setResultRanges, /*minimum=*/1);
-}
-
 void InterfaceWorkgroupSizeOp::getAsmResultNames(
     function_ref<void(Value, StringRef)> setNameFn) {
   getAsmResultNamesForInterfaceWorkgroupOp("workgroup_size_", getDimension(),
                                            getResult(), setNameFn);
 }
 
-void InterfaceWorkgroupSizeOp::inferResultRanges(
-    ArrayRef<ConstantIntRanges> argRanges, SetIntRangeFn setResultRanges) {
-  setResultRangesForInterfaceWorkgroupOp(getResult(), getUpperBound(),
-                                         setResultRanges, /*minimum=*/1);
-}
-
 //===----------------------------------------------------------------------===//
 // hal.fence.*
 //===----------------------------------------------------------------------===//
diff --git a/compiler/src/iree/compiler/Dialect/HAL/IR/HALOps.td b/compiler/src/iree/compiler/Dialect/HAL/IR/HALOps.td
index d51e430b57c7..16f1eadfdffd 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/IR/HALOps.td
+++ b/compiler/src/iree/compiler/Dialect/HAL/IR/HALOps.td
@@ -3029,28 +3029,9 @@ def OpGroupInterfaceOps : OpDocGroup {
 
 let opDocGroup = OpGroupInterfaceOps in {
 
-class HAL_InterfaceWorkgroupOp<string mnemonic, list<Trait> traits = []>
-  : HAL_PureOp<mnemonic, !listconcat(traits, [
-      DeclareOpInterfaceMethods<OpAsmOpInterface, ["getAsmResultNames"]>,
-      DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>])> {
-  let arguments = (ins
-    IndexAttr:$dimension,
-    OptionalAttr<IndexAttr>:$upper_bound);
-  let results = (outs HAL_Dim:$result);
-
-  let builders = [
-    OpBuilder<(ins "unsigned":$dim),
-    [{
-      build($_builder, $_state, $_builder.getIndexType(), $_builder.getIndexAttr(dim), ::mlir::IntegerAttr{});
-    }]>,
-  ];
-
-  let assemblyFormat = [{
-    `[` $dimension `]` (`upper_bound` $upper_bound^)? attr-dict `:` type($result)
-  }];
-}
-
-def HAL_InterfaceWorkgroupIDOp : HAL_InterfaceWorkgroupOp<"interface.workgroup.id"> {
+def HAL_InterfaceWorkgroupIDOp : HAL_PureOp<"interface.workgroup.id", [
+  DeclareOpInterfaceMethods<OpAsmOpInterface, ["getAsmResultNames"]>,
+]> {
   let summary = [{returns the index of the current workgroup in the grid}];
   let description = [{
     The global workgroup ID of the current tile in the range of
@@ -3065,9 +3046,25 @@ def HAL_InterfaceWorkgroupIDOp : HAL_InterfaceWorkgroupOp<"interface.workgroup.i
     %z = hal.interface.workgroup.id[2] : index
     ```
   }];
+
+  let arguments = (ins IndexAttr:$dimension);
+  let results = (outs HAL_Dim:$result);
+
+  let builders = [
+    OpBuilder<(ins "unsigned":$dim),
+    [{
+      build($_builder, $_state, $_builder.getIndexType(), $_builder.getIndexAttr(dim));
+    }]>,
+  ];
+
+  let assemblyFormat = [{
+    `[` $dimension `]` attr-dict `:` type($result)
+  }];
 }
 
-def HAL_InterfaceWorkgroupCountOp : HAL_InterfaceWorkgroupOp<"interface.workgroup.count"> {
+def HAL_InterfaceWorkgroupCountOp : HAL_PureOp<"interface.workgroup.count", [
+  DeclareOpInterfaceMethods<OpAsmOpInterface, ["getAsmResultNames"]>,
+]> {
   let summary = [{returns the total workgroup count of the grid}];
   let description = [{
     The total number of workgroups along each dimension in the dispatch grid.
@@ -3084,9 +3081,24 @@ def HAL_InterfaceWorkgroupCountOp : HAL_InterfaceWorkgroupOp<"interface.workgrou
     ```
   }];
 
+  let arguments = (ins IndexAttr:$dimension);
+  let results = (outs HAL_Dim:$result);
+
+  let builders = [
+    OpBuilder<(ins "unsigned":$dim),
+    [{
+      build($_builder, $_state, $_builder.getIndexType(), $_builder.getIndexAttr(dim));
+    }]>,
+  ];
+
+  let assemblyFormat = [{
+    `[` $dimension `]` attr-dict `:` type($result)
+  }];
 }
 
-def HAL_InterfaceWorkgroupSizeOp : HAL_InterfaceWorkgroupOp<"interface.workgroup.size"> {
+def HAL_InterfaceWorkgroupSizeOp : HAL_PureOp<"interface.workgroup.size", [
+  DeclareOpInterfaceMethods<OpAsmOpInterface, ["getAsmResultNames"]>,
+]> {
   let summary = [{returns the size of each workgroup in invocations}];
   let description = [{
     The number of local invocations within the current workgroup along each
@@ -3102,6 +3114,20 @@ def HAL_InterfaceWorkgroupSizeOp : HAL_InterfaceWorkgroupOp<"interface.workgroup
     %z = hal.interface.workgroup.size[2] : index
     ```
   }];
+
+  let arguments = (ins IndexAttr:$dimension);
+  let results = (outs HAL_Dim:$result);
+
+  let builders = [
+    OpBuilder<(ins "unsigned":$dim),
+    [{
+      build($_builder, $_state, $_builder.getIndexType(), $_builder.getIndexAttr(dim));
+    }]>,
+  ];
+
+  let assemblyFormat = [{
+    `[` $dimension `]` attr-dict `:` type($result)
+  }];
 }
 
 def HAL_InterfaceConstantLoadOp : HAL_PureOp<"interface.constant.load"> {
diff --git a/compiler/src/iree/compiler/Dialect/HAL/Transforms/MaterializeInterfaces.cpp b/compiler/src/iree/compiler/Dialect/HAL/Transforms/MaterializeInterfaces.cpp
index d830c078b4bb..9f3bee7d529a 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Transforms/MaterializeInterfaces.cpp
+++ b/compiler/src/iree/compiler/Dialect/HAL/Transforms/MaterializeInterfaces.cpp
@@ -514,8 +514,7 @@ struct ConvertDispatchWorkgroupInfoPattern final
   LogicalResult matchAndRewrite(SrcOp op,
                                 PatternRewriter &rewriter) const override {
     rewriter.replaceOpWithNewOp<DstOp>(op, op.getResult().getType(),
-                                       op.getDimensionAttr(),
-                                       /*upper_bound=*/nullptr);
+                                       op.getDimensionAttr());
     return success();
   }
 };

From a767061dc5c803b09f821118a543b265f2eb910e Mon Sep 17 00:00:00 2001
From: Nirvedh Meshram <96096277+nirvedhmeshram@users.noreply.github.com>
Date: Tue, 3 Dec 2024 16:03:42 -0600
Subject: [PATCH 45/54] [GPU] Add config and pass to do padding in TileAndFuse
 pipeline (#19271)

We add a padding config which is set for unaligned to intrinsic shapes
in TileAndFuse pipeline when an unaligned schedule is found. With this
PR we can now check-in an example of an unaligned to intrinsic batch
matmul using intrinsics as a pipeline lowering test.

---------

Signed-off-by: Nirvedh <nirvedh@gmail.com>
---
 .../compiler/Codegen/Common/GPU/BUILD.bazel   |  1 +
 .../Codegen/Common/GPU/CMakeLists.txt         |  1 +
 .../Codegen/Common/GPU/GPUPadOperands.cpp     | 82 +++++++++++++++++++
 .../compiler/Codegen/Common/GPU/Passes.td     | 10 +++
 .../Codegen/Common/GPU/test/BUILD.bazel       |  1 +
 .../Codegen/Common/GPU/test/CMakeLists.txt    |  1 +
 .../Common/GPU/test/gpu_pad_operands.mlir     | 26 ++++++
 .../Dialect/GPU/IR/GPULoweringConfigUtils.cpp | 10 +++
 .../Dialect/GPU/IR/GPULoweringConfigUtils.h   |  3 +
 .../Dialect/GPU/TargetUtils/ConfigUtils.cpp   | 65 ++++++++++-----
 .../iree/compiler/Codegen/LLVMGPU/Passes.cpp  |  1 +
 .../test/ROCDL/config_tile_and_fuse.mlir      | 22 +++++
 .../test/ROCDL/pipeline_tile_and_fuse.mlir    | 79 ++++++++++++++++++
 13 files changed, 282 insertions(+), 20 deletions(-)
 create mode 100644 compiler/src/iree/compiler/Codegen/Common/GPU/GPUPadOperands.cpp
 create mode 100644 compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_pad_operands.mlir

diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel
index 93cc6520e47d..ec5ccfcb813b 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel
@@ -69,6 +69,7 @@ iree_compiler_cc_library(
         "GPUMultiBuffering.cpp",
         "GPUNestedLayoutDistributionPatterns.cpp",
         "GPUPackToIntrinsics.cpp",
+        "GPUPadOperands.cpp",
         "GPUPatterns.cpp",
         "GPUPipelining.cpp",
         "GPUPromoteMatmulOperands.cpp",
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt
index 2112a013bb4e..7fe3df8e4cf1 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt
@@ -67,6 +67,7 @@ iree_cc_library(
     "GPUMultiBuffering.cpp"
     "GPUNestedLayoutDistributionPatterns.cpp"
     "GPUPackToIntrinsics.cpp"
+    "GPUPadOperands.cpp"
     "GPUPatterns.cpp"
     "GPUPipelining.cpp"
     "GPUPromoteMatmulOperands.cpp"
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUPadOperands.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUPadOperands.cpp
new file mode 100644
index 000000000000..538af2f92dca
--- /dev/null
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUPadOperands.cpp
@@ -0,0 +1,82 @@
+// Copyright 2024 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
+#include "iree/compiler/Codegen/Dialect/GPU/IR/GPULoweringConfigUtils.h"
+#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "mlir/Transforms/Passes.h"
+
+namespace mlir::iree_compiler {
+
+#define GEN_PASS_DEF_GPUPADOPERANDSPASS
+#include "iree/compiler/Codegen/Common/GPU/Passes.h.inc"
+
+namespace {
+
+static LogicalResult padLinalgOpToStaticSizes(RewriterBase &rewriter,
+                                              linalg::LinalgOp linalgOp,
+                                              ArrayRef<int64_t> padding) {
+  SmallVector<int64_t> paddingDims =
+      llvm::to_vector(llvm::seq<int64_t>(0, linalgOp.getNumLoops()));
+  SmallVector<bool> nofoldFlags(linalgOp.getNumDpsInputs(), /*nofold=*/false);
+  SmallVector<Attribute> paddingValueAttributes;
+  for (auto &operand : linalgOp->getOpOperands()) {
+    Type elemType = getElementTypeOrSelf(operand.get().getType());
+    paddingValueAttributes.push_back(rewriter.getZeroAttr(elemType));
+  }
+
+  auto options =
+      linalg::LinalgPaddingOptions()
+          .setPaddingDimensions(paddingDims)
+          .setPaddingValues(paddingValueAttributes)
+          .setPadToMultipleOf(padding)
+          .setNofoldFlags(nofoldFlags)
+          .setCopyBackOp(linalg::LinalgPaddingOptions::CopyBackOp::None);
+
+  linalg::LinalgOp paddedOp;
+  SmallVector<Value> newResults;
+  SmallVector<tensor::PadOp> padOps;
+  if (failed(rewriteAsPaddedOp(rewriter, linalgOp, options, paddedOp,
+                               newResults, padOps))) {
+    return rewriter.notifyMatchFailure(linalgOp,
+                                       "failed to pad contraction op");
+  }
+  rewriter.replaceOp(linalgOp, newResults.front());
+  return success();
+}
+
+struct GPUPadOperandsPass final
+    : impl::GPUPadOperandsPassBase<GPUPadOperandsPass> {
+  void runOnOperation() override {
+    FunctionOpInterface funcOp = getOperation();
+
+    IRRewriter rewriter(funcOp);
+    funcOp.walk([&](linalg::LinalgOp op) {
+      auto loweringConfig =
+          getLoweringConfig<IREE::GPU::LoweringConfigAttr>(op);
+      if (!loweringConfig) {
+        return;
+      }
+
+      std::optional<SmallVector<int64_t>> paddingTileSizes =
+          getPaddingList(loweringConfig);
+      if (!paddingTileSizes) {
+        return;
+      }
+
+      rewriter.setInsertionPoint(op);
+      if (failed(padLinalgOpToStaticSizes(rewriter, op,
+                                          paddingTileSizes.value()))) {
+        return signalPassFailure();
+      }
+    });
+  }
+};
+
+} // namespace
+} // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td b/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td
index 323dac5afbce..789130940477 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td
@@ -134,6 +134,16 @@ def GPUPackToIntrinsicsPass :
   ];
 }
 
+def GPUPadOperandsPass :
+    InterfacePass<"iree-codegen-gpu-pad-operands",
+                  "mlir::FunctionOpInterface"> {
+  let summary = "Pass to pad operands of ops with padding configuration provided. ";
+  let dependentDialects = [
+    "::mlir::linalg::LinalgDialect",
+    "::mlir::iree_compiler::IREE::GPU::IREEGPUDialect"
+  ];
+}
+
 def GPUPipeliningPass :
     InterfacePass<"iree-codegen-gpu-pipelining", "mlir::FunctionOpInterface"> {
   let summary = "Pass to do software pipelining.";
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel
index 3c16ba190be2..41afbb6559f3 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel
@@ -38,6 +38,7 @@ iree_lit_test_suite(
             "gpu_nested_layout_contract_amdgpu.mlir",
             "gpu_nested_layout_vector_distribution.mlir",
             "gpu_nested_layout_vector_distribution_step.mlir",
+            "gpu_pad_operands.mlir",
             "gpu_pipeline.mlir",
             "gpu_promote_matmul_operands.mlir",
             "gpu_reorder_workgroups_static.mlir",
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Common/GPU/test/CMakeLists.txt
index ebb4f77898bf..ad86649ada78 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/CMakeLists.txt
@@ -34,6 +34,7 @@ iree_lit_test_suite(
     "gpu_nested_layout_vector_distribution.mlir"
     "gpu_nested_layout_vector_distribution_step.mlir"
     "gpu_pack_to_instrinsics.mlir"
+    "gpu_pad_operands.mlir"
     "gpu_pipeline.mlir"
     "gpu_promote_matmul_operands.mlir"
     "gpu_reorder_workgroups.mlir"
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_pad_operands.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_pad_operands.mlir
new file mode 100644
index 000000000000..162186549815
--- /dev/null
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_pad_operands.mlir
@@ -0,0 +1,26 @@
+// RUN: iree-opt %s --split-input-file --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-pad-operands))" | FileCheck %s
+
+#lowering_config = #iree_gpu.lowering_config<{padding = [3, 7, 11]}>
+
+func.func @matmul(%a: tensor<32x1024xf32>, %b: tensor<1024x128xf32>) -> tensor<32x128xf32> {
+  %cst = arith.constant 0.000000e+00 : f32
+  %empty = tensor.empty() : tensor<32x128xf32>
+  %fill = linalg.fill ins(%cst : f32) outs(%empty : tensor<32x128xf32>) -> tensor<32x128xf32>
+  %mm = linalg.matmul {lowering_config = #lowering_config}
+    ins(%a, %b : tensor<32x1024xf32>, tensor<1024x128xf32>) outs(%fill : tensor<32x128xf32>) -> tensor<32x128xf32>
+  return %mm : tensor<32x128xf32>
+}
+
+// CHECK-LABEL: func.func @matmul
+//  CHECK-SAME:   %[[A:[A-Za-z0-9]+]]: tensor<32x1024xf32>
+//  CHECK-SAME:   %[[B:[A-Za-z0-9]+]]: tensor<1024x128xf32>
+//       CHECK:   %[[FILL:.+]] = linalg.fill {{.*}} -> tensor<32x128xf32>
+//       CHECK:   %[[PADDED_LHS:.+]] = tensor.pad %[[A]] low[0, 0] high[1, 10]
+//       CHECK:   %[[PADDED_RHS:.+]] = tensor.pad %[[B]] low[0, 0] high[10, 5]
+//       CHECK:   %[[PADDED_INIT:.+]] = tensor.pad %[[FILL]] low[0, 0] high[1, 5]
+//       CHECK:   %[[PADDED_RESULT:.+]] = linalg.matmul
+//  CHECK-SAME:     ins(%[[PADDED_LHS]], %[[PADDED_RHS]] : tensor<33x1034xf32>, tensor<1034x133xf32>)
+//  CHECK-SAME:     outs(%[[PADDED_INIT]] : tensor<33x133xf32>) -> tensor<33x133xf32>
+//       CHECK:   %[[EXTRACT:.+]] = tensor.extract_slice %[[PADDED_RESULT]][0, 0] [32, 128] [1, 1]
+//  CHECK-SAME:     : tensor<33x133xf32> to tensor<32x128xf32>
+//       CHECK:   return %[[EXTRACT]] : tensor<32x128xf32>
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/GPULoweringConfigUtils.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/GPULoweringConfigUtils.cpp
index 23e50be4e31b..2d3a08f7ad38 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/GPULoweringConfigUtils.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/GPULoweringConfigUtils.cpp
@@ -87,4 +87,14 @@ void setPromotedOperandList(MLIRContext *context,
                      b.getI64ArrayAttr(operands));
 }
 
+constexpr StringLiteral kPaddingName = "padding";
+
+std::optional<SmallVector<int64_t>> getPaddingList(LoweringConfigAttr config) {
+  auto array = config.getAttributes().getAs<ArrayAttr>(kPaddingName);
+  if (!array) {
+    return std::nullopt;
+  }
+  return getIntegerVector(array);
+}
+
 } // namespace mlir::iree_compiler::IREE::GPU
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/GPULoweringConfigUtils.h b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/GPULoweringConfigUtils.h
index 25240907ba4c..ee8c7ab18fea 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/GPULoweringConfigUtils.h
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/GPULoweringConfigUtils.h
@@ -35,6 +35,9 @@ void setPromotedOperandList(MLIRContext *context,
                             SmallVectorImpl<NamedAttribute> &attrs,
                             ArrayRef<int64_t> operands);
 
+/// Helper to retrieve  list of operand to pad.
+std::optional<SmallVector<int64_t>> getPaddingList(LoweringConfigAttr config);
+
 } // namespace mlir::iree_compiler::IREE::GPU
 
 #endif // IREE_COMPILER_CODEGEN_DIALECT_GPU_IR_GPULOWERINGCONFIGUTILS_H_
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp
index b10a567f1ca2..fd671b4e6913 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp
@@ -108,10 +108,9 @@ setDataTiledMultiMmaLoweringConfig(IREE::GPU::TargetAttr target,
 
 /// Given a target and a matmul problem, try to find an MMA schedule for the
 /// problem based on the available mma intrinsics.
-static std::optional<GPUMMASchedule>
-getMmaScheduleFromProblemAndTarget(IREE::GPU::TargetAttr target,
-                                   GPUMatmulShapeType problem,
-                                   bool transposedLhs, bool transposedRhs) {
+static std::optional<GPUMMASchedule> getMmaScheduleFromProblemAndTarget(
+    IREE::GPU::TargetAttr target, GPUMatmulShapeType problem,
+    bool transposedLhs, bool transposedRhs, bool mustBeAligned = true) {
   const int64_t targetSubgroupSize = target.getPreferredSubgroupSize();
   SmallVector<GPUMatmulShapeType> intrinsics;
   for (IREE::GPU::MMAAttr mma : target.getWgp().getMma()) {
@@ -153,14 +152,16 @@ getMmaScheduleFromProblemAndTarget(IREE::GPU::TargetAttr target,
   int64_t maxSharedMemoryBytes = target.getWgp().getMaxWorkgroupMemoryBytes();
 
   // First try to find a schedule with an exactly matching intrinsic.
-  std::optional<GPUMMASchedule> schedule =
-      deduceMMASchedule(problem, intrinsics, seeds, maxSharedMemoryBytes,
-                        targetSubgroupSize, transposedLhs, transposedRhs);
+  std::optional<GPUMMASchedule> schedule = deduceMMASchedule(
+      problem, intrinsics, seeds, maxSharedMemoryBytes, targetSubgroupSize,
+      transposedLhs, transposedRhs, /*canUpcastAcc=*/false,
+      /*mustBeAligned*/ mustBeAligned);
   if (!schedule) {
     // Then try again by allowing upcasting accumulator.
     schedule = deduceMMASchedule(
         problem, intrinsics, seeds, maxSharedMemoryBytes, targetSubgroupSize,
-        transposedLhs, transposedRhs, /*canUpcastAcc=*/true);
+        transposedLhs, transposedRhs, /*canUpcastAcc=*/true,
+        /*mustBeAligned*/ mustBeAligned);
   }
   return schedule;
 }
@@ -173,7 +174,8 @@ static FailureOr<std::pair<LoweringConfigAttr, int64_t>>
 getMatmulLoweringConfigAndWorkgroupSize(SmallVector<int64_t> bounds,
                                         ArrayRef<AffineMap> maps,
                                         ArrayRef<Value> operands,
-                                        IREE::GPU::TargetAttr target) {
+                                        IREE::GPU::TargetAttr target,
+                                        bool hasFusedLeadingOp) {
   if (target.getWgp().getMma().empty())
     return failure();
 
@@ -238,9 +240,23 @@ getMatmulLoweringConfigAndWorkgroupSize(SmallVector<int64_t> bounds,
       nDims.back() !=
       llvm::cast<AffineDimExpr>(maps[1].getResults().back()).getPosition();
 
+  bool mustBeAligned = true;
   std::optional<GPUMMASchedule> schedule = getMmaScheduleFromProblemAndTarget(
       target, problem, transposedLhs, transposedRhs);
 
+  // TODO (nirvedhmeshram, jerryyin): Support all GEMM types.
+  // TODO (nirvedhmeshram): Support fused leading op.
+  // TODO (nirvedhmeshram, qedawkins): The performance with this will be bad if
+  // the GEMM is accumulating (i.e doesnt have a zero fill dpsInit) as that
+  // buffer currently gets materialized as private memory. We need to add
+  // missing patterns to fix that.
+  if (!schedule && !contractionDims.batch.empty() && !hasFusedLeadingOp) {
+    LDBG("Attempting to deduce unaligned TileAndFuse MMA schedulee");
+    mustBeAligned = false;
+    schedule = getMmaScheduleFromProblemAndTarget(
+        target, problem, transposedLhs, transposedRhs, mustBeAligned);
+  }
+
   if (!schedule) {
     LDBG("Failed to deduce TileAndFuse MMA schedule");
     return failure();
@@ -270,14 +286,6 @@ getMatmulLoweringConfigAndWorkgroupSize(SmallVector<int64_t> bounds,
     reductionTileSizes[k] = 1;
   }
 
-  // Adjust the inner bound size for packing to intrinsic shapes, since tiling
-  // happens after packing.
-  assert(bounds[mDims.back()] % schedule->mSize == 0 &&
-         bounds[nDims.back()] % schedule->nSize == 0 &&
-         "expected inner bound to be evenly divisible by schedule sizes.");
-  bounds[mDims.back()] /= schedule->mSize;
-  bounds[nDims.back()] /= schedule->nSize;
-
   // Compute the M/N dimension tile sizes by multiplying subgroup information.
   for (auto [i, mDim] : llvm::enumerate(mDims)) {
     workgroupTileSizes[mDim] =
@@ -318,7 +326,22 @@ getMatmulLoweringConfigAndWorkgroupSize(SmallVector<int64_t> bounds,
   attrs.emplace_back(StringAttr::get(context, "subgroup"),
                      b.getI64ArrayAttr(subgroupTileSizes));
   attrs.emplace_back(StringAttr::get(context, "mma_kind"), mmaKind);
-  GPU::setPromotedOperandList(context, attrs, {0, 1});
+  if (mustBeAligned) {
+    GPU::setPromotedOperandList(context, attrs, {0, 1});
+  } else {
+    // TODO (nirvedhmeshram, Max191, jerryyin) : Add support so that unaligned
+    // shapes do not require c promotion.
+    // TODO (nirvedhmeshram, jerryyin) : When using c promotion the heuristics
+    // used during finding a schedule need to be updated to account for the
+    // extra shared memory for the result.
+    GPU::setPromotedOperandList(context, attrs, {0, 1, 2});
+    SmallVector<int64_t> paddingTileSizes = workgroupTileSizes;
+    int64_t innerKDim = contractionDims.k.back();
+    int64_t kPackFactor = std::get<2>(mmaKind.getMNKShape());
+    paddingTileSizes[innerKDim] = reductionTileSizes[innerKDim] * kPackFactor;
+    attrs.emplace_back(StringAttr::get(context, "padding"),
+                       b.getI64ArrayAttr(paddingTileSizes));
+  }
   auto configDict = DictionaryAttr::get(context, attrs);
   auto loweringConfig = IREE::GPU::LoweringConfigAttr::get(context, configDict);
   int64_t flatWorkgroupSize =
@@ -357,7 +380,8 @@ setIGEMMConvolutionLoweringConfig(IREE::GPU::TargetAttr target,
   SmallVector<int64_t> bounds = igemmLoopBounds.value();
   FailureOr<std::pair<LoweringConfigAttr, int64_t>> configAndWgSize =
       getMatmulLoweringConfigAndWorkgroupSize(
-          bounds, igemmContractionMaps.value(), igemmOperands.value(), target);
+          bounds, igemmContractionMaps.value(), igemmOperands.value(), target,
+          /*hasFusedLeadingOp=*/true);
   if (failed(configAndWgSize)) {
     return failure();
   }
@@ -400,7 +424,8 @@ LogicalResult setMatmulLoweringConfig(IREE::GPU::TargetAttr target,
   LDBG("Matmul TileAndFuse Config");
 
   FailureOr<std::pair<LoweringConfigAttr, int64_t>> configAndWgSize =
-      getMatmulLoweringConfigAndWorkgroupSize(bounds, maps, operands, target);
+      getMatmulLoweringConfigAndWorkgroupSize(bounds, maps, operands, target,
+                                              hasFusedLeadingOp(linalgOp));
   if (failed(configAndWgSize)) {
     return failure();
   }
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
index 26dced54768f..b79abdd0eb19 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
@@ -351,6 +351,7 @@ void addGPUTileAndFusePassPipeline(OpPassManager &funcPassManager,
                                /*convertToDpsOptions=*/std::nullopt);
 
   // Step 1. Promote matmul operands and pack to intrinsic shapes.
+  funcPassManager.addPass(createGPUPadOperandsPass());
   funcPassManager.addPass(createGPUPromoteMatmulOperandsPass());
   funcPassManager.addPass(createGPUPackToIntrinsicsPass());
   // Decompose packs and unpacks that are at the function boundary.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir
index c976a334be9b..ddb7dc4ac5b5 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir
@@ -256,3 +256,25 @@ module {
 //       CHECK:   iree_gpu.multi_mma {{.*}}lowering_config = #iree_gpu.lowering_config
 //  CHECK-SAME:     reduction = [0, 0, 1]
 //  CHECK-SAME:     workgroup = [1, 1, 0]
+
+// -----
+
+module {
+func.func @unaligned_to_intrinsic_batched_matmul(%lhs : tensor<12x577x577xf32>, %rhs : tensor<12x577x577xf32>) -> tensor<12x577x577xf32> {
+    %c0 = arith.constant 0.0 : f32
+    %empty = tensor.empty() : tensor<12x577x577xf32>
+    %fill = linalg.fill ins(%c0 : f32) outs(%empty : tensor<12x577x577xf32>) -> tensor<12x577x577xf32>
+    %mm = linalg.batch_matmul ins(%lhs, %rhs : tensor<12x577x577xf32>, tensor<12x577x577xf32>) outs(%fill : tensor<12x577x577xf32>) -> tensor<12x577x577xf32>
+    return %mm :  tensor<12x577x577xf32>
+}
+}
+
+// CHECK-LABEL: func.func @unaligned_to_intrinsic_batched_matmul
+// CHECK-SAME:    #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64
+// CHECK-SAME:    {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}
+//      CHECK:    linalg.batch_matmul {{.*}}lowering_config = #iree_gpu.lowering_config
+//  CHECK-SAME:     padding = [1, 16, 16, 4]
+//  CHECK-SAME:     promote_operands = [0, 1, 2]
+//  CHECK-SAME:     reduction = [0, 0, 0, 1]
+//  CHECK-SAME:     subgroup = [0, 1, 1, 0]
+//  CHECK-SAME:     workgroup = [1, 16, 16, 0]
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir
index 3f5b280b6342..a716c6b7579c 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir
@@ -1087,3 +1087,82 @@ hal.executable public @main {
 //       CHECK:       vector.transfer_write %[[SHARED_READ]], %[[B2]]
 //       CHECK:    }
 //       CHECK:   } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
+
+// -----
+
+#pipeline_layout = #hal.pipeline.layout<bindings = [
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>
+]>
+#config = #iree_gpu.lowering_config<{
+  mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>,
+  padding = [1, 16, 64, 4],
+  promote_operands = [0, 1, 2],
+  reduction = [0, 0, 0, 1],
+  subgroup = [0, 1, 1, 0],
+  workgroup = [1, 16, 64, 0]
+}>
+#translation = #iree_codegen.translation_info<pipeline =
+  LLVMGPUTileAndFuse
+  workgroup_size = [256, 1, 1]
+  subgroup_size = 64,
+  {
+    gpu_pipeline_options = #iree_gpu.pipeline_options<
+      prefetch_shared_memory = true,
+      no_reduce_shared_memory_bank_conflicts = false,
+      use_igemm_convolution = false>
+  }
+>
+hal.executable public @main {
+  hal.executable.variant public @rocm_hsaco_fb target(#hal.executable.target<"rocm", "rocm-hsaco-fb">) {
+    hal.executable.export public @unaligned_to_intrinsic_batched_matmul_dispatch_0_batch_matmul_12x577x577x577_f32 ordinal(0) layout(#pipeline_layout) {
+    ^bb0(%arg0: !hal.device):
+      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
+      hal.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @unaligned_to_intrinsic_batched_matmul() attributes {translation_info = #translation} {
+        %cst = arith.constant 0.000000e+00 : f32
+        %c0 = arith.constant 0 : index
+        %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<12x577x577xf32>>
+        %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<12x577x577xf32>>
+        %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<12x577x577xf32>>
+        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [12, 577, 577], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<12x577x577xf32>> -> tensor<12x577x577xf32>
+        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [12, 577, 577], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<12x577x577xf32>> -> tensor<12x577x577xf32>
+        %5 = tensor.empty() : tensor<12x577x577xf32>
+        %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<12x577x577xf32>) -> tensor<12x577x577xf32>
+        %7 = linalg.batch_matmul {lowering_config = #config} ins(%3, %4 : tensor<12x577x577xf32>, tensor<12x577x577xf32>) outs(%6 : tensor<12x577x577xf32>) -> tensor<12x577x577xf32>
+        flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [12, 577, 577], strides = [1, 1, 1] : tensor<12x577x577xf32> -> !flow.dispatch.tensor<writeonly:tensor<12x577x577xf32>>
+        return
+      }
+    }
+  }
+}
+
+// CHECK-LABEL: func @unaligned_to_intrinsic_batched_matmul
+//   CHECK-DAG:   %[[B0:.+]] = hal.interface.binding.subspan layout({{.+}}) binding(0)
+//   CHECK-DAG:   %[[B1:.+]] = hal.interface.binding.subspan layout({{.+}}) binding(1)
+//   CHECK-DAG:   %[[B2:.+]] = hal.interface.binding.subspan layout({{.+}}) binding(2)
+//   CHECK-DAG:   memref.alloc() : memref<1x4x66xf32, #gpu.address_space<workgroup>>
+//   CHECK-DAG:   memref.alloc() : memref<1x16x6xf32, #gpu.address_space<workgroup>>
+//   CHECK-DAG:   memref.alloc() : memref<1x1x16x4x16xf32, #gpu.address_space<workgroup>>
+//       CHECK:   scf.forall ({{.*}}) in (12, 37, 10) {
+//       CHECK:     %[[LOOP:.+]] = scf.for %[[IV:.+]] = %c0 to %c145 step %c1 {{.*}} -> (vector<1x1x1x4x1xf32>)
+//       CHECK:       gpu.barrier
+//   CHECK-DAG:       %[[LHS_RD:.+]] = vector.transfer_read {{.*}} vector<4xf32>
+//   CHECK-DAG:       vector.transfer_write %[[LHS_RD]]
+//   CHECK-DAG:       %[[RHS_RD:.+]] = vector.transfer_read {{.*}} vector<1xf32>
+//   CHECK-DAG:       vector.transfer_write %[[RHS_RD]]
+//       CHECK:       gpu.barrier
+//   CHECK-DAG:       vector.transfer_read {{.*}} #gpu.address_space<workgroup>>, vector<1xf32>
+//   CHECK-DAG:       vector.transfer_read {{.*}} #gpu.address_space<workgroup>>, vector<1xf32>
+// CHECK-COUNT-1:     amdgpu.mfma {{.*}}blocks = 1 : i32, k = 4 : i32, m = 16 : i32, n = 16 : i32
+//       CHECK:       scf.yield
+//       CHECK:     %[[LOOP_T:.+]] = vector.shape_cast %[[LOOP]] : vector<1x1x1x4x1xf32> to vector<4x1x1xf32>
+//       CHECK:     vector.transfer_write %[[LOOP_T]]
+//       CHECK:     scf.for {{.*}} {
+//       CHECK:       %[[SHARED_READ:.+]] = vector.transfer_read {{.*}} #gpu.address_space<workgroup>>, vector<1xf32>
+//       CHECK:       vector.transfer_write %[[SHARED_READ]], %[[B2]]
+//       CHECK:    }
+//       CHECK:   } {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}

From 6ff85dfbb829b24f630e2fb143323f7f4e92f4a1 Mon Sep 17 00:00:00 2001
From: Han-Chung Wang <hanhan0912@gmail.com>
Date: Tue, 3 Dec 2024 15:35:18 -0800
Subject: [PATCH 46/54] [Codegen][NFC] Remove duplicate c++ constants. (#19340)

The constant is defined in
[IREECodegenAttrs.h](https://github.com/iree-org/iree/blob/f2abfa8b5bdf17ad363cad0af198278c2e700113/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h#L41).
The revision removes the one in the `.cpp` file.

Signed-off-by: hanhanW <hanhan0912@gmail.com>
---
 .../compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.cpp     | 1 -
 1 file changed, 1 deletion(-)

diff --git a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.cpp b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.cpp
index 3def02aee6df..9b88f13a4d22 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.cpp
@@ -22,7 +22,6 @@
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.cpp.inc"
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/LoweringConfigEnums.cpp.inc"
 
-static const char kConfigAttrName[] = "lowering_config";
 static const char kTranslationInfoAttrName[] = "translation_info";
 static const char kCompilationInfoAttrName[] = "compilation_info";
 

From 29229dfb84c36d90a7d60b0e03f61a7b6e0a8d58 Mon Sep 17 00:00:00 2001
From: Kunwar Grover <groverkss@gmail.com>
Date: Wed, 4 Dec 2024 00:02:44 +0000
Subject: [PATCH 47/54] [GPU] Add gather fusion tests for vector distribution
 (#19209)

VectorDistribution now supports gather fusion on producers. This pr adds
pipeline tests for that. There are still numerical issues being tracked
seperatly,related to distribution of gather.
---
 .../pipeline_vector_distribute_gfx942.mlir    | 142 ++++++++++++++++++
 1 file changed, 142 insertions(+)

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute_gfx942.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute_gfx942.mlir
index 184d49799faf..4396888ad90b 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute_gfx942.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute_gfx942.mlir
@@ -884,6 +884,77 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) {
 
 // -----
 
+#map = affine_map<(d0, d1) -> (d0, d1)>
+#map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
+#map2 = affine_map<(d0, d1, d2) -> (d1, d2)>
+#map3 = affine_map<(d0, d1, d2) -> (d0, d1)>
+#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer>,
+                                                    #hal.pipeline.binding<storage_buffer>,
+                                                    #hal.pipeline.binding<storage_buffer>,
+                                                    #hal.pipeline.binding<storage_buffer>,
+                                                    #hal.pipeline.binding<storage_buffer>]>
+#translation = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>
+#config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], reduction = [0, 0, 64], subgroup_m_count = 2 : i64, subgroup_n_count = 2 : i64, workgroup = [64, 128, 0]}>
+
+hal.executable public @matmul_gather_rhs {
+  hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) {
+    hal.executable.export public @matmul_gather_rhs ordinal(0) layout(#pipeline_layout) {
+    ^bb0(%arg0: !hal.device):
+      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
+      hal.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @matmul_gather_rhs() attributes {translation_info = #translation} {
+        %cst = arith.constant 0.000000e+00 : f32
+        %c0 = arith.constant 0 : index
+        %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4096x64xf16>>
+        %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4096x64xi64>>
+        %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4096x64xf16>>
+        %3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(3) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<4096x4096xf16>>
+        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [4096, 64], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x64xf16>> -> tensor<4096x64xf16>
+        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [4096, 64], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x64xi64>> -> tensor<4096x64xi64>
+        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [4096, 64], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x64xf16>> -> tensor<4096x64xf16>
+        %7 = tensor.empty() : tensor<4096x4096xf16>
+        %8 = tensor.empty() : tensor<4096x4096xf32>
+        %9 = tensor.empty() : tensor<4096x64xf16>
+        %10 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%5 : tensor<4096x64xi64>) outs(%9 : tensor<4096x64xf16>) {
+        ^bb0(%in: i64, %out: f16):
+          %14 = linalg.index 0 : index
+          %15 = arith.index_cast %in : i64 to index
+          %extracted = tensor.extract %4[%14, %15] : tensor<4096x64xf16>
+          linalg.yield %extracted : f16
+        } -> tensor<4096x64xf16>
+        %11 = linalg.fill ins(%cst : f32) outs(%8 : tensor<4096x4096xf32>) -> tensor<4096x4096xf32>
+        %12 = linalg.generic {indexing_maps = [#map1, #map2, #map3],
+                              iterator_types = ["parallel", "parallel", "reduction"]}
+              ins(%6, %10 : tensor<4096x64xf16>, tensor<4096x64xf16>)
+              outs(%11 : tensor<4096x4096xf32>)
+              attrs = {lowering_config = #config} {
+        ^bb0(%in: f16, %in_0: f16, %out: f32):
+          %14 = arith.extf %in : f16 to f32
+          %15 = arith.extf %in_0 : f16 to f32
+          %16 = arith.mulf %14, %15 : f32
+          %17 = arith.addf %out, %16 : f32
+          linalg.yield %17 : f32
+        } -> tensor<4096x4096xf32>
+        %13 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<4096x4096xf32>) outs(%7 : tensor<4096x4096xf16>) {
+        ^bb0(%in: f32, %out: f16):
+          %14 = arith.truncf %in : f32 to f16
+          linalg.yield %14 : f16
+        } -> tensor<4096x4096xf16>
+        flow.dispatch.tensor.store %13, %3, offsets = [0, 0], sizes = [4096, 4096], strides = [1, 1] : tensor<4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<4096x4096xf16>>
+        return
+      }
+    }
+  }
+}
+
+// CHECK-LABEL: func.func @matmul_gather_rhs
+// CHECK: vector.gather
+// CHECK-COUNT-32: amdgpu.mfma
+
+// -----
+
 #config = #iree_gpu.lowering_config<{workgroup = [1, 64, 0, 0, 64], reduction = [0, 0, 0, 64, 0], promote_operands = [0, 1, 2]}>
 #translation = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64>
 
@@ -1169,3 +1240,74 @@ hal.executable private @online_attention_split_k2 {
 // MEMORY-LABEL: func.func @online_attention_split_k2()
 // MEMORY-COUNT-3: memref.alloc
 // MEMORY-NOT: memref.alloc
+
+// -----
+
+#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb">
+#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+#map1 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4)>
+#map2 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d4)>
+#map3 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d5, d3)>
+#map4 = affine_map<(d0, d1, d2, d3, d4, d5) -> ()>
+#map5 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>
+#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>
+#translation = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>
+
+#qk_config = {attention_qk_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [0, 1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}
+#pv_config = {attention_pv_matmul, lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, promote_operands = [1], subgroup_m_count = 2 : i64, subgroup_n_count = 1 : i64}>}
+#config = #iree_gpu.lowering_config<{promote_operands = [0, 1, 2], reduction = [0, 0, 0, 0, 0, 64], workgroup = [1, 1, 64, 64, 0, 0]}>
+
+module {
+  hal.executable public @attention_gather_k {
+    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
+      hal.executable.export public @attention_gather_k ordinal(0) layout(#pipeline_layout) {
+      ^bb0(%arg0: !hal.device):
+        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
+        hal.return %x, %y, %z : index, index, index
+      }
+      builtin.module {
+        func.func @attention_gather_k() attributes {translation_info = #translation} {
+          %cst = arith.constant 1.250000e-01 : f16
+          %c0 = arith.constant 0 : index
+          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x10x4096x64xf16>>
+          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x10x4096x64xi64>>
+          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x10x4096x64xf16>>
+          %3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(3) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x10x4096x64xf16>>
+          %4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x10x4096x64xf16>>
+          %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 10, 4096, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
+          %6 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [2, 10, 4096, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x4096x64xi64>> -> tensor<2x10x4096x64xi64>
+          %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [2, 10, 4096, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
+          %8 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [2, 10, 4096, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
+          %9 = tensor.empty() : tensor<2x10x4096x64xf16>
+          %10 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%6 : tensor<2x10x4096x64xi64>) outs(%9 : tensor<2x10x4096x64xf16>) {
+          ^bb0(%in: i64, %out: f16):
+            %12 = linalg.index 0 : index
+            %13 = linalg.index 1 : index
+            %14 = arith.index_cast %in : i64 to index
+            %15 = linalg.index 3 : index
+            %extracted = tensor.extract %5[%12, %13, %14, %15] : tensor<2x10x4096x64xf16>
+            linalg.yield %extracted : f16
+          } -> tensor<2x10x4096x64xf16>
+          %11 = iree_linalg_ext.attention {
+              indexing_maps = [#map1, #map2, #map3, #map4, #map5],
+              decomposition_config = { qk_attrs = #qk_config, pv_attrs = #pv_config },
+              lowering_config = #config} ins(%7, %10, %8, %cst : tensor<2x10x4096x64xf16>, tensor<2x10x4096x64xf16>, tensor<2x10x4096x64xf16>, f16) outs(%9 : tensor<2x10x4096x64xf16>) {
+          ^bb0(%arg0: f32):
+            iree_linalg_ext.yield %arg0 : f32
+          } -> tensor<2x10x4096x64xf16>
+          flow.dispatch.tensor.store %11, %4, offsets = [0, 0, 0, 0], sizes = [2, 10, 4096, 64], strides = [1, 1, 1, 1] : tensor<2x10x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x10x4096x64xf16>>
+          return
+        }
+      }
+    }
+  }
+}
+
+// CHECK-LABEL: func.func @attention_gather_k
+// CHECK: scf.for %{{.*}} = %c0 to %c4096 step %c64
+// CHECK:      vector.gather
+// CHECK-SAME: into vector<4x1x1x1x1x8xf16>
+// CHECK: scf.yield
+
+// MEMORY-LABEL: func.func @attention_gather_k
+// MEMORY-COUNT-3: memref.alloc

From 939984cf4812d07feb962887cb15972432049f1a Mon Sep 17 00:00:00 2001
From: Prashant Kumar <pk5561@gmail.com>
Date: Wed, 4 Dec 2024 09:17:28 +0530
Subject: [PATCH 48/54] [LLVMCPU] Update the tile&fuse MultiTilingExpert
 pipeline (#19352)

The tilePass is updated with tileRootAndFuseInputOperands pass.
---
 .../iree/compiler/Codegen/LLVMCPU/Passes.cpp  |  4 +--
 .../Codegen/LLVMCPU/test/pipeline_tests.mlir  | 26 +++++++++++++++++++
 2 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp
index 3753700c6111..3d5ed20d145a 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp
@@ -405,11 +405,11 @@ void addMultiTilingExpertPassPipeline(OpPassManager &funcPassManager,
         // SplitReductionPass takes care of banked-tiling.
         funcPassManager.addPass(
             createLLVMCPUSplitReductionPass(clEnableReassociateFpReductions));
-        funcPassManager.addPass(createLLVMCPUTilePass(i));
+        funcPassManager.addPass(createLLVMCPUTileRootAndFuseInputOperands(i));
         continue;
       }
 
-      funcPassManager.addPass(createLLVMCPUTilePass(i));
+      funcPassManager.addPass(createLLVMCPUTileRootAndFuseInputOperands(i));
     }
   }
 
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir
index e927b2fe8799..6936adef7e01 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir
@@ -395,3 +395,29 @@ func.func @dequant_matmul() attributes {hal.executable.target = #executable_targ
 // CHECK:      scf.for
 // CHECK:        arith.uitofp
 // CHECK:        vector.fma
+
+// -----
+
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+fma,+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
+func.func @fuse_inputs_reduction() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+  %cst = arith.constant 0.000000e+00 : f32
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<64x1x1x16x16xf32>>
+  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<64x16x16xf32>>
+  %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0, 0], sizes = [64, 1, 1, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1x1x16x16xf32>> -> tensor<64x1x1x16x16xf32>
+  %3 = tensor.empty() : tensor<64x16x16xf32>
+  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<64x16x16xf32>) -> tensor<64x16x16xf32>
+  %unpack = tensor.unpack %2 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %3 : tensor<64x1x1x16x16xf32> -> tensor<64x16x16xf32>
+  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%unpack : tensor<64x16x16xf32>) outs(%4 : tensor<64x16x16xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %6 = arith.addf %out, %in : f32
+    linalg.yield %6 : f32
+  } -> tensor<64x16x16xf32>
+  flow.dispatch.tensor.store %5, %1, offsets = [0, 0, 0], sizes = [64, 16, 16], strides = [1, 1, 1] : tensor<64x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x16x16xf32>>
+  return
+}
+// CHECK-LABEL: func.func @fuse_inputs_reduction
+//     CHECK:      scf.for
+//     CHECK:        vector.load
+// CHECK-NOT:        scf.for
+//     CHECK:        arith.addf

From 05ef33014589ad9f3a6659e41bf762d881b09df2 Mon Sep 17 00:00:00 2001
From: Prashant Kumar <pk5561@gmail.com>
Date: Wed, 4 Dec 2024 18:11:00 +0530
Subject: [PATCH 49/54] Add 'pashu123' as a co-owner of CodeGen/LLVMCPU
 (#19368)

---
 .github/CODEOWNERS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 6edcff51e7ad..373013b7a9d0 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -58,7 +58,7 @@
 /compiler/src/iree/compiler/Codegen/Common @hanhanW
 /compiler/src/iree/compiler/Codegen/Common/GPU @antiagainst @qedawkins
 /compiler/src/iree/compiler/Codegen/Dialect/GPU @antiagainst @qedawkins
-/compiler/src/iree/compiler/Codegen/LLVMCPU/ @hanhanW @MaheshRavishankar
+/compiler/src/iree/compiler/Codegen/LLVMCPU/ @hanhanW @MaheshRavishankar @pashu123
 /compiler/src/iree/compiler/Codegen/LLVMGPU/ @MaheshRavishankar @qedawkins @kuhar @Groverkss
 /compiler/src/iree/compiler/Codegen/SPIRV/ @antiagainst @MaheshRavishankar @kuhar
 /compiler/src/iree/compiler/ConstEval/ @hanhanW @stellaraccident

From e61707092020fbbf8272fe7f514dcae01c6519ee Mon Sep 17 00:00:00 2001
From: Benoit Jacob <jacob.benoit.1@gmail.com>
Date: Wed, 4 Dec 2024 11:46:39 -0500
Subject: [PATCH 50/54] rename `IREEGPU_I32MmaEnumAttr` to
 `IREEGPU_I32EnumAttr` (#19364)

Signed-off-by: Benoit Jacob <jacob.benoit.1@gmail.com>
---
 .../compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.td  | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.td b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.td
index b174ceaac916..0a9c1f6a4515 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.td
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.td
@@ -92,7 +92,7 @@ def IREEGPU_DotProductOps : I32BitEnumAttr<
 //===----------------------------------------------------------------------===//
 // MMA intrinsic
 
-class IREEGPU_I32MmaEnumAttr<string name, string summary, list<I32EnumAttrCase> cases>
+class IREEGPU_I32EnumAttr<string name, string summary, list<I32EnumAttrCase> cases>
     : I32EnumAttr<name, summary, cases> {
   let cppNamespace = "::mlir::iree_compiler::IREE::GPU";
   let genSpecializedAttr = 0;
@@ -176,7 +176,7 @@ def WMMA_I32_16x16x16_I8 : I32EnumAttrCase<"WMMA_I32_16x16x16_I8", 0x18C0>;
 def NV_WMMA_F32_16x16x16_F16 : I32EnumAttrCase<"NV_WMMA_F32_16x16x16_F16", 0x2020>;
 def NV_WMMA_F16_16x16x16_F16 : I32EnumAttrCase<"NV_WMMA_F16_16x16x16_F16", 0x2021>;
 
-def IREEGPU_MMAIntrinsic : IREEGPU_I32MmaEnumAttr<"MMAIntrinsic",
+def IREEGPU_MMAIntrinsic : IREEGPU_I32EnumAttr<"MMAIntrinsic",
     "Descriptor for different MMA intrinsics", [
       // Introduced in CDNA1
       MFMA_F32_16x16x4_F32,
@@ -221,7 +221,7 @@ def VMFMA_F32_32x32x16_F16  : I32EnumAttrCase<"VMFMA_F32_32x32x16_F16", 1>;
 def VMFMA_F32_16x16x32_F8E4M3FNUZ  : I32EnumAttrCase<"VMFMA_F32_16x16x32_F8E4M3FNUZ", 2>;
 def VMFMA_F32_32x32x16_F8E4M3FNUZ  : I32EnumAttrCase<"VMFMA_F32_32x32x16_F8E4M3FNUZ", 3>;
 
-def IREEGPU_VirtualMMAIntrinsic : IREEGPU_I32MmaEnumAttr<"VirtualMMAIntrinsic",
+def IREEGPU_VirtualMMAIntrinsic : IREEGPU_I32EnumAttr<"VirtualMMAIntrinsic",
     "Descriptor for different Virtual MMA intrinsics", [
       VMFMA_F32_16x16x32_F16,
       VMFMA_F32_32x32x16_F16,
@@ -233,7 +233,7 @@ def MMA_LHS : I32EnumAttrCase<"Lhs", 0>;
 def MMA_RHS : I32EnumAttrCase<"Rhs", 1>;
 def MMA_ACC : I32EnumAttrCase<"Acc", 2>;
 
-def IREEGPU_MMAFragment : IREEGPU_I32MmaEnumAttr<"MMAFragment",
+def IREEGPU_MMAFragment : IREEGPU_I32EnumAttr<"MMAFragment",
     "Descriptor for a particular fragment of an MMA operation", [
       MMA_LHS,
       MMA_RHS,
@@ -243,7 +243,7 @@ def IREEGPU_MMAFragment : IREEGPU_I32MmaEnumAttr<"MMAFragment",
 def MMA_Workgroup : I32EnumAttrCase<"Workgroup", 0>;
 def MMA_Subgroup : I32EnumAttrCase<"Subgroup", 1>;
 
-def IREEGPU_MMAScope : IREEGPU_I32MmaEnumAttr<"MMAScope",
+def IREEGPU_MMAScope : IREEGPU_I32EnumAttr<"MMAScope",
     "Descriptor for a particular scope of an MMA operation", [
       MMA_Workgroup,
       MMA_Subgroup
@@ -263,7 +263,7 @@ def Lane : I32EnumAttrCase<"Lane", 5>;
 /// Note that `Thread` tiling is mutually exclusive with `Subgroup` and
 /// `Lane` tiling, and `Lane` tiling is only legal if the same operation
 /// is also tiled or fused to subgroups.
-def IREEGPU_TilingLevel : IREEGPU_I32MmaEnumAttr<"TilingLevel",
+def IREEGPU_TilingLevel : IREEGPU_I32EnumAttr<"TilingLevel",
     "Descriptor for tiling levels for GPU lowering configs", [
       Workgroup,
       Reduction,

From 8894f5acd1f045ae2539217ccff17ea364e71fda Mon Sep 17 00:00:00 2001
From: Jakub Kuderski <jakub@nod-labs.com>
Date: Wed, 4 Dec 2024 12:16:37 -0500
Subject: [PATCH 51/54] [Codegen][Tuner] Clarify tuning spec linking order.
 NFC. (#19370)

This clarification was suggested in
https://github.com/iree-org/iree/pull/19337#discussion_r1868494935.
---
 compiler/src/iree/compiler/Codegen/Common/Passes.h  | 3 ++-
 compiler/src/iree/compiler/Codegen/Common/Passes.td | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/compiler/src/iree/compiler/Codegen/Common/Passes.h b/compiler/src/iree/compiler/Codegen/Common/Passes.h
index 2938bdd87da5..83a8206b5869 100644
--- a/compiler/src/iree/compiler/Codegen/Common/Passes.h
+++ b/compiler/src/iree/compiler/Codegen/Common/Passes.h
@@ -56,7 +56,8 @@ void addEncodingToNopPasses(FunctionLikeNest &passManager);
 /// Links nested transform dialect tuning specs named sequences into a single
 /// entry point. Returns the new named sequence op (inserted into the `module`)
 /// that includes the nested tuning specs, or a null op when no nested named
-/// sequences were found.
+/// sequences were found. The order of inclusion is the same as the order in
+/// which these nested tuning specs appear in the IR.
 FailureOr<transform::NamedSequenceOp> linkTuningSpecs(ModuleOp module);
 
 //------------------------------------------------------------------------------
diff --git a/compiler/src/iree/compiler/Codegen/Common/Passes.td b/compiler/src/iree/compiler/Codegen/Common/Passes.td
index 5471c95b0cad..5571aba9b1e4 100644
--- a/compiler/src/iree/compiler/Codegen/Common/Passes.td
+++ b/compiler/src/iree/compiler/Codegen/Common/Passes.td
@@ -412,7 +412,7 @@ def LinkTuningSpecsPass : Pass<"iree-codegen-link-tuning-specs", "ModuleOp"> {
   let description = [{
     Given a module with multiple nested tuning specs, introduce a new named sequence
     that includes all the other tuning spec entry points. The order of inclusion is the same
-    as the in which these nested tuning specs appear in the IR.
+    as the order in which these nested tuning specs appear in the IR.
 
     A tuning spec entry point is a `transform.named_sequence` op annotated with the
     `iree_codegen.tuning_spec` unit attribute. We require it to perform in-place op

From c3db7106df931e767a560b0c100bcf5c2b77c888 Mon Sep 17 00:00:00 2001
From: Kunwar Grover <groverkss@gmail.com>
Date: Wed, 4 Dec 2024 17:33:51 +0000
Subject: [PATCH 52/54] [VectorDistribution] Add distribution for trivial
 vector.extract (#19318)

This patch adds a distribution pattern for vector.extract when the list
of indices is zero. This arises in the case of a scalar extract for 0-d
vectors.
---
 .../Common/GPU/GPUDistributionPatterns.cpp    | 29 ++++++++++++++++++-
 .../GPU/test/gpu_vector_distribution.mlir     | 17 +++++++++++
 2 files changed, 45 insertions(+), 1 deletion(-)

diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistributionPatterns.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistributionPatterns.cpp
index 276b7fe11d4b..9cc704b196b2 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistributionPatterns.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistributionPatterns.cpp
@@ -303,10 +303,37 @@ struct DistributeGather final : OpDistributionPattern<vector::GatherOp> {
   }
 };
 
+/// Distribute a 0-rank vector to scalar vector.extract conversion.
+struct DistributeTrivialExtract final
+    : OpDistributionPattern<vector::ExtractOp> {
+  using OpDistributionPattern::OpDistributionPattern;
+
+  LogicalResult matchAndRewrite(vector::ExtractOp extractOp,
+                                DistributionSignature &signature,
+                                PatternRewriter &rewriter) const override {
+    if (extractOp.getSourceVectorType().getRank() != 0) {
+      return rewriter.notifyMatchFailure(
+          extractOp, "Only 0-rank vector extractions supported");
+    }
+
+    VectorValue source = extractOp.getVector();
+    VectorLayoutInterface sourceLayout = signature[source];
+
+    Value distributed = rewriter.create<vector::ExtractOp>(
+        extractOp.getLoc(), getDistributed(rewriter, source, sourceLayout),
+        ArrayRef<int64_t>{});
+
+    replaceOpWithDistributedValues(rewriter, extractOp, distributed);
+
+    return success();
+  }
+};
+
 } // namespace
 
 void populateGPUDistributionPatterns(RewritePatternSet &patterns) {
-  patterns.add<DistributeConstants, DistributeScfFor>(patterns.getContext());
+  patterns.add<DistributeConstants, DistributeScfFor, DistributeTrivialExtract>(
+      patterns.getContext());
   // Elementwise patterns.
   patterns.add<DistributeElementwise>(patterns.getContext());
   patterns.add<DistributeTrivialLayoutConversions>(patterns.getContext());
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_vector_distribution.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_vector_distribution.mlir
index 1f0833fa8768..c893ba30d0b5 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_vector_distribution.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_vector_distribution.mlir
@@ -96,6 +96,23 @@ func.func @distribute_scf_for_0d(%a: vector<i32>, %b: vector<i32>) -> vector<i32
   return %out : vector<i32>
 }
 
+// CHECK-LABEL: @distribute_scalar_extract
+func.func @distribute_scalar_extract(%a: f16, %b: vector<f16>) -> f16 {
+  %c0 = arith.constant 0 : index
+  %cst_0 = arith.constant 0.0 : f16
+  // CHECK: %[[ROOT:.*]] = arith.constant dense<0.000000e+00> : vector<f16>
+  %root = arith.constant dense<0.0> : vector<f16>
+  %rootl = iree_vector_ext.to_layout %root to layout(#layout_0d) : vector<f16>
+  // CHECK-DAG: %[[B:.*]] = iree_vector_ext.to_simt %{{.*}} : vector<f16> -> vector<f16>
+  // CHECK-DAG: %[[C:.*]] = arith.mulf %[[B]], %[[ROOT]] : vector<f16>
+  // CHECK-DAG: %[[SCALAR:.*]] = vector.extract %[[C]][] : f16 from vector<f16>
+  %c = arith.mulf %rootl, %b : vector<f16>
+  %scalar = vector.extract %c[] : f16 from vector<f16>
+  // CHECK-DAG: %[[D:.*]] = arith.addf %[[SCALAR]], %{{.*}} : f16
+  %d = arith.addf %scalar, %a : f16
+  return %d : f16
+}
+
 builtin.module attributes { transform.with_named_sequence } {
   transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
     %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op

From 62bccc9548439f9b4962709cf471ad3c847bfce9 Mon Sep 17 00:00:00 2001
From: Scott Todd <scott.todd0@gmail.com>
Date: Wed, 4 Dec 2024 10:07:10 -0800
Subject: [PATCH 53/54] Split bazel_to_cmake pre-commit hook into two. (#19373)

Tentative fix for https://github.com/iree-org/iree/issues/18014.
---
 .pre-commit-config.yaml | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 1d2402e7a9bf..efc1b6372075 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -83,14 +83,21 @@ repos:
         # probably changed again in a future version, whatever.
         language_version: "1.16"
 
-      - id: bazel_to_cmake
-        name: Run bazel_to_cmake.py
+      # Convert BUILD.bazel files to CMakeLists.txt files in two passes to
+      # avoid potential race conditions.
+      # Note: this passes file names to the tool. The tool can also be run
+      # manually with no arguments specified to walk directories on its own.
+      # Keep the top level directories here in sync with .bazel_to_cmake.cfg.py.
+      - id: bazel_to_cmake_1
+        name: Run bazel_to_cmake.py on BUILD.bazel files
         language: python
-        # Note: this passes file names to the tool. The tool can also be run
-        # manually with no arguments specified to walk directories on its own.
         entry: ./build_tools/bazel_to_cmake/bazel_to_cmake.py
-        # Keep the top level directories here in sync with .bazel_to_cmake.cfg.py.
-        files: '^(compiler|runtime|samples|tests|tools)/(.*/)?(BUILD\.bazel|CMakeLists.txt)$'
+        files: '^(compiler|runtime|samples|tests|tools)/(.*/)?(BUILD\.bazel)$'
+      - id: bazel_to_cmake_2
+        name: Run bazel_to_cmake.py on CMakeLists.txt files
+        language: python
+        entry: ./build_tools/bazel_to_cmake/bazel_to_cmake.py
+        files: "^(compiler|runtime|samples|tests|tools)/(.*/)?(CMakeLists.txt)$"
 
       - id: check_path_lengths
         name: Check for excessively long path lengths

From 9f8aad85b469b224bb9594d002f35bd8febebbf9 Mon Sep 17 00:00:00 2001
From: Jakub Kuderski <jakub@nod-labs.com>
Date: Wed, 4 Dec 2024 13:09:28 -0500
Subject: [PATCH 54/54] [Codegen][Tuner] Allow tuning specs in the LLVMGPU
 pipeline (#19359)

This adds the `materialize-tuning-specs` pass to the LLVMGPU executable
configuration pipelines.

Add a test that shows that the tuning spec gets applied and picked up in
the ROCDL pipeline.

Also, replace the print-based checks in existing tests with op remarks
on transform strategy application in `materialize-user-configs`.
---
 compiler/plugins/target/CUDA/BUILD.bazel      |  1 +
 compiler/plugins/target/CUDA/CMakeLists.txt   |  1 +
 compiler/plugins/target/CUDA/CUDATarget.cpp   |  3 ++
 compiler/plugins/target/ROCM/BUILD.bazel      |  1 +
 compiler/plugins/target/ROCM/CMakeLists.txt   |  1 +
 compiler/plugins/target/ROCM/ROCMTarget.cpp   |  3 ++
 compiler/plugins/target/ROCM/test/BUILD.bazel |  4 ++
 .../plugins/target/ROCM/test/CMakeLists.txt   |  3 ++
 .../lowering_strategy_from_tuning_spec.mlir   | 48 +++++++++++++++++++
 .../test/tuning_spec_mmt_tile_and_fuse.mlir   | 24 ++++++++++
 .../Codegen/Common/MaterializeUserConfigs.cpp | 16 +++++++
 ...erialize_user_config_from_tuning_spec.mlir | 13 ++---
 .../iree/compiler/Codegen/LLVMGPU/Passes.cpp  |  2 +
 13 files changed, 112 insertions(+), 8 deletions(-)
 create mode 100644 compiler/plugins/target/ROCM/test/lowering_strategy_from_tuning_spec.mlir
 create mode 100644 compiler/plugins/target/ROCM/test/tuning_spec_mmt_tile_and_fuse.mlir

diff --git a/compiler/plugins/target/CUDA/BUILD.bazel b/compiler/plugins/target/CUDA/BUILD.bazel
index b694187f7325..2af2c29883bc 100644
--- a/compiler/plugins/target/CUDA/BUILD.bazel
+++ b/compiler/plugins/target/CUDA/BUILD.bazel
@@ -28,6 +28,7 @@ iree_compiler_cc_library(
     ],
     deps = [
         "//compiler/src/iree/compiler/Codegen",
+        "//compiler/src/iree/compiler/Codegen/Common",
         "//compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR:IREECodegenDialect",
         "//compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils:KnownTargets",
         "//compiler/src/iree/compiler/Codegen/LLVMGPU",
diff --git a/compiler/plugins/target/CUDA/CMakeLists.txt b/compiler/plugins/target/CUDA/CMakeLists.txt
index 70c6dc6b8a5b..e3e86c00e54f 100644
--- a/compiler/plugins/target/CUDA/CMakeLists.txt
+++ b/compiler/plugins/target/CUDA/CMakeLists.txt
@@ -52,6 +52,7 @@ iree_cc_library(
     MLIRTransformDialect
     iree::base::internal::flatcc::building
     iree::compiler::Codegen
+    iree::compiler::Codegen::Common
     iree::compiler::Codegen::Dialect::Codegen::IR::IREECodegenDialect
     iree::compiler::Codegen::Dialect::GPU::TargetUtils::KnownTargets
     iree::compiler::Codegen::LLVMGPU
diff --git a/compiler/plugins/target/CUDA/CUDATarget.cpp b/compiler/plugins/target/CUDA/CUDATarget.cpp
index ffc49b57fa7d..fe41cb44f8f4 100644
--- a/compiler/plugins/target/CUDA/CUDATarget.cpp
+++ b/compiler/plugins/target/CUDA/CUDATarget.cpp
@@ -5,6 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 #include "./SetBlockIdsRangePass.h"
+#include "iree/compiler/Codegen/Common/Passes.h"
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenDialect.h"
 #include "iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.h"
 #include "iree/compiler/Codegen/LLVMGPU/Passes.h"
@@ -448,6 +449,8 @@ class CUDATargetBackend final : public TargetBackend {
     mlir::registerBuiltinDialectTranslation(registry);
     mlir::registerLLVMDialectTranslation(registry);
     mlir::registerNVVMDialectTranslation(registry);
+    // Configuration may load and manipulate transform dialect libraries.
+    registerTransformDialectTranslationDependentDialects(registry);
   }
 
   void
diff --git a/compiler/plugins/target/ROCM/BUILD.bazel b/compiler/plugins/target/ROCM/BUILD.bazel
index 48dfeb3ff401..682806e23539 100644
--- a/compiler/plugins/target/ROCM/BUILD.bazel
+++ b/compiler/plugins/target/ROCM/BUILD.bazel
@@ -28,6 +28,7 @@ iree_compiler_cc_library(
     ],
     deps = [
         "//compiler/plugins/target/ROCM/builtins/ukernel:iree_uk_amdgpu_bitcode",
+        "//compiler/src/iree/compiler/Codegen/Common",
         "//compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR:IREECodegenDialect",
         "//compiler/src/iree/compiler/Codegen/Dialect/GPU/IR:IREEGPUDialect",
         "//compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils:KnownTargets",
diff --git a/compiler/plugins/target/ROCM/CMakeLists.txt b/compiler/plugins/target/ROCM/CMakeLists.txt
index 96c3305d936d..69204abd3d13 100644
--- a/compiler/plugins/target/ROCM/CMakeLists.txt
+++ b/compiler/plugins/target/ROCM/CMakeLists.txt
@@ -52,6 +52,7 @@ iree_cc_library(
     MLIRROCDLToLLVMIRTranslation
     MLIRSupport
     MLIRTargetLLVMIRExport
+    iree::compiler::Codegen::Common
     iree::compiler::Codegen::Dialect::Codegen::IR::IREECodegenDialect
     iree::compiler::Codegen::Dialect::GPU::IR::IREEGPUDialect
     iree::compiler::Codegen::Dialect::GPU::TargetUtils::KnownTargets
diff --git a/compiler/plugins/target/ROCM/ROCMTarget.cpp b/compiler/plugins/target/ROCM/ROCMTarget.cpp
index 48ef62e07220..c175ab02029a 100644
--- a/compiler/plugins/target/ROCM/ROCMTarget.cpp
+++ b/compiler/plugins/target/ROCM/ROCMTarget.cpp
@@ -9,6 +9,7 @@
 #include <cstdint>
 
 #include "compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_bitcode.h"
+#include "iree/compiler/Codegen/Common/Passes.h"
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenDialect.h"
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUDialect.h"
@@ -268,6 +269,8 @@ class ROCMTargetBackend final : public TargetBackend {
     registry.insert<IREE::Codegen::IREECodegenDialect>();
     registry.insert<IREE::VectorExt::IREEVectorExtDialect>();
     registry.insert<IREE::GPU::IREEGPUDialect>();
+    // Configuration may load and manipulate transform dialect libraries.
+    registerTransformDialectTranslationDependentDialects(registry);
   }
 
   void
diff --git a/compiler/plugins/target/ROCM/test/BUILD.bazel b/compiler/plugins/target/ROCM/test/BUILD.bazel
index bf9a18d582bd..ebf4dfd7463e 100644
--- a/compiler/plugins/target/ROCM/test/BUILD.bazel
+++ b/compiler/plugins/target/ROCM/test/BUILD.bazel
@@ -16,9 +16,13 @@ iree_lit_test_suite(
     name = "lit",
     srcs = [
         "gpu_lower_to_ukernels.mlir",
+        "lowering_strategy_from_tuning_spec.mlir",
         "ukernel_pipeline_transform.mlir",
     ],
     cfg = "//compiler:lit.cfg.py",
+    data = [
+        "tuning_spec_mmt_tile_and_fuse.mlir",
+    ],
     tools = [
         "//tools:iree-opt",
         "@llvm-project//llvm:FileCheck",
diff --git a/compiler/plugins/target/ROCM/test/CMakeLists.txt b/compiler/plugins/target/ROCM/test/CMakeLists.txt
index 6d2199d8c4bb..38158aac8c5b 100644
--- a/compiler/plugins/target/ROCM/test/CMakeLists.txt
+++ b/compiler/plugins/target/ROCM/test/CMakeLists.txt
@@ -15,10 +15,13 @@ iree_lit_test_suite(
     lit
   SRCS
     "gpu_lower_to_ukernels.mlir"
+    "lowering_strategy_from_tuning_spec.mlir"
     "ukernel_pipeline_transform.mlir"
   TOOLS
     FileCheck
     iree-opt
+  DATA
+    tuning_spec_mmt_tile_and_fuse.mlir
 )
 
 ### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
diff --git a/compiler/plugins/target/ROCM/test/lowering_strategy_from_tuning_spec.mlir b/compiler/plugins/target/ROCM/test/lowering_strategy_from_tuning_spec.mlir
new file mode 100644
index 000000000000..6f7cf092242e
--- /dev/null
+++ b/compiler/plugins/target/ROCM/test/lowering_strategy_from_tuning_spec.mlir
@@ -0,0 +1,48 @@
+// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx942 \
+// RUN:   --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-hal-configure-target-executable-variants{target=rocm})))" \
+// RUN:   --iree-codegen-tuning-spec-path=%p/tuning_spec_mmt_tile_and_fuse.mlir \
+// RUN:   --iree-codegen-notify-transform-strategy-application \
+// RUN:   --verify-diagnostics %s | FileCheck %s
+
+// Make sure we can apply the lowering strategy from the specified tuning spec.
+
+// CHECK:      #translation = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [128, 1, 1] subgroup_size = 64>
+// CHECK:      func.func @matmul_transpose_b
+// CHECK-SAME:   translation_info = #translation
+// CHECK:        linalg.generic
+// CHECK-SAME:     __tuning_spec_applied__
+// CHECK-SAME:     lowering_config = #iree_gpu.lowering_config<
+
+#pipeline_layout = #hal.pipeline.layout<bindings = [
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>
+]>
+hal.executable public @main {
+  hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) {
+    hal.executable.export public @matmul_transpose_b ordinal(0) layout(#pipeline_layout) {
+    ^bb0(%arg0: !hal.device):
+      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
+      hal.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      // expected-remark@+1 {{Applied transform configuration strategy @iree_linked_tuning_spec::@__kernel_config}}
+      func.func @matmul_transpose_b() {
+        %cst = arith.constant 0.000000e+00 : f16
+        %c0 = arith.constant 0 : index
+        %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x1280xf16>>
+        %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<10240x1280xf16>>
+        %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x10240xf32>>
+        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1280], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1280xf16>> -> tensor<2048x1280xf16>
+        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [10240, 1280], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<10240x1280xf16>> -> tensor<10240x1280xf16>
+        %5 = tensor.empty() : tensor<2048x10240xf32>
+        %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<2048x10240xf32>) -> tensor<2048x10240xf32>
+        %7 = linalg.matmul_transpose_b
+          ins(%3, %4 : tensor<2048x1280xf16>, tensor<10240x1280xf16>)
+          outs(%6 : tensor<2048x10240xf32>) -> tensor<2048x10240xf32>
+        flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 10240], strides = [1, 1] : tensor<2048x10240xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x10240xf32>>
+        return
+      }
+    }
+  }
+}
diff --git a/compiler/plugins/target/ROCM/test/tuning_spec_mmt_tile_and_fuse.mlir b/compiler/plugins/target/ROCM/test/tuning_spec_mmt_tile_and_fuse.mlir
new file mode 100644
index 000000000000..24f0c3a200ad
--- /dev/null
+++ b/compiler/plugins/target/ROCM/test/tuning_spec_mmt_tile_and_fuse.mlir
@@ -0,0 +1,24 @@
+// RUN: iree-opt %s
+
+module @mmt_tile_and_fuse_spec attributes { transform.with_named_sequence } {
+  transform.named_sequence @main(%arg0: !transform.any_op {transform.readonly}) -> ()
+    attributes { iree_codegen.tuning_spec_entrypoint } {
+    %mmt = transform.structured.match ops{["linalg.generic"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    // transform.print %mmt {name="MMT"} : !transform.any_op
+    %config = transform.param.constant #iree_codegen.compilation_info<
+      lowering_config = #iree_gpu.lowering_config<{workgroup = [64, 64, 0],
+                                                   reduction = [0, 0, 4],
+                                                   thread = [8, 4],
+                                                   promote_operands = [0, 1]}>,
+      translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse
+        workgroup_size = [128, 1, 1] subgroup_size = 64>
+    > -> !transform.any_param
+    transform.annotate %mmt "compilation_info" = %config : !transform.any_op, !transform.any_param
+    // Add a dummy unit attribute to be sure that the tuning spec applied.
+    // Otherwise it would be difficult to tell if the lowering config attribute
+    // comes from our tuning spec or if the compiler heuristic happened to produce
+    // the same config as this script.
+    transform.annotate %mmt "__tuning_spec_applied__" : !transform.any_op
+    transform.yield
+  }
+}
diff --git a/compiler/src/iree/compiler/Codegen/Common/MaterializeUserConfigs.cpp b/compiler/src/iree/compiler/Codegen/Common/MaterializeUserConfigs.cpp
index 21fee4a3f065..92e719b76dbd 100644
--- a/compiler/src/iree/compiler/Codegen/Common/MaterializeUserConfigs.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/MaterializeUserConfigs.cpp
@@ -32,6 +32,13 @@ llvm::cl::opt<std::string> clCodegenTransformDialectLibraryFileName(
         "this will default to `__kernel_config`."),
     llvm::cl::init(""));
 
+llvm::cl::opt<bool> clCodegenNotifyTransformDialectLibraryApplication(
+    "iree-codegen-notify-transform-strategy-application",
+    llvm::cl::desc(
+        "Emit a remark when a transform configuration strategy successfully "
+        "applies on a function. This is intended for testing/debuging."),
+    llvm::cl::init(false));
+
 #define GEN_PASS_DEF_MATERIALIZEUSERCONFIGSPASS
 #include "iree/compiler/Codegen/Common/Passes.h.inc"
 
@@ -194,6 +201,9 @@ struct MaterializeUserConfigsPass final
       //      ```
       LDBG("MaterializeUserConfigsPass on function: " << funcOp);
       if (succeeded(userTransformLibrary)) {
+        StringRef libraryModuleName =
+            userTransformLibrary->transformLibrary.getSymName().value_or(
+                "<unnamed>");
         StringRef entrySequenceName = userTransformLibrary->entrypointName;
         auto runResult = runTransformConfigurationStrategy(
             funcOp, entrySequenceName, userTransformLibrary->transformLibrary);
@@ -207,6 +217,12 @@ struct MaterializeUserConfigsPass final
                              << entrySequenceName << "` failed to apply";
           return signalPassFailure();
         }
+
+        if (clCodegenNotifyTransformDialectLibraryApplication) {
+          funcOp->emitRemark()
+              << "Applied transform configuration strategy @"
+              << libraryModuleName << "::@" << entrySequenceName;
+        }
       }
 
       /// Nothing to do if the export already has a config.
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/materialize_user_config_from_tuning_spec.mlir b/compiler/src/iree/compiler/Codegen/Common/test/materialize_user_config_from_tuning_spec.mlir
index 08f52791de3f..4e4bba81f056 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/materialize_user_config_from_tuning_spec.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/test/materialize_user_config_from_tuning_spec.mlir
@@ -1,10 +1,12 @@
 // RUN: iree-opt --pass-pipeline='builtin.module(builtin.module(iree-codegen-materialize-tuning-specs,iree-codegen-materialize-user-configs))' \
 // RUN:   --iree-codegen-tuning-spec-path=%p/tuning_spec.mlir \
-// RUN:   --mlir-disable-threading --no-implicit-module %s | FileCheck %s
+// RUN:   --iree-codegen-notify-transform-strategy-application \
+// RUN:   --no-implicit-module --verify-diagnostics %s | FileCheck %s
 
 // RUN: iree-opt --pass-pipeline='builtin.module(iree-codegen-materialize-tuning-specs,builtin.module(iree-codegen-materialize-user-configs))' \
 // RUN:   --iree-codegen-tuning-spec-path=%p/tuning_spec.mlir \
-// RUN:   --mlir-disable-threading --no-implicit-module %s | FileCheck %s --check-prefix=PARENT
+// RUN:   --iree-codegen-notify-transform-strategy-application \
+// RUN:   --no-implicit-module --verify-diagnostics %s | FileCheck %s --check-prefix=PARENT
 
 // (1) We start by running the `Materialize Tuning Specs` pass to embed the
 // transform dialect library into the module. Doing it by hand hand is not
@@ -13,9 +15,6 @@
 // Check that the transform spec gets executed and that it does not remain as
 // a module attribute after `Materialize User Configs`.
 
-// CHECK-LABEL:  [ IR printer: Hello Tuning Spec top-level ]
-// CHECK-NEXT:   func.func @main_0
-//
 // CHECK-LABEL:  module @parent {
 // CHECK-LABEL:    module @child {
 // CHECK:            func.func @main_0
@@ -25,9 +24,6 @@
 // (conservatively) only remove tuning spec from the module passed
 // to the `materialize-user-configs` pass.
 
-// PARENT-LABEL:  [ IR printer: Hello Tuning Spec top-level ]
-// PARENT-NEXT:   func.func @main_0
-//
 // PARENT-LABEL:  module @parent attributes {
 // PARENT-SAME:     iree_codegen.tuning_spec_mlirbc = dense<
 // PARENT-LABEL:    module @child {
@@ -35,6 +31,7 @@
 
 module @parent {
   module @child {
+    // expected-remark@+1 {{Applied transform configuration strategy @iree_linked_tuning_spec::@__kernel_config}}
     func.func @main_0() {
       return
     }
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
index b79abdd0eb19..53e49efbf66a 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
@@ -1178,6 +1178,7 @@ static void buildLLVMGPUCodegenConfigurationPassPipelineImpl(
     funcPassManager.addPass(createConfigTrackingCanonicalizerPass);
     funcPassManager.addPass(createCSEPass);
   }
+  modulePassManager.addPass(createMaterializeTuningSpecsPass());
   modulePassManager.addPass(createMaterializeUserConfigsPass());
   modulePassManager.addPass(createLLVMGPUSelectLoweringStrategyPass());
 }
@@ -1245,6 +1246,7 @@ static void buildROCDLCodegenConfigurationPassPipelineImpl(
     funcPassManager.addPass(createGPUGeneralizeNamedOpsPass);
     addCommonTargetExecutablePreprocessingPasses(funcPassManager);
   }
+  modulePassManager.addPass(createMaterializeTuningSpecsPass());
   modulePassManager.addPass(createMaterializeUserConfigsPass());
 
   modulePassManager.addPass(createROCDLSelectLoweringStrategyPass());