diff --git a/.github/workflows/pkgci_regression_test.yml b/.github/workflows/pkgci_regression_test.yml
index c6b7805d67ada..a11107771012f 100644
--- a/.github/workflows/pkgci_regression_test.yml
+++ b/.github/workflows/pkgci_regression_test.yml
@@ -222,7 +222,7 @@ jobs:
             --goldentime-rocm-vae-ms 337.0 \
             --goldendispatch-rocm-unet 1531 \
             --goldendispatch-rocm-clip 1139 \
-            --goldendispatch-rocm-vae 245 \
+            --goldendispatch-rocm-vae 246 \
             --goldensize-rocm-unet-bytes 2280000  \
             --goldensize-rocm-clip-bytes 860000 \
             --goldensize-rocm-vae-bytes 840000 \
@@ -243,7 +243,7 @@ jobs:
             --goldentime-rocm-vae-ms 80.0 \
             --goldendispatch-rocm-unet 1531 \
             --goldendispatch-rocm-clip 1139 \
-            --goldendispatch-rocm-vae 245 \
+            --goldendispatch-rocm-vae 246 \
             --goldensize-rocm-unet-bytes 2270000 \
             --goldensize-rocm-clip-bytes 860000  \
             --goldensize-rocm-vae-bytes 840000 \
diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/pipeline_tests.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/pipeline_tests.mlir
index 0c4430fe7dfec..8973ba5a0278f 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/pipeline_tests.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/pipeline_tests.mlir
@@ -80,13 +80,13 @@ util.func public @grouped_quantized_matmul(%arg0: tensor<4096x32x128xi4>, %arg1:
 //       CHECK:   flow.executable private @[[EXECUTABLE0:[a-zA-Z0-9_]+]]
 //       CHECK:   func.func @[[FUNC0:[a-zA-Z0-9_x]+]]
 //       CHECK:   %[[GEN0:.+]] = linalg.generic
-//  CHECK-SAME:         ["parallel", "parallel", "parallel", "parallel", "parallel"]
+//  CHECK-SAME:         ["parallel", "parallel", "parallel"]
 //       CHECK:   arith.extui
 //       CHECK:   arith.uitofp
 //       CHECK:   arith.subf
 //       CHECK:   arith.mulf
 //       CHECK:   %[[GEN1:.+]] = linalg.generic
-//  CHECK-SAME:       ["parallel", "parallel", "parallel", "reduction", "reduction"]
+//  CHECK-SAME:       ["parallel", "reduction", "reduction"]
 //  CHECK-SAME:       ins(
 //  CHECK-SAME:       %[[GEN0]]
 //  CHECK-SAME:       outs(
@@ -95,4 +95,5 @@ util.func public @grouped_quantized_matmul(%arg0: tensor<4096x32x128xi4>, %arg1:
 //       CHECK:   flow.dispatch.tensor.store %[[GEN1]]
 //       CHECK:   util.func public @grouped_quantized_matmul(
 //       CHECK:     %[[T0:.+]] = flow.dispatch @[[EXECUTABLE0]]::@[[FUNC0]]
-//       CHECK:     util.return %[[T0]]
+//       CHECK:     %[[RS:.+]] = flow.tensor.reshape %[[T0]] : tensor<4096xf32> -> tensor<1x1x4096xf32>
+//       CHECK:     util.return %[[RS]]
diff --git a/compiler/src/iree/compiler/DispatchCreation/BubbleUpExpandShapes.cpp b/compiler/src/iree/compiler/DispatchCreation/BubbleUpExpandShapes.cpp
index 9ee67d637c060..79ae8d3b2ba8b 100644
--- a/compiler/src/iree/compiler/DispatchCreation/BubbleUpExpandShapes.cpp
+++ b/compiler/src/iree/compiler/DispatchCreation/BubbleUpExpandShapes.cpp
@@ -57,8 +57,12 @@ void BubbleUpExpandShapesPass::runOnOperation() {
           return false;
         }
 
+        // Do not fuse producer generic op if it has more than one user
+        // or any reduction iterators.
         if (auto producerGenericOp = dyn_cast<linalg::GenericOp>(producer)) {
-          return true;
+          return producerGenericOp->hasOneUse() &&
+                 llvm::all_of(producerGenericOp.getIteratorTypesArray(),
+                              linalg::isParallelIterator);
         }
 
         // Do not fuse with any producer linalg named ops for now.
@@ -66,9 +70,11 @@ void BubbleUpExpandShapesPass::runOnOperation() {
           return false;
         }
 
-        // Do not fuse with consumer linalg named ops.
+        // Do not fuse with consumer linalg named ops or reductions.
         if (auto consumerLinalgOp = dyn_cast<linalg::LinalgOp>(consumer)) {
-          return isa<linalg::GenericOp>(consumerLinalgOp);
+          return isa<linalg::GenericOp>(consumerLinalgOp) &&
+                 llvm::all_of(consumerLinalgOp.getIteratorTypesArray(),
+                              linalg::isParallelIterator);
         }
         // Fuse in all other cases.
         return true;