[CPU] Improve distribution tile sizes selection.

iree-org · Nov 6, 2023 · 90f1f14 · 90f1f14
1 parent fde520e
commit 90f1f14
Show file tree

Hide file tree

Showing 3 changed files with 128 additions and 62 deletions.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
@@ -371,81 +371,40 @@ static unsigned getReferenceTypeLengthInBytes(func::FuncOp entryPointFn) {
   return referenceTypeLengthInBytes;
 }
 
-/// Returns the default tile sizes to use for the loops that are distributed.
-static SmallVector<int64_t>
-getDefaultDistributionTileSizes(ArrayRef<int64_t> lbs, ArrayRef<int64_t> ubs,
-                                ArrayRef<int64_t> minTileSizes,
-                                ArrayRef<int64_t> maxTileSizes,
-                                ArrayRef<int64_t> vectorSizeHints) {
-  assert(lbs.size() == ubs.size() && lbs.size() == minTileSizes.size() &&
-         lbs.size() == maxTileSizes.size() &&
-         "expected all vectors to be of equal size");
-
-  size_t numDims = lbs.size();
-  // Set all the distribution tile sizes to zero if thread distribution is
-  // disabled.
-  if (clDisableDistribution) {
-    return SmallVector<int64_t>(numDims, 0);
-  }
-
-  SmallVector<int64_t> distributedTileSizes(numDims, 1);
-  SmallVector<int64_t> numWorkgroupsPerDim(numDims, 1);
-  SmallVector<int64_t> workload(numDims, 1);
-  for (auto i : llvm::seq<size_t>(0, numDims)) {
-    if (maxTileSizes[i] == 0 || ShapedType::isDynamic(lbs[i]) ||
-        ShapedType::isDynamic(ubs[i])) {
-      distributedTileSizes[i] = maxTileSizes[i];
-      workload[i] = ShapedType::kDynamic;
+// Reduces the number of workgroups in cases where we are dividing the work too
+// much. Over-provision the number of workgroups to twice the number of
+// threads.
+static void reduceDistributionWorkgroups(
+    ArrayRef<int64_t> workload, SmallVectorImpl<int64_t> &distributedTileSizes,
+    std::optional<ArrayRef<int64_t>> maxTileSizes = std::nullopt,
+    std::optional<ArrayRef<int64_t>> vectorSizeHints = std::nullopt) {
+  assert(workload.size() == distributedTileSizes.size());
+  SmallVector<int64_t> numWorkgroupsPerDim(workload.size(), 1);
+  for (auto [idx, value] : llvm::enumerate(workload)) {
+    if (distributedTileSizes[idx] == 0 || ShapedType::isDynamic(value)) {
       continue;
     }
-
-    assert(lbs[i] <= ubs[i]);
-    workload[i] = ubs[i] - lbs[i];
-    int64_t candidateTileSize = 1;
-    int64_t targetSize = std::min(workload[i] / 2, maxTileSizes[i]);
-    int64_t vectorSize = vectorSizeHints[i];
-    if (vectorSize > 1) {
-      // Pick the factor of dim which is closest to the target tile size and
-      // is a multiplier of vector size.
-      for (int64_t k = vectorSize; k <= targetSize; k += vectorSize) {
-        if (workload[i] % k == 0 && k >= minTileSizes[i]) {
-          candidateTileSize = k;
-        }
-      }
-    }
-    // Fallback to power of 2 if there's no hint or can't find the ideal size.
-    if (vectorSize <= 1 || candidateTileSize == 1) {
-      candidateTileSize = std::max<int64_t>(
-          llvm::bit_floor<uint64_t>(targetSize), minTileSizes[i]);
-    }
-
-    // Limit the workload per workgroup to the default being the max to keep the
-    // work per invocation reasonable.
-    distributedTileSizes[i] =
-        std::min<int64_t>(candidateTileSize, maxTileSizes[i]);
-    numWorkgroupsPerDim[i] =
-        llvm::divideCeil(workload[i], distributedTileSizes[i]);
+    numWorkgroupsPerDim[idx] =
+        llvm::divideCeil(value, distributedTileSizes[idx]);
   }
 
-  // Reduce the number of workgroups in cases where we are dividing the work too
-  // much. Over-provision the number of workgroups to twice the number of
-  // threads.
   int64_t numWorkgroupsLimit = 2 * clNumberOfRuntimeThreads;
   int64_t numWorkgroups =
       std::accumulate(numWorkgroupsPerDim.begin(), numWorkgroupsPerDim.end(),
                       1LL, std::multiplies<int64_t>{});
-  unsigned currDim = numDims;
+  unsigned currDim = workload.size();
   while (numWorkgroups > numWorkgroupsLimit && currDim > 0) {
     unsigned index = currDim - 1;
     int64_t currSize = distributedTileSizes[index];
     if (workload[index] == ShapedType::kDynamic ||
-        currSize >= maxTileSizes[index] || currSize >= workload[index]) {
+        (maxTileSizes && currSize >= maxTileSizes.value()[index]) ||
+        currSize >= workload[index]) {
       currDim--;
       continue;
     }
 
     int64_t newSize = std::min<int64_t>(currSize * 2, workload[index]);
-    int64_t vectorSize = vectorSizeHints[index];
+    int64_t vectorSize = vectorSizeHints ? vectorSizeHints.value()[index] : 0;
 
     // Chech if it's the ideal size with vector size hint. And skip if the new
     // size will break the ideal size.
@@ -470,22 +429,80 @@ getDefaultDistributionTileSizes(ArrayRef<int64_t> lbs, ArrayRef<int64_t> ubs,
 
   // Final fixup for dividing workload evenly.
   for (auto i : llvm::seq<unsigned>(0, distributedTileSizes.size())) {
-    if (distributedTileSizes[i] == 0 || ShapedType::isDynamic(workload[i]))
+    if (distributedTileSizes[i] == 0 || ShapedType::isDynamic(workload[i])) {
       continue;
+    }
 
     int64_t nwg = llvm::divideCeil(workload[i], distributedTileSizes[i]);
     int64_t newSize = llvm::divideCeil(workload[i], nwg);
 
     // Chech if it's the ideal size with vector size hint. And skip if the new
     // size will break the ideal size.
-    int64_t vectorSize = vectorSizeHints[i];
+    int64_t vectorSize = vectorSizeHints ? vectorSizeHints.value()[i] : 0;
     if (vectorSize > 1 &&
         (newSize % vectorSize != 0 || workload[i] % newSize != 0)) {
       continue;
     }
 
     distributedTileSizes[i] = newSize;
   }
+}
+
+/// Returns the default tile sizes to use for the loops that are distributed.
+static SmallVector<int64_t>
+getDefaultDistributionTileSizes(ArrayRef<int64_t> lbs, ArrayRef<int64_t> ubs,
+                                ArrayRef<int64_t> minTileSizes,
+                                ArrayRef<int64_t> maxTileSizes,
+                                ArrayRef<int64_t> vectorSizeHints) {
+  assert(lbs.size() == ubs.size() && lbs.size() == minTileSizes.size() &&
+         lbs.size() == maxTileSizes.size() &&
+         "expected all vectors to be of equal size");
+
+  size_t numDims = lbs.size();
+  // Set all the distribution tile sizes to zero if thread distribution is
+  // disabled.
+  if (clDisableDistribution) {
+    return SmallVector<int64_t>(numDims, 0);
+  }
+
+  SmallVector<int64_t> distributedTileSizes(numDims, 1);
+  SmallVector<int64_t> workload(numDims, 1);
+  for (auto i : llvm::seq<size_t>(0, numDims)) {
+    if (maxTileSizes[i] == 0 || ShapedType::isDynamic(lbs[i]) ||
+        ShapedType::isDynamic(ubs[i])) {
+      distributedTileSizes[i] = maxTileSizes[i];
+      workload[i] = ShapedType::kDynamic;
+      continue;
+    }
+
+    assert(lbs[i] <= ubs[i]);
+    workload[i] = ubs[i] - lbs[i];
+    int64_t candidateTileSize = 1;
+    int64_t targetSize = std::min(workload[i] / 2, maxTileSizes[i]);
+    int64_t vectorSize = vectorSizeHints[i];
+    if (vectorSize > 1) {
+      // Pick the factor of dim which is closest to the target tile size and
+      // is a multiplier of vector size.
+      for (int64_t k = vectorSize; k <= targetSize; k += vectorSize) {
+        if (workload[i] % k == 0 && k >= minTileSizes[i]) {
+          candidateTileSize = k;
+        }
+      }
+    }
+    // Fallback to power of 2 if there's no hint or can't find the ideal size.
+    if (vectorSize <= 1 || candidateTileSize == 1) {
+      candidateTileSize = std::max<int64_t>(
+          llvm::bit_floor<uint64_t>(targetSize), minTileSizes[i]);
+    }
+
+    // Limit the workload per workgroup to the default being the max to keep the
+    // work per invocation reasonable.
+    distributedTileSizes[i] =
+        std::min<int64_t>(candidateTileSize, maxTileSizes[i]);
+  }
+
+  reduceDistributionWorkgroups(workload, distributedTileSizes, maxTileSizes,
+                               vectorSizeHints);
 
   return distributedTileSizes;
 }
@@ -1358,7 +1375,9 @@ static SmallVector<int64_t> getPackVectorTileSizes(func::FuncOp entryPointFn,
   SmallVector<int64_t> tileSizes(op.getSourceRank(), 1);
   auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(entryPointFn);
   int64_t vectorSize = getVectorSize(entryPointFn, op.getSourceType());
-  if (hasAVX512fFeature(targetAttr) && isPackMatmulLHS(op)) {
+  // TODO(#15421): Improve tile sizes selection for non f32 cases.
+  if (op.getSourceType().getElementType().isF32() &&
+      hasAVX512fFeature(targetAttr) && isPackMatmulLHS(op)) {
     tileSizes.back() = vectorSize;
   }
   return tileSizes;
@@ -1370,6 +1389,16 @@ static LogicalResult setRootConfig(func::FuncOp entryPointFn,
   SmallVector<int64_t> distTileSizes =
       getDefaultDistributionTileSizes(cast<TilingInterface>(op.getOperation()));
 
+  int64_t vectorSize = getVectorSize(entryPointFn, op.getSourceType());
+  SmallVector<int64_t> vectorSizeHints(op.getSourceRank(), 1);
+  for (auto dim : op.getInnerDimsPos()) {
+    vectorSizeHints[dim] = vectorSize;
+  }
+
+  SmallVector<int64_t> workload(op.getSourceType().getShape());
+  reduceDistributionWorkgroups(workload, distTileSizes,
+                               /*maxTileSizes=*/std::nullopt, vectorSizeHints);
+
   // The default function aims to returns the number of workload per workgroup,
   // but it does not know that it is working on packed domain. We need to take
   // inner tile sizes into account and adjust the distribution tile sizes.

diff --git a/...iler/src/iree/compiler/Codegen/LLVMCPU/test/materialize_aarch64_launch_configuration.mlir b/...iler/src/iree/compiler/Codegen/LLVMCPU/test/materialize_aarch64_launch_configuration.mlir
@@ -498,7 +498,7 @@ hal.executable private @pack  {
     }
   }
 }
-//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[8, 64], [1, 1]]>
+//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[2, 40], [1, 1]]>
 //  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDataTiling>
 //      CHECK: hal.executable.export public @pack
 // CHECK-SAME:     translation_info = #[[TRANSLATION]]

diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/materialize_x86_64_launch_configuration.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/materialize_x86_64_launch_configuration.mlir
@@ -1574,6 +1574,43 @@ hal.executable private @pack  {
 
 // -----
 
+#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
+  #hal.descriptor_set.layout<0, bindings = [
+    #hal.descriptor_set.binding<0, storage_buffer>,
+    #hal.descriptor_set.binding<1, storage_buffer>
+  ]>
+]>
+hal.executable private @pack_many_elements  {
+  hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {
+    cpu_features = "+avx512f",
+    data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
+    native_vector_size = 64 : index,
+    target_triple = "x86_64-none-elf"
+  }>) {
+  hal.executable.export public @pack_many_elements layout(#pipeline_layout)
+    builtin.module {
+      func.func @pack_many_elements() {
+        %c0 = arith.constant 0 : index
+        %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1200x500000xf32>>
+        %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<31250x1200x16x1xf32>>
+        %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1200, 500000], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1200x500000xf32>> -> tensor<1200x500000xf32>
+        %3 = tensor.empty() : tensor<31250x1200x16x1xf32>
+        %pack = tensor.pack %2 outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %3 : tensor<1200x500000xf32> -> tensor<31250x1200x16x1xf32>
+        flow.dispatch.tensor.store %pack, %1, offsets = [0, 0, 0, 0], sizes = [31250, 1200, 16, 1], strides = [1, 1, 1, 1] : tensor<31250x1200x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<31250x1200x16x1xf32>>
+        return
+      }
+    }
+  }
+}
+//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[128, 31250], [1, 1]]>
+//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDataTiling>
+//      CHECK: hal.executable.export public @pack_many_elements
+// CHECK-SAME:     translation_info = #[[TRANSLATION]]
+//      CHECK:   tensor.pack
+// CHECK-SAME:       lowering_config = #[[CONFIG]]
+
+// -----
+
 #pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
   <0, bindings = [
     <0, storage_buffer, ReadOnly>,