Skip to content

Commit

Permalink
[CPU] Improve distribution tile sizes selection.
Browse files Browse the repository at this point in the history
  • Loading branch information
hanhanW committed Nov 6, 2023
1 parent fde520e commit 90f1f14
Show file tree
Hide file tree
Showing 3 changed files with 128 additions and 62 deletions.
151 changes: 90 additions & 61 deletions compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -371,81 +371,40 @@ static unsigned getReferenceTypeLengthInBytes(func::FuncOp entryPointFn) {
return referenceTypeLengthInBytes;
}

/// Returns the default tile sizes to use for the loops that are distributed.
static SmallVector<int64_t>
getDefaultDistributionTileSizes(ArrayRef<int64_t> lbs, ArrayRef<int64_t> ubs,
ArrayRef<int64_t> minTileSizes,
ArrayRef<int64_t> maxTileSizes,
ArrayRef<int64_t> vectorSizeHints) {
assert(lbs.size() == ubs.size() && lbs.size() == minTileSizes.size() &&
lbs.size() == maxTileSizes.size() &&
"expected all vectors to be of equal size");

size_t numDims = lbs.size();
// Set all the distribution tile sizes to zero if thread distribution is
// disabled.
if (clDisableDistribution) {
return SmallVector<int64_t>(numDims, 0);
}

SmallVector<int64_t> distributedTileSizes(numDims, 1);
SmallVector<int64_t> numWorkgroupsPerDim(numDims, 1);
SmallVector<int64_t> workload(numDims, 1);
for (auto i : llvm::seq<size_t>(0, numDims)) {
if (maxTileSizes[i] == 0 || ShapedType::isDynamic(lbs[i]) ||
ShapedType::isDynamic(ubs[i])) {
distributedTileSizes[i] = maxTileSizes[i];
workload[i] = ShapedType::kDynamic;
// Reduces the number of workgroups in cases where we are dividing the work too
// much. Over-provision the number of workgroups to twice the number of
// threads.
static void reduceDistributionWorkgroups(
ArrayRef<int64_t> workload, SmallVectorImpl<int64_t> &distributedTileSizes,
std::optional<ArrayRef<int64_t>> maxTileSizes = std::nullopt,
std::optional<ArrayRef<int64_t>> vectorSizeHints = std::nullopt) {
assert(workload.size() == distributedTileSizes.size());
SmallVector<int64_t> numWorkgroupsPerDim(workload.size(), 1);
for (auto [idx, value] : llvm::enumerate(workload)) {
if (distributedTileSizes[idx] == 0 || ShapedType::isDynamic(value)) {
continue;
}

assert(lbs[i] <= ubs[i]);
workload[i] = ubs[i] - lbs[i];
int64_t candidateTileSize = 1;
int64_t targetSize = std::min(workload[i] / 2, maxTileSizes[i]);
int64_t vectorSize = vectorSizeHints[i];
if (vectorSize > 1) {
// Pick the factor of dim which is closest to the target tile size and
// is a multiplier of vector size.
for (int64_t k = vectorSize; k <= targetSize; k += vectorSize) {
if (workload[i] % k == 0 && k >= minTileSizes[i]) {
candidateTileSize = k;
}
}
}
// Fallback to power of 2 if there's no hint or can't find the ideal size.
if (vectorSize <= 1 || candidateTileSize == 1) {
candidateTileSize = std::max<int64_t>(
llvm::bit_floor<uint64_t>(targetSize), minTileSizes[i]);
}

// Limit the workload per workgroup to the default being the max to keep the
// work per invocation reasonable.
distributedTileSizes[i] =
std::min<int64_t>(candidateTileSize, maxTileSizes[i]);
numWorkgroupsPerDim[i] =
llvm::divideCeil(workload[i], distributedTileSizes[i]);
numWorkgroupsPerDim[idx] =
llvm::divideCeil(value, distributedTileSizes[idx]);
}

// Reduce the number of workgroups in cases where we are dividing the work too
// much. Over-provision the number of workgroups to twice the number of
// threads.
int64_t numWorkgroupsLimit = 2 * clNumberOfRuntimeThreads;
int64_t numWorkgroups =
std::accumulate(numWorkgroupsPerDim.begin(), numWorkgroupsPerDim.end(),
1LL, std::multiplies<int64_t>{});
unsigned currDim = numDims;
unsigned currDim = workload.size();
while (numWorkgroups > numWorkgroupsLimit && currDim > 0) {
unsigned index = currDim - 1;
int64_t currSize = distributedTileSizes[index];
if (workload[index] == ShapedType::kDynamic ||
currSize >= maxTileSizes[index] || currSize >= workload[index]) {
(maxTileSizes && currSize >= maxTileSizes.value()[index]) ||
currSize >= workload[index]) {
currDim--;
continue;
}

int64_t newSize = std::min<int64_t>(currSize * 2, workload[index]);
int64_t vectorSize = vectorSizeHints[index];
int64_t vectorSize = vectorSizeHints ? vectorSizeHints.value()[index] : 0;

// Chech if it's the ideal size with vector size hint. And skip if the new
// size will break the ideal size.
Expand All @@ -470,22 +429,80 @@ getDefaultDistributionTileSizes(ArrayRef<int64_t> lbs, ArrayRef<int64_t> ubs,

// Final fixup for dividing workload evenly.
for (auto i : llvm::seq<unsigned>(0, distributedTileSizes.size())) {
if (distributedTileSizes[i] == 0 || ShapedType::isDynamic(workload[i]))
if (distributedTileSizes[i] == 0 || ShapedType::isDynamic(workload[i])) {
continue;
}

int64_t nwg = llvm::divideCeil(workload[i], distributedTileSizes[i]);
int64_t newSize = llvm::divideCeil(workload[i], nwg);

// Chech if it's the ideal size with vector size hint. And skip if the new
// size will break the ideal size.
int64_t vectorSize = vectorSizeHints[i];
int64_t vectorSize = vectorSizeHints ? vectorSizeHints.value()[i] : 0;
if (vectorSize > 1 &&
(newSize % vectorSize != 0 || workload[i] % newSize != 0)) {
continue;
}

distributedTileSizes[i] = newSize;
}
}

/// Returns the default tile sizes to use for the loops that are distributed.
static SmallVector<int64_t>
getDefaultDistributionTileSizes(ArrayRef<int64_t> lbs, ArrayRef<int64_t> ubs,
ArrayRef<int64_t> minTileSizes,
ArrayRef<int64_t> maxTileSizes,
ArrayRef<int64_t> vectorSizeHints) {
assert(lbs.size() == ubs.size() && lbs.size() == minTileSizes.size() &&
lbs.size() == maxTileSizes.size() &&
"expected all vectors to be of equal size");

size_t numDims = lbs.size();
// Set all the distribution tile sizes to zero if thread distribution is
// disabled.
if (clDisableDistribution) {
return SmallVector<int64_t>(numDims, 0);
}

SmallVector<int64_t> distributedTileSizes(numDims, 1);
SmallVector<int64_t> workload(numDims, 1);
for (auto i : llvm::seq<size_t>(0, numDims)) {
if (maxTileSizes[i] == 0 || ShapedType::isDynamic(lbs[i]) ||
ShapedType::isDynamic(ubs[i])) {
distributedTileSizes[i] = maxTileSizes[i];
workload[i] = ShapedType::kDynamic;
continue;
}

assert(lbs[i] <= ubs[i]);
workload[i] = ubs[i] - lbs[i];
int64_t candidateTileSize = 1;
int64_t targetSize = std::min(workload[i] / 2, maxTileSizes[i]);
int64_t vectorSize = vectorSizeHints[i];
if (vectorSize > 1) {
// Pick the factor of dim which is closest to the target tile size and
// is a multiplier of vector size.
for (int64_t k = vectorSize; k <= targetSize; k += vectorSize) {
if (workload[i] % k == 0 && k >= minTileSizes[i]) {
candidateTileSize = k;
}
}
}
// Fallback to power of 2 if there's no hint or can't find the ideal size.
if (vectorSize <= 1 || candidateTileSize == 1) {
candidateTileSize = std::max<int64_t>(
llvm::bit_floor<uint64_t>(targetSize), minTileSizes[i]);
}

// Limit the workload per workgroup to the default being the max to keep the
// work per invocation reasonable.
distributedTileSizes[i] =
std::min<int64_t>(candidateTileSize, maxTileSizes[i]);
}

reduceDistributionWorkgroups(workload, distributedTileSizes, maxTileSizes,
vectorSizeHints);

return distributedTileSizes;
}
Expand Down Expand Up @@ -1358,7 +1375,9 @@ static SmallVector<int64_t> getPackVectorTileSizes(func::FuncOp entryPointFn,
SmallVector<int64_t> tileSizes(op.getSourceRank(), 1);
auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(entryPointFn);
int64_t vectorSize = getVectorSize(entryPointFn, op.getSourceType());
if (hasAVX512fFeature(targetAttr) && isPackMatmulLHS(op)) {
// TODO(#15421): Improve tile sizes selection for non f32 cases.
if (op.getSourceType().getElementType().isF32() &&
hasAVX512fFeature(targetAttr) && isPackMatmulLHS(op)) {
tileSizes.back() = vectorSize;
}
return tileSizes;
Expand All @@ -1370,6 +1389,16 @@ static LogicalResult setRootConfig(func::FuncOp entryPointFn,
SmallVector<int64_t> distTileSizes =
getDefaultDistributionTileSizes(cast<TilingInterface>(op.getOperation()));

int64_t vectorSize = getVectorSize(entryPointFn, op.getSourceType());
SmallVector<int64_t> vectorSizeHints(op.getSourceRank(), 1);
for (auto dim : op.getInnerDimsPos()) {
vectorSizeHints[dim] = vectorSize;
}

SmallVector<int64_t> workload(op.getSourceType().getShape());
reduceDistributionWorkgroups(workload, distTileSizes,
/*maxTileSizes=*/std::nullopt, vectorSizeHints);

// The default function aims to returns the number of workload per workgroup,
// but it does not know that it is working on packed domain. We need to take
// inner tile sizes into account and adjust the distribution tile sizes.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -498,7 +498,7 @@ hal.executable private @pack {
}
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[8, 64], [1, 1]]>
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[2, 40], [1, 1]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDataTiling>
// CHECK: hal.executable.export public @pack
// CHECK-SAME: translation_info = #[[TRANSLATION]]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1574,6 +1574,43 @@ hal.executable private @pack {

// -----

#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
#hal.descriptor_set.layout<0, bindings = [
#hal.descriptor_set.binding<0, storage_buffer>,
#hal.descriptor_set.binding<1, storage_buffer>
]>
]>
hal.executable private @pack_many_elements {
hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {
cpu_features = "+avx512f",
data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
native_vector_size = 64 : index,
target_triple = "x86_64-none-elf"
}>) {
hal.executable.export public @pack_many_elements layout(#pipeline_layout)
builtin.module {
func.func @pack_many_elements() {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1200x500000xf32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<31250x1200x16x1xf32>>
%2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1200, 500000], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1200x500000xf32>> -> tensor<1200x500000xf32>
%3 = tensor.empty() : tensor<31250x1200x16x1xf32>
%pack = tensor.pack %2 outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %3 : tensor<1200x500000xf32> -> tensor<31250x1200x16x1xf32>
flow.dispatch.tensor.store %pack, %1, offsets = [0, 0, 0, 0], sizes = [31250, 1200, 16, 1], strides = [1, 1, 1, 1] : tensor<31250x1200x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<31250x1200x16x1xf32>>
return
}
}
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[128, 31250], [1, 1]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDataTiling>
// CHECK: hal.executable.export public @pack_many_elements
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: tensor.pack
// CHECK-SAME: lowering_config = #[[CONFIG]]

// -----

#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
<0, bindings = [
<0, storage_buffer, ReadOnly>,
Expand Down

0 comments on commit 90f1f14

Please sign in to comment.