[LLVMCPU] Add LLVMCPU support for winograd.filter_transform op (#17105)

This PR adds a tiling configuration for the `winograd.filter_transform` op in the LLVMCPU winograd pipeline.
iree-org · Apr 26, 2024 · ab54a60 · ab54a60
1 parent 1ac066a
commit ab54a60
Show file tree

Hide file tree

Showing 2 changed files with 51 additions and 32 deletions.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
@@ -1672,45 +1672,37 @@ static LogicalResult setRootConfig(mlir::FunctionOpInterface entryPointFn,
       entryPointFn, fftOp, tileSizes, DispatchLoweringPassPipeline::CPUDefault);
 }
 
-/// Sets the lowering configuration for dispatch region for
-/// linalg_ext.winograd.input_transform root op.
+/// Sets the lowering configuration for dispatch region for winograd ops:
+///   linalg_ext.winograd.filter_transform
+///   linalg_ext.winograd.input_transform
+///   linalg_ext.winograd.output_transform
+/// The vector tile sizes should be 1 for each dim here, because
+/// the winograd decomposition relies on these unit dimensions.
+template <typename WinogradOp>
 static LogicalResult
-setRootConfig(mlir::FunctionOpInterface entryPointFn,
-              IREE::LinalgExt::WinogradInputTransformOp inputOp) {
-  assert(!getLoweringConfig(inputOp) && "expected lowering_config is not set");
-  auto iterationRank = inputOp.getIterationDomainRank();
-  SmallVector<int64_t> vecSizeHints(iterationRank, 1);
-  DistributionHeuristicConfig distConfig;
-  distConfig.vectorSizeHints = vecSizeHints;
-  SmallVector<int64_t> distTileSizes =
-      getDefaultDistributedLevelTileSizes(inputOp, distConfig);
-  TileSizesListType tileSizes;
-  tileSizes.push_back(distTileSizes);
-  SmallVector<int64_t> vecTileSizes(iterationRank, 1);
-  tileSizes.push_back(vecTileSizes);
-  return setOpConfigAndEntryPointFnTranslation(
-      entryPointFn, inputOp, tileSizes,
-      DispatchLoweringPassPipeline::CPULinalgExtTileAndVectorize);
-}
-
-/// Sets the lowering configuration for dispatch region for
-/// linalg_ext.winograd.input_transform root op.
-static LogicalResult
-setRootConfig(mlir::FunctionOpInterface entryPointFn,
-              IREE::LinalgExt::WinogradOutputTransformOp outputOp) {
-  assert(!getLoweringConfig(outputOp) && "expected lowering_config is not set");
-  auto iterationRank = outputOp.getIterationDomainRank();
+setWinogradRootConfig(mlir::FunctionOpInterface entryPointFn,
+                      WinogradOp winogradOp) {
+  static_assert(
+      std::is_same<WinogradOp, IREE::LinalgExt::WinogradInputTransformOp>() ||
+          std::is_same<WinogradOp,
+                       IREE::LinalgExt::WinogradOutputTransformOp>() ||
+          std::is_same<WinogradOp,
+                       IREE::LinalgExt::WinogradFilterTransformOp>(),
+      "op expected to be a winograd op");
+  assert(!getLoweringConfig(winogradOp) &&
+         "expected lowering_config is not set");
+  auto iterationRank = winogradOp.getIterationDomainRank();
   SmallVector<int64_t> vecSizeHints(iterationRank, 1);
   DistributionHeuristicConfig distConfig;
   distConfig.vectorSizeHints = vecSizeHints;
   SmallVector<int64_t> distTileSizes =
-      getDefaultDistributedLevelTileSizes(outputOp, distConfig);
+      getDefaultDistributedLevelTileSizes(winogradOp, distConfig);
   TileSizesListType tileSizes;
   tileSizes.push_back(distTileSizes);
   SmallVector<int64_t> vecTileSizes(iterationRank, 1);
   tileSizes.push_back(vecTileSizes);
   return setOpConfigAndEntryPointFnTranslation(
-      entryPointFn, outputOp, tileSizes,
+      entryPointFn, winogradOp, tileSizes,
       DispatchLoweringPassPipeline::CPULinalgExtTileAndVectorize);
 }
 
@@ -2301,11 +2293,13 @@ setRootConfigImpl(mlir::FunctionOpInterface entryPointFn, Operation *op,
                                targetMLTransInfo);
         })
         .Case<IREE::LinalgExt::AttentionOp, IREE::LinalgExt::FftOp,
-              IREE::LinalgExt::WinogradInputTransformOp,
-              IREE::LinalgExt::WinogradOutputTransformOp, tensor::PackOp,
-              tensor::PadOp, tensor::UnPackOp, linalg::Mmt4DOp,
+              tensor::PackOp, tensor::PadOp, tensor::UnPackOp, linalg::Mmt4DOp,
               linalg::BatchMmt4DOp>(
             [&](auto op) { return setRootConfig(entryPointFn, op); })
+        .Case<IREE::LinalgExt::WinogradFilterTransformOp,
+              IREE::LinalgExt::WinogradInputTransformOp,
+              IREE::LinalgExt::WinogradOutputTransformOp>(
+            [&](auto op) { return setWinogradRootConfig(entryPointFn, op); })
         .Case<linalg::Conv2DNhwcHwcfOp, linalg::Conv2DNchwFchwOp,
               linalg::PoolingNhwcSumOp, linalg::PoolingNhwcMaxOp,
               linalg::PoolingNhwcMaxUnsignedOp, linalg::PoolingNhwcMinOp,

diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir
@@ -1587,6 +1587,31 @@ module {
 
 // -----
 
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {
+      cpu = "generic", cpu_features = "",
+      data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
+      native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>
+module {
+    func.func @winograd_filter_transform() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+    %c0 = arith.constant 0 : index
+    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<3x3x64x128xf32>>
+    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<8x8x64x128xf32>>
+    %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [3, 3, 64, 128], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<3x3x64x128xf32>> -> tensor<3x3x64x128xf32>
+    %3 = tensor.empty() : tensor<8x8x64x128xf32>
+    %4 = iree_linalg_ext.winograd.filter_transform output_tile_size(6) kernel_size(3) kernel_dimensions([0, 1]) ins(%2 : tensor<3x3x64x128xf32>) outs(%3 : tensor<8x8x64x128xf32>) -> tensor<8x8x64x128xf32>
+    flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0, 0], sizes = [8, 8, 64, 128], strides = [1, 1, 1, 1] : tensor<8x8x64x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<8x8x64x128xf32>>
+    return
+  }
+}
+//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[32, 64], [1, 1]]>
+//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPULinalgExtTileAndVectorize>
+//      CHECK: func.func @winograd_filter_transform()
+// CHECK-SAME:     translation_info = #[[TRANSLATION]]
+//      CHECK:   iree_linalg_ext.winograd.filter_transform
+// CHECK-SAME:     {lowering_config = #[[CONFIG]]}
+
+// -----
+
 #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {
       cpu = "generic", cpu_features = "",
       data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",