Skip to content

Commit

Permalink
[MLIR] Use test-lower-to-nvvm for sm_90 Integration Tests on GitHub (
Browse files Browse the repository at this point in the history
…#68184)

This PR enables `test-lower-to-nvvm` pass pipeline for the integration
tests for NVIDIA sm_90 architecture.

This PR adjusts `test-lower-to-nvvm` pass in two ways: 

1) Calls `createConvertNVGPUToNVVMPass` before the outlining process.
This particular pass is responsible for generating both device and host
code. On the host, it calls the CUDA driver to build the TMA descriptor
(`cuTensorMap`).

2) Integrates the `createConvertNVVMToLLVMPass` to generate PTXs for
NVVM Ops.
  • Loading branch information
grypp authored Oct 4, 2023
1 parent 20fc2ff commit afe4006
Show file tree
Hide file tree
Showing 4 changed files with 27 additions and 48 deletions.
Original file line number Diff line number Diff line change
@@ -1,25 +1,11 @@
// RUN: mlir-opt %s \
// RUN: -convert-nvgpu-to-nvvm \
// RUN: -gpu-kernel-outlining \
// RUN: -convert-vector-to-scf \
// RUN: -convert-scf-to-cf \
// RUN: -convert-nvvm-to-llvm \
// RUN: -convert-vector-to-llvm \
// RUN: -convert-index-to-llvm=index-bitwidth=32 \
// RUN: -convert-arith-to-llvm \
// RUN: -finalize-memref-to-llvm='use-opaque-pointers=1' \
// RUN: -convert-func-to-llvm \
// RUN: -canonicalize -cse \
// RUN: -expand-strided-metadata --nvvm-attach-target="module=main_kernel features=+ptx80 chip=sm_90 O=3" \
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-index-to-llvm{index-bitwidth=32},canonicalize,cse))' \
// RUN: | mlir-opt --gpu-to-llvm --gpu-module-to-binary=format=%gpu_compilation_format -canonicalize -cse -reconcile-unrealized-casts \
// RUN: -test-lower-to-nvvm="cubin-chip=sm_90 cubin-features=+ptx80 opt-level=3" \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
// RUN: --entry-point-result=void \
// RUN: | FileCheck %s


// Test swizzling with TMA load
// 128B Swizzle Each numbered cell is 16 byte
// |-------------------------------|
Expand Down
Original file line number Diff line number Diff line change
@@ -1,19 +1,5 @@
// RUN: mlir-opt %s \
// RUN: -convert-nvgpu-to-nvvm \
// RUN: -canonicalize -cse \
// RUN: -gpu-kernel-outlining \
// RUN: -convert-vector-to-scf \
// RUN: -convert-scf-to-cf \
// RUN: -convert-nvvm-to-llvm \
// RUN: -convert-vector-to-llvm \
// RUN: -convert-index-to-llvm=index-bitwidth=32 \
// RUN: -convert-arith-to-llvm \
// RUN: -finalize-memref-to-llvm='use-opaque-pointers=1' \
// RUN: -convert-func-to-llvm \
// RUN: -canonicalize -cse \
// RUN: -expand-strided-metadata --nvvm-attach-target="module=main_kernel features=+ptx80 chip=sm_90 O=3" \
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-index-to-llvm{index-bitwidth=32},canonicalize,cse))' \
// RUN: | mlir-opt --gpu-to-llvm --gpu-module-to-binary -canonicalize -cse -reconcile-unrealized-casts \
// RUN: -test-lower-to-nvvm="cubin-chip=sm_90 cubin-features=+ptx80 opt-level=3" \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
Expand Down
Original file line number Diff line number Diff line change
@@ -1,16 +1,10 @@
// RUN: mlir-opt %s --convert-nvgpu-to-nvvm \
// RUN: -gpu-kernel-outlining \
// RUN: -convert-nvvm-to-llvm \
// RUN: -convert-scf-to-cf \
// RUN: -convert-vector-to-llvm \
// RUN: -convert-index-to-llvm=index-bitwidth=32 \
// RUN: -convert-arith-to-llvm \
// RUN: -finalize-memref-to-llvm='use-opaque-pointers=1' \
// RUN: -convert-func-to-llvm \
// RUN: -expand-strided-metadata --nvvm-attach-target="module=main_kernel features=+ptx80 chip=sm_90 O=3" \
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-index-to-llvm{index-bitwidth=32},canonicalize,cse))' \
// RUN: | mlir-opt --gpu-to-llvm --gpu-module-to-binary=format=%gpu_compilation_format -canonicalize -cse -reconcile-unrealized-casts -debug-only=serialize-to-isa \
// RUN: 2>&1 | FileCheck %s --check-prefixes=CHECK-PTX
// RUN: mlir-opt %s \
// RUN: -test-lower-to-nvvm="cubin-chip=sm_90 cubin-features=+ptx80 opt-level=3" \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
// RUN: --entry-point-result=void \
// RUN: | FileCheck %s

// Basic PTX check to make sure we are generating the right instructions.

Expand Down
23 changes: 18 additions & 5 deletions mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include "mlir/Conversion/MathToLLVM/MathToLLVM.h"
#include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h"
#include "mlir/Conversion/NVGPUToNVVM/NVGPUToNVVM.h"
#include "mlir/Conversion/NVVMToLLVM/NVVMToLLVM.h"
#include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h"
#include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
#include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVMPass.h"
Expand Down Expand Up @@ -143,11 +144,6 @@ void buildGpuPassPipeline(OpPassManager &pm,
pm.addNestedPass<gpu::GPUModuleOp>(
createConvertGpuOpsToNVVMOps(convertGpuOpsToNVVMOpsOptions));

// TODO: C++20 designated initializers.
ConvertNVGPUToNVVMPassOptions convertNVGPUToNVVMPassOptions;
convertNVGPUToNVVMPassOptions.useOpaquePointers = true;
pm.addNestedPass<gpu::GPUModuleOp>(
createConvertNVGPUToNVVMPass(convertNVGPUToNVVMPassOptions));
pm.addNestedPass<gpu::GPUModuleOp>(createConvertSCFToCFPass());

// Convert vector to LLVM (always needed).
Expand All @@ -157,6 +153,9 @@ void buildGpuPassPipeline(OpPassManager &pm,
pm.addNestedPass<gpu::GPUModuleOp>(
createConvertVectorToLLVMPass(convertVectorToLLVMPassOptions));

// This pass is needed for PTX building
pm.addNestedPass<gpu::GPUModuleOp>(createConvertNVVMToLLVMPass());

// Sprinkle some cleanups.
pm.addPass(createCanonicalizerPass());
pm.addPass(createCSEPass());
Expand All @@ -167,6 +166,20 @@ void buildGpuPassPipeline(OpPassManager &pm,

void buildLowerToNVVMPassPipeline(OpPassManager &pm,
const TestLowerToNVVMOptions &options) {
// Start with a cleanup pass.
pm.addPass(createCanonicalizerPass());
pm.addPass(createCSEPass());

//===----------------------------------------------------------------------===//
// NVGPU lowers device code as well as host code to the driver, so must run
// before outlining.
//===----------------------------------------------------------------------===//
// TODO: C++20 designated initializers.
ConvertNVGPUToNVVMPassOptions convertNVGPUToNVVMPassOptions;
convertNVGPUToNVVMPassOptions.useOpaquePointers = true;
pm.addNestedPass<func::FuncOp>(
createConvertNVGPUToNVVMPass(convertNVGPUToNVVMPassOptions));

//===----------------------------------------------------------------------===//
// Host-specific stuff.
//===----------------------------------------------------------------------===//
Expand Down

0 comments on commit afe4006

Please sign in to comment.