triton-lang · zhanglx13 · Jan 6, 2025 · Dec 25, 2024 · Dec 28, 2024 · Dec 28, 2024
@@ -10,6 +10,7 @@ add_triton_library(TritonToTritonGPU
     MLIRPass
     MLIRTransforms
     TritonIR
+    ProtonIR
     TritonGPUIR
     TritonGPUTransforms
 )
@@ -16,6 +16,8 @@
 #include "triton/Conversion/TritonToTritonGPU/Passes.h.inc"
 #include "triton/Dialect/TritonGPU/Transforms/Utility.h"
 
+#include "third_party/proton/dialect/include/Dialect/Proton/IR/Dialect.h"
+
 namespace {
 
 using namespace mlir;
@@ -555,7 +557,13 @@ void populateTritonPatterns(TritonGPUTypeConverter &typeConverter,
       GenericOpPattern<triton::DotScaledOp>, GenericOpPattern<triton::CallOp>,
       TritonFuncOpPattern>(typeConverter, context);
 }
-
+// Proton patterns
+void populateProtonPatterns(TritonGPUTypeConverter &typeConverter,
+                            RewritePatternSet &patterns) {
+  MLIRContext *context = patterns.getContext();
+  patterns.add<GenericOpPattern<triton::proton::RecordOp>>(typeConverter,
+                                                           context);
+}
 //
 // SCF patterns
 //
@@ -770,6 +778,7 @@ class ConvertTritonToTritonGPU
     populateArithPatternsAndLegality(typeConverter, patterns, target);
     populateMathPatternsAndLegality(typeConverter, patterns, target);
     populateTritonPatterns(typeConverter, patterns, numCTAs);
+    populateProtonPatterns(typeConverter, patterns);
     // TODO: can we use
     //    mlir::scf::populateSCFStructurealTypeConversionsAndLegality(...) here?
     populateSCFPatterns(typeConverter, patterns);

@@ -31,6 +31,8 @@
 #include "triton/Tools/Sys/GetEnv.hpp"
 #include "llvm/Support/SourceMgr.h"
 
+#include "third_party/proton/dialect/include/Dialect/Proton/IR/Dialect.h"
+
 namespace {
 
 namespace py = pybind11;
@@ -235,7 +237,8 @@ void init_triton_ir(py::module &&m) {
     registry.insert<TritonDialect, ::mlir::triton::gpu::TritonGPUDialect,
                     math::MathDialect, arith::ArithDialect, scf::SCFDialect,
                     ::mlir::gpu::GPUDialect, cf::ControlFlowDialect,
-                    LLVM::LLVMDialect, mlir::ub::UBDialect>();
+                    ::mlir::triton::proton::ProtonDialect, LLVM::LLVMDialect,
+                    mlir::ub::UBDialect>();
     mlir::LLVM::registerInlinerInterface(registry);
     registerBuiltinDialectTranslation(registry);
     registerLLVMDialectTranslation(registry);
@@ -1603,6 +1606,10 @@ void init_triton_ir(py::module &&m) {
                                                llvm::StringRef(prefix));
              self.create<PrintOp>(prefixAttr, hex, values, isSigned);
            })
+      .def("create_proton_record",
+           [](TritonOpBuilder &self, bool isStart, int32_t regionId) -> void {
+             self.create<mlir::triton::proton::RecordOp>(isStart, regionId);
+           })
       .def("create_assert",
            [](TritonOpBuilder &self, Value &condition,
               const std::string &message) -> void {

@@ -29,4 +29,5 @@ add_triton_library(TritonAMDGPUToLLVM
     LINK_LIBS PUBLIC
     TritonGPUToLLVM
     TritonAMDGPUIR
+    TritonProtonToLLVM
 )
@@ -24,6 +24,8 @@
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 #include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h"
 
+#include "third_party/proton/dialect/include/TritonProtonToLLVM/PatternTritonProtonOpToLLVM.h"
+
 namespace mlir::triton {
 #define GEN_PASS_DEF_CONVERTTRITONAMDGPUTOLLVM
 #include "TritonAMDGPUToLLVM/Passes.h.inc"
@@ -228,6 +230,10 @@ struct ConvertTritonAMDGPUToLLVM
                                                           patterns);
     mlir::triton::populatePrintOpToLLVMPattern(typeConverter, patterns,
                                                targetInfo, commonBenefit);
+
+    mlir::triton::proton::populateRecordOpToLLVMPattern(
+        typeConverter, patterns, targetInfo, commonBenefit);
+
     mlir::ub::populateUBToLLVMConversionPatterns(typeConverter, patterns);
 
     if (failed(applyPartialConversion(mod, convTarget, std::move(patterns)))) {

@@ -25,4 +25,5 @@ add_triton_library(TritonNVIDIAGPUToLLVM
 
     LINK_LIBS PUBLIC
     TritonGPUToLLVM
+    TritonProtonToLLVM
 )
@@ -22,6 +22,8 @@
 #include "triton/Conversion/TritonGPUToLLVM/PatternTritonGPUOpToLLVM.h"
 #include "triton/Conversion/TritonGPUToLLVM/TypeConverter.h"
 
+#include "third_party/proton/dialect/include/TritonProtonToLLVM/PatternTritonProtonOpToLLVM.h"
+
 namespace mlir {
 namespace triton {
 #define GEN_PASS_DEF_CONVERTTRITONGPUTOLLVM
@@ -149,6 +151,8 @@ struct ConvertTritonGPUToLLVM
                                                     targetInfo, benefit);
     mlir::triton::populatePrintOpToLLVMPattern(typeConverter, patterns,
                                                targetInfo, benefit);
+    mlir::triton::proton::populateRecordOpToLLVMPattern(typeConverter, patterns,
+                                                        targetInfo, benefit);
     mlir::triton::populateControlFlowOpToLLVMPattern(typeConverter, patterns,
                                                      targetInfo, benefit);
     mlir::triton::NVIDIA::populateSPMDOpToLLVMPattern(typeConverter, patterns,

@@ -0,0 +1,20 @@
+#ifndef TRITON_CONVERSION_TRITONPROTON_TO_LLVM_PATTERNS_TRITON_PROTON_OP_TO_LLVM_H
+#define TRITON_CONVERSION_TRITONPROTON_TO_LLVM_PATTERNS_TRITON_PROTON_OP_TO_LLVM_H
+
+#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
+
+using namespace mlir;
+using namespace mlir::triton;
+
+namespace mlir {
+namespace triton {
+namespace proton {
+void populateRecordOpToLLVMPattern(LLVMTypeConverter &typeConverter,
+                                   RewritePatternSet &patterns,
+                                   const TargetInfoBase &targetInfo,
+                                   PatternBenefit benefit);
+} // namespace proton
+} // namespace triton
+} // namespace mlir
+
+#endif
@@ -1 +1,2 @@
 add_subdirectory(Dialect)
+add_subdirectory(TritonProtonToLLVM)
@@ -0,0 +1,20 @@
+add_triton_library(TritonProtonToLLVM
+    RecordOpToLLVM.cpp
+
+    DEPENDS
+    TritonGPUConversionPassIncGen
+
+    LINK_LIBS PUBLIC
+    MLIRIR
+    MLIRPass
+    MLIRGPUDialect
+    MLIRGPUToNVVMTransforms
+    MLIRGPUToROCDLTransforms
+    MLIRGPUTransforms
+    TritonAnalysis
+    TritonIR
+    ProtonIR
+    TritonGPUIR
+    TritonGPUTransforms
+    TritonNvidiaGPUTransforms
+)
@@ -0,0 +1,41 @@
+#include "mlir/Conversion/LLVMCommon/Pattern.h"
+#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
+#include "mlir/IR/PatternMatch.h"
+#include "triton/Conversion/TritonGPUToLLVM/PatternTritonGPUOpToLLVM.h"
+#include "triton/Conversion/TritonGPUToLLVM/TargetInfoBase.h"
+#include "triton/Conversion/TritonGPUToLLVM/Utility.h"
+#include "triton/Dialect/Triton/IR/Dialect.h"
+
+#include "third_party/proton/dialect/include/Dialect/Proton/IR/Dialect.h"
+#include "third_party/proton/dialect/include/TritonProtonToLLVM/PatternTritonProtonOpToLLVM.h"
+
+namespace {
+
+struct RecordOpConversion
+    : public ConvertOpToLLVMPattern<mlir::triton::proton::RecordOp> {
+  explicit RecordOpConversion(LLVMTypeConverter &typeConverter,
+                              const TargetInfoBase &targetInfo,
+                              PatternBenefit benefit)
+      : mlir::ConvertOpToLLVMPattern<mlir::triton::proton::RecordOp>(
+            typeConverter, benefit),
+        targetInfo(targetInfo) {}
+
+  LogicalResult
+  matchAndRewrite(mlir::triton::proton::RecordOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+
+    rewriter.eraseOp(op);
+    return success();
+  }
+
+protected:
+  const TargetInfoBase &targetInfo;
+};
+
+} // namespace
+
+void mlir::triton::proton::populateRecordOpToLLVMPattern(
+    LLVMTypeConverter &typeConverter, RewritePatternSet &patterns,
+    const TargetInfoBase &targetInfo, PatternBenefit benefit) {
+  patterns.add<RecordOpConversion>(typeConverter, targetInfo, benefit);
+}
diff --git a/third_party/proton/proton/__init__.py b/third_party/proton/proton/__init__.py
@@ -7,5 +7,6 @@
     deactivate,
     finalize,
     profile,
+    dev,
     DEFAULT_PROFILE_NAME,
 )
@@ -0,0 +1,6 @@
+from triton._C.libtriton import ir
+from triton.language import core as tl
+
+
+def proton_record(isStart: bool, regionId: int, builder: ir.builder) -> tl.tensor:
+    return tl.tensor(builder.create_proton_record(isStart, regionId), tl.void)
diff --git a/third_party/proton/proton/profile.py b/third_party/proton/proton/profile.py
@@ -3,13 +3,27 @@
 import os
 
 from triton._C.libproton import proton as libproton
+from triton.language import core as tl
+from triton.language.core import builtin
 from .hook import register_triton_hook, unregister_triton_hook
 from .flags import set_profiling_off, set_profiling_on, is_command_line
 from typing import Optional
+from . import language
+import warnings
 
 DEFAULT_PROFILE_NAME = "proton"
 
 
+class dev:
+
+    @builtin
+    def record(isStart: bool, regionId: int, _builder=None):
+        warnings.warn(
+            "\nWarning the dev module within Proton contains under development features that are not intended to be used outside of the core development team"
+        )
+        return language.proton_record(isStart, regionId, _builder)
+
+
 def _select_backend() -> str:
     backend = triton.runtime.driver.active.get_current_target().backend
     if backend == "cuda":

diff --git a/third_party/proton/test/test_proton_record.py b/third_party/proton/test/test_proton_record.py
@@ -0,0 +1,41 @@
+import torch
+import pytest
+import pathlib
+
+import triton
+import triton.language as tl
+import triton.profiler as proton
+
+
+def test_proton_record(tmp_path: pathlib.Path):
+
+    @triton.jit
+    def add_kernel(
+        x_ptr,
+        y_ptr,
+        output_ptr,
+        n_elements,
+        BLOCK_SIZE: tl.constexpr,
+    ):
+        pid = tl.program_id(axis=0)
+        block_start = pid * BLOCK_SIZE
+        offsets = block_start + tl.arange(0, BLOCK_SIZE)
+        mask = offsets < n_elements
+        x = tl.load(x_ptr + offsets, mask=mask)
+        proton.dev.record(True, 0)
+        y = tl.load(y_ptr + offsets, mask=mask)
+        proton.dev.record(False, 0)
+        output = x + y
+        tl.store(output_ptr + offsets, output, mask=mask)
+
+    torch.manual_seed(0)
+    size = 2**12
+    x = torch.rand(size, device='cuda')
+    y = torch.rand(size, device='cuda')
+    output = torch.empty_like(x)
+    n_elements = output.numel()
+    grid = (1, 1, 1)
+    pgm = add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024)
+    ttir = pgm.asm['ttir']
+    assert "proton.record() {isStart = true, regionId = 0 : i32}" in ttir
+    assert "proton.record() {isStart = false, regionId = 0 : i32}" in ttir
Original file line number	Diff line number	Diff line change
		@@ -1 +1,2 @@
		add_subdirectory(Dialect)
		add_subdirectory(TritonProtonToLLVM)