diff --git a/third_party/proton/csrc/include/Data/Metric.h b/third_party/proton/csrc/include/Data/Metric.h index 0e22f7a0504d..8cc2a1bc5544 100644 --- a/third_party/proton/csrc/include/Data/Metric.h +++ b/third_party/proton/csrc/include/Data/Metric.h @@ -7,7 +7,7 @@ namespace proton { -enum class MetricKind { Flexible, Kernel, Count }; +enum class MetricKind { Flexible, Kernel, PCSampling, Count }; using MetricValueType = std::variant; @@ -148,6 +148,76 @@ class KernelMetric : public Metric { }; }; +class PCSamplingMetric : public Metric { +public: + enum PCSamplingMetricKind : int { + NumSamples, + NumStalledSamples, + StalledBranchResolving, + StalledNoInstruction, + StalledShortScoreboard, + StalledWait, + StalledLongScoreboard, + StalledTexThrottle, + StalledBarrier, + StalledMembar, + StalledIMCMiss, + StalledMIOThrottle, + StalledMathPipeThrottle, + StalledDrain, + StalledLGThrottle, + StalledNotSelected, + StalledMisc, + StalledDispatchStall, + StalledSleeping, + StalledSelected, + Count, + }; + + PCSamplingMetric() + : Metric(MetricKind::PCSampling, PCSamplingMetricKind::Count) {} + + PCSamplingMetric(PCSamplingMetricKind kind, uint64_t samples, + uint64_t stalledSamples) + : PCSamplingMetric() { + this->values[kind] = stalledSamples; + this->values[PCSamplingMetricKind::NumSamples] = samples; + this->values[PCSamplingMetricKind::NumStalledSamples] = stalledSamples; + } + + virtual const std::string getName() const { return "PCSamplingMetric"; } + + virtual const std::string getValueName(int valueId) const { + return VALUE_NAMES[valueId]; + } + + virtual bool isAggregable(int valueId) const { return true; } + +private: + const static inline std::string VALUE_NAMES[PCSamplingMetricKind::Count] = { + "NumSamples", + "NumStalledSamples", + "StalledBranchResolving", + "StalledNoInstruction", + "StalledShortScoreboard", + "StalledWait", + "StalledLongScoreboard", + "StalledTexThrottle", + "StalledBarrier", + "StalledMembar", + "StalledIMCMiss", + "StalledMIOThrottle", + "StalledMathPipeThrottle", + "StalledDrain", + "StalledLGThrottle", + "StalledNotSelected", + "StalledMisc", + "StalledDispatchStall", + "StalledSleeping", + "StalledSelected", + }; +}; + } // namespace proton #endif // PROTON_DATA_METRIC_H_ diff --git a/third_party/proton/csrc/include/Data/TreeData.h b/third_party/proton/csrc/include/Data/TreeData.h index 0250f2647e36..0ad17a0a4794 100644 --- a/third_party/proton/csrc/include/Data/TreeData.h +++ b/third_party/proton/csrc/include/Data/TreeData.h @@ -33,6 +33,7 @@ class TreeData : public Data { void init(); void dumpHatchet(std::ostream &os) const; void doDump(std::ostream &os, OutputFormat outputFormat) const override; + size_t getValidScopeId(size_t scopeId) const; class Tree; std::unique_ptr tree; diff --git a/third_party/proton/csrc/include/Driver/GPU/CuptiApi.h b/third_party/proton/csrc/include/Driver/GPU/CuptiApi.h index 845b415bd52f..70210215daad 100644 --- a/third_party/proton/csrc/include/Driver/GPU/CuptiApi.h +++ b/third_party/proton/csrc/include/Driver/GPU/CuptiApi.h @@ -2,11 +2,15 @@ #define PROTON_DRIVER_GPU_CUPTI_H_ #include "cupti.h" +#include "cupti_pcsampling.h" namespace proton { namespace cupti { +template +CUptiResult getContextId(CUcontext context, uint32_t *pCtxId); + template CUptiResult activityRegisterCallbacks( CUpti_BuffersCallbackRequestFunc funcBufferRequested, @@ -66,6 +70,40 @@ CUptiResult getGraphExecId(CUgraphExec graph, uint32_t *pId); template CUptiResult getGraphId(CUgraph graph, uint32_t *pId); +template +CUptiResult getCubinCrc(CUpti_GetCubinCrcParams *pParams); + +template +CUptiResult +getSassToSourceCorrelation(CUpti_GetSassToSourceCorrelationParams *pParams); + +template +CUptiResult +pcSamplingGetNumStallReasons(CUpti_PCSamplingGetNumStallReasonsParams *pParams); + +template +CUptiResult +pcSamplingGetStallReasons(CUpti_PCSamplingGetStallReasonsParams *pParams); + +template +CUptiResult pcSamplingSetConfigurationAttribute( + CUpti_PCSamplingConfigurationInfoParams *pParams); + +template +CUptiResult pcSamplingEnable(CUpti_PCSamplingEnableParams *pParams); + +template +CUptiResult pcSamplingDisable(CUpti_PCSamplingDisableParams *pParams); + +template +CUptiResult pcSamplingGetData(CUpti_PCSamplingGetDataParams *pParams); + +template +CUptiResult pcSamplingStart(CUpti_PCSamplingStartParams *pParams); + +template +CUptiResult pcSamplingStop(CUpti_PCSamplingStopParams *pParams); + } // namespace cupti } // namespace proton diff --git a/third_party/proton/csrc/include/Profiler/Cupti/CuptiPCSampling.h b/third_party/proton/csrc/include/Profiler/Cupti/CuptiPCSampling.h new file mode 100644 index 000000000000..c9b6cc3a9519 --- /dev/null +++ b/third_party/proton/csrc/include/Profiler/Cupti/CuptiPCSampling.h @@ -0,0 +1,132 @@ +#ifndef PROTON_PROFILER_CUPTI_PC_SAMPLING_H_ +#define PROTON_PROFILER_CUPTI_PC_SAMPLING_H_ + +#include "CuptiProfiler.h" +#include "Driver/GPU/CudaApi.h" +#include "Driver/GPU/CuptiApi.h" +#include "Utility/Map.h" +#include "Utility/Singleton.h" +#include +#include + +namespace proton { + +struct CubinData { + size_t cubinCrc; + const char *cubin; + size_t cubinSize; + + struct LineInfoKey { + uint32_t functionIndex; + uint64_t pcOffset; + + bool operator<(const LineInfoKey &other) const { + return functionIndex < other.functionIndex || + (functionIndex == other.functionIndex && + pcOffset < other.pcOffset); + } + }; + + struct LineInfoValue { + uint32_t lineNumber{}; + const std::string functionName{}; + const std::string dirName{}; + const std::string fileName{}; + + LineInfoValue() = default; + + LineInfoValue(uint32_t lineNumber, const std::string &functionName, + const std::string &dirName, const std::string &fileName) + : lineNumber(lineNumber), functionName(functionName), dirName(dirName), + fileName(fileName) {} + }; + + std::map lineInfo; +}; + +struct ConfigureData { + ConfigureData() = default; + + ~ConfigureData() { + if (stallReasonNames) { + for (size_t i = 0; i < numStallReasons; i++) { + if (stallReasonNames[i]) + std::free(stallReasonNames[i]); + } + std::free(stallReasonNames); + } + if (stallReasonIndices) + std::free(stallReasonIndices); + } + + void initialize(CUcontext context); + + CUpti_PCSamplingConfigurationInfo configureStallReasons(); + CUpti_PCSamplingConfigurationInfo configureSamplingPeriod(); + CUpti_PCSamplingConfigurationInfo configureSamplingBuffer(); + CUpti_PCSamplingConfigurationInfo configureScratchBuffer(); + CUpti_PCSamplingConfigurationInfo configureHardwareBufferSize(); + CUpti_PCSamplingConfigurationInfo configureStartStopControl(); + CUpti_PCSamplingConfigurationInfo configureCollectionMode(); + + // The amount of data reserved on the GPU + static constexpr size_t HardwareBufferSize = 128 * 1024 * 1024; + // The amount of data copied from the hardware buffer each time + static constexpr size_t ScratchBufferSize = 16 * 1024 * 1024; + // The number of PCs copied from the scratch buffer each time + static constexpr size_t DataBufferPCCount = 1024; + // The sampling period in cycles = 2^frequency + static constexpr uint32_t DefaultFrequency = 10; + + CUcontext context{}; + uint32_t contextId; + uint32_t numStallReasons{}; + uint32_t numValidStallReasons{}; + char **stallReasonNames{}; + uint32_t *stallReasonIndices{}; + std::map stallReasonIndexToMetricIndex{}; + std::set notIssuedStallReasonIndices{}; + CUpti_PCSamplingData pcSamplingData{}; + // The memory storing configuration information has to be kept alive during + // the profiling session + std::vector configurationInfos; +}; + +class CuptiPCSampling : public Singleton { + +public: + CuptiPCSampling() = default; + virtual ~CuptiPCSampling() = default; + + void initialize(CUcontext context); + + void start(CUcontext context); + + void stop(CUcontext context, uint64_t externId, bool isAPI); + + void finalize(CUcontext context); + + void loadModule(CUpti_ResourceData *resourceData); + + void unloadModule(CUpti_ResourceData *resourceData); + +private: + ConfigureData *getConfigureData(CUcontext context); + + CubinData *getCubinData(uint64_t cubinCrc); + + void processPCSamplingData(ConfigureData *configureData, uint64_t externId, + bool isAPI); + + ThreadSafeMap contextIdToConfigureData; + ThreadSafeMap cubinCrcToCubinData; + ThreadSafeSet contextInitialized; + + std::atomic pcSamplingStarted{false}; + std::mutex pcSamplingMutex{}; + std::mutex contextMutex{}; +}; + +} // namespace proton + +#endif // PROTON_PROFILER_CUPTI_PC_SAMPLING_H_ diff --git a/third_party/proton/csrc/include/Profiler/CuptiProfiler.h b/third_party/proton/csrc/include/Profiler/Cupti/CuptiProfiler.h similarity index 90% rename from third_party/proton/csrc/include/Profiler/CuptiProfiler.h rename to third_party/proton/csrc/include/Profiler/Cupti/CuptiProfiler.h index 344d0fd4b9df..c443ec2e398f 100644 --- a/third_party/proton/csrc/include/Profiler/CuptiProfiler.h +++ b/third_party/proton/csrc/include/Profiler/Cupti/CuptiProfiler.h @@ -1,7 +1,7 @@ #ifndef PROTON_PROFILER_CUPTI_PROFILER_H_ #define PROTON_PROFILER_CUPTI_PROFILER_H_ -#include "GPUProfiler.h" +#include "Profiler/GPUProfiler.h" namespace proton { diff --git a/third_party/proton/csrc/include/Profiler/GPUProfiler.h b/third_party/proton/csrc/include/Profiler/GPUProfiler.h index 26c6d10b5d50..d5033b06aa63 100644 --- a/third_party/proton/csrc/include/Profiler/GPUProfiler.h +++ b/third_party/proton/csrc/include/Profiler/GPUProfiler.h @@ -31,6 +31,16 @@ class GPUProfiler : public Profiler, std::unordered_map>>; using ApiExternIdSet = ThreadSafeSet>; + ConcreteProfilerT &enablePCSampling() { + pcSamplingEnabled = true; + return dynamic_cast(*this); + } + ConcreteProfilerT &disablePCSampling() { + pcSamplingEnabled = false; + return dynamic_cast(*this); + } + bool isPCSamplingEnabled() const { return pcSamplingEnabled; } + protected: // OpInterface void startOp(const Scope &scope) override { @@ -140,6 +150,8 @@ class GPUProfiler : public Profiler, ConcreteProfilerT &profiler; }; std::unique_ptr pImpl; + + bool pcSamplingEnabled{false}; }; } // namespace proton diff --git a/third_party/proton/csrc/include/Profiler/RoctracerProfiler.h b/third_party/proton/csrc/include/Profiler/Roctracer/RoctracerProfiler.h similarity index 91% rename from third_party/proton/csrc/include/Profiler/RoctracerProfiler.h rename to third_party/proton/csrc/include/Profiler/Roctracer/RoctracerProfiler.h index 2f1791dcb506..b9bc08de8e83 100644 --- a/third_party/proton/csrc/include/Profiler/RoctracerProfiler.h +++ b/third_party/proton/csrc/include/Profiler/Roctracer/RoctracerProfiler.h @@ -1,7 +1,7 @@ #ifndef PROTON_PROFILER_ROCTRACER_PROFILER_H_ #define PROTON_PROFILER_ROCTRACER_PROFILER_H_ -#include "GPUProfiler.h" +#include "Profiler/GPUProfiler.h" namespace proton { diff --git a/third_party/proton/csrc/include/Utility/Atomic.h b/third_party/proton/csrc/include/Utility/Atomic.h index d7e40e73cd24..e0bd8c9dcac5 100644 --- a/third_party/proton/csrc/include/Utility/Atomic.h +++ b/third_party/proton/csrc/include/Utility/Atomic.h @@ -1,3 +1,6 @@ +#ifndef PROTON_UTILITY_ATOMIC_H_ +#define PROTON_UTILITY_ATOMIC_H_ + #include namespace proton { @@ -17,3 +20,5 @@ template T atomicMin(std::atomic &target, T value) { } } // namespace proton + +#endif // PROTON_UTILITY_ATOMIC_H_ diff --git a/third_party/proton/csrc/include/Utility/Errors.h b/third_party/proton/csrc/include/Utility/Errors.h index 62d4f3f6650b..094723d6f7e8 100644 --- a/third_party/proton/csrc/include/Utility/Errors.h +++ b/third_party/proton/csrc/include/Utility/Errors.h @@ -1,3 +1,6 @@ +#ifndef PROTON_UTILITY_ERRORS_H_ +#define PROTON_UTILITY_ERRORS_H_ + #include namespace proton { @@ -8,3 +11,5 @@ class NotImplemented : public std::logic_error { }; } // namespace proton + +#endif // PROTON_UTILITY_ERRORS_H_ diff --git a/third_party/proton/csrc/include/Utility/String.h b/third_party/proton/csrc/include/Utility/String.h index b7d45ae1f74f..817ae3b0981f 100644 --- a/third_party/proton/csrc/include/Utility/String.h +++ b/third_party/proton/csrc/include/Utility/String.h @@ -13,6 +13,17 @@ inline std::string toLower(const std::string &str) { return lower; } +inline std::string replace(const std::string &str, const std::string &src, + const std::string &dst) { + std::string replaced = str; + size_t pos = 0; + while ((pos = replaced.find(src, pos)) != std::string::npos) { + replaced.replace(pos, src.length(), dst); + pos += dst.length(); + } + return replaced; +} + } // namespace proton #endif // PROTON_UTILITY_STRING_H_ diff --git a/third_party/proton/csrc/lib/Data/TreeData.cpp b/third_party/proton/csrc/lib/Data/TreeData.cpp index b12427f77774..738924a61a1b 100644 --- a/third_party/proton/csrc/lib/Data/TreeData.cpp +++ b/third_party/proton/csrc/lib/Data/TreeData.cpp @@ -180,66 +180,75 @@ void TreeData::dumpHatchet(std::ostream &os) const { jsonNodes[Tree::TreeNode::RootId] = &(output.back()); std::set valueNames; std::map> deviceIds; - this->tree->template walk( - [&](Tree::TreeNode &treeNode) { - const auto contextName = treeNode.name; - auto contextId = treeNode.id; - json *jsonNode = jsonNodes[contextId]; - (*jsonNode)["frame"] = {{"name", contextName}, {"type", "function"}}; - (*jsonNode)["metrics"] = json::object(); - for (auto [metricKind, metric] : treeNode.metrics) { - if (metricKind == MetricKind::Kernel) { - auto kernelMetric = std::dynamic_pointer_cast(metric); - auto duration = std::get( - kernelMetric->getValue(KernelMetric::Duration)); - auto invocations = std::get( - kernelMetric->getValue(KernelMetric::Invocations)); - auto deviceId = std::get( - kernelMetric->getValue(KernelMetric::DeviceId)); - auto deviceType = std::get( - kernelMetric->getValue(KernelMetric::DeviceType)); - auto deviceTypeName = - getDeviceTypeString(static_cast(deviceType)); - (*jsonNode)["metrics"] - [kernelMetric->getValueName(KernelMetric::Duration)] = - duration; - (*jsonNode)["metrics"] - [kernelMetric->getValueName(KernelMetric::Invocations)] = - invocations; - (*jsonNode)["metrics"] - [kernelMetric->getValueName(KernelMetric::DeviceId)] = - std::to_string(deviceId); - (*jsonNode)["metrics"] - [kernelMetric->getValueName(KernelMetric::DeviceType)] = - deviceTypeName; - valueNames.insert( - kernelMetric->getValueName(KernelMetric::Duration)); - valueNames.insert( - kernelMetric->getValueName(KernelMetric::Invocations)); - deviceIds.insert({deviceType, {deviceId}}); - } else { - throw std::runtime_error("MetricKind not supported"); - } - } - for (auto [_, flexibleMetric] : treeNode.flexibleMetrics) { - auto valueName = flexibleMetric.getValueName(0); + this->tree->template walk([&](Tree::TreeNode + &treeNode) { + const auto contextName = treeNode.name; + auto contextId = treeNode.id; + json *jsonNode = jsonNodes[contextId]; + (*jsonNode)["frame"] = {{"name", contextName}, {"type", "function"}}; + (*jsonNode)["metrics"] = json::object(); + for (auto [metricKind, metric] : treeNode.metrics) { + if (metricKind == MetricKind::Kernel) { + auto kernelMetric = std::dynamic_pointer_cast(metric); + auto duration = + std::get(kernelMetric->getValue(KernelMetric::Duration)); + auto invocations = std::get( + kernelMetric->getValue(KernelMetric::Invocations)); + auto deviceId = + std::get(kernelMetric->getValue(KernelMetric::DeviceId)); + auto deviceType = std::get( + kernelMetric->getValue(KernelMetric::DeviceType)); + auto deviceTypeName = + getDeviceTypeString(static_cast(deviceType)); + (*jsonNode)["metrics"] + [kernelMetric->getValueName(KernelMetric::Duration)] = + duration; + (*jsonNode)["metrics"] + [kernelMetric->getValueName(KernelMetric::Invocations)] = + invocations; + (*jsonNode)["metrics"] + [kernelMetric->getValueName(KernelMetric::DeviceId)] = + std::to_string(deviceId); + (*jsonNode)["metrics"] + [kernelMetric->getValueName(KernelMetric::DeviceType)] = + deviceTypeName; + valueNames.insert(kernelMetric->getValueName(KernelMetric::Duration)); + valueNames.insert( + kernelMetric->getValueName(KernelMetric::Invocations)); + deviceIds.insert({deviceType, {deviceId}}); + } else if (metricKind == MetricKind::PCSampling) { + auto pcSamplingMetric = + std::dynamic_pointer_cast(metric); + for (size_t i = 0; i < PCSamplingMetric::Count; i++) { + auto valueName = pcSamplingMetric->getValueName(i); valueNames.insert(valueName); std::visit( [&](auto &&value) { (*jsonNode)["metrics"][valueName] = value; }, - flexibleMetric.getValues()[0]); - } - (*jsonNode)["children"] = json::array(); - auto children = treeNode.children; - for (auto _ : children) { - (*jsonNode)["children"].push_back(json::object()); + pcSamplingMetric->getValues()[i]); } - auto idx = 0; - for (auto child : children) { - auto [index, childId] = child; - jsonNodes[childId] = &(*jsonNode)["children"][idx]; - idx++; - } - }); + } else { + throw std::runtime_error("MetricKind not supported"); + } + } + for (auto [_, flexibleMetric] : treeNode.flexibleMetrics) { + auto valueName = flexibleMetric.getValueName(0); + valueNames.insert(valueName); + std::visit( + [&](auto &&value) { (*jsonNode)["metrics"][valueName] = value; }, + flexibleMetric.getValues()[0]); + } + (*jsonNode)["children"] = json::array(); + auto children = treeNode.children; + for (auto _ : children) { + (*jsonNode)["children"].push_back(json::object()); + } + auto idx = 0; + for (auto child : children) { + auto [index, childId] = child; + jsonNodes[childId] = &(*jsonNode)["children"][idx]; + idx++; + } + }); // Hints for all available metrics for (auto valueName : valueNames) { output[Tree::TreeNode::RootId]["metrics"][valueName] = 0; diff --git a/third_party/proton/csrc/lib/Driver/GPU/CuptiApi.cpp b/third_party/proton/csrc/lib/Driver/GPU/CuptiApi.cpp index f9c8617ff528..7512f6eb4dfc 100644 --- a/third_party/proton/csrc/lib/Driver/GPU/CuptiApi.cpp +++ b/third_party/proton/csrc/lib/Driver/GPU/CuptiApi.cpp @@ -15,6 +15,9 @@ struct ExternLibCupti : public ExternLibBase { void *ExternLibCupti::lib = nullptr; +DEFINE_DISPATCH(ExternLibCupti, getContextId, cuptiGetContextId, CUcontext, + uint32_t *); + DEFINE_DISPATCH(ExternLibCupti, activityRegisterCallbacks, cuptiActivityRegisterCallbacks, CUpti_BuffersCallbackRequestFunc, @@ -70,6 +73,40 @@ DEFINE_DISPATCH(ExternLibCupti, getGraphExecId, cuptiGetGraphExecId, DEFINE_DISPATCH(ExternLibCupti, getGraphId, cuptiGetGraphId, CUgraph, uint32_t *); +DEFINE_DISPATCH(ExternLibCupti, getCubinCrc, cuptiGetCubinCrc, + CUpti_GetCubinCrcParams *); + +DEFINE_DISPATCH(ExternLibCupti, getSassToSourceCorrelation, + cuptiGetSassToSourceCorrelation, + CUpti_GetSassToSourceCorrelationParams *); + +DEFINE_DISPATCH(ExternLibCupti, pcSamplingGetNumStallReasons, + cuptiPCSamplingGetNumStallReasons, + CUpti_PCSamplingGetNumStallReasonsParams *); + +DEFINE_DISPATCH(ExternLibCupti, pcSamplingGetStallReasons, + cuptiPCSamplingGetStallReasons, + CUpti_PCSamplingGetStallReasonsParams *); + +DEFINE_DISPATCH(ExternLibCupti, pcSamplingSetConfigurationAttribute, + cuptiPCSamplingSetConfigurationAttribute, + CUpti_PCSamplingConfigurationInfoParams *); + +DEFINE_DISPATCH(ExternLibCupti, pcSamplingEnable, cuptiPCSamplingEnable, + CUpti_PCSamplingEnableParams *); + +DEFINE_DISPATCH(ExternLibCupti, pcSamplingDisable, cuptiPCSamplingDisable, + CUpti_PCSamplingDisableParams *); + +DEFINE_DISPATCH(ExternLibCupti, pcSamplingGetData, cuptiPCSamplingGetData, + CUpti_PCSamplingGetDataParams *); + +DEFINE_DISPATCH(ExternLibCupti, pcSamplingStart, cuptiPCSamplingStart, + CUpti_PCSamplingStartParams *); + +DEFINE_DISPATCH(ExternLibCupti, pcSamplingStop, cuptiPCSamplingStop, + CUpti_PCSamplingStopParams *); + } // namespace cupti } // namespace proton diff --git a/third_party/proton/csrc/lib/Profiler/CuptiPCSampling.cpp b/third_party/proton/csrc/lib/Profiler/CuptiPCSampling.cpp new file mode 100644 index 000000000000..6a9ffadbbd86 --- /dev/null +++ b/third_party/proton/csrc/lib/Profiler/CuptiPCSampling.cpp @@ -0,0 +1,427 @@ +#include "Profiler/Cupti/CuptiPCSampling.h" +#include "Data/Metric.h" +#include "Driver/GPU/CudaApi.h" +#include "Driver/GPU/CuptiApi.h" +#include "Utility/Atomic.h" +#include "Utility/Map.h" +#include "Utility/String.h" +#include +#include + +namespace proton { + +namespace { + +uint64_t getCubinCrc(const char *cubin, size_t size) { + CUpti_GetCubinCrcParams cubinCrcParams = { + .size = CUpti_GetCubinCrcParamsSize, + .cubinSize = size, + .cubin = cubin, + .cubinCrc = 0, + }; + cupti::getCubinCrc(&cubinCrcParams); + return cubinCrcParams.cubinCrc; +} + +size_t getNumStallReasons(CUcontext context) { + size_t numStallReasons = 0; + CUpti_PCSamplingGetNumStallReasonsParams numStallReasonsParams = { + .size = CUpti_PCSamplingGetNumStallReasonsParamsSize, + .pPriv = NULL, + .ctx = context, + .numStallReasons = &numStallReasons}; + cupti::pcSamplingGetNumStallReasons(&numStallReasonsParams); + return numStallReasons; +} + +std::tuple +getSassToSourceCorrelation(const char *functionName, uint64_t pcOffset, + const char *cubin, size_t cubinSize) { + CUpti_GetSassToSourceCorrelationParams sassToSourceParams = { + .size = CUpti_GetSassToSourceCorrelationParamsSize, + .cubin = cubin, + .functionName = functionName, + .cubinSize = cubinSize, + .lineNumber = 0, + .pcOffset = pcOffset, + .fileName = NULL, + .dirName = NULL, + }; + // Get source can fail if the line mapping is not available so we don't check + cupti::getSassToSourceCorrelation(&sassToSourceParams); + return std::make_tuple(sassToSourceParams.lineNumber, + sassToSourceParams.dirName, + sassToSourceParams.fileName); +} + +std::pair +getStallReasonNamesAndIndices(CUcontext context, size_t numStallReasons) { + char **stallReasonNames = + static_cast(std::calloc(numStallReasons, sizeof(char *))); + for (size_t i = 0; i < numStallReasons; i++) { + stallReasonNames[i] = static_cast( + std::calloc(CUPTI_STALL_REASON_STRING_SIZE, sizeof(char))); + } + uint32_t *stallReasonIndices = + static_cast(std::calloc(numStallReasons, sizeof(uint32_t))); + // Initialize the names with 128 characters to avoid buffer overflow + CUpti_PCSamplingGetStallReasonsParams stallReasonsParams = { + .size = CUpti_PCSamplingGetStallReasonsParamsSize, + .pPriv = NULL, + .ctx = context, + .numStallReasons = numStallReasons, + .stallReasonIndex = stallReasonIndices, + .stallReasons = stallReasonNames, + }; + cupti::pcSamplingGetStallReasons(&stallReasonsParams); + return std::make_pair(stallReasonNames, stallReasonIndices); +} + +size_t matchStallReasonsToIndices( + size_t numStallReasons, char **stallReasonNames, + uint32_t *stallReasonIndices, + std::map &stallReasonIndexToMetricIndex, + std::set ¬IssuedStallReasonIndices) { + // In case there's any invalid stall reasons, we only collect valid ones. + // Invalid ones are swapped to the end of the list + std::vector validIndex(numStallReasons, false); + size_t numValidStalls = 0; + for (size_t i = 0; i < numStallReasons; i++) { + bool notIssued = std::string(stallReasonNames[i]).find("not_issued") != + std::string::npos; + auto cuptiStallName = replace(stallReasonNames[i], "_", ""); + for (size_t j = 0; j < PCSamplingMetric::PCSamplingMetricKind::Count; j++) { + auto metricName = toLower(PCSamplingMetric().getValueName(j)); + if (cuptiStallName.find(metricName) != std::string::npos) { + if (notIssued) + notIssuedStallReasonIndices.insert(stallReasonIndices[i]); + stallReasonIndexToMetricIndex[stallReasonIndices[i]] = j; + validIndex[i] = true; + numValidStalls++; + break; + } + } + } + int invalidIndex = -1; + for (size_t i = 0; i < numStallReasons; i++) { + if (invalidIndex == -1 && !validIndex[i]) { + invalidIndex = i; + } else if (invalidIndex != -1 && validIndex[i]) { + std::swap(stallReasonIndices[invalidIndex], stallReasonIndices[i]); + std::swap(stallReasonNames[invalidIndex], stallReasonNames[i]); + validIndex[invalidIndex] = true; + invalidIndex++; + } + } + return numValidStalls; +} + +CUpti_PCSamplingData allocPCSamplingData(size_t collectNumPCs, + size_t numValidStallReasons) { +// Check cuda api version >= 12.4 +// If so, we subtract 4 bytes from the size of CUpti_PCSamplingPCData +// because it introduces a new field at the end of the struct, which is not +// compatible with the previous versions. +#if CUDA_VERSION >= 12040 + size_t pcDataSize = sizeof(CUpti_PCSamplingPCData) - sizeof(uint32_t); +#else + size_t pcDataSize = sizeof(CUpti_PCSamplingPCData); +#endif + CUpti_PCSamplingData pcSamplingData{ + .size = sizeof(CUpti_PCSamplingData), + .collectNumPcs = collectNumPCs, + .pPcData = static_cast( + std::calloc(collectNumPCs, pcDataSize))}; + for (size_t i = 0; i < collectNumPCs; ++i) { + pcSamplingData.pPcData[i].stallReason = + static_cast(std::calloc( + numValidStallReasons, sizeof(CUpti_PCSamplingStallReason))); + } + return pcSamplingData; +} + +void enablePCSampling(CUcontext context) { + CUpti_PCSamplingEnableParams params = { + .size = CUpti_PCSamplingEnableParamsSize, + .pPriv = NULL, + .ctx = context, + }; + cupti::pcSamplingEnable(¶ms); +} + +void disablePCSampling(CUcontext context) { + CUpti_PCSamplingDisableParams params = { + .size = CUpti_PCSamplingDisableParamsSize, + .pPriv = NULL, + .ctx = context, + }; + cupti::pcSamplingDisable(¶ms); +} + +void startPCSampling(CUcontext context) { + CUpti_PCSamplingStartParams params = { + .size = CUpti_PCSamplingStartParamsSize, + .pPriv = NULL, + .ctx = context, + }; + cupti::pcSamplingStart(¶ms); +} + +void stopPCSampling(CUcontext context) { + CUpti_PCSamplingStopParams params = { + .size = CUpti_PCSamplingStopParamsSize, + .pPriv = NULL, + .ctx = context, + }; + cupti::pcSamplingStop(¶ms); +} + +void getPCSamplingData(CUcontext context, + CUpti_PCSamplingData *pcSamplingData) { + CUpti_PCSamplingGetDataParams params = { + .size = CUpti_PCSamplingGetDataParamsSize, + .pPriv = NULL, + .ctx = context, + .pcSamplingData = pcSamplingData, + }; + cupti::pcSamplingGetData(¶ms); +} + +void setConfigurationAttribute( + CUcontext context, + std::vector &configurationInfos) { + CUpti_PCSamplingConfigurationInfoParams infoParams = { + .size = CUpti_PCSamplingConfigurationInfoParamsSize, + .pPriv = NULL, + .ctx = context, + .numAttributes = configurationInfos.size(), + .pPCSamplingConfigurationInfo = configurationInfos.data(), + }; + cupti::pcSamplingSetConfigurationAttribute(&infoParams); +} + +} // namespace + +CUpti_PCSamplingConfigurationInfo ConfigureData::configureStallReasons() { + numStallReasons = getNumStallReasons(context); + std::tie(this->stallReasonNames, this->stallReasonIndices) = + getStallReasonNamesAndIndices(context, numStallReasons); + numValidStallReasons = matchStallReasonsToIndices( + numStallReasons, stallReasonNames, stallReasonIndices, + stallReasonIndexToMetricIndex, notIssuedStallReasonIndices); + CUpti_PCSamplingConfigurationInfo stallReasonInfo{}; + stallReasonInfo.attributeType = + CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_STALL_REASON; + stallReasonInfo.attributeData.stallReasonData.stallReasonCount = + numValidStallReasons; + stallReasonInfo.attributeData.stallReasonData.pStallReasonIndex = + stallReasonIndices; + return stallReasonInfo; +} + +CUpti_PCSamplingConfigurationInfo ConfigureData::configureSamplingPeriod() { + CUpti_PCSamplingConfigurationInfo samplingPeriodInfo{}; + samplingPeriodInfo.attributeType = + CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_SAMPLING_PERIOD; + samplingPeriodInfo.attributeData.samplingPeriodData.samplingPeriod = + DefaultFrequency; + return samplingPeriodInfo; +} + +CUpti_PCSamplingConfigurationInfo ConfigureData::configureSamplingBuffer() { + CUpti_PCSamplingConfigurationInfo sampleBufferInfo{}; + sampleBufferInfo.attributeType = + CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_SAMPLING_DATA_BUFFER; + this->pcSamplingData = + allocPCSamplingData(DataBufferPCCount, numValidStallReasons); + sampleBufferInfo.attributeData.samplingDataBufferData.samplingDataBuffer = + &this->pcSamplingData; + return sampleBufferInfo; +} + +CUpti_PCSamplingConfigurationInfo ConfigureData::configureScratchBuffer() { + CUpti_PCSamplingConfigurationInfo scratchBufferInfo{}; + scratchBufferInfo.attributeType = + CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_SCRATCH_BUFFER_SIZE; + scratchBufferInfo.attributeData.scratchBufferSizeData.scratchBufferSize = + ScratchBufferSize; + return scratchBufferInfo; +} + +CUpti_PCSamplingConfigurationInfo ConfigureData::configureHardwareBufferSize() { + CUpti_PCSamplingConfigurationInfo hardwareBufferInfo{}; + hardwareBufferInfo.attributeType = + CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_HARDWARE_BUFFER_SIZE; + hardwareBufferInfo.attributeData.hardwareBufferSizeData.hardwareBufferSize = + HardwareBufferSize; + return hardwareBufferInfo; +} + +CUpti_PCSamplingConfigurationInfo ConfigureData::configureStartStopControl() { + CUpti_PCSamplingConfigurationInfo startStopControlInfo{}; + startStopControlInfo.attributeType = + CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_ENABLE_START_STOP_CONTROL; + startStopControlInfo.attributeData.enableStartStopControlData + .enableStartStopControl = true; + return startStopControlInfo; +} + +CUpti_PCSamplingConfigurationInfo ConfigureData::configureCollectionMode() { + CUpti_PCSamplingConfigurationInfo collectionModeInfo{}; + collectionModeInfo.attributeType = + CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_COLLECTION_MODE; + collectionModeInfo.attributeData.collectionModeData.collectionMode = + CUPTI_PC_SAMPLING_COLLECTION_MODE_CONTINUOUS; + return collectionModeInfo; +} + +void ConfigureData::initialize(CUcontext context) { + this->context = context; + cupti::getContextId(context, &contextId); + configurationInfos.emplace_back(configureStallReasons()); + configurationInfos.emplace_back(configureSamplingPeriod()); + configurationInfos.emplace_back(configureHardwareBufferSize()); + configurationInfos.emplace_back(configureScratchBuffer()); + configurationInfos.emplace_back(configureSamplingBuffer()); + configurationInfos.emplace_back(configureStartStopControl()); + configurationInfos.emplace_back(configureCollectionMode()); + setConfigurationAttribute(context, configurationInfos); +} + +ConfigureData *CuptiPCSampling::getConfigureData(CUcontext context) { + uint32_t contextId; + cupti::getContextId(context, &contextId); + return &contextIdToConfigureData[contextId]; +} + +CubinData *CuptiPCSampling::getCubinData(uint64_t cubinCrc) { + return &cubinCrcToCubinData[cubinCrc]; +} + +void CuptiPCSampling::initialize(CUcontext context) { + uint32_t contextId = 0; + cupti::getContextId(context, &contextId); + if (contextInitialized.contain(contextId)) + return; + std::unique_lock lock(contextMutex); + if (contextInitialized.contain(contextId)) + return; + enablePCSampling(context); + getConfigureData(context)->initialize(context); + contextInitialized.insert(contextId); +} + +void CuptiPCSampling::start(CUcontext context) { + if (pcSamplingStarted) + return; + std::unique_lock lock(pcSamplingMutex); + if (pcSamplingStarted) + return; + initialize(context); + // Ensure all previous operations are completed + cuda::ctxSynchronize(); + // Clean up previous records + auto *configureData = getConfigureData(context); + configureData->pcSamplingData.totalNumPcs = 0; + configureData->pcSamplingData.remainingNumPcs = 0; + startPCSampling(context); + pcSamplingStarted = true; +} + +void CuptiPCSampling::processPCSamplingData(ConfigureData *configureData, + uint64_t externId, bool isAPI) { + auto *pcSamplingData = &configureData->pcSamplingData; + auto &profiler = CuptiProfiler::instance(); + auto dataSet = profiler.getDataSet(); + while ((pcSamplingData->totalNumPcs > 0 || + pcSamplingData->remainingNumPcs > 0)) { + // Handle data + for (size_t i = 0; i < pcSamplingData->totalNumPcs; ++i) { + auto *pcData = pcSamplingData->pPcData; + auto *cubinData = getCubinData(pcData->cubinCrc); + auto key = + CubinData::LineInfoKey{pcData->functionIndex, pcData->pcOffset}; + if (cubinData->lineInfo.find(key) == cubinData->lineInfo.end()) { + auto [lineNumber, fileName, dirName] = + getSassToSourceCorrelation(pcData->functionName, pcData->pcOffset, + cubinData->cubin, cubinData->cubinSize); + cubinData->lineInfo.try_emplace( + key, lineNumber, + pcData->functionName ? std::string(pcData->functionName) : "", + fileName ? std::string(fileName) : "", + dirName ? std::string(dirName) : ""); + } + auto &lineInfo = cubinData->lineInfo[key]; + for (size_t j = 0; j < pcData->stallReasonCount; ++j) { + auto *stallReason = &pcData->stallReason[j]; + if (!configureData->stallReasonIndexToMetricIndex.count( + stallReason->pcSamplingStallReasonIndex)) + throw std::runtime_error("Invalid stall reason index"); + for (auto *data : dataSet) { + auto scopeId = externId; + if (isAPI) + scopeId = data->addScope(externId, lineInfo.functionName); + if (lineInfo.fileName.size()) + scopeId = data->addScope( + scopeId, lineInfo.fileName + ":" + lineInfo.functionName + "@" + + std::to_string(lineInfo.lineNumber)); + auto metricKind = static_cast( + configureData->stallReasonIndexToMetricIndex + [stallReason->pcSamplingStallReasonIndex]); + auto samples = stallReason->samples; + auto stalledSamples = + configureData->notIssuedStallReasonIndices.count( + stallReason->pcSamplingStallReasonIndex) + ? 0 + : samples; + auto metric = std::make_shared(metricKind, samples, + stalledSamples); + data->addMetric(scopeId, metric); + } + } + } + if (pcSamplingData->remainingNumPcs > 0) + getPCSamplingData(configureData->context, pcSamplingData); + else + break; + } +} + +void CuptiPCSampling::stop(CUcontext context, uint64_t externId, bool isAPI) { + if (!pcSamplingStarted) + return; + std::unique_lock lock(pcSamplingMutex); + if (!pcSamplingStarted) + return; + stopPCSampling(context); + auto *configureData = getConfigureData(context); + processPCSamplingData(configureData, externId, isAPI); + pcSamplingStarted = false; +} + +void CuptiPCSampling::finalize(CUcontext context) { + auto *configureData = getConfigureData(context); + auto contextId = configureData->contextId; + contextIdToConfigureData.erase(contextId); + contextInitialized.erase(contextId); + disablePCSampling(context); +} + +void CuptiPCSampling::loadModule(CUpti_ResourceData *resourceData) { + auto *cubinResource = + static_cast(resourceData->resourceDescriptor); + auto cubinCrc = getCubinCrc(cubinResource->pCubin, cubinResource->cubinSize); + auto *cubinData = getCubinData(cubinCrc); + cubinData->cubinCrc = cubinCrc; + cubinData->cubinSize = cubinResource->cubinSize; + cubinData->cubin = cubinResource->pCubin; +} + +void CuptiPCSampling::unloadModule(CUpti_ResourceData *resourceData) { + auto *cubinResource = + static_cast(resourceData->resourceDescriptor); + auto cubinCrc = getCubinCrc(cubinResource->pCubin, cubinResource->cubinSize); + cubinCrcToCubinData.erase(cubinCrc); +} + +} // namespace proton diff --git a/third_party/proton/csrc/lib/Profiler/CuptiProfiler.cpp b/third_party/proton/csrc/lib/Profiler/CuptiProfiler.cpp index 489e93d7abbd..5ba456ffe8a0 100644 --- a/third_party/proton/csrc/lib/Profiler/CuptiProfiler.cpp +++ b/third_party/proton/csrc/lib/Profiler/CuptiProfiler.cpp @@ -1,9 +1,10 @@ -#include "Profiler/CuptiProfiler.h" +#include "Profiler/Cupti/CuptiProfiler.h" #include "Context/Context.h" #include "Data/Metric.h" #include "Driver/Device.h" #include "Driver/GPU/CudaApi.h" #include "Driver/GPU/CuptiApi.h" +#include "Profiler/Cupti/CuptiPCSampling.h" #include "Utility/Map.h" #include @@ -162,6 +163,18 @@ void setGraphCallbacks(CUpti_SubscriberHandle subscriber, bool enable) { #undef CALLBACK_ENABLE } +void setResourceCallbacks(CUpti_SubscriberHandle subscriber, bool enable) { +#define CALLBACK_ENABLE(id) \ + cupti::enableCallback(static_cast(enable), subscriber, \ + CUPTI_CB_DOMAIN_RESOURCE, id) + + CALLBACK_ENABLE(CUPTI_CBID_RESOURCE_MODULE_LOADED); + CALLBACK_ENABLE(CUPTI_CBID_RESOURCE_MODULE_UNLOAD_STARTING); + CALLBACK_ENABLE(CUPTI_CBID_RESOURCE_CONTEXT_CREATED); + CALLBACK_ENABLE(CUPTI_CBID_RESOURCE_CONTEXT_DESTROY_STARTING); +#undef CALLBACK_ENABLE +} + } // namespace struct CuptiProfiler::CuptiProfilerPimpl @@ -186,6 +199,7 @@ struct CuptiProfiler::CuptiProfilerPimpl static constexpr size_t AttributeSize = sizeof(size_t); CUpti_SubscriberHandle subscriber{}; + CuptiPCSampling pcSampling; ThreadSafeMap> graphIdToNumInstances; @@ -241,33 +255,44 @@ void CuptiProfiler::CuptiProfilerPimpl::callbackFn(void *userData, if (domain == CUPTI_CB_DOMAIN_RESOURCE) { auto *resourceData = static_cast(const_cast(cbData)); - auto *graphData = - static_cast(resourceData->resourceDescriptor); auto *pImpl = dynamic_cast(profiler.pImpl.get()); - uint32_t graphId = 0; - uint32_t graphExecId = 0; - if (graphData->graph) - cupti::getGraphId(graphData->graph, &graphId); - if (graphData->graphExec) - cupti::getGraphExecId(graphData->graphExec, &graphExecId); - if (cbId == CUPTI_CBID_RESOURCE_GRAPHNODE_CREATED || - cbId == CUPTI_CBID_RESOURCE_GRAPHNODE_CLONED) { - if (!pImpl->graphIdToNumInstances.contain(graphId)) - pImpl->graphIdToNumInstances[graphId] = 1; - else - pImpl->graphIdToNumInstances[graphId]++; - } else if (cbId == CUPTI_CBID_RESOURCE_GRAPHNODE_DESTROY_STARTING) { - pImpl->graphIdToNumInstances[graphId]--; - } else if (cbId == CUPTI_CBID_RESOURCE_GRAPHEXEC_CREATED) { - pImpl->graphExecIdToGraphId[graphExecId] = graphId; - } else if (cbId == CUPTI_CBID_RESOURCE_GRAPHEXEC_DESTROY_STARTING) { - pImpl->graphExecIdToGraphId.erase(graphExecId); - } else if (cbId == CUPTI_CBID_RESOURCE_GRAPH_DESTROY_STARTING) { - pImpl->graphIdToNumInstances.erase(graphId); + if (cbId == CUPTI_CBID_RESOURCE_MODULE_LOADED) { + pImpl->pcSampling.loadModule(resourceData); + } else if (cbId == CUPTI_CBID_RESOURCE_MODULE_UNLOAD_STARTING) { + pImpl->pcSampling.unloadModule(resourceData); + } else if (cbId == CUPTI_CBID_RESOURCE_CONTEXT_CREATED) { + pImpl->pcSampling.initialize(resourceData->context); + } else if (cbId == CUPTI_CBID_RESOURCE_CONTEXT_DESTROY_STARTING) { + pImpl->pcSampling.finalize(resourceData->context); + } else { + auto *graphData = + static_cast(resourceData->resourceDescriptor); + uint32_t graphId = 0; + uint32_t graphExecId = 0; + if (graphData->graph) + cupti::getGraphId(graphData->graph, &graphId); + if (graphData->graphExec) + cupti::getGraphExecId(graphData->graphExec, &graphExecId); + if (cbId == CUPTI_CBID_RESOURCE_GRAPHNODE_CREATED || + cbId == CUPTI_CBID_RESOURCE_GRAPHNODE_CLONED) { + if (!pImpl->graphIdToNumInstances.contain(graphId)) + pImpl->graphIdToNumInstances[graphId] = 1; + else + pImpl->graphIdToNumInstances[graphId]++; + } else if (cbId == CUPTI_CBID_RESOURCE_GRAPHNODE_DESTROY_STARTING) { + pImpl->graphIdToNumInstances[graphId]--; + } else if (cbId == CUPTI_CBID_RESOURCE_GRAPHEXEC_CREATED) { + pImpl->graphExecIdToGraphId[graphExecId] = graphId; + } else if (cbId == CUPTI_CBID_RESOURCE_GRAPHEXEC_DESTROY_STARTING) { + pImpl->graphExecIdToGraphId.erase(graphExecId); + } else if (cbId == CUPTI_CBID_RESOURCE_GRAPH_DESTROY_STARTING) { + pImpl->graphIdToNumInstances.erase(graphId); + } } } else { const CUpti_CallbackData *callbackData = static_cast(cbData); + auto *pImpl = dynamic_cast(profiler.pImpl.get()); if (callbackData->callbackSite == CUPTI_API_ENTER) { auto scopeId = Scope::getNewScopeId(); threadState.record(scopeId); @@ -275,7 +300,6 @@ void CuptiProfiler::CuptiProfilerPimpl::callbackFn(void *userData, size_t numInstances = 1; if (cbId == CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch || cbId == CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch_ptsz) { - auto *pImpl = dynamic_cast(profiler.pImpl.get()); auto graphExec = static_cast( callbackData->functionParams) ->hGraph; @@ -298,7 +322,15 @@ void CuptiProfiler::CuptiProfilerPimpl::callbackFn(void *userData, << std::endl; } profiler.correlation.correlate(callbackData->correlationId, numInstances); + if (profiler.isPCSamplingEnabled()) + pImpl->pcSampling.start(callbackData->context); } else if (callbackData->callbackSite == CUPTI_API_EXIT) { + if (profiler.isPCSamplingEnabled()) { + // XXX: Conservatively stop every GPU kernel for now + auto scopeId = profiler.correlation.externIdQueue.back(); + pImpl->pcSampling.stop(callbackData->context, scopeId, + !profiler.isOpInProgress()); + } threadState.exitOp(); profiler.correlation.submit(callbackData->correlationId); } @@ -306,10 +338,13 @@ void CuptiProfiler::CuptiProfilerPimpl::callbackFn(void *userData, } void CuptiProfiler::CuptiProfilerPimpl::doStart() { - cupti::activityRegisterCallbacks(allocBuffer, completeBuffer); - cupti::activityEnable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL); - // TODO: switch to directly subscribe the APIs and measure overhead cupti::subscribe(&subscriber, callbackFn, nullptr); + if (profiler.isPCSamplingEnabled()) { + setResourceCallbacks(subscriber, /*enable=*/true); + } else { + cupti::activityRegisterCallbacks(allocBuffer, completeBuffer); + cupti::activityEnable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL); + } setGraphCallbacks(subscriber, /*enable=*/true); setRuntimeCallbacks(subscriber, /*enable=*/true); setDriverCallbacks(subscriber, /*enable=*/true); @@ -328,20 +363,28 @@ void CuptiProfiler::CuptiProfilerPimpl::doFlush() { cuda::ctxGetCurrent(&cuContext); if (cuContext) cuda::ctxSynchronize(); - profiler.correlation.flush( - /*maxRetries=*/100, /*sleepMs=*/10, - /*flush=*/[]() { - cupti::activityFlushAll( - /*flag=*/0); - }); - // CUPTI_ACTIVITY_FLAG_FLUSH_FORCED is used to ensure that even incomplete - // activities are flushed so that the next profiling session can start with - // new activities. - cupti::activityFlushAll(/*flag=*/CUPTI_ACTIVITY_FLAG_FLUSH_FORCED); + if (profiler.isPCSamplingEnabled()) { + pcSampling.finalize(cuContext); + } else { + profiler.correlation.flush( + /*maxRetries=*/100, /*sleepMs=*/10, + /*flush=*/[]() { + cupti::activityFlushAll( + /*flag=*/0); + }); + // CUPTI_ACTIVITY_FLAG_FLUSH_FORCED is used to ensure that even incomplete + // activities are flushed so that the next profiling session can start with + // new activities. + cupti::activityFlushAll(/*flag=*/CUPTI_ACTIVITY_FLAG_FLUSH_FORCED); + } } void CuptiProfiler::CuptiProfilerPimpl::doStop() { - cupti::activityDisable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL); + if (profiler.isPCSamplingEnabled()) { + setResourceCallbacks(subscriber, /*enable=*/false); + } else { + cupti::activityDisable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL); + } setGraphCallbacks(subscriber, /*enable=*/false); setRuntimeCallbacks(subscriber, /*enable=*/false); setDriverCallbacks(subscriber, /*enable=*/false); diff --git a/third_party/proton/csrc/lib/Profiler/RoctracerProfiler.cpp b/third_party/proton/csrc/lib/Profiler/RoctracerProfiler.cpp index 24fca2b12469..8b4f4a42b3bd 100644 --- a/third_party/proton/csrc/lib/Profiler/RoctracerProfiler.cpp +++ b/third_party/proton/csrc/lib/Profiler/RoctracerProfiler.cpp @@ -1,4 +1,4 @@ -#include "Profiler/RoctracerProfiler.h" +#include "Profiler/Roctracer/RoctracerProfiler.h" #include "Context/Context.h" #include "Data/Metric.h" #include "Driver/GPU/HipApi.h" diff --git a/third_party/proton/csrc/lib/Session/Session.cpp b/third_party/proton/csrc/lib/Session/Session.cpp index 0265981e5af3..dec2fcac4874 100644 --- a/third_party/proton/csrc/lib/Session/Session.cpp +++ b/third_party/proton/csrc/lib/Session/Session.cpp @@ -2,8 +2,8 @@ #include "Context/Python.h" #include "Context/Shadow.h" #include "Data/TreeData.h" -#include "Profiler/CuptiProfiler.h" -#include "Profiler/RoctracerProfiler.h" +#include "Profiler/Cupti/CuptiProfiler.h" +#include "Profiler/Roctracer/RoctracerProfiler.h" #include "Utility/String.h" namespace proton { @@ -13,6 +13,9 @@ Profiler *getProfiler(const std::string &profilerName) { if (proton::toLower(profilerName) == "cupti") { return &CuptiProfiler::instance(); } + if (proton::toLower(profilerName) == "cupti_pcsampling") { + return &CuptiProfiler::instance().enablePCSampling(); + } if (proton::toLower(profilerName) == "roctracer") { return &RoctracerProfiler::instance(); } diff --git a/third_party/proton/proton/profile.py b/third_party/proton/proton/profile.py index 2bf7938a5967..b9c55d64314d 100644 --- a/third_party/proton/proton/profile.py +++ b/third_party/proton/proton/profile.py @@ -42,7 +42,7 @@ def start( name (str, optional): The name (with path) of the profiling session. If not provided, the default name is "~/proton.hatchet". backend (str, optional): The backend to use for profiling. - Available options are ["cupti"]. + Available options are ["cupti", "cupti_pcsampling", "roctracer"]. Defaults to None, which automatically selects the backend matching the current active runtime. context (str, optional): The context to use for profiling. Available options are ["shadow", "python"]. diff --git a/third_party/proton/proton/proton.py b/third_party/proton/proton/proton.py index f27fdcc93d01..42ba458c77ad 100644 --- a/third_party/proton/proton/proton.py +++ b/third_party/proton/proton/proton.py @@ -13,7 +13,8 @@ def parse_arguments(): python -m triton.profiler.proton [options] script.py [script_args] [script_options] """, formatter_class=argparse.RawTextHelpFormatter) parser.add_argument("-n", "--name", type=str, help="Name of the profiling session") - parser.add_argument("-b", "--backend", type=str, help="Profiling backend", default=None, choices=["cupti"]) + parser.add_argument("-b", "--backend", type=str, help="Profiling backend", default=None, + choices=["cupti", "cupti_pcsampling", "roctracer"]) parser.add_argument("-c", "--context", type=str, help="Profiling context", default="shadow", choices=["shadow", "python"]) parser.add_argument("-d", "--data", type=str, help="Profiling data", default="tree", choices=["tree"]) diff --git a/third_party/proton/proton/viewer.py b/third_party/proton/proton/viewer.py index 42110025a16d..7423a178cd32 100644 --- a/third_party/proton/proton/viewer.py +++ b/third_party/proton/proton/viewer.py @@ -91,10 +91,10 @@ def get_min_time_bytes(df, device_info): def derive_metrics(gf, metrics, raw_metrics, device_info): derived_metrics = [] original_metrics = [] - time_metric_name = match_available_metrics([time_factor_dict.name], raw_metrics)[0] - time_unit = (time_factor_dict.name + "/" + time_metric_name.split("(")[1].split(")")[0]) for metric in metrics: if metric == "util": # Tensor core only + time_metric_name = match_available_metrics([time_factor_dict.name], raw_metrics)[0] + time_unit = (time_factor_dict.name + "/" + time_metric_name.split("(")[1].split(")")[0]) min_time_bytes = get_min_time_bytes(gf.dataframe, device_info) min_time_flops = get_min_time_flops(gf.dataframe, device_info) time_sec = gf.dataframe[time_metric_name] * (time_factor_dict.factor[time_unit] / @@ -102,6 +102,8 @@ def derive_metrics(gf, metrics, raw_metrics, device_info): gf.dataframe["util (inc)"] = min_time_flops["min_time"].combine(min_time_bytes["min_time"], max) / time_sec derived_metrics.append("util (inc)") elif metric in derivable_metrics: + time_metric_name = match_available_metrics([time_factor_dict.name], raw_metrics)[0] + time_unit = (time_factor_dict.name + "/" + time_metric_name.split("(")[1].split(")")[0]) deriveable_metric = derivable_metrics[metric] metric_name = deriveable_metric.name metric_factor_dict = deriveable_metric.factor @@ -111,6 +113,8 @@ def derive_metrics(gf, metrics, raw_metrics, device_info): metric_factor_dict[metric]) derived_metrics.append(f"{metric} (inc)") elif metric in time_factor_dict.factor: + time_metric_name = match_available_metrics([time_factor_dict.name], raw_metrics)[0] + time_unit = (time_factor_dict.name + "/" + time_metric_name.split("(")[1].split(")")[0]) metric_time_unit = time_factor_dict.name + "/" + metric.split("/")[1] gf.dataframe[f"{metric} (inc)"] = gf.dataframe[time_metric_name] * ( time_factor_dict.factor[time_unit] / time_factor_dict.factor[metric_time_unit]) diff --git a/third_party/proton/test/test_profile.py b/third_party/proton/test/test_profile.py index 48235cdbfc08..1013d24498c2 100644 --- a/third_party/proton/test/test_profile.py +++ b/third_party/proton/test/test_profile.py @@ -197,3 +197,27 @@ def foo(x, size: tl.constexpr, y): assert data[0]["children"][0]["children"][0]["frame"]["name"] == "foo_test_1ctas_1elems" assert data[0]["children"][0]["children"][0]["metrics"]["flops32"] == 1.0 assert data[0]["children"][0]["children"][0]["metrics"]["Time (ns)"] > 0 + + +def test_pcsampling(): + if is_hip(): + pytest.skip("HIP backend does not support pc sampling") + + @triton.jit + def foo(x, y, size: tl.constexpr): + offs = tl.arange(0, size) + for _ in range(1000): + tl.store(y + offs, tl.load(x + offs)) + + x = torch.ones((1024, ), device="cuda", dtype=torch.float32) + y = torch.zeros_like(x) + with tempfile.NamedTemporaryFile(delete=True, suffix=".hatchet") as f: + proton.start(f.name.split(".")[0], hook="triton", backend="cupti_pcsampling") + with proton.scope("test0"): + foo[(1, )](x, y, x.size()[0], num_warps=4) + x.zero_() + proton.finalize() + data = json.load(f) + assert "foo@210" in data[0]["children"][0]["children"][0]["children"]["frame"]["name"] + assert data[0]["children"][0]["children"][0]["children"]["metrics"]["NumSamples"] > 0 + assert data[0]["children"][0]["children"][0]["children"]["metrics"]["StalledLongScoreboard"] > 0