From f32d9622d9580172fa54faf4b438dc8393386062 Mon Sep 17 00:00:00 2001 From: NaveenElumalaiAMD Date: Mon, 13 Jan 2025 14:28:00 -0600 Subject: [PATCH 1/2] Add tensilelite clients to hipblaslt-bench --- clients/benchmarks/client.cpp | 10 +- clients/common/argument_model.cpp | 50 +- clients/common/performance_monitor.cpp | 628 ++++++++++++++++++ clients/include/argument_model.hpp | 5 +- clients/include/performance_monitor.hpp | 56 ++ clients/include/testing_matmul.hpp | 20 +- library/include/hipblaslt-ext-op.h | 12 + library/src/amd_detail/hipblaslt-ext-op.cpp | 118 +++- .../rocblaslt/src/include/utility.hpp | 20 +- .../amd_detail/rocblaslt/src/tensile_host.cpp | 41 +- .../src/amd_detail/rocblaslt/src/utility.cpp | 11 + 11 files changed, 904 insertions(+), 67 deletions(-) create mode 100644 clients/common/performance_monitor.cpp create mode 100644 clients/include/performance_monitor.hpp diff --git a/clients/benchmarks/client.cpp b/clients/benchmarks/client.cpp index 91bf257926..0e365b9166 100644 --- a/clients/benchmarks/client.cpp +++ b/clients/benchmarks/client.cpp @@ -39,7 +39,7 @@ #include #include -#include "frequency_monitor.hpp" +#include "performance_monitor.hpp" #include "testing_matmul.hpp" @@ -50,7 +50,7 @@ using namespace roc; // For emulated program_options using namespace std::literals; // For std::string literals of form "str"s -struct perf_matmul: hipblaslt_test_valid +struct perf_matmul : hipblaslt_test_valid { void operator()(const Arguments& arg) { @@ -741,8 +741,8 @@ try throw std::invalid_argument("Invalid Device ID"); set_device(device_id); - FrequencyMonitor& freq_monitor = getFrequencyMonitor(); - freq_monitor.set_device_id(device_id); + PerformanceMonitor& perf_monitor = getPerformanceMonitor(); + perf_monitor.set_device_id(device_id); if(datafile) return hipblaslt_bench_datafile(filter, any_stride); @@ -873,7 +873,7 @@ try arg.norm_check_assert = false; int status = run_bench_test(arg, filter, any_stride); - freeFrequencyMonitor(); + freePerformanceMonitor(); return status; } catch(const std::invalid_argument& exp) diff --git a/clients/common/argument_model.cpp b/clients/common/argument_model.cpp index 6975834c09..df7bba13ca 100644 --- a/clients/common/argument_model.cpp +++ b/clients/common/argument_model.cpp @@ -25,7 +25,7 @@ *******************************************************************************/ #include "argument_model.hpp" -#include "frequency_monitor.hpp" +#include "performance_monitor.hpp" // this should have been a member variable but due to the complex variadic template this singleton allows global control @@ -41,31 +41,59 @@ bool ArgumentModel_get_log_function_name() return log_function_name; } -void ArgumentModel_log_frequencies(hipblaslt_internal_ostream& name_line, +void ArgumentModel_log_performance(hipblaslt_internal_ostream& name_line, hipblaslt_internal_ostream& val_line) { - FrequencyMonitor& frequency_monitor = getFrequencyMonitor(); - if(!frequency_monitor.enabled()) + PerformanceMonitor& performance_monitor = getPerformanceMonitor(); + if(!performance_monitor.enabled()) return; - if(!frequency_monitor.detailedReport()) + + name_line << ",Total Granularity"; + val_line << "," << performance_monitor.getTotalGranularityValue(); + + name_line << ",Tiles Per-CU"; + val_line << "," << performance_monitor.getTilesPerCuValue(); + + name_line << ",Tile-0 Granularity"; + val_line << "," << performance_monitor.getTile0Granularity(); + + name_line << ",Tile-1 Granularity"; + val_line << "," << performance_monitor.getTile1Granularity(); + + name_line << ",CU granularity"; + val_line << "," << performance_monitor.getCuGranularity(); + + name_line << ",Wave granularity"; + val_line << "," << performance_monitor.getWaveGranularity(); + + name_line << ",#CU's"; + val_line << "," << performance_monitor.getCUs(); + + name_line << ",mem-read-bytes"; + val_line << "," << performance_monitor.getMemReadBytes(); + + name_line << ",mem-write-bytes"; + val_line << "," << performance_monitor.getMemWriteBytesD(); + + if(!performance_monitor.detailedReport()) { name_line << ",lowest-avg-freq"; - val_line << "," << frequency_monitor.getLowestAverageSYSCLK(); + val_line << "," << performance_monitor.getLowestAverageSYSCLK(); name_line << ",lowest-median-freq"; - val_line << "," << frequency_monitor.getLowestMedianSYSCLK(); + val_line << "," << performance_monitor.getLowestMedianSYSCLK(); } else { - auto allAvgSYSCLK = frequency_monitor.getAllAverageSYSCLK(); + auto allAvgSYSCLK = performance_monitor.getAllAverageSYSCLK(); for(int i = 0; i < allAvgSYSCLK.size(); i++) { name_line << ",avg-freq_" << i; val_line << "," << allAvgSYSCLK[i]; } - auto allMedianSYSCLK = frequency_monitor.getAllMedianSYSCLK(); + auto allMedianSYSCLK = performance_monitor.getAllMedianSYSCLK(); for(int i = 0; i < allMedianSYSCLK.size(); i++) { name_line << ",median-freq_" << i; @@ -74,8 +102,8 @@ void ArgumentModel_log_frequencies(hipblaslt_internal_ostream& name_line, } name_line << ",avg-MCLK"; - val_line << "," << frequency_monitor.getAverageMEMCLK(); + val_line << "," << performance_monitor.getAverageMEMCLK(); name_line << ",median-MCLK"; - val_line << "," << frequency_monitor.getMedianMEMCLK(); + val_line << "," << performance_monitor.getMedianMEMCLK(); } diff --git a/clients/common/performance_monitor.cpp b/clients/common/performance_monitor.cpp new file mode 100644 index 0000000000..3b04587281 --- /dev/null +++ b/clients/common/performance_monitor.cpp @@ -0,0 +1,628 @@ + +/* ************************************************************************ + * Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop- + * ies of the Software, and to permit persons to whom the Software is furnished + * to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM- + * PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS + * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR + * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER + * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE- + * CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ************************************************************************ */ + +#include "performance_monitor.hpp" +#include "hipblaslt-ext-op.h" + +#include +#include +#include +#include +#include +#include +#include + +#ifndef _WIN32 + +#include +#include +#include + +template +inline std::ostream& stream_write(std::ostream& stream, T&& val) +{ + return stream << std::forward(val); +} + +template +inline std::ostream& stream_write(std::ostream& stream, T&& val, Ts&&... vals) +{ + return stream_write(stream << std::forward(val), std::forward(vals)...); +} + +template +inline std::string concatenate(Ts&&... vals) +{ + std::ostringstream msg; + stream_write(msg, std::forward(vals)...); + + return msg.str(); +} + +#define HIP_CHECK_EXC(expr) \ + do \ + { \ + hipError_t e = (expr); \ + if(e) \ + { \ + const char* errName = hipGetErrorName(e); \ + const char* errMsg = hipGetErrorString(e); \ + std::ostringstream msg; \ + msg << "Error " << e << "(" << errName << ") " << __FILE__ << ":" << __LINE__ << ": " \ + << std::endl \ + << #expr << std::endl \ + << errMsg << std::endl; \ + throw std::runtime_error(msg.str()); \ + } \ + } while(0) + +#define RSMI_CHECK_EXC(expr) \ + do \ + { \ + rsmi_status_t e = (expr); \ + if(e) \ + { \ + const char* errName = nullptr; \ + rsmi_status_string(e, &errName); \ + std::ostringstream msg; \ + msg << "Error " << e << "(" << errName << ") " << __FILE__ << ":" << __LINE__ << ": " \ + << std::endl \ + << #expr << std::endl; \ + throw std::runtime_error(msg.str()); \ + } \ + } while(0) + +#endif + +class PerformanceMonitorImp : public PerformanceMonitor +{ +public: + const double cHzToMHz = 0.000001; + const double cMhzToHz = 1000000; + + // deleting copy constructor + PerformanceMonitorImp(const PerformanceMonitorImp& obj) = delete; + +#ifndef _WIN32 + + bool enabled() + { + static const char* env1 = getenv("HIPBLASLT_BENCH_PERF"); + static const char* env2 = getenv("HIPBLASLT_BENCH_PERF_ALL"); + return env1 != nullptr || (env2 != nullptr && m_isMultiXCDSupported); + } + + bool detailedReport() + { + static const char* env2 = getenv("HIPBLASLT_BENCH_PERF_ALL"); + return (env2 != nullptr && m_isMultiXCDSupported); + } + + PerformanceMonitorImp() + { + initThread(); + } + + ~PerformanceMonitorImp() + { + m_stop = true; + m_exit = true; + + m_cv.notify_all(); + m_thread.join(); + } + + void set_device_id(int deviceId) + { + m_smiDeviceIndex = GetROCmSMIIndex(deviceId); + m_XCDCount = 1; + +#if rocm_smi_VERSION_MAJOR >= 7 + auto status2 = rsmi_dev_metrics_xcd_counter_get(m_smiDeviceIndex, &m_XCDCount); + + if(status2 != RSMI_STATUS_SUCCESS) + { + m_XCDCount = 1; + } +#endif + } + + void start() + { + if(!enabled()) + return; + + clearValues(); + runBetweenEvents(); + } + + void stop() + { + if(!enabled()) + return; + + assertActive(); + m_stop = true; + wait(); + } + + double averageValueMHz(double sum, std::vector& data) + { + assertNotActive(); + if(enabled() && data.empty()) + return 0.0; + + double averageFrequency = static_cast(sum / data.size()); + return averageFrequency * cHzToMHz; + } + + double medianValueMHz(std::vector& data) + { + assertNotActive(); + + double median = 0.0; + if(enabled() && data.empty()) + return 0.0; + + size_t num_datapoints = data.size(); + if(num_datapoints) + { + std::sort(data.begin(), data.end()); + + median = static_cast(data[(num_datapoints - 1) / 2]); + if(num_datapoints % 2 == 0) + { + median = static_cast(median + data[(num_datapoints - 1) / 2 + 1]) / 2.0; + } + } + return median * cHzToMHz; + } + + double getLowestAverageSYSCLK() + { + std::vector allAvgSYSCLK = getAllAverageSYSCLK(); + double minAvgSYSCLK = allAvgSYSCLK[0]; + for(int i = 1; i < m_XCDCount; i++) + { + if(allAvgSYSCLK[i] <= 0) + continue; + minAvgSYSCLK = min(minAvgSYSCLK, allAvgSYSCLK[i]); + } + return minAvgSYSCLK; + } + + double getLowestMedianSYSCLK() + { + std::vector allMedianSYSCLK = getAllMedianSYSCLK(); + double minMedianSYSCLK = allMedianSYSCLK[0]; + for(int i = 1; i < m_XCDCount; i++) + { + if(allMedianSYSCLK[i] <= 0) + continue; + minMedianSYSCLK = min(minMedianSYSCLK, allMedianSYSCLK[i]); + } + return minMedianSYSCLK; + } + + std::vector getAllAverageSYSCLK() + { + std::vector avgSYSCLK(m_XCDCount, 0.0); + for(int i = 0; i < m_XCDCount; i++) + { + avgSYSCLK[i] = averageValueMHz(m_SYSCLK_sum[i], m_SYSCLK_array[i]); + } + return avgSYSCLK; + } + + std::vector getAllMedianSYSCLK() + { + std::vector medianSYSCLK(m_XCDCount, 0.0); + for(int i = 0; i < m_XCDCount; i++) + { + medianSYSCLK[i] = medianValueMHz(m_SYSCLK_array[i]); + } + return medianSYSCLK; + } + + double getAverageMEMCLK() + { + return averageValueMHz(m_MEMCLK_sum, m_MEMCLK_array); + } + + double getMedianMEMCLK() + { + return medianValueMHz(m_MEMCLK_array); + } + + double getTotalGranularityValue() + { + return hipblasltGetTotalGranularityValue(); + } + + double getTilesPerCuValue() + { + return hipblasltGetTilesPerCuValue(); + } + + double getTile0Granularity() + { + return hipblasltGetTile0Granularity(); + } + + double getTile1Granularity() + { + return hipblasltGetTile1Granularity(); + } + + double getCuGranularity() + { + return hipblasltGetCuGranularity(); + } + + double getWaveGranularity() + { + return hipblasltGetWaveGranularity(); + } + + int getCUs() + { + return hipblasltGetCUs(); + } + + size_t getMemWriteBytesD() + { + return hipblasltGetMemWriteBytesD(); + } + + size_t getMemReadBytes() + { + return hipblasltGetMemReadBytes(); + } + +private: + void initThread() + { + m_stop = false; + m_exit = false; + + m_isMultiXCDSupported = false; +#if rocm_smi_VERSION_MAJOR >= 7 + m_isMultiXCDSupported = true; +#endif + + m_thread = std::thread([this]() { this->runLoop(); }); + return; + } + + void runBetweenEvents() + { + assertNotActive(); + { + std::unique_lock lock(m_mutex); + + m_task = std::move(Task([this]() { this->collect(); })); + m_future = m_task.get_future(); + + m_stop = false; + m_exit = false; + } + m_cv.notify_all(); + } + + void runLoop() + { + std::unique_lock lock(m_mutex); + while(!m_exit) + { + + while(!m_task.valid() && !m_exit) + { + m_cv.wait(lock); + } + + if(m_exit) + { + return; + } + + m_task(); + m_task = std::move(Task()); + } + return; + } + + void collect() + { + rsmi_frequencies_t freq; + do + { +#if rocm_smi_VERSION_MAJOR >= 7 + // multi_XCD + rsmi_gpu_metrics_t gpuMetrics; + auto status1 = rsmi_dev_gpu_metrics_info_get(m_smiDeviceIndex, &gpuMetrics); + if(status1 == RSMI_STATUS_SUCCESS) + { + for(int i = 0; i < m_XCDCount; i++) + { + m_SYSCLK_sum[i] += gpuMetrics.current_gfxclks[i] * cMhzToHz; + m_SYSCLK_array[i].push_back(gpuMetrics.current_gfxclks[i] * cMhzToHz); + } + } +#else + //XCD 0 + auto status1 = rsmi_dev_gpu_clk_freq_get(m_smiDeviceIndex, RSMI_CLK_TYPE_SYS, &freq); + if(status1 == RSMI_STATUS_SUCCESS) + { + m_SYSCLK_sum[0] += freq.frequency[freq.current]; + m_SYSCLK_array[0].push_back(freq.frequency[freq.current]); + } +#endif + + auto status2 = rsmi_dev_gpu_clk_freq_get(m_smiDeviceIndex, RSMI_CLK_TYPE_MEM, &freq); + if(status2 == RSMI_STATUS_SUCCESS) + { + m_MEMCLK_sum += freq.frequency[freq.current]; + m_MEMCLK_array.push_back(freq.frequency[freq.current]); + } + + // collect freq every 50ms regardless of success + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + + } while(!m_stop && !m_exit); + } + + void assertActive() + { + if(!m_future.valid()) + throw std::runtime_error("Monitor is not active."); + } + + void assertNotActive() + { + if(m_future.valid()) + throw std::runtime_error("Monitor is active."); + } + + void clearValues() + { + m_SYSCLK_sum = std::vector(m_XCDCount, 0); + m_SYSCLK_array = std::vector>(m_XCDCount, std::vector{}); + m_MEMCLK_sum = 0; + m_MEMCLK_array.clear(); + } + + void wait() + { + if(!m_future.valid()) + return; + + if(!m_stop) + throw std::runtime_error("Waiting for monitoring to stop with no end condition."); + + m_future.wait(); + m_future = std::move(std::future()); + } + + void InitROCmSMI() + { + static rsmi_status_t status = rsmi_init(0); + RSMI_CHECK_EXC(status); + } + + uint32_t GetROCmSMIIndex(int hipDeviceIndex) + { + InitROCmSMI(); + + hipDeviceProp_t props; + + HIP_CHECK_EXC(hipGetDeviceProperties(&props, hipDeviceIndex)); +#if HIP_VERSION >= 50220730 + int hip_version; + HIP_CHECK_EXC(hipRuntimeGetVersion(&hip_version)); + if(hip_version >= 50220730) + { + HIP_CHECK_EXC(hipDeviceGetAttribute(&props.multiProcessorCount, + hipDeviceAttributePhysicalMultiProcessorCount, + hipDeviceIndex)); + } +#endif + + uint64_t hipPCIID = 0; + // hipPCIID |= props.pciDeviceID & 0xFF; + // hipPCIID |= ((props.pciBusID & 0xFF) << 8); + // hipPCIID |= (props.pciDomainID) << 16; + + hipPCIID |= (((uint64_t)props.pciDomainID & 0xffffffff) << 32); + hipPCIID |= ((props.pciBusID & 0xff) << 8); + hipPCIID |= ((props.pciDeviceID & 0x1f) << 3); + + uint32_t smiCount = 0; + + RSMI_CHECK_EXC(rsmi_num_monitor_devices(&smiCount)); + + std::ostringstream msg; + msg << "PCI IDs: [" << std::endl; + + for(uint32_t smiIndex = 0; smiIndex < smiCount; smiIndex++) + { + uint64_t rsmiPCIID = 0; + + RSMI_CHECK_EXC(rsmi_dev_pci_id_get(smiIndex, &rsmiPCIID)); + + msg << smiIndex << ": " << rsmiPCIID << std::endl; + + if(hipPCIID == rsmiPCIID) + return smiIndex; + } + + msg << "]" << std::endl; + + throw std::runtime_error(concatenate("RSMI Can't find a device with PCI ID ", + hipPCIID, + "(", + props.pciDomainID, + "-", + props.pciBusID, + "-", + props.pciDeviceID, + ")\n", + msg.str())); + } + + using Task = std::packaged_task; + Task m_task; + std::atomic m_exit; + std::atomic m_stop; + std::future m_future; + std::thread m_thread; + std::condition_variable m_cv; + std::mutex m_mutex; + uint32_t m_smiDeviceIndex; + bool m_isMultiXCDSupported; + uint16_t m_XCDCount; + + std::vector m_SYSCLK_sum; + std::vector> m_SYSCLK_array; + uint64_t m_MEMCLK_sum; + std::vector m_MEMCLK_array; + +#else // WIN32 + + // not supporting windows for now + +public: + PerformanceMonitorImp() {} + + ~PerformanceMonitorImp() {} + + void set_device_id(int deviceId) {} + + void start() {} + + void stop() {} + + bool enabled() + { + return false; + } + + bool detailedReport() + { + return false; + } + + double getLowestAverageSYSCLK() + { + return 0.0; + } + + double getLowestMedianSYSCLK() + { + return 0.0; + } + + std::vector getAllAverageSYSCLK() + { + return std::vector(); + } + + std::vector getAllMedianSYSCLK() + { + return std::vector(); + } + + double getAverageMEMCLK() + { + return 0.0; + } + + double getMedianMEMCLK() + { + return 0.0; + } + double getTotalGranularityValue() + { + return 0.0; + } + + double getTilesPerCuValue() + { + return 0.0; + } + + double getTile0Granularity() + { + return 0.0; + } + + double getTile1Granularity() + { + return 0.0; + } + + double getCuGranularity() + { + return 0.0; + } + + double getWaveGranularity() + { + return 0.0; + } + + int getCUs() + { + return 0.0; + } + + size_t getMemWriteBytesD() + { + return 0.0; + } + + size_t getMemReadBytes() + { + return 0.0; + } +#endif +}; + +static PerformanceMonitorImp* g_PerfMonitorInstance{nullptr}; + +PerformanceMonitor& getPerformanceMonitor() +{ + if(g_PerfMonitorInstance == nullptr) + { + g_PerfMonitorInstance = new PerformanceMonitorImp(); + } + return *g_PerfMonitorInstance; +} + +void freePerformanceMonitor() +{ + if(g_PerfMonitorInstance != nullptr) + { + delete g_PerfMonitorInstance; + g_PerfMonitorInstance = nullptr; + } +} diff --git a/clients/include/argument_model.hpp b/clients/include/argument_model.hpp index 40cf84d82f..b6a741fd06 100644 --- a/clients/include/argument_model.hpp +++ b/clients/include/argument_model.hpp @@ -27,6 +27,7 @@ #pragma once #include "hipblaslt_arguments.hpp" +#include "performance_monitor.hpp" #include #include @@ -38,7 +39,7 @@ namespace ArgumentLogging void ArgumentModel_set_log_function_name(bool f); bool ArgumentModel_get_log_function_name(); -void ArgumentModel_log_frequencies(hipblaslt_internal_ostream& name_line, +void ArgumentModel_log_performance(hipblaslt_internal_ostream& name_line, hipblaslt_internal_ostream& val_line); // ArgumentModel template has a variadic list of argument enums @@ -69,7 +70,7 @@ class ArgumentModel double rtol) { // requires enablement for frequency logging - ArgumentModel_log_frequencies(name_line, val_line); + ArgumentModel_log_performance(name_line, val_line); constexpr bool has_batch_count = has(e_batch_count); int64_t batch_count = has_batch_count ? arg.batch_count : 1; diff --git a/clients/include/performance_monitor.hpp b/clients/include/performance_monitor.hpp new file mode 100644 index 0000000000..0c32f2ba34 --- /dev/null +++ b/clients/include/performance_monitor.hpp @@ -0,0 +1,56 @@ + +/* ************************************************************************ + * Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop- + * ies of the Software, and to permit persons to whom the Software is furnished + * to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM- + * PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS + * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR + * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER + * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE- + * CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * + * ************************************************************************/ +#pragma once + +#include +class PerformanceMonitor +{ +public: + virtual bool enabled() = 0; + virtual bool detailedReport() = 0; + + virtual void set_device_id(int deviceId) = 0; + + virtual void start() = 0; + virtual void stop() = 0; + + virtual double getLowestAverageSYSCLK() = 0; + virtual double getLowestMedianSYSCLK() = 0; + virtual std::vector getAllAverageSYSCLK() = 0; + virtual std::vector getAllMedianSYSCLK() = 0; + virtual double getAverageMEMCLK() = 0; + virtual double getMedianMEMCLK() = 0; + virtual double getTotalGranularityValue() = 0; + virtual double getTilesPerCuValue() = 0; + virtual double getTile0Granularity() = 0; + virtual double getTile1Granularity() = 0; + virtual double getCuGranularity() = 0; + virtual double getWaveGranularity() = 0; + virtual int getCUs() = 0; + virtual size_t getMemWriteBytesD() = 0; + virtual size_t getMemReadBytes() = 0; +}; + +PerformanceMonitor& getPerformanceMonitor(); +void freePerformanceMonitor(); diff --git a/clients/include/testing_matmul.hpp b/clients/include/testing_matmul.hpp index 7d3bba18b2..23aa1015af 100644 --- a/clients/include/testing_matmul.hpp +++ b/clients/include/testing_matmul.hpp @@ -29,7 +29,6 @@ #include "allclose.hpp" #include "cblas_interface.hpp" #include "flops.hpp" -#include "frequency_monitor.hpp" #include "hipBuffer.hpp" #include "hipblaslt_datatype2string.hpp" #include "hipblaslt_init.hpp" @@ -39,6 +38,7 @@ #include "hipblaslt_vector.hpp" #include "near.hpp" #include "norm.hpp" +#include "performance_monitor.hpp" #include "unit.hpp" #include "utility.hpp" #include @@ -2997,7 +2997,7 @@ void testing_matmul_with_bias(const Arguments& arg, } if(!do_grouped_gemm) { - FrequencyMonitor& freq_monitor = getFrequencyMonitor(); + PerformanceMonitor& perf_monitor = getPerformanceMonitor(); if(arg.use_ext) { for(int32_t b = 0; b < block_count; b++) @@ -3034,7 +3034,7 @@ void testing_matmul_with_bias(const Arguments& arg, continue; } } - freq_monitor.start(); + perf_monitor.start(); pre_gpu_time(arg.use_gpu_timer, event_gpu_time_start, gpu_time_used, stream); for(int i = 0; i < number_hot_calls; i++) @@ -3102,7 +3102,7 @@ void testing_matmul_with_bias(const Arguments& arg, continue; } } - freq_monitor.start(); + perf_monitor.start(); pre_gpu_time(arg.use_gpu_timer, event_gpu_time_start, gpu_time_used, stream); for(int i = 0; i < number_hot_calls; i++) { @@ -3143,11 +3143,11 @@ void testing_matmul_with_bias(const Arguments& arg, event_gpu_time_end, gpu_time_used, stream); - freq_monitor.stop(); + perf_monitor.stop(); } else { - FrequencyMonitor& freq_monitor = getFrequencyMonitor(); + PerformanceMonitor& perf_monitor = getPerformanceMonitor(); if(arg.use_user_args) { std::vector d_userArgsVec(block_count); @@ -3197,7 +3197,7 @@ void testing_matmul_with_bias(const Arguments& arg, continue; } } - freq_monitor.start(); + perf_monitor.start(); pre_gpu_time(arg.use_gpu_timer, event_gpu_time_start, gpu_time_used, stream); for(int i = 0; i < number_hot_calls; i++) @@ -3209,7 +3209,7 @@ void testing_matmul_with_bias(const Arguments& arg, event_gpu_time_end, gpu_time_used, stream); - freq_monitor.stop(); + perf_monitor.stop(); } else { @@ -3251,7 +3251,7 @@ void testing_matmul_with_bias(const Arguments& arg, continue; } } - freq_monitor.start(); + perf_monitor.start(); pre_gpu_time(arg.use_gpu_timer, event_gpu_time_start, gpu_time_used, stream); for(int i = 0; i < number_hot_calls; i++) @@ -3262,7 +3262,7 @@ void testing_matmul_with_bias(const Arguments& arg, event_gpu_time_end, gpu_time_used, stream); - freq_monitor.stop(); + perf_monitor.stop(); } } diff --git a/library/include/hipblaslt-ext-op.h b/library/include/hipblaslt-ext-op.h index a9d94482a6..1dadbff0d7 100644 --- a/library/include/hipblaslt-ext-op.h +++ b/library/include/hipblaslt-ext-op.h @@ -222,6 +222,18 @@ HIPBLASLT_EXPORT hipblasStatus_t hipblasltExtAMaxWithScale(const hipDataType dat uint32_t m, uint32_t n, hipStream_t stream); + +// For internal use only. +HIPBLASLT_EXPORT double hipblasltGetTotalGranularityValue(); +HIPBLASLT_EXPORT double hipblasltGetTilesPerCuValue(); +HIPBLASLT_EXPORT double hipblasltGetTile0Granularity(); +HIPBLASLT_EXPORT double hipblasltGetTile1Granularity(); +HIPBLASLT_EXPORT double hipblasltGetCuGranularity(); +HIPBLASLT_EXPORT double hipblasltGetWaveGranularity(); +HIPBLASLT_EXPORT int hipblasltGetCUs(); +HIPBLASLT_EXPORT size_t hipblasltGetMemWriteBytesD(); +HIPBLASLT_EXPORT size_t hipblasltGetMemReadBytes(); + #ifdef __cplusplus } #endif diff --git a/library/src/amd_detail/hipblaslt-ext-op.cpp b/library/src/amd_detail/hipblaslt-ext-op.cpp index 271b483fad..0b4cd52b9f 100644 --- a/library/src/amd_detail/hipblaslt-ext-op.cpp +++ b/library/src/amd_detail/hipblaslt-ext-op.cpp @@ -298,12 +298,13 @@ hipblasStatus_t hipblasltSoftmaxRun(hipDataType datatype, auto gpu = TensileLite::hip::GetCurrentDevice(); const auto archName = trimArchName(gpu->archName()); auto& masterLib = getExtOpMasterLibrary(); - const auto& lib - = masterLib - .getLibrary(archName, hipblaslt_ext::SoftmaxSolutionLibrary::opName, hipDataTypeo_char(datatype)) - ->as(); - auto sol - = lib.findBestSolution(hipblaslt_ext::SoftmaxProblem(m, n, hipDataType_to_tensile_type(datatype)), *gpu); + const auto& lib = masterLib + .getLibrary(archName, + hipblaslt_ext::SoftmaxSolutionLibrary::opName, + hipDataTypeo_char(datatype)) + ->as(); + auto sol = lib.findBestSolution( + hipblaslt_ext::SoftmaxProblem(m, n, hipDataType_to_tensile_type(datatype)), *gpu); const auto kernelName = sol->name(); err = adapter->initKernel(kernelName); TensileLite::KernelArguments kArgs(false); @@ -311,15 +312,15 @@ hipblasStatus_t hipblasltSoftmaxRun(hipDataType datatype, kArgs.append("output", output); kArgs.append("m", m); kArgs.append("n", n); - const auto numWorkgroups = getSoftmaxNumWorkgroups(m, tileM); + const auto numWorkgroups = getSoftmaxNumWorkgroups(m, tileM); TensileLite::KernelInvocation invocation{kernelName, - sol->getCodeObjectPath(), - false, - {WORKGROUP_SIZE, 1, 1}, - {numWorkgroups, 1, 1}, - {numWorkgroups * WORKGROUP_SIZE, 1, 1}, - getLdsUsageByte(datatype, tileM, tileN), - kArgs}; + sol->getCodeObjectPath(), + false, + {WORKGROUP_SIZE, 1, 1}, + {numWorkgroups, 1, 1}, + {numWorkgroups * WORKGROUP_SIZE, 1, 1}, + getLdsUsageByte(datatype, tileM, tileN), + kArgs}; err = adapter->launchKernel(invocation, stream, nullptr, nullptr); @@ -358,12 +359,13 @@ hipblasStatus_t hipblasltLayerNormRun(hipDataType datatype, auto gpu = TensileLite::hip::GetCurrentDevice(); const auto archName = trimArchName(gpu->archName()); auto& masterLib = getExtOpMasterLibrary(); - const auto& lib - = masterLib - .getLibrary(archName, hipblaslt_ext::LayerNormSolutionLibrary::opName, hipDataTypeo_char(datatype)) - ->as(); - auto sol - = lib.findBestSolution(hipblaslt_ext::LayerNormProblem(m, n, hipDataType_to_tensile_type(datatype)), *gpu); + const auto& lib = masterLib + .getLibrary(archName, + hipblaslt_ext::LayerNormSolutionLibrary::opName, + hipDataTypeo_char(datatype)) + ->as(); + auto sol = lib.findBestSolution( + hipblaslt_ext::LayerNormProblem(m, n, hipDataType_to_tensile_type(datatype)), *gpu); const auto kernelName = sol->name(); err = adapter->initKernel(kernelName); const auto numWorkgroups = m; @@ -426,13 +428,15 @@ hipblasStatus_t hipblasltAMaxRun(const hipDataType datatype, auto gpu = TensileLite::hip::GetCurrentDevice(); const auto archName = trimArchName(gpu->archName()); auto& masterLib = getExtOpMasterLibrary(); - const auto& lib - = masterLib.getLibrary(archName, hipblaslt_ext::AMaxSolutionLibrary::opName, hipDataTypeo_char(datatype)) - ->as(); - auto sol = lib.findBestSolution(hipblaslt_ext::AMaxProblem(len, - hipDataType_to_tensile_type(datatype), - hipDataType_to_tensile_type(outDatatype)), - *gpu); + const auto& lib = masterLib + .getLibrary(archName, + hipblaslt_ext::AMaxSolutionLibrary::opName, + hipDataTypeo_char(datatype)) + ->as(); + auto sol = lib.findBestSolution( + hipblaslt_ext::AMaxProblem( + len, hipDataType_to_tensile_type(datatype), hipDataType_to_tensile_type(outDatatype)), + *gpu); const auto kernelName = sol->name(); err = adapter->initKernel(kernelName); @@ -494,15 +498,18 @@ hipblasStatus_t hipblasltAMaxWithScaleRun(const hipDataType datatype, auto gpu = TensileLite::hip::GetCurrentDevice(); const auto archName = trimArchName(gpu->archName()); auto& masterLib = getExtOpMasterLibrary(); - const auto& lib - = masterLib.getLibrary(archName, hipblaslt_ext::AMaxSolutionLibrary::opName, hipDataTypeo_char(datatype)) - ->as(); - auto sol = lib.findBestSolution(hipblaslt_ext::AMaxProblem(len, - hipDataType_to_tensile_type(datatype), - hipDataType_to_tensile_type(outDatatype), - hipDataType_to_tensile_type(scaleDatatype), - true), - *gpu); + const auto& lib = masterLib + .getLibrary(archName, + hipblaslt_ext::AMaxSolutionLibrary::opName, + hipDataTypeo_char(datatype)) + ->as(); + auto sol = lib.findBestSolution( + hipblaslt_ext::AMaxProblem(len, + hipDataType_to_tensile_type(datatype), + hipDataType_to_tensile_type(outDatatype), + hipDataType_to_tensile_type(scaleDatatype), + true), + *gpu); if(!sol) { @@ -543,3 +550,42 @@ hipblasStatus_t hipblasltAMaxWithScaleRun(const hipDataType datatype, return HIPBLAS_STATUS_SUCCESS; } + +double hipblasltGetTotalGranularityValue() +{ + return hipblasltClientPerformanceArgs::totalGranularity; +} + +double hipblasltGetTilesPerCuValue() +{ + return hipblasltClientPerformanceArgs::tilesPerCu; +} + +double hipblasltGetTile0Granularity() +{ + return hipblasltClientPerformanceArgs::tile0Granularity; +} +double hipblasltGetTile1Granularity() +{ + return hipblasltClientPerformanceArgs::tile1Granularity; +} +double hipblasltGetCuGranularity() +{ + return hipblasltClientPerformanceArgs::cuGranularity; +} +double hipblasltGetWaveGranularity() +{ + return hipblasltClientPerformanceArgs::waveGranularity; +} +int hipblasltGetCUs() +{ + return hipblasltClientPerformanceArgs::CUs; +} +size_t hipblasltGetMemWriteBytesD() +{ + return hipblasltClientPerformanceArgs::memWriteBytesD; +} +size_t hipblasltGetMemReadBytes() +{ + return hipblasltClientPerformanceArgs::memReadBytes; +} diff --git a/library/src/amd_detail/rocblaslt/src/include/utility.hpp b/library/src/amd_detail/rocblaslt/src/include/utility.hpp index 85c46b59bd..a30485fb29 100644 --- a/library/src/amd_detail/rocblaslt/src/include/utility.hpp +++ b/library/src/amd_detail/rocblaslt/src/include/utility.hpp @@ -206,7 +206,7 @@ void log_base(rocblaslt_layer_mode layer_mode, const char* func, H head, Ts&&... if(get_logger_layer_mode() & layer_mode) { std::lock_guard lock(log_mutex); - std::string comma_separator = " "; + std::string comma_separator = " "; std::ostream* os = get_logger_os(); @@ -275,7 +275,7 @@ template void log_bench(const char* func, Ts&&... xs) { std::lock_guard lock(log_mutex); - std::ostream* os = get_logger_os(); + std::ostream* os = get_logger_os(); *os << "hipblaslt-bench "; log_arguments_bench(*os, std::forward(xs)...); *os << std::endl; @@ -483,4 +483,20 @@ bool rocblaslt_internal_tensile_supports_ldc_ne_ldd(rocblaslt_handle handle); // for internal use during testing, fetch arch name //std::string rocblaslt_internal_get_arch_name(); +//! Estimates based on problem size, solution tile, and machine hardware +//! charz: +struct hipblasltClientPerformanceArgs +{ + //! Granularity is measured 0..1 with 1.0 meaning no granularity loss + static double totalGranularity; + static double tilesPerCu; + static double tile0Granularity; // loss due to tile0 + static double tile1Granularity; + static double cuGranularity; + static double waveGranularity; + static int CUs; + static size_t memWriteBytesD; //! Estimated memory writes D + static size_t memReadBytes; +}; + #endif // UTILITY_H diff --git a/library/src/amd_detail/rocblaslt/src/tensile_host.cpp b/library/src/amd_detail/rocblaslt/src/tensile_host.cpp index 599244d431..429d037e1b 100644 --- a/library/src/amd_detail/rocblaslt/src/tensile_host.cpp +++ b/library/src/amd_detail/rocblaslt/src/tensile_host.cpp @@ -1326,7 +1326,8 @@ namespace {TensileLite::DataType::Double, (double)0.0}, }; - if (argument_vals.find(compute_type) == argument_vals.end()) { + if(argument_vals.find(compute_type) == argument_vals.end()) + { log_error(__func__, "Unsupported compute type"); throw std::runtime_error("[GetTensileInputs] unsupported compute type."); } @@ -1978,6 +1979,7 @@ rocblaslt_status runContractionProblem(rocblaslt_handle handle int* solutionIndex = (int*)algo->data; data->algoIndex = *solutionIndex; data->inputs = GetTensileInputs(prob); + if(get_logger_layer_mode() & rocblaslt_layer_mode_log_bench) { logBenchFromTensileDataGemm(data->problem, data->inputs, data->algoIndex, false); @@ -1987,7 +1989,44 @@ rocblaslt_status runContractionProblem(rocblaslt_handle handle { logProfileFromTensileDataGemm(data->problem, data->inputs, false); } + auto solution = library->getSolutionByIndex(data->problem, *hardware, *solutionIndex); + + if(getenv("HIPBLASLT_BENCH_PERF") != nullptr + || getenv("HIPBLASLT_BENCH_PERF_ALL") != nullptr) + { + auto Granularity = solution->computeGranularities( + *hardware, + data->problem.c().sizes()[0], + data->problem.c().sizes()[1], + data->problem.a().sizes()[data->problem.boundIndices()[0].a], + data->problem.batchSize(0)); + + hipblasltClientPerformanceArgs::totalGranularity = Granularity.totalGranularity; + hipblasltClientPerformanceArgs::tilesPerCu = Granularity.tilesPerCu; + hipblasltClientPerformanceArgs::tile0Granularity + = Granularity.tile0Granularity; // loss due to tile0 + hipblasltClientPerformanceArgs::tile1Granularity = Granularity.tile1Granularity; + hipblasltClientPerformanceArgs::cuGranularity = Granularity.cuGranularity; + hipblasltClientPerformanceArgs::waveGranularity = Granularity.waveGranularity; + hipblasltClientPerformanceArgs::CUs = Granularity.CUs; + + auto staticPerformanceModel = solution->staticPerformanceModel( + data->problem.c().sizes()[0], + data->problem.c().sizes()[1], + data->problem.a().sizes()[data->problem.boundIndices()[0].a], + data->problem.batchSize(0), + Granularity.MT0, + Granularity.MT1, + Granularity.CUs, + Granularity.totalGranularity, + solution->sizeMapping.globalSplitU); + + hipblasltClientPerformanceArgs::memWriteBytesD + = staticPerformanceModel.memWriteBytesD; //! Estimated memory writes D + hipblasltClientPerformanceArgs::memReadBytes = staticPerformanceModel.memReadBytes; + } + if(!solution) { #if 0 diff --git a/library/src/amd_detail/rocblaslt/src/utility.cpp b/library/src/amd_detail/rocblaslt/src/utility.cpp index f6c8c97064..feb40a91d4 100644 --- a/library/src/amd_detail/rocblaslt/src/utility.cpp +++ b/library/src/amd_detail/rocblaslt/src/utility.cpp @@ -345,3 +345,14 @@ std::string rocblaslt_matmul_desc_to_string(rocblaslt_matmul_desc matmul_desc) hipDataType_to_string(matmul_desc->bias_type)); return std::string(buf.get()); } + +// Define and initialize static members of struct hipblasltClientPerformanceArgs +double hipblasltClientPerformanceArgs::totalGranularity = 0.0; +double hipblasltClientPerformanceArgs::tilesPerCu = 0.0; +double hipblasltClientPerformanceArgs::tile0Granularity = 0.0; // loss due to tile0 +double hipblasltClientPerformanceArgs::tile1Granularity = 0.0; +double hipblasltClientPerformanceArgs::cuGranularity = 0.0; +double hipblasltClientPerformanceArgs::waveGranularity = 0.0; +int hipblasltClientPerformanceArgs::CUs = 0; +size_t hipblasltClientPerformanceArgs::memWriteBytesD = 0.0; //! Estimated memory writes D +size_t hipblasltClientPerformanceArgs::memReadBytes = 0.0; \ No newline at end of file From b26eabcac969e142e5e76dc1fede52589663b47b Mon Sep 17 00:00:00 2001 From: NaveenElumalaiAMD Date: Mon, 13 Jan 2025 14:32:02 -0600 Subject: [PATCH 2/2] More changes --- clients/CMakeLists.txt | 2 +- clients/include/frequency_monitor.hpp | 47 --------------------------- 2 files changed, 1 insertion(+), 48 deletions(-) delete mode 100644 clients/include/frequency_monitor.hpp diff --git a/clients/CMakeLists.txt b/clients/CMakeLists.txt index 02ae47d344..7e7ff8a785 100755 --- a/clients/CMakeLists.txt +++ b/clients/CMakeLists.txt @@ -105,7 +105,7 @@ if( BUILD_CLIENTS_BENCHMARKS OR BUILD_CLIENTS_TESTS) set( hipblaslt_test_bench_common ../common/singletons.cpp ../common/utility.cpp - ../common/frequency_monitor.cpp + ../common/performance_monitor.cpp ../common/cblas_interface.cpp ../common/argument_model.cpp ../common/hipblaslt_parse_data.cpp diff --git a/clients/include/frequency_monitor.hpp b/clients/include/frequency_monitor.hpp deleted file mode 100644 index 2f6b34c854..0000000000 --- a/clients/include/frequency_monitor.hpp +++ /dev/null @@ -1,47 +0,0 @@ - -/* ************************************************************************ - * Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop- - * ies of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM- - * PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS - * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR - * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER - * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE- - * CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * - * ************************************************************************/ -#pragma once - -#include -class FrequencyMonitor -{ -public: - virtual bool enabled() = 0; - virtual bool detailedReport() = 0; - - virtual void set_device_id(int deviceId) = 0; - - virtual void start() = 0; - virtual void stop() = 0; - - virtual double getLowestAverageSYSCLK() = 0; - virtual double getLowestMedianSYSCLK() = 0; - virtual std::vector getAllAverageSYSCLK() = 0; - virtual std::vector getAllMedianSYSCLK() = 0; - virtual double getAverageMEMCLK() = 0; - virtual double getMedianMEMCLK() = 0; -}; - -FrequencyMonitor& getFrequencyMonitor(); -void freeFrequencyMonitor();