From 22023fb9073b0c06efd057bebfe7844483a34ff0 Mon Sep 17 00:00:00 2001 From: Tristan Konolige Date: Wed, 1 Jun 2022 12:30:48 -0700 Subject: [PATCH] [PROFILER] Add configuration information to profiler Configuration is a place to store extra information related to the specific profiler run. Right now it is just the executor used and the number of threads. The roofline analysis also adds peak flops and peak bandwidth. --- include/tvm/runtime/profiling.h | 17 ++- python/tvm/runtime/profiling/__init__.py | 11 +- python/tvm/utils/roofline.py | 5 +- src/node/structural_hash.cc | 1 + .../debug/graph_executor_debug.cc | 2 +- src/runtime/profiling.cc | 111 ++++++++++++------ src/runtime/vm/profiler/vm.cc | 6 +- .../python/unittest/test_runtime_profiling.py | 3 + 8 files changed, 109 insertions(+), 47 deletions(-) diff --git a/include/tvm/runtime/profiling.h b/include/tvm/runtime/profiling.h index 0163f0c2e49e..83c26933be45 100644 --- a/include/tvm/runtime/profiling.h +++ b/include/tvm/runtime/profiling.h @@ -25,6 +25,7 @@ #define TVM_RUNTIME_PROFILING_H_ #include +#include #include #include #include @@ -192,6 +193,11 @@ class ReportNode : public Object { * because these metrics include the overhead of the executor. */ Map> device_metrics; + /*! Configuration used for this profiling run. Includes number of threads, executor. + * + * Values must be an object type that can be used with device_metrics. + */ + Map configuration; /*! \brief Output `calls` in CSV format. * * Note that this does not include `device_metrics`, it only includes per-call metrics. @@ -255,9 +261,11 @@ class Report : public ObjectRef { /*! Construct a Report from a set of calls (with associated metrics) and per-device metrics. * \param calls Function calls and associated metrics. * \param device_metrics Per-device metrics for overall execution. + * \param configuration Configuration data specific to this profiling run. */ explicit Report(Array> calls, - Map> device_metrics); + Map> device_metrics, + Map configuration); /*! Deserialize a Report from a JSON object. Needed for sending the report over RPC. * \param json Serialized json report from `ReportNode::AsJSON`. @@ -366,8 +374,10 @@ class Profiler { * \param devs The list of devices the profiler will be running on. Should * include all devices used by profiled operators. * \param metric_collectors Additional `MetricCollector`s to use with this profiler. + * \param configuration Additional configuration data to add to the outputted profiling report. */ - explicit Profiler(std::vector devs, std::vector metric_collectors); + explicit Profiler(std::vector devs, std::vector metric_collectors, + std::unordered_map configuration = {}); /*! \brief Start the profiler. * * This function should only be called once per object. @@ -400,7 +410,7 @@ class Profiler { * \returns A `Report` that can either be formatted as CSV (with `.AsCSV`) * or as a human readable table (with `.AsTable`). */ - profiling::Report Report(bool aggregate = true, bool sort = true); + profiling::Report Report(); /*! \brief Check if the profiler is currently running. * \returns Whether or not the profiler is running. */ @@ -412,6 +422,7 @@ class Profiler { std::vector calls_; std::stack in_flight_; std::vector collectors_; + std::unordered_map configuration_; }; /* \brief A duration in time. */ diff --git a/python/tvm/runtime/profiling/__init__.py b/python/tvm/runtime/profiling/__init__.py index 573779037827..347d8b9f94f1 100644 --- a/python/tvm/runtime/profiling/__init__.py +++ b/python/tvm/runtime/profiling/__init__.py @@ -36,7 +36,10 @@ class Report(Object): """ def __init__( - self, calls: Sequence[Dict[str, Object]], device_metrics: Dict[str, Dict[str, Object]] + self, + calls: Sequence[Dict[str, Object]], + device_metrics: Dict[str, Dict[str, Object]], + configuration: Dict[str, Object], ): """Construct a profiling report from a list of metrics and per-device metrics. @@ -47,8 +50,12 @@ def __init__( device_metrics : Dict[str, Dict[str, Object]] Per device metrics. + + configuration : Dict[str, Object] + Configuration of TVM for this profiling run. Includes number of + threads, executor. """ - self.__init_handle_by_constructor__(_ffi_api.Report, calls, device_metrics) + self.__init_handle_by_constructor__(_ffi_api.Report, calls, device_metrics, configuration) def csv(self): """Convert this profiling report into CSV format. diff --git a/python/tvm/utils/roofline.py b/python/tvm/utils/roofline.py index 6d1ac753e27e..adb8bf5cc97c 100644 --- a/python/tvm/utils/roofline.py +++ b/python/tvm/utils/roofline.py @@ -400,7 +400,10 @@ def roofline_from_existing( new_calls.append(call) else: new_calls.append(call) - return profiling.Report(new_calls, report.device_metrics) + new_configuration = dict(report.configuration.items()) + new_configuration["Estimated Peak FMA FLOP/s"] = profiling.Ratio(peak_flops) + new_configuration["Estimated Peak Bandwidth (byte/second)"] = profiling.Ratio(peak_bandwidth) + return profiling.Report(new_calls, report.device_metrics, new_configuration) def roofline_analysis( diff --git a/src/node/structural_hash.cc b/src/node/structural_hash.cc index e97e5f41bfc2..23811e219078 100644 --- a/src/node/structural_hash.cc +++ b/src/node/structural_hash.cc @@ -521,6 +521,7 @@ struct ReportNodeTrait { static void VisitAttrs(runtime::profiling::ReportNode* report, AttrVisitor* attrs) { attrs->Visit("calls", &report->calls); attrs->Visit("device_metrics", &report->device_metrics); + attrs->Visit("configuration", &report->configuration); } static constexpr std::nullptr_t SEqualReduce = nullptr; static constexpr std::nullptr_t SHashReduce = nullptr; diff --git a/src/runtime/graph_executor/debug/graph_executor_debug.cc b/src/runtime/graph_executor/debug/graph_executor_debug.cc index bd3b0db0403f..4a950153954f 100644 --- a/src/runtime/graph_executor/debug/graph_executor_debug.cc +++ b/src/runtime/graph_executor/debug/graph_executor_debug.cc @@ -294,7 +294,7 @@ class GraphExecutorDebug : public GraphExecutor { */ profiling::Report Profile(Array collectors) { std::vector cs(collectors.begin(), collectors.end()); - profiling::Profiler prof(devices_, cs); + profiling::Profiler prof(devices_, cs, {{String("Executor"), String("Graph")}}); // warm up. 1 iteration does not seem enough. for (int i = 0; i < 3; i++) { diff --git a/src/runtime/profiling.cc b/src/runtime/profiling.cc index 9499a6e7a5bb..9f95bf18f74b 100644 --- a/src/runtime/profiling.cc +++ b/src/runtime/profiling.cc @@ -105,8 +105,9 @@ TVM_REGISTER_GLOBAL("profiling.start_timer").set_body_typed(Timer::Start); namespace profiling { -Profiler::Profiler(std::vector devs, std::vector metric_collectors) - : devs_(devs), collectors_(metric_collectors) { +Profiler::Profiler(std::vector devs, std::vector metric_collectors, + std::unordered_map configuration) + : devs_(devs), collectors_(metric_collectors), configuration_(configuration) { is_running_ = false; std::vector wrapped_devs; for (auto dev : devs) { @@ -117,6 +118,9 @@ Profiler::Profiler(std::vector devs, std::vector metric } // reset the thread pool so that PAPI eventset hooks are set in all threads. threading::ResetThreadPool(); + + configuration_[String("Number of threads")] = + ObjectRef(make_object(threading::NumThreads())); } void Profiler::Start() { @@ -279,7 +283,7 @@ String ReportNode::AsCSV() const { } namespace { -void print_metric(std::ostream& os, ObjectRef o) { +void metric_as_json(std::ostream& os, ObjectRef o) { if (o.as()) { os << "{\"string\":" << "\"" << Downcast(o) << "\"" @@ -309,13 +313,14 @@ String ReportNode::AsJSON() const { // value we want to print. Instead we construct the json by hand because it // is easier. s << "{"; + s << "\"calls\":["; for (size_t i = 0; i < calls.size(); i++) { size_t j = 0; s << "{"; for (const auto& kv : calls[i]) { s << "\"" << kv.first << "\":"; - print_metric(s, kv.second); + metric_as_json(s, kv.second); if (j < calls[i].size() - 1) { s << ","; } @@ -326,7 +331,8 @@ String ReportNode::AsJSON() const { s << ","; } } - s << "],"; + s << "],"; // end calls + s << "\"device_metrics\":{"; size_t i = 0; for (const auto& dev_kv : device_metrics) { @@ -334,7 +340,7 @@ String ReportNode::AsJSON() const { s << "\"" << dev_kv.first << "\":{"; for (const auto& metric_kv : dev_kv.second) { s << "\"" << metric_kv.first << "\":"; - print_metric(s, metric_kv.second); + metric_as_json(s, metric_kv.second); if (j < dev_kv.second.size() - 1) { s << ","; } @@ -346,7 +352,20 @@ String ReportNode::AsJSON() const { } i++; } - s << "}}"; + s << "},"; // end device metrics + + s << "\"configuration\":{"; + size_t k = 0; + for (const auto& kv : configuration) { + s << "\"" << kv.first << "\":"; + metric_as_json(s, kv.second); + if (k < configuration.size() - 1) { + s << ","; + } + k++; + } + s << "}"; // end configuration + s << "}"; return s.str(); } @@ -395,6 +414,35 @@ ObjectRef AggregateMetric(const std::vector& metrics) { } } +static String print_metric(ObjectRef metric) { + std::string val; + if (metric.as()) { + std::stringstream s; + s.imbue(std::locale("")); // for 1000s seperators + s << std::fixed << metric.as()->value; + val = s.str(); + } else if (metric.as()) { + std::stringstream s; + s.imbue(std::locale("")); // for 1000s seperators + s << std::fixed << std::setprecision(2) << metric.as()->microseconds; + val = s.str(); + } else if (metric.as()) { + std::stringstream s; + s << std::fixed << std::setprecision(2) << metric.as()->percent; + val = s.str(); + } else if (metric.as()) { + std::stringstream s; + s.imbue(std::locale("")); // for 1000s seperators + s << std::setprecision(2) << metric.as()->ratio; + val = s.str(); + } else if (metric.as()) { + val = Downcast(metric); + } else { + LOG(FATAL) << "Cannot print metric of type " << metric->GetTypeKey(); + } + return val; +} + String ReportNode::AsTable(bool sort, bool aggregate, bool compute_col_sums) const { // aggregate calls by op hash (or op name if hash is not set) + argument shapes std::vector> aggregated_calls; @@ -533,30 +581,7 @@ String ReportNode::AsTable(bool sort, bool aggregate, bool compute_col_sums) con // fill empty data with empty strings cols[i].push_back(""); } else { - std::string val; - if ((*it).second.as()) { - std::stringstream s; - s.imbue(std::locale("")); // for 1000s seperators - s << std::fixed << (*it).second.as()->value; - val = s.str(); - } else if ((*it).second.as()) { - std::stringstream s; - s.imbue(std::locale("")); // for 1000s seperators - s << std::fixed << std::setprecision(2) << (*it).second.as()->microseconds; - val = s.str(); - } else if ((*it).second.as()) { - std::stringstream s; - s << std::fixed << std::setprecision(2) << (*it).second.as()->percent; - val = s.str(); - } else if ((*it).second.as()) { - std::stringstream s; - s.imbue(std::locale("")); // for 1000s seperators - s << std::setprecision(2) << (*it).second.as()->ratio; - val = s.str(); - } else if ((*it).second.as()) { - val = Downcast((*it).second); - } - cols[i].push_back(val); + cols[i].push_back(print_metric((*it).second)); } } } @@ -592,6 +617,12 @@ String ReportNode::AsTable(bool sort, bool aggregate, bool compute_col_sums) con } s << std::endl; } + + // Add configuration information. It will not be aligned with the columns. + s << std::endl << "Configuration" << std::endl << "-------------" << std::endl; + for (auto kv : configuration) { + s << kv.first << ": " << print_metric(kv.second) << std::endl; + } return s.str(); } @@ -599,7 +630,7 @@ std::string DeviceString(Device dev) { return DeviceName(dev.device_type) + std::to_string(dev.device_id); } -Report Profiler::Report(bool aggregate, bool sort) { +Report Profiler::Report() { // sync all timers and normalize rows std::vector> rows; for (auto& cf : calls_) { @@ -638,14 +669,16 @@ Report Profiler::Report(bool aggregate, bool sort) { converted_rows.push_back(row); } - return profiling::Report(converted_rows, device_metrics); + return profiling::Report(converted_rows, device_metrics, configuration_); } Report::Report(Array> calls, - Map> device_metrics) { + Map> device_metrics, + Map configuration) { auto node = make_object(); node->calls = std::move(calls); node->device_metrics = std::move(device_metrics); + node->configuration = std::move(configuration); data_ = std::move(node); } @@ -697,6 +730,7 @@ Report Report::FromJSON(String json) { std::string key; Array> calls; Map> device_metrics; + Map configuration; reader.BeginObject(); while (reader.NextObjectItem(&key)) { @@ -713,10 +747,12 @@ Report Report::FromJSON(String json) { device_metrics.Set(device_name, parse_metrics(&reader)); } // reader.EndObject(); + } else if (key == "configuration") { + configuration = parse_metrics(&reader); } } - return Report(calls, device_metrics); + return Report(calls, device_metrics, configuration); } TVM_REGISTER_OBJECT_TYPE(DurationNode); @@ -855,8 +891,9 @@ PackedFunc WrapTimeEvaluator(PackedFunc pf, Device dev, int number, int repeat, TVM_REGISTER_GLOBAL("runtime.profiling.Report") .set_body_typed([](Array> calls, - Map> device_metrics) { - return Report(calls, device_metrics); + Map> device_metrics, + Map configuration) { + return Report(calls, device_metrics, configuration); }); TVM_REGISTER_GLOBAL("runtime.profiling.Count").set_body_typed([](int64_t count) { diff --git a/src/runtime/vm/profiler/vm.cc b/src/runtime/vm/profiler/vm.cc index 393d1b399878..0ace910b5c53 100644 --- a/src/runtime/vm/profiler/vm.cc +++ b/src/runtime/vm/profiler/vm.cc @@ -58,9 +58,9 @@ PackedFunc VirtualMachineDebug::GetFunction(const std::string& name, // on remotes, we accept a nullptr for collectors. if (collectors.defined()) { std::vector cs(collectors.begin(), collectors.end()); - prof_ = profiling::Profiler(devices, cs); + prof_ = profiling::Profiler(devices, cs, {{String("Executor"), String("VM")}}); } else { - prof_ = profiling::Profiler(devices, {}); + prof_ = profiling::Profiler(devices, {}, {{String("Executor"), String("VM")}}); } auto invoke = VirtualMachine::GetFunction("invoke", sptr_to_self); @@ -77,7 +77,7 @@ PackedFunc VirtualMachineDebug::GetFunction(const std::string& name, return report; }); } else if (name == "profile_rpc") { - // We cannot return a Report over RPC because TMV RPC mechanism only + // We cannot return a Report over RPC because TVM RPC mechanism only // supports a subset of Object classes. Instead we serialize it on the // remote (here) and deserialize it on the other end. return TypedPackedFunc([sptr_to_self, this](std::string arg_name) { diff --git a/tests/python/unittest/test_runtime_profiling.py b/tests/python/unittest/test_runtime_profiling.py index 919057f08d27..a701be7c3c9b 100644 --- a/tests/python/unittest/test_runtime_profiling.py +++ b/tests/python/unittest/test_runtime_profiling.py @@ -69,6 +69,7 @@ def test_vm(target, dev): assert "Total" in str(report) assert "AllocTensorReg" in str(report) assert "AllocStorage" in str(report) + assert report.configuration["Executor"] == "VM" csv = read_csv(report) assert "Hash" in csv.keys() @@ -102,6 +103,7 @@ def test_graph_executor(target, dev): assert "fused_nn_softmax" in str(report) assert "Total" in str(report) assert "Hash" in str(report) + assert "Graph" in str(report) @tvm.testing.parametrize_targets("cuda", "llvm") @@ -147,6 +149,7 @@ def test_json(): parsed = json.loads(report.json()) assert "device_metrics" in parsed assert "calls" in parsed + assert "configuration" in parsed assert "Duration (us)" in parsed["calls"][0] assert "microseconds" in parsed["calls"][0]["Duration (us)"] assert len(parsed["calls"]) > 0