From eb52609c1eef34a9235ecdd70019084e08d4569e Mon Sep 17 00:00:00 2001 From: eyes_on_me Date: Fri, 14 Feb 2025 14:50:35 +0800 Subject: [PATCH] [BugFix] add some defensive codes to table metrics (#55839) Signed-off-by: silverbullet233 <3675229+silverbullet233@users.noreply.github.com> --- be/src/common/config.h | 2 ++ be/src/exec/tablet_sink.cpp | 1 - be/src/http/action/metrics_action.cpp | 7 +++++++ be/src/service/staros_worker.cpp | 15 +++++++++++++-- be/src/util/table_metrics.h | 12 ++++++++++-- 5 files changed, 32 insertions(+), 5 deletions(-) diff --git a/be/src/common/config.h b/be/src/common/config.h index cdbde7dd7dbd98..e90c7dd0152d40 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -1563,4 +1563,6 @@ CONF_mInt32(json_parse_many_batch_size, "1000000"); CONF_mBool(enable_dynamic_batch_size_for_json_parse_many, "true"); CONF_mInt32(put_combined_txn_log_thread_pool_num_max, "64"); CONF_mBool(enable_put_combinded_txn_log_parallel, "false"); +// used to control whether the metrics/ interface collects table metrics +CONF_mBool(enable_collect_table_metrics, "true"); } // namespace starrocks::config diff --git a/be/src/exec/tablet_sink.cpp b/be/src/exec/tablet_sink.cpp index 44556c432a4961..e4ce1581e013e7 100644 --- a/be/src/exec/tablet_sink.cpp +++ b/be/src/exec/tablet_sink.cpp @@ -65,7 +65,6 @@ #include "util/compression/compression_utils.h" #include "util/defer_op.h" #include "util/stack_util.h" -#include "util/starrocks_metrics.h" #include "util/thread.h" #include "util/thrift_rpc_helper.h" #include "util/uid_util.h" diff --git a/be/src/http/action/metrics_action.cpp b/be/src/http/action/metrics_action.cpp index 7a00a3d6eabd44..e97561af5d382e 100644 --- a/be/src/http/action/metrics_action.cpp +++ b/be/src/http/action/metrics_action.cpp @@ -117,6 +117,7 @@ const std::string SimpleCoreMetricsVisitor::MAX_DISK_IO_UTIL_PERCENT = "max_disk const std::string SimpleCoreMetricsVisitor::MAX_NETWORK_SEND_BYTES_RATE = "max_network_send_bytes_rate"; const std::string SimpleCoreMetricsVisitor::MAX_NETWORK_RECEIVE_BYTES_RATE = "max_network_receive_bytes_rate"; +const std::string TableMetricsPrefix = "table_"; void PrometheusMetricsVisitor::visit(const std::string& prefix, const std::string& name, MetricCollector* collector) { if (collector->empty() || name.empty()) { return; @@ -127,6 +128,9 @@ void PrometheusMetricsVisitor::visit(const std::string& prefix, const std::strin } else { metric_name = prefix + "_" + name; } + if (!config::enable_collect_table_metrics && name.starts_with(TableMetricsPrefix)) { + return; + } // Output metric type _ss << "# TYPE " << metric_name << " " << collector->type() << "\n"; switch (collector->type()) { @@ -290,6 +294,9 @@ void JsonMetricsVisitor::visit(const std::string& prefix, const std::string& nam if (collector->empty() || name.empty()) { return; } + if (!config::enable_collect_table_metrics && name.starts_with(TableMetricsPrefix)) { + return; + } rapidjson::Document::AllocatorType& allocator = doc.GetAllocator(); switch (collector->type()) { diff --git a/be/src/service/staros_worker.cpp b/be/src/service/staros_worker.cpp index 7986f247b0ae1d..eaa71f3febdcaf 100644 --- a/be/src/service/staros_worker.cpp +++ b/be/src/service/staros_worker.cpp @@ -81,10 +81,21 @@ StarOSWorker::StarOSWorker() : _mtx(), _shards(), _fs_cache(new_lru_cache(1024)) StarOSWorker::~StarOSWorker() = default; +static const uint64_t kUnknownTableId = UINT64_MAX; uint64_t StarOSWorker::get_table_id(const ShardInfo& shard) { const auto& properties = shard.properties; - CHECK(properties.contains("tableId")); - return std::stoull(properties.at("tableId")); + auto iter = properties.find("tableId"); + if (iter == properties.end()) { + DCHECK(false) << "tableId doesn't exist in shard properties"; + return kUnknownTableId; + } + const auto& tableId = properties.at("tableId"); + try { + return std::stoull(tableId); + } catch (const std::exception& e) { + DCHECK(false) << "failed to parse tableId: " << tableId << ", " << e.what(); + return kUnknownTableId; + } } absl::Status StarOSWorker::add_shard(const ShardInfo& shard) { diff --git a/be/src/util/table_metrics.h b/be/src/util/table_metrics.h index 989effa827df29..fddec4c9ece2c0 100644 --- a/be/src/util/table_metrics.h +++ b/be/src/util/table_metrics.h @@ -69,8 +69,11 @@ class TableMetricsManager { TableMetricsPtr get_table_metrics(uint64_t table_id) { std::shared_lock l(_mu); - DCHECK(_metrics_map.contains(table_id)); - return _metrics_map.at(table_id); + auto iter = _metrics_map.find(table_id); + if (iter != _metrics_map.end()) { + return iter->second; + } + return _blackhole_metrics; } void cleanup(); @@ -79,6 +82,11 @@ class TableMetricsManager { MetricRegistry* _metrics; std::shared_mutex _mu; phmap::flat_hash_map _metrics_map; + // In some cases, we may not be able to obtain the metrics for the corresponding table id, + // For example, when drop tablet and data load concurrently, + // the Tablets may have been deleted before the load begins, and the table metrics may be cleared. + // In such a scenario, we return blackhole metrics to ensure that subsequent processes can work well. + TableMetricsPtr _blackhole_metrics = std::make_shared(); // used for cleanup int64_t _last_cleanup_ts = 0;