Skip to content

Commit

Permalink
libcommon: Provide RSS details metrics (#9218)
Browse files Browse the repository at this point in the history
ref #9032

Signed-off-by: “EricZequan” <[email protected]>

Co-authored-by: JaySon <[email protected]>
  • Loading branch information
EricZequan and JaySon-Huang authored Aug 26, 2024
1 parent 55cb9b9 commit bfb8c5d
Show file tree
Hide file tree
Showing 12 changed files with 229 additions and 168 deletions.
89 changes: 58 additions & 31 deletions dbms/src/Common/ProcessCollector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,44 +18,71 @@
namespace DB
{

ProcessCollector::ProcessCollector()
{
auto info = get_process_metrics();
start_time.Set(info.start_time);
}

std::vector<prometheus::MetricFamily> ProcessCollector::Collect() const
{
auto new_info = get_process_metrics();

// Gauge is thread safe, no need to lock.
auto past_cpu_total = cpu_total.Value();
cpu_total.Increment(new_info.cpu_total - past_cpu_total);
vsize.Set(new_info.vsize);
rss.Set(new_info.rss);

std::vector<prometheus::MetricFamily> familes;
familes.reserve(4);
familes.emplace_back(prometheus::MetricFamily{
CPU_METRIC_NAME,
CPU_METRIC_HELP,
prometheus::MetricType::Gauge,
std::vector<prometheus::ClientMetric>{cpu_total.Collect()}});
familes.emplace_back(prometheus::MetricFamily{
VSIZE_METRIC_NAME,
VSIZE_METRIC_HELP,
prometheus::MetricType::Gauge,
std::vector<prometheus::ClientMetric>{vsize.Collect()}});
familes.emplace_back(prometheus::MetricFamily{
RSS_METRIC_NAME,
RSS_METRIC_HELP,
prometheus::MetricType::Gauge,
std::vector<prometheus::ClientMetric>{rss.Collect()}});

// The following metrics shadow TiFlash proxy metrics, so that we ensure these metrics are available
// in disaggregated mode, where TiFlash proxy may not start at all.
// Note that, even in non-disaggregated mode, duplicates are fine when being collected by Prometheus,
// because TiFlash proxy and TiFlash have different metrics endpoints. However we will see multiple
// endpoints in the Grafana, because both TiFlash proxy and TiFlash uses the same metric name.
// To avoid duplicates in Grafana, we will only include proxy metrics when proxy is not enabled.
if (include_proxy_metrics)
{
familes.emplace_back(prometheus::MetricFamily{
"tiflash_proxy_process_cpu_seconds_total",
"Total user and system CPU time spent in seconds.",
prometheus::MetricType::Gauge,
{
prometheus::ClientMetric{.gauge = {static_cast<double>(new_info.cpu_total)}},
}});

familes.emplace_back(prometheus::MetricFamily{
"tiflash_proxy_process_virtual_memory_bytes",
"Virtual memory size in bytes.",
prometheus::MetricType::Gauge,
{
prometheus::ClientMetric{.gauge = {static_cast<double>(new_info.vsize)}},
}});
familes.emplace_back(prometheus::MetricFamily{
"tiflash_proxy_process_resident_memory_bytes",
"Resident memory size in bytes.",
prometheus::MetricType::Gauge,
{
prometheus::ClientMetric{.gauge = {static_cast<double>(new_info.rss)}},
}});
familes.emplace_back(prometheus::MetricFamily{
"tiflash_proxy_process_start_time_seconds",
"Start time of the process since unix epoch in seconds.",
prometheus::MetricType::Gauge,
{
prometheus::ClientMetric{.gauge = {static_cast<double>(new_info.start_time)}},
}});
}

// The following metrics are TiFlash specific process metrics.
familes.emplace_back(prometheus::MetricFamily{
START_TIME_METRIC_NAME,
START_TIME_METRIC_HELP,
"tiflash_process_rss_by_type_bytes",
"Resident memory size by type in bytes.",
prometheus::MetricType::Gauge,
std::vector<prometheus::ClientMetric>{start_time.Collect()}});
{
prometheus::ClientMetric{
.label = {{"type", "anon"}},
.gauge = {static_cast<double>(new_info.rss_anon)},
},
prometheus::ClientMetric{
.label = {{"type", "file"}},
.gauge = {static_cast<double>(new_info.rss_file)},
},
prometheus::ClientMetric{
.label = {{"type", "shared"}},
.gauge = {static_cast<double>(new_info.rss_shared)},
},
}});

return familes;
}

Expand Down
23 changes: 5 additions & 18 deletions dbms/src/Common/ProcessCollector.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,10 @@

#pragma once

#include <Common/ProcessCollector_fwd.h>
#include <ProcessMetrics/ProcessMetrics.h>
#include <prometheus/counter.h>
#include <prometheus/family.h>
#include <prometheus/metric_family.h>
#include <prometheus/registry.h>

Expand All @@ -27,28 +29,13 @@ namespace DB
// Just like the original tiflash-proxy logic.
// 2. Current implentation of async_metrics interval is 15s, it's too large. And this interval also affect pushgateway interval.
// So better not to mix cpu/mem metrics with async_metrics.
// The difference between ProcessCollector and prometheus::Registry:
// 1. ProcessCollector will **update** Gauge then collect. prometheus::Registry only collect Gauge.
class ProcessCollector : public prometheus::Collectable
{
public:
static constexpr auto CPU_METRIC_NAME = "tiflash_proxy_process_cpu_seconds_total";
static constexpr auto CPU_METRIC_HELP = "Total user and system CPU time spent in seconds.";
static constexpr auto VSIZE_METRIC_NAME = "tiflash_proxy_process_virtual_memory_bytes";
static constexpr auto VSIZE_METRIC_HELP = "Virtual memory size in bytes.";
static constexpr auto RSS_METRIC_NAME = "tiflash_proxy_process_resident_memory_bytes";
static constexpr auto RSS_METRIC_HELP = "Resident memory size in bytes.";
static constexpr auto START_TIME_METRIC_NAME = "tiflash_proxy_process_start_time_seconds";
static constexpr auto START_TIME_METRIC_HELP = "Start time of the process since unix epoch in seconds.";

ProcessCollector();

std::vector<prometheus::MetricFamily> Collect() const override;

private:
mutable prometheus::Gauge cpu_total;
mutable prometheus::Gauge vsize;
mutable prometheus::Gauge rss;
prometheus::Gauge start_time;
public:
mutable std::atomic<bool> include_proxy_metrics = {true};
};

} // namespace DB
22 changes: 22 additions & 0 deletions dbms/src/Common/ProcessCollector_fwd.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
// Copyright 2023 PingCAP, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

namespace DB
{

class ProcessCollector;

}
10 changes: 9 additions & 1 deletion dbms/src/Common/TiFlashMetrics.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
// limitations under the License.

#include <Common/CurrentMetrics.h>
#include <Common/ProcessCollector.h>
#include <Common/ProfileEvents.h>
#include <Common/TiFlashMetrics.h>
#include <common/defines.h>
Expand All @@ -27,6 +28,8 @@ TiFlashMetrics & TiFlashMetrics::instance()

TiFlashMetrics::TiFlashMetrics()
{
process_collector = std::make_shared<ProcessCollector>();

registered_profile_events.reserve(ProfileEvents::end());
for (ProfileEvents::Event event = 0; event < ProfileEvents::end(); event++)
{
Expand Down Expand Up @@ -202,4 +205,9 @@ void TiFlashMetrics::registerStorageThreadMemory(const std::string & k)
}
}

} // namespace DB
void TiFlashMetrics::setProvideProxyProcessMetrics(bool v)
{
process_collector->include_proxy_metrics = v;
}

} // namespace DB
9 changes: 4 additions & 5 deletions dbms/src/Common/TiFlashMetrics.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

#include <Common/ComputeLabelHolder.h>
#include <Common/Exception.h>
#include <Common/ProcessCollector.h>
#include <Common/ProcessCollector_fwd.h>
#include <Common/TiFlashBuildInfo.h>
#include <Common/nocopyable.h>
#include <common/types.h>
Expand Down Expand Up @@ -1089,6 +1089,7 @@ struct MetricFamily
return *(resource_group_metrics_map[resource_group_name][idx]);
}


private:
void addMetricsForResourceGroup(const String & resource_group_name)
{
Expand Down Expand Up @@ -1143,6 +1144,7 @@ class TiFlashMetrics
double getStorageThreadMemory(MemoryAllocType type, const std::string & k);
void registerProxyThreadMemory(const std::string & k);
void registerStorageThreadMemory(const std::string & k);
void setProvideProxyProcessMetrics(bool v);

private:
TiFlashMetrics();
Expand All @@ -1157,10 +1159,7 @@ class TiFlashMetrics
static constexpr auto storages_thread_memory_usage = "tiflash_storages_thread_memory_usage";

std::shared_ptr<prometheus::Registry> registry = std::make_shared<prometheus::Registry>();
// Here we add a ProcessCollector to collect cpu/rss/vsize/start_time information.
// Normally, these metrics will be collected by tiflash-proxy,
// but in disaggregated compute mode with AutoScaler, tiflash-proxy will not start, so tiflash will collect these metrics itself.
std::shared_ptr<ProcessCollector> cn_process_collector = std::make_shared<ProcessCollector>();
std::shared_ptr<ProcessCollector> process_collector;

std::vector<prometheus::Gauge *> registered_profile_events;
std::vector<prometheus::Gauge *> registered_current_metrics;
Expand Down
27 changes: 11 additions & 16 deletions dbms/src/Server/MetricsPrometheus.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

#include <Common/CurrentMetrics.h>
#include <Common/FunctionTimerTask.h>
#include <Common/ProcessCollector.h>
#include <Common/ProfileEvents.h>
#include <Common/StringUtils/StringUtils.h>
#include <Common/TiFlashMetrics.h>
Expand Down Expand Up @@ -207,6 +208,11 @@ MetricsPrometheus::MetricsPrometheus(Context & context, const AsynchronousMetric
auto & tiflash_metrics = TiFlashMetrics::instance();
auto & conf = context.getConfigRef();

bool should_provide_proxy_metrics
= (context.getSharedContextDisagg()->isDisaggregatedComputeMode()
&& context.getSharedContextDisagg()->use_autoscaler);
tiflash_metrics.setProvideProxyProcessMetrics(should_provide_proxy_metrics);

// Interval to collect `ProfileEvents::Event`/`CurrentMetrics::Metric`/`AsynchronousMetrics`
// When push mode is enabled, it also define the interval that Prometheus client push to pushgateway.
metrics_interval = conf.getInt(status_metrics_interval, 15);
Expand Down Expand Up @@ -245,11 +251,7 @@ MetricsPrometheus::MetricsPrometheus(Context & context, const AsynchronousMetric
const auto & labels = prometheus::Gateway::GetInstanceLabel(getInstanceValue(conf));
gateway = std::make_shared<prometheus::Gateway>(host, port, job_name, labels);
gateway->RegisterCollectable(tiflash_metrics.registry);
if (context.getSharedContextDisagg()->isDisaggregatedComputeMode()
&& context.getSharedContextDisagg()->use_autoscaler)
{
gateway->RegisterCollectable(tiflash_metrics.cn_process_collector);
}
gateway->RegisterCollectable(tiflash_metrics.process_collector);

LOG_INFO(log, "Enable prometheus push mode; interval = {}; addr = {}", metrics_interval, metrics_addr);
}
Expand All @@ -268,12 +270,9 @@ MetricsPrometheus::MetricsPrometheus(Context & context, const AsynchronousMetric
addr = listen_host + ":" + metrics_port;
if (context.getSecurityConfig()->hasTlsConfig() && !conf.getBool(status_disable_metrics_tls, false))
{
std::vector<std::weak_ptr<prometheus::Collectable>> collectables{tiflash_metrics.registry};
if (context.getSharedContextDisagg()->isDisaggregatedComputeMode()
&& context.getSharedContextDisagg()->use_autoscaler)
{
collectables.push_back(tiflash_metrics.cn_process_collector);
}
std::vector<std::weak_ptr<prometheus::Collectable>> collectables{
tiflash_metrics.registry,
tiflash_metrics.process_collector};
server = getHTTPServer(context, collectables, addr);
server->start();
LOG_INFO(
Expand All @@ -286,11 +285,7 @@ MetricsPrometheus::MetricsPrometheus(Context & context, const AsynchronousMetric
{
exposer = std::make_shared<prometheus::Exposer>(addr);
exposer->RegisterCollectable(tiflash_metrics.registry);
if (context.getSharedContextDisagg()->isDisaggregatedComputeMode()
&& context.getSharedContextDisagg()->use_autoscaler)
{
exposer->RegisterCollectable(tiflash_metrics.cn_process_collector);
}
exposer->RegisterCollectable(tiflash_metrics.process_collector);
LOG_INFO(
log,
"Enable prometheus pull mode; Listen Host = {}, Metrics Port = {}",
Expand Down
2 changes: 2 additions & 0 deletions libs/libprocess_metrics/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
/target

1 change: 1 addition & 0 deletions libs/libprocess_metrics/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ add_custom_command(OUTPUT ${_PROCESS_METRICS_LIBRARY}
COMMENT "Building process_metrics"
COMMAND cargo build --release --target-dir ${CMAKE_CURRENT_BINARY_DIR}
VERBATIM
USES_TERMINAL
WORKING_DIRECTORY ${_PROCESS_METRICS_SOURCE_DIR}
DEPENDS "${_PROCESS_METRICS_SRCS}"
"${_PROCESS_METRICS_HEADERS}"
Expand Down
Loading

0 comments on commit bfb8c5d

Please sign in to comment.