Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

libcommon: Provide RSS details metrics #9218

Merged
merged 4 commits into from
Aug 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 58 additions & 31 deletions dbms/src/Common/ProcessCollector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,44 +18,71 @@
namespace DB
{

ProcessCollector::ProcessCollector()
{
auto info = get_process_metrics();
start_time.Set(info.start_time);
}

std::vector<prometheus::MetricFamily> ProcessCollector::Collect() const
{
auto new_info = get_process_metrics();

// Gauge is thread safe, no need to lock.
auto past_cpu_total = cpu_total.Value();
cpu_total.Increment(new_info.cpu_total - past_cpu_total);
vsize.Set(new_info.vsize);
rss.Set(new_info.rss);

std::vector<prometheus::MetricFamily> familes;
familes.reserve(4);
familes.emplace_back(prometheus::MetricFamily{
CPU_METRIC_NAME,
CPU_METRIC_HELP,
prometheus::MetricType::Gauge,
std::vector<prometheus::ClientMetric>{cpu_total.Collect()}});
familes.emplace_back(prometheus::MetricFamily{
VSIZE_METRIC_NAME,
VSIZE_METRIC_HELP,
prometheus::MetricType::Gauge,
std::vector<prometheus::ClientMetric>{vsize.Collect()}});
familes.emplace_back(prometheus::MetricFamily{
RSS_METRIC_NAME,
RSS_METRIC_HELP,
prometheus::MetricType::Gauge,
std::vector<prometheus::ClientMetric>{rss.Collect()}});

// The following metrics shadow TiFlash proxy metrics, so that we ensure these metrics are available
// in disaggregated mode, where TiFlash proxy may not start at all.
// Note that, even in non-disaggregated mode, duplicates are fine when being collected by Prometheus,
// because TiFlash proxy and TiFlash have different metrics endpoints. However we will see multiple
// endpoints in the Grafana, because both TiFlash proxy and TiFlash uses the same metric name.
// To avoid duplicates in Grafana, we will only include proxy metrics when proxy is not enabled.
if (include_proxy_metrics)
{
familes.emplace_back(prometheus::MetricFamily{
"tiflash_proxy_process_cpu_seconds_total",
"Total user and system CPU time spent in seconds.",
prometheus::MetricType::Gauge,
{
prometheus::ClientMetric{.gauge = {static_cast<double>(new_info.cpu_total)}},
}});

familes.emplace_back(prometheus::MetricFamily{
"tiflash_proxy_process_virtual_memory_bytes",
"Virtual memory size in bytes.",
prometheus::MetricType::Gauge,
{
prometheus::ClientMetric{.gauge = {static_cast<double>(new_info.vsize)}},
}});
familes.emplace_back(prometheus::MetricFamily{
"tiflash_proxy_process_resident_memory_bytes",
"Resident memory size in bytes.",
prometheus::MetricType::Gauge,
{
prometheus::ClientMetric{.gauge = {static_cast<double>(new_info.rss)}},
}});
familes.emplace_back(prometheus::MetricFamily{
"tiflash_proxy_process_start_time_seconds",
"Start time of the process since unix epoch in seconds.",
prometheus::MetricType::Gauge,
{
prometheus::ClientMetric{.gauge = {static_cast<double>(new_info.start_time)}},
}});
}

// The following metrics are TiFlash specific process metrics.
familes.emplace_back(prometheus::MetricFamily{
START_TIME_METRIC_NAME,
START_TIME_METRIC_HELP,
"tiflash_process_rss_by_type_bytes",
"Resident memory size by type in bytes.",
prometheus::MetricType::Gauge,
std::vector<prometheus::ClientMetric>{start_time.Collect()}});
{
prometheus::ClientMetric{
.label = {{"type", "anon"}},
.gauge = {static_cast<double>(new_info.rss_anon)},
},
prometheus::ClientMetric{
.label = {{"type", "file"}},
.gauge = {static_cast<double>(new_info.rss_file)},
},
prometheus::ClientMetric{
.label = {{"type", "shared"}},
.gauge = {static_cast<double>(new_info.rss_shared)},
},
}});

return familes;
}

Expand Down
23 changes: 5 additions & 18 deletions dbms/src/Common/ProcessCollector.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,10 @@

#pragma once

#include <Common/ProcessCollector_fwd.h>
#include <ProcessMetrics/ProcessMetrics.h>
#include <prometheus/counter.h>
#include <prometheus/family.h>
#include <prometheus/metric_family.h>
#include <prometheus/registry.h>

Expand All @@ -27,28 +29,13 @@ namespace DB
// Just like the original tiflash-proxy logic.
// 2. Current implentation of async_metrics interval is 15s, it's too large. And this interval also affect pushgateway interval.
// So better not to mix cpu/mem metrics with async_metrics.
// The difference between ProcessCollector and prometheus::Registry:
// 1. ProcessCollector will **update** Gauge then collect. prometheus::Registry only collect Gauge.
class ProcessCollector : public prometheus::Collectable
{
public:
static constexpr auto CPU_METRIC_NAME = "tiflash_proxy_process_cpu_seconds_total";
static constexpr auto CPU_METRIC_HELP = "Total user and system CPU time spent in seconds.";
static constexpr auto VSIZE_METRIC_NAME = "tiflash_proxy_process_virtual_memory_bytes";
static constexpr auto VSIZE_METRIC_HELP = "Virtual memory size in bytes.";
static constexpr auto RSS_METRIC_NAME = "tiflash_proxy_process_resident_memory_bytes";
static constexpr auto RSS_METRIC_HELP = "Resident memory size in bytes.";
static constexpr auto START_TIME_METRIC_NAME = "tiflash_proxy_process_start_time_seconds";
static constexpr auto START_TIME_METRIC_HELP = "Start time of the process since unix epoch in seconds.";

ProcessCollector();

std::vector<prometheus::MetricFamily> Collect() const override;

private:
mutable prometheus::Gauge cpu_total;
mutable prometheus::Gauge vsize;
mutable prometheus::Gauge rss;
prometheus::Gauge start_time;
public:
mutable std::atomic<bool> include_proxy_metrics = {true};
};

} // namespace DB
22 changes: 22 additions & 0 deletions dbms/src/Common/ProcessCollector_fwd.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
// Copyright 2023 PingCAP, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

namespace DB
{

class ProcessCollector;

}
10 changes: 9 additions & 1 deletion dbms/src/Common/TiFlashMetrics.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
// limitations under the License.

#include <Common/CurrentMetrics.h>
#include <Common/ProcessCollector.h>
#include <Common/ProfileEvents.h>
#include <Common/TiFlashMetrics.h>
#include <common/defines.h>
Expand All @@ -27,6 +28,8 @@ TiFlashMetrics & TiFlashMetrics::instance()

TiFlashMetrics::TiFlashMetrics()
{
process_collector = std::make_shared<ProcessCollector>();

registered_profile_events.reserve(ProfileEvents::end());
for (ProfileEvents::Event event = 0; event < ProfileEvents::end(); event++)
{
Expand Down Expand Up @@ -202,4 +205,9 @@ void TiFlashMetrics::registerStorageThreadMemory(const std::string & k)
}
}

} // namespace DB
void TiFlashMetrics::setProvideProxyProcessMetrics(bool v)
{
process_collector->include_proxy_metrics = v;
}

} // namespace DB
9 changes: 4 additions & 5 deletions dbms/src/Common/TiFlashMetrics.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

#include <Common/ComputeLabelHolder.h>
#include <Common/Exception.h>
#include <Common/ProcessCollector.h>
#include <Common/ProcessCollector_fwd.h>
#include <Common/TiFlashBuildInfo.h>
#include <Common/nocopyable.h>
#include <common/types.h>
Expand Down Expand Up @@ -1089,6 +1089,7 @@ struct MetricFamily
return *(resource_group_metrics_map[resource_group_name][idx]);
}


private:
void addMetricsForResourceGroup(const String & resource_group_name)
{
Expand Down Expand Up @@ -1143,6 +1144,7 @@ class TiFlashMetrics
double getStorageThreadMemory(MemoryAllocType type, const std::string & k);
void registerProxyThreadMemory(const std::string & k);
void registerStorageThreadMemory(const std::string & k);
void setProvideProxyProcessMetrics(bool v);

private:
TiFlashMetrics();
Expand All @@ -1157,10 +1159,7 @@ class TiFlashMetrics
static constexpr auto storages_thread_memory_usage = "tiflash_storages_thread_memory_usage";

std::shared_ptr<prometheus::Registry> registry = std::make_shared<prometheus::Registry>();
// Here we add a ProcessCollector to collect cpu/rss/vsize/start_time information.
// Normally, these metrics will be collected by tiflash-proxy,
// but in disaggregated compute mode with AutoScaler, tiflash-proxy will not start, so tiflash will collect these metrics itself.
std::shared_ptr<ProcessCollector> cn_process_collector = std::make_shared<ProcessCollector>();
std::shared_ptr<ProcessCollector> process_collector;

std::vector<prometheus::Gauge *> registered_profile_events;
std::vector<prometheus::Gauge *> registered_current_metrics;
Expand Down
27 changes: 11 additions & 16 deletions dbms/src/Server/MetricsPrometheus.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

#include <Common/CurrentMetrics.h>
#include <Common/FunctionTimerTask.h>
#include <Common/ProcessCollector.h>
#include <Common/ProfileEvents.h>
#include <Common/StringUtils/StringUtils.h>
#include <Common/TiFlashMetrics.h>
Expand Down Expand Up @@ -207,6 +208,11 @@ MetricsPrometheus::MetricsPrometheus(Context & context, const AsynchronousMetric
auto & tiflash_metrics = TiFlashMetrics::instance();
auto & conf = context.getConfigRef();

bool should_provide_proxy_metrics
= (context.getSharedContextDisagg()->isDisaggregatedComputeMode()
&& context.getSharedContextDisagg()->use_autoscaler);
tiflash_metrics.setProvideProxyProcessMetrics(should_provide_proxy_metrics);

// Interval to collect `ProfileEvents::Event`/`CurrentMetrics::Metric`/`AsynchronousMetrics`
// When push mode is enabled, it also define the interval that Prometheus client push to pushgateway.
metrics_interval = conf.getInt(status_metrics_interval, 15);
Expand Down Expand Up @@ -245,11 +251,7 @@ MetricsPrometheus::MetricsPrometheus(Context & context, const AsynchronousMetric
const auto & labels = prometheus::Gateway::GetInstanceLabel(getInstanceValue(conf));
gateway = std::make_shared<prometheus::Gateway>(host, port, job_name, labels);
gateway->RegisterCollectable(tiflash_metrics.registry);
if (context.getSharedContextDisagg()->isDisaggregatedComputeMode()
&& context.getSharedContextDisagg()->use_autoscaler)
{
gateway->RegisterCollectable(tiflash_metrics.cn_process_collector);
}
gateway->RegisterCollectable(tiflash_metrics.process_collector);

LOG_INFO(log, "Enable prometheus push mode; interval = {}; addr = {}", metrics_interval, metrics_addr);
}
Expand All @@ -268,12 +270,9 @@ MetricsPrometheus::MetricsPrometheus(Context & context, const AsynchronousMetric
addr = listen_host + ":" + metrics_port;
if (context.getSecurityConfig()->hasTlsConfig() && !conf.getBool(status_disable_metrics_tls, false))
{
std::vector<std::weak_ptr<prometheus::Collectable>> collectables{tiflash_metrics.registry};
if (context.getSharedContextDisagg()->isDisaggregatedComputeMode()
&& context.getSharedContextDisagg()->use_autoscaler)
{
collectables.push_back(tiflash_metrics.cn_process_collector);
}
std::vector<std::weak_ptr<prometheus::Collectable>> collectables{
tiflash_metrics.registry,
tiflash_metrics.process_collector};
server = getHTTPServer(context, collectables, addr);
server->start();
LOG_INFO(
Expand All @@ -286,11 +285,7 @@ MetricsPrometheus::MetricsPrometheus(Context & context, const AsynchronousMetric
{
exposer = std::make_shared<prometheus::Exposer>(addr);
exposer->RegisterCollectable(tiflash_metrics.registry);
if (context.getSharedContextDisagg()->isDisaggregatedComputeMode()
&& context.getSharedContextDisagg()->use_autoscaler)
{
exposer->RegisterCollectable(tiflash_metrics.cn_process_collector);
}
exposer->RegisterCollectable(tiflash_metrics.process_collector);
LOG_INFO(
log,
"Enable prometheus pull mode; Listen Host = {}, Metrics Port = {}",
Expand Down
2 changes: 2 additions & 0 deletions libs/libprocess_metrics/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
/target

1 change: 1 addition & 0 deletions libs/libprocess_metrics/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ add_custom_command(OUTPUT ${_PROCESS_METRICS_LIBRARY}
COMMENT "Building process_metrics"
COMMAND cargo build --release --target-dir ${CMAKE_CURRENT_BINARY_DIR}
VERBATIM
USES_TERMINAL
WORKING_DIRECTORY ${_PROCESS_METRICS_SOURCE_DIR}
DEPENDS "${_PROCESS_METRICS_SRCS}"
"${_PROCESS_METRICS_HEADERS}"
Expand Down
Loading