Skip to content

Commit

Permalink
feat: expose tx_queue_len metric
Browse files Browse the repository at this point in the history
This metric shows how much the transaction queue of dragonfly is loaded.

Signed-off-by: Roman Gershman <[email protected]>
  • Loading branch information
romange committed Dec 23, 2023
1 parent 1376295 commit ba68939
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 10 deletions.
19 changes: 11 additions & 8 deletions src/server/server_family.cc
Original file line number Diff line number Diff line change
Expand Up @@ -753,9 +753,9 @@ void PrintPrometheusMetrics(const Metrics& m, StringResponse* resp) {
// Server metrics
AppendMetricHeader("version", "", MetricType::GAUGE, &resp->body());
AppendMetricValue("version", 1, {"version"}, {GetVersion()}, &resp->body());
AppendMetricHeader("role", "", MetricType::GAUGE, &resp->body());
AppendMetricValue("role", 1, {"role"}, {m.is_master ? "master" : "replica"}, &resp->body());
AppendMetricWithoutLabels("master", "1 if master 0 if replica", m.is_master ? 1 : 0,

bool is_master = ServerState::tlocal()->is_master;
AppendMetricWithoutLabels("master", "1 if master 0 if replica", is_master ? 1 : 0,
MetricType::GAUGE, &resp->body());
AppendMetricWithoutLabels("uptime_in_seconds", "", m.uptime, MetricType::COUNTER, &resp->body());

Expand Down Expand Up @@ -883,7 +883,7 @@ void PrintPrometheusMetrics(const Metrics& m, StringResponse* resp) {
double longrun_seconds = m.fiber_longrun_usec * 1e-6;
AppendMetricWithoutLabels("fiber_longrun_seconds_total", "", longrun_seconds, MetricType::COUNTER,
&resp->body());

AppendMetricWithoutLabels("tx_queue_len", "", m.tx_queue_len, MetricType::GAUGE, &resp->body());
absl::StrAppend(&resp->body(), db_key_metrics);
absl::StrAppend(&resp->body(), db_key_expire_metrics);
}
Expand Down Expand Up @@ -1495,6 +1495,8 @@ Metrics ServerFamily::GetMetrics() const {

result.traverse_ttl_per_sec += shard->GetMovingSum6(EngineShard::TTL_TRAVERSE);
result.delete_ttl_per_sec += shard->GetMovingSum6(EngineShard::TTL_DELETE);
if (result.tx_queue_len < shard->txq()->size())
result.tx_queue_len = shard->txq()->size();
}

service_.mutable_registry()->MergeCallStats(index, cmd_stat_cb);
Expand All @@ -1507,11 +1509,12 @@ Metrics ServerFamily::GetMetrics() const {
result.traverse_ttl_per_sec /= 6;
result.delete_ttl_per_sec /= 6;

result.is_master = ServerState::tlocal() && ServerState::tlocal()->is_master;
if (result.is_master)
bool is_master = ServerState::tlocal() && ServerState::tlocal()->is_master;
if (is_master)
result.replication_metrics = dfly_cmd_->GetReplicasRoleInfo();

// Update peak stats
// Update peak stats. We rely on the fact that GetMetrics is called frequently enough to
// update peak_stats_ from it.
lock_guard lk{peak_stats_mu_};
UpdateMax(&peak_stats_.conn_dispatch_queue_bytes, result.conn_stats.dispatch_queue_bytes);
UpdateMax(&peak_stats_.conn_read_buf_capacity, result.conn_stats.read_buf_capacity);
Expand Down Expand Up @@ -1628,7 +1631,7 @@ void ServerFamily::Info(CmdArgList args, ConnectionContext* cntx) {
append("maxmemory_policy", "noeviction");
}

if (m.is_master && !m.replication_metrics.empty()) {
if (!m.replication_metrics.empty()) {
ReplicationMemoryStats repl_mem;
dfly_cmd_->GetReplicationMemoryStats(&repl_mem);
append("replication_streaming_buffer_bytes", repl_mem.streamer_buf_capacity_bytes_);
Expand Down
5 changes: 3 additions & 2 deletions src/server/server_family.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,10 +93,11 @@ struct Metrics {
uint64_t fiber_longrun_cnt = 0;
uint64_t fiber_longrun_usec = 0;

// Max length of the all the tx shard-queues.
uint32_t tx_queue_len = 0;

// command call frequencies (count, aggregated latency in usec).
std::map<std::string, std::pair<uint64_t, uint64_t>> cmd_stats_map;

bool is_master = true;
std::vector<ReplicaRoleInfo> replication_metrics;
};

Expand Down

0 comments on commit ba68939

Please sign in to comment.