Skip to content

Commit

Permalink
Export last replay age in replication collector (#1085)
Browse files Browse the repository at this point in the history
The exported replication lag does not handle all failure modes, and can
report 0 for replicas that are out of sync and incapable of recovery.

A proper replacement for that metric would require a different approach
(see e.g. #1007), but for a lot of folks, simply exporting the age of
the last replay can provide a pretty strong signal for something being
amiss.

I think this solution might be preferable to #977, though the lag
metric needs to be fixed or abandoned eventually.

Signed-off-by: Conrad Hoffmann <[email protected]>
  • Loading branch information
bitfehler authored Feb 15, 2025
1 parent 2ee2a8f commit c3885e8
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 4 deletions.
19 changes: 17 additions & 2 deletions collector/pg_replication.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,15 @@ var (
"Indicates if the server is a replica",
[]string{}, nil,
)
pgReplicationLastReplay = prometheus.NewDesc(
prometheus.BuildFQName(
namespace,
replicationSubsystem,
"last_replay_seconds",
),
"Age of last replay in seconds",
[]string{}, nil,
)

pgReplicationQuery = `SELECT
CASE
Expand All @@ -61,7 +70,8 @@ var (
CASE
WHEN pg_is_in_recovery() THEN 1
ELSE 0
END as is_replica`
END as is_replica,
GREATEST (0, EXTRACT(EPOCH FROM (now() - pg_last_xact_replay_timestamp()))) as last_replay`
)

func (c *PGReplicationCollector) Update(ctx context.Context, instance *instance, ch chan<- prometheus.Metric) error {
Expand All @@ -72,7 +82,8 @@ func (c *PGReplicationCollector) Update(ctx context.Context, instance *instance,

var lag float64
var isReplica int64
err := row.Scan(&lag, &isReplica)
var replayAge float64
err := row.Scan(&lag, &isReplica, &replayAge)
if err != nil {
return err
}
Expand All @@ -84,5 +95,9 @@ func (c *PGReplicationCollector) Update(ctx context.Context, instance *instance,
pgReplicationIsReplica,
prometheus.GaugeValue, float64(isReplica),
)
ch <- prometheus.MustNewConstMetric(
pgReplicationLastReplay,
prometheus.GaugeValue, replayAge,
)
return nil
}
5 changes: 3 additions & 2 deletions collector/pg_replication_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,9 @@ func TestPgReplicationCollector(t *testing.T) {

inst := &instance{db: db}

columns := []string{"lag", "is_replica"}
columns := []string{"lag", "is_replica", "last_replay"}
rows := sqlmock.NewRows(columns).
AddRow(1000, 1)
AddRow(1000, 1, 3)
mock.ExpectQuery(sanitizeQuery(pgReplicationQuery)).WillReturnRows(rows)

ch := make(chan prometheus.Metric)
Expand All @@ -49,6 +49,7 @@ func TestPgReplicationCollector(t *testing.T) {
expected := []MetricResult{
{labels: labelMap{}, value: 1000, metricType: dto.MetricType_GAUGE},
{labels: labelMap{}, value: 1, metricType: dto.MetricType_GAUGE},
{labels: labelMap{}, value: 3, metricType: dto.MetricType_GAUGE},
}

convey.Convey("Metrics comparison", t, func() {
Expand Down

0 comments on commit c3885e8

Please sign in to comment.