Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove user from state key metric value #5453

1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
* [BUGFIX] Store Gateway: Fix bug in store gateway ring comparison logic. #5426
* [BUGFIX] Ring: Fix bug in consistency of Get func in a scaling zone-aware ring. #5429
* [BUGFIX] Query Frontend: Fix bug of failing to cancel downstream request context in query frontend v2 mode (query scheduler enabled). #5447
* [BUGFIX] Alertmanager: Remove the user id from state replication key metric label value. #5453

## 1.15.1 2023-04-26

Expand Down
16 changes: 8 additions & 8 deletions pkg/alertmanager/alertmanager_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -168,19 +168,19 @@ func newAlertmanagerMetrics() *alertmanagerMetrics {
partialMerges: prometheus.NewDesc(
"cortex_alertmanager_partial_state_merges_total",
"Number of times we have received a partial state to merge for a key.",
[]string{"user"}, nil),
[]string{"user", "type"}, nil),
partialMergesFailed: prometheus.NewDesc(
"cortex_alertmanager_partial_state_merges_failed_total",
"Number of times we have failed to merge a partial state received for a key.",
[]string{"user"}, nil),
[]string{"user", "type"}, nil),
replicationTotal: prometheus.NewDesc(
"cortex_alertmanager_state_replication_total",
"Number of times we have tried to replicate a state to other alertmanagers",
[]string{"user"}, nil),
[]string{"user", "type"}, nil),
replicationFailed: prometheus.NewDesc(
"cortex_alertmanager_state_replication_failed_total",
"Number of times we have failed to replicate a state to other alertmanagers",
[]string{"user"}, nil),
[]string{"user", "type"}, nil),
fetchReplicaStateTotal: prometheus.NewDesc(
"cortex_alertmanager_state_fetch_replica_state_total",
"Number of times we have tried to read and merge the full state from another replica.",
Expand Down Expand Up @@ -317,10 +317,10 @@ func (m *alertmanagerMetrics) Collect(out chan<- prometheus.Metric) {

data.SendMaxOfGaugesPerUser(out, m.configHashValue, "alertmanager_config_hash")

data.SendSumOfCountersPerUser(out, m.partialMerges, "alertmanager_partial_state_merges_total")
data.SendSumOfCountersPerUser(out, m.partialMergesFailed, "alertmanager_partial_state_merges_failed_total")
data.SendSumOfCountersPerUser(out, m.replicationTotal, "alertmanager_state_replication_total")
data.SendSumOfCountersPerUser(out, m.replicationFailed, "alertmanager_state_replication_failed_total")
data.SendSumOfCountersPerUserWithLabels(out, m.partialMerges, "alertmanager_partial_state_merges_total", "type")
data.SendSumOfCountersPerUserWithLabels(out, m.partialMergesFailed, "alertmanager_partial_state_merges_failed_total", "type")
data.SendSumOfCountersPerUserWithLabels(out, m.replicationTotal, "alertmanager_state_replication_total", "type")
data.SendSumOfCountersPerUserWithLabels(out, m.replicationFailed, "alertmanager_state_replication_failed_total", "type")
data.SendSumOfCounters(out, m.fetchReplicaStateTotal, "alertmanager_state_fetch_replica_state_total")
data.SendSumOfCounters(out, m.fetchReplicaStateFailed, "alertmanager_state_fetch_replica_state_failed_total")
data.SendSumOfCounters(out, m.initialSyncTotal, "alertmanager_state_initial_sync_total")
Expand Down
56 changes: 56 additions & 0 deletions pkg/alertmanager/alertmanager_metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,17 @@ func TestAlertmanagerMetricsStore(t *testing.T) {
cortex_alertmanager_notifications_total{integration="wechat",user="user2"} 20
cortex_alertmanager_notifications_total{integration="wechat",user="user3"} 200

# HELP cortex_alertmanager_partial_state_merges_failed_total Number of times we have failed to merge a partial state received for a key.
# TYPE cortex_alertmanager_partial_state_merges_failed_total counter
cortex_alertmanager_partial_state_merges_failed_total{type="nfl",user="user1"} 2
cortex_alertmanager_partial_state_merges_failed_total{type="nfl",user="user2"} 20
cortex_alertmanager_partial_state_merges_failed_total{type="nfl",user="user3"} 200
# HELP cortex_alertmanager_partial_state_merges_total Number of times we have received a partial state to merge for a key.
# TYPE cortex_alertmanager_partial_state_merges_total counter
cortex_alertmanager_partial_state_merges_total{type="nfl",user="user1"} 3
cortex_alertmanager_partial_state_merges_total{type="nfl",user="user2"} 30
cortex_alertmanager_partial_state_merges_total{type="nfl",user="user3"} 300

# HELP cortex_alertmanager_silences How many silences by state.
# TYPE cortex_alertmanager_silences gauge
cortex_alertmanager_silences{state="active",user="user1"} 1
Expand Down Expand Up @@ -506,6 +517,17 @@ func TestAlertmanagerMetricsRemoval(t *testing.T) {
cortex_alertmanager_notifications_total{integration="wechat",user="user2"} 20
cortex_alertmanager_notifications_total{integration="wechat",user="user3"} 200

# HELP cortex_alertmanager_partial_state_merges_failed_total Number of times we have failed to merge a partial state received for a key.
# TYPE cortex_alertmanager_partial_state_merges_failed_total counter
cortex_alertmanager_partial_state_merges_failed_total{type="nfl",user="user1"} 2
cortex_alertmanager_partial_state_merges_failed_total{type="nfl",user="user2"} 20
cortex_alertmanager_partial_state_merges_failed_total{type="nfl",user="user3"} 200
# HELP cortex_alertmanager_partial_state_merges_total Number of times we have received a partial state to merge for a key.
# TYPE cortex_alertmanager_partial_state_merges_total counter
cortex_alertmanager_partial_state_merges_total{type="nfl",user="user1"} 3
cortex_alertmanager_partial_state_merges_total{type="nfl",user="user2"} 30
cortex_alertmanager_partial_state_merges_total{type="nfl",user="user3"} 300

# HELP cortex_alertmanager_silences How many silences by state.
# TYPE cortex_alertmanager_silences gauge
cortex_alertmanager_silences{state="active",user="user1"} 1
Expand Down Expand Up @@ -758,6 +780,15 @@ func TestAlertmanagerMetricsRemoval(t *testing.T) {
cortex_alertmanager_notifications_total{integration="wechat",user="user1"} 2
cortex_alertmanager_notifications_total{integration="wechat",user="user2"} 20

# HELP cortex_alertmanager_partial_state_merges_failed_total Number of times we have failed to merge a partial state received for a key.
# TYPE cortex_alertmanager_partial_state_merges_failed_total counter
cortex_alertmanager_partial_state_merges_failed_total{type="nfl",user="user1"} 2
cortex_alertmanager_partial_state_merges_failed_total{type="nfl",user="user2"} 20
# HELP cortex_alertmanager_partial_state_merges_total Number of times we have received a partial state to merge for a key.
# TYPE cortex_alertmanager_partial_state_merges_total counter
cortex_alertmanager_partial_state_merges_total{type="nfl",user="user1"} 3
cortex_alertmanager_partial_state_merges_total{type="nfl",user="user2"} 30

# HELP cortex_alertmanager_silences How many silences by state.
# TYPE cortex_alertmanager_silences gauge
cortex_alertmanager_silences{state="active",user="user1"} 1
Expand Down Expand Up @@ -898,6 +929,10 @@ func populateAlertmanager(base float64) *prometheus.Registry {
lm.size.Set(100 * base)
lm.insertFailures.Add(7 * base)

sr := newStateReplicationMetrics(reg)
sr.partialStateMergesFailed.WithLabelValues("nfl").Add(base * 2)
sr.partialStateMergesTotal.WithLabelValues("nfl").Add(base * 3)

return reg
}

Expand Down Expand Up @@ -1130,3 +1165,24 @@ func newLimiterMetrics(r prometheus.Registerer) *limiterMetrics {
insertFailures: insertAlertFailures,
}
}

type stateReplicationMetrics struct {
partialStateMergesTotal *prometheus.CounterVec
partialStateMergesFailed *prometheus.CounterVec
}

func newStateReplicationMetrics(r prometheus.Registerer) *stateReplicationMetrics {
partialStateMergesTotal := promauto.With(r).NewCounterVec(prometheus.CounterOpts{
Name: "alertmanager_partial_state_merges_total",
Help: "Number of times we have received a partial state to merge for a key.",
}, []string{"type"})
partialStateMergesFailed := promauto.With(r).NewCounterVec(prometheus.CounterOpts{
Name: "alertmanager_partial_state_merges_failed_total",
Help: "Number of times we have failed to merge a partial state received for a key.",
}, []string{"type"})

return &stateReplicationMetrics{
partialStateMergesTotal: partialStateMergesTotal,
partialStateMergesFailed: partialStateMergesFailed,
}
}
39 changes: 26 additions & 13 deletions pkg/alertmanager/state_replication.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package alertmanager
import (
"context"
"fmt"
"strings"
"sync"
"time"

Expand Down Expand Up @@ -79,19 +80,19 @@ func newReplicatedStates(userID string, rf int, re Replicator, st alertstore.Ale
partialStateMergesTotal: promauto.With(r).NewCounterVec(prometheus.CounterOpts{
Name: "alertmanager_partial_state_merges_total",
Help: "Number of times we have received a partial state to merge for a key.",
}, []string{"key"}),
}, []string{"type"}),
partialStateMergesFailed: promauto.With(r).NewCounterVec(prometheus.CounterOpts{
Name: "alertmanager_partial_state_merges_failed_total",
Help: "Number of times we have failed to merge a partial state received for a key.",
}, []string{"key"}),
}, []string{"type"}),
stateReplicationTotal: promauto.With(r).NewCounterVec(prometheus.CounterOpts{
Name: "alertmanager_state_replication_total",
Help: "Number of times we have tried to replicate a state to other alertmanagers.",
}, []string{"key"}),
}, []string{"type"}),
stateReplicationFailed: promauto.With(r).NewCounterVec(prometheus.CounterOpts{
Name: "alertmanager_state_replication_failed_total",
Help: "Number of times we have failed to replicate a state to other alertmanagers.",
}, []string{"key"}),
}, []string{"type"}),
fetchReplicaStateTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{
Name: "alertmanager_state_fetch_replica_state_total",
Help: "Number of times we have tried to read and merge the full state from another replica.",
Expand Down Expand Up @@ -131,10 +132,11 @@ func (s *state) AddState(key string, cs cluster.State, _ prometheus.Registerer)

s.states[key] = cs

s.partialStateMergesTotal.WithLabelValues(key)
s.partialStateMergesFailed.WithLabelValues(key)
s.stateReplicationTotal.WithLabelValues(key)
s.stateReplicationFailed.WithLabelValues(key)
stateType := getStateTypeFromKey(key)
s.partialStateMergesTotal.WithLabelValues(stateType)
s.partialStateMergesFailed.WithLabelValues(stateType)
s.stateReplicationTotal.WithLabelValues(stateType)
s.stateReplicationFailed.WithLabelValues(stateType)

return &stateChannel{
s: s,
Expand All @@ -144,18 +146,19 @@ func (s *state) AddState(key string, cs cluster.State, _ prometheus.Registerer)

// MergePartialState merges a received partial message with an internal state.
func (s *state) MergePartialState(p *clusterpb.Part) error {
s.partialStateMergesTotal.WithLabelValues(p.Key).Inc()
stateType := getStateTypeFromKey(p.Key)
s.partialStateMergesTotal.WithLabelValues(stateType).Inc()

s.mtx.Lock()
defer s.mtx.Unlock()
st, ok := s.states[p.Key]
if !ok {
s.partialStateMergesFailed.WithLabelValues(p.Key).Inc()
s.partialStateMergesFailed.WithLabelValues(stateType).Inc()
return fmt.Errorf("key not found while merging")
}

if err := st.Merge(p.Data); err != nil {
s.partialStateMergesFailed.WithLabelValues(p.Key).Inc()
s.partialStateMergesFailed.WithLabelValues(stateType).Inc()
return err
}

Expand Down Expand Up @@ -285,9 +288,10 @@ func (s *state) running(ctx context.Context) error {
return nil
}

s.stateReplicationTotal.WithLabelValues(p.Key).Inc()
stateType := getStateTypeFromKey(p.Key)
s.stateReplicationTotal.WithLabelValues(stateType).Inc()
if err := s.replicator.ReplicateStateForUser(ctx, s.userID, p); err != nil {
s.stateReplicationFailed.WithLabelValues(p.Key).Inc()
s.stateReplicationFailed.WithLabelValues(stateType).Inc()
level.Error(s.logger).Log("msg", "failed to replicate state to other alertmanagers", "user", s.userID, "key", p.Key, "err", err)
}
case <-ctx.Done():
Expand All @@ -314,3 +318,12 @@ type stateChannel struct {
func (c *stateChannel) Broadcast(b []byte) {
c.s.broadcast(c.key, b)
}

// getStateTypeFromKey used for get the state type out of the state key.
func getStateTypeFromKey(key string) string {
index := strings.IndexByte(key, ':')
if index < 0 {
return key
}
return key[:index]
}
10 changes: 5 additions & 5 deletions pkg/alertmanager/state_replication_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ func TestStateReplication(t *testing.T) {
require.NoError(t, s.WaitReady(ctx))
}

ch := s.AddState("nflog", &fakeState{}, reg)
ch := s.AddState("nflog:user-1", &fakeState{}, reg)

part := tt.message
d, err := part.Marshal()
Expand All @@ -166,10 +166,10 @@ alertmanager_state_fetch_replica_state_failed_total 0
alertmanager_state_fetch_replica_state_total 1
# HELP alertmanager_partial_state_merges_failed_total Number of times we have failed to merge a partial state received for a key.
# TYPE alertmanager_partial_state_merges_failed_total counter
alertmanager_partial_state_merges_failed_total{key="nflog"} 0
alertmanager_partial_state_merges_failed_total{type="nflog"} 0
# HELP alertmanager_partial_state_merges_total Number of times we have received a partial state to merge for a key.
# TYPE alertmanager_partial_state_merges_total counter
alertmanager_partial_state_merges_total{key="nflog"} 0
alertmanager_partial_state_merges_total{type="nflog"} 0
# HELP alertmanager_state_initial_sync_completed_total Number of times we have completed syncing initial state for each possible outcome.
# TYPE alertmanager_state_initial_sync_completed_total counter
alertmanager_state_initial_sync_completed_total{outcome="failed"} 0
Expand All @@ -181,10 +181,10 @@ alertmanager_state_initial_sync_completed_total{outcome="user-not-found"} 0
alertmanager_state_initial_sync_total 1
# HELP alertmanager_state_replication_failed_total Number of times we have failed to replicate a state to other alertmanagers.
# TYPE alertmanager_state_replication_failed_total counter
alertmanager_state_replication_failed_total{key="nflog"} 0
alertmanager_state_replication_failed_total{type="nflog"} 0
# HELP alertmanager_state_replication_total Number of times we have tried to replicate a state to other alertmanagers.
# TYPE alertmanager_state_replication_total counter
alertmanager_state_replication_total{key="nflog"} 1
alertmanager_state_replication_total{type="nflog"} 1
`),
"alertmanager_state_fetch_replica_state_failed_total",
"alertmanager_state_fetch_replica_state_total",
Expand Down