diff --git a/cmd/alertmanager/main.go b/cmd/alertmanager/main.go index 639d6827ab..510b166beb 100644 --- a/cmd/alertmanager/main.go +++ b/cmd/alertmanager/main.go @@ -63,6 +63,8 @@ var ( Name: "alertmanager_config_last_reload_success_timestamp_seconds", Help: "Timestamp of the last successful configuration reload.", }) + alertsActive prometheus.GaugeFunc + alertsSuppressed prometheus.GaugeFunc ) func init() { @@ -72,6 +74,27 @@ func init() { prometheus.MustRegister(version.NewCollector("alertmanager")) } +func newAlertMetricByState(marker types.Marker, st types.AlertState) prometheus.GaugeFunc { + return prometheus.NewGaugeFunc( + prometheus.GaugeOpts{ + Name: "alertmanager_alerts", + Help: "How many alerts by state.", + ConstLabels: prometheus.Labels{"state": string(st)}, + }, + func() float64 { + return float64(marker.Count(st)) + }, + ) +} + +func newMarkerMetrics(marker types.Marker) { + alertsActive = newAlertMetricByState(marker, types.AlertStateActive) + alertsSuppressed = newAlertMetricByState(marker, types.AlertStateSuppressed) + + prometheus.MustRegister(alertsActive) + prometheus.MustRegister(alertsSuppressed) +} + func main() { peers := &stringset{} var ( @@ -148,6 +171,7 @@ func main() { } marker := types.NewMarker() + newMarkerMetrics(marker) silenceOpts := silence.Options{ SnapshotFile: filepath.Join(*dataDir, "silences"), diff --git a/silence/silence.go b/silence/silence.go index 1a869264f1..1d48009636 100644 --- a/silence/silence.go +++ b/silence/silence.go @@ -113,9 +113,29 @@ type metrics struct { queriesTotal prometheus.Counter queryErrorsTotal prometheus.Counter queryDuration prometheus.Histogram + silencesActive prometheus.GaugeFunc + silencesPending prometheus.GaugeFunc + silencesExpired prometheus.GaugeFunc +} + +func newSilenceMetricByState(s *Silences, st SilenceState) prometheus.GaugeFunc { + return prometheus.NewGaugeFunc( + prometheus.GaugeOpts{ + Name: "alertmanager_silences", + Help: "How many silences by state.", + ConstLabels: prometheus.Labels{"state": string(st)}, + }, + func() float64 { + count, err := s.CountState(st) + if err != nil { + s.logger.With("err", err).Error("counting silences failed") + } + return float64(count) + }, + ) } -func newMetrics(r prometheus.Registerer) *metrics { +func newMetrics(r prometheus.Registerer, s *Silences) *metrics { m := &metrics{} m.gcDuration = prometheus.NewSummary(prometheus.SummaryOpts{ @@ -138,6 +158,11 @@ func newMetrics(r prometheus.Registerer) *metrics { Name: "alertmanager_silences_query_duration_seconds", Help: "Duration of silence query evaluation.", }) + if s != nil { + m.silencesActive = newSilenceMetricByState(s, StateActive) + m.silencesPending = newSilenceMetricByState(s, StatePending) + m.silencesExpired = newSilenceMetricByState(s, StateExpired) + } if r != nil { r.MustRegister( @@ -146,6 +171,9 @@ func newMetrics(r prometheus.Registerer) *metrics { m.queriesTotal, m.queryErrorsTotal, m.queryDuration, + m.silencesActive, + m.silencesPending, + m.silencesExpired, ) } return m @@ -195,12 +223,13 @@ func New(o Options) (*Silences, error) { s := &Silences{ mc: matcherCache{}, logger: log.NewNopLogger(), - metrics: newMetrics(o.Metrics), retention: o.Retention, now: utcNow, gossip: nopGossip{}, st: newGossipData(), } + s.metrics = newMetrics(o.Metrics, s) + if o.Logger != nil { s.logger = o.Logger } @@ -587,6 +616,16 @@ func (s *Silences) Query(params ...QueryParam) ([]*pb.Silence, error) { return sils, err } +// Count silences by state. +func (s *Silences) CountState(states ...SilenceState) (int, error) { + // This could probably be optimized. + sils, err := s.Query(QState(states...)) + if err != nil { + return -1, err + } + return len(sils), nil +} + func (s *Silences) query(q *query, now time.Time) ([]*pb.Silence, error) { // If we have an ID constraint, all silences are our base set. // This and the use of post-filter functions is the diff --git a/silence/silence_test.go b/silence/silence_test.go index 6fae48a5cb..ea0bd2d4f2 100644 --- a/silence/silence_test.go +++ b/silence/silence_test.go @@ -142,7 +142,7 @@ func TestSilencesSnapshot(t *testing.T) { f, err := ioutil.TempFile("", "snapshot") require.NoError(t, err, "creating temp file failed") - s1 := &Silences{st: newGossipData(), metrics: newMetrics(nil)} + s1 := &Silences{st: newGossipData(), metrics: newMetrics(nil, nil)} // Setup internal state manually. for _, e := range c.entries { s1.st.data[e.Silence.Id] = e @@ -778,6 +778,10 @@ func TestSilenceExpire(t *testing.T) { }, } + count, err := s.CountState(StatePending) + require.NoError(t, err) + require.Equal(t, 1, count) + require.NoError(t, s.expire("pending")) require.NoError(t, s.expire("active")) @@ -794,6 +798,11 @@ func TestSilenceExpire(t *testing.T) { EndsAt: now, UpdatedAt: now, }, sil) + + count, err = s.CountState(StatePending) + require.NoError(t, err) + require.Equal(t, 0, count) + // Expiring a pending Silence should make the API return the // SilenceStateExpired Silence state. silenceState := types.CalcSilenceState(sil.StartsAt, sil.EndsAt) diff --git a/types/types.go b/types/types.go index 4677866693..159f9557d1 100644 --- a/types/types.go +++ b/types/types.go @@ -45,6 +45,8 @@ type Marker interface { SetInhibited(alert model.Fingerprint, ids ...string) SetSilenced(alert model.Fingerprint, ids ...string) + Count(...AlertState) int + Status(model.Fingerprint) AlertStatus Delete(model.Fingerprint) @@ -67,6 +69,27 @@ type memMarker struct { mtx sync.RWMutex } +// Count alerts of a given state. +func (m *memMarker) Count(states ...AlertState) int { + count := 0 + + m.mtx.RLock() + defer m.mtx.RUnlock() + + if len(states) == 0 { + count = len(m.m) + } else { + for _, status := range m.m { + for _, state := range states { + if status.State == state { + count += 1 + } + } + } + } + return count +} + // SetSilenced sets the AlertStatus to suppressed and stores the associated silence IDs. func (m *memMarker) SetSilenced(alert model.Fingerprint, ids ...string) { m.mtx.Lock()