Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

silence|alerts: add metrics about current silences and alerts #998

Merged
merged 1 commit into from
Oct 5, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions cmd/alertmanager/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ var (
Name: "alertmanager_config_last_reload_success_timestamp_seconds",
Help: "Timestamp of the last successful configuration reload.",
})
alertsActive prometheus.GaugeFunc
alertsSuppressed prometheus.GaugeFunc
)

func init() {
Expand All @@ -72,6 +74,27 @@ func init() {
prometheus.MustRegister(version.NewCollector("alertmanager"))
}

func newAlertMetricByState(marker types.Marker, st types.AlertState) prometheus.GaugeFunc {
return prometheus.NewGaugeFunc(
prometheus.GaugeOpts{
Name: "alertmanager_alerts",
Help: "How many alerts by state.",
ConstLabels: prometheus.Labels{"state": string(st)},
},
func() float64 {
return float64(marker.Count(st))
},
)
}

func newMarkerMetrics(marker types.Marker) {
alertsActive = newAlertMetricByState(marker, types.AlertStateActive)
alertsSuppressed = newAlertMetricByState(marker, types.AlertStateSuppressed)

prometheus.MustRegister(alertsActive)
prometheus.MustRegister(alertsSuppressed)
}

func main() {
peers := &stringset{}
var (
Expand Down Expand Up @@ -148,6 +171,7 @@ func main() {
}

marker := types.NewMarker()
newMarkerMetrics(marker)

silenceOpts := silence.Options{
SnapshotFile: filepath.Join(*dataDir, "silences"),
Expand Down
43 changes: 41 additions & 2 deletions silence/silence.go
Original file line number Diff line number Diff line change
Expand Up @@ -113,9 +113,29 @@ type metrics struct {
queriesTotal prometheus.Counter
queryErrorsTotal prometheus.Counter
queryDuration prometheus.Histogram
silencesActive prometheus.GaugeFunc
silencesPending prometheus.GaugeFunc
silencesExpired prometheus.GaugeFunc
}

func newSilenceMetricByState(s *Silences, st SilenceState) prometheus.GaugeFunc {
return prometheus.NewGaugeFunc(
prometheus.GaugeOpts{
Name: "alertmanager_silences",
Help: "How many silences by state.",
ConstLabels: prometheus.Labels{"state": string(st)},
},
func() float64 {
count, err := s.CountState(st)
if err != nil {
s.logger.With("err", err).Error("counting silences failed")
}
return float64(count)
},
)
}

func newMetrics(r prometheus.Registerer) *metrics {
func newMetrics(r prometheus.Registerer, s *Silences) *metrics {
m := &metrics{}

m.gcDuration = prometheus.NewSummary(prometheus.SummaryOpts{
Expand All @@ -138,6 +158,11 @@ func newMetrics(r prometheus.Registerer) *metrics {
Name: "alertmanager_silences_query_duration_seconds",
Help: "Duration of silence query evaluation.",
})
if s != nil {
m.silencesActive = newSilenceMetricByState(s, StateActive)
m.silencesPending = newSilenceMetricByState(s, StatePending)
m.silencesExpired = newSilenceMetricByState(s, StateExpired)
}

if r != nil {
r.MustRegister(
Expand All @@ -146,6 +171,9 @@ func newMetrics(r prometheus.Registerer) *metrics {
m.queriesTotal,
m.queryErrorsTotal,
m.queryDuration,
m.silencesActive,
m.silencesPending,
m.silencesExpired,
)
}
return m
Expand Down Expand Up @@ -195,12 +223,13 @@ func New(o Options) (*Silences, error) {
s := &Silences{
mc: matcherCache{},
logger: log.NewNopLogger(),
metrics: newMetrics(o.Metrics),
retention: o.Retention,
now: utcNow,
gossip: nopGossip{},
st: newGossipData(),
}
s.metrics = newMetrics(o.Metrics, s)

if o.Logger != nil {
s.logger = o.Logger
}
Expand Down Expand Up @@ -587,6 +616,16 @@ func (s *Silences) Query(params ...QueryParam) ([]*pb.Silence, error) {
return sils, err
}

// Count silences by state.
func (s *Silences) CountState(states ...SilenceState) (int, error) {
// This could probably be optimized.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm hoping that we'll one day we'll be able to index the silence and alert in-memory databases, so we can search, count, query a bit more efficiently.

sils, err := s.Query(QState(states...))
if err != nil {
return -1, err
}
return len(sils), nil
}

func (s *Silences) query(q *query, now time.Time) ([]*pb.Silence, error) {
// If we have an ID constraint, all silences are our base set.
// This and the use of post-filter functions is the
Expand Down
11 changes: 10 additions & 1 deletion silence/silence_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ func TestSilencesSnapshot(t *testing.T) {
f, err := ioutil.TempFile("", "snapshot")
require.NoError(t, err, "creating temp file failed")

s1 := &Silences{st: newGossipData(), metrics: newMetrics(nil)}
s1 := &Silences{st: newGossipData(), metrics: newMetrics(nil, nil)}
// Setup internal state manually.
for _, e := range c.entries {
s1.st.data[e.Silence.Id] = e
Expand Down Expand Up @@ -778,6 +778,10 @@ func TestSilenceExpire(t *testing.T) {
},
}

count, err := s.CountState(StatePending)
require.NoError(t, err)
require.Equal(t, 1, count)

require.NoError(t, s.expire("pending"))
require.NoError(t, s.expire("active"))

Expand All @@ -794,6 +798,11 @@ func TestSilenceExpire(t *testing.T) {
EndsAt: now,
UpdatedAt: now,
}, sil)

count, err = s.CountState(StatePending)
require.NoError(t, err)
require.Equal(t, 0, count)

// Expiring a pending Silence should make the API return the
// SilenceStateExpired Silence state.
silenceState := types.CalcSilenceState(sil.StartsAt, sil.EndsAt)
Expand Down
23 changes: 23 additions & 0 deletions types/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ type Marker interface {
SetInhibited(alert model.Fingerprint, ids ...string)
SetSilenced(alert model.Fingerprint, ids ...string)

Count(...AlertState) int

Status(model.Fingerprint) AlertStatus
Delete(model.Fingerprint)

Expand All @@ -67,6 +69,27 @@ type memMarker struct {
mtx sync.RWMutex
}

// Count alerts of a given state.
func (m *memMarker) Count(states ...AlertState) int {
count := 0

m.mtx.RLock()
defer m.mtx.RUnlock()

if len(states) == 0 {
count = len(m.m)
} else {
for _, status := range m.m {
for _, state := range states {
if status.State == state {
count += 1
}
}
}
}
return count
}

// SetSilenced sets the AlertStatus to suppressed and stores the associated silence IDs.
func (m *memMarker) SetSilenced(alert model.Fingerprint, ids ...string) {
m.mtx.Lock()
Expand Down