Skip to content

Commit

Permalink
Add limits for silences (prometheus#3852)
Browse files Browse the repository at this point in the history
* Add limits for silences

This commit adds limits for silences including the maximum number
of active and pending silences, and the maximum size per silence
(in bytes).

Signed-off-by: George Robinson <[email protected]>

* Remove default limits

Signed-off-by: George Robinson <[email protected]>

* Allow expiration of silences that exceed max size

---------

Signed-off-by: George Robinson <[email protected]>
  • Loading branch information
grobinson-grafana committed May 31, 2024
1 parent c33c6b5 commit b0de8f9
Show file tree
Hide file tree
Showing 4 changed files with 134 additions and 7 deletions.
10 changes: 8 additions & 2 deletions cmd/alertmanager/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,8 @@ func run() int {
dataDir = kingpin.Flag("storage.path", "Base path for data storage.").Default("data/").String()
retention = kingpin.Flag("data.retention", "How long to keep data for.").Default("120h").Duration()
maintenanceInterval = kingpin.Flag("data.maintenance-interval", "Interval between garbage collection and snapshotting to disk of the silences and the notification logs.").Default("15m").Duration()
maxSilences = kingpin.Flag("silences.max-silences", "Maximum number of active and pending silences, excluding expired silences. If negative or zero, no limit is set.").Default("0").Int()
maxPerSilenceBytes = kingpin.Flag("silences.max-per-silence-bytes", "Maximum per silence size in bytes. If negative or zero, no limit is set.").Default("0").Int()
alertGCInterval = kingpin.Flag("alerts.gc-interval", "Interval between alert GC.").Default("30m").Duration()

webConfig = webflag.AddFlags(kingpin.CommandLine, ":9093")
Expand Down Expand Up @@ -258,8 +260,12 @@ func run() int {
silenceOpts := silence.Options{
SnapshotFile: filepath.Join(*dataDir, "silences"),
Retention: *retention,
Logger: log.With(logger, "component", "silences"),
Metrics: prometheus.DefaultRegisterer,
Limits: silence.Limits{
MaxSilences: *maxSilences,
MaxPerSilenceBytes: *maxPerSilenceBytes,
},
Logger: log.With(logger, "component", "silences"),
Metrics: prometheus.DefaultRegisterer,
}

silences, err := silence.New(silenceOpts)
Expand Down
11 changes: 11 additions & 0 deletions docs/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,17 @@ is not well-formed, the changes will not be applied and an error is logged.
A configuration reload is triggered by sending a `SIGHUP` to the process or
sending an HTTP POST request to the `/-/reload` endpoint.

## Limits

Alertmanager supports a number of configurable limits via command-line flags.

To limit the maximum number of active and pending silences, excluding expired ones,
use the `--silences.max-silences` flag.
You can limit the maximum size of individual silences with `--silences.max-per-silence-bytes`,
where the unit is in bytes.

Both limits are disabled by default.

## Configuration file introduction

To specify which configuration file to load, use the `--config.file` flag.
Expand Down
51 changes: 46 additions & 5 deletions silence/silence.go
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,7 @@ type Silences struct {
logger log.Logger
metrics *metrics
retention time.Duration
limits Limits

mtx sync.RWMutex
st state
Expand All @@ -201,6 +202,16 @@ type Silences struct {
mc matcherCache
}

// Limits contains the limits for silences.
type Limits struct {
// MaxSilences limits the maximum number active and pending silences.
// It does not include expired silences.
MaxSilences int
// MaxPerSilenceBytes is the maximum size of an individual silence as
// stored on disk.
MaxPerSilenceBytes int
}

// MaintenanceFunc represents the function to run as part of the periodic maintenance for silences.
// It returns the size of the snapshot taken or an error if it failed.
type MaintenanceFunc func() (int64, error)
Expand Down Expand Up @@ -318,6 +329,7 @@ type Options struct {
// Retention time for newly created Silences. Silences may be
// garbage collected after the given duration after they ended.
Retention time.Duration
Limits Limits

// A logger used by background processing.
Logger log.Logger
Expand All @@ -342,6 +354,7 @@ func New(o Options) (*Silences, error) {
mc: matcherCache{},
logger: log.NewNopLogger(),
retention: o.Retention,
limits: o.Limits,
broadcast: func([]byte) {},
st: state{},
}
Expand Down Expand Up @@ -569,6 +582,13 @@ func (s *Silences) setSilence(sil *pb.Silence, now time.Time, skipValidate bool)
return err
}

// Check the limit unless the silence has been expired. This is to avoid
// situations where silences cannot be expired after the limit has been
// reduced.
if n := msil.Size(); s.limits.MaxPerSilenceBytes > 0 && n > s.limits.MaxPerSilenceBytes && sil.EndsAt.After(now) {
return fmt.Errorf("silence exceeded maximum size: %d bytes (limit: %d bytes)", n, s.limits.MaxPerSilenceBytes)
}

if s.st.merge(msil, now) {
s.version++
}
Expand Down Expand Up @@ -608,10 +628,10 @@ func (s *Silences) Set(sil *pb.Silence) (string, error) {
func (s *Silences) set(sil *pb.Silence) (string, error) {
now := s.nowUTC()
prev, ok := s.getSilence(sil.Id)

if sil.Id != "" && !ok {
return "", ErrNotFound
}

if ok {
if canUpdate(prev, sil, now) {
return sil.Id, s.setSilence(sil, now, false)
Expand All @@ -623,7 +643,24 @@ func (s *Silences) set(sil *pb.Silence) (string, error) {
}
}
}

// If we got here it's either a new silence or a replacing one.
if s.limits.MaxSilences > 0 {
// Get the number of active and pending silences to enforce limits.
q := &query{}
err := QState(types.SilenceStateActive, types.SilenceStatePending)(q)
if err != nil {
return "", fmt.Errorf("unable to query silences while checking limits: %w", err)
}
sils, _, err := s.query(q, s.nowUTC())
if err != nil {
return "", fmt.Errorf("unable to query silences while checking limits: %w", err)
}
if len(sils)+1 > s.limits.MaxSilences {
return "", fmt.Errorf("exceeded maximum number of silences: %d (limit: %d)", len(sils), s.limits.MaxSilences)
}
}

uid, err := uuid.NewV4()
if err != nil {
return "", fmt.Errorf("generate uuid: %w", err)
Expand All @@ -634,7 +671,11 @@ func (s *Silences) set(sil *pb.Silence) (string, error) {
sil.StartsAt = now
}

return sil.Id, s.setSilence(sil, now, false)
if err = s.setSilence(sil, now, false); err != nil {
return "", err
}

return sil.Id, nil
}

// canUpdate returns true if silence a can be updated to b without
Expand Down Expand Up @@ -778,6 +819,9 @@ func (s *Silences) QueryOne(params ...QueryParam) (*pb.Silence, error) {
// Query for silences based on the given query parameters. It returns the
// resulting silences and the state version the result is based on.
func (s *Silences) Query(params ...QueryParam) ([]*pb.Silence, int, error) {
s.mtx.Lock()
defer s.mtx.Unlock()

s.metrics.queriesTotal.Inc()
defer prometheus.NewTimer(s.metrics.queryDuration).ObserveDuration()

Expand Down Expand Up @@ -817,9 +861,6 @@ func (s *Silences) query(q *query, now time.Time) ([]*pb.Silence, int, error) {
// the use of post-filter functions is the trivial solution for now.
var res []*pb.Silence

s.mtx.Lock()
defer s.mtx.Unlock()

if q.ids != nil {
for _, id := range q.ids {
if s, ok := s.st[id]; ok {
Expand Down
69 changes: 69 additions & 0 deletions silence/silence_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import (
"os"
"runtime"
"sort"
"strings"
"sync"
"testing"
"time"
Expand Down Expand Up @@ -458,6 +459,74 @@ func TestSilenceSet(t *testing.T) {
require.Equal(t, want, s.st, "unexpected state after silence creation")
}

func TestSilenceLimits(t *testing.T) {
s, err := New(Options{
Limits: Limits{
MaxSilences: 1,
MaxPerSilenceBytes: 2 << 11, // 4KB
},
})
require.NoError(t, err)

// Insert sil1 should succeed without error.
sil1 := &pb.Silence{
Matchers: []*pb.Matcher{{Name: "a", Pattern: "b"}},
StartsAt: time.Now(),
EndsAt: time.Now().Add(5 * time.Minute),
}
id1, err := s.Set(sil1)
require.NoError(t, err)
require.NotEqual(t, "", id1)

// Insert sil2 should fail because maximum number of silences
// has been exceeded.
sil2 := &pb.Silence{
Matchers: []*pb.Matcher{{Name: "a", Pattern: "b"}},
StartsAt: time.Now(),
EndsAt: time.Now().Add(5 * time.Minute),
}
id2, err := s.Set(sil2)
require.EqualError(t, err, "exceeded maximum number of silences: 1 (limit: 1)")
require.Equal(t, "", id2)

// Expire sil1. This should allow sil2 to be inserted.
require.NoError(t, s.Expire(id1))
id2, err = s.Set(sil2)
require.NoError(t, err)
require.NotEqual(t, "", id2)

// Should be able to update sil2 without hitting the limit.
_, err = s.Set(sil2)
require.NoError(t, err)

// Expire sil2.
require.NoError(t, s.Expire(id2))

// Insert sil3 should fail because it exceeds maximum size.
sil3 := &pb.Silence{
Matchers: []*pb.Matcher{
{
Name: strings.Repeat("a", 2<<9),
Pattern: strings.Repeat("b", 2<<9),
},
{
Name: strings.Repeat("c", 2<<9),
Pattern: strings.Repeat("d", 2<<9),
},
},
CreatedBy: strings.Repeat("e", 2<<9),
Comment: strings.Repeat("f", 2<<9),
StartsAt: time.Now(),
EndsAt: time.Now().Add(5 * time.Minute),
}
id3, err := s.Set(sil3)
require.Error(t, err)
// Do not check the exact size as it can change between consecutive runs
// due to padding.
require.Contains(t, err.Error(), "silence exceeded maximum size")
require.Equal(t, "", id3)
}

func TestSilenceUpsert(t *testing.T) {
s, err := New(Options{
Retention: time.Hour,
Expand Down

0 comments on commit b0de8f9

Please sign in to comment.