From 03ed067e3bfddf2d43179279f8d949fca8bb3a53 Mon Sep 17 00:00:00 2001 From: George Robinson Date: Wed, 26 Jun 2024 08:37:58 +0100 Subject: [PATCH] Fix per-tenant silence limits not reloaded during runtime (#8456) * Fix per-tenant silence limits not reloaded during runtime This commit fixes a bug where per-tenant silence limits were not reloaded during runtime. This happened because per-tenant limits were read at initialization time for a per-tenant Alertmanager and then never again. Instead, silence limits are read each time they are needed. Mimir uses an in-memory cache for per-tenant limits, and silence limits are only checked when creating new silences or updating existing silences. * Update go.mod (cherry picked from commit 837a3753b819761dfda69d92177fb0ab40f38c24) --- go.mod | 2 +- go.sum | 4 +-- pkg/alertmanager/alertmanager.go | 4 +-- .../alertmanager/silence/silence.go | 25 +++++++++++-------- vendor/modules.txt | 4 +-- 5 files changed, 21 insertions(+), 18 deletions(-) diff --git a/go.mod b/go.mod index 49b78bfd8e6..c33854e4556 100644 --- a/go.mod +++ b/go.mod @@ -292,4 +292,4 @@ replace github.com/opentracing-contrib/go-stdlib => github.com/grafana/opentraci replace github.com/opentracing-contrib/go-grpc => github.com/charleskorn/go-grpc v0.0.0-20231024023642-e9298576254f // Replacing prometheus/alertmanager with our fork. -replace github.com/prometheus/alertmanager => github.com/grafana/prometheus-alertmanager v0.25.1-0.20240605141526-70d9d63f74fc +replace github.com/prometheus/alertmanager => github.com/grafana/prometheus-alertmanager v0.25.1-0.20240621085530-d75ea57467b1 diff --git a/go.sum b/go.sum index 1b32b1bfcdc..e45f9c24cf6 100644 --- a/go.sum +++ b/go.sum @@ -521,8 +521,8 @@ github.com/grafana/mimir-prometheus v0.0.0-20240611092206-171cafc36ca7 h1:8FxtzV github.com/grafana/mimir-prometheus v0.0.0-20240611092206-171cafc36ca7/go.mod h1:hvYx75l+0dZUWI20CXlutLQDnyTnB8SXVSMX2sTF3Xo= github.com/grafana/opentracing-contrib-go-stdlib v0.0.0-20230509071955-f410e79da956 h1:em1oddjXL8c1tL0iFdtVtPloq2hRPen2MJQKoAWpxu0= github.com/grafana/opentracing-contrib-go-stdlib v0.0.0-20230509071955-f410e79da956/go.mod h1:qtI1ogk+2JhVPIXVc6q+NHziSmy2W5GbdQZFUHADCBU= -github.com/grafana/prometheus-alertmanager v0.25.1-0.20240605141526-70d9d63f74fc h1:VoEf4wNiS3hCPxxmFdEvyeZJA3eI4Wb4gAlzYZwh52A= -github.com/grafana/prometheus-alertmanager v0.25.1-0.20240605141526-70d9d63f74fc/go.mod h1:01sXtHoRwI8W324IPAzuxDFOmALqYLCOhvSC2fUHWXc= +github.com/grafana/prometheus-alertmanager v0.25.1-0.20240621085530-d75ea57467b1 h1:osfUzwxT9P3Iu5cIBLfAtyb5ybj/PB1rnFXLYPciPKE= +github.com/grafana/prometheus-alertmanager v0.25.1-0.20240621085530-d75ea57467b1/go.mod h1:01sXtHoRwI8W324IPAzuxDFOmALqYLCOhvSC2fUHWXc= github.com/grafana/pyroscope-go/godeltaprof v0.1.6 h1:nEdZ8louGAplSvIJi1HVp7kWvFvdiiYg3COLlTwJiFo= github.com/grafana/pyroscope-go/godeltaprof v0.1.6/go.mod h1:Tk376Nbldo4Cha9RgiU7ik8WKFkNpfds98aUzS8omLE= github.com/grafana/regexp v0.0.0-20240531075221-3685f1377d7b h1:oMAq12GxTpwo9jxbnG/M4F/HdpwbibTaVoxNA0NZprY= diff --git a/pkg/alertmanager/alertmanager.go b/pkg/alertmanager/alertmanager.go index 83e12531038..87863fcd980 100644 --- a/pkg/alertmanager/alertmanager.go +++ b/pkg/alertmanager/alertmanager.go @@ -232,8 +232,8 @@ func New(cfg *Config, reg *prometheus.Registry) (*Alertmanager, error) { SnapshotFile: silencesFile, Retention: cfg.Retention, Limits: silence.Limits{ - MaxSilences: cfg.Limits.AlertmanagerMaxSilencesCount(cfg.UserID), - MaxPerSilenceBytes: cfg.Limits.AlertmanagerMaxSilenceSizeBytes(cfg.UserID), + MaxSilences: func() int { return cfg.Limits.AlertmanagerMaxSilencesCount(cfg.UserID) }, + MaxSilenceSizeBytes: func() int { return cfg.Limits.AlertmanagerMaxSilenceSizeBytes(cfg.UserID) }, }, Logger: log.With(am.logger, "component", "silences"), Metrics: am.registry, diff --git a/vendor/github.com/prometheus/alertmanager/silence/silence.go b/vendor/github.com/prometheus/alertmanager/silence/silence.go index 67ba97dbb2c..5c52e853e8b 100644 --- a/vendor/github.com/prometheus/alertmanager/silence/silence.go +++ b/vendor/github.com/prometheus/alertmanager/silence/silence.go @@ -44,10 +44,10 @@ import ( ) // ErrNotFound is returned if a silence was not found. -var ErrNotFound = fmt.Errorf("silence not found") +var ErrNotFound = errors.New("silence not found") // ErrInvalidState is returned if the state isn't valid. -var ErrInvalidState = fmt.Errorf("invalid state") +var ErrInvalidState = errors.New("invalid state") type matcherCache map[*pb.Silence]labels.Matchers @@ -206,10 +206,10 @@ type Silences struct { type Limits struct { // MaxSilences limits the maximum number of silences, including expired // silences. - MaxSilences int - // MaxPerSilenceBytes is the maximum size of an individual silence as + MaxSilences func() int + // MaxSilenceSizeBytes is the maximum size of an individual silence as // stored on disk. - MaxPerSilenceBytes int + MaxSilenceSizeBytes func() int } // MaintenanceFunc represents the function to run as part of the periodic maintenance for silences. @@ -338,7 +338,7 @@ type Options struct { func (o *Options) validate() error { if o.SnapshotFile != "" && o.SnapshotReader != nil { - return fmt.Errorf("only one of SnapshotFile and SnapshotReader must be set") + return errors.New("only one of SnapshotFile and SnapshotReader must be set") } return nil } @@ -585,8 +585,11 @@ func (s *Silences) setSilence(sil *pb.Silence, now time.Time, skipValidate bool) // Check the limit unless the silence has been expired. This is to avoid // situations where silences cannot be expired after the limit has been // reduced. - if n := msil.Size(); s.limits.MaxPerSilenceBytes > 0 && n > s.limits.MaxPerSilenceBytes && sil.EndsAt.After(now) { - return fmt.Errorf("silence exceeded maximum size: %d bytes (limit: %d bytes)", n, s.limits.MaxPerSilenceBytes) + if s.limits.MaxSilenceSizeBytes != nil { + n := msil.Size() + if m := s.limits.MaxSilenceSizeBytes(); m > 0 && n > m && sil.EndsAt.After(now) { + return fmt.Errorf("silence exceeded maximum size: %d bytes (limit: %d bytes)", n, m) + } } if s.st.merge(msil, now) { @@ -645,9 +648,9 @@ func (s *Silences) set(sil *pb.Silence) (string, error) { } // If we got here it's either a new silence or a replacing one. - if s.limits.MaxSilences > 0 { - if len(s.st)+1 > s.limits.MaxSilences { - return "", fmt.Errorf("exceeded maximum number of silences: %d (limit: %d)", len(s.st), s.limits.MaxSilences) + if s.limits.MaxSilences != nil { + if m := s.limits.MaxSilences(); m > 0 && len(s.st)+1 > m { + return "", fmt.Errorf("exceeded maximum number of silences: %d (limit: %d)", len(s.st), m) } } diff --git a/vendor/modules.txt b/vendor/modules.txt index e5ba77a97ce..291e1ad8877 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -890,7 +890,7 @@ github.com/pmezard/go-difflib/difflib # github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c ## explicit; go 1.14 github.com/power-devops/perfstat -# github.com/prometheus/alertmanager v0.27.0 => github.com/grafana/prometheus-alertmanager v0.25.1-0.20240605141526-70d9d63f74fc +# github.com/prometheus/alertmanager v0.27.0 => github.com/grafana/prometheus-alertmanager v0.25.1-0.20240621085530-d75ea57467b1 ## explicit; go 1.21 github.com/prometheus/alertmanager/api github.com/prometheus/alertmanager/api/metrics @@ -1615,4 +1615,4 @@ sigs.k8s.io/yaml/goyaml.v3 # github.com/munnerz/goautoneg => github.com/grafana/goautoneg v0.0.0-20240607115440-f335c04c58ce # github.com/opentracing-contrib/go-stdlib => github.com/grafana/opentracing-contrib-go-stdlib v0.0.0-20230509071955-f410e79da956 # github.com/opentracing-contrib/go-grpc => github.com/charleskorn/go-grpc v0.0.0-20231024023642-e9298576254f -# github.com/prometheus/alertmanager => github.com/grafana/prometheus-alertmanager v0.25.1-0.20240605141526-70d9d63f74fc +# github.com/prometheus/alertmanager => github.com/grafana/prometheus-alertmanager v0.25.1-0.20240621085530-d75ea57467b1