Skip to content

Commit

Permalink
Fixes a number of bugs in silences
Browse files Browse the repository at this point in the history
This commit fixes the following bugs in silences:

- prometheus/alertmanager#3877
- prometheus/alertmanager#3898
- prometheus/alertmanager#3897

which could cause an existing silence to be deleted/expired
when updating the silence failed. This could be because
the replacing silence exceeded limits or was invalid.

additional tests in upstream.
  • Loading branch information
grobinson-grafana committed Jun 26, 2024
1 parent 36f7af3 commit d84bd0d
Show file tree
Hide file tree
Showing 10 changed files with 178 additions and 121 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
* [BUGFIX] Query scheduler: fix a panic in request queueing. #8451
* [BUGFIX] Querier: fix issue where "context canceled" is logged for trace spans for requests to store-gateways that return no series when chunks streaming is enabled. #8510
* [BUGFIX] Alertmanager: Fix per-tenant silence limits not reloaded during runtime. #8456
* [BUGFIX] Alertmanager: Fixes a number of bugs in silences which could cause an existing silence to be deleted/expired when updating the silence failed. This could happen when the replacing silence was invalid or exceeded limits. #8525

### Mixin

Expand Down
4 changes: 2 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ require (
github.com/google/go-github/v57 v57.0.0
github.com/google/uuid v1.6.0
github.com/grafana-tools/sdk v0.0.0-20220919052116-6562121319fc
github.com/grafana/alerting v0.0.0-20240607182251-835aff588914
github.com/grafana/alerting v0.0.0-20240626080128-8299cb22b8df
github.com/grafana/regexp v0.0.0-20240518133315-a468a5bfb3bc
github.com/hashicorp/golang-lru/v2 v2.0.7
github.com/hashicorp/vault/api v1.10.0
Expand Down Expand Up @@ -292,4 +292,4 @@ replace github.com/opentracing-contrib/go-stdlib => github.com/grafana/opentraci
replace github.com/opentracing-contrib/go-grpc => github.com/charleskorn/go-grpc v0.0.0-20231024023642-e9298576254f

// Replacing prometheus/alertmanager with our fork.
replace github.com/prometheus/alertmanager => github.com/grafana/prometheus-alertmanager v0.25.1-0.20240621085530-d75ea57467b1
replace github.com/prometheus/alertmanager => github.com/grafana/prometheus-alertmanager v0.25.1-0.20240625192351-66ec17e3aa45
8 changes: 4 additions & 4 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -505,8 +505,8 @@ github.com/gosimple/slug v1.1.1 h1:fRu/digW+NMwBIP+RmviTK97Ho/bEj/C9swrCspN3D4=
github.com/gosimple/slug v1.1.1/go.mod h1:ER78kgg1Mv0NQGlXiDe57DpCyfbNywXXZ9mIorhxAf0=
github.com/grafana-tools/sdk v0.0.0-20220919052116-6562121319fc h1:PXZQA2WCxe85Tnn+WEvr8fDpfwibmEPgfgFEaC87G24=
github.com/grafana-tools/sdk v0.0.0-20220919052116-6562121319fc/go.mod h1:AHHlOEv1+GGQ3ktHMlhuTUwo3zljV3QJbC0+8o2kn+4=
github.com/grafana/alerting v0.0.0-20240607182251-835aff588914 h1:WXLbSnnomltxdNcE20CI8RD8quZ/L0YpXP0WK+0S1BU=
github.com/grafana/alerting v0.0.0-20240607182251-835aff588914/go.mod h1:U7Ta3K4T7jVgqGSYuPsfuPKHFiL2GbCZSHa3nHjmCos=
github.com/grafana/alerting v0.0.0-20240626080128-8299cb22b8df h1:HgsjWyuUhJQ3jgu+mumqKuPl/KLLuOoIoCBaSliSZNY=
github.com/grafana/alerting v0.0.0-20240626080128-8299cb22b8df/go.mod h1:hWW8UNwXZCRT7SfsBRGygGfBYj2QwsnTM+OEraRiov0=
github.com/grafana/dskit v0.0.0-20240614072459-4beef469a18e h1:x2OBF2vc9dtvvOEcndIM+cFDAfZ7K3GnKe6gdL0FSIU=
github.com/grafana/dskit v0.0.0-20240614072459-4beef469a18e/go.mod h1:HvSf3uf8Ps2vPpzHeAFyZTdUcbVr+Rxpq1xcx7J/muc=
github.com/grafana/e2e v0.1.2-0.20240118170847-db90b84177fc h1:BW+LjKJDz0So5LI8UZfW5neWeKpSkWqhmGjQFzcFfLM=
Expand All @@ -521,8 +521,8 @@ github.com/grafana/mimir-prometheus v0.0.0-20240620082736-3d8577bc0dfb h1:RRarht
github.com/grafana/mimir-prometheus v0.0.0-20240620082736-3d8577bc0dfb/go.mod h1:ZrBwbXc+KqeAQT4QXHKfi68+7vaOzoSdrkk90RRgdkE=
github.com/grafana/opentracing-contrib-go-stdlib v0.0.0-20230509071955-f410e79da956 h1:em1oddjXL8c1tL0iFdtVtPloq2hRPen2MJQKoAWpxu0=
github.com/grafana/opentracing-contrib-go-stdlib v0.0.0-20230509071955-f410e79da956/go.mod h1:qtI1ogk+2JhVPIXVc6q+NHziSmy2W5GbdQZFUHADCBU=
github.com/grafana/prometheus-alertmanager v0.25.1-0.20240621085530-d75ea57467b1 h1:osfUzwxT9P3Iu5cIBLfAtyb5ybj/PB1rnFXLYPciPKE=
github.com/grafana/prometheus-alertmanager v0.25.1-0.20240621085530-d75ea57467b1/go.mod h1:01sXtHoRwI8W324IPAzuxDFOmALqYLCOhvSC2fUHWXc=
github.com/grafana/prometheus-alertmanager v0.25.1-0.20240625192351-66ec17e3aa45 h1:AJKOtDKAOg8XNFnIZSmqqqutoTSxVlRs6vekL2p2KEY=
github.com/grafana/prometheus-alertmanager v0.25.1-0.20240625192351-66ec17e3aa45/go.mod h1:01sXtHoRwI8W324IPAzuxDFOmALqYLCOhvSC2fUHWXc=
github.com/grafana/pyroscope-go/godeltaprof v0.1.6 h1:nEdZ8louGAplSvIJi1HVp7kWvFvdiiYg3COLlTwJiFo=
github.com/grafana/pyroscope-go/godeltaprof v0.1.6/go.mod h1:Tk376Nbldo4Cha9RgiU7ik8WKFkNpfds98aUzS8omLE=
github.com/grafana/regexp v0.0.0-20240531075221-3685f1377d7b h1:oMAq12GxTpwo9jxbnG/M4F/HdpwbibTaVoxNA0NZprY=
Expand Down
2 changes: 1 addition & 1 deletion integration/alertmanager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ func TestAlertmanagerClassicMode(t *testing.T) {
StartsAt: time.Now(),
EndsAt: time.Now().Add(time.Minute),
})
require.EqualError(t, err, "creating the silence failed with status 400 and error \"silence invalid: invalid label matcher 0: invalid label name \\\"bar🙂\\\"\"\n")
require.EqualError(t, err, "creating the silence failed with status 400 and error \"invalid silence: invalid label matcher 0: invalid label name \\\"bar🙂\\\"\"\n")
require.Empty(t, silenceID)

// Should be able to post alerts with classic labels but not UTF-8 labels.
Expand Down
142 changes: 102 additions & 40 deletions pkg/alertmanager/alertmanager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
"github.com/grafana/dskit/test"
"github.com/prometheus/alertmanager/cluster/clusterpb"
"github.com/prometheus/alertmanager/featurecontrol"
"github.com/prometheus/alertmanager/silence"
"github.com/prometheus/alertmanager/silence/silencepb"
"github.com/prometheus/alertmanager/types"
"github.com/prometheus/client_golang/prometheus"
Expand Down Expand Up @@ -330,17 +331,33 @@ func testLimiter(t *testing.T, limits Limits, ops []callbackOp) {
}
}

// cloneSilence returns a shallow copy of a silence. It is used in tests.
func cloneSilence(t *testing.T, sil *silencepb.Silence) *silencepb.Silence {
t.Helper()
s := *sil
return &s
}

func toMeshSilence(t *testing.T, sil *silencepb.Silence, retention time.Duration) *silencepb.MeshSilence {
t.Helper()
return &silencepb.MeshSilence{
Silence: sil,
ExpiresAt: sil.EndsAt.Add(retention),
}
}

func TestSilenceLimits(t *testing.T) {
user := "test"

r := prometheus.NewPedanticRegistry()
limits := mockAlertManagerLimits{
maxSilencesCount: 1,
maxSilenceSizeBytes: 2 << 11, // 4KB,
}
am, err := New(&Config{
UserID: user,
Logger: log.NewNopLogger(),
Limits: &mockAlertManagerLimits{
maxSilencesCount: 1,
maxSilenceSizeBytes: 2 << 11, // 4KB,
},
UserID: user,
Logger: log.NewNopLogger(),
Limits: &limits,
Features: featurecontrol.NoopFlags{},
TenantDataDir: t.TempDir(),
ExternalURL: &url.URL{Path: "/am"},
Expand All @@ -364,38 +381,26 @@ func TestSilenceLimits(t *testing.T) {
StartsAt: time.Now(),
EndsAt: time.Now().Add(5 * time.Minute),
}
id1, err := am.silences.Set(sil1)
require.NoError(t, err)
require.NotEqual(t, "", id1)
require.NoError(t, am.silences.Set(sil1))

// Insert sil2 should fail because maximum number of silences
// has been exceeded.
// Insert sil2 should fail because maximum number of silences has been
// exceeded.
sil2 := &silencepb.Silence{
Matchers: []*silencepb.Matcher{{Name: "a", Pattern: "b"}},
Matchers: []*silencepb.Matcher{{Name: "c", Pattern: "d"}},
StartsAt: time.Now(),
EndsAt: time.Now().Add(5 * time.Minute),
}
id2, err := am.silences.Set(sil2)
require.EqualError(t, err, "exceeded maximum number of silences: 1 (limit: 1)")
require.Equal(t, "", id2)
require.EqualError(t, am.silences.Set(sil2), "exceeded maximum number of silences: 1 (limit: 1)")

// Expire sil1 and run the GC. This should allow sil2 to be
// inserted.
require.NoError(t, am.silences.Expire(id1))
// Expire sil1 and run the GC. This should allow sil2 to be inserted.
require.NoError(t, am.silences.Expire(sil1.Id))
n, err := am.silences.GC()
require.NoError(t, err)
require.Equal(t, 1, n)
require.NoError(t, am.silences.Set(sil2))

id2, err = am.silences.Set(sil2)
require.NoError(t, err)
require.NotEqual(t, "", id2)

// Should be able to update sil2 without hitting the limit.
_, err = am.silences.Set(sil2)
require.NoError(t, err)

// Expire sil2.
require.NoError(t, am.silences.Expire(id2))
// Expire sil2 and run the GC.
require.NoError(t, am.silences.Expire(sil2.Id))
n, err = am.silences.GC()
require.NoError(t, err)
require.Equal(t, 1, n)
Expand All @@ -404,25 +409,82 @@ func TestSilenceLimits(t *testing.T) {
sil3 := &silencepb.Silence{
Matchers: []*silencepb.Matcher{
{
Name: strings.Repeat("a", 2<<9),
Pattern: strings.Repeat("b", 2<<9),
Name: strings.Repeat("e", 2<<9),
Pattern: strings.Repeat("f", 2<<9),
},
{
Name: strings.Repeat("c", 2<<9),
Pattern: strings.Repeat("d", 2<<9),
Name: strings.Repeat("g", 2<<9),
Pattern: strings.Repeat("h", 2<<9),
},
},
CreatedBy: strings.Repeat("e", 2<<9),
Comment: strings.Repeat("f", 2<<9),
CreatedBy: strings.Repeat("i", 2<<9),
Comment: strings.Repeat("j", 2<<9),
StartsAt: time.Now(),
EndsAt: time.Now().Add(5 * time.Minute),
}
id3, err := am.silences.Set(sil3)
require.Error(t, err)
// Do not check the exact size as it can change between consecutive runs
// due to padding.
require.Contains(t, err.Error(), "silence exceeded maximum size")
require.Equal(t, "", id3)
require.EqualError(t, am.silences.Set(sil3), fmt.Sprintf("silence exceeded maximum size: %d bytes (limit: 4096 bytes)", toMeshSilence(t, sil3, 0).Size()))

// Should be able to insert sil4.
sil4 := &silencepb.Silence{
Matchers: []*silencepb.Matcher{{Name: "k", Pattern: "l"}},
StartsAt: time.Now(),
EndsAt: time.Now().Add(5 * time.Minute),
}
require.NoError(t, am.silences.Set(sil4))

// Should be able to update sil4 without modifications. It is expected to
// keep the same ID.
sil5 := cloneSilence(t, sil4)
require.NoError(t, am.silences.Set(sil5))
require.Equal(t, sil4.Id, sil5.Id)

// Should be able to update the comment. It is also expected to keep the
// same ID.
sil6 := cloneSilence(t, sil5)
sil6.Comment = "m"
require.NoError(t, am.silences.Set(sil6))
require.Equal(t, sil5.Id, sil6.Id)

// Should not be able to update the start and end time as this requires
// sil6 to be expired and a new silence to be created. However, this would
// exceed the maximum number of silences, which counts both active and
// expired silences.
sil7 := cloneSilence(t, sil6)
sil7.StartsAt = time.Now().Add(5 * time.Minute)
sil7.EndsAt = time.Now().Add(10 * time.Minute)
require.EqualError(t, am.silences.Set(sil7), "exceeded maximum number of silences: 1 (limit: 1)")

// sil6 should not be expired because the update failed.
sils, _, err := am.silences.Query(silence.QState(types.SilenceStateExpired))
require.NoError(t, err)
require.Len(t, sils, 0)

// Should not be able to update with a comment that exceeds maximum size.
// Need to increase the maximum number of silences to test this.
limits.maxSilencesCount = 2
sil8 := cloneSilence(t, sil6)
sil8.Comment = strings.Repeat("m", 2<<11)
require.EqualError(t, am.silences.Set(sil8), fmt.Sprintf("silence exceeded maximum size: %d bytes (limit: 4096 bytes)", toMeshSilence(t, sil8, 0).Size()))

// sil6 should not be expired because the update failed.
sils, _, err = am.silences.Query(silence.QState(types.SilenceStateExpired))
require.NoError(t, err)
require.Len(t, sils, 0)

// Should not be able to replace with a silence that exceeds maximum size.
// This is different from the previous assertion as unlike when adding or
// updating a comment, changing the matchers for a silence should expire
// the existing silence, unless the silence that is replacing it exceeds
// limits, in which case the operation should fail and the existing silence
// should still be active.
sil9 := cloneSilence(t, sil8)
sil9.Matchers = []*silencepb.Matcher{{Name: "n", Pattern: "o"}}
require.EqualError(t, am.silences.Set(sil9), fmt.Sprintf("silence exceeded maximum size: %d bytes (limit: 4096 bytes)", toMeshSilence(t, sil9, 0).Size()))

// sil6 should not be expired because the update failed.
sils, _, err = am.silences.Query(silence.QState(types.SilenceStateExpired))
require.NoError(t, err)
require.Len(t, sils, 0)
}

func TestExperimentalReceiversAPI(t *testing.T) {
Expand Down
10 changes: 4 additions & 6 deletions vendor/github.com/grafana/alerting/notify/silences.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 2 additions & 3 deletions vendor/github.com/prometheus/alertmanager/api/v2/api.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit d84bd0d

Please sign in to comment.