From a87c25fd994a2eec5ab5af0a920cc529b17c0030 Mon Sep 17 00:00:00 2001 From: gotjosh Date: Mon, 24 Aug 2020 17:16:12 +0100 Subject: [PATCH] Fixes the registration of the Alertmanager alert receiving API metrics (#3065) * Remove TODO about prometheus/alertmanager#2182 as it got merged Signed-off-by: gotjosh * Remove TODO about prometheus/alertmanager#2200 as it got merged Signed-off-by: gotjosh * Register the Alertmanager API metrics Signed-off-by: gotjosh * Add a changelog entry Signed-off-by: gotjosh * Fix tests Signed-off-by: gotjosh --- CHANGELOG.md | 1 + integration/alertmanager_test.go | 8 ++++++++ integration/e2ecortex/client.go | 27 +++++++++++++++++++++++++++ pkg/alertmanager/alertmanager.go | 16 +++++++--------- pkg/alertmanager/multitenant_test.go | 4 ---- 5 files changed, 43 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9eb63f7884..6cf25d0dea 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,7 @@ * [BUGFIX] Experimental blocks storage: Ingester is less likely to hit gRPC message size limit when streaming data to queriers. #3015 * [BUGFIX] Fix configuration for TLS server validation, TLS skip verify was hardcoded to true for all TLS configurations and prevented validation of server certificates. #3030 * [BUGFIX] Fixes the Alertmanager panicking when no `-alertmanager.web.external-url` is provided. #3017 +* [BUGFIX] Fixes the registration of the Alertmanager API metrics `cortex_alertmanager_alerts_received_total` and `cortex_alertmanager_alerts_invalid_total`. #3065 ## 1.3.0 / 2020-08-21 diff --git a/integration/alertmanager_test.go b/integration/alertmanager_test.go index 4883f15394..a59009ceeb 100644 --- a/integration/alertmanager_test.go +++ b/integration/alertmanager_test.go @@ -6,6 +6,7 @@ import ( "context" "testing" + "github.com/prometheus/common/model" "github.com/prometheus/prometheus/pkg/labels" "github.com/stretchr/testify/require" @@ -95,6 +96,13 @@ func TestAlertmanagerStoreAPI(t *testing.T) { require.Len(t, cfg.Receivers, 1) require.Equal(t, "example_receiver", cfg.Receivers[0].Name) + err = c.SendAlertToAlermanager(context.Background(), &model.Alert{Labels: model.LabelSet{"foo": "bar"}}) + require.NoError(t, err) + + require.NoError(t, am.WaitSumMetricsWithOptions(e2e.Equals(1), []string{"cortex_alertmanager_alerts_received_total"}, + e2e.WithLabelMatchers(labels.MustNewMatcher(labels.MatchEqual, "user", "user-1")), + e2e.WaitMissingMetrics)) + err = c.DeleteAlertmanagerConfig(context.Background()) require.NoError(t, err) diff --git a/integration/e2ecortex/client.go b/integration/e2ecortex/client.go index da321e34ac..95192be366 100644 --- a/integration/e2ecortex/client.go +++ b/integration/e2ecortex/client.go @@ -15,6 +15,7 @@ import ( "github.com/gogo/protobuf/proto" "github.com/golang/snappy" alertConfig "github.com/prometheus/alertmanager/config" + "github.com/prometheus/alertmanager/types" promapi "github.com/prometheus/client_golang/api" promv1 "github.com/prometheus/client_golang/api/prometheus/v1" "github.com/prometheus/common/model" @@ -380,6 +381,32 @@ func (c *Client) DeleteAlertmanagerConfig(ctx context.Context) error { return nil } +// SendAlertToAlermanager sends alerts to the Alertmanager API +func (c *Client) SendAlertToAlermanager(ctx context.Context, alert *model.Alert) error { + u := c.alertmanagerClient.URL("/api/prom/api/v1/alerts", nil) + + data, err := json.Marshal([]types.Alert{{Alert: *alert}}) + if err != nil { + return fmt.Errorf("error marshaling the alert: %v", err) + } + + req, err := http.NewRequest(http.MethodPost, u.String(), bytes.NewReader(data)) + if err != nil { + return fmt.Errorf("error creating request: %v", err) + } + + resp, body, err := c.alertmanagerClient.Do(ctx, req) + if err != nil { + return err + } + + if resp.StatusCode != http.StatusOK { + return fmt.Errorf("sending alert failed with status %d and error %v", resp.StatusCode, string(body)) + } + + return nil +} + func (c *Client) PostRequest(url string, body io.Reader) (*http.Response, error) { req, err := http.NewRequest("POST", url, body) if err != nil { diff --git a/pkg/alertmanager/alertmanager.go b/pkg/alertmanager/alertmanager.go index 587bdb8ad7..98b507b034 100644 --- a/pkg/alertmanager/alertmanager.go +++ b/pkg/alertmanager/alertmanager.go @@ -67,20 +67,16 @@ type Alertmanager struct { mux *http.ServeMux registry *prometheus.Registry + // The Dispatcher is the only component we need to recreate when we call ApplyConfig. + // Given its metrics don't have any variable labels we need to re-use the same metrics. + dispatcherMetrics *dispatch.DispatcherMetrics + activeMtx sync.Mutex active bool } var ( webReload = make(chan chan error) - - // In order to workaround a bug in the alertmanager, which doesn't register the - // metrics in the input registry but to the global default one, we do define a - // singleton dispatcher metrics instance that is going to be shared across all - // tenants alertmanagers. - // TODO change this once the vendored alertmanager will have this PR merged into: - // https://github.com/prometheus/alertmanager/pull/2200 - dispatcherMetrics = dispatch.NewDispatcherMetrics(prometheus.NewRegistry()) ) func init() { @@ -158,6 +154,7 @@ func New(cfg *Config, reg *prometheus.Registry) (*Alertmanager, error) { Silences: am.silences, StatusFunc: am.marker.Status, Peer: cfg.Peer, + Registry: am.registry, Logger: log.With(am.logger, "component", "api"), GroupFunc: func(f1 func(*dispatch.Route) bool, f2 func(*types.Alert, time.Time) bool) (dispatch.AlertGroups, map[model.Fingerprint][]string) { return am.dispatcher.Groups(f1, f2) @@ -172,6 +169,7 @@ func New(cfg *Config, reg *prometheus.Registry) (*Alertmanager, error) { ui.Register(router, webReload, log.With(am.logger, "component", "ui")) am.mux = am.api.Register(router, am.cfg.ExternalURL.Path) + am.dispatcherMetrics = dispatch.NewDispatcherMetrics(am.registry) return am, nil } @@ -240,7 +238,7 @@ func (am *Alertmanager) ApplyConfig(userID string, conf *config.Config) error { am.marker, timeoutFunc, log.With(am.logger, "component", "dispatcher"), - dispatcherMetrics, + am.dispatcherMetrics, ) go am.dispatcher.Run() diff --git a/pkg/alertmanager/multitenant_test.go b/pkg/alertmanager/multitenant_test.go index fa712ec41f..b3e18307ea 100644 --- a/pkg/alertmanager/multitenant_test.go +++ b/pkg/alertmanager/multitenant_test.go @@ -1,5 +1,3 @@ -// +build !race - package alertmanager import ( @@ -57,8 +55,6 @@ func (m *mockAlertStore) DeleteAlertConfig(ctx context.Context, user string) err return fmt.Errorf("not implemented") } -// TestLoadAllConfigs ensures the multitenant alertmanager can properly load configs from a local backend store. -// It is excluded from the race detector due to a vendored race issue https://github.com/prometheus/alertmanager/issues/2182 func TestLoadAllConfigs(t *testing.T) { mockStore := &mockAlertStore{ configs: map[string]alerts.AlertConfigDesc{