Skip to content

Commit

Permalink
stateless ruler restores alert state
Browse files Browse the repository at this point in the history
Signed-off-by: Ben Ye <[email protected]>
  • Loading branch information
Ben Ye committed Jul 2, 2022
1 parent 5d05c1f commit b1bce0f
Show file tree
Hide file tree
Showing 7 changed files with 370 additions and 42 deletions.
3 changes: 3 additions & 0 deletions cmd/thanos/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,7 @@ type queryConfig struct {
dnsSDInterval time.Duration
httpMethod string
dnsSDResolver string
step time.Duration
}

func (qc *queryConfig) registerFlag(cmd extkingpin.FlagClause) *queryConfig {
Expand All @@ -189,6 +190,8 @@ func (qc *queryConfig) registerFlag(cmd extkingpin.FlagClause) *queryConfig {
Default("POST").EnumVar(&qc.httpMethod, "GET", "POST")
cmd.Flag("query.sd-dns-resolver", "Resolver to use. Possible options: [golang, miekgdns]").
Default("golang").Hidden().StringVar(&qc.dnsSDResolver)
cmd.Flag("query.default-step", "Default range query step to use. This is only used in stateless Ruler and alert state restoration.").
Default("1s").DurationVar(&qc.step)
return qc
}

Expand Down
69 changes: 33 additions & 36 deletions cmd/thanos/rule.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,12 +85,15 @@ type ruleConfig struct {

rwConfig *extflag.PathOrContent

resendDelay time.Duration
evalInterval time.Duration
ruleFiles []string
objStoreConfig *extflag.PathOrContent
dataDir string
lset labels.Labels
resendDelay time.Duration
evalInterval time.Duration
outageTolerance time.Duration
forGracePeriod time.Duration
ruleFiles []string
objStoreConfig *extflag.PathOrContent
dataDir string
lset labels.Labels
restoreIgnoreLabels []string
}

func (rc *ruleConfig) registerFlag(cmd extkingpin.FlagClause) {
Expand Down Expand Up @@ -126,6 +129,12 @@ func registerRule(app *extkingpin.App) {
Default("1m").DurationVar(&conf.resendDelay)
cmd.Flag("eval-interval", "The default evaluation interval to use.").
Default("1m").DurationVar(&conf.evalInterval)
cmd.Flag("for-outage-tolerance", "Max time to tolerate prometheus outage for restoring \"for\" state of alert.").
Default("1h").DurationVar(&conf.outageTolerance)
cmd.Flag("for-grace-period", "Minimum duration between alert and restored \"for\" state. This is maintained only for alerts with configured \"for\" time greater than grace period.").
Default("10m").DurationVar(&conf.forGracePeriod)
cmd.Flag("restore-ignored-label", "Labels to be ignored when restoring alerts from the remote storage. This is only used in stateless mode.").
StringsVar(&conf.restoreIgnoreLabels)

conf.rwConfig = extflag.RegisterPathOrContent(cmd, "remote-write.config", "YAML config for the remote-write configurations, that specify servers where samples should be sent to (see https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_write). This automatically enables stateless mode for ruler and no series will be stored in the ruler's TSDB. If an empty config (or file) is provided, the flag is ignored and ruler is run with its own TSDB.", extflag.WithEnvSubstitution())

Expand Down Expand Up @@ -321,7 +330,10 @@ func runRule(
extprom.WrapRegistererWithPrefix("thanos_rule_query_apis_", reg),
dns.ResolverType(conf.query.dnsSDResolver),
)
var queryClients []*httpconfig.Client
var (
queryClients []*httpconfig.Client
promClients []*promclient.Client
)
queryClientMetrics := extpromhttp.NewClientMetrics(extprom.WrapRegistererWith(prometheus.Labels{"client": "query"}, reg))
for _, cfg := range queryCfg {
cfg.HTTPClientConfig.ClientMetrics = queryClientMetrics
Expand All @@ -335,6 +347,7 @@ func runRule(
return err
}
queryClients = append(queryClients, queryClient)
promClients = append(promClients, promclient.NewClient(queryClient, logger, "thanos-rule"))
// Discover and resolve query addresses.
addDiscoveryGroups(g, queryClient, conf.query.dnsSDInterval)
}
Expand Down Expand Up @@ -377,7 +390,8 @@ func runRule(
}
fanoutStore := storage.NewFanout(logger, agentDB, remoteStore)
appendable = fanoutStore
queryable = fanoutStore
// Use a separate queryable to restore the ALERTS firing states.
queryable = thanosrules.NewPromClientsQueryable(logger, queryClients, promClients, conf.query.httpMethod, conf.query.step, conf.restoreIgnoreLabels)
} else {
tsdbDB, err = tsdb.Open(conf.dataDir, log.With(logger, "component", "tsdb"), reg, tsdbOpts, nil)
if err != nil {
Expand Down Expand Up @@ -495,14 +509,16 @@ func runRule(
reg,
conf.dataDir,
rules.ManagerOptions{
NotifyFunc: notifyFunc,
Logger: logger,
Appendable: appendable,
ExternalURL: nil,
Queryable: queryable,
ResendDelay: conf.resendDelay,
NotifyFunc: notifyFunc,
Logger: logger,
Appendable: appendable,
ExternalURL: nil,
Queryable: queryable,
ResendDelay: conf.resendDelay,
OutageTolerance: conf.outageTolerance,
ForGracePeriod: conf.forGracePeriod,
},
queryFuncCreator(logger, queryClients, metrics.duplicatedQuery, metrics.ruleEvalWarnings, conf.query.httpMethod),
queryFuncCreator(logger, queryClients, promClients, metrics.duplicatedQuery, metrics.ruleEvalWarnings, conf.query.httpMethod),
conf.lset,
// In our case the querying URL is the external URL because in Prometheus
// --web.external-url points to it i.e. it points at something where the user
Expand Down Expand Up @@ -772,24 +788,10 @@ func labelsTSDBToProm(lset labels.Labels) (res labels.Labels) {
return res
}

func removeDuplicateQueryEndpoints(logger log.Logger, duplicatedQueriers prometheus.Counter, urls []*url.URL) []*url.URL {
set := make(map[string]struct{})
deduplicated := make([]*url.URL, 0, len(urls))
for _, u := range urls {
if _, ok := set[u.String()]; ok {
level.Warn(logger).Log("msg", "duplicate query address is provided", "addr", u.String())
duplicatedQueriers.Inc()
continue
}
deduplicated = append(deduplicated, u)
set[u.String()] = struct{}{}
}
return deduplicated
}

func queryFuncCreator(
logger log.Logger,
queriers []*httpconfig.Client,
promClients []*promclient.Client,
duplicatedQuery prometheus.Counter,
ruleEvalWarnings *prometheus.CounterVec,
httpMethod string,
Expand All @@ -810,15 +812,10 @@ func queryFuncCreator(
panic(errors.Errorf("unknown partial response strategy %v", partialResponseStrategy).Error())
}

promClients := make([]*promclient.Client, 0, len(queriers))
for _, q := range queriers {
promClients = append(promClients, promclient.NewClient(q, logger, "thanos-rule"))
}

return func(ctx context.Context, q string, t time.Time) (promql.Vector, error) {
for _, i := range rand.Perm(len(queriers)) {
promClient := promClients[i]
endpoints := removeDuplicateQueryEndpoints(logger, duplicatedQuery, queriers[i].Endpoints())
endpoints := thanosrules.RemoveDuplicateQueryEndpoints(logger, duplicatedQuery, queriers[i].Endpoints())
for _, i := range rand.Perm(len(endpoints)) {
span, ctx := tracing.StartSpan(ctx, spanID)
v, warns, err := promClient.PromqlQueryInstant(ctx, endpoints[i], q, t, promclient.QueryOptions{
Expand Down
13 changes: 13 additions & 0 deletions docs/components/rule.md
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,12 @@ Flags:
prefix for the regular Alertmanager API path.
--data-dir="data/" data directory
--eval-interval=1m The default evaluation interval to use.
--for-grace-period=10m Minimum duration between alert and restored
"for" state. This is maintained only for alerts
with configured "for" time greater than grace
period.
--for-outage-tolerance=1h Max time to tolerate prometheus outage for
restoring "for" state of alert.
--grpc-address="0.0.0.0:10901"
Listen ip:port address for gRPC endpoints
(StoreAPI). Make sure this address is routable
Expand Down Expand Up @@ -385,6 +391,9 @@ Flags:
https://thanos.io/tip/components/rule.md/#configuration.
If defined, it takes precedence over the
'--query' and '--query.sd-files' flags.
--query.default-step=1s Default range query step to use. This is only
used in stateless Ruler and alert state
restoration.
--query.http-method=POST HTTP method to use when sending queries.
Possible options: [GET, POST]
--query.sd-dns-interval=30s
Expand Down Expand Up @@ -429,6 +438,10 @@ Flags:
https://thanos.io/tip/thanos/logging.md/#configuration
--resend-delay=1m Minimum amount of time to wait before resending
an alert to Alertmanager.
--restore-ignored-label=RESTORE-IGNORED-LABEL ...
Labels to be ignored when restoring alerts from
the remote storage. This is only used in
stateless mode.
--rule-file=rules/ ... Rule files that should be used by rule manager.
Can be in glob format (repeated). Note that
rules are not automatically detected, use
Expand Down
23 changes: 23 additions & 0 deletions pkg/promclient/promclient.go
Original file line number Diff line number Diff line change
Expand Up @@ -778,6 +778,29 @@ func (c *Client) RulesInGRPC(ctx context.Context, base *url.URL, typeRules strin
return m.Data.Groups, nil
}

// AlertsInGRPC returns the rules from Prometheus alerts API. It uses gRPC errors.
// NOTE: This method is tested in pkg/store/prometheus_test.go against Prometheus.
func (c *Client) AlertsInGRPC(ctx context.Context, base *url.URL) ([]*rulespb.AlertInstance, error) {
u := *base
u.Path = path.Join(u.Path, "/api/v1/alerts")

var m struct {
Data struct {
Alerts []*rulespb.AlertInstance `json:"alerts"`
} `json:"data"`
}

if err := c.get2xxResultWithGRPCErrors(ctx, "/prom_alerts HTTP[client]", &u, &m); err != nil {
return nil, err
}

// Prometheus does not support PartialResponseStrategy, and probably would never do. Make it Abort by default.
for _, g := range m.Data.Alerts {
g.PartialResponseStrategy = storepb.PartialResponseStrategy_ABORT
}
return m.Data.Alerts, nil
}

// MetricMetadataInGRPC returns the metadata from Prometheus metric metadata API. It uses gRPC errors.
func (c *Client) MetricMetadataInGRPC(ctx context.Context, base *url.URL, metric string, limit int) (map[string][]metadatapb.Meta, error) {
u := *base
Expand Down
150 changes: 150 additions & 0 deletions pkg/rules/queryable.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
// Copyright (c) The Thanos Authors.
// Licensed under the Apache License 2.0.

package rules

import (
"context"
"math/rand"
"net/url"
"strings"
"time"

"github.com/cortexproject/cortex/pkg/querier/series"
"github.com/go-kit/log"
"github.com/go-kit/log/level"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
"github.com/prometheus/common/model"
"github.com/prometheus/prometheus/model/labels"
"github.com/prometheus/prometheus/storage"

"github.com/thanos-io/thanos/pkg/httpconfig"
"github.com/thanos-io/thanos/pkg/promclient"
"github.com/thanos-io/thanos/pkg/store/storepb"
)

type promClientsQueryable struct {
httpMethod string
step time.Duration

logger log.Logger
promClients []*promclient.Client
queryClients []*httpconfig.Client
restoreIgnoreLabels []string

duplicatedQuery prometheus.Counter
}
type promClientsQuerier struct {
ctx context.Context
mint, maxt int64
step int64
httpMethod string

logger log.Logger
promClients []*promclient.Client
queryClients []*httpconfig.Client
restoreIgnoreLabels []string

// We use a dummy counter here because the duplicated
// addresses are already tracked by rule evaluation part.
duplicatedQuery prometheus.Counter
}

// NewPromClientsQueryable creates a queryable that queries queriers from Prometheus clients.
func NewPromClientsQueryable(logger log.Logger, queryClients []*httpconfig.Client, promClients []*promclient.Client,
httpMethod string, step time.Duration, restoreIgnoreLabels []string) *promClientsQueryable {
return &promClientsQueryable{
logger: logger,
queryClients: queryClients,
promClients: promClients,
duplicatedQuery: promauto.With(nil).NewCounter(prometheus.CounterOpts{}),
httpMethod: httpMethod,
step: step,
restoreIgnoreLabels: restoreIgnoreLabels,
}
}

// Querier returns a new Querier for the given time range.
func (q *promClientsQueryable) Querier(ctx context.Context, mint, maxt int64) (storage.Querier, error) {
return &promClientsQuerier{
ctx: ctx,
mint: mint,
maxt: maxt,
step: int64(q.step / time.Second),
httpMethod: q.httpMethod,
logger: q.logger,
queryClients: q.queryClients,
promClients: q.promClients,
restoreIgnoreLabels: q.restoreIgnoreLabels,
}, nil
}

// Select implements storage.Querier interface.
func (q *promClientsQuerier) Select(_ bool, _ *storage.SelectHints, matchers ...*labels.Matcher) storage.SeriesSet {
query := storepb.PromMatchersToString(matchers...)

for _, i := range rand.Perm(len(q.queryClients)) {
promClient := q.promClients[i]
endpoints := RemoveDuplicateQueryEndpoints(q.logger, q.duplicatedQuery, q.queryClients[i].Endpoints())
for _, i := range rand.Perm(len(endpoints)) {
m, warns, err := promClient.QueryRange(q.ctx, endpoints[i], query, q.mint, q.maxt, q.step, promclient.QueryOptions{
Deduplicate: true,
Method: q.httpMethod,
})

if err != nil {
level.Error(q.logger).Log("err", err, "query", q)
continue
}
if len(warns) > 0 {
level.Warn(q.logger).Log("warnings", strings.Join(warns, ", "), "query", q)
}
matrix := make([]*model.SampleStream, 0, m.Len())
for _, metric := range m {
for _, label := range q.restoreIgnoreLabels {
delete(metric.Metric, model.LabelName(label))
}

matrix = append(matrix, &model.SampleStream{
Metric: metric.Metric,
Values: metric.Values,
})
}

return series.MatrixToSeriesSet(matrix)
}
}
return storage.NoopSeriesSet()
}

// LabelValues implements storage.LabelQuerier interface.
func (q *promClientsQuerier) LabelValues(name string, matchers ...*labels.Matcher) ([]string, storage.Warnings, error) {
return nil, nil, nil
}

// LabelNames implements storage.LabelQuerier interface.
func (q *promClientsQuerier) LabelNames(matchers ...*labels.Matcher) ([]string, storage.Warnings, error) {
return nil, nil, nil
}

// Close implements storage.LabelQuerier interface.
func (q *promClientsQuerier) Close() error {
return nil
}

// RemoveDuplicateQueryEndpoints removes duplicate endpoints from the list of urls.
func RemoveDuplicateQueryEndpoints(logger log.Logger, duplicatedQueriers prometheus.Counter, urls []*url.URL) []*url.URL {
set := make(map[string]struct{})
deduplicated := make([]*url.URL, 0, len(urls))
for _, u := range urls {
if _, ok := set[u.String()]; ok {
level.Warn(logger).Log("msg", "duplicate query address is provided", "addr", u.String())
duplicatedQueriers.Inc()
continue
}
deduplicated = append(deduplicated, u)
set[u.String()] = struct{}{}
}
return deduplicated
}
Loading

0 comments on commit b1bce0f

Please sign in to comment.