Skip to content

Commit

Permalink
stateless ruler restores alert state
Browse files Browse the repository at this point in the history
Signed-off-by: Ben Ye <[email protected]>
  • Loading branch information
Ben Ye committed Mar 13, 2022
1 parent 6eb5ce6 commit 3e2375c
Show file tree
Hide file tree
Showing 4 changed files with 171 additions and 36 deletions.
3 changes: 3 additions & 0 deletions cmd/thanos/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,7 @@ type queryConfig struct {
dnsSDInterval time.Duration
httpMethod string
dnsSDResolver string
step time.Duration
}

func (qc *queryConfig) registerFlag(cmd extkingpin.FlagClause) *queryConfig {
Expand All @@ -189,6 +190,8 @@ func (qc *queryConfig) registerFlag(cmd extkingpin.FlagClause) *queryConfig {
Default("POST").EnumVar(&qc.httpMethod, "GET", "POST")
cmd.Flag("query.sd-dns-resolver", "Resolver to use. Possible options: [golang, miekgdns]").
Default("golang").Hidden().StringVar(&qc.dnsSDResolver)
cmd.Flag("query.default-step", "Default range query step to use. This is only used in stateless Ruler and alert state restoration.").
Default("1s").DurationVar(&qc.step)
return qc
}

Expand Down
66 changes: 30 additions & 36 deletions cmd/thanos/rule.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,12 +84,14 @@ type ruleConfig struct {

rwConfig *extflag.PathOrContent

resendDelay time.Duration
evalInterval time.Duration
ruleFiles []string
objStoreConfig *extflag.PathOrContent
dataDir string
lset labels.Labels
resendDelay time.Duration
evalInterval time.Duration
outageTolerance time.Duration
forGracePeriod time.Duration
ruleFiles []string
objStoreConfig *extflag.PathOrContent
dataDir string
lset labels.Labels
}

func (rc *ruleConfig) registerFlag(cmd extkingpin.FlagClause) {
Expand Down Expand Up @@ -125,6 +127,10 @@ func registerRule(app *extkingpin.App) {
Default("1m").DurationVar(&conf.resendDelay)
cmd.Flag("eval-interval", "The default evaluation interval to use.").
Default("30s").DurationVar(&conf.evalInterval)
cmd.Flag("for-outage-tolerance", "Max time to tolerate prometheus outage for restoring \"for\" state of alert.").
Default("1h").DurationVar(&conf.outageTolerance)
cmd.Flag("for-grace-period", "Minimum duration between alert and restored \"for\" state. This is maintained only for alerts with configured \"for\" time greater than grace period.").
Default("10s").DurationVar(&conf.forGracePeriod)

conf.rwConfig = extflag.RegisterPathOrContent(cmd, "remote-write.config", "YAML config for the remote-write configurations, that specify servers where samples should be sent to (see https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_write). This automatically enables stateless mode for ruler and no series will be stored in the ruler's TSDB. If an empty config (or file) is provided, the flag is ignored and ruler is run with its own TSDB.", extflag.WithEnvSubstitution())

Expand Down Expand Up @@ -320,7 +326,10 @@ func runRule(
extprom.WrapRegistererWithPrefix("thanos_rule_query_apis_", reg),
dns.ResolverType(conf.query.dnsSDResolver),
)
var queryClients []*httpconfig.Client
var (
queryClients []*httpconfig.Client
promClients []*promclient.Client
)
queryClientMetrics := extpromhttp.NewClientMetrics(extprom.WrapRegistererWith(prometheus.Labels{"client": "query"}, reg))
for _, cfg := range queryCfg {
cfg.HTTPClientConfig.ClientMetrics = queryClientMetrics
Expand All @@ -334,6 +343,7 @@ func runRule(
return err
}
queryClients = append(queryClients, queryClient)
promClients = append(promClients, promclient.NewClient(queryClient, logger, "thanos-rule"))
// Discover and resolve query addresses.
addDiscoveryGroups(g, queryClient, conf.query.dnsSDInterval)
}
Expand Down Expand Up @@ -377,7 +387,8 @@ func runRule(
}
fanoutStore := storage.NewFanout(logger, agentDB, remoteStore)
appendable = fanoutStore
queryable = fanoutStore
// Use a separate queryable to restore the ALERTS firing states.
queryable = thanosrules.NewPromClientsQueryable(logger, queryClients, promClients, conf.query.httpMethod, conf.query.step)
} else {
tsdbDB, err = tsdb.Open(conf.dataDir, log.With(logger, "component", "tsdb"), reg, tsdbOpts, nil)
if err != nil {
Expand Down Expand Up @@ -495,14 +506,16 @@ func runRule(
reg,
conf.dataDir,
rules.ManagerOptions{
NotifyFunc: notifyFunc,
Logger: logger,
Appendable: appendable,
ExternalURL: nil,
Queryable: queryable,
ResendDelay: conf.resendDelay,
NotifyFunc: notifyFunc,
Logger: logger,
Appendable: appendable,
ExternalURL: nil,
Queryable: queryable,
ResendDelay: conf.resendDelay,
OutageTolerance: conf.outageTolerance,
ForGracePeriod: conf.forGracePeriod,
},
queryFuncCreator(logger, queryClients, metrics.duplicatedQuery, metrics.ruleEvalWarnings, conf.query.httpMethod),
queryFuncCreator(logger, queryClients, promClients, metrics.duplicatedQuery, metrics.ruleEvalWarnings, conf.query.httpMethod),
conf.lset,
// In our case the querying URL is the external URL because in Prometheus
// --web.external-url points to it i.e. it points at something where the user
Expand Down Expand Up @@ -769,24 +782,10 @@ func labelsTSDBToProm(lset labels.Labels) (res labels.Labels) {
return res
}

func removeDuplicateQueryEndpoints(logger log.Logger, duplicatedQueriers prometheus.Counter, urls []*url.URL) []*url.URL {
set := make(map[string]struct{})
deduplicated := make([]*url.URL, 0, len(urls))
for _, u := range urls {
if _, ok := set[u.String()]; ok {
level.Warn(logger).Log("msg", "duplicate query address is provided", "addr", u.String())
duplicatedQueriers.Inc()
continue
}
deduplicated = append(deduplicated, u)
set[u.String()] = struct{}{}
}
return deduplicated
}

func queryFuncCreator(
logger log.Logger,
queriers []*httpconfig.Client,
promClients []*promclient.Client,
duplicatedQuery prometheus.Counter,
ruleEvalWarnings *prometheus.CounterVec,
httpMethod string,
Expand All @@ -807,15 +806,10 @@ func queryFuncCreator(
panic(errors.Errorf("unknown partial response strategy %v", partialResponseStrategy).Error())
}

promClients := make([]*promclient.Client, 0, len(queriers))
for _, q := range queriers {
promClients = append(promClients, promclient.NewClient(q, logger, "thanos-rule"))
}

return func(ctx context.Context, q string, t time.Time) (promql.Vector, error) {
for _, i := range rand.Perm(len(queriers)) {
promClient := promClients[i]
endpoints := removeDuplicateQueryEndpoints(logger, duplicatedQuery, queriers[i].Endpoints())
endpoints := thanosrules.RemoveDuplicateQueryEndpoints(logger, duplicatedQuery, queriers[i].Endpoints())
for _, i := range rand.Perm(len(endpoints)) {
span, ctx := tracing.StartSpan(ctx, spanID)
v, warns, err := promClient.PromqlQueryInstant(ctx, endpoints[i], q, t, promclient.QueryOptions{
Expand Down
9 changes: 9 additions & 0 deletions docs/components/rule.md
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,12 @@ Flags:
prefix for the regular Alertmanager API path.
--data-dir="data/" data directory
--eval-interval=30s The default evaluation interval to use.
--for-grace-period=10s Minimum duration between alert and restored
"for" state. This is maintained only for alerts
with configured "for" time greater than grace
period.
--for-outage-tolerance=1h Max time to tolerate prometheus outage for
restoring "for" state of alert.
--grpc-address="0.0.0.0:10901"
Listen ip:port address for gRPC endpoints
(StoreAPI). Make sure this address is routable
Expand Down Expand Up @@ -385,6 +391,9 @@ Flags:
https://thanos.io/tip/components/rule.md/#configuration.
If defined, it takes precedence over the
'--query' and '--query.sd-files' flags.
--query.default-step=1s Default range query step to use. This is only
used in stateless Ruler and alert state
restoration.
--query.http-method=POST HTTP method to use when sending queries.
Possible options: [GET, POST]
--query.sd-dns-interval=30s
Expand Down
129 changes: 129 additions & 0 deletions pkg/rules/queryable.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
package rules

import (
"context"
"math/rand"
"net/url"
"strings"
"time"

"github.com/cortexproject/cortex/pkg/querier/series"
"github.com/go-kit/log"
"github.com/go-kit/log/level"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/prometheus/model/labels"
"github.com/prometheus/prometheus/storage"

"github.com/thanos-io/thanos/pkg/httpconfig"
"github.com/thanos-io/thanos/pkg/promclient"
"github.com/thanos-io/thanos/pkg/store/storepb"
)

type promClientsQueryable struct {
httpMethod string
step time.Duration

logger log.Logger
promClients []*promclient.Client
queryClients []*httpconfig.Client

duplicatedQuery prometheus.Counter
}
type promClientsQuerier struct {
ctx context.Context
mint, maxt int64
step int64
httpMethod string

logger log.Logger
promClients []*promclient.Client
queryClients []*httpconfig.Client

// We use a dummy counter here because the duplicated
// addresses are already tracked by rule evaluation part.
duplicatedQuery prometheus.Counter
}

// NewPromClientsQueryable creates a queryable that queries queriers from Prometheus clients.
func NewPromClientsQueryable(logger log.Logger, queryClients []*httpconfig.Client, promClients []*promclient.Client, httpMethod string, step time.Duration) *promClientsQueryable {
return &promClientsQueryable{
logger: logger,
queryClients: queryClients,
promClients: promClients,
duplicatedQuery: prometheus.NewCounter(prometheus.CounterOpts{}),
httpMethod: httpMethod,
step: step,
}
}

// Querier returns a new Querier for the given time range.
func (q *promClientsQueryable) Querier(ctx context.Context, mint, maxt int64) (storage.Querier, error) {
return &promClientsQuerier{
ctx: ctx,
mint: mint,
maxt: maxt,
step: int64(q.step / time.Millisecond),
httpMethod: q.httpMethod,
logger: q.logger,
queryClients: q.queryClients,
promClients: q.promClients,
}, nil
}

// Select implements storage.Querier interface.
func (q *promClientsQuerier) Select(_ bool, _ *storage.SelectHints, matchers ...*labels.Matcher) storage.SeriesSet {
query := storepb.PromMatchersToString(matchers...)

for _, i := range rand.Perm(len(q.queryClients)) {
promClient := q.promClients[i]
endpoints := RemoveDuplicateQueryEndpoints(q.logger, q.duplicatedQuery, q.queryClients[i].Endpoints())
for _, i := range rand.Perm(len(endpoints)) {
m, warns, err := promClient.QueryRange(q.ctx, endpoints[i], query, q.mint, q.maxt, q.step, promclient.QueryOptions{
Deduplicate: true,
Method: q.httpMethod,
})

if err != nil {
level.Error(q.logger).Log("err", err, "query", q)
continue
}
if len(warns) > 0 {
level.Warn(q.logger).Log("warnings", strings.Join(warns, ", "), "query", q)
}

return series.MatrixToSeriesSet(m)
}
}
return storage.NoopSeriesSet()
}

// LabelValues implements storage.LabelQuerier interface.
func (q *promClientsQuerier) LabelValues(name string, matchers ...*labels.Matcher) ([]string, storage.Warnings, error) {
return nil, nil, nil
}

// LabelNames implements storage.LabelQuerier interface.
func (q *promClientsQuerier) LabelNames(matchers ...*labels.Matcher) ([]string, storage.Warnings, error) {
return nil, nil, nil
}

// Close implements storage.LabelQuerier interface.
func (q *promClientsQuerier) Close() error {
return nil
}

// RemoveDuplicateQueryEndpoints removes duplicate endpoints from the list of urls.
func RemoveDuplicateQueryEndpoints(logger log.Logger, duplicatedQueriers prometheus.Counter, urls []*url.URL) []*url.URL {
set := make(map[string]struct{})
deduplicated := make([]*url.URL, 0, len(urls))
for _, u := range urls {
if _, ok := set[u.String()]; ok {
level.Warn(logger).Log("msg", "duplicate query address is provided", "addr", u.String())
duplicatedQueriers.Inc()
continue
}
deduplicated = append(deduplicated, u)
set[u.String()] = struct{}{}
}
return deduplicated
}

0 comments on commit 3e2375c

Please sign in to comment.