Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Exclude vector queries from being counted in metric for rules with zero fetched series #6544

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

* [CHANGE] Ingester: Increase default value of `-blocks-storage.tsdb.head-postings-for-matchers-cache-max-bytes` and `-blocks-storage.tsdb.block-postings-for-matchers-cache-max-bytes` to 100 MiB (previous default value was 10 MiB). #6764
* [ENHANCEMENT] PromQL: ignore small errors for bucketQuantile #6766
* [ENHANCEMENT] Ruler: exclude vector queries from being tracked in `cortex_ruler_queries_zero_fetched_series_total`. #6544
* [BUGFIX] Ingester: don't ignore errors encountered while iterating through chunks or samples in response to a query request. #6451
* [BUGFIX] Fix issue where queries can fail or omit OOO samples if OOO head compaction occurs between creating a querier and reading chunks #6766
* [BUGFIX] Fix issue where concatenatingChunkIterator can obscure errors #6766
Expand Down
68 changes: 66 additions & 2 deletions integration/ruler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -694,10 +694,58 @@ func TestRulerMetricsForInvalidQueriesAndNoFetchedSeries(t *testing.T) {

deleteRuleAndWait(groupName)

const groupName2 = "good_rule_with_no_fetched_series_and_no_samples"
const expression2 = `sum(metric{foo="10000"})`
const groupName2 = "good_rule_with_fetched_series_and_samples"
const expression2 = `sum(metric{foo=~"1|2"})`
addNewRuleAndWait(groupName2, expression2, false)

// Ensure that samples were returned.
sum, err = ruler.SumMetrics([]string{"cortex_prometheus_last_evaluation_samples"})
require.NoError(t, err)
require.Less(t, float64(0), sum[0])

// Ensure that the metric for no fetched series was not incremented.
sum, err = ruler.SumMetrics([]string{"cortex_ruler_queries_zero_fetched_series_total"})
require.NoError(t, err)
require.Equal(t, float64(0), sum[0])

deleteRuleAndWait(groupName2)

const groupName3 = "good_rule_with_no_series_selector"
const expression3 = `vector(1.2345)`
addNewRuleAndWait(groupName3, expression3, false)

// Ensure that samples were returned.
sum, err = ruler.SumMetrics([]string{"cortex_prometheus_last_evaluation_samples"})
require.NoError(t, err)
require.Less(t, float64(0), sum[0])

// Ensure that the metric for no fetched series was not incremented.
sum, err = ruler.SumMetrics([]string{"cortex_ruler_queries_zero_fetched_series_total"})
require.NoError(t, err)
require.Equal(t, float64(0), sum[0])

deleteRuleAndWait(groupName3)

const groupName4 = "good_rule_with_fetched_series_and_samples_and_non_series_selector"
const expression4 = `sum(metric{foo=~"1|2"}) + vector(1.2345)`
addNewRuleAndWait(groupName4, expression4, false)

// Ensure that samples were returned.
sum, err = ruler.SumMetrics([]string{"cortex_prometheus_last_evaluation_samples"})
require.NoError(t, err)
require.Less(t, float64(0), sum[0])

// Ensure that the metric for no fetched series was not incremented.
sum, err = ruler.SumMetrics([]string{"cortex_ruler_queries_zero_fetched_series_total"})
require.NoError(t, err)
require.Equal(t, float64(0), sum[0])

deleteRuleAndWait(groupName4)

const groupName5 = "good_rule_with_no_fetched_series_and_no_samples_and_non_series_selector"
const expression5 = `sum(metric{foo="10000"}) + vector(1.2345)`
addNewRuleAndWait(groupName5, expression5, false)

// Ensure that no samples were returned.
sum, err = ruler.SumMetrics([]string{"cortex_prometheus_last_evaluation_samples"})
require.NoError(t, err)
Expand All @@ -707,6 +755,22 @@ func TestRulerMetricsForInvalidQueriesAndNoFetchedSeries(t *testing.T) {
sum, err = ruler.SumMetrics([]string{"cortex_ruler_queries_zero_fetched_series_total"})
require.NoError(t, err)
require.Equal(t, float64(1), sum[0])

deleteRuleAndWait(groupName5)

const groupName6 = "good_rule_with_no_fetched_series_and_no_samples"
const expression6 = `sum(metric{foo="10000"})`
addNewRuleAndWait(groupName6, expression6, false)

// Ensure that no samples were returned.
sum, err = ruler.SumMetrics([]string{"cortex_prometheus_last_evaluation_samples"})
require.NoError(t, err)
require.Equal(t, float64(0), sum[0])

// Ensure that the metric for no fetched series was incremented.
sum, err = ruler.SumMetrics([]string{"cortex_ruler_queries_zero_fetched_series_total"})
require.NoError(t, err)
require.Equal(t, float64(2), sum[0])
})

// Now let's stop ingester, and recheck metrics. This should increase cortex_ruler_queries_failed_total failures.
Expand Down
11 changes: 9 additions & 2 deletions pkg/ruler/compat.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"github.com/prometheus/prometheus/model/metadata"
"github.com/prometheus/prometheus/notifier"
"github.com/prometheus/prometheus/promql"
"github.com/prometheus/prometheus/promql/parser"
"github.com/prometheus/prometheus/rules"
"github.com/prometheus/prometheus/storage"

Expand Down Expand Up @@ -208,8 +209,14 @@ func RecordAndReportRuleQueryMetrics(qf rules.QueryFunc, queryTime, zeroFetchedS
shardedQueries := stats.LoadShardedQueries()

queryTime.Add(wallTime.Seconds())
if err == nil && numSeries == 0 { // Do not count queries with errors for zero fetched series.
zeroFetchedSeriesCount.Add(1)
// Do not count queries with errors for zero fetched series, or queries
// with no selectors that are not meant to fetch any series.
if err == nil && numSeries == 0 {
if expr, err := parser.ParseExpr(qs); err == nil {
if len(parser.ExtractSelectors(expr)) > 0 {
zeroFetchedSeriesCount.Add(1)
}
}
}

// Log ruler query stats.
Expand Down
28 changes: 27 additions & 1 deletion pkg/ruler/compat_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -354,13 +354,39 @@ func TestRecordAndReportRuleQueryMetrics(t *testing.T) {
}
qf := RecordAndReportRuleQueryMetrics(mockFunc, queryTime.WithLabelValues("userID"), zeroFetchedSeriesCount.WithLabelValues("userID"), log.NewNopLogger())

// Ensure we start with counters at 0.
require.LessOrEqual(t, float64(0), testutil.ToFloat64(queryTime.WithLabelValues("userID")))
require.Equal(t, float64(0), testutil.ToFloat64(zeroFetchedSeriesCount.WithLabelValues("userID")))

// Increment zeroFetchedSeriesCount for non-existent series.
_, _ = qf(context.Background(), "test", time.Now())
require.LessOrEqual(t, float64(1), testutil.ToFloat64(queryTime.WithLabelValues("userID")))
require.Equal(t, float64(1), testutil.ToFloat64(zeroFetchedSeriesCount.WithLabelValues("userID")))

_, _ = qf(context.Background(), "test", time.Now())
// Increment zeroFetchedSeriesCount for another non-existent series.
_, _ = qf(context.Background(), "test2", time.Now())
require.LessOrEqual(t, float64(2), testutil.ToFloat64(queryTime.WithLabelValues("userID")))
require.Equal(t, float64(2), testutil.ToFloat64(zeroFetchedSeriesCount.WithLabelValues("userID")))

// Don't increment zeroFetchedSeriesCount for query without series selectors.
_, _ = qf(context.Background(), "vector(0.995)", time.Now())
require.LessOrEqual(t, float64(3), testutil.ToFloat64(queryTime.WithLabelValues("userID")))
require.Equal(t, float64(2), testutil.ToFloat64(zeroFetchedSeriesCount.WithLabelValues("userID")))

// Don't increment zeroFetchedSeriesCount for another query without series selectors.
_, _ = qf(context.Background(), "vector(2.4192e+15 / 1e+09)", time.Now())
require.LessOrEqual(t, float64(4), testutil.ToFloat64(queryTime.WithLabelValues("userID")))
require.Equal(t, float64(2), testutil.ToFloat64(zeroFetchedSeriesCount.WithLabelValues("userID")))

// Increment zeroFetchedSeriesCount for non-existent series even when combined with a non-series selector.
_, _ = qf(context.Background(), "test + vector(0.995)", time.Now())
require.LessOrEqual(t, float64(5), testutil.ToFloat64(queryTime.WithLabelValues("userID")))
require.Equal(t, float64(3), testutil.ToFloat64(zeroFetchedSeriesCount.WithLabelValues("userID")))

// Don't increment zeroFetchedSeriesCount for queries with errors.
_, _ = qf(context.Background(), "rate(test)", time.Now())
require.LessOrEqual(t, float64(6), testutil.ToFloat64(queryTime.WithLabelValues("userID")))
require.Equal(t, float64(3), testutil.ToFloat64(zeroFetchedSeriesCount.WithLabelValues("userID")))
}

// TestManagerFactory_CorrectQueryableUsed ensures that when evaluating a group with non-empty SourceTenants
Expand Down