Skip to content

Commit

Permalink
Compact: Replace group with resolution in ownsample metrics
Browse files Browse the repository at this point in the history
Compaction dowsnample metrics have too high a cardinality, causing metric
bloat on large installations. The group information is better suited to logs.
* Replace with a resolution label to reduce cardinality.

Fixes: #5841

Signed-off-by: SuperQ <[email protected]>
  • Loading branch information
SuperQ committed Apr 16, 2024
1 parent 5fb0c69 commit c1872c7
Show file tree
Hide file tree
Showing 4 changed files with 16 additions and 15 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ We use *breaking :warning:* to mark changes that are not backward compatible (re

- [#7123](https://github.com/thanos-io/thanos/pull/7123) Rule: Change default Alertmanager API version to v2.
- [#7223](https://github.com/thanos-io/thanos/pull/7223) Automatic detection of memory limits and configure GOMEMLIMIT to match.
- [#TBD](https://github.com/thanos-io/thanos/pull/TBD) Compact: *breaking :warning:* Replace group with resolution in compact downsample metrics to avoid cardinality explosion with large numbers of groups.

### Removed

Expand Down
6 changes: 3 additions & 3 deletions cmd/thanos/compact.go
Original file line number Diff line number Diff line change
Expand Up @@ -456,9 +456,9 @@ func runCompact(
}

for _, meta := range filteredMetas {
groupKey := meta.Thanos.GroupKey()
downsampleMetrics.downsamples.WithLabelValues(groupKey)
downsampleMetrics.downsampleFailures.WithLabelValues(groupKey)
resolutionLabel := meta.Thanos.ResolutionString()
downsampleMetrics.downsamples.WithLabelValues(resolutionLabel)
downsampleMetrics.downsampleFailures.WithLabelValues(resolutionLabel)
}

if err := downsampleBucket(
Expand Down
18 changes: 9 additions & 9 deletions cmd/thanos/downsample.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,16 +50,16 @@ func newDownsampleMetrics(reg *prometheus.Registry) *DownsampleMetrics {
m.downsamples = promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
Name: "thanos_compact_downsample_total",
Help: "Total number of downsampling attempts.",
}, []string{"group"})
}, []string{"resolution"})
m.downsampleFailures = promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
Name: "thanos_compact_downsample_failures_total",
Help: "Total number of failed downsampling attempts.",
}, []string{"group"})
}, []string{"resolution"})
m.downsampleDuration = promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{
Name: "thanos_compact_downsample_duration_seconds",
Help: "Duration of downsample runs",
Buckets: []float64{60, 300, 900, 1800, 3600, 7200, 14400}, // 1m, 5m, 15m, 30m, 60m, 120m, 240m
}, []string{"group"})
}, []string{"resolution"})

return m
}
Expand Down Expand Up @@ -130,9 +130,9 @@ func RunDownsample(
}

for _, meta := range metas {

Check failure on line 132 in cmd/thanos/downsample.go

View workflow job for this annotation

GitHub Actions / Documentation check

meta declared and not used

Check failure on line 132 in cmd/thanos/downsample.go

View workflow job for this annotation

GitHub Actions / Linters (Static Analysis) for Go

meta declared and not used

Check failure on line 132 in cmd/thanos/downsample.go

View workflow job for this annotation

GitHub Actions / Linters (Static Analysis) for Go

meta declared and not used

Check failure on line 132 in cmd/thanos/downsample.go

View workflow job for this annotation

GitHub Actions / Linters (Static Analysis) for Go

meta declared and not used

Check failure on line 132 in cmd/thanos/downsample.go

View workflow job for this annotation

GitHub Actions / Linters (Static Analysis) for Go

meta declared and not used

Check failure on line 132 in cmd/thanos/downsample.go

View workflow job for this annotation

GitHub Actions / Go build for different platforms

meta declared and not used
groupKey := meta.Thanos.GroupKey()
metrics.downsamples.WithLabelValues(groupKey)
metrics.downsampleFailures.WithLabelValues(groupKey)
resolutionLabel := m.Thanos.ResolutionString()

Check failure on line 133 in cmd/thanos/downsample.go

View workflow job for this annotation

GitHub Actions / Documentation check

undefined: m

Check failure on line 133 in cmd/thanos/downsample.go

View workflow job for this annotation

GitHub Actions / Linters (Static Analysis) for Go

undefined: m

Check failure on line 133 in cmd/thanos/downsample.go

View workflow job for this annotation

GitHub Actions / Linters (Static Analysis) for Go

undefined: m

Check failure on line 133 in cmd/thanos/downsample.go

View workflow job for this annotation

GitHub Actions / Linters (Static Analysis) for Go

undefined: m

Check failure on line 133 in cmd/thanos/downsample.go

View workflow job for this annotation

GitHub Actions / Linters (Static Analysis) for Go

undefined: m

Check failure on line 133 in cmd/thanos/downsample.go

View workflow job for this annotation

GitHub Actions / Go build for different platforms

undefined: m
metrics.downsamples.WithLabelValues(resolutionLabel)
metrics.downsampleFailures.WithLabelValues(resolutionLabel)
}
if err := downsampleBucket(ctx, logger, metrics, insBkt, metas, dataDir, downsampleConcurrency, blockFilesConcurrency, hashFunc, false); err != nil {
return errors.Wrap(err, "downsampling failed")
Expand Down Expand Up @@ -263,11 +263,11 @@ func downsampleBucket(
errMsg = "downsampling to 60 min"
}
if err := processDownsampling(workerCtx, logger, bkt, m, dir, resolution, hashFunc, metrics, acceptMalformedIndex, blockFilesConcurrency); err != nil {
metrics.downsampleFailures.WithLabelValues(m.Thanos.GroupKey()).Inc()
metrics.downsampleFailures.WithLabelValues(m.Thanos.ResolutionString()).Inc()
errCh <- errors.Wrap(err, errMsg)

}
metrics.downsamples.WithLabelValues(m.Thanos.GroupKey()).Inc()
metrics.downsamples.WithLabelValues(m.Thanos.ResolutionString()).Inc()
}
}()
}
Expand Down Expand Up @@ -391,7 +391,7 @@ func processDownsampling(
downsampleDuration := time.Since(begin)
level.Info(logger).Log("msg", "downsampled block",
"from", m.ULID, "to", id, "duration", downsampleDuration, "duration_ms", downsampleDuration.Milliseconds())
metrics.downsampleDuration.WithLabelValues(m.Thanos.GroupKey()).Observe(downsampleDuration.Seconds())
metrics.downsampleDuration.WithLabelValues(m.Thanos.ResolutionString()).Observe(downsampleDuration.Seconds())

stats, err := block.GatherIndexHealthStats(ctx, logger, filepath.Join(resdir, block.IndexFilename), m.MinTime, m.MaxTime)
if err == nil {
Expand Down
6 changes: 3 additions & 3 deletions cmd/thanos/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ func TestRegression4960_Deadlock(t *testing.T) {
testutil.Ok(t, err)

metrics := newDownsampleMetrics(prometheus.NewRegistry())
testutil.Equals(t, 0.0, promtest.ToFloat64(metrics.downsamples.WithLabelValues(meta.Thanos.GroupKey())))
testutil.Equals(t, 0.0, promtest.ToFloat64(metrics.downsamples.WithLabelValues(meta.Thanos.ResolutionString())))
baseBlockIDsFetcher := block.NewConcurrentLister(logger, bkt)
metaFetcher, err := block.NewMetaFetcher(nil, block.FetcherConcurrency, bkt, baseBlockIDsFetcher, "", nil, nil)
testutil.Ok(t, err)
Expand Down Expand Up @@ -197,15 +197,15 @@ func TestCleanupDownsampleCacheFolder(t *testing.T) {
testutil.Ok(t, err)

metrics := newDownsampleMetrics(prometheus.NewRegistry())
testutil.Equals(t, 0.0, promtest.ToFloat64(metrics.downsamples.WithLabelValues(meta.Thanos.GroupKey())))
testutil.Equals(t, 0.0, promtest.ToFloat64(metrics.downsamples.WithLabelValues(meta.Thanos.ResolutionString())))
baseBlockIDsFetcher := block.NewConcurrentLister(logger, bkt)
metaFetcher, err := block.NewMetaFetcher(nil, block.FetcherConcurrency, bkt, baseBlockIDsFetcher, "", nil, nil)
testutil.Ok(t, err)

metas, _, err := metaFetcher.Fetch(ctx)
testutil.Ok(t, err)
testutil.Ok(t, downsampleBucket(ctx, logger, metrics, bkt, metas, dir, 1, 1, metadata.NoneFunc, false))
testutil.Equals(t, 1.0, promtest.ToFloat64(metrics.downsamples.WithLabelValues(meta.Thanos.GroupKey())))
testutil.Equals(t, 1.0, promtest.ToFloat64(metrics.downsamples.WithLabelValues(meta.Thanos.ResolutionString())))

_, err = os.Stat(dir)
testutil.Assert(t, os.IsNotExist(err), "index cache dir should not exist at the end of execution")
Expand Down

0 comments on commit c1872c7

Please sign in to comment.