From 1f39ade37918b32d641ad86ced44d10acbc133cb Mon Sep 17 00:00:00 2001 From: Andy Asp Date: Mon, 15 Jan 2024 15:52:23 -0500 Subject: [PATCH 1/2] Reduce sync concurrency in store-gateway by default to reduce disk contention --- CHANGELOG.md | 1 + cmd/mimir/config-descriptor.json | 4 ++-- cmd/mimir/help-all.txt.tmpl | 4 ++-- .../sources/mimir/configure/configuration-parameters/index.md | 4 ++-- operations/mimir/config.libsonnet | 2 -- operations/mimir/store-gateway.libsonnet | 3 --- pkg/storage/tsdb/config.go | 4 ++-- 7 files changed, 9 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e33e553afd3..f8a56abd30f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,7 @@ * `prometheus_sd_refresh_failures_total` renamed to `cortex_prometheus_sd_refresh_failures_total` * `prometheus_sd_refresh_duration_seconds` renamed to `cortex_prometheus_sd_refresh_duration_seconds` * [CHANGE] Query-frontend: the default value for `-query-frontend.not-running-timeout` has been changed from 0 (disabled) to 2s. The configuration option has also been moved from "experimental" to "advanced". #7126 +* [CHANGE] Store-gateway: to reduce disk contention the default value for `blocks-storage.bucket-store.tenant-sync-concurrency` has been changed from `10` to `1` and the default value for `blocks-storage.bucket-store.block-sync-concurrency` has been changed from `20` to `4`. #7136 * [FEATURE] Introduce `-tenant-federation.max-tenants` option to limit the max number of tenants allowed for requests when federation is enabled. #6959 * [FEATURE] Cardinality API: added a new `count_method` parameter which enables counting active label values. #7085 * [FEATURE] Querier / query-frontend: added `-querier.promql-experimental-functions-enabled` CLI flag (and respective YAML config option) to enable experimental PromQL functions. The experimental functions introduced are: `mad_over_time()`, `sort_by_label()` and `sort_by_label_desc()`. #7057 diff --git a/cmd/mimir/config-descriptor.json b/cmd/mimir/config-descriptor.json index 31913fe11f4..4a16404a814 100644 --- a/cmd/mimir/config-descriptor.json +++ b/cmd/mimir/config-descriptor.json @@ -6366,7 +6366,7 @@ "required": false, "desc": "Maximum number of concurrent tenants synching blocks.", "fieldValue": null, - "fieldDefaultValue": 10, + "fieldDefaultValue": 1, "fieldFlag": "blocks-storage.bucket-store.tenant-sync-concurrency", "fieldType": "int", "fieldCategory": "advanced" @@ -6377,7 +6377,7 @@ "required": false, "desc": "Maximum number of concurrent blocks synching per tenant.", "fieldValue": null, - "fieldDefaultValue": 20, + "fieldDefaultValue": 4, "fieldFlag": "blocks-storage.bucket-store.block-sync-concurrency", "fieldType": "int", "fieldCategory": "advanced" diff --git a/cmd/mimir/help-all.txt.tmpl b/cmd/mimir/help-all.txt.tmpl index 414e1a98b7c..6acef59128a 100644 --- a/cmd/mimir/help-all.txt.tmpl +++ b/cmd/mimir/help-all.txt.tmpl @@ -298,7 +298,7 @@ Usage of ./cmd/mimir/mimir: -blocks-storage.bucket-store.batch-series-size int This option controls how many series to fetch per batch. The batch size must be greater than 0. (default 5000) -blocks-storage.bucket-store.block-sync-concurrency int - Maximum number of concurrent blocks synching per tenant. (default 20) + Maximum number of concurrent blocks synching per tenant. (default 4) -blocks-storage.bucket-store.bucket-index.idle-timeout duration How long a unused bucket index should be cached. Once this timeout expires, the unused bucket index is removed from the in-memory cache. This option is used only by querier. (default 1h0m0s) -blocks-storage.bucket-store.bucket-index.max-stale-period duration @@ -660,7 +660,7 @@ Usage of ./cmd/mimir/mimir: -blocks-storage.bucket-store.sync-interval duration How frequently to scan the bucket, or to refresh the bucket index (if enabled), in order to look for changes (new blocks shipped by ingesters and blocks deleted by retention or compaction). (default 15m0s) -blocks-storage.bucket-store.tenant-sync-concurrency int - Maximum number of concurrent tenants synching blocks. (default 10) + Maximum number of concurrent tenants synching blocks. (default 1) -blocks-storage.filesystem.dir string Local filesystem storage directory. (default "blocks") -blocks-storage.gcs.bucket-name string diff --git a/docs/sources/mimir/configure/configuration-parameters/index.md b/docs/sources/mimir/configure/configuration-parameters/index.md index 4a6bdf92bb9..2a7d234391c 100644 --- a/docs/sources/mimir/configure/configuration-parameters/index.md +++ b/docs/sources/mimir/configure/configuration-parameters/index.md @@ -3474,11 +3474,11 @@ bucket_store: # (advanced) Maximum number of concurrent tenants synching blocks. # CLI flag: -blocks-storage.bucket-store.tenant-sync-concurrency - [tenant_sync_concurrency: | default = 10] + [tenant_sync_concurrency: | default = 1] # (advanced) Maximum number of concurrent blocks synching per tenant. # CLI flag: -blocks-storage.bucket-store.block-sync-concurrency - [block_sync_concurrency: | default = 20] + [block_sync_concurrency: | default = 4] # (advanced) Number of Go routines to use when syncing block meta files from # object storage per tenant. diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet index 94d6f38282b..790f7f17a30 100644 --- a/operations/mimir/config.libsonnet +++ b/operations/mimir/config.libsonnet @@ -98,8 +98,6 @@ // When store_gateway_lazy_loading_enabled: true, block index-headers are pre-downloaded but lazy loaded at query time. // Enabling lazy loading results in faster startup times at the cost of some latency during query time. - // store_gateway_lazy_loading_enabled: false will also reduce the concurrency of blocks syncing; - // this improves startup times when running on HDDs instead of SSDs as it reduces random reads. store_gateway_lazy_loading_enabled: true, // Number of memcached replicas for each memcached statefulset diff --git a/operations/mimir/store-gateway.libsonnet b/operations/mimir/store-gateway.libsonnet index 53e35d4dd05..143bbac2bcf 100644 --- a/operations/mimir/store-gateway.libsonnet +++ b/operations/mimir/store-gateway.libsonnet @@ -37,9 +37,6 @@ 'blocks-storage.bucket-store.index-header.lazy-loading-idle-timeout': '60m', } else { 'blocks-storage.bucket-store.index-header.lazy-loading-enabled': 'false', - // Force fewer random disk reads; this increases throughoput and reduces i/o wait on HDDs. - 'blocks-storage.bucket-store.block-sync-concurrency': 4, - 'blocks-storage.bucket-store.tenant-sync-concurrency': 1, }) + $.blocks_chunks_concurrency_connection_config + $.blocks_chunks_caching_config + diff --git a/pkg/storage/tsdb/config.go b/pkg/storage/tsdb/config.go index a35179d36b2..8c3135b5dfe 100644 --- a/pkg/storage/tsdb/config.go +++ b/pkg/storage/tsdb/config.go @@ -444,8 +444,8 @@ func (cfg *BucketStoreConfig) RegisterFlags(f *flag.FlagSet) { f.DurationVar(&cfg.SyncInterval, "blocks-storage.bucket-store.sync-interval", 15*time.Minute, "How frequently to scan the bucket, or to refresh the bucket index (if enabled), in order to look for changes (new blocks shipped by ingesters and blocks deleted by retention or compaction).") f.Uint64Var(&cfg.SeriesHashCacheMaxBytes, "blocks-storage.bucket-store.series-hash-cache-max-size-bytes", uint64(1*units.Gibibyte), "Max size - in bytes - of the in-memory series hash cache. The cache is shared across all tenants and it's used only when query sharding is enabled.") f.IntVar(&cfg.MaxConcurrent, "blocks-storage.bucket-store.max-concurrent", 100, "Max number of concurrent queries to execute against the long-term storage. The limit is shared across all tenants.") - f.IntVar(&cfg.TenantSyncConcurrency, "blocks-storage.bucket-store.tenant-sync-concurrency", 10, "Maximum number of concurrent tenants synching blocks.") - f.IntVar(&cfg.BlockSyncConcurrency, "blocks-storage.bucket-store.block-sync-concurrency", 20, "Maximum number of concurrent blocks synching per tenant.") + f.IntVar(&cfg.TenantSyncConcurrency, "blocks-storage.bucket-store.tenant-sync-concurrency", 1, "Maximum number of concurrent tenants synching blocks.") + f.IntVar(&cfg.BlockSyncConcurrency, "blocks-storage.bucket-store.block-sync-concurrency", 4, "Maximum number of concurrent blocks synching per tenant.") f.IntVar(&cfg.MetaSyncConcurrency, "blocks-storage.bucket-store.meta-sync-concurrency", 20, "Number of Go routines to use when syncing block meta files from object storage per tenant.") f.DurationVar(&cfg.IgnoreDeletionMarksDelay, "blocks-storage.bucket-store.ignore-deletion-marks-delay", time.Hour*1, "Duration after which the blocks marked for deletion will be filtered out while fetching blocks. "+ "The idea of ignore-deletion-marks-delay is to ignore blocks that are marked for deletion with some delay. This ensures store can still serve blocks that are meant to be deleted but do not have a replacement yet.") From c871cef863e8eacb4d60e15d36793563ba8b9685 Mon Sep 17 00:00:00 2001 From: Andy Asp <90626759+andyasp@users.noreply.github.com> Date: Tue, 16 Jan 2024 08:51:45 -0500 Subject: [PATCH 2/2] Update CHANGELOG.md --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f8a56abd30f..2d4d55d1c4e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,7 +19,7 @@ * `prometheus_sd_refresh_failures_total` renamed to `cortex_prometheus_sd_refresh_failures_total` * `prometheus_sd_refresh_duration_seconds` renamed to `cortex_prometheus_sd_refresh_duration_seconds` * [CHANGE] Query-frontend: the default value for `-query-frontend.not-running-timeout` has been changed from 0 (disabled) to 2s. The configuration option has also been moved from "experimental" to "advanced". #7126 -* [CHANGE] Store-gateway: to reduce disk contention the default value for `blocks-storage.bucket-store.tenant-sync-concurrency` has been changed from `10` to `1` and the default value for `blocks-storage.bucket-store.block-sync-concurrency` has been changed from `20` to `4`. #7136 +* [CHANGE] Store-gateway: to reduce disk contention on HDDs the default value for `blocks-storage.bucket-store.tenant-sync-concurrency` has been changed from `10` to `1` and the default value for `blocks-storage.bucket-store.block-sync-concurrency` has been changed from `20` to `4`. #7136 * [FEATURE] Introduce `-tenant-federation.max-tenants` option to limit the max number of tenants allowed for requests when federation is enabled. #6959 * [FEATURE] Cardinality API: added a new `count_method` parameter which enables counting active label values. #7085 * [FEATURE] Querier / query-frontend: added `-querier.promql-experimental-functions-enabled` CLI flag (and respective YAML config option) to enable experimental PromQL functions. The experimental functions introduced are: `mad_over_time()`, `sort_by_label()` and `sort_by_label_desc()`. #7057