diff --git a/.bingo/Variables.mk b/.bingo/Variables.mk index 737d0eae12..47a509a3d9 100644 --- a/.bingo/Variables.mk +++ b/.bingo/Variables.mk @@ -1,4 +1,4 @@ -# Auto generated binary variables helper managed by https://github.com/bwplotka/bingo v0.4.0. DO NOT EDIT. +# Auto generated binary variables helper managed by https://github.com/bwplotka/bingo v0.4.3. DO NOT EDIT. # All tools are designed to be build inside $GOBIN. BINGO_DIR := $(dir $(lastword $(MAKEFILE_LIST))) GOPATH ?= $(shell go env GOPATH) @@ -23,17 +23,11 @@ $(ALERTMANAGER): $(BINGO_DIR)/alertmanager.mod @echo "(re)installing $(GOBIN)/alertmanager-v0.20.0" @cd $(BINGO_DIR) && $(GO) build -mod=mod -modfile=alertmanager.mod -o=$(GOBIN)/alertmanager-v0.20.0 "github.com/prometheus/alertmanager/cmd/alertmanager" -BINGO := $(GOBIN)/bingo-v0.3.0 +BINGO := $(GOBIN)/bingo-v0.4.3 $(BINGO): $(BINGO_DIR)/bingo.mod @# Install binary/ries using Go 1.14+ build command. This is using bwplotka/bingo-controlled, separate go module with pinned dependencies. - @echo "(re)installing $(GOBIN)/bingo-v0.3.0" - @cd $(BINGO_DIR) && $(GO) build -mod=mod -modfile=bingo.mod -o=$(GOBIN)/bingo-v0.3.0 "github.com/bwplotka/bingo" - -EMBEDMD := $(GOBIN)/embedmd-v0.0.0-20181127031020-97c13d6e4160 -$(EMBEDMD): $(BINGO_DIR)/embedmd.mod - @# Install binary/ries using Go 1.14+ build command. This is using bwplotka/bingo-controlled, separate go module with pinned dependencies. - @echo "(re)installing $(GOBIN)/embedmd-v0.0.0-20181127031020-97c13d6e4160" - @cd $(BINGO_DIR) && $(GO) build -mod=mod -modfile=embedmd.mod -o=$(GOBIN)/embedmd-v0.0.0-20181127031020-97c13d6e4160 "github.com/campoy/embedmd" + @echo "(re)installing $(GOBIN)/bingo-v0.4.3" + @cd $(BINGO_DIR) && $(GO) build -mod=mod -modfile=bingo.mod -o=$(GOBIN)/bingo-v0.4.3 "github.com/bwplotka/bingo" FAILLINT := $(GOBIN)/faillint-v1.5.0 $(FAILLINT): $(BINGO_DIR)/faillint.mod @@ -95,6 +89,12 @@ $(JSONNETFMT): $(BINGO_DIR)/jsonnetfmt.mod @echo "(re)installing $(GOBIN)/jsonnetfmt-v0.17.0" @cd $(BINGO_DIR) && $(GO) build -mod=mod -modfile=jsonnetfmt.mod -o=$(GOBIN)/jsonnetfmt-v0.17.0 "github.com/google/go-jsonnet/cmd/jsonnetfmt" +MDOX := $(GOBIN)/mdox-v0.2.2-0.20210617084122-22b44c491197 +$(MDOX): $(BINGO_DIR)/mdox.mod + @# Install binary/ries using Go 1.14+ build command. This is using bwplotka/bingo-controlled, separate go module with pinned dependencies. + @echo "(re)installing $(GOBIN)/mdox-v0.2.2-0.20210617084122-22b44c491197" + @cd $(BINGO_DIR) && $(GO) build -mod=mod -modfile=mdox.mod -o=$(GOBIN)/mdox-v0.2.2-0.20210617084122-22b44c491197 "github.com/bwplotka/mdox" + MINIO := $(GOBIN)/minio-v0.0.0-20200527010300-cccf2de129da $(MINIO): $(BINGO_DIR)/minio.mod @# Install binary/ries using Go 1.14+ build command. This is using bwplotka/bingo-controlled, separate go module with pinned dependencies. diff --git a/.bingo/bingo.mod b/.bingo/bingo.mod index 7007d9020f..0ef4a501e2 100644 --- a/.bingo/bingo.mod +++ b/.bingo/bingo.mod @@ -2,4 +2,4 @@ module _ // Auto generated by https://github.com/bwplotka/bingo. DO NOT EDIT go 1.14 -require github.com/bwplotka/bingo v0.3.0 +require github.com/bwplotka/bingo v0.4.3 diff --git a/.bingo/embedmd.mod b/.bingo/embedmd.mod deleted file mode 100644 index 98cae742d8..0000000000 --- a/.bingo/embedmd.mod +++ /dev/null @@ -1,5 +0,0 @@ -module _ // Auto generated by https://github.com/bwplotka/bingo. DO NOT EDIT - -go 1.14 - -require github.com/campoy/embedmd v0.0.0-20181127031020-97c13d6e4160 diff --git a/.bingo/mdox.mod b/.bingo/mdox.mod new file mode 100644 index 0000000000..540dbc6126 --- /dev/null +++ b/.bingo/mdox.mod @@ -0,0 +1,7 @@ +module _ // Auto generated by https://github.com/bwplotka/bingo. DO NOT EDIT + +go 1.16 + +replace github.com/Kunde21/markdownfmt/v2 => github.com/bwplotka/markdownfmt/v2 v2.0.0-20210616121647-559e77044d46 + +require github.com/bwplotka/mdox v0.2.2-0.20210617084122-22b44c491197 diff --git a/.bingo/variables.env b/.bingo/variables.env index 303a2725f2..ed3af1f84d 100644 --- a/.bingo/variables.env +++ b/.bingo/variables.env @@ -1,4 +1,4 @@ -# Auto generated binary variables helper managed by https://github.com/bwplotka/bingo v0.4.0. DO NOT EDIT. +# Auto generated binary variables helper managed by https://github.com/bwplotka/bingo v0.4.3. DO NOT EDIT. # All tools are designed to be build inside $GOBIN. # Those variables will work only until 'bingo get' was invoked, or if tools were installed via Makefile's Variables.mk. GOBIN=${GOBIN:=$(go env GOBIN)} @@ -10,9 +10,7 @@ fi ALERTMANAGER="${GOBIN}/alertmanager-v0.20.0" -BINGO="${GOBIN}/bingo-v0.3.0" - -EMBEDMD="${GOBIN}/embedmd-v0.0.0-20181127031020-97c13d6e4160" +BINGO="${GOBIN}/bingo-v0.4.3" FAILLINT="${GOBIN}/faillint-v1.5.0" @@ -34,6 +32,8 @@ JSONNET="${GOBIN}/jsonnet-v0.17.0" JSONNETFMT="${GOBIN}/jsonnetfmt-v0.17.0" +MDOX="${GOBIN}/mdox-v0.2.2-0.20210617084122-22b44c491197" + MINIO="${GOBIN}/minio-v0.0.0-20200527010300-cccf2de129da" PROMDOC="${GOBIN}/promdoc-v0.7.0" diff --git a/CHANGELOG.md b/CHANGELOG.md index ceec708910..dae1300c42 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,14 +1,12 @@ - # Changelog All notable changes to this project will be documented in this file. -The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) -and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). +The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). NOTE: As semantic versioning states all 0.y.z releases can contain breaking changes in API (flags, grpc API, any backward compatibility) -We use _breaking :warning:_ to mark changes that are not backward compatible (relates only to v0.y.z releases.) +We use *breaking :warning:* to mark changes that are not backward compatible (relates only to v0.y.z releases.) ## Unreleased @@ -21,11 +19,11 @@ We use _breaking :warning:_ to mark changes that are not backward compatible (re ### Fixed -- +- ### Changed -- +- ## [v0.21.1](https://github.com/thanos-io/thanos/releases/tag/v0.21.1) - 2021.06.04 @@ -37,7 +35,7 @@ We use _breaking :warning:_ to mark changes that are not backward compatible (re ### Added -- [#4117](https://github.com/thanos-io/thanos/pull/4117) Mixin: new alert ThanosReceiveTrafficBelowThreshold to flag if the ingestion average of the last hour dips below 50% of the ingestion average for the last 12 hours. +- [#4117](https://github.com/thanos-io/thanos/pull/4117) Mixin: new alert ThanosReceiveTrafficBelowThreshold to flag if the ingestion average of the last hour dips below 50% of the ingestion average for the last 12 hours. - [#4107](https://github.com/thanos-io/thanos/pull/4107) Store: `LabelNames` and `LabelValues` now support label matchers. - [#3940](https://github.com/thanos-io/thanos/pull/3940) Sidecar: Added matchers support to `LabelValues` - [#4171](https://github.com/thanos-io/thanos/pull/4171) Docker: Busybox image updated to latest (1.33.1) @@ -51,6 +49,7 @@ We use _breaking :warning:_ to mark changes that are not backward compatible (re ### Fixed - [#4105](https://github.com/thanos-io/thanos/pull/4105) Tools: Add glob support for filepath in tools command + ### Changed - [#4223](https://github.com/thanos-io/thanos/pull/4223) Query: federated exemplars API only add replica labels to series labels, not to exemplar labels. @@ -68,9 +67,11 @@ We use _breaking :warning:_ to mark changes that are not backward compatible (re - [#4123](https://github.com/thanos-io/thanos/pull/4123) Query: match external labels for exemplars API. ### Changed -- +- + ### Removed -- +- + ## [v0.20.0](https://github.com/thanos-io/thanos/releases/tag/v0.20.0) - 2021.04.28 ### Added @@ -100,7 +101,7 @@ We use _breaking :warning:_ to mark changes that are not backward compatible (re - [#3929](https://github.com/thanos-io/thanos/pull/3929) Store: Adds the name of the instantiated memcached client to log info. - [#3827](https://github.com/thanos-io/thanos/pull/3827) Upgrade Go version to 1.16 - [#3948](https://github.com/thanos-io/thanos/pull/3948) Receiver: Adjust `http_request_duration_seconds` buckets for low latency requests. -- [#3856](https://github.com/thanos-io/thanos/pull/3856) Mixin: _breaking :warning:_ Introduce flexible multi-cluster/namespace mode for alerts and dashboards. Removes jobPrefix config option. Removes `namespace` by default. +- [#3856](https://github.com/thanos-io/thanos/pull/3856) Mixin: *breaking :warning:* Introduce flexible multi-cluster/namespace mode for alerts and dashboards. Removes jobPrefix config option. Removes `namespace` by default. - [#3937](https://github.com/thanos-io/thanos/pull/3937) Store: Reduce memory usage for range queries. - [#4045](https://github.com/thanos-io/thanos/pull/4045) UI: Enable Targets page in Querier UI. - [#4062](https://github.com/thanos-io/thanos/pull/4062) Flags: Sort flags alphabetically. @@ -165,9 +166,7 @@ We use _breaking :warning:_ to mark changes that are not backward compatible (re ### Changed - [#3496](https://github.com/thanos-io/thanos/pull/3496) S3: Respect SignatureV2 flag for all credential providers. -- [#2732](https://github.com/thanos-io/thanos/pull/2732) Swift: Switched to a new library [ncw/swift](https://github.com/ncw/swift) providing large objects support. - By default, segments will be uploaded to the same container directory `segments/` if the file is bigger than `1GB`. - To change the defaults see [the docs](./docs/storage.md#openstack-swift). +- [#2732](https://github.com/thanos-io/thanos/pull/2732) Swift: Switched to a new library [ncw/swift](https://github.com/ncw/swift) providing large objects support. By default, segments will be uploaded to the same container directory `segments/` if the file is bigger than `1GB`. To change the defaults see [the docs](docs/storage.md#openstack-swift). - [#3626](https://github.com/thanos-io/thanos/pull/3626) Shipper: Failed upload of `meta.json` file doesn't cause block cleanup anymore. This has a potential to generate corrupted blocks under specific conditions. Partial block is left in bucket for later cleanup. ## [v0.17.2](https://github.com/thanos-io/thanos/releases/tag/v0.17.2) - 2020.12.07 @@ -210,7 +209,7 @@ We use _breaking :warning:_ to mark changes that are not backward compatible (re - [#3431](https://github.com/thanos-io/thanos/pull/3431) Store: Added experimental support to lazy load index-headers at query time. When enabled via `--store.enable-index-header-lazy-reader` flag, the store-gateway will load into memory an index-header only once it's required at query time. Index-header will be automatically released after `--store.index-header-lazy-reader-idle-timeout` of inactivity. - This, generally, reduces baseline memory usage of store when inactive, as well as a total number of mapped files (which is limited to 64k in some systems. - [#3437](https://github.com/thanos-io/thanos/pull/3437) StoreAPI: Added `hints` field to `LabelNamesResponse` and `LabelValuesResponse`. Hints in an opaque data structure that can be used to carry additional information from the store and its content is implementation specific. - * This, generally, reduces baseline memory usage of store when inactive, as well as a total number of mapped files (which is limited to 64k in some systems. + * This, generally, reduces baseline memory usage of store when inactive, as well as a total number of mapped files (which is limited to 64k in some systems. - [#3415](https://github.com/thanos-io/thanos/pull/3415) Tools: Added `thanos tools bucket mark` command that allows to mark given block for deletion or for no-compact ### Fixed @@ -224,21 +223,18 @@ We use _breaking :warning:_ to mark changes that are not backward compatible (re ### Changed - [#3452](https://github.com/thanos-io/thanos/pull/3452) Store: Index cache posting compression is now enabled by default. Removed `experimental.enable-index-cache-postings-compression` flag. -- [#3410](https://github.com/thanos-io/thanos/pull/3410) Compactor: Changed metric `thanos_compactor_blocks_marked_for_deletion_total` to `thanos_compactor_blocks_marked_total` with `marker` label. - Compactor will now automatically disable compaction for blocks with large index that would output blocks after compaction larger than specified value (by default: 64GB). This automatically - handles the Promethus [format limit](https://github.com/thanos-io/thanos/issues/1424). +- [#3410](https://github.com/thanos-io/thanos/pull/3410) Compactor: Changed metric `thanos_compactor_blocks_marked_for_deletion_total` to `thanos_compactor_blocks_marked_total` with `marker` label. Compactor will now automatically disable compaction for blocks with large index that would output blocks after compaction larger than specified value (by default: 64GB). This automatically handles the Promethus [format limit](https://github.com/thanos-io/thanos/issues/1424). - [#2906](https://github.com/thanos-io/thanos/pull/2906) Tools: Refactor Bucket replicate execution. Removed all `thanos_replicate_origin_.*` metrics. - `thanos_replicate_origin_meta_loads_total` can be replaced by `blocks_meta_synced{state="loaded"}`. - `thanos_replicate_origin_partial_meta_reads_total` can be replaced by `blocks_meta_synced{state="failed"}`. -- [#3309](https://github.com/thanos-io/thanos/pull/3309) Compact: _breaking :warning:_ Rename metrics to match naming convention. This includes metrics starting with `thanos_compactor` to `thanos_compact`, `thanos_querier` to `thanos_query` and `thanos_ruler` to `thanos_rule`. +- [#3309](https://github.com/thanos-io/thanos/pull/3309) Compact: *breaking :warning:* Rename metrics to match naming convention. This includes metrics starting with `thanos_compactor` to `thanos_compact`, `thanos_querier` to `thanos_query` and `thanos_ruler` to `thanos_rule`. ## [v0.16.0](https://github.com/thanos-io/thanos/releases/tag/v0.16.0) - 2020.10.26 Highlights: - New Thanos component, [Query Frontend](https://thanos.io/tip/components/query-frontend.md/) has more options and supports shared cache (currently: Memcached). -- Added debug mode in Thanos UI that allows to filter Stores to query from by their IPs from Store page (!). This helps enormously in e.g debugging the slowest store etc. All raw Thanos API allows passing - `storeMatch[]` arguments with `__address__` matchers. +- Added debug mode in Thanos UI that allows to filter Stores to query from by their IPs from Store page (!). This helps enormously in e.g debugging the slowest store etc. All raw Thanos API allows passing `storeMatch[]` arguments with `__address__` matchers. - Improved debuggability on all Thanos components by exposing [off-CPU profiles thanks to fgprof endpoint](https://github.com/felixge/fgprof). - Significantly improved sidecar latency and CPU usage for metrics fetches. @@ -250,7 +246,7 @@ Highlights: ### Added - [#3114](https://github.com/thanos-io/thanos/pull/3114) Query Frontend: Added support for Memcached cache. - - **breaking** Renamed flag `log_queries_longer_than` to `log-queries-longer-than`. + - **breaking** Renamed flag `log_queries_longer_than` to `log-queries-longer-than`. - [#3166](https://github.com/thanos-io/thanos/pull/3166) UIs: Added UI for passing a `storeMatch[]` parameter to queries. - [#3181](https://github.com/thanos-io/thanos/pull/3181) Logging: Added debug level logging for responses between 300-399 - [#3133](https://github.com/thanos-io/thanos/pull/3133) Query: Allowed passing a `storeMatch[]` to Labels APIs; Time range metadata based store filtering is supported on Labels APIs. @@ -276,8 +272,7 @@ Highlights: Highlights: -- Added new Thanos component: [Query Frontend](https://thanos.io/v0.15/components/query-frontend/) responsible for response caching, - query scheduling and parallelization (based on Cortex Query Frontend). +- Added new Thanos component: [Query Frontend](https://thanos.io/v0.15/components/query-frontend/) responsible for response caching, query scheduling and parallelization (based on Cortex Query Frontend). - Added various new, improved UIs to Thanos based on React: Querier BuildInfo & Flags, Ruler UI, BlockViewer. - Optimized Sidecar, Store, Receive, Ruler data retrieval with new TSDB ChunkIterator (capping chunks to 120 samples), which fixed various leaks. - Fixed sample limit on Store Gateway. @@ -296,8 +291,7 @@ Highlights: - [#2936](https://github.com/thanos-io/thanos/pull/2936) Compact: Fix ReplicaLabelRemover panic when replicaLabels are not specified. - [#2956](https://github.com/thanos-io/thanos/pull/2956) Store: Fix fetching of chunks bigger than 16000 bytes. - [#2970](https://github.com/thanos-io/thanos/pull/2970) Store: Upgrade minio-go/v7 to fix slowness when running on EKS. -- [#2957](https://github.com/thanos-io/thanos/pull/2957) Rule: _breaking :warning:_ Now sets all of the relevant fields properly; avoids a panic when `/api/v1/rules` is called and the time zone is _not_ UTC; `rules` field is an empty array now if no rules have been defined in a rule group. - Thanos Rule's `/api/v1/rules` endpoint no longer returns the old, deprecated `partial_response_strategy`. The old, deprecated value has been fixed to `WARN` for quite some time. _Please_ use `partialResponseStrategy`. +- [#2957](https://github.com/thanos-io/thanos/pull/2957) Rule: *breaking :warning:* Now sets all of the relevant fields properly; avoids a panic when `/api/v1/rules` is called and the time zone is *not* UTC; `rules` field is an empty array now if no rules have been defined in a rule group. Thanos Rule's `/api/v1/rules` endpoint no longer returns the old, deprecated `partial_response_strategy`. The old, deprecated value has been fixed to `WARN` for quite some time. *Please* use `partialResponseStrategy`. - [#2976](https://github.com/thanos-io/thanos/pull/2976) Query: Better rounding for incoming query timestamps. - [#2929](https://github.com/thanos-io/thanos/pull/2929) Mixin: Fix expression for 'unhealthy sidecar' alert and increase the timeout for 10 minutes. - [#3024](https://github.com/thanos-io/thanos/pull/3024) Query: Consider group name and file for deduplication. @@ -327,9 +321,9 @@ Highlights: - [#2893](https://github.com/thanos-io/thanos/pull/2893) Store: Rename metric `thanos_bucket_store_cached_postings_compression_time_seconds` to `thanos_bucket_store_cached_postings_compression_time_seconds_total`. - [#2915](https://github.com/thanos-io/thanos/pull/2915) Receive,Ruler: Enable TSDB directory locking by default. Add a new flag (`--tsdb.no-lockfile`) to override behavior. - [#2902](https://github.com/thanos-io/thanos/pull/2902) Querier UI:Separate dedupe and partial response checkboxes per panel in new UI. -- [#2991](https://github.com/thanos-io/thanos/pull/2991) Store: _breaking :warning:_ `operation` label value `getrange` changed to `get_range` for `thanos_store_bucket_cache_operation_requests_total` and `thanos_store_bucket_cache_operation_hits_total` to be consistent with bucket operation metrics. +- [#2991](https://github.com/thanos-io/thanos/pull/2991) Store: *breaking :warning:* `operation` label value `getrange` changed to `get_range` for `thanos_store_bucket_cache_operation_requests_total` and `thanos_store_bucket_cache_operation_hits_total` to be consistent with bucket operation metrics. - [#2876](https://github.com/thanos-io/thanos/pull/2876) Receive,Ruler: Updated TSDB and switched to ChunkIterators instead of sample one, which avoids unnecessary decoding / encoding. -- [#3064](https://github.com/thanos-io/thanos/pull/3064) s3: _breaking :warning:_ Add SSE/SSE-KMS/SSE-C configuration. The S3 `encrypt_sse: true` option is now deprecated in favour of `sse_config`. If you used `encrypt_sse`, the migration strategy is to set up the following block: +- [#3064](https://github.com/thanos-io/thanos/pull/3064) s3: *breaking :warning:* Add SSE/SSE-KMS/SSE-C configuration. The S3 `encrypt_sse: true` option is now deprecated in favour of `sse_config`. If you used `encrypt_sse`, the migration strategy is to set up the following block: ```yaml sse_config: @@ -354,7 +348,7 @@ sse_config: - [#2667](https://github.com/thanos-io/thanos/pull/2667) Store: Removed support to the legacy `index.cache.json`. The hidden flag `--store.disable-index-header` was removed. - [#2613](https://github.com/thanos-io/thanos/pull/2613) Store: Renamed the caching bucket config option `chunk_object_size_ttl` to `chunk_object_attrs_ttl`. - [#2667](https://github.com/thanos-io/thanos/pull/2667) Compact: The deprecated flag `--index.generate-missing-cache-file` and the metric `thanos_compact_generated_index_total` were removed. -- [#2671](https://github.com/thanos-io/thanos/pull/2671) _breaking_ Tools: Bucket replicate flag `--resolution` is now in Go duration format. +- [#2671](https://github.com/thanos-io/thanos/pull/2671) *breaking* Tools: Bucket replicate flag `--resolution` is now in Go duration format. - [#2671](https://github.com/thanos-io/thanos/pull/2671) Tools: Bucket replicate now replicates by default all blocks. - [#2739](https://github.com/thanos-io/thanos/pull/2739) Changed `bucket tool bucket verify` `--id-whitelist` flag to `--id`. - [#2748](https://github.com/thanos-io/thanos/pull/2748) Upgrade Prometheus to [@66dfb951c4ca](https://github.com/prometheus/prometheus/commit/66dfb951c4ca2c1dd3f266172a48a925403b13a5) which is after v2.19.0. @@ -399,9 +393,7 @@ sse_config: - [#2194](https://github.com/thanos-io/thanos/pull/2194) Updated to golang v1.14.2. - [#2505](https://github.com/thanos.io/thanos/pull/2505) Store: Removed obsolete `thanos_store_node_info` metric. -- [#2513](https://github.com/thanos-io/thanos/pull/2513) Tools: Moved `thanos bucket` commands to `thanos tools bucket`, also - moved `thanos check rules` to `thanos tools rules-check`. `thanos tools rules-check` also takes rules by `--rules` repeated flag not argument - anymore. +- [#2513](https://github.com/thanos-io/thanos/pull/2513) Tools: Moved `thanos bucket` commands to `thanos tools bucket`, also moved `thanos check rules` to `thanos tools rules-check`. `thanos tools rules-check` also takes rules by `--rules` repeated flag not argument anymore. - [#2548](https://github.com/thanos-io/thanos/pull/2548/commits/53e69bd89b2b08c18df298eed7d90cb7179cc0ec) Store, Querier: remove duplicated chunks on StoreAPI. - [#2596](https://github.com/thanos-io/thanos/pull/2596) Updated Prometheus dependency to [@cd73b3d33e064bbd846fc7a26dc8c313d46af382](https://github.com/prometheus/prometheus/commit/cd73b3d33e064bbd846fc7a26dc8c313d46af382) which falls in between v2.17.0 and v2.18.0. - Receive,Rule: TSDB now supports isolation of append and queries. @@ -460,9 +452,8 @@ sse_config: ### Changed -- [#2136](https://github.com/thanos-io/thanos/pull/2136) _breaking_ Store, Compact, Bucket: schedule block deletion by adding deletion-mark.json. This adds a consistent way for multiple readers and writers to access object storage. - Since there are no consistency guarantees provided by some Object Storage providers, this PR adds a consistent lock-free way of dealing with Object Storage irrespective of the choice of object storage. In order to achieve this co-ordination, blocks are not deleted directly. Instead, blocks are marked for deletion by uploading the `deletion-mark.json` file for the block that was chosen to be deleted. This file contains Unix time of when the block was marked for deletion. If you want to keep existing behavior, you should add `--delete-delay=0s` as a flag. -- [#2090](https://github.com/thanos-io/thanos/issues/2090) _breaking_ Downsample command: the `downsample` command has moved and is now a sub-command of the `thanos bucket` sub-command; it cannot be called via `thanos downsample` any more. +- [#2136](https://github.com/thanos-io/thanos/pull/2136) *breaking* Store, Compact, Bucket: schedule block deletion by adding deletion-mark.json. This adds a consistent way for multiple readers and writers to access object storage. Since there are no consistency guarantees provided by some Object Storage providers, this PR adds a consistent lock-free way of dealing with Object Storage irrespective of the choice of object storage. In order to achieve this co-ordination, blocks are not deleted directly. Instead, blocks are marked for deletion by uploading the `deletion-mark.json` file for the block that was chosen to be deleted. This file contains Unix time of when the block was marked for deletion. If you want to keep existing behavior, you should add `--delete-delay=0s` as a flag. +- [#2090](https://github.com/thanos-io/thanos/issues/2090) *breaking* Downsample command: the `downsample` command has moved and is now a sub-command of the `thanos bucket` sub-command; it cannot be called via `thanos downsample` any more. - [#2294](https://github.com/thanos-io/thanos/pull/2294) Store: optimizations for fetching postings. Queries using `=~".*"` matchers or negation matchers (`!=...` or `!~...`) benefit the most. - [#2301](https://github.com/thanos-io/thanos/pull/2301) Ruler: exit with an error when initialization fails. - [#2310](https://github.com/thanos-io/thanos/pull/2310) Query: report timespan 0 to 0 when discovering no stores. @@ -493,7 +484,7 @@ sse_config: - [#1870](https://github.com/thanos-io/thanos/pull/1870) UI: Persist settings in query. - [#1969](https://github.com/thanos-io/thanos/pull/1969) Sidecar: allow setting http connection pool size via flags. - [#1967](https://github.com/thanos-io/thanos/issues/1967) Receive: Allow local TSDB compaction. -- [#1939](https://github.com/thanos-io/thanos/pull/1939) Ruler: Add TLS and authentication support for query endpoints with the `--query.config` and `--query.config-file` CLI flags. See [documentation](docs/components/rule.md/#configuration) for further information. +- [#1939](https://github.com/thanos-io/thanos/pull/1939) Ruler: Add TLS and authentication support for query endpoints with the `--query.config` and `--query.config-file` CLI flags. See [documentation](docs/components/rule.md#configuration) for further information. - [#1982](https://github.com/thanos-io/thanos/pull/1982) Ruler: Add support for Alertmanager v2 API endpoints. - [#2030](https://github.com/thanos-io/thanos/pull/2030) Query: Add `thanos_proxy_store_empty_stream_responses_total` metric for number of empty responses from stores. - [#2049](https://github.com/thanos-io/thanos/pull/2049) Tracing: Support sampling on Elastic APM with new sample_rate setting. @@ -502,7 +493,7 @@ sse_config: ### Changed -- [#1970](https://github.com/thanos-io/thanos/issues/1970) _breaking_ Receive: Use gRPC for forwarding requests between peers. Note that existing values for the `--receive.local-endpoint` flag and the endpoints in the hashring configuration file must now specify the receive gRPC port and must be updated to be a simple `host:port` combination, e.g. `127.0.0.1:10901`, rather than a full HTTP URL, e.g. `http://127.0.0.1:10902/api/v1/receive`. +- [#1970](https://github.com/thanos-io/thanos/issues/1970) *breaking* Receive: Use gRPC for forwarding requests between peers. Note that existing values for the `--receive.local-endpoint` flag and the endpoints in the hashring configuration file must now specify the receive gRPC port and must be updated to be a simple `host:port` combination, e.g. `127.0.0.1:10901`, rather than a full HTTP URL, e.g. `http://127.0.0.1:10902/api/v1/receive`. - [#1933](https://github.com/thanos-io/thanos/pull/1933) Add a flag `--tsdb.wal-compression` to configure whether to enable tsdb wal compression in ruler and receiver. - [#2021](https://github.com/thanos-io/thanos/pull/2021) Rename metric `thanos_query_duplicated_store_address` to `thanos_query_duplicated_store_addresses_total` and `thanos_rule_duplicated_query_address` to `thanos_rule_duplicated_query_addresses_total`. - [#2166](https://github.com/thanos-io/thanos/pull/2166) Bucket Web: improve the tooltip for the bucket UI; it was reconstructed and now exposes much more information about blocks. @@ -511,18 +502,15 @@ sse_config: ### Fixed -- [#2015](https://github.com/thanos-io/thanos/pull/2015) Sidecar: Querier /api/v1/series bug fixed when time range was ignored inside sidecar. - The bug was noticeable for example when using Grafana template variables. +- [#2015](https://github.com/thanos-io/thanos/pull/2015) Sidecar: Querier /api/v1/series bug fixed when time range was ignored inside sidecar. The bug was noticeable for example when using Grafana template variables. - [#2120](https://github.com/thanos-io/thanos/pull/2120) Bucket Web: Set state of status prober properly. ## [v0.10.0](https://github.com/thanos-io/thanos/releases/tag/v0.10.0) - 2020.01.13 ### Fixed -- [#1919](https://github.com/thanos-io/thanos/issues/1919) Compactor: Fixed potential data loss when uploading older blocks, or upload taking long time while compactor is - running. -- [#1937](https://github.com/thanos-io/thanos/pull/1937) Compactor: Improved synchronization of meta JSON files. - Compactor now properly handles partial block uploads for all operation like retention apply, downsampling and compaction. Additionally: +- [#1919](https://github.com/thanos-io/thanos/issues/1919) Compactor: Fixed potential data loss when uploading older blocks, or upload taking long time while compactor is running. +- [#1937](https://github.com/thanos-io/thanos/pull/1937) Compactor: Improved synchronization of meta JSON files. Compactor now properly handles partial block uploads for all operation like retention apply, downsampling and compaction. Additionally: - Removed `thanos_compact_sync_meta_*` metrics. Use `thanos_blocks_meta_*` metrics instead. - Added `thanos_consistency_delay_seconds` and `thanos_compactor_aborted_partial_uploads_deletion_attempts_total` metrics. @@ -541,9 +529,9 @@ sse_config: - [#1852](https://github.com/thanos-io/thanos/pull/1852) Add support for `AWS_CONTAINER_CREDENTIALS_FULL_URI` by upgrading to minio-go v6.0.44 - [#1854](https://github.com/thanos-io/thanos/pull/1854) Update Rule UI to support alerts count displaying and filtering. -- [#1838](https://github.com/thanos-io/thanos/pull/1838) Ruler: Add TLS and authentication support for Alertmanager with the `--alertmanagers.config` and `--alertmanagers.config-file` CLI flags. See [documentation](docs/components/rule.md/#configuration) for further information. +- [#1838](https://github.com/thanos-io/thanos/pull/1838) Ruler: Add TLS and authentication support for Alertmanager with the `--alertmanagers.config` and `--alertmanagers.config-file` CLI flags. See [documentation](docs/components/rule.md#configuration) for further information. - [#1838](https://github.com/thanos-io/thanos/pull/1838) Ruler: Add a new `--alertmanagers.sd-dns-interval` CLI option to specify the interval between DNS resolutions of Alertmanager hosts. -- [#1881](https://github.com/thanos-io/thanos/pull/1881) Store Gateway: memcached support for index cache. See [documentation](docs/components/store.md/#index-cache) for further information. +- [#1881](https://github.com/thanos-io/thanos/pull/1881) Store Gateway: memcached support for index cache. See [documentation](docs/components/store.md#index-cache) for further information. - [#1904](https://github.com/thanos-io/thanos/pull/1904) Add a skip-chunks option in Store Series API to improve the response time of `/api/v1/series` endpoint. - [#1910](https://github.com/thanos-io/thanos/pull/1910) Query: `/api/v1/labels` now understands `POST` - useful for sending bigger requests @@ -591,8 +579,7 @@ sse_config: ### Changed -- [#1666](https://github.com/thanos-io/thanos/pull/1666) Compact: `thanos_compact_group_compactions_total` now counts block compactions, so operations that resulted in a compacted block. The old behaviour - is now exposed by new metric: `thanos_compact_group_compaction_runs_started_total` and `thanos_compact_group_compaction_runs_completed_total` which counts compaction runs overall. +- [#1666](https://github.com/thanos-io/thanos/pull/1666) Compact: `thanos_compact_group_compactions_total` now counts block compactions, so operations that resulted in a compacted block. The old behaviour is now exposed by new metric: `thanos_compact_group_compaction_runs_started_total` and `thanos_compact_group_compaction_runs_completed_total` which counts compaction runs overall. - [#1748](https://github.com/thanos-io/thanos/pull/1748) Updated all dependencies. - [#1694](https://github.com/thanos-io/thanos/pull/1694) `prober_ready` and `prober_healthy` metrics are removed, for sake of `status`. Now `status` exposes same metric with a label, `check`. `check` can have "healty" or "ready" depending on status of the probe. - [#1790](https://github.com/thanos-io/thanos/pull/1790) Ruler: Fixes subqueries support for ruler. @@ -604,8 +591,7 @@ sse_config: - [#1632](https://github.com/thanos-io/thanos/issues/1632) Removes the duplicated external labels detection on Thanos Querier; warning only; Made Store Gateway compatible with older Querier versions. - NOTE: `thanos_store_nodes_grpc_connections` metric is now per `external_labels` and `store_type`. It is a recommended metric for Querier storeAPIs. `thanos_store_node_info` is marked as obsolete and will be removed in next release. - - NOTE2: Store Gateway is now advertising artificial: `"@thanos_compatibility_store_type=store"` label. This is to have the current Store Gateway compatible with Querier pre v0.8.0. - This label can be disabled by hidden `debug.advertise-compatibility-label=false` flag on Store Gateway. + - NOTE2: Store Gateway is now advertising artificial: `"@thanos_compatibility_store_type=store"` label. This is to have the current Store Gateway compatible with Querier pre v0.8.0. This label can be disabled by hidden `debug.advertise-compatibility-label=false` flag on Store Gateway. ## [v0.8.0](https://github.com/thanos-io/thanos/releases/tag/v0.8.0) - 2019.10.10 @@ -619,15 +605,13 @@ Lot's of improvements this release! Noteworthy items: - Sidecar exposed data from Prometheus can be now limited to given `min-time` (e.g 3h only). - Numerous Thanos Receive improvements. -Make sure you check out Prometheus 2.13.0 as well. New release drastically improves usage and resource consumption of -both Prometheus and sidecar with Thanos: https://prometheus.io/blog/2019/10/10/remote-read-meets-streaming/ +Make sure you check out Prometheus 2.13.0 as well. New release drastically improves usage and resource consumption of both Prometheus and sidecar with Thanos: https://prometheus.io/blog/2019/10/10/remote-read-meets-streaming/ ### Added - [#1619](https://github.com/thanos-io/thanos/pull/1619) Thanos sidecar allows to limit min time range for data it exposes from Prometheus. - [#1583](https://github.com/thanos-io/thanos/pull/1583) Thanos sharding: - - Add relabel config (`--selector.relabel-config-file` and `selector.relabel-config`) into Thanos Store and Compact components. - Selecting blocks to serve depends on the result of block labels relabeling. + - Add relabel config (`--selector.relabel-config-file` and `selector.relabel-config`) into Thanos Store and Compact components. Selecting blocks to serve depends on the result of block labels relabeling. - For store gateway, advertise labels from "approved" blocks. - [#1540](https://github.com/thanos-io/thanos/pull/1540) Thanos Downsample added `/-/ready` and `/-/healthy` endpoints. - [#1538](https://github.com/thanos-io/thanos/pull/1538) Thanos Rule added `/-/ready` and `/-/healthy` endpoints. @@ -636,8 +620,7 @@ both Prometheus and sidecar with Thanos: https://prometheus.io/blog/2019/10/10/r - [#1534](https://github.com/thanos-io/thanos/pull/1534) Thanos Query Added `/-/ready` and `/-/healthy` endpoints. - [#1533](https://github.com/thanos-io/thanos/pull/1533) Thanos inspect now supports the timeout flag. - [#1496](https://github.com/thanos-io/thanos/pull/1496) Thanos Receive now supports setting block duration. -- [#1362](https://github.com/thanos-io/thanos/pull/1362) Optional `replicaLabels` param for `/query` and - `/query_range` querier endpoints. When provided overwrite the `query.replica-label` cli flags. +- [#1362](https://github.com/thanos-io/thanos/pull/1362) Optional `replicaLabels` param for `/query` and `/query_range` querier endpoints. When provided overwrite the `query.replica-label` cli flags. - [#1482](https://github.com/thanos-io/thanos/pull/1482) Thanos now supports Elastic APM as tracing provider. - [#1612](https://github.com/thanos-io/thanos/pull/1612) Thanos Rule added `resendDelay` flag. - [#1480](https://github.com/thanos-io/thanos/pull/1480) Thanos Receive flushes storage on hashring change. @@ -645,8 +628,7 @@ both Prometheus and sidecar with Thanos: https://prometheus.io/blog/2019/10/10/r ### Changed -- [#1362](https://github.com/thanos-io/thanos/pull/1362) `query.replica-label` configuration can be provided more than - once for multiple deduplication labels like: `--query.replica-label=prometheus_replica --query.replica-label=service`. +- [#1362](https://github.com/thanos-io/thanos/pull/1362) `query.replica-label` configuration can be provided more than once for multiple deduplication labels like: `--query.replica-label=prometheus_replica --query.replica-label=service`. - [#1581](https://github.com/thanos-io/thanos/pull/1581) Thanos Store now can use smaller buffer sizes for Bytes pool; reducing memory for some requests. - [#1622](https://github.com/thanos-io/thanos/pull/1622) & [#1590](https://github.com/thanos-io/thanos/pull/1590) Upgraded to Go 1.13.1 - [#1498](https://github.com/thanos-io/thanos/pull/1498) Thanos Receive change flag `labels` to `label` to be consistent with other commands. @@ -666,16 +648,15 @@ both Prometheus and sidecar with Thanos: https://prometheus.io/blog/2019/10/10/r Accepted into CNCF: -- Thanos moved to new repository -- Docker images moved to and mirrored at -- Slack moved to `#thanos`/`#thanos-dev`/`#thanos-prs` +- Thanos moved to new repository https://github.com/thanos-io/thanos +- Docker images moved to https://quay.io/thanos/thanos and mirrored at https://hub.docker.com/r/thanosio/thanos +- Slack moved to https://slack.cncf.io `#thanos`/`#thanos-dev`/`#thanos-prs` ### Added - [#1478](https://github.com/thanos-io/thanos/pull/1478) Thanos components now exposes gRPC server metrics as soon as server starts, to provide more reliable data for instrumentation. - [#1378](https://github.com/thanos-io/thanos/pull/1378) Thanos Receive now exposes `thanos_receive_config_hash`, `thanos_receive_config_last_reload_successful` and `thanos_receive_config_last_reload_success_timestamp_seconds` metrics to track latest configuration change -- [#1268](https://github.com/thanos-io/thanos/pull/1268) Thanos Sidecar added support for newest Prometheus streaming remote read added [here](https://github.com/prometheus/prometheus/pull/5703). This massively improves memory required by single - request for both Prometheus and sidecar. Single requests now should take constant amount of memory on sidecar, so resource consumption prediction is now straightforward. This will be used if you have Prometheus `2.13` or `2.12-master`. +- [#1268](https://github.com/thanos-io/thanos/pull/1268) Thanos Sidecar added support for newest Prometheus streaming remote read added [here](https://github.com/prometheus/prometheus/pull/5703). This massively improves memory required by single request for both Prometheus and sidecar. Single requests now should take constant amount of memory on sidecar, so resource consumption prediction is now straightforward. This will be used if you have Prometheus `2.13` or `2.12-master`. - [#1358](https://github.com/thanos-io/thanos/pull/1358) Added `part_size` configuration option for HTTP multipart requests minimum part size for S3 storage type - [#1363](https://github.com/thanos-io/thanos/pull/1363) Thanos Receive now exposes `thanos_receive_hashring_nodes` and `thanos_receive_hashring_tenants` metrics to monitor status of hash-rings - [#1395](https://github.com/thanos-io/thanos/pull/1395) Thanos Sidecar added `/-/ready` and `/-/healthy` endpoints to Thanos sidecar. @@ -729,7 +710,7 @@ Accepted into CNCF: - [#1147](https://github.com/thanos-io/thanos/pull/1147) Support for the Jaeger tracer has been added! -_breaking_ New common flags were added for configuring tracing: `--tracing.config-file` and `--tracing.config`. You can either pass a file to Thanos with the tracing configuration or pass it in the command line itself. Old `--gcloudtrace.*` flags were removed :warning: +*breaking* New common flags were added for configuring tracing: `--tracing.config-file` and `--tracing.config`. You can either pass a file to Thanos with the tracing configuration or pass it in the command line itself. Old `--gcloudtrace.*` flags were removed :warning: To migrate over the old `--gcloudtrace.*` configuration, your tracing configuration should look like this: @@ -746,9 +727,7 @@ The other `type` you can use is `JAEGER` now. The `config` keys and values are J ### Changed -- [#1284](https://github.com/thanos-io/thanos/pull/1284) Add support for multiple label-sets in Info gRPC service. - This deprecates the single `Labels` slice of the `InfoResponse`, in a future release backward compatible handling for the single set of Labels will be removed. Upgrading to v0.6.0 or higher is advised. - _breaking_ If you run have duplicate queries in your Querier configuration with hierarchical federation of multiple Queries this PR makes Thanos Querier to detect this case and block all duplicates. Refer to 0.6.1 which at least allows for single replica to work. +- [#1284](https://github.com/thanos-io/thanos/pull/1284) Add support for multiple label-sets in Info gRPC service. This deprecates the single `Labels` slice of the `InfoResponse`, in a future release backward compatible handling for the single set of Labels will be removed. Upgrading to v0.6.0 or higher is advised. *breaking* If you run have duplicate queries in your Querier configuration with hierarchical federation of multiple Queries this PR makes Thanos Querier to detect this case and block all duplicates. Refer to 0.6.1 which at least allows for single replica to work. - [#1314](https://github.com/thanos-io/thanos/pull/1314) Removes `http_request_duration_microseconds` (Summary) and adds `http_request_duration_seconds` (Histogram) from http server instrumentation used in Thanos APIs and UIs. @@ -786,7 +765,7 @@ TL;DR: Store LRU cache is no longer leaking, Upgraded Thanos UI to Prometheus 2. This version moved tarballs to Golang 1.12.5 from 1.11 as well, so same warning applies if you use `container_memory_usage_bytes` from cadvisor. Use `container_memory_working_set_bytes` instead. -_breaking_ As announced couple of times this release also removes gossip with all configuration flags (`--cluster.*`). +*breaking* As announced couple of times this release also removes gossip with all configuration flags (`--cluster.*`). ### Fixed @@ -801,8 +780,7 @@ _breaking_ As announced couple of times this release also removes gossip with al ### Changed -- [#1118](https://github.com/thanos-io/thanos/pull/1118) _breaking_ swift: Added support for cross-domain authentication by introducing `userDomainID`, `userDomainName`, `projectDomainID`, `projectDomainName`. - The outdated terms `tenantID`, `tenantName` are deprecated and have been replaced by `projectID`, `projectName`. +- [#1118](https://github.com/thanos-io/thanos/pull/1118) *breaking* swift: Added support for cross-domain authentication by introducing `userDomainID`, `userDomainName`, `projectDomainID`, `projectDomainName`. The outdated terms `tenantID`, `tenantName` are deprecated and have been replaced by `projectID`, `projectName`. - [#1066](https://github.com/thanos-io/thanos/pull/1066) Upgrade Thanos ui to Prometheus v2.9.1. @@ -842,14 +820,13 @@ _breaking_ As announced couple of times this release also removes gossip with al ## Deprecated -- [#1008](https://github.com/thanos-io/thanos/pull/1008) _breaking_ Removed Gossip implementation. All `--cluster.*` flags removed and Thanos will error out if any is provided. +- [#1008](https://github.com/thanos-io/thanos/pull/1008) *breaking* Removed Gossip implementation. All `--cluster.*` flags removed and Thanos will error out if any is provided. ## [v0.4.0](https://github.com/thanos-io/thanos/releases/tag/v0.4.0) - 2019.05.3 :warning: **IMPORTANT** :warning: This is the last release that supports gossip. From Thanos v0.5.0, gossip will be completely removed. -This release also disables gossip mode by default for all components. -See [this](docs/proposals/201809_gossip-removal.md) for more details. +This release also disables gossip mode by default for all components. See [this](docs/proposals/201809_gossip-removal.md) for more details. :warning: This release moves Thanos docker images (NOT artifacts by accident) to Golang 1.12. This release includes change in GC's memory release which gives following effect: @@ -878,26 +855,27 @@ New options: New Store flags: - * `--store.grpc.series-sample-limit` limits the amount of samples that might be retrieved on a single Series() call. By default it is 0. Consider enabling it by setting it to more than 0 if you are running on limited resources. - * `--store.grpc.series-max-concurrency` limits the number of concurrent Series() calls in Thanos Store. By default it is 20. Considering making it lower or bigger depending on the scale of your deployment. +``` +* `--store.grpc.series-sample-limit` limits the amount of samples that might be retrieved on a single Series() call. By default it is 0. Consider enabling it by setting it to more than 0 if you are running on limited resources. +* `--store.grpc.series-max-concurrency` limits the number of concurrent Series() calls in Thanos Store. By default it is 20. Considering making it lower or bigger depending on the scale of your deployment. +``` New Store metrics: - * `thanos_bucket_store_queries_dropped_total` shows how many queries were dropped due to the samples limit; - * `thanos_bucket_store_queries_concurrent_max` is a constant metric which shows how many Series() calls can concurrently be executed by Thanos Store; - * `thanos_bucket_store_queries_in_flight` shows how many queries are currently "in flight" i.e. they are being executed; - * `thanos_bucket_store_gate_duration_seconds` shows how many seconds it took for queries to pass through the gate in both cases - when that fails and when it does not. +``` +* `thanos_bucket_store_queries_dropped_total` shows how many queries were dropped due to the samples limit; +* `thanos_bucket_store_queries_concurrent_max` is a constant metric which shows how many Series() calls can concurrently be executed by Thanos Store; +* `thanos_bucket_store_queries_in_flight` shows how many queries are currently "in flight" i.e. they are being executed; +* `thanos_bucket_store_gate_duration_seconds` shows how many seconds it took for queries to pass through the gate in both cases - when that fails and when it does not. +``` New Store tracing span: \* `store_query_gate_ismyturn` shows how long it took for a query to pass (or not) through the gate. -- [#1016](https://github.com/thanos-io/thanos/pull/1016) Added option for another DNS resolver (miekg/dns client). - Note that this is required to have SRV resolution working on [Golang 1.11+ with KubeDNS below v1.14](https://github.com/golang/go/issues/27546) +- [#1016](https://github.com/thanos-io/thanos/pull/1016) Added option for another DNS resolver (miekg/dns client). Note that this is required to have SRV resolution working on [Golang 1.11+ with KubeDNS below v1.14](https://github.com/golang/go/issues/27546) New Querier and Ruler flag: `-- store.sd-dns-resolver` which allows to specify resolver to use. Either `golang` or `miekgdns` -- [#986](https://github.com/thanos-io/thanos/pull/986) Allow to save some startup & sync time in store gateway as it is no longer needed to compute index-cache from block index on its own for larger blocks. - The store Gateway still can do it, but it first checks bucket if there is index-cached uploaded already. - In the same time, compactor precomputes the index cache file on every compaction. +- [#986](https://github.com/thanos-io/thanos/pull/986) Allow to save some startup & sync time in store gateway as it is no longer needed to compute index-cache from block index on its own for larger blocks. The store Gateway still can do it, but it first checks bucket if there is index-cached uploaded already. In the same time, compactor precomputes the index cache file on every compaction. New Compactor flag: `--index.generate-missing-cache-file` was added to allow quicker addition of index cache files. If enabled it precomputes missing files on compactor startup. Note that it will take time and it's only one-off step per bucket. @@ -916,8 +894,7 @@ New Store tracing span: \* `store_query_gate_ismyturn` shows how long it took fo ### Changed -- [#970](https://github.com/thanos-io/thanos/pull/970) Deprecated `partial_response_disabled` proto field. Added `partial_response_strategy` instead. Both in gRPC and Query API. - No `PartialResponseStrategy` field for `RuleGroups` by default means `abort` strategy (old PartialResponse disabled) as this is recommended option for Rules and alerts. +- [#970](https://github.com/thanos-io/thanos/pull/970) Deprecated `partial_response_disabled` proto field. Added `partial_response_strategy` instead. Both in gRPC and Query API. No `PartialResponseStrategy` field for `RuleGroups` by default means `abort` strategy (old PartialResponse disabled) as this is recommended option for Rules and alerts. Metrics: @@ -948,12 +925,11 @@ New Store tracing span: \* `store_query_gate_ismyturn` shows how long it took fo - tooling: [FEATURE] New dump command to tsdb tool to dump all samples. - compactor: - [ENHANCEMENT] When closing the db any running compaction will be cancelled so it doesn't block. - - [CHANGE] _breaking_ Renamed flag `--sync-delay` to `--consistency-delay` [#1053](https://github.com/thanos-io/thanos/pull/1053) + - [CHANGE] *breaking* Renamed flag `--sync-delay` to `--consistency-delay` [#1053](https://github.com/thanos-io/thanos/pull/1053) For ruler essentially whole TSDB CHANGELOG applies between v0.4.0-v0.6.1: https://github.com/prometheus/tsdb/blob/master/CHANGELOG.md - Note that this was added on TSDB and Prometheus: [FEATURE] Time-ovelapping blocks are now allowed. #370 - Whoever due to nature of Thanos compaction (distributed systems), for safety reason this is disabled for Thanos compactor for now. + Note that this was added on TSDB and Prometheus: [FEATURE] Time-ovelapping blocks are now allowed. #370 Whoever due to nature of Thanos compaction (distributed systems), for safety reason this is disabled for Thanos compactor for now. - [#868](https://github.com/thanos-io/thanos/pull/868) Go has been updated to 1.12. - [#1055](https://github.com/thanos-io/thanos/pull/1055) Gossip flags are now disabled by default and deprecated. @@ -983,9 +959,7 @@ New Store tracing span: \* `store_query_gate_ismyturn` shows how long it took fo - [#851](https://github.com/thanos-io/thanos/pull/851) New read API endpoint for api/v1/rules and api/v1/alerts. - [#873](https://github.com/thanos-io/thanos/pull/873) Store: fix set index cache LRU -:warning: **WARNING** :warning: #873 fix fixes actual handling of `index-cache-size`. Handling of limit for this cache was -broken so it was unbounded all the time. From this release actual value matters and is extremely low by default. To "revert" -the old behaviour (no boundary), use a large enough value. +:warning: **WARNING** :warning: #873 fix fixes actual handling of `index-cache-size`. Handling of limit for this cache was broken so it was unbounded all the time. From this release actual value matters and is extremely low by default. To "revert" the old behaviour (no boundary), use a large enough value. ### Fixed @@ -1047,7 +1021,7 @@ the old behaviour (no boundary), use a large enough value. ### Deprecated -- Tests against Prometheus below v2.2.1. This does not mean _lack_ of support for those. Only that we don't tests the compatibility anymore. See [#758](https://github.com/thanos-io/thanos/issues/758) for details. +- Tests against Prometheus below v2.2.1. This does not mean *lack* of support for those. Only that we don't tests the compatibility anymore. See [#758](https://github.com/thanos-io/thanos/issues/758) for details. ## [v0.2.1](https://github.com/thanos-io/thanos/releases/tag/v0.2.1) - 2018.12.27 @@ -1056,7 +1030,7 @@ the old behaviour (no boundary), use a large enough value. - Relabel drop for Thanos Ruler to enable replica label drop and alert deduplication on AM side. - Query: Stores UI page available at `/stores`. -![](./docs/img/query_ui_stores.png) +![](docs/img/query_ui_stores.png) ### Fixed @@ -1073,7 +1047,7 @@ Note lots of necessary breaking changes in flags that relates to bucket configur ### Deprecated -- _breaking_: Removed all bucket specific flags as we moved to config files: +- *breaking*: Removed all bucket specific flags as we moved to config files: - --gcs-bucket=\ - --s3.bucket=\ - --s3.endpoint=\ @@ -1083,17 +1057,17 @@ Note lots of necessary breaking changes in flags that relates to bucket configur - --s3.encrypt-sse - --gcs-backup-bucket=\ - --s3-backup-bucket=\ -- _breaking_: Removed support of those environment variables for bucket: +- *breaking*: Removed support of those environment variables for bucket: - S3_BUCKET - S3_ENDPOINT - S3_ACCESS_KEY - S3_INSECURE - S3_SIGNATURE_VERSION2 -- _breaking_: Removed provider specific bucket metrics e.g `thanos_objstore_gcs_bucket_operations_total` in favor of of generic bucket operation metrics. +- *breaking*: Removed provider specific bucket metrics e.g `thanos_objstore_gcs_bucket_operations_total` in favor of of generic bucket operation metrics. ### Changed -- _breaking_: Added `thanos_` prefix to memberlist (gossip) metrics. Make sure to update your dashboards and rules. +- *breaking*: Added `thanos_` prefix to memberlist (gossip) metrics. Make sure to update your dashboards and rules. - S3 provider: - Set `"X-Amz-Acl": "bucket-owner-full-control"` metadata for s3 upload operation. diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index f0db01ee15..f87eb555a2 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -4,15 +4,9 @@ Thanos follows the [CNCF Code of Conduct](https://github.com/cncf/foundation/blo # Enforcement -Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting -the project team members (e.g @bwplotka, @povilasv) at [CNCF slack workspace](https://slack.cncf.io/). -The project team will review and investigate all complaints, -and will respond in a way that it deems appropriate to the circumstances. -The project team is obligated to maintain confidentiality with regard to the -reporter of an incident. Further details of specific enforcement policies may be posted separately. +Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team members (e.g @bwplotka, @povilasv) at [CNCF slack workspace](https://slack.cncf.io/). The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. -Project maintainers who do not follow or enforce the Code of Conduct in good faith may -face temporary or permanent repercussions as determined by other members of the project's leadership. +Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. ## Attribution diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index c27f398aeb..ead79ad77e 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -2,7 +2,7 @@ This document explain the process of contributing to the Thanos project. -First of all please follow the [CODE_OF_CONDUCT](/CODE_OF_CONDUCT.md) in all your interactions within the project. +First of all please follow the [CODE_OF_CONDUCT](CODE_OF_CONDUCT.md) in all your interactions within the project. ## Thanos Philosophy @@ -19,22 +19,17 @@ The philosophy of Thanos and our community borrows heavily from UNIX philosophy If you encounter any issue or you have an idea to improve, please: -* Search through Google and [existing open and closed GitHub Issues](https://github.com/thanos-io/thanos/issues) for the -answer first. If you find a relevant topic, please comment on the issue. -* If none of the issues are relevant, please add an issue to [GitHub issues](https://github.com/thanos-io/thanos/issues). Please provide -any relevant information as suggested by the Pull Request template. -* If you have a quick question you might want to also ask on #thanos or #thanos-dev slack channel in the CNCF workspace. -We recommend using GitHub issues for issues and feedback, because GitHub issues are trackable. +* Search through Google and [existing open and closed GitHub Issues](https://github.com/thanos-io/thanos/issues) for the answer first. If you find a relevant topic, please comment on the issue. +* If none of the issues are relevant, please add an issue to [GitHub issues](https://github.com/thanos-io/thanos/issues). Please provide any relevant information as suggested by the Pull Request template. +* If you have a quick question you might want to also ask on #thanos or #thanos-dev slack channel in the CNCF workspace. We recommend using GitHub issues for issues and feedback, because GitHub issues are trackable. -If you encounter a security vulnerability, please refer to [Reporting a Vulnerability process](/SECURITY.md#reporting-a-vulnerability) +If you encounter a security vulnerability, please refer to [Reporting a Vulnerability process](SECURITY.md#reporting-a-vulnerability) ## Adding New Features / Components -When contributing a complex change to Thanos repository, please -discuss the change you wish to make within a Github issue, in Slack, or by another -method with the owners of this repository before making the change. +When contributing a complex change to Thanos repository, please discuss the change you wish to make within a Github issue, in Slack, or by another method with the owners of this repository before making the change. -Adding a large new feature or/and component to Thanos should be done by first creating a [proposal](/docs/proposals) document outlining the design decisions of the change, motivations for the change, and any alternatives that might have been considered. +Adding a large new feature or/and component to Thanos should be done by first creating a [proposal](docs/contributing/proposal-process.md) document outlining the design decisions of the change, motivations for the change, and any alternatives that might have been considered. ## General Naming @@ -46,7 +41,7 @@ In the code and documentation prefer non-offensive terminology, for example: ## Components Naming -Thanos is a distributed system comprised of several services and CLI tools as listed [here](/cmd/thanos). +Thanos is a distributed system comprised of several services and CLI tools as listed [here](cmd/thanos). When we refer to them in a technical capacity we use the verbal form: `store`, `compact`, `rule`, `query`, `query_frontend`. This includes: @@ -77,15 +72,12 @@ The following section explains various suggestions and procedures to note during ### First Steps -It's key to get familiarized with the style guide and mechanics of Thanos, especially if your contribution touches more than one -component of the Thanos distributed system. We recommend: +It's key to get familiarized with the style guide and mechanics of Thanos, especially if your contribution touches more than one component of the Thanos distributed system. We recommend: * Reading the [getting started docs](docs/getting-started.md) and working through them, or alternatively working through the [Thanos tutorial](https://katacoda.com/thanos). * Familiarizing yourself with our [coding style guidelines.](docs/contributing/coding-style-guide.md). -* Familiarizing yourself with the [Makefile](Makefile) commands, for example `format`, `build`, `proto`, `docker` and `test`. -`make help` will print most of available commands with relevant details. -* Spin up a prebuilt dev environment using Gitpod.io -[![Gitpod Ready-to-Code](https://img.shields.io/badge/Gitpod-ready--to--code-blue?logo=gitpod)](https://gitpod.io/#https://github.com/thanos-io/thanos) +* Familiarizing yourself with the [Makefile](Makefile) commands, for example `format`, `build`, `proto`, `docker` and `test`. `make help` will print most of available commands with relevant details. +* Spin up a prebuilt dev environment using Gitpod.io [![Gitpod Ready-to-Code](https://img.shields.io/badge/Gitpod-ready--to--code-blue?logo=gitpod)](https://gitpod.io/#https://github.com/thanos-io/thanos) * In case you want to develop the project locally, install **Golang** in your machine. Here is a nice [gist](https://gist.github.com/nikhita/432436d570b89cab172dcf2894465753) for this purpose. ### Installing Project locally in your machine @@ -93,6 +85,7 @@ component of the Thanos distributed system. We recommend: * Find any directory in your system your want Thanos repo in. e.g `~/Repos` - * `cd ~/Repos` * Make sure that the GOBIN, GOPATH and GOPROXY (useful) environment variables are set and that GOBIN is included in your PATH. For example - + ``` export GOBIN="~/Repos/thanos/.bin" # It's nice to have local tooling installed and stored locally. @@ -111,8 +104,8 @@ component of the Thanos distributed system. We recommend: ### Building/Running/Developing * Run `make help` for getting a list of helper commands that will make your development life much more easy. Especially consider using `make lint` often. It provides auto **linting** and **formatter** for making sure the code quality meets the standards of contribution. -* Usually, while sending in a PR `make build`, `make format`, `make lint`, `make test`, `make docs`, `make check-docs`, `make quickstart` are the most used commands while developing Thanos. -* When you run `make build` from Thanos repo root, code is compiled and a binary named `thanos` is created and built into your `$GOBIN` or `$GOPATH/bin`. +* Usually, while sending in a PR `make build`, `make format`, `make lint`, `make test`, `make docs`, `make check-docs`, `make quickstart` are the most used commands while developing Thanos. +* When you run `make build` from Thanos repo root, code is compiled and a binary named `thanos` is created and built into your `$GOBIN` or `$GOPATH/bin`. * In case you are working on a component of Thanos, you would love it if you don’t have to set up the yaml configuration for Prometheus and other components, before you start running the component. This is a repetitive task, and the Thanos Community has provided commands/script for automating the running of components - * Run `make quickstart` for spinning up all components of Thanos quickly. * If you want to run specific components instead of all, feel free to use and edit - [quickstart.sh](https://github.com/thanos-io/thanos/blob/b08c0ea62abfe4dcf1400da0e37598f0cd8fa8cf/scripts/quickstart.sh) @@ -140,15 +133,15 @@ $ thanos -h **Signing your work: DCO (Developer Certificate of Origin) Process.** * By contributing to this project you agree to the [Developer Certificate of Origin](https://developercertificate.org/)(DCO). This document was created by the Linux Kernel community and is a simple statement that you, as a contributor, have the legal right to make the contribution. -* To signoff, you need to add `Signed-off-by: Your Name ` at the end of your commit messages. You can do this using [`git commit -s`](https://git-scm.com/docs/git-commit#Documentation/git-commit.txt--s). For example: +* To signoff, you need to add `Signed-off-by: Your Name ` at the end of your commit messages. You can do this using [`git commit -s`](https://git-scm.com/docs/git-commit#Documentation/git-commit.txt--s). For example: + ``` $ git commit -s -m 'This is my commit message' ``` -* You can also alias ``commit`` as `commit -s` in your `~/.gitconfig` to signoff all your future commits. -* If you have authored an unsigned commit, you can update it using ``git commit --amend --signoff``. If you've pushed your changes to GitHub already you'll need to force push your branch after this with ``git push -f``. +* You can also alias `commit` as `commit -s` in your `~/.gitconfig` to signoff all your future commits. +* If you have authored an unsigned commit, you can update it using `git commit --amend --signoff`. If you've pushed your changes to GitHub already you'll need to force push your branch after this with `git push -f`. -1. Keep PRs as small as possible. For each of your PRs, you create a new branch based on the latest main. -Chain them if needed (base one PR on other PRs). You can read more details about the workflow from [here](https://gist.github.com/Chaser324/ce0505fbed06b947d962). +1. Keep PRs as small as possible. For each of your PRs, you create a new branch based on the latest main. Chain them if needed (base one PR on other PRs). You can read more details about the workflow from [here](https://gist.github.com/Chaser324/ce0505fbed06b947d962). ```console $ git checkout main @@ -159,9 +152,8 @@ $ make build $ $ git push origin ``` -**Tests your changes** - +**Tests your changes** **Updating your branch** @@ -172,10 +164,9 @@ It is a good practice to keep your branch updated by rebasing your branch to mai **Changelog and Review Procedure** * If your change affects users (adds or removes feature) consider adding the item to the [CHANGELOG](CHANGELOG.md). -* You may merge the Pull Request once you have the sign-off of at least one developer with write access, or if you -do not have permission to do that, you may request the second reviewer to merge it for you. +* You may merge the Pull Request once you have the sign-off of at least one developer with write access, or if you do not have permission to do that, you may request the second reviewer to merge it for you. * If you feel like your PR is waiting too long for a review, feel free to ping the [`#thanos-dev`](https://slack.cncf.io/) channel on our slack for a review! -* If you are a new contributor with no write access, you can tag in the respective maintainer for the changes, but be patient enough for the reviews. _Remember, good things take time :)_ +* If you are a new contributor with no write access, you can tag in the respective maintainer for the changes, but be patient enough for the reviews. *Remember, good things take time :)* ### Dependency management @@ -205,27 +196,21 @@ You have to commit the changes to `go.mod` and `go.sum` before submitting the pu * Thanos provides make commands that help you run the tests locally. * If you don't have a live object store ready, you can use the `make test-local` command. -* **NOTE**: This command skips tests against live object storage systems by specifying environment variables; this causes the -store-specific tests to be run against memory and filesystem object storage types only. The CI tests run uses GCS, AWS and Swift. +* **NOTE**: This command skips tests against live object storage systems by specifying environment variables; this causes the store-specific tests to be run against memory and filesystem object storage types only. The CI tests run uses GCS, AWS and Swift. * Not specifying these variables will result in auth errors against GCS, AWS, Azure, COS etc. * If you have a decent hardware to run the tests, you can run them locally. * If you want to run the tests once in a while, it is suitable for you to send in a PR, the built in CI/CD setup runs the tests for you, which is nice for once in a while run. -* `make test`: Runs all Thanos Go unit tests against each supported version of Prometheus. This excludes tests in `./test/e2e`. -* `make test-local`: Runs test excluding tests for ALL object storage integrations. -* `make test-e2e`: Runs all Thanos e2e docker-based e2e tests from test/e2e. Required access to docker daemon. -* `make test-e2e-local`: Runs all thanos e2e tests locally. +* `make test`: Runs all Thanos Go unit tests against each supported version of Prometheus. This excludes tests in `./test/e2e`. +* `make test-local`: Runs test excluding tests for ALL object storage integrations. +* `make test-e2e`: Runs all Thanos e2e docker-based e2e tests from test/e2e. Required access to docker daemon. +* `make test-e2e-local`: Runs all thanos e2e tests locally. ### Advanced testing -At some point during development it is useful, in addition to running unit or e2e tests, to run and play with Thanos components manually. While you -can run any component manually by crafting specific flags for a test setup, there are already some nice tools and scripts available. -Consider the following methods: +At some point during development it is useful, in addition to running unit or e2e tests, to run and play with Thanos components manually. While you can run any component manually by crafting specific flags for a test setup, there are already some nice tools and scripts available. Consider the following methods: -* `make quickstart`: this command spins -up a simple setup of all Thanos components. -* `make test-e2e`: the e2e tests cover most of the setups and functionality Thanos offers. It's extremely easy to add `time.Sleep(10* time.Minutes)` -at certain points in the tests (e.g for compactor [here](https://github.com/thanos-io/thanos/blob/8f492a9f073f819019dd9f044e346a1e1fa730bc/test/e2e/compact_test.go#L379)). -This way when you run `make test-e2e`, the test will sleep for some time, allowing you to connect to the setup manually using the port printed in the logs. For example: +* `make quickstart`: this command spins up a simple setup of all Thanos components. +* `make test-e2e`: the e2e tests cover most of the setups and functionality Thanos offers. It's extremely easy to add `time.Sleep(10* time.Minutes)` at certain points in the tests (e.g for compactor [here](https://github.com/thanos-io/thanos/blob/8f492a9f073f819019dd9f044e346a1e1fa730bc/test/e2e/compact_test.go#L379)). This way when you run `make test-e2e`, the test will sleep for some time, allowing you to connect to the setup manually using the port printed in the logs. For example: ```bash querier-1: level=info name=querier-1 ts=2020-04-01T12:53:56.101029491Z caller=http.go:56 service=http/server component=query msg="listening for requests and metrics" address=:80 diff --git a/MAINTAINERS.md b/MAINTAINERS.md index 65a39551b8..ee24be5b8a 100644 --- a/MAINTAINERS.md +++ b/MAINTAINERS.md @@ -1,20 +1,18 @@ # Core Maintainers of this repository -| Name | Email | Slack | GitHub | Company | -|-----------------------|------------------------|--------------------------|---------------------------------------------|-------------------| -| Bartłomiej Płotka | bwplotka@gmail.com | `@bwplotka` | [@bwplotka](https://github.com/bwplotka) | Red Hat | -| Frederic Branczyk | fbranczyk@gmail.com | `@brancz` | [@brancz](https://github.com/brancz) | Polar Signals | -| Giedrius Statkevičius | giedriuswork@gmail.com | `@Giedrius Statkevičius` | [@GiedriusS](https://github.com/GiedriusS) | AdForm | -| Kemal Akkoyun | kakkoyun@gmail.com | `@kakkoyun` | [@kakkoyun](https://github.com/kakkoyun) | Red Hat | -| Lucas Servén Marín | lserven@gmail.com | `@squat` | [@squat](https://github.com/squat) | Red Hat | -| Prem Saraswat | prmsrswt@gmail.com | `@Prem Saraswat` | [@onprem](https://github.com/onprem) | Red Hat | -| Marco Pracucci | marco@pracucci.com | `@pracucci` | [@pracucci](https://github.com/pracucci) | Grafana Labs | -| Matthias Loibl | mail@matthiasloibl.com | `@metalmatze` | [@metalmatze](https://github.com/metalmatze)| Polar Signals | -| Ben Ye | yb532204897@gmail.com | `@yeya24` | [@yeya24](https://github.com/yeya24) | Red Hat | - -We are bunch of people from different companies with various interests and skills. -We are from different parts of the world: Germany, Italy, Lithuania, Poland, UK, India and China. -We have something in common though: We all share the love for OpenSource, Go, Prometheus, :coffee: and Observability topics. +| Name | Email | Slack | GitHub | Company | +|-----------------------|------------------------|--------------------------|----------------------------------------------|---------------| +| Bartłomiej Płotka | bwplotka@gmail.com | `@bwplotka` | [@bwplotka](https://github.com/bwplotka) | Red Hat | +| Frederic Branczyk | fbranczyk@gmail.com | `@brancz` | [@brancz](https://github.com/brancz) | Polar Signals | +| Giedrius Statkevičius | giedriuswork@gmail.com | `@Giedrius Statkevičius` | [@GiedriusS](https://github.com/GiedriusS) | AdForm | +| Kemal Akkoyun | kakkoyun@gmail.com | `@kakkoyun` | [@kakkoyun](https://github.com/kakkoyun) | Red Hat | +| Lucas Servén Marín | lserven@gmail.com | `@squat` | [@squat](https://github.com/squat) | Red Hat | +| Prem Saraswat | prmsrswt@gmail.com | `@Prem Saraswat` | [@onprem](https://github.com/onprem) | Red Hat | +| Marco Pracucci | marco@pracucci.com | `@pracucci` | [@pracucci](https://github.com/pracucci) | Grafana Labs | +| Matthias Loibl | mail@matthiasloibl.com | `@metalmatze` | [@metalmatze](https://github.com/metalmatze) | Polar Signals | +| Ben Ye | yb532204897@gmail.com | `@yeya24` | [@yeya24](https://github.com/yeya24) | Red Hat | + +We are bunch of people from different companies with various interests and skills. We are from different parts of the world: Germany, Italy, Lithuania, Poland, UK, India and China. We have something in common though: We all share the love for OpenSource, Go, Prometheus, :coffee: and Observability topics. As either Software Developers or SRE (or both!) we've chosen to maintain (mostly in our free time) Thanos, the de facto way to scale awesome [Prometheus](https://prometheus.io) project. @@ -26,25 +24,23 @@ You can reach us under `thanos-io@googlegroups.com` email. ## Triage -We also have some nice souls that help triaging issues and PRs. See [here](https://help.github.com/en/articles/repository-permission-levels-for-an-organization#permission-levels-for-repositories-owned-by-an-organization) -for details about the role's permission. +We also have some nice souls that help triaging issues and PRs. See [here](https://help.github.com/en/articles/repository-permission-levels-for-an-organization#permission-levels-for-repositories-owned-by-an-organization) for details about the role's permission. Full list of triage persons is displayed below: -| Name | Slack | GitHub | Company | -|-----------------------|--------------------------|------------------------------------------------------------|-----------------------| -| Adrien Fillon | `@Adrien F` | [@adrien-f](https://github.com/adrien-f) | | -| Martin Chodur | `@FUSAKLA` | [@fusakla](https://github.com/fusakla) | | -| Michael Dai | `@jojohappy` | [@jojohappy](https://github.com/jojohappy) | | -| Xiang Dai | `@daixiang0` | [@daixiang0](https://github.com/daixiang0) | | -| Wiard van Rij | `@wiard van Rij` | [@wiardvanrij](https://github.com/wiardvanrij) | Fullstaq | +| Name | Slack | GitHub | Company | +|---------------|------------------|------------------------------------------------|----------| +| Adrien Fillon | `@Adrien F` | [@adrien-f](https://github.com/adrien-f) | | +| Martin Chodur | `@FUSAKLA` | [@fusakla](https://github.com/fusakla) | | +| Michael Dai | `@jojohappy` | [@jojohappy](https://github.com/jojohappy) | | +| Xiang Dai | `@daixiang0` | [@daixiang0](https://github.com/daixiang0) | | +| Wiard van Rij | `@wiard van Rij` | [@wiardvanrij](https://github.com/wiardvanrij) | Fullstaq | Please reach any of the maintainer on slack or email if you want to help as well. ### Triage labels -To improve navigating through issues and PRs we introduce various [labels](https://github.com/thanos-io/thanos/issues/labels). Part of triaging process for Triage and Maintainers -is to adjust those labels if needed manual. The `Prow` system can automate a portion of this in future. +To improve navigating through issues and PRs we introduce various [labels](https://github.com/thanos-io/thanos/issues/labels). Part of triaging process for Triage and Maintainers is to adjust those labels if needed manual. The `Prow` system can automate a portion of this in future. The main labels are: @@ -76,8 +72,7 @@ This helps to also estimate how long it can potentially take to review the PR or #### State -Github shows some basic states, but sometimes PR is stale due to requested changes, but it's not clear from first glance. -That's why `state: changes-requested` is helpful on those. +Github shows some basic states, but sometimes PR is stale due to requested changes, but it's not clear from first glance. That's why `state: changes-requested` is helpful on those. #### Other @@ -95,19 +90,16 @@ Maintainers of bucket storage clients are available [here](/docs/storage.md#impl ## How to be maintainer? -Any [contributor](/CONTRIBUTING.md) that shows effort and willingness in maintaining Thanos repository can join maintainer team. +Any [contributor](CONTRIBUTING.md) that shows effort and willingness in maintaining Thanos repository can join maintainer team. Open Source is all about the trust, which is the key factor in decision to add write permissions. -In time we plan to set up maintainers team that will be organization independent. Reach us if you have any questions or want to join -maintainer team. +In time we plan to set up maintainers team that will be organization independent. Reach us if you have any questions or want to join maintainer team. ## Initial authors Fabian Reinartz @fabxc and Bartłomiej Płotka @bwplotka - ## Previous Maintainers -Dominic Green -Povilas Versockas +Dominic Green, Povilas Versockas diff --git a/Makefile b/Makefile index 5fb123e728..24b86946b4 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,6 @@ include .bingo/Variables.mk FILES_TO_FMT ?= $(shell find . -path ./vendor -prune -o -name '*.go' -print) +MD_FILES_TO_FORMAT = $(shell find docs -name "*.md") $(shell ls *.md) DOCKER_IMAGE_REPO ?= quay.io/thanos/thanos DOCKER_IMAGE_TAG ?= $(subst /,-,$(shell git rev-parse --abbrev-ref HEAD))-$(shell date +%Y-%m-%d)-$(shell git rev-parse --short HEAD) @@ -167,28 +168,24 @@ docker-push: @docker push "$(DOCKER_IMAGE_REPO):$(DOCKER_IMAGE_TAG)" .PHONY: docs -docs: ## Regenerates flags in docs for all thanos commands. -docs: $(EMBEDMD) build +docs: ## Regenerates flags in docs for all thanos commands localise links, ensure GitHub format. +docs: $(MDOX) build @echo ">> generating docs" - @EMBEDMD_BIN="$(EMBEDMD)" SED_BIN="$(SED)" THANOS_BIN="$(GOBIN)/thanos" scripts/genflagdocs.sh - @echo ">> cleaning white noise" - @find . -type f -name "*.md" | SED_BIN="$(SED)" xargs scripts/cleanup-white-noise.sh + PATH=${PATH}:$(GOBIN) $(MDOX) fmt --links.localize.address-regex="https://thanos.io/.*" $(MD_FILES_TO_FORMAT) .PHONY: check-docs check-docs: ## checks docs against discrepancy with flags, links, white noise. -check-docs: $(EMBEDMD) build - @echo ">> checking docs generation" - @EMBEDMD_BIN="$(EMBEDMD)" SED_BIN="$(SED)" THANOS_BIN="$(GOBIN)/thanos" scripts/genflagdocs.sh check - @echo ">> checking links (DISABLED for now)" - @find . -type f -name "*.md" | SED_BIN="$(SED)" xargs scripts/cleanup-white-noise.sh +check-docs: $(MDOX) build + @echo ">> checking local links" + PATH=${PATH}:$(GOBIN) $(MDOX) fmt --check --links.localize.address-regex="https://thanos.io/.*" $(MD_FILES_TO_FORMAT) $(call require_clean_work_tree,'run make docs and commit changes') -.PHONY:shell-format +.PHONY: shell-format shell-format: $(SHFMT) @echo ">> formatting shell scripts" @$(SHFMT) -i 2 -ci -w -s $(shell find . -type f -name "*.sh" -not -path "*vendor*" -not -path "tmp/*") -.PHONY:format +.PHONY: format format: ## Formats code including imports and cleans up white noise. format: go-format shell-format @SED_BIN="$(SED)" scripts/cleanup-white-noise.sh $(FILES_TO_FMT) @@ -276,10 +273,15 @@ web-pre-process: web: ## Builds our website. web: web-pre-process $(HUGO) @echo ">> building documentation website" - # TODO(bwplotka): Make it --gc @rm -rf "$(WEB_DIR)/public" @cd $(WEB_DIR) && HUGO_ENV=production $(HUGO) --config hugo.yaml --minify -v -b $(WEBSITE_BASE_URL) +.PHONY: web-serve +web-serve: ## Builds and serves Thanos website on localhost. +web-serve: web-pre-process $(HUGO) + @echo ">> serving documentation website" + @cd $(WEB_DIR) && $(HUGO) --config hugo.yaml -v server + .PHONY:lint lint: ## Runs various static analysis against our code. lint: go-lint react-app-lint shell-lint @@ -314,18 +316,12 @@ sync/atomic=go.uber.org/atomic" ./... @$(MAKE) proto $(call require_clean_work_tree,'detected files without copyright, run make lint and commit changes') -.PHONY:shell-lint +.PHONY: shell-lint shell-lint: ## Runs static analysis against our shell scripts. shell-lint: $(SHELLCHECK) @echo ">> linting all of the shell script files" @$(SHELLCHECK) --severity=error -o all -s bash $(shell find . -type f -name "*.sh" -not -path "*vendor*" -not -path "tmp/*" -not -path "*node_modules*") -.PHONY: web-serve -web-serve: ## Builds and serves Thanos website on localhost. -web-serve: web-pre-process $(HUGO) - @echo ">> serving documentation website" - @cd $(WEB_DIR) && $(HUGO) --config hugo.yaml -v server - .PHONY: examples examples: jsonnet-vendor jsonnet-format $(EMBEDMD) ${THANOS_MIXIN}/README.md examples/alerts/alerts.md examples/alerts/alerts.yaml examples/alerts/rules.yaml examples/dashboards examples/tmp mixin/runbook.md $(EMBEDMD) -w examples/alerts/alerts.md diff --git a/README.md b/README.md index a83f5c0fea..dff0615dff 100644 --- a/README.md +++ b/README.md @@ -1,37 +1,22 @@

Thanos Logo

-[![Latest Release](https://img.shields.io/github/release/thanos-io/thanos.svg?style=flat-square)](https://github.com/thanos-io/thanos/releases/latest) -[![Go Report Card](https://goreportcard.com/badge/github.com/thanos-io/thanos)](https://goreportcard.com/report/github.com/thanos-io/thanos) -[![Go Code reference](https://img.shields.io/badge/code%20reference-go.dev-darkblue.svg)](https://pkg.go.dev/github.com/thanos-io/thanos?tab=subdirectories) -[![Slack](https://img.shields.io/badge/join%20slack-%23thanos-brightgreen.svg)](https://slack.cncf.io/) -[![Netlify Status](https://api.netlify.com/api/v1/badges/664a5091-934c-4b0e-a7b6-bc12f822a590/deploy-status)](https://app.netlify.com/sites/thanos-io/deploys) -[![CII Best Practices](https://bestpractices.coreinfrastructure.org/projects/3048/badge)](https://bestpractices.coreinfrastructure.org/projects/3048) - -[![CI](https://github.com/thanos-io/thanos/workflows/CI/badge.svg)](https://github.com/thanos-io/thanos/actions?query=workflow%3ACI) -[![CI](https://circleci.com/gh/thanos-io/thanos.svg?style=svg)](https://circleci.com/gh/thanos-io/thanos) -[![go](https://github.com/thanos-io/thanos/workflows/go/badge.svg)](https://github.com/thanos-io/thanos/actions?query=workflow%3Ago) -[![react](https://github.com/thanos-io/thanos/workflows/react/badge.svg)](https://github.com/thanos-io/thanos/actions?query=workflow%3Areact) -[![docs](https://github.com/thanos-io/thanos/workflows/docs/badge.svg)](https://github.com/thanos-io/thanos/actions?query=workflow%3Adocs) -[![Gitpod ready-to-code](https://img.shields.io/badge/Gitpod-ready--to--code-blue?logo=gitpod)](https://gitpod.io/#https://github.com/thanos-io/thanos) +[![Latest Release](https://img.shields.io/github/release/thanos-io/thanos.svg?style=flat-square)](https://github.com/thanos-io/thanos/releases/latest) [![Go Report Card](https://goreportcard.com/badge/github.com/thanos-io/thanos)](https://goreportcard.com/report/github.com/thanos-io/thanos) [![Go Code reference](https://img.shields.io/badge/code%20reference-go.dev-darkblue.svg)](https://pkg.go.dev/github.com/thanos-io/thanos?tab=subdirectories) [![Slack](https://img.shields.io/badge/join%20slack-%23thanos-brightgreen.svg)](https://slack.cncf.io/) [![Netlify Status](https://api.netlify.com/api/v1/badges/664a5091-934c-4b0e-a7b6-bc12f822a590/deploy-status)](https://app.netlify.com/sites/thanos-io/deploys) [![CII Best Practices](https://bestpractices.coreinfrastructure.org/projects/3048/badge)](https://bestpractices.coreinfrastructure.org/projects/3048) + +[![CI](https://github.com/thanos-io/thanos/workflows/CI/badge.svg)](https://github.com/thanos-io/thanos/actions?query=workflow%3ACI) [![CI](https://circleci.com/gh/thanos-io/thanos.svg?style=svg)](https://circleci.com/gh/thanos-io/thanos) [![go](https://github.com/thanos-io/thanos/workflows/go/badge.svg)](https://github.com/thanos-io/thanos/actions?query=workflow%3Ago) [![react](https://github.com/thanos-io/thanos/workflows/react/badge.svg)](https://github.com/thanos-io/thanos/actions?query=workflow%3Areact) [![docs](https://github.com/thanos-io/thanos/workflows/docs/badge.svg)](https://github.com/thanos-io/thanos/actions?query=workflow%3Adocs) [![Gitpod ready-to-code](https://img.shields.io/badge/Gitpod-ready--to--code-blue?logo=gitpod)](https://gitpod.io/#https://github.com/thanos-io/thanos) ## Overview -Thanos is a set of components that can be composed into a highly available metric -system with unlimited storage capacity, which can be added seamlessly on top of existing -Prometheus deployments. +Thanos is a set of components that can be composed into a highly available metric system with unlimited storage capacity, which can be added seamlessly on top of existing Prometheus deployments. Thanos is a [CNCF](https://www.cncf.io/) Incubating project. -Thanos leverages the Prometheus 2.0 storage format to cost-efficiently store historical metric -data in any object storage while retaining fast query latencies. Additionally, it provides -a global query view across all Prometheus installations and can merge data from Prometheus -HA pairs on the fly. +Thanos leverages the Prometheus 2.0 storage format to cost-efficiently store historical metric data in any object storage while retaining fast query latencies. Additionally, it provides a global query view across all Prometheus installations and can merge data from Prometheus HA pairs on the fly. Concretely the aims of the project are: 1. Global query view of metrics. -1. Unlimited retention of metrics. -1. High availability of components, including Prometheus. +2. Unlimited retention of metrics. +3. High availability of components, including Prometheus. ## Getting Started @@ -91,8 +76,7 @@ Contributions are very welcome! See our [CONTRIBUTING.md](CONTRIBUTING.md) for m ## Community -Thanos is an open source project and we value and welcome new contributors and members -of the community. Here are ways to get in touch with the community: +Thanos is an open source project and we value and welcome new contributors and members of the community. Here are ways to get in touch with the community: * Slack: [#thanos](https://slack.cncf.io/) * Issue Tracker: [GitHub Issues](https://github.com/thanos-io/thanos/issues) diff --git a/SECURITY.md b/SECURITY.md index 6c8962ef33..50b0651e63 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -1,38 +1,29 @@ # Security Policy -At the Thanos team we are not security experts. -However we try our best to avoid security concerns and to avoid writing features that handle sensitive information at all. +At the Thanos team we are not security experts. However we try our best to avoid security concerns and to avoid writing features that handle sensitive information at all. -It's worth noting that we assume metric data to be sensitive and important. -External labels and query API parameters are considered less sensitive, as they are logged and put into metrics/traces. +It's worth noting that we assume metric data to be sensitive and important. External labels and query API parameters are considered less sensitive, as they are logged and put into metrics/traces. ## What You CAN Expect: -* We follow best programming practices. -We test heavily, including e2e tests against major object storages. -We use vetting and static analysis tools on every pull request. -We use secure protocols for building processes, e.g. when producing Docker images. +* We follow best programming practices. We test heavily, including e2e tests against major object storages. We use vetting and static analysis tools on every pull request. We use secure protocols for building processes, e.g. when producing Docker images. * We don't put any data that is stored in the TSDB into logs or instrumentation. -* If we use crypto tools, we always rely on FLOSS and standard libraries, like the official [Go crypt](https://golang.org/pkg/crypto/) - library. +* If we use crypto tools, we always rely on FLOSS and standard libraries, like the official [Go crypt](https://golang.org/pkg/crypto/) library. * We always use TLS by default for communication with all object storages. -* We use stable Go versions to build our images and binaries. -We update Go as soon as a new version is released. +* We use stable Go versions to build our images and binaries. We update Go as soon as a new version is released. * We use only FLOSS tools. ## What We DON'T Do (yet): -* We don't encrypt metrics in local storage, i.e. on disk. -We don't do client-side encryption for object storage. -We recommend setting server-side encryption for object storage. +* We don't encrypt metrics in local storage, i.e. on disk. We don't do client-side encryption for object storage. We recommend setting server-side encryption for object storage. * We don't allow specifying authorization or TLS for Thanos server HTTP APIs. ## Supported Versions -| Version | Supported | -| ------- | ------------------ | -| >= 0.10.1 | :white_check_mark: | -| < 0.10.1 | :x: | +| Version | Supported | +|-----------|--------------------| +| >= 0.10.1 | :white_check_mark: | +| < 0.10.1 | :x: | ## Reporting a Vulnerability diff --git a/docs/components/_index.md b/docs/components/_index.md index 869b07d88a..bc9d4437b5 100644 --- a/docs/components/_index.md +++ b/docs/components/_index.md @@ -1,3 +1,5 @@ --- -title: "Components:" +title: 'Components:' --- + + diff --git a/docs/components/compact.md b/docs/components/compact.md index 0afc283318..2a723793fa 100644 --- a/docs/components/compact.md +++ b/docs/components/compact.md @@ -1,13 +1,12 @@ --- -title: Compactor type: docs +title: Compactor menu: components --- # Compactor -The `thanos compact` command applies the compaction procedure of the Prometheus 2.0 storage engine to block data stored in object storage. -It is generally not semantically concurrency safe and must be deployed as a singleton against a bucket. +The `thanos compact` command applies the compaction procedure of the Prometheus 2.0 storage engine to block data stored in object storage. It is generally not semantically concurrency safe and must be deployed as a singleton against a bucket. Compactor is also responsible for downsampling of data: @@ -30,8 +29,7 @@ config: By default `thanos compact` will run to completion which makes it possible to execute in a cronjob. Using the arguments `--wait` and `--wait-interval=5m` it's possible to keep it running. -**Compactor, Sidecar, Receive and Ruler are the only Thanos component which should have a write access to object storage, -with only Compactor being able to delete data.** +**Compactor, Sidecar, Receive and Ruler are the only Thanos component which should have a write access to object storage, with only Compactor being able to delete data.** > **NOTE:** High availability for Compactor is generally not required. See [Availability](#availability) section. @@ -39,41 +37,30 @@ with only Compactor being able to delete data.** The Compactor, among other things, is responsible for compacting multiple blocks into one. -Why even compacting? This is a process, also done in Prometheus, to reduce number of blocks and compact index indices. We can compact -index quite well in most cases, because series usually live longer than the duration of the smallest block, so 2 hours. +Why even compacting? This is a process, also done in Prometheus, to reduce number of blocks and compact index indices. We can compact index quite well in most cases, because series usually live longer than the duration of the smallest block, so 2 hours. ### Compaction Groups / Block Streams -Usually those blocks come through the same source. We call blocks from a single source, a "stream" of blocks or compaction group. We distinguish streams by -`external labels`. Blocks with the same labels are considered as produced by a single source. +Usually those blocks come through the same source. We call blocks from a single source, a "stream" of blocks or compaction group. We distinguish streams by `external labels`. Blocks with the same labels are considered as produced by a single source. This is because `external_labels` are added by the Prometheus which produced the block. -⚠ This is why those labels on block must be both _unique_ and _persistent_ across different Prometheus instances. ⚠ +⚠ This is why those labels on block must be both *unique* and *persistent* across different Prometheus instances. ⚠ -* By _unique_, we mean that the set of labels in a Prometheus instance must be different from all other sets of labels of -your Prometheus instances, so that the compactor will be able to group blocks by Prometheus instance. -* By _persistent_, we mean that one Prometheus instance must keep the same labels if it restarts, so that the compactor will keep -compacting blocks from an instance even when a Prometheus instance goes down for some time. +* By *unique*, we mean that the set of labels in a Prometheus instance must be different from all other sets of labels of your Prometheus instances, so that the compactor will be able to group blocks by Prometheus instance. +* By *persistent*, we mean that one Prometheus instance must keep the same labels if it restarts, so that the compactor will keep compacting blocks from an instance even when a Prometheus instance goes down for some time. -Natively Prometheus does not store external labels anywhere. This is why external labels are added only on upload time to the `ThanosMeta` section -of `meta.json` in each block. +Natively Prometheus does not store external labels anywhere. This is why external labels are added only on upload time to the `ThanosMeta` section of `meta.json` in each block. -> **NOTE:** In default mode the state of two or more blocks having same external labels and overlapping in time is assumed as an unhealthy situation. -Refer to [Overlap Issue Troubleshooting](../operating/troubleshooting.md#overlaps) for more info. This results in compactor [halting](#halting). +> **NOTE:** In default mode the state of two or more blocks having same external labels and overlapping in time is assumed as an unhealthy situation. Refer to [Overlap Issue Troubleshooting](../operating/troubleshooting.md#overlaps) for more info. This results in compactor [halting](#halting). #### Warning: Only one Instance has to run against single stream of blocks in single Object Storage. :warning: :warning: :warning: -Because there is no safe locking mechanism for all object storage provides, currently, you need to ensure on your own that only -single Compactor is running against single stream of blocks on single bucket. Running more can result with [Overlap Issues](../operating/troubleshooting.md#overlaps) -that has to be resolved manually. +Because there is no safe locking mechanism for all object storage provides, currently, you need to ensure on your own that only single Compactor is running against single stream of blocks on single bucket. Running more can result with [Overlap Issues](../operating/troubleshooting.md#overlaps) that has to be resolved manually. -This rule, means also that there could be a problem when both compacted and non compacted blocks are being uploaded by sidecar. -This is why "upload compacted" flag is still under a separate `--shipper.upload-compacted` flag that helps to ensure that compacted blocks -are uploaded before anything else. The singleton rule is also why local Prometheus compaction has to be disabled in order to use sidecar with upload option. -Use hidden `--shipper.ignore-unequal-block-size` to override this check (on your own risk). +This rule, means also that there could be a problem when both compacted and non compacted blocks are being uploaded by sidecar. This is why "upload compacted" flag is still under a separate `--shipper.upload-compacted` flag that helps to ensure that compacted blocks are uploaded before anything else. The singleton rule is also why local Prometheus compaction has to be disabled in order to use sidecar with upload option. Use hidden `--shipper.ignore-unequal-block-size` to override this check (on your own risk). > **NOTE:** In further Thanos version it's possible that both restrictions will be removed with production status of [vertical compaction](#vertical-compactions) which is worked on. @@ -83,14 +70,11 @@ You can though run multiple Compactors against single Bucket as long as for sepa Thanos and Prometheus supports vertical compaction, so process of compacting multiple streams of blocks into one. -In Prometheus, this can be triggered by setting hidden flag in Prometheus and putting additional TSDB blocks within Prometheus -local directory. Extra blocks can overlap with existing ones. When Prometheus detects that situation it performs `vertical compaction` -which compacts overlapping blocks into single one. This is mainly used for **backfilling** purposes. +In Prometheus, this can be triggered by setting hidden flag in Prometheus and putting additional TSDB blocks within Prometheus local directory. Extra blocks can overlap with existing ones. When Prometheus detects that situation it performs `vertical compaction` which compacts overlapping blocks into single one. This is mainly used for **backfilling** purposes. In Thanos, it works similarly, but on bigger scale and using external labels for grouping as explained in [Compaction section](#compaction). -In both systems, series with the same labels are merged together. In prometheus, merging samples is **naive**. It works by deduplicating samples within -exactly the same timestamps. Otherwise samples are added in sorted by time order. Thanos also support a new penalty based samples merger and it is explained in [Deduplication](#Vertical Compaction Use Cases). +In both systems, series with the same labels are merged together. In prometheus, merging samples is **naive**. It works by deduplicating samples within exactly the same timestamps. Otherwise samples are added in sorted by time order. Thanos also support a new penalty based samples merger and it is explained in [Deduplication](#Vertical Compaction Use Cases). > **NOTE:** Both Prometheus and Thanos default behaviour is to fail compaction if any overlapping blocks are spotted. (For Thanos, within the same external labels). @@ -98,25 +82,19 @@ exactly the same timestamps. Otherwise samples are added in sorted by time order There can be few valid use cases for vertical compaction: -* Races between multiple compactions, for example multiple compactors or between compactor and Prometheus compactions. While this will have extra -computation overhead for Compactor it's safe to enable vertical compaction for this case. +* Races between multiple compactions, for example multiple compactors or between compactor and Prometheus compactions. While this will have extra computation overhead for Compactor it's safe to enable vertical compaction for this case. * Backfilling. If you want to add blocks of data to any stream where there is existing data already there for the time range, you will need enabled vertical compaction. * Offline deduplication of series. It's very common to have the same data replicated into multiple streams. We can distinguish two common series deduplications, `one-to-one` and `penalty`: - * `one-to-one` deduplication is when same series (series with the same labels from different blocks) for the same range have **exactly** the same samples: Same values and timestamps. -This is very common while using [Receivers](../components/receive.md) with replication greater than 1 as receiver replication copies exactly the same timestamps and values to different receive instances. - * `penalty` deduplication is when same series data is **logically duplicated**. For example, it comes from the same application, but scraped by two different Prometheus-es. Ideally -this requires more complex deduplication algorithms. For example one that is used to [deduplicate on the fly on the Querier](query.md#run-time-deduplication-of-ha-groups). This is common -case when Prometheus HA replicas are used. You can enable this deduplication via `--deduplication.func=penalty` flag. + * `one-to-one` deduplication is when same series (series with the same labels from different blocks) for the same range have **exactly** the same samples: Same values and timestamps. This is very common while using [Receivers](receive.md) with replication greater than 1 as receiver replication copies exactly the same timestamps and values to different receive instances. + * `penalty` deduplication is when same series data is **logically duplicated**. For example, it comes from the same application, but scraped by two different Prometheus-es. Ideally this requires more complex deduplication algorithms. For example one that is used to [deduplicate on the fly on the Querier](query.md#run-time-deduplication-of-ha-groups). This is common case when Prometheus HA replicas are used. You can enable this deduplication via `--deduplication.func=penalty` flag. #### Vertical Compaction Risks The main risk is the **irreversible** implications of potential configuration errors: -* If you accidentally upload block with the same external labels but produced by totally different Prometheus for totally different applications, some metrics can overlap -and potentially can merge together making such series useless. +* If you accidentally upload block with the same external labels but produced by totally different Prometheus for totally different applications, some metrics can overlap and potentially can merge together making such series useless. * If you merge disjoint series in multiple of blocks together, there is currently no easy way to split them back. -* The `penalty` offline deduplication algorithm has its own limitation. Even though it has been battle-tested for quite a long time but still very few issues come up from time to time - such as https://github.com/thanos-io/thanos/issues/2890. If you'd like to enable this deduplication algorithm, please take the risk and make sure you back up your data. +* The `penalty` offline deduplication algorithm has its own limitation. Even though it has been battle-tested for quite a long time but still very few issues come up from time to time such as https://github.com/thanos-io/thanos/issues/2890. If you'd like to enable this deduplication algorithm, please take the risk and make sure you back up your data. #### Enabling Vertical Compaction @@ -151,8 +129,7 @@ If you need a different deduplication algorithm, use `deduplication.func` flag. By default, there is NO retention set for object storage data. This means that you store data for unlimited time, which is a valid and recommended way of running Thanos. -You can set retention by different resolutions using `--retention.resolution-raw` `--retention.resolution-5m` and `--retention.resolution-1h` flag. Not setting -them or setting to `0s` means no retention. +You can set retention by different resolutions using `--retention.resolution-raw` `--retention.resolution-5m` and `--retention.resolution-1h` flag. Not setting them or setting to `0s` means no retention. **NOTE:** ⚠ ️Retention is applied right after Compaction and Downsampling loops. If those are failing, data will be never deleted. @@ -164,8 +141,7 @@ To learn more see [video from KubeCon 2019](https://youtu.be/qQN0N14HXPM?t=714) ### TL;DR on how thanos downsampling works -Thanos Compactor takes "raw" resolution block and creates a new one with "downsampled" chunks. Downsampled chunk takes -on storage level form of "AggrChunk": +Thanos Compactor takes "raw" resolution block and creates a new one with "downsampled" chunks. Downsampled chunk takes on storage level form of "AggrChunk": ```proto message AggrChunk { @@ -181,8 +157,7 @@ message AggrChunk { } ``` -This means that for each series we collect various aggregations with given interval: 5m or 1h (depending on resolution) -This allows us to keep precision on large duration queries, without fetching too many samples. +This means that for each series we collect various aggregations with given interval: 5m or 1h (depending on resolution) This allows us to keep precision on large duration queries, without fetching too many samples. ### ⚠ ️Downsampling: Note About Resolution and Retention ⚠️ @@ -192,9 +167,7 @@ Resolution is a distance between data points on your graphs. E.g. * `5 minutes` - data point is every 5 minutes * `1 hour` - data point is every 1h -Keep in mind, that the initial goal of downsampling is not saving disk or object storage space. In fact, downsampling doesn't save you __any__ space but instead, -it adds 2 more blocks for each raw block which are only slightly smaller or relatively similar size to raw block. This is done by internal downsampling implementation -which to be mathematically correct holds various aggregations. This means that downsampling can increase the size of your storage a bit (~3x), if you choose to store all resolutions (recommended and by default). +Keep in mind, that the initial goal of downsampling is not saving disk or object storage space. In fact, downsampling doesn't save you **any** space but instead, it adds 2 more blocks for each raw block which are only slightly smaller or relatively similar size to raw block. This is done by internal downsampling implementation which to be mathematically correct holds various aggregations. This means that downsampling can increase the size of your storage a bit (~3x), if you choose to store all resolutions (recommended and by default). The goal of downsampling is to provide an opportunity to get fast results for range queries of big time intervals like months or years. In other words, if you set `--retention.resolution-raw` less than `--retention.resolution-5m` and `--retention.resolution-1h` - you might run into a problem of not being able to "zoom in" to your historical data. @@ -210,9 +183,7 @@ Please note that blocks are only deleted after they completely "fall off" of the ## Deleting Aborted Partial Uploads -It can happen that any producer started uploading some block, but never finished and never will. Sidecars will retry in case of failures during upload or process (unless there was no persistent storage), -but very common case is with Compactor. If Compactor process crashes during upload of compacted block, whole compaction starts from scratch and new block ID is created. This means that -partial upload will be never retried. +It can happen that any producer started uploading some block, but never finished and never will. Sidecars will retry in case of failures during upload or process (unless there was no persistent storage), but very common case is with Compactor. If Compactor process crashes during upload of compacted block, whole compaction starts from scratch and new block ID is created. This means that partial upload will be never retried. To handle this case there is `--delete-delay=48h` flag that starts deletion of directories inside object storage without `meta.json` only after given time. @@ -220,8 +191,7 @@ This value has to be smaller than upload duration and [consistency delay](#consi ## Halting -Because of the very specific nature of Compactor which is writing to object storage, potentially deleting sensitive data, and downloading GBs of data, by default we halt Compactor on certain data failures. -This means that that Compactor does not crash on halt errors, but instead is kept running and does nothing with metric `thanos_compactor_halted` set to 1. +Because of the very specific nature of Compactor which is writing to object storage, potentially deleting sensitive data, and downloading GBs of data, by default we halt Compactor on certain data failures. This means that that Compactor does not crash on halt errors, but instead is kept running and does nothing with metric `thanos_compactor_halted` set to 1. Reason is that we don't want to retry compaction and all the computations if we know that, for example, there is already overlapped state in the object storage for some reason. @@ -246,8 +216,7 @@ Generally, the maximum memory utilization is exactly the same as for Prometheus You need to multiply this with X where X is `--compact.concurrency` (by default 1). -**NOTE:** Don't check heap memory only. Prometheus and Thanos compaction leverages `mmap` heavily which is outside of `Go` `runtime` stats. -Refer to process / OS memory used rather. On Linux/MacOS Go will also use as much as available, so utilization will be always near limit. +**NOTE:** Don't check heap memory only. Prometheus and Thanos compaction leverages `mmap` heavily which is outside of `Go` `runtime` stats. Refer to process / OS memory used rather. On Linux/MacOS Go will also use as much as available, so utilization will be always near limit. Generally, for medium-sized bucket limit of 10GB of memory should be enough to keep it working. @@ -255,91 +224,76 @@ Generally, for medium-sized bucket limit of 10GB of memory should be enough to k Overall Compactor is the component that might have the heaviest use of network against object storage, so place it near the bucket's zone/location. -It has to download each block needed for compaction / downsampling and it does that on every compaction / downsampling. It then uploads -computed blocks. It also refreshes the state of bucket often. +It has to download each block needed for compaction / downsampling and it does that on every compaction / downsampling. It then uploads computed blocks. It also refreshes the state of bucket often. ### Disk -The compactor needs local disk space to store intermediate data for its processing as well as bucket state cache. Generally, -for medium sized bucket about 100GB should be enough to keep working as the compacted time ranges grow over time. However, this highly depends on -size of the blocks. In worst case scenario compactor has to have space adequate to 2 times 2 weeks (if your maximum compaction level is 2 weeks) worth of smaller blocks to -perform compaction. First, to download all of those source blocks, second to build on disk output of 2 week block composed of those smaller ones. +The compactor needs local disk space to store intermediate data for its processing as well as bucket state cache. Generally, for medium sized bucket about 100GB should be enough to keep working as the compacted time ranges grow over time. However, this highly depends on size of the blocks. In worst case scenario compactor has to have space adequate to 2 times 2 weeks (if your maximum compaction level is 2 weeks) worth of smaller blocks to perform compaction. First, to download all of those source blocks, second to build on disk output of 2 week block composed of those smaller ones. You need to multiply this with X where X is `--compact.concurrency` (by default 1). -On-disk data is safe to delete between restarts and should be the first attempt to get crash-looping compactors unstuck. -However, it's recommended to give the Compactor persistent disk in order to effectively use bucket state cache between restarts. +On-disk data is safe to delete between restarts and should be the first attempt to get crash-looping compactors unstuck. However, it's recommended to give the Compactor persistent disk in order to effectively use bucket state cache between restarts. ## Availability Compactor, generally, does not need to be highly available. Compactions are needed from time to time, only when new blocks appear. -The only risk is that without compactor running for longer time (weeks) you might see reduced performance of your read path due to amount of small blocks, -lack of downsampled data and retention not enforced +The only risk is that without compactor running for longer time (weeks) you might see reduced performance of your read path due to amount of small blocks, lack of downsampled data and retention not enforced ## Scalability The main and only `Service Level Indicator` for Compactor is how fast it can cope with uploaded TSDB blocks to the bucket. -To understand that you can use mix `thanos_objstore_bucket_last_successful_upload_time` being quite fresh, `thanos_compactor_halted` being non 1 -and `thanos_blocks_meta_synced{state="loaded"}` constantly increasing over days. +To understand that you can use mix `thanos_objstore_bucket_last_successful_upload_time` being quite fresh, `thanos_compactor_halted` being non 1 and `thanos_blocks_meta_synced{state="loaded"}` constantly increasing over days. ![Example view of compactor not coping with amount and size of incoming blocks](compactor_no_coping_with_load.png) Generally there two scalability directions: -1. Too many producers/sources (e.g Prometheus-es) are uploading to same object storage. Too many "streams" of work for Compactor. -Compactor has to scale with the number of producers in the bucket. +1. Too many producers/sources (e.g Prometheus-es) are uploading to same object storage. Too many "streams" of work for Compactor. Compactor has to scale with the number of producers in the bucket. -You should horizontally scale Compactor to cope with this using [label sharding](../sharding.md#compactor). This allows to assign -multiple streams to each instance of compactor. +You should horizontally scale Compactor to cope with this using [label sharding](../sharding.md#compactor). This allows to assign multiple streams to each instance of compactor. -2. TSDB blocks from single stream is too big, it takes too much time or resources. +1. TSDB blocks from single stream is too big, it takes too much time or resources. -This is rare as first you would need to ingest that amount of data into Prometheus and it's usually not recommended to have bigger than 10 millions series -in the 2 hours blocks. However, with 2 weeks blocks, potential [Vertical Compaction](#vertical-compactions) enabled and other producers than Prometheus (e.g backfilling) -this scalability concern can appear as well. See [Limit size of blocks](https://github.com/thanos-io/thanos/issues/3068) ticket to track progress of solution if you are hitting this. +This is rare as first you would need to ingest that amount of data into Prometheus and it's usually not recommended to have bigger than 10 millions series in the 2 hours blocks. However, with 2 weeks blocks, potential [Vertical Compaction](#vertical-compactions) enabled and other producers than Prometheus (e.g backfilling) this scalability concern can appear as well. See [Limit size of blocks](https://github.com/thanos-io/thanos/issues/3068) ticket to track progress of solution if you are hitting this. ## Eventual Consistency -Depending on the Object Storage provider like S3, GCS, Ceph etc; we can divide the storages into strongly consistent or eventually consistent. -Since there are no consistency guarantees provided by some Object Storage providers, we have to make sure that we have a consistent lock-free way of dealing with Object Storage irrespective of the choice of object storage. +Depending on the Object Storage provider like S3, GCS, Ceph etc; we can divide the storages into strongly consistent or eventually consistent. Since there are no consistency guarantees provided by some Object Storage providers, we have to make sure that we have a consistent lock-free way of dealing with Object Storage irrespective of the choice of object storage. ### Consistency Delay -In order to make sure we don't read partially uploaded block (or eventually visible fully in system) we established `--consistency-delay=30m` delay for all components -reading blocks. +In order to make sure we don't read partially uploaded block (or eventually visible fully in system) we established `--consistency-delay=30m` delay for all components reading blocks. This means that blocks are visible / loadable for compactor (and used for retention, compaction planning, etc), only after 30m from block upload start in object storage. ### Block Deletions -In order to achieve co-ordination between compactor and all object storage readers without any race, blocks are not deleted directly. Instead, blocks are marked for deletion by uploading -`deletion-mark.json` file for the block that was chosen to be deleted. This file contains unix time of when the block was marked for deletion. +In order to achieve co-ordination between compactor and all object storage readers without any race, blocks are not deleted directly. Instead, blocks are marked for deletion by uploading `deletion-mark.json` file for the block that was chosen to be deleted. This file contains unix time of when the block was marked for deletion. ## Flags -[embedmd]:# (flags/compact.txt $) -```$ +```$ mdox-exec="thanos compact --help" usage: thanos compact [] Continuously compacts blocks in an object store bucket. Flags: - --block-meta-fetch-concurrency=32 + --block-meta-fetch-concurrency=32 Number of goroutines to use when fetching block metadata from object storage. - --block-sync-concurrency=20 + --block-sync-concurrency=20 Number of goroutines to use when syncing block metadata from object storage. - --block-viewer.global.sync-block-interval=1m + --block-viewer.global.sync-block-interval=1m Repeat interval for syncing the blocks between local and remote view for /global Block Viewer UI. - --bucket-web-label=BUCKET-WEB-LABEL + --bucket-web-label=BUCKET-WEB-LABEL Prometheus label to use as timeline title in the bucket web UI - --compact.cleanup-interval=5m + --compact.cleanup-interval=5m How often we should clean up partially uploaded blocks and blocks with deletion mark in the background when --wait has been enabled. Setting @@ -362,7 +316,7 @@ Flags: algorithm will be used. At least one replica label has to be set via --deduplication.replica-label flag. - --deduplication.replica-label=DEDUPLICATION.REPLICA-LABEL ... + --deduplication.replica-label=DEDUPLICATION.REPLICA-LABEL ... Label to treat as a replica indicator of blocks that can be deduplicated (repeated flag). This will merge multiple replica blocks into one. @@ -401,7 +355,7 @@ Flags: values are: "", "SHA256". -h, --help Show context-sensitive help (also try --help-long and --help-man). - --http-address="0.0.0.0:10902" + --http-address="0.0.0.0:10902" Listen host:port for HTTP endpoints. --http-grace-period=2m Time to wait after an interrupt received for HTTP Server. @@ -411,29 +365,29 @@ Flags: --log.format=logfmt Log format to use. Possible options: logfmt or json. --log.level=info Log filtering level. - --objstore.config= + --objstore.config= Alternative to 'objstore.config-file' flag (mutually exclusive). Content of YAML file that contains object store configuration. See format details: https://thanos.io/tip/thanos/storage.md/#configuration - --objstore.config-file= + --objstore.config-file= Path to YAML file that contains object store configuration. See format details: https://thanos.io/tip/thanos/storage.md/#configuration - --retention.resolution-1h=0d + --retention.resolution-1h=0d How long to retain samples of resolution 2 (1 hour) in bucket. Setting this to 0d will retain samples of this resolution forever - --retention.resolution-5m=0d + --retention.resolution-5m=0d How long to retain samples of resolution 1 (5 minutes) in bucket. Setting this to 0d will retain samples of this resolution forever - --retention.resolution-raw=0d + --retention.resolution-raw=0d How long to retain raw samples in bucket. Setting this to 0d will retain samples of this resolution forever - --selector.relabel-config= + --selector.relabel-config= Alternative to 'selector.relabel-config-file' flag (mutually exclusive). Content of YAML file that contains relabeling configuration that @@ -441,18 +395,18 @@ Flags: Prometheus relabel-config syntax. See format details: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config - --selector.relabel-config-file= + --selector.relabel-config-file= Path to YAML file that contains relabeling configuration that allows selecting blocks. It follows native Prometheus relabel-config syntax. See format details: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config - --tracing.config= + --tracing.config= Alternative to 'tracing.config-file' flag (mutually exclusive). Content of YAML file with tracing configuration. See format details: https://thanos.io/tip/thanos/tracing.md/#configuration - --tracing.config-file= + --tracing.config-file= Path to YAML file with tracing configuration. See format details: https://thanos.io/tip/thanos/tracing.md/#configuration diff --git a/docs/components/query-frontend.md b/docs/components/query-frontend.md index 93fbcc0048..e16228b670 100644 --- a/docs/components/query-frontend.md +++ b/docs/components/query-frontend.md @@ -1,13 +1,12 @@ --- -title: Query Frontend type: docs +title: Query Frontend menu: components --- # Query Frontend -The `thanos query-frontend` command implements a service that can be put in front of Thanos Queriers to improve the read path. It is -based on the [Cortex Query Frontend](https://cortexmetrics.io/docs/architecture/#query-frontend) component so you can find some common features like `Splitting` and `Results Caching`. +The `thanos query-frontend` command implements a service that can be put in front of Thanos Queriers to improve the read path. It is based on the [Cortex Query Frontend](https://cortexmetrics.io/docs/architecture/#query-frontend) component so you can find some common features like `Splitting` and `Results Caching`. Query Frontend is fully stateless and horizontally scalable. @@ -19,8 +18,7 @@ thanos query-frontend \ --query-frontend.downstream-url=":" ``` -_**NOTE:** Currently only range queries (`/api/v1/query_range` API call) are actually processed through Query Frontend. All other -API calls just directly go to the downstream Querier, which means only range queries are split and cached. But we are planning to support instant queries as well. +_**NOTE:** Currently only range queries (`/api/v1/query_range` API call) are actually processed through Query Frontend. All other API calls just directly go to the downstream Querier, which means only range queries are split and cached. But we are planning to support instant queries as well. For more information please check out [initial design proposal](https://thanos.io/tip/proposals/202004_embedd_cortex_frontend.md/). @@ -28,8 +26,7 @@ For more information please check out [initial design proposal](https://thanos.i ### Splitting -Query Frontend splits a long query into multiple short queries based on the configured `--query-range.split-interval` flag. The default value of `--query-range.split-interval` -is `24h`. When caching is enabled it should be greater than `0`. +Query Frontend splits a long query into multiple short queries based on the configured `--query-range.split-interval` flag. The default value of `--query-range.split-interval` is `24h`. When caching is enabled it should be greater than `0`. There are some benefits from query splitting: @@ -43,34 +40,29 @@ Query Frontend supports a retry mechanism to retry query when HTTP requests are ### Caching -Query Frontend supports caching query results and reuses them on subsequent queries. If the cached results are incomplete, -Query Frontend calculates the required subqueries and executes them in parallel on downstream queriers. -Query Frontend can optionally align queries with their step parameter to improve the cacheability of the query results. -Currently, in-memory cache (fifo cache) and memcached are supported. +Query Frontend supports caching query results and reuses them on subsequent queries. If the cached results are incomplete, Query Frontend calculates the required subqueries and executes them in parallel on downstream queriers. Query Frontend can optionally align queries with their step parameter to improve the cacheability of the query results. Currently, in-memory cache (fifo cache) and memcached are supported. #### In-memory -[embedmd]:# (../flags/config_response_cache_in_memory.txt yaml) -```yaml +```yaml mdox-exec="go run scripts/cfggen/main.go --name=queryfrontend.InMemoryResponseCacheConfig" type: IN-MEMORY config: max_size: "" max_size_items: 0 validity: 0s ``` + `max_size: ` Maximum memory size of the cache in bytes. A unit suffix (KB, MB, GB) may be applied. **_NOTE:** If both `max_size` and `max_size_items` are not set, then the *cache* would not be created. -If either of `max_size` or `max_size_items` is set, then there is no limit on other field. -For example - only set `max_size_item` to 1000, then `max_size` is unlimited. Similarly, if only `max_size` is set, then `max_size_items` is unlimited. +If either of `max_size` or `max_size_items` is set, then there is no limit on other field. For example - only set `max_size_item` to 1000, then `max_size` is unlimited. Similarly, if only `max_size` is set, then `max_size_items` is unlimited. Example configuration: [kube-thanos](https://github.com/thanos-io/kube-thanos/blob/master/examples/all/manifests/thanos-query-frontend-deployment.yaml#L50-L54) #### Memcached -[embedmd]:# (../flags/config_response_cache_memcached.txt yaml) -```yaml +```yaml mdox-exec="go run scripts/cfggen/main.go --name=queryfrontend.MemcachedResponseCacheConfig" type: MEMCACHED config: addresses: [] @@ -85,11 +77,11 @@ config: expiration: 0s ``` -`expiration` specifies memcached cache valid time. If set to 0s, so using a default of 24 hours expiration time. +`expiration` specifies memcached cache valid time. If set to 0s, so using a default of 24 hours expiration time. If a `set` operation is skipped because of the item size is larger than `max_item_size`, this event is tracked by a counter metric `cortex_memcache_client_set_skip_total`. -Other cache configuration parameters, you can refer to [memcached-index-cache]( https://thanos.io/tip/components/store.md/#memcached-index-cache). +Other cache configuration parameters, you can refer to [memcached-index-cache](https://thanos.io/tip/components/store.md/#memcached-index-cache). The default memcached config is: @@ -118,54 +110,53 @@ Naming is hard :) Please check [here](https://github.com/thanos-io/thanos/pull/2 ## Flags -[embedmd]:# (flags/query-frontend.txt $) -```$ +```$ mdox-exec="thanos query-frontend --help" usage: thanos query-frontend [] Query frontend command implements a service deployed in front of queriers to improve query parallelization and caching. Flags: - --cache-compression-type="" + --cache-compression-type="" Use compression in results cache. Supported values are: 'snappy' and ” (disable compression). -h, --help Show context-sensitive help (also try --help-long and --help-man). - --http-address="0.0.0.0:10902" + --http-address="0.0.0.0:10902" Listen host:port for HTTP endpoints. --http-grace-period=2m Time to wait after an interrupt received for HTTP Server. --http.config="" [EXPERIMENTAL] Path to the configuration file that can enable TLS or authentication for all HTTP endpoints. - --labels.default-time-range=24h + --labels.default-time-range=24h The default metadata time range duration for retrieving labels through Labels and Series API when the range parameters are not specified. - --labels.max-query-parallelism=14 + --labels.max-query-parallelism=14 Maximum number of labels requests will be scheduled in parallel by the Frontend. - --labels.max-retries-per-request=5 + --labels.max-retries-per-request=5 Maximum number of retries for a single label/series API request; beyond this, the downstream error is returned. --labels.partial-response Enable partial response for labels requests if no partial_response param is specified. --no-labels.partial-response for disabling. - --labels.response-cache-config= + --labels.response-cache-config= Alternative to 'labels.response-cache-config-file' flag (mutually exclusive). Content of YAML file that contains response cache configuration. - --labels.response-cache-config-file= + --labels.response-cache-config-file= Path to YAML file that contains response cache configuration. - --labels.response-cache-max-freshness=1m + --labels.response-cache-max-freshness=1m Most recent allowed cacheable result for labels requests, to prevent caching very recent results that might still be in flux. - --labels.split-interval=24h + --labels.split-interval=24h Split labels requests by an interval and execute in parallel, it should be greater than 0 when labels.response-cache-config is @@ -182,16 +173,16 @@ Flags: LogStartAndFinishCall : Logs the start and finish call of the requests. NoLogCall : Disable request logging. - --query-frontend.compress-responses + --query-frontend.compress-responses Compress HTTP responses. - --query-frontend.downstream-url="http://localhost:9090" + --query-frontend.downstream-url="http://localhost:9090" URL of downstream Prometheus Query compatible API. - --query-frontend.log-queries-longer-than=0 + --query-frontend.log-queries-longer-than=0 Log queries that are slower than the specified duration. Set to 0 to disable. Set to < 0 to enable on all queries. - --query-frontend.org-id-header= ... + --query-frontend.org-id-header= ... Request header names used to identify the source of slow queries (repeated flag). The values of the header will be added to the org @@ -199,63 +190,63 @@ Flags: headers match the request, the first matching arg specified will take precedence. If no headers match 'anonymous' will be used. - --query-range.align-range-with-step + --query-range.align-range-with-step Mutate incoming queries to align their start and end with their step for better cache-ability. Note: Grafana dashboards do that by default. - --query-range.max-query-length=0 + --query-range.max-query-length=0 Limit the query time range (end - start time) in the query-frontend, 0 disables it. - --query-range.max-query-parallelism=14 + --query-range.max-query-parallelism=14 Maximum number of query range requests will be scheduled in parallel by the Frontend. - --query-range.max-retries-per-request=5 + --query-range.max-retries-per-request=5 Maximum number of retries for a single query range request; beyond this, the downstream error is returned. - --query-range.partial-response + --query-range.partial-response Enable partial response for query range requests if no partial_response param is specified. --no-query-range.partial-response for disabling. - --query-range.request-downsampled + --query-range.request-downsampled Make additional query for downsampled data in case of empty or incomplete response to range request. - --query-range.response-cache-config= + --query-range.response-cache-config= Alternative to 'query-range.response-cache-config-file' flag (mutually exclusive). Content of YAML file that contains response cache configuration. - --query-range.response-cache-config-file= + --query-range.response-cache-config-file= Path to YAML file that contains response cache configuration. - --query-range.response-cache-max-freshness=1m + --query-range.response-cache-max-freshness=1m Most recent allowed cacheable result for query range requests, to prevent caching very recent results that might still be in flux. - --query-range.split-interval=24h + --query-range.split-interval=24h Split query range requests by an interval and execute in parallel, it should be greater than 0 when query-range.response-cache-config is configured. - --request.logging-config= + --request.logging-config= Alternative to 'request.logging-config-file' flag (mutually exclusive). Content of YAML file with request logging configuration. See format details: https://gist.github.com/yashrsharma44/02f5765c5710dd09ce5d14e854f22825 - --request.logging-config-file= + --request.logging-config-file= Path to YAML file with request logging configuration. See format details: https://gist.github.com/yashrsharma44/02f5765c5710dd09ce5d14e854f22825 - --tracing.config= + --tracing.config= Alternative to 'tracing.config-file' flag (mutually exclusive). Content of YAML file with tracing configuration. See format details: https://thanos.io/tip/thanos/tracing.md/#configuration - --tracing.config-file= + --tracing.config-file= Path to YAML file with tracing configuration. See format details: https://thanos.io/tip/thanos/tracing.md/#configuration diff --git a/docs/components/query.md b/docs/components/query.md index 5144ebaa67..348de99799 100644 --- a/docs/components/query.md +++ b/docs/components/query.md @@ -1,6 +1,6 @@ --- -title: Query type: docs +title: Query menu: components --- @@ -20,6 +20,7 @@ thanos query \ --store ":" \ --store ":" ``` + ## Querier use cases, why do I need this component? Thanos Querier essentially allows to aggregate and optionally deduplicate multiple metrics backends under single Prometheus Query endpoint. @@ -28,49 +29,41 @@ Thanos Querier essentially allows to aggregate and optionally deduplicate multip Since for Querier "a backend" is anything that implements gRPC StoreAPI we can aggregate data from any number of the different storages like: -* Prometheus (see [Sidecar](./sidecar.md)) -* Object Storage (see [Store Gateway](./store.md)) -* Global alerting/recording rules evaluations (see [Ruler](./rule.md)) -* Metrics received from Prometheus remote write streams (see [Receiver](./receive.md)) +* Prometheus (see [Sidecar](sidecar.md)) +* Object Storage (see [Store Gateway](store.md)) +* Global alerting/recording rules evaluations (see [Ruler](rule.md)) +* Metrics received from Prometheus remote write streams (see [Receiver](receive.md)) * Another Querier (you can stack Queriers on top of each other) * Non-Prometheus systems! - * e.g [OpenTSDB](../integrations.md#opentsdb) + * e.g [OpenTSDB](../integrations.md#opentsdb) Thanks to that, you can run queries (manually, from Grafana or via Alerting rule) that aggregate metrics from mix of those sources. Some examples: -* `sum(cpu_used{cluster=~"cluster-(eu1|eu2|eu3|us1|us2|us3)", job="service1"})` that will give you sum of CPU used inside all listed clusters for service `service1`. This will work -even if those clusters runs multiple Prometheus servers each. Querier will know which data sources to query. +* `sum(cpu_used{cluster=~"cluster-(eu1|eu2|eu3|us1|us2|us3)", job="service1"})` that will give you sum of CPU used inside all listed clusters for service `service1`. This will work even if those clusters runs multiple Prometheus servers each. Querier will know which data sources to query. * In single cluster you shard Prometheus functionally or have different Prometheus instances for different tenants. You can spin up Querier to have access to both within single Query evaluation. ### Run-time deduplication of HA groups -Prometheus is stateful and does not allow replicating its database. This means that increasing high availability by running multiple Prometheus replicas is not very easy to use. -Simple load balancing will not work as for example after some crash, replica might be up but querying such replica will result in small gap during the period it was down. You have a - second replica that maybe was up, but it could be down in other moment (e.g rolling restart), so load balancing on top of those is not working well. +Prometheus is stateful and does not allow replicating its database. This means that increasing high availability by running multiple Prometheus replicas is not very easy to use. Simple load balancing will not work as for example after some crash, replica might be up but querying such replica will result in small gap during the period it was down. You have a second replica that maybe was up, but it could be down in other moment (e.g rolling restart), so load balancing on top of those is not working well. Thanos Querier instead pulls the data from both replicas, and deduplicate those signals, filling the gaps if any, transparently to the Querier consumer. ## Metric Query Flow Overview -querier-steps +querier-steps -Overall QueryAPI exposed by Thanos is guaranteed to be compatible with [Prometheus 2.x. API](https://prometheus.io/docs/prometheus/latest/querying/api/). -The above diagram shows what Querier does for each Prometheus query request. +Overall QueryAPI exposed by Thanos is guaranteed to be compatible with [Prometheus 2.x. API](https://prometheus.io/docs/prometheus/latest/querying/api/). The above diagram shows what Querier does for each Prometheus query request. See [here](https://thanos.io/tip/thanos/service-discovery.md/) on how to connect Querier with desired StoreAPIs. - +### Deduplication -### Deduplication +The query layer can deduplicate series that were collected from high-availability pairs of data sources such as Prometheus. A fixed single or multiple replica labels must be chosen for the entire cluster and can then be passed to query nodes on startup. -The query layer can deduplicate series that were collected from high-availability pairs of data sources such as Prometheus. -A fixed single or multiple replica labels must be chosen for the entire cluster and can then be passed to query nodes on startup. - -Two or more series that are only distinguished by the given replica label, will be merged into a single time series. -This also hides gaps in collection of a single data source. +Two or more series that are only distinguished by the given replica label, will be merged into a single time series. This also hides gaps in collection of a single data source. ### An example with a single replica labels: @@ -90,14 +83,14 @@ thanos query \ And we query for metric `up{job="prometheus",env="2"}` with this option we will get 2 results: - * `up{job="prometheus",env="2",cluster="1"} 1` - * `up{job="prometheus",env="2",cluster="2"} 1` +* `up{job="prometheus",env="2",cluster="1"} 1` +* `up{job="prometheus",env="2",cluster="2"} 1` WITHOUT this replica flag (deduplication turned off), we will get 3 results: - * `up{job="prometheus",env="2",cluster="1",replica="A"} 1` - * `up{job="prometheus",env="2",cluster="1",replica="B"} 1` - * `up{job="prometheus",env="2",cluster="2",replica="A"} 1` +* `up{job="prometheus",env="2",cluster="1",replica="A"} 1` +* `up{job="prometheus",env="2",cluster="1",replica="B"} 1` +* `up{job="prometheus",env="2",cluster="2",replica="A"} 1` ### The same output will be present for this example with multiple replica labels: @@ -114,13 +107,11 @@ thanos query \ --store ":" \ ``` - This logic can also be controlled via parameter on QueryAPI. More details below. ## Query API Overview -As mentioned, Query API exposed by Thanos is guaranteed to be compatible with [Prometheus 2.x. API](https://prometheus.io/docs/prometheus/latest/querying/api/). -However for additional Thanos features on top of Prometheus, Thanos adds: +As mentioned, Query API exposed by Thanos is guaranteed to be compatible with [Prometheus 2.x. API](https://prometheus.io/docs/prometheus/latest/querying/api/). However for additional Thanos features on top of Prometheus, Thanos adds: * partial response behaviour * several additional parameters listed below @@ -130,15 +121,13 @@ Let's walk through all of those extensions: ### Partial Response -QueryAPI and StoreAPI has additional behaviour controlled via query parameter called [PartialResponseStrategy](/pkg/store/storepb/rpc.pb.go). +QueryAPI and StoreAPI has additional behaviour controlled via query parameter called [PartialResponseStrategy](../../pkg/store/storepb/rpc.pb.go). This parameter controls tradeoff between accuracy and availability. -Partial response is a potentially missed result within query against QueryAPI or StoreAPI. This can happen if one -of StoreAPIs is returning error or timeout whereas couple of others returns success. It does not mean you are missing data, -you might lucky enough that you actually get the correct data as the broken StoreAPI did not have anything for your query. +Partial response is a potentially missed result within query against QueryAPI or StoreAPI. This can happen if one of StoreAPIs is returning error or timeout whereas couple of others returns success. It does not mean you are missing data, you might lucky enough that you actually get the correct data as the broken StoreAPI did not have anything for your query. -If partial response happen QueryAPI returns human readable warnings explained [here](query.md#custom-response-fields). +If partial response happen QueryAPI returns human readable warnings explained [here](#custom-response-fields). Now it supports two strategies: * "warn" @@ -151,33 +140,32 @@ Querier also allows to configure different timeouts: * `--query.timeout` * `--store.response-timeout` -If you prefer availability over accuracy you can set tighter timeout to underlying StoreAPI than overall query timeout. If partial response -strategy is NOT `abort`, this will "ignore" slower StoreAPIs producing just warning with 200 status code response. +If you prefer availability over accuracy you can set tighter timeout to underlying StoreAPI than overall query timeout. If partial response strategy is NOT `abort`, this will "ignore" slower StoreAPIs producing just warning with 200 status code response. ### Deduplication replica labels. -| HTTP URL/FORM parameter | Type | Default | Example | -|----|----|----|----| -| `replicaLabels` | `[]string` | `query.replica-label` flag (default: empty). | `replicaLabels=replicaA&replicaLabels=replicaB` | -| | | | | +| HTTP URL/FORM parameter | Type | Default | Example | +|-------------------------|------------|----------------------------------------------|-------------------------------------------------| +| `replicaLabels` | `[]string` | `query.replica-label` flag (default: empty). | `replicaLabels=replicaA&replicaLabels=replicaB` | +| | | | | This overwrites the `query.replica-label` cli flag to allow dynamic replica labels at query time. ### Deduplication Enabled -| HTTP URL/FORM parameter | Type | Default | Example | -|----|----|----|----| -| `dedup` | `Boolean` | True, but effect depends on `query.replica` configuration flag. | `1, t, T, TRUE, true, True` for "True" | -| | | | | +| HTTP URL/FORM parameter | Type | Default | Example | +|-------------------------|-----------|-----------------------------------------------------------------|----------------------------------------| +| `dedup` | `Boolean` | True, but effect depends on `query.replica` configuration flag. | `1, t, T, TRUE, true, True` for "True" | +| | | | | This controls if query results should be deduplicated using the replica labels. ### Auto downsampling -| HTTP URL/FORM parameter | Type | Default | Example | -|----|----|----|----| -| `max_source_resolution` | `Float64/time.Duration/model.Duration` | `step / 5` or `0` if `query.auto-downsampling` is false (default: False) | `5m` | -| | | | | +| HTTP URL/FORM parameter | Type | Default | Example | +|-------------------------|----------------------------------------|--------------------------------------------------------------------------|---------| +| `max_source_resolution` | `Float64/time.Duration/model.Duration` | `step / 5` or `0` if `query.auto-downsampling` is false (default: False) | `5m` | +| | | | | Max source resolution is max resolution in seconds we want to use for data we query for. This means that for value: @@ -187,15 +175,14 @@ Max source resolution is max resolution in seconds we want to use for data we qu ### Partial Response Strategy -// TODO(bwplotka): Update. This will change to "strategy" soon as [PartialResponseStrategy enum here](/pkg/store/storepb/rpc.proto) +// TODO(bwplotka): Update. This will change to "strategy" soon as [PartialResponseStrategy enum here](../../pkg/store/storepb/rpc.proto) -| HTTP URL/FORM parameter | Type | Default | Example | -|----|----|----|----| -| `partial_response` | `Boolean` | `query.partial-response` flag (default: True) | `1, t, T, TRUE, true, True` for "True" | -| | | | | +| HTTP URL/FORM parameter | Type | Default | Example | +|-------------------------|-----------|-----------------------------------------------|----------------------------------------| +| `partial_response` | `Boolean` | `query.partial-response` flag (default: True) | `1, t, T, TRUE, true, True` for "True" | +| | | | | -If true, then all storeAPIs that will be unavailable (and thus return no data) will not cause query to fail, but instead -return warning. +If true, then all storeAPIs that will be unavailable (and thus return no data) will not cause query to fail, but instead return warning. ### Custom Response Fields @@ -209,26 +196,22 @@ type queryData struct { Result promql.Value `json:"result"` // Additional Thanos Response field. - Warnings []error `json:"warnings,omitempty"` + Warnings []error `json:"warnings,omitempty"` } ``` -Additional field is `Warnings` that contains every error that occurred that is assumed non critical. `partial_response` -option controls if storeAPI unavailability is considered critical. +Additional field is `Warnings` that contains every error that occurred that is assumed non critical. `partial_response` option controls if storeAPI unavailability is considered critical. ### Concurrent Selects -Thanos Querier has the ability to perform concurrent select request per query. It dissects given PromQL statement and executes selectors concurrently against the discovered StoreAPIs. -The maximum number of concurrent requests are being made per query is controlled by `query.max-concurrent-select` flag. -Keep in mind that the maximum number of concurrent queries that are handled by querier is controlled by `query.max-concurrent`. Please consider implications of combined value while tuning the querier. +Thanos Querier has the ability to perform concurrent select request per query. It dissects given PromQL statement and executes selectors concurrently against the discovered StoreAPIs. The maximum number of concurrent requests are being made per query is controlled by `query.max-concurrent-select` flag. Keep in mind that the maximum number of concurrent queries that are handled by querier is controlled by `query.max-concurrent`. Please consider implications of combined value while tuning the querier. ### Store filtering -It's possible to provide a set of matchers to the Querier api to select specific stores to be used during the query using the `storeMatch[]` parameter. It is useful when debugging a slow/broken store. -It uses the same format as the matcher of [Prometheus' federate api](https://prometheus.io/docs/prometheus/latest/querying/api/#finding-series-by-label-matchers). -Note that at the moment the querier only supports the `__address__` which contain the address of the store as it is shown on the `/stores` endpoint of the UI. +It's possible to provide a set of matchers to the Querier api to select specific stores to be used during the query using the `storeMatch[]` parameter. It is useful when debugging a slow/broken store. It uses the same format as the matcher of [Prometheus' federate api](https://prometheus.io/docs/prometheus/latest/querying/api/#finding-series-by-label-matchers). Note that at the moment the querier only supports the `__address__` which contain the address of the store as it is shown on the `/stores` endpoint of the UI. Example: + ``` - targets: - prometheus-foo.thanos-sidecar:10901 @@ -241,24 +224,11 @@ http://localhost:10901/api/v1/query?query=up&dedup=true&partial_response=true&st Will only return metrics from `prometheus-foo.thanos-sidecar:10901` - ## Expose UI on a sub-path -It is possible to expose thanos-query UI and optionally API on a sub-path. -The sub-path can be defined either statically or dynamically via an HTTP header. -Static path prefix definition follows the pattern used in Prometheus, -where `web.route-prefix` option defines HTTP request path prefix (endpoints prefix) -and `web.external-prefix` prefixes the URLs in HTML code and the HTTP redirect responses. - -Additionally, Thanos supports dynamic prefix configuration, which -[is not yet implemented by Prometheus](https://github.com/prometheus/prometheus/issues/3156). -Dynamic prefixing simplifies setup when `thanos query` is exposed on a sub-path behind -a reverse proxy, for example, via a Kubernetes ingress controller -[Traefik](https://docs.traefik.io/routing/routers/) -or [nginx](https://github.com/kubernetes/ingress-nginx/pull/1805). -If `PathPrefixStrip: /some-path` option or `traefik.frontend.rule.type: PathPrefixStrip` -Kubernetes Ingress annotation is set, then `Traefik` writes the stripped prefix into X-Forwarded-Prefix header. -Then, `thanos query --web.prefix-header=X-Forwarded-Prefix` will serve correct HTTP redirects and links prefixed by the stripped path. +It is possible to expose thanos-query UI and optionally API on a sub-path. The sub-path can be defined either statically or dynamically via an HTTP header. Static path prefix definition follows the pattern used in Prometheus, where `web.route-prefix` option defines HTTP request path prefix (endpoints prefix) and `web.external-prefix` prefixes the URLs in HTML code and the HTTP redirect responses. + +Additionally, Thanos supports dynamic prefix configuration, which [is not yet implemented by Prometheus](https://github.com/prometheus/prometheus/issues/3156). Dynamic prefixing simplifies setup when `thanos query` is exposed on a sub-path behind a reverse proxy, for example, via a Kubernetes ingress controller [Traefik](https://docs.traefik.io/routing/routers/) or [nginx](https://github.com/kubernetes/ingress-nginx/pull/1805). If `PathPrefixStrip: /some-path` option or `traefik.frontend.rule.type: PathPrefixStrip` Kubernetes Ingress annotation is set, then `Traefik` writes the stripped prefix into X-Forwarded-Prefix header. Then, `thanos query --web.prefix-header=X-Forwarded-Prefix` will serve correct HTTP redirects and links prefixed by the stripped path. ## File SD @@ -279,22 +249,20 @@ Example file SD file in YAML: - thanos-store.infra:10901 ``` - ## Flags -[embedmd]:# (flags/query.txt $) -```$ +```$ mdox-exec="thanos query --help" usage: thanos query [] Query node exposing PromQL enabled Query API with data retrieved from multiple store nodes. Flags: - --grpc-address="0.0.0.0:10901" + --grpc-address="0.0.0.0:10901" Listen ip:port address for gRPC endpoints (StoreAPI). Make sure this address is routable from other components. - --grpc-client-server-name="" + --grpc-client-server-name="" Server name to verify the hostname on the returned gRPC certificates. See https://tools.ietf.org/html/rfc4366#section-3.1 @@ -304,14 +272,14 @@ Flags: to the server --grpc-client-tls-key="" TLS Key for the client's certificate --grpc-client-tls-secure Use TLS when talking to the gRPC server - --grpc-client-tls-skip-verify + --grpc-client-tls-skip-verify Disable TLS certificate verification i.e self signed, signed by fake CA --grpc-grace-period=2m Time to wait after an interrupt received for GRPC Server. --grpc-server-tls-cert="" TLS Certificate for gRPC server, leave blank to disable TLS - --grpc-server-tls-client-ca="" + --grpc-server-tls-client-ca="" TLS CA to verify clients against. If no client CA is specified, there is no client verification on server side. (tls.NoClientCert) @@ -319,7 +287,7 @@ Flags: disable TLS -h, --help Show context-sensitive help (also try --help-long and --help-man). - --http-address="0.0.0.0:10902" + --http-address="0.0.0.0:10902" Listen host:port for HTTP endpoints. --http-grace-period=2m Time to wait after an interrupt received for HTTP Server. @@ -341,7 +309,7 @@ Flags: --query.auto-downsampling Enable automatic adjustment (step / 5) to what source of data should be used in store gateways if no max_source_resolution param is specified. - --query.default-evaluation-interval=1m + --query.default-evaluation-interval=1m Set default evaluation interval for sub queries. --query.default-step=1s Set default step for range queries. Default @@ -351,7 +319,7 @@ Flags: max(rangeSeconds / 250, defaultStep)). This will not work from Grafana, but Grafana has __step variable which can be used. - --query.lookback-delta=QUERY.LOOKBACK-DELTA + --query.lookback-delta=QUERY.LOOKBACK-DELTA The maximum lookback duration for retrieving metrics during expression evaluations. PromQL always evaluates the query for the certain @@ -366,10 +334,10 @@ Flags: it will use the promql default of 5m. --query.max-concurrent=20 Maximum number of queries processed concurrently by query node. - --query.max-concurrent-select=4 + --query.max-concurrent-select=4 Maximum number of select requests made concurrently per a query. - --query.metadata.default-time-range=0s + --query.metadata.default-time-range=0s The default metadata time range duration for retrieving labels through Labels and Series API when the range parameters are not specified. @@ -378,24 +346,24 @@ Flags: --query.partial-response Enable partial response for queries if no partial_response param is specified. --no-query.partial-response for disabling. - --query.replica-label=QUERY.REPLICA-LABEL ... + --query.replica-label=QUERY.REPLICA-LABEL ... Labels to treat as a replica indicator along which data is deduplicated. Still you will be able to query without deduplication using 'dedup=false' parameter. Data includes time series, recording rules, and alerting rules. --query.timeout=2m Maximum time to process query by query node. - --request.logging-config= + --request.logging-config= Alternative to 'request.logging-config-file' flag (mutually exclusive). Content of YAML file with request logging configuration. See format details: https://gist.github.com/yashrsharma44/02f5765c5710dd09ce5d14e854f22825 - --request.logging-config-file= + --request.logging-config-file= Path to YAML file with request logging configuration. See format details: https://gist.github.com/yashrsharma44/02f5765c5710dd09ce5d14e854f22825 - --selector-label=="" ... + --selector-label=="" ... Query selector labels that will be exposed in info endpoint (repeated). --store= ... Addresses of statically configured store API @@ -403,33 +371,33 @@ Flags: prefixed with 'dns+' or 'dnssrv+' to detect store API servers through respective DNS lookups. - --store-strict= ... + --store-strict= ... Addresses of only statically configured store API servers that are always used, even if the health check fails. Useful if you have a caching layer on top. - --store.response-timeout=0ms + --store.response-timeout=0ms If a Store doesn't send any data in this specified duration then a Store will be ignored and partial data will be returned if it's enabled. 0 disables timeout. - --store.sd-dns-interval=30s + --store.sd-dns-interval=30s Interval between DNS resolutions. - --store.sd-files= ... + --store.sd-files= ... Path to files that contain addresses of store API servers. The path can be a glob pattern (repeatable). --store.sd-interval=5m Refresh interval to re-read file SD files. It is used as a resync fallback. - --store.unhealthy-timeout=5m + --store.unhealthy-timeout=5m Timeout before an unhealthy store is cleaned from the store UI page. - --tracing.config= + --tracing.config= Alternative to 'tracing.config-file' flag (mutually exclusive). Content of YAML file with tracing configuration. See format details: https://thanos.io/tip/thanos/tracing.md/#configuration - --tracing.config-file= + --tracing.config-file= Path to YAML file with tracing configuration. See format details: https://thanos.io/tip/thanos/tracing.md/#configuration diff --git a/docs/components/receive.md b/docs/components/receive.md index 0454abbd73..f46d840491 100644 --- a/docs/components/receive.md +++ b/docs/components/receive.md @@ -1,21 +1,20 @@ --- -title: Receiver type: docs +title: Receiver menu: components --- # Receiver -The `thanos receive` command implements the [Prometheus Remote Write API](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_write). It builds on top of existing Prometheus TSDB and retains its usefulness while extending its functionality with long-term-storage, horizontal scalability, and downsampling. It exposes the StoreAPI so that [Thanos Queriers](./query.md) can query received metrics in real-time. The [Thanos Sidecar](./sidecar.md) is not sufficient for this, as the system would always lag the block length behind (typically 2 hours). +The `thanos receive` command implements the [Prometheus Remote Write API](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_write). It builds on top of existing Prometheus TSDB and retains its usefulness while extending its functionality with long-term-storage, horizontal scalability, and downsampling. It exposes the StoreAPI so that [Thanos Queriers](query.md) can query received metrics in real-time. The [Thanos Sidecar](sidecar.md) is not sufficient for this, as the system would always lag the block length behind (typically 2 hours). We recommend this component to users who can only push into a Thanos due to air-gapped, or egress only environments. Please note the [various pros and cons of pushing metrics](https://docs.google.com/document/d/1H47v7WfyKkSLMrR8_iku6u9VB73WrVzBHb2SB6dL9_g/edit#heading=h.2v27snv0lsur). Thanos Receive supports multi-tenancy by using labels. See [Multitenancy documentation here](../operating/multi-tenancy.md). -Thanos Receive supports ingesting [exemplars](https://github.com/OpenObservability/OpenMetrics/blob/main/specification/OpenMetrics.md#exemplars) via remote-write. By default, the exemplars are silently discarded as `--tsdb.max-exemplars` is set to `0`. To enable exemplars storage, set the `--tsdb.max-exemplars` flag to a non-zero value. It exposes the ExemplarsAPI so that the [Thanos Queriers](./query.md) can query the stored exemplars. Take a look at the documentation for [exemplars storage in Prometheus](https://prometheus.io/docs/prometheus/latest/disabled_features/#exemplars-storage) to know more about it. +Thanos Receive supports ingesting [exemplars](https://github.com/OpenObservability/OpenMetrics/blob/main/specification/OpenMetrics.md#exemplars) via remote-write. By default, the exemplars are silently discarded as `--tsdb.max-exemplars` is set to `0`. To enable exemplars storage, set the `--tsdb.max-exemplars` flag to a non-zero value. It exposes the ExemplarsAPI so that the [Thanos Queriers](query.md) can query the stored exemplars. Take a look at the documentation for [exemplars storage in Prometheus](https://prometheus.io/docs/prometheus/latest/disabled_features/#exemplars-storage) to know more about it. -For more information please check out [initial design proposal](../proposals/201812_thanos-remote-receive.md). -For further information on tuning Prometheus Remote Write [see remote write tuning document](https://prometheus.io/docs/practices/remote_write/). +For more information please check out [initial design proposal](../proposals/201812_thanos-remote-receive.md). For further information on tuning Prometheus Remote Write [see remote write tuning document](https://prometheus.io/docs/practices/remote_write/). > NOTE: As the block producer it's important to set correct "external labels" that will identify data block across Thanos clusters. See [external labels](../storage.md#external-labels) docs for details. @@ -46,10 +45,11 @@ where `` is an IP address reachable by Prometheus The example content of `bucket.yml`: -```yaml +```yaml mdox-exec="go run scripts/cfggen/main.go --name=gcs.Config" type: GCS config: - bucket: example-bucket + bucket: "" + service_account: "" ``` The example content of `hashring.json`: @@ -65,19 +65,18 @@ The example content of `hashring.json`: } ] ``` -With such configuration any receive listens for remote write on `10908/api/v1/receive` and will forward to correct one in hashring if needed -for tenancy and replication. + +With such configuration any receive listens for remote write on `10908/api/v1/receive` and will forward to correct one in hashring if needed for tenancy and replication. ## Flags -[embedmd]:# (flags/receive.txt $) -```$ +```$ mdox-exec="thanos receive --help" usage: thanos receive [] Accept Prometheus remote write API requests and write to local tsdb. Flags: - --grpc-address="0.0.0.0:10901" + --grpc-address="0.0.0.0:10901" Listen ip:port address for gRPC endpoints (StoreAPI). Make sure this address is routable from other components. @@ -85,7 +84,7 @@ Flags: GRPC Server. --grpc-server-tls-cert="" TLS Certificate for gRPC server, leave blank to disable TLS - --grpc-server-tls-client-ca="" + --grpc-server-tls-client-ca="" TLS CA to verify clients against. If no client CA is specified, there is no client verification on server side. (tls.NoClientCert) @@ -99,7 +98,7 @@ Flags: Possible values are: "", "SHA256". -h, --help Show context-sensitive help (also try --help-long and --help-man). - --http-address="0.0.0.0:10902" + --http-address="0.0.0.0:10902" Listen host:port for HTTP endpoints. --http-grace-period=2m Time to wait after an interrupt received for HTTP Server. @@ -112,90 +111,90 @@ Flags: --log.format=logfmt Log format to use. Possible options: logfmt or json. --log.level=info Log filtering level. - --objstore.config= + --objstore.config= Alternative to 'objstore.config-file' flag (mutually exclusive). Content of YAML file that contains object store configuration. See format details: https://thanos.io/tip/thanos/storage.md/#configuration - --objstore.config-file= + --objstore.config-file= Path to YAML file that contains object store configuration. See format details: https://thanos.io/tip/thanos/storage.md/#configuration - --receive.default-tenant-id="default-tenant" + --receive.default-tenant-id="default-tenant" Default tenant ID to use when none is provided via a header. - --receive.hashrings= + --receive.hashrings= Alternative to 'receive.hashrings-file' flag (lower priority). Content of file that contains the hashring configuration. - --receive.hashrings-file= + --receive.hashrings-file= Path to file that contains the hashring configuration. A watcher is initialized to watch changes and update the hashring dynamically. - --receive.hashrings-file-refresh-interval=5m + --receive.hashrings-file-refresh-interval=5m Refresh interval to re-read the hashring configuration file. (used as a fallback) - --receive.local-endpoint=RECEIVE.LOCAL-ENDPOINT + --receive.local-endpoint=RECEIVE.LOCAL-ENDPOINT Endpoint of local receive node. Used to identify the local node in the hashring configuration. - --receive.replica-header="THANOS-REPLICA" + --receive.replica-header="THANOS-REPLICA" HTTP header specifying the replica number of a write request. - --receive.replication-factor=1 + --receive.replication-factor=1 How many times to replicate incoming write requests. - --receive.tenant-header="THANOS-TENANT" + --receive.tenant-header="THANOS-TENANT" HTTP header to determine tenant for write requests. - --receive.tenant-label-name="tenant_id" + --receive.tenant-label-name="tenant_id" Label name through which the tenant will be announced. - --remote-write.address="0.0.0.0:19291" + --remote-write.address="0.0.0.0:19291" Address to listen on for remote write requests. - --remote-write.client-server-name="" + --remote-write.client-server-name="" Server name to verify the hostname on the returned TLS certificates. See https://tools.ietf.org/html/rfc4366#section-3.1 - --remote-write.client-tls-ca="" + --remote-write.client-tls-ca="" TLS CA Certificates to use to verify servers. - --remote-write.client-tls-cert="" + --remote-write.client-tls-cert="" TLS Certificates to use to identify this client to the server. - --remote-write.client-tls-key="" + --remote-write.client-tls-key="" TLS Key for the client's certificate. - --remote-write.server-tls-cert="" + --remote-write.server-tls-cert="" TLS Certificate for HTTP server, leave blank to disable TLS. - --remote-write.server-tls-client-ca="" + --remote-write.server-tls-client-ca="" TLS CA to verify clients against. If no client CA is specified, there is no client verification on server side. (tls.NoClientCert) - --remote-write.server-tls-key="" + --remote-write.server-tls-key="" TLS Key for the HTTP server, leave blank to disable TLS. - --request.logging-config= + --request.logging-config= Alternative to 'request.logging-config-file' flag (mutually exclusive). Content of YAML file with request logging configuration. See format details: https://gist.github.com/yashrsharma44/02f5765c5710dd09ce5d14e854f22825 - --request.logging-config-file= + --request.logging-config-file= Path to YAML file with request logging configuration. See format details: https://gist.github.com/yashrsharma44/02f5765c5710dd09ce5d14e854f22825 - --tracing.config= + --tracing.config= Alternative to 'tracing.config-file' flag (mutually exclusive). Content of YAML file with tracing configuration. See format details: https://thanos.io/tip/thanos/tracing.md/#configuration - --tracing.config-file= + --tracing.config-file= Path to YAML file with tracing configuration. See format details: https://thanos.io/tip/thanos/tracing.md/#configuration - --tsdb.allow-overlapping-blocks + --tsdb.allow-overlapping-blocks Allow overlapping blocks, which in turn enables vertical compaction and vertical query merge. --tsdb.max-exemplars=0 Enables support for ingesting exemplars and diff --git a/docs/components/rule.md b/docs/components/rule.md index cf40e37b7c..605c12dd3b 100644 --- a/docs/components/rule.md +++ b/docs/components/rule.md @@ -1,14 +1,14 @@ --- -title: Rule type: docs +title: Rule menu: components --- # Rule (aka Ruler) -_**NOTE:** It is recommended to keep deploying rules inside the relevant Prometheus servers locally. Use ruler only on specific cases. Read details [below](rule.md#risk) why._ +***NOTE:** It is recommended to keep deploying rules inside the relevant Prometheus servers locally. Use ruler only on specific cases. Read details [below](#risk) why.* -_The rule component should in particular not be used to circumvent solving rule deployment properly at the configuration management level._ +*The rule component should in particular not be used to circumvent solving rule deployment properly at the configuration management level.* The `thanos rule` command evaluates Prometheus recording and alerting rules against chosen query API via repeated `--query` (or FileSD via `--query.sd`). If more than one query is passed, round robin balancing is performed. @@ -16,8 +16,7 @@ Rule results are written back to disk in the Prometheus 2.0 storage format. Rule You can think of Rule as a simplified Prometheus that does not require a sidecar and does not scrape and do PromQL evaluation (no QueryAPI). -The data of each Rule node can be labeled to satisfy the clusters labeling scheme. High-availability pairs can be run in parallel and should be distinguished by the designated replica label, just like regular Prometheus servers. -Read more about Ruler in HA [here](rule.md#ruler-ha) +The data of each Rule node can be labeled to satisfy the clusters labeling scheme. High-availability pairs can be run in parallel and should be distinguished by the designated replica label, just like regular Prometheus servers. Read more about Ruler in HA [here](#ruler-ha) ```bash thanos rule \ @@ -35,18 +34,14 @@ thanos rule \ ## Risk -Ruler has conceptual tradeoffs that might not be favorable for most use cases. The main tradeoff is its dependence on -query reliability. For Prometheus it is unlikely to have alert/recording rule evaluation failure as evaluation is local. +Ruler has conceptual tradeoffs that might not be favorable for most use cases. The main tradeoff is its dependence on query reliability. For Prometheus it is unlikely to have alert/recording rule evaluation failure as evaluation is local. For Ruler the read path is distributed, since most likely Ruler is querying Thanos Querier which gets data from remote Store APIs. -This means that **query failure** are more likely to happen, that's why clear strategy on what will happen to alert and during query -unavailability is the key. - +This means that **query failure** are more likely to happen, that's why clear strategy on what will happen to alert and during query unavailability is the key. ## Configuring Rules - Rule files use YAML, the syntax of a rule file is: ``` @@ -161,21 +156,13 @@ To be sure that alerting works it is essential to monitor Ruler and alert from a The most important metrics to alert on are: -* `thanos_alert_sender_alerts_dropped_total`. If greater than 0, it means that alerts triggered by Rule are not being sent to alertmanager which might -indicate connection, incompatibility or misconfiguration problems. +* `thanos_alert_sender_alerts_dropped_total`. If greater than 0, it means that alerts triggered by Rule are not being sent to alertmanager which might indicate connection, incompatibility or misconfiguration problems. -* `prometheus_rule_evaluation_failures_total`. If greater than 0, it means that that rule failed to be evaluated, which results in -either gap in rule or potentially ignored alert. This metric might indicate problems on the queryAPI endpoint you use. Alert heavily on this if this happens for longer than your alert thresholds. -`strategy` label will tell you if failures comes from rules that tolerate [partial response](rule.md#partial-response) or not. +* `prometheus_rule_evaluation_failures_total`. If greater than 0, it means that that rule failed to be evaluated, which results in either gap in rule or potentially ignored alert. This metric might indicate problems on the queryAPI endpoint you use. Alert heavily on this if this happens for longer than your alert thresholds. `strategy` label will tell you if failures comes from rules that tolerate [partial response](#partial-response) or not. -* `prometheus_rule_group_last_duration_seconds < prometheus_rule_group_interval_seconds` If the difference is large, it means -that rule evaluation took more time than the scheduled interval. It can indicate that your query backend (e.g Querier) takes too much time -to evaluate the query, i.e. that it is not fast enough to fill the rule. This might indicate other problems like slow StoreAPis or -too complex query expression in rule. +* `prometheus_rule_group_last_duration_seconds < prometheus_rule_group_interval_seconds` If the difference is large, it means that rule evaluation took more time than the scheduled interval. It can indicate that your query backend (e.g Querier) takes too much time to evaluate the query, i.e. that it is not fast enough to fill the rule. This might indicate other problems like slow StoreAPis or too complex query expression in rule. -* `thanos_rule_evaluation_with_warnings_total`. If you choose to use Rules and Alerts with [partial response strategy's](rule.md#partial-response) -value as "warn", this metric will tell you how many evaluation ended up with some kind of warning. To see the actual warnings -see WARN log level. This might suggest that those evaluations return partial response and might not be accurate. +* `thanos_rule_evaluation_with_warnings_total`. If you choose to use Rules and Alerts with [partial response strategy's](#partial-response) value as "warn", this metric will tell you how many evaluation ended up with some kind of warning. To see the actual warnings see WARN log level. This might suggest that those evaluations return partial response and might not be accurate. Those metrics are important for vanilla Prometheus as well, but even more important when we rely on (sometimes WAN) network. @@ -183,18 +170,15 @@ Those metrics are important for vanilla Prometheus as well, but even more import See [alerts](/examples/alerts/alerts.md#Ruler) for more example alerts for ruler. -NOTE: It is also recommended to set a mocked Alert on Ruler that checks if Query is up. This might be something simple like `vector(1)` query, just -to check if Querier is live. +NOTE: It is also recommended to set a mocked Alert on Ruler that checks if Query is up. This might be something simple like `vector(1)` query, just to check if Querier is live. ## Performance. -As rule nodes outsource query processing to query nodes, they should generally experience little load. If necessary, functional sharding can be applied by splitting up the sets of rules between HA pairs. -Rules are processed with deduplicated data according to the replica label configured on query nodes. +As rule nodes outsource query processing to query nodes, they should generally experience little load. If necessary, functional sharding can be applied by splitting up the sets of rules between HA pairs. Rules are processed with deduplicated data according to the replica label configured on query nodes. ## External labels -It is *mandatory* to add certain external labels to indicate the ruler origin (e.g `label='replica="A"'` or for `cluster`). -Otherwise running multiple ruler replicas will be not possible, resulting in clash during compaction. +It is *mandatory* to add certain external labels to indicate the ruler origin (e.g `label='replica="A"'` or for `cluster`). Otherwise running multiple ruler replicas will be not possible, resulting in clash during compaction. NOTE: It is advised to put different external labels than labels given by other sources we are recording or alerting against. @@ -205,13 +189,11 @@ For example: * We configure `ScraperIsDown` alert that monitors service from `work1` cluster. * When triggered this alert results in `ScraperIsDown{cluster=mon1}` since external labels always *replace* source labels. -This effectively drops the important metadata and makes it impossible to tell in what exactly `cluster` the `ScraperIsDown` alert found problem -without falling back to manual query. +This effectively drops the important metadata and makes it impossible to tell in what exactly `cluster` the `ScraperIsDown` alert found problem without falling back to manual query. ## Ruler UI -On HTTP address Ruler exposes its UI that shows mainly Alerts and Rules page (similar to Prometheus Alerts page). -Each alert is linked to the query that the alert is performing, which you can click to navigate to the configured `alert.query-url`. +On HTTP address Ruler exposes its UI that shows mainly Alerts and Rules page (similar to Prometheus Alerts page). Each alert is linked to the query that the alert is performing, which you can click to navigate to the configured `alert.query-url`. ## Ruler HA @@ -219,39 +201,36 @@ Ruler aims to use a similar approach to the one that Prometheus has. You can con In case of Ruler in HA you need to make sure you have the following labelling setup: -* Labels that identify the HA group ruler and replica label with different value for each ruler instance, e.g: -`cluster="eu1", replica="A"` and `cluster=eu1, replica="B"` by using `--label` flag. -* Labels that need to be dropped just before sending to alermanager in order for alertmanager to deduplicate alerts e.g -`--alert.label-drop="replica"`. +* Labels that identify the HA group ruler and replica label with different value for each ruler instance, e.g: `cluster="eu1", replica="A"` and `cluster=eu1, replica="B"` by using `--label` flag. +* Labels that need to be dropped just before sending to alermanager in order for alertmanager to deduplicate alerts e.g `--alert.label-drop="replica"`. Full relabelling is planned to be done in future and is tracked here: https://github.com/thanos-io/thanos/issues/660 ## Flags -[embedmd]:# (flags/rule.txt $) -```$ +```$ mdox-exec="thanos rule --help" usage: thanos rule [] Ruler evaluating Prometheus rules against given Query nodes, exposing Store API and storing old blocks in bucket. Flags: - --alert.label-drop=ALERT.LABEL-DROP ... + --alert.label-drop=ALERT.LABEL-DROP ... Labels by name to drop before sending to alertmanager. This allows alert to be deduplicated on replica label (repeated). Similar Prometheus alert relabelling - --alert.query-url=ALERT.QUERY-URL + --alert.query-url=ALERT.QUERY-URL The external Thanos Query URL that would be set in all alerts 'Source' field - --alert.relabel-config= + --alert.relabel-config= Alternative to 'alert.relabel-config-file' flag (mutually exclusive). Content of YAML file that contains alert relabelling configuration. - --alert.relabel-config-file= + --alert.relabel-config-file= Path to YAML file that contains alert relabelling configuration. - --alertmanagers.config= + --alertmanagers.config= Alternative to 'alertmanagers.config-file' flag (mutually exclusive). Content of YAML file that contains alerting configuration. See format @@ -260,19 +239,19 @@ Flags: If defined, it takes precedence over the '--alertmanagers.url' and '--alertmanagers.send-timeout' flags. - --alertmanagers.config-file= + --alertmanagers.config-file= Path to YAML file that contains alerting configuration. See format details: https://thanos.io/tip/components/rule.md/#configuration. If defined, it takes precedence over the '--alertmanagers.url' and '--alertmanagers.send-timeout' flags. - --alertmanagers.sd-dns-interval=30s + --alertmanagers.sd-dns-interval=30s Interval between DNS resolutions of Alertmanager hosts. - --alertmanagers.send-timeout=10s + --alertmanagers.send-timeout=10s Timeout for sending alerts to Alertmanager - --alertmanagers.url=ALERTMANAGERS.URL ... + --alertmanagers.url=ALERTMANAGERS.URL ... Alertmanager replica URLs to push firing alerts. Ruler claims success if push to at least one alertmanager from discovered @@ -285,7 +264,7 @@ Flags: prefix for the regular Alertmanager API path. --data-dir="data/" data directory --eval-interval=30s The default evaluation interval to use. - --grpc-address="0.0.0.0:10901" + --grpc-address="0.0.0.0:10901" Listen ip:port address for gRPC endpoints (StoreAPI). Make sure this address is routable from other components. @@ -293,7 +272,7 @@ Flags: GRPC Server. --grpc-server-tls-cert="" TLS Certificate for gRPC server, leave blank to disable TLS - --grpc-server-tls-client-ca="" + --grpc-server-tls-client-ca="" TLS CA to verify clients against. If no client CA is specified, there is no client verification on server side. (tls.NoClientCert) @@ -307,14 +286,14 @@ Flags: Possible values are: "", "SHA256". -h, --help Show context-sensitive help (also try --help-long and --help-man). - --http-address="0.0.0.0:10902" + --http-address="0.0.0.0:10902" Listen host:port for HTTP endpoints. --http-grace-period=2m Time to wait after an interrupt received for HTTP Server. --http.config="" [EXPERIMENTAL] Path to the configuration file that can enable TLS or authentication for all HTTP endpoints. - --label=="" ... + --label=="" ... Labels to be applied to all generated metrics (repeated). Similar to external labels for Prometheus, used to identify ruler and its @@ -331,13 +310,13 @@ Flags: LogStartAndFinishCall: Logs the start and finish call of the requests. NoLogCall: Disable request logging. - --objstore.config= + --objstore.config= Alternative to 'objstore.config-file' flag (mutually exclusive). Content of YAML file that contains object store configuration. See format details: https://thanos.io/tip/thanos/storage.md/#configuration - --objstore.config-file= + --objstore.config-file= Path to YAML file that contains object store configuration. See format details: https://thanos.io/tip/thanos/storage.md/#configuration @@ -353,7 +332,7 @@ Flags: https://thanos.io/tip/components/rule.md/#configuration. If defined, it takes precedence over the '--query' and '--query.sd-files' flags. - --query.config-file= + --query.config-file= Path to YAML file that contains query API servers configuration. See format details: https://thanos.io/tip/components/rule.md/#configuration. @@ -361,21 +340,21 @@ Flags: '--query' and '--query.sd-files' flags. --query.http-method=POST HTTP method to use when sending queries. Possible options: [GET, POST] - --query.sd-dns-interval=30s + --query.sd-dns-interval=30s Interval between DNS resolutions. - --query.sd-files= ... + --query.sd-files= ... Path to file that contains addresses of query API servers. The path can be a glob pattern (repeatable). --query.sd-interval=5m Refresh interval to re-read file SD files. (used as a fallback) - --request.logging-config= + --request.logging-config= Alternative to 'request.logging-config-file' flag (mutually exclusive). Content of YAML file with request logging configuration. See format details: https://gist.github.com/yashrsharma44/02f5765c5710dd09ce5d14e854f22825 - --request.logging-config-file= + --request.logging-config-file= Path to YAML file with request logging configuration. See format details: https://gist.github.com/yashrsharma44/02f5765c5710dd09ce5d14e854f22825 @@ -383,18 +362,18 @@ Flags: an alert to Alertmanager. --rule-file=rules/ ... Rule files that should be used by rule manager. Can be in glob format (repeated). - --shipper.upload-compacted + --shipper.upload-compacted If true shipper will try to upload compacted blocks as well. Useful for migration purposes. Works only if compaction is disabled on Prometheus. Do it once and then disable the flag when done. - --tracing.config= + --tracing.config= Alternative to 'tracing.config-file' flag (mutually exclusive). Content of YAML file with tracing configuration. See format details: https://thanos.io/tip/thanos/tracing.md/#configuration - --tracing.config-file= + --tracing.config-file= Path to YAML file with tracing configuration. See format details: https://thanos.io/tip/thanos/tracing.md/#configuration @@ -442,7 +421,6 @@ The `--alertmanagers.config` and `--alertmanagers.config-file` flags allow speci The configuration format is the following: -[embedmd]:# (../flags/config_rule_alerting.txt yaml) ```yaml alertmanagers: - http_config: @@ -477,7 +455,6 @@ The `--query.config` and `--query.config-file` flags allow specifying multiple q The configuration format is the following: -[embedmd]:# (../flags/config_rule_query.txt yaml) ```yaml - http_config: basic_auth: diff --git a/docs/components/sidecar.md b/docs/components/sidecar.md index 9b79500116..2fc1489e90 100644 --- a/docs/components/sidecar.md +++ b/docs/components/sidecar.md @@ -1,21 +1,21 @@ --- -title: Sidecar type: docs +title: Sidecar menu: components --- # Sidecar -The `thanos sidecar` command runs a component that gets deployed along with a Prometheus instance. This allows sidecar to optionally upload metrics to object storage and allow [Queriers](./query.md) to query Prometheus data with common, efficient StoreAPI. +The `thanos sidecar` command runs a component that gets deployed along with a Prometheus instance. This allows sidecar to optionally upload metrics to object storage and allow [Queriers](query.md) to query Prometheus data with common, efficient StoreAPI. In details: -* It implements Thanos' Store API on top of Prometheus' remote-read API. This allows [Queriers](./query.md) to treat Prometheus servers as yet another source of time series data without directly talking to its APIs. +* It implements Thanos' Store API on top of Prometheus' remote-read API. This allows [Queriers](query.md) to treat Prometheus servers as yet another source of time series data without directly talking to its APIs. * Optionally, the sidecar uploads TSDB blocks to an object storage bucket as Prometheus produces them every 2 hours. This allows Prometheus servers to be run with relatively low retention while their historic data is made durable and queryable via object storage. - NOTE: This still does NOT mean that Prometheus can be fully stateless, because if it crashes and restarts you will lose ~2 hours of metrics, so persistent disk for Prometheus is highly recommended. The closest to stateless you can get is using remote write (which Thanos supports, see [Receiver](./receive.md). Remote write has other risks and consequences, and still if crashed you lose in positive case seconds of metrics data, so persistent disk is recommended in all cases. + NOTE: This still does NOT mean that Prometheus can be fully stateless, because if it crashes and restarts you will lose ~2 hours of metrics, so persistent disk for Prometheus is highly recommended. The closest to stateless you can get is using remote write (which Thanos supports, see [Receiver](receive.md). Remote write has other risks and consequences, and still if crashed you lose in positive case seconds of metrics data, so persistent disk is recommended in all cases. -* Optionally Thanos sidecar is able to watch Prometheus rules and configuration, decompress and substitute environment variables if needed and ping Prometheus to reload them. Read more about this in [here](./sidecar.md#reloader-configuration) +* Optionally Thanos sidecar is able to watch Prometheus rules and configuration, decompress and substitute environment variables if needed and ping Prometheus to reload them. Read more about this in [here](#reloader-configuration) Prometheus servers connected to the Thanos cluster via the sidecar are subject to a few limitations and recommendations for safe operations: @@ -28,8 +28,7 @@ If you choose to use the sidecar to also upload data to object storage: * Must specify object storage (`--objstore.*` flags) * It only uploads uncompacted Prometheus blocks. For compacted blocks, see [Upload compacted blocks](./sidecar.md/#upload-compacted-blocks). -* The `--storage.tsdb.min-block-duration` and `--storage.tsdb.max-block-duration` must be set to equal values to disable local compaction on order to use Thanos sidecar upload, otherwise leave local compaction on if sidecar just exposes StoreAPI and your retention is normal. The default of `2h` is recommended. - Mentioned parameters set to equal values disable the internal Prometheus compaction, which is needed to avoid the uploaded data corruption when Thanos compactor does its job, this is critical for data consistency and should not be ignored if you plan to use Thanos compactor. Even though you set mentioned parameters equal, you might observe Prometheus internal metric `prometheus_tsdb_compactions_total` being incremented, don't be confused by that: Prometheus writes initial head block to filesystem via its internal compaction mechanism, but if you have followed recommendations - data won't be modified by Prometheus before the sidecar uploads it. Thanos sidecar will also check sanity of the flags set to Prometheus on the startup and log errors or warning if they have been configured improperly (#838). +* The `--storage.tsdb.min-block-duration` and `--storage.tsdb.max-block-duration` must be set to equal values to disable local compaction on order to use Thanos sidecar upload, otherwise leave local compaction on if sidecar just exposes StoreAPI and your retention is normal. The default of `2h` is recommended. Mentioned parameters set to equal values disable the internal Prometheus compaction, which is needed to avoid the uploaded data corruption when Thanos compactor does its job, this is critical for data consistency and should not be ignored if you plan to use Thanos compactor. Even though you set mentioned parameters equal, you might observe Prometheus internal metric `prometheus_tsdb_compactions_total` being incremented, don't be confused by that: Prometheus writes initial head block to filesystem via its internal compaction mechanism, but if you have followed recommendations - data won't be modified by Prometheus before the sidecar uploads it. Thanos sidecar will also check sanity of the flags set to Prometheus on the startup and log errors or warning if they have been configured improperly (#838). * The retention of Prometheus is recommended to not be lower than three times of the min block duration, so 6 hours. This achieves resilience in the face of connectivity issues to the object storage since all local data will remain available within the Thanos cluster. If connectivity gets restored the backlog of blocks gets uploaded to the object storage. ## Reloader Configuration @@ -58,10 +57,11 @@ thanos sidecar \ The example content of `bucket.yml`: -```yaml +```yaml mdox-exec="go run scripts/cfggen/main.go --name=gcs.Config" type: GCS config: - bucket: example-bucket + bucket: "" + service_account: "" ``` ## Upload compacted blocks @@ -75,14 +75,13 @@ To use this, the Prometheus compaction needs to be disabled. This can be done by ## Flags -[embedmd]:# (flags/sidecar.txt $) -```$ +```$ mdox-exec="thanos sidecar --help" usage: thanos sidecar [] Sidecar for Prometheus server. Flags: - --grpc-address="0.0.0.0:10901" + --grpc-address="0.0.0.0:10901" Listen ip:port address for gRPC endpoints (StoreAPI). Make sure this address is routable from other components. @@ -90,7 +89,7 @@ Flags: GRPC Server. --grpc-server-tls-cert="" TLS Certificate for gRPC server, leave blank to disable TLS - --grpc-server-tls-client-ca="" + --grpc-server-tls-client-ca="" TLS CA to verify clients against. If no client CA is specified, there is no client verification on server side. (tls.NoClientCert) @@ -104,7 +103,7 @@ Flags: Possible values are: "", "SHA256". -h, --help Show context-sensitive help (also try --help-long and --help-man). - --http-address="0.0.0.0:10902" + --http-address="0.0.0.0:10902" Listen host:port for HTTP endpoints. --http-grace-period=2m Time to wait after an interrupt received for HTTP Server. @@ -114,69 +113,69 @@ Flags: --log.format=logfmt Log format to use. Possible options: logfmt or json. --log.level=info Log filtering level. - --min-time=0000-01-01T00:00:00Z + --min-time=0000-01-01T00:00:00Z Start of time range limit to serve. Thanos sidecar will serve only metrics, which happened later than this value. Option can be a constant time in RFC3339 format or time duration relative to current time, such as -1d or 2h45m. Valid duration units are ms, s, m, h, d, w, y. - --objstore.config= + --objstore.config= Alternative to 'objstore.config-file' flag (mutually exclusive). Content of YAML file that contains object store configuration. See format details: https://thanos.io/tip/thanos/storage.md/#configuration - --objstore.config-file= + --objstore.config-file= Path to YAML file that contains object store configuration. See format details: https://thanos.io/tip/thanos/storage.md/#configuration - --prometheus.ready_timeout=10m + --prometheus.ready_timeout=10m Maximum time to wait for the Prometheus instance to start up - --prometheus.url=http://localhost:9090 + --prometheus.url=http://localhost:9090 URL at which to reach Prometheus's API. For better performance use local network. - --receive.connection-pool-size=RECEIVE.CONNECTION-POOL-SIZE + --receive.connection-pool-size=RECEIVE.CONNECTION-POOL-SIZE Controls the http MaxIdleConns. Default is 0, which is unlimited - --receive.connection-pool-size-per-host=100 + --receive.connection-pool-size-per-host=100 Controls the http MaxIdleConnsPerHost - --reloader.config-envsubst-file="" + --reloader.config-envsubst-file="" Output file for environment variable substituted config file. --reloader.config-file="" Config file watched by the reloader. - --reloader.retry-interval=5s + --reloader.retry-interval=5s Controls how often reloader retries config reload in case of error. - --reloader.rule-dir=RELOADER.RULE-DIR ... + --reloader.rule-dir=RELOADER.RULE-DIR ... Rule directories for the reloader to refresh (repeated field). - --reloader.watch-interval=3m + --reloader.watch-interval=3m Controls how often reloader re-reads config and rules. - --request.logging-config= + --request.logging-config= Alternative to 'request.logging-config-file' flag (mutually exclusive). Content of YAML file with request logging configuration. See format details: https://gist.github.com/yashrsharma44/02f5765c5710dd09ce5d14e854f22825 - --request.logging-config-file= + --request.logging-config-file= Path to YAML file with request logging configuration. See format details: https://gist.github.com/yashrsharma44/02f5765c5710dd09ce5d14e854f22825 - --shipper.upload-compacted + --shipper.upload-compacted If true shipper will try to upload compacted blocks as well. Useful for migration purposes. Works only if compaction is disabled on Prometheus. Do it once and then disable the flag when done. - --tracing.config= + --tracing.config= Alternative to 'tracing.config-file' flag (mutually exclusive). Content of YAML file with tracing configuration. See format details: https://thanos.io/tip/thanos/tracing.md/#configuration - --tracing.config-file= + --tracing.config-file= Path to YAML file with tracing configuration. See format details: https://thanos.io/tip/thanos/tracing.md/#configuration diff --git a/docs/components/store.md b/docs/components/store.md index b654cce2e3..888a846991 100644 --- a/docs/components/store.md +++ b/docs/components/store.md @@ -1,13 +1,12 @@ --- -title: Store type: docs +title: Store menu: components --- # Store -The `thanos store` command (also known as Store Gateway) implements the Store API on top of historical data in an object storage bucket. It acts primarily as an API gateway and therefore does not need significant amounts of local disk space. It joins a Thanos cluster on startup and advertises the data it can access. -It keeps a small amount of information about all remote blocks on local disk and keeps it in sync with the bucket. This data is generally safe to delete across restarts at the cost of increased startup times. +The `thanos store` command (also known as Store Gateway) implements the Store API on top of historical data in an object storage bucket. It acts primarily as an API gateway and therefore does not need significant amounts of local disk space. It joins a Thanos cluster on startup and advertises the data it can access. It keeps a small amount of information about all remote blocks on local disk and keeps it in sync with the bucket. This data is generally safe to delete across restarts at the cost of increased startup times. ```bash thanos store \ @@ -17,28 +16,28 @@ thanos store \ The content of `bucket.yml`: -```yaml +```yaml mdox-exec="go run scripts/cfggen/main.go --name=gcs.Config" type: GCS config: - bucket: example-bucket + bucket: "" + service_account: "" ``` In general, an average of 6 MB of local disk space is required per TSDB block stored in the object storage bucket, but for high cardinality blocks with large label set it can even go up to 30MB and more. It is for the pre-computed index, which includes symbols and postings offsets as well as metadata JSON. ## Flags -[embedmd]:# (flags/store.txt $) -```$ +```$ mdox-exec="thanos store --help" usage: thanos store [] Store node giving access to blocks in a bucket provider. Now supported GCS, S3, Azure, Swift, Tencent COS and Aliyun OSS. Flags: - --block-meta-fetch-concurrency=32 + --block-meta-fetch-concurrency=32 Number of goroutines to use when fetching block metadata from object storage. - --block-sync-concurrency=20 + --block-sync-concurrency=20 Number of goroutines to use when constructing index-cache.json blocks from object storage. --chunk-pool-size=2GB Maximum size of concurrently allocatable bytes @@ -55,7 +54,7 @@ Flags: NOTE: Putting raw blocks here will not cause the store to read them. For such use cases use Prometheus + sidecar. - --grpc-address="0.0.0.0:10901" + --grpc-address="0.0.0.0:10901" Listen ip:port address for gRPC endpoints (StoreAPI). Make sure this address is routable from other components. @@ -63,7 +62,7 @@ Flags: GRPC Server. --grpc-server-tls-cert="" TLS Certificate for gRPC server, leave blank to disable TLS - --grpc-server-tls-client-ca="" + --grpc-server-tls-client-ca="" TLS CA to verify clients against. If no client CA is specified, there is no client verification on server side. (tls.NoClientCert) @@ -71,14 +70,14 @@ Flags: disable TLS -h, --help Show context-sensitive help (also try --help-long and --help-man). - --http-address="0.0.0.0:10902" + --http-address="0.0.0.0:10902" Listen host:port for HTTP endpoints. --http-grace-period=2m Time to wait after an interrupt received for HTTP Server. --http.config="" [EXPERIMENTAL] Path to the configuration file that can enable TLS or authentication for all HTTP endpoints. - --ignore-deletion-marks-delay=24h + --ignore-deletion-marks-delay=24h Duration after which the blocks marked for deletion will be filtered out while fetching blocks. The idea of ignore-deletion-marks-delay @@ -102,54 +101,54 @@ Flags: --index-cache-size=250MB Maximum size of items held in the in-memory index cache. Ignored if --index-cache.config or --index-cache.config-file option is specified. - --index-cache.config= + --index-cache.config= Alternative to 'index-cache.config-file' flag (mutually exclusive). Content of YAML file that contains index cache configuration. See format details: https://thanos.io/tip/components/store.md/#index-cache - --index-cache.config-file= + --index-cache.config-file= Path to YAML file that contains index cache configuration. See format details: https://thanos.io/tip/components/store.md/#index-cache --log.format=logfmt Log format to use. Possible options: logfmt or json. --log.level=info Log filtering level. - --max-time=9999-12-31T23:59:59Z + --max-time=9999-12-31T23:59:59Z End of time range limit to serve. Thanos Store will serve only blocks, which happened earlier than this value. Option can be a constant time in RFC3339 format or time duration relative to current time, such as -1d or 2h45m. Valid duration units are ms, s, m, h, d, w, y. - --min-time=0000-01-01T00:00:00Z + --min-time=0000-01-01T00:00:00Z Start of time range limit to serve. Thanos Store will serve only metrics, which happened later than this value. Option can be a constant time in RFC3339 format or time duration relative to current time, such as -1d or 2h45m. Valid duration units are ms, s, m, h, d, w, y. - --objstore.config= + --objstore.config= Alternative to 'objstore.config-file' flag (mutually exclusive). Content of YAML file that contains object store configuration. See format details: https://thanos.io/tip/thanos/storage.md/#configuration - --objstore.config-file= + --objstore.config-file= Path to YAML file that contains object store configuration. See format details: https://thanos.io/tip/thanos/storage.md/#configuration - --request.logging-config= + --request.logging-config= Alternative to 'request.logging-config-file' flag (mutually exclusive). Content of YAML file with request logging configuration. See format details: https://gist.github.com/yashrsharma44/02f5765c5710dd09ce5d14e854f22825 - --request.logging-config-file= + --request.logging-config-file= Path to YAML file with request logging configuration. See format details: https://gist.github.com/yashrsharma44/02f5765c5710dd09ce5d14e854f22825 - --selector.relabel-config= + --selector.relabel-config= Alternative to 'selector.relabel-config-file' flag (mutually exclusive). Content of YAML file that contains relabeling configuration that @@ -157,19 +156,19 @@ Flags: Prometheus relabel-config syntax. See format details: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config - --selector.relabel-config-file= + --selector.relabel-config-file= Path to YAML file that contains relabeling configuration that allows selecting blocks. It follows native Prometheus relabel-config syntax. See format details: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config - --store.enable-index-header-lazy-reader + --store.enable-index-header-lazy-reader If true, Store Gateway will lazy memory map index-header only once the block is required by a query. - --store.grpc.series-max-concurrency=20 + --store.grpc.series-max-concurrency=20 Maximum number of concurrent Series calls. - --store.grpc.series-sample-limit=0 + --store.grpc.series-sample-limit=0 Maximum amount of samples returned via a single Series call. The Series call fails if this limit is exceeded. 0 means no limit. NOTE: For @@ -179,18 +178,18 @@ Flags: samples each chunk can contain), so the actual number of samples might be lower, even though the maximum could be hit. - --store.grpc.touched-series-limit=0 + --store.grpc.touched-series-limit=0 Maximum amount of touched series returned via a single Series call. The Series call fails if this limit is exceeded. 0 means no limit. --sync-block-duration=3m Repeat interval for syncing the blocks between local and remote view. - --tracing.config= + --tracing.config= Alternative to 'tracing.config-file' flag (mutually exclusive). Content of YAML file with tracing configuration. See format details: https://thanos.io/tip/thanos/tracing.md/#configuration - --tracing.config-file= + --tracing.config-file= Path to YAML file with tracing configuration. See format details: https://thanos.io/tip/thanos/tracing.md/#configuration @@ -254,7 +253,7 @@ Check more [here](https://thanos.io/tip/thanos/sharding.md/). Thanos Store Gateway supports an index cache to speed up postings and series lookups from TSDB blocks indexes. Two types of caches are supported: -- `in-memory` (_default_) +- `in-memory` (*default*) - `memcached` ### In-memory index cache @@ -263,8 +262,7 @@ The `in-memory` index cache is enabled by default and its max size can be config Alternatively, the `in-memory` index cache can also be configured using `--index-cache.config-file` to reference the configuration file or `--index-cache.config` to put yaml config directly: -[embedmd]:# (../flags/config_index_cache_in_memory.txt yaml) -```yaml +```yaml mdox-exec="go run scripts/cfggen/main.go --name=storecache.InMemoryIndexCacheConfig" type: IN-MEMORY config: max_size: 0 @@ -280,8 +278,7 @@ All the settings are **optional**: The `memcached` index cache allows to use [Memcached](https://memcached.org) as cache backend. This cache type is configured using `--index-cache.config-file` to reference the configuration file or `--index-cache.config` to put yaml config directly: -[embedmd]:# (../flags/config_index_cache_memcached.txt yaml) -```yaml +```yaml mdox-exec="go run scripts/cfggen/main.go --name=cacheutil.MemcachedClientConfig" type: MEMCACHED config: addresses: [] diff --git a/docs/components/tools.md b/docs/components/tools.md index b69c45d227..b3ba3998eb 100644 --- a/docs/components/tools.md +++ b/docs/components/tools.md @@ -1,20 +1,18 @@ --- -title: Tools type: docs +title: Tools menu: components --- # Tools -The `thanos tools` subcommand of Thanos is a set of additional CLI, short-living tools that -are meant to be ran for development or debugging purposes. +The `thanos tools` subcommand of Thanos is a set of additional CLI, short-living tools that are meant to be ran for development or debugging purposes. All commands added as tools should land in `tools.go` or file with `tools_` prefix. ## Flags -[embedmd]:# (flags/tools.txt $) -```$ +```$ mdox-exec="thanos tools --help" usage: thanos tools [ ...] Tools utility commands @@ -24,12 +22,12 @@ Flags: --help-man). --log.format=logfmt Log format to use. Possible options: logfmt or json. --log.level=info Log filtering level. - --tracing.config= + --tracing.config= Alternative to 'tracing.config-file' flag (mutually exclusive). Content of YAML file with tracing configuration. See format details: https://thanos.io/tip/thanos/tracing.md/#configuration - --tracing.config-file= + --tracing.config-file= Path to YAML file with tracing configuration. See format details: https://thanos.io/tip/thanos/tracing.md/#configuration @@ -90,8 +88,7 @@ Subcommands: ## Bucket -The `thanos tools bucket` subcommand of Thanos is a set of commands to inspect data in object storage buckets. -It is normally run as a standalone command to aid with troubleshooting. +The `thanos tools bucket` subcommand of Thanos is a set of commands to inspect data in object storage buckets. It is normally run as a standalone command to aid with troubleshooting. Example: @@ -101,17 +98,16 @@ thanos tools bucket verify --objstore.config-file=bucket.yml The content of `bucket.yml`: -```yaml +```yaml mdox-exec="go run scripts/cfggen/main.go --name=gcs.Config" type: GCS config: - bucket: example-bucket + bucket: "" + service_account: "" ``` -Bucket can be extended to add more subcommands that will be helpful when working with object storage buckets -by adding a new command within [`/cmd/thanos/tools_bucket.go`](/cmd/thanos/tools_bucket.go) . +Bucket can be extended to add more subcommands that will be helpful when working with object storage buckets by adding a new command within [`/cmd/thanos/tools_bucket.go`](../../cmd/thanos/tools_bucket.go) . -[embedmd]:# (flags/tools_bucket.txt $) -```$ +```$ mdox-exec="thanos tools bucket --help" usage: thanos tools bucket [] [ ...] Bucket utility commands @@ -121,21 +117,21 @@ Flags: --help-man). --log.format=logfmt Log format to use. Possible options: logfmt or json. --log.level=info Log filtering level. - --objstore.config= + --objstore.config= Alternative to 'objstore.config-file' flag (mutually exclusive). Content of YAML file that contains object store configuration. See format details: https://thanos.io/tip/thanos/storage.md/#configuration - --objstore.config-file= + --objstore.config-file= Path to YAML file that contains object store configuration. See format details: https://thanos.io/tip/thanos/storage.md/#configuration - --tracing.config= + --tracing.config= Alternative to 'tracing.config-file' flag (mutually exclusive). Content of YAML file with tracing configuration. See format details: https://thanos.io/tip/thanos/tracing.md/#configuration - --tracing.config-file= + --tracing.config-file= Path to YAML file with tracing configuration. See format details: https://thanos.io/tip/thanos/tracing.md/#configuration @@ -197,7 +193,7 @@ Subcommands: This will start local webserver that will periodically update the view with given refresh. -web +web Example: @@ -205,8 +201,7 @@ Example: thanos tools bucket web --objstore.config-file="..." ``` -[embedmd]:# (flags/tools_bucket_web.txt $) -```$ +```$ mdox-exec="thanos tools bucket web --help" usage: thanos tools bucket web [] Web interface for remote storage bucket. @@ -214,7 +209,7 @@ Web interface for remote storage bucket. Flags: -h, --help Show context-sensitive help (also try --help-long and --help-man). - --http-address="0.0.0.0:10902" + --http-address="0.0.0.0:10902" Listen host:port for HTTP endpoints. --http-grace-period=2m Time to wait after an interrupt received for HTTP Server. @@ -225,25 +220,25 @@ Flags: --log.format=logfmt Log format to use. Possible options: logfmt or json. --log.level=info Log filtering level. - --objstore.config= + --objstore.config= Alternative to 'objstore.config-file' flag (mutually exclusive). Content of YAML file that contains object store configuration. See format details: https://thanos.io/tip/thanos/storage.md/#configuration - --objstore.config-file= + --objstore.config-file= Path to YAML file that contains object store configuration. See format details: https://thanos.io/tip/thanos/storage.md/#configuration --refresh=30m Refresh interval to download metadata from remote storage --timeout=5m Timeout to download metadata from remote storage - --tracing.config= + --tracing.config= Alternative to 'tracing.config-file' flag (mutually exclusive). Content of YAML file with tracing configuration. See format details: https://thanos.io/tip/thanos/tracing.md/#configuration - --tracing.config-file= + --tracing.config-file= Path to YAML file with tracing configuration. See format details: https://thanos.io/tip/thanos/tracing.md/#configuration @@ -290,8 +285,7 @@ thanos tools bucket verify --objstore.config-file="..." When using the `--repair` option, make sure that the compactor job is disabled first. -[embedmd]:# (flags/tools_bucket_verify.txt $) -```$ +```$ mdox-exec="thanos tools bucket verify --help" usage: thanos tools bucket verify [] Verify all blocks in the bucket against specified issues. NOTE: Depending on @@ -316,14 +310,14 @@ Flags: --id=ID ... Block IDs to verify (and optionally repair) only. If none is specified, all blocks will be verified. Repeated field - -i, --issues=index_known_issues... ... + -i, --issues=index_known_issues... ... Issues to verify (and optionally repair). Possible issue to verify, without repair: [overlapped_blocks]; Possible issue to verify and repair: [index_known_issues duplicated_compaction] --log.format=logfmt Log format to use. Possible options: logfmt or json. --log.level=info Log filtering level. - --objstore-backup.config= + --objstore-backup.config= Alternative to 'objstore-backup.config-file' flag (mutually exclusive). Content of YAML file that contains object store-backup configuration. See @@ -331,29 +325,29 @@ Flags: https://thanos.io/tip/thanos/storage.md/#configuration Used for repair logic to backup blocks before removal. - --objstore-backup.config-file= + --objstore-backup.config-file= Path to YAML file that contains object store-backup configuration. See format details: https://thanos.io/tip/thanos/storage.md/#configuration Used for repair logic to backup blocks before removal. - --objstore.config= + --objstore.config= Alternative to 'objstore.config-file' flag (mutually exclusive). Content of YAML file that contains object store configuration. See format details: https://thanos.io/tip/thanos/storage.md/#configuration - --objstore.config-file= + --objstore.config-file= Path to YAML file that contains object store configuration. See format details: https://thanos.io/tip/thanos/storage.md/#configuration -r, --repair Attempt to repair blocks for which issues were detected - --tracing.config= + --tracing.config= Alternative to 'tracing.config-file' flag (mutually exclusive). Content of YAML file with tracing configuration. See format details: https://thanos.io/tip/thanos/tracing.md/#configuration - --tracing.config-file= + --tracing.config-file= Path to YAML file with tracing configuration. See format details: https://thanos.io/tip/thanos/tracing.md/#configuration @@ -371,8 +365,7 @@ Example: thanos tools bucket ls -o json --objstore.config-file="..." ``` -[embedmd]:# (flags/tools_bucket_ls.txt $) -```$ +```$ mdox-exec="thanos tools bucket ls --help" usage: thanos tools bucket ls [] List all blocks in the bucket. @@ -382,24 +375,24 @@ Flags: --help-man). --log.format=logfmt Log format to use. Possible options: logfmt or json. --log.level=info Log filtering level. - --objstore.config= + --objstore.config= Alternative to 'objstore.config-file' flag (mutually exclusive). Content of YAML file that contains object store configuration. See format details: https://thanos.io/tip/thanos/storage.md/#configuration - --objstore.config-file= + --objstore.config-file= Path to YAML file that contains object store configuration. See format details: https://thanos.io/tip/thanos/storage.md/#configuration -o, --output="" Optional format in which to print each block's information. Options are 'json', 'wide' or a custom template. - --tracing.config= + --tracing.config= Alternative to 'tracing.config-file' flag (mutually exclusive). Content of YAML file with tracing configuration. See format details: https://thanos.io/tip/thanos/tracing.md/#configuration - --tracing.config-file= + --tracing.config-file= Path to YAML file with tracing configuration. See format details: https://thanos.io/tip/thanos/tracing.md/#configuration @@ -417,8 +410,7 @@ Example: thanos tools bucket inspect -l environment=\"prod\" --objstore.config-file="..." ``` -[embedmd]:# (flags/tools_bucket_inspect.txt $) -```$ +```$ mdox-exec="thanos tools bucket inspect --help" usage: thanos tools bucket inspect [] Inspect all blocks in the bucket in detailed, table-like way. @@ -429,17 +421,17 @@ Flags: --log.format=logfmt Log format to use. Possible options: logfmt or json. --log.level=info Log filtering level. - --objstore.config= + --objstore.config= Alternative to 'objstore.config-file' flag (mutually exclusive). Content of YAML file that contains object store configuration. See format details: https://thanos.io/tip/thanos/storage.md/#configuration - --objstore.config-file= + --objstore.config-file= Path to YAML file that contains object store configuration. See format details: https://thanos.io/tip/thanos/storage.md/#configuration - -l, --selector==\"\" ... + -l, --selector==\"\" ... Selects blocks based on label, e.g. '-l key1=\"value1\" -l key2=\"value2\"'. All key value pairs must match. @@ -448,12 +440,12 @@ Flags: UNTIL'. I.e., if the 'FROM' value is equal the rows are then further sorted by the 'UNTIL' value. --timeout=5m Timeout to download metadata from remote storage - --tracing.config= + --tracing.config= Alternative to 'tracing.config-file' flag (mutually exclusive). Content of YAML file with tracing configuration. See format details: https://thanos.io/tip/thanos/tracing.md/#configuration - --tracing.config-file= + --tracing.config-file= Path to YAML file with tracing configuration. See format details: https://thanos.io/tip/thanos/tracing.md/#configuration @@ -468,12 +460,12 @@ Flags: NOTE: Currently it works only with Thanos blocks (meta.json has to have Thanos metadata). Example: + ``` thanos tools bucket replicate --objstore.config-file="..." --objstore-to.config="..." ``` -[embedmd]:# (flags/tools_bucket_replicate.txt $) -```$ +```$ mdox-exec="thanos tools bucket replicate --help" usage: thanos tools bucket replicate [] Replicate data from one object storage to another. NOTE: Currently it works only @@ -484,7 +476,7 @@ Flags: be replicated. Repeated flag. -h, --help Show context-sensitive help (also try --help-long and --help-man). - --http-address="0.0.0.0:10902" + --http-address="0.0.0.0:10902" Listen host:port for HTTP endpoints. --http-grace-period=2m Time to wait after an interrupt received for HTTP Server. @@ -501,7 +493,7 @@ Flags: --log.level=info Log filtering level. --matcher=key="value" ... Only blocks whose external labels exactly match this matcher will be replicated. - --max-time=9999-12-31T23:59:59Z + --max-time=9999-12-31T23:59:59Z End of time range limit to replicate. Thanos Replicate will replicate only metrics, which happened earlier than this value. Option can be @@ -509,7 +501,7 @@ Flags: duration relative to current time, such as -1d or 2h45m. Valid duration units are ms, s, m, h, d, w, y. - --min-time=0000-01-01T00:00:00Z + --min-time=0000-01-01T00:00:00Z Start of time range limit to replicate. Thanos Replicate will replicate only metrics, which happened later than this value. Option can be a @@ -517,37 +509,37 @@ Flags: duration relative to current time, such as -1d or 2h45m. Valid duration units are ms, s, m, h, d, w, y. - --objstore-to.config= + --objstore-to.config= Alternative to 'objstore-to.config-file' flag (mutually exclusive). Content of YAML file that contains object store-to configuration. See format details: https://thanos.io/tip/thanos/storage.md/#configuration The object storage which replicate data to. - --objstore-to.config-file= + --objstore-to.config-file= Path to YAML file that contains object store-to configuration. See format details: https://thanos.io/tip/thanos/storage.md/#configuration The object storage which replicate data to. - --objstore.config= + --objstore.config= Alternative to 'objstore.config-file' flag (mutually exclusive). Content of YAML file that contains object store configuration. See format details: https://thanos.io/tip/thanos/storage.md/#configuration - --objstore.config-file= + --objstore.config-file= Path to YAML file that contains object store configuration. See format details: https://thanos.io/tip/thanos/storage.md/#configuration --resolution=0s... ... Only blocks with these resolutions will be replicated. Repeated flag. --single-run Run replication only one time, then exit. - --tracing.config= + --tracing.config= Alternative to 'tracing.config-file' flag (mutually exclusive). Content of YAML file with tracing configuration. See format details: https://thanos.io/tip/thanos/tracing.md/#configuration - --tracing.config-file= + --tracing.config-file= Path to YAML file with tracing configuration. See format details: https://thanos.io/tip/thanos/tracing.md/#configuration @@ -557,8 +549,7 @@ Flags: ### Bucket downsample -`tools bucket downsample` is used to downsample blocks in an object store bucket as a service. -It implements the downsample API on top of historical data in an object storage bucket. +`tools bucket downsample` is used to downsample blocks in an object store bucket as a service. It implements the downsample API on top of historical data in an object storage bucket. ```bash thanos tools bucket downsample \ @@ -568,14 +559,14 @@ thanos tools bucket downsample \ The content of `bucket.yml`: -```yaml +```yaml mdox-exec="go run scripts/cfggen/main.go --name=gcs.Config" type: GCS config: - bucket: example-bucket + bucket: "" + service_account: "" ``` -[embedmd]:# (flags/tools_bucket_downsample.txt $) -```$ +```$ mdox-exec="thanos tools bucket downsample --help" usage: thanos tools bucket downsample [] Continuously downsamples blocks in an object store bucket. @@ -591,7 +582,7 @@ Flags: are: "", "SHA256". -h, --help Show context-sensitive help (also try --help-long and --help-man). - --http-address="0.0.0.0:10902" + --http-address="0.0.0.0:10902" Listen host:port for HTTP endpoints. --http-grace-period=2m Time to wait after an interrupt received for HTTP Server. @@ -601,22 +592,22 @@ Flags: --log.format=logfmt Log format to use. Possible options: logfmt or json. --log.level=info Log filtering level. - --objstore.config= + --objstore.config= Alternative to 'objstore.config-file' flag (mutually exclusive). Content of YAML file that contains object store configuration. See format details: https://thanos.io/tip/thanos/storage.md/#configuration - --objstore.config-file= + --objstore.config-file= Path to YAML file that contains object store configuration. See format details: https://thanos.io/tip/thanos/storage.md/#configuration - --tracing.config= + --tracing.config= Alternative to 'tracing.config-file' flag (mutually exclusive). Content of YAML file with tracing configuration. See format details: https://thanos.io/tip/thanos/tracing.md/#configuration - --tracing.config-file= + --tracing.config-file= Path to YAML file with tracing configuration. See format details: https://thanos.io/tip/thanos/tracing.md/#configuration @@ -638,14 +629,14 @@ thanos tools bucket mark \ The example content of `bucket.yml`: -```yaml +```yaml mdox-exec="go run scripts/cfggen/main.go --name=gcs.Config" type: GCS config: - bucket: example-bucket + bucket: "" + service_account: "" ``` -[embedmd]:# (flags/tools_bucket_mark.txt $) -```$ +```$ mdox-exec="thanos tools bucket mark --help" usage: thanos tools bucket mark --id=ID --marker=MARKER --details=DETAILS Mark block for deletion or no-compact in a safe way. NOTE: If the compactor is @@ -661,21 +652,21 @@ Flags: --log.format=logfmt Log format to use. Possible options: logfmt or json. --log.level=info Log filtering level. --marker=MARKER Marker to be put. - --objstore.config= + --objstore.config= Alternative to 'objstore.config-file' flag (mutually exclusive). Content of YAML file that contains object store configuration. See format details: https://thanos.io/tip/thanos/storage.md/#configuration - --objstore.config-file= + --objstore.config-file= Path to YAML file that contains object store configuration. See format details: https://thanos.io/tip/thanos/storage.md/#configuration - --tracing.config= + --tracing.config= Alternative to 'tracing.config-file' flag (mutually exclusive). Content of YAML file with tracing configuration. See format details: https://thanos.io/tip/thanos/tracing.md/#configuration - --tracing.config-file= + --tracing.config-file= Path to YAML file with tracing configuration. See format details: https://thanos.io/tip/thanos/tracing.md/#configuration @@ -708,8 +699,7 @@ By default, rewrite also produces `change.log` in the tmp local dir. Look for lo ts=2020-11-09T00:40:13.703322181Z caller=level.go:63 level=info msg="changelog will be available" file=/tmp/thanos-rewrite/01EPN74E401ZD2SQXS4SRY6DZX/change.log` ``` -[embedmd]:# (flags/tools_bucket_rewrite.txt $) -```$ +```$ mdox-exec="thanos tools bucket rewrite --help" usage: thanos tools bucket rewrite --id=ID [] Rewrite chosen blocks in the bucket, while deleting or modifying series Resulted @@ -744,13 +734,13 @@ Flags: --log.format=logfmt Log format to use. Possible options: logfmt or json. --log.level=info Log filtering level. - --objstore.config= + --objstore.config= Alternative to 'objstore.config-file' flag (mutually exclusive). Content of YAML file that contains object store configuration. See format details: https://thanos.io/tip/thanos/storage.md/#configuration - --objstore.config-file= + --objstore.config-file= Path to YAML file that contains object store configuration. See format details: https://thanos.io/tip/thanos/storage.md/#configuration @@ -760,31 +750,31 @@ Flags: --rewrite.add-change-log If specified, all modifications are written to new block directory. Disable if latency is to high. - --rewrite.to-delete-config= + --rewrite.to-delete-config= Alternative to 'rewrite.to-delete-config-file' flag (mutually exclusive). Content of YAML file that contains []metadata.DeletionRequest that will be applied to blocks - --rewrite.to-delete-config-file= + --rewrite.to-delete-config-file= Path to YAML file that contains []metadata.DeletionRequest that will be applied to blocks - --rewrite.to-relabel-config= + --rewrite.to-relabel-config= Alternative to 'rewrite.to-relabel-config-file' flag (mutually exclusive). Content of YAML file that contains relabel configs that will be applied to blocks - --rewrite.to-relabel-config-file= + --rewrite.to-relabel-config-file= Path to YAML file that contains relabel configs that will be applied to blocks - --tmp.dir="/tmp/thanos-rewrite" + --tmp.dir="/tmp/thanos-rewrite" Working directory for temporary files - --tracing.config= + --tracing.config= Alternative to 'tracing.config-file' flag (mutually exclusive). Content of YAML file with tracing configuration. See format details: https://thanos.io/tip/thanos/tracing.md/#configuration - --tracing.config-file= + --tracing.config-file= Path to YAML file with tracing configuration. See format details: https://thanos.io/tip/thanos/tracing.md/#configuration @@ -798,8 +788,7 @@ The `tools rules-check` subcommand contains tools for validation of Prometheus r This is allowing to check the rules with the same validation as is used by the Thanos Ruler node. -NOTE: The check is equivalent to the `promtool check rules` with addition of Thanos Ruler extended rules file syntax, -which includes `partial_response_strategy` field which `promtool` does not allow. +NOTE: The check is equivalent to the `promtool check rules` with addition of Thanos Ruler extended rules file syntax, which includes `partial_response_strategy` field which `promtool` does not allow. If the check fails the command fails with exit code `1`, otherwise `0`. @@ -809,8 +798,7 @@ Example: ./thanos tools rules-check --rules cmd/thanos/testdata/rules-files/*.yaml ``` -[embedmd]:# (flags/tools_rules-check.txt $) -```$ +```$ mdox-exec="thanos tools rules-check --help" usage: thanos tools rules-check --rules=RULES Check if the rule files are valid or not. @@ -821,12 +809,12 @@ Flags: --log.format=logfmt Log format to use. Possible options: logfmt or json. --log.level=info Log filtering level. --rules=RULES ... The rule files glob to check (repeated). - --tracing.config= + --tracing.config= Alternative to 'tracing.config-file' flag (mutually exclusive). Content of YAML file with tracing configuration. See format details: https://thanos.io/tip/thanos/tracing.md/#configuration - --tracing.config-file= + --tracing.config-file= Path to YAML file with tracing configuration. See format details: https://thanos.io/tip/thanos/tracing.md/#configuration diff --git a/docs/contributing/_index.md b/docs/contributing/_index.md index 4e48cb2fd1..3ec7a0a4a2 100644 --- a/docs/contributing/_index.md +++ b/docs/contributing/_index.md @@ -1,3 +1,5 @@ --- -title: "Contributing:" +title: 'Contributing:' --- + + diff --git a/docs/contributing/coding-style-guide.md b/docs/contributing/coding-style-guide.md index 0fa43985af..8eb238ad26 100644 --- a/docs/contributing/coding-style-guide.md +++ b/docs/contributing/coding-style-guide.md @@ -1,14 +1,12 @@ --- -title: Coding Style Guide type: docs +title: Coding Style Guide menu: contributing --- # Thanos Coding Style Guide -This document details the official style guides for the various languages we use in the Thanos project. -Feel free to familiarize yourself with and refer to this document during code reviews. If something in our codebase does not match the style, it means it -was missed or it was written before this document. Help wanted to fix it! (: +This document details the official style guides for the various languages we use in the Thanos project. Feel free to familiarize yourself with and refer to this document during code reviews. If something in our codebase does not match the style, it means it was missed or it was written before this document. Help wanted to fix it! (: Generally, we care about: @@ -18,18 +16,13 @@ Generally, we care about: * Testability. Even if it means some changes to the production code, like `timeNow func() time.Time` mock. * Consistency: If some pattern repeats, it means fewer surprises. -Some style is enforced by our linters and is covered in separate smaller sections. Please look there if you want to -embrace some of the rules in your own project! For Thanos developers, we recommend reading sections about rules to manually apply during -development. Some of those are currently impossible to detect with linters. Ideally, everything would be automated. (: +Some style is enforced by our linters and is covered in separate smaller sections. Please look there if you want to embrace some of the rules in your own project! For Thanos developers, we recommend reading sections about rules to manually apply during development. Some of those are currently impossible to detect with linters. Ideally, everything would be automated. (: # Go -For code written in [Go](https://golang.org/) we use the standard Go style guides ([Effective Go](https://golang.org/doc/effective_go.html), -[CodeReviewComments](https://github.com/golang/go/wiki/CodeReviewComments)) with a few additional rules that make certain areas stricter -than the standard guides. This ensures even better consistency in modern distributed system databases like Thanos, where reliability, performance, -and maintainability are extremely important. +For code written in [Go](https://golang.org/) we use the standard Go style guides ([Effective Go](https://golang.org/doc/effective_go.html), [CodeReviewComments](https://github.com/golang/go/wiki/CodeReviewComments)) with a few additional rules that make certain areas stricter than the standard guides. This ensures even better consistency in modern distributed system databases like Thanos, where reliability, performance, and maintainability are extremely important. -Go in Thanos +Go in Thanos