From b9de64d509dc155841c86eddb2dcdc4334a70a18 Mon Sep 17 00:00:00 2001 From: Kemal Akkoyun Date: Wed, 16 Oct 2019 14:31:30 +0200 Subject: [PATCH 01/10] Start metric and status probe server as soon as possible Signed-off-by: Kemal Akkoyun --- cmd/thanos/compact.go | 1 + cmd/thanos/downsample.go | 11 ++++--- cmd/thanos/query.go | 62 ++++++++++++++++++------------------ cmd/thanos/receive.go | 12 +++---- cmd/thanos/rule.go | 69 ++++++++++++++++++++-------------------- cmd/thanos/sidecar.go | 1 + cmd/thanos/store.go | 9 +++--- 7 files changed, 85 insertions(+), 80 deletions(-) diff --git a/cmd/thanos/compact.go b/cmd/thanos/compact.go index 12f6573a0e..749a76ab8c 100644 --- a/cmd/thanos/compact.go +++ b/cmd/thanos/compact.go @@ -173,6 +173,7 @@ func runCompact( downsampleMetrics := newDownsampleMetrics(reg) + level.Debug(logger).Log("msg", "setting up http server") statusProber := prober.NewProber(component, logger, prometheus.WrapRegistererWithPrefix("thanos_", reg)) // Initiate HTTP listener providing metrics endpoint and readiness/liveness probes. if err := scheduleHTTPServer(g, logger, reg, statusProber, httpBindAddr, nil, component); err != nil { diff --git a/cmd/thanos/downsample.go b/cmd/thanos/downsample.go index b6b2397127..60da138382 100644 --- a/cmd/thanos/downsample.go +++ b/cmd/thanos/downsample.go @@ -95,7 +95,13 @@ func runDownsample( }() metrics := newDownsampleMetrics(reg) + level.Debug(logger).Log("msg", "setting up http server") statusProber := prober.NewProber(comp, logger, prometheus.WrapRegistererWithPrefix("thanos_", reg)) + // Initiate HTTP listener providing metrics endpoint and readiness/liveness probes. + if err := scheduleHTTPServer(g, logger, reg, statusProber, httpBindAddr, nil, comp); err != nil { + return errors.Wrap(err, "schedule HTTP server with probe") + } + // Start cycle of syncing blocks from the bucket and garbage collecting the bucket. { ctx, cancel := context.WithCancel(context.Background()) @@ -122,11 +128,6 @@ func runDownsample( }) } - // Initiate HTTP listener providing metrics endpoint and readiness/liveness probes. - if err := scheduleHTTPServer(g, logger, reg, statusProber, httpBindAddr, nil, comp); err != nil { - return errors.Wrap(err, "schedule HTTP server with probe") - } - level.Info(logger).Log("msg", "starting downsample node") return nil } diff --git a/cmd/thanos/query.go b/cmd/thanos/query.go index 6821e6fd36..5dafd9798a 100644 --- a/cmd/thanos/query.go +++ b/cmd/thanos/query.go @@ -342,6 +342,37 @@ func runQuery( stores.Close() }) } + // Start query API + UI, metrics and status probe HTTP server. + level.Debug(logger).Log("msg", "setting up http server") + statusProber := prober.NewProber(comp, logger, reg) + { + router := route.New() + + // Redirect from / to /webRoutePrefix. + if webRoutePrefix != "" { + router.Get("/", func(w http.ResponseWriter, r *http.Request) { + http.Redirect(w, r, webRoutePrefix, http.StatusFound) + }) + } + + flagsMap := map[string]string{ + // TODO(bplotka in PR #513 review): pass all flags, not only the flags needed by prefix rewriting. + "web.external-prefix": webExternalPrefix, + "web.prefix-header": webPrefixHeaderName, + } + + ins := extpromhttp.NewInstrumentationMiddleware(reg) + ui.NewQueryUI(logger, reg, stores, flagsMap).Register(router.WithPrefix(webRoutePrefix), ins) + + api := v1.NewAPI(logger, reg, engine, queryableCreator, enableAutodownsampling, enablePartialResponse, replicaLabels, instantDefaultMaxSourceResolution) + + api.Register(router.WithPrefix(path.Join(webRoutePrefix, "/api/v1")), tracer, logger, ins) + + // Initiate HTTP listener providing metrics endpoint and readiness/liveness probes. + if err := scheduleHTTPServer(g, logger, reg, statusProber, httpBindAddr, router, comp); err != nil { + return errors.Wrap(err, "schedule HTTP server with probes") + } + } // Run File Service Discovery and update the store set when the files are modified. if fileSD != nil { var fileSDUpdates chan []*targetgroup.Group @@ -389,37 +420,6 @@ func runQuery( cancel() }) } - // Start query API + UI HTTP server. - - statusProber := prober.NewProber(comp, logger, reg) - { - router := route.New() - - // Redirect from / to /webRoutePrefix. - if webRoutePrefix != "" { - router.Get("/", func(w http.ResponseWriter, r *http.Request) { - http.Redirect(w, r, webRoutePrefix, http.StatusFound) - }) - } - - flagsMap := map[string]string{ - // TODO(bplotka in PR #513 review): pass all flags, not only the flags needed by prefix rewriting. - "web.external-prefix": webExternalPrefix, - "web.prefix-header": webPrefixHeaderName, - } - - ins := extpromhttp.NewInstrumentationMiddleware(reg) - ui.NewQueryUI(logger, reg, stores, flagsMap).Register(router.WithPrefix(webRoutePrefix), ins) - - api := v1.NewAPI(logger, reg, engine, queryableCreator, enableAutodownsampling, enablePartialResponse, replicaLabels, instantDefaultMaxSourceResolution) - - api.Register(router.WithPrefix(path.Join(webRoutePrefix, "/api/v1")), tracer, logger, ins) - - // Initiate HTTP listener providing metrics endpoint and readiness/liveness probes. - if err := scheduleHTTPServer(g, logger, reg, statusProber, httpBindAddr, router, comp); err != nil { - return errors.Wrap(err, "schedule HTTP server with probes") - } - } // Start query (proxy) gRPC StoreAPI. { l, err := net.Listen("tcp", grpcBindAddr) diff --git a/cmd/thanos/receive.go b/cmd/thanos/receive.go index 9639154037..f8713ad9ae 100644 --- a/cmd/thanos/receive.go +++ b/cmd/thanos/receive.go @@ -163,7 +163,13 @@ func runReceive( Tracer: tracer, }) + level.Debug(logger).Log("msg", "setting up http server") statusProber := prober.NewProber(comp, logger, prometheus.WrapRegistererWithPrefix("thanos_", reg)) + // Initiate HTTP listener providing metrics endpoint and readiness/liveness probes. + if err := scheduleHTTPServer(g, logger, reg, statusProber, httpBindAddr, nil, comp); err != nil { + return errors.Wrap(err, "schedule HTTP server with probes") + } + confContentYaml, err := objStoreConfig.Content() if err != nil { return err @@ -302,12 +308,6 @@ func runReceive( ) } - level.Debug(logger).Log("msg", "setting up http server") - // Initiate HTTP listener providing metrics endpoint and readiness/liveness probes. - if err := scheduleHTTPServer(g, logger, reg, statusProber, httpBindAddr, nil, comp); err != nil { - return errors.Wrap(err, "schedule HTTP server with probes") - } - level.Debug(logger).Log("msg", "setting up grpc server") { var ( diff --git a/cmd/thanos/rule.go b/cmd/thanos/rule.go index b5cc568ebf..73973e2db3 100644 --- a/cmd/thanos/rule.go +++ b/cmd/thanos/rule.go @@ -483,6 +483,41 @@ func runRule( close(cancel) }) } + // Start query API + UI, metrics and status probe HTTP server. + level.Debug(logger).Log("msg", "setting up http server") + statusProber := prober.NewProber(comp, logger, prometheus.WrapRegistererWithPrefix("thanos_", reg)) + { + router := route.New() + + // Redirect from / to /webRoutePrefix. + if webRoutePrefix != "" { + router.Get("/", func(w http.ResponseWriter, r *http.Request) { + http.Redirect(w, r, webRoutePrefix, http.StatusFound) + }) + } + + router.WithPrefix(webRoutePrefix).Post("/-/reload", func(w http.ResponseWriter, r *http.Request) { + reload <- struct{}{} + }) + + flagsMap := map[string]string{ + // TODO(bplotka in PR #513 review): pass all flags, not only the flags needed by prefix rewriting. + "web.external-prefix": webExternalPrefix, + "web.prefix-header": webPrefixHeaderName, + } + + ins := extpromhttp.NewInstrumentationMiddleware(reg) + + ui.NewRuleUI(logger, reg, ruleMgrs, alertQueryURL.String(), flagsMap).Register(router.WithPrefix(webRoutePrefix), ins) + + api := v1.NewAPI(logger, reg, ruleMgrs) + api.Register(router.WithPrefix(path.Join(webRoutePrefix, "/api/v1")), tracer, logger, ins) + + // Initiate HTTP listener providing metrics endpoint and readiness/liveness probes. + if err := scheduleHTTPServer(g, logger, reg, statusProber, httpBindAddr, router, comp); err != nil { + return errors.Wrap(err, "schedule HTTP server with probes") + } + } // Periodically update the addresses from static flags and file SD by resolving them using DNS SD if necessary. { ctx, cancel := context.WithCancel(context.Background()) @@ -495,7 +530,6 @@ func runRule( cancel() }) } - statusProber := prober.NewProber(comp, logger, prometheus.WrapRegistererWithPrefix("thanos_", reg)) // Start gRPC server. { l, err := net.Listen("tcp", grpcBindAddr) @@ -519,39 +553,6 @@ func runRule( s.Stop() }) } - // Start UI & metrics HTTP server. - { - router := route.New() - - // Redirect from / to /webRoutePrefix. - if webRoutePrefix != "" { - router.Get("/", func(w http.ResponseWriter, r *http.Request) { - http.Redirect(w, r, webRoutePrefix, http.StatusFound) - }) - } - - router.WithPrefix(webRoutePrefix).Post("/-/reload", func(w http.ResponseWriter, r *http.Request) { - reload <- struct{}{} - }) - - flagsMap := map[string]string{ - // TODO(bplotka in PR #513 review): pass all flags, not only the flags needed by prefix rewriting. - "web.external-prefix": webExternalPrefix, - "web.prefix-header": webPrefixHeaderName, - } - - ins := extpromhttp.NewInstrumentationMiddleware(reg) - - ui.NewRuleUI(logger, reg, ruleMgrs, alertQueryURL.String(), flagsMap).Register(router.WithPrefix(webRoutePrefix), ins) - - api := v1.NewAPI(logger, reg, ruleMgrs) - api.Register(router.WithPrefix(path.Join(webRoutePrefix, "/api/v1")), tracer, logger, ins) - - // Initiate HTTP listener providing metrics endpoint and readiness/liveness probes. - if err := scheduleHTTPServer(g, logger, reg, statusProber, httpBindAddr, router, comp); err != nil { - return errors.Wrap(err, "schedule HTTP server with probes") - } - } confContentYaml, err := objStoreConfig.Content() if err != nil { diff --git a/cmd/thanos/sidecar.go b/cmd/thanos/sidecar.go index dfc2b4ac56..5c72f5a9b4 100644 --- a/cmd/thanos/sidecar.go +++ b/cmd/thanos/sidecar.go @@ -134,6 +134,7 @@ func runSidecar( uploads = false } + level.Debug(logger).Log("msg", "setting up http server") statusProber := prober.NewProber(comp, logger, prometheus.WrapRegistererWithPrefix("thanos_", reg)) // Initiate HTTP listener providing metrics endpoint and readiness/liveness probes. if err := scheduleHTTPServer(g, logger, reg, statusProber, httpBindAddr, nil, comp); err != nil { diff --git a/cmd/thanos/store.go b/cmd/thanos/store.go index 139a7c8a74..1a1cad0706 100644 --- a/cmd/thanos/store.go +++ b/cmd/thanos/store.go @@ -126,7 +126,12 @@ func runStore( selectorRelabelConf *extflag.PathOrContent, advertiseCompatibilityLabel bool, ) error { + level.Debug(logger).Log("msg", "setting up http server") + // Initiate HTTP listener providing metrics endpoint and readiness/liveness probes. statusProber := prober.NewProber(component, logger, prometheus.WrapRegistererWithPrefix("thanos_", reg)) + if err := scheduleHTTPServer(g, logger, reg, statusProber, httpBindAddr, nil, component); err != nil { + return errors.Wrap(err, "schedule HTTP server") + } confContentYaml, err := objStoreConfig.Content() if err != nil { @@ -228,10 +233,6 @@ func runStore( s.Stop() }) - if err := scheduleHTTPServer(g, logger, reg, statusProber, httpBindAddr, nil, component); err != nil { - return errors.Wrap(err, "schedule HTTP server") - } - level.Info(logger).Log("msg", "starting store node") return nil } From e8e0bb7566fc538424afdd8fadb7638dd152690e Mon Sep 17 00:00:00 2001 From: Kemal Akkoyun Date: Wed, 16 Oct 2019 14:46:13 +0200 Subject: [PATCH 02/10] Update changelog Signed-off-by: Kemal Akkoyun --- CHANGELOG.md | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c7c45a703b..c1fab1aa2d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,10 @@ We use *breaking* word for marking changes that are not backward compatible (rel - [#1660](https://github.com/thanos-io/thanos/pull/1660) Add a new `--prometheus.ready_timeout` CLI option to the sidecar to set how long to wait until Prometheus starts up. +### Fixed + +- [#1656](https://github.com/thanos-io/thanos/pull/1656) Thanos components now starts metric and status probe HTTP server earlier in their start-up sequence. + ## [v0.8.1](https://github.com/thanos-io/thanos/releases/tag/v0.8.1) - 2019.10.14 ### Fixed @@ -23,12 +27,12 @@ We use *breaking* word for marking changes that are not backward compatible (rel * NOTE: `thanos_store_nodes_grpc_connections` metric is now per `external_labels` and `store_type`. It is a recommended metric for Querier storeAPIs. `thanos_store_node_info` is marked as obsolete and will be removed in next release. * NOTE2: Store Gateway is now advertising artificial: `"@thanos_compatibility_store_type=store"` label. This is to have the current Store Gateway compatible with Querier pre v0.8.0. This label can be disabled by hidden `debug.advertise-compatibility-label=false` flag on Store Gateway. - + ## [v0.8.0](https://github.com/thanos-io/thanos/releases/tag/v0.8.0) - 2019.10.10 Lot's of improvements this release! Noteworthy items: - First Katacoda tutorial! 🐱 -- Fixed Deletion order causing Compactor to produce not needed 👻 blocks with missing random files. +- Fixed Deletion order causing Compactor to produce not needed 👻 blocks with missing random files. - Store GW memory improvements (more to come!). - Querier allows multiple deduplication labels. - Both Compactor and Store Gateway can be **sharded** within the same bucket using relabelling! @@ -42,7 +46,7 @@ both Prometheus and sidecar with Thanos: https://prometheus.io/blog/2019/10/10/r - [#1619](https://github.com/thanos-io/thanos/pull/1619) Thanos sidecar allows to limit min time range for data it exposes from Prometheus. - [#1583](https://github.com/thanos-io/thanos/pull/1583) Thanos sharding: - - Add relabel config (`--selector.relabel-config-file` and `selector.relabel-config`) into Thanos Store and Compact components. + - Add relabel config (`--selector.relabel-config-file` and `selector.relabel-config`) into Thanos Store and Compact components. Selecting blocks to serve depends on the result of block labels relabeling. - For store gateway, advertise labels from "approved" blocks. - [#1540](https://github.com/thanos-io/thanos/pull/1540) Thanos Downsample added `/-/ready` and `/-/healthy` endpoints. @@ -55,8 +59,8 @@ Selecting blocks to serve depends on the result of block labels relabeling. - [#1362](https://github.com/thanos-io/thanos/pull/1362) Optional `replicaLabels` param for `/query` and `/query_range` querier endpoints. When provided overwrite the `query.replica-label` cli flags. - [#1482](https://github.com/thanos-io/thanos/pull/1482) Thanos now supports Elastic APM as tracing provider. -- [#1612](https://github.com/thanos-io/thanos/pull/1612) Thanos Rule added `resendDelay` flag. -- [#1480](https://github.com/thanos-io/thanos/pull/1480) Thanos Receive flushes storage on hashring change. +- [#1612](https://github.com/thanos-io/thanos/pull/1612) Thanos Rule added `resendDelay` flag. +- [#1480](https://github.com/thanos-io/thanos/pull/1480) Thanos Receive flushes storage on hashring change. - [#1613](https://github.com/thanos-io/thanos/pull/1613) Thanos Receive now traces forwarded requests. ### Changed @@ -76,7 +80,7 @@ once for multiple deduplication labels like: `--query.replica-label=prometheus_r - [#1544](https://github.com/thanos-io/thanos/pull/1544) Iterating over object store is resilient to the edge case for some providers. - [#1469](https://github.com/thanos-io/thanos/pull/1469) Fixed Azure potential failures (EOF) when requesting more data then blob has. - [#1512](https://github.com/thanos-io/thanos/pull/1512) Thanos Store fixed memory leak for chunk pool. -- [#1488](https://github.com/thanos-io/thanos/pull/1488) Thanos Rule now now correctly links to query URL from rules and alerts. +- [#1488](https://github.com/thanos-io/thanos/pull/1488) Thanos Rule now now correctly links to query URL from rules and alerts. ## [v0.7.0](https://github.com/thanos-io/thanos/releases/tag/v0.7.0) - 2019.09.02 From b6a22a0b199823e4e79aea3767d8b311161294e5 Mon Sep 17 00:00:00 2001 From: Kemal Akkoyun Date: Wed, 16 Oct 2019 16:23:44 +0200 Subject: [PATCH 03/10] Schedule a separate goroutine to start server Signed-off-by: Kemal Akkoyun --- cmd/thanos/main.go | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/cmd/thanos/main.go b/cmd/thanos/main.go index 74550c558b..a1e50dc4d6 100644 --- a/cmd/thanos/main.go +++ b/cmd/thanos/main.go @@ -333,7 +333,7 @@ func newStoreGRPCServer(logger log.Logger, reg prometheus.Registerer, tracer ope return s } -// scheduleHTTPServer starts a run.Group that servers HTTP endpoint with default endpoints providing Prometheus metrics, +// scheduleHTTPServer starts a goroutine that servers HTTP endpoint with default endpoints providing Prometheus metrics, // profiling and liveness/readiness probes. func scheduleHTTPServer(g *run.Group, logger log.Logger, reg *prometheus.Registry, readinessProber *prober.Prober, httpBindAddr string, handler http.Handler, comp component.Component) error { mux := http.NewServeMux() @@ -349,13 +349,29 @@ func scheduleHTTPServer(g *run.Group, logger log.Logger, reg *prometheus.Registr return errors.Wrap(err, "listen metrics address") } - g.Add(func() error { - level.Info(logger).Log("msg", "listening for requests and metrics", "component", comp.String(), "address", httpBindAddr) + srv := http.Server{Handler: mux} + // This spawns a goroutine independent from rungroup to start Server as soon as it's scheduled. + ctx, cancel := context.WithCancel(context.Background()) + go func() { readinessProber.SetHealthy() - return errors.Wrapf(http.Serve(l, mux), "serve %s and metrics", comp.String()) + level.Info(logger).Log("msg", "listening for requests and metrics", "component", comp.String(), "address", httpBindAddr) + if err := srv.Serve(l); err != http.ErrServerClosed { + level.Error(logger).Log(err, "serve %s and metrics", comp.String()) + } + cancel() + }() + + // Synchronize with rungroup to catch interrupts. + g.Add(func() error { + <-ctx.Done() + return ctx.Err() }, func(err error) { readinessProber.SetNotHealthy(err) runutil.CloseWithLogOnErr(logger, l, "%s and metric listener", comp.String()) + if err := srv.Shutdown(context.Background()); err != nil { + level.Error(logger).Log(err, "HTTP server Shutdown: %v", comp.String()) + } + cancel() }) return nil } From 22255ab4f3d5eb6c6ae09abdd225f64def76ee8d Mon Sep 17 00:00:00 2001 From: Kemal Akkoyun Date: Thu, 17 Oct 2019 14:57:10 +0200 Subject: [PATCH 04/10] Add InitSync to the rungroup Signed-off-by: Kemal Akkoyun --- cmd/thanos/compact.go | 13 ++++----- cmd/thanos/downsample.go | 2 +- cmd/thanos/main.go | 23 +++------------ cmd/thanos/query.go | 1 - cmd/thanos/receive.go | 15 +++++----- cmd/thanos/rule.go | 2 -- cmd/thanos/sidecar.go | 22 +++++++-------- cmd/thanos/store.go | 60 +++++++++++++++++++++++++--------------- 8 files changed, 66 insertions(+), 72 deletions(-) diff --git a/cmd/thanos/compact.go b/cmd/thanos/compact.go index 749a76ab8c..48563823f8 100644 --- a/cmd/thanos/compact.go +++ b/cmd/thanos/compact.go @@ -173,13 +173,6 @@ func runCompact( downsampleMetrics := newDownsampleMetrics(reg) - level.Debug(logger).Log("msg", "setting up http server") - statusProber := prober.NewProber(component, logger, prometheus.WrapRegistererWithPrefix("thanos_", reg)) - // Initiate HTTP listener providing metrics endpoint and readiness/liveness probes. - if err := scheduleHTTPServer(g, logger, reg, statusProber, httpBindAddr, nil, component); err != nil { - return errors.Wrap(err, "schedule HTTP server with probes") - } - confContentYaml, err := objStoreConfig.Content() if err != nil { return err @@ -248,6 +241,12 @@ func runCompact( return errors.Wrap(err, "create bucket compactor") } + statusProber := prober.NewProber(component, logger, prometheus.WrapRegistererWithPrefix("thanos_", reg)) + // Initiate HTTP listener providing metrics endpoint and readiness/liveness probes. + if err := scheduleHTTPServer(g, logger, reg, statusProber, httpBindAddr, nil, component); err != nil { + return errors.Wrap(err, "schedule HTTP server with probes") + } + if retentionByResolution[compact.ResolutionLevelRaw].Seconds() != 0 { level.Info(logger).Log("msg", "retention policy of raw samples is enabled", "duration", retentionByResolution[compact.ResolutionLevelRaw]) } diff --git a/cmd/thanos/downsample.go b/cmd/thanos/downsample.go index 60da138382..8cb5d029ed 100644 --- a/cmd/thanos/downsample.go +++ b/cmd/thanos/downsample.go @@ -95,7 +95,7 @@ func runDownsample( }() metrics := newDownsampleMetrics(reg) - level.Debug(logger).Log("msg", "setting up http server") + statusProber := prober.NewProber(comp, logger, prometheus.WrapRegistererWithPrefix("thanos_", reg)) // Initiate HTTP listener providing metrics endpoint and readiness/liveness probes. if err := scheduleHTTPServer(g, logger, reg, statusProber, httpBindAddr, nil, comp); err != nil { diff --git a/cmd/thanos/main.go b/cmd/thanos/main.go index a1e50dc4d6..5a66c98961 100644 --- a/cmd/thanos/main.go +++ b/cmd/thanos/main.go @@ -349,29 +349,14 @@ func scheduleHTTPServer(g *run.Group, logger log.Logger, reg *prometheus.Registr return errors.Wrap(err, "listen metrics address") } - srv := http.Server{Handler: mux} - // This spawns a goroutine independent from rungroup to start Server as soon as it's scheduled. - ctx, cancel := context.WithCancel(context.Background()) - go func() { - readinessProber.SetHealthy() - level.Info(logger).Log("msg", "listening for requests and metrics", "component", comp.String(), "address", httpBindAddr) - if err := srv.Serve(l); err != http.ErrServerClosed { - level.Error(logger).Log(err, "serve %s and metrics", comp.String()) - } - cancel() - }() - - // Synchronize with rungroup to catch interrupts. g.Add(func() error { - <-ctx.Done() - return ctx.Err() + level.Info(logger).Log("msg", "listening for requests and metrics", "component", comp.String(), "address", httpBindAddr) + readinessProber.SetHealthy() + return errors.Wrapf(http.Serve(l, mux), "serve %s and metrics", comp.String()) }, func(err error) { readinessProber.SetNotHealthy(err) + l.Close() runutil.CloseWithLogOnErr(logger, l, "%s and metric listener", comp.String()) - if err := srv.Shutdown(context.Background()); err != nil { - level.Error(logger).Log(err, "HTTP server Shutdown: %v", comp.String()) - } - cancel() }) return nil } diff --git a/cmd/thanos/query.go b/cmd/thanos/query.go index 5dafd9798a..16850bbce8 100644 --- a/cmd/thanos/query.go +++ b/cmd/thanos/query.go @@ -343,7 +343,6 @@ func runQuery( }) } // Start query API + UI, metrics and status probe HTTP server. - level.Debug(logger).Log("msg", "setting up http server") statusProber := prober.NewProber(comp, logger, reg) { router := route.New() diff --git a/cmd/thanos/receive.go b/cmd/thanos/receive.go index f8713ad9ae..98bb98c4f3 100644 --- a/cmd/thanos/receive.go +++ b/cmd/thanos/receive.go @@ -9,6 +9,7 @@ import ( "time" "github.com/thanos-io/thanos/pkg/extflag" + "github.com/thanos-io/thanos/pkg/prober" "github.com/go-kit/kit/log" "github.com/go-kit/kit/log/level" @@ -22,7 +23,6 @@ import ( "github.com/thanos-io/thanos/pkg/block/metadata" "github.com/thanos-io/thanos/pkg/component" "github.com/thanos-io/thanos/pkg/objstore/client" - "github.com/thanos-io/thanos/pkg/prober" "github.com/thanos-io/thanos/pkg/receive" "github.com/thanos-io/thanos/pkg/runutil" "github.com/thanos-io/thanos/pkg/shipper" @@ -163,13 +163,6 @@ func runReceive( Tracer: tracer, }) - level.Debug(logger).Log("msg", "setting up http server") - statusProber := prober.NewProber(comp, logger, prometheus.WrapRegistererWithPrefix("thanos_", reg)) - // Initiate HTTP listener providing metrics endpoint and readiness/liveness probes. - if err := scheduleHTTPServer(g, logger, reg, statusProber, httpBindAddr, nil, comp); err != nil { - return errors.Wrap(err, "schedule HTTP server with probes") - } - confContentYaml, err := objStoreConfig.Content() if err != nil { return err @@ -180,6 +173,12 @@ func runReceive( upload = false } + statusProber := prober.NewProber(comp, logger, prometheus.WrapRegistererWithPrefix("thanos_", reg)) + // Initiate HTTP listener providing metrics endpoint and readiness/liveness probes. + if err := scheduleHTTPServer(g, logger, reg, statusProber, httpBindAddr, nil, comp); err != nil { + return errors.Wrap(err, "schedule HTTP server with probes") + } + // Start all components while we wait for TSDB to open but only load // initial config and mark ourselves as ready after it completed. diff --git a/cmd/thanos/rule.go b/cmd/thanos/rule.go index 73973e2db3..391a20607c 100644 --- a/cmd/thanos/rule.go +++ b/cmd/thanos/rule.go @@ -280,7 +280,6 @@ func runRule( extprom.WrapRegistererWithPrefix("thanos_ruler_query_apis_", reg), dns.ResolverType(dnsSDResolver), ) - // Run rule evaluation and alert notifications. var ( alertmgrs = newAlertmanagerSet(logger, alertmgrURLs, dns.ResolverType(dnsSDResolver)) @@ -484,7 +483,6 @@ func runRule( }) } // Start query API + UI, metrics and status probe HTTP server. - level.Debug(logger).Log("msg", "setting up http server") statusProber := prober.NewProber(comp, logger, prometheus.WrapRegistererWithPrefix("thanos_", reg)) { router := route.New() diff --git a/cmd/thanos/sidecar.go b/cmd/thanos/sidecar.go index 5c72f5a9b4..0d02ffd5b6 100644 --- a/cmd/thanos/sidecar.go +++ b/cmd/thanos/sidecar.go @@ -112,6 +112,16 @@ func runSidecar( comp component.Component, limitMinTime thanosmodel.TimeOrDurationValue, ) error { + promUp := prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "thanos_sidecar_prometheus_up", + Help: "Boolean indicator whether the sidecar can reach its Prometheus peer.", + }) + lastHeartbeat := prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "thanos_sidecar_last_heartbeat_success_time_seconds", + Help: "Second timestamp of the last successful heartbeat.", + }) + reg.MustRegister(promUp, lastHeartbeat) + var m = &promMetadata{ promURL: promURL, @@ -134,7 +144,6 @@ func runSidecar( uploads = false } - level.Debug(logger).Log("msg", "setting up http server") statusProber := prober.NewProber(comp, logger, prometheus.WrapRegistererWithPrefix("thanos_", reg)) // Initiate HTTP listener providing metrics endpoint and readiness/liveness probes. if err := scheduleHTTPServer(g, logger, reg, statusProber, httpBindAddr, nil, comp); err != nil { @@ -143,16 +152,6 @@ func runSidecar( // Setup all the concurrent groups. { - promUp := prometheus.NewGauge(prometheus.GaugeOpts{ - Name: "thanos_sidecar_prometheus_up", - Help: "Boolean indicator whether the sidecar can reach its Prometheus peer.", - }) - lastHeartbeat := prometheus.NewGauge(prometheus.GaugeOpts{ - Name: "thanos_sidecar_last_heartbeat_success_time_seconds", - Help: "Second timestamp of the last successful heartbeat.", - }) - reg.MustRegister(promUp, lastHeartbeat) - ctx, cancel := context.WithCancel(context.Background()) g.Add(func() error { // Only check Prometheus's flags when upload is enabled. @@ -221,7 +220,6 @@ func runSidecar( cancel() }) } - { l, err := net.Listen("tcp", grpcBindAddr) if err != nil { diff --git a/cmd/thanos/store.go b/cmd/thanos/store.go index 1a1cad0706..d206501b38 100644 --- a/cmd/thanos/store.go +++ b/cmd/thanos/store.go @@ -126,7 +126,6 @@ func runStore( selectorRelabelConf *extflag.PathOrContent, advertiseCompatibilityLabel bool, ) error { - level.Debug(logger).Log("msg", "setting up http server") // Initiate HTTP listener providing metrics endpoint and readiness/liveness probes. statusProber := prober.NewProber(component, logger, prometheus.WrapRegistererWithPrefix("thanos_", reg)) if err := scheduleHTTPServer(g, logger, reg, statusProber, httpBindAddr, nil, component); err != nil { @@ -190,29 +189,45 @@ func runStore( return errors.Wrap(err, "create object storage store") } - begin := time.Now() - level.Debug(logger).Log("msg", "initializing bucket store") - if err := bs.InitialSync(context.Background()); err != nil { - return errors.Wrap(err, "bucket store initial sync") - } - level.Debug(logger).Log("msg", "bucket store ready", "init_duration", time.Since(begin).String()) - - ctx, cancel := context.WithCancel(context.Background()) - g.Add(func() error { - defer runutil.CloseWithLogOnErr(logger, bkt, "bucket client") - - err := runutil.Repeat(syncInterval, ctx.Done(), func() error { - if err := bs.SyncBlocks(ctx); err != nil { - level.Warn(logger).Log("msg", "syncing blocks failed", "err", err) + // bucketStoreReady signals when bucket store is ready. + bucketStoreReady := make(chan struct{}) + { + begin := time.Now() + ctx, cancel := context.WithCancel(context.Background()) + g.Add(func() error { + level.Info(logger).Log("msg", "initializing bucket store") + if err := bs.InitialSync(ctx); err != nil { + close(bucketStoreReady) + return errors.Wrap(err, "bucket store initial sync") } - return nil + level.Info(logger).Log("msg", "bucket store ready", "init_duration", time.Since(begin).String()) + close(bucketStoreReady) + <-ctx.Done() + return ctx.Err() + }, func(error) { + cancel() }) + } - runutil.CloseWithLogOnErr(logger, bs, "bucket store") - return err - }, func(error) { - cancel() - }) + { + ctx, cancel := context.WithCancel(context.Background()) + g.Add(func() error { + <-bucketStoreReady + defer runutil.CloseWithLogOnErr(logger, bkt, "bucket client") + + err := runutil.Repeat(syncInterval, ctx.Done(), func() error { + if err := bs.SyncBlocks(ctx); err != nil { + level.Warn(logger).Log("msg", "syncing blocks failed", "err", err) + } + return nil + }) + + runutil.CloseWithLogOnErr(logger, bs, "bucket store") + return err + }, func(error) { + cancel() + }) + } l, err := net.Listen("tcp", grpcBindAddr) if err != nil { @@ -226,7 +241,8 @@ func runStore( s := newStoreGRPCServer(logger, reg, tracer, bs, opts) g.Add(func() error { - level.Info(logger).Log("msg", "Listening for StoreAPI gRPC", "address", grpcBindAddr) + <-bucketStoreReady + level.Info(logger).Log("msg", "listening for StoreAPI gRPC", "address", grpcBindAddr) statusProber.SetReady() return errors.Wrap(s.Serve(l), "serve gRPC") }, func(error) { From dc6c73b8b69efb82e027b70dfdd32769679e413f Mon Sep 17 00:00:00 2001 From: Kemal Akkoyun Date: Thu, 17 Oct 2019 15:09:18 +0200 Subject: [PATCH 05/10] Fix linter pointed issues Signed-off-by: Kemal Akkoyun --- cmd/thanos/compact.go | 1 + cmd/thanos/main.go | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/cmd/thanos/compact.go b/cmd/thanos/compact.go index 48563823f8..c3e8006e63 100644 --- a/cmd/thanos/compact.go +++ b/cmd/thanos/compact.go @@ -244,6 +244,7 @@ func runCompact( statusProber := prober.NewProber(component, logger, prometheus.WrapRegistererWithPrefix("thanos_", reg)) // Initiate HTTP listener providing metrics endpoint and readiness/liveness probes. if err := scheduleHTTPServer(g, logger, reg, statusProber, httpBindAddr, nil, component); err != nil { + cancel() return errors.Wrap(err, "schedule HTTP server with probes") } diff --git a/cmd/thanos/main.go b/cmd/thanos/main.go index 5a66c98961..3c31998499 100644 --- a/cmd/thanos/main.go +++ b/cmd/thanos/main.go @@ -333,7 +333,7 @@ func newStoreGRPCServer(logger log.Logger, reg prometheus.Registerer, tracer ope return s } -// scheduleHTTPServer starts a goroutine that servers HTTP endpoint with default endpoints providing Prometheus metrics, +// scheduleHTTPServer starts a run.Group that servers HTTP endpoint with default endpoints providing Prometheus metrics, // profiling and liveness/readiness probes. func scheduleHTTPServer(g *run.Group, logger log.Logger, reg *prometheus.Registry, readinessProber *prober.Prober, httpBindAddr string, handler http.Handler, comp component.Component) error { mux := http.NewServeMux() From 2d6ef8881963a2d502983fa58f224dbff4c4b8b7 Mon Sep 17 00:00:00 2001 From: Kemal Akkoyun Date: Thu, 17 Oct 2019 16:07:31 +0200 Subject: [PATCH 06/10] Move InitSync to alreay existed run.Group Signed-off-by: Kemal Akkoyun --- cmd/thanos/main.go | 1 - cmd/thanos/store.go | 14 ++------------ 2 files changed, 2 insertions(+), 13 deletions(-) diff --git a/cmd/thanos/main.go b/cmd/thanos/main.go index 3c31998499..74550c558b 100644 --- a/cmd/thanos/main.go +++ b/cmd/thanos/main.go @@ -355,7 +355,6 @@ func scheduleHTTPServer(g *run.Group, logger log.Logger, reg *prometheus.Registr return errors.Wrapf(http.Serve(l, mux), "serve %s and metrics", comp.String()) }, func(err error) { readinessProber.SetNotHealthy(err) - l.Close() runutil.CloseWithLogOnErr(logger, l, "%s and metric listener", comp.String()) }) return nil diff --git a/cmd/thanos/store.go b/cmd/thanos/store.go index d206501b38..496b0ff65c 100644 --- a/cmd/thanos/store.go +++ b/cmd/thanos/store.go @@ -195,6 +195,8 @@ func runStore( begin := time.Now() ctx, cancel := context.WithCancel(context.Background()) g.Add(func() error { + defer runutil.CloseWithLogOnErr(logger, bkt, "bucket client") + level.Info(logger).Log("msg", "initializing bucket store") if err := bs.InitialSync(ctx); err != nil { close(bucketStoreReady) @@ -202,18 +204,6 @@ func runStore( } level.Info(logger).Log("msg", "bucket store ready", "init_duration", time.Since(begin).String()) close(bucketStoreReady) - <-ctx.Done() - return ctx.Err() - }, func(error) { - cancel() - }) - } - - { - ctx, cancel := context.WithCancel(context.Background()) - g.Add(func() error { - <-bucketStoreReady - defer runutil.CloseWithLogOnErr(logger, bkt, "bucket client") err := runutil.Repeat(syncInterval, ctx.Done(), func() error { if err := bs.SyncBlocks(ctx); err != nil { From 8499f3bbea2a324ab185b0e2751cd94b9cae06f2 Mon Sep 17 00:00:00 2001 From: Kemal Akkoyun Date: Thu, 17 Oct 2019 16:24:55 +0200 Subject: [PATCH 07/10] Remove unnecessary changes and update CHANGELOG Signed-off-by: Kemal Akkoyun --- CHANGELOG.md | 2 +- cmd/thanos/compact.go | 13 ++++---- cmd/thanos/downsample.go | 11 +++---- cmd/thanos/query.go | 61 ++++++++++++++++++----------------- cmd/thanos/receive.go | 15 +++++---- cmd/thanos/rule.go | 69 ++++++++++++++++++++-------------------- cmd/thanos/sidecar.go | 21 ++++++------ 7 files changed, 97 insertions(+), 95 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c1fab1aa2d..7134f15635 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,7 +17,7 @@ We use *breaking* word for marking changes that are not backward compatible (rel ### Fixed -- [#1656](https://github.com/thanos-io/thanos/pull/1656) Thanos components now starts metric and status probe HTTP server earlier in their start-up sequence. +- [#1656](https://github.com/thanos-io/thanos/pull/1656) Thanos Store now starts metric and status probe HTTP server earlier in its start-up sequence. `/-/healthy` now endpoint starts to respond with success earlier. `/metrics` endpoint now starts serving earlier as well, as a result for your readiness probes you should rely on `/-/ready` rather than `/metrics`. ## [v0.8.1](https://github.com/thanos-io/thanos/releases/tag/v0.8.1) - 2019.10.14 diff --git a/cmd/thanos/compact.go b/cmd/thanos/compact.go index c3e8006e63..12f6573a0e 100644 --- a/cmd/thanos/compact.go +++ b/cmd/thanos/compact.go @@ -173,6 +173,12 @@ func runCompact( downsampleMetrics := newDownsampleMetrics(reg) + statusProber := prober.NewProber(component, logger, prometheus.WrapRegistererWithPrefix("thanos_", reg)) + // Initiate HTTP listener providing metrics endpoint and readiness/liveness probes. + if err := scheduleHTTPServer(g, logger, reg, statusProber, httpBindAddr, nil, component); err != nil { + return errors.Wrap(err, "schedule HTTP server with probes") + } + confContentYaml, err := objStoreConfig.Content() if err != nil { return err @@ -241,13 +247,6 @@ func runCompact( return errors.Wrap(err, "create bucket compactor") } - statusProber := prober.NewProber(component, logger, prometheus.WrapRegistererWithPrefix("thanos_", reg)) - // Initiate HTTP listener providing metrics endpoint and readiness/liveness probes. - if err := scheduleHTTPServer(g, logger, reg, statusProber, httpBindAddr, nil, component); err != nil { - cancel() - return errors.Wrap(err, "schedule HTTP server with probes") - } - if retentionByResolution[compact.ResolutionLevelRaw].Seconds() != 0 { level.Info(logger).Log("msg", "retention policy of raw samples is enabled", "duration", retentionByResolution[compact.ResolutionLevelRaw]) } diff --git a/cmd/thanos/downsample.go b/cmd/thanos/downsample.go index 8cb5d029ed..b6b2397127 100644 --- a/cmd/thanos/downsample.go +++ b/cmd/thanos/downsample.go @@ -95,13 +95,7 @@ func runDownsample( }() metrics := newDownsampleMetrics(reg) - statusProber := prober.NewProber(comp, logger, prometheus.WrapRegistererWithPrefix("thanos_", reg)) - // Initiate HTTP listener providing metrics endpoint and readiness/liveness probes. - if err := scheduleHTTPServer(g, logger, reg, statusProber, httpBindAddr, nil, comp); err != nil { - return errors.Wrap(err, "schedule HTTP server with probe") - } - // Start cycle of syncing blocks from the bucket and garbage collecting the bucket. { ctx, cancel := context.WithCancel(context.Background()) @@ -128,6 +122,11 @@ func runDownsample( }) } + // Initiate HTTP listener providing metrics endpoint and readiness/liveness probes. + if err := scheduleHTTPServer(g, logger, reg, statusProber, httpBindAddr, nil, comp); err != nil { + return errors.Wrap(err, "schedule HTTP server with probe") + } + level.Info(logger).Log("msg", "starting downsample node") return nil } diff --git a/cmd/thanos/query.go b/cmd/thanos/query.go index 16850bbce8..6821e6fd36 100644 --- a/cmd/thanos/query.go +++ b/cmd/thanos/query.go @@ -342,36 +342,6 @@ func runQuery( stores.Close() }) } - // Start query API + UI, metrics and status probe HTTP server. - statusProber := prober.NewProber(comp, logger, reg) - { - router := route.New() - - // Redirect from / to /webRoutePrefix. - if webRoutePrefix != "" { - router.Get("/", func(w http.ResponseWriter, r *http.Request) { - http.Redirect(w, r, webRoutePrefix, http.StatusFound) - }) - } - - flagsMap := map[string]string{ - // TODO(bplotka in PR #513 review): pass all flags, not only the flags needed by prefix rewriting. - "web.external-prefix": webExternalPrefix, - "web.prefix-header": webPrefixHeaderName, - } - - ins := extpromhttp.NewInstrumentationMiddleware(reg) - ui.NewQueryUI(logger, reg, stores, flagsMap).Register(router.WithPrefix(webRoutePrefix), ins) - - api := v1.NewAPI(logger, reg, engine, queryableCreator, enableAutodownsampling, enablePartialResponse, replicaLabels, instantDefaultMaxSourceResolution) - - api.Register(router.WithPrefix(path.Join(webRoutePrefix, "/api/v1")), tracer, logger, ins) - - // Initiate HTTP listener providing metrics endpoint and readiness/liveness probes. - if err := scheduleHTTPServer(g, logger, reg, statusProber, httpBindAddr, router, comp); err != nil { - return errors.Wrap(err, "schedule HTTP server with probes") - } - } // Run File Service Discovery and update the store set when the files are modified. if fileSD != nil { var fileSDUpdates chan []*targetgroup.Group @@ -419,6 +389,37 @@ func runQuery( cancel() }) } + // Start query API + UI HTTP server. + + statusProber := prober.NewProber(comp, logger, reg) + { + router := route.New() + + // Redirect from / to /webRoutePrefix. + if webRoutePrefix != "" { + router.Get("/", func(w http.ResponseWriter, r *http.Request) { + http.Redirect(w, r, webRoutePrefix, http.StatusFound) + }) + } + + flagsMap := map[string]string{ + // TODO(bplotka in PR #513 review): pass all flags, not only the flags needed by prefix rewriting. + "web.external-prefix": webExternalPrefix, + "web.prefix-header": webPrefixHeaderName, + } + + ins := extpromhttp.NewInstrumentationMiddleware(reg) + ui.NewQueryUI(logger, reg, stores, flagsMap).Register(router.WithPrefix(webRoutePrefix), ins) + + api := v1.NewAPI(logger, reg, engine, queryableCreator, enableAutodownsampling, enablePartialResponse, replicaLabels, instantDefaultMaxSourceResolution) + + api.Register(router.WithPrefix(path.Join(webRoutePrefix, "/api/v1")), tracer, logger, ins) + + // Initiate HTTP listener providing metrics endpoint and readiness/liveness probes. + if err := scheduleHTTPServer(g, logger, reg, statusProber, httpBindAddr, router, comp); err != nil { + return errors.Wrap(err, "schedule HTTP server with probes") + } + } // Start query (proxy) gRPC StoreAPI. { l, err := net.Listen("tcp", grpcBindAddr) diff --git a/cmd/thanos/receive.go b/cmd/thanos/receive.go index 98bb98c4f3..9639154037 100644 --- a/cmd/thanos/receive.go +++ b/cmd/thanos/receive.go @@ -9,7 +9,6 @@ import ( "time" "github.com/thanos-io/thanos/pkg/extflag" - "github.com/thanos-io/thanos/pkg/prober" "github.com/go-kit/kit/log" "github.com/go-kit/kit/log/level" @@ -23,6 +22,7 @@ import ( "github.com/thanos-io/thanos/pkg/block/metadata" "github.com/thanos-io/thanos/pkg/component" "github.com/thanos-io/thanos/pkg/objstore/client" + "github.com/thanos-io/thanos/pkg/prober" "github.com/thanos-io/thanos/pkg/receive" "github.com/thanos-io/thanos/pkg/runutil" "github.com/thanos-io/thanos/pkg/shipper" @@ -163,6 +163,7 @@ func runReceive( Tracer: tracer, }) + statusProber := prober.NewProber(comp, logger, prometheus.WrapRegistererWithPrefix("thanos_", reg)) confContentYaml, err := objStoreConfig.Content() if err != nil { return err @@ -173,12 +174,6 @@ func runReceive( upload = false } - statusProber := prober.NewProber(comp, logger, prometheus.WrapRegistererWithPrefix("thanos_", reg)) - // Initiate HTTP listener providing metrics endpoint and readiness/liveness probes. - if err := scheduleHTTPServer(g, logger, reg, statusProber, httpBindAddr, nil, comp); err != nil { - return errors.Wrap(err, "schedule HTTP server with probes") - } - // Start all components while we wait for TSDB to open but only load // initial config and mark ourselves as ready after it completed. @@ -307,6 +302,12 @@ func runReceive( ) } + level.Debug(logger).Log("msg", "setting up http server") + // Initiate HTTP listener providing metrics endpoint and readiness/liveness probes. + if err := scheduleHTTPServer(g, logger, reg, statusProber, httpBindAddr, nil, comp); err != nil { + return errors.Wrap(err, "schedule HTTP server with probes") + } + level.Debug(logger).Log("msg", "setting up grpc server") { var ( diff --git a/cmd/thanos/rule.go b/cmd/thanos/rule.go index 391a20607c..b5cc568ebf 100644 --- a/cmd/thanos/rule.go +++ b/cmd/thanos/rule.go @@ -280,6 +280,7 @@ func runRule( extprom.WrapRegistererWithPrefix("thanos_ruler_query_apis_", reg), dns.ResolverType(dnsSDResolver), ) + // Run rule evaluation and alert notifications. var ( alertmgrs = newAlertmanagerSet(logger, alertmgrURLs, dns.ResolverType(dnsSDResolver)) @@ -482,40 +483,6 @@ func runRule( close(cancel) }) } - // Start query API + UI, metrics and status probe HTTP server. - statusProber := prober.NewProber(comp, logger, prometheus.WrapRegistererWithPrefix("thanos_", reg)) - { - router := route.New() - - // Redirect from / to /webRoutePrefix. - if webRoutePrefix != "" { - router.Get("/", func(w http.ResponseWriter, r *http.Request) { - http.Redirect(w, r, webRoutePrefix, http.StatusFound) - }) - } - - router.WithPrefix(webRoutePrefix).Post("/-/reload", func(w http.ResponseWriter, r *http.Request) { - reload <- struct{}{} - }) - - flagsMap := map[string]string{ - // TODO(bplotka in PR #513 review): pass all flags, not only the flags needed by prefix rewriting. - "web.external-prefix": webExternalPrefix, - "web.prefix-header": webPrefixHeaderName, - } - - ins := extpromhttp.NewInstrumentationMiddleware(reg) - - ui.NewRuleUI(logger, reg, ruleMgrs, alertQueryURL.String(), flagsMap).Register(router.WithPrefix(webRoutePrefix), ins) - - api := v1.NewAPI(logger, reg, ruleMgrs) - api.Register(router.WithPrefix(path.Join(webRoutePrefix, "/api/v1")), tracer, logger, ins) - - // Initiate HTTP listener providing metrics endpoint and readiness/liveness probes. - if err := scheduleHTTPServer(g, logger, reg, statusProber, httpBindAddr, router, comp); err != nil { - return errors.Wrap(err, "schedule HTTP server with probes") - } - } // Periodically update the addresses from static flags and file SD by resolving them using DNS SD if necessary. { ctx, cancel := context.WithCancel(context.Background()) @@ -528,6 +495,7 @@ func runRule( cancel() }) } + statusProber := prober.NewProber(comp, logger, prometheus.WrapRegistererWithPrefix("thanos_", reg)) // Start gRPC server. { l, err := net.Listen("tcp", grpcBindAddr) @@ -551,6 +519,39 @@ func runRule( s.Stop() }) } + // Start UI & metrics HTTP server. + { + router := route.New() + + // Redirect from / to /webRoutePrefix. + if webRoutePrefix != "" { + router.Get("/", func(w http.ResponseWriter, r *http.Request) { + http.Redirect(w, r, webRoutePrefix, http.StatusFound) + }) + } + + router.WithPrefix(webRoutePrefix).Post("/-/reload", func(w http.ResponseWriter, r *http.Request) { + reload <- struct{}{} + }) + + flagsMap := map[string]string{ + // TODO(bplotka in PR #513 review): pass all flags, not only the flags needed by prefix rewriting. + "web.external-prefix": webExternalPrefix, + "web.prefix-header": webPrefixHeaderName, + } + + ins := extpromhttp.NewInstrumentationMiddleware(reg) + + ui.NewRuleUI(logger, reg, ruleMgrs, alertQueryURL.String(), flagsMap).Register(router.WithPrefix(webRoutePrefix), ins) + + api := v1.NewAPI(logger, reg, ruleMgrs) + api.Register(router.WithPrefix(path.Join(webRoutePrefix, "/api/v1")), tracer, logger, ins) + + // Initiate HTTP listener providing metrics endpoint and readiness/liveness probes. + if err := scheduleHTTPServer(g, logger, reg, statusProber, httpBindAddr, router, comp); err != nil { + return errors.Wrap(err, "schedule HTTP server with probes") + } + } confContentYaml, err := objStoreConfig.Content() if err != nil { diff --git a/cmd/thanos/sidecar.go b/cmd/thanos/sidecar.go index 0d02ffd5b6..dfc2b4ac56 100644 --- a/cmd/thanos/sidecar.go +++ b/cmd/thanos/sidecar.go @@ -112,16 +112,6 @@ func runSidecar( comp component.Component, limitMinTime thanosmodel.TimeOrDurationValue, ) error { - promUp := prometheus.NewGauge(prometheus.GaugeOpts{ - Name: "thanos_sidecar_prometheus_up", - Help: "Boolean indicator whether the sidecar can reach its Prometheus peer.", - }) - lastHeartbeat := prometheus.NewGauge(prometheus.GaugeOpts{ - Name: "thanos_sidecar_last_heartbeat_success_time_seconds", - Help: "Second timestamp of the last successful heartbeat.", - }) - reg.MustRegister(promUp, lastHeartbeat) - var m = &promMetadata{ promURL: promURL, @@ -152,6 +142,16 @@ func runSidecar( // Setup all the concurrent groups. { + promUp := prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "thanos_sidecar_prometheus_up", + Help: "Boolean indicator whether the sidecar can reach its Prometheus peer.", + }) + lastHeartbeat := prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "thanos_sidecar_last_heartbeat_success_time_seconds", + Help: "Second timestamp of the last successful heartbeat.", + }) + reg.MustRegister(promUp, lastHeartbeat) + ctx, cancel := context.WithCancel(context.Background()) g.Add(func() error { // Only check Prometheus's flags when upload is enabled. @@ -220,6 +220,7 @@ func runSidecar( cancel() }) } + { l, err := net.Listen("tcp", grpcBindAddr) if err != nil { From bbde009cc53b0cf9d0a3303f8eb00d49aac35aaa Mon Sep 17 00:00:00 2001 From: Kemal Akkoyun Date: Thu, 17 Oct 2019 16:36:13 +0200 Subject: [PATCH 08/10] Add simple explanation for probes Signed-off-by: Kemal Akkoyun --- docs/components/store.md | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/docs/components/store.md b/docs/components/store.md index 5149fd17b4..a3d5614da5 100644 --- a/docs/components/store.md +++ b/docs/components/store.md @@ -122,11 +122,11 @@ Flags: ``` -## Time based partioning +## Time based partitioning By default Thanos Store Gateway looks at all the data in Object Store and returns it based on query's time range. -Thanos Store `--min-time`, `--max-time` flags allows you to shard Thanos Store based on constant time or time duration relative to current time. +Thanos Store `--min-time`, `--max-time` flags allows you to shard Thanos Store based on constant time or time duration relative to current time. For example setting: `--min-time=-6w` & `--max-time==-2w` will make Thanos Store Gateway return metrics that fall within `now - 6 weeks` up to `now - 2 weeks` time range. @@ -136,6 +136,12 @@ Thanos Store Gateway might not get new blocks immediately, as Time partitioning We recommend having overlapping time ranges with Thanos Sidecar and other Thanos Store gateways as this will improve your resiliency to failures. -Thanos Querier deals with overlapping time series by merging them together. +Thanos Querier deals with overlapping time series by merging them together. Filtering is done on a Chunk level, so Thanos Store might still return Samples which are outside of `--min-time` & `--max-time`. + +## Probes + +- Thanos Store exposes two endpoints for probing. + * `/-/healthy` starts as soon as initial setup completed. + * `/-/ready` starts after all the bootstrapping completed (e.g initial index building) and ready to serve traffic. From 65e9b0533ccffd55638a6262e7aa8e5c4323fed7 Mon Sep 17 00:00:00 2001 From: Kemal Akkoyun Date: Thu, 17 Oct 2019 20:52:50 +0200 Subject: [PATCH 09/10] Make requested changes Signed-off-by: Kemal Akkoyun --- cmd/thanos/store.go | 2 +- docs/components/store.md | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/cmd/thanos/store.go b/cmd/thanos/store.go index 496b0ff65c..70b213937e 100644 --- a/cmd/thanos/store.go +++ b/cmd/thanos/store.go @@ -192,12 +192,12 @@ func runStore( // bucketStoreReady signals when bucket store is ready. bucketStoreReady := make(chan struct{}) { - begin := time.Now() ctx, cancel := context.WithCancel(context.Background()) g.Add(func() error { defer runutil.CloseWithLogOnErr(logger, bkt, "bucket client") level.Info(logger).Log("msg", "initializing bucket store") + begin := time.Now() if err := bs.InitialSync(ctx); err != nil { close(bucketStoreReady) return errors.Wrap(err, "bucket store initial sync") diff --git a/docs/components/store.md b/docs/components/store.md index a3d5614da5..9e5b7fe943 100644 --- a/docs/components/store.md +++ b/docs/components/store.md @@ -143,5 +143,7 @@ Filtering is done on a Chunk level, so Thanos Store might still return Samples w ## Probes - Thanos Store exposes two endpoints for probing. - * `/-/healthy` starts as soon as initial setup completed. - * `/-/ready` starts after all the bootstrapping completed (e.g initial index building) and ready to serve traffic. + - `/-/healthy` starts as soon as initial setup completed. + - `/-/ready` starts after all the bootstrapping completed (e.g initial index building) and ready to serve traffic. + +> NOTE: Metric endpoint starts immediately so, make sure you set up readiness probe on designated HTTP `/-/ready` path. From 7004a300f1b3c582d5d9b160bfe6543b6ab13675 Mon Sep 17 00:00:00 2001 From: Kemal Akkoyun Date: Fri, 18 Oct 2019 00:21:10 +0200 Subject: [PATCH 10/10] Update CHANGELOG.md Co-Authored-By: Martin Chodur Signed-off-by: Kemal Akkoyun --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7134f15635..eb96112711 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,7 +17,7 @@ We use *breaking* word for marking changes that are not backward compatible (rel ### Fixed -- [#1656](https://github.com/thanos-io/thanos/pull/1656) Thanos Store now starts metric and status probe HTTP server earlier in its start-up sequence. `/-/healthy` now endpoint starts to respond with success earlier. `/metrics` endpoint now starts serving earlier as well, as a result for your readiness probes you should rely on `/-/ready` rather than `/metrics`. +- [#1656](https://github.com/thanos-io/thanos/pull/1656) Thanos Store now starts metric and status probe HTTP server earlier in its start-up sequence. `/-/healthy` endpoint now starts to respond with success earlier. `/metrics` endpoint starts serving metrics earlier as well. Make sure to point your readiness probes to the `/-/ready` endpoint rather than `/metrics`. ## [v0.8.1](https://github.com/thanos-io/thanos/releases/tag/v0.8.1) - 2019.10.14