Skip to content

Commit

Permalink
Sidecar: mark as unqueryable if prometheus is down
Browse files Browse the repository at this point in the history
If the prometheus that belongs to a sidecar is down we dont need to
query the sidecar. This PR makes it so that we take the sidecar out of
the endpoint set then.

Signed-off-by: Michael Hoffmann <[email protected]>
  • Loading branch information
MichaHoffmann committed Apr 22, 2024
1 parent c3cd031 commit 1df444c
Show file tree
Hide file tree
Showing 7 changed files with 144 additions and 49 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ We use *breaking :warning:* to mark changes that are not backward compatible (re
- [#7233](https://github.com/thanos-io/thanos/pull/7233): UI: Showing Block Size Stats
- [#7280](https://github.com/thanos-io/thanos/pull/7281): Adding User-Agent to request logs
- [#7219](https://github.com/thanos-io/thanos/pull/7219): Receive: add `--remote-write.client-tls-secure` and `--remote-write.client-tls-skip-verify` flags to stop relying on grpc server config to determine grpc client secure/skipVerify.
- [#7297](https://github.com/thanos-io/thanos/pull/7297): Sidecar: mark as not queryable if prometheus is down.

### Changed

Expand Down
2 changes: 1 addition & 1 deletion cmd/thanos/sidecar.go
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,7 @@ func runSidecar(
TsdbInfos: promStore.TSDBInfos(),
}
}
return nil
return &infopb.StoreInfo{PrometheusDown: true}
}),
info.WithExemplarsInfoFunc(),
info.WithRulesInfoFunc(),
Expand Down
3 changes: 2 additions & 1 deletion pkg/info/info.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@ package info
import (
"context"

"google.golang.org/grpc"

"github.com/thanos-io/thanos/pkg/info/infopb"
"github.com/thanos-io/thanos/pkg/store/labelpb"
"google.golang.org/grpc"
)

// InfoServer implements the corresponding protobuf interface
Expand Down
114 changes: 76 additions & 38 deletions pkg/info/infopb/rpc.pb.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions pkg/info/infopb/rpc.proto
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,10 @@ message StoreInfo {

// TSDBInfos holds metadata for all TSDBs exposed by the store.
repeated TSDBInfo tsdb_infos = 6 [(gogoproto.nullable) = false];

// Valid for Sidecars. We can use this to indicate that the store is available
// but gRPC calls would fail since its downstream prometheus is not available.
bool prometheus_down = 7;
}

// RulesInfo holds the metadata related to Rules API exposed by the component.
Expand Down
18 changes: 15 additions & 3 deletions pkg/query/endpointset.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,14 @@ import (
"time"
"unicode/utf8"

"github.com/thanos-io/thanos/pkg/api/query/querypb"

"github.com/go-kit/log"
"github.com/go-kit/log/level"
"github.com/pkg/errors"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/prometheus/model/labels"
"google.golang.org/grpc"

"github.com/thanos-io/thanos/pkg/api/query/querypb"
"github.com/thanos-io/thanos/pkg/component"
"github.com/thanos-io/thanos/pkg/exemplars/exemplarspb"
"github.com/thanos-io/thanos/pkg/info/infopb"
Expand Down Expand Up @@ -716,7 +715,20 @@ func (er *endpointRef) isQueryable() bool {
er.mtx.RLock()
defer er.mtx.RUnlock()

return er.isStrict || er.status.LastError == nil
if er.isStrict {
return true
}
if er.status.LastError != nil {
return false
}

// Dont query sidecars whose prometheus is known to be unreachable.
if er.metadata != nil && er.metadata.ComponentType == component.Sidecar.String() {
if er.metadata.Store != nil {
return !er.metadata.Store.PrometheusDown
}
}
return true
}

func (er *endpointRef) ComponentType() component.Component {
Expand Down
51 changes: 45 additions & 6 deletions pkg/query/endpointset_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,20 +14,19 @@ import (
"testing"
"time"

"github.com/efficientgo/core/testutil"
"github.com/pkg/errors"
"github.com/stretchr/testify/require"

"github.com/prometheus/prometheus/model/labels"
"github.com/thanos-io/thanos/pkg/store"

"golang.org/x/sync/errgroup"
"google.golang.org/grpc"
"google.golang.org/grpc/credentials/insecure"

"github.com/efficientgo/core/testutil"
"github.com/pkg/errors"
promtestutil "github.com/prometheus/client_golang/prometheus/testutil"
"github.com/prometheus/prometheus/model/labels"

"github.com/thanos-io/thanos/pkg/component"
"github.com/thanos-io/thanos/pkg/info/infopb"
"github.com/thanos-io/thanos/pkg/store"
"github.com/thanos-io/thanos/pkg/store/labelpb"
"github.com/thanos-io/thanos/pkg/store/storepb"
)
Expand Down Expand Up @@ -497,6 +496,46 @@ func TestEndpointSetUpdate_EndpointGoingAway(t *testing.T) {
testutil.Equals(t, 0, len(endpointSet.GetStoreClients()))
}

func TestEndpointSetUpdate_SidecarPrometheusBecomingUnreachable(t *testing.T) {
storeInfo := &infopb.InfoResponse{
ComponentType: component.Sidecar.String(),
Store: &infopb.StoreInfo{
MinTime: math.MinInt64,
MaxTime: math.MaxInt64,
},
Exemplars: &infopb.ExemplarsInfo{},
Rules: &infopb.RulesInfo{},
MetricMetadata: &infopb.MetricMetadataInfo{},
Targets: &infopb.TargetsInfo{},
}
endpoints, err := startTestEndpoints([]testEndpointMeta{
{
InfoResponse: storeInfo,
extlsetFn: func(addr string) []labelpb.ZLabelSet {
return labelpb.ZLabelSetsFromPromLabels(
labels.FromStrings("addr", addr, "a", "b"),
)
},
},
})
testutil.Ok(t, err)
defer endpoints.Close()

discoveredEndpointAddr := endpoints.EndpointAddresses()
endpointSet := makeEndpointSet(discoveredEndpointAddr, false, time.Now)
defer endpointSet.Close()

// Initial update.
endpointSet.Update(context.Background())
testutil.Equals(t, 1, len(endpointSet.GetEndpointStatus()))
testutil.Equals(t, 1, len(endpointSet.GetStoreClients()))

storeInfo.Store.PrometheusDown = true
endpointSet.Update(context.Background())
testutil.Equals(t, 1, len(endpointSet.GetEndpointStatus()))
testutil.Equals(t, 0, len(endpointSet.GetStoreClients()))
}

func TestEndpointSetUpdate_EndpointComingOnline(t *testing.T) {
endpoints, err := startTestEndpoints([]testEndpointMeta{
{
Expand Down

0 comments on commit 1df444c

Please sign in to comment.