Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

shipper: Be strict about upload order unless it's specified so & cut v0.13.0-rc.2 #2765

Merged
merged 2 commits into from
Jun 15, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ We use *breaking* word for marking changes that are not backward compatible (rel

## Unreleased

## [v0.13.0](https://github.com/thanos-io/thanos/releases/tag/v0.13.0) - 2020.06.15
## [v0.13.0-rc.2](https://github.com/thanos-io/thanos/releases/tag/v0.13.0-rc.2) - 2020.06.15

### Fixed

Expand All @@ -26,6 +26,7 @@ We use *breaking* word for marking changes that are not backward compatible (rel
- [#2416](https://github.com/thanos-io/thanos/pull/2416) Bucket: Fixed issue #2416 bug in `inspect --sort-by` doesn't work correctly in all cases.
- [#2719](https://github.com/thanos-io/thanos/pull/2719) Query: `irate` and `resets` use now counter downsampling aggregations.
- [#2705](https://github.com/thanos-io/thanos/pull/2705) minio-go: Added support for `af-south-1` and `eu-south-1` regions.
- [#2753](https://github.com/thanos-io/thanos/issues/2753) Sidecar, Receive, Rule: Fixed possibility of out of order uploads in error cases. This could potentially cause Compactor to create overlapping blocks.

### Added

Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.13.0
0.13.0-rc.2
14 changes: 10 additions & 4 deletions cmd/thanos/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -112,17 +112,23 @@ func (rc *reloaderConfig) registerFlag(cmd *kingpin.CmdClause) *reloaderConfig {
}

type shipperConfig struct {
uploadCompacted bool
ignoreBlockSize bool
uploadCompacted bool
ignoreBlockSize bool
allowOutOfOrderUpload bool
}

func (sc *shipperConfig) registerFlag(cmd *kingpin.CmdClause) *shipperConfig {
cmd.Flag("shipper.upload-compacted",
"If true sidecar will try to upload compacted blocks as well. Useful for migration purposes. Works only if compaction is disabled on Prometheus. Do it once and then disable the flag when done.").
"If true shipper will try to upload compacted blocks as well. Useful for migration purposes. Works only if compaction is disabled on Prometheus. Do it once and then disable the flag when done.").
Default("false").BoolVar(&sc.uploadCompacted)
cmd.Flag("shipper.ignore-unequal-block-size",
"If true sidecar will not require prometheus min and max block size flags to be set to the same value. Only use this if you want to keep long retention and compaction enabled on your Prometheus instance, as in the worst case it can result in ~2h data loss for your Thanos bucket storage.").
"If true shipper will not require prometheus min and max block size flags to be set to the same value. Only use this if you want to keep long retention and compaction enabled on your Prometheus instance, as in the worst case it can result in ~2h data loss for your Thanos bucket storage.").
Default("false").Hidden().BoolVar(&sc.ignoreBlockSize)
cmd.Flag("shipper.allow-out-of-order-uploads",
"If true, shipper will skip failed block uploads in the given iteration and retry later. This means that some newer blocks might be uploaded sooner than older blocks."+
"This can trigger compaction without those blocks and as a result will create an overlap situation. Set it to true if you have vertical compaction enabled and wish to upload blocks as soon as possible without caring"+
"about order.").
Default("false").Hidden().BoolVar(&sc.allowOutOfOrderUpload)
return sc
}

Expand Down
9 changes: 9 additions & 0 deletions cmd/thanos/receive.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,12 @@ func registerReceive(m map[string]setupFunc, app *kingpin.Application) {

walCompression := cmd.Flag("tsdb.wal-compression", "Compress the tsdb WAL.").Default("true").Bool()

allowOutOfOrderUpload := cmd.Flag("shipper.allow-out-of-order-uploads",
"If true, shipper will skip failed block uploads in the given iteration and retry later. This means that some newer blocks might be uploaded sooner than older blocks."+
"This can trigger compaction without those blocks and as a result will create an overlap situation. Set it to true if you have vertical compaction enabled and wish to upload blocks as soon as possible without caring"+
"about order.").
Default("false").Hidden().Bool()

m[comp.String()] = func(g *run.Group, logger log.Logger, reg *prometheus.Registry, tracer opentracing.Tracer, _ <-chan struct{}, _ bool) error {
lset, err := parseFlagLabels(*labelStrs)
if err != nil {
Expand Down Expand Up @@ -157,6 +163,7 @@ func registerReceive(m map[string]setupFunc, app *kingpin.Application) {
*replicationFactor,
time.Duration(*forwardTimeout),
comp,
*allowOutOfOrderUpload,
)
}
}
Expand Down Expand Up @@ -195,6 +202,7 @@ func runReceive(
replicationFactor uint64,
forwardTimeout time.Duration,
comp component.SourceStoreAPI,
allowOutOfOrderUpload bool,
) error {
logger = log.With(logger, "component", "receive")
level.Warn(logger).Log("msg", "setting up receive; the Thanos receive component is EXPERIMENTAL, it may break significantly without notice")
Expand Down Expand Up @@ -246,6 +254,7 @@ func runReceive(
lset,
tenantLabelName,
bkt,
allowOutOfOrderUpload,
)
writer := receive.NewWriter(log.With(logger, "component", "receive-writer"), dbs)
webHandler := receive.NewHandler(log.With(logger, "component", "receive-handler"), &receive.Options{
Expand Down
10 changes: 9 additions & 1 deletion cmd/thanos/rule.go
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,12 @@ func registerRule(m map[string]setupFunc, app *kingpin.Application) {
dnsSDResolver := cmd.Flag("query.sd-dns-resolver", "Resolver to use. Possible options: [golang, miekgdns]").
Default("golang").Hidden().String()

allowOutOfOrderUpload := cmd.Flag("shipper.allow-out-of-order-uploads",
"If true, shipper will skip failed block uploads in the given iteration and retry later. This means that some newer blocks might be uploaded sooner than older blocks."+
"This can trigger compaction without those blocks and as a result will create an overlap situation. Set it to true if you have vertical compaction enabled and wish to upload blocks as soon as possible without caring"+
"about order.").
Default("false").Hidden().Bool()

m[comp.String()] = func(g *run.Group, logger log.Logger, reg *prometheus.Registry, tracer opentracing.Tracer, reload <-chan struct{}, _ bool) error {
lset, err := parseFlagLabels(*labelStrs)
if err != nil {
Expand Down Expand Up @@ -197,6 +203,7 @@ func registerRule(m map[string]setupFunc, app *kingpin.Application) {
time.Duration(*dnsSDInterval),
*dnsSDResolver,
comp,
*allowOutOfOrderUpload,
)
}
}
Expand Down Expand Up @@ -283,6 +290,7 @@ func runRule(
dnsSDInterval time.Duration,
dnsSDResolver string,
comp component.Component,
allowOutOfOrderUpload bool,
) error {
metrics := newRuleMetrics(reg)

Expand Down Expand Up @@ -615,7 +623,7 @@ func runRule(
}
}()

s := shipper.New(logger, reg, dataDir, bkt, func() labels.Labels { return lset }, metadata.RulerSource)
s := shipper.New(logger, reg, dataDir, bkt, func() labels.Labels { return lset }, metadata.RulerSource, allowOutOfOrderUpload)

ctx, cancel := context.WithCancel(context.Background())

Expand Down
4 changes: 2 additions & 2 deletions cmd/thanos/sidecar.go
Original file line number Diff line number Diff line change
Expand Up @@ -273,9 +273,9 @@ func runSidecar(

var s *shipper.Shipper
if conf.shipper.uploadCompacted {
s = shipper.NewWithCompacted(logger, reg, conf.tsdb.path, bkt, m.Labels, metadata.SidecarSource)
s = shipper.NewWithCompacted(logger, reg, conf.tsdb.path, bkt, m.Labels, metadata.SidecarSource, conf.shipper.allowOutOfOrderUpload)
} else {
s = shipper.New(logger, reg, conf.tsdb.path, bkt, m.Labels, metadata.SidecarSource)
s = shipper.New(logger, reg, conf.tsdb.path, bkt, m.Labels, metadata.SidecarSource, conf.shipper.allowOutOfOrderUpload)
}

return runutil.Repeat(30*time.Second, ctx.Done(), func() error {
Expand Down
2 changes: 1 addition & 1 deletion docs/components/sidecar.md
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ Flags:
details:
https://thanos.io/storage.md/#configuration
--shipper.upload-compacted
If true sidecar will try to upload compacted
If true shipper will try to upload compacted
blocks as well. Useful for migration purposes.
Works only if compaction is disabled on
Prometheus. Do it once and then disable the
Expand Down
3 changes: 2 additions & 1 deletion docs/operating/troubleshooting.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ slug: /troubleshooting.md

# Troubleshooting; Common cases


## Overlaps

**Block overlap**: Set of blocks with exactly the same external labels in meta.json and for the same time or overlapping time period.
Expand All @@ -29,13 +28,15 @@ Checking producers log for such ULID, and checking meta.json (e.g if sample stat

### Reasons

- You are running Thanos (sidecar, ruler or receive) older than 0.13.0. During transient upload errors there is a possibility to have overlaps caused by the compactor not being aware of all blocks See: [this](https://github.com/thanos-io/thanos/issues/2753)
- Misconfiguraiton of sidecar/ruler: Same external labels or no external labels across many block producers.
- Running multiple compactors for single block "stream", even for short duration.
- Manually uploading blocks to the bucket.
- Eventually consistent block storage until we fully implement [RW for bucket](https://thanos.io/proposals/201901-read-write-operations-bucket.md)

### Solutions

- Upgrade sidecar, ruler and receive to 0.13.0+
- Compactor can be blocked for some time, but if it is urgent. Mitigate by removing overlap or better: Backing up somewhere else (you can rename block ULID to non-ulid).
- Who uploaded the block? Search for logs with this ULID across all sidecars/rulers. Check access logs to object storage. Check debug/metas or meta.json of problematic block to see how blocks looks like and what is the `source`.
- Determine what you misconfigured.
Expand Down
26 changes: 15 additions & 11 deletions pkg/receive/multitsdb.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,9 @@ type MultiTSDB struct {
labels labels.Labels
bucket objstore.Bucket

mtx *sync.RWMutex
tenants map[string]*tenant
mtx *sync.RWMutex
tenants map[string]*tenant
allowOutOfOrderUpload bool
}

func NewMultiTSDB(
Expand All @@ -50,21 +51,23 @@ func NewMultiTSDB(
labels labels.Labels,
tenantLabelName string,
bucket objstore.Bucket,
allowOutOfOrderUpload bool,
) *MultiTSDB {
if l == nil {
l = log.NewNopLogger()
}

return &MultiTSDB{
dataDir: dataDir,
logger: l,
reg: reg,
tsdbOpts: tsdbOpts,
mtx: &sync.RWMutex{},
tenants: map[string]*tenant{},
labels: labels,
tenantLabelName: tenantLabelName,
bucket: bucket,
dataDir: dataDir,
logger: l,
reg: reg,
tsdbOpts: tsdbOpts,
mtx: &sync.RWMutex{},
tenants: map[string]*tenant{},
labels: labels,
tenantLabelName: tenantLabelName,
bucket: bucket,
allowOutOfOrderUpload: allowOutOfOrderUpload,
}
}

Expand Down Expand Up @@ -256,6 +259,7 @@ func (t *MultiTSDB) getOrLoadTenant(tenantID string, blockingStart bool) (*tenan
t.bucket,
func() labels.Labels { return lbls },
metadata.ReceiveSource,
t.allowOutOfOrderUpload,
)
}

Expand Down
2 changes: 2 additions & 0 deletions pkg/receive/multitsdb_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ func TestMultiTSDB(t *testing.T) {
labels.FromStrings("replica", "01"),
"tenant_id",
nil,
false,
)
defer testutil.Ok(t, m.Flush())

Expand Down Expand Up @@ -109,6 +110,7 @@ func TestMultiTSDB(t *testing.T) {
labels.FromStrings("replica", "01"),
"tenant_id",
nil,
false,
)
defer testutil.Ok(t, m.Flush())

Expand Down
Loading