From 1da135482dd3913261a065acaca85d6147439bf5 Mon Sep 17 00:00:00 2001 From: Andy Dunstall Date: Thu, 5 Dec 2024 10:37:36 +0000 Subject: [PATCH] backoff: fix enforcing max backoff (#200) * backoff: fix enforcing max backoff * backoff: log backoff time --- client/upstream.go | 7 ++++++- pkg/backoff/backoff.go | 18 +++++++----------- server/gossip/gossip.go | 17 +++++++++++++++-- 3 files changed, 28 insertions(+), 14 deletions(-) diff --git a/client/upstream.go b/client/upstream.go index 8066ea2..7575a8c 100644 --- a/client/upstream.go +++ b/client/upstream.go @@ -146,14 +146,19 @@ func (u *Upstream) connect(ctx context.Context, endpointID string) (*yamux.Sessi return nil, err } + backoff, _ := backoff.Backoff() u.logger().Warn( "connect failed; retrying", zap.String("endpoint-id", endpointID), zap.String("url", url), + zap.String("backoff", backoff.String()), zap.Error(err), ) - if !backoff.Wait(ctx) { + select { + case <-time.After(backoff): + continue + case <-ctx.Done(): return nil, ctx.Err() } } diff --git a/pkg/backoff/backoff.go b/pkg/backoff/backoff.go index f5ed458..103014a 100644 --- a/pkg/backoff/backoff.go +++ b/pkg/backoff/backoff.go @@ -1,7 +1,6 @@ package backoff import ( - "context" "math/rand" "time" ) @@ -30,23 +29,17 @@ func New(retries int, minBackoff time.Duration, maxBackoff time.Duration) *Backo } } -// Wait blocks until the next retry. Returns false if the number of retries has -// been reached so the client should stop. -func (b *Backoff) Wait(ctx context.Context) bool { +// Backoff returns whether to retry or abort, and how long to backoff for. +func (b *Backoff) Backoff() (time.Duration, bool) { if b.retries != 0 && b.attempts > b.retries { - return false + return 0, false } b.attempts++ backoff := b.nextWait() b.lastBackoff = backoff - select { - case <-time.After(b.lastBackoff): - return true - case <-ctx.Done(): - return false - } + return backoff, true } func (b *Backoff) nextWait() time.Duration { @@ -56,6 +49,9 @@ func (b *Backoff) nextWait() time.Duration { } else { backoff = b.lastBackoff * 2 } + if backoff > b.maxBackoff { + backoff = b.maxBackoff + } jitterMultipler := 1.0 + (rand.Float64() * 0.1) return time.Duration(float64(backoff) * jitterMultipler) diff --git a/server/gossip/gossip.go b/server/gossip/gossip.go index 245aa03..b497a39 100644 --- a/server/gossip/gossip.go +++ b/server/gossip/gossip.go @@ -77,10 +77,23 @@ func (g *Gossip) JoinOnStartup(ctx context.Context, addrs []string) ([]string, e if err == nil { return nodeIDs, nil } - g.logger.Warn("failed to join cluster", zap.Error(err)) + + backoff, retry := backoff.Backoff() + if !retry { + return nil, lastErr + } + + g.logger.Warn( + "failed to join cluster; retrying", + zap.String("backoff", backoff.String()), + zap.Error(err), + ) lastErr = err - if !backoff.Wait(ctx) { + select { + case <-time.After(backoff): + continue + case <-ctx.Done(): return nil, lastErr } }