diff --git a/pkg/storage/client_merge_test.go b/pkg/storage/client_merge_test.go index f51a7e97a6e5..21f16ae47567 100644 --- a/pkg/storage/client_merge_test.go +++ b/pkg/storage/client_merge_test.go @@ -1668,7 +1668,7 @@ func TestStoreRangeMergeAddReplicaRace(t *testing.T) { } } -func TestStoreRangeMergeSlowUnabandonedFollower(t *testing.T) { +func TestStoreRangeMergeSlowUnabandonedFollower_NoSplit(t *testing.T) { defer leaktest.AfterTest(t)() ctx := context.Background() @@ -1726,6 +1726,80 @@ func TestStoreRangeMergeSlowUnabandonedFollower(t *testing.T) { mtc.transferLease(ctx, lhsDesc.RangeID, 0, 2) } +func TestStoreRangeMergeSlowUnabandonedFollower_WithSplit(t *testing.T) { + defer leaktest.AfterTest(t)() + + ctx := context.Background() + storeCfg := storage.TestStoreConfig(nil) + storeCfg.TestingKnobs.DisableReplicateQueue = true + mtc := &multiTestContext{storeConfig: &storeCfg} + mtc.Start(t, 3) + defer mtc.Stop() + store0, store2 := mtc.Store(0), mtc.Store(2) + + mtc.replicateRange(roachpb.RangeID(1), 1, 2) + lhsDesc, rhsDesc, err := createSplitRanges(ctx, store0) + if err != nil { + t.Fatal(err) + } + + // Wait for store2 to hear about the split. + testutils.SucceedsSoon(t, func() error { + _, err := store2.GetReplica(rhsDesc.RangeID) + return err + }) + + // Start dropping all Raft traffic to the LHS on store2 so that it won't be + // aware that there is a merge in progress. + mtc.transport.Listen(store2.Ident.StoreID, &unreliableRaftHandler{ + rangeID: lhsDesc.RangeID, + RaftMessageHandler: store2, + }) + + args := adminMergeArgs(lhsDesc.StartKey.AsRawKey()) + _, pErr := client.SendWrapped(ctx, store0.TestSender(), args) + if pErr != nil { + t.Fatal(pErr) + } + + // Now split the newly merged range splits back out at exactly the same key. + // When the replica GC queue looks in meta2 it will find the new RHS range, of + // which store2 is a member. Note that store2 does not yet have an initialized + // replica for this range, since it would intersect with the old RHS replica. + _, newRHSDesc, err := createSplitRanges(ctx, store0) + if err != nil { + t.Fatal(err) + } + + // Remove and GC the LHS replica on store2. This ensures that the RHS replica + // on store2 will never be subsumed, because the merge trigger will never be + // applied by the LHS. + mtc.unreplicateRange(lhsDesc.RangeID, 2) + lhsRepl2 := store2.LookupReplica(lhsDesc.StartKey) + if err := store2.ManualReplicaGC(lhsRepl2); err != nil { + t.Fatal(err) + } + if _, err := store2.GetReplica(lhsDesc.RangeID); err == nil { + t.Fatal("lhs replica not destroyed on store2") + } + + // Transfer the lease on the new RHS to store2 and wait for it to apply. This + // will force its replica to of the new RHS to become up to date, which + // indirectly tests that the replica GC queue cleans up both the LHS replica + // and the old RHS replica. + mtc.transferLease(ctx, newRHSDesc.RangeID, 0, 2) + testutils.SucceedsSoon(t, func() error { + rhsRepl, err := store2.GetReplica(newRHSDesc.RangeID) + if err != nil { + return err + } + if !rhsRepl.OwnsValidLease(mtc.clock.Now()) { + return errors.New("rhs store does not own valid lease for rhs range") + } + return nil + }) +} + func TestStoreRangeMergeSlowAbandonedFollower(t *testing.T) { defer leaktest.AfterTest(t)() @@ -2341,6 +2415,15 @@ func (h *unreliableRaftHandler) HandleRaftRequest( return h.RaftMessageHandler.HandleRaftRequest(ctx, req, respStream) } +func (h *unreliableRaftHandler) HandleRaftResponse( + ctx context.Context, resp *storage.RaftMessageResponse, +) error { + if resp.RangeID == h.rangeID { + return nil + } + return h.RaftMessageHandler.HandleRaftResponse(ctx, resp) +} + func TestStoreRangeMergeRaftSnapshot(t *testing.T) { defer leaktest.AfterTest(t)() diff --git a/pkg/storage/replica_gc_queue.go b/pkg/storage/replica_gc_queue.go index 28e3e6e90e2f..9effb72612d9 100644 --- a/pkg/storage/replica_gc_queue.go +++ b/pkg/storage/replica_gc_queue.go @@ -219,9 +219,7 @@ func (rgcq *replicaGCQueue) process( // but also on how good a job the queue does at inspecting every // Replica (see #8111) when inactive ones can be starved by // event-driven additions. - if log.V(1) { - log.Infof(ctx, "not gc'able, replica is still in range descriptor: %v", currentDesc) - } + log.VEventf(ctx, 1, "not gc'able, replica is still in range descriptor: %v", currentDesc) if err := repl.setLastReplicaGCTimestamp(ctx, repl.store.Clock().Now()); err != nil { return err } @@ -229,36 +227,24 @@ func (rgcq *replicaGCQueue) process( // We are no longer a member of this range, but the range still exists. // Clean up our local data. rgcq.metrics.RemoveReplicaCount.Inc(1) - if log.V(1) { - log.Infof(ctx, "destroying local data") - } + log.VEventf(ctx, 1, "destroying local data") if err := repl.store.RemoveReplica(ctx, repl, replyDesc.NextReplicaID, RemoveOptions{ DestroyData: true, }); err != nil { return err } - } else if currentMember { - // This store is a current member of a different range that overlaps with - // this one. This situation can only happen when we are a current member - // of the range that subsumed this one, but our replica of the subsuming - // range has not yet applied the merge trigger. This replica must be - // preserved so it can be subsumed. - if log.V(1) { - log.Infof(ctx, "range merged away; allowing merge trigger on LHS to subsume it") - } } else { - // This case is tricky. This range has been merged away, and this store is - // not a member of the current range. It is likely that we can GC this - // replica, but we need to be careful. If this store has a replica of the - // subsuming range that has not yet applied the merge trigger, we must not - // GC this replica. + // This case is tricky. This range has been merged away, so it is likely + // that we can GC this replica, but we need to be careful. If this store has + // a replica of the subsuming range that has not yet applied the merge + // trigger, we must not GC this replica. // // We can't just ask our local left neighbor whether it has an unapplied // merge, as if it's a slow follower it might not have learned about the - // merge yet! What we can do, though, is check whether the generation of - // our local left neighbor matches the generation of its meta2 descriptor. - // If it is generationally up-to-date, it has applied all splits and - // merges, and it is thus safe to remove this replica. + // merge yet! What we can do, though, is check whether the generation of our + // local left neighbor matches the generation of its meta2 descriptor. If it + // is generationally up-to-date, it has applied all splits and merges, and + // it is thus safe to remove this replica. leftRepl := repl.store.lookupPrecedingReplica(desc.StartKey) if leftRepl != nil { leftDesc := leftRepl.Desc() @@ -271,9 +257,8 @@ func (rgcq *replicaGCQueue) process( return errors.Errorf("expected 1 range descriptor, got %d", len(rs)) } if leftReplyDesc := rs[0]; !leftDesc.Equal(leftReplyDesc) { - if log.V(1) { - log.Infof(ctx, "left neighbor not up-to-date; cannot safely GC range yet") - } + log.VEventf(ctx, 1, "left neighbor %s not up-to-date with meta descriptor %s; cannot safely GC range yet", + leftDesc, leftReplyDesc) return nil } } diff --git a/pkg/storage/store_snapshot.go b/pkg/storage/store_snapshot.go index a48e58ffbac4..9cefb6703ed9 100644 --- a/pkg/storage/store_snapshot.go +++ b/pkg/storage/store_snapshot.go @@ -437,20 +437,35 @@ func (s *Store) canApplySnapshotLocked( if r.RaftStatus() == nil { return true } + // TODO(benesch): this check does detect inactivity on replicas with + // epoch-based leases. Since the validity of an epoch-based lease is + // tied to the owning node's liveness, the lease can be valid well after + // the leader of the range has cut off communication with this replica. + // Expiration based leases, by contrast, will expire quickly if the + // leader of the range stops sending this replica heartbeats. lease, pendingLease := r.GetLease() now := s.Clock().Now() return !r.IsLeaseValid(lease, now) && (pendingLease == (roachpb.Lease{}) || !r.IsLeaseValid(pendingLease, now)) } - - // If the existing range shows no signs of recent activity, give it a GC - // run. + // We unconditionally send this replica through the GC queue. It's + // reasonably likely that the GC queue will do nothing because the replica + // needs to split instead, but better to err on the side of queueing too + // frequently. Blocking Raft snapshots for too long can wedge a cluster, + // and if the replica does need to be GC'd, this might be the only code + // path that notices in a timely fashion. + // + // We're careful to avoid starving out other replicas in the GC queue by + // queueing at a low priority unless we can prove that the range is + // inactive and thus unlikely to be about to process a split. + gcPriority := replicaGCPriorityDefault if inactive(exReplica) { - if _, err := s.replicaGCQueue.Add(exReplica, replicaGCPriorityCandidate); err != nil { - log.Errorf(ctx, "%s: unable to add replica to GC queue: %s", exReplica, err) - } else { - msg += "; initiated GC:" - } + gcPriority = replicaGCPriorityCandidate + } + if _, err := s.replicaGCQueue.Add(exReplica, gcPriority); err != nil { + log.Errorf(ctx, "%s: unable to add replica to GC queue: %s", exReplica, err) + } else { + msg += "; initiated GC:" } } return nil, errors.Errorf("%s %v", msg, exReplica) // exReplica can be nil