cockroachdb · craig · Oct 20, 2018 · Oct 18, 2018 · Oct 18, 2018 · Oct 18, 2018
@@ -1668,7 +1668,7 @@ func TestStoreRangeMergeAddReplicaRace(t *testing.T) {
 	}
 }
 
-func TestStoreRangeMergeSlowUnabandonedFollower(t *testing.T) {
+func TestStoreRangeMergeSlowUnabandonedFollower_NoSplit(t *testing.T) {
 	defer leaktest.AfterTest(t)()
 
 	ctx := context.Background()
@@ -1726,6 +1726,80 @@ func TestStoreRangeMergeSlowUnabandonedFollower(t *testing.T) {
 	mtc.transferLease(ctx, lhsDesc.RangeID, 0, 2)
 }
 
+func TestStoreRangeMergeSlowUnabandonedFollower_WithSplit(t *testing.T) {
+	defer leaktest.AfterTest(t)()
+
+	ctx := context.Background()
+	storeCfg := storage.TestStoreConfig(nil)
+	storeCfg.TestingKnobs.DisableReplicateQueue = true
+	mtc := &multiTestContext{storeConfig: &storeCfg}
+	mtc.Start(t, 3)
+	defer mtc.Stop()
+	store0, store2 := mtc.Store(0), mtc.Store(2)
+
+	mtc.replicateRange(roachpb.RangeID(1), 1, 2)
+	lhsDesc, rhsDesc, err := createSplitRanges(ctx, store0)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// Wait for store2 to hear about the split.
+	testutils.SucceedsSoon(t, func() error {
+		_, err := store2.GetReplica(rhsDesc.RangeID)
+		return err
+	})
+
+	// Start dropping all Raft traffic to the LHS on store2 so that it won't be
+	// aware that there is a merge in progress.
+	mtc.transport.Listen(store2.Ident.StoreID, &unreliableRaftHandler{
+		rangeID:            lhsDesc.RangeID,
+		RaftMessageHandler: store2,
+	})
+
+	args := adminMergeArgs(lhsDesc.StartKey.AsRawKey())
+	_, pErr := client.SendWrapped(ctx, store0.TestSender(), args)
+	if pErr != nil {
+		t.Fatal(pErr)
+	}
+
+	// Now split the newly merged range splits back out at exactly the same key.
+	// When the replica GC queue looks in meta2 it will find the new RHS range, of
+	// which store2 is a member. Note that store2 does not yet have an initialized
+	// replica for this range, since it would intersect with the old RHS replica.
+	_, newRHSDesc, err := createSplitRanges(ctx, store0)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// Remove and GC the LHS replica on store2. This ensures that the RHS replica
+	// on store2 will never be subsumed, because the merge trigger will never be
+	// applied by the LHS.
+	mtc.unreplicateRange(lhsDesc.RangeID, 2)
+	lhsRepl2 := store2.LookupReplica(lhsDesc.StartKey)
+	if err := store2.ManualReplicaGC(lhsRepl2); err != nil {
+		t.Fatal(err)
+	}
+	if _, err := store2.GetReplica(lhsDesc.RangeID); err == nil {
+		t.Fatal("lhs replica not destroyed on store2")
+	}
+
+	// Transfer the lease on the new RHS to store2 and wait for it to apply. This
+	// will force its replica to of the new RHS to become up to date, which
+	// indirectly tests that the replica GC queue cleans up both the LHS replica
+	// and the old RHS replica.
+	mtc.transferLease(ctx, newRHSDesc.RangeID, 0, 2)
+	testutils.SucceedsSoon(t, func() error {
+		rhsRepl, err := store2.GetReplica(newRHSDesc.RangeID)
+		if err != nil {
+			return err
+		}
+		if !rhsRepl.OwnsValidLease(mtc.clock.Now()) {
+			return errors.New("rhs store does not own valid lease for rhs range")
+		}
+		return nil
+	})
+}
+
 func TestStoreRangeMergeSlowAbandonedFollower(t *testing.T) {
 	defer leaktest.AfterTest(t)()
 
@@ -2341,6 +2415,15 @@ func (h *unreliableRaftHandler) HandleRaftRequest(
 	return h.RaftMessageHandler.HandleRaftRequest(ctx, req, respStream)
 }
 
+func (h *unreliableRaftHandler) HandleRaftResponse(
+	ctx context.Context, resp *storage.RaftMessageResponse,
+) error {
+	if resp.RangeID == h.rangeID {
+		return nil
+	}
+	return h.RaftMessageHandler.HandleRaftResponse(ctx, resp)
+}
+
 func TestStoreRangeMergeRaftSnapshot(t *testing.T) {
 	defer leaktest.AfterTest(t)()
 

@@ -219,46 +219,32 @@ func (rgcq *replicaGCQueue) process(
 		// but also on how good a job the queue does at inspecting every
 		// Replica (see #8111) when inactive ones can be starved by
 		// event-driven additions.
-		if log.V(1) {
-			log.Infof(ctx, "not gc'able, replica is still in range descriptor: %v", currentDesc)
-		}
+		log.VEventf(ctx, 1, "not gc'able, replica is still in range descriptor: %v", currentDesc)
 		if err := repl.setLastReplicaGCTimestamp(ctx, repl.store.Clock().Now()); err != nil {
 			return err
 		}
 	} else if desc.RangeID == replyDesc.RangeID {
 		// We are no longer a member of this range, but the range still exists.
 		// Clean up our local data.
 		rgcq.metrics.RemoveReplicaCount.Inc(1)
-		if log.V(1) {
-			log.Infof(ctx, "destroying local data")
-		}
+		log.VEventf(ctx, 1, "destroying local data")
 		if err := repl.store.RemoveReplica(ctx, repl, replyDesc.NextReplicaID, RemoveOptions{
 			DestroyData: true,
 		}); err != nil {
 			return err
 		}
-	} else if currentMember {
-		// This store is a current member of a different range that overlaps with
-		// this one. This situation can only happen when we are a current member
-		// of the range that subsumed this one, but our replica of the subsuming
-		// range has not yet applied the merge trigger. This replica must be
-		// preserved so it can be subsumed.
-		if log.V(1) {
-			log.Infof(ctx, "range merged away; allowing merge trigger on LHS to subsume it")
-		}
 	} else {
-		// This case is tricky. This range has been merged away, and this store is
-		// not a member of the current range. It is likely that we can GC this
-		// replica, but we need to be careful. If this store has a replica of the
-		// subsuming range that has not yet applied the merge trigger, we must not
-		// GC this replica.
+		// This case is tricky. This range has been merged away, so it is likely
+		// that we can GC this replica, but we need to be careful. If this store has
+		// a replica of the subsuming range that has not yet applied the merge
+		// trigger, we must not GC this replica.
 		//
 		// We can't just ask our local left neighbor whether it has an unapplied
 		// merge, as if it's a slow follower it might not have learned about the
-		// merge yet! What we can do, though, is check whether the generation of
-		// our local left neighbor matches the generation of its meta2 descriptor.
-		// If it is generationally up-to-date, it has applied all splits and
-		// merges, and it is thus safe to remove this replica.
+		// merge yet! What we can do, though, is check whether the generation of our
+		// local left neighbor matches the generation of its meta2 descriptor. If it
+		// is generationally up-to-date, it has applied all splits and merges, and
+		// it is thus safe to remove this replica.
 		leftRepl := repl.store.lookupPrecedingReplica(desc.StartKey)
 		if leftRepl != nil {
 			leftDesc := leftRepl.Desc()
@@ -271,9 +257,8 @@ func (rgcq *replicaGCQueue) process(
 				return errors.Errorf("expected 1 range descriptor, got %d", len(rs))
 			}
 			if leftReplyDesc := rs[0]; !leftDesc.Equal(leftReplyDesc) {
-				if log.V(1) {
-					log.Infof(ctx, "left neighbor not up-to-date; cannot safely GC range yet")
-				}
+				log.VEventf(ctx, 1, "left neighbor %s not up-to-date with meta descriptor %s; cannot safely GC range yet",
+					leftDesc, leftReplyDesc)
 				return nil
 			}
 		}

@@ -437,20 +437,35 @@ func (s *Store) canApplySnapshotLocked(
 				if r.RaftStatus() == nil {
 					return true
 				}
+				// TODO(benesch): this check does detect inactivity on replicas with
+				// epoch-based leases. Since the validity of an epoch-based lease is
+				// tied to the owning node's liveness, the lease can be valid well after
+				// the leader of the range has cut off communication with this replica.
+				// Expiration based leases, by contrast, will expire quickly if the
+				// leader of the range stops sending this replica heartbeats.
 				lease, pendingLease := r.GetLease()
 				now := s.Clock().Now()
 				return !r.IsLeaseValid(lease, now) &&
 					(pendingLease == (roachpb.Lease{}) || !r.IsLeaseValid(pendingLease, now))
 			}
-
-			// If the existing range shows no signs of recent activity, give it a GC
-			// run.
+			// We unconditionally send this replica through the GC queue. It's
+			// reasonably likely that the GC queue will do nothing because the replica
+			// needs to split instead, but better to err on the side of queueing too
+			// frequently. Blocking Raft snapshots for too long can wedge a cluster,
+			// and if the replica does need to be GC'd, this might be the only code
+			// path that notices in a timely fashion.
+			//
+			// We're careful to avoid starving out other replicas in the GC queue by
+			// queueing at a low priority unless we can prove that the range is
+			// inactive and thus unlikely to be about to process a split.
+			gcPriority := replicaGCPriorityDefault
 			if inactive(exReplica) {
-				if _, err := s.replicaGCQueue.Add(exReplica, replicaGCPriorityCandidate); err != nil {
-					log.Errorf(ctx, "%s: unable to add replica to GC queue: %s", exReplica, err)
-				} else {
-					msg += "; initiated GC:"
-				}
+				gcPriority = replicaGCPriorityCandidate
+			}
+			if _, err := s.replicaGCQueue.Add(exReplica, gcPriority); err != nil {
+				log.Errorf(ctx, "%s: unable to add replica to GC queue: %s", exReplica, err)
+			} else {
+				msg += "; initiated GC:"
 			}
 		}
 		return nil, errors.Errorf("%s %v", msg, exReplica) // exReplica can be nil