From c99737f1c5758272a9ae28928a7261db8bcd8bd4 Mon Sep 17 00:00:00 2001 From: 3pointer Date: Mon, 7 Aug 2023 16:36:46 +0800 Subject: [PATCH 01/45] update checkpoint --- br/pkg/streamhelper/advancer.go | 91 ++++++++++++++++++++++++--------- 1 file changed, 68 insertions(+), 23 deletions(-) diff --git a/br/pkg/streamhelper/advancer.go b/br/pkg/streamhelper/advancer.go index c064f2dd5e670..f5462c543aeec 100644 --- a/br/pkg/streamhelper/advancer.go +++ b/br/pkg/streamhelper/advancer.go @@ -60,7 +60,7 @@ type CheckpointAdvancer struct { // the cached last checkpoint. // if no progress, this cache can help us don't to send useless requests. - lastCheckpoint uint64 + lastCheckpoint checkpoint checkpoints *spans.ValueSortedFull checkpointsMu sync.Mutex @@ -69,6 +69,45 @@ type CheckpointAdvancer struct { subscriberMu sync.Mutex } +// checkpoint represents the TS with specific range. +// it's only used in advancer.go. +type checkpoint struct { + StartKey []byte + EndKey []byte + TS uint64 + + // It's better to use PD timestamp in future, for now + // use local time to decide the time to resolve lock is ok. + generateTime time.Time +} + +func newCheckpointWithTS(ts uint64) checkpoint { + return checkpoint{ + TS: ts, + generateTime: time.Now(), + } +} + +func newCheckpointWithSpan(s spans.Valued) checkpoint { + return checkpoint{ + StartKey: s.Key.StartKey, + EndKey: s.Key.EndKey, + TS: s.Value, + generateTime: time.Now(), + } +} + +func (c checkpoint) safeTS() uint64 { + return c.TS - 1 +} + +// if a checkpoint stay in a time too long(1 min) +// we should try to resolve lock for the range +// to keep the RPO in a short value. +func (c checkpoint) needResolveLocks() bool { + return time.Since(c.generateTime) > time.Minute +} + // NewCheckpointAdvancer creates a checkpoint advancer with the env. func NewCheckpointAdvancer(env Env) *CheckpointAdvancer { return &CheckpointAdvancer{ @@ -180,7 +219,7 @@ func (c *CheckpointAdvancer) WithCheckpoints(f func(*spans.ValueSortedFull)) { } func (c *CheckpointAdvancer) CalculateGlobalCheckpointLight(ctx context.Context, - threshold time.Duration) (uint64, error) { + threshold time.Duration) (spans.Valued, error) { var targets []spans.Valued var minValue spans.Valued c.WithCheckpoints(func(vsf *spans.ValueSortedFull) { @@ -194,13 +233,13 @@ func (c *CheckpointAdvancer) CalculateGlobalCheckpointLight(ctx context.Context, zap.Stringer("min", minValue), zap.Int("for-polling", len(targets)), zap.String("min-ts", oracle.GetTimeFromTS(minValue.Value).Format(time.RFC3339))) if len(targets) == 0 { - return minValue.Value, nil + return minValue, nil } err := c.tryAdvance(ctx, len(targets), func(i int) kv.KeyRange { return targets[i].Key }) if err != nil { - return 0, err + return minValue, err } - return minValue.Value, nil + return minValue, nil } func (c *CheckpointAdvancer) consumeAllTask(ctx context.Context, ch <-chan TaskEvent) error { @@ -293,7 +332,7 @@ func (c *CheckpointAdvancer) onTaskEvent(ctx context.Context, e TaskEvent) error c.task = e.Info c.taskRange = spans.Collapse(len(e.Ranges), func(i int) kv.KeyRange { return e.Ranges[i] }) c.checkpoints = spans.Sorted(spans.NewFullWith(e.Ranges, 0)) - c.lastCheckpoint = e.Info.StartTs + c.lastCheckpoint = newCheckpointWithTS(e.Info.StartTs) p, err := c.env.BlockGCUntil(ctx, c.task.StartTs) if err != nil { log.Warn("failed to upload service GC safepoint, skipping.", logutil.ShortError(err)) @@ -323,23 +362,29 @@ func (c *CheckpointAdvancer) onTaskEvent(ctx context.Context, e TaskEvent) error return nil } -func (c *CheckpointAdvancer) setCheckpoint(cp uint64) bool { - if cp < c.lastCheckpoint { +func (c *CheckpointAdvancer) setCheckpoint(s spans.Valued) bool { + cp := newCheckpointWithSpan(s) + if cp.TS < c.lastCheckpoint.TS { log.Warn("failed to update global checkpoint: stale", - zap.Uint64("old", c.lastCheckpoint), zap.Uint64("new", cp)) + zap.Uint64("old", c.lastCheckpoint.TS), zap.Uint64("new", cp.TS)) return false } - if cp <= c.lastCheckpoint { + // lastCheckpoint is too long enough. + // assume the cluster has expired locks for whatever reasons. + if c.lastCheckpoint.needResolveLocks() { + + } + if cp.TS <= c.lastCheckpoint.TS { return false } c.lastCheckpoint = cp - metrics.LastCheckpoint.WithLabelValues(c.task.GetName()).Set(float64(c.lastCheckpoint)) + metrics.LastCheckpoint.WithLabelValues(c.task.GetName()).Set(float64(c.lastCheckpoint.TS)) return true } // advanceCheckpointBy advances the checkpoint by a checkpoint getter function. func (c *CheckpointAdvancer) advanceCheckpointBy(ctx context.Context, - getCheckpoint func(context.Context) (uint64, error)) error { + getCheckpoint func(context.Context) (spans.Valued, error)) error { start := time.Now() cp, err := getCheckpoint(ctx) if err != nil { @@ -348,8 +393,8 @@ func (c *CheckpointAdvancer) advanceCheckpointBy(ctx context.Context, if c.setCheckpoint(cp) { log.Info("uploading checkpoint for task", - zap.Stringer("checkpoint", oracle.GetTimeFromTS(cp)), - zap.Uint64("checkpoint", cp), + zap.Stringer("checkpoint", oracle.GetTimeFromTS(cp.Value)), + zap.Uint64("checkpoint", cp.Value), zap.String("task", c.task.Name), zap.Stringer("take", time.Since(start))) } @@ -403,24 +448,24 @@ func (c *CheckpointAdvancer) subscribeTick(ctx context.Context) error { func (c *CheckpointAdvancer) importantTick(ctx context.Context) error { c.checkpointsMu.Lock() - c.setCheckpoint(c.checkpoints.MinValue()) + c.setCheckpoint(c.checkpoints.Min()) c.checkpointsMu.Unlock() - if err := c.env.UploadV3GlobalCheckpointForTask(ctx, c.task.Name, c.lastCheckpoint); err != nil { + if err := c.env.UploadV3GlobalCheckpointForTask(ctx, c.task.Name, c.lastCheckpoint.TS); err != nil { return errors.Annotate(err, "failed to upload global checkpoint") } - p, err := c.env.BlockGCUntil(ctx, c.lastCheckpoint-1) + p, err := c.env.BlockGCUntil(ctx, c.lastCheckpoint.safeTS()) if err != nil { return errors.Annotatef(err, "failed to update service GC safe point, current checkpoint is %d, target checkpoint is %d", - c.lastCheckpoint-1, p) + c.lastCheckpoint.safeTS(), p) } - if p <= c.lastCheckpoint-1 { + if p <= c.lastCheckpoint.safeTS() { log.Info("updated log backup GC safe point.", - zap.Uint64("checkpoint", p), zap.Uint64("target", c.lastCheckpoint-1)) + zap.Uint64("checkpoint", p), zap.Uint64("target", c.lastCheckpoint.safeTS())) } - if p > c.lastCheckpoint-1 { + if p > c.lastCheckpoint.safeTS() { log.Warn("update log backup GC safe point failed: stale.", - zap.Uint64("checkpoint", p), zap.Uint64("target", c.lastCheckpoint-1)) + zap.Uint64("checkpoint", p), zap.Uint64("target", c.lastCheckpoint.safeTS())) } return nil } @@ -433,7 +478,7 @@ func (c *CheckpointAdvancer) optionalTick(cx context.Context) error { threshold = c.Config().GetSubscriberErrorStartPollThreshold() } - err := c.advanceCheckpointBy(cx, func(cx context.Context) (uint64, error) { + err := c.advanceCheckpointBy(cx, func(cx context.Context) (spans.Valued, error) { return c.CalculateGlobalCheckpointLight(cx, threshold) }) if err != nil { From 6ac2d25a8b62a1994172a1545bd95544bf4fb3cf Mon Sep 17 00:00:00 2001 From: 3pointer Date: Tue, 8 Aug 2023 11:41:42 +0800 Subject: [PATCH 02/45] refactor GCWorker for lockResolver --- store/gcworker/gc_worker.go | 126 +++++++++++++----------- store/gcworker/gc_worker_test.go | 160 +++++++++++++++++++------------ 2 files changed, 170 insertions(+), 116 deletions(-) diff --git a/store/gcworker/gc_worker.go b/store/gcworker/gc_worker.go index 43d95c1b7e2f5..f476a856cd6c2 100644 --- a/store/gcworker/gc_worker.go +++ b/store/gcworker/gc_worker.go @@ -63,22 +63,56 @@ import ( "golang.org/x/exp/slices" ) +type GCLockResolver interface { + LocateKey(bo *tikv.Backoffer, key []byte) (*tikv.KeyLocation, error) + + // Try batch resolve locks first. if not ok, fallback to normal resolve locks + ResolveLocks(bo *tikv.Backoffer, tryLocks []*txnlock.Lock, forceLocks []*txnlock.Lock, loc tikv.RegionVerID) (bool, error) + + // only used for mock test + ScanLocks(key []byte, regionID uint64, maxVersion uint64) []*txnlock.Lock + + SendReq(bo *tikv.Backoffer, req *tikvrpc.Request, regionID tikv.RegionVerID, timeout time.Duration) (*tikvrpc.Response, error) +} + +type GCWorkerLockResolver struct { + tikvStore tikv.Storage +} + +func (w *GCWorkerLockResolver) LocateKey(bo *tikv.Backoffer, key []byte) (*tikv.KeyLocation, error) { + return w.tikvStore.GetRegionCache().LocateKey(bo, key) +} + +func (w *GCWorkerLockResolver) ResolveLocks(bo *tikv.Backoffer, tryLocks []*txnlock.Lock, forceLocks []*txnlock.Lock, loc tikv.RegionVerID) (bool, error) { + ok, err := w.tikvStore.GetLockResolver().BatchResolveLocks(bo, forceLocks, loc) + if err != nil || !ok { + return ok, err + } + // callerStartTS is always 0. so no need to make it a parameter here. + _, err = w.tikvStore.GetLockResolver().ResolveLocks(bo, 0, tryLocks) + return err == nil, errors.Trace(err) +} + +func (w *GCWorkerLockResolver) ScanLocks(key []byte, regionID uint64, maxVersion uint64) []*txnlock.Lock { + return nil +} + +func (w *GCWorkerLockResolver) SendReq(bo *tikv.Backoffer, req *tikvrpc.Request, regionID tikv.RegionVerID, timeout time.Duration) (*tikvrpc.Response, error) { + return w.tikvStore.SendReq(bo, req, regionID, timeout) +} + // GCWorker periodically triggers GC process on tikv server. type GCWorker struct { - uuid string - desc string - store kv.Storage - tikvStore tikv.Storage - pdClient pd.Client - gcIsRunning bool - lastFinish time.Time - cancel context.CancelFunc - done chan error - testingKnobs struct { - scanLocks func(key []byte, regionID uint64, maxVersion uint64) []*txnlock.Lock - batchResolveLocks func(locks []*txnlock.Lock, regionID tikv.RegionVerID, safepoint uint64) (ok bool, err error) - resolveLocks func(locks []*txnlock.Lock, lowResolutionTS uint64) (int64, error) - } + uuid string + desc string + store kv.Storage + tikvStore tikv.Storage + pdClient pd.Client + gcIsRunning bool + lastFinish time.Time + cancel context.CancelFunc + done chan error + lockResolver GCLockResolver logBackupEnabled bool // check log-backup task existed. } @@ -97,14 +131,15 @@ func NewGCWorker(store kv.Storage, pdClient pd.Client) (*GCWorker, error) { return nil, errors.New("GC should run against TiKV storage") } worker := &GCWorker{ - uuid: strconv.FormatUint(ver.Ver, 16), - desc: fmt.Sprintf("host:%s, pid:%d, start at %s", hostName, os.Getpid(), time.Now()), - store: store, - tikvStore: tikvStore, - pdClient: pdClient, - gcIsRunning: false, - lastFinish: time.Now(), - done: make(chan error), + uuid: strconv.FormatUint(ver.Ver, 16), + desc: fmt.Sprintf("host:%s, pid:%d, start at %s", hostName, os.Getpid(), time.Now()), + store: store, + tikvStore: tikvStore, + pdClient: pdClient, + gcIsRunning: false, + lastFinish: time.Now(), + lockResolver: &GCWorkerLockResolver{tikvStore}, + done: make(chan error), } variable.RegisterStatistics(worker) return worker, nil @@ -1217,7 +1252,7 @@ func (w *GCWorker) legacyResolveLocks( startTime := time.Now() handler := func(ctx context.Context, r tikvstore.KeyRange) (rangetask.TaskStat, error) { - return w.resolveLocksForRange(ctx, safePoint, tryResolveLocksTS, r.StartKey, r.EndKey) + return ResolveLocksForRange(ctx, w.uuid, w.lockResolver, safePoint, tryResolveLocksTS, r.StartKey, r.EndKey) } runner := rangetask.NewRangeTaskRunner("resolve-locks-runner", w.tikvStore, concurrency, handler) @@ -1258,7 +1293,8 @@ func (w *GCWorker) getTryResolveLocksTS() (uint64, error) { // it will rollback the txn, no matter the lock is expired of not. // 2. If the ts of lock is larger than forceResolveLocksTS, it will check status of the txn. // Resolve the lock if txn is expired, Or do nothing. -func (w *GCWorker) batchResolveExpiredLocks( +func batchResolveExpiredLocks( + lockResolver GCLockResolver, bo *tikv.Backoffer, locks []*txnlock.Lock, loc tikv.RegionVerID, @@ -1285,29 +1321,13 @@ func (w *GCWorker) batchResolveExpiredLocks( zap.Int("force-resolve-locks-count", len(forceResolveLocks)), zap.Int("try-resolve-locks-count", len(tryResolveLocks))) - var ( - ok bool - err error - ) - if w.testingKnobs.batchResolveLocks != nil { - ok, err = w.testingKnobs.batchResolveLocks(forceResolveLocks, loc, forceResolveLocksTS) - } else { - ok, err = w.tikvStore.GetLockResolver().BatchResolveLocks(bo, forceResolveLocks, loc) - } - if err != nil || !ok { - return ok, err - } - - if w.testingKnobs.resolveLocks != nil { - _, err = w.testingKnobs.resolveLocks(tryResolveLocks, tryResolveLocksTS) - } else { - _, err = w.tikvStore.GetLockResolver().ResolveLocks(bo, 0, tryResolveLocks) - } - return err == nil, errors.Trace(err) + return lockResolver.ResolveLocks(bo, tryResolveLocks, forceResolveLocks, loc) } -func (w *GCWorker) resolveLocksForRange( +func ResolveLocksForRange( ctx context.Context, + uuid string, + lockResolver GCLockResolver, forceResolveLocksTS uint64, tryResolveLocksTS uint64, startKey []byte, @@ -1346,12 +1366,12 @@ retryScanAndResolve: } req.ScanLock().StartKey = key - loc, err := w.tikvStore.GetRegionCache().LocateKey(bo, key) + loc, err := lockResolver.LocateKey(bo, key) if err != nil { return stat, errors.Trace(err) } req.ScanLock().EndKey = loc.EndKey - resp, err := w.tikvStore.SendReq(bo, req, loc.Region, tikv.ReadTimeoutMedium) + resp, err := lockResolver.SendReq(bo, req, loc.Region, tikv.ReadTimeoutMedium) if err != nil { return stat, errors.Trace(err) } @@ -1378,12 +1398,10 @@ retryScanAndResolve: for _, li := range locksInfo { locks = append(locks, txnlock.NewLock(li)) } - if w.testingKnobs.scanLocks != nil { - locks = append(locks, w.testingKnobs.scanLocks(key, loc.Region.GetID(), tryResolveLocksTS)...) - } + locks = append(locks, lockResolver.ScanLocks(key, loc.Region.GetID(), tryResolveLocksTS)...) locForResolve := loc for { - ok, err1 := w.batchResolveExpiredLocks(bo, locks, locForResolve.Region, forceResolveLocksTS, tryResolveLocksTS) + ok, err1 := batchResolveExpiredLocks(lockResolver, bo, locks, locForResolve.Region, forceResolveLocksTS, tryResolveLocksTS) if err1 != nil { return stat, errors.Trace(err1) } @@ -1392,7 +1410,7 @@ retryScanAndResolve: if err != nil { return stat, errors.Trace(err) } - stillInSame, refreshedLoc, err := w.tryRelocateLocksRegion(bo, locks) + stillInSame, refreshedLoc, err := tryRelocateLocksRegion(bo, lockResolver, locks) if err != nil { return stat, errors.Trace(err) } @@ -1409,7 +1427,7 @@ retryScanAndResolve: key = loc.EndKey } else { logutil.Logger(ctx).Info("region has more than limit locks", zap.String("category", "gc worker"), - zap.String("uuid", w.uuid), + zap.String("uuid", uuid), zap.Uint64("region", locForResolve.Region.GetID()), zap.Int("scan lock limit", gcScanLockLimit)) metrics.GCRegionTooManyLocksCounter.Inc() @@ -1428,11 +1446,11 @@ retryScanAndResolve: return stat, nil } -func (w *GCWorker) tryRelocateLocksRegion(bo *tikv.Backoffer, locks []*txnlock.Lock) (stillInSameRegion bool, refreshedLoc *tikv.KeyLocation, err error) { +func tryRelocateLocksRegion(bo *tikv.Backoffer, lockResolver GCLockResolver, locks []*txnlock.Lock) (stillInSameRegion bool, refreshedLoc *tikv.KeyLocation, err error) { if len(locks) == 0 { return } - refreshedLoc, err = w.tikvStore.GetRegionCache().LocateKey(bo, locks[0].Key) + refreshedLoc, err = lockResolver.LocateKey(bo, locks[0].Key) if err != nil { return } diff --git a/store/gcworker/gc_worker_test.go b/store/gcworker/gc_worker_test.go index c6d7bc7ff234c..38dcf633f5a79 100644 --- a/store/gcworker/gc_worker_test.go +++ b/store/gcworker/gc_worker_test.go @@ -49,6 +49,37 @@ import ( pd "github.com/tikv/pd/client" ) +type mockGCWorkerLockResolver struct { + tikvStore tikv.Storage + forceResolveLocksTS uint64 + tryResolveLocksTS uint64 + scanLocks func(key []byte, regionID uint64, maxVersion uint64) []*txnlock.Lock + batchResolveLocks func(locks []*txnlock.Lock, regionID tikv.RegionVerID, safepoint uint64) (ok bool, err error) + resolveLocks func(locks []*txnlock.Lock, lowResolutionTS uint64) (int64, error) +} + +func (l *mockGCWorkerLockResolver) LocateKey(bo *tikv.Backoffer, key []byte) (*tikv.KeyLocation, error) { + return l.tikvStore.GetRegionCache().LocateKey(bo, key) +} + +func (l *mockGCWorkerLockResolver) SendReq(bo *tikv.Backoffer, req *tikvrpc.Request, regionID tikv.RegionVerID, timeout time.Duration) (*tikvrpc.Response, error) { + return l.tikvStore.SendReq(bo, req, regionID, timeout) +} + +func (l *mockGCWorkerLockResolver) ScanLocks(key []byte, regionID uint64, maxVersion uint64) []*txnlock.Lock { + return l.scanLocks(key, regionID, maxVersion) +} + +func (l *mockGCWorkerLockResolver) ResolveLocks(bo *tikv.Backoffer, tryLocks []*txnlock.Lock, forceLocks []*txnlock.Lock, loc tikv.RegionVerID) (bool, error) { + ok, err := l.batchResolveLocks(forceLocks, loc, l.forceResolveLocksTS) + if err != nil || !ok { + return ok, err + } + + _, err = l.resolveLocks(tryLocks, l.tryResolveLocksTS) + return err == nil, err +} + type mockGCWorkerClient struct { tikv.Client unsafeDestroyRangeHandler handler @@ -1027,7 +1058,7 @@ func TestResolveLockRangeInfine(t *testing.T) { require.NoError(t, failpoint.Disable("github.com/pingcap/tidb/store/gcworker/setGcResolveMaxBackoff")) }() - _, err := s.gcWorker.resolveLocksForRange(gcContext(), 1, 3, []byte{0}, []byte{1}) + _, err := ResolveLocksForRange(gcContext(), s.gcWorker.uuid, s.gcWorker.lockResolver, 1, 3, []byte{0}, []byte{1}) require.Error(t, err) } @@ -1074,51 +1105,54 @@ func TestResolveLockRangeMeetRegionCacheMiss(t *testing.T) { }, } - s.gcWorker.testingKnobs.scanLocks = func(key []byte, regionID uint64, maxVersion uint64) []*txnlock.Lock { - *scanCntRef++ - - locks := make([]*txnlock.Lock, 0) - for _, l := range allLocks { - if l.TxnID <= maxVersion { - locks = append(locks, l) - scanLockCnt++ + mockLockResolver := &mockGCWorkerLockResolver{ + tikvStore: s.tikvStore, + forceResolveLocksTS: safepointTS, + tryResolveLocksTS: lowResolveTS, + scanLocks: func(key []byte, regionID uint64, maxVersion uint64) []*txnlock.Lock { + *scanCntRef++ + + locks := make([]*txnlock.Lock, 0) + for _, l := range allLocks { + if l.TxnID <= maxVersion { + locks = append(locks, l) + scanLockCnt++ + } + } + return locks + }, + batchResolveLocks: func( + locks []*txnlock.Lock, + regionID tikv.RegionVerID, + safepoint uint64, + ) (ok bool, err error) { + *resolveCntRef++ + if *resolveCntRef == 1 { + s.gcWorker.tikvStore.GetRegionCache().InvalidateCachedRegion(regionID) + // mock the region cache miss error + return false, nil } - } - return locks - } - s.gcWorker.testingKnobs.batchResolveLocks = func( - locks []*txnlock.Lock, - regionID tikv.RegionVerID, - safepoint uint64, - ) (ok bool, err error) { - *resolveCntRef++ - if *resolveCntRef == 1 { - s.gcWorker.tikvStore.GetRegionCache().InvalidateCachedRegion(regionID) - // mock the region cache miss error - return false, nil - } - - resolveBeforeSafepointLockCnt = len(locks) - for _, l := range locks { - require.True(t, l.TxnID <= safepoint) - } - return true, nil - } - s.gcWorker.testingKnobs.resolveLocks = func( - locks []*txnlock.Lock, - lowResolutionTS uint64, - ) (int64, error) { - for _, l := range locks { - expiredTS := oracle.ComposeTS(oracle.ExtractPhysical(l.TxnID)+int64(l.TTL), oracle.ExtractLogical(l.TxnID)) - if expiredTS <= lowResolutionTS { - resolveAfterSafepointLockCnt++ + resolveBeforeSafepointLockCnt = len(locks) + for _, l := range locks { + require.True(t, l.TxnID <= safepoint) } - } - return 0, nil + return true, nil + }, + resolveLocks: func( + locks []*txnlock.Lock, + lowResolutionTS uint64, + ) (int64, error) { + for _, l := range locks { + expiredTS := oracle.ComposeTS(oracle.ExtractPhysical(l.TxnID)+int64(l.TTL), oracle.ExtractLogical(l.TxnID)) + if expiredTS <= lowResolutionTS { + resolveAfterSafepointLockCnt++ + } + } + return 0, nil + }, } - - _, err := s.gcWorker.resolveLocksForRange(gcContext(), safepointTS, lowResolveTS, []byte{0}, []byte{10}) + _, err := ResolveLocksForRange(gcContext(), s.gcWorker.uuid, mockLockResolver, safepointTS, lowResolveTS, []byte{0}, []byte{10}) require.NoError(t, err) require.Equal(t, 2, resolveCnt) require.Equal(t, 1, scanCnt) @@ -1148,18 +1182,27 @@ func TestResolveLockRangeMeetRegionEnlargeCausedByRegionMerge(t *testing.T) { newPeers := []uint64{s.cluster.AllocID(), s.cluster.AllocID(), s.cluster.AllocID()} s.cluster.Split(s.initRegion.regionID, region2, []byte("m"), newPeers, newPeers[0]) - // init a, b lock in region1 and o, p locks in region2 - s.gcWorker.testingKnobs.scanLocks = func(key []byte, regionID uint64, maxVersion uint64) []*txnlock.Lock { - if regionID == s.initRegion.regionID { - return []*txnlock.Lock{{Key: []byte("a")}, {Key: []byte("b")}} - } - if regionID == region2 { - return []*txnlock.Lock{{Key: []byte("o")}, {Key: []byte("p")}} - } - return []*txnlock.Lock{} + mockGCLockResolver := &mockGCWorkerLockResolver{ + tikvStore: s.tikvStore, + forceResolveLocksTS: 1, + tryResolveLocksTS: 3, + scanLocks: func(key []byte, regionID uint64, maxVersion uint64) []*txnlock.Lock { + if regionID == s.initRegion.regionID { + return []*txnlock.Lock{{Key: []byte("a")}, {Key: []byte("b")}} + } + if regionID == region2 { + return []*txnlock.Lock{{Key: []byte("o")}, {Key: []byte("p")}} + } + return []*txnlock.Lock{} + }, + resolveLocks: func( + locks []*txnlock.Lock, + lowResolutionTS uint64, + ) (int64, error) { + return 0, nil + }, } - - s.gcWorker.testingKnobs.batchResolveLocks = func( + mockGCLockResolver.batchResolveLocks = func( locks []*txnlock.Lock, regionID tikv.RegionVerID, safepoint uint64, @@ -1176,7 +1219,7 @@ func TestResolveLockRangeMeetRegionEnlargeCausedByRegionMerge(t *testing.T) { []*metapb.Region{regionMeta}) require.NoError(t, err) // also let region1 contains all 4 locks - s.gcWorker.testingKnobs.scanLocks = func(key []byte, regionID uint64, maxVersion uint64) []*txnlock.Lock { + mockGCLockResolver.scanLocks = func(key []byte, regionID uint64, maxVersion uint64) []*txnlock.Lock { if regionID == s.initRegion.regionID { locks := []*txnlock.Lock{ {Key: []byte("a")}, @@ -1199,14 +1242,7 @@ func TestResolveLockRangeMeetRegionEnlargeCausedByRegionMerge(t *testing.T) { } return true, nil } - s.gcWorker.testingKnobs.resolveLocks = func( - locks []*txnlock.Lock, - lowResolutionTS uint64, - ) (int64, error) { - return 0, nil - } - - _, err := s.gcWorker.resolveLocksForRange(gcContext(), 1, 3, []byte(""), []byte("z")) + _, err := ResolveLocksForRange(gcContext(), s.gcWorker.uuid, mockGCLockResolver, 1, 3, []byte(""), []byte("z")) require.NoError(t, err) require.Len(t, resolvedLock, 4) expects := [][]byte{[]byte("a"), []byte("b"), []byte("o"), []byte("p")} From 418bd47fc89ba3681806c49f0a02af975bc01a99 Mon Sep 17 00:00:00 2001 From: 3pointer Date: Tue, 8 Aug 2023 12:11:47 +0800 Subject: [PATCH 03/45] decouple logbackup in GCWorker --- store/gcworker/gc_worker.go | 36 +++++++++----------------------- store/gcworker/gc_worker_test.go | 6 ------ 2 files changed, 10 insertions(+), 32 deletions(-) diff --git a/store/gcworker/gc_worker.go b/store/gcworker/gc_worker.go index f476a856cd6c2..8b6b8c5e65d6c 100644 --- a/store/gcworker/gc_worker.go +++ b/store/gcworker/gc_worker.go @@ -33,7 +33,6 @@ import ( "github.com/pingcap/kvproto/pkg/errorpb" "github.com/pingcap/kvproto/pkg/kvrpcpb" "github.com/pingcap/kvproto/pkg/metapb" - "github.com/pingcap/tidb/br/pkg/utils" "github.com/pingcap/tidb/ddl" "github.com/pingcap/tidb/ddl/label" "github.com/pingcap/tidb/ddl/placement" @@ -103,17 +102,16 @@ func (w *GCWorkerLockResolver) SendReq(bo *tikv.Backoffer, req *tikvrpc.Request, // GCWorker periodically triggers GC process on tikv server. type GCWorker struct { - uuid string - desc string - store kv.Storage - tikvStore tikv.Storage - pdClient pd.Client - gcIsRunning bool - lastFinish time.Time - cancel context.CancelFunc - done chan error - lockResolver GCLockResolver - logBackupEnabled bool // check log-backup task existed. + uuid string + desc string + store kv.Storage + tikvStore tikv.Storage + pdClient pd.Client + gcIsRunning bool + lastFinish time.Time + cancel context.CancelFunc + done chan error + lockResolver GCLockResolver } // NewGCWorker creates a GCWorker instance. @@ -450,17 +448,6 @@ func (w *GCWorker) leaderTick(ctx context.Context) error { metrics.GCJobFailureCounter.WithLabelValues("prepare").Inc() return errors.Trace(err) } else if !ok { - // If skip gc, it still needs to resolve locks with expired TTL, in order not to block log backup. - if w.logBackupEnabled { - tryResolveLocksTS, err := w.getTryResolveLocksTS() - if err != nil { - return errors.Trace(err) - } - // Set 0 to safepoint, which means resolving locks with expired TTL only. - if err = w.legacyResolveLocks(ctx, 0, tryResolveLocksTS, concurrency); err != nil { - return errors.Trace(err) - } - } return nil } // When the worker is just started, or an old GC job has just finished, @@ -1214,8 +1201,6 @@ func (w *GCWorker) resolveLocks(ctx context.Context, safePoint uint64, concurren if tryResolveLocksTS < safePoint { tryResolveLocksTS = safePoint - } else if !w.logBackupEnabled { - tryResolveLocksTS = safePoint } if !usePhysical { @@ -1947,7 +1932,6 @@ func (w *GCWorker) checkLeader(ctx context.Context) (bool, error) { se := createSession(w.store) defer se.Close() - w.logBackupEnabled = utils.IsLogBackupInUse(se) _, err := se.ExecuteInternal(ctx, "BEGIN") if err != nil { return false, errors.Trace(err) diff --git a/store/gcworker/gc_worker_test.go b/store/gcworker/gc_worker_test.go index 38dcf633f5a79..7744842e423d5 100644 --- a/store/gcworker/gc_worker_test.go +++ b/store/gcworker/gc_worker_test.go @@ -1948,8 +1948,6 @@ func TestGCLabelRules(t *testing.T) { func TestGCWithPendingTxn(t *testing.T) { s := createGCWorkerSuite(t) - // set to false gc worker won't resolve locks after safepoint. - s.gcWorker.logBackupEnabled = false ctx := gcContext() gcSafePointCacheInterval = 0 @@ -2001,8 +1999,6 @@ func TestGCWithPendingTxn(t *testing.T) { func TestGCWithPendingTxn2(t *testing.T) { s := createGCWorkerSuite(t) - // only when log backup enabled will scan locks after safepoint. - s.gcWorker.logBackupEnabled = true ctx := gcContext() gcSafePointCacheInterval = 0 @@ -2073,8 +2069,6 @@ func TestGCWithPendingTxn2(t *testing.T) { func TestSkipGCAndOnlyResolveLock(t *testing.T) { s := createGCWorkerSuite(t) - // only when log backup enabled will scan locks after safepoint. - s.gcWorker.logBackupEnabled = true ctx := gcContext() gcSafePointCacheInterval = 0 From 1bf06cd0dde90ecc08c24f63778867b1c556750a Mon Sep 17 00:00:00 2001 From: 3pointer Date: Tue, 8 Aug 2023 13:25:35 +0800 Subject: [PATCH 04/45] use GCLockResolver in advancer --- br/pkg/conn/conn.go | 5 +++++ br/pkg/streamhelper/advancer.go | 7 ++++++- br/pkg/streamhelper/advancer_env.go | 21 ++++++++++++++------- br/pkg/task/stream.go | 2 +- domain/domain.go | 2 +- store/gcworker/gc_worker.go | 10 +++++----- 6 files changed, 32 insertions(+), 15 deletions(-) diff --git a/br/pkg/conn/conn.go b/br/pkg/conn/conn.go index 1ecb1d732ac13..34ba7608c5b5a 100644 --- a/br/pkg/conn/conn.go +++ b/br/pkg/conn/conn.go @@ -245,6 +245,11 @@ func (mgr *Mgr) GetTLSConfig() *tls.Config { return mgr.StoreManager.TLSConfig() } +// GetStore gets the tikvStore. +func (mgr *Mgr) GetStore() tikv.Storage { + return mgr.tikvStore +} + // GetLockResolver gets the LockResolver. func (mgr *Mgr) GetLockResolver() *txnlock.LockResolver { return mgr.tikvStore.GetLockResolver() diff --git a/br/pkg/streamhelper/advancer.go b/br/pkg/streamhelper/advancer.go index f5462c543aeec..e4454103cbbf1 100644 --- a/br/pkg/streamhelper/advancer.go +++ b/br/pkg/streamhelper/advancer.go @@ -17,7 +17,10 @@ import ( "github.com/pingcap/tidb/br/pkg/utils" "github.com/pingcap/tidb/kv" "github.com/pingcap/tidb/metrics" + "github.com/pingcap/tidb/store/gcworker" + tikvstore "github.com/tikv/client-go/v2/kv" "github.com/tikv/client-go/v2/oracle" + "github.com/tikv/client-go/v2/txnkv/rangetask" "go.uber.org/multierr" "go.uber.org/zap" "golang.org/x/sync/errgroup" @@ -372,7 +375,9 @@ func (c *CheckpointAdvancer) setCheckpoint(s spans.Valued) bool { // lastCheckpoint is too long enough. // assume the cluster has expired locks for whatever reasons. if c.lastCheckpoint.needResolveLocks() { - + handler := func(ctx context.Context, r tikvstore.KeyRange) (rangetask.TaskStat, error) { + return gcworker.ResolveLocksForRange(ctx, w.uuid, c.env, safePoint, tryResolveLocksTS, r.StartKey, r.EndKey) + } } if cp.TS <= c.lastCheckpoint.TS { return false diff --git a/br/pkg/streamhelper/advancer_env.go b/br/pkg/streamhelper/advancer_env.go index 1527def725f9a..46fc5c4efb623 100644 --- a/br/pkg/streamhelper/advancer_env.go +++ b/br/pkg/streamhelper/advancer_env.go @@ -9,7 +9,9 @@ import ( logbackup "github.com/pingcap/kvproto/pkg/logbackuppb" "github.com/pingcap/tidb/br/pkg/utils" "github.com/pingcap/tidb/config" + "github.com/pingcap/tidb/store/gcworker" "github.com/pingcap/tidb/util/engine" + "github.com/tikv/client-go/v2/tikv" pd "github.com/tikv/pd/client" clientv3 "go.etcd.io/etcd/client/v3" "google.golang.org/grpc" @@ -29,6 +31,8 @@ type Env interface { LogBackupService // StreamMeta connects to the metadata service (normally PD). StreamMeta + // GCLockResolver try to resolve locks when region checkpoint stopped. + gcworker.GCLockResolver } // PDRegionScanner is a simple wrapper over PD @@ -83,6 +87,7 @@ type clusterEnv struct { clis *utils.StoreManager *AdvancerExt PDRegionScanner + *gcworker.GCWorkerLockResolver } // GetLogBackupClient gets the log backup client. @@ -98,16 +103,17 @@ func (t clusterEnv) GetLogBackupClient(ctx context.Context, storeID uint64) (log } // CliEnv creates the Env for CLI usage. -func CliEnv(cli *utils.StoreManager, etcdCli *clientv3.Client) Env { +func CliEnv(cli *utils.StoreManager, tikvStore tikv.Storage, etcdCli *clientv3.Client) Env { return clusterEnv{ - clis: cli, - AdvancerExt: &AdvancerExt{MetaDataClient: *NewMetaDataClient(etcdCli)}, - PDRegionScanner: PDRegionScanner{cli.PDClient()}, + clis: cli, + AdvancerExt: &AdvancerExt{MetaDataClient: *NewMetaDataClient(etcdCli)}, + PDRegionScanner: PDRegionScanner{cli.PDClient()}, + GCWorkerLockResolver: &gcworker.GCWorkerLockResolver{TiKvStore: tikvStore}, } } // TiDBEnv creates the Env by TiDB config. -func TiDBEnv(pdCli pd.Client, etcdCli *clientv3.Client, conf *config.Config) (Env, error) { +func TiDBEnv(tikvStore tikv.Storage, pdCli pd.Client, etcdCli *clientv3.Client, conf *config.Config) (Env, error) { tconf, err := conf.GetTiKVConfig().Security.ToTLSConfig() if err != nil { return nil, err @@ -117,8 +123,9 @@ func TiDBEnv(pdCli pd.Client, etcdCli *clientv3.Client, conf *config.Config) (En Time: time.Duration(conf.TiKVClient.GrpcKeepAliveTime) * time.Second, Timeout: time.Duration(conf.TiKVClient.GrpcKeepAliveTimeout) * time.Second, }, tconf), - AdvancerExt: &AdvancerExt{MetaDataClient: *NewMetaDataClient(etcdCli)}, - PDRegionScanner: PDRegionScanner{Client: pdCli}, + AdvancerExt: &AdvancerExt{MetaDataClient: *NewMetaDataClient(etcdCli)}, + PDRegionScanner: PDRegionScanner{Client: pdCli}, + GCWorkerLockResolver: &gcworker.GCWorkerLockResolver{TiKvStore: tikvStore}, }, nil } diff --git a/br/pkg/task/stream.go b/br/pkg/task/stream.go index 53ff04f079232..0a76690693894 100644 --- a/br/pkg/task/stream.go +++ b/br/pkg/task/stream.go @@ -876,7 +876,7 @@ func RunStreamAdvancer(c context.Context, g glue.Glue, cmdName string, cfg *Stre if err != nil { return err } - env := streamhelper.CliEnv(mgr.StoreManager, etcdCLI) + env := streamhelper.CliEnv(mgr.StoreManager, mgr.GetStore(), etcdCLI) advancer := streamhelper.NewCheckpointAdvancer(env) advancer.UpdateConfig(cfg.AdvancerCfg) advancerd := daemon.New(advancer, streamhelper.OwnerManagerForLogBackup(ctx, etcdCLI), cfg.AdvancerCfg.TickDuration) diff --git a/domain/domain.go b/domain/domain.go index c1bd3597d94f5..0490a3fcf1e0c 100644 --- a/domain/domain.go +++ b/domain/domain.go @@ -1274,7 +1274,7 @@ func (do *Domain) initLogBackup(ctx context.Context, pdClient pd.Client) error { log.Warn("pd / etcd client not provided, won't begin Advancer.") return nil } - env, err := streamhelper.TiDBEnv(pdClient, do.etcdClient, cfg) + env, err := streamhelper.TiDBEnv(do.Store, pdClient, do.etcdClient, cfg) if err != nil { return err } diff --git a/store/gcworker/gc_worker.go b/store/gcworker/gc_worker.go index 8b6b8c5e65d6c..1c6b5c86f020b 100644 --- a/store/gcworker/gc_worker.go +++ b/store/gcworker/gc_worker.go @@ -75,20 +75,20 @@ type GCLockResolver interface { } type GCWorkerLockResolver struct { - tikvStore tikv.Storage + TiKvStore tikv.Storage } func (w *GCWorkerLockResolver) LocateKey(bo *tikv.Backoffer, key []byte) (*tikv.KeyLocation, error) { - return w.tikvStore.GetRegionCache().LocateKey(bo, key) + return w.TiKvStore.GetRegionCache().LocateKey(bo, key) } func (w *GCWorkerLockResolver) ResolveLocks(bo *tikv.Backoffer, tryLocks []*txnlock.Lock, forceLocks []*txnlock.Lock, loc tikv.RegionVerID) (bool, error) { - ok, err := w.tikvStore.GetLockResolver().BatchResolveLocks(bo, forceLocks, loc) + ok, err := w.TiKvStore.GetLockResolver().BatchResolveLocks(bo, forceLocks, loc) if err != nil || !ok { return ok, err } // callerStartTS is always 0. so no need to make it a parameter here. - _, err = w.tikvStore.GetLockResolver().ResolveLocks(bo, 0, tryLocks) + _, err = w.TiKvStore.GetLockResolver().ResolveLocks(bo, 0, tryLocks) return err == nil, errors.Trace(err) } @@ -97,7 +97,7 @@ func (w *GCWorkerLockResolver) ScanLocks(key []byte, regionID uint64, maxVersion } func (w *GCWorkerLockResolver) SendReq(bo *tikv.Backoffer, req *tikvrpc.Request, regionID tikv.RegionVerID, timeout time.Duration) (*tikvrpc.Response, error) { - return w.tikvStore.SendReq(bo, req, regionID, timeout) + return w.TiKvStore.SendReq(bo, req, regionID, timeout) } // GCWorker periodically triggers GC process on tikv server. From a0742dc09d819fbdd42886b3593158f66202534c Mon Sep 17 00:00:00 2001 From: 3pointer Date: Tue, 8 Aug 2023 14:53:23 +0800 Subject: [PATCH 05/45] avoid cycle import --- br/pkg/streamhelper/advancer.go | 4 +- br/pkg/streamhelper/advancer_env.go | 58 ++++++- store/gcworker/gc_worker.go | 226 ++-------------------------- store/gcworker/gc_worker_test.go | 34 +---- util/gcutil/gcutil.go | 159 +++++++++++++++++++ 5 files changed, 232 insertions(+), 249 deletions(-) diff --git a/br/pkg/streamhelper/advancer.go b/br/pkg/streamhelper/advancer.go index e4454103cbbf1..0f38cb6e7dcd2 100644 --- a/br/pkg/streamhelper/advancer.go +++ b/br/pkg/streamhelper/advancer.go @@ -17,7 +17,7 @@ import ( "github.com/pingcap/tidb/br/pkg/utils" "github.com/pingcap/tidb/kv" "github.com/pingcap/tidb/metrics" - "github.com/pingcap/tidb/store/gcworker" + "github.com/pingcap/tidb/util/gcutil" tikvstore "github.com/tikv/client-go/v2/kv" "github.com/tikv/client-go/v2/oracle" "github.com/tikv/client-go/v2/txnkv/rangetask" @@ -376,7 +376,7 @@ func (c *CheckpointAdvancer) setCheckpoint(s spans.Valued) bool { // assume the cluster has expired locks for whatever reasons. if c.lastCheckpoint.needResolveLocks() { handler := func(ctx context.Context, r tikvstore.KeyRange) (rangetask.TaskStat, error) { - return gcworker.ResolveLocksForRange(ctx, w.uuid, c.env, safePoint, tryResolveLocksTS, r.StartKey, r.EndKey) + return gcutil.ResolveLocksForRange(ctx, "log backup advancer", c.env, safePoint, r.StartKey, r.EndKey) } } if cp.TS <= c.lastCheckpoint.TS { diff --git a/br/pkg/streamhelper/advancer_env.go b/br/pkg/streamhelper/advancer_env.go index 46fc5c4efb623..6885feb2ecf03 100644 --- a/br/pkg/streamhelper/advancer_env.go +++ b/br/pkg/streamhelper/advancer_env.go @@ -6,12 +6,15 @@ import ( "context" "time" + "github.com/pingcap/errors" logbackup "github.com/pingcap/kvproto/pkg/logbackuppb" "github.com/pingcap/tidb/br/pkg/utils" "github.com/pingcap/tidb/config" - "github.com/pingcap/tidb/store/gcworker" "github.com/pingcap/tidb/util/engine" + "github.com/pingcap/tidb/util/gcutil" "github.com/tikv/client-go/v2/tikv" + "github.com/tikv/client-go/v2/tikvrpc" + "github.com/tikv/client-go/v2/txnkv/txnlock" pd "github.com/tikv/pd/client" clientv3 "go.etcd.io/etcd/client/v3" "google.golang.org/grpc" @@ -32,7 +35,7 @@ type Env interface { // StreamMeta connects to the metadata service (normally PD). StreamMeta // GCLockResolver try to resolve locks when region checkpoint stopped. - gcworker.GCLockResolver + gcutil.GCLockResolver } // PDRegionScanner is a simple wrapper over PD @@ -87,7 +90,7 @@ type clusterEnv struct { clis *utils.StoreManager *AdvancerExt PDRegionScanner - *gcworker.GCWorkerLockResolver + *AdvancerLockResolver } // GetLogBackupClient gets the log backup client. @@ -108,7 +111,7 @@ func CliEnv(cli *utils.StoreManager, tikvStore tikv.Storage, etcdCli *clientv3.C clis: cli, AdvancerExt: &AdvancerExt{MetaDataClient: *NewMetaDataClient(etcdCli)}, PDRegionScanner: PDRegionScanner{cli.PDClient()}, - GCWorkerLockResolver: &gcworker.GCWorkerLockResolver{TiKvStore: tikvStore}, + AdvancerLockResolver: &AdvancerLockResolver{TiKvStore: tikvStore}, } } @@ -125,7 +128,7 @@ func TiDBEnv(tikvStore tikv.Storage, pdCli pd.Client, etcdCli *clientv3.Client, }, tconf), AdvancerExt: &AdvancerExt{MetaDataClient: *NewMetaDataClient(etcdCli)}, PDRegionScanner: PDRegionScanner{Client: pdCli}, - GCWorkerLockResolver: &gcworker.GCWorkerLockResolver{TiKvStore: tikvStore}, + AdvancerLockResolver: &AdvancerLockResolver{TiKvStore: tikvStore}, }, nil } @@ -144,3 +147,48 @@ type StreamMeta interface { // ClearV3GlobalCheckpointForTask clears the global checkpoint to the meta store. ClearV3GlobalCheckpointForTask(ctx context.Context, taskName string) error } + +type AdvancerLockResolver struct { + TiKvStore tikv.Storage +} + +func (w *AdvancerLockResolver) LocateKey(bo *tikv.Backoffer, key []byte) (*tikv.KeyLocation, error) { + return w.TiKvStore.GetRegionCache().LocateKey(bo, key) +} + +// ResolveLocks tries to resolve expired locks with batch method. +// Travesal the given locks and check that: +// 1. If the ts of lock is equal with or smaller than forceResolveLocksTS(acually equals safepoint), +// it will rollback the txn, no matter the lock is expired of not. +// 2. If the ts of lock is larger than forceResolveLocksTS, it will check status of the txn. +// Resolve the lock if txn is expired, Or do nothing. +func (w *AdvancerLockResolver) ResolveLocks(bo *tikv.Backoffer, locks []*txnlock.Lock, loc tikv.RegionVerID, safePoint uint64) (bool, error) { + if len(locks) == 0 { + return true, nil + } + + forceResolveLocks := make([]*txnlock.Lock, 0, len(locks)) + tryResolveLocks := make([]*txnlock.Lock, 0, len(locks)) + for _, l := range locks { + if l.TxnID <= safePoint { + forceResolveLocks = append(forceResolveLocks, l) + } else { + tryResolveLocks = append(tryResolveLocks, l) + } + } + + ok, err := w.TiKvStore.GetLockResolver().BatchResolveLocks(bo, forceResolveLocks, loc) + if err != nil || !ok { + return ok, err + } + _, err = w.TiKvStore.GetLockResolver().ResolveLocks(bo, 0, tryResolveLocks) + return err == nil, errors.Trace(err) +} + +func (w *AdvancerLockResolver) ScanLocks(key []byte, regionID uint64) []*txnlock.Lock { + return nil +} + +func (w *AdvancerLockResolver) SendReq(bo *tikv.Backoffer, req *tikvrpc.Request, regionID tikv.RegionVerID, timeout time.Duration) (*tikvrpc.Response, error) { + return w.TiKvStore.SendReq(bo, req, regionID, timeout) +} diff --git a/store/gcworker/gc_worker.go b/store/gcworker/gc_worker.go index 1c6b5c86f020b..faedb029a2aec 100644 --- a/store/gcworker/gc_worker.go +++ b/store/gcworker/gc_worker.go @@ -48,6 +48,7 @@ import ( "github.com/pingcap/tidb/tablecodec" "github.com/pingcap/tidb/util/codec" "github.com/pingcap/tidb/util/dbterror" + "github.com/pingcap/tidb/util/gcutil" "github.com/pingcap/tidb/util/logutil" tikverr "github.com/tikv/client-go/v2/error" tikvstore "github.com/tikv/client-go/v2/kv" @@ -62,18 +63,6 @@ import ( "golang.org/x/exp/slices" ) -type GCLockResolver interface { - LocateKey(bo *tikv.Backoffer, key []byte) (*tikv.KeyLocation, error) - - // Try batch resolve locks first. if not ok, fallback to normal resolve locks - ResolveLocks(bo *tikv.Backoffer, tryLocks []*txnlock.Lock, forceLocks []*txnlock.Lock, loc tikv.RegionVerID) (bool, error) - - // only used for mock test - ScanLocks(key []byte, regionID uint64, maxVersion uint64) []*txnlock.Lock - - SendReq(bo *tikv.Backoffer, req *tikvrpc.Request, regionID tikv.RegionVerID, timeout time.Duration) (*tikvrpc.Response, error) -} - type GCWorkerLockResolver struct { TiKvStore tikv.Storage } @@ -82,17 +71,11 @@ func (w *GCWorkerLockResolver) LocateKey(bo *tikv.Backoffer, key []byte) (*tikv. return w.TiKvStore.GetRegionCache().LocateKey(bo, key) } -func (w *GCWorkerLockResolver) ResolveLocks(bo *tikv.Backoffer, tryLocks []*txnlock.Lock, forceLocks []*txnlock.Lock, loc tikv.RegionVerID) (bool, error) { - ok, err := w.TiKvStore.GetLockResolver().BatchResolveLocks(bo, forceLocks, loc) - if err != nil || !ok { - return ok, err - } - // callerStartTS is always 0. so no need to make it a parameter here. - _, err = w.TiKvStore.GetLockResolver().ResolveLocks(bo, 0, tryLocks) - return err == nil, errors.Trace(err) +func (w *GCWorkerLockResolver) ResolveLocks(bo *tikv.Backoffer, locks []*txnlock.Lock, loc tikv.RegionVerID, safePoint uint64) (bool, error) { + return w.TiKvStore.GetLockResolver().BatchResolveLocks(bo, locks, loc) } -func (w *GCWorkerLockResolver) ScanLocks(key []byte, regionID uint64, maxVersion uint64) []*txnlock.Lock { +func (w *GCWorkerLockResolver) ScanLocks(key []byte, regionID uint64) []*txnlock.Lock { return nil } @@ -111,7 +94,7 @@ type GCWorker struct { lastFinish time.Time cancel context.CancelFunc done chan error - lockResolver GCLockResolver + lockResolver gcutil.GCLockResolver } // NewGCWorker creates a GCWorker instance. @@ -186,9 +169,6 @@ const ( gcTryResolveLocksIntervalFromNow = time.Minute * 5 - // We don't want gc to sweep out the cached info belong to other processes, like coprocessor. - gcScanLockLimit = txnlock.ResolvedCacheSize / 2 - gcEnableKey = "tikv_gc_enable" gcDefaultEnableValue = true @@ -1192,23 +1172,12 @@ func (w *GCWorker) checkUsePhysicalScanLock() (bool, error) { } func (w *GCWorker) resolveLocks(ctx context.Context, safePoint uint64, concurrency int, usePhysical bool) (bool, error) { - // tryResolveLocksTS is defined as `now() - gcTryResolveLocksIntervalFromNow`, - // it used for trying resolve locks, ts of which is smaller than tryResolveLocksTS and expired. - tryResolveLocksTS, err := w.getTryResolveLocksTS() - if err != nil { - return false, err - } - - if tryResolveLocksTS < safePoint { - tryResolveLocksTS = safePoint - } - if !usePhysical { - return false, w.legacyResolveLocks(ctx, safePoint, tryResolveLocksTS, concurrency) + return false, w.legacyResolveLocks(ctx, safePoint, concurrency) } // First try resolve locks with physical scan - err = w.resolveLocksPhysical(ctx, safePoint) + err := w.resolveLocksPhysical(ctx, safePoint) if err == nil { return true, nil } @@ -1216,28 +1185,25 @@ func (w *GCWorker) resolveLocks(ctx context.Context, safePoint uint64, concurren logutil.Logger(ctx).Error("resolve locks with physical scan failed, trying fallback to legacy resolve lock", zap.String("category", "gc worker"), zap.String("uuid", w.uuid), zap.Uint64("safePoint", safePoint), - zap.Uint64("try-resolve-locks-ts", tryResolveLocksTS), zap.Error(err)) - return false, w.legacyResolveLocks(ctx, safePoint, tryResolveLocksTS, concurrency) + return false, w.legacyResolveLocks(ctx, safePoint, concurrency) } func (w *GCWorker) legacyResolveLocks( ctx context.Context, safePoint uint64, - tryResolveLocksTS uint64, concurrency int, ) error { metrics.GCWorkerCounter.WithLabelValues("resolve_locks").Inc() logutil.Logger(ctx).Info("start resolve locks", zap.String("category", "gc worker"), zap.String("uuid", w.uuid), zap.Uint64("safePoint", safePoint), - zap.Uint64("try-resolve-locks-ts", tryResolveLocksTS), zap.Int("concurrency", concurrency)) startTime := time.Now() handler := func(ctx context.Context, r tikvstore.KeyRange) (rangetask.TaskStat, error) { - return ResolveLocksForRange(ctx, w.uuid, w.lockResolver, safePoint, tryResolveLocksTS, r.StartKey, r.EndKey) + return gcutil.ResolveLocksForRange(ctx, w.uuid, w.lockResolver, safePoint, r.StartKey, r.EndKey) } runner := rangetask.NewRangeTaskRunner("resolve-locks-runner", w.tikvStore, concurrency, handler) @@ -1254,7 +1220,6 @@ func (w *GCWorker) legacyResolveLocks( logutil.Logger(ctx).Info("finish resolve locks", zap.String("category", "gc worker"), zap.String("uuid", w.uuid), zap.Uint64("safePoint", safePoint), - zap.Uint64("try-resolve-locks-ts", tryResolveLocksTS), zap.Int("regions", runner.CompletedRegions())) metrics.GCHistogram.WithLabelValues("resolve_locks").Observe(time.Since(startTime).Seconds()) return nil @@ -1272,177 +1237,6 @@ func (w *GCWorker) getTryResolveLocksTS() (uint64, error) { return gcTryResolveLockTS, nil } -// batchResolveExpiredLocks tries to resolve expired locks with batch method. -// Travesal the given locks and check that: -// 1. If the ts of lock is equal with or smaller than forceResolveLocksTS(acually equals safepoint), -// it will rollback the txn, no matter the lock is expired of not. -// 2. If the ts of lock is larger than forceResolveLocksTS, it will check status of the txn. -// Resolve the lock if txn is expired, Or do nothing. -func batchResolveExpiredLocks( - lockResolver GCLockResolver, - bo *tikv.Backoffer, - locks []*txnlock.Lock, - loc tikv.RegionVerID, - forceResolveLocksTS uint64, - tryResolveLocksTS uint64, -) (bool, error) { - if len(locks) == 0 { - return true, nil - } - - forceResolveLocks := make([]*txnlock.Lock, 0, len(locks)) - tryResolveLocks := make([]*txnlock.Lock, 0, len(locks)) - for _, l := range locks { - if l.TxnID <= forceResolveLocksTS { - forceResolveLocks = append(forceResolveLocks, l) - } else { - tryResolveLocks = append(tryResolveLocks, l) - } - } - - logutil.BgLogger().Debug("batchResolveExpiredLocks", - zap.Uint64("force-resolve-locks-ts", forceResolveLocksTS), - zap.Uint64("try-resolve-locks-ts", tryResolveLocksTS), - zap.Int("force-resolve-locks-count", len(forceResolveLocks)), - zap.Int("try-resolve-locks-count", len(tryResolveLocks))) - - return lockResolver.ResolveLocks(bo, tryResolveLocks, forceResolveLocks, loc) -} - -func ResolveLocksForRange( - ctx context.Context, - uuid string, - lockResolver GCLockResolver, - forceResolveLocksTS uint64, - tryResolveLocksTS uint64, - startKey []byte, - endKey []byte, -) (rangetask.TaskStat, error) { - // for scan lock request, we must return all locks even if they are generated - // by the same transaction. because gc worker need to make sure all locks have been - // cleaned. - req := tikvrpc.NewRequest(tikvrpc.CmdScanLock, &kvrpcpb.ScanLockRequest{ - MaxVersion: tryResolveLocksTS, - Limit: gcScanLockLimit, - }, kvrpcpb.Context{ - RequestSource: tikvutil.RequestSourceFromCtx(ctx), - }) - - failpoint.Inject("lowScanLockLimit", func() { - req.ScanLock().Limit = 3 - }) - - var stat rangetask.TaskStat - key := startKey - bo := tikv.NewGcResolveLockMaxBackoffer(ctx) - failpoint.Inject("setGcResolveMaxBackoff", func(v failpoint.Value) { - sleep := v.(int) - // cooperate with github.com/tikv/client-go/v2/locate/invalidCacheAndRetry - //nolint: SA1029 - ctx = context.WithValue(ctx, "injectedBackoff", struct{}{}) - bo = tikv.NewBackofferWithVars(ctx, sleep, nil) - }) -retryScanAndResolve: - for { - select { - case <-ctx.Done(): - return stat, errors.New("[gc worker] gc job canceled") - default: - } - - req.ScanLock().StartKey = key - loc, err := lockResolver.LocateKey(bo, key) - if err != nil { - return stat, errors.Trace(err) - } - req.ScanLock().EndKey = loc.EndKey - resp, err := lockResolver.SendReq(bo, req, loc.Region, tikv.ReadTimeoutMedium) - if err != nil { - return stat, errors.Trace(err) - } - regionErr, err := resp.GetRegionError() - if err != nil { - return stat, errors.Trace(err) - } - if regionErr != nil { - err = bo.Backoff(tikv.BoRegionMiss(), errors.New(regionErr.String())) - if err != nil { - return stat, errors.Trace(err) - } - continue - } - if resp.Resp == nil { - return stat, errors.Trace(tikverr.ErrBodyMissing) - } - locksResp := resp.Resp.(*kvrpcpb.ScanLockResponse) - if locksResp.GetError() != nil { - return stat, errors.Errorf("unexpected scanlock error: %s", locksResp) - } - locksInfo := locksResp.GetLocks() - locks := make([]*txnlock.Lock, 0, len(locksInfo)) - for _, li := range locksInfo { - locks = append(locks, txnlock.NewLock(li)) - } - locks = append(locks, lockResolver.ScanLocks(key, loc.Region.GetID(), tryResolveLocksTS)...) - locForResolve := loc - for { - ok, err1 := batchResolveExpiredLocks(lockResolver, bo, locks, locForResolve.Region, forceResolveLocksTS, tryResolveLocksTS) - if err1 != nil { - return stat, errors.Trace(err1) - } - if !ok { - err = bo.Backoff(tikv.BoTxnLock(), errors.Errorf("remain locks: %d", len(locks))) - if err != nil { - return stat, errors.Trace(err) - } - stillInSame, refreshedLoc, err := tryRelocateLocksRegion(bo, lockResolver, locks) - if err != nil { - return stat, errors.Trace(err) - } - if stillInSame { - locForResolve = refreshedLoc - continue - } - continue retryScanAndResolve - } - break - } - if len(locks) < gcScanLockLimit { - stat.CompletedRegions++ - key = loc.EndKey - } else { - logutil.Logger(ctx).Info("region has more than limit locks", zap.String("category", "gc worker"), - zap.String("uuid", uuid), - zap.Uint64("region", locForResolve.Region.GetID()), - zap.Int("scan lock limit", gcScanLockLimit)) - metrics.GCRegionTooManyLocksCounter.Inc() - key = locks[len(locks)-1].Key - } - - if len(key) == 0 || (len(endKey) != 0 && bytes.Compare(key, endKey) >= 0) { - break - } - bo = tikv.NewGcResolveLockMaxBackoffer(ctx) - failpoint.Inject("setGcResolveMaxBackoff", func(v failpoint.Value) { - sleep := v.(int) - bo = tikv.NewBackofferWithVars(ctx, sleep, nil) - }) - } - return stat, nil -} - -func tryRelocateLocksRegion(bo *tikv.Backoffer, lockResolver GCLockResolver, locks []*txnlock.Lock) (stillInSameRegion bool, refreshedLoc *tikv.KeyLocation, err error) { - if len(locks) == 0 { - return - } - refreshedLoc, err = lockResolver.LocateKey(bo, locks[0].Key) - if err != nil { - return - } - stillInSameRegion = refreshedLoc.Contains(locks[len(locks)-1].Key) - return -} - // resolveLocksPhysical uses TiKV's `PhysicalScanLock` to scan stale locks in the cluster and resolve them. It tries to // ensure no lock whose ts <= safePoint is left. func (w *GCWorker) resolveLocksPhysical(ctx context.Context, safePoint uint64) error { @@ -2472,7 +2266,7 @@ func newMergeLockScanner(safePoint uint64, client tikv.Client, stores map[uint64 safePoint: safePoint, client: client, stores: stores, - scanLockLimit: gcScanLockLimit, + scanLockLimit: gcutil.GCScanLockLimit, } failpoint.Inject("lowPhysicalScanLockLimit", func() { scanner.scanLockLimit = 3 diff --git a/store/gcworker/gc_worker_test.go b/store/gcworker/gc_worker_test.go index 7744842e423d5..a4f1b9f20c863 100644 --- a/store/gcworker/gc_worker_test.go +++ b/store/gcworker/gc_worker_test.go @@ -53,7 +53,7 @@ type mockGCWorkerLockResolver struct { tikvStore tikv.Storage forceResolveLocksTS uint64 tryResolveLocksTS uint64 - scanLocks func(key []byte, regionID uint64, maxVersion uint64) []*txnlock.Lock + scanLocks func(key []byte, regionID uint64) []*txnlock.Lock batchResolveLocks func(locks []*txnlock.Lock, regionID tikv.RegionVerID, safepoint uint64) (ok bool, err error) resolveLocks func(locks []*txnlock.Lock, lowResolutionTS uint64) (int64, error) } @@ -66,18 +66,12 @@ func (l *mockGCWorkerLockResolver) SendReq(bo *tikv.Backoffer, req *tikvrpc.Requ return l.tikvStore.SendReq(bo, req, regionID, timeout) } -func (l *mockGCWorkerLockResolver) ScanLocks(key []byte, regionID uint64, maxVersion uint64) []*txnlock.Lock { - return l.scanLocks(key, regionID, maxVersion) +func (l *mockGCWorkerLockResolver) ScanLocks(key []byte, regionID uint64) []*txnlock.Lock { + return l.scanLocks(key, regionID) } -func (l *mockGCWorkerLockResolver) ResolveLocks(bo *tikv.Backoffer, tryLocks []*txnlock.Lock, forceLocks []*txnlock.Lock, loc tikv.RegionVerID) (bool, error) { - ok, err := l.batchResolveLocks(forceLocks, loc, l.forceResolveLocksTS) - if err != nil || !ok { - return ok, err - } - - _, err = l.resolveLocks(tryLocks, l.tryResolveLocksTS) - return err == nil, err +func (l *mockGCWorkerLockResolver) ResolveLocks(bo *tikv.Backoffer, locks []*txnlock.Lock, loc tikv.RegionVerID, safePoint uint64) (bool, error) { + return l.batchResolveLocks(locks, loc, l.forceResolveLocksTS) } type mockGCWorkerClient struct { @@ -303,16 +297,6 @@ func TestGetOracleTime(t *testing.T) { timeEqual(t, t2, t1.Add(time.Second*10), time.Millisecond*10) } -func TestGetLowResolveTS(t *testing.T) { - s := createGCWorkerSuite(t) - - lowResolveTS, err := s.gcWorker.getTryResolveLocksTS() - require.NoError(t, err) - - lowResolveTime := oracle.GetTimeFromTS(lowResolveTS) - timeEqual(t, time.Now(), lowResolveTime.Add(gcTryResolveLocksIntervalFromNow), time.Millisecond*10) -} - func TestMinStartTS(t *testing.T) { s := createGCWorkerSuite(t) @@ -1109,15 +1093,13 @@ func TestResolveLockRangeMeetRegionCacheMiss(t *testing.T) { tikvStore: s.tikvStore, forceResolveLocksTS: safepointTS, tryResolveLocksTS: lowResolveTS, - scanLocks: func(key []byte, regionID uint64, maxVersion uint64) []*txnlock.Lock { + scanLocks: func(key []byte, regionID uint64) []*txnlock.Lock { *scanCntRef++ locks := make([]*txnlock.Lock, 0) for _, l := range allLocks { - if l.TxnID <= maxVersion { - locks = append(locks, l) - scanLockCnt++ - } + locks = append(locks, l) + scanLockCnt++ } return locks }, diff --git a/util/gcutil/gcutil.go b/util/gcutil/gcutil.go index 9474cace52378..73509b74a854a 100644 --- a/util/gcutil/gcutil.go +++ b/util/gcutil/gcutil.go @@ -15,22 +15,48 @@ package gcutil import ( + "bytes" "context" + "time" "github.com/pingcap/errors" + "github.com/pingcap/failpoint" + "github.com/pingcap/kvproto/pkg/kvrpcpb" "github.com/pingcap/tidb/kv" + "github.com/pingcap/tidb/metrics" "github.com/pingcap/tidb/parser/model" "github.com/pingcap/tidb/sessionctx" "github.com/pingcap/tidb/sessionctx/variable" + "github.com/pingcap/tidb/util/logutil" "github.com/pingcap/tidb/util/sqlexec" + tikverr "github.com/tikv/client-go/v2/error" "github.com/tikv/client-go/v2/oracle" + "github.com/tikv/client-go/v2/tikv" + "github.com/tikv/client-go/v2/tikvrpc" + "github.com/tikv/client-go/v2/txnkv/rangetask" + "github.com/tikv/client-go/v2/txnkv/txnlock" "github.com/tikv/client-go/v2/util" + "go.uber.org/zap" ) const ( selectVariableValueSQL = `SELECT HIGH_PRIORITY variable_value FROM mysql.tidb WHERE variable_name=%?` + + // We don't want gc to sweep out the cached info belong to other processes, like coprocessor. + GCScanLockLimit = txnlock.ResolvedCacheSize / 2 ) +type GCLockResolver interface { + LocateKey(bo *tikv.Backoffer, key []byte) (*tikv.KeyLocation, error) + + ResolveLocks(bo *tikv.Backoffer, locks []*txnlock.Lock, loc tikv.RegionVerID, safePoint uint64) (bool, error) + + // only used for mock test + ScanLocks(key []byte, regionID uint64) []*txnlock.Lock + + SendReq(bo *tikv.Backoffer, req *tikvrpc.Request, regionID tikv.RegionVerID, timeout time.Duration) (*tikvrpc.Response, error) +} + // CheckGCEnable is use to check whether GC is enable. func CheckGCEnable(ctx sessionctx.Context) (enable bool, err error) { val, err := ctx.GetSessionVars().GlobalVarsAccessor.GetGlobalSysVar(variable.TiDBGCEnable) @@ -89,3 +115,136 @@ func GetGCSafePoint(sctx sessionctx.Context) (uint64, error) { ts := oracle.GoTimeToTS(safePointTime) return ts, nil } + +func ResolveLocksForRange( + ctx context.Context, + uuid string, + lockResolver GCLockResolver, + safePoint uint64, + startKey []byte, + endKey []byte, +) (rangetask.TaskStat, error) { + // for scan lock request, we must return all locks even if they are generated + // by the same transaction. because gc worker need to make sure all locks have been + // cleaned. + req := tikvrpc.NewRequest(tikvrpc.CmdScanLock, &kvrpcpb.ScanLockRequest{ + MaxVersion: safePoint, + Limit: GCScanLockLimit, + }, kvrpcpb.Context{ + RequestSource: util.RequestSourceFromCtx(ctx), + }) + + failpoint.Inject("lowScanLockLimit", func() { + req.ScanLock().Limit = 3 + }) + + var stat rangetask.TaskStat + key := startKey + bo := tikv.NewGcResolveLockMaxBackoffer(ctx) + failpoint.Inject("setGcResolveMaxBackoff", func(v failpoint.Value) { + sleep := v.(int) + // cooperate with github.com/tikv/client-go/v2/locate/invalidCacheAndRetry + //nolint: SA1029 + ctx = context.WithValue(ctx, "injectedBackoff", struct{}{}) + bo = tikv.NewBackofferWithVars(ctx, sleep, nil) + }) +retryScanAndResolve: + for { + select { + case <-ctx.Done(): + return stat, errors.New("[gc worker] gc job canceled") + default: + } + + req.ScanLock().StartKey = key + loc, err := lockResolver.LocateKey(bo, key) + if err != nil { + return stat, errors.Trace(err) + } + req.ScanLock().EndKey = loc.EndKey + resp, err := lockResolver.SendReq(bo, req, loc.Region, tikv.ReadTimeoutMedium) + if err != nil { + return stat, errors.Trace(err) + } + regionErr, err := resp.GetRegionError() + if err != nil { + return stat, errors.Trace(err) + } + if regionErr != nil { + err = bo.Backoff(tikv.BoRegionMiss(), errors.New(regionErr.String())) + if err != nil { + return stat, errors.Trace(err) + } + continue + } + if resp.Resp == nil { + return stat, errors.Trace(tikverr.ErrBodyMissing) + } + locksResp := resp.Resp.(*kvrpcpb.ScanLockResponse) + if locksResp.GetError() != nil { + return stat, errors.Errorf("unexpected scanlock error: %s", locksResp) + } + locksInfo := locksResp.GetLocks() + locks := make([]*txnlock.Lock, 0, len(locksInfo)) + for _, li := range locksInfo { + locks = append(locks, txnlock.NewLock(li)) + } + locks = append(locks, lockResolver.ScanLocks(key, loc.Region.GetID())...) + locForResolve := loc + for { + ok, err1 := lockResolver.ResolveLocks(bo, locks, locForResolve.Region, safePoint) + if err1 != nil { + return stat, errors.Trace(err1) + } + if !ok { + err = bo.Backoff(tikv.BoTxnLock(), errors.Errorf("remain locks: %d", len(locks))) + if err != nil { + return stat, errors.Trace(err) + } + stillInSame, refreshedLoc, err := tryRelocateLocksRegion(bo, lockResolver, locks) + if err != nil { + return stat, errors.Trace(err) + } + if stillInSame { + locForResolve = refreshedLoc + continue + } + continue retryScanAndResolve + } + break + } + if len(locks) < GCScanLockLimit { + stat.CompletedRegions++ + key = loc.EndKey + } else { + logutil.Logger(ctx).Info("region has more than limit locks", zap.String("category", "gc worker"), + zap.String("uuid", uuid), + zap.Uint64("region", locForResolve.Region.GetID()), + zap.Int("scan lock limit", GCScanLockLimit)) + metrics.GCRegionTooManyLocksCounter.Inc() + key = locks[len(locks)-1].Key + } + + if len(key) == 0 || (len(endKey) != 0 && bytes.Compare(key, endKey) >= 0) { + break + } + bo = tikv.NewGcResolveLockMaxBackoffer(ctx) + failpoint.Inject("setGcResolveMaxBackoff", func(v failpoint.Value) { + sleep := v.(int) + bo = tikv.NewBackofferWithVars(ctx, sleep, nil) + }) + } + return stat, nil +} + +func tryRelocateLocksRegion(bo *tikv.Backoffer, lockResolver GCLockResolver, locks []*txnlock.Lock) (stillInSameRegion bool, refreshedLoc *tikv.KeyLocation, err error) { + if len(locks) == 0 { + return + } + refreshedLoc, err = lockResolver.LocateKey(bo, locks[0].Key) + if err != nil { + return + } + stillInSameRegion = refreshedLoc.Contains(locks[len(locks)-1].Key) + return +} From 885425f3a358153613549df6f7c1f6cdedcf0922 Mon Sep 17 00:00:00 2001 From: 3pointer Date: Tue, 8 Aug 2023 15:36:55 +0800 Subject: [PATCH 06/45] expose tikvStore for lockResolver --- br/pkg/streamhelper/advancer.go | 27 +++++++++++++++++++++----- br/pkg/streamhelper/advancer_env.go | 30 ++++++++--------------------- domain/domain.go | 7 ++++++- store/gcworker/gc_worker.go | 18 +++++------------ util/gcutil/gcutil.go | 20 +++++++++++++------ 5 files changed, 55 insertions(+), 47 deletions(-) diff --git a/br/pkg/streamhelper/advancer.go b/br/pkg/streamhelper/advancer.go index 0f38cb6e7dcd2..428040568361b 100644 --- a/br/pkg/streamhelper/advancer.go +++ b/br/pkg/streamhelper/advancer.go @@ -17,6 +17,7 @@ import ( "github.com/pingcap/tidb/br/pkg/utils" "github.com/pingcap/tidb/kv" "github.com/pingcap/tidb/metrics" + "github.com/pingcap/tidb/util/codec" "github.com/pingcap/tidb/util/gcutil" tikvstore "github.com/tikv/client-go/v2/kv" "github.com/tikv/client-go/v2/oracle" @@ -365,19 +366,35 @@ func (c *CheckpointAdvancer) onTaskEvent(ctx context.Context, e TaskEvent) error return nil } -func (c *CheckpointAdvancer) setCheckpoint(s spans.Valued) bool { +func (c *CheckpointAdvancer) setCheckpoint(ctx context.Context, s spans.Valued) bool { cp := newCheckpointWithSpan(s) if cp.TS < c.lastCheckpoint.TS { log.Warn("failed to update global checkpoint: stale", zap.Uint64("old", c.lastCheckpoint.TS), zap.Uint64("new", cp.TS)) return false } - // lastCheckpoint is too long enough. + // lastCheckpoint is not increased too long enough. // assume the cluster has expired locks for whatever reasons. if c.lastCheckpoint.needResolveLocks() { handler := func(ctx context.Context, r tikvstore.KeyRange) (rangetask.TaskStat, error) { - return gcutil.ResolveLocksForRange(ctx, "log backup advancer", c.env, safePoint, r.StartKey, r.EndKey) + // we will scan all lock before cp.TS and try to resolve them by check txn status. + return gcutil.ResolveLocksForRange(ctx, "log backup advancer", c.env, cp.TS, r.StartKey, r.EndKey) } + runner := rangetask.NewRangeTaskRunner("advancer-resolve-locks-runner", c.env.GetStore(), config.DefaultMaxConcurrencyAdvance, handler) + // Run resolve lock on the whole TiKV cluster. it will use startKey/endKey to scan region in PD. so we need encode key here. + encodedStartKey := codec.EncodeBytes([]byte{}, []byte(c.lastCheckpoint.StartKey)) + encodedEndKey := codec.EncodeBytes([]byte{}, []byte(c.lastCheckpoint.EndKey)) + err := runner.RunOnRange(ctx, encodedStartKey, encodedEndKey) + if err != nil { + log.Error("resolve locks failed", zap.String("category", "advancer"), + zap.String("uuid", "log backup advancer"), + zap.Error(err)) + return false + } + log.Info("finish resolve locks", zap.String("category", "advancer"), + zap.String("uuid", "log backup advancer"), + zap.Int("regions", runner.CompletedRegions())) + return false } if cp.TS <= c.lastCheckpoint.TS { return false @@ -396,7 +413,7 @@ func (c *CheckpointAdvancer) advanceCheckpointBy(ctx context.Context, return err } - if c.setCheckpoint(cp) { + if c.setCheckpoint(ctx, cp) { log.Info("uploading checkpoint for task", zap.Stringer("checkpoint", oracle.GetTimeFromTS(cp.Value)), zap.Uint64("checkpoint", cp.Value), @@ -453,7 +470,7 @@ func (c *CheckpointAdvancer) subscribeTick(ctx context.Context) error { func (c *CheckpointAdvancer) importantTick(ctx context.Context) error { c.checkpointsMu.Lock() - c.setCheckpoint(c.checkpoints.Min()) + c.setCheckpoint(ctx, c.checkpoints.Min()) c.checkpointsMu.Unlock() if err := c.env.UploadV3GlobalCheckpointForTask(ctx, c.task.Name, c.lastCheckpoint.TS); err != nil { return errors.Annotate(err, "failed to upload global checkpoint") diff --git a/br/pkg/streamhelper/advancer_env.go b/br/pkg/streamhelper/advancer_env.go index 6885feb2ecf03..314f16f1bb359 100644 --- a/br/pkg/streamhelper/advancer_env.go +++ b/br/pkg/streamhelper/advancer_env.go @@ -156,32 +156,14 @@ func (w *AdvancerLockResolver) LocateKey(bo *tikv.Backoffer, key []byte) (*tikv. return w.TiKvStore.GetRegionCache().LocateKey(bo, key) } -// ResolveLocks tries to resolve expired locks with batch method. -// Travesal the given locks and check that: -// 1. If the ts of lock is equal with or smaller than forceResolveLocksTS(acually equals safepoint), -// it will rollback the txn, no matter the lock is expired of not. -// 2. If the ts of lock is larger than forceResolveLocksTS, it will check status of the txn. -// Resolve the lock if txn is expired, Or do nothing. -func (w *AdvancerLockResolver) ResolveLocks(bo *tikv.Backoffer, locks []*txnlock.Lock, loc tikv.RegionVerID, safePoint uint64) (bool, error) { +// ResolveLocks tries to resolve expired locks with this method. +// It will check status of the txn. Resolve the lock if txn is expired, Or do nothing. +func (w *AdvancerLockResolver) ResolveLocks(bo *tikv.Backoffer, locks []*txnlock.Lock, loc tikv.RegionVerID) (bool, error) { if len(locks) == 0 { return true, nil } - forceResolveLocks := make([]*txnlock.Lock, 0, len(locks)) - tryResolveLocks := make([]*txnlock.Lock, 0, len(locks)) - for _, l := range locks { - if l.TxnID <= safePoint { - forceResolveLocks = append(forceResolveLocks, l) - } else { - tryResolveLocks = append(tryResolveLocks, l) - } - } - - ok, err := w.TiKvStore.GetLockResolver().BatchResolveLocks(bo, forceResolveLocks, loc) - if err != nil || !ok { - return ok, err - } - _, err = w.TiKvStore.GetLockResolver().ResolveLocks(bo, 0, tryResolveLocks) + _, err := w.TiKvStore.GetLockResolver().ResolveLocks(bo, 0, locks) return err == nil, errors.Trace(err) } @@ -192,3 +174,7 @@ func (w *AdvancerLockResolver) ScanLocks(key []byte, regionID uint64) []*txnlock func (w *AdvancerLockResolver) SendReq(bo *tikv.Backoffer, req *tikvrpc.Request, regionID tikv.RegionVerID, timeout time.Duration) (*tikvrpc.Response, error) { return w.TiKvStore.SendReq(bo, req, regionID, timeout) } + +func (w *AdvancerLockResolver) GetStore() tikv.Storage { + return w.TiKvStore +} diff --git a/domain/domain.go b/domain/domain.go index 0490a3fcf1e0c..f9a7d785800a9 100644 --- a/domain/domain.go +++ b/domain/domain.go @@ -1274,7 +1274,12 @@ func (do *Domain) initLogBackup(ctx context.Context, pdClient pd.Client) error { log.Warn("pd / etcd client not provided, won't begin Advancer.") return nil } - env, err := streamhelper.TiDBEnv(do.Store, pdClient, do.etcdClient, cfg) + tikvStore, ok := do.Store().(tikv.Storage) + if !ok { + log.Warn("non tikv store, stop begin Advancer.") + return nil + } + env, err := streamhelper.TiDBEnv(tikvStore, pdClient, do.etcdClient, cfg) if err != nil { return err } diff --git a/store/gcworker/gc_worker.go b/store/gcworker/gc_worker.go index faedb029a2aec..38a5ab89dcdf1 100644 --- a/store/gcworker/gc_worker.go +++ b/store/gcworker/gc_worker.go @@ -71,7 +71,7 @@ func (w *GCWorkerLockResolver) LocateKey(bo *tikv.Backoffer, key []byte) (*tikv. return w.TiKvStore.GetRegionCache().LocateKey(bo, key) } -func (w *GCWorkerLockResolver) ResolveLocks(bo *tikv.Backoffer, locks []*txnlock.Lock, loc tikv.RegionVerID, safePoint uint64) (bool, error) { +func (w *GCWorkerLockResolver) ResolveLocks(bo *tikv.Backoffer, locks []*txnlock.Lock, loc tikv.RegionVerID) (bool, error) { return w.TiKvStore.GetLockResolver().BatchResolveLocks(bo, locks, loc) } @@ -83,6 +83,10 @@ func (w *GCWorkerLockResolver) SendReq(bo *tikv.Backoffer, req *tikvrpc.Request, return w.TiKvStore.SendReq(bo, req, regionID, timeout) } +func (w *GCWorkerLockResolver) GetStore() tikv.Storage { + return w.TiKvStore +} + // GCWorker periodically triggers GC process on tikv server. type GCWorker struct { uuid string @@ -1225,18 +1229,6 @@ func (w *GCWorker) legacyResolveLocks( return nil } -// getTryResolveLocksTS gets the TryResolveLocksTS -// that is defined as `now() - gcTryResolveLocksIntervalFromNow`. -func (w *GCWorker) getTryResolveLocksTS() (uint64, error) { - now, err := w.tikvStore.CurrentTimestamp(kv.GlobalTxnScope) - if err != nil { - return 0, err - } - - gcTryResolveLockTS := oracle.ComposeTS(oracle.ExtractPhysical(now)-gcTryResolveLocksIntervalFromNow.Milliseconds(), oracle.ExtractLogical(now)) - return gcTryResolveLockTS, nil -} - // resolveLocksPhysical uses TiKV's `PhysicalScanLock` to scan stale locks in the cluster and resolve them. It tries to // ensure no lock whose ts <= safePoint is left. func (w *GCWorker) resolveLocksPhysical(ctx context.Context, safePoint uint64) error { diff --git a/util/gcutil/gcutil.go b/util/gcutil/gcutil.go index 73509b74a854a..684c8795b3ad0 100644 --- a/util/gcutil/gcutil.go +++ b/util/gcutil/gcutil.go @@ -46,15 +46,22 @@ const ( GCScanLockLimit = txnlock.ResolvedCacheSize / 2 ) +// GCLockResolver is used for GCWorker and log backup advancer to resolve locks. +// #Note: Put it here to avoid cycle import type GCLockResolver interface { - LocateKey(bo *tikv.Backoffer, key []byte) (*tikv.KeyLocation, error) + LocateKey(*tikv.Backoffer, []byte) (*tikv.KeyLocation, error) - ResolveLocks(bo *tikv.Backoffer, locks []*txnlock.Lock, loc tikv.RegionVerID, safePoint uint64) (bool, error) + ResolveLocks(*tikv.Backoffer, []*txnlock.Lock, tikv.RegionVerID) (bool, error) - // only used for mock test - ScanLocks(key []byte, regionID uint64) []*txnlock.Lock + // ScanLocks only used for mock test. + ScanLocks([]byte, uint64) []*txnlock.Lock - SendReq(bo *tikv.Backoffer, req *tikvrpc.Request, regionID tikv.RegionVerID, timeout time.Duration) (*tikvrpc.Response, error) + SendReq(*tikv.Backoffer, *tikvrpc.Request, tikv.RegionVerID, time.Duration) (*tikvrpc.Response, error) + + // We need to get tikvStore to build rangerunner. + // FIXME: the most code is in client.go and the store is only used to locate end keys of a region. + // maybe we can move GCLockResolver into client.go. + GetStore() tikv.Storage } // CheckGCEnable is use to check whether GC is enable. @@ -116,6 +123,7 @@ func GetGCSafePoint(sctx sessionctx.Context) (uint64, error) { return ts, nil } +// ResolveLocksForRange is used for GCWorker and log backup advancer. func ResolveLocksForRange( ctx context.Context, uuid string, @@ -192,7 +200,7 @@ retryScanAndResolve: locks = append(locks, lockResolver.ScanLocks(key, loc.Region.GetID())...) locForResolve := loc for { - ok, err1 := lockResolver.ResolveLocks(bo, locks, locForResolve.Region, safePoint) + ok, err1 := lockResolver.ResolveLocks(bo, locks, locForResolve.Region) if err1 != nil { return stat, errors.Trace(err1) } From e0df995680db542aca8bda4d514e1621c6e1189b Mon Sep 17 00:00:00 2001 From: 3pointer Date: Tue, 8 Aug 2023 15:46:49 +0800 Subject: [PATCH 07/45] update test --- store/gcworker/gc_worker.go | 3 +-- store/gcworker/gc_worker_test.go | 24 +++++++++++++++--------- util/gcutil/gcutil.go | 4 ++-- 3 files changed, 18 insertions(+), 13 deletions(-) diff --git a/store/gcworker/gc_worker.go b/store/gcworker/gc_worker.go index 38a5ab89dcdf1..1aba1768cf85c 100644 --- a/store/gcworker/gc_worker.go +++ b/store/gcworker/gc_worker.go @@ -72,6 +72,7 @@ func (w *GCWorkerLockResolver) LocateKey(bo *tikv.Backoffer, key []byte) (*tikv. } func (w *GCWorkerLockResolver) ResolveLocks(bo *tikv.Backoffer, locks []*txnlock.Lock, loc tikv.RegionVerID) (bool, error) { + // Resolve locks without check txn status. it's safe because it use safepoint to scan locks. return w.TiKvStore.GetLockResolver().BatchResolveLocks(bo, locks, loc) } @@ -171,8 +172,6 @@ const ( gcMinConcurrency = 1 gcMaxConcurrency = 128 - gcTryResolveLocksIntervalFromNow = time.Minute * 5 - gcEnableKey = "tikv_gc_enable" gcDefaultEnableValue = true diff --git a/store/gcworker/gc_worker_test.go b/store/gcworker/gc_worker_test.go index a4f1b9f20c863..9125da4086068 100644 --- a/store/gcworker/gc_worker_test.go +++ b/store/gcworker/gc_worker_test.go @@ -39,6 +39,7 @@ import ( "github.com/pingcap/tidb/parser/model" "github.com/pingcap/tidb/session" "github.com/pingcap/tidb/store/mockstore" + "github.com/pingcap/tidb/util/gcutil" "github.com/stretchr/testify/require" "github.com/tikv/client-go/v2/oracle" "github.com/tikv/client-go/v2/oracle/oracles" @@ -54,7 +55,7 @@ type mockGCWorkerLockResolver struct { forceResolveLocksTS uint64 tryResolveLocksTS uint64 scanLocks func(key []byte, regionID uint64) []*txnlock.Lock - batchResolveLocks func(locks []*txnlock.Lock, regionID tikv.RegionVerID, safepoint uint64) (ok bool, err error) + batchResolveLocks func(locks []*txnlock.Lock, regionID tikv.RegionVerID) (ok bool, err error) resolveLocks func(locks []*txnlock.Lock, lowResolutionTS uint64) (int64, error) } @@ -70,8 +71,13 @@ func (l *mockGCWorkerLockResolver) ScanLocks(key []byte, regionID uint64) []*txn return l.scanLocks(key, regionID) } -func (l *mockGCWorkerLockResolver) ResolveLocks(bo *tikv.Backoffer, locks []*txnlock.Lock, loc tikv.RegionVerID, safePoint uint64) (bool, error) { - return l.batchResolveLocks(locks, loc, l.forceResolveLocksTS) +func (l *mockGCWorkerLockResolver) ResolveLocks(bo *tikv.Backoffer, locks []*txnlock.Lock, loc tikv.RegionVerID) (bool, error) { + return l.batchResolveLocks(locks, loc) +} + +func (l *mockGCWorkerLockResolver) GetStore() tikv.Storage { + // no use for test + return l.tikvStore } type mockGCWorkerClient struct { @@ -1036,13 +1042,13 @@ func TestResolveLockRangeInfine(t *testing.T) { s := createGCWorkerSuite(t) require.NoError(t, failpoint.Enable("tikvclient/invalidCacheAndRetry", "return(true)")) - require.NoError(t, failpoint.Enable("github.com/pingcap/tidb/store/gcworker/setGcResolveMaxBackoff", "return(1)")) + require.NoError(t, failpoint.Enable("github.com/pingcap/tidb/util/gcutil/setGcResolveMaxBackoff", "return(1)")) defer func() { require.NoError(t, failpoint.Disable("tikvclient/invalidCacheAndRetry")) - require.NoError(t, failpoint.Disable("github.com/pingcap/tidb/store/gcworker/setGcResolveMaxBackoff")) + require.NoError(t, failpoint.Disable("github.com/pingcap/tidb/util/gcutil/setGcResolveMaxBackoff")) }() - _, err := ResolveLocksForRange(gcContext(), s.gcWorker.uuid, s.gcWorker.lockResolver, 1, 3, []byte{0}, []byte{1}) + _, err := gcutil.ResolveLocksForRange(gcContext(), s.gcWorker.uuid, s.gcWorker.lockResolver, 1, []byte{0}, []byte{1}) require.Error(t, err) } @@ -1168,7 +1174,7 @@ func TestResolveLockRangeMeetRegionEnlargeCausedByRegionMerge(t *testing.T) { tikvStore: s.tikvStore, forceResolveLocksTS: 1, tryResolveLocksTS: 3, - scanLocks: func(key []byte, regionID uint64, maxVersion uint64) []*txnlock.Lock { + scanLocks: func(key []byte, regionID uint64) []*txnlock.Lock { if regionID == s.initRegion.regionID { return []*txnlock.Lock{{Key: []byte("a")}, {Key: []byte("b")}} } @@ -1201,7 +1207,7 @@ func TestResolveLockRangeMeetRegionEnlargeCausedByRegionMerge(t *testing.T) { []*metapb.Region{regionMeta}) require.NoError(t, err) // also let region1 contains all 4 locks - mockGCLockResolver.scanLocks = func(key []byte, regionID uint64, maxVersion uint64) []*txnlock.Lock { + mockGCLockResolver.scanLocks = func(key []byte, regionID uint64) []*txnlock.Lock { if regionID == s.initRegion.regionID { locks := []*txnlock.Lock{ {Key: []byte("a")}, @@ -1224,7 +1230,7 @@ func TestResolveLockRangeMeetRegionEnlargeCausedByRegionMerge(t *testing.T) { } return true, nil } - _, err := ResolveLocksForRange(gcContext(), s.gcWorker.uuid, mockGCLockResolver, 1, 3, []byte(""), []byte("z")) + _, err := gcutil.ResolveLocksForRange(gcContext(), s.gcWorker.uuid, mockGCLockResolver, 1, []byte(""), []byte("z")) require.NoError(t, err) require.Len(t, resolvedLock, 4) expects := [][]byte{[]byte("a"), []byte("b"), []byte("o"), []byte("p")} diff --git a/util/gcutil/gcutil.go b/util/gcutil/gcutil.go index 684c8795b3ad0..f52b9bf06343f 100644 --- a/util/gcutil/gcutil.go +++ b/util/gcutil/gcutil.go @@ -128,7 +128,7 @@ func ResolveLocksForRange( ctx context.Context, uuid string, lockResolver GCLockResolver, - safePoint uint64, + maxVersion uint64, startKey []byte, endKey []byte, ) (rangetask.TaskStat, error) { @@ -136,7 +136,7 @@ func ResolveLocksForRange( // by the same transaction. because gc worker need to make sure all locks have been // cleaned. req := tikvrpc.NewRequest(tikvrpc.CmdScanLock, &kvrpcpb.ScanLockRequest{ - MaxVersion: safePoint, + MaxVersion: maxVersion, Limit: GCScanLockLimit, }, kvrpcpb.Context{ RequestSource: util.RequestSourceFromCtx(ctx), From a209893143224299ca280d800bcd91b90bb87a54 Mon Sep 17 00:00:00 2001 From: 3pointer Date: Tue, 8 Aug 2023 16:36:30 +0800 Subject: [PATCH 08/45] address comment --- br/pkg/streamhelper/advancer.go | 27 ---------------- br/pkg/streamhelper/advancer_daemon.go | 45 ++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 27 deletions(-) diff --git a/br/pkg/streamhelper/advancer.go b/br/pkg/streamhelper/advancer.go index 428040568361b..16c08162a34d0 100644 --- a/br/pkg/streamhelper/advancer.go +++ b/br/pkg/streamhelper/advancer.go @@ -17,11 +17,7 @@ import ( "github.com/pingcap/tidb/br/pkg/utils" "github.com/pingcap/tidb/kv" "github.com/pingcap/tidb/metrics" - "github.com/pingcap/tidb/util/codec" - "github.com/pingcap/tidb/util/gcutil" - tikvstore "github.com/tikv/client-go/v2/kv" "github.com/tikv/client-go/v2/oracle" - "github.com/tikv/client-go/v2/txnkv/rangetask" "go.uber.org/multierr" "go.uber.org/zap" "golang.org/x/sync/errgroup" @@ -373,29 +369,6 @@ func (c *CheckpointAdvancer) setCheckpoint(ctx context.Context, s spans.Valued) zap.Uint64("old", c.lastCheckpoint.TS), zap.Uint64("new", cp.TS)) return false } - // lastCheckpoint is not increased too long enough. - // assume the cluster has expired locks for whatever reasons. - if c.lastCheckpoint.needResolveLocks() { - handler := func(ctx context.Context, r tikvstore.KeyRange) (rangetask.TaskStat, error) { - // we will scan all lock before cp.TS and try to resolve them by check txn status. - return gcutil.ResolveLocksForRange(ctx, "log backup advancer", c.env, cp.TS, r.StartKey, r.EndKey) - } - runner := rangetask.NewRangeTaskRunner("advancer-resolve-locks-runner", c.env.GetStore(), config.DefaultMaxConcurrencyAdvance, handler) - // Run resolve lock on the whole TiKV cluster. it will use startKey/endKey to scan region in PD. so we need encode key here. - encodedStartKey := codec.EncodeBytes([]byte{}, []byte(c.lastCheckpoint.StartKey)) - encodedEndKey := codec.EncodeBytes([]byte{}, []byte(c.lastCheckpoint.EndKey)) - err := runner.RunOnRange(ctx, encodedStartKey, encodedEndKey) - if err != nil { - log.Error("resolve locks failed", zap.String("category", "advancer"), - zap.String("uuid", "log backup advancer"), - zap.Error(err)) - return false - } - log.Info("finish resolve locks", zap.String("category", "advancer"), - zap.String("uuid", "log backup advancer"), - zap.Int("regions", runner.CompletedRegions())) - return false - } if cp.TS <= c.lastCheckpoint.TS { return false } diff --git a/br/pkg/streamhelper/advancer_daemon.go b/br/pkg/streamhelper/advancer_daemon.go index 4e3b68eb3fbf5..d91b6a6a55bd6 100644 --- a/br/pkg/streamhelper/advancer_daemon.go +++ b/br/pkg/streamhelper/advancer_daemon.go @@ -4,12 +4,21 @@ package streamhelper import ( "context" + "math" + "time" "github.com/google/uuid" + "github.com/pingcap/log" + "github.com/pingcap/tidb/br/pkg/streamhelper/config" "github.com/pingcap/tidb/br/pkg/utils" "github.com/pingcap/tidb/metrics" "github.com/pingcap/tidb/owner" + "github.com/pingcap/tidb/util/codec" + "github.com/pingcap/tidb/util/gcutil" + tikvstore "github.com/tikv/client-go/v2/kv" + "github.com/tikv/client-go/v2/txnkv/rangetask" clientv3 "go.etcd.io/etcd/client/v3" + "go.uber.org/zap" ) const ( @@ -35,6 +44,42 @@ func (c *CheckpointAdvancer) OnStart(ctx context.Context) { func (c *CheckpointAdvancer) OnBecomeOwner(ctx context.Context) { metrics.AdvancerOwner.Set(1.0) c.spawnSubscriptionHandler(ctx) + go func() { + tick := time.NewTicker(30 * time.Second) + defer tick.Stop() + for { + select { + case <-ctx.Done(): + // no longger be an owner. return it. + return + case <-tick.C: + { + // lastCheckpoint is not increased too long enough. + // assume the cluster has expired locks for whatever reasons. + if c.lastCheckpoint.needResolveLocks() { + handler := func(ctx context.Context, r tikvstore.KeyRange) (rangetask.TaskStat, error) { + // we will scan all locks and try to resolve them by check txn status. + return gcutil.ResolveLocksForRange(ctx, "log backup advancer", c.env, math.MaxUint64, r.StartKey, r.EndKey) + } + runner := rangetask.NewRangeTaskRunner("advancer-resolve-locks-runner", c.env.GetStore(), config.DefaultMaxConcurrencyAdvance, handler) + // Run resolve lock on the whole TiKV cluster. it will use startKey/endKey to scan region in PD. so we need encode key here. + encodedStartKey := codec.EncodeBytes([]byte{}, []byte(c.lastCheckpoint.StartKey)) + encodedEndKey := codec.EncodeBytes([]byte{}, []byte(c.lastCheckpoint.EndKey)) + err := runner.RunOnRange(ctx, encodedStartKey, encodedEndKey) + if err != nil { + // wait for next tick + log.Error("resolve locks failed", zap.String("category", "advancer"), + zap.String("uuid", "log backup advancer"), + zap.Error(err)) + } + log.Info("finish resolve locks", zap.String("category", "advancer"), + zap.String("uuid", "log backup advancer"), + zap.Int("regions", runner.CompletedRegions())) + } + } + } + } + }() go func() { <-ctx.Done() c.onStop() From ae12e5c710163b19edf782b66c21a074a5480b32 Mon Sep 17 00:00:00 2001 From: 3pointer Date: Wed, 9 Aug 2023 10:32:54 +0800 Subject: [PATCH 09/45] fix bazel --- br/pkg/streamhelper/BUILD.bazel | 5 +++++ br/pkg/streamhelper/advancer_daemon.go | 10 ++++++---- br/pkg/streamhelper/advancer_env.go | 7 +++++-- store/gcworker/BUILD.bazel | 5 +++-- util/gcutil/BUILD.bazel | 10 ++++++++++ util/gcutil/gcutil.go | 4 ++-- 6 files changed, 31 insertions(+), 10 deletions(-) diff --git a/br/pkg/streamhelper/BUILD.bazel b/br/pkg/streamhelper/BUILD.bazel index 7509c0098ebf7..bd14d9d43e567 100644 --- a/br/pkg/streamhelper/BUILD.bazel +++ b/br/pkg/streamhelper/BUILD.bazel @@ -29,6 +29,7 @@ go_library( "//owner", "//util/codec", "//util/engine", + "//util/gcutil", "//util/mathutil", "@com_github_gogo_protobuf//proto", "@com_github_golang_protobuf//proto", @@ -41,6 +42,10 @@ go_library( "@com_github_pingcap_log//:log", "@com_github_tikv_client_go_v2//kv", "@com_github_tikv_client_go_v2//oracle", + "@com_github_tikv_client_go_v2//tikv", + "@com_github_tikv_client_go_v2//tikvrpc", + "@com_github_tikv_client_go_v2//txnkv/rangetask", + "@com_github_tikv_client_go_v2//txnkv/txnlock", "@com_github_tikv_pd_client//:client", "@io_etcd_go_etcd_client_v3//:client", "@org_golang_google_grpc//:grpc", diff --git a/br/pkg/streamhelper/advancer_daemon.go b/br/pkg/streamhelper/advancer_daemon.go index d91b6a6a55bd6..f69938407859b 100644 --- a/br/pkg/streamhelper/advancer_daemon.go +++ b/br/pkg/streamhelper/advancer_daemon.go @@ -61,10 +61,12 @@ func (c *CheckpointAdvancer) OnBecomeOwner(ctx context.Context) { // we will scan all locks and try to resolve them by check txn status. return gcutil.ResolveLocksForRange(ctx, "log backup advancer", c.env, math.MaxUint64, r.StartKey, r.EndKey) } - runner := rangetask.NewRangeTaskRunner("advancer-resolve-locks-runner", c.env.GetStore(), config.DefaultMaxConcurrencyAdvance, handler) - // Run resolve lock on the whole TiKV cluster. it will use startKey/endKey to scan region in PD. so we need encode key here. - encodedStartKey := codec.EncodeBytes([]byte{}, []byte(c.lastCheckpoint.StartKey)) - encodedEndKey := codec.EncodeBytes([]byte{}, []byte(c.lastCheckpoint.EndKey)) + runner := rangetask.NewRangeTaskRunner("advancer-resolve-locks-runner", + c.env.GetStore(), config.DefaultMaxConcurrencyAdvance, handler) + // Run resolve lock on the whole TiKV cluster. + // it will use startKey/endKey to scan region in PD. so we need encode key here. + encodedStartKey := codec.EncodeBytes([]byte{}, c.lastCheckpoint.StartKey) + encodedEndKey := codec.EncodeBytes([]byte{}, c.lastCheckpoint.EndKey) err := runner.RunOnRange(ctx, encodedStartKey, encodedEndKey) if err != nil { // wait for next tick diff --git a/br/pkg/streamhelper/advancer_env.go b/br/pkg/streamhelper/advancer_env.go index 314f16f1bb359..e70abee084f5e 100644 --- a/br/pkg/streamhelper/advancer_env.go +++ b/br/pkg/streamhelper/advancer_env.go @@ -158,7 +158,8 @@ func (w *AdvancerLockResolver) LocateKey(bo *tikv.Backoffer, key []byte) (*tikv. // ResolveLocks tries to resolve expired locks with this method. // It will check status of the txn. Resolve the lock if txn is expired, Or do nothing. -func (w *AdvancerLockResolver) ResolveLocks(bo *tikv.Backoffer, locks []*txnlock.Lock, loc tikv.RegionVerID) (bool, error) { +func (w *AdvancerLockResolver) ResolveLocks( + bo *tikv.Backoffer, locks []*txnlock.Lock, loc tikv.RegionVerID) (bool, error) { if len(locks) == 0 { return true, nil } @@ -171,7 +172,9 @@ func (w *AdvancerLockResolver) ScanLocks(key []byte, regionID uint64) []*txnlock return nil } -func (w *AdvancerLockResolver) SendReq(bo *tikv.Backoffer, req *tikvrpc.Request, regionID tikv.RegionVerID, timeout time.Duration) (*tikvrpc.Response, error) { +func (w *AdvancerLockResolver) SendReq( + bo *tikv.Backoffer, req *tikvrpc.Request, + regionID tikv.RegionVerID, timeout time.Duration) (*tikvrpc.Response, error) { return w.TiKvStore.SendReq(bo, req, regionID, timeout) } diff --git a/store/gcworker/BUILD.bazel b/store/gcworker/BUILD.bazel index d6e494ea0291d..2c28f33096d28 100644 --- a/store/gcworker/BUILD.bazel +++ b/store/gcworker/BUILD.bazel @@ -6,7 +6,6 @@ go_library( importpath = "github.com/pingcap/tidb/store/gcworker", visibility = ["//visibility:public"], deps = [ - "//br/pkg/utils", "//ddl", "//ddl/label", "//ddl/placement", @@ -22,6 +21,7 @@ go_library( "//tablecodec", "//util/codec", "//util/dbterror", + "//util/gcutil", "//util/logutil", "@com_github_pingcap_errors//:errors", "@com_github_pingcap_failpoint//:failpoint", @@ -52,7 +52,7 @@ go_test( embed = [":gcworker"], flaky = True, race = "on", - shard_count = 30, + shard_count = 29, deps = [ "//ddl/placement", "//ddl/util", @@ -64,6 +64,7 @@ go_test( "//store/mockstore", "//testkit/testmain", "//testkit/testsetup", + "//util/gcutil", "@com_github_pingcap_errors//:errors", "@com_github_pingcap_failpoint//:failpoint", "@com_github_pingcap_kvproto//pkg/errorpb", diff --git a/util/gcutil/BUILD.bazel b/util/gcutil/BUILD.bazel index fc6c882078726..e8d1cba0722a4 100644 --- a/util/gcutil/BUILD.bazel +++ b/util/gcutil/BUILD.bazel @@ -7,12 +7,22 @@ go_library( visibility = ["//visibility:public"], deps = [ "//kv", + "//metrics", "//parser/model", "//sessionctx", "//sessionctx/variable", + "//util/logutil", "//util/sqlexec", "@com_github_pingcap_errors//:errors", + "@com_github_pingcap_failpoint//:failpoint", + "@com_github_pingcap_kvproto//pkg/kvrpcpb", + "@com_github_tikv_client_go_v2//error", "@com_github_tikv_client_go_v2//oracle", + "@com_github_tikv_client_go_v2//tikv", + "@com_github_tikv_client_go_v2//tikvrpc", + "@com_github_tikv_client_go_v2//txnkv/rangetask", + "@com_github_tikv_client_go_v2//txnkv/txnlock", "@com_github_tikv_client_go_v2//util", + "@org_uber_go_zap//:zap", ], ) diff --git a/util/gcutil/gcutil.go b/util/gcutil/gcutil.go index f52b9bf06343f..e6159dfa59a8d 100644 --- a/util/gcutil/gcutil.go +++ b/util/gcutil/gcutil.go @@ -42,7 +42,7 @@ import ( const ( selectVariableValueSQL = `SELECT HIGH_PRIORITY variable_value FROM mysql.tidb WHERE variable_name=%?` - // We don't want gc to sweep out the cached info belong to other processes, like coprocessor. + // GCScanLockLimit We don't want gc to sweep out the cached info belong to other processes, like coprocessor. GCScanLockLimit = txnlock.ResolvedCacheSize / 2 ) @@ -153,7 +153,7 @@ func ResolveLocksForRange( sleep := v.(int) // cooperate with github.com/tikv/client-go/v2/locate/invalidCacheAndRetry //nolint: SA1029 - ctx = context.WithValue(ctx, "injectedBackoff", struct{}{}) + ctx = context.WithValue(ctx, "injectedBackoff", struct{}{}) //nolint bo = tikv.NewBackofferWithVars(ctx, sleep, nil) }) retryScanAndResolve: From 5e4ba628aeb09116fed829281e28c9dc825b1ab1 Mon Sep 17 00:00:00 2001 From: 3pointer Date: Wed, 9 Aug 2023 11:17:18 +0800 Subject: [PATCH 10/45] remove LocateKey/SendReq in GCLockResolver interface --- br/pkg/streamhelper/advancer_env.go | 11 ----------- br/pkg/streamhelper/basic_lib_for_test.go | 22 ++++++++++++++++++++++ store/gcworker/gc_worker.go | 8 -------- store/gcworker/gc_worker_test.go | 10 +--------- util/gcutil/gcutil.go | 19 +++++++++---------- 5 files changed, 32 insertions(+), 38 deletions(-) diff --git a/br/pkg/streamhelper/advancer_env.go b/br/pkg/streamhelper/advancer_env.go index e70abee084f5e..fd98cd26fc2bd 100644 --- a/br/pkg/streamhelper/advancer_env.go +++ b/br/pkg/streamhelper/advancer_env.go @@ -13,7 +13,6 @@ import ( "github.com/pingcap/tidb/util/engine" "github.com/pingcap/tidb/util/gcutil" "github.com/tikv/client-go/v2/tikv" - "github.com/tikv/client-go/v2/tikvrpc" "github.com/tikv/client-go/v2/txnkv/txnlock" pd "github.com/tikv/pd/client" clientv3 "go.etcd.io/etcd/client/v3" @@ -152,10 +151,6 @@ type AdvancerLockResolver struct { TiKvStore tikv.Storage } -func (w *AdvancerLockResolver) LocateKey(bo *tikv.Backoffer, key []byte) (*tikv.KeyLocation, error) { - return w.TiKvStore.GetRegionCache().LocateKey(bo, key) -} - // ResolveLocks tries to resolve expired locks with this method. // It will check status of the txn. Resolve the lock if txn is expired, Or do nothing. func (w *AdvancerLockResolver) ResolveLocks( @@ -172,12 +167,6 @@ func (w *AdvancerLockResolver) ScanLocks(key []byte, regionID uint64) []*txnlock return nil } -func (w *AdvancerLockResolver) SendReq( - bo *tikv.Backoffer, req *tikvrpc.Request, - regionID tikv.RegionVerID, timeout time.Duration) (*tikvrpc.Response, error) { - return w.TiKvStore.SendReq(bo, req, regionID, timeout) -} - func (w *AdvancerLockResolver) GetStore() tikv.Storage { return w.TiKvStore } diff --git a/br/pkg/streamhelper/basic_lib_for_test.go b/br/pkg/streamhelper/basic_lib_for_test.go index 0f8ffabf19e2b..787cf19d044b2 100644 --- a/br/pkg/streamhelper/basic_lib_for_test.go +++ b/br/pkg/streamhelper/basic_lib_for_test.go @@ -16,6 +16,7 @@ import ( "sync/atomic" "testing" + "github.com/pingcap/errors" backup "github.com/pingcap/kvproto/pkg/brpb" "github.com/pingcap/kvproto/pkg/errorpb" logbackup "github.com/pingcap/kvproto/pkg/logbackuppb" @@ -26,6 +27,8 @@ import ( "github.com/pingcap/tidb/br/pkg/utils" "github.com/pingcap/tidb/kv" "github.com/pingcap/tidb/util/codec" + "github.com/tikv/client-go/v2/tikv" + "github.com/tikv/client-go/v2/txnkv/txnlock" "go.uber.org/zap" "google.golang.org/grpc" "google.golang.org/grpc/codes" @@ -576,6 +579,7 @@ func (f *fakeCluster) String() string { } type testEnv struct { + tikvStore tikv.Storage *fakeCluster checkpoint uint64 testCtx *testing.T @@ -635,3 +639,21 @@ func (t *testEnv) unregisterTask() { Name: "whole", } } + +func (t *testEnv) ResolveLocks( + bo *tikv.Backoffer, locks []*txnlock.Lock, loc tikv.RegionVerID) (bool, error) { + if len(locks) == 0 { + return true, nil + } + + _, err := t.tikvStore.GetLockResolver().ResolveLocks(bo, 0, locks) + return err == nil, errors.Trace(err) +} + +func (t *testEnv) ScanLocks(key []byte, regionID uint64) []*txnlock.Lock { + return nil +} + +func (t *testEnv) GetStore() tikv.Storage { + return t.tikvStore +} diff --git a/store/gcworker/gc_worker.go b/store/gcworker/gc_worker.go index 1aba1768cf85c..9ffcd040b76f2 100644 --- a/store/gcworker/gc_worker.go +++ b/store/gcworker/gc_worker.go @@ -67,10 +67,6 @@ type GCWorkerLockResolver struct { TiKvStore tikv.Storage } -func (w *GCWorkerLockResolver) LocateKey(bo *tikv.Backoffer, key []byte) (*tikv.KeyLocation, error) { - return w.TiKvStore.GetRegionCache().LocateKey(bo, key) -} - func (w *GCWorkerLockResolver) ResolveLocks(bo *tikv.Backoffer, locks []*txnlock.Lock, loc tikv.RegionVerID) (bool, error) { // Resolve locks without check txn status. it's safe because it use safepoint to scan locks. return w.TiKvStore.GetLockResolver().BatchResolveLocks(bo, locks, loc) @@ -80,10 +76,6 @@ func (w *GCWorkerLockResolver) ScanLocks(key []byte, regionID uint64) []*txnlock return nil } -func (w *GCWorkerLockResolver) SendReq(bo *tikv.Backoffer, req *tikvrpc.Request, regionID tikv.RegionVerID, timeout time.Duration) (*tikvrpc.Response, error) { - return w.TiKvStore.SendReq(bo, req, regionID, timeout) -} - func (w *GCWorkerLockResolver) GetStore() tikv.Storage { return w.TiKvStore } diff --git a/store/gcworker/gc_worker_test.go b/store/gcworker/gc_worker_test.go index 9125da4086068..f96bc8ab03cce 100644 --- a/store/gcworker/gc_worker_test.go +++ b/store/gcworker/gc_worker_test.go @@ -59,14 +59,6 @@ type mockGCWorkerLockResolver struct { resolveLocks func(locks []*txnlock.Lock, lowResolutionTS uint64) (int64, error) } -func (l *mockGCWorkerLockResolver) LocateKey(bo *tikv.Backoffer, key []byte) (*tikv.KeyLocation, error) { - return l.tikvStore.GetRegionCache().LocateKey(bo, key) -} - -func (l *mockGCWorkerLockResolver) SendReq(bo *tikv.Backoffer, req *tikvrpc.Request, regionID tikv.RegionVerID, timeout time.Duration) (*tikvrpc.Response, error) { - return l.tikvStore.SendReq(bo, req, regionID, timeout) -} - func (l *mockGCWorkerLockResolver) ScanLocks(key []byte, regionID uint64) []*txnlock.Lock { return l.scanLocks(key, regionID) } @@ -76,7 +68,7 @@ func (l *mockGCWorkerLockResolver) ResolveLocks(bo *tikv.Backoffer, locks []*txn } func (l *mockGCWorkerLockResolver) GetStore() tikv.Storage { - // no use for test + // no use in test return l.tikvStore } diff --git a/util/gcutil/gcutil.go b/util/gcutil/gcutil.go index e6159dfa59a8d..b75ab05b6ac6e 100644 --- a/util/gcutil/gcutil.go +++ b/util/gcutil/gcutil.go @@ -17,7 +17,6 @@ package gcutil import ( "bytes" "context" - "time" "github.com/pingcap/errors" "github.com/pingcap/failpoint" @@ -49,17 +48,17 @@ const ( // GCLockResolver is used for GCWorker and log backup advancer to resolve locks. // #Note: Put it here to avoid cycle import type GCLockResolver interface { - LocateKey(*tikv.Backoffer, []byte) (*tikv.KeyLocation, error) - + // ResolveLocks tries to resolve expired locks. + // 1. For GCWorker it will scan locks for all regions before *safepoint*, + // and force remove locks. rollback the txn, no matter the lock is expired of not. + // 2. For log backup advancer, it will scan all locks for a small range. + // and it will check status of the txn. resolve the locks if txn is expired, Or do nothing. ResolveLocks(*tikv.Backoffer, []*txnlock.Lock, tikv.RegionVerID) (bool, error) // ScanLocks only used for mock test. ScanLocks([]byte, uint64) []*txnlock.Lock - - SendReq(*tikv.Backoffer, *tikvrpc.Request, tikv.RegionVerID, time.Duration) (*tikvrpc.Response, error) - // We need to get tikvStore to build rangerunner. - // FIXME: the most code is in client.go and the store is only used to locate end keys of a region. + // TODO: the most code is in client.go and the store is only used to locate end keys of a region. // maybe we can move GCLockResolver into client.go. GetStore() tikv.Storage } @@ -165,12 +164,12 @@ retryScanAndResolve: } req.ScanLock().StartKey = key - loc, err := lockResolver.LocateKey(bo, key) + loc, err := lockResolver.GetStore().GetRegionCache().LocateKey(bo, key) if err != nil { return stat, errors.Trace(err) } req.ScanLock().EndKey = loc.EndKey - resp, err := lockResolver.SendReq(bo, req, loc.Region, tikv.ReadTimeoutMedium) + resp, err := lockResolver.GetStore().SendReq(bo, req, loc.Region, tikv.ReadTimeoutMedium) if err != nil { return stat, errors.Trace(err) } @@ -249,7 +248,7 @@ func tryRelocateLocksRegion(bo *tikv.Backoffer, lockResolver GCLockResolver, loc if len(locks) == 0 { return } - refreshedLoc, err = lockResolver.LocateKey(bo, locks[0].Key) + refreshedLoc, err = lockResolver.GetStore().GetRegionCache().LocateKey(bo, locks[0].Key) if err != nil { return } From 713b16fe74aceb665b5bdf54bd4b853c985c5379 Mon Sep 17 00:00:00 2001 From: 3pointer Date: Wed, 9 Aug 2023 15:02:02 +0800 Subject: [PATCH 11/45] fix gc worker unit test --- br/pkg/streamhelper/advancer_test.go | 25 ++++++++++ br/pkg/streamhelper/basic_lib_for_test.go | 8 +-- store/gcworker/gc_worker_test.go | 61 +++-------------------- 3 files changed, 37 insertions(+), 57 deletions(-) diff --git a/br/pkg/streamhelper/advancer_test.go b/br/pkg/streamhelper/advancer_test.go index a01a13f96053e..b75ba6bec0d4e 100644 --- a/br/pkg/streamhelper/advancer_test.go +++ b/br/pkg/streamhelper/advancer_test.go @@ -16,6 +16,7 @@ import ( "github.com/pingcap/tidb/br/pkg/streamhelper/config" "github.com/pingcap/tidb/kv" "github.com/stretchr/testify/require" + "github.com/tikv/client-go/v2/txnkv/txnlock" "go.uber.org/zap/zapcore" "google.golang.org/grpc/codes" "google.golang.org/grpc/status" @@ -278,3 +279,27 @@ func TestBlocked(t *testing.T) { }) req.ErrorIs(errors.Cause(err), context.DeadlineExceeded) } + +func TestResolveLock(t *testing.T) { + c := createFakeCluster(t, 4, false) + defer func() { + if t.Failed() { + fmt.Println(c) + } + }() + c.splitAndScatter("01", "02", "022", "023", "033", "04", "043") + ctx := context.Background() + minCheckpoint := c.advanceCheckpoints() + env := &testEnv{fakeCluster: c, testCtx: t} + env.scanLocks = func(key []byte, regionID uint64) []*txnlock.Lock { + return nil + } + adv := streamhelper.NewCheckpointAdvancer(env) + coll := streamhelper.NewClusterCollector(ctx, env) + err := adv.GetCheckpointInRange(ctx, []byte{}, []byte{}, coll) + require.NoError(t, err) + r, err := coll.Finish(ctx) + require.NoError(t, err) + require.Len(t, r.FailureSubRanges, 0) + require.Equal(t, r.Checkpoint, minCheckpoint, "%d %d", r.Checkpoint, minCheckpoint) +} diff --git a/br/pkg/streamhelper/basic_lib_for_test.go b/br/pkg/streamhelper/basic_lib_for_test.go index 787cf19d044b2..55babcf3c00e1 100644 --- a/br/pkg/streamhelper/basic_lib_for_test.go +++ b/br/pkg/streamhelper/basic_lib_for_test.go @@ -579,14 +579,14 @@ func (f *fakeCluster) String() string { } type testEnv struct { - tikvStore tikv.Storage *fakeCluster checkpoint uint64 testCtx *testing.T ranges []kv.KeyRange taskCh chan<- streamhelper.TaskEvent - mu sync.Mutex + scanLocks func(key []byte, regionID uint64) []*txnlock.Lock + mu sync.Mutex } func (t *testEnv) Begin(ctx context.Context, ch chan<- streamhelper.TaskEvent) error { @@ -651,9 +651,9 @@ func (t *testEnv) ResolveLocks( } func (t *testEnv) ScanLocks(key []byte, regionID uint64) []*txnlock.Lock { - return nil + return t.scanLocks(key, regionID) } func (t *testEnv) GetStore() tikv.Storage { - return t.tikvStore + return nil } diff --git a/store/gcworker/gc_worker_test.go b/store/gcworker/gc_worker_test.go index f96bc8ab03cce..084c235bcbd44 100644 --- a/store/gcworker/gc_worker_test.go +++ b/store/gcworker/gc_worker_test.go @@ -51,12 +51,9 @@ import ( ) type mockGCWorkerLockResolver struct { - tikvStore tikv.Storage - forceResolveLocksTS uint64 - tryResolveLocksTS uint64 - scanLocks func(key []byte, regionID uint64) []*txnlock.Lock - batchResolveLocks func(locks []*txnlock.Lock, regionID tikv.RegionVerID) (ok bool, err error) - resolveLocks func(locks []*txnlock.Lock, lowResolutionTS uint64) (int64, error) + tikvStore tikv.Storage + scanLocks func(key []byte, regionID uint64) []*txnlock.Lock + batchResolveLocks func(locks []*txnlock.Lock, regionID tikv.RegionVerID) (ok bool, err error) } func (l *mockGCWorkerLockResolver) ScanLocks(key []byte, regionID uint64) []*txnlock.Lock { @@ -1053,11 +1050,7 @@ func TestResolveLockRangeMeetRegionCacheMiss(t *testing.T) { resolveCnt int resolveCntRef = &resolveCnt - scanLockCnt int - resolveBeforeSafepointLockCnt int - resolveAfterSafepointLockCnt int - safepointTS uint64 = 434245550444904450 - lowResolveTS uint64 = 434245550449098752 + safepointTS uint64 = 434245550444904450 ) allLocks := []*txnlock.Lock{ @@ -1088,23 +1081,14 @@ func TestResolveLockRangeMeetRegionCacheMiss(t *testing.T) { } mockLockResolver := &mockGCWorkerLockResolver{ - tikvStore: s.tikvStore, - forceResolveLocksTS: safepointTS, - tryResolveLocksTS: lowResolveTS, + tikvStore: s.tikvStore, scanLocks: func(key []byte, regionID uint64) []*txnlock.Lock { *scanCntRef++ - - locks := make([]*txnlock.Lock, 0) - for _, l := range allLocks { - locks = append(locks, l) - scanLockCnt++ - } - return locks + return allLocks }, batchResolveLocks: func( locks []*txnlock.Lock, regionID tikv.RegionVerID, - safepoint uint64, ) (ok bool, err error) { *resolveCntRef++ if *resolveCntRef == 1 { @@ -1112,33 +1096,13 @@ func TestResolveLockRangeMeetRegionCacheMiss(t *testing.T) { // mock the region cache miss error return false, nil } - - resolveBeforeSafepointLockCnt = len(locks) - for _, l := range locks { - require.True(t, l.TxnID <= safepoint) - } return true, nil }, - resolveLocks: func( - locks []*txnlock.Lock, - lowResolutionTS uint64, - ) (int64, error) { - for _, l := range locks { - expiredTS := oracle.ComposeTS(oracle.ExtractPhysical(l.TxnID)+int64(l.TTL), oracle.ExtractLogical(l.TxnID)) - if expiredTS <= lowResolutionTS { - resolveAfterSafepointLockCnt++ - } - } - return 0, nil - }, } - _, err := ResolveLocksForRange(gcContext(), s.gcWorker.uuid, mockLockResolver, safepointTS, lowResolveTS, []byte{0}, []byte{10}) + _, err := gcutil.ResolveLocksForRange(gcContext(), s.gcWorker.uuid, mockLockResolver, safepointTS, []byte{0}, []byte{10}) require.NoError(t, err) require.Equal(t, 2, resolveCnt) require.Equal(t, 1, scanCnt) - require.Equal(t, 3, scanLockCnt) - require.Equal(t, 1, resolveBeforeSafepointLockCnt) - require.Equal(t, 1, resolveAfterSafepointLockCnt) } func TestResolveLockRangeMeetRegionEnlargeCausedByRegionMerge(t *testing.T) { @@ -1163,9 +1127,7 @@ func TestResolveLockRangeMeetRegionEnlargeCausedByRegionMerge(t *testing.T) { s.cluster.Split(s.initRegion.regionID, region2, []byte("m"), newPeers, newPeers[0]) mockGCLockResolver := &mockGCWorkerLockResolver{ - tikvStore: s.tikvStore, - forceResolveLocksTS: 1, - tryResolveLocksTS: 3, + tikvStore: s.tikvStore, scanLocks: func(key []byte, regionID uint64) []*txnlock.Lock { if regionID == s.initRegion.regionID { return []*txnlock.Lock{{Key: []byte("a")}, {Key: []byte("b")}} @@ -1175,17 +1137,10 @@ func TestResolveLockRangeMeetRegionEnlargeCausedByRegionMerge(t *testing.T) { } return []*txnlock.Lock{} }, - resolveLocks: func( - locks []*txnlock.Lock, - lowResolutionTS uint64, - ) (int64, error) { - return 0, nil - }, } mockGCLockResolver.batchResolveLocks = func( locks []*txnlock.Lock, regionID tikv.RegionVerID, - safepoint uint64, ) (ok bool, err error) { if regionID.GetID() == s.initRegion.regionID && *firstAccessRef { *firstAccessRef = false From 8d821a78e527a78fed21a1b96472a6d25b1a1552 Mon Sep 17 00:00:00 2001 From: 3pointer Date: Wed, 9 Aug 2023 16:11:06 +0800 Subject: [PATCH 12/45] add ut for resolve lock --- br/pkg/streamhelper/advancer.go | 15 ++++- br/pkg/streamhelper/advancer_daemon.go | 12 +++- br/pkg/streamhelper/advancer_test.go | 47 ++++++++++++- br/pkg/streamhelper/basic_lib_for_test.go | 80 +++++++++++++++++++++-- 4 files changed, 144 insertions(+), 10 deletions(-) diff --git a/br/pkg/streamhelper/advancer.go b/br/pkg/streamhelper/advancer.go index 16c08162a34d0..5cb7b5f9e5595 100644 --- a/br/pkg/streamhelper/advancer.go +++ b/br/pkg/streamhelper/advancer.go @@ -4,11 +4,13 @@ package streamhelper import ( "context" + "fmt" "strings" "sync" "time" "github.com/pingcap/errors" + "github.com/pingcap/failpoint" backuppb "github.com/pingcap/kvproto/pkg/brpb" "github.com/pingcap/log" "github.com/pingcap/tidb/br/pkg/logutil" @@ -88,7 +90,7 @@ func newCheckpointWithTS(ts uint64) checkpoint { } } -func newCheckpointWithSpan(s spans.Valued) checkpoint { +func NewCheckpointWithSpan(s spans.Valued) checkpoint { return checkpoint{ StartKey: s.Key.StartKey, EndKey: s.Key.EndKey, @@ -105,6 +107,9 @@ func (c checkpoint) safeTS() uint64 { // we should try to resolve lock for the range // to keep the RPO in a short value. func (c checkpoint) needResolveLocks() bool { + failpoint.Inject("NeedResolveLocks", func(val failpoint.Value) { + failpoint.Return(val.(bool)) + }) return time.Since(c.generateTime) > time.Minute } @@ -131,6 +136,11 @@ func (c *CheckpointAdvancer) UpdateConfigWith(f func(*config.Config)) { c.UpdateConfig(cfg) } +// only used for test +func (c *CheckpointAdvancer) UpdateLastCheckpoint(p checkpoint) { + c.lastCheckpoint = p +} + // Config returns the current config. func (c *CheckpointAdvancer) Config() config.Config { return c.cfg @@ -363,7 +373,8 @@ func (c *CheckpointAdvancer) onTaskEvent(ctx context.Context, e TaskEvent) error } func (c *CheckpointAdvancer) setCheckpoint(ctx context.Context, s spans.Valued) bool { - cp := newCheckpointWithSpan(s) + fmt.Println("enter setCheckpoint") + cp := NewCheckpointWithSpan(s) if cp.TS < c.lastCheckpoint.TS { log.Warn("failed to update global checkpoint: stale", zap.Uint64("old", c.lastCheckpoint.TS), zap.Uint64("new", cp.TS)) diff --git a/br/pkg/streamhelper/advancer_daemon.go b/br/pkg/streamhelper/advancer_daemon.go index f69938407859b..1d66608dc6284 100644 --- a/br/pkg/streamhelper/advancer_daemon.go +++ b/br/pkg/streamhelper/advancer_daemon.go @@ -8,6 +8,7 @@ import ( "time" "github.com/google/uuid" + "github.com/pingcap/failpoint" "github.com/pingcap/log" "github.com/pingcap/tidb/br/pkg/streamhelper/config" "github.com/pingcap/tidb/br/pkg/utils" @@ -26,6 +27,14 @@ const ( ownerPath = "/tidb/br-stream/owner" ) +func resolveLockTickTime() time.Duration { + failpoint.Inject("ResolveLockTickTime", func(val failpoint.Value) { + t := time.Duration(val.(int)) + failpoint.Return(t * time.Second) + }) + return 30 * time.Second +} + // OnTick advances the inner logic clock for the advancer. // It's synchronous: this would only return after the events triggered by the clock has all been done. // It's generally panic-free, you may not need to trying recover a panic here. @@ -45,7 +54,8 @@ func (c *CheckpointAdvancer) OnBecomeOwner(ctx context.Context) { metrics.AdvancerOwner.Set(1.0) c.spawnSubscriptionHandler(ctx) go func() { - tick := time.NewTicker(30 * time.Second) + t := resolveLockTickTime() + tick := time.NewTicker(t) defer tick.Stop() for { select { diff --git a/br/pkg/streamhelper/advancer_test.go b/br/pkg/streamhelper/advancer_test.go index b75ba6bec0d4e..4b64d0b1d1ac1 100644 --- a/br/pkg/streamhelper/advancer_test.go +++ b/br/pkg/streamhelper/advancer_test.go @@ -10,12 +10,15 @@ import ( "time" "github.com/pingcap/errors" + "github.com/pingcap/failpoint" logbackup "github.com/pingcap/kvproto/pkg/logbackuppb" "github.com/pingcap/log" "github.com/pingcap/tidb/br/pkg/streamhelper" "github.com/pingcap/tidb/br/pkg/streamhelper/config" + "github.com/pingcap/tidb/br/pkg/streamhelper/spans" "github.com/pingcap/tidb/kv" "github.com/stretchr/testify/require" + "github.com/tikv/client-go/v2/tikv" "github.com/tikv/client-go/v2/txnkv/txnlock" "go.uber.org/zap/zapcore" "google.golang.org/grpc/codes" @@ -287,19 +290,61 @@ func TestResolveLock(t *testing.T) { fmt.Println(c) } }() + require.NoError(t, failpoint.Enable("github.com/pingcap/tidb/br/pkg/streamhelper/NeedResolveLocks", `return(true)`)) + require.NoError(t, failpoint.Enable("github.com/pingcap/tidb/br/pkg/streamhelper/ResolveLockTickTime", `return(1)`)) + defer func() { + require.NoError(t, failpoint.Disable("github.com/pingcap/tidb/br/pkg/streamhelper/NeedResolveLocks")) + require.NoError(t, failpoint.Disable("github.com/pingcap/tidb/br/pkg/streamhelper/ResolveLockTickTime")) + }() + c.splitAndScatter("01", "02", "022", "023", "033", "04", "043") ctx := context.Background() minCheckpoint := c.advanceCheckpoints() env := &testEnv{fakeCluster: c, testCtx: t} + allLocks := []*txnlock.Lock{ + { + Key: []byte{1}, + // TxnID == minCheckpoint + TxnID: minCheckpoint, + }, + { + Key: []byte{2}, + // TxnID > minCheckpoint + TxnID: minCheckpoint + 1, + }, + } env.scanLocks = func(key []byte, regionID uint64) []*txnlock.Lock { - return nil + return allLocks + + } + // ensure resolve locks triggered and collect all locks from scan locks + resolveLockCnt := 0 + resolveLockCntRef := &resolveLockCnt + env.resolveLocks = func(locks []*txnlock.Lock, regionID tikv.RegionVerID) (ok bool, err error) { + *resolveLockCntRef += 1 + require.ElementsMatch(t, locks, allLocks) + return true, nil } adv := streamhelper.NewCheckpointAdvancer(env) + // make lastCheckpoint stuck at 123 + adv.UpdateLastCheckpoint(streamhelper.NewCheckpointWithSpan(spans.Valued{ + Key: kv.KeyRange{ + StartKey: kv.Key([]byte("1")), + EndKey: kv.Key([]byte("2")), + }, + Value: 123, + })) + adv.OnBecomeOwner(ctx) coll := streamhelper.NewClusterCollector(ctx, env) err := adv.GetCheckpointInRange(ctx, []byte{}, []byte{}, coll) + + require.Eventually(t, func() bool { return *resolveLockCntRef > 0 }, + 8*time.Second, 50*time.Microsecond) + require.NoError(t, err) r, err := coll.Finish(ctx) require.NoError(t, err) require.Len(t, r.FailureSubRanges, 0) require.Equal(t, r.Checkpoint, minCheckpoint, "%d %d", r.Checkpoint, minCheckpoint) + } diff --git a/br/pkg/streamhelper/basic_lib_for_test.go b/br/pkg/streamhelper/basic_lib_for_test.go index 55babcf3c00e1..ef7eab388f9a0 100644 --- a/br/pkg/streamhelper/basic_lib_for_test.go +++ b/br/pkg/streamhelper/basic_lib_for_test.go @@ -15,10 +15,11 @@ import ( "sync" "sync/atomic" "testing" + "time" - "github.com/pingcap/errors" backup "github.com/pingcap/kvproto/pkg/brpb" "github.com/pingcap/kvproto/pkg/errorpb" + "github.com/pingcap/kvproto/pkg/kvrpcpb" logbackup "github.com/pingcap/kvproto/pkg/logbackuppb" "github.com/pingcap/kvproto/pkg/metapb" "github.com/pingcap/log" @@ -28,7 +29,9 @@ import ( "github.com/pingcap/tidb/kv" "github.com/pingcap/tidb/util/codec" "github.com/tikv/client-go/v2/tikv" + "github.com/tikv/client-go/v2/tikvrpc" "github.com/tikv/client-go/v2/txnkv/txnlock" + pd "github.com/tikv/pd/client" "go.uber.org/zap" "google.golang.org/grpc" "google.golang.org/grpc/codes" @@ -585,8 +588,10 @@ type testEnv struct { ranges []kv.KeyRange taskCh chan<- streamhelper.TaskEvent - scanLocks func(key []byte, regionID uint64) []*txnlock.Lock - mu sync.Mutex + scanLocks func(key []byte, regionID uint64) []*txnlock.Lock + resolveLocks func(locks []*txnlock.Lock, regionID tikv.RegionVerID) (ok bool, err error) + + mu sync.Mutex } func (t *testEnv) Begin(ctx context.Context, ch chan<- streamhelper.TaskEvent) error { @@ -646,8 +651,7 @@ func (t *testEnv) ResolveLocks( return true, nil } - _, err := t.tikvStore.GetLockResolver().ResolveLocks(bo, 0, locks) - return err == nil, errors.Trace(err) + return t.resolveLocks(locks, loc) } func (t *testEnv) ScanLocks(key []byte, regionID uint64) []*txnlock.Lock { @@ -655,5 +659,69 @@ func (t *testEnv) ScanLocks(key []byte, regionID uint64) []*txnlock.Lock { } func (t *testEnv) GetStore() tikv.Storage { - return nil + // only used for GetRegionCache once in resolve lock + return &mockTiKVStore{regionCache: tikv.NewRegionCache(&mockPDClient{})} +} + +type mockKVStore struct { + kv.Storage +} + +type mockTiKVStore struct { + mockKVStore + tikv.Storage + regionCache *tikv.RegionCache +} + +func (s *mockTiKVStore) GetRegionCache() *tikv.RegionCache { + return s.regionCache +} + +func (s *mockTiKVStore) SendReq(bo *tikv.Backoffer, req *tikvrpc.Request, regionID tikv.RegionVerID, timeout time.Duration) (*tikvrpc.Response, error) { + scanResp := kvrpcpb.ScanLockResponse{ + // we don't need mock locks here, because we already have mock locks in testEnv.Scanlocks. + // this behaviour is align with gc_worker_test + Locks: nil, + RegionError: nil, + } + return &tikvrpc.Response{Resp: &scanResp}, nil +} + +type mockPDClient struct { + pd.Client +} + +func (p *mockPDClient) ScanRegions(ctx context.Context, key, endKey []byte, limit int) ([]*pd.Region, error) { + return []*pd.Region{ + newMockRegion(1, []byte("1"), []byte("3")), + }, nil +} + +func (c *mockPDClient) GetStore(_ context.Context, storeID uint64) (*metapb.Store, error) { + return &metapb.Store{ + Id: storeID, + Address: fmt.Sprintf("127.0.0.%d", storeID), + }, nil +} + +func (c *mockPDClient) GetRegion(ctx context.Context, key []byte, opts ...pd.GetRegionOption) (*pd.Region, error) { + return newMockRegion(1, []byte("1"), []byte("3")), nil +} + +func newMockRegion(regionID uint64, startKey []byte, endKey []byte) *pd.Region { + leader := &metapb.Peer{ + Id: regionID, + StoreId: 1, + Role: metapb.PeerRole_Voter, + } + + return &pd.Region{ + Meta: &metapb.Region{ + Id: regionID, + StartKey: startKey, + EndKey: endKey, + Peers: []*metapb.Peer{leader}, + }, + Leader: leader, + } } From e76a325fb7cab76f55266657986ad95a46068db2 Mon Sep 17 00:00:00 2001 From: 3pointer Date: Thu, 10 Aug 2023 14:36:39 +0800 Subject: [PATCH 13/45] fix --- br/pkg/streamhelper/advancer.go | 12 +++++++++--- br/pkg/streamhelper/advancer_daemon.go | 13 +++++++------ store/gcworker/gc_worker.go | 2 +- util/gcutil/gcutil.go | 3 ++- 4 files changed, 19 insertions(+), 11 deletions(-) diff --git a/br/pkg/streamhelper/advancer.go b/br/pkg/streamhelper/advancer.go index 5cb7b5f9e5595..f686fedf409e6 100644 --- a/br/pkg/streamhelper/advancer.go +++ b/br/pkg/streamhelper/advancer.go @@ -3,8 +3,8 @@ package streamhelper import ( + "bytes" "context" - "fmt" "strings" "sync" "time" @@ -103,6 +103,11 @@ func (c checkpoint) safeTS() uint64 { return c.TS - 1 } +func (c checkpoint) equal(o checkpoint) bool { + return bytes.Equal(c.StartKey, o.StartKey) && + bytes.Equal(c.EndKey, o.EndKey) && c.TS == o.TS +} + // if a checkpoint stay in a time too long(1 min) // we should try to resolve lock for the range // to keep the RPO in a short value. @@ -373,14 +378,15 @@ func (c *CheckpointAdvancer) onTaskEvent(ctx context.Context, e TaskEvent) error } func (c *CheckpointAdvancer) setCheckpoint(ctx context.Context, s spans.Valued) bool { - fmt.Println("enter setCheckpoint") cp := NewCheckpointWithSpan(s) if cp.TS < c.lastCheckpoint.TS { log.Warn("failed to update global checkpoint: stale", zap.Uint64("old", c.lastCheckpoint.TS), zap.Uint64("new", cp.TS)) return false } - if cp.TS <= c.lastCheckpoint.TS { + // Need resolve lock for different range and same TS + // so check the range and TS here. + if cp.equal(c.lastCheckpoint) { return false } c.lastCheckpoint = cp diff --git a/br/pkg/streamhelper/advancer_daemon.go b/br/pkg/streamhelper/advancer_daemon.go index 1d66608dc6284..db1a4f2778579 100644 --- a/br/pkg/streamhelper/advancer_daemon.go +++ b/br/pkg/streamhelper/advancer_daemon.go @@ -10,11 +10,11 @@ import ( "github.com/google/uuid" "github.com/pingcap/failpoint" "github.com/pingcap/log" + "github.com/pingcap/tidb/br/pkg/logutil" "github.com/pingcap/tidb/br/pkg/streamhelper/config" "github.com/pingcap/tidb/br/pkg/utils" "github.com/pingcap/tidb/metrics" "github.com/pingcap/tidb/owner" - "github.com/pingcap/tidb/util/codec" "github.com/pingcap/tidb/util/gcutil" tikvstore "github.com/tikv/client-go/v2/kv" "github.com/tikv/client-go/v2/txnkv/rangetask" @@ -69,15 +69,14 @@ func (c *CheckpointAdvancer) OnBecomeOwner(ctx context.Context) { if c.lastCheckpoint.needResolveLocks() { handler := func(ctx context.Context, r tikvstore.KeyRange) (rangetask.TaskStat, error) { // we will scan all locks and try to resolve them by check txn status. - return gcutil.ResolveLocksForRange(ctx, "log backup advancer", c.env, math.MaxUint64, r.StartKey, r.EndKey) + return gcutil.ResolveLocksForRange(ctx, "log backup", "advancer", c.env, math.MaxUint64, r.StartKey, r.EndKey) } runner := rangetask.NewRangeTaskRunner("advancer-resolve-locks-runner", c.env.GetStore(), config.DefaultMaxConcurrencyAdvance, handler) // Run resolve lock on the whole TiKV cluster. - // it will use startKey/endKey to scan region in PD. so we need encode key here. - encodedStartKey := codec.EncodeBytes([]byte{}, c.lastCheckpoint.StartKey) - encodedEndKey := codec.EncodeBytes([]byte{}, c.lastCheckpoint.EndKey) - err := runner.RunOnRange(ctx, encodedStartKey, encodedEndKey) + // it will use startKey/endKey to scan region in PD. + // but regionCache already has a codecPDClient. so just use decode key here. + err := runner.RunOnRange(ctx, c.lastCheckpoint.StartKey, c.lastCheckpoint.EndKey) if err != nil { // wait for next tick log.Error("resolve locks failed", zap.String("category", "advancer"), @@ -86,6 +85,8 @@ func (c *CheckpointAdvancer) OnBecomeOwner(ctx context.Context) { } log.Info("finish resolve locks", zap.String("category", "advancer"), zap.String("uuid", "log backup advancer"), + logutil.Key("StartKey", c.lastCheckpoint.StartKey), + logutil.Key("EndKey", c.lastCheckpoint.EndKey), zap.Int("regions", runner.CompletedRegions())) } } diff --git a/store/gcworker/gc_worker.go b/store/gcworker/gc_worker.go index 9ffcd040b76f2..4f46be9d83609 100644 --- a/store/gcworker/gc_worker.go +++ b/store/gcworker/gc_worker.go @@ -1198,7 +1198,7 @@ func (w *GCWorker) legacyResolveLocks( startTime := time.Now() handler := func(ctx context.Context, r tikvstore.KeyRange) (rangetask.TaskStat, error) { - return gcutil.ResolveLocksForRange(ctx, w.uuid, w.lockResolver, safePoint, r.StartKey, r.EndKey) + return gcutil.ResolveLocksForRange(ctx, w.uuid, "gc worker", w.lockResolver, safePoint, r.StartKey, r.EndKey) } runner := rangetask.NewRangeTaskRunner("resolve-locks-runner", w.tikvStore, concurrency, handler) diff --git a/util/gcutil/gcutil.go b/util/gcutil/gcutil.go index b75ab05b6ac6e..99fe796219c11 100644 --- a/util/gcutil/gcutil.go +++ b/util/gcutil/gcutil.go @@ -126,6 +126,7 @@ func GetGCSafePoint(sctx sessionctx.Context) (uint64, error) { func ResolveLocksForRange( ctx context.Context, uuid string, + category string, lockResolver GCLockResolver, maxVersion uint64, startKey []byte, @@ -224,7 +225,7 @@ retryScanAndResolve: stat.CompletedRegions++ key = loc.EndKey } else { - logutil.Logger(ctx).Info("region has more than limit locks", zap.String("category", "gc worker"), + logutil.Logger(ctx).Info("region has more than limit locks", zap.String("category", category), zap.String("uuid", uuid), zap.Uint64("region", locForResolve.Region.GetID()), zap.Int("scan lock limit", GCScanLockLimit)) From 2adf397e742837ae214fe315f7030ee03fb18db5 Mon Sep 17 00:00:00 2001 From: 3pointer Date: Fri, 11 Aug 2023 12:21:51 +0800 Subject: [PATCH 14/45] precise resolve lock ranges --- br/pkg/streamhelper/advancer.go | 20 ++++++----- br/pkg/streamhelper/advancer_daemon.go | 46 ++++++++++++++++++-------- util/gcutil/gcutil.go | 1 + 3 files changed, 45 insertions(+), 22 deletions(-) diff --git a/br/pkg/streamhelper/advancer.go b/br/pkg/streamhelper/advancer.go index f686fedf409e6..adc0652e9d671 100644 --- a/br/pkg/streamhelper/advancer.go +++ b/br/pkg/streamhelper/advancer.go @@ -80,22 +80,22 @@ type checkpoint struct { // It's better to use PD timestamp in future, for now // use local time to decide the time to resolve lock is ok. - generateTime time.Time + resolveLockTime time.Time } func newCheckpointWithTS(ts uint64) checkpoint { return checkpoint{ - TS: ts, - generateTime: time.Now(), + TS: ts, + resolveLockTime: time.Now(), } } func NewCheckpointWithSpan(s spans.Valued) checkpoint { return checkpoint{ - StartKey: s.Key.StartKey, - EndKey: s.Key.EndKey, - TS: s.Value, - generateTime: time.Now(), + StartKey: s.Key.StartKey, + EndKey: s.Key.EndKey, + TS: s.Value, + resolveLockTime: time.Now(), } } @@ -115,7 +115,7 @@ func (c checkpoint) needResolveLocks() bool { failpoint.Inject("NeedResolveLocks", func(val failpoint.Value) { failpoint.Return(val.(bool)) }) - return time.Since(c.generateTime) > time.Minute + return time.Since(c.resolveLockTime) > time.Minute } // NewCheckpointAdvancer creates a checkpoint advancer with the env. @@ -226,6 +226,10 @@ func tsoBefore(n time.Duration) uint64 { return oracle.ComposeTS(now.UnixMilli()-n.Milliseconds(), 0) } +func tsoAfter(ts uint64, n time.Duration) uint64 { + return oracle.GoTimeToTS(oracle.GetTimeFromTS(ts).Add(n)) +} + func (c *CheckpointAdvancer) WithCheckpoints(f func(*spans.ValueSortedFull)) { c.checkpointsMu.Lock() defer c.checkpointsMu.Unlock() diff --git a/br/pkg/streamhelper/advancer_daemon.go b/br/pkg/streamhelper/advancer_daemon.go index db1a4f2778579..8bf05e40399b1 100644 --- a/br/pkg/streamhelper/advancer_daemon.go +++ b/br/pkg/streamhelper/advancer_daemon.go @@ -12,6 +12,7 @@ import ( "github.com/pingcap/log" "github.com/pingcap/tidb/br/pkg/logutil" "github.com/pingcap/tidb/br/pkg/streamhelper/config" + "github.com/pingcap/tidb/br/pkg/streamhelper/spans" "github.com/pingcap/tidb/br/pkg/utils" "github.com/pingcap/tidb/metrics" "github.com/pingcap/tidb/owner" @@ -32,7 +33,7 @@ func resolveLockTickTime() time.Duration { t := time.Duration(val.(int)) failpoint.Return(t * time.Second) }) - return 30 * time.Second + return 10 * time.Second } // OnTick advances the inner logic clock for the advancer. @@ -67,27 +68,44 @@ func (c *CheckpointAdvancer) OnBecomeOwner(ctx context.Context) { // lastCheckpoint is not increased too long enough. // assume the cluster has expired locks for whatever reasons. if c.lastCheckpoint.needResolveLocks() { + var targets []spans.Valued + c.WithCheckpoints(func(vsf *spans.ValueSortedFull) { + // when get locks here. assume these locks are not belong to same txn, + // but startTS close to 1 minute. try resolve these locks at one time + vsf.TraverseValuesLessThan(tsoAfter(c.lastCheckpoint.TS, time.Minute), func(v spans.Valued) bool { + targets = append(targets, v) + return true + }) + }) handler := func(ctx context.Context, r tikvstore.KeyRange) (rangetask.TaskStat, error) { // we will scan all locks and try to resolve them by check txn status. return gcutil.ResolveLocksForRange(ctx, "log backup", "advancer", c.env, math.MaxUint64, r.StartKey, r.EndKey) } - runner := rangetask.NewRangeTaskRunner("advancer-resolve-locks-runner", - c.env.GetStore(), config.DefaultMaxConcurrencyAdvance, handler) - // Run resolve lock on the whole TiKV cluster. - // it will use startKey/endKey to scan region in PD. - // but regionCache already has a codecPDClient. so just use decode key here. - err := runner.RunOnRange(ctx, c.lastCheckpoint.StartKey, c.lastCheckpoint.EndKey) - if err != nil { - // wait for next tick - log.Error("resolve locks failed", zap.String("category", "advancer"), - zap.String("uuid", "log backup advancer"), - zap.Error(err)) + workerPool := utils.NewWorkerPool(uint(config.DefaultMaxConcurrencyAdvance), "advancer resolve locks") + for _, r := range targets { + targetRange := r + workerPool.Apply(func() { + // Run resolve lock on the whole TiKV cluster. + // it will use startKey/endKey to scan region in PD. + // but regionCache already has a codecPDClient. so just use decode key here. + // and it almost only include one region here. so set concurrency to 1. + runner := rangetask.NewRangeTaskRunner("advancer-resolve-locks-runner", + c.env.GetStore(), 1, handler) + err := runner.RunOnRange(ctx, targetRange.Key.StartKey, targetRange.Key.EndKey) + if err != nil { + // wait for next tick + log.Error("resolve locks failed", zap.String("category", "advancer"), + zap.String("uuid", "log backup advancer"), + zap.Error(err)) + } + }) } - log.Info("finish resolve locks", zap.String("category", "advancer"), + log.Info("finish resolve locks for checkpoint", zap.String("category", "advancer"), zap.String("uuid", "log backup advancer"), logutil.Key("StartKey", c.lastCheckpoint.StartKey), logutil.Key("EndKey", c.lastCheckpoint.EndKey), - zap.Int("regions", runner.CompletedRegions())) + zap.Int("targets", len(targets))) + c.lastCheckpoint.resolveLockTime = time.Now() } } } diff --git a/util/gcutil/gcutil.go b/util/gcutil/gcutil.go index 99fe796219c11..3b481ef8d4ade 100644 --- a/util/gcutil/gcutil.go +++ b/util/gcutil/gcutil.go @@ -171,6 +171,7 @@ retryScanAndResolve: } req.ScanLock().EndKey = loc.EndKey resp, err := lockResolver.GetStore().SendReq(bo, req, loc.Region, tikv.ReadTimeoutMedium) + logutil.Logger(ctx).Info("lockResolver resp", zap.Any("resp", resp)) if err != nil { return stat, errors.Trace(err) } From bf8b1d517acca81fea8a0871d24ab71072161b60 Mon Sep 17 00:00:00 2001 From: 3pointer Date: Mon, 14 Aug 2023 11:55:53 +0800 Subject: [PATCH 15/45] update log --- br/pkg/streamhelper/advancer_daemon.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/br/pkg/streamhelper/advancer_daemon.go b/br/pkg/streamhelper/advancer_daemon.go index 8bf05e40399b1..3d3746295d452 100644 --- a/br/pkg/streamhelper/advancer_daemon.go +++ b/br/pkg/streamhelper/advancer_daemon.go @@ -33,7 +33,7 @@ func resolveLockTickTime() time.Duration { t := time.Duration(val.(int)) failpoint.Return(t * time.Second) }) - return 10 * time.Second + return 5 * time.Second } // OnTick advances the inner logic clock for the advancer. @@ -105,7 +105,7 @@ func (c *CheckpointAdvancer) OnBecomeOwner(ctx context.Context) { logutil.Key("StartKey", c.lastCheckpoint.StartKey), logutil.Key("EndKey", c.lastCheckpoint.EndKey), zap.Int("targets", len(targets))) - c.lastCheckpoint.resolveLockTime = time.Now() + //c.lastCheckpoint.resolveLockTime = time.Now() } } } From 1fe03aab1ff35ec02b93ea59c7f9e5bbbe246056 Mon Sep 17 00:00:00 2001 From: 3pointer Date: Wed, 16 Aug 2023 10:37:30 +0800 Subject: [PATCH 16/45] udpate go mod for client-go --- br/pkg/streamhelper/advancer_daemon.go | 4 +-- br/pkg/streamhelper/advancer_env.go | 45 +++++++++++++++----------- go.mod | 1 + go.sum | 4 +-- store/gcworker/gc_worker.go | 6 ++-- 5 files changed, 34 insertions(+), 26 deletions(-) diff --git a/br/pkg/streamhelper/advancer_daemon.go b/br/pkg/streamhelper/advancer_daemon.go index 3d3746295d452..c30dcba4d5ab7 100644 --- a/br/pkg/streamhelper/advancer_daemon.go +++ b/br/pkg/streamhelper/advancer_daemon.go @@ -16,8 +16,8 @@ import ( "github.com/pingcap/tidb/br/pkg/utils" "github.com/pingcap/tidb/metrics" "github.com/pingcap/tidb/owner" - "github.com/pingcap/tidb/util/gcutil" tikvstore "github.com/tikv/client-go/v2/kv" + "github.com/tikv/client-go/v2/tikv" "github.com/tikv/client-go/v2/txnkv/rangetask" clientv3 "go.etcd.io/etcd/client/v3" "go.uber.org/zap" @@ -79,7 +79,7 @@ func (c *CheckpointAdvancer) OnBecomeOwner(ctx context.Context) { }) handler := func(ctx context.Context, r tikvstore.KeyRange) (rangetask.TaskStat, error) { // we will scan all locks and try to resolve them by check txn status. - return gcutil.ResolveLocksForRange(ctx, "log backup", "advancer", c.env, math.MaxUint64, r.StartKey, r.EndKey) + return tikv.ResolveLocksForRange(ctx, "log backup advancer", c.env, math.MaxUint64, r.StartKey, r.EndKey) } workerPool := utils.NewWorkerPool(uint(config.DefaultMaxConcurrencyAdvance), "advancer resolve locks") for _, r := range targets { diff --git a/br/pkg/streamhelper/advancer_env.go b/br/pkg/streamhelper/advancer_env.go index fd98cd26fc2bd..d2fde5fe6a23e 100644 --- a/br/pkg/streamhelper/advancer_env.go +++ b/br/pkg/streamhelper/advancer_env.go @@ -6,12 +6,10 @@ import ( "context" "time" - "github.com/pingcap/errors" logbackup "github.com/pingcap/kvproto/pkg/logbackuppb" "github.com/pingcap/tidb/br/pkg/utils" "github.com/pingcap/tidb/config" "github.com/pingcap/tidb/util/engine" - "github.com/pingcap/tidb/util/gcutil" "github.com/tikv/client-go/v2/tikv" "github.com/tikv/client-go/v2/txnkv/txnlock" pd "github.com/tikv/pd/client" @@ -34,7 +32,7 @@ type Env interface { // StreamMeta connects to the metadata service (normally PD). StreamMeta // GCLockResolver try to resolve locks when region checkpoint stopped. - gcutil.GCLockResolver + tikv.GCLockResolver } // PDRegionScanner is a simple wrapper over PD @@ -92,6 +90,8 @@ type clusterEnv struct { *AdvancerLockResolver } +var _ Env = &clusterEnv{} + // GetLogBackupClient gets the log backup client. func (t clusterEnv) GetLogBackupClient(ctx context.Context, storeID uint64) (logbackup.LogBackupClient, error) { var cli logbackup.LogBackupClient @@ -110,7 +110,7 @@ func CliEnv(cli *utils.StoreManager, tikvStore tikv.Storage, etcdCli *clientv3.C clis: cli, AdvancerExt: &AdvancerExt{MetaDataClient: *NewMetaDataClient(etcdCli)}, PDRegionScanner: PDRegionScanner{cli.PDClient()}, - AdvancerLockResolver: &AdvancerLockResolver{TiKvStore: tikvStore}, + AdvancerLockResolver: newAdvancerLockResolver(tikvStore), } } @@ -127,7 +127,7 @@ func TiDBEnv(tikvStore tikv.Storage, pdCli pd.Client, etcdCli *clientv3.Client, }, tconf), AdvancerExt: &AdvancerExt{MetaDataClient: *NewMetaDataClient(etcdCli)}, PDRegionScanner: PDRegionScanner{Client: pdCli}, - AdvancerLockResolver: &AdvancerLockResolver{TiKvStore: tikvStore}, + AdvancerLockResolver: newAdvancerLockResolver(tikvStore), }, nil } @@ -147,26 +147,33 @@ type StreamMeta interface { ClearV3GlobalCheckpointForTask(ctx context.Context, taskName string) error } +var _ tikv.GCLockResolver = &AdvancerLockResolver{} + type AdvancerLockResolver struct { - TiKvStore tikv.Storage + store tikv.Storage + *tikv.BaseLockResolver } -// ResolveLocks tries to resolve expired locks with this method. -// It will check status of the txn. Resolve the lock if txn is expired, Or do nothing. -func (w *AdvancerLockResolver) ResolveLocks( - bo *tikv.Backoffer, locks []*txnlock.Lock, loc tikv.RegionVerID) (bool, error) { - if len(locks) == 0 { - return true, nil +func newAdvancerLockResolver(store tikv.Storage) *AdvancerLockResolver { + return &AdvancerLockResolver{ + store: store, + BaseLockResolver: tikv.NewBaseLockResolver(store), } - - _, err := w.TiKvStore.GetLockResolver().ResolveLocks(bo, 0, locks) - return err == nil, errors.Trace(err) } -func (w *AdvancerLockResolver) ScanLocks(key []byte, regionID uint64) []*txnlock.Lock { - return nil +// ResolveLocks tries to resolve expired locks with this method. +// It will check status of the txn. Resolve the lock if txn is expired, Or do nothing. +func (l *AdvancerLockResolver) ResolveLocks(ctx context.Context, locks []*txnlock.Lock, loc *tikv.KeyLocation) (*tikv.KeyLocation, error) { + // renew backoffer + bo := tikv.NewGcResolveLockMaxBackoffer(ctx) + _, err := l.GetStore().GetLockResolver().ResolveLocks(bo, 0, locks) + if err != nil { + return nil, err + } + return loc, nil } -func (w *AdvancerLockResolver) GetStore() tikv.Storage { - return w.TiKvStore +// If we don't implement GetStore here, it won't complie. +func (l *AdvancerLockResolver) GetStore() tikv.Storage { + return l.store } diff --git a/go.mod b/go.mod index 9de839945ba11..6f670e9c68fd5 100644 --- a/go.mod +++ b/go.mod @@ -304,4 +304,5 @@ replace ( github.com/dgrijalva/jwt-go => github.com/form3tech-oss/jwt-go v3.2.6-0.20210809144907-32ab6a8243d7+incompatible github.com/go-ldap/ldap/v3 => github.com/YangKeao/ldap/v3 v3.4.5-0.20230421065457-369a3bab1117 github.com/pingcap/tidb/parser => ./parser + github.com/tikv/client-go/v2 => github.com/3pointer/client-go/v2 v2.0.1-0.20230816091421-4125f3e49bda ) diff --git a/go.sum b/go.sum index 4b2507954ed67..24fe3b033503f 100644 --- a/go.sum +++ b/go.sum @@ -47,6 +47,8 @@ cloud.google.com/go/storage v1.30.1 h1:uOdMxAs8HExqBlnLtnQyP0YkvbiDpdGShGKtx6U/o cloud.google.com/go/storage v1.30.1/go.mod h1:NfxhC0UJE1aXSx7CIIbCf7y9HKT7BiccwkR7+P7gN8E= contrib.go.opencensus.io/exporter/ocagent v0.4.12/go.mod h1:450APlNTSR6FrvC3CTRqYosuDstRB9un7SOx2k/9ckA= dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU= +github.com/3pointer/client-go/v2 v2.0.1-0.20230816091421-4125f3e49bda h1:EY4NdDiFOPNHe5NYbDTTreLjm9G2swaWvbPjtPWJzto= +github.com/3pointer/client-go/v2 v2.0.1-0.20230816091421-4125f3e49bda/go.mod h1:J17iHkj8buCLDF7lgKJLX5jq5aGozrbpa7+Ln6g8Xjc= github.com/AndreasBriese/bbloom v0.0.0-20190306092124-e2d15f34fcf9/go.mod h1:bOvUY6CB00SOBii9/FifXqc0awNKxLFCL/+pkDPuyl8= github.com/Azure/azure-sdk-for-go v23.2.0+incompatible/go.mod h1:9XXNKU+eRnpl9moKnB4QOLf1HestfXbmab5FXxiDBjc= github.com/Azure/azure-sdk-for-go/sdk/azcore v1.6.0 h1:8kDqDngH+DmVBiCtIjCFTGa7MBnsIOkF9IccInFEbjk= @@ -991,8 +993,6 @@ github.com/tiancaiamao/appdash v0.0.0-20181126055449-889f96f722a2 h1:mbAskLJ0oJf github.com/tiancaiamao/appdash v0.0.0-20181126055449-889f96f722a2/go.mod h1:2PfKggNGDuadAa0LElHrByyrz4JPZ9fFx6Gs7nx7ZZU= github.com/tiancaiamao/gp v0.0.0-20221230034425-4025bc8a4d4a h1:J/YdBZ46WKpXsxsW93SG+q0F8KI+yFrcIDT4c/RNoc4= github.com/tiancaiamao/gp v0.0.0-20221230034425-4025bc8a4d4a/go.mod h1:h4xBhSNtOeEosLJ4P7JyKXX7Cabg7AVkWCK5gV2vOrM= -github.com/tikv/client-go/v2 v2.0.8-0.20230731032349-719e6456f7d5 h1:eHCUYIb9bH3TePiwnLTf40kFHSmiIO3oRg4gfzeowvQ= -github.com/tikv/client-go/v2 v2.0.8-0.20230731032349-719e6456f7d5/go.mod h1:J17iHkj8buCLDF7lgKJLX5jq5aGozrbpa7+Ln6g8Xjc= github.com/tikv/pd/client v0.0.0-20230728033905-31343e006842 h1:TwjBJvRx/DJbgMt7Vk5cFO7tG1DZnxR+22S2VmaNGRw= github.com/tikv/pd/client v0.0.0-20230728033905-31343e006842/go.mod h1:VJwM+qMcQxvGgyu9C6wU7fhjLaALs+odsOvpUMmnhHo= github.com/timakin/bodyclose v0.0.0-20230421092635-574207250966 h1:quvGphlmUVU+nhpFa4gg4yJyTRJ13reZMDHrKwYw53M= diff --git a/store/gcworker/gc_worker.go b/store/gcworker/gc_worker.go index 4f46be9d83609..ee17a89980a13 100644 --- a/store/gcworker/gc_worker.go +++ b/store/gcworker/gc_worker.go @@ -91,7 +91,7 @@ type GCWorker struct { lastFinish time.Time cancel context.CancelFunc done chan error - lockResolver gcutil.GCLockResolver + lockResolver tikv.GCLockResolver } // NewGCWorker creates a GCWorker instance. @@ -116,7 +116,7 @@ func NewGCWorker(store kv.Storage, pdClient pd.Client) (*GCWorker, error) { pdClient: pdClient, gcIsRunning: false, lastFinish: time.Now(), - lockResolver: &GCWorkerLockResolver{tikvStore}, + lockResolver: tikv.NewBaseLockResolver(tikvStore), done: make(chan error), } variable.RegisterStatistics(worker) @@ -1198,7 +1198,7 @@ func (w *GCWorker) legacyResolveLocks( startTime := time.Now() handler := func(ctx context.Context, r tikvstore.KeyRange) (rangetask.TaskStat, error) { - return gcutil.ResolveLocksForRange(ctx, w.uuid, "gc worker", w.lockResolver, safePoint, r.StartKey, r.EndKey) + return tikv.ResolveLocksForRange(ctx, w.uuid, w.lockResolver, safePoint, r.StartKey, r.EndKey) } runner := rangetask.NewRangeTaskRunner("resolve-locks-runner", w.tikvStore, concurrency, handler) From 7e02b0929c9bac14e923286753b40c21b48bd28a Mon Sep 17 00:00:00 2001 From: 3pointer Date: Wed, 16 Aug 2023 18:21:10 +0800 Subject: [PATCH 17/45] update test --- store/gcworker/gc_worker_test.go | 80 ++++++++++++++++++-------------- 1 file changed, 44 insertions(+), 36 deletions(-) diff --git a/store/gcworker/gc_worker_test.go b/store/gcworker/gc_worker_test.go index 084c235bcbd44..d1dac0bb063e1 100644 --- a/store/gcworker/gc_worker_test.go +++ b/store/gcworker/gc_worker_test.go @@ -39,7 +39,6 @@ import ( "github.com/pingcap/tidb/parser/model" "github.com/pingcap/tidb/session" "github.com/pingcap/tidb/store/mockstore" - "github.com/pingcap/tidb/util/gcutil" "github.com/stretchr/testify/require" "github.com/tikv/client-go/v2/oracle" "github.com/tikv/client-go/v2/oracle/oracles" @@ -52,20 +51,21 @@ import ( type mockGCWorkerLockResolver struct { tikvStore tikv.Storage - scanLocks func(key []byte, regionID uint64) []*txnlock.Lock - batchResolveLocks func(locks []*txnlock.Lock, regionID tikv.RegionVerID) (ok bool, err error) + scanLocks func([]byte) ([]*txnlock.Lock, *tikv.KeyLocation) + batchResolveLocks func([]*txnlock.Lock, *tikv.KeyLocation) (*tikv.KeyLocation, error) } -func (l *mockGCWorkerLockResolver) ScanLocks(key []byte, regionID uint64) []*txnlock.Lock { - return l.scanLocks(key, regionID) +func (l *mockGCWorkerLockResolver) ScanLocks(ctx context.Context, key []byte, maxVersion uint64) ([]*txnlock.Lock, *tikv.KeyLocation, error) { + locks, loc := l.scanLocks(key) + return locks, loc, nil + } -func (l *mockGCWorkerLockResolver) ResolveLocks(bo *tikv.Backoffer, locks []*txnlock.Lock, loc tikv.RegionVerID) (bool, error) { +func (l *mockGCWorkerLockResolver) ResolveLocks(ctx context.Context, locks []*txnlock.Lock, loc *tikv.KeyLocation) (*tikv.KeyLocation, error) { return l.batchResolveLocks(locks, loc) } func (l *mockGCWorkerLockResolver) GetStore() tikv.Storage { - // no use in test return l.tikvStore } @@ -1031,13 +1031,11 @@ func TestResolveLockRangeInfine(t *testing.T) { s := createGCWorkerSuite(t) require.NoError(t, failpoint.Enable("tikvclient/invalidCacheAndRetry", "return(true)")) - require.NoError(t, failpoint.Enable("github.com/pingcap/tidb/util/gcutil/setGcResolveMaxBackoff", "return(1)")) defer func() { require.NoError(t, failpoint.Disable("tikvclient/invalidCacheAndRetry")) - require.NoError(t, failpoint.Disable("github.com/pingcap/tidb/util/gcutil/setGcResolveMaxBackoff")) }() - _, err := gcutil.ResolveLocksForRange(gcContext(), s.gcWorker.uuid, s.gcWorker.lockResolver, 1, []byte{0}, []byte{1}) + _, err := tikv.ResolveLocksForRange(gcContext(), s.gcWorker.uuid, s.gcWorker.lockResolver, 1, []byte{0}, []byte{1}) require.Error(t, err) } @@ -1082,27 +1080,29 @@ func TestResolveLockRangeMeetRegionCacheMiss(t *testing.T) { mockLockResolver := &mockGCWorkerLockResolver{ tikvStore: s.tikvStore, - scanLocks: func(key []byte, regionID uint64) []*txnlock.Lock { + scanLocks: func(key []byte) ([]*txnlock.Lock, *tikv.KeyLocation) { *scanCntRef++ - return allLocks + return allLocks, &tikv.KeyLocation{ + Region: tikv.NewRegionVerID(s.initRegion.regionID, 0, 0), + } }, batchResolveLocks: func( locks []*txnlock.Lock, - regionID tikv.RegionVerID, - ) (ok bool, err error) { + loc *tikv.KeyLocation, + ) (*tikv.KeyLocation, error) { *resolveCntRef++ if *resolveCntRef == 1 { - s.gcWorker.tikvStore.GetRegionCache().InvalidateCachedRegion(regionID) + s.gcWorker.tikvStore.GetRegionCache().InvalidateCachedRegion(loc.Region) // mock the region cache miss error - return false, nil + return nil, nil } - return true, nil + return loc, nil }, } - _, err := gcutil.ResolveLocksForRange(gcContext(), s.gcWorker.uuid, mockLockResolver, safepointTS, []byte{0}, []byte{10}) + _, err := tikv.ResolveLocksForRange(gcContext(), s.gcWorker.uuid, mockLockResolver, safepointTS, []byte{0}, []byte{10}) require.NoError(t, err) require.Equal(t, 2, resolveCnt) - require.Equal(t, 1, scanCnt) + require.Equal(t, 2, scanCnt) } func TestResolveLockRangeMeetRegionEnlargeCausedByRegionMerge(t *testing.T) { @@ -1128,21 +1128,29 @@ func TestResolveLockRangeMeetRegionEnlargeCausedByRegionMerge(t *testing.T) { mockGCLockResolver := &mockGCWorkerLockResolver{ tikvStore: s.tikvStore, - scanLocks: func(key []byte, regionID uint64) []*txnlock.Lock { - if regionID == s.initRegion.regionID { - return []*txnlock.Lock{{Key: []byte("a")}, {Key: []byte("b")}} + scanLocks: func(key []byte) ([]*txnlock.Lock, *tikv.KeyLocation) { + // first time scan locks + if bytes.Equal(key, []byte("")) { + return []*txnlock.Lock{{Key: []byte("a")}, {Key: []byte("b")}}, + &tikv.KeyLocation{ + Region: tikv.NewRegionVerID(s.initRegion.regionID, 0, 0), + } } - if regionID == region2 { - return []*txnlock.Lock{{Key: []byte("o")}, {Key: []byte("p")}} + // second time scan locks + if bytes.Equal(key, []byte("m")) { + return []*txnlock.Lock{{Key: []byte("o")}, {Key: []byte("p")}}, + &tikv.KeyLocation{ + Region: tikv.NewRegionVerID(region2, 0, 0), + } } - return []*txnlock.Lock{} + return []*txnlock.Lock{}, nil }, } mockGCLockResolver.batchResolveLocks = func( locks []*txnlock.Lock, - regionID tikv.RegionVerID, - ) (ok bool, err error) { - if regionID.GetID() == s.initRegion.regionID && *firstAccessRef { + loc *tikv.KeyLocation, + ) (*tikv.KeyLocation, error) { + if loc.Region.GetID() == s.initRegion.regionID && *firstAccessRef { *firstAccessRef = false // merge region2 into region1 and return EpochNotMatch error. mCluster := s.cluster.(*testutils.MockCluster) @@ -1150,12 +1158,12 @@ func TestResolveLockRangeMeetRegionEnlargeCausedByRegionMerge(t *testing.T) { regionMeta, _ := mCluster.GetRegion(s.initRegion.regionID) _, err := s.tikvStore.GetRegionCache().OnRegionEpochNotMatch( tikv.NewNoopBackoff(context.Background()), - &tikv.RPCContext{Region: regionID, Store: &tikv.Store{}}, + &tikv.RPCContext{Region: loc.Region, Store: &tikv.Store{}}, []*metapb.Region{regionMeta}) require.NoError(t, err) // also let region1 contains all 4 locks - mockGCLockResolver.scanLocks = func(key []byte, regionID uint64) []*txnlock.Lock { - if regionID == s.initRegion.regionID { + mockGCLockResolver.scanLocks = func(key []byte) ([]*txnlock.Lock, *tikv.KeyLocation) { + if bytes.Equal(key, []byte("")) { locks := []*txnlock.Lock{ {Key: []byte("a")}, {Key: []byte("b")}, @@ -1164,20 +1172,20 @@ func TestResolveLockRangeMeetRegionEnlargeCausedByRegionMerge(t *testing.T) { } for i, lock := range locks { if bytes.Compare(key, lock.Key) <= 0 { - return locks[i:] + return locks[i:], &tikv.KeyLocation{Region: tikv.NewRegionVerID(s.initRegion.regionID, 0, 0)} } } } - return []*txnlock.Lock{} + return []*txnlock.Lock{}, nil } - return false, nil + return nil, nil } for _, lock := range locks { resolvedLock = append(resolvedLock, lock.Key) } - return true, nil + return loc, nil } - _, err := gcutil.ResolveLocksForRange(gcContext(), s.gcWorker.uuid, mockGCLockResolver, 1, []byte(""), []byte("z")) + _, err := tikv.ResolveLocksForRange(gcContext(), s.gcWorker.uuid, mockGCLockResolver, 1, []byte(""), []byte("z")) require.NoError(t, err) require.Len(t, resolvedLock, 4) expects := [][]byte{[]byte("a"), []byte("b"), []byte("o"), []byte("p")} From 8dc9ce33954cbb096b11d5306880db3efa58cd84 Mon Sep 17 00:00:00 2001 From: 3pointer Date: Thu, 17 Aug 2023 11:28:18 +0800 Subject: [PATCH 18/45] remove useless code --- util/gcutil/gcutil.go | 145 ------------------------------------------ 1 file changed, 145 deletions(-) diff --git a/util/gcutil/gcutil.go b/util/gcutil/gcutil.go index 3b481ef8d4ade..b1c0675a4994d 100644 --- a/util/gcutil/gcutil.go +++ b/util/gcutil/gcutil.go @@ -15,27 +15,18 @@ package gcutil import ( - "bytes" "context" "github.com/pingcap/errors" - "github.com/pingcap/failpoint" - "github.com/pingcap/kvproto/pkg/kvrpcpb" "github.com/pingcap/tidb/kv" - "github.com/pingcap/tidb/metrics" "github.com/pingcap/tidb/parser/model" "github.com/pingcap/tidb/sessionctx" "github.com/pingcap/tidb/sessionctx/variable" - "github.com/pingcap/tidb/util/logutil" "github.com/pingcap/tidb/util/sqlexec" - tikverr "github.com/tikv/client-go/v2/error" "github.com/tikv/client-go/v2/oracle" "github.com/tikv/client-go/v2/tikv" - "github.com/tikv/client-go/v2/tikvrpc" - "github.com/tikv/client-go/v2/txnkv/rangetask" "github.com/tikv/client-go/v2/txnkv/txnlock" "github.com/tikv/client-go/v2/util" - "go.uber.org/zap" ) const ( @@ -121,139 +112,3 @@ func GetGCSafePoint(sctx sessionctx.Context) (uint64, error) { ts := oracle.GoTimeToTS(safePointTime) return ts, nil } - -// ResolveLocksForRange is used for GCWorker and log backup advancer. -func ResolveLocksForRange( - ctx context.Context, - uuid string, - category string, - lockResolver GCLockResolver, - maxVersion uint64, - startKey []byte, - endKey []byte, -) (rangetask.TaskStat, error) { - // for scan lock request, we must return all locks even if they are generated - // by the same transaction. because gc worker need to make sure all locks have been - // cleaned. - req := tikvrpc.NewRequest(tikvrpc.CmdScanLock, &kvrpcpb.ScanLockRequest{ - MaxVersion: maxVersion, - Limit: GCScanLockLimit, - }, kvrpcpb.Context{ - RequestSource: util.RequestSourceFromCtx(ctx), - }) - - failpoint.Inject("lowScanLockLimit", func() { - req.ScanLock().Limit = 3 - }) - - var stat rangetask.TaskStat - key := startKey - bo := tikv.NewGcResolveLockMaxBackoffer(ctx) - failpoint.Inject("setGcResolveMaxBackoff", func(v failpoint.Value) { - sleep := v.(int) - // cooperate with github.com/tikv/client-go/v2/locate/invalidCacheAndRetry - //nolint: SA1029 - ctx = context.WithValue(ctx, "injectedBackoff", struct{}{}) //nolint - bo = tikv.NewBackofferWithVars(ctx, sleep, nil) - }) -retryScanAndResolve: - for { - select { - case <-ctx.Done(): - return stat, errors.New("[gc worker] gc job canceled") - default: - } - - req.ScanLock().StartKey = key - loc, err := lockResolver.GetStore().GetRegionCache().LocateKey(bo, key) - if err != nil { - return stat, errors.Trace(err) - } - req.ScanLock().EndKey = loc.EndKey - resp, err := lockResolver.GetStore().SendReq(bo, req, loc.Region, tikv.ReadTimeoutMedium) - logutil.Logger(ctx).Info("lockResolver resp", zap.Any("resp", resp)) - if err != nil { - return stat, errors.Trace(err) - } - regionErr, err := resp.GetRegionError() - if err != nil { - return stat, errors.Trace(err) - } - if regionErr != nil { - err = bo.Backoff(tikv.BoRegionMiss(), errors.New(regionErr.String())) - if err != nil { - return stat, errors.Trace(err) - } - continue - } - if resp.Resp == nil { - return stat, errors.Trace(tikverr.ErrBodyMissing) - } - locksResp := resp.Resp.(*kvrpcpb.ScanLockResponse) - if locksResp.GetError() != nil { - return stat, errors.Errorf("unexpected scanlock error: %s", locksResp) - } - locksInfo := locksResp.GetLocks() - locks := make([]*txnlock.Lock, 0, len(locksInfo)) - for _, li := range locksInfo { - locks = append(locks, txnlock.NewLock(li)) - } - locks = append(locks, lockResolver.ScanLocks(key, loc.Region.GetID())...) - locForResolve := loc - for { - ok, err1 := lockResolver.ResolveLocks(bo, locks, locForResolve.Region) - if err1 != nil { - return stat, errors.Trace(err1) - } - if !ok { - err = bo.Backoff(tikv.BoTxnLock(), errors.Errorf("remain locks: %d", len(locks))) - if err != nil { - return stat, errors.Trace(err) - } - stillInSame, refreshedLoc, err := tryRelocateLocksRegion(bo, lockResolver, locks) - if err != nil { - return stat, errors.Trace(err) - } - if stillInSame { - locForResolve = refreshedLoc - continue - } - continue retryScanAndResolve - } - break - } - if len(locks) < GCScanLockLimit { - stat.CompletedRegions++ - key = loc.EndKey - } else { - logutil.Logger(ctx).Info("region has more than limit locks", zap.String("category", category), - zap.String("uuid", uuid), - zap.Uint64("region", locForResolve.Region.GetID()), - zap.Int("scan lock limit", GCScanLockLimit)) - metrics.GCRegionTooManyLocksCounter.Inc() - key = locks[len(locks)-1].Key - } - - if len(key) == 0 || (len(endKey) != 0 && bytes.Compare(key, endKey) >= 0) { - break - } - bo = tikv.NewGcResolveLockMaxBackoffer(ctx) - failpoint.Inject("setGcResolveMaxBackoff", func(v failpoint.Value) { - sleep := v.(int) - bo = tikv.NewBackofferWithVars(ctx, sleep, nil) - }) - } - return stat, nil -} - -func tryRelocateLocksRegion(bo *tikv.Backoffer, lockResolver GCLockResolver, locks []*txnlock.Lock) (stillInSameRegion bool, refreshedLoc *tikv.KeyLocation, err error) { - if len(locks) == 0 { - return - } - refreshedLoc, err = lockResolver.GetStore().GetRegionCache().LocateKey(bo, locks[0].Key) - if err != nil { - return - } - stillInSameRegion = refreshedLoc.Contains(locks[len(locks)-1].Key) - return -} From 29e65d95f4b0c889c3f3dcb4beab592bbd44f54e Mon Sep 17 00:00:00 2001 From: 3pointer Date: Thu, 17 Aug 2023 11:54:07 +0800 Subject: [PATCH 19/45] update test --- br/pkg/streamhelper/advancer_test.go | 11 ++++++----- br/pkg/streamhelper/basic_lib_for_test.go | 20 +++++++++----------- go.mod | 2 +- go.sum | 4 ++-- store/gcworker/gc_worker.go | 20 +------------------- util/gcutil/gcutil.go | 23 ----------------------- 6 files changed, 19 insertions(+), 61 deletions(-) diff --git a/br/pkg/streamhelper/advancer_test.go b/br/pkg/streamhelper/advancer_test.go index 4b64d0b1d1ac1..07b8b695736be 100644 --- a/br/pkg/streamhelper/advancer_test.go +++ b/br/pkg/streamhelper/advancer_test.go @@ -313,17 +313,18 @@ func TestResolveLock(t *testing.T) { TxnID: minCheckpoint + 1, }, } - env.scanLocks = func(key []byte, regionID uint64) []*txnlock.Lock { - return allLocks - + env.scanLocks = func(key []byte) ([]*txnlock.Lock, *tikv.KeyLocation) { + return allLocks, &tikv.KeyLocation{ + Region: tikv.NewRegionVerID(1, 0, 0), + } } // ensure resolve locks triggered and collect all locks from scan locks resolveLockCnt := 0 resolveLockCntRef := &resolveLockCnt - env.resolveLocks = func(locks []*txnlock.Lock, regionID tikv.RegionVerID) (ok bool, err error) { + env.resolveLocks = func(locks []*txnlock.Lock, loc *tikv.KeyLocation) (*tikv.KeyLocation, error) { *resolveLockCntRef += 1 require.ElementsMatch(t, locks, allLocks) - return true, nil + return loc, nil } adv := streamhelper.NewCheckpointAdvancer(env) // make lastCheckpoint stuck at 123 diff --git a/br/pkg/streamhelper/basic_lib_for_test.go b/br/pkg/streamhelper/basic_lib_for_test.go index ef7eab388f9a0..b7b598217db6b 100644 --- a/br/pkg/streamhelper/basic_lib_for_test.go +++ b/br/pkg/streamhelper/basic_lib_for_test.go @@ -588,8 +588,8 @@ type testEnv struct { ranges []kv.KeyRange taskCh chan<- streamhelper.TaskEvent - scanLocks func(key []byte, regionID uint64) []*txnlock.Lock - resolveLocks func(locks []*txnlock.Lock, regionID tikv.RegionVerID) (ok bool, err error) + scanLocks func([]byte) ([]*txnlock.Lock, *tikv.KeyLocation) + resolveLocks func([]*txnlock.Lock, *tikv.KeyLocation) (*tikv.KeyLocation, error) mu sync.Mutex } @@ -645,22 +645,20 @@ func (t *testEnv) unregisterTask() { } } -func (t *testEnv) ResolveLocks( - bo *tikv.Backoffer, locks []*txnlock.Lock, loc tikv.RegionVerID) (bool, error) { - if len(locks) == 0 { - return true, nil - } +func (t *testEnv) ScanLocks(ctx context.Context, key []byte, maxVersion uint64) ([]*txnlock.Lock, *tikv.KeyLocation, error) { + locks, loc := t.scanLocks(key) + return locks, loc, nil - return t.resolveLocks(locks, loc) } -func (t *testEnv) ScanLocks(key []byte, regionID uint64) []*txnlock.Lock { - return t.scanLocks(key, regionID) +func (t *testEnv) ResolveLocks(ctx context.Context, locks []*txnlock.Lock, loc *tikv.KeyLocation) (*tikv.KeyLocation, error) { + return t.resolveLocks(locks, loc) } func (t *testEnv) GetStore() tikv.Storage { + return nil // only used for GetRegionCache once in resolve lock - return &mockTiKVStore{regionCache: tikv.NewRegionCache(&mockPDClient{})} + //return &mockTiKVStore{regionCache: tikv.NewRegionCache(&mockPDClient{})} } type mockKVStore struct { diff --git a/go.mod b/go.mod index 6f670e9c68fd5..87c2593770123 100644 --- a/go.mod +++ b/go.mod @@ -304,5 +304,5 @@ replace ( github.com/dgrijalva/jwt-go => github.com/form3tech-oss/jwt-go v3.2.6-0.20210809144907-32ab6a8243d7+incompatible github.com/go-ldap/ldap/v3 => github.com/YangKeao/ldap/v3 v3.4.5-0.20230421065457-369a3bab1117 github.com/pingcap/tidb/parser => ./parser - github.com/tikv/client-go/v2 => github.com/3pointer/client-go/v2 v2.0.1-0.20230816091421-4125f3e49bda + github.com/tikv/client-go/v2 => github.com/3pointer/client-go/v2 v2.0.1-0.20230817033745-505e606d43bc ) diff --git a/go.sum b/go.sum index 24fe3b033503f..d5adc799335bd 100644 --- a/go.sum +++ b/go.sum @@ -47,8 +47,8 @@ cloud.google.com/go/storage v1.30.1 h1:uOdMxAs8HExqBlnLtnQyP0YkvbiDpdGShGKtx6U/o cloud.google.com/go/storage v1.30.1/go.mod h1:NfxhC0UJE1aXSx7CIIbCf7y9HKT7BiccwkR7+P7gN8E= contrib.go.opencensus.io/exporter/ocagent v0.4.12/go.mod h1:450APlNTSR6FrvC3CTRqYosuDstRB9un7SOx2k/9ckA= dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU= -github.com/3pointer/client-go/v2 v2.0.1-0.20230816091421-4125f3e49bda h1:EY4NdDiFOPNHe5NYbDTTreLjm9G2swaWvbPjtPWJzto= -github.com/3pointer/client-go/v2 v2.0.1-0.20230816091421-4125f3e49bda/go.mod h1:J17iHkj8buCLDF7lgKJLX5jq5aGozrbpa7+Ln6g8Xjc= +github.com/3pointer/client-go/v2 v2.0.1-0.20230817033745-505e606d43bc h1:y4KdIgx8v7yA2GH3Rzs4foBQB0u61+IuDyvXC+dU8Wg= +github.com/3pointer/client-go/v2 v2.0.1-0.20230817033745-505e606d43bc/go.mod h1:J17iHkj8buCLDF7lgKJLX5jq5aGozrbpa7+Ln6g8Xjc= github.com/AndreasBriese/bbloom v0.0.0-20190306092124-e2d15f34fcf9/go.mod h1:bOvUY6CB00SOBii9/FifXqc0awNKxLFCL/+pkDPuyl8= github.com/Azure/azure-sdk-for-go v23.2.0+incompatible/go.mod h1:9XXNKU+eRnpl9moKnB4QOLf1HestfXbmab5FXxiDBjc= github.com/Azure/azure-sdk-for-go/sdk/azcore v1.6.0 h1:8kDqDngH+DmVBiCtIjCFTGa7MBnsIOkF9IccInFEbjk= diff --git a/store/gcworker/gc_worker.go b/store/gcworker/gc_worker.go index ee17a89980a13..39e7adf45e8d1 100644 --- a/store/gcworker/gc_worker.go +++ b/store/gcworker/gc_worker.go @@ -48,7 +48,6 @@ import ( "github.com/pingcap/tidb/tablecodec" "github.com/pingcap/tidb/util/codec" "github.com/pingcap/tidb/util/dbterror" - "github.com/pingcap/tidb/util/gcutil" "github.com/pingcap/tidb/util/logutil" tikverr "github.com/tikv/client-go/v2/error" tikvstore "github.com/tikv/client-go/v2/kv" @@ -63,23 +62,6 @@ import ( "golang.org/x/exp/slices" ) -type GCWorkerLockResolver struct { - TiKvStore tikv.Storage -} - -func (w *GCWorkerLockResolver) ResolveLocks(bo *tikv.Backoffer, locks []*txnlock.Lock, loc tikv.RegionVerID) (bool, error) { - // Resolve locks without check txn status. it's safe because it use safepoint to scan locks. - return w.TiKvStore.GetLockResolver().BatchResolveLocks(bo, locks, loc) -} - -func (w *GCWorkerLockResolver) ScanLocks(key []byte, regionID uint64) []*txnlock.Lock { - return nil -} - -func (w *GCWorkerLockResolver) GetStore() tikv.Storage { - return w.TiKvStore -} - // GCWorker periodically triggers GC process on tikv server. type GCWorker struct { uuid string @@ -2249,7 +2231,7 @@ func newMergeLockScanner(safePoint uint64, client tikv.Client, stores map[uint64 safePoint: safePoint, client: client, stores: stores, - scanLockLimit: gcutil.GCScanLockLimit, + scanLockLimit: tikv.GCScanLockLimit, } failpoint.Inject("lowPhysicalScanLockLimit", func() { scanner.scanLockLimit = 3 diff --git a/util/gcutil/gcutil.go b/util/gcutil/gcutil.go index b1c0675a4994d..9474cace52378 100644 --- a/util/gcutil/gcutil.go +++ b/util/gcutil/gcutil.go @@ -24,36 +24,13 @@ import ( "github.com/pingcap/tidb/sessionctx/variable" "github.com/pingcap/tidb/util/sqlexec" "github.com/tikv/client-go/v2/oracle" - "github.com/tikv/client-go/v2/tikv" - "github.com/tikv/client-go/v2/txnkv/txnlock" "github.com/tikv/client-go/v2/util" ) const ( selectVariableValueSQL = `SELECT HIGH_PRIORITY variable_value FROM mysql.tidb WHERE variable_name=%?` - - // GCScanLockLimit We don't want gc to sweep out the cached info belong to other processes, like coprocessor. - GCScanLockLimit = txnlock.ResolvedCacheSize / 2 ) -// GCLockResolver is used for GCWorker and log backup advancer to resolve locks. -// #Note: Put it here to avoid cycle import -type GCLockResolver interface { - // ResolveLocks tries to resolve expired locks. - // 1. For GCWorker it will scan locks for all regions before *safepoint*, - // and force remove locks. rollback the txn, no matter the lock is expired of not. - // 2. For log backup advancer, it will scan all locks for a small range. - // and it will check status of the txn. resolve the locks if txn is expired, Or do nothing. - ResolveLocks(*tikv.Backoffer, []*txnlock.Lock, tikv.RegionVerID) (bool, error) - - // ScanLocks only used for mock test. - ScanLocks([]byte, uint64) []*txnlock.Lock - // We need to get tikvStore to build rangerunner. - // TODO: the most code is in client.go and the store is only used to locate end keys of a region. - // maybe we can move GCLockResolver into client.go. - GetStore() tikv.Storage -} - // CheckGCEnable is use to check whether GC is enable. func CheckGCEnable(ctx sessionctx.Context) (enable bool, err error) { val, err := ctx.GetSessionVars().GlobalVarsAccessor.GetGlobalSysVar(variable.TiDBGCEnable) From ba1a4d18e245eb870dba3f07ea4127e7d6a611bd Mon Sep 17 00:00:00 2001 From: 3pointer Date: Thu, 17 Aug 2023 14:06:16 +0800 Subject: [PATCH 20/45] fix test --- br/pkg/streamhelper/advancer.go | 5 +++++ br/pkg/streamhelper/advancer_daemon.go | 5 +++++ br/pkg/streamhelper/advancer_test.go | 9 ++++++++- br/pkg/streamhelper/basic_lib_for_test.go | 3 +-- 4 files changed, 19 insertions(+), 3 deletions(-) diff --git a/br/pkg/streamhelper/advancer.go b/br/pkg/streamhelper/advancer.go index adc0652e9d671..34d75a0f49d5e 100644 --- a/br/pkg/streamhelper/advancer.go +++ b/br/pkg/streamhelper/advancer.go @@ -237,6 +237,11 @@ func (c *CheckpointAdvancer) WithCheckpoints(f func(*spans.ValueSortedFull)) { f(c.checkpoints) } +// only used for test +func (c *CheckpointAdvancer) NewCheckpoints(cps *spans.ValueSortedFull) { + c.checkpoints = cps +} + func (c *CheckpointAdvancer) CalculateGlobalCheckpointLight(ctx context.Context, threshold time.Duration) (spans.Valued, error) { var targets []spans.Valued diff --git a/br/pkg/streamhelper/advancer_daemon.go b/br/pkg/streamhelper/advancer_daemon.go index c30dcba4d5ab7..fe9b1dda56cfd 100644 --- a/br/pkg/streamhelper/advancer_daemon.go +++ b/br/pkg/streamhelper/advancer_daemon.go @@ -5,6 +5,7 @@ package streamhelper import ( "context" "math" + "sync" "time" "github.com/google/uuid" @@ -82,9 +83,12 @@ func (c *CheckpointAdvancer) OnBecomeOwner(ctx context.Context) { return tikv.ResolveLocksForRange(ctx, "log backup advancer", c.env, math.MaxUint64, r.StartKey, r.EndKey) } workerPool := utils.NewWorkerPool(uint(config.DefaultMaxConcurrencyAdvance), "advancer resolve locks") + var wg sync.WaitGroup for _, r := range targets { targetRange := r + wg.Add(1) workerPool.Apply(func() { + defer wg.Done() // Run resolve lock on the whole TiKV cluster. // it will use startKey/endKey to scan region in PD. // but regionCache already has a codecPDClient. so just use decode key here. @@ -100,6 +104,7 @@ func (c *CheckpointAdvancer) OnBecomeOwner(ctx context.Context) { } }) } + wg.Wait() log.Info("finish resolve locks for checkpoint", zap.String("category", "advancer"), zap.String("uuid", "log backup advancer"), logutil.Key("StartKey", c.lastCheckpoint.StartKey), diff --git a/br/pkg/streamhelper/advancer_test.go b/br/pkg/streamhelper/advancer_test.go index 07b8b695736be..8e0d0573806bc 100644 --- a/br/pkg/streamhelper/advancer_test.go +++ b/br/pkg/streamhelper/advancer_test.go @@ -335,6 +335,14 @@ func TestResolveLock(t *testing.T) { }, Value: 123, })) + adv.NewCheckpoints( + spans.Sorted(spans.NewFullWith([]kv.KeyRange{ + { + StartKey: kv.Key([]byte("1")), + EndKey: kv.Key([]byte("2")), + }, + }, 0)), + ) adv.OnBecomeOwner(ctx) coll := streamhelper.NewClusterCollector(ctx, env) err := adv.GetCheckpointInRange(ctx, []byte{}, []byte{}, coll) @@ -347,5 +355,4 @@ func TestResolveLock(t *testing.T) { require.NoError(t, err) require.Len(t, r.FailureSubRanges, 0) require.Equal(t, r.Checkpoint, minCheckpoint, "%d %d", r.Checkpoint, minCheckpoint) - } diff --git a/br/pkg/streamhelper/basic_lib_for_test.go b/br/pkg/streamhelper/basic_lib_for_test.go index b7b598217db6b..aceeac07c16dd 100644 --- a/br/pkg/streamhelper/basic_lib_for_test.go +++ b/br/pkg/streamhelper/basic_lib_for_test.go @@ -656,9 +656,8 @@ func (t *testEnv) ResolveLocks(ctx context.Context, locks []*txnlock.Lock, loc * } func (t *testEnv) GetStore() tikv.Storage { - return nil // only used for GetRegionCache once in resolve lock - //return &mockTiKVStore{regionCache: tikv.NewRegionCache(&mockPDClient{})} + return &mockTiKVStore{regionCache: tikv.NewRegionCache(&mockPDClient{})} } type mockKVStore struct { From c1628f7ce5141273b3d1275e7efca93732df1e87 Mon Sep 17 00:00:00 2001 From: 3pointer Date: Mon, 21 Aug 2023 10:50:59 +0800 Subject: [PATCH 21/45] update check time --- br/pkg/streamhelper/advancer.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/br/pkg/streamhelper/advancer.go b/br/pkg/streamhelper/advancer.go index 34d75a0f49d5e..7c9bd34c75fa0 100644 --- a/br/pkg/streamhelper/advancer.go +++ b/br/pkg/streamhelper/advancer.go @@ -108,14 +108,14 @@ func (c checkpoint) equal(o checkpoint) bool { bytes.Equal(c.EndKey, o.EndKey) && c.TS == o.TS } -// if a checkpoint stay in a time too long(1 min) +// if a checkpoint stay in a time too long(3 min) // we should try to resolve lock for the range -// to keep the RPO in a short value. +// to keep the RPO in 5 min. func (c checkpoint) needResolveLocks() bool { failpoint.Inject("NeedResolveLocks", func(val failpoint.Value) { failpoint.Return(val.(bool)) }) - return time.Since(c.resolveLockTime) > time.Minute + return time.Since(c.resolveLockTime) > 3*time.Minute } // NewCheckpointAdvancer creates a checkpoint advancer with the env. From e58f0fadc9d150cd97a590c269f55095b2277277 Mon Sep 17 00:00:00 2001 From: 3pointer Date: Mon, 28 Aug 2023 11:30:34 +0800 Subject: [PATCH 22/45] adapt new lockResolver --- br/pkg/streamhelper/advancer_daemon.go | 2 +- br/pkg/streamhelper/advancer_env.go | 16 +++++++--------- go.mod | 4 ++-- go.sum | 13 ++++--------- store/gcworker/gc_worker.go | 14 ++++++++++---- store/gcworker/gc_worker_test.go | 8 ++++++-- 6 files changed, 30 insertions(+), 27 deletions(-) diff --git a/br/pkg/streamhelper/advancer_daemon.go b/br/pkg/streamhelper/advancer_daemon.go index fe9b1dda56cfd..4ad442c234628 100644 --- a/br/pkg/streamhelper/advancer_daemon.go +++ b/br/pkg/streamhelper/advancer_daemon.go @@ -80,7 +80,7 @@ func (c *CheckpointAdvancer) OnBecomeOwner(ctx context.Context) { }) handler := func(ctx context.Context, r tikvstore.KeyRange) (rangetask.TaskStat, error) { // we will scan all locks and try to resolve them by check txn status. - return tikv.ResolveLocksForRange(ctx, "log backup advancer", c.env, math.MaxUint64, r.StartKey, r.EndKey) + return tikv.ResolveLocksForRange(ctx, c.env, math.MaxUint64, r.StartKey, r.EndKey, tikv.NewGcResolveLockMaxBackoffer, tikv.GCScanLockLimit) } workerPool := utils.NewWorkerPool(uint(config.DefaultMaxConcurrencyAdvance), "advancer resolve locks") var wg sync.WaitGroup diff --git a/br/pkg/streamhelper/advancer_env.go b/br/pkg/streamhelper/advancer_env.go index d2fde5fe6a23e..4648085825f6f 100644 --- a/br/pkg/streamhelper/advancer_env.go +++ b/br/pkg/streamhelper/advancer_env.go @@ -32,7 +32,7 @@ type Env interface { // StreamMeta connects to the metadata service (normally PD). StreamMeta // GCLockResolver try to resolve locks when region checkpoint stopped. - tikv.GCLockResolver + tikv.RegionLockResolver } // PDRegionScanner is a simple wrapper over PD @@ -147,25 +147,23 @@ type StreamMeta interface { ClearV3GlobalCheckpointForTask(ctx context.Context, taskName string) error } -var _ tikv.GCLockResolver = &AdvancerLockResolver{} +var _ tikv.RegionLockResolver = &AdvancerLockResolver{} type AdvancerLockResolver struct { store tikv.Storage - *tikv.BaseLockResolver + *tikv.BaseRegionLockResolver } func newAdvancerLockResolver(store tikv.Storage) *AdvancerLockResolver { return &AdvancerLockResolver{ - store: store, - BaseLockResolver: tikv.NewBaseLockResolver(store), + store: store, + BaseRegionLockResolver: tikv.NewRegionLockResolver("log backup advancer", store), } } -// ResolveLocks tries to resolve expired locks with this method. +// ResolveLocksInOneRegion tries to resolve expired locks with this method. // It will check status of the txn. Resolve the lock if txn is expired, Or do nothing. -func (l *AdvancerLockResolver) ResolveLocks(ctx context.Context, locks []*txnlock.Lock, loc *tikv.KeyLocation) (*tikv.KeyLocation, error) { - // renew backoffer - bo := tikv.NewGcResolveLockMaxBackoffer(ctx) +func (l *AdvancerLockResolver) ResolveLocksInOneRegion(bo *tikv.Backoffer, locks []*txnlock.Lock, loc *tikv.KeyLocation) (*tikv.KeyLocation, error) { _, err := l.GetStore().GetLockResolver().ResolveLocks(bo, 0, locks) if err != nil { return nil, err diff --git a/go.mod b/go.mod index b9e6ae1d08572..694973dec5a55 100644 --- a/go.mod +++ b/go.mod @@ -78,7 +78,7 @@ require ( github.com/pingcap/errors v0.11.5-0.20221009092201-b66cddb77c32 github.com/pingcap/failpoint v0.0.0-20220801062533-2eaa32854a6c github.com/pingcap/fn v1.0.0 - github.com/pingcap/kvproto v0.0.0-20230728080053-8a9db88bc88a + github.com/pingcap/kvproto v0.0.0-20230818065851-7b612d935bf9 github.com/pingcap/log v1.1.1-0.20230317032135-a0d097d16e22 github.com/pingcap/sysutil v1.0.1-0.20230407040306-fb007c5aff21 github.com/pingcap/tidb/parser v0.0.0-20211011031125-9b13dc409c5e @@ -304,5 +304,5 @@ replace ( github.com/dgrijalva/jwt-go => github.com/form3tech-oss/jwt-go v3.2.6-0.20210809144907-32ab6a8243d7+incompatible github.com/go-ldap/ldap/v3 => github.com/YangKeao/ldap/v3 v3.4.5-0.20230421065457-369a3bab1117 github.com/pingcap/tidb/parser => ./parser - github.com/tikv/client-go/v2 => github.com/3pointer/client-go/v2 v2.0.1-0.20230817033745-505e606d43bc + github.com/tikv/client-go/v2 => github.com/3pointer/client-go/v2 v2.0.1-0.20230825093018-1cd747c7575f ) diff --git a/go.sum b/go.sum index 0dd37fa5d41dd..ebd374090ca3b 100644 --- a/go.sum +++ b/go.sum @@ -49,8 +49,8 @@ cloud.google.com/go/storage v1.30.1 h1:uOdMxAs8HExqBlnLtnQyP0YkvbiDpdGShGKtx6U/o cloud.google.com/go/storage v1.30.1/go.mod h1:NfxhC0UJE1aXSx7CIIbCf7y9HKT7BiccwkR7+P7gN8E= contrib.go.opencensus.io/exporter/ocagent v0.4.12/go.mod h1:450APlNTSR6FrvC3CTRqYosuDstRB9un7SOx2k/9ckA= dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU= -github.com/3pointer/client-go/v2 v2.0.1-0.20230817033745-505e606d43bc h1:y4KdIgx8v7yA2GH3Rzs4foBQB0u61+IuDyvXC+dU8Wg= -github.com/3pointer/client-go/v2 v2.0.1-0.20230817033745-505e606d43bc/go.mod h1:J17iHkj8buCLDF7lgKJLX5jq5aGozrbpa7+Ln6g8Xjc= +github.com/3pointer/client-go/v2 v2.0.1-0.20230825093018-1cd747c7575f h1:3XzIm4xkVbquFvt8ju4so9fvbi41i6+75dre7FJhUT4= +github.com/3pointer/client-go/v2 v2.0.1-0.20230825093018-1cd747c7575f/go.mod h1:2cndkbSuokkQ6TvYdcUbSfd0IYXbQx9HfdPEc+r0ezg= github.com/AndreasBriese/bbloom v0.0.0-20190306092124-e2d15f34fcf9/go.mod h1:bOvUY6CB00SOBii9/FifXqc0awNKxLFCL/+pkDPuyl8= github.com/Azure/azure-sdk-for-go v23.2.0+incompatible/go.mod h1:9XXNKU+eRnpl9moKnB4QOLf1HestfXbmab5FXxiDBjc= github.com/Azure/azure-sdk-for-go/sdk/azcore v1.6.0 h1:8kDqDngH+DmVBiCtIjCFTGa7MBnsIOkF9IccInFEbjk= @@ -822,8 +822,8 @@ github.com/pingcap/fn v1.0.0/go.mod h1:u9WZ1ZiOD1RpNhcI42RucFh/lBuzTu6rw88a+oF2Z github.com/pingcap/goleveldb v0.0.0-20191226122134-f82aafb29989 h1:surzm05a8C9dN8dIUmo4Be2+pMRb6f55i+UIYrluu2E= github.com/pingcap/goleveldb v0.0.0-20191226122134-f82aafb29989/go.mod h1:O17XtbryoCJhkKGbT62+L2OlrniwqiGLSqrmdHCMzZw= github.com/pingcap/kvproto v0.0.0-20191211054548-3c6b38ea5107/go.mod h1:WWLmULLO7l8IOcQG+t+ItJ3fEcrL5FxF0Wu+HrMy26w= -github.com/pingcap/kvproto v0.0.0-20230728080053-8a9db88bc88a h1:5gSwd337GRaT1E0O3y10jb2ZDzv+h30pjCpRcCC5Phg= -github.com/pingcap/kvproto v0.0.0-20230728080053-8a9db88bc88a/go.mod h1:r0q/CFcwvyeRhKtoqzmWMBebrtpIziQQ9vR+JKh1knc= +github.com/pingcap/kvproto v0.0.0-20230818065851-7b612d935bf9 h1:VDoZ18CAXoTUNTCxfl4BjQSD5rJQri8QlH8nu0ZuHeg= +github.com/pingcap/kvproto v0.0.0-20230818065851-7b612d935bf9/go.mod h1:r0q/CFcwvyeRhKtoqzmWMBebrtpIziQQ9vR+JKh1knc= github.com/pingcap/log v0.0.0-20210625125904-98ed8e2eb1c7/go.mod h1:8AanEdAHATuRurdGxZXBz0At+9avep+ub7U1AGYLIMM= github.com/pingcap/log v1.1.0/go.mod h1:DWQW5jICDR7UJh4HtxXSM20Churx4CQL0fwL/SoOSA4= github.com/pingcap/log v1.1.1-0.20230317032135-a0d097d16e22 h1:2SOzvGvE8beiC1Y4g9Onkvu6UmuBBOeWRGQEjJaT/JY= @@ -1006,11 +1006,6 @@ github.com/tiancaiamao/appdash v0.0.0-20181126055449-889f96f722a2 h1:mbAskLJ0oJf github.com/tiancaiamao/appdash v0.0.0-20181126055449-889f96f722a2/go.mod h1:2PfKggNGDuadAa0LElHrByyrz4JPZ9fFx6Gs7nx7ZZU= github.com/tiancaiamao/gp v0.0.0-20221230034425-4025bc8a4d4a h1:J/YdBZ46WKpXsxsW93SG+q0F8KI+yFrcIDT4c/RNoc4= github.com/tiancaiamao/gp v0.0.0-20221230034425-4025bc8a4d4a/go.mod h1:h4xBhSNtOeEosLJ4P7JyKXX7Cabg7AVkWCK5gV2vOrM= -<<<<<<< HEAD -======= -github.com/tikv/client-go/v2 v2.0.8-0.20230811033710-8a214402da13 h1:oTAPyrDR5UFVhg4SYmHNQ1gHQrwQfBjGGK/zKbz8VcA= -github.com/tikv/client-go/v2 v2.0.8-0.20230811033710-8a214402da13/go.mod h1:J17iHkj8buCLDF7lgKJLX5jq5aGozrbpa7+Ln6g8Xjc= ->>>>>>> master github.com/tikv/pd/client v0.0.0-20230728033905-31343e006842 h1:TwjBJvRx/DJbgMt7Vk5cFO7tG1DZnxR+22S2VmaNGRw= github.com/tikv/pd/client v0.0.0-20230728033905-31343e006842/go.mod h1:VJwM+qMcQxvGgyu9C6wU7fhjLaALs+odsOvpUMmnhHo= github.com/timakin/bodyclose v0.0.0-20230421092635-574207250966 h1:quvGphlmUVU+nhpFa4gg4yJyTRJ13reZMDHrKwYw53M= diff --git a/store/gcworker/gc_worker.go b/store/gcworker/gc_worker.go index b5c52a8d171ce..0b50f774785c5 100644 --- a/store/gcworker/gc_worker.go +++ b/store/gcworker/gc_worker.go @@ -73,7 +73,7 @@ type GCWorker struct { lastFinish time.Time cancel context.CancelFunc done chan error - lockResolver tikv.GCLockResolver + lockResolver tikv.RegionLockResolver } // NewGCWorker creates a GCWorker instance. @@ -90,15 +90,17 @@ func NewGCWorker(store kv.Storage, pdClient pd.Client) (*GCWorker, error) { if !ok { return nil, errors.New("GC should run against TiKV storage") } + uuid := strconv.FormatUint(ver.Ver, 16) + resolverIdentifier := fmt.Sprintf("gc-worker-%s", uuid) worker := &GCWorker{ - uuid: strconv.FormatUint(ver.Ver, 16), + uuid: uuid, desc: fmt.Sprintf("host:%s, pid:%d, start at %s", hostName, os.Getpid(), time.Now()), store: store, tikvStore: tikvStore, pdClient: pdClient, gcIsRunning: false, lastFinish: time.Now(), - lockResolver: tikv.NewBaseLockResolver(tikvStore), + lockResolver: tikv.NewRegionLockResolver(resolverIdentifier, tikvStore), done: make(chan error), } variable.RegisterStatistics(worker) @@ -1180,7 +1182,11 @@ func (w *GCWorker) legacyResolveLocks( startTime := time.Now() handler := func(ctx context.Context, r tikvstore.KeyRange) (rangetask.TaskStat, error) { - return tikv.ResolveLocksForRange(ctx, w.uuid, w.lockResolver, safePoint, r.StartKey, r.EndKey) + scanLimit := uint32(tikv.GCScanLockLimit) + failpoint.Inject("lowScanLockLimit", func() { + scanLimit = 3 + }) + return tikv.ResolveLocksForRange(ctx, w.lockResolver, safePoint, r.StartKey, r.EndKey, tikv.NewGcResolveLockMaxBackoffer, scanLimit) } runner := rangetask.NewRangeTaskRunner("resolve-locks-runner", w.tikvStore, concurrency, handler) diff --git a/store/gcworker/gc_worker_test.go b/store/gcworker/gc_worker_test.go index d1dac0bb063e1..103dffc8728b4 100644 --- a/store/gcworker/gc_worker_test.go +++ b/store/gcworker/gc_worker_test.go @@ -55,13 +55,13 @@ type mockGCWorkerLockResolver struct { batchResolveLocks func([]*txnlock.Lock, *tikv.KeyLocation) (*tikv.KeyLocation, error) } -func (l *mockGCWorkerLockResolver) ScanLocks(ctx context.Context, key []byte, maxVersion uint64) ([]*txnlock.Lock, *tikv.KeyLocation, error) { +func (l *mockGCWorkerLockResolver) ScanLocksInOneRegion(ctx context.Context, key []byte, maxVersion uint64) ([]*txnlock.Lock, *tikv.KeyLocation, error) { locks, loc := l.scanLocks(key) return locks, loc, nil } -func (l *mockGCWorkerLockResolver) ResolveLocks(ctx context.Context, locks []*txnlock.Lock, loc *tikv.KeyLocation) (*tikv.KeyLocation, error) { +func (l *mockGCWorkerLockResolver) ResolveLocksInOneRegion(ctx context.Context, locks []*txnlock.Lock, loc *tikv.KeyLocation) (*tikv.KeyLocation, error) { return l.batchResolveLocks(locks, loc) } @@ -69,6 +69,10 @@ func (l *mockGCWorkerLockResolver) GetStore() tikv.Storage { return l.tikvStore } +func (l *mockGCWorkerLockResolver) Identifier() string { + return "gc worker test" +} + type mockGCWorkerClient struct { tikv.Client unsafeDestroyRangeHandler handler From f22ece2c96b654a3452da90a67aa1ca71805a941 Mon Sep 17 00:00:00 2001 From: 3pointer Date: Mon, 28 Aug 2023 11:54:29 +0800 Subject: [PATCH 23/45] make bazel_prepare --- DEPS.bzl | 12 ++++++------ br/pkg/streamhelper/BUILD.bazel | 9 ++++++--- store/gcworker/BUILD.bazel | 2 -- util/gcutil/BUILD.bazel | 10 ---------- 4 files changed, 12 insertions(+), 21 deletions(-) diff --git a/DEPS.bzl b/DEPS.bzl index 3f12473f2f7c2..5384bf731e296 100644 --- a/DEPS.bzl +++ b/DEPS.bzl @@ -6924,13 +6924,13 @@ def go_deps(): name = "com_github_tikv_client_go_v2", build_file_proto_mode = "disable_global", importpath = "github.com/tikv/client-go/v2", - sha256 = "608e5c393dcf7fa07a7a360333816dc479b05bad6ad489a4643c9a096e47f5d9", - strip_prefix = "github.com/tikv/client-go/v2@v2.0.8-0.20230811033710-8a214402da13", + sha256 = "0ad9b5fbfd1807fccc0ec29245b499903d58c57e0e313880012537ff7f059974", + strip_prefix = "github.com/3pointer/client-go/v2@v2.0.1-0.20230825093018-1cd747c7575f", urls = [ - "http://bazel-cache.pingcap.net:8080/gomod/github.com/tikv/client-go/v2/com_github_tikv_client_go_v2-v2.0.8-0.20230811033710-8a214402da13.zip", - "http://ats.apps.svc/gomod/github.com/tikv/client-go/v2/com_github_tikv_client_go_v2-v2.0.8-0.20230811033710-8a214402da13.zip", - "https://cache.hawkingrei.com/gomod/github.com/tikv/client-go/v2/com_github_tikv_client_go_v2-v2.0.8-0.20230811033710-8a214402da13.zip", - "https://storage.googleapis.com/pingcapmirror/gomod/github.com/tikv/client-go/v2/com_github_tikv_client_go_v2-v2.0.8-0.20230811033710-8a214402da13.zip", + "http://bazel-cache.pingcap.net:8080/gomod/github.com/3pointer/client-go/v2/com_github_3pointer_client_go_v2-v2.0.1-0.20230825093018-1cd747c7575f.zip", + "http://ats.apps.svc/gomod/github.com/3pointer/client-go/v2/com_github_3pointer_client_go_v2-v2.0.1-0.20230825093018-1cd747c7575f.zip", + "https://cache.hawkingrei.com/gomod/github.com/3pointer/client-go/v2/com_github_3pointer_client_go_v2-v2.0.1-0.20230825093018-1cd747c7575f.zip", + "https://storage.googleapis.com/pingcapmirror/gomod/github.com/3pointer/client-go/v2/com_github_3pointer_client_go_v2-v2.0.1-0.20230825093018-1cd747c7575f.zip", ], ) go_repository( diff --git a/br/pkg/streamhelper/BUILD.bazel b/br/pkg/streamhelper/BUILD.bazel index bd14d9d43e567..76ba1c9210018 100644 --- a/br/pkg/streamhelper/BUILD.bazel +++ b/br/pkg/streamhelper/BUILD.bazel @@ -29,7 +29,6 @@ go_library( "//owner", "//util/codec", "//util/engine", - "//util/gcutil", "//util/mathutil", "@com_github_gogo_protobuf//proto", "@com_github_golang_protobuf//proto", @@ -43,7 +42,6 @@ go_library( "@com_github_tikv_client_go_v2//kv", "@com_github_tikv_client_go_v2//oracle", "@com_github_tikv_client_go_v2//tikv", - "@com_github_tikv_client_go_v2//tikvrpc", "@com_github_tikv_client_go_v2//txnkv/rangetask", "@com_github_tikv_client_go_v2//txnkv/txnlock", "@com_github_tikv_pd_client//:client", @@ -70,7 +68,7 @@ go_test( ], flaky = True, race = "on", - shard_count = 18, + shard_count = 19, deps = [ ":streamhelper", "//br/pkg/errors", @@ -87,11 +85,16 @@ go_test( "@com_github_pingcap_failpoint//:failpoint", "@com_github_pingcap_kvproto//pkg/brpb", "@com_github_pingcap_kvproto//pkg/errorpb", + "@com_github_pingcap_kvproto//pkg/kvrpcpb", "@com_github_pingcap_kvproto//pkg/logbackuppb", "@com_github_pingcap_kvproto//pkg/metapb", "@com_github_pingcap_log//:log", "@com_github_stretchr_testify//require", "@com_github_tikv_client_go_v2//kv", + "@com_github_tikv_client_go_v2//tikv", + "@com_github_tikv_client_go_v2//tikvrpc", + "@com_github_tikv_client_go_v2//txnkv/txnlock", + "@com_github_tikv_pd_client//:client", "@io_etcd_go_etcd_client_v3//:client", "@io_etcd_go_etcd_server_v3//embed", "@io_etcd_go_etcd_server_v3//mvcc", diff --git a/store/gcworker/BUILD.bazel b/store/gcworker/BUILD.bazel index 8fbbb81c7ce2f..98a5caffa3453 100644 --- a/store/gcworker/BUILD.bazel +++ b/store/gcworker/BUILD.bazel @@ -21,7 +21,6 @@ go_library( "//tablecodec", "//util/codec", "//util/dbterror", - "//util/gcutil", "//util/logutil", "@com_github_pingcap_errors//:errors", "@com_github_pingcap_failpoint//:failpoint", @@ -63,7 +62,6 @@ go_test( "//store/mockstore", "//testkit/testmain", "//testkit/testsetup", - "//util/gcutil", "@com_github_pingcap_errors//:errors", "@com_github_pingcap_failpoint//:failpoint", "@com_github_pingcap_kvproto//pkg/errorpb", diff --git a/util/gcutil/BUILD.bazel b/util/gcutil/BUILD.bazel index e8d1cba0722a4..fc6c882078726 100644 --- a/util/gcutil/BUILD.bazel +++ b/util/gcutil/BUILD.bazel @@ -7,22 +7,12 @@ go_library( visibility = ["//visibility:public"], deps = [ "//kv", - "//metrics", "//parser/model", "//sessionctx", "//sessionctx/variable", - "//util/logutil", "//util/sqlexec", "@com_github_pingcap_errors//:errors", - "@com_github_pingcap_failpoint//:failpoint", - "@com_github_pingcap_kvproto//pkg/kvrpcpb", - "@com_github_tikv_client_go_v2//error", "@com_github_tikv_client_go_v2//oracle", - "@com_github_tikv_client_go_v2//tikv", - "@com_github_tikv_client_go_v2//tikvrpc", - "@com_github_tikv_client_go_v2//txnkv/rangetask", - "@com_github_tikv_client_go_v2//txnkv/txnlock", "@com_github_tikv_client_go_v2//util", - "@org_uber_go_zap//:zap", ], ) From 3a0db5898e2c5b9e92a388f8ec039788fdf625c3 Mon Sep 17 00:00:00 2001 From: 3pointer Date: Tue, 29 Aug 2023 13:49:08 +0800 Subject: [PATCH 24/45] update go mod --- go.mod | 3 +-- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/go.mod b/go.mod index 18ae0698c1fa9..10bb1b571c7b5 100644 --- a/go.mod +++ b/go.mod @@ -98,7 +98,7 @@ require ( github.com/stretchr/testify v1.8.4 github.com/tdakkota/asciicheck v0.2.0 github.com/tiancaiamao/appdash v0.0.0-20181126055449-889f96f722a2 - github.com/tikv/client-go/v2 v2.0.8-0.20230811033710-8a214402da13 + github.com/tikv/client-go/v2 v2.0.8-0.20230829054323-a8860a98018e github.com/tikv/pd/client v0.0.0-20230728033905-31343e006842 github.com/timakin/bodyclose v0.0.0-20230421092635-574207250966 github.com/twmb/murmur3 v1.1.6 @@ -304,5 +304,4 @@ replace ( github.com/dgrijalva/jwt-go => github.com/form3tech-oss/jwt-go v3.2.6-0.20210809144907-32ab6a8243d7+incompatible github.com/go-ldap/ldap/v3 => github.com/YangKeao/ldap/v3 v3.4.5-0.20230421065457-369a3bab1117 github.com/pingcap/tidb/parser => ./parser - github.com/tikv/client-go/v2 => github.com/3pointer/client-go/v2 v2.0.1-0.20230825093018-1cd747c7575f ) diff --git a/go.sum b/go.sum index 96b8ef9602c62..6403924d14f18 100644 --- a/go.sum +++ b/go.sum @@ -49,8 +49,6 @@ cloud.google.com/go/storage v1.30.1 h1:uOdMxAs8HExqBlnLtnQyP0YkvbiDpdGShGKtx6U/o cloud.google.com/go/storage v1.30.1/go.mod h1:NfxhC0UJE1aXSx7CIIbCf7y9HKT7BiccwkR7+P7gN8E= contrib.go.opencensus.io/exporter/ocagent v0.4.12/go.mod h1:450APlNTSR6FrvC3CTRqYosuDstRB9un7SOx2k/9ckA= dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU= -github.com/3pointer/client-go/v2 v2.0.1-0.20230825093018-1cd747c7575f h1:3XzIm4xkVbquFvt8ju4so9fvbi41i6+75dre7FJhUT4= -github.com/3pointer/client-go/v2 v2.0.1-0.20230825093018-1cd747c7575f/go.mod h1:2cndkbSuokkQ6TvYdcUbSfd0IYXbQx9HfdPEc+r0ezg= github.com/AndreasBriese/bbloom v0.0.0-20190306092124-e2d15f34fcf9/go.mod h1:bOvUY6CB00SOBii9/FifXqc0awNKxLFCL/+pkDPuyl8= github.com/Azure/azure-sdk-for-go v23.2.0+incompatible/go.mod h1:9XXNKU+eRnpl9moKnB4QOLf1HestfXbmab5FXxiDBjc= github.com/Azure/azure-sdk-for-go/sdk/azcore v1.6.0 h1:8kDqDngH+DmVBiCtIjCFTGa7MBnsIOkF9IccInFEbjk= @@ -1004,6 +1002,8 @@ github.com/tiancaiamao/appdash v0.0.0-20181126055449-889f96f722a2 h1:mbAskLJ0oJf github.com/tiancaiamao/appdash v0.0.0-20181126055449-889f96f722a2/go.mod h1:2PfKggNGDuadAa0LElHrByyrz4JPZ9fFx6Gs7nx7ZZU= github.com/tiancaiamao/gp v0.0.0-20221230034425-4025bc8a4d4a h1:J/YdBZ46WKpXsxsW93SG+q0F8KI+yFrcIDT4c/RNoc4= github.com/tiancaiamao/gp v0.0.0-20221230034425-4025bc8a4d4a/go.mod h1:h4xBhSNtOeEosLJ4P7JyKXX7Cabg7AVkWCK5gV2vOrM= +github.com/tikv/client-go/v2 v2.0.8-0.20230829054323-a8860a98018e h1:2/UEklnqFNj5Pc8gh6Z5+IR/TlBt6L1EASwug0jELoU= +github.com/tikv/client-go/v2 v2.0.8-0.20230829054323-a8860a98018e/go.mod h1:2cndkbSuokkQ6TvYdcUbSfd0IYXbQx9HfdPEc+r0ezg= github.com/tikv/pd/client v0.0.0-20230728033905-31343e006842 h1:TwjBJvRx/DJbgMt7Vk5cFO7tG1DZnxR+22S2VmaNGRw= github.com/tikv/pd/client v0.0.0-20230728033905-31343e006842/go.mod h1:VJwM+qMcQxvGgyu9C6wU7fhjLaALs+odsOvpUMmnhHo= github.com/timakin/bodyclose v0.0.0-20230421092635-574207250966 h1:quvGphlmUVU+nhpFa4gg4yJyTRJ13reZMDHrKwYw53M= From 9e6fcead110e3bb1151948d5cffa9f1ece027065 Mon Sep 17 00:00:00 2001 From: 3pointer Date: Tue, 29 Aug 2023 14:02:21 +0800 Subject: [PATCH 25/45] update --- DEPS.bzl | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/DEPS.bzl b/DEPS.bzl index 5384bf731e296..127d20dc47313 100644 --- a/DEPS.bzl +++ b/DEPS.bzl @@ -6924,13 +6924,13 @@ def go_deps(): name = "com_github_tikv_client_go_v2", build_file_proto_mode = "disable_global", importpath = "github.com/tikv/client-go/v2", - sha256 = "0ad9b5fbfd1807fccc0ec29245b499903d58c57e0e313880012537ff7f059974", - strip_prefix = "github.com/3pointer/client-go/v2@v2.0.1-0.20230825093018-1cd747c7575f", + sha256 = "544ea0d93fb8aed8f20f7b821e81100e7204ffcfd8465e35360ad55823f13106", + strip_prefix = "github.com/tikv/client-go/v2@v2.0.8-0.20230829054323-a8860a98018e", urls = [ - "http://bazel-cache.pingcap.net:8080/gomod/github.com/3pointer/client-go/v2/com_github_3pointer_client_go_v2-v2.0.1-0.20230825093018-1cd747c7575f.zip", - "http://ats.apps.svc/gomod/github.com/3pointer/client-go/v2/com_github_3pointer_client_go_v2-v2.0.1-0.20230825093018-1cd747c7575f.zip", - "https://cache.hawkingrei.com/gomod/github.com/3pointer/client-go/v2/com_github_3pointer_client_go_v2-v2.0.1-0.20230825093018-1cd747c7575f.zip", - "https://storage.googleapis.com/pingcapmirror/gomod/github.com/3pointer/client-go/v2/com_github_3pointer_client_go_v2-v2.0.1-0.20230825093018-1cd747c7575f.zip", + "http://bazel-cache.pingcap.net:8080/gomod/github.com/tikv/client-go/v2/com_github_tikv_client_go_v2-v2.0.8-0.20230829054323-a8860a98018e.zip", + "http://ats.apps.svc/gomod/github.com/tikv/client-go/v2/com_github_tikv_client_go_v2-v2.0.8-0.20230829054323-a8860a98018e.zip", + "https://cache.hawkingrei.com/gomod/github.com/tikv/client-go/v2/com_github_tikv_client_go_v2-v2.0.8-0.20230829054323-a8860a98018e.zip", + "https://storage.googleapis.com/pingcapmirror/gomod/github.com/tikv/client-go/v2/com_github_tikv_client_go_v2-v2.0.8-0.20230829054323-a8860a98018e.zip", ], ) go_repository( From 0e4c9d3816a30d90501023cab536c89281c9316d Mon Sep 17 00:00:00 2001 From: 3pointer Date: Tue, 29 Aug 2023 15:34:13 +0800 Subject: [PATCH 26/45] fix bazel build --- br/pkg/streamhelper/advancer_daemon.go | 5 +++-- br/pkg/streamhelper/advancer_env.go | 3 ++- br/pkg/streamhelper/basic_lib_for_test.go | 13 ++++++++----- store/gcworker/gc_worker_test.go | 11 +++++------ 4 files changed, 18 insertions(+), 14 deletions(-) diff --git a/br/pkg/streamhelper/advancer_daemon.go b/br/pkg/streamhelper/advancer_daemon.go index 4ad442c234628..59ef6c0caa2e1 100644 --- a/br/pkg/streamhelper/advancer_daemon.go +++ b/br/pkg/streamhelper/advancer_daemon.go @@ -32,7 +32,7 @@ const ( func resolveLockTickTime() time.Duration { failpoint.Inject("ResolveLockTickTime", func(val failpoint.Value) { t := time.Duration(val.(int)) - failpoint.Return(t * time.Second) + failpoint.Return(t) }) return 5 * time.Second } @@ -80,7 +80,8 @@ func (c *CheckpointAdvancer) OnBecomeOwner(ctx context.Context) { }) handler := func(ctx context.Context, r tikvstore.KeyRange) (rangetask.TaskStat, error) { // we will scan all locks and try to resolve them by check txn status. - return tikv.ResolveLocksForRange(ctx, c.env, math.MaxUint64, r.StartKey, r.EndKey, tikv.NewGcResolveLockMaxBackoffer, tikv.GCScanLockLimit) + return tikv.ResolveLocksForRange( + ctx, c.env, math.MaxUint64, r.StartKey, r.EndKey, tikv.NewGcResolveLockMaxBackoffer, tikv.GCScanLockLimit) } workerPool := utils.NewWorkerPool(uint(config.DefaultMaxConcurrencyAdvance), "advancer resolve locks") var wg sync.WaitGroup diff --git a/br/pkg/streamhelper/advancer_env.go b/br/pkg/streamhelper/advancer_env.go index 4648085825f6f..6b739869fc47d 100644 --- a/br/pkg/streamhelper/advancer_env.go +++ b/br/pkg/streamhelper/advancer_env.go @@ -163,7 +163,8 @@ func newAdvancerLockResolver(store tikv.Storage) *AdvancerLockResolver { // ResolveLocksInOneRegion tries to resolve expired locks with this method. // It will check status of the txn. Resolve the lock if txn is expired, Or do nothing. -func (l *AdvancerLockResolver) ResolveLocksInOneRegion(bo *tikv.Backoffer, locks []*txnlock.Lock, loc *tikv.KeyLocation) (*tikv.KeyLocation, error) { +func (l *AdvancerLockResolver) ResolveLocksInOneRegion( + bo *tikv.Backoffer, locks []*txnlock.Lock, loc *tikv.KeyLocation) (*tikv.KeyLocation, error) { _, err := l.GetStore().GetLockResolver().ResolveLocks(bo, 0, locks) if err != nil { return nil, err diff --git a/br/pkg/streamhelper/basic_lib_for_test.go b/br/pkg/streamhelper/basic_lib_for_test.go index aceeac07c16dd..b2171f527a1b0 100644 --- a/br/pkg/streamhelper/basic_lib_for_test.go +++ b/br/pkg/streamhelper/basic_lib_for_test.go @@ -645,16 +645,19 @@ func (t *testEnv) unregisterTask() { } } -func (t *testEnv) ScanLocks(ctx context.Context, key []byte, maxVersion uint64) ([]*txnlock.Lock, *tikv.KeyLocation, error) { +func (t *testEnv) ScanLocksInOneRegion(bo *tikv.Backoffer, key []byte, maxVersion uint64, limit uint32) ([]*txnlock.Lock, *tikv.KeyLocation, error) { locks, loc := t.scanLocks(key) return locks, loc, nil - } -func (t *testEnv) ResolveLocks(ctx context.Context, locks []*txnlock.Lock, loc *tikv.KeyLocation) (*tikv.KeyLocation, error) { +func (t *testEnv) ResolveLocksInOneRegion(bo *tikv.Backoffer, locks []*txnlock.Lock, loc *tikv.KeyLocation) (*tikv.KeyLocation, error) { return t.resolveLocks(locks, loc) } +func (t *testEnv) Identifier() string { + return "advance test" +} + func (t *testEnv) GetStore() tikv.Storage { // only used for GetRegionCache once in resolve lock return &mockTiKVStore{regionCache: tikv.NewRegionCache(&mockPDClient{})} @@ -694,14 +697,14 @@ func (p *mockPDClient) ScanRegions(ctx context.Context, key, endKey []byte, limi }, nil } -func (c *mockPDClient) GetStore(_ context.Context, storeID uint64) (*metapb.Store, error) { +func (p *mockPDClient) GetStore(_ context.Context, storeID uint64) (*metapb.Store, error) { return &metapb.Store{ Id: storeID, Address: fmt.Sprintf("127.0.0.%d", storeID), }, nil } -func (c *mockPDClient) GetRegion(ctx context.Context, key []byte, opts ...pd.GetRegionOption) (*pd.Region, error) { +func (p *mockPDClient) GetRegion(ctx context.Context, key []byte, opts ...pd.GetRegionOption) (*pd.Region, error) { return newMockRegion(1, []byte("1"), []byte("3")), nil } diff --git a/store/gcworker/gc_worker_test.go b/store/gcworker/gc_worker_test.go index 103dffc8728b4..d581b1e8558ba 100644 --- a/store/gcworker/gc_worker_test.go +++ b/store/gcworker/gc_worker_test.go @@ -55,13 +55,12 @@ type mockGCWorkerLockResolver struct { batchResolveLocks func([]*txnlock.Lock, *tikv.KeyLocation) (*tikv.KeyLocation, error) } -func (l *mockGCWorkerLockResolver) ScanLocksInOneRegion(ctx context.Context, key []byte, maxVersion uint64) ([]*txnlock.Lock, *tikv.KeyLocation, error) { +func (l *mockGCWorkerLockResolver) ScanLocksInOneRegion(bo *tikv.Backoffer, key []byte, maxVersion uint64, limit uint32) ([]*txnlock.Lock, *tikv.KeyLocation, error) { locks, loc := l.scanLocks(key) return locks, loc, nil - } -func (l *mockGCWorkerLockResolver) ResolveLocksInOneRegion(ctx context.Context, locks []*txnlock.Lock, loc *tikv.KeyLocation) (*tikv.KeyLocation, error) { +func (l *mockGCWorkerLockResolver) ResolveLocksInOneRegion(bo *tikv.Backoffer, locks []*txnlock.Lock, loc *tikv.KeyLocation) (*tikv.KeyLocation, error) { return l.batchResolveLocks(locks, loc) } @@ -1039,7 +1038,7 @@ func TestResolveLockRangeInfine(t *testing.T) { require.NoError(t, failpoint.Disable("tikvclient/invalidCacheAndRetry")) }() - _, err := tikv.ResolveLocksForRange(gcContext(), s.gcWorker.uuid, s.gcWorker.lockResolver, 1, []byte{0}, []byte{1}) + _, err := tikv.ResolveLocksForRange(gcContext(), s.gcWorker.lockResolver, 1, []byte{0}, []byte{1}, tikv.NewNoopBackoff, 10) require.Error(t, err) } @@ -1103,7 +1102,7 @@ func TestResolveLockRangeMeetRegionCacheMiss(t *testing.T) { return loc, nil }, } - _, err := tikv.ResolveLocksForRange(gcContext(), s.gcWorker.uuid, mockLockResolver, safepointTS, []byte{0}, []byte{10}) + _, err := tikv.ResolveLocksForRange(gcContext(), mockLockResolver, safepointTS, []byte{0}, []byte{10}, tikv.NewNoopBackoff, 10) require.NoError(t, err) require.Equal(t, 2, resolveCnt) require.Equal(t, 2, scanCnt) @@ -1189,7 +1188,7 @@ func TestResolveLockRangeMeetRegionEnlargeCausedByRegionMerge(t *testing.T) { } return loc, nil } - _, err := tikv.ResolveLocksForRange(gcContext(), s.gcWorker.uuid, mockGCLockResolver, 1, []byte(""), []byte("z")) + _, err := tikv.ResolveLocksForRange(gcContext(), mockGCLockResolver, 1, []byte(""), []byte("z"), tikv.NewNoopBackoff, 10) require.NoError(t, err) require.Len(t, resolvedLock, 4) expects := [][]byte{[]byte("a"), []byte("b"), []byte("o"), []byte("p")} From b7c2912364a531f64455f76322fbfac5577f1464 Mon Sep 17 00:00:00 2001 From: 3pointer Date: Tue, 29 Aug 2023 16:51:05 +0800 Subject: [PATCH 27/45] fix race test --- br/pkg/streamhelper/advancer.go | 22 +++++++++++++--------- br/pkg/streamhelper/advancer_daemon.go | 4 ++-- br/pkg/streamhelper/advancer_test.go | 8 ++++---- 3 files changed, 19 insertions(+), 15 deletions(-) diff --git a/br/pkg/streamhelper/advancer.go b/br/pkg/streamhelper/advancer.go index 7c9bd34c75fa0..fe18178779eda 100644 --- a/br/pkg/streamhelper/advancer.go +++ b/br/pkg/streamhelper/advancer.go @@ -62,7 +62,7 @@ type CheckpointAdvancer struct { // the cached last checkpoint. // if no progress, this cache can help us don't to send useless requests. - lastCheckpoint checkpoint + lastCheckpoint *checkpoint checkpoints *spans.ValueSortedFull checkpointsMu sync.Mutex @@ -83,15 +83,15 @@ type checkpoint struct { resolveLockTime time.Time } -func newCheckpointWithTS(ts uint64) checkpoint { - return checkpoint{ +func newCheckpointWithTS(ts uint64) *checkpoint { + return &checkpoint{ TS: ts, resolveLockTime: time.Now(), } } -func NewCheckpointWithSpan(s spans.Valued) checkpoint { - return checkpoint{ +func NewCheckpointWithSpan(s spans.Valued) *checkpoint { + return &checkpoint{ StartKey: s.Key.StartKey, EndKey: s.Key.EndKey, TS: s.Value, @@ -99,11 +99,15 @@ func NewCheckpointWithSpan(s spans.Valued) checkpoint { } } -func (c checkpoint) safeTS() uint64 { +func (c *checkpoint) UpdateResolveLockTime(t time.Time) { + c.resolveLockTime = t +} + +func (c *checkpoint) safeTS() uint64 { return c.TS - 1 } -func (c checkpoint) equal(o checkpoint) bool { +func (c *checkpoint) equal(o *checkpoint) bool { return bytes.Equal(c.StartKey, o.StartKey) && bytes.Equal(c.EndKey, o.EndKey) && c.TS == o.TS } @@ -111,7 +115,7 @@ func (c checkpoint) equal(o checkpoint) bool { // if a checkpoint stay in a time too long(3 min) // we should try to resolve lock for the range // to keep the RPO in 5 min. -func (c checkpoint) needResolveLocks() bool { +func (c *checkpoint) needResolveLocks() bool { failpoint.Inject("NeedResolveLocks", func(val failpoint.Value) { failpoint.Return(val.(bool)) }) @@ -142,7 +146,7 @@ func (c *CheckpointAdvancer) UpdateConfigWith(f func(*config.Config)) { } // only used for test -func (c *CheckpointAdvancer) UpdateLastCheckpoint(p checkpoint) { +func (c *CheckpointAdvancer) UpdateLastCheckpoint(p *checkpoint) { c.lastCheckpoint = p } diff --git a/br/pkg/streamhelper/advancer_daemon.go b/br/pkg/streamhelper/advancer_daemon.go index 59ef6c0caa2e1..8f1a73c32bfed 100644 --- a/br/pkg/streamhelper/advancer_daemon.go +++ b/br/pkg/streamhelper/advancer_daemon.go @@ -72,7 +72,7 @@ func (c *CheckpointAdvancer) OnBecomeOwner(ctx context.Context) { var targets []spans.Valued c.WithCheckpoints(func(vsf *spans.ValueSortedFull) { // when get locks here. assume these locks are not belong to same txn, - // but startTS close to 1 minute. try resolve these locks at one time + // but these locks' start ts are close to 1 minute. try resolve these locks at one time vsf.TraverseValuesLessThan(tsoAfter(c.lastCheckpoint.TS, time.Minute), func(v spans.Valued) bool { targets = append(targets, v) return true @@ -111,7 +111,7 @@ func (c *CheckpointAdvancer) OnBecomeOwner(ctx context.Context) { logutil.Key("StartKey", c.lastCheckpoint.StartKey), logutil.Key("EndKey", c.lastCheckpoint.EndKey), zap.Int("targets", len(targets))) - //c.lastCheckpoint.resolveLockTime = time.Now() + c.lastCheckpoint.UpdateResolveLockTime(time.Now()) } } } diff --git a/br/pkg/streamhelper/advancer_test.go b/br/pkg/streamhelper/advancer_test.go index 8e0d0573806bc..b115c892a201c 100644 --- a/br/pkg/streamhelper/advancer_test.go +++ b/br/pkg/streamhelper/advancer_test.go @@ -20,6 +20,7 @@ import ( "github.com/stretchr/testify/require" "github.com/tikv/client-go/v2/tikv" "github.com/tikv/client-go/v2/txnkv/txnlock" + "go.uber.org/atomic" "go.uber.org/zap/zapcore" "google.golang.org/grpc/codes" "google.golang.org/grpc/status" @@ -319,10 +320,9 @@ func TestResolveLock(t *testing.T) { } } // ensure resolve locks triggered and collect all locks from scan locks - resolveLockCnt := 0 - resolveLockCntRef := &resolveLockCnt + resolveLockRef := atomic.NewBool(false) env.resolveLocks = func(locks []*txnlock.Lock, loc *tikv.KeyLocation) (*tikv.KeyLocation, error) { - *resolveLockCntRef += 1 + resolveLockRef.Store(true) require.ElementsMatch(t, locks, allLocks) return loc, nil } @@ -347,7 +347,7 @@ func TestResolveLock(t *testing.T) { coll := streamhelper.NewClusterCollector(ctx, env) err := adv.GetCheckpointInRange(ctx, []byte{}, []byte{}, coll) - require.Eventually(t, func() bool { return *resolveLockCntRef > 0 }, + require.Eventually(t, func() bool { return resolveLockRef.Load() }, 8*time.Second, 50*time.Microsecond) require.NoError(t, err) From 38c95e4a98be91392fb4ca4e7cffed81a70b9cbb Mon Sep 17 00:00:00 2001 From: 3pointer Date: Tue, 29 Aug 2023 17:00:18 +0800 Subject: [PATCH 28/45] fix bazel --- br/pkg/streamhelper/BUILD.bazel | 1 + 1 file changed, 1 insertion(+) diff --git a/br/pkg/streamhelper/BUILD.bazel b/br/pkg/streamhelper/BUILD.bazel index 76ba1c9210018..cb8442cb618dc 100644 --- a/br/pkg/streamhelper/BUILD.bazel +++ b/br/pkg/streamhelper/BUILD.bazel @@ -102,6 +102,7 @@ go_test( "@org_golang_google_grpc//codes", "@org_golang_google_grpc//metadata", "@org_golang_google_grpc//status", + "@org_uber_go_atomic//:atomic", "@org_uber_go_zap//:zap", "@org_uber_go_zap//zapcore", ], From 012bdecbafadbe6155837dd50fe9ad4cfcc65654 Mon Sep 17 00:00:00 2001 From: 3pointer Date: Tue, 29 Aug 2023 18:08:50 +0800 Subject: [PATCH 29/45] fix unit test --- store/gcworker/gc_worker_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/store/gcworker/gc_worker_test.go b/store/gcworker/gc_worker_test.go index d581b1e8558ba..3c04b23a6e24a 100644 --- a/store/gcworker/gc_worker_test.go +++ b/store/gcworker/gc_worker_test.go @@ -2059,9 +2059,9 @@ func TestSkipGCAndOnlyResolveLock(t *testing.T) { err = s.gcWorker.leaderTick(ctx) require.NoError(t, err) - // check the lock has been resolved. + // check the lock has not been resolved. err = txn.Commit(ctx) - require.Error(t, err) + require.NoError(t, err) // check gc is skipped last, err := s.gcWorker.loadTime(gcLastRunTimeKey) From d32ed0e7b6ceb39aa62c88b3e59196c20c46fb8a Mon Sep 17 00:00:00 2001 From: 3pointer Date: Tue, 29 Aug 2023 18:29:06 +0800 Subject: [PATCH 30/45] fix unit test --- store/gcworker/gc_worker_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/store/gcworker/gc_worker_test.go b/store/gcworker/gc_worker_test.go index 3c04b23a6e24a..4e9a295edcff1 100644 --- a/store/gcworker/gc_worker_test.go +++ b/store/gcworker/gc_worker_test.go @@ -2008,7 +2008,7 @@ func TestGCWithPendingTxn2(t *testing.T) { require.NoError(t, err) err = txn.Commit(ctx) - require.Error(t, err) + require.NoError(t, err) err = txn2.Commit(ctx) require.NoError(t, err) } From 0d69d7bba969c50b1ec45d2427701a4f37a58cc0 Mon Sep 17 00:00:00 2001 From: 3pointer Date: Wed, 30 Aug 2023 11:11:24 +0800 Subject: [PATCH 31/45] fix unit test --- store/gcworker/gc_worker_test.go | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/store/gcworker/gc_worker_test.go b/store/gcworker/gc_worker_test.go index 4e9a295edcff1..2a79b4c2dfd86 100644 --- a/store/gcworker/gc_worker_test.go +++ b/store/gcworker/gc_worker_test.go @@ -1038,7 +1038,20 @@ func TestResolveLockRangeInfine(t *testing.T) { require.NoError(t, failpoint.Disable("tikvclient/invalidCacheAndRetry")) }() - _, err := tikv.ResolveLocksForRange(gcContext(), s.gcWorker.lockResolver, 1, []byte{0}, []byte{1}, tikv.NewNoopBackoff, 10) + mockLockResolver := &mockGCWorkerLockResolver{ + tikvStore: s.tikvStore, + scanLocks: func(key []byte) ([]*txnlock.Lock, *tikv.KeyLocation) { + return []*txnlock.Lock{}, &tikv.KeyLocation{} + }, + batchResolveLocks: func( + locks []*txnlock.Lock, + loc *tikv.KeyLocation, + ) (*tikv.KeyLocation, error) { + // mock error to test backoff + return nil, errors.New("mock error") + }, + } + _, err := tikv.ResolveLocksForRange(gcContext(), mockLockResolver, 1, []byte{0}, []byte{1}, tikv.NewNoopBackoff, 10) require.Error(t, err) } From 22ddfc5a63928b4236a8e13c10b8bf1f7ef1fca6 Mon Sep 17 00:00:00 2001 From: 3pointer Date: Wed, 30 Aug 2023 13:21:47 +0800 Subject: [PATCH 32/45] udpate --- store/gcworker/gc_worker.go | 19 +++++++++++-------- store/gcworker/gc_worker_test.go | 30 ++++++++++++++++++++++++++++-- 2 files changed, 39 insertions(+), 10 deletions(-) diff --git a/store/gcworker/gc_worker.go b/store/gcworker/gc_worker.go index 0b50f774785c5..723ff5b8f9f58 100644 --- a/store/gcworker/gc_worker.go +++ b/store/gcworker/gc_worker.go @@ -2033,11 +2033,13 @@ func getGCRules(ids []int64, rules map[string]*label.Rule) []string { } // RunGCJob sends GC command to KV. It is exported for kv api, do not use it with GCWorker at the same time. -func RunGCJob(ctx context.Context, s tikv.Storage, pd pd.Client, safePoint uint64, identifier string, concurrency int) error { +// only use for test +func RunGCJob(ctx context.Context, lockResolver tikv.RegionLockResolver, s tikv.Storage, pd pd.Client, safePoint uint64, identifier string, concurrency int) error { gcWorker := &GCWorker{ - tikvStore: s, - uuid: identifier, - pdClient: pd, + tikvStore: s, + uuid: identifier, + pdClient: pd, + lockResolver: lockResolver, } if concurrency <= 0 { @@ -2070,11 +2072,12 @@ func RunGCJob(ctx context.Context, s tikv.Storage, pd pd.Client, safePoint uint6 // RunDistributedGCJob notifies TiKVs to do GC. It is exported for kv api, do not use it with GCWorker at the same time. // This function may not finish immediately because it may take some time to do resolveLocks. // Param concurrency specifies the concurrency of resolveLocks phase. -func RunDistributedGCJob(ctx context.Context, s tikv.Storage, pd pd.Client, safePoint uint64, identifier string, concurrency int) error { +func RunDistributedGCJob(ctx context.Context, lockResolver tikv.RegionLockResolver, s tikv.Storage, pd pd.Client, safePoint uint64, identifier string, concurrency int) error { gcWorker := &GCWorker{ - tikvStore: s, - uuid: identifier, - pdClient: pd, + tikvStore: s, + uuid: identifier, + pdClient: pd, + lockResolver: lockResolver, } safePoint, err := gcWorker.setGCWorkerServiceSafePoint(ctx, safePoint) diff --git a/store/gcworker/gc_worker_test.go b/store/gcworker/gc_worker_test.go index 2a79b4c2dfd86..749f6759e40c8 100644 --- a/store/gcworker/gc_worker_test.go +++ b/store/gcworker/gc_worker_test.go @@ -1295,12 +1295,25 @@ func TestSetServiceSafePoint(t *testing.T) { func TestRunGCJobAPI(t *testing.T) { s := createGCWorkerSuite(t) + mockLockResolver := &mockGCWorkerLockResolver{ + tikvStore: s.tikvStore, + scanLocks: func(key []byte) ([]*txnlock.Lock, *tikv.KeyLocation) { + return []*txnlock.Lock{}, &tikv.KeyLocation{} + }, + batchResolveLocks: func( + locks []*txnlock.Lock, + loc *tikv.KeyLocation, + ) (*tikv.KeyLocation, error) { + // no locks + return loc, nil + }, + } gcSafePointCacheInterval = 0 p := s.createGCProbe(t, "k1") safePoint := s.mustAllocTs(t) - err := RunGCJob(gcContext(), s.tikvStore, s.pdClient, safePoint, "mock", 1) + err := RunGCJob(gcContext(), mockLockResolver, s.tikvStore, s.pdClient, safePoint, "mock", 1) require.NoError(t, err) s.checkCollected(t, p) etcdSafePoint := s.loadEtcdSafePoint(t) @@ -1312,9 +1325,22 @@ func TestRunDistGCJobAPI(t *testing.T) { s := createGCWorkerSuite(t) gcSafePointCacheInterval = 0 + mockLockResolver := &mockGCWorkerLockResolver{ + tikvStore: s.tikvStore, + scanLocks: func(key []byte) ([]*txnlock.Lock, *tikv.KeyLocation) { + return []*txnlock.Lock{}, &tikv.KeyLocation{} + }, + batchResolveLocks: func( + locks []*txnlock.Lock, + loc *tikv.KeyLocation, + ) (*tikv.KeyLocation, error) { + // no locks + return loc, nil + }, + } safePoint := s.mustAllocTs(t) - err := RunDistributedGCJob(gcContext(), s.tikvStore, s.pdClient, safePoint, "mock", 1) + err := RunDistributedGCJob(gcContext(), mockLockResolver, s.tikvStore, s.pdClient, safePoint, "mock", 1) require.NoError(t, err) pdSafePoint := s.mustGetSafePointFromPd(t) require.Equal(t, safePoint, pdSafePoint) From 88e8092dcb108f3f759fac9e5a3958bba310bc65 Mon Sep 17 00:00:00 2001 From: 3pointer Date: Wed, 30 Aug 2023 17:20:27 +0800 Subject: [PATCH 33/45] fix test --- br/pkg/streamhelper/advancer_daemon.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/br/pkg/streamhelper/advancer_daemon.go b/br/pkg/streamhelper/advancer_daemon.go index 8f1a73c32bfed..d85031dbeaa57 100644 --- a/br/pkg/streamhelper/advancer_daemon.go +++ b/br/pkg/streamhelper/advancer_daemon.go @@ -68,7 +68,7 @@ func (c *CheckpointAdvancer) OnBecomeOwner(ctx context.Context) { { // lastCheckpoint is not increased too long enough. // assume the cluster has expired locks for whatever reasons. - if c.lastCheckpoint.needResolveLocks() { + if c.lastCheckpoint != nil && c.lastCheckpoint.needResolveLocks() { var targets []spans.Valued c.WithCheckpoints(func(vsf *spans.ValueSortedFull) { // when get locks here. assume these locks are not belong to same txn, From 75fb8fd022cee81b7e312ee58a3e031ef89f4102 Mon Sep 17 00:00:00 2001 From: 3pointer Date: Fri, 1 Sep 2023 11:38:01 +0800 Subject: [PATCH 34/45] fix gc job test --- store/gcworker/gc_worker.go | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/store/gcworker/gc_worker.go b/store/gcworker/gc_worker.go index 723ff5b8f9f58..a37debe2c6d2b 100644 --- a/store/gcworker/gc_worker.go +++ b/store/gcworker/gc_worker.go @@ -2109,9 +2109,10 @@ func RunDistributedGCJob(ctx context.Context, lockResolver tikv.RegionLockResolv // It is exported only for test, do not use it in the production environment. func RunResolveLocks(ctx context.Context, s tikv.Storage, pd pd.Client, safePoint uint64, identifier string, concurrency int, usePhysical bool) (bool, error) { gcWorker := &GCWorker{ - tikvStore: s, - uuid: identifier, - pdClient: pd, + tikvStore: s, + uuid: identifier, + pdClient: pd, + lockResolver: tikv.NewRegionLockResolver("test-resolver", s), } return gcWorker.resolveLocks(ctx, safePoint, concurrency, usePhysical) } From 8d94a94d8cb684bd450821a5eb4a96eaaa25a25d Mon Sep 17 00:00:00 2001 From: 3pointer Date: Wed, 6 Sep 2023 13:57:50 +0800 Subject: [PATCH 35/45] address comments --- br/pkg/streamhelper/advancer.go | 86 ++++++++++++++++++++++---- br/pkg/streamhelper/advancer_daemon.go | 72 --------------------- br/pkg/streamhelper/advancer_env.go | 2 +- 3 files changed, 75 insertions(+), 85 deletions(-) diff --git a/br/pkg/streamhelper/advancer.go b/br/pkg/streamhelper/advancer.go index fe18178779eda..0d28830dacb40 100644 --- a/br/pkg/streamhelper/advancer.go +++ b/br/pkg/streamhelper/advancer.go @@ -5,6 +5,7 @@ package streamhelper import ( "bytes" "context" + "math" "strings" "sync" "time" @@ -19,7 +20,10 @@ import ( "github.com/pingcap/tidb/br/pkg/utils" "github.com/pingcap/tidb/kv" "github.com/pingcap/tidb/metrics" + tikvstore "github.com/tikv/client-go/v2/kv" "github.com/tikv/client-go/v2/oracle" + "github.com/tikv/client-go/v2/tikv" + "github.com/tikv/client-go/v2/txnkv/rangetask" "go.uber.org/multierr" "go.uber.org/zap" "golang.org/x/sync/errgroup" @@ -62,7 +66,8 @@ type CheckpointAdvancer struct { // the cached last checkpoint. // if no progress, this cache can help us don't to send useless requests. - lastCheckpoint *checkpoint + lastCheckpoint *checkpoint + lastCheckpointMu sync.Mutex checkpoints *spans.ValueSortedFull checkpointsMu sync.Mutex @@ -74,6 +79,7 @@ type CheckpointAdvancer struct { // checkpoint represents the TS with specific range. // it's only used in advancer.go. type checkpoint struct { + sync.Mutex StartKey []byte EndKey []byte TS uint64 @@ -99,10 +105,6 @@ func NewCheckpointWithSpan(s spans.Valued) *checkpoint { } } -func (c *checkpoint) UpdateResolveLockTime(t time.Time) { - c.resolveLockTime = t -} - func (c *checkpoint) safeTS() uint64 { return c.TS - 1 } @@ -145,9 +147,11 @@ func (c *CheckpointAdvancer) UpdateConfigWith(f func(*config.Config)) { c.UpdateConfig(cfg) } -// only used for test +// UpdateLastCheckpoint modify the checkpoint in ticking. func (c *CheckpointAdvancer) UpdateLastCheckpoint(p *checkpoint) { + c.lastCheckpointMu.Lock() c.lastCheckpoint = p + c.lastCheckpointMu.Unlock() } // Config returns the current config. @@ -402,7 +406,7 @@ func (c *CheckpointAdvancer) setCheckpoint(ctx context.Context, s spans.Valued) if cp.equal(c.lastCheckpoint) { return false } - c.lastCheckpoint = cp + c.UpdateLastCheckpoint(cp) metrics.LastCheckpoint.WithLabelValues(c.task.GetName()).Set(float64(c.lastCheckpoint.TS)) return true } @@ -496,6 +500,25 @@ func (c *CheckpointAdvancer) importantTick(ctx context.Context) error { } func (c *CheckpointAdvancer) optionalTick(cx context.Context) error { + // lastCheckpoint is not increased too long enough. + // assume the cluster has expired locks for whatever reasons. + var targets []spans.Valued + if c.lastCheckpoint != nil && c.lastCheckpoint.needResolveLocks() { + c.WithCheckpoints(func(vsf *spans.ValueSortedFull) { + // when get locks here. assume these locks are not belong to same txn, + // but these locks' start ts are close to 1 minute. try resolve these locks at one time + vsf.TraverseValuesLessThan(tsoAfter(c.lastCheckpoint.TS, time.Minute), func(v spans.Valued) bool { + targets = append(targets, v) + return true + }) + }) + if len(targets) != 0 { + log.Info("Advancer starts to resolve locks", zap.Int("targets", len(targets))) + // use new context here to avoid timeout + ctx := context.Background() + c.asyncResolveLocksForRanges(ctx, targets) + } + } threshold := c.Config().GetDefaultStartPollThreshold() if err := c.subscribeTick(cx); err != nil { log.Warn("Subscriber meet error, would polling the checkpoint.", zap.String("category", "log backup advancer"), @@ -503,13 +526,9 @@ func (c *CheckpointAdvancer) optionalTick(cx context.Context) error { threshold = c.Config().GetSubscriberErrorStartPollThreshold() } - err := c.advanceCheckpointBy(cx, func(cx context.Context) (spans.Valued, error) { + return c.advanceCheckpointBy(cx, func(cx context.Context) (spans.Valued, error) { return c.CalculateGlobalCheckpointLight(cx, threshold) }) - if err != nil { - return err - } - return nil } func (c *CheckpointAdvancer) tick(ctx context.Context) error { @@ -538,3 +557,46 @@ func (c *CheckpointAdvancer) tick(ctx context.Context) error { return errs } + +func (c *CheckpointAdvancer) asyncResolveLocksForRanges(ctx context.Context, targets []spans.Valued) { + // run in another goroutine + // do not block main tick here + go func() { + handler := func(ctx context.Context, r tikvstore.KeyRange) (rangetask.TaskStat, error) { + // we will scan all locks and try to resolve them by check txn status. + return tikv.ResolveLocksForRange( + ctx, c.env, math.MaxUint64, r.StartKey, r.EndKey, tikv.NewGcResolveLockMaxBackoffer, tikv.GCScanLockLimit) + } + workerPool := utils.NewWorkerPool(uint(config.DefaultMaxConcurrencyAdvance), "advancer resolve locks") + var wg sync.WaitGroup + for _, r := range targets { + targetRange := r + wg.Add(1) + workerPool.Apply(func() { + defer wg.Done() + // Run resolve lock on the whole TiKV cluster. + // it will use startKey/endKey to scan region in PD. + // but regionCache already has a codecPDClient. so just use decode key here. + // and it almost only include one region here. so set concurrency to 1. + runner := rangetask.NewRangeTaskRunner("advancer-resolve-locks-runner", + c.env.GetStore(), 1, handler) + err := runner.RunOnRange(ctx, targetRange.Key.StartKey, targetRange.Key.EndKey) + if err != nil { + // wait for next tick + log.Warn("resolve locks failed, wait for next tick", zap.String("category", "advancer"), + zap.String("uuid", "log backup advancer"), + zap.Error(err)) + } + }) + } + wg.Wait() + log.Info("finish resolve locks for checkpoint", zap.String("category", "advancer"), + zap.String("uuid", "log backup advancer"), + logutil.Key("StartKey", c.lastCheckpoint.StartKey), + logutil.Key("EndKey", c.lastCheckpoint.EndKey), + zap.Int("targets", len(targets))) + c.lastCheckpointMu.Lock() + c.lastCheckpoint.resolveLockTime = time.Now() + c.lastCheckpointMu.Unlock() + }() +} diff --git a/br/pkg/streamhelper/advancer_daemon.go b/br/pkg/streamhelper/advancer_daemon.go index d85031dbeaa57..fa2769d789c9b 100644 --- a/br/pkg/streamhelper/advancer_daemon.go +++ b/br/pkg/streamhelper/advancer_daemon.go @@ -4,24 +4,14 @@ package streamhelper import ( "context" - "math" - "sync" "time" "github.com/google/uuid" "github.com/pingcap/failpoint" - "github.com/pingcap/log" - "github.com/pingcap/tidb/br/pkg/logutil" - "github.com/pingcap/tidb/br/pkg/streamhelper/config" - "github.com/pingcap/tidb/br/pkg/streamhelper/spans" "github.com/pingcap/tidb/br/pkg/utils" "github.com/pingcap/tidb/metrics" "github.com/pingcap/tidb/owner" - tikvstore "github.com/tikv/client-go/v2/kv" - "github.com/tikv/client-go/v2/tikv" - "github.com/tikv/client-go/v2/txnkv/rangetask" clientv3 "go.etcd.io/etcd/client/v3" - "go.uber.org/zap" ) const ( @@ -55,68 +45,6 @@ func (c *CheckpointAdvancer) OnStart(ctx context.Context) { func (c *CheckpointAdvancer) OnBecomeOwner(ctx context.Context) { metrics.AdvancerOwner.Set(1.0) c.spawnSubscriptionHandler(ctx) - go func() { - t := resolveLockTickTime() - tick := time.NewTicker(t) - defer tick.Stop() - for { - select { - case <-ctx.Done(): - // no longger be an owner. return it. - return - case <-tick.C: - { - // lastCheckpoint is not increased too long enough. - // assume the cluster has expired locks for whatever reasons. - if c.lastCheckpoint != nil && c.lastCheckpoint.needResolveLocks() { - var targets []spans.Valued - c.WithCheckpoints(func(vsf *spans.ValueSortedFull) { - // when get locks here. assume these locks are not belong to same txn, - // but these locks' start ts are close to 1 minute. try resolve these locks at one time - vsf.TraverseValuesLessThan(tsoAfter(c.lastCheckpoint.TS, time.Minute), func(v spans.Valued) bool { - targets = append(targets, v) - return true - }) - }) - handler := func(ctx context.Context, r tikvstore.KeyRange) (rangetask.TaskStat, error) { - // we will scan all locks and try to resolve them by check txn status. - return tikv.ResolveLocksForRange( - ctx, c.env, math.MaxUint64, r.StartKey, r.EndKey, tikv.NewGcResolveLockMaxBackoffer, tikv.GCScanLockLimit) - } - workerPool := utils.NewWorkerPool(uint(config.DefaultMaxConcurrencyAdvance), "advancer resolve locks") - var wg sync.WaitGroup - for _, r := range targets { - targetRange := r - wg.Add(1) - workerPool.Apply(func() { - defer wg.Done() - // Run resolve lock on the whole TiKV cluster. - // it will use startKey/endKey to scan region in PD. - // but regionCache already has a codecPDClient. so just use decode key here. - // and it almost only include one region here. so set concurrency to 1. - runner := rangetask.NewRangeTaskRunner("advancer-resolve-locks-runner", - c.env.GetStore(), 1, handler) - err := runner.RunOnRange(ctx, targetRange.Key.StartKey, targetRange.Key.EndKey) - if err != nil { - // wait for next tick - log.Error("resolve locks failed", zap.String("category", "advancer"), - zap.String("uuid", "log backup advancer"), - zap.Error(err)) - } - }) - } - wg.Wait() - log.Info("finish resolve locks for checkpoint", zap.String("category", "advancer"), - zap.String("uuid", "log backup advancer"), - logutil.Key("StartKey", c.lastCheckpoint.StartKey), - logutil.Key("EndKey", c.lastCheckpoint.EndKey), - zap.Int("targets", len(targets))) - c.lastCheckpoint.UpdateResolveLockTime(time.Now()) - } - } - } - } - }() go func() { <-ctx.Done() c.onStop() diff --git a/br/pkg/streamhelper/advancer_env.go b/br/pkg/streamhelper/advancer_env.go index 6b739869fc47d..24faad706f8ee 100644 --- a/br/pkg/streamhelper/advancer_env.go +++ b/br/pkg/streamhelper/advancer_env.go @@ -174,5 +174,5 @@ func (l *AdvancerLockResolver) ResolveLocksInOneRegion( // If we don't implement GetStore here, it won't complie. func (l *AdvancerLockResolver) GetStore() tikv.Storage { - return l.store + return l.BaseRegionLockResolver.GetStore() } From 980229569585e8fe7e340554a0641328dff06806 Mon Sep 17 00:00:00 2001 From: 3pointer Date: Wed, 6 Sep 2023 15:19:52 +0800 Subject: [PATCH 36/45] address comment --- br/pkg/streamhelper/advancer_daemon.go | 10 --------- br/pkg/streamhelper/advancer_test.go | 16 ++++++-------- br/pkg/streamhelper/basic_lib_for_test.go | 27 +++++++++++++++++++---- 3 files changed, 30 insertions(+), 23 deletions(-) diff --git a/br/pkg/streamhelper/advancer_daemon.go b/br/pkg/streamhelper/advancer_daemon.go index fa2769d789c9b..4e3b68eb3fbf5 100644 --- a/br/pkg/streamhelper/advancer_daemon.go +++ b/br/pkg/streamhelper/advancer_daemon.go @@ -4,10 +4,8 @@ package streamhelper import ( "context" - "time" "github.com/google/uuid" - "github.com/pingcap/failpoint" "github.com/pingcap/tidb/br/pkg/utils" "github.com/pingcap/tidb/metrics" "github.com/pingcap/tidb/owner" @@ -19,14 +17,6 @@ const ( ownerPath = "/tidb/br-stream/owner" ) -func resolveLockTickTime() time.Duration { - failpoint.Inject("ResolveLockTickTime", func(val failpoint.Value) { - t := time.Duration(val.(int)) - failpoint.Return(t) - }) - return 5 * time.Second -} - // OnTick advances the inner logic clock for the advancer. // It's synchronous: this would only return after the events triggered by the clock has all been done. // It's generally panic-free, you may not need to trying recover a panic here. diff --git a/br/pkg/streamhelper/advancer_test.go b/br/pkg/streamhelper/advancer_test.go index b115c892a201c..fabf220b9ff3b 100644 --- a/br/pkg/streamhelper/advancer_test.go +++ b/br/pkg/streamhelper/advancer_test.go @@ -292,16 +292,16 @@ func TestResolveLock(t *testing.T) { } }() require.NoError(t, failpoint.Enable("github.com/pingcap/tidb/br/pkg/streamhelper/NeedResolveLocks", `return(true)`)) - require.NoError(t, failpoint.Enable("github.com/pingcap/tidb/br/pkg/streamhelper/ResolveLockTickTime", `return(1)`)) defer func() { require.NoError(t, failpoint.Disable("github.com/pingcap/tidb/br/pkg/streamhelper/NeedResolveLocks")) - require.NoError(t, failpoint.Disable("github.com/pingcap/tidb/br/pkg/streamhelper/ResolveLockTickTime")) }() c.splitAndScatter("01", "02", "022", "023", "033", "04", "043") ctx := context.Background() minCheckpoint := c.advanceCheckpoints() env := &testEnv{fakeCluster: c, testCtx: t} + + lockRegion := c.findRegionByKey([]byte("01")) allLocks := []*txnlock.Lock{ { Key: []byte{1}, @@ -314,11 +314,8 @@ func TestResolveLock(t *testing.T) { TxnID: minCheckpoint + 1, }, } - env.scanLocks = func(key []byte) ([]*txnlock.Lock, *tikv.KeyLocation) { - return allLocks, &tikv.KeyLocation{ - Region: tikv.NewRegionVerID(1, 0, 0), - } - } + c.LockRegion(lockRegion, allLocks) + // ensure resolve locks triggered and collect all locks from scan locks resolveLockRef := atomic.NewBool(false) env.resolveLocks = func(locks []*txnlock.Lock, loc *tikv.KeyLocation) (*tikv.KeyLocation, error) { @@ -343,13 +340,14 @@ func TestResolveLock(t *testing.T) { }, }, 0)), ) - adv.OnBecomeOwner(ctx) + adv.StartTaskListener(ctx) + require.Eventually(t, func() bool { return adv.OnTick(ctx) == nil }, + time.Second, 50*time.Millisecond) coll := streamhelper.NewClusterCollector(ctx, env) err := adv.GetCheckpointInRange(ctx, []byte{}, []byte{}, coll) require.Eventually(t, func() bool { return resolveLockRef.Load() }, 8*time.Second, 50*time.Microsecond) - require.NoError(t, err) r, err := coll.Finish(ctx) require.NoError(t, err) diff --git a/br/pkg/streamhelper/basic_lib_for_test.go b/br/pkg/streamhelper/basic_lib_for_test.go index b2171f527a1b0..866dc8bb4c9b1 100644 --- a/br/pkg/streamhelper/basic_lib_for_test.go +++ b/br/pkg/streamhelper/basic_lib_for_test.go @@ -77,6 +77,8 @@ type region struct { checkpoint atomic.Uint64 fsim flushSimulator + + locks []*txnlock.Lock } type fakeStore struct { @@ -331,6 +333,11 @@ func (f *fakeCluster) findRegionById(rid uint64) *region { return nil } +func (f *fakeCluster) LockRegion(r *region, locks []*txnlock.Lock) *region { + r.locks = locks + return r +} + func (f *fakeCluster) findRegionByKey(key []byte) *region { for _, r := range f.regions { if bytes.Compare(key, r.rng.StartKey) >= 0 && (len(r.rng.EndKey) == 0 || bytes.Compare(key, r.rng.EndKey) < 0) { @@ -588,7 +595,6 @@ type testEnv struct { ranges []kv.KeyRange taskCh chan<- streamhelper.TaskEvent - scanLocks func([]byte) ([]*txnlock.Lock, *tikv.KeyLocation) resolveLocks func([]*txnlock.Lock, *tikv.KeyLocation) (*tikv.KeyLocation, error) mu sync.Mutex @@ -646,12 +652,25 @@ func (t *testEnv) unregisterTask() { } func (t *testEnv) ScanLocksInOneRegion(bo *tikv.Backoffer, key []byte, maxVersion uint64, limit uint32) ([]*txnlock.Lock, *tikv.KeyLocation, error) { - locks, loc := t.scanLocks(key) - return locks, loc, nil + for _, r := range t.regions { + if len(r.locks) != 0 { + return r.locks, &tikv.KeyLocation{ + Region: tikv.NewRegionVerID(r.id, 0, 0), + }, nil + } + } + return nil, nil, nil } func (t *testEnv) ResolveLocksInOneRegion(bo *tikv.Backoffer, locks []*txnlock.Lock, loc *tikv.KeyLocation) (*tikv.KeyLocation, error) { - return t.resolveLocks(locks, loc) + for _, r := range t.regions { + if loc != nil && loc.Region.GetID() == r.id { + // reset locks + r.locks = nil + return t.resolveLocks(locks, loc) + } + } + return nil, nil } func (t *testEnv) Identifier() string { From ce3726f441f9ed5ff41f85338882ee66a0574cbd Mon Sep 17 00:00:00 2001 From: 3pointer Date: Wed, 6 Sep 2023 17:03:43 +0800 Subject: [PATCH 37/45] address comment --- br/pkg/streamhelper/advancer_env.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/br/pkg/streamhelper/advancer_env.go b/br/pkg/streamhelper/advancer_env.go index 24faad706f8ee..7707783c495f1 100644 --- a/br/pkg/streamhelper/advancer_env.go +++ b/br/pkg/streamhelper/advancer_env.go @@ -150,13 +150,11 @@ type StreamMeta interface { var _ tikv.RegionLockResolver = &AdvancerLockResolver{} type AdvancerLockResolver struct { - store tikv.Storage *tikv.BaseRegionLockResolver } func newAdvancerLockResolver(store tikv.Storage) *AdvancerLockResolver { return &AdvancerLockResolver{ - store: store, BaseRegionLockResolver: tikv.NewRegionLockResolver("log backup advancer", store), } } From e5817096d1378ba73449ab20bfac7063149aef25 Mon Sep 17 00:00:00 2001 From: 3pointer Date: Wed, 6 Sep 2023 17:42:27 +0800 Subject: [PATCH 38/45] address comment --- br/pkg/streamhelper/basic_lib_for_test.go | 25 +++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/br/pkg/streamhelper/basic_lib_for_test.go b/br/pkg/streamhelper/basic_lib_for_test.go index 866dc8bb4c9b1..3ee3393535626 100644 --- a/br/pkg/streamhelper/basic_lib_for_test.go +++ b/br/pkg/streamhelper/basic_lib_for_test.go @@ -598,6 +598,7 @@ type testEnv struct { resolveLocks func([]*txnlock.Lock, *tikv.KeyLocation) (*tikv.KeyLocation, error) mu sync.Mutex + pd.Client } func (t *testEnv) Begin(ctx context.Context, ch chan<- streamhelper.TaskEvent) error { @@ -679,7 +680,7 @@ func (t *testEnv) Identifier() string { func (t *testEnv) GetStore() tikv.Storage { // only used for GetRegionCache once in resolve lock - return &mockTiKVStore{regionCache: tikv.NewRegionCache(&mockPDClient{})} + return &mockTiKVStore{regionCache: tikv.NewRegionCache(&mockPDClient{fakeRegions: t.regions})} } type mockKVStore struct { @@ -708,12 +709,24 @@ func (s *mockTiKVStore) SendReq(bo *tikv.Backoffer, req *tikvrpc.Request, region type mockPDClient struct { pd.Client + fakeRegions []*region } func (p *mockPDClient) ScanRegions(ctx context.Context, key, endKey []byte, limit int) ([]*pd.Region, error) { - return []*pd.Region{ - newMockRegion(1, []byte("1"), []byte("3")), - }, nil + sort.Slice(p.fakeRegions, func(i, j int) bool { + return bytes.Compare(p.fakeRegions[i].rng.StartKey, p.fakeRegions[j].rng.StartKey) < 0 + }) + + result := make([]*pd.Region, 0, len(p.fakeRegions)) + for _, region := range p.fakeRegions { + if spans.Overlaps(kv.KeyRange{StartKey: key, EndKey: endKey}, region.rng) && len(result) < limit { + regionInfo := newMockRegion(region.id, region.rng.StartKey, region.rng.EndKey) + result = append(result, regionInfo) + } else if bytes.Compare(region.rng.StartKey, key) > 0 { + break + } + } + return result, nil } func (p *mockPDClient) GetStore(_ context.Context, storeID uint64) (*metapb.Store, error) { @@ -723,10 +736,6 @@ func (p *mockPDClient) GetStore(_ context.Context, storeID uint64) (*metapb.Stor }, nil } -func (p *mockPDClient) GetRegion(ctx context.Context, key []byte, opts ...pd.GetRegionOption) (*pd.Region, error) { - return newMockRegion(1, []byte("1"), []byte("3")), nil -} - func newMockRegion(regionID uint64, startKey []byte, endKey []byte) *pd.Region { leader := &metapb.Peer{ Id: regionID, From d69e65225fe332b3d2a3806b432d1c7703ebf800 Mon Sep 17 00:00:00 2001 From: 3pointer Date: Thu, 7 Sep 2023 10:41:01 +0800 Subject: [PATCH 39/45] address comment --- br/pkg/streamhelper/advancer.go | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/br/pkg/streamhelper/advancer.go b/br/pkg/streamhelper/advancer.go index 0d28830dacb40..816cc14431f53 100644 --- a/br/pkg/streamhelper/advancer.go +++ b/br/pkg/streamhelper/advancer.go @@ -8,6 +8,7 @@ import ( "math" "strings" "sync" + "sync/atomic" "time" "github.com/pingcap/errors" @@ -68,6 +69,7 @@ type CheckpointAdvancer struct { // if no progress, this cache can help us don't to send useless requests. lastCheckpoint *checkpoint lastCheckpointMu sync.Mutex + inResolvingLock atomic.Bool checkpoints *spans.ValueSortedFull checkpointsMu sync.Mutex @@ -503,7 +505,8 @@ func (c *CheckpointAdvancer) optionalTick(cx context.Context) error { // lastCheckpoint is not increased too long enough. // assume the cluster has expired locks for whatever reasons. var targets []spans.Valued - if c.lastCheckpoint != nil && c.lastCheckpoint.needResolveLocks() { + if c.lastCheckpoint != nil && c.lastCheckpoint.needResolveLocks() && !c.inResolvingLock.Load() { + c.inResolvingLock.Store(true) c.WithCheckpoints(func(vsf *spans.ValueSortedFull) { // when get locks here. assume these locks are not belong to same txn, // but these locks' start ts are close to 1 minute. try resolve these locks at one time @@ -598,5 +601,6 @@ func (c *CheckpointAdvancer) asyncResolveLocksForRanges(ctx context.Context, tar c.lastCheckpointMu.Lock() c.lastCheckpoint.resolveLockTime = time.Now() c.lastCheckpointMu.Unlock() + c.inResolvingLock.Store(false) }() } From a545c594ef6210b5f01230ffb7e81d37fc15ccbb Mon Sep 17 00:00:00 2001 From: 3pointer Date: Thu, 7 Sep 2023 15:16:00 +0800 Subject: [PATCH 40/45] address comment --- store/gcworker/gc_worker.go | 68 ++++++++++++++++---------------- store/gcworker/gc_worker_test.go | 5 ++- 2 files changed, 37 insertions(+), 36 deletions(-) diff --git a/store/gcworker/gc_worker.go b/store/gcworker/gc_worker.go index a37debe2c6d2b..0be2347e76675 100644 --- a/store/gcworker/gc_worker.go +++ b/store/gcworker/gc_worker.go @@ -64,16 +64,16 @@ import ( // GCWorker periodically triggers GC process on tikv server. type GCWorker struct { - uuid string - desc string - store kv.Storage - tikvStore tikv.Storage - pdClient pd.Client - gcIsRunning bool - lastFinish time.Time - cancel context.CancelFunc - done chan error - lockResolver tikv.RegionLockResolver + uuid string + desc string + store kv.Storage + tikvStore tikv.Storage + pdClient pd.Client + gcIsRunning bool + lastFinish time.Time + cancel context.CancelFunc + done chan error + regionLockResolver tikv.RegionLockResolver } // NewGCWorker creates a GCWorker instance. @@ -93,15 +93,15 @@ func NewGCWorker(store kv.Storage, pdClient pd.Client) (*GCWorker, error) { uuid := strconv.FormatUint(ver.Ver, 16) resolverIdentifier := fmt.Sprintf("gc-worker-%s", uuid) worker := &GCWorker{ - uuid: uuid, - desc: fmt.Sprintf("host:%s, pid:%d, start at %s", hostName, os.Getpid(), time.Now()), - store: store, - tikvStore: tikvStore, - pdClient: pdClient, - gcIsRunning: false, - lastFinish: time.Now(), - lockResolver: tikv.NewRegionLockResolver(resolverIdentifier, tikvStore), - done: make(chan error), + uuid: uuid, + desc: fmt.Sprintf("host:%s, pid:%d, start at %s", hostName, os.Getpid(), time.Now()), + store: store, + tikvStore: tikvStore, + pdClient: pdClient, + gcIsRunning: false, + lastFinish: time.Now(), + regionLockResolver: tikv.NewRegionLockResolver(resolverIdentifier, tikvStore), + done: make(chan error), } variable.RegisterStatistics(worker) return worker, nil @@ -1186,7 +1186,7 @@ func (w *GCWorker) legacyResolveLocks( failpoint.Inject("lowScanLockLimit", func() { scanLimit = 3 }) - return tikv.ResolveLocksForRange(ctx, w.lockResolver, safePoint, r.StartKey, r.EndKey, tikv.NewGcResolveLockMaxBackoffer, scanLimit) + return tikv.ResolveLocksForRange(ctx, w.regionLockResolver, safePoint, r.StartKey, r.EndKey, tikv.NewGcResolveLockMaxBackoffer, scanLimit) } runner := rangetask.NewRangeTaskRunner("resolve-locks-runner", w.tikvStore, concurrency, handler) @@ -2034,12 +2034,12 @@ func getGCRules(ids []int64, rules map[string]*label.Rule) []string { // RunGCJob sends GC command to KV. It is exported for kv api, do not use it with GCWorker at the same time. // only use for test -func RunGCJob(ctx context.Context, lockResolver tikv.RegionLockResolver, s tikv.Storage, pd pd.Client, safePoint uint64, identifier string, concurrency int) error { +func RunGCJob(ctx context.Context, regionLockResolver tikv.RegionLockResolver, s tikv.Storage, pd pd.Client, safePoint uint64, identifier string, concurrency int) error { gcWorker := &GCWorker{ - tikvStore: s, - uuid: identifier, - pdClient: pd, - lockResolver: lockResolver, + tikvStore: s, + uuid: identifier, + pdClient: pd, + regionLockResolver: regionLockResolver, } if concurrency <= 0 { @@ -2072,12 +2072,12 @@ func RunGCJob(ctx context.Context, lockResolver tikv.RegionLockResolver, s tikv. // RunDistributedGCJob notifies TiKVs to do GC. It is exported for kv api, do not use it with GCWorker at the same time. // This function may not finish immediately because it may take some time to do resolveLocks. // Param concurrency specifies the concurrency of resolveLocks phase. -func RunDistributedGCJob(ctx context.Context, lockResolver tikv.RegionLockResolver, s tikv.Storage, pd pd.Client, safePoint uint64, identifier string, concurrency int) error { +func RunDistributedGCJob(ctx context.Context, regionLockResolver tikv.RegionLockResolver, s tikv.Storage, pd pd.Client, safePoint uint64, identifier string, concurrency int) error { gcWorker := &GCWorker{ - tikvStore: s, - uuid: identifier, - pdClient: pd, - lockResolver: lockResolver, + tikvStore: s, + uuid: identifier, + pdClient: pd, + regionLockResolver: regionLockResolver, } safePoint, err := gcWorker.setGCWorkerServiceSafePoint(ctx, safePoint) @@ -2109,10 +2109,10 @@ func RunDistributedGCJob(ctx context.Context, lockResolver tikv.RegionLockResolv // It is exported only for test, do not use it in the production environment. func RunResolveLocks(ctx context.Context, s tikv.Storage, pd pd.Client, safePoint uint64, identifier string, concurrency int, usePhysical bool) (bool, error) { gcWorker := &GCWorker{ - tikvStore: s, - uuid: identifier, - pdClient: pd, - lockResolver: tikv.NewRegionLockResolver("test-resolver", s), + tikvStore: s, + uuid: identifier, + pdClient: pd, + regionLockResolver: tikv.NewRegionLockResolver("test-resolver", s), } return gcWorker.resolveLocks(ctx, safePoint, concurrency, usePhysical) } diff --git a/store/gcworker/gc_worker_test.go b/store/gcworker/gc_worker_test.go index 749f6759e40c8..723ba722c9f33 100644 --- a/store/gcworker/gc_worker_test.go +++ b/store/gcworker/gc_worker_test.go @@ -1146,14 +1146,15 @@ func TestResolveLockRangeMeetRegionEnlargeCausedByRegionMerge(t *testing.T) { tikvStore: s.tikvStore, scanLocks: func(key []byte) ([]*txnlock.Lock, *tikv.KeyLocation) { // first time scan locks - if bytes.Equal(key, []byte("")) { + region, _, _ := s.cluster.GetRegionByKey(key) + if region.GetId() == s.initRegion.regionID { return []*txnlock.Lock{{Key: []byte("a")}, {Key: []byte("b")}}, &tikv.KeyLocation{ Region: tikv.NewRegionVerID(s.initRegion.regionID, 0, 0), } } // second time scan locks - if bytes.Equal(key, []byte("m")) { + if region.GetId() == region2 { return []*txnlock.Lock{{Key: []byte("o")}, {Key: []byte("p")}}, &tikv.KeyLocation{ Region: tikv.NewRegionVerID(region2, 0, 0), From c41accc6697154342da4374aa139784ab51c5433 Mon Sep 17 00:00:00 2001 From: 3pointer Date: Thu, 7 Sep 2023 17:53:45 +0800 Subject: [PATCH 41/45] address comment --- store/gcworker/gc_worker_test.go | 46 +++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 15 deletions(-) diff --git a/store/gcworker/gc_worker_test.go b/store/gcworker/gc_worker_test.go index 723ba722c9f33..78664b8e36297 100644 --- a/store/gcworker/gc_worker_test.go +++ b/store/gcworker/gc_worker_test.go @@ -50,18 +50,29 @@ import ( ) type mockGCWorkerLockResolver struct { + tikv.RegionLockResolver tikvStore tikv.Storage - scanLocks func([]byte) ([]*txnlock.Lock, *tikv.KeyLocation) + scanLocks func([]*txnlock.Lock, []byte) ([]*txnlock.Lock, *tikv.KeyLocation) batchResolveLocks func([]*txnlock.Lock, *tikv.KeyLocation) (*tikv.KeyLocation, error) } func (l *mockGCWorkerLockResolver) ScanLocksInOneRegion(bo *tikv.Backoffer, key []byte, maxVersion uint64, limit uint32) ([]*txnlock.Lock, *tikv.KeyLocation, error) { - locks, loc := l.scanLocks(key) - return locks, loc, nil + locks, loc, err := l.RegionLockResolver.ScanLocksInOneRegion(bo, key, maxVersion, limit) + if err != nil { + return nil, nil, err + } + mockLocks, mockLoc := l.scanLocks(locks, key) + if loc.Region.GetID() != mockLoc.Region.GetID() { + panic("detect different location for region lock resolve tests") + } + return mockLocks, loc, nil } func (l *mockGCWorkerLockResolver) ResolveLocksInOneRegion(bo *tikv.Backoffer, locks []*txnlock.Lock, loc *tikv.KeyLocation) (*tikv.KeyLocation, error) { - return l.batchResolveLocks(locks, loc) + if l.batchResolveLocks != nil { + return l.batchResolveLocks(locks, loc) + } + return l.RegionLockResolver.ResolveLocksInOneRegion(bo, locks, loc) } func (l *mockGCWorkerLockResolver) GetStore() tikv.Storage { @@ -1039,8 +1050,9 @@ func TestResolveLockRangeInfine(t *testing.T) { }() mockLockResolver := &mockGCWorkerLockResolver{ - tikvStore: s.tikvStore, - scanLocks: func(key []byte) ([]*txnlock.Lock, *tikv.KeyLocation) { + RegionLockResolver: tikv.NewRegionLockResolver("test", s.tikvStore), + tikvStore: s.tikvStore, + scanLocks: func(_ []*txnlock.Lock, key []byte) ([]*txnlock.Lock, *tikv.KeyLocation) { return []*txnlock.Lock{}, &tikv.KeyLocation{} }, batchResolveLocks: func( @@ -1095,8 +1107,9 @@ func TestResolveLockRangeMeetRegionCacheMiss(t *testing.T) { } mockLockResolver := &mockGCWorkerLockResolver{ - tikvStore: s.tikvStore, - scanLocks: func(key []byte) ([]*txnlock.Lock, *tikv.KeyLocation) { + RegionLockResolver: tikv.NewRegionLockResolver("test", s.tikvStore), + tikvStore: s.tikvStore, + scanLocks: func(_ []*txnlock.Lock, key []byte) ([]*txnlock.Lock, *tikv.KeyLocation) { *scanCntRef++ return allLocks, &tikv.KeyLocation{ Region: tikv.NewRegionVerID(s.initRegion.regionID, 0, 0), @@ -1143,8 +1156,9 @@ func TestResolveLockRangeMeetRegionEnlargeCausedByRegionMerge(t *testing.T) { s.cluster.Split(s.initRegion.regionID, region2, []byte("m"), newPeers, newPeers[0]) mockGCLockResolver := &mockGCWorkerLockResolver{ - tikvStore: s.tikvStore, - scanLocks: func(key []byte) ([]*txnlock.Lock, *tikv.KeyLocation) { + RegionLockResolver: tikv.NewRegionLockResolver("test", s.tikvStore), + tikvStore: s.tikvStore, + scanLocks: func(_ []*txnlock.Lock, key []byte) ([]*txnlock.Lock, *tikv.KeyLocation) { // first time scan locks region, _, _ := s.cluster.GetRegionByKey(key) if region.GetId() == s.initRegion.regionID { @@ -1179,7 +1193,7 @@ func TestResolveLockRangeMeetRegionEnlargeCausedByRegionMerge(t *testing.T) { []*metapb.Region{regionMeta}) require.NoError(t, err) // also let region1 contains all 4 locks - mockGCLockResolver.scanLocks = func(key []byte) ([]*txnlock.Lock, *tikv.KeyLocation) { + mockGCLockResolver.scanLocks = func(_ []*txnlock.Lock, key []byte) ([]*txnlock.Lock, *tikv.KeyLocation) { if bytes.Equal(key, []byte("")) { locks := []*txnlock.Lock{ {Key: []byte("a")}, @@ -1297,8 +1311,9 @@ func TestSetServiceSafePoint(t *testing.T) { func TestRunGCJobAPI(t *testing.T) { s := createGCWorkerSuite(t) mockLockResolver := &mockGCWorkerLockResolver{ - tikvStore: s.tikvStore, - scanLocks: func(key []byte) ([]*txnlock.Lock, *tikv.KeyLocation) { + RegionLockResolver: tikv.NewRegionLockResolver("test", s.tikvStore), + tikvStore: s.tikvStore, + scanLocks: func(_ []*txnlock.Lock, key []byte) ([]*txnlock.Lock, *tikv.KeyLocation) { return []*txnlock.Lock{}, &tikv.KeyLocation{} }, batchResolveLocks: func( @@ -1327,8 +1342,9 @@ func TestRunDistGCJobAPI(t *testing.T) { gcSafePointCacheInterval = 0 mockLockResolver := &mockGCWorkerLockResolver{ - tikvStore: s.tikvStore, - scanLocks: func(key []byte) ([]*txnlock.Lock, *tikv.KeyLocation) { + RegionLockResolver: tikv.NewRegionLockResolver("test", s.tikvStore), + tikvStore: s.tikvStore, + scanLocks: func(_ []*txnlock.Lock, key []byte) ([]*txnlock.Lock, *tikv.KeyLocation) { return []*txnlock.Lock{}, &tikv.KeyLocation{} }, batchResolveLocks: func( From ba6c52a6aaf8de5b2cf83d4bad5eec8d34cb67dd Mon Sep 17 00:00:00 2001 From: 3pointer Date: Thu, 7 Sep 2023 20:28:25 +0800 Subject: [PATCH 42/45] fix ut --- store/gcworker/gc_worker_test.go | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/store/gcworker/gc_worker_test.go b/store/gcworker/gc_worker_test.go index 78664b8e36297..559ef4cbc2951 100644 --- a/store/gcworker/gc_worker_test.go +++ b/store/gcworker/gc_worker_test.go @@ -61,11 +61,14 @@ func (l *mockGCWorkerLockResolver) ScanLocksInOneRegion(bo *tikv.Backoffer, key if err != nil { return nil, nil, err } - mockLocks, mockLoc := l.scanLocks(locks, key) - if loc.Region.GetID() != mockLoc.Region.GetID() { - panic("detect different location for region lock resolve tests") + if l.scanLocks != nil { + mockLocks, mockLoc := l.scanLocks(locks, key) + // append locks from mock function + locks = append(locks, mockLocks...) + // use location from mock function + loc = mockLoc } - return mockLocks, loc, nil + return locks, loc, nil } func (l *mockGCWorkerLockResolver) ResolveLocksInOneRegion(bo *tikv.Backoffer, locks []*txnlock.Lock, loc *tikv.KeyLocation) (*tikv.KeyLocation, error) { From 78ace348decc495461d5bf0999668f5bc40cf692 Mon Sep 17 00:00:00 2001 From: 3pointer Date: Fri, 8 Sep 2023 12:16:34 +0800 Subject: [PATCH 43/45] fix unstable test --- store/gcworker/gc_worker_test.go | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/store/gcworker/gc_worker_test.go b/store/gcworker/gc_worker_test.go index 559ef4cbc2951..e51628520c8a2 100644 --- a/store/gcworker/gc_worker_test.go +++ b/store/gcworker/gc_worker_test.go @@ -1167,14 +1167,22 @@ func TestResolveLockRangeMeetRegionEnlargeCausedByRegionMerge(t *testing.T) { if region.GetId() == s.initRegion.regionID { return []*txnlock.Lock{{Key: []byte("a")}, {Key: []byte("b")}}, &tikv.KeyLocation{ - Region: tikv.NewRegionVerID(s.initRegion.regionID, 0, 0), + Region: tikv.NewRegionVerID( + region.GetId(), + region.GetRegionEpoch().ConfVer, + region.GetRegionEpoch().Version, + ), } } // second time scan locks if region.GetId() == region2 { return []*txnlock.Lock{{Key: []byte("o")}, {Key: []byte("p")}}, &tikv.KeyLocation{ - Region: tikv.NewRegionVerID(region2, 0, 0), + Region: tikv.NewRegionVerID( + region.GetId(), + region.GetRegionEpoch().ConfVer, + region.GetRegionEpoch().Version, + ), } } return []*txnlock.Lock{}, nil @@ -1206,7 +1214,10 @@ func TestResolveLockRangeMeetRegionEnlargeCausedByRegionMerge(t *testing.T) { } for i, lock := range locks { if bytes.Compare(key, lock.Key) <= 0 { - return locks[i:], &tikv.KeyLocation{Region: tikv.NewRegionVerID(s.initRegion.regionID, 0, 0)} + return locks[i:], &tikv.KeyLocation{Region: tikv.NewRegionVerID( + regionMeta.GetId(), + regionMeta.GetRegionEpoch().ConfVer, + regionMeta.GetRegionEpoch().Version)} } } } @@ -1219,7 +1230,7 @@ func TestResolveLockRangeMeetRegionEnlargeCausedByRegionMerge(t *testing.T) { } return loc, nil } - _, err := tikv.ResolveLocksForRange(gcContext(), mockGCLockResolver, 1, []byte(""), []byte("z"), tikv.NewNoopBackoff, 10) + _, err := tikv.ResolveLocksForRange(gcContext(), mockGCLockResolver, 1, []byte(""), []byte("z"), tikv.NewGcResolveLockMaxBackoffer, 10) require.NoError(t, err) require.Len(t, resolvedLock, 4) expects := [][]byte{[]byte("a"), []byte("b"), []byte("o"), []byte("p")} From be0c535a1cc23c91c406cdf3795b85f5a64d1f79 Mon Sep 17 00:00:00 2001 From: 3pointer Date: Mon, 11 Sep 2023 15:26:03 +0800 Subject: [PATCH 44/45] address comment --- br/pkg/streamhelper/advancer.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/br/pkg/streamhelper/advancer.go b/br/pkg/streamhelper/advancer.go index 816cc14431f53..f9744d186e35d 100644 --- a/br/pkg/streamhelper/advancer.go +++ b/br/pkg/streamhelper/advancer.go @@ -505,8 +505,7 @@ func (c *CheckpointAdvancer) optionalTick(cx context.Context) error { // lastCheckpoint is not increased too long enough. // assume the cluster has expired locks for whatever reasons. var targets []spans.Valued - if c.lastCheckpoint != nil && c.lastCheckpoint.needResolveLocks() && !c.inResolvingLock.Load() { - c.inResolvingLock.Store(true) + if c.lastCheckpoint != nil && c.lastCheckpoint.needResolveLocks() && c.inResolvingLock.CompareAndSwap(false, true) { c.WithCheckpoints(func(vsf *spans.ValueSortedFull) { // when get locks here. assume these locks are not belong to same txn, // but these locks' start ts are close to 1 minute. try resolve these locks at one time From 5ae183015b6b395cba9c0ebf6924327878c4e821 Mon Sep 17 00:00:00 2001 From: 3pointer Date: Tue, 12 Sep 2023 14:45:05 +0800 Subject: [PATCH 45/45] address comments --- br/pkg/streamhelper/advancer.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/br/pkg/streamhelper/advancer.go b/br/pkg/streamhelper/advancer.go index f9744d186e35d..7f6f12eeea6e8 100644 --- a/br/pkg/streamhelper/advancer.go +++ b/br/pkg/streamhelper/advancer.go @@ -81,7 +81,6 @@ type CheckpointAdvancer struct { // checkpoint represents the TS with specific range. // it's only used in advancer.go. type checkpoint struct { - sync.Mutex StartKey []byte EndKey []byte TS uint64 @@ -520,6 +519,7 @@ func (c *CheckpointAdvancer) optionalTick(cx context.Context) error { ctx := context.Background() c.asyncResolveLocksForRanges(ctx, targets) } + c.inResolvingLock.Store(false) } threshold := c.Config().GetDefaultStartPollThreshold() if err := c.subscribeTick(cx); err != nil { @@ -600,6 +600,5 @@ func (c *CheckpointAdvancer) asyncResolveLocksForRanges(ctx context.Context, tar c.lastCheckpointMu.Lock() c.lastCheckpoint.resolveLockTime = time.Now() c.lastCheckpointMu.Unlock() - c.inResolvingLock.Store(false) }() }