From c7c58bdb0ce7ece393a97bf35c07c892a173d455 Mon Sep 17 00:00:00 2001 From: ehco1996 Date: Thu, 23 Dec 2021 15:53:46 +0800 Subject: [PATCH 01/47] change interface --- dm/cmd/dm-syncer/main.go | 2 +- dm/dm/unit/unit.go | 3 ++- dm/dm/worker/subtask.go | 6 +++--- dm/dm/worker/subtask_test.go | 2 +- dm/dumpling/dumpling.go | 2 +- dm/dumpling/dumpling_test.go | 2 +- dm/loader/lightning.go | 2 +- dm/loader/loader.go | 2 +- dm/syncer/syncer.go | 2 +- dm/syncer/syncer_test.go | 4 ++-- 10 files changed, 14 insertions(+), 13 deletions(-) diff --git a/dm/cmd/dm-syncer/main.go b/dm/cmd/dm-syncer/main.go index 0f89de19696..41928a55683 100644 --- a/dm/cmd/dm-syncer/main.go +++ b/dm/cmd/dm-syncer/main.go @@ -102,7 +102,7 @@ func main() { } // 5. close the syncer - sync.Close() + sync.Close(true) log.L().Info("dm-syncer exit") // 6. flush log diff --git a/dm/dm/unit/unit.go b/dm/dm/unit/unit.go index 4451a652e86..1d9b32f4977 100644 --- a/dm/dm/unit/unit.go +++ b/dm/dm/unit/unit.go @@ -47,7 +47,8 @@ type Unit interface { Process(ctx context.Context, pr chan pb.ProcessResult) // Close shuts down the process and closes the unit, after that can not call Process to resume // The implementation should not block for a long time. - Close() + // if gracefulStop is true, unit is allowed to wait for its internal logic to complete, otherwise it should be closed immediately + Close(gracefulStop bool) // Pause does some cleanups and the unit can be resumed later. The caller will make sure Process has returned. // The implementation should not block for a long time. Pause() diff --git a/dm/dm/worker/subtask.go b/dm/dm/worker/subtask.go index 842c2f2c265..f67a8edb406 100644 --- a/dm/dm/worker/subtask.go +++ b/dm/dm/worker/subtask.go @@ -157,7 +157,7 @@ func (st *SubTask) initUnits(relay relay.Process) error { var needCloseUnits []unit.Unit defer func() { for _, u := range needCloseUnits { - u.Close() + u.Close(true) } st.initialized.Store(initializeUnitSuccess) @@ -311,7 +311,7 @@ func (st *SubTask) fetchResultAndUpdateStage(pr chan pb.ProcessResult) { switch stage { case pb.Stage_Finished: - cu.Close() + cu.Close(true) nu := st.getNextUnit() if nu == nil { // Now, when finished, it only stops the process @@ -380,7 +380,7 @@ func (st *SubTask) closeUnits() { for i := cui; i < len(st.units); i++ { u := st.units[i] st.l.Info("closing unit process", zap.Stringer("unit", cu.Type())) - u.Close() + u.Close(true) } } diff --git a/dm/dm/worker/subtask_test.go b/dm/dm/worker/subtask_test.go index 8fb5006f03a..0568197e7c1 100644 --- a/dm/dm/worker/subtask_test.go +++ b/dm/dm/worker/subtask_test.go @@ -116,7 +116,7 @@ func (m *MockUnit) Process(ctx context.Context, pr chan pb.ProcessResult) { } } -func (m *MockUnit) Close() {} +func (m *MockUnit) Close(gracefulStop bool) {} func (m MockUnit) Pause() {} diff --git a/dm/dumpling/dumpling.go b/dm/dumpling/dumpling.go index ed4c0fe8b4b..cf1695230a9 100644 --- a/dm/dumpling/dumpling.go +++ b/dm/dumpling/dumpling.go @@ -165,7 +165,7 @@ func (m *Dumpling) Process(ctx context.Context, pr chan pb.ProcessResult) { } // Close implements Unit.Close. -func (m *Dumpling) Close() { +func (m *Dumpling) Close(gracefulStop bool) { if m.closed.Load() { return } diff --git a/dm/dumpling/dumpling_test.go b/dm/dumpling/dumpling_test.go index 04deaef0429..9308ccb68ea 100644 --- a/dm/dumpling/dumpling_test.go +++ b/dm/dumpling/dumpling_test.go @@ -147,7 +147,7 @@ func (t *testDumplingSuite) TestCallStatus(c *C) { dumpling, _ := export.NewDumper(ctx, dumpConf) m.core = dumpling - m.Close() + m.Close(true) s = m.Status(nil).(*pb.DumpStatus) c.Assert(s.CompletedTables, Equals, float64(0)) c.Assert(s.FinishedBytes, Equals, float64(0)) diff --git a/dm/loader/lightning.go b/dm/loader/lightning.go index 66b41fdf347..075a9eacec6 100644 --- a/dm/loader/lightning.go +++ b/dm/loader/lightning.go @@ -279,7 +279,7 @@ func (l *LightningLoader) IsFreshTask(ctx context.Context) (bool, error) { } // Close does graceful shutdown. -func (l *LightningLoader) Close() { +func (l *LightningLoader) Close(gracefulStop bool) { l.Pause() l.checkPointList.Close() l.closed.Store(true) diff --git a/dm/loader/loader.go b/dm/loader/loader.go index 42eaaa20d4d..1f6475f77f1 100644 --- a/dm/loader/loader.go +++ b/dm/loader/loader.go @@ -787,7 +787,7 @@ func (l *Loader) loadFinishedSize() { } // Close does graceful shutdown. -func (l *Loader) Close() { +func (l *Loader) Close(gracefulStop bool) { l.Lock() defer l.Unlock() if l.isClosed() { diff --git a/dm/syncer/syncer.go b/dm/syncer/syncer.go index 6f0c730aee8..ff0ea9f86c8 100644 --- a/dm/syncer/syncer.go +++ b/dm/syncer/syncer.go @@ -3160,7 +3160,7 @@ func (s *Syncer) isClosed() bool { } // Close closes syncer. -func (s *Syncer) Close() { +func (s *Syncer) Close(gracefulStop bool) { s.Lock() defer s.Unlock() diff --git a/dm/syncer/syncer_test.go b/dm/syncer/syncer_test.go index b487080b170..26472219497 100644 --- a/dm/syncer/syncer_test.go +++ b/dm/syncer/syncer_test.go @@ -1015,7 +1015,7 @@ func (s *testSyncerSuite) TestRun(c *C) { testJobs.RUnlock() cancel() - syncer.Close() + syncer.Close(true) c.Assert(syncer.isClosed(), IsTrue) if err := mock.ExpectationsWereMet(); err != nil { @@ -1191,7 +1191,7 @@ func (s *testSyncerSuite) TestExitSafeModeByConfig(c *C) { testJobs.Unlock() cancel() - syncer.Close() + syncer.Close(true) c.Assert(syncer.isClosed(), IsTrue) if err := mock.ExpectationsWereMet(); err != nil { From b2b8a7731673c5a85a670565178036816ab7deeb Mon Sep 17 00:00:00 2001 From: ehco1996 Date: Thu, 23 Dec 2021 16:07:47 +0800 Subject: [PATCH 02/47] rename --- dm/dm/unit/unit.go | 4 ++-- dm/dm/worker/subtask_test.go | 2 +- dm/dumpling/dumpling.go | 2 +- dm/loader/lightning.go | 2 +- dm/loader/loader.go | 2 +- dm/syncer/syncer.go | 2 +- 6 files changed, 7 insertions(+), 7 deletions(-) diff --git a/dm/dm/unit/unit.go b/dm/dm/unit/unit.go index 1d9b32f4977..58d59aa102f 100644 --- a/dm/dm/unit/unit.go +++ b/dm/dm/unit/unit.go @@ -47,8 +47,8 @@ type Unit interface { Process(ctx context.Context, pr chan pb.ProcessResult) // Close shuts down the process and closes the unit, after that can not call Process to resume // The implementation should not block for a long time. - // if gracefulStop is true, unit is allowed to wait for its internal logic to complete, otherwise it should be closed immediately - Close(gracefulStop bool) + // if graceful is true, unit is allowed to wait for its internal logic to complete, otherwise it should be closed immediately + Close(graceful bool) // Pause does some cleanups and the unit can be resumed later. The caller will make sure Process has returned. // The implementation should not block for a long time. Pause() diff --git a/dm/dm/worker/subtask_test.go b/dm/dm/worker/subtask_test.go index 0568197e7c1..13f2ef5c38f 100644 --- a/dm/dm/worker/subtask_test.go +++ b/dm/dm/worker/subtask_test.go @@ -116,7 +116,7 @@ func (m *MockUnit) Process(ctx context.Context, pr chan pb.ProcessResult) { } } -func (m *MockUnit) Close(gracefulStop bool) {} +func (m *MockUnit) Close(graceful bool) {} func (m MockUnit) Pause() {} diff --git a/dm/dumpling/dumpling.go b/dm/dumpling/dumpling.go index cf1695230a9..d64bbe5bb99 100644 --- a/dm/dumpling/dumpling.go +++ b/dm/dumpling/dumpling.go @@ -165,7 +165,7 @@ func (m *Dumpling) Process(ctx context.Context, pr chan pb.ProcessResult) { } // Close implements Unit.Close. -func (m *Dumpling) Close(gracefulStop bool) { +func (m *Dumpling) Close(graceful bool) { if m.closed.Load() { return } diff --git a/dm/loader/lightning.go b/dm/loader/lightning.go index 075a9eacec6..1fbb0356643 100644 --- a/dm/loader/lightning.go +++ b/dm/loader/lightning.go @@ -279,7 +279,7 @@ func (l *LightningLoader) IsFreshTask(ctx context.Context) (bool, error) { } // Close does graceful shutdown. -func (l *LightningLoader) Close(gracefulStop bool) { +func (l *LightningLoader) Close(graceful bool) { l.Pause() l.checkPointList.Close() l.closed.Store(true) diff --git a/dm/loader/loader.go b/dm/loader/loader.go index 1f6475f77f1..ce2a5782e01 100644 --- a/dm/loader/loader.go +++ b/dm/loader/loader.go @@ -787,7 +787,7 @@ func (l *Loader) loadFinishedSize() { } // Close does graceful shutdown. -func (l *Loader) Close(gracefulStop bool) { +func (l *Loader) Close(graceful bool) { l.Lock() defer l.Unlock() if l.isClosed() { diff --git a/dm/syncer/syncer.go b/dm/syncer/syncer.go index ff0ea9f86c8..fd227df49a1 100644 --- a/dm/syncer/syncer.go +++ b/dm/syncer/syncer.go @@ -3160,7 +3160,7 @@ func (s *Syncer) isClosed() bool { } // Close closes syncer. -func (s *Syncer) Close(gracefulStop bool) { +func (s *Syncer) Close(graceful bool) { s.Lock() defer s.Unlock() From 273cdbb2743c7f89c469963d3512a8bb4105c943 Mon Sep 17 00:00:00 2001 From: ehco1996 Date: Thu, 23 Dec 2021 16:14:16 +0800 Subject: [PATCH 03/47] adjust close --- dm/dm/worker/server.go | 4 ++-- dm/dm/worker/source_worker.go | 12 ++++++------ dm/dm/worker/source_worker_test.go | 14 +++++++------- dm/dm/worker/subtask.go | 10 +++++----- dm/dm/worker/subtask_holder.go | 6 +++--- dm/dm/worker/subtask_test.go | 2 +- 6 files changed, 24 insertions(+), 24 deletions(-) diff --git a/dm/dm/worker/server.go b/dm/dm/worker/server.go index da2952c6271..44ffb5ad367 100644 --- a/dm/dm/worker/server.go +++ b/dm/dm/worker/server.go @@ -452,7 +452,7 @@ func (s *Server) doClose() { } // close worker and wait for return if w := s.getWorker(false); w != nil { - w.Close() + w.Close(true) } s.closed.Store(true) } @@ -531,7 +531,7 @@ func (s *Server) stopWorker(sourceID string, needLock bool) error { s.UpdateKeepAliveTTL(s.cfg.KeepAliveTTL) s.setWorker(nil, false) s.setSourceStatus("", nil, false) - w.Close() + w.Close(true) return nil } diff --git a/dm/dm/worker/source_worker.go b/dm/dm/worker/source_worker.go index 607c5f530eb..3efefd45f58 100644 --- a/dm/dm/worker/source_worker.go +++ b/dm/dm/worker/source_worker.go @@ -115,7 +115,7 @@ func NewSourceWorker( if err != nil { // when err != nil, `w` will become nil in this func, so we pass `w` in defer. // release resources, NOTE: we need to refactor New/Init/Start/Close for components later. w2.cancel() - w2.subTaskHolder.closeAllSubTasks() + w2.subTaskHolder.closeAllSubTasks(true) } }(w) @@ -196,7 +196,7 @@ func (w *SourceWorker) Start() { } // Close stops working and releases resources. -func (w *SourceWorker) Close() { +func (w *SourceWorker) Close(graceful bool) { if w.closed.Load() { w.l.Warn("already closed") return @@ -212,7 +212,7 @@ func (w *SourceWorker) Close() { defer w.Unlock() // close all sub tasks - w.subTaskHolder.closeAllSubTasks() + w.subTaskHolder.closeAllSubTasks(graceful) if w.relayHolder != nil { w.relayHolder.Close() @@ -464,7 +464,7 @@ func (w *SourceWorker) EnableHandleSubtasks() error { // "for range" of a map will use same value address, so we'd better not pass value address to other function clone := subTaskCfg if err2 := w.StartSubTask(&clone, expectStage.Expect, false); err2 != nil { - w.subTaskHolder.closeAllSubTasks() + w.subTaskHolder.closeAllSubTasks(true) return err2 } } @@ -497,7 +497,7 @@ func (w *SourceWorker) DisableHandleSubtasks() { defer w.Unlock() // close all sub tasks - w.subTaskHolder.closeAllSubTasks() + w.subTaskHolder.closeAllSubTasks(true) w.l.Info("handling subtask enabled") } @@ -604,7 +604,7 @@ func (w *SourceWorker) OperateSubTask(name string, op pb.TaskOp) error { switch op { case pb.TaskOp_Stop: w.l.Info("stop sub task", zap.String("task", name)) - st.Close() + st.Close(true) w.subTaskHolder.removeSubTask(name) case pb.TaskOp_Pause: w.l.Info("pause sub task", zap.String("task", name)) diff --git a/dm/dm/worker/source_worker_test.go b/dm/dm/worker/source_worker_test.go index 5dd29bce6c9..574fd7f4020 100644 --- a/dm/dm/worker/source_worker_test.go +++ b/dm/dm/worker/source_worker_test.go @@ -89,10 +89,10 @@ func (t *testServer) testWorker(c *C) { c.Assert(w.GetUnitAndSourceStatusJSON("", nil), HasLen, emptyWorkerStatusInfoJSONLength) // close twice - w.Close() + w.Close(true) c.Assert(w.closed.Load(), IsTrue) c.Assert(w.subTaskHolder.getAllSubTasks(), HasLen, 0) - w.Close() + w.Close(true) c.Assert(w.closed.Load(), IsTrue) c.Assert(w.subTaskHolder.getAllSubTasks(), HasLen, 0) c.Assert(w.closed.Load(), IsTrue) @@ -294,7 +294,7 @@ func (t *testWorkerFunctionalities) TestWorkerFunctionalities(c *C) { // start worker w, err := NewSourceWorker(sourceCfg, etcdCli, "", "") c.Assert(err, IsNil) - defer w.Close() + defer w.Close(true) go func() { w.Start() }() @@ -467,7 +467,7 @@ func (t *testWorkerEtcdCompact) TestWatchSubtaskStageEtcdCompact(c *C) { c.Assert(err, IsNil) ctx, cancel := context.WithCancel(context.Background()) defer cancel() - defer w.Close() + defer w.Close(true) go func() { w.Start() }() @@ -530,7 +530,7 @@ func (t *testWorkerEtcdCompact) TestWatchSubtaskStageEtcdCompact(c *C) { c.Assert(status[0].Stage, Equals, pb.Stage_Running) cancel1() wg.Wait() - w.subTaskHolder.closeAllSubTasks() + w.subTaskHolder.closeAllSubTasks(true) // step 5: restart observe and start from startRev, this subtask should be added ctx2, cancel2 := context.WithCancel(ctx) wg.Add(1) @@ -548,7 +548,7 @@ func (t *testWorkerEtcdCompact) TestWatchSubtaskStageEtcdCompact(c *C) { c.Assert(status, HasLen, 1) c.Assert(status[0].Name, Equals, subtaskCfg.Name) c.Assert(status[0].Stage, Equals, pb.Stage_Running) - w.Close() + w.Close(true) cancel2() wg.Wait() } @@ -586,7 +586,7 @@ func (t *testWorkerEtcdCompact) TestWatchRelayStageEtcdCompact(c *C) { c.Assert(err, IsNil) ctx, cancel := context.WithCancel(context.Background()) defer cancel() - defer w.Close() + defer w.Close(true) go func() { c.Assert(w.EnableRelay(false), IsNil) w.Start() diff --git a/dm/dm/worker/subtask.go b/dm/dm/worker/subtask.go index f67a8edb406..676b1bf1bc3 100644 --- a/dm/dm/worker/subtask.go +++ b/dm/dm/worker/subtask.go @@ -358,7 +358,7 @@ func (st *SubTask) PrevUnit() unit.Unit { } // closeUnits closes all un-closed units (current unit and all the subsequent units). -func (st *SubTask) closeUnits() { +func (st *SubTask) closeUnits(graceful bool) { st.cancel() st.resultWg.Wait() @@ -379,8 +379,8 @@ func (st *SubTask) closeUnits() { for i := cui; i < len(st.units); i++ { u := st.units[i] - st.l.Info("closing unit process", zap.Stringer("unit", cu.Type())) - u.Close(true) + st.l.Info("closing unit process", zap.Stringer("unit", cu.Type()), zap.Bool("graceful", graceful)) + u.Close(graceful) } } @@ -477,14 +477,14 @@ func (st *SubTask) Result() *pb.ProcessResult { } // Close stops the sub task. -func (st *SubTask) Close() { +func (st *SubTask) Close(graceful bool) { st.l.Info("closing") if !st.setStageIfNotIn([]pb.Stage{pb.Stage_Stopped, pb.Stage_Stopping, pb.Stage_Finished}, pb.Stage_Stopping) { st.l.Info("subTask is already closed, no need to close") return } - st.closeUnits() // close all un-closed units + st.closeUnits(graceful) // close all un-closed units updateTaskMetric(st.cfg.Name, st.cfg.SourceID, pb.Stage_Stopped, st.workerName) } diff --git a/dm/dm/worker/subtask_holder.go b/dm/dm/worker/subtask_holder.go index 888b756f894..141590f4461 100644 --- a/dm/dm/worker/subtask_holder.go +++ b/dm/dm/worker/subtask_holder.go @@ -55,7 +55,7 @@ func (h *subTaskHolder) resetAllSubTasks(relay relay.Process) { defer h.mu.Unlock() for _, st := range h.subTasks { stage := st.Stage() - st.Close() + st.Close(true) // TODO: make a st.reset st.ctx, st.cancel = context.WithCancel(context.Background()) st.cfg.UseRelay = relay != nil @@ -64,11 +64,11 @@ func (h *subTaskHolder) resetAllSubTasks(relay relay.Process) { } // closeAllSubTasks closes all subtask instances. -func (h *subTaskHolder) closeAllSubTasks() { +func (h *subTaskHolder) closeAllSubTasks(graceful bool) { h.mu.Lock() defer h.mu.Unlock() for _, st := range h.subTasks { - st.Close() + st.Close(graceful) } h.subTasks = make(map[string]*SubTask) } diff --git a/dm/dm/worker/subtask_test.go b/dm/dm/worker/subtask_test.go index 13f2ef5c38f..52b7487e898 100644 --- a/dm/dm/worker/subtask_test.go +++ b/dm/dm/worker/subtask_test.go @@ -512,7 +512,7 @@ func (t *testSubTask) TestSubtaskFastQuit(c *C) { return st.Stage() == pb.Stage_Running }), IsTrue) // test Close - st.Close() + st.Close(true) select { case <-time.After(500 * time.Millisecond): c.Fatal("fail to stop subtask in 0.5s when stuck into unitTransWaitCondition") From c7f833c71d5ae2e7c5b19f5c610c9bdf02790b39 Mon Sep 17 00:00:00 2001 From: ehco1996 Date: Thu, 23 Dec 2021 16:55:21 +0800 Subject: [PATCH 04/47] graceful stop worker when lost ha --- dm/dm/worker/join.go | 3 ++- dm/dm/worker/server.go | 12 ++++++------ dm/dm/worker/server_test.go | 2 +- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/dm/dm/worker/join.go b/dm/dm/worker/join.go index ece56ec1a4d..fd00a49095c 100644 --- a/dm/dm/worker/join.go +++ b/dm/dm/worker/join.go @@ -108,7 +108,8 @@ func (s *Server) KeepAlive() { failpoint.Label("bypass") // TODO: report the error. - err := s.stopWorker("", true) + // when lost keepalive, stop the worker with not graceful. this is to fix https://github.com/pingcap/tiflow/issues/3737 + err := s.stopWorker("", true, false) if err != nil { log.L().Error("fail to stop worker", zap.Error(err)) return // return if failed to stop the worker. diff --git a/dm/dm/worker/server.go b/dm/dm/worker/server.go index 44ffb5ad367..ee257914759 100644 --- a/dm/dm/worker/server.go +++ b/dm/dm/worker/server.go @@ -325,7 +325,7 @@ func (s *Server) observeRelayConfig(ctx context.Context, rev int64) error { } return nil } - err = s.stopWorker("", false) + err = s.stopWorker("", false, true) if err != nil { log.L().Error("fail to stop worker", zap.Error(err)) return err // return if failed to stop the worker. @@ -414,7 +414,7 @@ func (s *Server) observeSourceBound(ctx context.Context, rev int64) error { } return nil } - err = s.stopWorker("", false) + err = s.stopWorker("", false, true) if err != nil { log.L().Error("fail to stop worker", zap.Error(err)) return err // return if failed to stop the worker. @@ -515,7 +515,7 @@ func (s *Server) setSourceStatus(source string, err error, needLock bool) { // if sourceID is set to "", worker will be closed directly // if sourceID is not "", we will check sourceID with w.cfg.SourceID. -func (s *Server) stopWorker(sourceID string, needLock bool) error { +func (s *Server) stopWorker(sourceID string, needLock, graceful bool) error { if needLock { s.Lock() defer s.Unlock() @@ -531,7 +531,7 @@ func (s *Server) stopWorker(sourceID string, needLock bool) error { s.UpdateKeepAliveTTL(s.cfg.KeepAliveTTL) s.setWorker(nil, false) s.setSourceStatus("", nil, false) - w.Close(true) + w.Close(graceful) return nil } @@ -676,7 +676,7 @@ func (s *Server) disableHandleSubtasks(source string) error { var err error if !w.relayEnabled.Load() { log.L().Info("relay is not enabled after disabling subtask, so stop worker") - err = s.stopWorker(source, false) + err = s.stopWorker(source, false, true) } return err } @@ -731,7 +731,7 @@ func (s *Server) disableRelay(source string) error { var err error if !w.subTaskEnabled.Load() { log.L().Info("subtask is not enabled after disabling relay, so stop worker") - err = s.stopWorker(source, false) + err = s.stopWorker(source, false, true) } return err } diff --git a/dm/dm/worker/server_test.go b/dm/dm/worker/server_test.go index f7c007983da..e9a578ddc82 100644 --- a/dm/dm/worker/server_test.go +++ b/dm/dm/worker/server_test.go @@ -426,7 +426,7 @@ func (t *testServer) TestWatchSourceBoundEtcdCompact(c *C) { c.Assert(cfg2, DeepEquals, sourceCfg) cancel1() wg.Wait() - c.Assert(s.stopWorker(sourceCfg.SourceID, true), IsNil) + c.Assert(s.stopWorker(sourceCfg.SourceID, true, true), IsNil) // step 5: start observeSourceBound from compacted revision again, should start worker ctx2, cancel2 := context.WithCancel(ctx) wg.Add(1) From bb57be67c5904191fe2e3c3bf82c952d42c8a883 Mon Sep 17 00:00:00 2001 From: ehco1996 Date: Fri, 24 Dec 2021 16:57:45 +0800 Subject: [PATCH 05/47] no graceful end in syncer --- dm/syncer/syncer.go | 73 ++++++++++++++++++++++++++++----------------- 1 file changed, 45 insertions(+), 28 deletions(-) diff --git a/dm/syncer/syncer.go b/dm/syncer/syncer.go index fd227df49a1..36e93eb5a05 100644 --- a/dm/syncer/syncer.go +++ b/dm/syncer/syncer.go @@ -157,6 +157,8 @@ type Syncer struct { waitXIDJob atomic.Int64 isTransactionEnd bool waitTransactionLock sync.Mutex + noWaitCtx context.Context // when this cancel, syncer will exit quickly no need to wait transaction end + noWaitCancel context.CancelFunc tableRouter *router.Table binlogFilter *bf.BinlogEvent @@ -273,6 +275,10 @@ func NewSyncer(cfg *config.SubTaskConfig, etcdClient *clientv3.Client, relay rel } syncer.lastCheckpointFlushedTime = time.Time{} syncer.relay = relay + + noWaitCtx, noWaitCancel := context.WithCancel(context.Background()) + syncer.noWaitCtx = noWaitCtx + syncer.noWaitCancel = noWaitCancel return syncer } @@ -564,7 +570,9 @@ func (s *Syncer) reset() { s.waitXIDJob.Store(int64(noWait)) s.isTransactionEnd = true s.flushSeq = 0 - + noWaitCtx, noWaitCancel := context.WithCancel(context.Background()) + s.noWaitCtx = noWaitCtx + s.noWaitCancel = noWaitCancel switch s.cfg.ShardMode { case config.ShardPessimistic: // every time start to re-sync from resume, we reset status to make it like a fresh syncing @@ -1385,6 +1393,37 @@ func (s *Syncer) syncDML() { } } +func (s *Syncer) waitTransactionEndBeforeExit(ctx, runCtx context.Context, cancel context.CancelFunc) { + <-ctx.Done() + select { + case <-runCtx.Done(): + default: + log.L().Info("received subtask's done") + + s.waitTransactionLock.Lock() + if s.isTransactionEnd { + s.waitXIDJob.Store(int64(waitComplete)) + log.L().Info("the last job is transaction end, done directly") + cancel() + s.waitTransactionLock.Unlock() + return + } + s.waitXIDJob.Store(int64(waiting)) + s.waitTransactionLock.Unlock() + + select { + case <-runCtx.Done(): + log.L().Info("received syncer's done") + case <-s.noWaitCtx.Done(): + log.L().Info("received no wait ctx done") + cancel() + case <-time.After(maxPauseOrStopWaitTime): + log.L().Info("wait transaction end timeout") + cancel() + } + } +} + // Run starts running for sync, we should guarantee it can rerun when paused. func (s *Syncer) Run(ctx context.Context) (err error) { runCtx, runCancel := context.WithCancel(context.Background()) @@ -1397,33 +1436,8 @@ func (s *Syncer) Run(ctx context.Context) (err error) { } }() - go func() { - <-ctx.Done() - select { - case <-runCtx.Done(): - default: - tctx.L().Info("received subtask's done") - - s.waitTransactionLock.Lock() - if s.isTransactionEnd { - s.waitXIDJob.Store(int64(waitComplete)) - tctx.L().Info("the last job is transaction end, done directly") - runCancel() - s.waitTransactionLock.Unlock() - return - } - s.waitXIDJob.Store(int64(waiting)) - s.waitTransactionLock.Unlock() - - select { - case <-runCtx.Done(): - tctx.L().Info("received syncer's done") - case <-time.After(maxPauseOrStopWaitTime): - tctx.L().Info("wait transaction end timeout") - runCancel() - } - } - }() + // start another goroutine to wait until transaction end. + go s.waitTransactionEndBeforeExit(ctx, runCtx, runCancel) // some initialization that can't be put in Syncer.Init fresh, err := s.IsFreshTask(runCtx) @@ -3168,6 +3182,9 @@ func (s *Syncer) Close(graceful bool) { return } + if !graceful { + s.noWaitCancel() + } s.stopSync() s.closeDBs() From 5acafed61046081bcb9bebcbd12309762b3beb04 Mon Sep 17 00:00:00 2001 From: ehco1996 Date: Wed, 29 Dec 2021 13:38:30 +0800 Subject: [PATCH 06/47] save work --- dm/dm/worker/subtask.go | 6 ++ dm/syncer/syncer.go | 55 +++++++++---------- .../conf/dm-worker1.toml | 1 + dm/tests/checkpoint_transaction/run.sh | 52 ++++++++++++++++++ 4 files changed, 84 insertions(+), 30 deletions(-) diff --git a/dm/dm/worker/subtask.go b/dm/dm/worker/subtask.go index 676b1bf1bc3..b85de02d5ea 100644 --- a/dm/dm/worker/subtask.go +++ b/dm/dm/worker/subtask.go @@ -359,6 +359,12 @@ func (st *SubTask) PrevUnit() unit.Unit { // closeUnits closes all un-closed units (current unit and all the subsequent units). func (st *SubTask) closeUnits(graceful bool) { + // when not graceful, we want to syncer to exit immediately, so we call u.Close(false) before call cancel + // Note that we only implement un graceful close for sync unit + if !graceful && st.CurrUnit().Type() == pb.UnitType_Sync { + st.l.Info("closing syncer without graceful", zap.String("task", st.cfg.Name)) + st.CurrUnit().Close(false) + } st.cancel() st.resultWg.Wait() diff --git a/dm/syncer/syncer.go b/dm/syncer/syncer.go index 36e93eb5a05..16e93fbb94f 100644 --- a/dm/syncer/syncer.go +++ b/dm/syncer/syncer.go @@ -157,8 +157,8 @@ type Syncer struct { waitXIDJob atomic.Int64 isTransactionEnd bool waitTransactionLock sync.Mutex - noWaitCtx context.Context // when this cancel, syncer will exit quickly no need to wait transaction end - noWaitCancel context.CancelFunc + runCtx context.Context // this ctx is injected in `s.Run` and when this ctx cancelled, syncer will exit quickly and not wait transaction end + runCancel context.CancelFunc tableRouter *router.Table binlogFilter *bf.BinlogEvent @@ -275,10 +275,6 @@ func NewSyncer(cfg *config.SubTaskConfig, etcdClient *clientv3.Client, relay rel } syncer.lastCheckpointFlushedTime = time.Time{} syncer.relay = relay - - noWaitCtx, noWaitCancel := context.WithCancel(context.Background()) - syncer.noWaitCtx = noWaitCtx - syncer.noWaitCancel = noWaitCancel return syncer } @@ -570,9 +566,6 @@ func (s *Syncer) reset() { s.waitXIDJob.Store(int64(noWait)) s.isTransactionEnd = true s.flushSeq = 0 - noWaitCtx, noWaitCancel := context.WithCancel(context.Background()) - s.noWaitCtx = noWaitCtx - s.noWaitCancel = noWaitCancel switch s.cfg.ShardMode { case config.ShardPessimistic: // every time start to re-sync from resume, we reset status to make it like a fresh syncing @@ -980,7 +973,7 @@ func (s *Syncer) addJob(job *job) error { s.isTransactionEnd = false failpoint.Inject("checkCheckpointInMiddleOfTransaction", func() { s.tctx.L().Info("receive dml job", zap.Any("dml job", job)) - time.Sleep(100 * time.Millisecond) + time.Sleep(500 * time.Millisecond) }) } @@ -1393,34 +1386,28 @@ func (s *Syncer) syncDML() { } } -func (s *Syncer) waitTransactionEndBeforeExit(ctx, runCtx context.Context, cancel context.CancelFunc) { - <-ctx.Done() +func (s *Syncer) waitTransactionEndBeforeExit(ctx context.Context) { select { - case <-runCtx.Done(): - default: - log.L().Info("received subtask's done") - + case <-ctx.Done(): // hijack the context to wait for the transaction to end. + log.L().Info("received subtask's done, try graceful stop") s.waitTransactionLock.Lock() if s.isTransactionEnd { s.waitXIDJob.Store(int64(waitComplete)) log.L().Info("the last job is transaction end, done directly") - cancel() + s.runCancel() s.waitTransactionLock.Unlock() return } s.waitXIDJob.Store(int64(waiting)) s.waitTransactionLock.Unlock() - select { - case <-runCtx.Done(): - log.L().Info("received syncer's done") - case <-s.noWaitCtx.Done(): - log.L().Info("received no wait ctx done") - cancel() + case <-s.runCtx.Done(): + log.L().Info("received run ctx done, exit now") case <-time.After(maxPauseOrStopWaitTime): - log.L().Info("wait transaction end timeout") - cancel() + log.L().Info("wait transaction end timeout, exit now") } + case <-s.runCtx.Done(): // when no graceful stop, run ctx will canceled first. + log.L().Info("received ungraceful exit ctx, exit now") } } @@ -1428,6 +1415,10 @@ func (s *Syncer) waitTransactionEndBeforeExit(ctx, runCtx context.Context, cance func (s *Syncer) Run(ctx context.Context) (err error) { runCtx, runCancel := context.WithCancel(context.Background()) defer runCancel() + s.Lock() + s.runCtx = runCtx + s.runCancel = runCancel + s.Unlock() tctx := s.tctx.WithContext(runCtx) defer func() { @@ -1436,8 +1427,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) { } }() - // start another goroutine to wait until transaction end. - go s.waitTransactionEndBeforeExit(ctx, runCtx, runCancel) + go s.waitTransactionEndBeforeExit(ctx) // some initialization that can't be put in Syncer.Init fresh, err := s.IsFreshTask(runCtx) @@ -3183,15 +3173,20 @@ func (s *Syncer) Close(graceful bool) { } if !graceful { - s.noWaitCancel() + s.runCancel() + s.tctx.L().Warn("syncer is closing without graceful") } s.stopSync() s.closeDBs() s.checkpoint.Close() - if err := s.schemaTracker.Close(); err != nil { - s.tctx.L().Error("fail to close schema tracker", log.ShortError(err)) + if graceful { + // when close sycner without graceful, Close will be called before s.Process exit. + // if we close schematracker here worker will pending on `s.checkpoint.Rollback(s.schemaTracker)` forever + if err := s.schemaTracker.Close(); err != nil { + s.tctx.L().Error("fail to close schema tracker", log.ShortError(err)) + } } if s.sgk != nil { diff --git a/dm/tests/checkpoint_transaction/conf/dm-worker1.toml b/dm/tests/checkpoint_transaction/conf/dm-worker1.toml index 7a72ea72bf8..1a53a1dc788 100644 --- a/dm/tests/checkpoint_transaction/conf/dm-worker1.toml +++ b/dm/tests/checkpoint_transaction/conf/dm-worker1.toml @@ -1,2 +1,3 @@ name = "worker1" join = "127.0.0.1:8261" +keepalive-ttl = 1 diff --git a/dm/tests/checkpoint_transaction/run.sh b/dm/tests/checkpoint_transaction/run.sh index 5771945384a..8562f2c1315 100755 --- a/dm/tests/checkpoint_transaction/run.sh +++ b/dm/tests/checkpoint_transaction/run.sh @@ -6,6 +6,35 @@ cur=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) source $cur/../_utils/test_prepare WORK_DIR=$TEST_DIR/$TEST_NAME +function check_worker_ungraceful_stop_with_retry() { + all_matched=false + + for ((k = 0; k < 10; k++)); do + sleep 1 + echo "start check_worker_ungraceful_stop_with_retry times: $k" + num=$(grep "closing syncer without graceful" $WORK_DIR/worker1/log/dm-worker.log | wc -l) + if [ $num -lt 1 ]; then + continue + fi + num=$(grep "syncer is closing" $WORK_DIR/worker1/log/dm-worker.log | wc -l) + if [ $num -lt 1 ]; then + continue + fi + num=$(grep "received ungraceful exit ctx, exit now" $WORK_DIR/worker1/log/dm-worker.log | wc -l) + if [ $num -lt 1 ]; then + continue + fi + echo "check_worker_ungraceful_stop_with_retry after retry: $k" + all_matched=true + if $all_matched; then + break + fi + done + if ! $all_matched; then + exit 1 + fi +} + function run() { export GO_FAILPOINTS="github.com/pingcap/tiflow/dm/syncer/checkCheckpointInMiddleOfTransaction=return" @@ -28,6 +57,29 @@ function run() { # check diff check_sync_diff $WORK_DIR $cur/conf/diff_config.toml + # test ungraceful stop, worker will not wait transaction finish + run_sql_file $cur/data/db1.increment1.sql $MYSQL_HOST1 $MYSQL_PORT1 $MYSQL_PASSWORD1 + sleep 2 + # kill dm-master 1 to make worker lost keepalive while a transaction is not finished + echo "kill dm-master1" + kill_dm_master + check_master_port_offline 1 + sleep 1 # wiat woker lost ha keepalive-ttl is 1 second + + # check dm-worker2 will exit quickly without waiting for the transaction to finish + check_worker_ungraceful_stop_with_retry + + # test data in tidb less than source + dataCountSource=$(mysql -uroot -h$MYSQL_HOST1 -P$MYSQL_PORT1 -p$MYSQL_PASSWORD1 -se "select count(1) from checkpoint_transaction.t1") + dataCountIntidb=$(mysql -uroot -h127.0.0.1 -P4000 -se "select count(1) from checkpoint_transaction.t1") + echo "afetr ungraceful exit data in source count: $dataCountSource data in tidb count: $dataCountIntidb" + [[ $dataCountIntidb -lt $dataCountSource ]] + + # start dm-master again task will be resume, and data will be synced + run_dm_master $WORK_DIR/master $MASTER_PORT $cur/conf/dm-master.toml + check_rpc_alive $cur/../bin/check_master_online 127.0.0.1:$MASTER_PORT + check_sync_diff $WORK_DIR $cur/conf/diff_config.toml + run_sql_file $cur/data/db1.increment1.sql $MYSQL_HOST1 $MYSQL_PORT1 $MYSQL_PASSWORD1 # wait transaction start # you can see why sleep in https://github.com/pingcap/dm/pull/1928#issuecomment-895820239 From 0e68f2419194925da7dc8c27ca33404715678010 Mon Sep 17 00:00:00 2001 From: ehco1996 Date: Thu, 30 Dec 2021 16:39:10 +0800 Subject: [PATCH 07/47] fix comment --- dm/dm/worker/subtask.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dm/dm/worker/subtask.go b/dm/dm/worker/subtask.go index b85de02d5ea..c2003b4bbba 100644 --- a/dm/dm/worker/subtask.go +++ b/dm/dm/worker/subtask.go @@ -360,7 +360,7 @@ func (st *SubTask) PrevUnit() unit.Unit { // closeUnits closes all un-closed units (current unit and all the subsequent units). func (st *SubTask) closeUnits(graceful bool) { // when not graceful, we want to syncer to exit immediately, so we call u.Close(false) before call cancel - // Note that we only implement un graceful close for sync unit + // note that we only implement ungraceful close for sync unit if !graceful && st.CurrUnit().Type() == pb.UnitType_Sync { st.l.Info("closing syncer without graceful", zap.String("task", st.cfg.Name)) st.CurrUnit().Close(false) From 6081bce35ebc7cb8f6d7fe2613e0123493e93b22 Mon Sep 17 00:00:00 2001 From: ehco1996 Date: Tue, 4 Jan 2022 10:37:35 +0800 Subject: [PATCH 08/47] address comment and trigger test --- dm/cmd/dm-syncer/main.go | 2 +- dm/dm/unit/unit.go | 5 ++-- dm/dm/worker/join.go | 2 +- dm/dm/worker/server.go | 6 ++-- dm/dm/worker/source_worker.go | 18 +++++++----- dm/dm/worker/source_worker_test.go | 16 +++++------ dm/dm/worker/subtask.go | 39 +++++++++++++++++--------- dm/dm/worker/subtask_holder.go | 16 +++++++++-- dm/dm/worker/subtask_test.go | 6 ++-- dm/dumpling/dumpling.go | 8 +++++- dm/dumpling/dumpling_test.go | 2 +- dm/loader/lightning.go | 8 +++++- dm/loader/loader.go | 8 +++++- dm/syncer/syncer.go | 25 +++++++++++------ dm/syncer/syncer_test.go | 4 +-- dm/tests/checkpoint_transaction/run.sh | 7 +++-- 16 files changed, 114 insertions(+), 58 deletions(-) diff --git a/dm/cmd/dm-syncer/main.go b/dm/cmd/dm-syncer/main.go index 41928a55683..0f89de19696 100644 --- a/dm/cmd/dm-syncer/main.go +++ b/dm/cmd/dm-syncer/main.go @@ -102,7 +102,7 @@ func main() { } // 5. close the syncer - sync.Close(true) + sync.Close() log.L().Info("dm-syncer exit") // 6. flush log diff --git a/dm/dm/unit/unit.go b/dm/dm/unit/unit.go index 58d59aa102f..363d17bf2cc 100644 --- a/dm/dm/unit/unit.go +++ b/dm/dm/unit/unit.go @@ -47,8 +47,9 @@ type Unit interface { Process(ctx context.Context, pr chan pb.ProcessResult) // Close shuts down the process and closes the unit, after that can not call Process to resume // The implementation should not block for a long time. - // if graceful is true, unit is allowed to wait for its internal logic to complete, otherwise it should be closed immediately - Close(graceful bool) + Close() + // Kill shuts down the process and closes the unit without graceful. + Kill() // Pause does some cleanups and the unit can be resumed later. The caller will make sure Process has returned. // The implementation should not block for a long time. Pause() diff --git a/dm/dm/worker/join.go b/dm/dm/worker/join.go index fd00a49095c..e9d18ba5bbb 100644 --- a/dm/dm/worker/join.go +++ b/dm/dm/worker/join.go @@ -108,7 +108,7 @@ func (s *Server) KeepAlive() { failpoint.Label("bypass") // TODO: report the error. - // when lost keepalive, stop the worker with not graceful. this is to fix https://github.com/pingcap/tiflow/issues/3737 + // when lost keepalive, stop the worker without graceful. this is to fix https://github.com/pingcap/tiflow/issues/3737 err := s.stopWorker("", true, false) if err != nil { log.L().Error("fail to stop worker", zap.Error(err)) diff --git a/dm/dm/worker/server.go b/dm/dm/worker/server.go index ee257914759..b3c778c2d21 100644 --- a/dm/dm/worker/server.go +++ b/dm/dm/worker/server.go @@ -450,9 +450,9 @@ func (s *Server) doClose() { if s.closed.Load() { return } - // close worker and wait for return + // stop worker and wait for return if w := s.getWorker(false); w != nil { - w.Close(true) + w.Stop(true) } s.closed.Store(true) } @@ -531,7 +531,7 @@ func (s *Server) stopWorker(sourceID string, needLock, graceful bool) error { s.UpdateKeepAliveTTL(s.cfg.KeepAliveTTL) s.setWorker(nil, false) s.setSourceStatus("", nil, false) - w.Close(graceful) + w.Stop(graceful) return nil } diff --git a/dm/dm/worker/source_worker.go b/dm/dm/worker/source_worker.go index 3efefd45f58..efcf8e04490 100644 --- a/dm/dm/worker/source_worker.go +++ b/dm/dm/worker/source_worker.go @@ -115,7 +115,7 @@ func NewSourceWorker( if err != nil { // when err != nil, `w` will become nil in this func, so we pass `w` in defer. // release resources, NOTE: we need to refactor New/Init/Start/Close for components later. w2.cancel() - w2.subTaskHolder.closeAllSubTasks(true) + w2.subTaskHolder.closeAllSubTasks() } }(w) @@ -196,7 +196,7 @@ func (w *SourceWorker) Start() { } // Close stops working and releases resources. -func (w *SourceWorker) Close(graceful bool) { +func (w *SourceWorker) Stop(graceful bool) { if w.closed.Load() { w.l.Warn("already closed") return @@ -211,8 +211,12 @@ func (w *SourceWorker) Close(graceful bool) { w.Lock() defer w.Unlock() - // close all sub tasks - w.subTaskHolder.closeAllSubTasks(graceful) + // close or kill all subtasks + if graceful { + w.subTaskHolder.closeAllSubTasks() + } else { + w.subTaskHolder.killAllSubTasks() + } if w.relayHolder != nil { w.relayHolder.Close() @@ -464,7 +468,7 @@ func (w *SourceWorker) EnableHandleSubtasks() error { // "for range" of a map will use same value address, so we'd better not pass value address to other function clone := subTaskCfg if err2 := w.StartSubTask(&clone, expectStage.Expect, false); err2 != nil { - w.subTaskHolder.closeAllSubTasks(true) + w.subTaskHolder.closeAllSubTasks() return err2 } } @@ -497,7 +501,7 @@ func (w *SourceWorker) DisableHandleSubtasks() { defer w.Unlock() // close all sub tasks - w.subTaskHolder.closeAllSubTasks(true) + w.subTaskHolder.closeAllSubTasks() w.l.Info("handling subtask enabled") } @@ -604,7 +608,7 @@ func (w *SourceWorker) OperateSubTask(name string, op pb.TaskOp) error { switch op { case pb.TaskOp_Stop: w.l.Info("stop sub task", zap.String("task", name)) - st.Close(true) + st.Close() w.subTaskHolder.removeSubTask(name) case pb.TaskOp_Pause: w.l.Info("pause sub task", zap.String("task", name)) diff --git a/dm/dm/worker/source_worker_test.go b/dm/dm/worker/source_worker_test.go index 574fd7f4020..770b7ec7983 100644 --- a/dm/dm/worker/source_worker_test.go +++ b/dm/dm/worker/source_worker_test.go @@ -88,11 +88,11 @@ func (t *testServer) testWorker(c *C) { c.Assert(err, IsNil) c.Assert(w.GetUnitAndSourceStatusJSON("", nil), HasLen, emptyWorkerStatusInfoJSONLength) - // close twice - w.Close(true) + // stop twice + w.Stop(true) c.Assert(w.closed.Load(), IsTrue) c.Assert(w.subTaskHolder.getAllSubTasks(), HasLen, 0) - w.Close(true) + w.Stop(true) c.Assert(w.closed.Load(), IsTrue) c.Assert(w.subTaskHolder.getAllSubTasks(), HasLen, 0) c.Assert(w.closed.Load(), IsTrue) @@ -294,7 +294,7 @@ func (t *testWorkerFunctionalities) TestWorkerFunctionalities(c *C) { // start worker w, err := NewSourceWorker(sourceCfg, etcdCli, "", "") c.Assert(err, IsNil) - defer w.Close(true) + defer w.Stop(true) go func() { w.Start() }() @@ -467,7 +467,7 @@ func (t *testWorkerEtcdCompact) TestWatchSubtaskStageEtcdCompact(c *C) { c.Assert(err, IsNil) ctx, cancel := context.WithCancel(context.Background()) defer cancel() - defer w.Close(true) + defer w.Stop(true) go func() { w.Start() }() @@ -530,7 +530,7 @@ func (t *testWorkerEtcdCompact) TestWatchSubtaskStageEtcdCompact(c *C) { c.Assert(status[0].Stage, Equals, pb.Stage_Running) cancel1() wg.Wait() - w.subTaskHolder.closeAllSubTasks(true) + w.subTaskHolder.closeAllSubTasks() // step 5: restart observe and start from startRev, this subtask should be added ctx2, cancel2 := context.WithCancel(ctx) wg.Add(1) @@ -548,7 +548,7 @@ func (t *testWorkerEtcdCompact) TestWatchSubtaskStageEtcdCompact(c *C) { c.Assert(status, HasLen, 1) c.Assert(status[0].Name, Equals, subtaskCfg.Name) c.Assert(status[0].Stage, Equals, pb.Stage_Running) - w.Close(true) + w.Stop(true) cancel2() wg.Wait() } @@ -586,7 +586,7 @@ func (t *testWorkerEtcdCompact) TestWatchRelayStageEtcdCompact(c *C) { c.Assert(err, IsNil) ctx, cancel := context.WithCancel(context.Background()) defer cancel() - defer w.Close(true) + defer w.Stop(true) go func() { c.Assert(w.EnableRelay(false), IsNil) w.Start() diff --git a/dm/dm/worker/subtask.go b/dm/dm/worker/subtask.go index c2003b4bbba..5d37f2ae025 100644 --- a/dm/dm/worker/subtask.go +++ b/dm/dm/worker/subtask.go @@ -157,7 +157,7 @@ func (st *SubTask) initUnits(relay relay.Process) error { var needCloseUnits []unit.Unit defer func() { for _, u := range needCloseUnits { - u.Close(true) + u.Close() } st.initialized.Store(initializeUnitSuccess) @@ -311,7 +311,7 @@ func (st *SubTask) fetchResultAndUpdateStage(pr chan pb.ProcessResult) { switch stage { case pb.Stage_Finished: - cu.Close(true) + cu.Close() nu := st.getNextUnit() if nu == nil { // Now, when finished, it only stops the process @@ -358,13 +358,7 @@ func (st *SubTask) PrevUnit() unit.Unit { } // closeUnits closes all un-closed units (current unit and all the subsequent units). -func (st *SubTask) closeUnits(graceful bool) { - // when not graceful, we want to syncer to exit immediately, so we call u.Close(false) before call cancel - // note that we only implement ungraceful close for sync unit - if !graceful && st.CurrUnit().Type() == pb.UnitType_Sync { - st.l.Info("closing syncer without graceful", zap.String("task", st.cfg.Name)) - st.CurrUnit().Close(false) - } +func (st *SubTask) closeUnits() { st.cancel() st.resultWg.Wait() @@ -385,8 +379,16 @@ func (st *SubTask) closeUnits(graceful bool) { for i := cui; i < len(st.units); i++ { u := st.units[i] - st.l.Info("closing unit process", zap.Stringer("unit", cu.Type()), zap.Bool("graceful", graceful)) - u.Close(graceful) + st.l.Info("closing unit process", zap.Stringer("unit", cu.Type())) + u.Close() + } +} + +func (st *SubTask) killCurrentUnit() { + // note that we only implement ungraceful stop for sync unit + if st.CurrUnit().Type() == pb.UnitType_Sync { + st.l.Info("kill syncer unit", zap.String("task", st.cfg.Name)) + st.CurrUnit().Kill() } } @@ -483,14 +485,25 @@ func (st *SubTask) Result() *pb.ProcessResult { } // Close stops the sub task. -func (st *SubTask) Close(graceful bool) { +func (st *SubTask) Close() { st.l.Info("closing") if !st.setStageIfNotIn([]pb.Stage{pb.Stage_Stopped, pb.Stage_Stopping, pb.Stage_Finished}, pb.Stage_Stopping) { st.l.Info("subTask is already closed, no need to close") return } + st.closeUnits() // close all un-closed units + updateTaskMetric(st.cfg.Name, st.cfg.SourceID, pb.Stage_Stopped, st.workerName) +} - st.closeUnits(graceful) // close all un-closed units +// kill kill running unit and stop the sub task. +func (st *SubTask) Kill() { + st.l.Info("killing") + if !st.setStageIfNotIn([]pb.Stage{pb.Stage_Stopped, pb.Stage_Stopping, pb.Stage_Finished}, pb.Stage_Stopping) { + st.l.Info("subTask is already closed, no need to close") + return + } + st.killCurrentUnit() + st.closeUnits() // close all un-closed units updateTaskMetric(st.cfg.Name, st.cfg.SourceID, pb.Stage_Stopped, st.workerName) } diff --git a/dm/dm/worker/subtask_holder.go b/dm/dm/worker/subtask_holder.go index 141590f4461..ebce2494671 100644 --- a/dm/dm/worker/subtask_holder.go +++ b/dm/dm/worker/subtask_holder.go @@ -55,7 +55,7 @@ func (h *subTaskHolder) resetAllSubTasks(relay relay.Process) { defer h.mu.Unlock() for _, st := range h.subTasks { stage := st.Stage() - st.Close(true) + st.Close() // TODO: make a st.reset st.ctx, st.cancel = context.WithCancel(context.Background()) st.cfg.UseRelay = relay != nil @@ -64,11 +64,21 @@ func (h *subTaskHolder) resetAllSubTasks(relay relay.Process) { } // closeAllSubTasks closes all subtask instances. -func (h *subTaskHolder) closeAllSubTasks(graceful bool) { +func (h *subTaskHolder) closeAllSubTasks() { h.mu.Lock() defer h.mu.Unlock() for _, st := range h.subTasks { - st.Close(graceful) + st.Close() + } + h.subTasks = make(map[string]*SubTask) +} + +// killAllSubTasks kill and stop all subtask instances. +func (h *subTaskHolder) killAllSubTasks() { + h.mu.Lock() + defer h.mu.Unlock() + for _, st := range h.subTasks { + st.Kill() } h.subTasks = make(map[string]*SubTask) } diff --git a/dm/dm/worker/subtask_test.go b/dm/dm/worker/subtask_test.go index 52b7487e898..5cdcbb747ef 100644 --- a/dm/dm/worker/subtask_test.go +++ b/dm/dm/worker/subtask_test.go @@ -116,7 +116,9 @@ func (m *MockUnit) Process(ctx context.Context, pr chan pb.ProcessResult) { } } -func (m *MockUnit) Close(graceful bool) {} +func (m *MockUnit) Close() {} + +func (m *MockUnit) Kill() {} func (m MockUnit) Pause() {} @@ -512,7 +514,7 @@ func (t *testSubTask) TestSubtaskFastQuit(c *C) { return st.Stage() == pb.Stage_Running }), IsTrue) // test Close - st.Close(true) + st.Close() select { case <-time.After(500 * time.Millisecond): c.Fatal("fail to stop subtask in 0.5s when stuck into unitTransWaitCondition") diff --git a/dm/dumpling/dumpling.go b/dm/dumpling/dumpling.go index d64bbe5bb99..ea14cbd2eb5 100644 --- a/dm/dumpling/dumpling.go +++ b/dm/dumpling/dumpling.go @@ -165,7 +165,7 @@ func (m *Dumpling) Process(ctx context.Context, pr chan pb.ProcessResult) { } // Close implements Unit.Close. -func (m *Dumpling) Close(graceful bool) { +func (m *Dumpling) Close() { if m.closed.Load() { return } @@ -175,6 +175,12 @@ func (m *Dumpling) Close(graceful bool) { m.closed.Store(true) } +// Kill implements Unit.Kill. +func (m *Dumpling) Kill() { + // TODO: implement kill + m.Close() +} + // Pause implements Unit.Pause. func (m *Dumpling) Pause() { if m.closed.Load() { diff --git a/dm/dumpling/dumpling_test.go b/dm/dumpling/dumpling_test.go index 9308ccb68ea..04deaef0429 100644 --- a/dm/dumpling/dumpling_test.go +++ b/dm/dumpling/dumpling_test.go @@ -147,7 +147,7 @@ func (t *testDumplingSuite) TestCallStatus(c *C) { dumpling, _ := export.NewDumper(ctx, dumpConf) m.core = dumpling - m.Close(true) + m.Close() s = m.Status(nil).(*pb.DumpStatus) c.Assert(s.CompletedTables, Equals, float64(0)) c.Assert(s.FinishedBytes, Equals, float64(0)) diff --git a/dm/loader/lightning.go b/dm/loader/lightning.go index 1fbb0356643..2cb7c11e99f 100644 --- a/dm/loader/lightning.go +++ b/dm/loader/lightning.go @@ -279,12 +279,18 @@ func (l *LightningLoader) IsFreshTask(ctx context.Context) (bool, error) { } // Close does graceful shutdown. -func (l *LightningLoader) Close(graceful bool) { +func (l *LightningLoader) Close() { l.Pause() l.checkPointList.Close() l.closed.Store(true) } +// Kill does ungraceful shutdown. +func (l *LightningLoader) Kill() { + // TODO: implement kill + l.Close() +} + // Pause pauses the process, and it can be resumed later // should cancel context from external. func (l *LightningLoader) Pause() { diff --git a/dm/loader/loader.go b/dm/loader/loader.go index ce2a5782e01..de3835ded64 100644 --- a/dm/loader/loader.go +++ b/dm/loader/loader.go @@ -787,7 +787,7 @@ func (l *Loader) loadFinishedSize() { } // Close does graceful shutdown. -func (l *Loader) Close(graceful bool) { +func (l *Loader) Close() { l.Lock() defer l.Unlock() if l.isClosed() { @@ -804,6 +804,12 @@ func (l *Loader) Close(graceful bool) { l.closed.Store(true) } +// Kill kill the loader without graceful. +func (l *Loader) Kill() { + // TODO: implement kill + l.Close() +} + // stopLoad stops loading, now it used by Close and Pause // maybe we can refine the workflow more clear. func (l *Loader) stopLoad() { diff --git a/dm/syncer/syncer.go b/dm/syncer/syncer.go index 16e93fbb94f..61517cea76e 100644 --- a/dm/syncer/syncer.go +++ b/dm/syncer/syncer.go @@ -3163,8 +3163,7 @@ func (s *Syncer) isClosed() bool { return s.closed.Load() } -// Close closes syncer. -func (s *Syncer) Close(graceful bool) { +func (s *Syncer) close(closeTracker bool) { s.Lock() defer s.Unlock() @@ -3172,18 +3171,14 @@ func (s *Syncer) Close(graceful bool) { return } - if !graceful { - s.runCancel() - s.tctx.L().Warn("syncer is closing without graceful") - } s.stopSync() s.closeDBs() s.checkpoint.Close() - if graceful { - // when close sycner without graceful, Close will be called before s.Process exit. - // if we close schematracker here worker will pending on `s.checkpoint.Rollback(s.schemaTracker)` forever + // when close sycner without graceful, Close will be called before s.Process exit. + // if we close schematracker worker will pending on `s.checkpoint.Rollback(s.schemaTracker)` forever + if closeTracker { if err := s.schemaTracker.Close(); err != nil { s.tctx.L().Error("fail to close schema tracker", log.ShortError(err)) } @@ -3203,6 +3198,18 @@ func (s *Syncer) Close(graceful bool) { s.closed.Store(true) } +// Close closes syncer. +func (s *Syncer) Close() { + s.close(true) +} + +// Kill kill syncer without graceful. +func (s *Syncer) Kill() { + s.tctx.L().Warn("kill syncer without graceful") + s.runCancel() + s.close(false) +} + // stopSync stops syncing, now it used by Close and Pause // maybe we can refine the workflow more clear. func (s *Syncer) stopSync() { diff --git a/dm/syncer/syncer_test.go b/dm/syncer/syncer_test.go index 26472219497..b487080b170 100644 --- a/dm/syncer/syncer_test.go +++ b/dm/syncer/syncer_test.go @@ -1015,7 +1015,7 @@ func (s *testSyncerSuite) TestRun(c *C) { testJobs.RUnlock() cancel() - syncer.Close(true) + syncer.Close() c.Assert(syncer.isClosed(), IsTrue) if err := mock.ExpectationsWereMet(); err != nil { @@ -1191,7 +1191,7 @@ func (s *testSyncerSuite) TestExitSafeModeByConfig(c *C) { testJobs.Unlock() cancel() - syncer.Close(true) + syncer.Close() c.Assert(syncer.isClosed(), IsTrue) if err := mock.ExpectationsWereMet(); err != nil { diff --git a/dm/tests/checkpoint_transaction/run.sh b/dm/tests/checkpoint_transaction/run.sh index 8562f2c1315..608cf0d0ef1 100755 --- a/dm/tests/checkpoint_transaction/run.sh +++ b/dm/tests/checkpoint_transaction/run.sh @@ -12,11 +12,12 @@ function check_worker_ungraceful_stop_with_retry() { for ((k = 0; k < 10; k++)); do sleep 1 echo "start check_worker_ungraceful_stop_with_retry times: $k" - num=$(grep "closing syncer without graceful" $WORK_DIR/worker1/log/dm-worker.log | wc -l) + + num=$(grep "kill syncer unit" $WORK_DIR/worker1/log/dm-worker.log | wc -l) if [ $num -lt 1 ]; then continue fi - num=$(grep "syncer is closing" $WORK_DIR/worker1/log/dm-worker.log | wc -l) + num=$(grep "kill syncer without graceful" $WORK_DIR/worker1/log/dm-worker.log | wc -l) if [ $num -lt 1 ]; then continue fi @@ -24,9 +25,9 @@ function check_worker_ungraceful_stop_with_retry() { if [ $num -lt 1 ]; then continue fi - echo "check_worker_ungraceful_stop_with_retry after retry: $k" all_matched=true if $all_matched; then + echo "check_worker_ungraceful_stop_with_retry after retry: $k" break fi done From 1ec3c56b762ba1af8f9ee6fb4d24373163bdf62a Mon Sep 17 00:00:00 2001 From: ehco1996 Date: Wed, 12 Jan 2022 14:16:09 +0800 Subject: [PATCH 09/47] rollback checkpoint before close schematracker --- dm/syncer/syncer.go | 39 ++++++++++++--------------------------- 1 file changed, 12 insertions(+), 27 deletions(-) diff --git a/dm/syncer/syncer.go b/dm/syncer/syncer.go index 61517cea76e..5cbe215991c 100644 --- a/dm/syncer/syncer.go +++ b/dm/syncer/syncer.go @@ -694,14 +694,6 @@ func (s *Syncer) Process(ctx context.Context, pr chan pb.ProcessResult) { default: } - // try to rollback checkpoints, if they already flushed, no effect - prePos := s.checkpoint.GlobalPoint() - s.checkpoint.Rollback(s.schemaTracker) - currPos := s.checkpoint.GlobalPoint() - if binlog.CompareLocation(prePos, currPos, s.cfg.EnableGTID) != 0 { - s.tctx.L().Warn("something wrong with rollback global checkpoint", zap.Stringer("previous position", prePos), zap.Stringer("current position", currPos)) - } - pr <- pb.ProcessResult{ IsCanceled: isCanceled, Errors: errs, @@ -3163,7 +3155,8 @@ func (s *Syncer) isClosed() bool { return s.closed.Load() } -func (s *Syncer) close(closeTracker bool) { +// Close closes syncer. +func (s *Syncer) Close() { s.Lock() defer s.Unlock() @@ -3174,40 +3167,32 @@ func (s *Syncer) close(closeTracker bool) { s.stopSync() s.closeDBs() + // try to rollback checkpoints, if they already flushed, no effect, this operation should call before close schemaTracker + prePos := s.checkpoint.GlobalPoint() + s.checkpoint.Rollback(s.schemaTracker) + currPos := s.checkpoint.GlobalPoint() + if binlog.CompareLocation(prePos, currPos, s.cfg.EnableGTID) != 0 { + s.tctx.L().Warn("something wrong with rollback global checkpoint", zap.Stringer("previous position", prePos), zap.Stringer("current position", currPos)) + } s.checkpoint.Close() - - // when close sycner without graceful, Close will be called before s.Process exit. - // if we close schematracker worker will pending on `s.checkpoint.Rollback(s.schemaTracker)` forever - if closeTracker { - if err := s.schemaTracker.Close(); err != nil { - s.tctx.L().Error("fail to close schema tracker", log.ShortError(err)) - } + if err := s.schemaTracker.Close(); err != nil { + s.tctx.L().Error("fail to close schema tracker", log.ShortError(err)) } - if s.sgk != nil { s.sgk.Close() } - s.closeOnlineDDL() - // when closing syncer by `stop-task`, remove active relay log from hub s.removeActiveRelayLog() - metrics.RemoveLabelValuesWithTaskInMetrics(s.cfg.Name) - s.closed.Store(true) } -// Close closes syncer. -func (s *Syncer) Close() { - s.close(true) -} - // Kill kill syncer without graceful. func (s *Syncer) Kill() { s.tctx.L().Warn("kill syncer without graceful") s.runCancel() - s.close(false) + s.Close() } // stopSync stops syncing, now it used by Close and Pause From 63e87ca449878a74f98356a6c9ec2cc83fdf4629 Mon Sep 17 00:00:00 2001 From: ehco1996 Date: Wed, 12 Jan 2022 15:16:11 +0800 Subject: [PATCH 10/47] use runctx to cancel dml worker --- dm/syncer/dml_worker.go | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/dm/syncer/dml_worker.go b/dm/syncer/dml_worker.go index 051bb140f05..cfa268e8804 100644 --- a/dm/syncer/dml_worker.go +++ b/dm/syncer/dml_worker.go @@ -14,6 +14,7 @@ package syncer import ( + "context" "time" "github.com/pingcap/errors" @@ -36,7 +37,7 @@ type DMLWorker struct { chanSize int multipleRows bool toDBConns []*dbconn.DBConn - tctx *tcontext.Context + runCtx context.Context logger log.Logger // for metrics @@ -75,7 +76,7 @@ func dmlWorkerWrap(inCh chan *job, syncer *Syncer) chan *job { fatalFunc: syncer.fatalFunc, lagFunc: syncer.updateReplicationJobTS, addCountFunc: syncer.addCount, - tctx: syncer.tctx, + runCtx: syncer.runCtx, toDBConns: syncer.toDBConns, inCh: inCh, flushCh: make(chan *job), @@ -236,9 +237,9 @@ func (w *DMLWorker) executeBatchJobs(queueID int, jobs []*job) { time.Sleep(time.Duration(t) * time.Second) }) // use background context to execute sqls as much as possible - ctx, cancel := w.tctx.WithTimeout(maxDMLExecutionDuration) + ctx, cancel := context.WithTimeout(w.runCtx, maxDMLExecutionDuration) defer cancel() - affect, err = db.ExecuteSQL(ctx, queries, args...) + affect, err = db.ExecuteSQL(tcontext.NewContext(ctx, w.logger), queries, args...) failpoint.Inject("SafeModeExit", func(val failpoint.Value) { if intVal, ok := val.(int); ok && intVal == 4 && len(jobs) > 0 { w.logger.Warn("fail to exec DML", zap.String("failpoint", "SafeModeExit")) From d743f1d2112fcc2a28261a1b0ab74f2066b309ba Mon Sep 17 00:00:00 2001 From: ehco1996 Date: Fri, 14 Jan 2022 09:09:29 +0800 Subject: [PATCH 11/47] address comment --- dm/dm/worker/server.go | 2 +- dm/dm/worker/subtask.go | 3 +-- dm/syncer/syncer.go | 20 ++++++++++---------- 3 files changed, 12 insertions(+), 13 deletions(-) diff --git a/dm/dm/worker/server.go b/dm/dm/worker/server.go index aae087216e8..3c5bca79001 100644 --- a/dm/dm/worker/server.go +++ b/dm/dm/worker/server.go @@ -459,8 +459,8 @@ func (s *Server) doClose() { // Close close the RPC server, this function can be called multiple times. func (s *Server) Close() { + s.doClose() // we should stop current sync first, other wise master may scheduler task on new worker while we are closing s.stopKeepAlive() - s.doClose() } // if needLock is false, we should make sure Server has been locked in caller. diff --git a/dm/dm/worker/subtask.go b/dm/dm/worker/subtask.go index c9f30d8de8a..bfefb863794 100644 --- a/dm/dm/worker/subtask.go +++ b/dm/dm/worker/subtask.go @@ -385,8 +385,7 @@ func (st *SubTask) closeUnits() { } func (st *SubTask) killCurrentUnit() { - // note that we only implement ungraceful stop for sync unit - if st.CurrUnit().Type() == pb.UnitType_Sync { + if st.CurrUnit() != nil && st.CurrUnit().Type() == pb.UnitType_Sync { st.l.Info("kill syncer unit", zap.String("task", st.cfg.Name)) st.CurrUnit().Kill() } diff --git a/dm/syncer/syncer.go b/dm/syncer/syncer.go index 4f7439deafc..a3a1973beac 100644 --- a/dm/syncer/syncer.go +++ b/dm/syncer/syncer.go @@ -157,7 +157,7 @@ type Syncer struct { waitXIDJob atomic.Int64 isTransactionEnd bool waitTransactionLock sync.Mutex - runCtx context.Context // this ctx is injected in `s.Run` and when this ctx cancelled, syncer will exit quickly and not wait transaction end + runCtx context.Context // this (background)ctx is injected in `s.Run` and when this ctx cancelled, syncer will exit quickly and not wait transaction end runCancel context.CancelFunc tableRouter *router.Table @@ -3344,14 +3344,6 @@ func (s *Syncer) Close() { s.stopSync() s.closeDBs() - - // try to rollback checkpoints, if they already flushed, no effect, this operation should call before close schemaTracker - prePos := s.checkpoint.GlobalPoint() - s.checkpoint.Rollback(s.schemaTracker) - currPos := s.checkpoint.GlobalPoint() - if binlog.CompareLocation(prePos, currPos, s.cfg.EnableGTID) != 0 { - s.tctx.L().Warn("something wrong with rollback global checkpoint", zap.Stringer("previous position", prePos), zap.Stringer("current position", currPos)) - } s.checkpoint.Close() if err := s.schemaTracker.Close(); err != nil { s.tctx.L().Error("fail to close schema tracker", log.ShortError(err)) @@ -3373,7 +3365,7 @@ func (s *Syncer) Kill() { s.Close() } -// stopSync stops syncing, now it used by Close and Pause +// stopSync stops syncing and rollback checkpoint now it used by Close and Pause // maybe we can refine the workflow more clear. func (s *Syncer) stopSync() { if s.done != nil { @@ -3388,6 +3380,14 @@ func (s *Syncer) stopSync() { if s.streamerController != nil { s.streamerController.Close(s.tctx) } + + // try to rollback checkpoints, if they already flushed, no effect, this operation should call before close schemaTracker + prePos := s.checkpoint.GlobalPoint() + s.checkpoint.Rollback(s.schemaTracker) + currPos := s.checkpoint.GlobalPoint() + if binlog.CompareLocation(prePos, currPos, s.cfg.EnableGTID) != 0 { + s.tctx.L().Warn("something wrong with rollback global checkpoint", zap.Stringer("previous position", prePos), zap.Stringer("current position", currPos)) + } } func (s *Syncer) closeOnlineDDL() { From 9b9a5583568b7808db6aae99413cb00be2fe176c Mon Sep 17 00:00:00 2001 From: ehco1996 Date: Mon, 17 Jan 2022 14:23:06 +0800 Subject: [PATCH 12/47] address some comments --- dm/dm/worker/subtask.go | 2 +- dm/syncer/syncer.go | 24 ++++++++++++------------ dm/tests/checkpoint_transaction/run.sh | 9 ++++++--- 3 files changed, 19 insertions(+), 16 deletions(-) diff --git a/dm/dm/worker/subtask.go b/dm/dm/worker/subtask.go index bfefb863794..e94ec61693b 100644 --- a/dm/dm/worker/subtask.go +++ b/dm/dm/worker/subtask.go @@ -385,7 +385,7 @@ func (st *SubTask) closeUnits() { } func (st *SubTask) killCurrentUnit() { - if st.CurrUnit() != nil && st.CurrUnit().Type() == pb.UnitType_Sync { + if st.CurrUnit() != nil { st.l.Info("kill syncer unit", zap.String("task", st.cfg.Name)) st.CurrUnit().Kill() } diff --git a/dm/syncer/syncer.go b/dm/syncer/syncer.go index a3a1973beac..c2b8753714f 100644 --- a/dm/syncer/syncer.go +++ b/dm/syncer/syncer.go @@ -1127,7 +1127,7 @@ func (s *Syncer) flushCheckPoints() error { snapshotInfo, exceptTables, shardMetaSQLs, shardMetaArgs := s.createCheckpointSnapshot(true) if snapshotInfo == nil { - log.L().Info("checkpoint has no change, skip sync flush checkpoint") + s.tctx.L().Info("checkpoint has no change, skip sync flush checkpoint") return nil } @@ -1163,7 +1163,7 @@ func (s *Syncer) flushCheckPointsAsync(asyncFlushJob *job) { snapshotInfo, exceptTables, shardMetaSQLs, shardMetaArgs := s.createCheckpointSnapshot(false) if snapshotInfo == nil { - log.L().Info("checkpoint has no change, skip async flush checkpoint", zap.Int64("job seq", asyncFlushJob.flushSeq)) + s.tctx.L().Info("checkpoint has no change, skip async flush checkpoint", zap.Int64("job seq", asyncFlushJob.flushSeq)) return } @@ -1411,11 +1411,11 @@ func (s *Syncer) syncDML() { func (s *Syncer) waitTransactionEndBeforeExit(ctx context.Context) { select { case <-ctx.Done(): // hijack the context to wait for the transaction to end. - log.L().Info("received subtask's done, try graceful stop") + s.tctx.L().Info("received subtask's done, try graceful stop") s.waitTransactionLock.Lock() if s.isTransactionEnd { s.waitXIDJob.Store(int64(waitComplete)) - log.L().Info("the last job is transaction end, done directly") + s.tctx.L().Info("the last job is transaction end, done directly") s.runCancel() s.waitTransactionLock.Unlock() return @@ -1424,12 +1424,12 @@ func (s *Syncer) waitTransactionEndBeforeExit(ctx context.Context) { s.waitTransactionLock.Unlock() select { case <-s.runCtx.Done(): - log.L().Info("received run ctx done, exit now") + s.tctx.L().Info("received run ctx done, exit now") case <-time.After(maxPauseOrStopWaitTime): - log.L().Info("wait transaction end timeout, exit now") + s.tctx.L().Info("wait transaction end timeout, exit now") } case <-s.runCtx.Done(): // when no graceful stop, run ctx will canceled first. - log.L().Info("received ungraceful exit ctx, exit now") + s.tctx.L().Info("received ungraceful exit ctx, exit now") } } @@ -1716,7 +1716,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) { i++ } } - log.L().Info("discard event already consumed", zap.Int("count", n), + s.tctx.L().Info("discard event already consumed", zap.Int("count", n), zap.Any("cur_loc", currentLocation)) return nil } @@ -1801,7 +1801,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) { if err == nil { continue } - log.L().Warn("skip duplicate rows event failed", zap.Error(err)) + s.tctx.L().Warn("skip duplicate rows event failed", zap.Error(err)) } if err != nil { @@ -1817,7 +1817,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) { if err != nil { return err } - log.L().Info("reset replication binlog puller", zap.Any("pos", s.checkpoint.GlobalPoint())) + s.tctx.L().Info("reset replication binlog puller", zap.Any("pos", s.checkpoint.GlobalPoint())) if err = maybeSkipNRowsEvent(eventIndex); err != nil { return err } @@ -2465,7 +2465,7 @@ func (s *Syncer) handleQueryEvent(ev *replication.QueryEvent, ec eventContext, o qec.p, err = event.GetParserForStatusVars(ev.StatusVars) if err != nil { - log.L().Warn("found error when get sql_mode from binlog status_vars", zap.Error(err)) + s.tctx.L().Warn("found error when get sql_mode from binlog status_vars", zap.Error(err)) } stmt, err := parseOneStmt(qec) @@ -3076,7 +3076,7 @@ func (s *Syncer) trackOriginDDL(ev *replication.QueryEvent, ec eventContext) (ma } qec.p, err = event.GetParserForStatusVars(ev.StatusVars) if err != nil { - log.L().Warn("found error when get sql_mode from binlog status_vars", zap.Error(err)) + s.tctx.L().Warn("found error when get sql_mode from binlog status_vars", zap.Error(err)) } stmt, err := parseOneStmt(qec) if err != nil { diff --git a/dm/tests/checkpoint_transaction/run.sh b/dm/tests/checkpoint_transaction/run.sh index 608cf0d0ef1..d9e740143dd 100755 --- a/dm/tests/checkpoint_transaction/run.sh +++ b/dm/tests/checkpoint_transaction/run.sh @@ -27,13 +27,16 @@ function check_worker_ungraceful_stop_with_retry() { fi all_matched=true if $all_matched; then - echo "check_worker_ungraceful_stop_with_retry after retry: $k" + echo "check_worker_ungraceful_stop_with_retry success after retry: $k" break fi done + if ! $all_matched; then + echo "check_worker_ungraceful_stop_with_retry failed after retry: $k" exit 1 fi + } function run() { @@ -65,9 +68,9 @@ function run() { echo "kill dm-master1" kill_dm_master check_master_port_offline 1 - sleep 1 # wiat woker lost ha keepalive-ttl is 1 second + sleep 1 # wait worker lost keep alive ttl is 1 second - # check dm-worker2 will exit quickly without waiting for the transaction to finish + # check dm-worker will exit quickly without waiting for the transaction to finish check_worker_ungraceful_stop_with_retry # test data in tidb less than source From 5d88e4bdd46c58fe9c7aa9b240dace35f5420b7d Mon Sep 17 00:00:00 2001 From: ehco1996 Date: Mon, 17 Jan 2022 17:58:56 +0800 Subject: [PATCH 13/47] savework: refine syncer ctx --- dm/dm/worker/subtask.go | 2 +- dm/syncer/dml_worker.go | 9 +- dm/syncer/syncer.go | 199 +++++++++++++------------ dm/tests/checkpoint_transaction/run.sh | 2 +- 4 files changed, 112 insertions(+), 100 deletions(-) diff --git a/dm/dm/worker/subtask.go b/dm/dm/worker/subtask.go index e94ec61693b..8c17e345aea 100644 --- a/dm/dm/worker/subtask.go +++ b/dm/dm/worker/subtask.go @@ -386,7 +386,7 @@ func (st *SubTask) closeUnits() { func (st *SubTask) killCurrentUnit() { if st.CurrUnit() != nil { - st.l.Info("kill syncer unit", zap.String("task", st.cfg.Name)) + st.l.Info("kill syncer unit", zap.String("task", st.cfg.Name), zap.Stringer("unit", st.CurrUnit().Type())) st.CurrUnit().Kill() } } diff --git a/dm/syncer/dml_worker.go b/dm/syncer/dml_worker.go index cfa268e8804..edbcf4bd1f3 100644 --- a/dm/syncer/dml_worker.go +++ b/dm/syncer/dml_worker.go @@ -14,7 +14,6 @@ package syncer import ( - "context" "time" "github.com/pingcap/errors" @@ -37,7 +36,7 @@ type DMLWorker struct { chanSize int multipleRows bool toDBConns []*dbconn.DBConn - runCtx context.Context + runTCtx *tcontext.Context logger log.Logger // for metrics @@ -76,7 +75,7 @@ func dmlWorkerWrap(inCh chan *job, syncer *Syncer) chan *job { fatalFunc: syncer.fatalFunc, lagFunc: syncer.updateReplicationJobTS, addCountFunc: syncer.addCount, - runCtx: syncer.runCtx, + runTCtx: syncer.runTCtx, toDBConns: syncer.toDBConns, inCh: inCh, flushCh: make(chan *job), @@ -237,9 +236,9 @@ func (w *DMLWorker) executeBatchJobs(queueID int, jobs []*job) { time.Sleep(time.Duration(t) * time.Second) }) // use background context to execute sqls as much as possible - ctx, cancel := context.WithTimeout(w.runCtx, maxDMLExecutionDuration) + ctx, cancel := w.runTCtx.WithTimeout(maxDMLExecutionDuration) defer cancel() - affect, err = db.ExecuteSQL(tcontext.NewContext(ctx, w.logger), queries, args...) + affect, err = db.ExecuteSQL(ctx, queries, args...) failpoint.Inject("SafeModeExit", func(val failpoint.Value) { if intVal, ok := val.(int); ok && intVal == 4 && len(jobs) > 0 { w.logger.Warn("fail to exec DML", zap.String("failpoint", "SafeModeExit")) diff --git a/dm/syncer/syncer.go b/dm/syncer/syncer.go index c2b8753714f..d89f800761f 100644 --- a/dm/syncer/syncer.go +++ b/dm/syncer/syncer.go @@ -123,7 +123,14 @@ const ( type Syncer struct { sync.RWMutex - tctx *tcontext.Context + tctx *tcontext.Context // this ctx only used for logger. + + // this ctx is a background ctx and was initialized in s.Run, it is used for some background tasks in S.Run + // when this ctx cancelled, syncer will shuts down all background running jobs and not wait transaction end + runCtx context.Context + // this is used for some background tasks S.Run that need logger it share same lifecycle with runCtx. + runTCtx *tcontext.Context // this ctx only used for logger. + runCancel context.CancelFunc cfg *config.SubTaskConfig syncCfg replication.BinlogSyncerConfig @@ -157,8 +164,6 @@ type Syncer struct { waitXIDJob atomic.Int64 isTransactionEnd bool waitTransactionLock sync.Mutex - runCtx context.Context // this (background)ctx is injected in `s.Run` and when this ctx cancelled, syncer will exit quickly and not wait transaction end - runCancel context.CancelFunc tableRouter *router.Table binlogFilter *bf.BinlogEvent @@ -1409,14 +1414,14 @@ func (s *Syncer) syncDML() { } func (s *Syncer) waitTransactionEndBeforeExit(ctx context.Context) { + defer s.runCancel() select { - case <-ctx.Done(): // hijack the context to wait for the transaction to end. + case <-ctx.Done(): // hijack the root context from s.Run to wait for the transaction to end. s.tctx.L().Info("received subtask's done, try graceful stop") s.waitTransactionLock.Lock() if s.isTransactionEnd { s.waitXIDJob.Store(int64(waitComplete)) s.tctx.L().Info("the last job is transaction end, done directly") - s.runCancel() s.waitTransactionLock.Unlock() return } @@ -1433,15 +1438,58 @@ func (s *Syncer) waitTransactionEndBeforeExit(ctx context.Context) { } } +func (s *Syncer) updateTSOffsetCronJob(ctx context.Context) { + defer s.wg.Done() + // temporarily hard code there. if this metrics works well add this to config file. + ticker := time.NewTicker(time.Minute * 10) + defer ticker.Stop() + for { + select { + case <-ticker.C: + if utErr := s.updateTSOffset(ctx); utErr != nil { + s.tctx.L().Error("get server unix ts err", zap.Error(utErr)) + } + case <-ctx.Done(): + return + } + } +} + +func (s *Syncer) updateLagCronJob(ctx context.Context) { + defer s.wg.Done() + // temporarily hard code there. if this metrics works well add this to config file. + ticker := time.NewTicker(time.Millisecond * 100) + defer ticker.Stop() + for { + select { + case <-ticker.C: + s.updateReplicationLagMetric() + case <-ctx.Done(): + return + } + } +} + +func (s *Syncer) updateTSOffset(ctx context.Context) error { + t1 := time.Now() + ts, tsErr := s.fromDB.GetServerUnixTS(ctx) + rtt := time.Since(t1).Seconds() + if tsErr == nil { + s.tsOffset.Store(time.Now().Unix() - ts - int64(rtt/2)) + } + return tsErr +} + // Run starts running for sync, we should guarantee it can rerun when paused. func (s *Syncer) Run(ctx context.Context) (err error) { runCtx, runCancel := context.WithCancel(context.Background()) + runTCtx := tcontext.NewContext(runCtx, s.tctx.L()) defer runCancel() s.Lock() s.runCtx = runCtx + s.runTCtx = runTCtx s.runCancel = runCancel s.Unlock() - tctx := s.tctx.WithContext(runCtx) defer func() { if s.done != nil { @@ -1452,7 +1500,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) { go s.waitTransactionEndBeforeExit(ctx) // some initialization that can't be put in Syncer.Init - fresh, err := s.IsFreshTask(runCtx) + fresh, err := s.IsFreshTask(ctx) if err != nil { return err } else if fresh { @@ -1467,7 +1515,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) { s.wg.Add(1) go func() { defer s.wg.Done() - s.checkpointFlushWorker.Run(s.tctx) + s.checkpointFlushWorker.Run(runTCtx) }() var ( @@ -1475,7 +1523,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) { delLoadTask bool cleanDumpFile = s.cfg.CleanDumpFile ) - flushCheckpoint, err = s.adjustGlobalPointGTID(tctx) + flushCheckpoint, err = s.adjustGlobalPointGTID(s.runTCtx) if err != nil { return err } @@ -1484,7 +1532,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) { flushCheckpoint = true err = s.loadTableStructureFromDump(ctx) if err != nil { - tctx.L().Warn("error happened when load table structure from dump files", zap.Error(err)) + s.tctx.L().Warn("error happened when load table structure from dump files", zap.Error(err)) cleanDumpFile = false } } else { @@ -1493,59 +1541,35 @@ func (s *Syncer) Run(ctx context.Context) (err error) { if flushCheckpoint { if err = s.flushCheckPoints(); err != nil { - tctx.L().Warn("fail to flush checkpoints when starting task", zap.Error(err)) + s.tctx.L().Warn("fail to flush checkpoints when starting task", zap.Error(err)) return err } } if delLoadTask { if err = s.delLoadTask(); err != nil { - tctx.L().Warn("error when del load task in etcd", zap.Error(err)) + s.tctx.L().Warn("error when del load task in etcd", zap.Error(err)) } } if cleanDumpFile { - tctx.L().Info("try to remove all dump files") + s.tctx.L().Info("try to remove all dump files") if err = os.RemoveAll(s.cfg.Dir); err != nil { - tctx.L().Warn("error when remove loaded dump folder", zap.String("data folder", s.cfg.Dir), zap.Error(err)) + s.tctx.L().Warn("error when remove loaded dump folder", zap.String("data folder", s.cfg.Dir), zap.Error(err)) } } failpoint.Inject("AdjustGTIDExit", func() { - tctx.L().Warn("exit triggered", zap.String("failpoint", "AdjustGTIDExit")) - s.streamerController.Close(tctx) + s.tctx.L().Warn("exit triggered", zap.String("failpoint", "AdjustGTIDExit")) + s.streamerController.Close(s.tctx) utils.OsExit(1) }) - updateTSOffset := func() error { - t1 := time.Now() - ts, tsErr := s.fromDB.GetServerUnixTS(runCtx) - rtt := time.Since(t1).Seconds() - if tsErr == nil { - s.tsOffset.Store(time.Now().Unix() - ts - int64(rtt/2)) - } - return tsErr - } - // before sync run, we get the tsoffset from upstream first - if utErr := updateTSOffset(); utErr != nil { + // before sync run, we get the ts offset from upstream first + if utErr := s.updateTSOffset(ctx); utErr != nil { return utErr } // start background task to get/update current ts offset between dm and upstream s.wg.Add(1) - go func() { - defer s.wg.Done() - // temporarily hard code there. if this metrics works well add this to config file. - updateTicker := time.NewTicker(time.Minute * 10) - defer updateTicker.Stop() - for { - select { - case <-updateTicker.C: - if utErr := updateTSOffset(); utErr != nil { - s.tctx.L().Error("get server unix ts err", zap.Error(utErr)) - } - case <-runCtx.Done(): - return - } - } - }() + go s.updateTSOffsetCronJob(s.runCtx) // startLocation is the start location for current received event // currentLocation is the end location for current received event (End_log_pos in `show binlog events` for mysql) @@ -1560,11 +1584,11 @@ func (s *Syncer) Run(ctx context.Context) (err error) { currentGTID string ) - tctx.L().Info("replicate binlog from checkpoint", zap.Stringer("checkpoint", lastLocation)) + s.tctx.L().Info("replicate binlog from checkpoint", zap.Stringer("checkpoint", lastLocation)) if s.streamerController.IsClosed() { s.locations.reset(lastLocation) - err = s.streamerController.Start(tctx, lastLocation) + err = s.streamerController.Start(runTCtx, lastLocation) if err != nil { return terror.Annotate(err, "fail to restart streamer controller") } @@ -1574,22 +1598,11 @@ func (s *Syncer) Run(ctx context.Context) (err error) { go s.syncDML() s.wg.Add(1) - go s.syncDDL(tctx, adminQueueName, s.ddlDBConn, s.ddlJobCh) + go s.syncDDL(s.runTCtx, adminQueueName, s.ddlDBConn, s.ddlJobCh) + // start background task to update replication lag metric s.wg.Add(1) - go func() { - defer s.wg.Done() - updateLagTicker := time.NewTicker(time.Millisecond * 100) - defer updateLagTicker.Stop() - for { - select { - case <-updateLagTicker.C: - s.updateReplicationLagMetric() - case <-runCtx.Done(): - return - } - } - }() + go s.updateLagCronJob(s.runCtx) // syncing progress with sharding DDL group // 1. use the global streamer to sync regular binlog events @@ -1611,10 +1624,10 @@ func (s *Syncer) Run(ctx context.Context) (err error) { defer func() { if err1 := recover(); err1 != nil { failpoint.Inject("ExitAfterSaveOnlineDDL", func() { - tctx.L().Info("force panic") + s.tctx.L().Info("force panic") panic("ExitAfterSaveOnlineDDL") }) - tctx.L().Error("panic log", zap.Reflect("error message", err1), zap.Stack("stack")) + s.tctx.L().Error("panic log", zap.Reflect("error message", err1), zap.Stack("stack")) err = terror.ErrSyncerUnitPanic.Generate(err1) } @@ -1631,13 +1644,13 @@ func (s *Syncer) Run(ctx context.Context) (err error) { // flush all jobs before exit if err2 = s.flushJobs(); err2 != nil { - tctx.L().Warn("failed to flush jobs when exit task", zap.Error(err2)) + s.tctx.L().Warn("failed to flush jobs when exit task", zap.Error(err2)) } // if any execute error, flush safemode exit point if err2 = s.execError.Load(); err2 != nil && (terror.ErrDBExecuteFailed.Equal(err2) || terror.ErrDBUnExpect.Equal(err2)) { if err2 = s.checkpoint.FlushSafeModeExitPoint(s.tctx); err2 != nil { - tctx.L().Warn("failed to flush safe mode checkpoints when exit task", zap.Error(err2)) + s.tctx.L().Warn("failed to flush safe mode checkpoints when exit task", zap.Error(err2)) } } }() @@ -1656,7 +1669,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) { // if we start syncer at an early position, database must bear a period of inconsistent state, // it's eventual consistency. s.safeMode = sm.NewSafeMode() - s.enableSafeModeInitializationPhase(tctx) + s.enableSafeModeInitializationPhase(s.runTCtx) closeShardingResync := func() error { if shardingReSync == nil { @@ -1680,7 +1693,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) { s.isReplacingOrInjectingErr = currentLocation.Suffix != 0 s.locations.reset(currentLocation) - err3 := s.streamerController.RedirectStreamer(tctx, currentLocation) + err3 := s.streamerController.RedirectStreamer(s.tctx, currentLocation) if err3 != nil { return err3 } @@ -1695,7 +1708,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) { } for i := 0; i < n; { - e, err1 := s.getEvent(tctx, currentLocation) + e, err1 := s.getEvent(s.runTCtx, currentLocation) if err1 != nil { return err } @@ -1746,13 +1759,13 @@ func (s *Syncer) Run(ctx context.Context) (err error) { // if suffix>0, we are replacing error s.isReplacingOrInjectingErr = currentLocation.Suffix != 0 s.locations.reset(shardingReSync.currLocation) - err = s.streamerController.RedirectStreamer(tctx, shardingReSync.currLocation) + err = s.streamerController.RedirectStreamer(s.runTCtx, shardingReSync.currLocation) if err != nil { return err } failpoint.Inject("ReSyncExit", func() { - tctx.L().Warn("exit triggered", zap.String("failpoint", "ReSyncExit")) + s.tctx.L().Warn("exit triggered", zap.String("failpoint", "ReSyncExit")) utils.OsExit(1) }) } @@ -1760,7 +1773,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) { var e *replication.BinlogEvent startTime := time.Now() - e, err = s.getEvent(tctx, currentLocation) + e, err = s.getEvent(s.tctx, currentLocation) s.tctx.L().Debug("location refactor", zap.Stringer("current", currentLocation), zap.Stringer("start", startLocation), @@ -1782,21 +1795,21 @@ func (s *Syncer) Run(ctx context.Context) (err error) { }) switch { case err == context.Canceled: - tctx.L().Info("binlog replication main routine quit(context canceled)!", zap.Stringer("last location", lastLocation)) + s.tctx.L().Info("binlog replication main routine quit(context canceled)!", zap.Stringer("last location", lastLocation)) return nil case err == context.DeadlineExceeded: - tctx.L().Info("deadline exceeded when fetching binlog event") + s.tctx.L().Info("deadline exceeded when fetching binlog event") continue case isDuplicateServerIDError(err): // if the server id is already used, need to use a new server id - tctx.L().Info("server id is already used by another slave, will change to a new server id and get event again") - err1 := s.streamerController.UpdateServerIDAndResetReplication(tctx, lastLocation) + s.tctx.L().Info("server id is already used by another slave, will change to a new server id and get event again") + err1 := s.streamerController.UpdateServerIDAndResetReplication(s.tctx, lastLocation) if err1 != nil { return err1 } continue case err == relay.ErrorMaybeDuplicateEvent: - tctx.L().Warn("read binlog met a truncated file, will skip events that has been consumed") + s.tctx.L().Warn("read binlog met a truncated file, will skip events that has been consumed") err = maybeSkipNRowsEvent(eventIndex) if err == nil { continue @@ -1805,7 +1818,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) { } if err != nil { - tctx.L().Error("fail to fetch binlog", log.ShortError(err)) + s.tctx.L().Error("fail to fetch binlog", log.ShortError(err)) if isConnectionRefusedError(err) { return err @@ -1813,7 +1826,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) { if s.streamerController.CanRetry(err) { // GlobalPoint is the last finished transaction location - err = s.streamerController.ResetReplicationSyncer(tctx, s.checkpoint.GlobalPoint()) + err = s.streamerController.ResetReplicationSyncer(s.tctx, s.checkpoint.GlobalPoint()) if err != nil { return err } @@ -1827,7 +1840,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) { // try to re-sync in gtid mode if tryReSync && s.cfg.EnableGTID && utils.IsErrBinlogPurged(err) && s.cfg.AutoFixGTID { time.Sleep(retryTimeout) - err = s.reSyncBinlog(*tctx, lastLocation) + err = s.reSyncBinlog(*s.tctx, lastLocation) if err != nil { return err } @@ -1840,7 +1853,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) { failpoint.Inject("IgnoreSomeTypeEvent", func(val failpoint.Value) { if e.Header.EventType.String() == val.(string) { - tctx.L().Debug("IgnoreSomeTypeEvent", zap.Reflect("event", e)) + s.tctx.L().Debug("IgnoreSomeTypeEvent", zap.Reflect("event", e)) failpoint.Continue() } }) @@ -1854,7 +1867,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) { metrics.BinlogPosGauge.WithLabelValues("syncer", s.cfg.Name, s.cfg.SourceID).Set(float64(e.Header.LogPos)) index, err := binlog.GetFilenameIndex(lastLocation.Position.Name) if err != nil { - tctx.L().Warn("fail to get index number of binlog file, may because only specify GTID and hasn't saved according binlog position", log.ShortError(err)) + s.tctx.L().Warn("fail to get index number of binlog file, may because only specify GTID and hasn't saved according binlog position", log.ShortError(err)) } else { metrics.BinlogFileGauge.WithLabelValues("syncer", s.cfg.Name, s.cfg.SourceID).Set(float64(index)) } @@ -1863,7 +1876,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) { failpoint.Inject("ProcessBinlogSlowDown", nil) - tctx.L().Debug("receive binlog event", zap.Reflect("header", e.Header)) + s.tctx.L().Debug("receive binlog event", zap.Reflect("header", e.Header)) // TODO: support all event // we calculate startLocation and endLocation(currentLocation) for Query event here @@ -1906,7 +1919,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) { currentLocation = startLocation } else if op == pb.ErrorOp_Skip { ec := eventContext{ - tctx: tctx, + tctx: s.tctx, header: e.Header, startLocation: &startLocation, currentLocation: ¤tLocation, @@ -1915,7 +1928,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) { var sourceTbls map[string]map[string]struct{} sourceTbls, err = s.trackOriginDDL(ev, ec) if err != nil { - tctx.L().Warn("failed to track query when handle-error skip", zap.Error(err), zap.ByteString("sql", ev.Query)) + s.tctx.L().Warn("failed to track query when handle-error skip", zap.Error(err), zap.ByteString("sql", ev.Query)) } s.saveGlobalPoint(currentLocation) @@ -1929,9 +1942,9 @@ func (s *Syncer) Run(ctx context.Context) (err error) { } err = s.flushJobs() if err != nil { - tctx.L().Warn("failed to flush jobs when handle-error skip", zap.Error(err)) + s.tctx.L().Warn("failed to flush jobs when handle-error skip", zap.Error(err)) } else { - tctx.L().Info("flush jobs when handle-error skip") + s.tctx.L().Info("flush jobs when handle-error skip") } } // skip the current event @@ -1948,9 +1961,9 @@ func (s *Syncer) Run(ctx context.Context) (err error) { s.errOperatorHolder.SetHasAllInjected(startLocation) // reset event as startLocation, avoid to be marked in checkpoint currentLocation.Position.Pos = startLocation.Position.Pos - err = s.streamerController.RedirectStreamer(tctx, startLocation) + err = s.streamerController.RedirectStreamer(s.runTCtx, startLocation) } else { - err = s.streamerController.RedirectStreamer(tctx, currentLocation) + err = s.streamerController.RedirectStreamer(s.runTCtx, currentLocation) } if err != nil { return err @@ -1970,17 +1983,17 @@ func (s *Syncer) Run(ctx context.Context) (err error) { // 2. push forward and replicate some sqls after safeModeExitPoint to downstream // 3. quit because of network error, fail to flush global checkpoint and new safeModeExitPoint to downstream // 4. restart again, quit safe mode at safeModeExitPoint, but some sqls after this location have already been replicated to the downstream - if err = s.checkpoint.FlushSafeModeExitPoint(s.tctx); err != nil { + if err = s.checkpoint.FlushSafeModeExitPoint(s.runTCtx); err != nil { return err } - if err = s.safeMode.Add(tctx, -1); err != nil { + if err = s.safeMode.Add(s.tctx, -1); err != nil { return err } } } ec := eventContext{ - tctx: tctx, + tctx: s.tctx, header: e.Header, startLocation: &startLocation, currentLocation: ¤tLocation, @@ -2021,7 +2034,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) { // only need compare binlog position? lastLocation = shardingReSync.currLocation if binlog.CompareLocation(shardingReSync.currLocation, shardingReSync.latestLocation, s.cfg.EnableGTID) >= 0 { - tctx.L().Info("re-replicate shard group was completed", zap.String("event", "XID"), zap.Stringer("re-shard", shardingReSync)) + s.tctx.L().Info("re-replicate shard group was completed", zap.String("event", "XID"), zap.Stringer("re-shard", shardingReSync)) err = closeShardingResync() if err != nil { return terror.Annotatef(err, "shard group current location %s", shardingReSync.currLocation) @@ -2036,7 +2049,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) { return terror.Annotatef(err, "fail to record GTID %v", ev.GSet) } - tctx.L().Debug("", zap.String("event", "XID"), zap.Stringer("last location", lastLocation), log.WrapStringerField("location", currentLocation)) + s.tctx.L().Debug("", zap.String("event", "XID"), zap.Stringer("last location", lastLocation), log.WrapStringerField("location", currentLocation)) lastLocation.Position.Pos = e.Header.LogPos // update lastPos err = lastLocation.SetGTID(ev.GSet) if err != nil { @@ -2049,7 +2062,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) { if e.Header.EventType == replication.HEARTBEAT_EVENT { // flush checkpoint even if there are no real binlog events if s.checkpoint.CheckGlobalPoint() { - tctx.L().Info("meet heartbeat event and then flush jobs") + s.tctx.L().Info("meet heartbeat event and then flush jobs") err2 = s.flushJobs() } } @@ -3365,7 +3378,7 @@ func (s *Syncer) Kill() { s.Close() } -// stopSync stops syncing and rollback checkpoint now it used by Close and Pause +// stopSync stops syncing and rollback checkpoint now it used by Close,kILL and Pause // maybe we can refine the workflow more clear. func (s *Syncer) stopSync() { if s.done != nil { diff --git a/dm/tests/checkpoint_transaction/run.sh b/dm/tests/checkpoint_transaction/run.sh index d9e740143dd..5b434a38136 100755 --- a/dm/tests/checkpoint_transaction/run.sh +++ b/dm/tests/checkpoint_transaction/run.sh @@ -33,7 +33,7 @@ function check_worker_ungraceful_stop_with_retry() { done if ! $all_matched; then - echo "check_worker_ungraceful_stop_with_retry failed after retry: $k" + echo "check_worker_ungraceful_stop_with_retry failed after retry" exit 1 fi From 5ca927684fee131e3b36719b53a003847aec2d52 Mon Sep 17 00:00:00 2001 From: ehco1996 Date: Wed, 19 Jan 2022 11:10:13 +0800 Subject: [PATCH 14/47] refine syncer.RUN --- dm/syncer/syncer.go | 65 +++++++++---------- .../conf/dm-worker1.toml | 4 +- dm/tests/checkpoint_transaction/run.sh | 44 ++++++------- 3 files changed, 55 insertions(+), 58 deletions(-) diff --git a/dm/syncer/syncer.go b/dm/syncer/syncer.go index d89f800761f..be9b16d0393 100644 --- a/dm/syncer/syncer.go +++ b/dm/syncer/syncer.go @@ -125,10 +125,10 @@ type Syncer struct { tctx *tcontext.Context // this ctx only used for logger. - // this ctx is a background ctx and was initialized in s.Run, it is used for some background tasks in S.Run + // this ctx is a background ctx and was initialized in s.Run, it is used for some background tasks in s.Run // when this ctx cancelled, syncer will shuts down all background running jobs and not wait transaction end runCtx context.Context - // this is used for some background tasks S.Run that need logger it share same lifecycle with runCtx. + // this is used for some background tasks s.Run that need logger it share same lifecycle with runCtx. runTCtx *tcontext.Context // this ctx only used for logger. runCancel context.CancelFunc @@ -196,8 +196,6 @@ type Syncer struct { filteredUpdate atomic.Int64 filteredDelete atomic.Int64 - done chan struct{} - checkpoint CheckPoint checkpointFlushWorker *checkpointFlushWorker onlineDDL onlineddl.OnlinePlugin @@ -261,7 +259,6 @@ func NewSyncer(cfg *config.SubTaskConfig, etcdClient *clientv3.Client, relay rel syncer.binlogSizeCount.Store(0) syncer.lastCount.Store(0) syncer.count.Store(0) - syncer.done = nil syncer.handleJobFunc = syncer.handleJob syncer.cli = etcdClient @@ -303,7 +300,7 @@ func (s *Syncer) closeJobChans() { } close(s.dmlJobCh) close(s.ddlJobCh) - close(s.checkpointFlushWorker.input) + s.checkpointFlushWorker.Close() s.jobsClosed.Store(true) } @@ -643,7 +640,6 @@ func (s *Syncer) Process(ctx context.Context, pr chan pb.ProcessResult) { s.Unlock() return } - s.done = make(chan struct{}) s.Unlock() runFatalChan := make(chan *pb.ProcessError, s.cfg.WorkerCount+1) @@ -1414,27 +1410,29 @@ func (s *Syncer) syncDML() { } func (s *Syncer) waitTransactionEndBeforeExit(ctx context.Context) { - defer s.runCancel() select { case <-ctx.Done(): // hijack the root context from s.Run to wait for the transaction to end. s.tctx.L().Info("received subtask's done, try graceful stop") s.waitTransactionLock.Lock() if s.isTransactionEnd { s.waitXIDJob.Store(int64(waitComplete)) - s.tctx.L().Info("the last job is transaction end, done directly") s.waitTransactionLock.Unlock() + s.tctx.L().Info("the last job is transaction end, done directly") + s.runCancel() return } s.waitXIDJob.Store(int64(waiting)) s.waitTransactionLock.Unlock() select { case <-s.runCtx.Done(): - s.tctx.L().Info("received run ctx done, exit now") + s.tctx.L().Info("run ctx done by waitComplete") case <-time.After(maxPauseOrStopWaitTime): s.tctx.L().Info("wait transaction end timeout, exit now") + s.runCancel() } case <-s.runCtx.Done(): // when no graceful stop, run ctx will canceled first. s.tctx.L().Info("received ungraceful exit ctx, exit now") + s.runCancel() } } @@ -1484,21 +1482,12 @@ func (s *Syncer) updateTSOffset(ctx context.Context) error { func (s *Syncer) Run(ctx context.Context) (err error) { runCtx, runCancel := context.WithCancel(context.Background()) runTCtx := tcontext.NewContext(runCtx, s.tctx.L()) - defer runCancel() s.Lock() s.runCtx = runCtx s.runTCtx = runTCtx s.runCancel = runCancel s.Unlock() - defer func() { - if s.done != nil { - close(s.done) - } - }() - - go s.waitTransactionEndBeforeExit(ctx) - // some initialization that can't be put in Syncer.Init fresh, err := s.IsFreshTask(ctx) if err != nil { @@ -1511,13 +1500,6 @@ func (s *Syncer) Run(ctx context.Context) (err error) { } } - // start flush checkpoints worker. - s.wg.Add(1) - go func() { - defer s.wg.Done() - s.checkpointFlushWorker.Run(runTCtx) - }() - var ( flushCheckpoint bool delLoadTask bool @@ -1538,6 +1520,12 @@ func (s *Syncer) Run(ctx context.Context) (err error) { } else { cleanDumpFile = false } + // start flush checkpoints worker. this worker mast start before s.flushCheckPoints() + s.wg.Add(1) + go func() { + defer s.wg.Done() + s.checkpointFlushWorker.Run(s.runTCtx) + }() if flushCheckpoint { if err = s.flushCheckPoints(); err != nil { @@ -1567,9 +1555,6 @@ func (s *Syncer) Run(ctx context.Context) (err error) { if utErr := s.updateTSOffset(ctx); utErr != nil { return utErr } - // start background task to get/update current ts offset between dm and upstream - s.wg.Add(1) - go s.updateTSOffsetCronJob(s.runCtx) // startLocation is the start location for current received event // currentLocation is the end location for current received event (End_log_pos in `show binlog events` for mysql) @@ -1588,19 +1573,25 @@ func (s *Syncer) Run(ctx context.Context) (err error) { if s.streamerController.IsClosed() { s.locations.reset(lastLocation) - err = s.streamerController.Start(runTCtx, lastLocation) + err = s.streamerController.Start(s.runTCtx, lastLocation) if err != nil { return terror.Annotate(err, "fail to restart streamer controller") } } + // 只有这个能 hook 最外层的 ctx, 通过这个函数来控制 s.Run 的退出 + go s.waitTransactionEndBeforeExit(ctx) + + // all pre check passed, we can start all background jobs, they all use s.runCancel to cancel + s.wg.Add(1) + go s.updateTSOffsetCronJob(s.runCtx) + s.wg.Add(1) go s.syncDML() s.wg.Add(1) go s.syncDDL(s.runTCtx, adminQueueName, s.ddlDBConn, s.ddlJobCh) - // start background task to update replication lag metric s.wg.Add(1) go s.updateLagCronJob(s.runCtx) @@ -1773,7 +1764,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) { var e *replication.BinlogEvent startTime := time.Now() - e, err = s.getEvent(s.tctx, currentLocation) + e, err = s.getEvent(s.runTCtx, currentLocation) s.tctx.L().Debug("location refactor", zap.Stringer("current", currentLocation), zap.Stringer("start", startLocation), @@ -2078,6 +2069,8 @@ func (s *Syncer) Run(ctx context.Context) (err error) { } } if waitXIDStatus(s.waitXIDJob.Load()) == waitComplete { + // already wait until XID event, we can stop sync now + s.runCancel() return nil } } @@ -3355,6 +3348,11 @@ func (s *Syncer) Close() { return } + // wait for s.Run() to finish + if s.runCtx != nil { + <-s.runCtx.Done() + } + s.stopSync() s.closeDBs() s.checkpoint.Close() @@ -3381,9 +3379,6 @@ func (s *Syncer) Kill() { // stopSync stops syncing and rollback checkpoint now it used by Close,kILL and Pause // maybe we can refine the workflow more clear. func (s *Syncer) stopSync() { - if s.done != nil { - <-s.done // wait Run to return - } s.closeJobChans() s.wg.Wait() // wait job workers to return diff --git a/dm/tests/checkpoint_transaction/conf/dm-worker1.toml b/dm/tests/checkpoint_transaction/conf/dm-worker1.toml index 1a53a1dc788..9b8ecaf991c 100644 --- a/dm/tests/checkpoint_transaction/conf/dm-worker1.toml +++ b/dm/tests/checkpoint_transaction/conf/dm-worker1.toml @@ -1,3 +1,5 @@ -name = "worker1" join = "127.0.0.1:8261" keepalive-ttl = 1 +log-level = "info" +name = "worker1" +worker-addr = "127.0.0.1:8262" diff --git a/dm/tests/checkpoint_transaction/run.sh b/dm/tests/checkpoint_transaction/run.sh index 5b434a38136..15ff859c229 100755 --- a/dm/tests/checkpoint_transaction/run.sh +++ b/dm/tests/checkpoint_transaction/run.sh @@ -61,28 +61,28 @@ function run() { # check diff check_sync_diff $WORK_DIR $cur/conf/diff_config.toml - # test ungraceful stop, worker will not wait transaction finish - run_sql_file $cur/data/db1.increment1.sql $MYSQL_HOST1 $MYSQL_PORT1 $MYSQL_PASSWORD1 - sleep 2 - # kill dm-master 1 to make worker lost keepalive while a transaction is not finished - echo "kill dm-master1" - kill_dm_master - check_master_port_offline 1 - sleep 1 # wait worker lost keep alive ttl is 1 second - - # check dm-worker will exit quickly without waiting for the transaction to finish - check_worker_ungraceful_stop_with_retry - - # test data in tidb less than source - dataCountSource=$(mysql -uroot -h$MYSQL_HOST1 -P$MYSQL_PORT1 -p$MYSQL_PASSWORD1 -se "select count(1) from checkpoint_transaction.t1") - dataCountIntidb=$(mysql -uroot -h127.0.0.1 -P4000 -se "select count(1) from checkpoint_transaction.t1") - echo "afetr ungraceful exit data in source count: $dataCountSource data in tidb count: $dataCountIntidb" - [[ $dataCountIntidb -lt $dataCountSource ]] - - # start dm-master again task will be resume, and data will be synced - run_dm_master $WORK_DIR/master $MASTER_PORT $cur/conf/dm-master.toml - check_rpc_alive $cur/../bin/check_master_online 127.0.0.1:$MASTER_PORT - check_sync_diff $WORK_DIR $cur/conf/diff_config.toml + # # test ungraceful stop, worker will not wait transaction finish + # run_sql_file $cur/data/db1.increment1.sql $MYSQL_HOST1 $MYSQL_PORT1 $MYSQL_PASSWORD1 + # sleep 2 + # # kill dm-master 1 to make worker lost keep alive while a transaction is not finished + # echo "kill dm-master1" + # kill_dm_master + # check_master_port_offline 1 + # sleep 1 # wait worker lost keep alive ttl is 1 second + + # # check dm-worker will exit quickly without waiting for the transaction to finish + # check_worker_ungraceful_stop_with_retry + + # # test data in tidb less than source + # dataCountSource=$(mysql -uroot -h$MYSQL_HOST1 -P$MYSQL_PORT1 -p$MYSQL_PASSWORD1 -se "select count(1) from checkpoint_transaction.t1") + # dataCountInTiDB=$(mysql -uroot -h127.0.0.1 -P4000 -se "select count(1) from checkpoint_transaction.t1") + # echo "after ungraceful exit data in source count: $dataCountSource data in tidb count: $dataCountInTiDB" + # [[ $dataCountInTiDB -lt $dataCountSource ]] + + # # start dm-master again task will be resume, and data will be synced + # run_dm_master $WORK_DIR/master $MASTER_PORT $cur/conf/dm-master.toml + # check_rpc_alive $cur/../bin/check_master_online 127.0.0.1:$MASTER_PORT + # check_sync_diff $WORK_DIR $cur/conf/diff_config.toml run_sql_file $cur/data/db1.increment1.sql $MYSQL_HOST1 $MYSQL_PORT1 $MYSQL_PASSWORD1 # wait transaction start From 1302a758f4b251f049b3fb1451eb025ab6d8b1ba Mon Sep 17 00:00:00 2001 From: ehco1996 Date: Wed, 19 Jan 2022 12:11:12 +0800 Subject: [PATCH 15/47] revert test --- dm/syncer/syncer.go | 16 +++++---- dm/tests/checkpoint_transaction/run.sh | 45 +++++++++++++------------- 2 files changed, 32 insertions(+), 29 deletions(-) diff --git a/dm/syncer/syncer.go b/dm/syncer/syncer.go index a2e6d681ed2..61fdca8adeb 100644 --- a/dm/syncer/syncer.go +++ b/dm/syncer/syncer.go @@ -131,6 +131,7 @@ type Syncer struct { // this is used for some background tasks s.Run that need logger it share same lifecycle with runCtx. runTCtx *tcontext.Context // this ctx only used for logger. runCancel context.CancelFunc + rundone chan struct{} // this will be closed when syncer.Run exit cfg *config.SubTaskConfig syncCfg replication.BinlogSyncerConfig @@ -280,6 +281,8 @@ func NewSyncer(cfg *config.SubTaskConfig, etcdClient *clientv3.Client, relay rel syncer.lastCheckpointFlushedTime = time.Time{} syncer.relay = relay syncer.locations = &locationRecorder{} + + syncer.rundone = nil return syncer } @@ -956,7 +959,7 @@ func (s *Syncer) addJob(job *job) { s.dmlJobCh <- job failpoint.Inject("checkCheckpointInMiddleOfTransaction", func() { s.tctx.L().Info("receive dml job", zap.Any("dml job", job)) - time.Sleep(500 * time.Millisecond) + time.Sleep(100 * time.Millisecond) }) case gc: s.dmlJobCh <- job @@ -1491,6 +1494,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) { s.runCtx = runCtx s.runTCtx = runTCtx s.runCancel = runCancel + s.rundone = make(chan struct{}) s.Unlock() // some initialization that can't be put in Syncer.Init @@ -1584,9 +1588,6 @@ func (s *Syncer) Run(ctx context.Context) (err error) { } } - // 只有这个能 hook 最外层的 ctx, 通过这个函数来控制 s.Run 的退出 - go s.waitTransactionEndBeforeExit(ctx) - // all pre check passed, we can start all background jobs, they all use s.runCancel to cancel s.wg.Add(1) go s.updateTSOffsetCronJob(s.runCtx) @@ -1617,7 +1618,9 @@ func (s *Syncer) Run(ctx context.Context) (err error) { traceSource = fmt.Sprintf("%s.syncer.%s", s.cfg.SourceID, s.cfg.Name) ) + go s.waitTransactionEndBeforeExit(ctx) defer func() { + // before if err1 := recover(); err1 != nil { failpoint.Inject("ExitAfterSaveOnlineDDL", func() { s.tctx.L().Info("force panic") @@ -1649,6 +1652,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) { s.tctx.L().Warn("failed to flush safe mode checkpoints when exit task", zap.Error(err2)) } } + close(s.rundone) }() now := time.Now() @@ -3353,8 +3357,8 @@ func (s *Syncer) Close() { } // wait for s.Run() to finish - if s.runCtx != nil { - <-s.runCtx.Done() + if s.rundone != nil { + <-s.rundone } s.stopSync() diff --git a/dm/tests/checkpoint_transaction/run.sh b/dm/tests/checkpoint_transaction/run.sh index 15ff859c229..7c41c905995 100755 --- a/dm/tests/checkpoint_transaction/run.sh +++ b/dm/tests/checkpoint_transaction/run.sh @@ -61,29 +61,28 @@ function run() { # check diff check_sync_diff $WORK_DIR $cur/conf/diff_config.toml - # # test ungraceful stop, worker will not wait transaction finish - # run_sql_file $cur/data/db1.increment1.sql $MYSQL_HOST1 $MYSQL_PORT1 $MYSQL_PASSWORD1 - # sleep 2 - # # kill dm-master 1 to make worker lost keep alive while a transaction is not finished - # echo "kill dm-master1" - # kill_dm_master - # check_master_port_offline 1 - # sleep 1 # wait worker lost keep alive ttl is 1 second - - # # check dm-worker will exit quickly without waiting for the transaction to finish - # check_worker_ungraceful_stop_with_retry - - # # test data in tidb less than source - # dataCountSource=$(mysql -uroot -h$MYSQL_HOST1 -P$MYSQL_PORT1 -p$MYSQL_PASSWORD1 -se "select count(1) from checkpoint_transaction.t1") - # dataCountInTiDB=$(mysql -uroot -h127.0.0.1 -P4000 -se "select count(1) from checkpoint_transaction.t1") - # echo "after ungraceful exit data in source count: $dataCountSource data in tidb count: $dataCountInTiDB" - # [[ $dataCountInTiDB -lt $dataCountSource ]] - - # # start dm-master again task will be resume, and data will be synced - # run_dm_master $WORK_DIR/master $MASTER_PORT $cur/conf/dm-master.toml - # check_rpc_alive $cur/../bin/check_master_online 127.0.0.1:$MASTER_PORT - # check_sync_diff $WORK_DIR $cur/conf/diff_config.toml - + # test ungraceful stop, worker will not wait transaction finish + run_sql_file $cur/data/db1.increment1.sql $MYSQL_HOST1 $MYSQL_PORT1 $MYSQL_PASSWORD1 + sleep 2 + # kill dm-master 1 to make worker lost keep alive while a transaction is not finished + echo "kill dm-master1" + kill_dm_master + check_master_port_offline 1 + sleep 1 # wait worker lost keep alive ttl is 1 second + + # check dm-worker will exit quickly without waiting for the transaction to finish + check_worker_ungraceful_stop_with_retry + + # test data in tidb less than source + dataCountSource=$(mysql -uroot -h$MYSQL_HOST1 -P$MYSQL_PORT1 -p$MYSQL_PASSWORD1 -se "select count(1) from checkpoint_transaction.t1") + dataCountInTiDB=$(mysql -uroot -h127.0.0.1 -P4000 -se "select count(1) from checkpoint_transaction.t1") + echo "after ungraceful exit data in source count: $dataCountSource data in tidb count: $dataCountInTiDB" + [[ $dataCountInTiDB -lt $dataCountSource ]] + + # start dm-master again task will be resume, and data will be synced + run_dm_master $WORK_DIR/master $MASTER_PORT $cur/conf/dm-master.toml + check_rpc_alive $cur/../bin/check_master_online 127.0.0.1:$MASTER_PORT + check_sync_diff $WORK_DIR $cur/conf/diff_config.toml run_sql_file $cur/data/db1.increment1.sql $MYSQL_HOST1 $MYSQL_PORT1 $MYSQL_PASSWORD1 # wait transaction start # you can see why sleep in https://github.com/pingcap/dm/pull/1928#issuecomment-895820239 From a813189d24ce9fb0395a28dbc1fdfdc5e86167e7 Mon Sep 17 00:00:00 2001 From: ehco1996 Date: Wed, 19 Jan 2022 17:28:44 +0800 Subject: [PATCH 16/47] make test steable --- dm/syncer/syncer.go | 7 ++++++- dm/tests/checkpoint_transaction/run.sh | 23 +++++++++++++++++++---- 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/dm/syncer/syncer.go b/dm/syncer/syncer.go index 61fdca8adeb..7a3cbd888b7 100644 --- a/dm/syncer/syncer.go +++ b/dm/syncer/syncer.go @@ -959,7 +959,7 @@ func (s *Syncer) addJob(job *job) { s.dmlJobCh <- job failpoint.Inject("checkCheckpointInMiddleOfTransaction", func() { s.tctx.L().Info("receive dml job", zap.Any("dml job", job)) - time.Sleep(100 * time.Millisecond) + time.Sleep(500 * time.Millisecond) }) case gc: s.dmlJobCh <- job @@ -1418,6 +1418,11 @@ func (s *Syncer) syncDML() { } func (s *Syncer) waitTransactionEndBeforeExit(ctx context.Context) { + failpoint.Inject("checkCheckpointInMiddleOfTransaction", func() { + s.tctx.L().Info("incr maxPauseOrStopWaitTime time ") + maxPauseOrStopWaitTime = time.Minute * 10 + }) + select { case <-ctx.Done(): // hijack the root context from s.Run to wait for the transaction to end. s.tctx.L().Info("received subtask's done, try graceful stop") diff --git a/dm/tests/checkpoint_transaction/run.sh b/dm/tests/checkpoint_transaction/run.sh index 7c41c905995..7198d385a7e 100755 --- a/dm/tests/checkpoint_transaction/run.sh +++ b/dm/tests/checkpoint_transaction/run.sh @@ -77,7 +77,12 @@ function run() { dataCountSource=$(mysql -uroot -h$MYSQL_HOST1 -P$MYSQL_PORT1 -p$MYSQL_PASSWORD1 -se "select count(1) from checkpoint_transaction.t1") dataCountInTiDB=$(mysql -uroot -h127.0.0.1 -P4000 -se "select count(1) from checkpoint_transaction.t1") echo "after ungraceful exit data in source count: $dataCountSource data in tidb count: $dataCountInTiDB" - [[ $dataCountInTiDB -lt $dataCountSource ]] + if [ "$dataCountInTiDB" -lt "$dataCountSource" ]; then + echo "ungraceful stop test success" + else + echo "ungraceful stop test failed" + exit 1 + fi # start dm-master again task will be resume, and data will be synced run_dm_master $WORK_DIR/master $MASTER_PORT $cur/conf/dm-master.toml @@ -96,8 +101,13 @@ function run() { "\"stage\": \"Paused\"" 1 # check the point is the middle of checkpoint num=$(grep "not receive xid job yet" $WORK_DIR/worker1/log/dm-worker.log | wc -l) - [[ $num -gt 0 ]] - sed -e '/not receive xid job yet/d' $WORK_DIR/worker1/log/dm-worker.log >$WORK_DIR/worker1/log/dm-worker.log + + if [ "$num" -gt 0 ]; then + echo "graceful pause test success" + else + echo "graceful pause test failed" + exit 1 + fi echo "start check pause diff" check_sync_diff $WORK_DIR $cur/conf/diff_config.toml @@ -120,7 +130,12 @@ function run() { "\"result\": true" 2 # check the point is the middle of checkpoint num=$(grep "not receive xid job yet" $WORK_DIR/worker1/log/dm-worker.log | wc -l) - [[ $num -gt 0 ]] + if [ "$num" -gt 0 ]; then + echo "graceful stop test success" + else + echo "graceful stop test failed" + exit 1 + fi echo "start check stop diff" check_sync_diff $WORK_DIR $cur/conf/diff_config.toml From 922faa08dfd5c7a38e52d8f157056c6eea4c642b Mon Sep 17 00:00:00 2001 From: ehco1996 Date: Thu, 20 Jan 2022 09:03:48 +0800 Subject: [PATCH 17/47] fix ha test lib --- dm/syncer/syncer.go | 1 - dm/tests/_utils/ha_cases_lib.sh | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/dm/syncer/syncer.go b/dm/syncer/syncer.go index 7a3cbd888b7..a3404ebebf4 100644 --- a/dm/syncer/syncer.go +++ b/dm/syncer/syncer.go @@ -1625,7 +1625,6 @@ func (s *Syncer) Run(ctx context.Context) (err error) { go s.waitTransactionEndBeforeExit(ctx) defer func() { - // before if err1 := recover(); err1 != nil { failpoint.Inject("ExitAfterSaveOnlineDDL", func() { s.tctx.L().Info("force panic") diff --git a/dm/tests/_utils/ha_cases_lib.sh b/dm/tests/_utils/ha_cases_lib.sh index d9fb774a90b..6a0a2896ed8 100644 --- a/dm/tests/_utils/ha_cases_lib.sh +++ b/dm/tests/_utils/ha_cases_lib.sh @@ -145,6 +145,7 @@ function start_multi_tasks_cluster() { } function cleanup() { + cleanup_process $* cleanup_data $ha_test cleanup_data $ha_test2 echo "clean source table" @@ -154,7 +155,6 @@ function cleanup() { $(mysql -h127.0.0.1 -p123456 -P${i} -uroot -e "drop database if exists ha_test2;") sleep 1 done - cleanup_process $* } function isolate_master() { From 9ffaf25ea69e660da5d25a3aafc1d886099533ca Mon Sep 17 00:00:00 2001 From: ehco1996 Date: Thu, 20 Jan 2022 09:29:12 +0800 Subject: [PATCH 18/47] fix lazy_init_tracker --- dm/tests/start_task/run.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dm/tests/start_task/run.sh b/dm/tests/start_task/run.sh index 90e51cb24f7..5f67c350b00 100644 --- a/dm/tests/start_task/run.sh +++ b/dm/tests/start_task/run.sh @@ -58,8 +58,8 @@ function lazy_init_tracker() { check_log_contains $WORK_DIR/worker1/log/dm-worker.log 'lazy init table info.*t50' 1 check_log_not_contains $WORK_DIR/worker1/log/dm-worker.log 'lazy init table info.*t51' - cleanup_data start_task cleanup_process + cleanup_data start_task } function run() { From 9c2ea48d45a521f4ff2a4e37b461bb9f5f478089 Mon Sep 17 00:00:00 2001 From: ehco1996 Date: Thu, 20 Jan 2022 10:07:37 +0800 Subject: [PATCH 19/47] fix clean up data --- dm/tests/all_mode/run.sh | 8 ++++---- dm/tests/expression_filter/run.sh | 2 +- dm/tests/full_mode/run.sh | 6 +++--- dm/tests/load_interrupt/run.sh | 2 +- dm/tests/safe_mode/run.sh | 8 ++++---- 5 files changed, 13 insertions(+), 13 deletions(-) diff --git a/dm/tests/all_mode/run.sh b/dm/tests/all_mode/run.sh index 043511d9f43..61dbcd8a18f 100755 --- a/dm/tests/all_mode/run.sh +++ b/dm/tests/all_mode/run.sh @@ -142,8 +142,8 @@ function test_query_timeout() { run_sql_tidb 'SHOW PROCESSLIST;' check_rows_equal 1 - cleanup_data all_mode cleanup_process + cleanup_data all_mode export GO_FAILPOINTS='' echo "[$(date)] <<<<<< finish test_query_timeout >>>>>>" @@ -208,8 +208,8 @@ function test_stop_task_before_checkpoint() { "stop-task test" \ "\"result\": true" 3 - cleanup_data all_mode cleanup_process + cleanup_data all_mode export GO_FAILPOINTS='' echo "[$(date)] <<<<<< finish test_stop_task_before_checkpoint >>>>>>" @@ -268,8 +268,8 @@ function test_fail_job_between_event() { "\"result\": true" 3 check_sync_diff $WORK_DIR $cur/conf/diff_config.toml - cleanup_data all_mode cleanup_process + cleanup_data all_mode export GO_FAILPOINTS='' echo "[$(date)] <<<<<< finish test_fail_job_between_event >>>>>>" @@ -315,8 +315,8 @@ function test_expression_filter() { "query-status test" \ "\"result\": true" 3 - cleanup_data all_mode cleanup_process + cleanup_data all_mode echo "[$(date)] <<<<<< finish test_expression_filter >>>>>>" } diff --git a/dm/tests/expression_filter/run.sh b/dm/tests/expression_filter/run.sh index 01b559ccba9..faa7d1d43db 100755 --- a/dm/tests/expression_filter/run.sh +++ b/dm/tests/expression_filter/run.sh @@ -53,8 +53,8 @@ function complex_behaviour() { update_num=$(grep -o '"number of filtered update"=[0-9]\+' $WORK_DIR/worker1/log/dm-worker.log | grep -o '[0-9]\+' | awk '{n += $1}; END{print n}') [ $update_num -eq 3 ] - cleanup_data expr_filter cleanup_process $* + cleanup_data expr_filter } function run() { diff --git a/dm/tests/full_mode/run.sh b/dm/tests/full_mode/run.sh index 80fd8975c52..b74e7599a03 100755 --- a/dm/tests/full_mode/run.sh +++ b/dm/tests/full_mode/run.sh @@ -55,8 +55,8 @@ function fail_acquire_global_lock() { "\"stage\": \"Paused\"" 2 \ "you need (at least one of) the RELOAD privilege(s) for this operation" 2 - cleanup_data full_mode cleanup_process $* + cleanup_data full_mode } function escape_schema() { @@ -113,8 +113,8 @@ function escape_schema() { check_metric $WORKER1_PORT 'dumpling_dump_finished_tables' 3 0 3 check_metric $WORKER2_PORT 'dumpling_dump_finished_tables' 3 0 3 - cleanup_data full/mode cleanup_process $* + cleanup_data full/mode } function empty_data() { @@ -138,8 +138,8 @@ function empty_data() { check_log_contains $WORK_DIR/worker1/log/dm-worker.log "progress=\"100.00 %\"" check_log_contains $WORK_DIR/worker2/log/dm-worker.log "progress=\"100.00 %\"" - cleanup_data full_mode cleanup_process $* + cleanup_data full_mode } function run() { diff --git a/dm/tests/load_interrupt/run.sh b/dm/tests/load_interrupt/run.sh index 79f8c79c44c..e0e96aebb9c 100755 --- a/dm/tests/load_interrupt/run.sh +++ b/dm/tests/load_interrupt/run.sh @@ -60,8 +60,8 @@ function test_save_checkpoint_failed() { ls $WORK_DIR/worker1/dumped_data.test echo "test_save_checkpoint_failed SUCCESS!" - cleanup_data load_interrupt cleanup_process $* + cleanup_data load_interrupt } function run() { diff --git a/dm/tests/safe_mode/run.sh b/dm/tests/safe_mode/run.sh index 644967cd306..603c855a2c7 100755 --- a/dm/tests/safe_mode/run.sh +++ b/dm/tests/safe_mode/run.sh @@ -55,8 +55,8 @@ function consistency_none() { check_log_contain_with_retry "\[\"enable safe-mode for safe mode exit point, will exit at\"\] \[task=test\] \[unit=\"binlog replication\"\] \[location=\"position: ($name2, $pos2), gtid-set: $gtid2\"\]" $WORK_DIR/worker2/log/dm-worker.log run_sql_source2 "SET @@GLOBAL.SQL_MODE='ONLY_FULL_GROUP_BY,STRICT_TRANS_TABLES,NO_ZERO_IN_DATE,NO_ZERO_DATE,ERROR_FOR_DIVISION_BY_ZERO,NO_ENGINE_SUBSTITUTION'" - cleanup_data safe_mode_target cleanup_process $* + cleanup_data safe_mode_target } function check_exit_safe_binlog() { @@ -182,14 +182,14 @@ function safe_mode_recover() { echo "finish running run safe mode recover case $i" ((i += 1)) - cleanup_data safe_mode_target cleanup_process $* + cleanup_data safe_mode_target done } function run() { - consistency_none - safe_mode_recover + # consistency_none + # safe_mode_recover run_sql_file $cur/data/db1.prepare.sql $MYSQL_HOST1 $MYSQL_PORT1 $MYSQL_PASSWORD1 check_contains 'Query OK, 2 rows affected' From 937cdcc34f41ec178615b5e670e1c85f8305d13b Mon Sep 17 00:00:00 2001 From: ehco1996 Date: Thu, 20 Jan 2022 11:20:11 +0800 Subject: [PATCH 20/47] fix event ctx --- dm/syncer/syncer.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dm/syncer/syncer.go b/dm/syncer/syncer.go index a3404ebebf4..4625422c135 100644 --- a/dm/syncer/syncer.go +++ b/dm/syncer/syncer.go @@ -1997,7 +1997,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) { } ec := eventContext{ - tctx: s.tctx, + tctx: s.runTCtx, header: e.Header, startLocation: &startLocation, currentLocation: ¤tLocation, From 57ba6161b6809bc95b1162f9fb34e604ee5fc255 Mon Sep 17 00:00:00 2001 From: ehco1996 Date: Thu, 20 Jan 2022 14:20:15 +0800 Subject: [PATCH 21/47] fix syncer.done --- dm/dm/worker/server.go | 7 +++---- dm/dm/worker/subtask.go | 1 + dm/syncer/syncer.go | 7 +++++-- dm/tests/safe_mode/run.sh | 4 ++-- 4 files changed, 11 insertions(+), 8 deletions(-) diff --git a/dm/dm/worker/server.go b/dm/dm/worker/server.go index 3c5bca79001..c9f1b33d406 100644 --- a/dm/dm/worker/server.go +++ b/dm/dm/worker/server.go @@ -441,19 +441,18 @@ func (s *Server) observeSourceBound(ctx context.Context, rev int64) error { } func (s *Server) doClose() { - s.cancel() - // close server in advance, stop receiving source bound and relay bound - s.wg.Wait() - s.Lock() defer s.Unlock() + if s.closed.Load() { return } // stop worker and wait for return + s.cancel() if w := s.getWorker(false); w != nil { w.Stop(true) } + s.wg.Wait() s.closed.Store(true) } diff --git a/dm/dm/worker/subtask.go b/dm/dm/worker/subtask.go index 8c17e345aea..3fb0d61b66b 100644 --- a/dm/dm/worker/subtask.go +++ b/dm/dm/worker/subtask.go @@ -381,6 +381,7 @@ func (st *SubTask) closeUnits() { u := st.units[i] st.l.Info("closing unit process", zap.Stringer("unit", cu.Type())) u.Close() + st.l.Info("closing unit done", zap.Stringer("unit", cu.Type())) } } diff --git a/dm/syncer/syncer.go b/dm/syncer/syncer.go index 4625422c135..50573a0254f 100644 --- a/dm/syncer/syncer.go +++ b/dm/syncer/syncer.go @@ -1501,6 +1501,10 @@ func (s *Syncer) Run(ctx context.Context) (err error) { s.runCancel = runCancel s.rundone = make(chan struct{}) s.Unlock() + defer func() { + // close rundone to notify other goroutine that syncer.Run is exited + close(s.rundone) + }() // some initialization that can't be put in Syncer.Init fresh, err := s.IsFreshTask(ctx) @@ -1656,7 +1660,6 @@ func (s *Syncer) Run(ctx context.Context) (err error) { s.tctx.L().Warn("failed to flush safe mode checkpoints when exit task", zap.Error(err2)) } } - close(s.rundone) }() now := time.Now() @@ -3398,7 +3401,7 @@ func (s *Syncer) stopSync() { // when resuming, re-create s.syncer if s.streamerController != nil { - s.streamerController.Close(s.tctx) + s.streamerController.Close(s.runTCtx) } // try to rollback checkpoints, if they already flushed, no effect, this operation should call before close schemaTracker diff --git a/dm/tests/safe_mode/run.sh b/dm/tests/safe_mode/run.sh index 603c855a2c7..ed1d8ff74ff 100755 --- a/dm/tests/safe_mode/run.sh +++ b/dm/tests/safe_mode/run.sh @@ -188,8 +188,8 @@ function safe_mode_recover() { } function run() { - # consistency_none - # safe_mode_recover + consistency_none + safe_mode_recover run_sql_file $cur/data/db1.prepare.sql $MYSQL_HOST1 $MYSQL_PORT1 $MYSQL_PASSWORD1 check_contains 'Query OK, 2 rows affected' From adb7901798d14ef7298522d98424bdfd3e575c60 Mon Sep 17 00:00:00 2001 From: ehco1996 Date: Thu, 20 Jan 2022 16:48:32 +0800 Subject: [PATCH 22/47] fix kill --- dm/tests/new_relay/run.sh | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/dm/tests/new_relay/run.sh b/dm/tests/new_relay/run.sh index f9beb3a4c7f..c6fa3fb6199 100755 --- a/dm/tests/new_relay/run.sh +++ b/dm/tests/new_relay/run.sh @@ -26,7 +26,9 @@ function test_cant_dail_upstream() { "start-relay -s $SOURCE_ID1 worker1" \ "\"result\": true" 2 - kill_dm_worker + echo "kill dm-worker1" + ps aux | grep dm-worker1 | awk '{print $2}' | xargs kill || true + check_port_offline $WORKER1_PORT 20 export GO_FAILPOINTS="github.com/pingcap/tiflow/dm/pkg/conn/failDBPing=return()" run_dm_worker $WORK_DIR/worker1 $WORKER1_PORT $cur/conf/dm-worker1.toml @@ -254,8 +256,9 @@ function run() { check_metric $WORKER1_PORT 'dm_relay_space{type="available"}' 5 0 9223372036854775807 # subtask is preferred to scheduled to another relay worker - pkill -hup -f dm-worker1.toml 2>/dev/null || true - wait_pattern_exit dm-worker1.toml + echo "kill dm-worker1" + ps aux | grep dm-worker1 | awk '{print $2}' | xargs kill || true + check_port_offline $WORKER1_PORT 20 # worker1 is down, worker2 has running relay and sync unit run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \ "query-status -s $SOURCE_ID1" \ @@ -296,11 +299,12 @@ function run() { [ "$new_relay_log_count_1" -eq 1 ] [ "$new_relay_log_count_2" -eq 1 ] - pkill -hup -f dm-worker1.toml 2>/dev/null || true - wait_pattern_exit dm-worker1.toml - pkill -hup -f dm-worker2.toml 2>/dev/null || true - wait_pattern_exit dm-worker2.toml - + echo "kill dm-worker2" + ps aux | grep dm-worker1 | awk '{print $2}' | xargs kill || true + check_port_offline $WORKER1_PORT 20 + echo "kill dm-worker2" + ps aux | grep dm-worker1 | awk '{print $2}' | xargs kill || true + check_port_offline $WORKER1_PORT 20 # if all relay workers are offline, relay-not-enabled worker should continue to sync run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \ "query-status -s $SOURCE_ID1" \ From a5acf28b0efe443f7aeb9eba564f0f8758d7de88 Mon Sep 17 00:00:00 2001 From: ehco1996 Date: Thu, 20 Jan 2022 17:18:21 +0800 Subject: [PATCH 23/47] addresss comment --- dm/dm/worker/subtask.go | 2 +- dm/tests/checkpoint_transaction/run.sh | 4 ++-- dm/tests/new_relay/run.sh | 12 +++++++++--- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/dm/dm/worker/subtask.go b/dm/dm/worker/subtask.go index 3fb0d61b66b..99a9d079624 100644 --- a/dm/dm/worker/subtask.go +++ b/dm/dm/worker/subtask.go @@ -387,7 +387,7 @@ func (st *SubTask) closeUnits() { func (st *SubTask) killCurrentUnit() { if st.CurrUnit() != nil { - st.l.Info("kill syncer unit", zap.String("task", st.cfg.Name), zap.Stringer("unit", st.CurrUnit().Type())) + st.l.Info("kill unit", zap.String("task", st.cfg.Name), zap.Stringer("unit", st.CurrUnit().Type())) st.CurrUnit().Kill() } } diff --git a/dm/tests/checkpoint_transaction/run.sh b/dm/tests/checkpoint_transaction/run.sh index 7198d385a7e..79c6d46ca4d 100755 --- a/dm/tests/checkpoint_transaction/run.sh +++ b/dm/tests/checkpoint_transaction/run.sh @@ -13,11 +13,11 @@ function check_worker_ungraceful_stop_with_retry() { sleep 1 echo "start check_worker_ungraceful_stop_with_retry times: $k" - num=$(grep "kill syncer unit" $WORK_DIR/worker1/log/dm-worker.log | wc -l) + num=$(grep "kill unit" $WORK_DIR/worker1/log/dm-worker.log | wc -l) if [ $num -lt 1 ]; then continue fi - num=$(grep "kill syncer without graceful" $WORK_DIR/worker1/log/dm-worker.log | wc -l) + num=$(grep "kill without graceful" $WORK_DIR/worker1/log/dm-worker.log | wc -l) if [ $num -lt 1 ]; then continue fi diff --git a/dm/tests/new_relay/run.sh b/dm/tests/new_relay/run.sh index c6fa3fb6199..0c856719fc5 100755 --- a/dm/tests/new_relay/run.sh +++ b/dm/tests/new_relay/run.sh @@ -62,7 +62,9 @@ function test_cant_dail_downstream() { "\"result\": true" 2 dmctl_start_task_standalone $cur/conf/dm-task.yaml "--remove-meta" - kill_dm_worker + echo "kill dm-worker1" + ps aux | grep dm-worker1 | awk '{print $2}' | xargs kill || true + check_port_offline $WORKER1_PORT 20 # kill tidb pkill -hup tidb-server 2>/dev/null || true wait_process_exit tidb-server @@ -130,8 +132,12 @@ function test_restart_relay_status() { "list-member -n worker3" \ "relay" 1 - kill_dm_worker - kill_dm_master + echo "kill dm-worker1" + ps aux | grep dm-worker1 | awk '{print $2}' | xargs kill || true + check_port_offline $WORKER1_PORT 20 + echo "kill dm-master" + ps aux | grep dm-master | awk '{print $2}' | xargs kill || true + check_master_port_offline 1 run_dm_master $WORK_DIR/master $MASTER_PORT $cur/conf/dm-master.toml check_rpc_alive $cur/../bin/check_master_online 127.0.0.1:$MASTER_PORT From d9f42f67410d4141e1da9227cb77dbaed52cdddf Mon Sep 17 00:00:00 2001 From: ehco1996 Date: Thu, 20 Jan 2022 18:08:02 +0800 Subject: [PATCH 24/47] add log for debug --- dm/tests/new_relay/run.sh | 169 ++++++++++++++++++++------------------ 1 file changed, 87 insertions(+), 82 deletions(-) diff --git a/dm/tests/new_relay/run.sh b/dm/tests/new_relay/run.sh index 0c856719fc5..6eec999f53b 100755 --- a/dm/tests/new_relay/run.sh +++ b/dm/tests/new_relay/run.sh @@ -10,86 +10,9 @@ SQL_RESULT_FILE="$TEST_DIR/sql_res.$TEST_NAME.txt" API_VERSION="v1alpha1" -function test_cant_dail_upstream() { - cleanup_data $TEST_NAME - cleanup_process - - run_dm_master $WORK_DIR/master $MASTER_PORT $cur/conf/dm-master.toml - check_rpc_alive $cur/../bin/check_master_online 127.0.0.1:$MASTER_PORT - run_dm_worker $WORK_DIR/worker1 $WORKER1_PORT $cur/conf/dm-worker1.toml - check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER1_PORT - - cp $cur/conf/source1.yaml $WORK_DIR/source1.yaml - dmctl_operate_source create $WORK_DIR/source1.yaml $SOURCE_ID1 - - run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \ - "start-relay -s $SOURCE_ID1 worker1" \ - "\"result\": true" 2 - - echo "kill dm-worker1" - ps aux | grep dm-worker1 | awk '{print $2}' | xargs kill || true - check_port_offline $WORKER1_PORT 20 - - export GO_FAILPOINTS="github.com/pingcap/tiflow/dm/pkg/conn/failDBPing=return()" - run_dm_worker $WORK_DIR/worker1 $WORKER1_PORT $cur/conf/dm-worker1.toml - check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER1_PORT - - # make sure DM-worker doesn't exit - sleep 2 - run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \ - "query-status -s $SOURCE_ID1" \ - "injected error" 1 - - export GO_FAILPOINTS="" - cleanup_process - cleanup_data $TEST_NAME -} - -function test_cant_dail_downstream() { - cleanup_data $TEST_NAME - cleanup_process - - run_dm_master $WORK_DIR/master $MASTER_PORT $cur/conf/dm-master.toml - check_rpc_alive $cur/../bin/check_master_online 127.0.0.1:$MASTER_PORT - run_dm_worker $WORK_DIR/worker1 $WORKER1_PORT $cur/conf/dm-worker1.toml - check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER1_PORT - - cp $cur/conf/source1.yaml $WORK_DIR/source1.yaml - dmctl_operate_source create $WORK_DIR/source1.yaml $SOURCE_ID1 - - run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \ - "start-relay -s $SOURCE_ID1 worker1" \ - "\"result\": true" 2 - dmctl_start_task_standalone $cur/conf/dm-task.yaml "--remove-meta" - - echo "kill dm-worker1" - ps aux | grep dm-worker1 | awk '{print $2}' | xargs kill || true - check_port_offline $WORKER1_PORT 20 - # kill tidb - pkill -hup tidb-server 2>/dev/null || true - wait_process_exit tidb-server - - run_dm_worker $WORK_DIR/worker1 $WORKER1_PORT $cur/conf/dm-worker1.toml - check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER1_PORT - - # make sure DM-worker doesn't exit - sleep 2 - run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \ - "query-status -s $SOURCE_ID1" \ - "\"relayCatchUpMaster\": true" 1 \ - "dial tcp 127.0.0.1:4000: connect: connection refused" 1 - - # restart tidb - run_tidb_server 4000 $TIDB_PASSWORD - sleep 2 - - cleanup_process - cleanup_data $TEST_NAME -} - function test_restart_relay_status() { - cleanup_data $TEST_NAME cleanup_process + cleanup_data $TEST_NAME run_dm_master $WORK_DIR/master $MASTER_PORT $cur/conf/dm-master.toml check_rpc_alive $cur/../bin/check_master_online 127.0.0.1:$MASTER_PORT @@ -165,11 +88,92 @@ function test_restart_relay_status() { "list-member --worker" \ "relay" 1 \ "bound" 2 + + echo "》》》》》》》》》》》》》》》》》test test_restart_relay_status passed" } -function test_kill_dump_connection() { +function test_cant_dail_upstream() { + cleanup_process cleanup_data $TEST_NAME + + run_dm_master $WORK_DIR/master $MASTER_PORT $cur/conf/dm-master.toml + check_rpc_alive $cur/../bin/check_master_online 127.0.0.1:$MASTER_PORT + run_dm_worker $WORK_DIR/worker1 $WORKER1_PORT $cur/conf/dm-worker1.toml + check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER1_PORT + + cp $cur/conf/source1.yaml $WORK_DIR/source1.yaml + dmctl_operate_source create $WORK_DIR/source1.yaml $SOURCE_ID1 + + run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \ + "start-relay -s $SOURCE_ID1 worker1" \ + "\"result\": true" 2 + + echo "kill dm-worker1" + ps aux | grep dm-worker1 | awk '{print $2}' | xargs kill || true + check_port_offline $WORKER1_PORT 20 + + export GO_FAILPOINTS="github.com/pingcap/tiflow/dm/pkg/conn/failDBPing=return()" + run_dm_worker $WORK_DIR/worker1 $WORKER1_PORT $cur/conf/dm-worker1.toml + check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER1_PORT + + # make sure DM-worker doesn't exit + sleep 2 + run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \ + "query-status -s $SOURCE_ID1" \ + "injected error" 1 + + export GO_FAILPOINTS="" + cleanup_data $TEST_NAME + cleanup_process + echo "》》》》》》》》》》》》》》》》》test test_cant_dail_upstream passed" +} + +function test_cant_dail_downstream() { + cleanup_process + cleanup_data $TEST_NAME + + run_dm_master $WORK_DIR/master $MASTER_PORT $cur/conf/dm-master.toml + check_rpc_alive $cur/../bin/check_master_online 127.0.0.1:$MASTER_PORT + run_dm_worker $WORK_DIR/worker1 $WORKER1_PORT $cur/conf/dm-worker1.toml + check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER1_PORT + + cp $cur/conf/source1.yaml $WORK_DIR/source1.yaml + dmctl_operate_source create $WORK_DIR/source1.yaml $SOURCE_ID1 + + run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \ + "start-relay -s $SOURCE_ID1 worker1" \ + "\"result\": true" 2 + dmctl_start_task_standalone $cur/conf/dm-task.yaml "--remove-meta" + + echo "kill dm-worker1" + ps aux | grep dm-worker1 | awk '{print $2}' | xargs kill || true + check_port_offline $WORKER1_PORT 20 + # kill tidb + pkill -hup tidb-server 2>/dev/null || true + wait_process_exit tidb-server + + run_dm_worker $WORK_DIR/worker1 $WORKER1_PORT $cur/conf/dm-worker1.toml + check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER1_PORT + + # make sure DM-worker doesn't exit + sleep 2 + run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \ + "query-status -s $SOURCE_ID1" \ + "\"relayCatchUpMaster\": true" 1 \ + "dial tcp 127.0.0.1:4000: connect: connection refused" 1 + + # restart tidb + run_tidb_server 4000 $TIDB_PASSWORD + sleep 2 + + cleanup_data $TEST_NAME + cleanup_process + echo "》》》》》》》》》》》》》》》》》test test_cant_dail_downstream passed" +} + +function test_kill_dump_connection() { cleanup_process + cleanup_data $TEST_NAME run_sql_file $cur/data/db1.prepare.sql $MYSQL_HOST1 $MYSQL_PORT1 $MYSQL_PASSWORD1 check_contains 'Query OK, 2 rows affected' @@ -201,15 +205,17 @@ function test_kill_dump_connection() { run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \ "query-status -s $SOURCE_ID1" \ "\"relayCatchUpMaster\": true" 1 - - cleanup_process cleanup_data $TEST_NAME + cleanup_process + echo "》》》》》》》》》》》》》》》》》test test_kill_dump_connection passed" + } function run() { test_restart_relay_status test_cant_dail_downstream test_cant_dail_upstream + test_kill_dump_connection export GO_FAILPOINTS="github.com/pingcap/tiflow/dm/relay/ReportRelayLogSpaceInBackground=return(1)" @@ -363,7 +369,6 @@ function run() { check_sync_diff $WORK_DIR $cur/conf/diff_config.toml - test_kill_dump_connection } cleanup_data $TEST_NAME From 31727e0ba93ecf640feeeeb4d9c9885200393498 Mon Sep 17 00:00:00 2001 From: ehco1996 Date: Thu, 20 Jan 2022 19:22:53 +0800 Subject: [PATCH 25/47] fix test case --- dm/dm/worker/server.go | 3 +- dm/syncer/syncer.go | 10 +++---- dm/tests/checkpoint_transaction/run.sh | 2 +- dm/tests/new_relay/run.sh | 41 ++++++++++++-------------- 4 files changed, 27 insertions(+), 29 deletions(-) diff --git a/dm/dm/worker/server.go b/dm/dm/worker/server.go index c9f1b33d406..25d49e85773 100644 --- a/dm/dm/worker/server.go +++ b/dm/dm/worker/server.go @@ -448,10 +448,10 @@ func (s *Server) doClose() { return } // stop worker and wait for return - s.cancel() if w := s.getWorker(false); w != nil { w.Stop(true) } + s.cancel() s.wg.Wait() s.closed.Store(true) } @@ -460,6 +460,7 @@ func (s *Server) doClose() { func (s *Server) Close() { s.doClose() // we should stop current sync first, other wise master may scheduler task on new worker while we are closing s.stopKeepAlive() + s.etcdClient.Close() } // if needLock is false, we should make sure Server has been locked in caller. diff --git a/dm/syncer/syncer.go b/dm/syncer/syncer.go index 50573a0254f..56dd95ed709 100644 --- a/dm/syncer/syncer.go +++ b/dm/syncer/syncer.go @@ -281,8 +281,6 @@ func NewSyncer(cfg *config.SubTaskConfig, etcdClient *clientv3.Client, relay rel syncer.lastCheckpointFlushedTime = time.Time{} syncer.relay = relay syncer.locations = &locationRecorder{} - - syncer.rundone = nil return syncer } @@ -1504,6 +1502,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) { defer func() { // close rundone to notify other goroutine that syncer.Run is exited close(s.rundone) + runCancel() }() // some initialization that can't be put in Syncer.Init @@ -1517,6 +1516,8 @@ func (s *Syncer) Run(ctx context.Context) (err error) { return err } } + // we should start this goroutine as soon as possible, because it's the only goroutine that cancel syncer.Run + go s.waitTransactionEndBeforeExit(ctx) var ( flushCheckpoint bool @@ -1627,7 +1628,6 @@ func (s *Syncer) Run(ctx context.Context) (err error) { traceSource = fmt.Sprintf("%s.syncer.%s", s.cfg.SourceID, s.cfg.Name) ) - go s.waitTransactionEndBeforeExit(ctx) defer func() { if err1 := recover(); err1 != nil { failpoint.Inject("ExitAfterSaveOnlineDDL", func() { @@ -1847,7 +1847,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) { // try to re-sync in gtid mode if tryReSync && s.cfg.EnableGTID && utils.IsErrBinlogPurged(err) && s.cfg.AutoFixGTID { time.Sleep(retryTimeout) - err = s.reSyncBinlog(*s.tctx, lastLocation) + err = s.reSyncBinlog(*s.runTCtx, lastLocation) if err != nil { return err } @@ -1993,7 +1993,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) { if err = s.checkpoint.FlushSafeModeExitPoint(s.runTCtx); err != nil { return err } - if err = s.safeMode.Add(s.tctx, -1); err != nil { + if err = s.safeMode.Add(s.runTCtx, -1); err != nil { return err } } diff --git a/dm/tests/checkpoint_transaction/run.sh b/dm/tests/checkpoint_transaction/run.sh index 79c6d46ca4d..61eaee06efd 100755 --- a/dm/tests/checkpoint_transaction/run.sh +++ b/dm/tests/checkpoint_transaction/run.sh @@ -17,7 +17,7 @@ function check_worker_ungraceful_stop_with_retry() { if [ $num -lt 1 ]; then continue fi - num=$(grep "kill without graceful" $WORK_DIR/worker1/log/dm-worker.log | wc -l) + num=$(grep "kill syncer without graceful" $WORK_DIR/worker1/log/dm-worker.log | wc -l) if [ $num -lt 1 ]; then continue fi diff --git a/dm/tests/new_relay/run.sh b/dm/tests/new_relay/run.sh index 6eec999f53b..095136b8d27 100755 --- a/dm/tests/new_relay/run.sh +++ b/dm/tests/new_relay/run.sh @@ -13,6 +13,7 @@ API_VERSION="v1alpha1" function test_restart_relay_status() { cleanup_process cleanup_data $TEST_NAME + export GO_FAILPOINTS="" run_dm_master $WORK_DIR/master $MASTER_PORT $cur/conf/dm-master.toml check_rpc_alive $cur/../bin/check_master_online 127.0.0.1:$MASTER_PORT @@ -95,6 +96,7 @@ function test_restart_relay_status() { function test_cant_dail_upstream() { cleanup_process cleanup_data $TEST_NAME + export GO_FAILPOINTS="" run_dm_master $WORK_DIR/master $MASTER_PORT $cur/conf/dm-master.toml check_rpc_alive $cur/../bin/check_master_online 127.0.0.1:$MASTER_PORT @@ -122,15 +124,13 @@ function test_cant_dail_upstream() { "query-status -s $SOURCE_ID1" \ "injected error" 1 - export GO_FAILPOINTS="" - cleanup_data $TEST_NAME - cleanup_process echo "》》》》》》》》》》》》》》》》》test test_cant_dail_upstream passed" } function test_cant_dail_downstream() { cleanup_process cleanup_data $TEST_NAME + export GO_FAILPOINTS="" run_dm_master $WORK_DIR/master $MASTER_PORT $cur/conf/dm-master.toml check_rpc_alive $cur/../bin/check_master_online 127.0.0.1:$MASTER_PORT @@ -155,8 +155,6 @@ function test_cant_dail_downstream() { run_dm_worker $WORK_DIR/worker1 $WORKER1_PORT $cur/conf/dm-worker1.toml check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER1_PORT - # make sure DM-worker doesn't exit - sleep 2 run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \ "query-status -s $SOURCE_ID1" \ "\"relayCatchUpMaster\": true" 1 \ @@ -166,8 +164,6 @@ function test_cant_dail_downstream() { run_tidb_server 4000 $TIDB_PASSWORD sleep 2 - cleanup_data $TEST_NAME - cleanup_process echo "》》》》》》》》》》》》》》》》》test test_cant_dail_downstream passed" } @@ -205,18 +201,12 @@ function test_kill_dump_connection() { run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \ "query-status -s $SOURCE_ID1" \ "\"relayCatchUpMaster\": true" 1 - cleanup_data $TEST_NAME - cleanup_process echo "》》》》》》》》》》》》》》》》》test test_kill_dump_connection passed" - } -function run() { - test_restart_relay_status - test_cant_dail_downstream - test_cant_dail_upstream - test_kill_dump_connection - +function test_relay_operations() { + cleanup_process + cleanup_data $TEST_NAME export GO_FAILPOINTS="github.com/pingcap/tiflow/dm/relay/ReportRelayLogSpaceInBackground=return(1)" run_sql_file $cur/data/db1.prepare.sql $MYSQL_HOST1 $MYSQL_PORT1 $MYSQL_PASSWORD1 @@ -248,7 +238,7 @@ function run() { "\"worker\": \"worker1\"" 1 \ "\"worker\": \"worker2\"" 1 - # worker1 and worker2 has one realy job and worker3 have none. + # worker1 and worker2 has one relay job and worker3 have none. check_metric $WORKER1_PORT "dm_relay_binlog_file{node=\"relay\"}" 3 0 2 check_metric $WORKER1_PORT "dm_relay_exit_with_error_count" 3 -1 1 check_metric $WORKER2_PORT "dm_relay_binlog_file{node=\"relay\"}" 3 0 2 @@ -260,7 +250,7 @@ function run() { run_sql_file $cur/data/db1.increment.sql $MYSQL_HOST1 $MYSQL_PORT1 $MYSQL_PASSWORD1 - # relay task tranfer to worker1 with no error. + # relay task transfer to worker1 with no error. check_metric $WORKER1_PORT "dm_relay_data_corruption" 3 -1 1 check_metric $WORKER1_PORT "dm_relay_read_error_count" 3 -1 1 check_metric $WORKER1_PORT "dm_relay_write_error_count" 3 -1 1 @@ -311,11 +301,11 @@ function run() { [ "$new_relay_log_count_1" -eq 1 ] [ "$new_relay_log_count_2" -eq 1 ] - echo "kill dm-worker2" + echo "kill dm-worker1" ps aux | grep dm-worker1 | awk '{print $2}' | xargs kill || true check_port_offline $WORKER1_PORT 20 echo "kill dm-worker2" - ps aux | grep dm-worker1 | awk '{print $2}' | xargs kill || true + ps aux | grep dm-worker2 | awk '{print $2}' | xargs kill || true check_port_offline $WORKER1_PORT 20 # if all relay workers are offline, relay-not-enabled worker should continue to sync run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \ @@ -338,8 +328,7 @@ function run() { # destroy cluster cleanup_process $* - rm -rf $WORK_DIR - mkdir $WORK_DIR + cleanup_data $TEST_NAME # insert new data run_sql_file $cur/data/db1.increment5.sql $MYSQL_HOST1 $MYSQL_PORT1 $MYSQL_PASSWORD1 @@ -368,7 +357,15 @@ function run() { "\"result\": true" 2 check_sync_diff $WORK_DIR $cur/conf/diff_config.toml + echo "》》》》》》》》》》》》》》》》》test test_relay_operations passed" +} +function run() { + test_relay_operations + test_cant_dail_upstream + test_restart_relay_status + test_cant_dail_downstream + test_kill_dump_connection } cleanup_data $TEST_NAME From 22a7e72d3926ffbc460d643501d7b77870a219f5 Mon Sep 17 00:00:00 2001 From: ehco1996 Date: Fri, 21 Jan 2022 08:54:43 +0800 Subject: [PATCH 26/47] fix comments --- dm/syncer/syncer.go | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/dm/syncer/syncer.go b/dm/syncer/syncer.go index 56dd95ed709..3b8e8870c8d 100644 --- a/dm/syncer/syncer.go +++ b/dm/syncer/syncer.go @@ -1506,7 +1506,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) { }() // some initialization that can't be put in Syncer.Init - fresh, err := s.IsFreshTask(ctx) + fresh, err := s.IsFreshTask(s.runCtx) if err != nil { return err } else if fresh { @@ -1543,7 +1543,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) { s.wg.Add(1) go func() { defer s.wg.Done() - s.checkpointFlushWorker.Run(s.runTCtx) + s.checkpointFlushWorker.Run(s.tctx) }() if flushCheckpoint { @@ -2085,8 +2085,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) { } } if waitXIDStatus(s.waitXIDJob.Load()) == waitComplete { - // already wait until XID event, we can stop sync now - s.runCancel() + // already wait until XID event, we can stop sync now, s.runcancel will be called in defer func return nil } } @@ -3362,12 +3361,6 @@ func (s *Syncer) Close() { if s.isClosed() { return } - - // wait for s.Run() to finish - if s.rundone != nil { - <-s.rundone - } - s.stopSync() s.closeDBs() s.checkpoint.Close() @@ -3394,6 +3387,10 @@ func (s *Syncer) Kill() { // stopSync stops syncing and rollback checkpoint now it used by Close,kILL and Pause // maybe we can refine the workflow more clear. func (s *Syncer) stopSync() { + // wait for s.Run() to finish + if s.rundone != nil { + <-s.rundone + } s.closeJobChans() s.wg.Wait() // wait job workers to return From e01e2ea0a1c5bd70f1c15bab7c0721d786220c6f Mon Sep 17 00:00:00 2001 From: ehco1996 Date: Fri, 21 Jan 2022 09:57:20 +0800 Subject: [PATCH 27/47] rename func about sourceWorker --- dm/dm/worker/hub_test.go | 2 +- dm/dm/worker/join.go | 2 +- dm/dm/worker/server.go | 46 +++++----- dm/dm/worker/server_test.go | 30 +++---- dm/dm/worker/source_worker_test.go | 8 +- dm/syncer/syncer.go | 133 +++++++++++++---------------- dm/syncer/syncer_test.go | 4 +- 7 files changed, 106 insertions(+), 119 deletions(-) diff --git a/dm/dm/worker/hub_test.go b/dm/dm/worker/hub_test.go index d76697b1e6a..ca20fd437c3 100644 --- a/dm/dm/worker/hub_test.go +++ b/dm/dm/worker/hub_test.go @@ -20,5 +20,5 @@ import ( func (t *testServer) testConidtionHub(c *C, s *Server) { // test condition hub c.Assert(GetConditionHub(), NotNil) - c.Assert(GetConditionHub().w, DeepEquals, s.getWorker(true)) + c.Assert(GetConditionHub().w, DeepEquals, s.getSourceWorker(true)) } diff --git a/dm/dm/worker/join.go b/dm/dm/worker/join.go index e9d18ba5bbb..a50c47d44fe 100644 --- a/dm/dm/worker/join.go +++ b/dm/dm/worker/join.go @@ -109,7 +109,7 @@ func (s *Server) KeepAlive() { // TODO: report the error. // when lost keepalive, stop the worker without graceful. this is to fix https://github.com/pingcap/tiflow/issues/3737 - err := s.stopWorker("", true, false) + err := s.stopSourceWorker("", true, false) if err != nil { log.L().Error("fail to stop worker", zap.Error(err)) return // return if failed to stop the worker. diff --git a/dm/dm/worker/server.go b/dm/dm/worker/server.go index 25d49e85773..ff7590fd3b9 100644 --- a/dm/dm/worker/server.go +++ b/dm/dm/worker/server.go @@ -300,7 +300,7 @@ func (s *Server) observeRelayConfig(ctx context.Context, rev int64) error { } rev = rev1 if relaySource == nil { - if w := s.getWorker(true); w != nil && w.startedRelayBySourceCfg { + if w := s.getSourceWorker(true); w != nil && w.startedRelayBySourceCfg { break } log.L().Info("didn't found relay config after etcd retryable error. Will stop relay now") @@ -314,7 +314,7 @@ func (s *Server) observeRelayConfig(ctx context.Context, rev int64) error { s.Lock() defer s.Unlock() - if w := s.getWorker(false); w != nil && w.cfg.SourceID == relaySource.SourceID { + if w := s.getSourceWorker(false); w != nil && w.cfg.SourceID == relaySource.SourceID { // we may face both relay config and subtask bound changed in a compaction error, so here // we check if observeSourceBound has started a worker // TODO: add a test for this situation @@ -325,7 +325,7 @@ func (s *Server) observeRelayConfig(ctx context.Context, rev int64) error { } return nil } - err = s.stopWorker("", false, true) + err = s.stopSourceWorker("", false, true) if err != nil { log.L().Error("fail to stop worker", zap.Error(err)) return err // return if failed to stop the worker. @@ -403,7 +403,7 @@ func (s *Server) observeSourceBound(ctx context.Context, rev int64) error { s.Lock() defer s.Unlock() - if w := s.getWorker(false); w != nil && w.cfg.SourceID == bound.Source { + if w := s.getSourceWorker(false); w != nil && w.cfg.SourceID == bound.Source { // we may face both relay config and subtask bound changed in a compaction error, so here // we check if observeRelayConfig has started a worker // TODO: add a test for this situation @@ -414,7 +414,7 @@ func (s *Server) observeSourceBound(ctx context.Context, rev int64) error { } return nil } - err = s.stopWorker("", false, true) + err = s.stopSourceWorker("", false, true) if err != nil { log.L().Error("fail to stop worker", zap.Error(err)) return err // return if failed to stop the worker. @@ -447,8 +447,8 @@ func (s *Server) doClose() { if s.closed.Load() { return } - // stop worker and wait for return - if w := s.getWorker(false); w != nil { + // stop worker and wait for return(we already lock the whole Sever, so no need use lock to get source worker) + if w := s.getSourceWorker(false); w != nil { w.Stop(true) } s.cancel() @@ -464,7 +464,7 @@ func (s *Server) Close() { } // if needLock is false, we should make sure Server has been locked in caller. -func (s *Server) getWorker(needLock bool) *SourceWorker { +func (s *Server) getSourceWorker(needLock bool) *SourceWorker { if needLock { s.Lock() defer s.Unlock() @@ -497,7 +497,7 @@ func (s *Server) setSourceStatus(source string, err error, needLock bool) { defer s.Unlock() } // now setSourceStatus will be concurrently called. skip setting a source status if worker has been closed - if s.getWorker(false) == nil && source != "" { + if s.getSourceWorker(false) == nil && source != "" { return } s.sourceStatus = pb.SourceStatus{ @@ -515,23 +515,23 @@ func (s *Server) setSourceStatus(source string, err error, needLock bool) { // if sourceID is set to "", worker will be closed directly // if sourceID is not "", we will check sourceID with w.cfg.SourceID. -func (s *Server) stopWorker(sourceID string, needLock, graceful bool) error { +func (s *Server) stopSourceWorker(sourceID string, needLock, graceful bool) error { if needLock { s.Lock() defer s.Unlock() } - w := s.getWorker(false) - if w == nil { + sw := s.getSourceWorker(false) + if sw == nil { log.L().Warn("worker has not been started, no need to stop", zap.String("source", sourceID)) return nil // no need to stop because not started yet } - if sourceID != "" && w.cfg.SourceID != sourceID { + if sourceID != "" && sw.cfg.SourceID != sourceID { return terror.ErrWorkerSourceNotMatch } s.UpdateKeepAliveTTL(s.cfg.KeepAliveTTL) s.setWorker(nil, false) s.setSourceStatus("", nil, false) - w.Stop(graceful) + sw.Stop(graceful) return nil } @@ -659,7 +659,7 @@ func (s *Server) enableHandleSubtasks(sourceCfg *config.SourceConfig, needLock b func (s *Server) disableHandleSubtasks(source string) error { s.Lock() defer s.Unlock() - w := s.getWorker(false) + w := s.getSourceWorker(false) if w == nil { log.L().Warn("worker has already stopped before DisableHandleSubtasks", zap.String("source", source)) return nil @@ -676,7 +676,7 @@ func (s *Server) disableHandleSubtasks(source string) error { var err error if !w.relayEnabled.Load() { log.L().Info("relay is not enabled after disabling subtask, so stop worker") - err = s.stopWorker(source, false, true) + err = s.stopSourceWorker(source, false, true) } return err } @@ -721,7 +721,7 @@ func (s *Server) enableRelay(sourceCfg *config.SourceConfig, needLock bool) erro func (s *Server) disableRelay(source string) error { s.Lock() defer s.Unlock() - w := s.getWorker(false) + w := s.getSourceWorker(false) if w == nil { log.L().Warn("worker has already stopped before DisableRelay", zap.Any("relaySource", source)) return nil @@ -731,7 +731,7 @@ func (s *Server) disableRelay(source string) error { var err error if !w.subTaskEnabled.Load() { log.L().Info("subtask is not enabled after disabling relay, so stop worker") - err = s.stopWorker(source, false, true) + err = s.stopSourceWorker(source, false, true) } return err } @@ -747,7 +747,7 @@ func (s *Server) QueryStatus(ctx context.Context, req *pb.QueryStatusRequest) (* SourceStatus: &sourceStatus, } - w := s.getWorker(true) + w := s.getSourceWorker(true) if w == nil { log.L().Warn("fail to call QueryStatus, because no mysql source is being handled in the worker") resp.Result = false @@ -769,7 +769,7 @@ func (s *Server) QueryStatus(ctx context.Context, req *pb.QueryStatusRequest) (* // PurgeRelay implements WorkerServer.PurgeRelay. func (s *Server) PurgeRelay(ctx context.Context, req *pb.PurgeRelayRequest) (*pb.CommonWorkerResponse, error) { log.L().Info("", zap.String("request", "PurgeRelay"), zap.Stringer("payload", req)) - w := s.getWorker(true) + w := s.getSourceWorker(true) if w == nil { log.L().Warn("fail to call StartSubTask, because no mysql source is being handled in the worker") return makeCommonWorkerResponse(terror.ErrWorkerNoStart.Generate()), nil @@ -786,7 +786,7 @@ func (s *Server) PurgeRelay(ctx context.Context, req *pb.PurgeRelayRequest) (*pb func (s *Server) OperateSchema(ctx context.Context, req *pb.OperateWorkerSchemaRequest) (*pb.CommonWorkerResponse, error) { log.L().Info("", zap.String("request", "OperateSchema"), zap.Stringer("payload", req)) - w := s.getWorker(true) + w := s.getSourceWorker(true) if w == nil { log.L().Warn("fail to call OperateSchema, because no mysql source is being handled in the worker") return makeCommonWorkerResponse(terror.ErrWorkerNoStart.Generate()), nil @@ -813,7 +813,7 @@ func (s *Server) getOrStartWorker(cfg *config.SourceConfig, needLock bool) (*Sou defer s.Unlock() } - if w := s.getWorker(false); w != nil { + if w := s.getSourceWorker(false); w != nil { if w.cfg.SourceID == cfg.SourceID { log.L().Info("mysql source is being handled", zap.String("sourceID", s.worker.cfg.SourceID)) return w, nil @@ -904,7 +904,7 @@ func getMinLocForSubTask(ctx context.Context, subTaskCfg config.SubTaskConfig) ( func (s *Server) HandleError(ctx context.Context, req *pb.HandleWorkerErrorRequest) (*pb.CommonWorkerResponse, error) { log.L().Info("", zap.String("request", "HandleError"), zap.Stringer("payload", req)) - w := s.getWorker(true) + w := s.getSourceWorker(true) if w == nil { log.L().Warn("fail to call HandleError, because no mysql source is being handled in the worker") return makeCommonWorkerResponse(terror.ErrWorkerNoStart.Generate()), nil diff --git a/dm/dm/worker/server_test.go b/dm/dm/worker/server_test.go index e9a578ddc82..fdedd1a77da 100644 --- a/dm/dm/worker/server_test.go +++ b/dm/dm/worker/server_test.go @@ -209,7 +209,7 @@ func (t *testServer) TestServer(c *C) { _, err = ha.DeleteSubTaskStage(s.etcdClient, ha.NewSubTaskStage(pb.Stage_Stopped, sourceCfg.SourceID, subtaskCfg.Name)) c.Assert(err, IsNil) c.Assert(utils.WaitSomething(30, 100*time.Millisecond, func() bool { - return s.getWorker(true).subTaskHolder.findSubTask(subtaskCfg.Name) == nil + return s.getSourceWorker(true).subTaskHolder.findSubTask(subtaskCfg.Name) == nil }), IsTrue) dupServer := NewServer(cfg) @@ -338,13 +338,13 @@ func (t *testServer) TestHandleSourceBoundAfterError(c *C) { _, err = ha.PutSourceCfg(etcdCli, sourceCfg) c.Assert(err, IsNil) c.Assert(utils.WaitSomething(30, 100*time.Millisecond, func() bool { - return s.getWorker(true) != nil + return s.getSourceWorker(true) != nil }), IsTrue) _, err = ha.DeleteSourceBound(etcdCli, s.cfg.Name) c.Assert(err, IsNil) c.Assert(utils.WaitSomething(30, 100*time.Millisecond, func() bool { - return s.getWorker(true) == nil + return s.getSourceWorker(true) == nil }), IsTrue) } @@ -414,19 +414,19 @@ func (t *testServer) TestWatchSourceBoundEtcdCompact(c *C) { }() // step 4.1: should stop the running worker, source bound has been deleted, should stop this worker c.Assert(utils.WaitSomething(20, 100*time.Millisecond, func() bool { - return s.getWorker(true) == nil + return s.getSourceWorker(true) == nil }), IsTrue) // step 4.2: put a new source bound, source should be started _, err = ha.PutSourceBound(etcdCli, sourceBound) c.Assert(err, IsNil) c.Assert(utils.WaitSomething(30, 100*time.Millisecond, func() bool { - return s.getWorker(true) != nil + return s.getSourceWorker(true) != nil }), IsTrue) - cfg2 := s.getWorker(true).cfg + cfg2 := s.getSourceWorker(true).cfg c.Assert(cfg2, DeepEquals, sourceCfg) cancel1() wg.Wait() - c.Assert(s.stopWorker(sourceCfg.SourceID, true, true), IsNil) + c.Assert(s.stopSourceWorker(sourceCfg.SourceID, true, true), IsNil) // step 5: start observeSourceBound from compacted revision again, should start worker ctx2, cancel2 := context.WithCancel(ctx) wg.Add(1) @@ -435,9 +435,9 @@ func (t *testServer) TestWatchSourceBoundEtcdCompact(c *C) { c.Assert(s.observeSourceBound(ctx2, startRev), IsNil) }() c.Assert(utils.WaitSomething(30, 100*time.Millisecond, func() bool { - return s.getWorker(true) != nil + return s.getSourceWorker(true) != nil }), IsTrue) - cfg2 = s.getWorker(true).cfg + cfg2 = s.getSourceWorker(true).cfg c.Assert(cfg2, DeepEquals, sourceCfg) cancel2() wg.Wait() @@ -481,13 +481,13 @@ func (t *testServer) testOperateWorker(c *C, s *Server, dir string, start bool) c.Assert(err, IsNil) // worker should be started and without error c.Assert(utils.WaitSomething(30, 100*time.Millisecond, func() bool { - w := s.getWorker(true) + w := s.getSourceWorker(true) return w != nil && !w.closed.Load() }), IsTrue) c.Assert(s.getSourceStatus(true).Result, IsNil) } else { // worker should be started before stopped - w := s.getWorker(true) + w := s.getSourceWorker(true) c.Assert(w, NotNil) c.Assert(w.closed.Load(), IsFalse) _, err := ha.DeleteRelayConfig(s.etcdClient, w.name) @@ -496,7 +496,7 @@ func (t *testServer) testOperateWorker(c *C, s *Server, dir string, start bool) c.Assert(err, IsNil) // worker should be closed and without error c.Assert(utils.WaitSomething(30, 100*time.Millisecond, func() bool { - currentWorker := s.getWorker(true) + currentWorker := s.getSourceWorker(true) return currentWorker == nil && w.closed.Load() }), IsTrue) c.Assert(s.getSourceStatus(true).Result, IsNil) @@ -507,7 +507,7 @@ func (t *testServer) testRetryConnectMaster(c *C, s *Server, etcd *embed.Etcd, d etcd.Close() time.Sleep(6 * time.Second) // When worker server fail to keepalive with etcd, server should close its worker - c.Assert(s.getWorker(true), IsNil) + c.Assert(s.getSourceWorker(true), IsNil) c.Assert(s.getSourceStatus(true).Result, IsNil) ETCD, err := createMockETCD(dir, "http://"+hostName) c.Assert(err, IsNil) @@ -552,9 +552,9 @@ func (t *testServer) testSubTaskRecover(c *C, s *Server, dir string) { func (t *testServer) testStopWorkerWhenLostConnect(c *C, s *Server, etcd *embed.Etcd) { etcd.Close() c.Assert(utils.WaitSomething(int(defaultKeepAliveTTL+3), time.Second, func() bool { - return s.getWorker(true) == nil + return s.getSourceWorker(true) == nil }), IsTrue) - c.Assert(s.getWorker(true), IsNil) + c.Assert(s.getSourceWorker(true), IsNil) } func (t *testServer) TestGetMinLocInAllSubTasks(c *C) { diff --git a/dm/dm/worker/source_worker_test.go b/dm/dm/worker/source_worker_test.go index 770b7ec7983..4768e24dd4d 100644 --- a/dm/dm/worker/source_worker_test.go +++ b/dm/dm/worker/source_worker_test.go @@ -197,11 +197,11 @@ func (t *testServer2) TestTaskAutoResume(c *C) { c.Assert(err, IsNil) subtaskCfg.Mode = "full" subtaskCfg.Timezone = "UTC" - c.Assert(s.getWorker(true).StartSubTask(&subtaskCfg, pb.Stage_Running, true), IsNil) + c.Assert(s.getSourceWorker(true).StartSubTask(&subtaskCfg, pb.Stage_Running, true), IsNil) // check task in paused state c.Assert(utils.WaitSomething(100, 100*time.Millisecond, func() bool { - subtaskStatus, _, _ := s.getWorker(true).QueryStatus(context.Background(), taskName) + subtaskStatus, _, _ := s.getSourceWorker(true).QueryStatus(context.Background(), taskName) for _, st := range subtaskStatus { if st.Name == taskName && st.Stage == pb.Stage_Paused { return true @@ -212,7 +212,7 @@ func (t *testServer2) TestTaskAutoResume(c *C) { //nolint:errcheck failpoint.Disable("github.com/pingcap/tiflow/dm/dumpling/dumpUnitProcessWithError") - rtsc, ok := s.getWorker(true).taskStatusChecker.(*realTaskStatusChecker) + rtsc, ok := s.getSourceWorker(true).taskStatusChecker.(*realTaskStatusChecker) c.Assert(ok, IsTrue) defer func() { // close multiple time @@ -222,7 +222,7 @@ func (t *testServer2) TestTaskAutoResume(c *C) { // check task will be auto resumed c.Assert(utils.WaitSomething(10, 100*time.Millisecond, func() bool { - sts, _, _ := s.getWorker(true).QueryStatus(context.Background(), taskName) + sts, _, _ := s.getSourceWorker(true).QueryStatus(context.Background(), taskName) for _, st := range sts { if st.Name == taskName && st.Stage == pb.Stage_Running { return true diff --git a/dm/syncer/syncer.go b/dm/syncer/syncer.go index 3b8e8870c8d..7e53aa7c8c3 100644 --- a/dm/syncer/syncer.go +++ b/dm/syncer/syncer.go @@ -131,7 +131,7 @@ type Syncer struct { // this is used for some background tasks s.Run that need logger it share same lifecycle with runCtx. runTCtx *tcontext.Context // this ctx only used for logger. runCancel context.CancelFunc - rundone chan struct{} // this will be closed when syncer.Run exit + runWg sync.WaitGroup // control all goroutines in S.Run cfg *config.SubTaskConfig syncCfg replication.BinlogSyncerConfig @@ -144,7 +144,6 @@ type Syncer struct { binlogType BinlogType streamerController *StreamerController - wg sync.WaitGroup // counts goroutines jobWg sync.WaitGroup // counts ddl/flush/asyncFlush job in-flight in s.dmlJobCh and s.ddlJobCh schemaTracker *schema.Tracker @@ -285,11 +284,9 @@ func NewSyncer(cfg *config.SubTaskConfig, etcdClient *clientv3.Client, relay rel } func (s *Syncer) newJobChans() { - s.closeJobChans() chanSize := calculateChanSize(s.cfg.QueueSize, s.cfg.WorkerCount, s.cfg.Compact) s.dmlJobCh = make(chan *job, chanSize) s.ddlJobCh = make(chan *job, s.cfg.QueueSize) - s.checkpointFlushWorker.input = make(chan *checkpointFlushTask, 16) s.jobsClosed.Store(false) } @@ -301,7 +298,6 @@ func (s *Syncer) closeJobChans() { } close(s.dmlJobCh) close(s.ddlJobCh) - s.checkpointFlushWorker.Close() s.jobsClosed.Store(true) } @@ -436,13 +432,6 @@ func (s *Syncer) Init(ctx context.Context) (err error) { } } } - s.checkpointFlushWorker = &checkpointFlushWorker{ - input: nil, // will be created in s.reset() - cp: s.checkpoint, - execError: &s.execError, - afterFlushFn: s.afterFlushCheckpoint, - addCountFunc: s.addCount, - } // when Init syncer, set active relay log info err = s.setInitActiveRelayLog(ctx) @@ -679,8 +668,6 @@ func (s *Syncer) Process(ctx context.Context, pr chan pb.ProcessResult) { // cancel goroutines created in s.Run cancel() } - s.closeJobChans() // Run returned, all jobs sent, we can close s.jobs - s.wg.Wait() // wait for sync goroutine to return close(runFatalChan) // Run returned, all potential fatal sent to s.runFatalChan wg.Wait() // wait for receive all fatal from s.runFatalChan @@ -1251,8 +1238,8 @@ func (s *Syncer) logAndClearFilteredStatistics() { } // DDL synced one by one, so we only need to process one DDL at a time. -func (s *Syncer) syncDDL(tctx *tcontext.Context, queueBucket string, db *dbconn.DBConn, ddlJobChan chan *job) { - defer s.wg.Done() +func (s *Syncer) syncDDL(queueBucket string, db *dbconn.DBConn, ddlJobChan chan *job) { + defer s.runWg.Done() var err error for { @@ -1276,12 +1263,12 @@ func (s *Syncer) syncDDL(tctx *tcontext.Context, queueBucket string, db *dbconn. shardPessimistOp = s.pessimist.PendingOperation() if shardPessimistOp != nil && !shardPessimistOp.Exec { ignore = true - tctx.L().Info("ignore shard DDLs in pessimistic shard mode", zap.Strings("ddls", ddlJob.ddls)) + s.tctx.L().Info("ignore shard DDLs in pessimistic shard mode", zap.Strings("ddls", ddlJob.ddls)) } case config.ShardOptimistic: if len(ddlJob.ddls) == 0 { ignore = true - tctx.L().Info("ignore shard DDLs in optimistic mode", zap.Stringer("info", s.optimist.PendingInfo())) + s.tctx.L().Info("ignore shard DDLs in optimistic mode", zap.Stringer("info", s.optimist.PendingInfo())) } } @@ -1293,9 +1280,9 @@ func (s *Syncer) syncDDL(tctx *tcontext.Context, queueBucket string, db *dbconn. if !ignore { var affected int - affected, err = db.ExecuteSQLWithIgnore(tctx, errorutil.IsIgnorableMySQLDDLError, ddlJob.ddls) + affected, err = db.ExecuteSQLWithIgnore(s.runTCtx, errorutil.IsIgnorableMySQLDDLError, ddlJob.ddls) if err != nil { - err = s.handleSpecialDDLError(tctx, err, ddlJob.ddls, affected, db) + err = s.handleSpecialDDLError(s.runTCtx, err, ddlJob.ddls, affected, db) err = terror.WithScope(err, terror.ScopeDownstream) } } @@ -1329,7 +1316,7 @@ func (s *Syncer) syncDDL(tctx *tcontext.Context, queueBucket string, db *dbconn. switch { case shardInfo == nil: // no need to do the shard DDL handle for `CREATE DATABASE/TABLE` now. - tctx.L().Warn("skip shard DDL handle in pessimistic shard mode", zap.Strings("ddl", ddlJob.ddls)) + s.tctx.L().Warn("skip shard DDL handle in pessimistic shard mode", zap.Strings("ddl", ddlJob.ddls)) case shardPessimistOp == nil: err = terror.ErrWorkerDDLLockOpNotFound.Generate(shardInfo) default: @@ -1342,7 +1329,7 @@ func (s *Syncer) syncDDL(tctx *tcontext.Context, queueBucket string, db *dbconn. // no need to do the shard DDL handle for `DROP DATABASE/TABLE` now. // but for `CREATE DATABASE` and `ALTER DATABASE` we execute it to the downstream directly without `shardInfo`. if ignore { // actually ignored. - tctx.L().Warn("skip shard DDL handle in optimistic shard mode", zap.Strings("ddl", ddlJob.ddls)) + s.tctx.L().Warn("skip shard DDL handle in optimistic shard mode", zap.Strings("ddl", ddlJob.ddls)) } case s.optimist.PendingOperation() == nil: err = terror.ErrWorkerDDLLockOpNotFound.Generate(shardInfo) @@ -1401,7 +1388,7 @@ func (s *Syncer) fatalFunc(job *job, err error) { // DML synced with causality. func (s *Syncer) syncDML() { - defer s.wg.Done() + defer s.runWg.Done() dmlJobCh := s.dmlJobCh if s.cfg.Compact { @@ -1415,7 +1402,8 @@ func (s *Syncer) syncDML() { } } -func (s *Syncer) waitTransactionEndBeforeExit(ctx context.Context) { +func (s *Syncer) waitBeforeRunExit(ctx context.Context) { + defer s.runWg.Done() failpoint.Inject("checkCheckpointInMiddleOfTransaction", func() { s.tctx.L().Info("incr maxPauseOrStopWaitTime time ") maxPauseOrStopWaitTime = time.Minute * 10 @@ -1436,7 +1424,7 @@ func (s *Syncer) waitTransactionEndBeforeExit(ctx context.Context) { s.waitTransactionLock.Unlock() select { case <-s.runCtx.Done(): - s.tctx.L().Info("run ctx done by waitComplete") + s.tctx.L().Info("syncer run exit so runCtx done") case <-time.After(maxPauseOrStopWaitTime): s.tctx.L().Info("wait transaction end timeout, exit now") s.runCancel() @@ -1448,7 +1436,7 @@ func (s *Syncer) waitTransactionEndBeforeExit(ctx context.Context) { } func (s *Syncer) updateTSOffsetCronJob(ctx context.Context) { - defer s.wg.Done() + defer s.runWg.Done() // temporarily hard code there. if this metrics works well add this to config file. ticker := time.NewTicker(time.Minute * 10) defer ticker.Stop() @@ -1465,7 +1453,7 @@ func (s *Syncer) updateTSOffsetCronJob(ctx context.Context) { } func (s *Syncer) updateLagCronJob(ctx context.Context) { - defer s.wg.Done() + defer s.runWg.Done() // temporarily hard code there. if this metrics works well add this to config file. ticker := time.NewTicker(time.Millisecond * 100) defer ticker.Stop() @@ -1491,20 +1479,37 @@ func (s *Syncer) updateTSOffset(ctx context.Context) error { // Run starts running for sync, we should guarantee it can rerun when paused. func (s *Syncer) Run(ctx context.Context) (err error) { - runCtx, runCancel := context.WithCancel(context.Background()) - runTCtx := tcontext.NewContext(runCtx, s.tctx.L()) s.Lock() - s.runCtx = runCtx - s.runTCtx = runTCtx - s.runCancel = runCancel - s.rundone = make(chan struct{}) + s.runCtx, s.runCancel = context.WithCancel(context.Background()) + s.runTCtx = tcontext.NewContext(s.runCtx, s.tctx.L()) + s.checkpointFlushWorker = &checkpointFlushWorker{ + input: make(chan *checkpointFlushTask, 16), + cp: s.checkpoint, + execError: &s.execError, + afterFlushFn: s.afterFlushCheckpoint, + addCountFunc: s.addCount, + } s.Unlock() defer func() { - // close rundone to notify other goroutine that syncer.Run is exited - close(s.rundone) - runCancel() + s.tctx.L().Info("in .... syncer run exit...........................................") + s.runCancel() + s.closeJobChans() + s.checkpointFlushWorker.Close() + s.runWg.Wait() // wait for all syncer.Run goroutine exit + s.tctx.L().Info("syncer run exit...........................................") + }() + + // we should start this goroutine as soon as possible, because it's the only goroutine that cancel syncer.Run + s.runWg.Add(1) + go func() { + s.waitBeforeRunExit(ctx) }() + // before sync run, we get the ts offset from upstream first + if utErr := s.updateTSOffset(ctx); utErr != nil { + return utErr + } + // some initialization that can't be put in Syncer.Init fresh, err := s.IsFreshTask(s.runCtx) if err != nil { @@ -1516,8 +1521,6 @@ func (s *Syncer) Run(ctx context.Context) (err error) { return err } } - // we should start this goroutine as soon as possible, because it's the only goroutine that cancel syncer.Run - go s.waitTransactionEndBeforeExit(ctx) var ( flushCheckpoint bool @@ -1539,13 +1542,23 @@ func (s *Syncer) Run(ctx context.Context) (err error) { } else { cleanDumpFile = false } + + s.runWg.Add(1) + // start syncDML worker first because if there is no dml worker checkpointFlushWorker will blocked forever + go s.syncDML() + s.runWg.Add(1) // start flush checkpoints worker. this worker mast start before s.flushCheckPoints() - s.wg.Add(1) go func() { - defer s.wg.Done() + defer s.runWg.Done() + // also need to use a ctx different s.runCtx, checkpointFlushWorker worker will closed in the first defer s.checkpointFlushWorker.Run(s.tctx) }() - + s.runWg.Add(1) + go s.syncDDL(adminQueueName, s.ddlDBConn, s.ddlJobCh) + s.runWg.Add(1) + go s.updateLagCronJob(s.runCtx) + s.runWg.Add(1) + go s.updateTSOffsetCronJob(s.runCtx) if flushCheckpoint { if err = s.flushCheckPoints(); err != nil { s.tctx.L().Warn("fail to flush checkpoints when starting task", zap.Error(err)) @@ -1570,11 +1583,6 @@ func (s *Syncer) Run(ctx context.Context) (err error) { utils.OsExit(1) }) - // before sync run, we get the ts offset from upstream first - if utErr := s.updateTSOffset(ctx); utErr != nil { - return utErr - } - // startLocation is the start location for current received event // currentLocation is the end location for current received event (End_log_pos in `show binlog events` for mysql) // lastLocation is the end location for last received (ROTATE / QUERY / XID) event @@ -1597,20 +1605,6 @@ func (s *Syncer) Run(ctx context.Context) (err error) { return terror.Annotate(err, "fail to restart streamer controller") } } - - // all pre check passed, we can start all background jobs, they all use s.runCancel to cancel - s.wg.Add(1) - go s.updateTSOffsetCronJob(s.runCtx) - - s.wg.Add(1) - go s.syncDML() - - s.wg.Add(1) - go s.syncDDL(s.runTCtx, adminQueueName, s.ddlDBConn, s.ddlJobCh) - - s.wg.Add(1) - go s.updateLagCronJob(s.runCtx) - // syncing progress with sharding DDL group // 1. use the global streamer to sync regular binlog events // 2. sharding DDL synced for some sharding groups @@ -1628,6 +1622,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) { traceSource = fmt.Sprintf("%s.syncer.%s", s.cfg.SourceID, s.cfg.Name) ) + // this is second defer func in syncer.Run so in this time checkpointFlushWorker are still running defer func() { if err1 := recover(); err1 != nil { failpoint.Inject("ExitAfterSaveOnlineDDL", func() { @@ -3361,7 +3356,7 @@ func (s *Syncer) Close() { if s.isClosed() { return } - s.stopSync() + s.stopStreamAndRollbackCheckpoint() s.closeDBs() s.checkpoint.Close() if err := s.schemaTracker.Close(); err != nil { @@ -3384,21 +3379,13 @@ func (s *Syncer) Kill() { s.Close() } -// stopSync stops syncing and rollback checkpoint now it used by Close,kILL and Pause -// maybe we can refine the workflow more clear. -func (s *Syncer) stopSync() { - // wait for s.Run() to finish - if s.rundone != nil { - <-s.rundone - } - s.closeJobChans() - s.wg.Wait() // wait job workers to return - +// stopStreamAndRollbackCheckpoint stop stream and rollback checkpoint now it used by Close() and Pause(). +func (s *Syncer) stopStreamAndRollbackCheckpoint() { // before re-write workflow for s.syncer, simply close it // when resuming, re-create s.syncer if s.streamerController != nil { - s.streamerController.Close(s.runTCtx) + s.streamerController.Close(s.tctx) } // try to rollback checkpoints, if they already flushed, no effect, this operation should call before close schemaTracker @@ -3423,7 +3410,7 @@ func (s *Syncer) Pause() { s.tctx.L().Warn("try to pause, but already closed") return } - s.stopSync() + s.stopStreamAndRollbackCheckpoint() } // Resume resumes the paused process. diff --git a/dm/syncer/syncer_test.go b/dm/syncer/syncer_test.go index c4347e18ea6..6dac7315648 100644 --- a/dm/syncer/syncer_test.go +++ b/dm/syncer/syncer_test.go @@ -741,10 +741,10 @@ func (s *testSyncerSuite) TestcheckpointID(c *C) { func (s *testSyncerSuite) TestRun(c *C) { // 1. run syncer with column mapping - // 2. execute some sqls which will trigger casuality + // 2. execute some sqls which will trigger causality // 3. check the generated jobs // 4. update config, add route rules, and update syncer - // 5. execute somes sqls and then check jobs generated + // 5. execute some sqls and then check jobs generated db, mock, err := sqlmock.New() c.Assert(err, IsNil) From 06d4363ee09f63e9f69d69c02236c0bf12c5d0d1 Mon Sep 17 00:00:00 2001 From: ehco1996 Date: Fri, 21 Jan 2022 14:19:29 +0800 Subject: [PATCH 28/47] fix comments --- dm/syncer/dml_worker.go | 2 +- dm/syncer/syncer.go | 22 ++++++++++++---------- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/dm/syncer/dml_worker.go b/dm/syncer/dml_worker.go index edbcf4bd1f3..7dfcccc2254 100644 --- a/dm/syncer/dml_worker.go +++ b/dm/syncer/dml_worker.go @@ -75,7 +75,7 @@ func dmlWorkerWrap(inCh chan *job, syncer *Syncer) chan *job { fatalFunc: syncer.fatalFunc, lagFunc: syncer.updateReplicationJobTS, addCountFunc: syncer.addCount, - runTCtx: syncer.runTCtx, + runTCtx: syncer.runSyncTCtx, toDBConns: syncer.toDBConns, inCh: inCh, flushCh: make(chan *job), diff --git a/dm/syncer/syncer.go b/dm/syncer/syncer.go index 5520cdefe99..fa80e0afca8 100644 --- a/dm/syncer/syncer.go +++ b/dm/syncer/syncer.go @@ -126,12 +126,14 @@ type Syncer struct { tctx *tcontext.Context // this ctx only used for logger. // this ctx is a background ctx and was initialized in s.Run, it is used for some background tasks in s.Run - // when this ctx cancelled, syncer will shuts down all background running jobs and not wait transaction end + // when this ctx cancelled, syncer will shutdown all background running jobs and not wait transaction end runCtx context.Context // this is used for some background tasks s.Run that need logger it share same lifecycle with runCtx. - runTCtx *tcontext.Context // this ctx only used for logger. - runCancel context.CancelFunc - runWg sync.WaitGroup // control all goroutines in S.Run + runTCtx *tcontext.Context + runCancel context.CancelFunc + runSyncTCtx *tcontext.Context // this ctx only used for syncDML and syncDDL and only cancelled when ungraceful stop + runSyncCancel context.CancelFunc + runWg sync.WaitGroup // control all goroutines started in S.Run cfg *config.SubTaskConfig syncCfg replication.BinlogSyncerConfig @@ -1284,9 +1286,9 @@ func (s *Syncer) syncDDL(queueBucket string, db *dbconn.DBConn, ddlJobChan chan if !ignore { var affected int - affected, err = db.ExecuteSQLWithIgnore(s.runTCtx, errorutil.IsIgnorableMySQLDDLError, ddlJob.ddls) + affected, err = db.ExecuteSQLWithIgnore(s.runSyncTCtx, errorutil.IsIgnorableMySQLDDLError, ddlJob.ddls) if err != nil { - err = s.handleSpecialDDLError(s.runTCtx, err, ddlJob.ddls, affected, db) + err = s.handleSpecialDDLError(s.runSyncTCtx, err, ddlJob.ddls, affected, db) err = terror.WithScope(err, terror.ScopeDownstream) } } @@ -1435,7 +1437,6 @@ func (s *Syncer) waitBeforeRunExit(ctx context.Context) { } case <-s.runCtx.Done(): // when no graceful stop, run ctx will canceled first. s.tctx.L().Info("received ungraceful exit ctx, exit now") - s.runCancel() } } @@ -1486,6 +1487,8 @@ func (s *Syncer) Run(ctx context.Context) (err error) { s.Lock() s.runCtx, s.runCancel = context.WithCancel(context.Background()) s.runTCtx = tcontext.NewContext(s.runCtx, s.tctx.L()) + syncCtx, syncCancel := context.WithCancel(context.Background()) + s.runSyncTCtx, s.runSyncCancel = tcontext.NewContext(syncCtx, s.tctx.L()), syncCancel s.checkpointFlushWorker = &checkpointFlushWorker{ input: make(chan *checkpointFlushTask, 16), cp: s.checkpoint, @@ -1495,12 +1498,10 @@ func (s *Syncer) Run(ctx context.Context) (err error) { } s.Unlock() defer func() { - s.tctx.L().Info("in .... syncer run exit...........................................") s.runCancel() s.closeJobChans() s.checkpointFlushWorker.Close() - s.runWg.Wait() // wait for all syncer.Run goroutine exit - s.tctx.L().Info("syncer run exit...........................................") + s.runWg.Wait() }() // we should start this goroutine as soon as possible, because it's the only goroutine that cancel syncer.Run @@ -3382,6 +3383,7 @@ func (s *Syncer) Close() { func (s *Syncer) Kill() { s.tctx.L().Warn("kill syncer without graceful") s.runCancel() + s.runSyncCancel() s.Close() } From 7a39654024bcd27cd9179faaba68603a09c52a8b Mon Sep 17 00:00:00 2001 From: ehco1996 Date: Tue, 25 Jan 2022 10:52:16 +0800 Subject: [PATCH 29/47] merge one ctx --- dm/dm/worker/server.go | 10 ++++---- dm/syncer/syncer.go | 56 +++++++++++++++++++++--------------------- 2 files changed, 33 insertions(+), 33 deletions(-) diff --git a/dm/dm/worker/server.go b/dm/dm/worker/server.go index ff7590fd3b9..2b3a0e77c3e 100644 --- a/dm/dm/worker/server.go +++ b/dm/dm/worker/server.go @@ -458,7 +458,7 @@ func (s *Server) doClose() { // Close close the RPC server, this function can be called multiple times. func (s *Server) Close() { - s.doClose() // we should stop current sync first, other wise master may scheduler task on new worker while we are closing + s.doClose() // we should stop current sync first, otherwise master may schedule task on new worker while we are closing s.stopKeepAlive() s.etcdClient.Close() } @@ -520,18 +520,18 @@ func (s *Server) stopSourceWorker(sourceID string, needLock, graceful bool) erro s.Lock() defer s.Unlock() } - sw := s.getSourceWorker(false) - if sw == nil { + w := s.getSourceWorker(false) + if w == nil { log.L().Warn("worker has not been started, no need to stop", zap.String("source", sourceID)) return nil // no need to stop because not started yet } - if sourceID != "" && sw.cfg.SourceID != sourceID { + if sourceID != "" && w.cfg.SourceID != sourceID { return terror.ErrWorkerSourceNotMatch } s.UpdateKeepAliveTTL(s.cfg.KeepAliveTTL) s.setWorker(nil, false) s.setSourceStatus("", nil, false) - sw.Stop(graceful) + w.Stop(graceful) return nil } diff --git a/dm/syncer/syncer.go b/dm/syncer/syncer.go index fa80e0afca8..26eed10567a 100644 --- a/dm/syncer/syncer.go +++ b/dm/syncer/syncer.go @@ -125,15 +125,15 @@ type Syncer struct { tctx *tcontext.Context // this ctx only used for logger. - // this ctx is a background ctx and was initialized in s.Run, it is used for some background tasks in s.Run + // this ctx derives from a background ctx and was initialized in s.Run, it is used for some background tasks in s.Run // when this ctx cancelled, syncer will shutdown all background running jobs and not wait transaction end - runCtx context.Context - // this is used for some background tasks s.Run that need logger it share same lifecycle with runCtx. - runTCtx *tcontext.Context - runCancel context.CancelFunc - runSyncTCtx *tcontext.Context // this ctx only used for syncDML and syncDDL and only cancelled when ungraceful stop + runCtx *tcontext.Context + runCancel context.CancelFunc + // this ctx only used for syncDML and syncDDL and only cancelled when ungraceful stop + runSyncTCtx *tcontext.Context runSyncCancel context.CancelFunc - runWg sync.WaitGroup // control all goroutines started in S.Run + // control all goroutines that started in S.Run + runWg sync.WaitGroup cfg *config.SubTaskConfig syncCfg replication.BinlogSyncerConfig @@ -1429,13 +1429,13 @@ func (s *Syncer) waitBeforeRunExit(ctx context.Context) { s.waitXIDJob.Store(int64(waiting)) s.waitTransactionLock.Unlock() select { - case <-s.runCtx.Done(): + case <-s.runCtx.Ctx.Done(): s.tctx.L().Info("syncer run exit so runCtx done") case <-time.After(maxPauseOrStopWaitTime): s.tctx.L().Info("wait transaction end timeout, exit now") s.runCancel() } - case <-s.runCtx.Done(): // when no graceful stop, run ctx will canceled first. + case <-s.runCtx.Ctx.Done(): // when no graceful stop, run ctx will canceled first. s.tctx.L().Info("received ungraceful exit ctx, exit now") } } @@ -1485,8 +1485,8 @@ func (s *Syncer) updateTSOffset(ctx context.Context) error { // Run starts running for sync, we should guarantee it can rerun when paused. func (s *Syncer) Run(ctx context.Context) (err error) { s.Lock() - s.runCtx, s.runCancel = context.WithCancel(context.Background()) - s.runTCtx = tcontext.NewContext(s.runCtx, s.tctx.L()) + runCtx, runCancel := context.WithCancel(context.Background()) + s.runCtx, s.runCancel = tcontext.NewContext(runCtx, s.tctx.L()), runCancel syncCtx, syncCancel := context.WithCancel(context.Background()) s.runSyncTCtx, s.runSyncCancel = tcontext.NewContext(syncCtx, s.tctx.L()), syncCancel s.checkpointFlushWorker = &checkpointFlushWorker{ @@ -1516,7 +1516,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) { } // some initialization that can't be put in Syncer.Init - fresh, err := s.IsFreshTask(s.runCtx) + fresh, err := s.IsFreshTask(s.runCtx.Ctx) if err != nil { return err } else if fresh { @@ -1532,7 +1532,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) { delLoadTask bool cleanDumpFile = s.cfg.CleanDumpFile ) - flushCheckpoint, err = s.adjustGlobalPointGTID(s.runTCtx) + flushCheckpoint, err = s.adjustGlobalPointGTID(s.runCtx) if err != nil { return err } @@ -1552,18 +1552,18 @@ func (s *Syncer) Run(ctx context.Context) (err error) { // start syncDML worker first because if there is no dml worker checkpointFlushWorker will blocked forever go s.syncDML() s.runWg.Add(1) - // start flush checkpoints worker. this worker mast start before s.flushCheckPoints() + // start flush checkpoints worker. this worker must start before s.flushCheckPoints() go func() { defer s.runWg.Done() - // also need to use a ctx different s.runCtx, checkpointFlushWorker worker will closed in the first defer + // also need to use a ctx different s.runCtx, checkpointFlushWorker worker will be closed in the first defer s.checkpointFlushWorker.Run(s.tctx) }() s.runWg.Add(1) go s.syncDDL(adminQueueName, s.ddlDBConn, s.ddlJobCh) s.runWg.Add(1) - go s.updateLagCronJob(s.runCtx) + go s.updateLagCronJob(s.runCtx.Ctx) s.runWg.Add(1) - go s.updateTSOffsetCronJob(s.runCtx) + go s.updateTSOffsetCronJob(s.runCtx.Ctx) if flushCheckpoint { if err = s.flushCheckPoints(); err != nil { s.tctx.L().Warn("fail to flush checkpoints when starting task", zap.Error(err)) @@ -1605,7 +1605,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) { if s.streamerController.IsClosed() { s.locations.reset(lastLocation) - err = s.streamerController.Start(s.runTCtx, lastLocation) + err = s.streamerController.Start(s.runCtx, lastLocation) if err != nil { return terror.Annotate(err, "fail to restart streamer controller") } @@ -1676,7 +1676,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) { // if we start syncer at an early position, database must bear a period of inconsistent state, // it's eventual consistency. s.safeMode = sm.NewSafeMode() - s.enableSafeModeInitializationPhase(s.runTCtx) + s.enableSafeModeInitializationPhase(s.runCtx) closeShardingResync := func() error { if shardingReSync == nil { @@ -1715,7 +1715,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) { } for i := 0; i < n; { - e, err1 := s.getEvent(s.runTCtx, currentLocation) + e, err1 := s.getEvent(s.runCtx, currentLocation) if err1 != nil { return err } @@ -1766,7 +1766,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) { // if suffix>0, we are replacing error s.isReplacingOrInjectingErr = currentLocation.Suffix != 0 s.locations.reset(shardingReSync.currLocation) - err = s.streamerController.RedirectStreamer(s.runTCtx, shardingReSync.currLocation) + err = s.streamerController.RedirectStreamer(s.runCtx, shardingReSync.currLocation) if err != nil { return err } @@ -1780,7 +1780,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) { var e *replication.BinlogEvent startTime := time.Now() - e, err = s.getEvent(s.runTCtx, currentLocation) + e, err = s.getEvent(s.runCtx, currentLocation) s.tctx.L().Debug("location refactor", zap.Stringer("current", currentLocation), zap.Stringer("start", startLocation), @@ -1847,7 +1847,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) { // try to re-sync in gtid mode if tryReSync && s.cfg.EnableGTID && utils.IsErrBinlogPurged(err) && s.cfg.AutoFixGTID { time.Sleep(retryTimeout) - err = s.reSyncBinlog(*s.runTCtx, lastLocation) + err = s.reSyncBinlog(*s.runCtx, lastLocation) if err != nil { return err } @@ -1968,9 +1968,9 @@ func (s *Syncer) Run(ctx context.Context) (err error) { s.errOperatorHolder.SetHasAllInjected(startLocation) // reset event as startLocation, avoid to be marked in checkpoint currentLocation.Position.Pos = startLocation.Position.Pos - err = s.streamerController.RedirectStreamer(s.runTCtx, startLocation) + err = s.streamerController.RedirectStreamer(s.runCtx, startLocation) } else { - err = s.streamerController.RedirectStreamer(s.runTCtx, currentLocation) + err = s.streamerController.RedirectStreamer(s.runCtx, currentLocation) } if err != nil { return err @@ -1990,17 +1990,17 @@ func (s *Syncer) Run(ctx context.Context) (err error) { // 2. push forward and replicate some sqls after safeModeExitPoint to downstream // 3. quit because of network error, fail to flush global checkpoint and new safeModeExitPoint to downstream // 4. restart again, quit safe mode at safeModeExitPoint, but some sqls after this location have already been replicated to the downstream - if err = s.checkpoint.FlushSafeModeExitPoint(s.runTCtx); err != nil { + if err = s.checkpoint.FlushSafeModeExitPoint(s.runCtx); err != nil { return err } - if err = s.safeMode.Add(s.runTCtx, -1); err != nil { + if err = s.safeMode.Add(s.runCtx, -1); err != nil { return err } } } ec := eventContext{ - tctx: s.runTCtx, + tctx: s.runCtx, header: e.Header, startLocation: &startLocation, currentLocation: ¤tLocation, From a73aa0be1dccbcde85300a3c4a6e3a6a71e216a9 Mon Sep 17 00:00:00 2001 From: ehco1996 Date: Tue, 25 Jan 2022 11:37:33 +0800 Subject: [PATCH 30/47] rename ctx --- dm/syncer/dml_worker.go | 6 +++--- dm/syncer/syncer.go | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/dm/syncer/dml_worker.go b/dm/syncer/dml_worker.go index 7dfcccc2254..877a4c64878 100644 --- a/dm/syncer/dml_worker.go +++ b/dm/syncer/dml_worker.go @@ -36,7 +36,7 @@ type DMLWorker struct { chanSize int multipleRows bool toDBConns []*dbconn.DBConn - runTCtx *tcontext.Context + syncCtx *tcontext.Context logger log.Logger // for metrics @@ -75,7 +75,7 @@ func dmlWorkerWrap(inCh chan *job, syncer *Syncer) chan *job { fatalFunc: syncer.fatalFunc, lagFunc: syncer.updateReplicationJobTS, addCountFunc: syncer.addCount, - runTCtx: syncer.runSyncTCtx, + syncCtx: syncer.syncTCtx, // this ctx can be used to cancel all the workers toDBConns: syncer.toDBConns, inCh: inCh, flushCh: make(chan *job), @@ -236,7 +236,7 @@ func (w *DMLWorker) executeBatchJobs(queueID int, jobs []*job) { time.Sleep(time.Duration(t) * time.Second) }) // use background context to execute sqls as much as possible - ctx, cancel := w.runTCtx.WithTimeout(maxDMLExecutionDuration) + ctx, cancel := w.syncCtx.WithTimeout(maxDMLExecutionDuration) defer cancel() affect, err = db.ExecuteSQL(ctx, queries, args...) failpoint.Inject("SafeModeExit", func(val failpoint.Value) { diff --git a/dm/syncer/syncer.go b/dm/syncer/syncer.go index 26eed10567a..a8fb03ea005 100644 --- a/dm/syncer/syncer.go +++ b/dm/syncer/syncer.go @@ -130,8 +130,8 @@ type Syncer struct { runCtx *tcontext.Context runCancel context.CancelFunc // this ctx only used for syncDML and syncDDL and only cancelled when ungraceful stop - runSyncTCtx *tcontext.Context - runSyncCancel context.CancelFunc + syncTCtx *tcontext.Context + syncCancel context.CancelFunc // control all goroutines that started in S.Run runWg sync.WaitGroup @@ -1286,9 +1286,9 @@ func (s *Syncer) syncDDL(queueBucket string, db *dbconn.DBConn, ddlJobChan chan if !ignore { var affected int - affected, err = db.ExecuteSQLWithIgnore(s.runSyncTCtx, errorutil.IsIgnorableMySQLDDLError, ddlJob.ddls) + affected, err = db.ExecuteSQLWithIgnore(s.syncTCtx, errorutil.IsIgnorableMySQLDDLError, ddlJob.ddls) if err != nil { - err = s.handleSpecialDDLError(s.runSyncTCtx, err, ddlJob.ddls, affected, db) + err = s.handleSpecialDDLError(s.syncTCtx, err, ddlJob.ddls, affected, db) err = terror.WithScope(err, terror.ScopeDownstream) } } @@ -1488,7 +1488,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) { runCtx, runCancel := context.WithCancel(context.Background()) s.runCtx, s.runCancel = tcontext.NewContext(runCtx, s.tctx.L()), runCancel syncCtx, syncCancel := context.WithCancel(context.Background()) - s.runSyncTCtx, s.runSyncCancel = tcontext.NewContext(syncCtx, s.tctx.L()), syncCancel + s.syncTCtx, s.syncCancel = tcontext.NewContext(syncCtx, s.tctx.L()), syncCancel s.checkpointFlushWorker = &checkpointFlushWorker{ input: make(chan *checkpointFlushTask, 16), cp: s.checkpoint, @@ -3383,7 +3383,7 @@ func (s *Syncer) Close() { func (s *Syncer) Kill() { s.tctx.L().Warn("kill syncer without graceful") s.runCancel() - s.runSyncCancel() + s.syncCancel() s.Close() } From 705c228dd22049a3499c4171ff5ddbbf01f12b10 Mon Sep 17 00:00:00 2001 From: ehco1996 Date: Wed, 26 Jan 2022 14:33:00 +0800 Subject: [PATCH 31/47] address comments --- dm/syncer/dml_worker.go | 2 +- dm/syncer/syncer.go | 21 ++++++++++----------- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/dm/syncer/dml_worker.go b/dm/syncer/dml_worker.go index 877a4c64878..4bc942b3db6 100644 --- a/dm/syncer/dml_worker.go +++ b/dm/syncer/dml_worker.go @@ -75,7 +75,7 @@ func dmlWorkerWrap(inCh chan *job, syncer *Syncer) chan *job { fatalFunc: syncer.fatalFunc, lagFunc: syncer.updateReplicationJobTS, addCountFunc: syncer.addCount, - syncCtx: syncer.syncTCtx, // this ctx can be used to cancel all the workers + syncCtx: syncer.syncCtx, // this ctx can be used to cancel all the workers toDBConns: syncer.toDBConns, inCh: inCh, flushCh: make(chan *job), diff --git a/dm/syncer/syncer.go b/dm/syncer/syncer.go index a8fb03ea005..22bef2ab4b2 100644 --- a/dm/syncer/syncer.go +++ b/dm/syncer/syncer.go @@ -126,11 +126,11 @@ type Syncer struct { tctx *tcontext.Context // this ctx only used for logger. // this ctx derives from a background ctx and was initialized in s.Run, it is used for some background tasks in s.Run - // when this ctx cancelled, syncer will shutdown all background running jobs and not wait transaction end + // when this ctx cancelled, syncer will shutdown all background running jobs (except the syncDML and syncDDL) and not wait transaction end. runCtx *tcontext.Context runCancel context.CancelFunc - // this ctx only used for syncDML and syncDDL and only cancelled when ungraceful stop - syncTCtx *tcontext.Context + // this ctx only used for syncDML and syncDDL and only cancelled when ungraceful stop. + syncCtx *tcontext.Context syncCancel context.CancelFunc // control all goroutines that started in S.Run runWg sync.WaitGroup @@ -1286,9 +1286,9 @@ func (s *Syncer) syncDDL(queueBucket string, db *dbconn.DBConn, ddlJobChan chan if !ignore { var affected int - affected, err = db.ExecuteSQLWithIgnore(s.syncTCtx, errorutil.IsIgnorableMySQLDDLError, ddlJob.ddls) + affected, err = db.ExecuteSQLWithIgnore(s.syncCtx, errorutil.IsIgnorableMySQLDDLError, ddlJob.ddls) if err != nil { - err = s.handleSpecialDDLError(s.syncTCtx, err, ddlJob.ddls, affected, db) + err = s.handleSpecialDDLError(s.syncCtx, err, ddlJob.ddls, affected, db) err = terror.WithScope(err, terror.ScopeDownstream) } } @@ -1488,7 +1488,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) { runCtx, runCancel := context.WithCancel(context.Background()) s.runCtx, s.runCancel = tcontext.NewContext(runCtx, s.tctx.L()), runCancel syncCtx, syncCancel := context.WithCancel(context.Background()) - s.syncTCtx, s.syncCancel = tcontext.NewContext(syncCtx, s.tctx.L()), syncCancel + s.syncCtx, s.syncCancel = tcontext.NewContext(syncCtx, s.tctx.L()), syncCancel s.checkpointFlushWorker = &checkpointFlushWorker{ input: make(chan *checkpointFlushTask, 16), cp: s.checkpoint, @@ -1549,7 +1549,6 @@ func (s *Syncer) Run(ctx context.Context) (err error) { } s.runWg.Add(1) - // start syncDML worker first because if there is no dml worker checkpointFlushWorker will blocked forever go s.syncDML() s.runWg.Add(1) // start flush checkpoints worker. this worker must start before s.flushCheckPoints() @@ -3363,7 +3362,7 @@ func (s *Syncer) Close() { if s.isClosed() { return } - s.stopStreamAndRollbackCheckpoint() + s.stopSync() s.closeDBs() s.checkpoint.Close() if err := s.schemaTracker.Close(); err != nil { @@ -3387,8 +3386,8 @@ func (s *Syncer) Kill() { s.Close() } -// stopStreamAndRollbackCheckpoint stop stream and rollback checkpoint now it used by Close() and Pause(). -func (s *Syncer) stopStreamAndRollbackCheckpoint() { +// stopSync stops stream and rollbacks checkpoint now it used by Close() and Pause(). +func (s *Syncer) stopSync() { // before re-write workflow for s.syncer, simply close it // when resuming, re-create s.syncer @@ -3418,7 +3417,7 @@ func (s *Syncer) Pause() { s.tctx.L().Warn("try to pause, but already closed") return } - s.stopStreamAndRollbackCheckpoint() + s.stopSync() } // Resume resumes the paused process. From e881437b628ee2de07d95be1e6bd6d327b224d75 Mon Sep 17 00:00:00 2001 From: ehco1996 Date: Fri, 28 Jan 2022 10:04:10 +0800 Subject: [PATCH 32/47] fix shell && revert some logic of stop worker --- dm/dm/worker/server.go | 3 ++- .../checkpoint_transaction/conf/dm-worker1.toml | 1 - dm/tests/checkpoint_transaction/run.sh | 16 ++++------------ dm/tests/new_relay/run.sh | 10 +++++----- 4 files changed, 11 insertions(+), 19 deletions(-) diff --git a/dm/dm/worker/server.go b/dm/dm/worker/server.go index 2b3a0e77c3e..d3e2c9e9b38 100644 --- a/dm/dm/worker/server.go +++ b/dm/dm/worker/server.go @@ -447,11 +447,12 @@ func (s *Server) doClose() { if s.closed.Load() { return } + // stop server in advance, stop receiving source bound and relay bound + s.cancel() // stop worker and wait for return(we already lock the whole Sever, so no need use lock to get source worker) if w := s.getSourceWorker(false); w != nil { w.Stop(true) } - s.cancel() s.wg.Wait() s.closed.Store(true) } diff --git a/dm/tests/checkpoint_transaction/conf/dm-worker1.toml b/dm/tests/checkpoint_transaction/conf/dm-worker1.toml index 9b8ecaf991c..3d99321d632 100644 --- a/dm/tests/checkpoint_transaction/conf/dm-worker1.toml +++ b/dm/tests/checkpoint_transaction/conf/dm-worker1.toml @@ -1,5 +1,4 @@ join = "127.0.0.1:8261" keepalive-ttl = 1 -log-level = "info" name = "worker1" worker-addr = "127.0.0.1:8262" diff --git a/dm/tests/checkpoint_transaction/run.sh b/dm/tests/checkpoint_transaction/run.sh index 61eaee06efd..7a47c4165f7 100755 --- a/dm/tests/checkpoint_transaction/run.sh +++ b/dm/tests/checkpoint_transaction/run.sh @@ -7,8 +7,6 @@ source $cur/../_utils/test_prepare WORK_DIR=$TEST_DIR/$TEST_NAME function check_worker_ungraceful_stop_with_retry() { - all_matched=false - for ((k = 0; k < 10; k++)); do sleep 1 echo "start check_worker_ungraceful_stop_with_retry times: $k" @@ -25,18 +23,12 @@ function check_worker_ungraceful_stop_with_retry() { if [ $num -lt 1 ]; then continue fi - all_matched=true - if $all_matched; then - echo "check_worker_ungraceful_stop_with_retry success after retry: $k" - break - fi + echo "check_worker_ungraceful_stop_with_retry success after retry: $k" + return 0 done - if ! $all_matched; then - echo "check_worker_ungraceful_stop_with_retry failed after retry" - exit 1 - fi - + echo "check_worker_ungraceful_stop_with_retry failed after retry" + exit 1 } function run() { diff --git a/dm/tests/new_relay/run.sh b/dm/tests/new_relay/run.sh index 095136b8d27..512819f3504 100755 --- a/dm/tests/new_relay/run.sh +++ b/dm/tests/new_relay/run.sh @@ -90,7 +90,7 @@ function test_restart_relay_status() { "relay" 1 \ "bound" 2 - echo "》》》》》》》》》》》》》》》》》test test_restart_relay_status passed" + echo ">>>>>>>>>>>>>>>>>>>>>>>>>>test test_restart_relay_status passed" } function test_cant_dail_upstream() { @@ -124,7 +124,7 @@ function test_cant_dail_upstream() { "query-status -s $SOURCE_ID1" \ "injected error" 1 - echo "》》》》》》》》》》》》》》》》》test test_cant_dail_upstream passed" + echo ">>>>>>>>>>>>>>>>>>>>>>>>>>test test_cant_dail_upstream passed" } function test_cant_dail_downstream() { @@ -164,7 +164,7 @@ function test_cant_dail_downstream() { run_tidb_server 4000 $TIDB_PASSWORD sleep 2 - echo "》》》》》》》》》》》》》》》》》test test_cant_dail_downstream passed" + echo ">>>>>>>>>>>>>>>>>>>>>>>>>>test test_cant_dail_downstream passed" } function test_kill_dump_connection() { @@ -201,7 +201,7 @@ function test_kill_dump_connection() { run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \ "query-status -s $SOURCE_ID1" \ "\"relayCatchUpMaster\": true" 1 - echo "》》》》》》》》》》》》》》》》》test test_kill_dump_connection passed" + echo ">>>>>>>>>>>>>>>>>>>>>>>>>>test test_kill_dump_connection passed" } function test_relay_operations() { @@ -357,7 +357,7 @@ function test_relay_operations() { "\"result\": true" 2 check_sync_diff $WORK_DIR $cur/conf/diff_config.toml - echo "》》》》》》》》》》》》》》》》》test test_relay_operations passed" + echo ">>>>>>>>>>>>>>>>>>>>>>>>>>test test_relay_operations passed" } function run() { From a06c5e82cf6d7fc3e19e4daac5bd09c6685b66d5 Mon Sep 17 00:00:00 2001 From: ehco1996 Date: Fri, 28 Jan 2022 13:36:08 +0800 Subject: [PATCH 33/47] add ui for waitBeforeRunExit --- dm/syncer/syncer_test.go | 113 ++++++++++++++++++++++++++++----------- 1 file changed, 83 insertions(+), 30 deletions(-) diff --git a/dm/syncer/syncer_test.go b/dm/syncer/syncer_test.go index fdafa31b09a..0b4f36e3f53 100644 --- a/dm/syncer/syncer_test.go +++ b/dm/syncer/syncer_test.go @@ -41,6 +41,7 @@ import ( "github.com/pingcap/tiflow/dm/pkg/utils" "github.com/pingcap/tiflow/dm/syncer/dbconn" "github.com/pingcap/tiflow/pkg/errorutil" + "github.com/stretchr/testify/require" sqlmock "github.com/DATA-DOG/go-sqlmock" "github.com/go-mysql-org/go-mysql/mysql" @@ -102,12 +103,16 @@ type testSyncerSuite struct { } type MockStreamer struct { - events []*replication.BinlogEvent - idx uint32 + events []*replication.BinlogEvent + idx uint32 + pending bool } func (m *MockStreamer) GetEvent(ctx context.Context) (*replication.BinlogEvent, error) { if int(m.idx) >= len(m.events) { + if m.pending { + <-ctx.Done() + } return nil, context.Canceled } e := m.events[m.idx] @@ -121,7 +126,7 @@ type MockStreamProducer struct { func (mp *MockStreamProducer) generateStreamer(location binlog.Location) (reader.Streamer, error) { if location.Position.Pos == 4 { - return &MockStreamer{mp.events, 0}, nil + return &MockStreamer{mp.events, 0, false}, nil } bytesLen := 0 idx := uint32(0) @@ -132,32 +137,11 @@ func (mp *MockStreamProducer) generateStreamer(location binlog.Location) (reader break } } - return &MockStreamer{mp.events, idx}, nil + return &MockStreamer{mp.events, idx, false}, nil } func (s *testSyncerSuite) SetUpSuite(c *C) { - loaderDir, err := os.MkdirTemp("", "loader") - c.Assert(err, IsNil) - loaderCfg := config.LoaderConfig{ - Dir: loaderDir, - } - s.cfg = &config.SubTaskConfig{ - From: config.GetDBConfigForTest(), - To: config.GetDBConfigForTest(), - ServerID: 101, - MetaSchema: "test", - Name: "syncer_ut", - ShadowTableRules: []string{config.DefaultShadowTableRules}, - TrashTableRules: []string{config.DefaultTrashTableRules}, - Mode: config.ModeIncrement, - Flavor: "mysql", - LoaderConfig: loaderCfg, - } - s.cfg.Experimental.AsyncCheckpointFlush = true - s.cfg.From.Adjust() - s.cfg.To.Adjust() - - s.cfg.UseRelay = false + s.cfg = genDefaultSubTaskConfig4Test() s.resetEventsGenerator(c) c.Assert(log.InitLogger(&log.Config{}), IsNil) } @@ -236,7 +220,7 @@ func (s *testSyncerSuite) TearDownSuite(c *C) { os.RemoveAll(s.cfg.Dir) } -func (s *testSyncerSuite) mockGetServerUnixTS(mock sqlmock.Sqlmock) { +func mockGetServerUnixTS(mock sqlmock.Sqlmock) { ts := time.Now().Unix() rows := sqlmock.NewRows([]string{"UNIX_TIMESTAMP()"}).AddRow(strconv.FormatInt(ts, 10)) mock.ExpectQuery("SELECT UNIX_TIMESTAMP()").WillReturnRows(rows) @@ -748,7 +732,7 @@ func (s *testSyncerSuite) TestRun(c *C) { db, mock, err := sqlmock.New() c.Assert(err, IsNil) - s.mockGetServerUnixTS(mock) + mockGetServerUnixTS(mock) dbConn, err := db.Conn(context.Background()) c.Assert(err, IsNil) checkPointDB, checkPointMock, err := sqlmock.New() @@ -802,7 +786,7 @@ func (s *testSyncerSuite) TestRun(c *C) { mock.ExpectQuery("SHOW CREATE TABLE " + "`test_1`.`t_1`").WillReturnRows( sqlmock.NewRows([]string{"Table", "Create Table"}). AddRow("t_1", "create table t_1(id int primary key, name varchar(24), KEY `index1` (`name`))")) - s.mockGetServerUnixTS(mock) + mockGetServerUnixTS(mock) mock.ExpectQuery("SHOW CREATE TABLE " + "`test_1`.`t_2`").WillReturnRows( sqlmock.NewRows([]string{"Table", "Create Table"}). AddRow("t_2", "create table t_2(id int primary key, name varchar(24))")) @@ -1040,7 +1024,7 @@ func (s *testSyncerSuite) TestRun(c *C) { func (s *testSyncerSuite) TestExitSafeModeByConfig(c *C) { db, mock, err := sqlmock.New() c.Assert(err, IsNil) - s.mockGetServerUnixTS(mock) + mockGetServerUnixTS(mock) dbConn, err := db.Conn(context.Background()) c.Assert(err, IsNil) @@ -1703,3 +1687,72 @@ func (s *testSyncerSuite) TestExecuteSQLSWithIgnore(c *C) { c.Assert(mock.ExpectationsWereMet(), IsNil) } + +func genDefaultSubTaskConfig4Test() *config.SubTaskConfig { + loaderDir, err := os.MkdirTemp("", "loader") + if err != nil { + panic(err) // no happen + } + + loaderCfg := config.LoaderConfig{ + Dir: loaderDir, + } + cfg := &config.SubTaskConfig{ + From: config.GetDBConfigForTest(), + To: config.GetDBConfigForTest(), + ServerID: 101, + MetaSchema: "test", + Name: "syncer_ut", + ShadowTableRules: []string{config.DefaultShadowTableRules}, + TrashTableRules: []string{config.DefaultTrashTableRules}, + Mode: config.ModeIncrement, + Flavor: "mysql", + LoaderConfig: loaderCfg, + UseRelay: false, + } + cfg.Experimental.AsyncCheckpointFlush = true + cfg.From.Adjust() + cfg.To.Adjust() + return cfg +} + +func TestWaitBeforeRunExit(t *testing.T) { + ctx := context.Background() + cfg := genDefaultSubTaskConfig4Test() + cfg.WorkerCount = 0 + syncer := NewSyncer(cfg, nil, nil) + + db, mock, err := sqlmock.New() + require.NoError(t, err) + mockGetServerUnixTS(mock) + + syncer.fromDB = &dbconn.UpStreamConn{BaseDB: conn.NewBaseDB(db)} + syncer.reset() + require.NoError(t, syncer.genRouter()) + + mockStreamerProducer := &MockStreamProducer{} + mockStreamer, err := mockStreamerProducer.generateStreamer(binlog.NewLocation("")) + require.NoError(t, err) + // let getEvent pending until ctx.Done() + mockStreamer.(*MockStreamer).pending = true + syncer.streamerController = &StreamerController{ + streamerProducer: mockStreamerProducer, streamer: mockStreamer, closed: false, + } + + wg := &sync.WaitGroup{} + errCh := make(chan error, 1) + wg.Add(1) + go func() { + defer wg.Done() + errCh <- syncer.Run(ctx) + }() + // wait s.Run start + time.Sleep(time.Second) + // s.Run will not exit unit we call cancel + require.Equal(t, 0, len(errCh)) + require.NotNil(t, syncer.runCtx) + require.NotNil(t, syncer.runCancel) + syncer.runCancel() //this will make s.Run exit + wg.Wait() + require.Nil(t, <-errCh) +} From 1e51bec3be4a8460970648afd7c163ee516a77a4 Mon Sep 17 00:00:00 2001 From: ehco1996 Date: Tue, 8 Feb 2022 13:23:38 +0800 Subject: [PATCH 34/47] address comment --- dm/dm/worker/server.go | 4 +++- dm/syncer/syncer.go | 2 +- dm/syncer/syncer_test.go | 15 ++++++++++++++- 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/dm/dm/worker/server.go b/dm/dm/worker/server.go index d3e2c9e9b38..9e25e203d32 100644 --- a/dm/dm/worker/server.go +++ b/dm/dm/worker/server.go @@ -461,7 +461,9 @@ func (s *Server) doClose() { func (s *Server) Close() { s.doClose() // we should stop current sync first, otherwise master may schedule task on new worker while we are closing s.stopKeepAlive() - s.etcdClient.Close() + if s.etcdClient != nil { + s.etcdClient.Close() + } } // if needLock is false, we should make sure Server has been locked in caller. diff --git a/dm/syncer/syncer.go b/dm/syncer/syncer.go index 22bef2ab4b2..acf1c25d2fa 100644 --- a/dm/syncer/syncer.go +++ b/dm/syncer/syncer.go @@ -1554,7 +1554,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) { // start flush checkpoints worker. this worker must start before s.flushCheckPoints() go func() { defer s.runWg.Done() - // also need to use a ctx different s.runCtx, checkpointFlushWorker worker will be closed in the first defer + // also need to use a ctx different from s.runCtx, checkpointFlushWorker worker will be closed in the first defer s.checkpointFlushWorker.Run(s.tctx) }() s.runWg.Add(1) diff --git a/dm/syncer/syncer_test.go b/dm/syncer/syncer_test.go index 0b4f36e3f53..d70e1ea4f14 100644 --- a/dm/syncer/syncer_test.go +++ b/dm/syncer/syncer_test.go @@ -1748,11 +1748,24 @@ func TestWaitBeforeRunExit(t *testing.T) { }() // wait s.Run start time.Sleep(time.Second) - // s.Run will not exit unit we call cancel + + // test s.Run will not exit unit caller cancel ctx or call s.runCancel require.Equal(t, 0, len(errCh)) require.NotNil(t, syncer.runCtx) require.NotNil(t, syncer.runCancel) syncer.runCancel() //this will make s.Run exit wg.Wait() require.Nil(t, <-errCh) + + // test syncer wait time not more than maxPauseOrStopWaitTime + oldMaxPauseOrStopWaitTime := maxPauseOrStopWaitTime + maxPauseOrStopWaitTime = time.Second + ctx2, cancel := context.WithCancel(context.Background()) + cancel() + runCtx, runCancel := context.WithCancel(context.Background()) + syncer.runCtx, syncer.runCancel = tcontext.NewContext(runCtx, syncer.tctx.L()), runCancel + syncer.runWg.Add(1) + syncer.waitBeforeRunExit(ctx2) + require.Equal(t, context.Canceled, syncer.runCtx.Ctx.Err()) + maxPauseOrStopWaitTime = oldMaxPauseOrStopWaitTime } From 63a235e8ef60eeb3e9bf8196afcd5abefd2ff44f Mon Sep 17 00:00:00 2001 From: ehco1996 Date: Tue, 8 Feb 2022 13:29:54 +0800 Subject: [PATCH 35/47] fix lint --- dm/syncer/syncer.go | 2 +- dm/syncer/syncer_test.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dm/syncer/syncer.go b/dm/syncer/syncer.go index acf1c25d2fa..56f796ac7ae 100644 --- a/dm/syncer/syncer.go +++ b/dm/syncer/syncer.go @@ -1554,7 +1554,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) { // start flush checkpoints worker. this worker must start before s.flushCheckPoints() go func() { defer s.runWg.Done() - // also need to use a ctx different from s.runCtx, checkpointFlushWorker worker will be closed in the first defer + // also need to use a different ctx. checkpointFlushWorker worker will be closed in the first defer s.checkpointFlushWorker.Run(s.tctx) }() s.runWg.Add(1) diff --git a/dm/syncer/syncer_test.go b/dm/syncer/syncer_test.go index d70e1ea4f14..c0d4343eb54 100644 --- a/dm/syncer/syncer_test.go +++ b/dm/syncer/syncer_test.go @@ -1753,7 +1753,7 @@ func TestWaitBeforeRunExit(t *testing.T) { require.Equal(t, 0, len(errCh)) require.NotNil(t, syncer.runCtx) require.NotNil(t, syncer.runCancel) - syncer.runCancel() //this will make s.Run exit + syncer.runCancel() // this will make s.Run exit wg.Wait() require.Nil(t, <-errCh) From fb5f21fd8072154df2fa15d2302cab7af7e44287 Mon Sep 17 00:00:00 2001 From: ehco1996 Date: Tue, 8 Feb 2022 13:42:24 +0800 Subject: [PATCH 36/47] ungraceful exit the dml pipeline --- dm/syncer/causality.go | 52 +++++++++++++++++++------------------ dm/syncer/dml_worker.go | 57 ++++++++++++++++++++++------------------- 2 files changed, 58 insertions(+), 51 deletions(-) diff --git a/dm/syncer/causality.go b/dm/syncer/causality.go index ab5eb8e8699..016f455f250 100644 --- a/dm/syncer/causality.go +++ b/dm/syncer/causality.go @@ -66,33 +66,35 @@ func causalityWrap(inCh chan *job, syncer *Syncer) chan *job { // run receives dml jobs and send causality jobs by adding causality key. // When meet conflict, sends a conflict job. func (c *causality) run() { - for j := range c.inCh { - metrics.QueueSizeGauge.WithLabelValues(c.task, "causality_input", c.source).Set(float64(len(c.inCh))) - - startTime := time.Now() - - switch j.tp { - case flush, asyncFlush: - c.relation.rotate(j.flushSeq) - case gc: - // gc is only used on inner-causality logic - c.relation.gc(j.flushSeq) - continue - default: - keys := j.dml.identifyKeys(c.sessCtx) - - // detectConflict before add - if c.detectConflict(keys) { - c.logger.Debug("meet causality key, will generate a conflict job to flush all sqls", zap.Strings("keys", keys)) - c.outCh <- newConflictJob(c.workerCount) - c.relation.clear() + for { + select { + case j, ok := <-c.inCh: + if !ok { + return } - j.dml.key = c.add(keys) - c.logger.Debug("key for keys", zap.String("key", j.dml.key), zap.Strings("keys", keys)) + metrics.QueueSizeGauge.WithLabelValues(c.task, "causality_input", c.source).Set(float64(len(c.inCh))) + startTime := time.Now() + switch j.tp { + case flush, asyncFlush: + c.relation.rotate(j.flushSeq) + case gc: + // gc is only used on inner-causality logic + c.relation.gc(j.flushSeq) + continue + default: + keys := j.dml.identifyKeys(c.sessCtx) + // detectConflict before add + if c.detectConflict(keys) { + c.logger.Debug("meet causality key, will generate a conflict job to flush all sqls", zap.Strings("keys", keys)) + c.outCh <- newConflictJob(c.workerCount) + c.relation.clear() + } + j.dml.key = c.add(keys) + c.logger.Debug("key for keys", zap.String("key", j.dml.key), zap.Strings("keys", keys)) + } + metrics.ConflictDetectDurationHistogram.WithLabelValues(c.task, c.source).Observe(time.Since(startTime).Seconds()) + c.outCh <- j } - metrics.ConflictDetectDurationHistogram.WithLabelValues(c.task, c.source).Observe(time.Since(startTime).Seconds()) - - c.outCh <- j } } diff --git a/dm/syncer/dml_worker.go b/dm/syncer/dml_worker.go index 4bc942b3db6..87ed4174596 100644 --- a/dm/syncer/dml_worker.go +++ b/dm/syncer/dml_worker.go @@ -112,32 +112,37 @@ func (w *DMLWorker) run() { for i := 0; i < w.workerCount; i++ { queueBucketMapping[i] = queueBucketName(i) } - - for j := range w.inCh { - metrics.QueueSizeGauge.WithLabelValues(w.task, "dml_worker_input", w.source).Set(float64(len(w.inCh))) - switch j.tp { - case flush: - w.addCountFunc(false, adminQueueName, j.tp, 1, j.targetTable) - w.sendJobToAllDmlQueue(j, jobChs, queueBucketMapping) - j.flushWg.Wait() - w.addCountFunc(true, adminQueueName, j.tp, 1, j.targetTable) - w.flushCh <- j - case asyncFlush: - w.addCountFunc(false, adminQueueName, j.tp, 1, j.targetTable) - w.sendJobToAllDmlQueue(j, jobChs, queueBucketMapping) - w.flushCh <- j - case conflict: - w.addCountFunc(false, adminQueueName, j.tp, 1, j.targetTable) - w.sendJobToAllDmlQueue(j, jobChs, queueBucketMapping) - j.flushWg.Wait() - w.addCountFunc(true, adminQueueName, j.tp, 1, j.targetTable) - default: - queueBucket := int(utils.GenHashKey(j.dml.key)) % w.workerCount - w.addCountFunc(false, queueBucketMapping[queueBucket], j.tp, 1, j.targetTable) - startTime := time.Now() - w.logger.Debug("queue for key", zap.Int("queue", queueBucket), zap.String("key", j.dml.key)) - jobChs[queueBucket] <- j - metrics.AddJobDurationHistogram.WithLabelValues(j.tp.String(), w.task, queueBucketMapping[queueBucket], w.source).Observe(time.Since(startTime).Seconds()) + for { + select { + case j, ok := <-w.inCh: + if !ok { + return + } + metrics.QueueSizeGauge.WithLabelValues(w.task, "dml_worker_input", w.source).Set(float64(len(w.inCh))) + switch j.tp { + case flush: + w.addCountFunc(false, adminQueueName, j.tp, 1, j.targetTable) + w.sendJobToAllDmlQueue(j, jobChs, queueBucketMapping) + j.flushWg.Wait() + w.addCountFunc(true, adminQueueName, j.tp, 1, j.targetTable) + w.flushCh <- j + case asyncFlush: + w.addCountFunc(false, adminQueueName, j.tp, 1, j.targetTable) + w.sendJobToAllDmlQueue(j, jobChs, queueBucketMapping) + w.flushCh <- j + case conflict: + w.addCountFunc(false, adminQueueName, j.tp, 1, j.targetTable) + w.sendJobToAllDmlQueue(j, jobChs, queueBucketMapping) + j.flushWg.Wait() + w.addCountFunc(true, adminQueueName, j.tp, 1, j.targetTable) + default: + queueBucket := int(utils.GenHashKey(j.dml.key)) % w.workerCount + w.addCountFunc(false, queueBucketMapping[queueBucket], j.tp, 1, j.targetTable) + startTime := time.Now() + w.logger.Debug("queue for key", zap.Int("queue", queueBucket), zap.String("key", j.dml.key)) + jobChs[queueBucket] <- j + metrics.AddJobDurationHistogram.WithLabelValues(j.tp.String(), w.task, queueBucketMapping[queueBucket], w.source).Observe(time.Since(startTime).Seconds()) + } } } } From b851b623e02165a84aaaa0696bf5ba0971c3ef55 Mon Sep 17 00:00:00 2001 From: ehco1996 Date: Tue, 8 Feb 2022 14:57:31 +0800 Subject: [PATCH 37/47] fix merge master --- dm/syncer/syncer.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dm/syncer/syncer.go b/dm/syncer/syncer.go index 040ce4b9c65..4f484204156 100644 --- a/dm/syncer/syncer.go +++ b/dm/syncer/syncer.go @@ -1539,7 +1539,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) { cleanDumpFile = false } if s.cfg.ShardMode == config.ShardOptimistic { - s.flushOptimisticTableInfos(tctx) + s.flushOptimisticTableInfos(s.runCtx) } } @@ -1967,7 +1967,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) { s.locations.reset(currentLocation) if !s.errOperatorHolder.IsInject(startLocation) { // replace operator need redirect to currentLocation - if err = s.streamerController.RedirectStreamer(tctx, currentLocation); err != nil { + if err = s.streamerController.RedirectStreamer(s.runCtx, currentLocation); err != nil { return err } } From 6e9d1ad02cfe4318a0ee1b20bfb2fb596474639c Mon Sep 17 00:00:00 2001 From: ehco1996 Date: Tue, 8 Feb 2022 15:25:55 +0800 Subject: [PATCH 38/47] fix datarace --- dm/syncer/syncer_test.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dm/syncer/syncer_test.go b/dm/syncer/syncer_test.go index c0d4343eb54..3026949c09d 100644 --- a/dm/syncer/syncer_test.go +++ b/dm/syncer/syncer_test.go @@ -1751,8 +1751,10 @@ func TestWaitBeforeRunExit(t *testing.T) { // test s.Run will not exit unit caller cancel ctx or call s.runCancel require.Equal(t, 0, len(errCh)) + syncer.Lock() require.NotNil(t, syncer.runCtx) require.NotNil(t, syncer.runCancel) + syncer.Unlock() syncer.runCancel() // this will make s.Run exit wg.Wait() require.Nil(t, <-errCh) From 7aca258d8646f9edded417f0418942b8090de4aa Mon Sep 17 00:00:00 2001 From: ehco1996 Date: Tue, 8 Feb 2022 16:27:12 +0800 Subject: [PATCH 39/47] add syncer ctx to sync pipeline --- dm/syncer/causality.go | 5 +++++ dm/syncer/compactor.go | 5 +++++ dm/syncer/dml_worker.go | 2 ++ 3 files changed, 12 insertions(+) diff --git a/dm/syncer/causality.go b/dm/syncer/causality.go index 016f455f250..159b7fa7a8d 100644 --- a/dm/syncer/causality.go +++ b/dm/syncer/causality.go @@ -20,6 +20,7 @@ import ( "go.uber.org/zap" "github.com/pingcap/tidb/sessionctx" + tcontext "github.com/pingcap/tiflow/dm/pkg/context" "github.com/pingcap/tiflow/dm/pkg/log" "github.com/pingcap/tiflow/dm/syncer/metrics" ) @@ -34,6 +35,7 @@ type causality struct { outCh chan *job inCh chan *job logger log.Logger + syncCtx *tcontext.Context sessCtx sessionctx.Context workerCount int @@ -51,6 +53,7 @@ func causalityWrap(inCh chan *job, syncer *Syncer) chan *job { logger: syncer.tctx.Logger.WithFields(zap.String("component", "causality")), inCh: inCh, outCh: make(chan *job, syncer.cfg.QueueSize), + syncCtx: syncer.syncCtx, // this ctx can be used to cancel all the workers sessCtx: syncer.sessCtx, workerCount: syncer.cfg.WorkerCount, } @@ -68,6 +71,8 @@ func causalityWrap(inCh chan *job, syncer *Syncer) chan *job { func (c *causality) run() { for { select { + case <-c.syncCtx.Ctx.Done(): + break case j, ok := <-c.inCh: if !ok { return diff --git a/dm/syncer/compactor.go b/dm/syncer/compactor.go index 62c907f037d..28ea593c70d 100644 --- a/dm/syncer/compactor.go +++ b/dm/syncer/compactor.go @@ -22,6 +22,7 @@ import ( "github.com/pingcap/tidb-tools/pkg/filter" "go.uber.org/zap" + tcontext "github.com/pingcap/tiflow/dm/pkg/context" "github.com/pingcap/tiflow/dm/pkg/log" "github.com/pingcap/tiflow/dm/syncer/metrics" ) @@ -33,6 +34,7 @@ type compactor struct { bufferSize int logger log.Logger safeMode bool + syncCtx *tcontext.Context keyMap map[string]map[string]int // table -> key(pk or (uk + not null)) -> index in buffer buffer []*job @@ -58,6 +60,7 @@ func compactorWrap(inCh chan *job, syncer *Syncer) chan *job { task: syncer.cfg.Name, source: syncer.cfg.SourceID, addCountFunc: syncer.addCount, + syncCtx: syncer.syncCtx, // this ctx can be used to cancel all the workers } go func() { compactor.run() @@ -70,6 +73,8 @@ func compactorWrap(inCh chan *job, syncer *Syncer) chan *job { func (c *compactor) run() { for { select { + case <-c.syncCtx.Ctx.Done(): + break case j, ok := <-c.inCh: if !ok { return diff --git a/dm/syncer/dml_worker.go b/dm/syncer/dml_worker.go index 1dbe7111fe8..a866e94f7bd 100644 --- a/dm/syncer/dml_worker.go +++ b/dm/syncer/dml_worker.go @@ -114,6 +114,8 @@ func (w *DMLWorker) run() { } for { select { + case <-w.syncCtx.Ctx.Done(): + break case j, ok := <-w.inCh: if !ok { return From 0f5e2207e52cde9fdcc279e6d1cdda94d4c9c503 Mon Sep 17 00:00:00 2001 From: ehco1996 Date: Tue, 8 Feb 2022 16:50:42 +0800 Subject: [PATCH 40/47] fix ut panic --- dm/syncer/causality_test.go | 2 ++ dm/syncer/compactor_test.go | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/dm/syncer/causality_test.go b/dm/syncer/causality_test.go index 1109b8c1b80..1ebfe16a3d3 100644 --- a/dm/syncer/causality_test.go +++ b/dm/syncer/causality_test.go @@ -95,6 +95,7 @@ func (s *testSyncerSuite) TestCasuality(c *C) { Name: "task", SourceID: "source", }, + syncCtx: tcontext.Background(), tctx: tcontext.Background().WithLogger(log.L()), sessCtx: utils.NewSessionCtx(map[string]string{"time_zone": "UTC"}), } @@ -166,6 +167,7 @@ func (s *testSyncerSuite) TestCasualityWithPrefixIndex(c *C) { Name: "task", SourceID: "source", }, + syncCtx: tcontext.Background(), tctx: tcontext.Background().WithLogger(log.L()), sessCtx: utils.NewSessionCtx(map[string]string{"time_zone": "UTC"}), } diff --git a/dm/syncer/compactor_test.go b/dm/syncer/compactor_test.go index 506f9581a7f..ccf1b3bcc3a 100644 --- a/dm/syncer/compactor_test.go +++ b/dm/syncer/compactor_test.go @@ -251,7 +251,8 @@ func (s *testSyncerSuite) TestCompactorSafeMode(c *C) { inCh := make(chan *job, 100) syncer := &Syncer{ - tctx: tcontext.NewContext(context.Background(), log.L()), + tctx: tcontext.NewContext(context.Background(), log.L()), + syncCtx: tcontext.Background(), cfg: &config.SubTaskConfig{ Name: "task", SourceID: "source", From cd7903c91064827949f61e1f604716d52c3ae908 Mon Sep 17 00:00:00 2001 From: ehco1996 Date: Thu, 10 Feb 2022 14:29:22 +0800 Subject: [PATCH 41/47] address comments --- dm/dm/worker/subtask.go | 4 +++- dm/syncer/causality.go | 2 +- dm/syncer/compactor.go | 2 +- dm/syncer/dml_worker.go | 2 +- 4 files changed, 6 insertions(+), 4 deletions(-) diff --git a/dm/dm/worker/subtask.go b/dm/dm/worker/subtask.go index c6ed4f7f261..1a1ee47c206 100644 --- a/dm/dm/worker/subtask.go +++ b/dm/dm/worker/subtask.go @@ -394,8 +394,10 @@ func (st *SubTask) closeUnits() { func (st *SubTask) killCurrentUnit() { if st.CurrUnit() != nil { - st.l.Info("kill unit", zap.String("task", st.cfg.Name), zap.Stringer("unit", st.CurrUnit().Type())) + ut := st.CurrUnit().Type() + st.l.Info("kill unit", zap.String("task", st.cfg.Name), zap.Stringer("unit", ut)) st.CurrUnit().Kill() + st.l.Info("kill unit done", zap.String("task", st.cfg.Name), zap.Stringer("unit", ut)) } } diff --git a/dm/syncer/causality.go b/dm/syncer/causality.go index 159b7fa7a8d..5b4d53621fd 100644 --- a/dm/syncer/causality.go +++ b/dm/syncer/causality.go @@ -72,7 +72,7 @@ func (c *causality) run() { for { select { case <-c.syncCtx.Ctx.Done(): - break + return case j, ok := <-c.inCh: if !ok { return diff --git a/dm/syncer/compactor.go b/dm/syncer/compactor.go index 28ea593c70d..a3aa03576c1 100644 --- a/dm/syncer/compactor.go +++ b/dm/syncer/compactor.go @@ -74,7 +74,7 @@ func (c *compactor) run() { for { select { case <-c.syncCtx.Ctx.Done(): - break + return case j, ok := <-c.inCh: if !ok { return diff --git a/dm/syncer/dml_worker.go b/dm/syncer/dml_worker.go index a866e94f7bd..cbe9919d584 100644 --- a/dm/syncer/dml_worker.go +++ b/dm/syncer/dml_worker.go @@ -115,7 +115,7 @@ func (w *DMLWorker) run() { for { select { case <-w.syncCtx.Ctx.Done(): - break + return case j, ok := <-w.inCh: if !ok { return From 05ca26babf53aa0de37cd296a27cf31ffd966e25 Mon Sep 17 00:00:00 2001 From: ehco1996 Date: Fri, 11 Feb 2022 16:03:34 +0800 Subject: [PATCH 42/47] revert ungraceful close job pipeline --- dm/dm/worker/server.go | 2 +- dm/dm/worker/subtask.go | 2 +- dm/syncer/causality.go | 57 ++++++++++++++++-------------------- dm/syncer/causality_test.go | 2 -- dm/syncer/compactor.go | 5 ---- dm/syncer/compactor_test.go | 3 +- dm/syncer/dml_worker.go | 58 ++++++++++++++++--------------------- dm/syncer/syncer.go | 22 +++++++------- 8 files changed, 63 insertions(+), 88 deletions(-) diff --git a/dm/dm/worker/server.go b/dm/dm/worker/server.go index 9e25e203d32..6cfcbd481bc 100644 --- a/dm/dm/worker/server.go +++ b/dm/dm/worker/server.go @@ -449,11 +449,11 @@ func (s *Server) doClose() { } // stop server in advance, stop receiving source bound and relay bound s.cancel() + s.wg.Wait() // stop worker and wait for return(we already lock the whole Sever, so no need use lock to get source worker) if w := s.getSourceWorker(false); w != nil { w.Stop(true) } - s.wg.Wait() s.closed.Store(true) } diff --git a/dm/dm/worker/subtask.go b/dm/dm/worker/subtask.go index 1a1ee47c206..7c35acd47f3 100644 --- a/dm/dm/worker/subtask.go +++ b/dm/dm/worker/subtask.go @@ -504,7 +504,7 @@ func (st *SubTask) Close() { updateTaskMetric(st.cfg.Name, st.cfg.SourceID, pb.Stage_Stopped, st.workerName) } -// kill kill running unit and stop the sub task. +// Kill kill running unit and stop the sub task. func (st *SubTask) Kill() { st.l.Info("killing") if !st.setStageIfNotIn([]pb.Stage{pb.Stage_Stopped, pb.Stage_Stopping, pb.Stage_Finished}, pb.Stage_Stopping) { diff --git a/dm/syncer/causality.go b/dm/syncer/causality.go index 5b4d53621fd..ab5eb8e8699 100644 --- a/dm/syncer/causality.go +++ b/dm/syncer/causality.go @@ -20,7 +20,6 @@ import ( "go.uber.org/zap" "github.com/pingcap/tidb/sessionctx" - tcontext "github.com/pingcap/tiflow/dm/pkg/context" "github.com/pingcap/tiflow/dm/pkg/log" "github.com/pingcap/tiflow/dm/syncer/metrics" ) @@ -35,7 +34,6 @@ type causality struct { outCh chan *job inCh chan *job logger log.Logger - syncCtx *tcontext.Context sessCtx sessionctx.Context workerCount int @@ -53,7 +51,6 @@ func causalityWrap(inCh chan *job, syncer *Syncer) chan *job { logger: syncer.tctx.Logger.WithFields(zap.String("component", "causality")), inCh: inCh, outCh: make(chan *job, syncer.cfg.QueueSize), - syncCtx: syncer.syncCtx, // this ctx can be used to cancel all the workers sessCtx: syncer.sessCtx, workerCount: syncer.cfg.WorkerCount, } @@ -69,37 +66,33 @@ func causalityWrap(inCh chan *job, syncer *Syncer) chan *job { // run receives dml jobs and send causality jobs by adding causality key. // When meet conflict, sends a conflict job. func (c *causality) run() { - for { - select { - case <-c.syncCtx.Ctx.Done(): - return - case j, ok := <-c.inCh: - if !ok { - return + for j := range c.inCh { + metrics.QueueSizeGauge.WithLabelValues(c.task, "causality_input", c.source).Set(float64(len(c.inCh))) + + startTime := time.Now() + + switch j.tp { + case flush, asyncFlush: + c.relation.rotate(j.flushSeq) + case gc: + // gc is only used on inner-causality logic + c.relation.gc(j.flushSeq) + continue + default: + keys := j.dml.identifyKeys(c.sessCtx) + + // detectConflict before add + if c.detectConflict(keys) { + c.logger.Debug("meet causality key, will generate a conflict job to flush all sqls", zap.Strings("keys", keys)) + c.outCh <- newConflictJob(c.workerCount) + c.relation.clear() } - metrics.QueueSizeGauge.WithLabelValues(c.task, "causality_input", c.source).Set(float64(len(c.inCh))) - startTime := time.Now() - switch j.tp { - case flush, asyncFlush: - c.relation.rotate(j.flushSeq) - case gc: - // gc is only used on inner-causality logic - c.relation.gc(j.flushSeq) - continue - default: - keys := j.dml.identifyKeys(c.sessCtx) - // detectConflict before add - if c.detectConflict(keys) { - c.logger.Debug("meet causality key, will generate a conflict job to flush all sqls", zap.Strings("keys", keys)) - c.outCh <- newConflictJob(c.workerCount) - c.relation.clear() - } - j.dml.key = c.add(keys) - c.logger.Debug("key for keys", zap.String("key", j.dml.key), zap.Strings("keys", keys)) - } - metrics.ConflictDetectDurationHistogram.WithLabelValues(c.task, c.source).Observe(time.Since(startTime).Seconds()) - c.outCh <- j + j.dml.key = c.add(keys) + c.logger.Debug("key for keys", zap.String("key", j.dml.key), zap.Strings("keys", keys)) } + metrics.ConflictDetectDurationHistogram.WithLabelValues(c.task, c.source).Observe(time.Since(startTime).Seconds()) + + c.outCh <- j } } diff --git a/dm/syncer/causality_test.go b/dm/syncer/causality_test.go index 1ebfe16a3d3..1109b8c1b80 100644 --- a/dm/syncer/causality_test.go +++ b/dm/syncer/causality_test.go @@ -95,7 +95,6 @@ func (s *testSyncerSuite) TestCasuality(c *C) { Name: "task", SourceID: "source", }, - syncCtx: tcontext.Background(), tctx: tcontext.Background().WithLogger(log.L()), sessCtx: utils.NewSessionCtx(map[string]string{"time_zone": "UTC"}), } @@ -167,7 +166,6 @@ func (s *testSyncerSuite) TestCasualityWithPrefixIndex(c *C) { Name: "task", SourceID: "source", }, - syncCtx: tcontext.Background(), tctx: tcontext.Background().WithLogger(log.L()), sessCtx: utils.NewSessionCtx(map[string]string{"time_zone": "UTC"}), } diff --git a/dm/syncer/compactor.go b/dm/syncer/compactor.go index a3aa03576c1..62c907f037d 100644 --- a/dm/syncer/compactor.go +++ b/dm/syncer/compactor.go @@ -22,7 +22,6 @@ import ( "github.com/pingcap/tidb-tools/pkg/filter" "go.uber.org/zap" - tcontext "github.com/pingcap/tiflow/dm/pkg/context" "github.com/pingcap/tiflow/dm/pkg/log" "github.com/pingcap/tiflow/dm/syncer/metrics" ) @@ -34,7 +33,6 @@ type compactor struct { bufferSize int logger log.Logger safeMode bool - syncCtx *tcontext.Context keyMap map[string]map[string]int // table -> key(pk or (uk + not null)) -> index in buffer buffer []*job @@ -60,7 +58,6 @@ func compactorWrap(inCh chan *job, syncer *Syncer) chan *job { task: syncer.cfg.Name, source: syncer.cfg.SourceID, addCountFunc: syncer.addCount, - syncCtx: syncer.syncCtx, // this ctx can be used to cancel all the workers } go func() { compactor.run() @@ -73,8 +70,6 @@ func compactorWrap(inCh chan *job, syncer *Syncer) chan *job { func (c *compactor) run() { for { select { - case <-c.syncCtx.Ctx.Done(): - return case j, ok := <-c.inCh: if !ok { return diff --git a/dm/syncer/compactor_test.go b/dm/syncer/compactor_test.go index ccf1b3bcc3a..506f9581a7f 100644 --- a/dm/syncer/compactor_test.go +++ b/dm/syncer/compactor_test.go @@ -251,8 +251,7 @@ func (s *testSyncerSuite) TestCompactorSafeMode(c *C) { inCh := make(chan *job, 100) syncer := &Syncer{ - tctx: tcontext.NewContext(context.Background(), log.L()), - syncCtx: tcontext.Background(), + tctx: tcontext.NewContext(context.Background(), log.L()), cfg: &config.SubTaskConfig{ Name: "task", SourceID: "source", diff --git a/dm/syncer/dml_worker.go b/dm/syncer/dml_worker.go index cbe9919d584..71802fbc09a 100644 --- a/dm/syncer/dml_worker.go +++ b/dm/syncer/dml_worker.go @@ -112,39 +112,31 @@ func (w *DMLWorker) run() { for i := 0; i < w.workerCount; i++ { queueBucketMapping[i] = queueBucketName(i) } - for { - select { - case <-w.syncCtx.Ctx.Done(): - return - case j, ok := <-w.inCh: - if !ok { - return - } - metrics.QueueSizeGauge.WithLabelValues(w.task, "dml_worker_input", w.source).Set(float64(len(w.inCh))) - switch j.tp { - case flush: - w.addCountFunc(false, adminQueueName, j.tp, 1, j.targetTable) - w.sendJobToAllDmlQueue(j, jobChs, queueBucketMapping) - j.flushWg.Wait() - w.addCountFunc(true, adminQueueName, j.tp, 1, j.targetTable) - w.flushCh <- j - case asyncFlush: - w.addCountFunc(false, adminQueueName, j.tp, 1, j.targetTable) - w.sendJobToAllDmlQueue(j, jobChs, queueBucketMapping) - w.flushCh <- j - case conflict: - w.addCountFunc(false, adminQueueName, j.tp, 1, j.targetTable) - w.sendJobToAllDmlQueue(j, jobChs, queueBucketMapping) - j.flushWg.Wait() - w.addCountFunc(true, adminQueueName, j.tp, 1, j.targetTable) - default: - queueBucket := int(utils.GenHashKey(j.dml.key)) % w.workerCount - w.addCountFunc(false, queueBucketMapping[queueBucket], j.tp, 1, j.targetTable) - startTime := time.Now() - w.logger.Debug("queue for key", zap.Int("queue", queueBucket), zap.String("key", j.dml.key)) - jobChs[queueBucket] <- j - metrics.AddJobDurationHistogram.WithLabelValues(j.tp.String(), w.task, queueBucketMapping[queueBucket], w.source).Observe(time.Since(startTime).Seconds()) - } + for j := range w.inCh { + metrics.QueueSizeGauge.WithLabelValues(w.task, "dml_worker_input", w.source).Set(float64(len(w.inCh))) + switch j.tp { + case flush: + w.addCountFunc(false, adminQueueName, j.tp, 1, j.targetTable) + w.sendJobToAllDmlQueue(j, jobChs, queueBucketMapping) + j.flushWg.Wait() + w.addCountFunc(true, adminQueueName, j.tp, 1, j.targetTable) + w.flushCh <- j + case asyncFlush: + w.addCountFunc(false, adminQueueName, j.tp, 1, j.targetTable) + w.sendJobToAllDmlQueue(j, jobChs, queueBucketMapping) + w.flushCh <- j + case conflict: + w.addCountFunc(false, adminQueueName, j.tp, 1, j.targetTable) + w.sendJobToAllDmlQueue(j, jobChs, queueBucketMapping) + j.flushWg.Wait() + w.addCountFunc(true, adminQueueName, j.tp, 1, j.targetTable) + default: + queueBucket := int(utils.GenHashKey(j.dml.key)) % w.workerCount + w.addCountFunc(false, queueBucketMapping[queueBucket], j.tp, 1, j.targetTable) + startTime := time.Now() + w.logger.Debug("queue for key", zap.Int("queue", queueBucket), zap.String("key", j.dml.key)) + jobChs[queueBucket] <- j + metrics.AddJobDurationHistogram.WithLabelValues(j.tp.String(), w.task, queueBucketMapping[queueBucket], w.source).Observe(time.Since(startTime).Seconds()) } } } diff --git a/dm/syncer/syncer.go b/dm/syncer/syncer.go index e3e7c8e845b..df6dd9f2b8d 100644 --- a/dm/syncer/syncer.go +++ b/dm/syncer/syncer.go @@ -289,6 +289,13 @@ func (s *Syncer) newJobChans() { chanSize := calculateChanSize(s.cfg.QueueSize, s.cfg.WorkerCount, s.cfg.Compact) s.dmlJobCh = make(chan *job, chanSize) s.ddlJobCh = make(chan *job, s.cfg.QueueSize) + s.checkpointFlushWorker = &checkpointFlushWorker{ + input: make(chan *checkpointFlushTask, 16), + cp: s.checkpoint, + execError: &s.execError, + afterFlushFn: s.afterFlushCheckpoint, + addCountFunc: s.addCount, + } s.jobsClosed.Store(false) } @@ -300,6 +307,7 @@ func (s *Syncer) closeJobChans() { } close(s.dmlJobCh) close(s.ddlJobCh) + s.checkpointFlushWorker.Close() s.jobsClosed.Store(true) } @@ -441,7 +449,6 @@ func (s *Syncer) Init(ctx context.Context) (err error) { return err } rollbackHolder.Add(fr.FuncRollback{Name: "remove-active-realylog", Fn: s.removeActiveRelayLog}) - s.reset() return nil } @@ -1431,6 +1438,7 @@ func (s *Syncer) waitBeforeRunExit(ctx context.Context) { case <-s.runCtx.Ctx.Done(): s.tctx.L().Info("syncer run exit so runCtx done") case <-time.After(maxPauseOrStopWaitTime): + // TODO: maxPauseOrStopWaitTime should also count the time of waiting waitTransactionLock s.tctx.L().Info("wait transaction end timeout, exit now") s.runCancel() } @@ -1483,23 +1491,13 @@ func (s *Syncer) updateTSOffset(ctx context.Context) error { // Run starts running for sync, we should guarantee it can rerun when paused. func (s *Syncer) Run(ctx context.Context) (err error) { - s.Lock() runCtx, runCancel := context.WithCancel(context.Background()) s.runCtx, s.runCancel = tcontext.NewContext(runCtx, s.tctx.L()), runCancel syncCtx, syncCancel := context.WithCancel(context.Background()) s.syncCtx, s.syncCancel = tcontext.NewContext(syncCtx, s.tctx.L()), syncCancel - s.checkpointFlushWorker = &checkpointFlushWorker{ - input: make(chan *checkpointFlushTask, 16), - cp: s.checkpoint, - execError: &s.execError, - afterFlushFn: s.afterFlushCheckpoint, - addCountFunc: s.addCount, - } - s.Unlock() defer func() { s.runCancel() s.closeJobChans() - s.checkpointFlushWorker.Close() s.runWg.Wait() }() @@ -1555,7 +1553,6 @@ func (s *Syncer) Run(ctx context.Context) (err error) { s.runWg.Add(1) go s.syncDML() s.runWg.Add(1) - // start flush checkpoints worker. this worker must start before s.flushCheckPoints() go func() { defer s.runWg.Done() // also need to use a different ctx. checkpointFlushWorker worker will be closed in the first defer @@ -3377,6 +3374,7 @@ func (s *Syncer) Close() { // when closing syncer by `stop-task`, remove active relay log from hub s.removeActiveRelayLog() metrics.RemoveLabelValuesWithTaskInMetrics(s.cfg.Name) + s.runWg.Wait() s.closed.Store(true) } From 4fcad7d90b7c44c3a0aaf6c7f99b9fcf569db759 Mon Sep 17 00:00:00 2001 From: ehco1996 Date: Fri, 11 Feb 2022 17:21:52 +0800 Subject: [PATCH 43/47] fix datarace --- dm/syncer/syncer_test.go | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/dm/syncer/syncer_test.go b/dm/syncer/syncer_test.go index 3026949c09d..eca7152e770 100644 --- a/dm/syncer/syncer_test.go +++ b/dm/syncer/syncer_test.go @@ -1717,7 +1717,7 @@ func genDefaultSubTaskConfig4Test() *config.SubTaskConfig { } func TestWaitBeforeRunExit(t *testing.T) { - ctx := context.Background() + ctx, cancel := context.WithCancel(context.Background()) cfg := genDefaultSubTaskConfig4Test() cfg.WorkerCount = 0 syncer := NewSyncer(cfg, nil, nil) @@ -1746,18 +1746,15 @@ func TestWaitBeforeRunExit(t *testing.T) { defer wg.Done() errCh <- syncer.Run(ctx) }() - // wait s.Run start - time.Sleep(time.Second) + time.Sleep(time.Second) // wait s.Run start // test s.Run will not exit unit caller cancel ctx or call s.runCancel + cancel() // this will make s.Run exit + wg.Wait() + require.Nil(t, <-errCh) require.Equal(t, 0, len(errCh)) - syncer.Lock() require.NotNil(t, syncer.runCtx) require.NotNil(t, syncer.runCancel) - syncer.Unlock() - syncer.runCancel() // this will make s.Run exit - wg.Wait() - require.Nil(t, <-errCh) // test syncer wait time not more than maxPauseOrStopWaitTime oldMaxPauseOrStopWaitTime := maxPauseOrStopWaitTime From d6484a68354578ec62968c13a63aa385efe8eae4 Mon Sep 17 00:00:00 2001 From: ehco1996 Date: Mon, 14 Feb 2022 15:24:05 +0800 Subject: [PATCH 44/47] move init checkpoint worker to init && revert a kill --- dm/syncer/syncer.go | 16 ++++++++-------- dm/tests/new_relay/run.sh | 8 ++------ 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/dm/syncer/syncer.go b/dm/syncer/syncer.go index 58050b1c4ed..5392c7ac044 100644 --- a/dm/syncer/syncer.go +++ b/dm/syncer/syncer.go @@ -290,13 +290,6 @@ func (s *Syncer) newJobChans() { chanSize := calculateChanSize(s.cfg.QueueSize, s.cfg.WorkerCount, s.cfg.Compact) s.dmlJobCh = make(chan *job, chanSize) s.ddlJobCh = make(chan *job, s.cfg.QueueSize) - s.checkpointFlushWorker = &checkpointFlushWorker{ - input: make(chan *checkpointFlushTask, 16), - cp: s.checkpoint, - execError: &s.execError, - afterFlushFn: s.afterFlushCheckpoint, - updateJobMetricsFn: s.updateJobMetrics, - } s.jobsClosed.Store(false) } @@ -308,7 +301,6 @@ func (s *Syncer) closeJobChans() { } close(s.dmlJobCh) close(s.ddlJobCh) - s.checkpointFlushWorker.Close() s.jobsClosed.Store(true) } @@ -567,6 +559,13 @@ func (s *Syncer) reset() { } // create new job chans s.newJobChans() + s.checkpointFlushWorker = &checkpointFlushWorker{ + input: make(chan *checkpointFlushTask, 16), + cp: s.checkpoint, + execError: &s.execError, + afterFlushFn: s.afterFlushCheckpoint, + updateJobMetricsFn: s.updateJobMetrics, + } s.execError.Store(nil) s.setErrLocation(nil, nil, false) @@ -1514,6 +1513,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) { defer func() { s.runCancel() s.closeJobChans() + s.checkpointFlushWorker.Close() s.runWg.Wait() }() diff --git a/dm/tests/new_relay/run.sh b/dm/tests/new_relay/run.sh index 512819f3504..717bb3e210c 100755 --- a/dm/tests/new_relay/run.sh +++ b/dm/tests/new_relay/run.sh @@ -56,12 +56,8 @@ function test_restart_relay_status() { "list-member -n worker3" \ "relay" 1 - echo "kill dm-worker1" - ps aux | grep dm-worker1 | awk '{print $2}' | xargs kill || true - check_port_offline $WORKER1_PORT 20 - echo "kill dm-master" - ps aux | grep dm-master | awk '{print $2}' | xargs kill || true - check_master_port_offline 1 + kill_dm_worker + kill_dm_master run_dm_master $WORK_DIR/master $MASTER_PORT $cur/conf/dm-master.toml check_rpc_alive $cur/../bin/check_master_online 127.0.0.1:$MASTER_PORT From 76180db7ea6f8b62a47929c730a7c02425d3e3c8 Mon Sep 17 00:00:00 2001 From: ehco1996 Date: Mon, 14 Feb 2022 15:42:56 +0800 Subject: [PATCH 45/47] fix check graceful stop --- dm/tests/checkpoint_transaction/run.sh | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/dm/tests/checkpoint_transaction/run.sh b/dm/tests/checkpoint_transaction/run.sh index 7a47c4165f7..e3834469245 100755 --- a/dm/tests/checkpoint_transaction/run.sh +++ b/dm/tests/checkpoint_transaction/run.sh @@ -112,6 +112,16 @@ function run() { "query-status test" \ "\"stage\": \"Running\"" 1 + echo "kill dm-worker1" + ps aux | grep dm-worker1 | awk '{print $2}' | xargs kill || true + check_port_offline $WORKER1_PORT 20 + rm -rf $WORK_DIR/worker1 + run_dm_worker $WORK_DIR/worker1 $WORKER1_PORT $cur/conf/dm-worker1.toml + check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER1_PORT + run_dm_ctl $WORK_DIR "127.0.0.1:$MASTER_PORT" \ + "query-status test" \ + "\"stage\": \"Running\"" 1 + run_sql_file $cur/data/db1.increment2.sql $MYSQL_HOST1 $MYSQL_PORT1 $MYSQL_PASSWORD1 # wait transaction start # you can see why sleep in https://github.com/pingcap/dm/pull/1928#issuecomment-895820239 From 4809d42ff1a186b02cc08c6bf028c5a0fc310413 Mon Sep 17 00:00:00 2001 From: ehco1996 Date: Tue, 15 Feb 2022 17:08:13 +0800 Subject: [PATCH 46/47] address comments --- dm/syncer/syncer.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dm/syncer/syncer.go b/dm/syncer/syncer.go index d768ac8f0a6..754fd23d19b 100644 --- a/dm/syncer/syncer.go +++ b/dm/syncer/syncer.go @@ -1527,6 +1527,8 @@ func (s *Syncer) Run(ctx context.Context) (err error) { s.closeJobChans() s.checkpointFlushWorker.Close() s.runWg.Wait() + // s.syncCancel won't be called when normal exit, this call just to follow the best practice of use context. + s.syncCancel() }() // we should start this goroutine as soon as possible, because it's the only goroutine that cancel syncer.Run From f7df28a9d4172661c8f98070c79a4d57d3268801 Mon Sep 17 00:00:00 2001 From: ehco1996 Date: Wed, 16 Feb 2022 11:50:06 +0800 Subject: [PATCH 47/47] remove sleep --- dm/tests/new_relay/run.sh | 4 ---- 1 file changed, 4 deletions(-) diff --git a/dm/tests/new_relay/run.sh b/dm/tests/new_relay/run.sh index 717bb3e210c..b359bd0c606 100755 --- a/dm/tests/new_relay/run.sh +++ b/dm/tests/new_relay/run.sh @@ -115,11 +115,9 @@ function test_cant_dail_upstream() { check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER1_PORT # make sure DM-worker doesn't exit - sleep 2 run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \ "query-status -s $SOURCE_ID1" \ "injected error" 1 - echo ">>>>>>>>>>>>>>>>>>>>>>>>>>test test_cant_dail_upstream passed" } @@ -158,8 +156,6 @@ function test_cant_dail_downstream() { # restart tidb run_tidb_server 4000 $TIDB_PASSWORD - sleep 2 - echo ">>>>>>>>>>>>>>>>>>>>>>>>>>test test_cant_dail_downstream passed" }