From c7c58bdb0ce7ece393a97bf35c07c892a173d455 Mon Sep 17 00:00:00 2001
From: ehco1996 <zh19960202@gmail.com>
Date: Thu, 23 Dec 2021 15:53:46 +0800
Subject: [PATCH 01/47] change interface

---
 dm/cmd/dm-syncer/main.go     | 2 +-
 dm/dm/unit/unit.go           | 3 ++-
 dm/dm/worker/subtask.go      | 6 +++---
 dm/dm/worker/subtask_test.go | 2 +-
 dm/dumpling/dumpling.go      | 2 +-
 dm/dumpling/dumpling_test.go | 2 +-
 dm/loader/lightning.go       | 2 +-
 dm/loader/loader.go          | 2 +-
 dm/syncer/syncer.go          | 2 +-
 dm/syncer/syncer_test.go     | 4 ++--
 10 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/dm/cmd/dm-syncer/main.go b/dm/cmd/dm-syncer/main.go
index 0f89de19696..41928a55683 100644
--- a/dm/cmd/dm-syncer/main.go
+++ b/dm/cmd/dm-syncer/main.go
@@ -102,7 +102,7 @@ func main() {
 	}
 
 	// 5. close the syncer
-	sync.Close()
+	sync.Close(true)
 	log.L().Info("dm-syncer exit")
 
 	// 6. flush log
diff --git a/dm/dm/unit/unit.go b/dm/dm/unit/unit.go
index 4451a652e86..1d9b32f4977 100644
--- a/dm/dm/unit/unit.go
+++ b/dm/dm/unit/unit.go
@@ -47,7 +47,8 @@ type Unit interface {
 	Process(ctx context.Context, pr chan pb.ProcessResult)
 	// Close shuts down the process and closes the unit, after that can not call Process to resume
 	// The implementation should not block for a long time.
-	Close()
+	// if gracefulStop is true, unit is allowed to wait for its internal logic to complete, otherwise it should be closed immediately
+	Close(gracefulStop bool)
 	// Pause does some cleanups and the unit can be resumed later. The caller will make sure Process has returned.
 	// The implementation should not block for a long time.
 	Pause()
diff --git a/dm/dm/worker/subtask.go b/dm/dm/worker/subtask.go
index 842c2f2c265..f67a8edb406 100644
--- a/dm/dm/worker/subtask.go
+++ b/dm/dm/worker/subtask.go
@@ -157,7 +157,7 @@ func (st *SubTask) initUnits(relay relay.Process) error {
 	var needCloseUnits []unit.Unit
 	defer func() {
 		for _, u := range needCloseUnits {
-			u.Close()
+			u.Close(true)
 		}
 
 		st.initialized.Store(initializeUnitSuccess)
@@ -311,7 +311,7 @@ func (st *SubTask) fetchResultAndUpdateStage(pr chan pb.ProcessResult) {
 
 	switch stage {
 	case pb.Stage_Finished:
-		cu.Close()
+		cu.Close(true)
 		nu := st.getNextUnit()
 		if nu == nil {
 			// Now, when finished, it only stops the process
@@ -380,7 +380,7 @@ func (st *SubTask) closeUnits() {
 	for i := cui; i < len(st.units); i++ {
 		u := st.units[i]
 		st.l.Info("closing unit process", zap.Stringer("unit", cu.Type()))
-		u.Close()
+		u.Close(true)
 	}
 }
 
diff --git a/dm/dm/worker/subtask_test.go b/dm/dm/worker/subtask_test.go
index 8fb5006f03a..0568197e7c1 100644
--- a/dm/dm/worker/subtask_test.go
+++ b/dm/dm/worker/subtask_test.go
@@ -116,7 +116,7 @@ func (m *MockUnit) Process(ctx context.Context, pr chan pb.ProcessResult) {
 	}
 }
 
-func (m *MockUnit) Close() {}
+func (m *MockUnit) Close(gracefulStop bool) {}
 
 func (m MockUnit) Pause() {}
 
diff --git a/dm/dumpling/dumpling.go b/dm/dumpling/dumpling.go
index ed4c0fe8b4b..cf1695230a9 100644
--- a/dm/dumpling/dumpling.go
+++ b/dm/dumpling/dumpling.go
@@ -165,7 +165,7 @@ func (m *Dumpling) Process(ctx context.Context, pr chan pb.ProcessResult) {
 }
 
 // Close implements Unit.Close.
-func (m *Dumpling) Close() {
+func (m *Dumpling) Close(gracefulStop bool) {
 	if m.closed.Load() {
 		return
 	}
diff --git a/dm/dumpling/dumpling_test.go b/dm/dumpling/dumpling_test.go
index 04deaef0429..9308ccb68ea 100644
--- a/dm/dumpling/dumpling_test.go
+++ b/dm/dumpling/dumpling_test.go
@@ -147,7 +147,7 @@ func (t *testDumplingSuite) TestCallStatus(c *C) {
 	dumpling, _ := export.NewDumper(ctx, dumpConf)
 	m.core = dumpling
 
-	m.Close()
+	m.Close(true)
 	s = m.Status(nil).(*pb.DumpStatus)
 	c.Assert(s.CompletedTables, Equals, float64(0))
 	c.Assert(s.FinishedBytes, Equals, float64(0))
diff --git a/dm/loader/lightning.go b/dm/loader/lightning.go
index 66b41fdf347..075a9eacec6 100644
--- a/dm/loader/lightning.go
+++ b/dm/loader/lightning.go
@@ -279,7 +279,7 @@ func (l *LightningLoader) IsFreshTask(ctx context.Context) (bool, error) {
 }
 
 // Close does graceful shutdown.
-func (l *LightningLoader) Close() {
+func (l *LightningLoader) Close(gracefulStop bool) {
 	l.Pause()
 	l.checkPointList.Close()
 	l.closed.Store(true)
diff --git a/dm/loader/loader.go b/dm/loader/loader.go
index 42eaaa20d4d..1f6475f77f1 100644
--- a/dm/loader/loader.go
+++ b/dm/loader/loader.go
@@ -787,7 +787,7 @@ func (l *Loader) loadFinishedSize() {
 }
 
 // Close does graceful shutdown.
-func (l *Loader) Close() {
+func (l *Loader) Close(gracefulStop bool) {
 	l.Lock()
 	defer l.Unlock()
 	if l.isClosed() {
diff --git a/dm/syncer/syncer.go b/dm/syncer/syncer.go
index 6f0c730aee8..ff0ea9f86c8 100644
--- a/dm/syncer/syncer.go
+++ b/dm/syncer/syncer.go
@@ -3160,7 +3160,7 @@ func (s *Syncer) isClosed() bool {
 }
 
 // Close closes syncer.
-func (s *Syncer) Close() {
+func (s *Syncer) Close(gracefulStop bool) {
 	s.Lock()
 	defer s.Unlock()
 
diff --git a/dm/syncer/syncer_test.go b/dm/syncer/syncer_test.go
index b487080b170..26472219497 100644
--- a/dm/syncer/syncer_test.go
+++ b/dm/syncer/syncer_test.go
@@ -1015,7 +1015,7 @@ func (s *testSyncerSuite) TestRun(c *C) {
 	testJobs.RUnlock()
 
 	cancel()
-	syncer.Close()
+	syncer.Close(true)
 	c.Assert(syncer.isClosed(), IsTrue)
 
 	if err := mock.ExpectationsWereMet(); err != nil {
@@ -1191,7 +1191,7 @@ func (s *testSyncerSuite) TestExitSafeModeByConfig(c *C) {
 	testJobs.Unlock()
 
 	cancel()
-	syncer.Close()
+	syncer.Close(true)
 	c.Assert(syncer.isClosed(), IsTrue)
 
 	if err := mock.ExpectationsWereMet(); err != nil {

From b2b8a7731673c5a85a670565178036816ab7deeb Mon Sep 17 00:00:00 2001
From: ehco1996 <zh19960202@gmail.com>
Date: Thu, 23 Dec 2021 16:07:47 +0800
Subject: [PATCH 02/47] rename

---
 dm/dm/unit/unit.go           | 4 ++--
 dm/dm/worker/subtask_test.go | 2 +-
 dm/dumpling/dumpling.go      | 2 +-
 dm/loader/lightning.go       | 2 +-
 dm/loader/loader.go          | 2 +-
 dm/syncer/syncer.go          | 2 +-
 6 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/dm/dm/unit/unit.go b/dm/dm/unit/unit.go
index 1d9b32f4977..58d59aa102f 100644
--- a/dm/dm/unit/unit.go
+++ b/dm/dm/unit/unit.go
@@ -47,8 +47,8 @@ type Unit interface {
 	Process(ctx context.Context, pr chan pb.ProcessResult)
 	// Close shuts down the process and closes the unit, after that can not call Process to resume
 	// The implementation should not block for a long time.
-	// if gracefulStop is true, unit is allowed to wait for its internal logic to complete, otherwise it should be closed immediately
-	Close(gracefulStop bool)
+	// if graceful is true, unit is allowed to wait for its internal logic to complete, otherwise it should be closed immediately
+	Close(graceful bool)
 	// Pause does some cleanups and the unit can be resumed later. The caller will make sure Process has returned.
 	// The implementation should not block for a long time.
 	Pause()
diff --git a/dm/dm/worker/subtask_test.go b/dm/dm/worker/subtask_test.go
index 0568197e7c1..13f2ef5c38f 100644
--- a/dm/dm/worker/subtask_test.go
+++ b/dm/dm/worker/subtask_test.go
@@ -116,7 +116,7 @@ func (m *MockUnit) Process(ctx context.Context, pr chan pb.ProcessResult) {
 	}
 }
 
-func (m *MockUnit) Close(gracefulStop bool) {}
+func (m *MockUnit) Close(graceful bool) {}
 
 func (m MockUnit) Pause() {}
 
diff --git a/dm/dumpling/dumpling.go b/dm/dumpling/dumpling.go
index cf1695230a9..d64bbe5bb99 100644
--- a/dm/dumpling/dumpling.go
+++ b/dm/dumpling/dumpling.go
@@ -165,7 +165,7 @@ func (m *Dumpling) Process(ctx context.Context, pr chan pb.ProcessResult) {
 }
 
 // Close implements Unit.Close.
-func (m *Dumpling) Close(gracefulStop bool) {
+func (m *Dumpling) Close(graceful bool) {
 	if m.closed.Load() {
 		return
 	}
diff --git a/dm/loader/lightning.go b/dm/loader/lightning.go
index 075a9eacec6..1fbb0356643 100644
--- a/dm/loader/lightning.go
+++ b/dm/loader/lightning.go
@@ -279,7 +279,7 @@ func (l *LightningLoader) IsFreshTask(ctx context.Context) (bool, error) {
 }
 
 // Close does graceful shutdown.
-func (l *LightningLoader) Close(gracefulStop bool) {
+func (l *LightningLoader) Close(graceful bool) {
 	l.Pause()
 	l.checkPointList.Close()
 	l.closed.Store(true)
diff --git a/dm/loader/loader.go b/dm/loader/loader.go
index 1f6475f77f1..ce2a5782e01 100644
--- a/dm/loader/loader.go
+++ b/dm/loader/loader.go
@@ -787,7 +787,7 @@ func (l *Loader) loadFinishedSize() {
 }
 
 // Close does graceful shutdown.
-func (l *Loader) Close(gracefulStop bool) {
+func (l *Loader) Close(graceful bool) {
 	l.Lock()
 	defer l.Unlock()
 	if l.isClosed() {
diff --git a/dm/syncer/syncer.go b/dm/syncer/syncer.go
index ff0ea9f86c8..fd227df49a1 100644
--- a/dm/syncer/syncer.go
+++ b/dm/syncer/syncer.go
@@ -3160,7 +3160,7 @@ func (s *Syncer) isClosed() bool {
 }
 
 // Close closes syncer.
-func (s *Syncer) Close(gracefulStop bool) {
+func (s *Syncer) Close(graceful bool) {
 	s.Lock()
 	defer s.Unlock()
 

From 273cdbb2743c7f89c469963d3512a8bb4105c943 Mon Sep 17 00:00:00 2001
From: ehco1996 <zh19960202@gmail.com>
Date: Thu, 23 Dec 2021 16:14:16 +0800
Subject: [PATCH 03/47] adjust close

---
 dm/dm/worker/server.go             |  4 ++--
 dm/dm/worker/source_worker.go      | 12 ++++++------
 dm/dm/worker/source_worker_test.go | 14 +++++++-------
 dm/dm/worker/subtask.go            | 10 +++++-----
 dm/dm/worker/subtask_holder.go     |  6 +++---
 dm/dm/worker/subtask_test.go       |  2 +-
 6 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/dm/dm/worker/server.go b/dm/dm/worker/server.go
index da2952c6271..44ffb5ad367 100644
--- a/dm/dm/worker/server.go
+++ b/dm/dm/worker/server.go
@@ -452,7 +452,7 @@ func (s *Server) doClose() {
 	}
 	// close worker and wait for return
 	if w := s.getWorker(false); w != nil {
-		w.Close()
+		w.Close(true)
 	}
 	s.closed.Store(true)
 }
@@ -531,7 +531,7 @@ func (s *Server) stopWorker(sourceID string, needLock bool) error {
 	s.UpdateKeepAliveTTL(s.cfg.KeepAliveTTL)
 	s.setWorker(nil, false)
 	s.setSourceStatus("", nil, false)
-	w.Close()
+	w.Close(true)
 	return nil
 }
 
diff --git a/dm/dm/worker/source_worker.go b/dm/dm/worker/source_worker.go
index 607c5f530eb..3efefd45f58 100644
--- a/dm/dm/worker/source_worker.go
+++ b/dm/dm/worker/source_worker.go
@@ -115,7 +115,7 @@ func NewSourceWorker(
 		if err != nil { // when err != nil, `w` will become nil in this func, so we pass `w` in defer.
 			// release resources, NOTE: we need to refactor New/Init/Start/Close for components later.
 			w2.cancel()
-			w2.subTaskHolder.closeAllSubTasks()
+			w2.subTaskHolder.closeAllSubTasks(true)
 		}
 	}(w)
 
@@ -196,7 +196,7 @@ func (w *SourceWorker) Start() {
 }
 
 // Close stops working and releases resources.
-func (w *SourceWorker) Close() {
+func (w *SourceWorker) Close(graceful bool) {
 	if w.closed.Load() {
 		w.l.Warn("already closed")
 		return
@@ -212,7 +212,7 @@ func (w *SourceWorker) Close() {
 	defer w.Unlock()
 
 	// close all sub tasks
-	w.subTaskHolder.closeAllSubTasks()
+	w.subTaskHolder.closeAllSubTasks(graceful)
 
 	if w.relayHolder != nil {
 		w.relayHolder.Close()
@@ -464,7 +464,7 @@ func (w *SourceWorker) EnableHandleSubtasks() error {
 		// "for range" of a map will use same value address, so we'd better not pass value address to other function
 		clone := subTaskCfg
 		if err2 := w.StartSubTask(&clone, expectStage.Expect, false); err2 != nil {
-			w.subTaskHolder.closeAllSubTasks()
+			w.subTaskHolder.closeAllSubTasks(true)
 			return err2
 		}
 	}
@@ -497,7 +497,7 @@ func (w *SourceWorker) DisableHandleSubtasks() {
 	defer w.Unlock()
 
 	// close all sub tasks
-	w.subTaskHolder.closeAllSubTasks()
+	w.subTaskHolder.closeAllSubTasks(true)
 	w.l.Info("handling subtask enabled")
 }
 
@@ -604,7 +604,7 @@ func (w *SourceWorker) OperateSubTask(name string, op pb.TaskOp) error {
 	switch op {
 	case pb.TaskOp_Stop:
 		w.l.Info("stop sub task", zap.String("task", name))
-		st.Close()
+		st.Close(true)
 		w.subTaskHolder.removeSubTask(name)
 	case pb.TaskOp_Pause:
 		w.l.Info("pause sub task", zap.String("task", name))
diff --git a/dm/dm/worker/source_worker_test.go b/dm/dm/worker/source_worker_test.go
index 5dd29bce6c9..574fd7f4020 100644
--- a/dm/dm/worker/source_worker_test.go
+++ b/dm/dm/worker/source_worker_test.go
@@ -89,10 +89,10 @@ func (t *testServer) testWorker(c *C) {
 	c.Assert(w.GetUnitAndSourceStatusJSON("", nil), HasLen, emptyWorkerStatusInfoJSONLength)
 
 	// close twice
-	w.Close()
+	w.Close(true)
 	c.Assert(w.closed.Load(), IsTrue)
 	c.Assert(w.subTaskHolder.getAllSubTasks(), HasLen, 0)
-	w.Close()
+	w.Close(true)
 	c.Assert(w.closed.Load(), IsTrue)
 	c.Assert(w.subTaskHolder.getAllSubTasks(), HasLen, 0)
 	c.Assert(w.closed.Load(), IsTrue)
@@ -294,7 +294,7 @@ func (t *testWorkerFunctionalities) TestWorkerFunctionalities(c *C) {
 	// start worker
 	w, err := NewSourceWorker(sourceCfg, etcdCli, "", "")
 	c.Assert(err, IsNil)
-	defer w.Close()
+	defer w.Close(true)
 	go func() {
 		w.Start()
 	}()
@@ -467,7 +467,7 @@ func (t *testWorkerEtcdCompact) TestWatchSubtaskStageEtcdCompact(c *C) {
 	c.Assert(err, IsNil)
 	ctx, cancel := context.WithCancel(context.Background())
 	defer cancel()
-	defer w.Close()
+	defer w.Close(true)
 	go func() {
 		w.Start()
 	}()
@@ -530,7 +530,7 @@ func (t *testWorkerEtcdCompact) TestWatchSubtaskStageEtcdCompact(c *C) {
 	c.Assert(status[0].Stage, Equals, pb.Stage_Running)
 	cancel1()
 	wg.Wait()
-	w.subTaskHolder.closeAllSubTasks()
+	w.subTaskHolder.closeAllSubTasks(true)
 	// step 5: restart observe and start from startRev, this subtask should be added
 	ctx2, cancel2 := context.WithCancel(ctx)
 	wg.Add(1)
@@ -548,7 +548,7 @@ func (t *testWorkerEtcdCompact) TestWatchSubtaskStageEtcdCompact(c *C) {
 	c.Assert(status, HasLen, 1)
 	c.Assert(status[0].Name, Equals, subtaskCfg.Name)
 	c.Assert(status[0].Stage, Equals, pb.Stage_Running)
-	w.Close()
+	w.Close(true)
 	cancel2()
 	wg.Wait()
 }
@@ -586,7 +586,7 @@ func (t *testWorkerEtcdCompact) TestWatchRelayStageEtcdCompact(c *C) {
 	c.Assert(err, IsNil)
 	ctx, cancel := context.WithCancel(context.Background())
 	defer cancel()
-	defer w.Close()
+	defer w.Close(true)
 	go func() {
 		c.Assert(w.EnableRelay(false), IsNil)
 		w.Start()
diff --git a/dm/dm/worker/subtask.go b/dm/dm/worker/subtask.go
index f67a8edb406..676b1bf1bc3 100644
--- a/dm/dm/worker/subtask.go
+++ b/dm/dm/worker/subtask.go
@@ -358,7 +358,7 @@ func (st *SubTask) PrevUnit() unit.Unit {
 }
 
 // closeUnits closes all un-closed units (current unit and all the subsequent units).
-func (st *SubTask) closeUnits() {
+func (st *SubTask) closeUnits(graceful bool) {
 	st.cancel()
 	st.resultWg.Wait()
 
@@ -379,8 +379,8 @@ func (st *SubTask) closeUnits() {
 
 	for i := cui; i < len(st.units); i++ {
 		u := st.units[i]
-		st.l.Info("closing unit process", zap.Stringer("unit", cu.Type()))
-		u.Close(true)
+		st.l.Info("closing unit process", zap.Stringer("unit", cu.Type()), zap.Bool("graceful", graceful))
+		u.Close(graceful)
 	}
 }
 
@@ -477,14 +477,14 @@ func (st *SubTask) Result() *pb.ProcessResult {
 }
 
 // Close stops the sub task.
-func (st *SubTask) Close() {
+func (st *SubTask) Close(graceful bool) {
 	st.l.Info("closing")
 	if !st.setStageIfNotIn([]pb.Stage{pb.Stage_Stopped, pb.Stage_Stopping, pb.Stage_Finished}, pb.Stage_Stopping) {
 		st.l.Info("subTask is already closed, no need to close")
 		return
 	}
 
-	st.closeUnits() // close all un-closed units
+	st.closeUnits(graceful) // close all un-closed units
 	updateTaskMetric(st.cfg.Name, st.cfg.SourceID, pb.Stage_Stopped, st.workerName)
 }
 
diff --git a/dm/dm/worker/subtask_holder.go b/dm/dm/worker/subtask_holder.go
index 888b756f894..141590f4461 100644
--- a/dm/dm/worker/subtask_holder.go
+++ b/dm/dm/worker/subtask_holder.go
@@ -55,7 +55,7 @@ func (h *subTaskHolder) resetAllSubTasks(relay relay.Process) {
 	defer h.mu.Unlock()
 	for _, st := range h.subTasks {
 		stage := st.Stage()
-		st.Close()
+		st.Close(true)
 		// TODO: make a st.reset
 		st.ctx, st.cancel = context.WithCancel(context.Background())
 		st.cfg.UseRelay = relay != nil
@@ -64,11 +64,11 @@ func (h *subTaskHolder) resetAllSubTasks(relay relay.Process) {
 }
 
 // closeAllSubTasks closes all subtask instances.
-func (h *subTaskHolder) closeAllSubTasks() {
+func (h *subTaskHolder) closeAllSubTasks(graceful bool) {
 	h.mu.Lock()
 	defer h.mu.Unlock()
 	for _, st := range h.subTasks {
-		st.Close()
+		st.Close(graceful)
 	}
 	h.subTasks = make(map[string]*SubTask)
 }
diff --git a/dm/dm/worker/subtask_test.go b/dm/dm/worker/subtask_test.go
index 13f2ef5c38f..52b7487e898 100644
--- a/dm/dm/worker/subtask_test.go
+++ b/dm/dm/worker/subtask_test.go
@@ -512,7 +512,7 @@ func (t *testSubTask) TestSubtaskFastQuit(c *C) {
 		return st.Stage() == pb.Stage_Running
 	}), IsTrue)
 	// test Close
-	st.Close()
+	st.Close(true)
 	select {
 	case <-time.After(500 * time.Millisecond):
 		c.Fatal("fail to stop subtask in 0.5s when stuck into unitTransWaitCondition")

From c7f833c71d5ae2e7c5b19f5c610c9bdf02790b39 Mon Sep 17 00:00:00 2001
From: ehco1996 <zh19960202@gmail.com>
Date: Thu, 23 Dec 2021 16:55:21 +0800
Subject: [PATCH 04/47] graceful stop worker when lost ha

---
 dm/dm/worker/join.go        |  3 ++-
 dm/dm/worker/server.go      | 12 ++++++------
 dm/dm/worker/server_test.go |  2 +-
 3 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/dm/dm/worker/join.go b/dm/dm/worker/join.go
index ece56ec1a4d..fd00a49095c 100644
--- a/dm/dm/worker/join.go
+++ b/dm/dm/worker/join.go
@@ -108,7 +108,8 @@ func (s *Server) KeepAlive() {
 		failpoint.Label("bypass")
 
 		// TODO: report the error.
-		err := s.stopWorker("", true)
+		// when lost keepalive, stop the worker with not graceful. this is to fix https://github.com/pingcap/tiflow/issues/3737
+		err := s.stopWorker("", true, false)
 		if err != nil {
 			log.L().Error("fail to stop worker", zap.Error(err))
 			return // return if failed to stop the worker.
diff --git a/dm/dm/worker/server.go b/dm/dm/worker/server.go
index 44ffb5ad367..ee257914759 100644
--- a/dm/dm/worker/server.go
+++ b/dm/dm/worker/server.go
@@ -325,7 +325,7 @@ func (s *Server) observeRelayConfig(ctx context.Context, rev int64) error {
 								}
 								return nil
 							}
-							err = s.stopWorker("", false)
+							err = s.stopWorker("", false, true)
 							if err != nil {
 								log.L().Error("fail to stop worker", zap.Error(err))
 								return err // return if failed to stop the worker.
@@ -414,7 +414,7 @@ func (s *Server) observeSourceBound(ctx context.Context, rev int64) error {
 								}
 								return nil
 							}
-							err = s.stopWorker("", false)
+							err = s.stopWorker("", false, true)
 							if err != nil {
 								log.L().Error("fail to stop worker", zap.Error(err))
 								return err // return if failed to stop the worker.
@@ -515,7 +515,7 @@ func (s *Server) setSourceStatus(source string, err error, needLock bool) {
 
 // if sourceID is set to "", worker will be closed directly
 // if sourceID is not "", we will check sourceID with w.cfg.SourceID.
-func (s *Server) stopWorker(sourceID string, needLock bool) error {
+func (s *Server) stopWorker(sourceID string, needLock, graceful bool) error {
 	if needLock {
 		s.Lock()
 		defer s.Unlock()
@@ -531,7 +531,7 @@ func (s *Server) stopWorker(sourceID string, needLock bool) error {
 	s.UpdateKeepAliveTTL(s.cfg.KeepAliveTTL)
 	s.setWorker(nil, false)
 	s.setSourceStatus("", nil, false)
-	w.Close(true)
+	w.Close(graceful)
 	return nil
 }
 
@@ -676,7 +676,7 @@ func (s *Server) disableHandleSubtasks(source string) error {
 	var err error
 	if !w.relayEnabled.Load() {
 		log.L().Info("relay is not enabled after disabling subtask, so stop worker")
-		err = s.stopWorker(source, false)
+		err = s.stopWorker(source, false, true)
 	}
 	return err
 }
@@ -731,7 +731,7 @@ func (s *Server) disableRelay(source string) error {
 	var err error
 	if !w.subTaskEnabled.Load() {
 		log.L().Info("subtask is not enabled after disabling relay, so stop worker")
-		err = s.stopWorker(source, false)
+		err = s.stopWorker(source, false, true)
 	}
 	return err
 }
diff --git a/dm/dm/worker/server_test.go b/dm/dm/worker/server_test.go
index f7c007983da..e9a578ddc82 100644
--- a/dm/dm/worker/server_test.go
+++ b/dm/dm/worker/server_test.go
@@ -426,7 +426,7 @@ func (t *testServer) TestWatchSourceBoundEtcdCompact(c *C) {
 	c.Assert(cfg2, DeepEquals, sourceCfg)
 	cancel1()
 	wg.Wait()
-	c.Assert(s.stopWorker(sourceCfg.SourceID, true), IsNil)
+	c.Assert(s.stopWorker(sourceCfg.SourceID, true, true), IsNil)
 	// step 5: start observeSourceBound from compacted revision again, should start worker
 	ctx2, cancel2 := context.WithCancel(ctx)
 	wg.Add(1)

From bb57be67c5904191fe2e3c3bf82c952d42c8a883 Mon Sep 17 00:00:00 2001
From: ehco1996 <zh19960202@gmail.com>
Date: Fri, 24 Dec 2021 16:57:45 +0800
Subject: [PATCH 05/47] no graceful end in syncer

---
 dm/syncer/syncer.go | 73 ++++++++++++++++++++++++++++-----------------
 1 file changed, 45 insertions(+), 28 deletions(-)

diff --git a/dm/syncer/syncer.go b/dm/syncer/syncer.go
index fd227df49a1..36e93eb5a05 100644
--- a/dm/syncer/syncer.go
+++ b/dm/syncer/syncer.go
@@ -157,6 +157,8 @@ type Syncer struct {
 	waitXIDJob          atomic.Int64
 	isTransactionEnd    bool
 	waitTransactionLock sync.Mutex
+	noWaitCtx           context.Context // when this cancel, syncer will exit quickly no need to wait transaction end
+	noWaitCancel        context.CancelFunc
 
 	tableRouter     *router.Table
 	binlogFilter    *bf.BinlogEvent
@@ -273,6 +275,10 @@ func NewSyncer(cfg *config.SubTaskConfig, etcdClient *clientv3.Client, relay rel
 	}
 	syncer.lastCheckpointFlushedTime = time.Time{}
 	syncer.relay = relay
+
+	noWaitCtx, noWaitCancel := context.WithCancel(context.Background())
+	syncer.noWaitCtx = noWaitCtx
+	syncer.noWaitCancel = noWaitCancel
 	return syncer
 }
 
@@ -564,7 +570,9 @@ func (s *Syncer) reset() {
 	s.waitXIDJob.Store(int64(noWait))
 	s.isTransactionEnd = true
 	s.flushSeq = 0
-
+	noWaitCtx, noWaitCancel := context.WithCancel(context.Background())
+	s.noWaitCtx = noWaitCtx
+	s.noWaitCancel = noWaitCancel
 	switch s.cfg.ShardMode {
 	case config.ShardPessimistic:
 		// every time start to re-sync from resume, we reset status to make it like a fresh syncing
@@ -1385,6 +1393,37 @@ func (s *Syncer) syncDML() {
 	}
 }
 
+func (s *Syncer) waitTransactionEndBeforeExit(ctx, runCtx context.Context, cancel context.CancelFunc) {
+	<-ctx.Done()
+	select {
+	case <-runCtx.Done():
+	default:
+		log.L().Info("received subtask's done")
+
+		s.waitTransactionLock.Lock()
+		if s.isTransactionEnd {
+			s.waitXIDJob.Store(int64(waitComplete))
+			log.L().Info("the last job is transaction end, done directly")
+			cancel()
+			s.waitTransactionLock.Unlock()
+			return
+		}
+		s.waitXIDJob.Store(int64(waiting))
+		s.waitTransactionLock.Unlock()
+
+		select {
+		case <-runCtx.Done():
+			log.L().Info("received syncer's done")
+		case <-s.noWaitCtx.Done():
+			log.L().Info("received no wait ctx done")
+			cancel()
+		case <-time.After(maxPauseOrStopWaitTime):
+			log.L().Info("wait transaction end timeout")
+			cancel()
+		}
+	}
+}
+
 // Run starts running for sync, we should guarantee it can rerun when paused.
 func (s *Syncer) Run(ctx context.Context) (err error) {
 	runCtx, runCancel := context.WithCancel(context.Background())
@@ -1397,33 +1436,8 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 		}
 	}()
 
-	go func() {
-		<-ctx.Done()
-		select {
-		case <-runCtx.Done():
-		default:
-			tctx.L().Info("received subtask's done")
-
-			s.waitTransactionLock.Lock()
-			if s.isTransactionEnd {
-				s.waitXIDJob.Store(int64(waitComplete))
-				tctx.L().Info("the last job is transaction end, done directly")
-				runCancel()
-				s.waitTransactionLock.Unlock()
-				return
-			}
-			s.waitXIDJob.Store(int64(waiting))
-			s.waitTransactionLock.Unlock()
-
-			select {
-			case <-runCtx.Done():
-				tctx.L().Info("received syncer's done")
-			case <-time.After(maxPauseOrStopWaitTime):
-				tctx.L().Info("wait transaction end timeout")
-				runCancel()
-			}
-		}
-	}()
+	// start another goroutine to wait until transaction end.
+	go s.waitTransactionEndBeforeExit(ctx, runCtx, runCancel)
 
 	// some initialization that can't be put in Syncer.Init
 	fresh, err := s.IsFreshTask(runCtx)
@@ -3168,6 +3182,9 @@ func (s *Syncer) Close(graceful bool) {
 		return
 	}
 
+	if !graceful {
+		s.noWaitCancel()
+	}
 	s.stopSync()
 	s.closeDBs()
 

From 5acafed61046081bcb9bebcbd12309762b3beb04 Mon Sep 17 00:00:00 2001
From: ehco1996 <zh19960202@gmail.com>
Date: Wed, 29 Dec 2021 13:38:30 +0800
Subject: [PATCH 06/47] save work

---
 dm/dm/worker/subtask.go                       |  6 ++
 dm/syncer/syncer.go                           | 55 +++++++++----------
 .../conf/dm-worker1.toml                      |  1 +
 dm/tests/checkpoint_transaction/run.sh        | 52 ++++++++++++++++++
 4 files changed, 84 insertions(+), 30 deletions(-)

diff --git a/dm/dm/worker/subtask.go b/dm/dm/worker/subtask.go
index 676b1bf1bc3..b85de02d5ea 100644
--- a/dm/dm/worker/subtask.go
+++ b/dm/dm/worker/subtask.go
@@ -359,6 +359,12 @@ func (st *SubTask) PrevUnit() unit.Unit {
 
 // closeUnits closes all un-closed units (current unit and all the subsequent units).
 func (st *SubTask) closeUnits(graceful bool) {
+	// when not graceful, we want to syncer to exit immediately, so we call u.Close(false) before call cancel
+	// Note that we only implement un graceful close for sync unit
+	if !graceful && st.CurrUnit().Type() == pb.UnitType_Sync {
+		st.l.Info("closing syncer without graceful", zap.String("task", st.cfg.Name))
+		st.CurrUnit().Close(false)
+	}
 	st.cancel()
 	st.resultWg.Wait()
 
diff --git a/dm/syncer/syncer.go b/dm/syncer/syncer.go
index 36e93eb5a05..16e93fbb94f 100644
--- a/dm/syncer/syncer.go
+++ b/dm/syncer/syncer.go
@@ -157,8 +157,8 @@ type Syncer struct {
 	waitXIDJob          atomic.Int64
 	isTransactionEnd    bool
 	waitTransactionLock sync.Mutex
-	noWaitCtx           context.Context // when this cancel, syncer will exit quickly no need to wait transaction end
-	noWaitCancel        context.CancelFunc
+	runCtx              context.Context // this ctx is injected in `s.Run` and when this ctx cancelled, syncer will exit quickly and not wait transaction end
+	runCancel           context.CancelFunc
 
 	tableRouter     *router.Table
 	binlogFilter    *bf.BinlogEvent
@@ -275,10 +275,6 @@ func NewSyncer(cfg *config.SubTaskConfig, etcdClient *clientv3.Client, relay rel
 	}
 	syncer.lastCheckpointFlushedTime = time.Time{}
 	syncer.relay = relay
-
-	noWaitCtx, noWaitCancel := context.WithCancel(context.Background())
-	syncer.noWaitCtx = noWaitCtx
-	syncer.noWaitCancel = noWaitCancel
 	return syncer
 }
 
@@ -570,9 +566,6 @@ func (s *Syncer) reset() {
 	s.waitXIDJob.Store(int64(noWait))
 	s.isTransactionEnd = true
 	s.flushSeq = 0
-	noWaitCtx, noWaitCancel := context.WithCancel(context.Background())
-	s.noWaitCtx = noWaitCtx
-	s.noWaitCancel = noWaitCancel
 	switch s.cfg.ShardMode {
 	case config.ShardPessimistic:
 		// every time start to re-sync from resume, we reset status to make it like a fresh syncing
@@ -980,7 +973,7 @@ func (s *Syncer) addJob(job *job) error {
 		s.isTransactionEnd = false
 		failpoint.Inject("checkCheckpointInMiddleOfTransaction", func() {
 			s.tctx.L().Info("receive dml job", zap.Any("dml job", job))
-			time.Sleep(100 * time.Millisecond)
+			time.Sleep(500 * time.Millisecond)
 		})
 	}
 
@@ -1393,34 +1386,28 @@ func (s *Syncer) syncDML() {
 	}
 }
 
-func (s *Syncer) waitTransactionEndBeforeExit(ctx, runCtx context.Context, cancel context.CancelFunc) {
-	<-ctx.Done()
+func (s *Syncer) waitTransactionEndBeforeExit(ctx context.Context) {
 	select {
-	case <-runCtx.Done():
-	default:
-		log.L().Info("received subtask's done")
-
+	case <-ctx.Done(): // hijack the context to wait for the transaction to end.
+		log.L().Info("received subtask's done, try graceful stop")
 		s.waitTransactionLock.Lock()
 		if s.isTransactionEnd {
 			s.waitXIDJob.Store(int64(waitComplete))
 			log.L().Info("the last job is transaction end, done directly")
-			cancel()
+			s.runCancel()
 			s.waitTransactionLock.Unlock()
 			return
 		}
 		s.waitXIDJob.Store(int64(waiting))
 		s.waitTransactionLock.Unlock()
-
 		select {
-		case <-runCtx.Done():
-			log.L().Info("received syncer's done")
-		case <-s.noWaitCtx.Done():
-			log.L().Info("received no wait ctx done")
-			cancel()
+		case <-s.runCtx.Done():
+			log.L().Info("received run ctx done, exit now")
 		case <-time.After(maxPauseOrStopWaitTime):
-			log.L().Info("wait transaction end timeout")
-			cancel()
+			log.L().Info("wait transaction end timeout, exit now")
 		}
+	case <-s.runCtx.Done(): // when no graceful stop, run ctx will canceled first.
+		log.L().Info("received ungraceful exit ctx, exit now")
 	}
 }
 
@@ -1428,6 +1415,10 @@ func (s *Syncer) waitTransactionEndBeforeExit(ctx, runCtx context.Context, cance
 func (s *Syncer) Run(ctx context.Context) (err error) {
 	runCtx, runCancel := context.WithCancel(context.Background())
 	defer runCancel()
+	s.Lock()
+	s.runCtx = runCtx
+	s.runCancel = runCancel
+	s.Unlock()
 	tctx := s.tctx.WithContext(runCtx)
 
 	defer func() {
@@ -1436,8 +1427,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 		}
 	}()
 
-	// start another goroutine to wait until transaction end.
-	go s.waitTransactionEndBeforeExit(ctx, runCtx, runCancel)
+	go s.waitTransactionEndBeforeExit(ctx)
 
 	// some initialization that can't be put in Syncer.Init
 	fresh, err := s.IsFreshTask(runCtx)
@@ -3183,15 +3173,20 @@ func (s *Syncer) Close(graceful bool) {
 	}
 
 	if !graceful {
-		s.noWaitCancel()
+		s.runCancel()
+		s.tctx.L().Warn("syncer is closing without graceful")
 	}
 	s.stopSync()
 	s.closeDBs()
 
 	s.checkpoint.Close()
 
-	if err := s.schemaTracker.Close(); err != nil {
-		s.tctx.L().Error("fail to close schema tracker", log.ShortError(err))
+	if graceful {
+		// when close sycner without graceful, Close will be called before s.Process exit.
+		// if we close schematracker here worker will pending on `s.checkpoint.Rollback(s.schemaTracker)` forever
+		if err := s.schemaTracker.Close(); err != nil {
+			s.tctx.L().Error("fail to close schema tracker", log.ShortError(err))
+		}
 	}
 
 	if s.sgk != nil {
diff --git a/dm/tests/checkpoint_transaction/conf/dm-worker1.toml b/dm/tests/checkpoint_transaction/conf/dm-worker1.toml
index 7a72ea72bf8..1a53a1dc788 100644
--- a/dm/tests/checkpoint_transaction/conf/dm-worker1.toml
+++ b/dm/tests/checkpoint_transaction/conf/dm-worker1.toml
@@ -1,2 +1,3 @@
 name = "worker1"
 join = "127.0.0.1:8261"
+keepalive-ttl = 1
diff --git a/dm/tests/checkpoint_transaction/run.sh b/dm/tests/checkpoint_transaction/run.sh
index 5771945384a..8562f2c1315 100755
--- a/dm/tests/checkpoint_transaction/run.sh
+++ b/dm/tests/checkpoint_transaction/run.sh
@@ -6,6 +6,35 @@ cur=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
 source $cur/../_utils/test_prepare
 WORK_DIR=$TEST_DIR/$TEST_NAME
 
+function check_worker_ungraceful_stop_with_retry() {
+	all_matched=false
+
+	for ((k = 0; k < 10; k++)); do
+		sleep 1
+		echo "start check_worker_ungraceful_stop_with_retry times: $k"
+		num=$(grep "closing syncer without graceful" $WORK_DIR/worker1/log/dm-worker.log | wc -l)
+		if [ $num -lt 1 ]; then
+			continue
+		fi
+		num=$(grep "syncer is closing" $WORK_DIR/worker1/log/dm-worker.log | wc -l)
+		if [ $num -lt 1 ]; then
+			continue
+		fi
+		num=$(grep "received ungraceful exit ctx, exit now" $WORK_DIR/worker1/log/dm-worker.log | wc -l)
+		if [ $num -lt 1 ]; then
+			continue
+		fi
+		echo "check_worker_ungraceful_stop_with_retry after retry: $k"
+		all_matched=true
+		if $all_matched; then
+			break
+		fi
+	done
+	if ! $all_matched; then
+		exit 1
+	fi
+}
+
 function run() {
 	export GO_FAILPOINTS="github.com/pingcap/tiflow/dm/syncer/checkCheckpointInMiddleOfTransaction=return"
 
@@ -28,6 +57,29 @@ function run() {
 	# check diff
 	check_sync_diff $WORK_DIR $cur/conf/diff_config.toml
 
+	# test ungraceful stop, worker will not wait transaction finish
+	run_sql_file $cur/data/db1.increment1.sql $MYSQL_HOST1 $MYSQL_PORT1 $MYSQL_PASSWORD1
+	sleep 2
+	# kill dm-master 1 to make worker lost keepalive while a transaction is not finished
+	echo "kill dm-master1"
+	kill_dm_master
+	check_master_port_offline 1
+	sleep 1 # wiat woker lost ha keepalive-ttl is 1 second
+
+	# check dm-worker2 will exit quickly without waiting for the transaction to finish
+	check_worker_ungraceful_stop_with_retry
+
+	# test data in tidb less than source
+	dataCountSource=$(mysql -uroot -h$MYSQL_HOST1 -P$MYSQL_PORT1 -p$MYSQL_PASSWORD1 -se "select count(1) from checkpoint_transaction.t1")
+	dataCountIntidb=$(mysql -uroot -h127.0.0.1 -P4000 -se "select count(1) from checkpoint_transaction.t1")
+	echo "afetr ungraceful exit data in source count: $dataCountSource data in tidb count: $dataCountIntidb"
+	[[ $dataCountIntidb -lt $dataCountSource ]]
+
+	# start dm-master again task will be resume, and data will be synced
+	run_dm_master $WORK_DIR/master $MASTER_PORT $cur/conf/dm-master.toml
+	check_rpc_alive $cur/../bin/check_master_online 127.0.0.1:$MASTER_PORT
+	check_sync_diff $WORK_DIR $cur/conf/diff_config.toml
+
 	run_sql_file $cur/data/db1.increment1.sql $MYSQL_HOST1 $MYSQL_PORT1 $MYSQL_PASSWORD1
 	# wait transaction start
 	# you can see why sleep in https://github.com/pingcap/dm/pull/1928#issuecomment-895820239

From 0e68f2419194925da7dc8c27ca33404715678010 Mon Sep 17 00:00:00 2001
From: ehco1996 <zh19960202@gmail.com>
Date: Thu, 30 Dec 2021 16:39:10 +0800
Subject: [PATCH 07/47] fix comment

---
 dm/dm/worker/subtask.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dm/dm/worker/subtask.go b/dm/dm/worker/subtask.go
index b85de02d5ea..c2003b4bbba 100644
--- a/dm/dm/worker/subtask.go
+++ b/dm/dm/worker/subtask.go
@@ -360,7 +360,7 @@ func (st *SubTask) PrevUnit() unit.Unit {
 // closeUnits closes all un-closed units (current unit and all the subsequent units).
 func (st *SubTask) closeUnits(graceful bool) {
 	// when not graceful, we want to syncer to exit immediately, so we call u.Close(false) before call cancel
-	// Note that we only implement un graceful close for sync unit
+	// note that we only implement ungraceful close for sync unit
 	if !graceful && st.CurrUnit().Type() == pb.UnitType_Sync {
 		st.l.Info("closing syncer without graceful", zap.String("task", st.cfg.Name))
 		st.CurrUnit().Close(false)

From 6081bce35ebc7cb8f6d7fe2613e0123493e93b22 Mon Sep 17 00:00:00 2001
From: ehco1996 <zh19960202@gmail.com>
Date: Tue, 4 Jan 2022 10:37:35 +0800
Subject: [PATCH 08/47] address comment and trigger test

---
 dm/cmd/dm-syncer/main.go               |  2 +-
 dm/dm/unit/unit.go                     |  5 ++--
 dm/dm/worker/join.go                   |  2 +-
 dm/dm/worker/server.go                 |  6 ++--
 dm/dm/worker/source_worker.go          | 18 +++++++-----
 dm/dm/worker/source_worker_test.go     | 16 +++++------
 dm/dm/worker/subtask.go                | 39 +++++++++++++++++---------
 dm/dm/worker/subtask_holder.go         | 16 +++++++++--
 dm/dm/worker/subtask_test.go           |  6 ++--
 dm/dumpling/dumpling.go                |  8 +++++-
 dm/dumpling/dumpling_test.go           |  2 +-
 dm/loader/lightning.go                 |  8 +++++-
 dm/loader/loader.go                    |  8 +++++-
 dm/syncer/syncer.go                    | 25 +++++++++++------
 dm/syncer/syncer_test.go               |  4 +--
 dm/tests/checkpoint_transaction/run.sh |  7 +++--
 16 files changed, 114 insertions(+), 58 deletions(-)

diff --git a/dm/cmd/dm-syncer/main.go b/dm/cmd/dm-syncer/main.go
index 41928a55683..0f89de19696 100644
--- a/dm/cmd/dm-syncer/main.go
+++ b/dm/cmd/dm-syncer/main.go
@@ -102,7 +102,7 @@ func main() {
 	}
 
 	// 5. close the syncer
-	sync.Close(true)
+	sync.Close()
 	log.L().Info("dm-syncer exit")
 
 	// 6. flush log
diff --git a/dm/dm/unit/unit.go b/dm/dm/unit/unit.go
index 58d59aa102f..363d17bf2cc 100644
--- a/dm/dm/unit/unit.go
+++ b/dm/dm/unit/unit.go
@@ -47,8 +47,9 @@ type Unit interface {
 	Process(ctx context.Context, pr chan pb.ProcessResult)
 	// Close shuts down the process and closes the unit, after that can not call Process to resume
 	// The implementation should not block for a long time.
-	// if graceful is true, unit is allowed to wait for its internal logic to complete, otherwise it should be closed immediately
-	Close(graceful bool)
+	Close()
+	// Kill shuts down the process and closes the unit without graceful.
+	Kill()
 	// Pause does some cleanups and the unit can be resumed later. The caller will make sure Process has returned.
 	// The implementation should not block for a long time.
 	Pause()
diff --git a/dm/dm/worker/join.go b/dm/dm/worker/join.go
index fd00a49095c..e9d18ba5bbb 100644
--- a/dm/dm/worker/join.go
+++ b/dm/dm/worker/join.go
@@ -108,7 +108,7 @@ func (s *Server) KeepAlive() {
 		failpoint.Label("bypass")
 
 		// TODO: report the error.
-		// when lost keepalive, stop the worker with not graceful. this is to fix https://github.com/pingcap/tiflow/issues/3737
+		// when lost keepalive, stop the worker without graceful. this is to fix https://github.com/pingcap/tiflow/issues/3737
 		err := s.stopWorker("", true, false)
 		if err != nil {
 			log.L().Error("fail to stop worker", zap.Error(err))
diff --git a/dm/dm/worker/server.go b/dm/dm/worker/server.go
index ee257914759..b3c778c2d21 100644
--- a/dm/dm/worker/server.go
+++ b/dm/dm/worker/server.go
@@ -450,9 +450,9 @@ func (s *Server) doClose() {
 	if s.closed.Load() {
 		return
 	}
-	// close worker and wait for return
+	// stop worker and wait for return
 	if w := s.getWorker(false); w != nil {
-		w.Close(true)
+		w.Stop(true)
 	}
 	s.closed.Store(true)
 }
@@ -531,7 +531,7 @@ func (s *Server) stopWorker(sourceID string, needLock, graceful bool) error {
 	s.UpdateKeepAliveTTL(s.cfg.KeepAliveTTL)
 	s.setWorker(nil, false)
 	s.setSourceStatus("", nil, false)
-	w.Close(graceful)
+	w.Stop(graceful)
 	return nil
 }
 
diff --git a/dm/dm/worker/source_worker.go b/dm/dm/worker/source_worker.go
index 3efefd45f58..efcf8e04490 100644
--- a/dm/dm/worker/source_worker.go
+++ b/dm/dm/worker/source_worker.go
@@ -115,7 +115,7 @@ func NewSourceWorker(
 		if err != nil { // when err != nil, `w` will become nil in this func, so we pass `w` in defer.
 			// release resources, NOTE: we need to refactor New/Init/Start/Close for components later.
 			w2.cancel()
-			w2.subTaskHolder.closeAllSubTasks(true)
+			w2.subTaskHolder.closeAllSubTasks()
 		}
 	}(w)
 
@@ -196,7 +196,7 @@ func (w *SourceWorker) Start() {
 }
 
 // Close stops working and releases resources.
-func (w *SourceWorker) Close(graceful bool) {
+func (w *SourceWorker) Stop(graceful bool) {
 	if w.closed.Load() {
 		w.l.Warn("already closed")
 		return
@@ -211,8 +211,12 @@ func (w *SourceWorker) Close(graceful bool) {
 	w.Lock()
 	defer w.Unlock()
 
-	// close all sub tasks
-	w.subTaskHolder.closeAllSubTasks(graceful)
+	// close or kill all subtasks
+	if graceful {
+		w.subTaskHolder.closeAllSubTasks()
+	} else {
+		w.subTaskHolder.killAllSubTasks()
+	}
 
 	if w.relayHolder != nil {
 		w.relayHolder.Close()
@@ -464,7 +468,7 @@ func (w *SourceWorker) EnableHandleSubtasks() error {
 		// "for range" of a map will use same value address, so we'd better not pass value address to other function
 		clone := subTaskCfg
 		if err2 := w.StartSubTask(&clone, expectStage.Expect, false); err2 != nil {
-			w.subTaskHolder.closeAllSubTasks(true)
+			w.subTaskHolder.closeAllSubTasks()
 			return err2
 		}
 	}
@@ -497,7 +501,7 @@ func (w *SourceWorker) DisableHandleSubtasks() {
 	defer w.Unlock()
 
 	// close all sub tasks
-	w.subTaskHolder.closeAllSubTasks(true)
+	w.subTaskHolder.closeAllSubTasks()
 	w.l.Info("handling subtask enabled")
 }
 
@@ -604,7 +608,7 @@ func (w *SourceWorker) OperateSubTask(name string, op pb.TaskOp) error {
 	switch op {
 	case pb.TaskOp_Stop:
 		w.l.Info("stop sub task", zap.String("task", name))
-		st.Close(true)
+		st.Close()
 		w.subTaskHolder.removeSubTask(name)
 	case pb.TaskOp_Pause:
 		w.l.Info("pause sub task", zap.String("task", name))
diff --git a/dm/dm/worker/source_worker_test.go b/dm/dm/worker/source_worker_test.go
index 574fd7f4020..770b7ec7983 100644
--- a/dm/dm/worker/source_worker_test.go
+++ b/dm/dm/worker/source_worker_test.go
@@ -88,11 +88,11 @@ func (t *testServer) testWorker(c *C) {
 	c.Assert(err, IsNil)
 	c.Assert(w.GetUnitAndSourceStatusJSON("", nil), HasLen, emptyWorkerStatusInfoJSONLength)
 
-	// close twice
-	w.Close(true)
+	// stop twice
+	w.Stop(true)
 	c.Assert(w.closed.Load(), IsTrue)
 	c.Assert(w.subTaskHolder.getAllSubTasks(), HasLen, 0)
-	w.Close(true)
+	w.Stop(true)
 	c.Assert(w.closed.Load(), IsTrue)
 	c.Assert(w.subTaskHolder.getAllSubTasks(), HasLen, 0)
 	c.Assert(w.closed.Load(), IsTrue)
@@ -294,7 +294,7 @@ func (t *testWorkerFunctionalities) TestWorkerFunctionalities(c *C) {
 	// start worker
 	w, err := NewSourceWorker(sourceCfg, etcdCli, "", "")
 	c.Assert(err, IsNil)
-	defer w.Close(true)
+	defer w.Stop(true)
 	go func() {
 		w.Start()
 	}()
@@ -467,7 +467,7 @@ func (t *testWorkerEtcdCompact) TestWatchSubtaskStageEtcdCompact(c *C) {
 	c.Assert(err, IsNil)
 	ctx, cancel := context.WithCancel(context.Background())
 	defer cancel()
-	defer w.Close(true)
+	defer w.Stop(true)
 	go func() {
 		w.Start()
 	}()
@@ -530,7 +530,7 @@ func (t *testWorkerEtcdCompact) TestWatchSubtaskStageEtcdCompact(c *C) {
 	c.Assert(status[0].Stage, Equals, pb.Stage_Running)
 	cancel1()
 	wg.Wait()
-	w.subTaskHolder.closeAllSubTasks(true)
+	w.subTaskHolder.closeAllSubTasks()
 	// step 5: restart observe and start from startRev, this subtask should be added
 	ctx2, cancel2 := context.WithCancel(ctx)
 	wg.Add(1)
@@ -548,7 +548,7 @@ func (t *testWorkerEtcdCompact) TestWatchSubtaskStageEtcdCompact(c *C) {
 	c.Assert(status, HasLen, 1)
 	c.Assert(status[0].Name, Equals, subtaskCfg.Name)
 	c.Assert(status[0].Stage, Equals, pb.Stage_Running)
-	w.Close(true)
+	w.Stop(true)
 	cancel2()
 	wg.Wait()
 }
@@ -586,7 +586,7 @@ func (t *testWorkerEtcdCompact) TestWatchRelayStageEtcdCompact(c *C) {
 	c.Assert(err, IsNil)
 	ctx, cancel := context.WithCancel(context.Background())
 	defer cancel()
-	defer w.Close(true)
+	defer w.Stop(true)
 	go func() {
 		c.Assert(w.EnableRelay(false), IsNil)
 		w.Start()
diff --git a/dm/dm/worker/subtask.go b/dm/dm/worker/subtask.go
index c2003b4bbba..5d37f2ae025 100644
--- a/dm/dm/worker/subtask.go
+++ b/dm/dm/worker/subtask.go
@@ -157,7 +157,7 @@ func (st *SubTask) initUnits(relay relay.Process) error {
 	var needCloseUnits []unit.Unit
 	defer func() {
 		for _, u := range needCloseUnits {
-			u.Close(true)
+			u.Close()
 		}
 
 		st.initialized.Store(initializeUnitSuccess)
@@ -311,7 +311,7 @@ func (st *SubTask) fetchResultAndUpdateStage(pr chan pb.ProcessResult) {
 
 	switch stage {
 	case pb.Stage_Finished:
-		cu.Close(true)
+		cu.Close()
 		nu := st.getNextUnit()
 		if nu == nil {
 			// Now, when finished, it only stops the process
@@ -358,13 +358,7 @@ func (st *SubTask) PrevUnit() unit.Unit {
 }
 
 // closeUnits closes all un-closed units (current unit and all the subsequent units).
-func (st *SubTask) closeUnits(graceful bool) {
-	// when not graceful, we want to syncer to exit immediately, so we call u.Close(false) before call cancel
-	// note that we only implement ungraceful close for sync unit
-	if !graceful && st.CurrUnit().Type() == pb.UnitType_Sync {
-		st.l.Info("closing syncer without graceful", zap.String("task", st.cfg.Name))
-		st.CurrUnit().Close(false)
-	}
+func (st *SubTask) closeUnits() {
 	st.cancel()
 	st.resultWg.Wait()
 
@@ -385,8 +379,16 @@ func (st *SubTask) closeUnits(graceful bool) {
 
 	for i := cui; i < len(st.units); i++ {
 		u := st.units[i]
-		st.l.Info("closing unit process", zap.Stringer("unit", cu.Type()), zap.Bool("graceful", graceful))
-		u.Close(graceful)
+		st.l.Info("closing unit process", zap.Stringer("unit", cu.Type()))
+		u.Close()
+	}
+}
+
+func (st *SubTask) killCurrentUnit() {
+	// note that we only implement ungraceful stop for sync unit
+	if st.CurrUnit().Type() == pb.UnitType_Sync {
+		st.l.Info("kill syncer unit", zap.String("task", st.cfg.Name))
+		st.CurrUnit().Kill()
 	}
 }
 
@@ -483,14 +485,25 @@ func (st *SubTask) Result() *pb.ProcessResult {
 }
 
 // Close stops the sub task.
-func (st *SubTask) Close(graceful bool) {
+func (st *SubTask) Close() {
 	st.l.Info("closing")
 	if !st.setStageIfNotIn([]pb.Stage{pb.Stage_Stopped, pb.Stage_Stopping, pb.Stage_Finished}, pb.Stage_Stopping) {
 		st.l.Info("subTask is already closed, no need to close")
 		return
 	}
+	st.closeUnits() // close all un-closed units
+	updateTaskMetric(st.cfg.Name, st.cfg.SourceID, pb.Stage_Stopped, st.workerName)
+}
 
-	st.closeUnits(graceful) // close all un-closed units
+// kill kill running unit and stop the sub task.
+func (st *SubTask) Kill() {
+	st.l.Info("killing")
+	if !st.setStageIfNotIn([]pb.Stage{pb.Stage_Stopped, pb.Stage_Stopping, pb.Stage_Finished}, pb.Stage_Stopping) {
+		st.l.Info("subTask is already closed, no need to close")
+		return
+	}
+	st.killCurrentUnit()
+	st.closeUnits() // close all un-closed units
 	updateTaskMetric(st.cfg.Name, st.cfg.SourceID, pb.Stage_Stopped, st.workerName)
 }
 
diff --git a/dm/dm/worker/subtask_holder.go b/dm/dm/worker/subtask_holder.go
index 141590f4461..ebce2494671 100644
--- a/dm/dm/worker/subtask_holder.go
+++ b/dm/dm/worker/subtask_holder.go
@@ -55,7 +55,7 @@ func (h *subTaskHolder) resetAllSubTasks(relay relay.Process) {
 	defer h.mu.Unlock()
 	for _, st := range h.subTasks {
 		stage := st.Stage()
-		st.Close(true)
+		st.Close()
 		// TODO: make a st.reset
 		st.ctx, st.cancel = context.WithCancel(context.Background())
 		st.cfg.UseRelay = relay != nil
@@ -64,11 +64,21 @@ func (h *subTaskHolder) resetAllSubTasks(relay relay.Process) {
 }
 
 // closeAllSubTasks closes all subtask instances.
-func (h *subTaskHolder) closeAllSubTasks(graceful bool) {
+func (h *subTaskHolder) closeAllSubTasks() {
 	h.mu.Lock()
 	defer h.mu.Unlock()
 	for _, st := range h.subTasks {
-		st.Close(graceful)
+		st.Close()
+	}
+	h.subTasks = make(map[string]*SubTask)
+}
+
+// killAllSubTasks kill and stop all subtask instances.
+func (h *subTaskHolder) killAllSubTasks() {
+	h.mu.Lock()
+	defer h.mu.Unlock()
+	for _, st := range h.subTasks {
+		st.Kill()
 	}
 	h.subTasks = make(map[string]*SubTask)
 }
diff --git a/dm/dm/worker/subtask_test.go b/dm/dm/worker/subtask_test.go
index 52b7487e898..5cdcbb747ef 100644
--- a/dm/dm/worker/subtask_test.go
+++ b/dm/dm/worker/subtask_test.go
@@ -116,7 +116,9 @@ func (m *MockUnit) Process(ctx context.Context, pr chan pb.ProcessResult) {
 	}
 }
 
-func (m *MockUnit) Close(graceful bool) {}
+func (m *MockUnit) Close() {}
+
+func (m *MockUnit) Kill() {}
 
 func (m MockUnit) Pause() {}
 
@@ -512,7 +514,7 @@ func (t *testSubTask) TestSubtaskFastQuit(c *C) {
 		return st.Stage() == pb.Stage_Running
 	}), IsTrue)
 	// test Close
-	st.Close(true)
+	st.Close()
 	select {
 	case <-time.After(500 * time.Millisecond):
 		c.Fatal("fail to stop subtask in 0.5s when stuck into unitTransWaitCondition")
diff --git a/dm/dumpling/dumpling.go b/dm/dumpling/dumpling.go
index d64bbe5bb99..ea14cbd2eb5 100644
--- a/dm/dumpling/dumpling.go
+++ b/dm/dumpling/dumpling.go
@@ -165,7 +165,7 @@ func (m *Dumpling) Process(ctx context.Context, pr chan pb.ProcessResult) {
 }
 
 // Close implements Unit.Close.
-func (m *Dumpling) Close(graceful bool) {
+func (m *Dumpling) Close() {
 	if m.closed.Load() {
 		return
 	}
@@ -175,6 +175,12 @@ func (m *Dumpling) Close(graceful bool) {
 	m.closed.Store(true)
 }
 
+// Kill implements Unit.Kill.
+func (m *Dumpling) Kill() {
+	// TODO: implement kill
+	m.Close()
+}
+
 // Pause implements Unit.Pause.
 func (m *Dumpling) Pause() {
 	if m.closed.Load() {
diff --git a/dm/dumpling/dumpling_test.go b/dm/dumpling/dumpling_test.go
index 9308ccb68ea..04deaef0429 100644
--- a/dm/dumpling/dumpling_test.go
+++ b/dm/dumpling/dumpling_test.go
@@ -147,7 +147,7 @@ func (t *testDumplingSuite) TestCallStatus(c *C) {
 	dumpling, _ := export.NewDumper(ctx, dumpConf)
 	m.core = dumpling
 
-	m.Close(true)
+	m.Close()
 	s = m.Status(nil).(*pb.DumpStatus)
 	c.Assert(s.CompletedTables, Equals, float64(0))
 	c.Assert(s.FinishedBytes, Equals, float64(0))
diff --git a/dm/loader/lightning.go b/dm/loader/lightning.go
index 1fbb0356643..2cb7c11e99f 100644
--- a/dm/loader/lightning.go
+++ b/dm/loader/lightning.go
@@ -279,12 +279,18 @@ func (l *LightningLoader) IsFreshTask(ctx context.Context) (bool, error) {
 }
 
 // Close does graceful shutdown.
-func (l *LightningLoader) Close(graceful bool) {
+func (l *LightningLoader) Close() {
 	l.Pause()
 	l.checkPointList.Close()
 	l.closed.Store(true)
 }
 
+// Kill does ungraceful shutdown.
+func (l *LightningLoader) Kill() {
+	// TODO: implement kill
+	l.Close()
+}
+
 // Pause pauses the process, and it can be resumed later
 // should cancel context from external.
 func (l *LightningLoader) Pause() {
diff --git a/dm/loader/loader.go b/dm/loader/loader.go
index ce2a5782e01..de3835ded64 100644
--- a/dm/loader/loader.go
+++ b/dm/loader/loader.go
@@ -787,7 +787,7 @@ func (l *Loader) loadFinishedSize() {
 }
 
 // Close does graceful shutdown.
-func (l *Loader) Close(graceful bool) {
+func (l *Loader) Close() {
 	l.Lock()
 	defer l.Unlock()
 	if l.isClosed() {
@@ -804,6 +804,12 @@ func (l *Loader) Close(graceful bool) {
 	l.closed.Store(true)
 }
 
+// Kill kill the loader without graceful.
+func (l *Loader) Kill() {
+	// TODO: implement kill
+	l.Close()
+}
+
 // stopLoad stops loading, now it used by Close and Pause
 // maybe we can refine the workflow more clear.
 func (l *Loader) stopLoad() {
diff --git a/dm/syncer/syncer.go b/dm/syncer/syncer.go
index 16e93fbb94f..61517cea76e 100644
--- a/dm/syncer/syncer.go
+++ b/dm/syncer/syncer.go
@@ -3163,8 +3163,7 @@ func (s *Syncer) isClosed() bool {
 	return s.closed.Load()
 }
 
-// Close closes syncer.
-func (s *Syncer) Close(graceful bool) {
+func (s *Syncer) close(closeTracker bool) {
 	s.Lock()
 	defer s.Unlock()
 
@@ -3172,18 +3171,14 @@ func (s *Syncer) Close(graceful bool) {
 		return
 	}
 
-	if !graceful {
-		s.runCancel()
-		s.tctx.L().Warn("syncer is closing without graceful")
-	}
 	s.stopSync()
 	s.closeDBs()
 
 	s.checkpoint.Close()
 
-	if graceful {
-		// when close sycner without graceful, Close will be called before s.Process exit.
-		// if we close schematracker here worker will pending on `s.checkpoint.Rollback(s.schemaTracker)` forever
+	// when close sycner without graceful, Close will be called before s.Process exit.
+	// if we close schematracker worker will pending on `s.checkpoint.Rollback(s.schemaTracker)` forever
+	if closeTracker {
 		if err := s.schemaTracker.Close(); err != nil {
 			s.tctx.L().Error("fail to close schema tracker", log.ShortError(err))
 		}
@@ -3203,6 +3198,18 @@ func (s *Syncer) Close(graceful bool) {
 	s.closed.Store(true)
 }
 
+// Close closes syncer.
+func (s *Syncer) Close() {
+	s.close(true)
+}
+
+// Kill kill syncer without graceful.
+func (s *Syncer) Kill() {
+	s.tctx.L().Warn("kill syncer without graceful")
+	s.runCancel()
+	s.close(false)
+}
+
 // stopSync stops syncing, now it used by Close and Pause
 // maybe we can refine the workflow more clear.
 func (s *Syncer) stopSync() {
diff --git a/dm/syncer/syncer_test.go b/dm/syncer/syncer_test.go
index 26472219497..b487080b170 100644
--- a/dm/syncer/syncer_test.go
+++ b/dm/syncer/syncer_test.go
@@ -1015,7 +1015,7 @@ func (s *testSyncerSuite) TestRun(c *C) {
 	testJobs.RUnlock()
 
 	cancel()
-	syncer.Close(true)
+	syncer.Close()
 	c.Assert(syncer.isClosed(), IsTrue)
 
 	if err := mock.ExpectationsWereMet(); err != nil {
@@ -1191,7 +1191,7 @@ func (s *testSyncerSuite) TestExitSafeModeByConfig(c *C) {
 	testJobs.Unlock()
 
 	cancel()
-	syncer.Close(true)
+	syncer.Close()
 	c.Assert(syncer.isClosed(), IsTrue)
 
 	if err := mock.ExpectationsWereMet(); err != nil {
diff --git a/dm/tests/checkpoint_transaction/run.sh b/dm/tests/checkpoint_transaction/run.sh
index 8562f2c1315..608cf0d0ef1 100755
--- a/dm/tests/checkpoint_transaction/run.sh
+++ b/dm/tests/checkpoint_transaction/run.sh
@@ -12,11 +12,12 @@ function check_worker_ungraceful_stop_with_retry() {
 	for ((k = 0; k < 10; k++)); do
 		sleep 1
 		echo "start check_worker_ungraceful_stop_with_retry times: $k"
-		num=$(grep "closing syncer without graceful" $WORK_DIR/worker1/log/dm-worker.log | wc -l)
+
+		num=$(grep "kill syncer unit" $WORK_DIR/worker1/log/dm-worker.log | wc -l)
 		if [ $num -lt 1 ]; then
 			continue
 		fi
-		num=$(grep "syncer is closing" $WORK_DIR/worker1/log/dm-worker.log | wc -l)
+		num=$(grep "kill syncer without graceful" $WORK_DIR/worker1/log/dm-worker.log | wc -l)
 		if [ $num -lt 1 ]; then
 			continue
 		fi
@@ -24,9 +25,9 @@ function check_worker_ungraceful_stop_with_retry() {
 		if [ $num -lt 1 ]; then
 			continue
 		fi
-		echo "check_worker_ungraceful_stop_with_retry after retry: $k"
 		all_matched=true
 		if $all_matched; then
+			echo "check_worker_ungraceful_stop_with_retry after retry: $k"
 			break
 		fi
 	done

From 1ec3c56b762ba1af8f9ee6fb4d24373163bdf62a Mon Sep 17 00:00:00 2001
From: ehco1996 <zh19960202@gmail.com>
Date: Wed, 12 Jan 2022 14:16:09 +0800
Subject: [PATCH 09/47] rollback checkpoint before close schematracker

---
 dm/syncer/syncer.go | 39 ++++++++++++---------------------------
 1 file changed, 12 insertions(+), 27 deletions(-)

diff --git a/dm/syncer/syncer.go b/dm/syncer/syncer.go
index 61517cea76e..5cbe215991c 100644
--- a/dm/syncer/syncer.go
+++ b/dm/syncer/syncer.go
@@ -694,14 +694,6 @@ func (s *Syncer) Process(ctx context.Context, pr chan pb.ProcessResult) {
 	default:
 	}
 
-	// try to rollback checkpoints, if they already flushed, no effect
-	prePos := s.checkpoint.GlobalPoint()
-	s.checkpoint.Rollback(s.schemaTracker)
-	currPos := s.checkpoint.GlobalPoint()
-	if binlog.CompareLocation(prePos, currPos, s.cfg.EnableGTID) != 0 {
-		s.tctx.L().Warn("something wrong with rollback global checkpoint", zap.Stringer("previous position", prePos), zap.Stringer("current position", currPos))
-	}
-
 	pr <- pb.ProcessResult{
 		IsCanceled: isCanceled,
 		Errors:     errs,
@@ -3163,7 +3155,8 @@ func (s *Syncer) isClosed() bool {
 	return s.closed.Load()
 }
 
-func (s *Syncer) close(closeTracker bool) {
+// Close closes syncer.
+func (s *Syncer) Close() {
 	s.Lock()
 	defer s.Unlock()
 
@@ -3174,40 +3167,32 @@ func (s *Syncer) close(closeTracker bool) {
 	s.stopSync()
 	s.closeDBs()
 
+	// try to rollback checkpoints, if they already flushed, no effect, this operation should call before close schemaTracker
+	prePos := s.checkpoint.GlobalPoint()
+	s.checkpoint.Rollback(s.schemaTracker)
+	currPos := s.checkpoint.GlobalPoint()
+	if binlog.CompareLocation(prePos, currPos, s.cfg.EnableGTID) != 0 {
+		s.tctx.L().Warn("something wrong with rollback global checkpoint", zap.Stringer("previous position", prePos), zap.Stringer("current position", currPos))
+	}
 	s.checkpoint.Close()
-
-	// when close sycner without graceful, Close will be called before s.Process exit.
-	// if we close schematracker worker will pending on `s.checkpoint.Rollback(s.schemaTracker)` forever
-	if closeTracker {
-		if err := s.schemaTracker.Close(); err != nil {
-			s.tctx.L().Error("fail to close schema tracker", log.ShortError(err))
-		}
+	if err := s.schemaTracker.Close(); err != nil {
+		s.tctx.L().Error("fail to close schema tracker", log.ShortError(err))
 	}
-
 	if s.sgk != nil {
 		s.sgk.Close()
 	}
-
 	s.closeOnlineDDL()
-
 	// when closing syncer by `stop-task`, remove active relay log from hub
 	s.removeActiveRelayLog()
-
 	metrics.RemoveLabelValuesWithTaskInMetrics(s.cfg.Name)
-
 	s.closed.Store(true)
 }
 
-// Close closes syncer.
-func (s *Syncer) Close() {
-	s.close(true)
-}
-
 // Kill kill syncer without graceful.
 func (s *Syncer) Kill() {
 	s.tctx.L().Warn("kill syncer without graceful")
 	s.runCancel()
-	s.close(false)
+	s.Close()
 }
 
 // stopSync stops syncing, now it used by Close and Pause

From 63e87ca449878a74f98356a6c9ec2cc83fdf4629 Mon Sep 17 00:00:00 2001
From: ehco1996 <zh19960202@gmail.com>
Date: Wed, 12 Jan 2022 15:16:11 +0800
Subject: [PATCH 10/47] use runctx to cancel dml worker

---
 dm/syncer/dml_worker.go | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/dm/syncer/dml_worker.go b/dm/syncer/dml_worker.go
index 051bb140f05..cfa268e8804 100644
--- a/dm/syncer/dml_worker.go
+++ b/dm/syncer/dml_worker.go
@@ -14,6 +14,7 @@
 package syncer
 
 import (
+	"context"
 	"time"
 
 	"github.com/pingcap/errors"
@@ -36,7 +37,7 @@ type DMLWorker struct {
 	chanSize     int
 	multipleRows bool
 	toDBConns    []*dbconn.DBConn
-	tctx         *tcontext.Context
+	runCtx       context.Context
 	logger       log.Logger
 
 	// for metrics
@@ -75,7 +76,7 @@ func dmlWorkerWrap(inCh chan *job, syncer *Syncer) chan *job {
 		fatalFunc:    syncer.fatalFunc,
 		lagFunc:      syncer.updateReplicationJobTS,
 		addCountFunc: syncer.addCount,
-		tctx:         syncer.tctx,
+		runCtx:       syncer.runCtx,
 		toDBConns:    syncer.toDBConns,
 		inCh:         inCh,
 		flushCh:      make(chan *job),
@@ -236,9 +237,9 @@ func (w *DMLWorker) executeBatchJobs(queueID int, jobs []*job) {
 		time.Sleep(time.Duration(t) * time.Second)
 	})
 	// use background context to execute sqls as much as possible
-	ctx, cancel := w.tctx.WithTimeout(maxDMLExecutionDuration)
+	ctx, cancel := context.WithTimeout(w.runCtx, maxDMLExecutionDuration)
 	defer cancel()
-	affect, err = db.ExecuteSQL(ctx, queries, args...)
+	affect, err = db.ExecuteSQL(tcontext.NewContext(ctx, w.logger), queries, args...)
 	failpoint.Inject("SafeModeExit", func(val failpoint.Value) {
 		if intVal, ok := val.(int); ok && intVal == 4 && len(jobs) > 0 {
 			w.logger.Warn("fail to exec DML", zap.String("failpoint", "SafeModeExit"))

From d743f1d2112fcc2a28261a1b0ab74f2066b309ba Mon Sep 17 00:00:00 2001
From: ehco1996 <zh19960202@gmail.com>
Date: Fri, 14 Jan 2022 09:09:29 +0800
Subject: [PATCH 11/47] address comment

---
 dm/dm/worker/server.go  |  2 +-
 dm/dm/worker/subtask.go |  3 +--
 dm/syncer/syncer.go     | 20 ++++++++++----------
 3 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/dm/dm/worker/server.go b/dm/dm/worker/server.go
index aae087216e8..3c5bca79001 100644
--- a/dm/dm/worker/server.go
+++ b/dm/dm/worker/server.go
@@ -459,8 +459,8 @@ func (s *Server) doClose() {
 
 // Close close the RPC server, this function can be called multiple times.
 func (s *Server) Close() {
+	s.doClose() // we should stop current sync first, other wise master may scheduler task on new worker while we are closing
 	s.stopKeepAlive()
-	s.doClose()
 }
 
 // if needLock is false, we should make sure Server has been locked in caller.
diff --git a/dm/dm/worker/subtask.go b/dm/dm/worker/subtask.go
index c9f30d8de8a..bfefb863794 100644
--- a/dm/dm/worker/subtask.go
+++ b/dm/dm/worker/subtask.go
@@ -385,8 +385,7 @@ func (st *SubTask) closeUnits() {
 }
 
 func (st *SubTask) killCurrentUnit() {
-	// note that we only implement ungraceful stop for sync unit
-	if st.CurrUnit().Type() == pb.UnitType_Sync {
+	if st.CurrUnit() != nil && st.CurrUnit().Type() == pb.UnitType_Sync {
 		st.l.Info("kill syncer unit", zap.String("task", st.cfg.Name))
 		st.CurrUnit().Kill()
 	}
diff --git a/dm/syncer/syncer.go b/dm/syncer/syncer.go
index 4f7439deafc..a3a1973beac 100644
--- a/dm/syncer/syncer.go
+++ b/dm/syncer/syncer.go
@@ -157,7 +157,7 @@ type Syncer struct {
 	waitXIDJob          atomic.Int64
 	isTransactionEnd    bool
 	waitTransactionLock sync.Mutex
-	runCtx              context.Context // this ctx is injected in `s.Run` and when this ctx cancelled, syncer will exit quickly and not wait transaction end
+	runCtx              context.Context // this (background)ctx is injected in `s.Run` and when this ctx cancelled, syncer will exit quickly and not wait transaction end
 	runCancel           context.CancelFunc
 
 	tableRouter     *router.Table
@@ -3344,14 +3344,6 @@ func (s *Syncer) Close() {
 
 	s.stopSync()
 	s.closeDBs()
-
-	// try to rollback checkpoints, if they already flushed, no effect, this operation should call before close schemaTracker
-	prePos := s.checkpoint.GlobalPoint()
-	s.checkpoint.Rollback(s.schemaTracker)
-	currPos := s.checkpoint.GlobalPoint()
-	if binlog.CompareLocation(prePos, currPos, s.cfg.EnableGTID) != 0 {
-		s.tctx.L().Warn("something wrong with rollback global checkpoint", zap.Stringer("previous position", prePos), zap.Stringer("current position", currPos))
-	}
 	s.checkpoint.Close()
 	if err := s.schemaTracker.Close(); err != nil {
 		s.tctx.L().Error("fail to close schema tracker", log.ShortError(err))
@@ -3373,7 +3365,7 @@ func (s *Syncer) Kill() {
 	s.Close()
 }
 
-// stopSync stops syncing, now it used by Close and Pause
+// stopSync stops syncing and rollback checkpoint now it used by Close and Pause
 // maybe we can refine the workflow more clear.
 func (s *Syncer) stopSync() {
 	if s.done != nil {
@@ -3388,6 +3380,14 @@ func (s *Syncer) stopSync() {
 	if s.streamerController != nil {
 		s.streamerController.Close(s.tctx)
 	}
+
+	// try to rollback checkpoints, if they already flushed, no effect, this operation should call before close schemaTracker
+	prePos := s.checkpoint.GlobalPoint()
+	s.checkpoint.Rollback(s.schemaTracker)
+	currPos := s.checkpoint.GlobalPoint()
+	if binlog.CompareLocation(prePos, currPos, s.cfg.EnableGTID) != 0 {
+		s.tctx.L().Warn("something wrong with rollback global checkpoint", zap.Stringer("previous position", prePos), zap.Stringer("current position", currPos))
+	}
 }
 
 func (s *Syncer) closeOnlineDDL() {

From 9b9a5583568b7808db6aae99413cb00be2fe176c Mon Sep 17 00:00:00 2001
From: ehco1996 <zh19960202@gmail.com>
Date: Mon, 17 Jan 2022 14:23:06 +0800
Subject: [PATCH 12/47] address some comments

---
 dm/dm/worker/subtask.go                |  2 +-
 dm/syncer/syncer.go                    | 24 ++++++++++++------------
 dm/tests/checkpoint_transaction/run.sh |  9 ++++++---
 3 files changed, 19 insertions(+), 16 deletions(-)

diff --git a/dm/dm/worker/subtask.go b/dm/dm/worker/subtask.go
index bfefb863794..e94ec61693b 100644
--- a/dm/dm/worker/subtask.go
+++ b/dm/dm/worker/subtask.go
@@ -385,7 +385,7 @@ func (st *SubTask) closeUnits() {
 }
 
 func (st *SubTask) killCurrentUnit() {
-	if st.CurrUnit() != nil && st.CurrUnit().Type() == pb.UnitType_Sync {
+	if st.CurrUnit() != nil {
 		st.l.Info("kill syncer unit", zap.String("task", st.cfg.Name))
 		st.CurrUnit().Kill()
 	}
diff --git a/dm/syncer/syncer.go b/dm/syncer/syncer.go
index a3a1973beac..c2b8753714f 100644
--- a/dm/syncer/syncer.go
+++ b/dm/syncer/syncer.go
@@ -1127,7 +1127,7 @@ func (s *Syncer) flushCheckPoints() error {
 	snapshotInfo, exceptTables, shardMetaSQLs, shardMetaArgs := s.createCheckpointSnapshot(true)
 
 	if snapshotInfo == nil {
-		log.L().Info("checkpoint has no change, skip sync flush checkpoint")
+		s.tctx.L().Info("checkpoint has no change, skip sync flush checkpoint")
 		return nil
 	}
 
@@ -1163,7 +1163,7 @@ func (s *Syncer) flushCheckPointsAsync(asyncFlushJob *job) {
 	snapshotInfo, exceptTables, shardMetaSQLs, shardMetaArgs := s.createCheckpointSnapshot(false)
 
 	if snapshotInfo == nil {
-		log.L().Info("checkpoint has no change, skip async flush checkpoint", zap.Int64("job seq", asyncFlushJob.flushSeq))
+		s.tctx.L().Info("checkpoint has no change, skip async flush checkpoint", zap.Int64("job seq", asyncFlushJob.flushSeq))
 		return
 	}
 
@@ -1411,11 +1411,11 @@ func (s *Syncer) syncDML() {
 func (s *Syncer) waitTransactionEndBeforeExit(ctx context.Context) {
 	select {
 	case <-ctx.Done(): // hijack the context to wait for the transaction to end.
-		log.L().Info("received subtask's done, try graceful stop")
+		s.tctx.L().Info("received subtask's done, try graceful stop")
 		s.waitTransactionLock.Lock()
 		if s.isTransactionEnd {
 			s.waitXIDJob.Store(int64(waitComplete))
-			log.L().Info("the last job is transaction end, done directly")
+			s.tctx.L().Info("the last job is transaction end, done directly")
 			s.runCancel()
 			s.waitTransactionLock.Unlock()
 			return
@@ -1424,12 +1424,12 @@ func (s *Syncer) waitTransactionEndBeforeExit(ctx context.Context) {
 		s.waitTransactionLock.Unlock()
 		select {
 		case <-s.runCtx.Done():
-			log.L().Info("received run ctx done, exit now")
+			s.tctx.L().Info("received run ctx done, exit now")
 		case <-time.After(maxPauseOrStopWaitTime):
-			log.L().Info("wait transaction end timeout, exit now")
+			s.tctx.L().Info("wait transaction end timeout, exit now")
 		}
 	case <-s.runCtx.Done(): // when no graceful stop, run ctx will canceled first.
-		log.L().Info("received ungraceful exit ctx, exit now")
+		s.tctx.L().Info("received ungraceful exit ctx, exit now")
 	}
 }
 
@@ -1716,7 +1716,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 				i++
 			}
 		}
-		log.L().Info("discard event already consumed", zap.Int("count", n),
+		s.tctx.L().Info("discard event already consumed", zap.Int("count", n),
 			zap.Any("cur_loc", currentLocation))
 		return nil
 	}
@@ -1801,7 +1801,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 			if err == nil {
 				continue
 			}
-			log.L().Warn("skip duplicate rows event failed", zap.Error(err))
+			s.tctx.L().Warn("skip duplicate rows event failed", zap.Error(err))
 		}
 
 		if err != nil {
@@ -1817,7 +1817,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 				if err != nil {
 					return err
 				}
-				log.L().Info("reset replication binlog puller", zap.Any("pos", s.checkpoint.GlobalPoint()))
+				s.tctx.L().Info("reset replication binlog puller", zap.Any("pos", s.checkpoint.GlobalPoint()))
 				if err = maybeSkipNRowsEvent(eventIndex); err != nil {
 					return err
 				}
@@ -2465,7 +2465,7 @@ func (s *Syncer) handleQueryEvent(ev *replication.QueryEvent, ec eventContext, o
 
 	qec.p, err = event.GetParserForStatusVars(ev.StatusVars)
 	if err != nil {
-		log.L().Warn("found error when get sql_mode from binlog status_vars", zap.Error(err))
+		s.tctx.L().Warn("found error when get sql_mode from binlog status_vars", zap.Error(err))
 	}
 
 	stmt, err := parseOneStmt(qec)
@@ -3076,7 +3076,7 @@ func (s *Syncer) trackOriginDDL(ev *replication.QueryEvent, ec eventContext) (ma
 	}
 	qec.p, err = event.GetParserForStatusVars(ev.StatusVars)
 	if err != nil {
-		log.L().Warn("found error when get sql_mode from binlog status_vars", zap.Error(err))
+		s.tctx.L().Warn("found error when get sql_mode from binlog status_vars", zap.Error(err))
 	}
 	stmt, err := parseOneStmt(qec)
 	if err != nil {
diff --git a/dm/tests/checkpoint_transaction/run.sh b/dm/tests/checkpoint_transaction/run.sh
index 608cf0d0ef1..d9e740143dd 100755
--- a/dm/tests/checkpoint_transaction/run.sh
+++ b/dm/tests/checkpoint_transaction/run.sh
@@ -27,13 +27,16 @@ function check_worker_ungraceful_stop_with_retry() {
 		fi
 		all_matched=true
 		if $all_matched; then
-			echo "check_worker_ungraceful_stop_with_retry after retry: $k"
+			echo "check_worker_ungraceful_stop_with_retry success after retry: $k"
 			break
 		fi
 	done
+
 	if ! $all_matched; then
+		echo "check_worker_ungraceful_stop_with_retry failed after retry: $k"
 		exit 1
 	fi
+
 }
 
 function run() {
@@ -65,9 +68,9 @@ function run() {
 	echo "kill dm-master1"
 	kill_dm_master
 	check_master_port_offline 1
-	sleep 1 # wiat woker lost ha keepalive-ttl is 1 second
+	sleep 1 # wait worker lost keep alive ttl is 1 second
 
-	# check dm-worker2 will exit quickly without waiting for the transaction to finish
+	# check dm-worker will exit quickly without waiting for the transaction to finish
 	check_worker_ungraceful_stop_with_retry
 
 	# test data in tidb less than source

From 5d88e4bdd46c58fe9c7aa9b240dace35f5420b7d Mon Sep 17 00:00:00 2001
From: ehco1996 <zh19960202@gmail.com>
Date: Mon, 17 Jan 2022 17:58:56 +0800
Subject: [PATCH 13/47] savework: refine syncer ctx

---
 dm/dm/worker/subtask.go                |   2 +-
 dm/syncer/dml_worker.go                |   9 +-
 dm/syncer/syncer.go                    | 199 +++++++++++++------------
 dm/tests/checkpoint_transaction/run.sh |   2 +-
 4 files changed, 112 insertions(+), 100 deletions(-)

diff --git a/dm/dm/worker/subtask.go b/dm/dm/worker/subtask.go
index e94ec61693b..8c17e345aea 100644
--- a/dm/dm/worker/subtask.go
+++ b/dm/dm/worker/subtask.go
@@ -386,7 +386,7 @@ func (st *SubTask) closeUnits() {
 
 func (st *SubTask) killCurrentUnit() {
 	if st.CurrUnit() != nil {
-		st.l.Info("kill syncer unit", zap.String("task", st.cfg.Name))
+		st.l.Info("kill syncer unit", zap.String("task", st.cfg.Name), zap.Stringer("unit", st.CurrUnit().Type()))
 		st.CurrUnit().Kill()
 	}
 }
diff --git a/dm/syncer/dml_worker.go b/dm/syncer/dml_worker.go
index cfa268e8804..edbcf4bd1f3 100644
--- a/dm/syncer/dml_worker.go
+++ b/dm/syncer/dml_worker.go
@@ -14,7 +14,6 @@
 package syncer
 
 import (
-	"context"
 	"time"
 
 	"github.com/pingcap/errors"
@@ -37,7 +36,7 @@ type DMLWorker struct {
 	chanSize     int
 	multipleRows bool
 	toDBConns    []*dbconn.DBConn
-	runCtx       context.Context
+	runTCtx      *tcontext.Context
 	logger       log.Logger
 
 	// for metrics
@@ -76,7 +75,7 @@ func dmlWorkerWrap(inCh chan *job, syncer *Syncer) chan *job {
 		fatalFunc:    syncer.fatalFunc,
 		lagFunc:      syncer.updateReplicationJobTS,
 		addCountFunc: syncer.addCount,
-		runCtx:       syncer.runCtx,
+		runTCtx:      syncer.runTCtx,
 		toDBConns:    syncer.toDBConns,
 		inCh:         inCh,
 		flushCh:      make(chan *job),
@@ -237,9 +236,9 @@ func (w *DMLWorker) executeBatchJobs(queueID int, jobs []*job) {
 		time.Sleep(time.Duration(t) * time.Second)
 	})
 	// use background context to execute sqls as much as possible
-	ctx, cancel := context.WithTimeout(w.runCtx, maxDMLExecutionDuration)
+	ctx, cancel := w.runTCtx.WithTimeout(maxDMLExecutionDuration)
 	defer cancel()
-	affect, err = db.ExecuteSQL(tcontext.NewContext(ctx, w.logger), queries, args...)
+	affect, err = db.ExecuteSQL(ctx, queries, args...)
 	failpoint.Inject("SafeModeExit", func(val failpoint.Value) {
 		if intVal, ok := val.(int); ok && intVal == 4 && len(jobs) > 0 {
 			w.logger.Warn("fail to exec DML", zap.String("failpoint", "SafeModeExit"))
diff --git a/dm/syncer/syncer.go b/dm/syncer/syncer.go
index c2b8753714f..d89f800761f 100644
--- a/dm/syncer/syncer.go
+++ b/dm/syncer/syncer.go
@@ -123,7 +123,14 @@ const (
 type Syncer struct {
 	sync.RWMutex
 
-	tctx *tcontext.Context
+	tctx *tcontext.Context // this ctx only used for logger.
+
+	// this ctx is a background ctx and was initialized in s.Run, it is used for some background tasks in S.Run
+	// when this ctx cancelled, syncer will shuts down all background running jobs and not wait transaction end
+	runCtx context.Context
+	// this is used for some background tasks S.Run that need logger it share same lifecycle with runCtx.
+	runTCtx   *tcontext.Context // this ctx only used for logger.
+	runCancel context.CancelFunc
 
 	cfg     *config.SubTaskConfig
 	syncCfg replication.BinlogSyncerConfig
@@ -157,8 +164,6 @@ type Syncer struct {
 	waitXIDJob          atomic.Int64
 	isTransactionEnd    bool
 	waitTransactionLock sync.Mutex
-	runCtx              context.Context // this (background)ctx is injected in `s.Run` and when this ctx cancelled, syncer will exit quickly and not wait transaction end
-	runCancel           context.CancelFunc
 
 	tableRouter     *router.Table
 	binlogFilter    *bf.BinlogEvent
@@ -1409,14 +1414,14 @@ func (s *Syncer) syncDML() {
 }
 
 func (s *Syncer) waitTransactionEndBeforeExit(ctx context.Context) {
+	defer s.runCancel()
 	select {
-	case <-ctx.Done(): // hijack the context to wait for the transaction to end.
+	case <-ctx.Done(): // hijack the root context from s.Run to wait for the transaction to end.
 		s.tctx.L().Info("received subtask's done, try graceful stop")
 		s.waitTransactionLock.Lock()
 		if s.isTransactionEnd {
 			s.waitXIDJob.Store(int64(waitComplete))
 			s.tctx.L().Info("the last job is transaction end, done directly")
-			s.runCancel()
 			s.waitTransactionLock.Unlock()
 			return
 		}
@@ -1433,15 +1438,58 @@ func (s *Syncer) waitTransactionEndBeforeExit(ctx context.Context) {
 	}
 }
 
+func (s *Syncer) updateTSOffsetCronJob(ctx context.Context) {
+	defer s.wg.Done()
+	// temporarily hard code there. if this metrics works well add this to config file.
+	ticker := time.NewTicker(time.Minute * 10)
+	defer ticker.Stop()
+	for {
+		select {
+		case <-ticker.C:
+			if utErr := s.updateTSOffset(ctx); utErr != nil {
+				s.tctx.L().Error("get server unix ts err", zap.Error(utErr))
+			}
+		case <-ctx.Done():
+			return
+		}
+	}
+}
+
+func (s *Syncer) updateLagCronJob(ctx context.Context) {
+	defer s.wg.Done()
+	// temporarily hard code there. if this metrics works well add this to config file.
+	ticker := time.NewTicker(time.Millisecond * 100)
+	defer ticker.Stop()
+	for {
+		select {
+		case <-ticker.C:
+			s.updateReplicationLagMetric()
+		case <-ctx.Done():
+			return
+		}
+	}
+}
+
+func (s *Syncer) updateTSOffset(ctx context.Context) error {
+	t1 := time.Now()
+	ts, tsErr := s.fromDB.GetServerUnixTS(ctx)
+	rtt := time.Since(t1).Seconds()
+	if tsErr == nil {
+		s.tsOffset.Store(time.Now().Unix() - ts - int64(rtt/2))
+	}
+	return tsErr
+}
+
 // Run starts running for sync, we should guarantee it can rerun when paused.
 func (s *Syncer) Run(ctx context.Context) (err error) {
 	runCtx, runCancel := context.WithCancel(context.Background())
+	runTCtx := tcontext.NewContext(runCtx, s.tctx.L())
 	defer runCancel()
 	s.Lock()
 	s.runCtx = runCtx
+	s.runTCtx = runTCtx
 	s.runCancel = runCancel
 	s.Unlock()
-	tctx := s.tctx.WithContext(runCtx)
 
 	defer func() {
 		if s.done != nil {
@@ -1452,7 +1500,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 	go s.waitTransactionEndBeforeExit(ctx)
 
 	// some initialization that can't be put in Syncer.Init
-	fresh, err := s.IsFreshTask(runCtx)
+	fresh, err := s.IsFreshTask(ctx)
 	if err != nil {
 		return err
 	} else if fresh {
@@ -1467,7 +1515,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 	s.wg.Add(1)
 	go func() {
 		defer s.wg.Done()
-		s.checkpointFlushWorker.Run(s.tctx)
+		s.checkpointFlushWorker.Run(runTCtx)
 	}()
 
 	var (
@@ -1475,7 +1523,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 		delLoadTask     bool
 		cleanDumpFile   = s.cfg.CleanDumpFile
 	)
-	flushCheckpoint, err = s.adjustGlobalPointGTID(tctx)
+	flushCheckpoint, err = s.adjustGlobalPointGTID(s.runTCtx)
 	if err != nil {
 		return err
 	}
@@ -1484,7 +1532,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 		flushCheckpoint = true
 		err = s.loadTableStructureFromDump(ctx)
 		if err != nil {
-			tctx.L().Warn("error happened when load table structure from dump files", zap.Error(err))
+			s.tctx.L().Warn("error happened when load table structure from dump files", zap.Error(err))
 			cleanDumpFile = false
 		}
 	} else {
@@ -1493,59 +1541,35 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 
 	if flushCheckpoint {
 		if err = s.flushCheckPoints(); err != nil {
-			tctx.L().Warn("fail to flush checkpoints when starting task", zap.Error(err))
+			s.tctx.L().Warn("fail to flush checkpoints when starting task", zap.Error(err))
 			return err
 		}
 	}
 	if delLoadTask {
 		if err = s.delLoadTask(); err != nil {
-			tctx.L().Warn("error when del load task in etcd", zap.Error(err))
+			s.tctx.L().Warn("error when del load task in etcd", zap.Error(err))
 		}
 	}
 	if cleanDumpFile {
-		tctx.L().Info("try to remove all dump files")
+		s.tctx.L().Info("try to remove all dump files")
 		if err = os.RemoveAll(s.cfg.Dir); err != nil {
-			tctx.L().Warn("error when remove loaded dump folder", zap.String("data folder", s.cfg.Dir), zap.Error(err))
+			s.tctx.L().Warn("error when remove loaded dump folder", zap.String("data folder", s.cfg.Dir), zap.Error(err))
 		}
 	}
 
 	failpoint.Inject("AdjustGTIDExit", func() {
-		tctx.L().Warn("exit triggered", zap.String("failpoint", "AdjustGTIDExit"))
-		s.streamerController.Close(tctx)
+		s.tctx.L().Warn("exit triggered", zap.String("failpoint", "AdjustGTIDExit"))
+		s.streamerController.Close(s.tctx)
 		utils.OsExit(1)
 	})
 
-	updateTSOffset := func() error {
-		t1 := time.Now()
-		ts, tsErr := s.fromDB.GetServerUnixTS(runCtx)
-		rtt := time.Since(t1).Seconds()
-		if tsErr == nil {
-			s.tsOffset.Store(time.Now().Unix() - ts - int64(rtt/2))
-		}
-		return tsErr
-	}
-	// before sync run, we get the tsoffset from upstream first
-	if utErr := updateTSOffset(); utErr != nil {
+	// before sync run, we get the ts offset from upstream first
+	if utErr := s.updateTSOffset(ctx); utErr != nil {
 		return utErr
 	}
 	// start background task to get/update current ts offset between dm and upstream
 	s.wg.Add(1)
-	go func() {
-		defer s.wg.Done()
-		// temporarily hard code there. if this metrics works well add this to config file.
-		updateTicker := time.NewTicker(time.Minute * 10)
-		defer updateTicker.Stop()
-		for {
-			select {
-			case <-updateTicker.C:
-				if utErr := updateTSOffset(); utErr != nil {
-					s.tctx.L().Error("get server unix ts err", zap.Error(utErr))
-				}
-			case <-runCtx.Done():
-				return
-			}
-		}
-	}()
+	go s.updateTSOffsetCronJob(s.runCtx)
 
 	// startLocation is the start location for current received event
 	// currentLocation is the end location for current received event (End_log_pos in `show binlog events` for mysql)
@@ -1560,11 +1584,11 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 
 		currentGTID string
 	)
-	tctx.L().Info("replicate binlog from checkpoint", zap.Stringer("checkpoint", lastLocation))
+	s.tctx.L().Info("replicate binlog from checkpoint", zap.Stringer("checkpoint", lastLocation))
 
 	if s.streamerController.IsClosed() {
 		s.locations.reset(lastLocation)
-		err = s.streamerController.Start(tctx, lastLocation)
+		err = s.streamerController.Start(runTCtx, lastLocation)
 		if err != nil {
 			return terror.Annotate(err, "fail to restart streamer controller")
 		}
@@ -1574,22 +1598,11 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 	go s.syncDML()
 
 	s.wg.Add(1)
-	go s.syncDDL(tctx, adminQueueName, s.ddlDBConn, s.ddlJobCh)
+	go s.syncDDL(s.runTCtx, adminQueueName, s.ddlDBConn, s.ddlJobCh)
 
+	// start background task to update replication lag metric
 	s.wg.Add(1)
-	go func() {
-		defer s.wg.Done()
-		updateLagTicker := time.NewTicker(time.Millisecond * 100)
-		defer updateLagTicker.Stop()
-		for {
-			select {
-			case <-updateLagTicker.C:
-				s.updateReplicationLagMetric()
-			case <-runCtx.Done():
-				return
-			}
-		}
-	}()
+	go s.updateLagCronJob(s.runCtx)
 
 	// syncing progress with sharding DDL group
 	// 1. use the global streamer to sync regular binlog events
@@ -1611,10 +1624,10 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 	defer func() {
 		if err1 := recover(); err1 != nil {
 			failpoint.Inject("ExitAfterSaveOnlineDDL", func() {
-				tctx.L().Info("force panic")
+				s.tctx.L().Info("force panic")
 				panic("ExitAfterSaveOnlineDDL")
 			})
-			tctx.L().Error("panic log", zap.Reflect("error message", err1), zap.Stack("stack"))
+			s.tctx.L().Error("panic log", zap.Reflect("error message", err1), zap.Stack("stack"))
 			err = terror.ErrSyncerUnitPanic.Generate(err1)
 		}
 
@@ -1631,13 +1644,13 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 
 		// flush all jobs before exit
 		if err2 = s.flushJobs(); err2 != nil {
-			tctx.L().Warn("failed to flush jobs when exit task", zap.Error(err2))
+			s.tctx.L().Warn("failed to flush jobs when exit task", zap.Error(err2))
 		}
 
 		// if any execute error, flush safemode exit point
 		if err2 = s.execError.Load(); err2 != nil && (terror.ErrDBExecuteFailed.Equal(err2) || terror.ErrDBUnExpect.Equal(err2)) {
 			if err2 = s.checkpoint.FlushSafeModeExitPoint(s.tctx); err2 != nil {
-				tctx.L().Warn("failed to flush safe mode checkpoints when exit task", zap.Error(err2))
+				s.tctx.L().Warn("failed to flush safe mode checkpoints when exit task", zap.Error(err2))
 			}
 		}
 	}()
@@ -1656,7 +1669,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 	// if we start syncer at an early position, database must bear a period of inconsistent state,
 	// it's eventual consistency.
 	s.safeMode = sm.NewSafeMode()
-	s.enableSafeModeInitializationPhase(tctx)
+	s.enableSafeModeInitializationPhase(s.runTCtx)
 
 	closeShardingResync := func() error {
 		if shardingReSync == nil {
@@ -1680,7 +1693,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 		s.isReplacingOrInjectingErr = currentLocation.Suffix != 0
 
 		s.locations.reset(currentLocation)
-		err3 := s.streamerController.RedirectStreamer(tctx, currentLocation)
+		err3 := s.streamerController.RedirectStreamer(s.tctx, currentLocation)
 		if err3 != nil {
 			return err3
 		}
@@ -1695,7 +1708,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 		}
 
 		for i := 0; i < n; {
-			e, err1 := s.getEvent(tctx, currentLocation)
+			e, err1 := s.getEvent(s.runTCtx, currentLocation)
 			if err1 != nil {
 				return err
 			}
@@ -1746,13 +1759,13 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 			// if suffix>0, we are replacing error
 			s.isReplacingOrInjectingErr = currentLocation.Suffix != 0
 			s.locations.reset(shardingReSync.currLocation)
-			err = s.streamerController.RedirectStreamer(tctx, shardingReSync.currLocation)
+			err = s.streamerController.RedirectStreamer(s.runTCtx, shardingReSync.currLocation)
 			if err != nil {
 				return err
 			}
 
 			failpoint.Inject("ReSyncExit", func() {
-				tctx.L().Warn("exit triggered", zap.String("failpoint", "ReSyncExit"))
+				s.tctx.L().Warn("exit triggered", zap.String("failpoint", "ReSyncExit"))
 				utils.OsExit(1)
 			})
 		}
@@ -1760,7 +1773,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 		var e *replication.BinlogEvent
 
 		startTime := time.Now()
-		e, err = s.getEvent(tctx, currentLocation)
+		e, err = s.getEvent(s.tctx, currentLocation)
 		s.tctx.L().Debug("location refactor",
 			zap.Stringer("current", currentLocation),
 			zap.Stringer("start", startLocation),
@@ -1782,21 +1795,21 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 		})
 		switch {
 		case err == context.Canceled:
-			tctx.L().Info("binlog replication main routine quit(context canceled)!", zap.Stringer("last location", lastLocation))
+			s.tctx.L().Info("binlog replication main routine quit(context canceled)!", zap.Stringer("last location", lastLocation))
 			return nil
 		case err == context.DeadlineExceeded:
-			tctx.L().Info("deadline exceeded when fetching binlog event")
+			s.tctx.L().Info("deadline exceeded when fetching binlog event")
 			continue
 		case isDuplicateServerIDError(err):
 			// if the server id is already used, need to use a new server id
-			tctx.L().Info("server id is already used by another slave, will change to a new server id and get event again")
-			err1 := s.streamerController.UpdateServerIDAndResetReplication(tctx, lastLocation)
+			s.tctx.L().Info("server id is already used by another slave, will change to a new server id and get event again")
+			err1 := s.streamerController.UpdateServerIDAndResetReplication(s.tctx, lastLocation)
 			if err1 != nil {
 				return err1
 			}
 			continue
 		case err == relay.ErrorMaybeDuplicateEvent:
-			tctx.L().Warn("read binlog met a truncated file, will skip events that has been consumed")
+			s.tctx.L().Warn("read binlog met a truncated file, will skip events that has been consumed")
 			err = maybeSkipNRowsEvent(eventIndex)
 			if err == nil {
 				continue
@@ -1805,7 +1818,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 		}
 
 		if err != nil {
-			tctx.L().Error("fail to fetch binlog", log.ShortError(err))
+			s.tctx.L().Error("fail to fetch binlog", log.ShortError(err))
 
 			if isConnectionRefusedError(err) {
 				return err
@@ -1813,7 +1826,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 
 			if s.streamerController.CanRetry(err) {
 				// GlobalPoint is the last finished transaction location
-				err = s.streamerController.ResetReplicationSyncer(tctx, s.checkpoint.GlobalPoint())
+				err = s.streamerController.ResetReplicationSyncer(s.tctx, s.checkpoint.GlobalPoint())
 				if err != nil {
 					return err
 				}
@@ -1827,7 +1840,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 			// try to re-sync in gtid mode
 			if tryReSync && s.cfg.EnableGTID && utils.IsErrBinlogPurged(err) && s.cfg.AutoFixGTID {
 				time.Sleep(retryTimeout)
-				err = s.reSyncBinlog(*tctx, lastLocation)
+				err = s.reSyncBinlog(*s.tctx, lastLocation)
 				if err != nil {
 					return err
 				}
@@ -1840,7 +1853,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 
 		failpoint.Inject("IgnoreSomeTypeEvent", func(val failpoint.Value) {
 			if e.Header.EventType.String() == val.(string) {
-				tctx.L().Debug("IgnoreSomeTypeEvent", zap.Reflect("event", e))
+				s.tctx.L().Debug("IgnoreSomeTypeEvent", zap.Reflect("event", e))
 				failpoint.Continue()
 			}
 		})
@@ -1854,7 +1867,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 		metrics.BinlogPosGauge.WithLabelValues("syncer", s.cfg.Name, s.cfg.SourceID).Set(float64(e.Header.LogPos))
 		index, err := binlog.GetFilenameIndex(lastLocation.Position.Name)
 		if err != nil {
-			tctx.L().Warn("fail to get index number of binlog file, may because only specify GTID and hasn't saved according binlog position", log.ShortError(err))
+			s.tctx.L().Warn("fail to get index number of binlog file, may because only specify GTID and hasn't saved according binlog position", log.ShortError(err))
 		} else {
 			metrics.BinlogFileGauge.WithLabelValues("syncer", s.cfg.Name, s.cfg.SourceID).Set(float64(index))
 		}
@@ -1863,7 +1876,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 
 		failpoint.Inject("ProcessBinlogSlowDown", nil)
 
-		tctx.L().Debug("receive binlog event", zap.Reflect("header", e.Header))
+		s.tctx.L().Debug("receive binlog event", zap.Reflect("header", e.Header))
 
 		// TODO: support all event
 		// we calculate startLocation and endLocation(currentLocation) for Query event here
@@ -1906,7 +1919,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 						currentLocation = startLocation
 					} else if op == pb.ErrorOp_Skip {
 						ec := eventContext{
-							tctx:            tctx,
+							tctx:            s.tctx,
 							header:          e.Header,
 							startLocation:   &startLocation,
 							currentLocation: &currentLocation,
@@ -1915,7 +1928,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 						var sourceTbls map[string]map[string]struct{}
 						sourceTbls, err = s.trackOriginDDL(ev, ec)
 						if err != nil {
-							tctx.L().Warn("failed to track query when handle-error skip", zap.Error(err), zap.ByteString("sql", ev.Query))
+							s.tctx.L().Warn("failed to track query when handle-error skip", zap.Error(err), zap.ByteString("sql", ev.Query))
 						}
 
 						s.saveGlobalPoint(currentLocation)
@@ -1929,9 +1942,9 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 						}
 						err = s.flushJobs()
 						if err != nil {
-							tctx.L().Warn("failed to flush jobs when handle-error skip", zap.Error(err))
+							s.tctx.L().Warn("failed to flush jobs when handle-error skip", zap.Error(err))
 						} else {
-							tctx.L().Info("flush jobs when handle-error skip")
+							s.tctx.L().Info("flush jobs when handle-error skip")
 						}
 					}
 					// skip the current event
@@ -1948,9 +1961,9 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 					s.errOperatorHolder.SetHasAllInjected(startLocation)
 					// reset event as startLocation, avoid to be marked in checkpoint
 					currentLocation.Position.Pos = startLocation.Position.Pos
-					err = s.streamerController.RedirectStreamer(tctx, startLocation)
+					err = s.streamerController.RedirectStreamer(s.runTCtx, startLocation)
 				} else {
-					err = s.streamerController.RedirectStreamer(tctx, currentLocation)
+					err = s.streamerController.RedirectStreamer(s.runTCtx, currentLocation)
 				}
 				if err != nil {
 					return err
@@ -1970,17 +1983,17 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 				// 2. push forward and replicate some sqls after safeModeExitPoint to downstream
 				// 3. quit because of network error, fail to flush global checkpoint and new safeModeExitPoint to downstream
 				// 4. restart again, quit safe mode at safeModeExitPoint, but some sqls after this location have already been replicated to the downstream
-				if err = s.checkpoint.FlushSafeModeExitPoint(s.tctx); err != nil {
+				if err = s.checkpoint.FlushSafeModeExitPoint(s.runTCtx); err != nil {
 					return err
 				}
-				if err = s.safeMode.Add(tctx, -1); err != nil {
+				if err = s.safeMode.Add(s.tctx, -1); err != nil {
 					return err
 				}
 			}
 		}
 
 		ec := eventContext{
-			tctx:                tctx,
+			tctx:                s.tctx,
 			header:              e.Header,
 			startLocation:       &startLocation,
 			currentLocation:     &currentLocation,
@@ -2021,7 +2034,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 				// only need compare binlog position?
 				lastLocation = shardingReSync.currLocation
 				if binlog.CompareLocation(shardingReSync.currLocation, shardingReSync.latestLocation, s.cfg.EnableGTID) >= 0 {
-					tctx.L().Info("re-replicate shard group was completed", zap.String("event", "XID"), zap.Stringer("re-shard", shardingReSync))
+					s.tctx.L().Info("re-replicate shard group was completed", zap.String("event", "XID"), zap.Stringer("re-shard", shardingReSync))
 					err = closeShardingResync()
 					if err != nil {
 						return terror.Annotatef(err, "shard group current location %s", shardingReSync.currLocation)
@@ -2036,7 +2049,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 				return terror.Annotatef(err, "fail to record GTID %v", ev.GSet)
 			}
 
-			tctx.L().Debug("", zap.String("event", "XID"), zap.Stringer("last location", lastLocation), log.WrapStringerField("location", currentLocation))
+			s.tctx.L().Debug("", zap.String("event", "XID"), zap.Stringer("last location", lastLocation), log.WrapStringerField("location", currentLocation))
 			lastLocation.Position.Pos = e.Header.LogPos // update lastPos
 			err = lastLocation.SetGTID(ev.GSet)
 			if err != nil {
@@ -2049,7 +2062,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 			if e.Header.EventType == replication.HEARTBEAT_EVENT {
 				// flush checkpoint even if there are no real binlog events
 				if s.checkpoint.CheckGlobalPoint() {
-					tctx.L().Info("meet heartbeat event and then flush jobs")
+					s.tctx.L().Info("meet heartbeat event and then flush jobs")
 					err2 = s.flushJobs()
 				}
 			}
@@ -3365,7 +3378,7 @@ func (s *Syncer) Kill() {
 	s.Close()
 }
 
-// stopSync stops syncing and rollback checkpoint now it used by Close and Pause
+// stopSync stops syncing and rollback checkpoint now it used by Close,kILL and Pause
 // maybe we can refine the workflow more clear.
 func (s *Syncer) stopSync() {
 	if s.done != nil {
diff --git a/dm/tests/checkpoint_transaction/run.sh b/dm/tests/checkpoint_transaction/run.sh
index d9e740143dd..5b434a38136 100755
--- a/dm/tests/checkpoint_transaction/run.sh
+++ b/dm/tests/checkpoint_transaction/run.sh
@@ -33,7 +33,7 @@ function check_worker_ungraceful_stop_with_retry() {
 	done
 
 	if ! $all_matched; then
-		echo "check_worker_ungraceful_stop_with_retry failed after retry: $k"
+		echo "check_worker_ungraceful_stop_with_retry failed after retry"
 		exit 1
 	fi
 

From 5ca927684fee131e3b36719b53a003847aec2d52 Mon Sep 17 00:00:00 2001
From: ehco1996 <zh19960202@gmail.com>
Date: Wed, 19 Jan 2022 11:10:13 +0800
Subject: [PATCH 14/47] refine syncer.RUN

---
 dm/syncer/syncer.go                           | 65 +++++++++----------
 .../conf/dm-worker1.toml                      |  4 +-
 dm/tests/checkpoint_transaction/run.sh        | 44 ++++++-------
 3 files changed, 55 insertions(+), 58 deletions(-)

diff --git a/dm/syncer/syncer.go b/dm/syncer/syncer.go
index d89f800761f..be9b16d0393 100644
--- a/dm/syncer/syncer.go
+++ b/dm/syncer/syncer.go
@@ -125,10 +125,10 @@ type Syncer struct {
 
 	tctx *tcontext.Context // this ctx only used for logger.
 
-	// this ctx is a background ctx and was initialized in s.Run, it is used for some background tasks in S.Run
+	// this ctx is a background ctx and was initialized in s.Run, it is used for some background tasks in s.Run
 	// when this ctx cancelled, syncer will shuts down all background running jobs and not wait transaction end
 	runCtx context.Context
-	// this is used for some background tasks S.Run that need logger it share same lifecycle with runCtx.
+	// this is used for some background tasks s.Run that need logger it share same lifecycle with runCtx.
 	runTCtx   *tcontext.Context // this ctx only used for logger.
 	runCancel context.CancelFunc
 
@@ -196,8 +196,6 @@ type Syncer struct {
 	filteredUpdate atomic.Int64
 	filteredDelete atomic.Int64
 
-	done chan struct{}
-
 	checkpoint            CheckPoint
 	checkpointFlushWorker *checkpointFlushWorker
 	onlineDDL             onlineddl.OnlinePlugin
@@ -261,7 +259,6 @@ func NewSyncer(cfg *config.SubTaskConfig, etcdClient *clientv3.Client, relay rel
 	syncer.binlogSizeCount.Store(0)
 	syncer.lastCount.Store(0)
 	syncer.count.Store(0)
-	syncer.done = nil
 	syncer.handleJobFunc = syncer.handleJob
 	syncer.cli = etcdClient
 
@@ -303,7 +300,7 @@ func (s *Syncer) closeJobChans() {
 	}
 	close(s.dmlJobCh)
 	close(s.ddlJobCh)
-	close(s.checkpointFlushWorker.input)
+	s.checkpointFlushWorker.Close()
 	s.jobsClosed.Store(true)
 }
 
@@ -643,7 +640,6 @@ func (s *Syncer) Process(ctx context.Context, pr chan pb.ProcessResult) {
 		s.Unlock()
 		return
 	}
-	s.done = make(chan struct{})
 	s.Unlock()
 
 	runFatalChan := make(chan *pb.ProcessError, s.cfg.WorkerCount+1)
@@ -1414,27 +1410,29 @@ func (s *Syncer) syncDML() {
 }
 
 func (s *Syncer) waitTransactionEndBeforeExit(ctx context.Context) {
-	defer s.runCancel()
 	select {
 	case <-ctx.Done(): // hijack the root context from s.Run to wait for the transaction to end.
 		s.tctx.L().Info("received subtask's done, try graceful stop")
 		s.waitTransactionLock.Lock()
 		if s.isTransactionEnd {
 			s.waitXIDJob.Store(int64(waitComplete))
-			s.tctx.L().Info("the last job is transaction end, done directly")
 			s.waitTransactionLock.Unlock()
+			s.tctx.L().Info("the last job is transaction end, done directly")
+			s.runCancel()
 			return
 		}
 		s.waitXIDJob.Store(int64(waiting))
 		s.waitTransactionLock.Unlock()
 		select {
 		case <-s.runCtx.Done():
-			s.tctx.L().Info("received run ctx done, exit now")
+			s.tctx.L().Info("run ctx done by waitComplete")
 		case <-time.After(maxPauseOrStopWaitTime):
 			s.tctx.L().Info("wait transaction end timeout, exit now")
+			s.runCancel()
 		}
 	case <-s.runCtx.Done(): // when no graceful stop, run ctx will canceled first.
 		s.tctx.L().Info("received ungraceful exit ctx, exit now")
+		s.runCancel()
 	}
 }
 
@@ -1484,21 +1482,12 @@ func (s *Syncer) updateTSOffset(ctx context.Context) error {
 func (s *Syncer) Run(ctx context.Context) (err error) {
 	runCtx, runCancel := context.WithCancel(context.Background())
 	runTCtx := tcontext.NewContext(runCtx, s.tctx.L())
-	defer runCancel()
 	s.Lock()
 	s.runCtx = runCtx
 	s.runTCtx = runTCtx
 	s.runCancel = runCancel
 	s.Unlock()
 
-	defer func() {
-		if s.done != nil {
-			close(s.done)
-		}
-	}()
-
-	go s.waitTransactionEndBeforeExit(ctx)
-
 	// some initialization that can't be put in Syncer.Init
 	fresh, err := s.IsFreshTask(ctx)
 	if err != nil {
@@ -1511,13 +1500,6 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 		}
 	}
 
-	// start flush checkpoints worker.
-	s.wg.Add(1)
-	go func() {
-		defer s.wg.Done()
-		s.checkpointFlushWorker.Run(runTCtx)
-	}()
-
 	var (
 		flushCheckpoint bool
 		delLoadTask     bool
@@ -1538,6 +1520,12 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 	} else {
 		cleanDumpFile = false
 	}
+	// start flush checkpoints worker. this worker mast start before s.flushCheckPoints()
+	s.wg.Add(1)
+	go func() {
+		defer s.wg.Done()
+		s.checkpointFlushWorker.Run(s.runTCtx)
+	}()
 
 	if flushCheckpoint {
 		if err = s.flushCheckPoints(); err != nil {
@@ -1567,9 +1555,6 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 	if utErr := s.updateTSOffset(ctx); utErr != nil {
 		return utErr
 	}
-	// start background task to get/update current ts offset between dm and upstream
-	s.wg.Add(1)
-	go s.updateTSOffsetCronJob(s.runCtx)
 
 	// startLocation is the start location for current received event
 	// currentLocation is the end location for current received event (End_log_pos in `show binlog events` for mysql)
@@ -1588,19 +1573,25 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 
 	if s.streamerController.IsClosed() {
 		s.locations.reset(lastLocation)
-		err = s.streamerController.Start(runTCtx, lastLocation)
+		err = s.streamerController.Start(s.runTCtx, lastLocation)
 		if err != nil {
 			return terror.Annotate(err, "fail to restart streamer controller")
 		}
 	}
 
+	// 只有这个能 hook 最外层的 ctx， 通过这个函数来控制 s.Run 的退出
+	go s.waitTransactionEndBeforeExit(ctx)
+
+	// all pre check passed, we can start all background jobs, they all use s.runCancel to cancel
+	s.wg.Add(1)
+	go s.updateTSOffsetCronJob(s.runCtx)
+
 	s.wg.Add(1)
 	go s.syncDML()
 
 	s.wg.Add(1)
 	go s.syncDDL(s.runTCtx, adminQueueName, s.ddlDBConn, s.ddlJobCh)
 
-	// start background task to update replication lag metric
 	s.wg.Add(1)
 	go s.updateLagCronJob(s.runCtx)
 
@@ -1773,7 +1764,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 		var e *replication.BinlogEvent
 
 		startTime := time.Now()
-		e, err = s.getEvent(s.tctx, currentLocation)
+		e, err = s.getEvent(s.runTCtx, currentLocation)
 		s.tctx.L().Debug("location refactor",
 			zap.Stringer("current", currentLocation),
 			zap.Stringer("start", startLocation),
@@ -2078,6 +2069,8 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 			}
 		}
 		if waitXIDStatus(s.waitXIDJob.Load()) == waitComplete {
+			// already wait until XID event, we can stop sync now
+			s.runCancel()
 			return nil
 		}
 	}
@@ -3355,6 +3348,11 @@ func (s *Syncer) Close() {
 		return
 	}
 
+	// wait for s.Run() to finish
+	if s.runCtx != nil {
+		<-s.runCtx.Done()
+	}
+
 	s.stopSync()
 	s.closeDBs()
 	s.checkpoint.Close()
@@ -3381,9 +3379,6 @@ func (s *Syncer) Kill() {
 // stopSync stops syncing and rollback checkpoint now it used by Close,kILL and Pause
 // maybe we can refine the workflow more clear.
 func (s *Syncer) stopSync() {
-	if s.done != nil {
-		<-s.done // wait Run to return
-	}
 	s.closeJobChans()
 	s.wg.Wait() // wait job workers to return
 
diff --git a/dm/tests/checkpoint_transaction/conf/dm-worker1.toml b/dm/tests/checkpoint_transaction/conf/dm-worker1.toml
index 1a53a1dc788..9b8ecaf991c 100644
--- a/dm/tests/checkpoint_transaction/conf/dm-worker1.toml
+++ b/dm/tests/checkpoint_transaction/conf/dm-worker1.toml
@@ -1,3 +1,5 @@
-name = "worker1"
 join = "127.0.0.1:8261"
 keepalive-ttl = 1
+log-level = "info"
+name = "worker1"
+worker-addr = "127.0.0.1:8262"
diff --git a/dm/tests/checkpoint_transaction/run.sh b/dm/tests/checkpoint_transaction/run.sh
index 5b434a38136..15ff859c229 100755
--- a/dm/tests/checkpoint_transaction/run.sh
+++ b/dm/tests/checkpoint_transaction/run.sh
@@ -61,28 +61,28 @@ function run() {
 	# check diff
 	check_sync_diff $WORK_DIR $cur/conf/diff_config.toml
 
-	# test ungraceful stop, worker will not wait transaction finish
-	run_sql_file $cur/data/db1.increment1.sql $MYSQL_HOST1 $MYSQL_PORT1 $MYSQL_PASSWORD1
-	sleep 2
-	# kill dm-master 1 to make worker lost keepalive while a transaction is not finished
-	echo "kill dm-master1"
-	kill_dm_master
-	check_master_port_offline 1
-	sleep 1 # wait worker lost keep alive ttl is 1 second
-
-	# check dm-worker will exit quickly without waiting for the transaction to finish
-	check_worker_ungraceful_stop_with_retry
-
-	# test data in tidb less than source
-	dataCountSource=$(mysql -uroot -h$MYSQL_HOST1 -P$MYSQL_PORT1 -p$MYSQL_PASSWORD1 -se "select count(1) from checkpoint_transaction.t1")
-	dataCountIntidb=$(mysql -uroot -h127.0.0.1 -P4000 -se "select count(1) from checkpoint_transaction.t1")
-	echo "afetr ungraceful exit data in source count: $dataCountSource data in tidb count: $dataCountIntidb"
-	[[ $dataCountIntidb -lt $dataCountSource ]]
-
-	# start dm-master again task will be resume, and data will be synced
-	run_dm_master $WORK_DIR/master $MASTER_PORT $cur/conf/dm-master.toml
-	check_rpc_alive $cur/../bin/check_master_online 127.0.0.1:$MASTER_PORT
-	check_sync_diff $WORK_DIR $cur/conf/diff_config.toml
+	# # test ungraceful stop, worker will not wait transaction finish
+	# run_sql_file $cur/data/db1.increment1.sql $MYSQL_HOST1 $MYSQL_PORT1 $MYSQL_PASSWORD1
+	# sleep 2
+	# # kill dm-master 1 to make worker lost keep alive while a transaction is not finished
+	# echo "kill dm-master1"
+	# kill_dm_master
+	# check_master_port_offline 1
+	# sleep 1 # wait worker lost keep alive ttl is 1 second
+
+	# # check dm-worker will exit quickly without waiting for the transaction to finish
+	# check_worker_ungraceful_stop_with_retry
+
+	# # test data in tidb less than source
+	# dataCountSource=$(mysql -uroot -h$MYSQL_HOST1 -P$MYSQL_PORT1 -p$MYSQL_PASSWORD1 -se "select count(1) from checkpoint_transaction.t1")
+	# dataCountInTiDB=$(mysql -uroot -h127.0.0.1 -P4000 -se "select count(1) from checkpoint_transaction.t1")
+	# echo "after ungraceful exit data in source count: $dataCountSource data in tidb count: $dataCountInTiDB"
+	# [[ $dataCountInTiDB -lt $dataCountSource ]]
+
+	# # start dm-master again task will be resume, and data will be synced
+	# run_dm_master $WORK_DIR/master $MASTER_PORT $cur/conf/dm-master.toml
+	# check_rpc_alive $cur/../bin/check_master_online 127.0.0.1:$MASTER_PORT
+	# check_sync_diff $WORK_DIR $cur/conf/diff_config.toml
 
 	run_sql_file $cur/data/db1.increment1.sql $MYSQL_HOST1 $MYSQL_PORT1 $MYSQL_PASSWORD1
 	# wait transaction start

From 1302a758f4b251f049b3fb1451eb025ab6d8b1ba Mon Sep 17 00:00:00 2001
From: ehco1996 <zh19960202@gmail.com>
Date: Wed, 19 Jan 2022 12:11:12 +0800
Subject: [PATCH 15/47] revert test

---
 dm/syncer/syncer.go                    | 16 +++++----
 dm/tests/checkpoint_transaction/run.sh | 45 +++++++++++++-------------
 2 files changed, 32 insertions(+), 29 deletions(-)

diff --git a/dm/syncer/syncer.go b/dm/syncer/syncer.go
index a2e6d681ed2..61fdca8adeb 100644
--- a/dm/syncer/syncer.go
+++ b/dm/syncer/syncer.go
@@ -131,6 +131,7 @@ type Syncer struct {
 	// this is used for some background tasks s.Run that need logger it share same lifecycle with runCtx.
 	runTCtx   *tcontext.Context // this ctx only used for logger.
 	runCancel context.CancelFunc
+	rundone   chan struct{} // this will be closed when syncer.Run exit
 
 	cfg     *config.SubTaskConfig
 	syncCfg replication.BinlogSyncerConfig
@@ -280,6 +281,8 @@ func NewSyncer(cfg *config.SubTaskConfig, etcdClient *clientv3.Client, relay rel
 	syncer.lastCheckpointFlushedTime = time.Time{}
 	syncer.relay = relay
 	syncer.locations = &locationRecorder{}
+
+	syncer.rundone = nil
 	return syncer
 }
 
@@ -956,7 +959,7 @@ func (s *Syncer) addJob(job *job) {
 		s.dmlJobCh <- job
 		failpoint.Inject("checkCheckpointInMiddleOfTransaction", func() {
 			s.tctx.L().Info("receive dml job", zap.Any("dml job", job))
-			time.Sleep(500 * time.Millisecond)
+			time.Sleep(100 * time.Millisecond)
 		})
 	case gc:
 		s.dmlJobCh <- job
@@ -1491,6 +1494,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 	s.runCtx = runCtx
 	s.runTCtx = runTCtx
 	s.runCancel = runCancel
+	s.rundone = make(chan struct{})
 	s.Unlock()
 
 	// some initialization that can't be put in Syncer.Init
@@ -1584,9 +1588,6 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 		}
 	}
 
-	// 只有这个能 hook 最外层的 ctx， 通过这个函数来控制 s.Run 的退出
-	go s.waitTransactionEndBeforeExit(ctx)
-
 	// all pre check passed, we can start all background jobs, they all use s.runCancel to cancel
 	s.wg.Add(1)
 	go s.updateTSOffsetCronJob(s.runCtx)
@@ -1617,7 +1618,9 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 		traceSource             = fmt.Sprintf("%s.syncer.%s", s.cfg.SourceID, s.cfg.Name)
 	)
 
+	go s.waitTransactionEndBeforeExit(ctx)
 	defer func() {
+		// before
 		if err1 := recover(); err1 != nil {
 			failpoint.Inject("ExitAfterSaveOnlineDDL", func() {
 				s.tctx.L().Info("force panic")
@@ -1649,6 +1652,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 				s.tctx.L().Warn("failed to flush safe mode checkpoints when exit task", zap.Error(err2))
 			}
 		}
+		close(s.rundone)
 	}()
 
 	now := time.Now()
@@ -3353,8 +3357,8 @@ func (s *Syncer) Close() {
 	}
 
 	// wait for s.Run() to finish
-	if s.runCtx != nil {
-		<-s.runCtx.Done()
+	if s.rundone != nil {
+		<-s.rundone
 	}
 
 	s.stopSync()
diff --git a/dm/tests/checkpoint_transaction/run.sh b/dm/tests/checkpoint_transaction/run.sh
index 15ff859c229..7c41c905995 100755
--- a/dm/tests/checkpoint_transaction/run.sh
+++ b/dm/tests/checkpoint_transaction/run.sh
@@ -61,29 +61,28 @@ function run() {
 	# check diff
 	check_sync_diff $WORK_DIR $cur/conf/diff_config.toml
 
-	# # test ungraceful stop, worker will not wait transaction finish
-	# run_sql_file $cur/data/db1.increment1.sql $MYSQL_HOST1 $MYSQL_PORT1 $MYSQL_PASSWORD1
-	# sleep 2
-	# # kill dm-master 1 to make worker lost keep alive while a transaction is not finished
-	# echo "kill dm-master1"
-	# kill_dm_master
-	# check_master_port_offline 1
-	# sleep 1 # wait worker lost keep alive ttl is 1 second
-
-	# # check dm-worker will exit quickly without waiting for the transaction to finish
-	# check_worker_ungraceful_stop_with_retry
-
-	# # test data in tidb less than source
-	# dataCountSource=$(mysql -uroot -h$MYSQL_HOST1 -P$MYSQL_PORT1 -p$MYSQL_PASSWORD1 -se "select count(1) from checkpoint_transaction.t1")
-	# dataCountInTiDB=$(mysql -uroot -h127.0.0.1 -P4000 -se "select count(1) from checkpoint_transaction.t1")
-	# echo "after ungraceful exit data in source count: $dataCountSource data in tidb count: $dataCountInTiDB"
-	# [[ $dataCountInTiDB -lt $dataCountSource ]]
-
-	# # start dm-master again task will be resume, and data will be synced
-	# run_dm_master $WORK_DIR/master $MASTER_PORT $cur/conf/dm-master.toml
-	# check_rpc_alive $cur/../bin/check_master_online 127.0.0.1:$MASTER_PORT
-	# check_sync_diff $WORK_DIR $cur/conf/diff_config.toml
-
+	# test ungraceful stop, worker will not wait transaction finish
+	run_sql_file $cur/data/db1.increment1.sql $MYSQL_HOST1 $MYSQL_PORT1 $MYSQL_PASSWORD1
+	sleep 2
+	# kill dm-master 1 to make worker lost keep alive while a transaction is not finished
+	echo "kill dm-master1"
+	kill_dm_master
+	check_master_port_offline 1
+	sleep 1 # wait worker lost keep alive ttl is 1 second
+
+	# check dm-worker will exit quickly without waiting for the transaction to finish
+	check_worker_ungraceful_stop_with_retry
+
+	# test data in tidb less than source
+	dataCountSource=$(mysql -uroot -h$MYSQL_HOST1 -P$MYSQL_PORT1 -p$MYSQL_PASSWORD1 -se "select count(1) from checkpoint_transaction.t1")
+	dataCountInTiDB=$(mysql -uroot -h127.0.0.1 -P4000 -se "select count(1) from checkpoint_transaction.t1")
+	echo "after ungraceful exit data in source count: $dataCountSource data in tidb count: $dataCountInTiDB"
+	[[ $dataCountInTiDB -lt $dataCountSource ]]
+
+	# start dm-master again task will be resume, and data will be synced
+	run_dm_master $WORK_DIR/master $MASTER_PORT $cur/conf/dm-master.toml
+	check_rpc_alive $cur/../bin/check_master_online 127.0.0.1:$MASTER_PORT
+	check_sync_diff $WORK_DIR $cur/conf/diff_config.toml
 	run_sql_file $cur/data/db1.increment1.sql $MYSQL_HOST1 $MYSQL_PORT1 $MYSQL_PASSWORD1
 	# wait transaction start
 	# you can see why sleep in https://github.com/pingcap/dm/pull/1928#issuecomment-895820239

From a813189d24ce9fb0395a28dbc1fdfdc5e86167e7 Mon Sep 17 00:00:00 2001
From: ehco1996 <zh19960202@gmail.com>
Date: Wed, 19 Jan 2022 17:28:44 +0800
Subject: [PATCH 16/47] make test steable

---
 dm/syncer/syncer.go                    |  7 ++++++-
 dm/tests/checkpoint_transaction/run.sh | 23 +++++++++++++++++++----
 2 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/dm/syncer/syncer.go b/dm/syncer/syncer.go
index 61fdca8adeb..7a3cbd888b7 100644
--- a/dm/syncer/syncer.go
+++ b/dm/syncer/syncer.go
@@ -959,7 +959,7 @@ func (s *Syncer) addJob(job *job) {
 		s.dmlJobCh <- job
 		failpoint.Inject("checkCheckpointInMiddleOfTransaction", func() {
 			s.tctx.L().Info("receive dml job", zap.Any("dml job", job))
-			time.Sleep(100 * time.Millisecond)
+			time.Sleep(500 * time.Millisecond)
 		})
 	case gc:
 		s.dmlJobCh <- job
@@ -1418,6 +1418,11 @@ func (s *Syncer) syncDML() {
 }
 
 func (s *Syncer) waitTransactionEndBeforeExit(ctx context.Context) {
+	failpoint.Inject("checkCheckpointInMiddleOfTransaction", func() {
+		s.tctx.L().Info("incr maxPauseOrStopWaitTime time ")
+		maxPauseOrStopWaitTime = time.Minute * 10
+	})
+
 	select {
 	case <-ctx.Done(): // hijack the root context from s.Run to wait for the transaction to end.
 		s.tctx.L().Info("received subtask's done, try graceful stop")
diff --git a/dm/tests/checkpoint_transaction/run.sh b/dm/tests/checkpoint_transaction/run.sh
index 7c41c905995..7198d385a7e 100755
--- a/dm/tests/checkpoint_transaction/run.sh
+++ b/dm/tests/checkpoint_transaction/run.sh
@@ -77,7 +77,12 @@ function run() {
 	dataCountSource=$(mysql -uroot -h$MYSQL_HOST1 -P$MYSQL_PORT1 -p$MYSQL_PASSWORD1 -se "select count(1) from checkpoint_transaction.t1")
 	dataCountInTiDB=$(mysql -uroot -h127.0.0.1 -P4000 -se "select count(1) from checkpoint_transaction.t1")
 	echo "after ungraceful exit data in source count: $dataCountSource data in tidb count: $dataCountInTiDB"
-	[[ $dataCountInTiDB -lt $dataCountSource ]]
+	if [ "$dataCountInTiDB" -lt "$dataCountSource" ]; then
+		echo "ungraceful stop test success"
+	else
+		echo "ungraceful stop test failed"
+		exit 1
+	fi
 
 	# start dm-master again task will be resume, and data will be synced
 	run_dm_master $WORK_DIR/master $MASTER_PORT $cur/conf/dm-master.toml
@@ -96,8 +101,13 @@ function run() {
 		"\"stage\": \"Paused\"" 1
 	# check the point is the middle of checkpoint
 	num=$(grep "not receive xid job yet" $WORK_DIR/worker1/log/dm-worker.log | wc -l)
-	[[ $num -gt 0 ]]
-	sed -e '/not receive xid job yet/d' $WORK_DIR/worker1/log/dm-worker.log >$WORK_DIR/worker1/log/dm-worker.log
+
+	if [ "$num" -gt 0 ]; then
+		echo "graceful pause test success"
+	else
+		echo "graceful pause test failed"
+		exit 1
+	fi
 
 	echo "start check pause diff"
 	check_sync_diff $WORK_DIR $cur/conf/diff_config.toml
@@ -120,7 +130,12 @@ function run() {
 		"\"result\": true" 2
 	# check the point is the middle of checkpoint
 	num=$(grep "not receive xid job yet" $WORK_DIR/worker1/log/dm-worker.log | wc -l)
-	[[ $num -gt 0 ]]
+	if [ "$num" -gt 0 ]; then
+		echo "graceful stop test success"
+	else
+		echo "graceful stop test failed"
+		exit 1
+	fi
 
 	echo "start check stop diff"
 	check_sync_diff $WORK_DIR $cur/conf/diff_config.toml

From 922faa08dfd5c7a38e52d8f157056c6eea4c642b Mon Sep 17 00:00:00 2001
From: ehco1996 <zh19960202@gmail.com>
Date: Thu, 20 Jan 2022 09:03:48 +0800
Subject: [PATCH 17/47] fix ha test lib

---
 dm/syncer/syncer.go             | 1 -
 dm/tests/_utils/ha_cases_lib.sh | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/dm/syncer/syncer.go b/dm/syncer/syncer.go
index 7a3cbd888b7..a3404ebebf4 100644
--- a/dm/syncer/syncer.go
+++ b/dm/syncer/syncer.go
@@ -1625,7 +1625,6 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 
 	go s.waitTransactionEndBeforeExit(ctx)
 	defer func() {
-		// before
 		if err1 := recover(); err1 != nil {
 			failpoint.Inject("ExitAfterSaveOnlineDDL", func() {
 				s.tctx.L().Info("force panic")
diff --git a/dm/tests/_utils/ha_cases_lib.sh b/dm/tests/_utils/ha_cases_lib.sh
index d9fb774a90b..6a0a2896ed8 100644
--- a/dm/tests/_utils/ha_cases_lib.sh
+++ b/dm/tests/_utils/ha_cases_lib.sh
@@ -145,6 +145,7 @@ function start_multi_tasks_cluster() {
 }
 
 function cleanup() {
+	cleanup_process $*
 	cleanup_data $ha_test
 	cleanup_data $ha_test2
 	echo "clean source table"
@@ -154,7 +155,6 @@ function cleanup() {
 		$(mysql -h127.0.0.1 -p123456 -P${i} -uroot -e "drop database if exists ha_test2;")
 		sleep 1
 	done
-	cleanup_process $*
 }
 
 function isolate_master() {

From 9ffaf25ea69e660da5d25a3aafc1d886099533ca Mon Sep 17 00:00:00 2001
From: ehco1996 <zh19960202@gmail.com>
Date: Thu, 20 Jan 2022 09:29:12 +0800
Subject: [PATCH 18/47] fix lazy_init_tracker

---
 dm/tests/start_task/run.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dm/tests/start_task/run.sh b/dm/tests/start_task/run.sh
index 90e51cb24f7..5f67c350b00 100644
--- a/dm/tests/start_task/run.sh
+++ b/dm/tests/start_task/run.sh
@@ -58,8 +58,8 @@ function lazy_init_tracker() {
 	check_log_contains $WORK_DIR/worker1/log/dm-worker.log 'lazy init table info.*t50' 1
 	check_log_not_contains $WORK_DIR/worker1/log/dm-worker.log 'lazy init table info.*t51'
 
-	cleanup_data start_task
 	cleanup_process
+	cleanup_data start_task
 }
 
 function run() {

From 9c2ea48d45a521f4ff2a4e37b461bb9f5f478089 Mon Sep 17 00:00:00 2001
From: ehco1996 <zh19960202@gmail.com>
Date: Thu, 20 Jan 2022 10:07:37 +0800
Subject: [PATCH 19/47] fix clean up data

---
 dm/tests/all_mode/run.sh          | 8 ++++----
 dm/tests/expression_filter/run.sh | 2 +-
 dm/tests/full_mode/run.sh         | 6 +++---
 dm/tests/load_interrupt/run.sh    | 2 +-
 dm/tests/safe_mode/run.sh         | 8 ++++----
 5 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/dm/tests/all_mode/run.sh b/dm/tests/all_mode/run.sh
index 043511d9f43..61dbcd8a18f 100755
--- a/dm/tests/all_mode/run.sh
+++ b/dm/tests/all_mode/run.sh
@@ -142,8 +142,8 @@ function test_query_timeout() {
 	run_sql_tidb 'SHOW PROCESSLIST;'
 	check_rows_equal 1
 
-	cleanup_data all_mode
 	cleanup_process
+	cleanup_data all_mode
 
 	export GO_FAILPOINTS=''
 	echo "[$(date)] <<<<<< finish test_query_timeout >>>>>>"
@@ -208,8 +208,8 @@ function test_stop_task_before_checkpoint() {
 		"stop-task test" \
 		"\"result\": true" 3
 
-	cleanup_data all_mode
 	cleanup_process
+	cleanup_data all_mode
 
 	export GO_FAILPOINTS=''
 	echo "[$(date)] <<<<<< finish test_stop_task_before_checkpoint >>>>>>"
@@ -268,8 +268,8 @@ function test_fail_job_between_event() {
 		"\"result\": true" 3
 	check_sync_diff $WORK_DIR $cur/conf/diff_config.toml
 
-	cleanup_data all_mode
 	cleanup_process
+	cleanup_data all_mode
 
 	export GO_FAILPOINTS=''
 	echo "[$(date)] <<<<<< finish test_fail_job_between_event >>>>>>"
@@ -315,8 +315,8 @@ function test_expression_filter() {
 		"query-status test" \
 		"\"result\": true" 3
 
-	cleanup_data all_mode
 	cleanup_process
+	cleanup_data all_mode
 	echo "[$(date)] <<<<<< finish test_expression_filter >>>>>>"
 }
 
diff --git a/dm/tests/expression_filter/run.sh b/dm/tests/expression_filter/run.sh
index 01b559ccba9..faa7d1d43db 100755
--- a/dm/tests/expression_filter/run.sh
+++ b/dm/tests/expression_filter/run.sh
@@ -53,8 +53,8 @@ function complex_behaviour() {
 	update_num=$(grep -o '"number of filtered update"=[0-9]\+' $WORK_DIR/worker1/log/dm-worker.log | grep -o '[0-9]\+' | awk '{n += $1}; END{print n}')
 	[ $update_num -eq 3 ]
 
-	cleanup_data expr_filter
 	cleanup_process $*
+	cleanup_data expr_filter
 }
 
 function run() {
diff --git a/dm/tests/full_mode/run.sh b/dm/tests/full_mode/run.sh
index 80fd8975c52..b74e7599a03 100755
--- a/dm/tests/full_mode/run.sh
+++ b/dm/tests/full_mode/run.sh
@@ -55,8 +55,8 @@ function fail_acquire_global_lock() {
 		"\"stage\": \"Paused\"" 2 \
 		"you need (at least one of) the RELOAD privilege(s) for this operation" 2
 
-	cleanup_data full_mode
 	cleanup_process $*
+	cleanup_data full_mode
 }
 
 function escape_schema() {
@@ -113,8 +113,8 @@ function escape_schema() {
 	check_metric $WORKER1_PORT 'dumpling_dump_finished_tables' 3 0 3
 	check_metric $WORKER2_PORT 'dumpling_dump_finished_tables' 3 0 3
 
-	cleanup_data full/mode
 	cleanup_process $*
+	cleanup_data full/mode
 }
 
 function empty_data() {
@@ -138,8 +138,8 @@ function empty_data() {
 	check_log_contains $WORK_DIR/worker1/log/dm-worker.log "progress=\"100.00 %\""
 	check_log_contains $WORK_DIR/worker2/log/dm-worker.log "progress=\"100.00 %\""
 
-	cleanup_data full_mode
 	cleanup_process $*
+	cleanup_data full_mode
 }
 
 function run() {
diff --git a/dm/tests/load_interrupt/run.sh b/dm/tests/load_interrupt/run.sh
index 79f8c79c44c..e0e96aebb9c 100755
--- a/dm/tests/load_interrupt/run.sh
+++ b/dm/tests/load_interrupt/run.sh
@@ -60,8 +60,8 @@ function test_save_checkpoint_failed() {
 	ls $WORK_DIR/worker1/dumped_data.test
 
 	echo "test_save_checkpoint_failed SUCCESS!"
-	cleanup_data load_interrupt
 	cleanup_process $*
+	cleanup_data load_interrupt
 }
 
 function run() {
diff --git a/dm/tests/safe_mode/run.sh b/dm/tests/safe_mode/run.sh
index 644967cd306..603c855a2c7 100755
--- a/dm/tests/safe_mode/run.sh
+++ b/dm/tests/safe_mode/run.sh
@@ -55,8 +55,8 @@ function consistency_none() {
 	check_log_contain_with_retry "\[\"enable safe-mode for safe mode exit point, will exit at\"\] \[task=test\] \[unit=\"binlog replication\"\] \[location=\"position: ($name2, $pos2), gtid-set: $gtid2\"\]" $WORK_DIR/worker2/log/dm-worker.log
 
 	run_sql_source2 "SET @@GLOBAL.SQL_MODE='ONLY_FULL_GROUP_BY,STRICT_TRANS_TABLES,NO_ZERO_IN_DATE,NO_ZERO_DATE,ERROR_FOR_DIVISION_BY_ZERO,NO_ENGINE_SUBSTITUTION'"
-	cleanup_data safe_mode_target
 	cleanup_process $*
+	cleanup_data safe_mode_target
 }
 
 function check_exit_safe_binlog() {
@@ -182,14 +182,14 @@ function safe_mode_recover() {
 
 		echo "finish running run safe mode recover case $i"
 		((i += 1))
-		cleanup_data safe_mode_target
 		cleanup_process $*
+		cleanup_data safe_mode_target
 	done
 }
 
 function run() {
-	consistency_none
-	safe_mode_recover
+	# consistency_none
+	# safe_mode_recover
 
 	run_sql_file $cur/data/db1.prepare.sql $MYSQL_HOST1 $MYSQL_PORT1 $MYSQL_PASSWORD1
 	check_contains 'Query OK, 2 rows affected'

From 937cdcc34f41ec178615b5e670e1c85f8305d13b Mon Sep 17 00:00:00 2001
From: ehco1996 <zh19960202@gmail.com>
Date: Thu, 20 Jan 2022 11:20:11 +0800
Subject: [PATCH 20/47] fix event ctx

---
 dm/syncer/syncer.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dm/syncer/syncer.go b/dm/syncer/syncer.go
index a3404ebebf4..4625422c135 100644
--- a/dm/syncer/syncer.go
+++ b/dm/syncer/syncer.go
@@ -1997,7 +1997,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 		}
 
 		ec := eventContext{
-			tctx:                s.tctx,
+			tctx:                s.runTCtx,
 			header:              e.Header,
 			startLocation:       &startLocation,
 			currentLocation:     &currentLocation,

From 57ba6161b6809bc95b1162f9fb34e604ee5fc255 Mon Sep 17 00:00:00 2001
From: ehco1996 <zh19960202@gmail.com>
Date: Thu, 20 Jan 2022 14:20:15 +0800
Subject: [PATCH 21/47] fix syncer.done

---
 dm/dm/worker/server.go    | 7 +++----
 dm/dm/worker/subtask.go   | 1 +
 dm/syncer/syncer.go       | 7 +++++--
 dm/tests/safe_mode/run.sh | 4 ++--
 4 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/dm/dm/worker/server.go b/dm/dm/worker/server.go
index 3c5bca79001..c9f1b33d406 100644
--- a/dm/dm/worker/server.go
+++ b/dm/dm/worker/server.go
@@ -441,19 +441,18 @@ func (s *Server) observeSourceBound(ctx context.Context, rev int64) error {
 }
 
 func (s *Server) doClose() {
-	s.cancel()
-	// close server in advance, stop receiving source bound and relay bound
-	s.wg.Wait()
-
 	s.Lock()
 	defer s.Unlock()
+
 	if s.closed.Load() {
 		return
 	}
 	// stop worker and wait for return
+	s.cancel()
 	if w := s.getWorker(false); w != nil {
 		w.Stop(true)
 	}
+	s.wg.Wait()
 	s.closed.Store(true)
 }
 
diff --git a/dm/dm/worker/subtask.go b/dm/dm/worker/subtask.go
index 8c17e345aea..3fb0d61b66b 100644
--- a/dm/dm/worker/subtask.go
+++ b/dm/dm/worker/subtask.go
@@ -381,6 +381,7 @@ func (st *SubTask) closeUnits() {
 		u := st.units[i]
 		st.l.Info("closing unit process", zap.Stringer("unit", cu.Type()))
 		u.Close()
+		st.l.Info("closing unit done", zap.Stringer("unit", cu.Type()))
 	}
 }
 
diff --git a/dm/syncer/syncer.go b/dm/syncer/syncer.go
index 4625422c135..50573a0254f 100644
--- a/dm/syncer/syncer.go
+++ b/dm/syncer/syncer.go
@@ -1501,6 +1501,10 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 	s.runCancel = runCancel
 	s.rundone = make(chan struct{})
 	s.Unlock()
+	defer func() {
+		// close rundone to notify other goroutine that syncer.Run is exited
+		close(s.rundone)
+	}()
 
 	// some initialization that can't be put in Syncer.Init
 	fresh, err := s.IsFreshTask(ctx)
@@ -1656,7 +1660,6 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 				s.tctx.L().Warn("failed to flush safe mode checkpoints when exit task", zap.Error(err2))
 			}
 		}
-		close(s.rundone)
 	}()
 
 	now := time.Now()
@@ -3398,7 +3401,7 @@ func (s *Syncer) stopSync() {
 	// when resuming, re-create s.syncer
 
 	if s.streamerController != nil {
-		s.streamerController.Close(s.tctx)
+		s.streamerController.Close(s.runTCtx)
 	}
 
 	// try to rollback checkpoints, if they already flushed, no effect, this operation should call before close schemaTracker
diff --git a/dm/tests/safe_mode/run.sh b/dm/tests/safe_mode/run.sh
index 603c855a2c7..ed1d8ff74ff 100755
--- a/dm/tests/safe_mode/run.sh
+++ b/dm/tests/safe_mode/run.sh
@@ -188,8 +188,8 @@ function safe_mode_recover() {
 }
 
 function run() {
-	# consistency_none
-	# safe_mode_recover
+	consistency_none
+	safe_mode_recover
 
 	run_sql_file $cur/data/db1.prepare.sql $MYSQL_HOST1 $MYSQL_PORT1 $MYSQL_PASSWORD1
 	check_contains 'Query OK, 2 rows affected'

From adb7901798d14ef7298522d98424bdfd3e575c60 Mon Sep 17 00:00:00 2001
From: ehco1996 <zh19960202@gmail.com>
Date: Thu, 20 Jan 2022 16:48:32 +0800
Subject: [PATCH 22/47] fix kill

---
 dm/tests/new_relay/run.sh | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/dm/tests/new_relay/run.sh b/dm/tests/new_relay/run.sh
index f9beb3a4c7f..c6fa3fb6199 100755
--- a/dm/tests/new_relay/run.sh
+++ b/dm/tests/new_relay/run.sh
@@ -26,7 +26,9 @@ function test_cant_dail_upstream() {
 		"start-relay -s $SOURCE_ID1 worker1" \
 		"\"result\": true" 2
 
-	kill_dm_worker
+	echo "kill dm-worker1"
+	ps aux | grep dm-worker1 | awk '{print $2}' | xargs kill || true
+	check_port_offline $WORKER1_PORT 20
 
 	export GO_FAILPOINTS="github.com/pingcap/tiflow/dm/pkg/conn/failDBPing=return()"
 	run_dm_worker $WORK_DIR/worker1 $WORKER1_PORT $cur/conf/dm-worker1.toml
@@ -254,8 +256,9 @@ function run() {
 	check_metric $WORKER1_PORT 'dm_relay_space{type="available"}' 5 0 9223372036854775807
 
 	# subtask is preferred to scheduled to another relay worker
-	pkill -hup -f dm-worker1.toml 2>/dev/null || true
-	wait_pattern_exit dm-worker1.toml
+	echo "kill dm-worker1"
+	ps aux | grep dm-worker1 | awk '{print $2}' | xargs kill || true
+	check_port_offline $WORKER1_PORT 20
 	# worker1 is down, worker2 has running relay and sync unit
 	run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
 		"query-status -s $SOURCE_ID1" \
@@ -296,11 +299,12 @@ function run() {
 	[ "$new_relay_log_count_1" -eq 1 ]
 	[ "$new_relay_log_count_2" -eq 1 ]
 
-	pkill -hup -f dm-worker1.toml 2>/dev/null || true
-	wait_pattern_exit dm-worker1.toml
-	pkill -hup -f dm-worker2.toml 2>/dev/null || true
-	wait_pattern_exit dm-worker2.toml
-
+	echo "kill dm-worker2"
+	ps aux | grep dm-worker1 | awk '{print $2}' | xargs kill || true
+	check_port_offline $WORKER1_PORT 20
+	echo "kill dm-worker2"
+	ps aux | grep dm-worker1 | awk '{print $2}' | xargs kill || true
+	check_port_offline $WORKER1_PORT 20
 	# if all relay workers are offline, relay-not-enabled worker should continue to sync
 	run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
 		"query-status -s $SOURCE_ID1" \

From a5acf28b0efe443f7aeb9eba564f0f8758d7de88 Mon Sep 17 00:00:00 2001
From: ehco1996 <zh19960202@gmail.com>
Date: Thu, 20 Jan 2022 17:18:21 +0800
Subject: [PATCH 23/47] addresss comment

---
 dm/dm/worker/subtask.go                |  2 +-
 dm/tests/checkpoint_transaction/run.sh |  4 ++--
 dm/tests/new_relay/run.sh              | 12 +++++++++---
 3 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/dm/dm/worker/subtask.go b/dm/dm/worker/subtask.go
index 3fb0d61b66b..99a9d079624 100644
--- a/dm/dm/worker/subtask.go
+++ b/dm/dm/worker/subtask.go
@@ -387,7 +387,7 @@ func (st *SubTask) closeUnits() {
 
 func (st *SubTask) killCurrentUnit() {
 	if st.CurrUnit() != nil {
-		st.l.Info("kill syncer unit", zap.String("task", st.cfg.Name), zap.Stringer("unit", st.CurrUnit().Type()))
+		st.l.Info("kill unit", zap.String("task", st.cfg.Name), zap.Stringer("unit", st.CurrUnit().Type()))
 		st.CurrUnit().Kill()
 	}
 }
diff --git a/dm/tests/checkpoint_transaction/run.sh b/dm/tests/checkpoint_transaction/run.sh
index 7198d385a7e..79c6d46ca4d 100755
--- a/dm/tests/checkpoint_transaction/run.sh
+++ b/dm/tests/checkpoint_transaction/run.sh
@@ -13,11 +13,11 @@ function check_worker_ungraceful_stop_with_retry() {
 		sleep 1
 		echo "start check_worker_ungraceful_stop_with_retry times: $k"
 
-		num=$(grep "kill syncer unit" $WORK_DIR/worker1/log/dm-worker.log | wc -l)
+		num=$(grep "kill unit" $WORK_DIR/worker1/log/dm-worker.log | wc -l)
 		if [ $num -lt 1 ]; then
 			continue
 		fi
-		num=$(grep "kill syncer without graceful" $WORK_DIR/worker1/log/dm-worker.log | wc -l)
+		num=$(grep "kill without graceful" $WORK_DIR/worker1/log/dm-worker.log | wc -l)
 		if [ $num -lt 1 ]; then
 			continue
 		fi
diff --git a/dm/tests/new_relay/run.sh b/dm/tests/new_relay/run.sh
index c6fa3fb6199..0c856719fc5 100755
--- a/dm/tests/new_relay/run.sh
+++ b/dm/tests/new_relay/run.sh
@@ -62,7 +62,9 @@ function test_cant_dail_downstream() {
 		"\"result\": true" 2
 	dmctl_start_task_standalone $cur/conf/dm-task.yaml "--remove-meta"
 
-	kill_dm_worker
+	echo "kill dm-worker1"
+	ps aux | grep dm-worker1 | awk '{print $2}' | xargs kill || true
+	check_port_offline $WORKER1_PORT 20
 	# kill tidb
 	pkill -hup tidb-server 2>/dev/null || true
 	wait_process_exit tidb-server
@@ -130,8 +132,12 @@ function test_restart_relay_status() {
 		"list-member -n worker3" \
 		"relay" 1
 
-	kill_dm_worker
-	kill_dm_master
+	echo "kill dm-worker1"
+	ps aux | grep dm-worker1 | awk '{print $2}' | xargs kill || true
+	check_port_offline $WORKER1_PORT 20
+	echo "kill dm-master"
+	ps aux | grep dm-master | awk '{print $2}' | xargs kill || true
+	check_master_port_offline 1
 
 	run_dm_master $WORK_DIR/master $MASTER_PORT $cur/conf/dm-master.toml
 	check_rpc_alive $cur/../bin/check_master_online 127.0.0.1:$MASTER_PORT

From d9f42f67410d4141e1da9227cb77dbaed52cdddf Mon Sep 17 00:00:00 2001
From: ehco1996 <zh19960202@gmail.com>
Date: Thu, 20 Jan 2022 18:08:02 +0800
Subject: [PATCH 24/47] add log for debug

---
 dm/tests/new_relay/run.sh | 169 ++++++++++++++++++++------------------
 1 file changed, 87 insertions(+), 82 deletions(-)

diff --git a/dm/tests/new_relay/run.sh b/dm/tests/new_relay/run.sh
index 0c856719fc5..6eec999f53b 100755
--- a/dm/tests/new_relay/run.sh
+++ b/dm/tests/new_relay/run.sh
@@ -10,86 +10,9 @@ SQL_RESULT_FILE="$TEST_DIR/sql_res.$TEST_NAME.txt"
 
 API_VERSION="v1alpha1"
 
-function test_cant_dail_upstream() {
-	cleanup_data $TEST_NAME
-	cleanup_process
-
-	run_dm_master $WORK_DIR/master $MASTER_PORT $cur/conf/dm-master.toml
-	check_rpc_alive $cur/../bin/check_master_online 127.0.0.1:$MASTER_PORT
-	run_dm_worker $WORK_DIR/worker1 $WORKER1_PORT $cur/conf/dm-worker1.toml
-	check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER1_PORT
-
-	cp $cur/conf/source1.yaml $WORK_DIR/source1.yaml
-	dmctl_operate_source create $WORK_DIR/source1.yaml $SOURCE_ID1
-
-	run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
-		"start-relay -s $SOURCE_ID1 worker1" \
-		"\"result\": true" 2
-
-	echo "kill dm-worker1"
-	ps aux | grep dm-worker1 | awk '{print $2}' | xargs kill || true
-	check_port_offline $WORKER1_PORT 20
-
-	export GO_FAILPOINTS="github.com/pingcap/tiflow/dm/pkg/conn/failDBPing=return()"
-	run_dm_worker $WORK_DIR/worker1 $WORKER1_PORT $cur/conf/dm-worker1.toml
-	check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER1_PORT
-
-	# make sure DM-worker doesn't exit
-	sleep 2
-	run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
-		"query-status -s $SOURCE_ID1" \
-		"injected error" 1
-
-	export GO_FAILPOINTS=""
-	cleanup_process
-	cleanup_data $TEST_NAME
-}
-
-function test_cant_dail_downstream() {
-	cleanup_data $TEST_NAME
-	cleanup_process
-
-	run_dm_master $WORK_DIR/master $MASTER_PORT $cur/conf/dm-master.toml
-	check_rpc_alive $cur/../bin/check_master_online 127.0.0.1:$MASTER_PORT
-	run_dm_worker $WORK_DIR/worker1 $WORKER1_PORT $cur/conf/dm-worker1.toml
-	check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER1_PORT
-
-	cp $cur/conf/source1.yaml $WORK_DIR/source1.yaml
-	dmctl_operate_source create $WORK_DIR/source1.yaml $SOURCE_ID1
-
-	run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
-		"start-relay -s $SOURCE_ID1 worker1" \
-		"\"result\": true" 2
-	dmctl_start_task_standalone $cur/conf/dm-task.yaml "--remove-meta"
-
-	echo "kill dm-worker1"
-	ps aux | grep dm-worker1 | awk '{print $2}' | xargs kill || true
-	check_port_offline $WORKER1_PORT 20
-	# kill tidb
-	pkill -hup tidb-server 2>/dev/null || true
-	wait_process_exit tidb-server
-
-	run_dm_worker $WORK_DIR/worker1 $WORKER1_PORT $cur/conf/dm-worker1.toml
-	check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER1_PORT
-
-	# make sure DM-worker doesn't exit
-	sleep 2
-	run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
-		"query-status -s $SOURCE_ID1" \
-		"\"relayCatchUpMaster\": true" 1 \
-		"dial tcp 127.0.0.1:4000: connect: connection refused" 1
-
-	# restart tidb
-	run_tidb_server 4000 $TIDB_PASSWORD
-	sleep 2
-
-	cleanup_process
-	cleanup_data $TEST_NAME
-}
-
 function test_restart_relay_status() {
-	cleanup_data $TEST_NAME
 	cleanup_process
+	cleanup_data $TEST_NAME
 
 	run_dm_master $WORK_DIR/master $MASTER_PORT $cur/conf/dm-master.toml
 	check_rpc_alive $cur/../bin/check_master_online 127.0.0.1:$MASTER_PORT
@@ -165,11 +88,92 @@ function test_restart_relay_status() {
 		"list-member --worker" \
 		"relay" 1 \
 		"bound" 2
+
+	echo "》》》》》》》》》》》》》》》》》test test_restart_relay_status passed"
 }
 
-function test_kill_dump_connection() {
+function test_cant_dail_upstream() {
+	cleanup_process
 	cleanup_data $TEST_NAME
+
+	run_dm_master $WORK_DIR/master $MASTER_PORT $cur/conf/dm-master.toml
+	check_rpc_alive $cur/../bin/check_master_online 127.0.0.1:$MASTER_PORT
+	run_dm_worker $WORK_DIR/worker1 $WORKER1_PORT $cur/conf/dm-worker1.toml
+	check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER1_PORT
+
+	cp $cur/conf/source1.yaml $WORK_DIR/source1.yaml
+	dmctl_operate_source create $WORK_DIR/source1.yaml $SOURCE_ID1
+
+	run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
+		"start-relay -s $SOURCE_ID1 worker1" \
+		"\"result\": true" 2
+
+	echo "kill dm-worker1"
+	ps aux | grep dm-worker1 | awk '{print $2}' | xargs kill || true
+	check_port_offline $WORKER1_PORT 20
+
+	export GO_FAILPOINTS="github.com/pingcap/tiflow/dm/pkg/conn/failDBPing=return()"
+	run_dm_worker $WORK_DIR/worker1 $WORKER1_PORT $cur/conf/dm-worker1.toml
+	check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER1_PORT
+
+	# make sure DM-worker doesn't exit
+	sleep 2
+	run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
+		"query-status -s $SOURCE_ID1" \
+		"injected error" 1
+
+	export GO_FAILPOINTS=""
+	cleanup_data $TEST_NAME
+	cleanup_process
+	echo "》》》》》》》》》》》》》》》》》test test_cant_dail_upstream passed"
+}
+
+function test_cant_dail_downstream() {
+	cleanup_process
+	cleanup_data $TEST_NAME
+
+	run_dm_master $WORK_DIR/master $MASTER_PORT $cur/conf/dm-master.toml
+	check_rpc_alive $cur/../bin/check_master_online 127.0.0.1:$MASTER_PORT
+	run_dm_worker $WORK_DIR/worker1 $WORKER1_PORT $cur/conf/dm-worker1.toml
+	check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER1_PORT
+
+	cp $cur/conf/source1.yaml $WORK_DIR/source1.yaml
+	dmctl_operate_source create $WORK_DIR/source1.yaml $SOURCE_ID1
+
+	run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
+		"start-relay -s $SOURCE_ID1 worker1" \
+		"\"result\": true" 2
+	dmctl_start_task_standalone $cur/conf/dm-task.yaml "--remove-meta"
+
+	echo "kill dm-worker1"
+	ps aux | grep dm-worker1 | awk '{print $2}' | xargs kill || true
+	check_port_offline $WORKER1_PORT 20
+	# kill tidb
+	pkill -hup tidb-server 2>/dev/null || true
+	wait_process_exit tidb-server
+
+	run_dm_worker $WORK_DIR/worker1 $WORKER1_PORT $cur/conf/dm-worker1.toml
+	check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER1_PORT
+
+	# make sure DM-worker doesn't exit
+	sleep 2
+	run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
+		"query-status -s $SOURCE_ID1" \
+		"\"relayCatchUpMaster\": true" 1 \
+		"dial tcp 127.0.0.1:4000: connect: connection refused" 1
+
+	# restart tidb
+	run_tidb_server 4000 $TIDB_PASSWORD
+	sleep 2
+
+	cleanup_data $TEST_NAME
+	cleanup_process
+	echo "》》》》》》》》》》》》》》》》》test test_cant_dail_downstream passed"
+}
+
+function test_kill_dump_connection() {
 	cleanup_process
+	cleanup_data $TEST_NAME
 
 	run_sql_file $cur/data/db1.prepare.sql $MYSQL_HOST1 $MYSQL_PORT1 $MYSQL_PASSWORD1
 	check_contains 'Query OK, 2 rows affected'
@@ -201,15 +205,17 @@ function test_kill_dump_connection() {
 	run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
 		"query-status -s $SOURCE_ID1" \
 		"\"relayCatchUpMaster\": true" 1
-
-	cleanup_process
 	cleanup_data $TEST_NAME
+	cleanup_process
+	echo "》》》》》》》》》》》》》》》》》test test_kill_dump_connection passed"
+
 }
 
 function run() {
 	test_restart_relay_status
 	test_cant_dail_downstream
 	test_cant_dail_upstream
+	test_kill_dump_connection
 
 	export GO_FAILPOINTS="github.com/pingcap/tiflow/dm/relay/ReportRelayLogSpaceInBackground=return(1)"
 
@@ -363,7 +369,6 @@ function run() {
 
 	check_sync_diff $WORK_DIR $cur/conf/diff_config.toml
 
-	test_kill_dump_connection
 }
 
 cleanup_data $TEST_NAME

From 31727e0ba93ecf640feeeeb4d9c9885200393498 Mon Sep 17 00:00:00 2001
From: ehco1996 <zh19960202@gmail.com>
Date: Thu, 20 Jan 2022 19:22:53 +0800
Subject: [PATCH 25/47] fix test case

---
 dm/dm/worker/server.go                 |  3 +-
 dm/syncer/syncer.go                    | 10 +++----
 dm/tests/checkpoint_transaction/run.sh |  2 +-
 dm/tests/new_relay/run.sh              | 41 ++++++++++++--------------
 4 files changed, 27 insertions(+), 29 deletions(-)

diff --git a/dm/dm/worker/server.go b/dm/dm/worker/server.go
index c9f1b33d406..25d49e85773 100644
--- a/dm/dm/worker/server.go
+++ b/dm/dm/worker/server.go
@@ -448,10 +448,10 @@ func (s *Server) doClose() {
 		return
 	}
 	// stop worker and wait for return
-	s.cancel()
 	if w := s.getWorker(false); w != nil {
 		w.Stop(true)
 	}
+	s.cancel()
 	s.wg.Wait()
 	s.closed.Store(true)
 }
@@ -460,6 +460,7 @@ func (s *Server) doClose() {
 func (s *Server) Close() {
 	s.doClose() // we should stop current sync first, other wise master may scheduler task on new worker while we are closing
 	s.stopKeepAlive()
+	s.etcdClient.Close()
 }
 
 // if needLock is false, we should make sure Server has been locked in caller.
diff --git a/dm/syncer/syncer.go b/dm/syncer/syncer.go
index 50573a0254f..56dd95ed709 100644
--- a/dm/syncer/syncer.go
+++ b/dm/syncer/syncer.go
@@ -281,8 +281,6 @@ func NewSyncer(cfg *config.SubTaskConfig, etcdClient *clientv3.Client, relay rel
 	syncer.lastCheckpointFlushedTime = time.Time{}
 	syncer.relay = relay
 	syncer.locations = &locationRecorder{}
-
-	syncer.rundone = nil
 	return syncer
 }
 
@@ -1504,6 +1502,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 	defer func() {
 		// close rundone to notify other goroutine that syncer.Run is exited
 		close(s.rundone)
+		runCancel()
 	}()
 
 	// some initialization that can't be put in Syncer.Init
@@ -1517,6 +1516,8 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 			return err
 		}
 	}
+	// we should start this goroutine as soon as possible, because it's the only goroutine that cancel syncer.Run
+	go s.waitTransactionEndBeforeExit(ctx)
 
 	var (
 		flushCheckpoint bool
@@ -1627,7 +1628,6 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 		traceSource             = fmt.Sprintf("%s.syncer.%s", s.cfg.SourceID, s.cfg.Name)
 	)
 
-	go s.waitTransactionEndBeforeExit(ctx)
 	defer func() {
 		if err1 := recover(); err1 != nil {
 			failpoint.Inject("ExitAfterSaveOnlineDDL", func() {
@@ -1847,7 +1847,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 			// try to re-sync in gtid mode
 			if tryReSync && s.cfg.EnableGTID && utils.IsErrBinlogPurged(err) && s.cfg.AutoFixGTID {
 				time.Sleep(retryTimeout)
-				err = s.reSyncBinlog(*s.tctx, lastLocation)
+				err = s.reSyncBinlog(*s.runTCtx, lastLocation)
 				if err != nil {
 					return err
 				}
@@ -1993,7 +1993,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 				if err = s.checkpoint.FlushSafeModeExitPoint(s.runTCtx); err != nil {
 					return err
 				}
-				if err = s.safeMode.Add(s.tctx, -1); err != nil {
+				if err = s.safeMode.Add(s.runTCtx, -1); err != nil {
 					return err
 				}
 			}
diff --git a/dm/tests/checkpoint_transaction/run.sh b/dm/tests/checkpoint_transaction/run.sh
index 79c6d46ca4d..61eaee06efd 100755
--- a/dm/tests/checkpoint_transaction/run.sh
+++ b/dm/tests/checkpoint_transaction/run.sh
@@ -17,7 +17,7 @@ function check_worker_ungraceful_stop_with_retry() {
 		if [ $num -lt 1 ]; then
 			continue
 		fi
-		num=$(grep "kill without graceful" $WORK_DIR/worker1/log/dm-worker.log | wc -l)
+		num=$(grep "kill syncer without graceful" $WORK_DIR/worker1/log/dm-worker.log | wc -l)
 		if [ $num -lt 1 ]; then
 			continue
 		fi
diff --git a/dm/tests/new_relay/run.sh b/dm/tests/new_relay/run.sh
index 6eec999f53b..095136b8d27 100755
--- a/dm/tests/new_relay/run.sh
+++ b/dm/tests/new_relay/run.sh
@@ -13,6 +13,7 @@ API_VERSION="v1alpha1"
 function test_restart_relay_status() {
 	cleanup_process
 	cleanup_data $TEST_NAME
+	export GO_FAILPOINTS=""
 
 	run_dm_master $WORK_DIR/master $MASTER_PORT $cur/conf/dm-master.toml
 	check_rpc_alive $cur/../bin/check_master_online 127.0.0.1:$MASTER_PORT
@@ -95,6 +96,7 @@ function test_restart_relay_status() {
 function test_cant_dail_upstream() {
 	cleanup_process
 	cleanup_data $TEST_NAME
+	export GO_FAILPOINTS=""
 
 	run_dm_master $WORK_DIR/master $MASTER_PORT $cur/conf/dm-master.toml
 	check_rpc_alive $cur/../bin/check_master_online 127.0.0.1:$MASTER_PORT
@@ -122,15 +124,13 @@ function test_cant_dail_upstream() {
 		"query-status -s $SOURCE_ID1" \
 		"injected error" 1
 
-	export GO_FAILPOINTS=""
-	cleanup_data $TEST_NAME
-	cleanup_process
 	echo "》》》》》》》》》》》》》》》》》test test_cant_dail_upstream passed"
 }
 
 function test_cant_dail_downstream() {
 	cleanup_process
 	cleanup_data $TEST_NAME
+	export GO_FAILPOINTS=""
 
 	run_dm_master $WORK_DIR/master $MASTER_PORT $cur/conf/dm-master.toml
 	check_rpc_alive $cur/../bin/check_master_online 127.0.0.1:$MASTER_PORT
@@ -155,8 +155,6 @@ function test_cant_dail_downstream() {
 	run_dm_worker $WORK_DIR/worker1 $WORKER1_PORT $cur/conf/dm-worker1.toml
 	check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER1_PORT
 
-	# make sure DM-worker doesn't exit
-	sleep 2
 	run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
 		"query-status -s $SOURCE_ID1" \
 		"\"relayCatchUpMaster\": true" 1 \
@@ -166,8 +164,6 @@ function test_cant_dail_downstream() {
 	run_tidb_server 4000 $TIDB_PASSWORD
 	sleep 2
 
-	cleanup_data $TEST_NAME
-	cleanup_process
 	echo "》》》》》》》》》》》》》》》》》test test_cant_dail_downstream passed"
 }
 
@@ -205,18 +201,12 @@ function test_kill_dump_connection() {
 	run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
 		"query-status -s $SOURCE_ID1" \
 		"\"relayCatchUpMaster\": true" 1
-	cleanup_data $TEST_NAME
-	cleanup_process
 	echo "》》》》》》》》》》》》》》》》》test test_kill_dump_connection passed"
-
 }
 
-function run() {
-	test_restart_relay_status
-	test_cant_dail_downstream
-	test_cant_dail_upstream
-	test_kill_dump_connection
-
+function test_relay_operations() {
+	cleanup_process
+	cleanup_data $TEST_NAME
 	export GO_FAILPOINTS="github.com/pingcap/tiflow/dm/relay/ReportRelayLogSpaceInBackground=return(1)"
 
 	run_sql_file $cur/data/db1.prepare.sql $MYSQL_HOST1 $MYSQL_PORT1 $MYSQL_PASSWORD1
@@ -248,7 +238,7 @@ function run() {
 		"\"worker\": \"worker1\"" 1 \
 		"\"worker\": \"worker2\"" 1
 
-	# worker1 and worker2 has one realy job and worker3 have none.
+	# worker1 and worker2 has one relay job and worker3 have none.
 	check_metric $WORKER1_PORT "dm_relay_binlog_file{node=\"relay\"}" 3 0 2
 	check_metric $WORKER1_PORT "dm_relay_exit_with_error_count" 3 -1 1
 	check_metric $WORKER2_PORT "dm_relay_binlog_file{node=\"relay\"}" 3 0 2
@@ -260,7 +250,7 @@ function run() {
 
 	run_sql_file $cur/data/db1.increment.sql $MYSQL_HOST1 $MYSQL_PORT1 $MYSQL_PASSWORD1
 
-	# relay task tranfer to worker1 with no error.
+	# relay task transfer to worker1 with no error.
 	check_metric $WORKER1_PORT "dm_relay_data_corruption" 3 -1 1
 	check_metric $WORKER1_PORT "dm_relay_read_error_count" 3 -1 1
 	check_metric $WORKER1_PORT "dm_relay_write_error_count" 3 -1 1
@@ -311,11 +301,11 @@ function run() {
 	[ "$new_relay_log_count_1" -eq 1 ]
 	[ "$new_relay_log_count_2" -eq 1 ]
 
-	echo "kill dm-worker2"
+	echo "kill dm-worker1"
 	ps aux | grep dm-worker1 | awk '{print $2}' | xargs kill || true
 	check_port_offline $WORKER1_PORT 20
 	echo "kill dm-worker2"
-	ps aux | grep dm-worker1 | awk '{print $2}' | xargs kill || true
+	ps aux | grep dm-worker2 | awk '{print $2}' | xargs kill || true
 	check_port_offline $WORKER1_PORT 20
 	# if all relay workers are offline, relay-not-enabled worker should continue to sync
 	run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
@@ -338,8 +328,7 @@ function run() {
 
 	# destroy cluster
 	cleanup_process $*
-	rm -rf $WORK_DIR
-	mkdir $WORK_DIR
+	cleanup_data $TEST_NAME
 
 	# insert new data
 	run_sql_file $cur/data/db1.increment5.sql $MYSQL_HOST1 $MYSQL_PORT1 $MYSQL_PASSWORD1
@@ -368,7 +357,15 @@ function run() {
 		"\"result\": true" 2
 
 	check_sync_diff $WORK_DIR $cur/conf/diff_config.toml
+	echo "》》》》》》》》》》》》》》》》》test test_relay_operations passed"
+}
 
+function run() {
+	test_relay_operations
+	test_cant_dail_upstream
+	test_restart_relay_status
+	test_cant_dail_downstream
+	test_kill_dump_connection
 }
 
 cleanup_data $TEST_NAME

From 22a7e72d3926ffbc460d643501d7b77870a219f5 Mon Sep 17 00:00:00 2001
From: ehco1996 <zh19960202@gmail.com>
Date: Fri, 21 Jan 2022 08:54:43 +0800
Subject: [PATCH 26/47] fix comments

---
 dm/syncer/syncer.go | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/dm/syncer/syncer.go b/dm/syncer/syncer.go
index 56dd95ed709..3b8e8870c8d 100644
--- a/dm/syncer/syncer.go
+++ b/dm/syncer/syncer.go
@@ -1506,7 +1506,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 	}()
 
 	// some initialization that can't be put in Syncer.Init
-	fresh, err := s.IsFreshTask(ctx)
+	fresh, err := s.IsFreshTask(s.runCtx)
 	if err != nil {
 		return err
 	} else if fresh {
@@ -1543,7 +1543,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 	s.wg.Add(1)
 	go func() {
 		defer s.wg.Done()
-		s.checkpointFlushWorker.Run(s.runTCtx)
+		s.checkpointFlushWorker.Run(s.tctx)
 	}()
 
 	if flushCheckpoint {
@@ -2085,8 +2085,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 			}
 		}
 		if waitXIDStatus(s.waitXIDJob.Load()) == waitComplete {
-			// already wait until XID event, we can stop sync now
-			s.runCancel()
+			// already wait until XID event, we can stop sync now, s.runcancel will be called in defer func
 			return nil
 		}
 	}
@@ -3362,12 +3361,6 @@ func (s *Syncer) Close() {
 	if s.isClosed() {
 		return
 	}
-
-	// wait for s.Run() to finish
-	if s.rundone != nil {
-		<-s.rundone
-	}
-
 	s.stopSync()
 	s.closeDBs()
 	s.checkpoint.Close()
@@ -3394,6 +3387,10 @@ func (s *Syncer) Kill() {
 // stopSync stops syncing and rollback checkpoint now it used by Close,kILL and Pause
 // maybe we can refine the workflow more clear.
 func (s *Syncer) stopSync() {
+	// wait for s.Run() to finish
+	if s.rundone != nil {
+		<-s.rundone
+	}
 	s.closeJobChans()
 	s.wg.Wait() // wait job workers to return
 

From e01e2ea0a1c5bd70f1c15bab7c0721d786220c6f Mon Sep 17 00:00:00 2001
From: ehco1996 <zh19960202@gmail.com>
Date: Fri, 21 Jan 2022 09:57:20 +0800
Subject: [PATCH 27/47] rename func about sourceWorker

---
 dm/dm/worker/hub_test.go           |   2 +-
 dm/dm/worker/join.go               |   2 +-
 dm/dm/worker/server.go             |  46 +++++-----
 dm/dm/worker/server_test.go        |  30 +++----
 dm/dm/worker/source_worker_test.go |   8 +-
 dm/syncer/syncer.go                | 133 +++++++++++++----------------
 dm/syncer/syncer_test.go           |   4 +-
 7 files changed, 106 insertions(+), 119 deletions(-)

diff --git a/dm/dm/worker/hub_test.go b/dm/dm/worker/hub_test.go
index d76697b1e6a..ca20fd437c3 100644
--- a/dm/dm/worker/hub_test.go
+++ b/dm/dm/worker/hub_test.go
@@ -20,5 +20,5 @@ import (
 func (t *testServer) testConidtionHub(c *C, s *Server) {
 	// test condition hub
 	c.Assert(GetConditionHub(), NotNil)
-	c.Assert(GetConditionHub().w, DeepEquals, s.getWorker(true))
+	c.Assert(GetConditionHub().w, DeepEquals, s.getSourceWorker(true))
 }
diff --git a/dm/dm/worker/join.go b/dm/dm/worker/join.go
index e9d18ba5bbb..a50c47d44fe 100644
--- a/dm/dm/worker/join.go
+++ b/dm/dm/worker/join.go
@@ -109,7 +109,7 @@ func (s *Server) KeepAlive() {
 
 		// TODO: report the error.
 		// when lost keepalive, stop the worker without graceful. this is to fix https://github.com/pingcap/tiflow/issues/3737
-		err := s.stopWorker("", true, false)
+		err := s.stopSourceWorker("", true, false)
 		if err != nil {
 			log.L().Error("fail to stop worker", zap.Error(err))
 			return // return if failed to stop the worker.
diff --git a/dm/dm/worker/server.go b/dm/dm/worker/server.go
index 25d49e85773..ff7590fd3b9 100644
--- a/dm/dm/worker/server.go
+++ b/dm/dm/worker/server.go
@@ -300,7 +300,7 @@ func (s *Server) observeRelayConfig(ctx context.Context, rev int64) error {
 					}
 					rev = rev1
 					if relaySource == nil {
-						if w := s.getWorker(true); w != nil && w.startedRelayBySourceCfg {
+						if w := s.getSourceWorker(true); w != nil && w.startedRelayBySourceCfg {
 							break
 						}
 						log.L().Info("didn't found relay config after etcd retryable error. Will stop relay now")
@@ -314,7 +314,7 @@ func (s *Server) observeRelayConfig(ctx context.Context, rev int64) error {
 							s.Lock()
 							defer s.Unlock()
 
-							if w := s.getWorker(false); w != nil && w.cfg.SourceID == relaySource.SourceID {
+							if w := s.getSourceWorker(false); w != nil && w.cfg.SourceID == relaySource.SourceID {
 								// we may face both relay config and subtask bound changed in a compaction error, so here
 								// we check if observeSourceBound has started a worker
 								// TODO: add a test for this situation
@@ -325,7 +325,7 @@ func (s *Server) observeRelayConfig(ctx context.Context, rev int64) error {
 								}
 								return nil
 							}
-							err = s.stopWorker("", false, true)
+							err = s.stopSourceWorker("", false, true)
 							if err != nil {
 								log.L().Error("fail to stop worker", zap.Error(err))
 								return err // return if failed to stop the worker.
@@ -403,7 +403,7 @@ func (s *Server) observeSourceBound(ctx context.Context, rev int64) error {
 							s.Lock()
 							defer s.Unlock()
 
-							if w := s.getWorker(false); w != nil && w.cfg.SourceID == bound.Source {
+							if w := s.getSourceWorker(false); w != nil && w.cfg.SourceID == bound.Source {
 								// we may face both relay config and subtask bound changed in a compaction error, so here
 								// we check if observeRelayConfig has started a worker
 								// TODO: add a test for this situation
@@ -414,7 +414,7 @@ func (s *Server) observeSourceBound(ctx context.Context, rev int64) error {
 								}
 								return nil
 							}
-							err = s.stopWorker("", false, true)
+							err = s.stopSourceWorker("", false, true)
 							if err != nil {
 								log.L().Error("fail to stop worker", zap.Error(err))
 								return err // return if failed to stop the worker.
@@ -447,8 +447,8 @@ func (s *Server) doClose() {
 	if s.closed.Load() {
 		return
 	}
-	// stop worker and wait for return
-	if w := s.getWorker(false); w != nil {
+	// stop worker and wait for return(we already lock the whole Sever, so no need use lock to get source worker)
+	if w := s.getSourceWorker(false); w != nil {
 		w.Stop(true)
 	}
 	s.cancel()
@@ -464,7 +464,7 @@ func (s *Server) Close() {
 }
 
 // if needLock is false, we should make sure Server has been locked in caller.
-func (s *Server) getWorker(needLock bool) *SourceWorker {
+func (s *Server) getSourceWorker(needLock bool) *SourceWorker {
 	if needLock {
 		s.Lock()
 		defer s.Unlock()
@@ -497,7 +497,7 @@ func (s *Server) setSourceStatus(source string, err error, needLock bool) {
 		defer s.Unlock()
 	}
 	// now setSourceStatus will be concurrently called. skip setting a source status if worker has been closed
-	if s.getWorker(false) == nil && source != "" {
+	if s.getSourceWorker(false) == nil && source != "" {
 		return
 	}
 	s.sourceStatus = pb.SourceStatus{
@@ -515,23 +515,23 @@ func (s *Server) setSourceStatus(source string, err error, needLock bool) {
 
 // if sourceID is set to "", worker will be closed directly
 // if sourceID is not "", we will check sourceID with w.cfg.SourceID.
-func (s *Server) stopWorker(sourceID string, needLock, graceful bool) error {
+func (s *Server) stopSourceWorker(sourceID string, needLock, graceful bool) error {
 	if needLock {
 		s.Lock()
 		defer s.Unlock()
 	}
-	w := s.getWorker(false)
-	if w == nil {
+	sw := s.getSourceWorker(false)
+	if sw == nil {
 		log.L().Warn("worker has not been started, no need to stop", zap.String("source", sourceID))
 		return nil // no need to stop because not started yet
 	}
-	if sourceID != "" && w.cfg.SourceID != sourceID {
+	if sourceID != "" && sw.cfg.SourceID != sourceID {
 		return terror.ErrWorkerSourceNotMatch
 	}
 	s.UpdateKeepAliveTTL(s.cfg.KeepAliveTTL)
 	s.setWorker(nil, false)
 	s.setSourceStatus("", nil, false)
-	w.Stop(graceful)
+	sw.Stop(graceful)
 	return nil
 }
 
@@ -659,7 +659,7 @@ func (s *Server) enableHandleSubtasks(sourceCfg *config.SourceConfig, needLock b
 func (s *Server) disableHandleSubtasks(source string) error {
 	s.Lock()
 	defer s.Unlock()
-	w := s.getWorker(false)
+	w := s.getSourceWorker(false)
 	if w == nil {
 		log.L().Warn("worker has already stopped before DisableHandleSubtasks", zap.String("source", source))
 		return nil
@@ -676,7 +676,7 @@ func (s *Server) disableHandleSubtasks(source string) error {
 	var err error
 	if !w.relayEnabled.Load() {
 		log.L().Info("relay is not enabled after disabling subtask, so stop worker")
-		err = s.stopWorker(source, false, true)
+		err = s.stopSourceWorker(source, false, true)
 	}
 	return err
 }
@@ -721,7 +721,7 @@ func (s *Server) enableRelay(sourceCfg *config.SourceConfig, needLock bool) erro
 func (s *Server) disableRelay(source string) error {
 	s.Lock()
 	defer s.Unlock()
-	w := s.getWorker(false)
+	w := s.getSourceWorker(false)
 	if w == nil {
 		log.L().Warn("worker has already stopped before DisableRelay", zap.Any("relaySource", source))
 		return nil
@@ -731,7 +731,7 @@ func (s *Server) disableRelay(source string) error {
 	var err error
 	if !w.subTaskEnabled.Load() {
 		log.L().Info("subtask is not enabled after disabling relay, so stop worker")
-		err = s.stopWorker(source, false, true)
+		err = s.stopSourceWorker(source, false, true)
 	}
 	return err
 }
@@ -747,7 +747,7 @@ func (s *Server) QueryStatus(ctx context.Context, req *pb.QueryStatusRequest) (*
 		SourceStatus: &sourceStatus,
 	}
 
-	w := s.getWorker(true)
+	w := s.getSourceWorker(true)
 	if w == nil {
 		log.L().Warn("fail to call QueryStatus, because no mysql source is being handled in the worker")
 		resp.Result = false
@@ -769,7 +769,7 @@ func (s *Server) QueryStatus(ctx context.Context, req *pb.QueryStatusRequest) (*
 // PurgeRelay implements WorkerServer.PurgeRelay.
 func (s *Server) PurgeRelay(ctx context.Context, req *pb.PurgeRelayRequest) (*pb.CommonWorkerResponse, error) {
 	log.L().Info("", zap.String("request", "PurgeRelay"), zap.Stringer("payload", req))
-	w := s.getWorker(true)
+	w := s.getSourceWorker(true)
 	if w == nil {
 		log.L().Warn("fail to call StartSubTask, because no mysql source is being handled in the worker")
 		return makeCommonWorkerResponse(terror.ErrWorkerNoStart.Generate()), nil
@@ -786,7 +786,7 @@ func (s *Server) PurgeRelay(ctx context.Context, req *pb.PurgeRelayRequest) (*pb
 func (s *Server) OperateSchema(ctx context.Context, req *pb.OperateWorkerSchemaRequest) (*pb.CommonWorkerResponse, error) {
 	log.L().Info("", zap.String("request", "OperateSchema"), zap.Stringer("payload", req))
 
-	w := s.getWorker(true)
+	w := s.getSourceWorker(true)
 	if w == nil {
 		log.L().Warn("fail to call OperateSchema, because no mysql source is being handled in the worker")
 		return makeCommonWorkerResponse(terror.ErrWorkerNoStart.Generate()), nil
@@ -813,7 +813,7 @@ func (s *Server) getOrStartWorker(cfg *config.SourceConfig, needLock bool) (*Sou
 		defer s.Unlock()
 	}
 
-	if w := s.getWorker(false); w != nil {
+	if w := s.getSourceWorker(false); w != nil {
 		if w.cfg.SourceID == cfg.SourceID {
 			log.L().Info("mysql source is being handled", zap.String("sourceID", s.worker.cfg.SourceID))
 			return w, nil
@@ -904,7 +904,7 @@ func getMinLocForSubTask(ctx context.Context, subTaskCfg config.SubTaskConfig) (
 func (s *Server) HandleError(ctx context.Context, req *pb.HandleWorkerErrorRequest) (*pb.CommonWorkerResponse, error) {
 	log.L().Info("", zap.String("request", "HandleError"), zap.Stringer("payload", req))
 
-	w := s.getWorker(true)
+	w := s.getSourceWorker(true)
 	if w == nil {
 		log.L().Warn("fail to call HandleError, because no mysql source is being handled in the worker")
 		return makeCommonWorkerResponse(terror.ErrWorkerNoStart.Generate()), nil
diff --git a/dm/dm/worker/server_test.go b/dm/dm/worker/server_test.go
index e9a578ddc82..fdedd1a77da 100644
--- a/dm/dm/worker/server_test.go
+++ b/dm/dm/worker/server_test.go
@@ -209,7 +209,7 @@ func (t *testServer) TestServer(c *C) {
 	_, err = ha.DeleteSubTaskStage(s.etcdClient, ha.NewSubTaskStage(pb.Stage_Stopped, sourceCfg.SourceID, subtaskCfg.Name))
 	c.Assert(err, IsNil)
 	c.Assert(utils.WaitSomething(30, 100*time.Millisecond, func() bool {
-		return s.getWorker(true).subTaskHolder.findSubTask(subtaskCfg.Name) == nil
+		return s.getSourceWorker(true).subTaskHolder.findSubTask(subtaskCfg.Name) == nil
 	}), IsTrue)
 
 	dupServer := NewServer(cfg)
@@ -338,13 +338,13 @@ func (t *testServer) TestHandleSourceBoundAfterError(c *C) {
 	_, err = ha.PutSourceCfg(etcdCli, sourceCfg)
 	c.Assert(err, IsNil)
 	c.Assert(utils.WaitSomething(30, 100*time.Millisecond, func() bool {
-		return s.getWorker(true) != nil
+		return s.getSourceWorker(true) != nil
 	}), IsTrue)
 
 	_, err = ha.DeleteSourceBound(etcdCli, s.cfg.Name)
 	c.Assert(err, IsNil)
 	c.Assert(utils.WaitSomething(30, 100*time.Millisecond, func() bool {
-		return s.getWorker(true) == nil
+		return s.getSourceWorker(true) == nil
 	}), IsTrue)
 }
 
@@ -414,19 +414,19 @@ func (t *testServer) TestWatchSourceBoundEtcdCompact(c *C) {
 	}()
 	// step 4.1: should stop the running worker, source bound has been deleted, should stop this worker
 	c.Assert(utils.WaitSomething(20, 100*time.Millisecond, func() bool {
-		return s.getWorker(true) == nil
+		return s.getSourceWorker(true) == nil
 	}), IsTrue)
 	// step 4.2: put a new source bound, source should be started
 	_, err = ha.PutSourceBound(etcdCli, sourceBound)
 	c.Assert(err, IsNil)
 	c.Assert(utils.WaitSomething(30, 100*time.Millisecond, func() bool {
-		return s.getWorker(true) != nil
+		return s.getSourceWorker(true) != nil
 	}), IsTrue)
-	cfg2 := s.getWorker(true).cfg
+	cfg2 := s.getSourceWorker(true).cfg
 	c.Assert(cfg2, DeepEquals, sourceCfg)
 	cancel1()
 	wg.Wait()
-	c.Assert(s.stopWorker(sourceCfg.SourceID, true, true), IsNil)
+	c.Assert(s.stopSourceWorker(sourceCfg.SourceID, true, true), IsNil)
 	// step 5: start observeSourceBound from compacted revision again, should start worker
 	ctx2, cancel2 := context.WithCancel(ctx)
 	wg.Add(1)
@@ -435,9 +435,9 @@ func (t *testServer) TestWatchSourceBoundEtcdCompact(c *C) {
 		c.Assert(s.observeSourceBound(ctx2, startRev), IsNil)
 	}()
 	c.Assert(utils.WaitSomething(30, 100*time.Millisecond, func() bool {
-		return s.getWorker(true) != nil
+		return s.getSourceWorker(true) != nil
 	}), IsTrue)
-	cfg2 = s.getWorker(true).cfg
+	cfg2 = s.getSourceWorker(true).cfg
 	c.Assert(cfg2, DeepEquals, sourceCfg)
 	cancel2()
 	wg.Wait()
@@ -481,13 +481,13 @@ func (t *testServer) testOperateWorker(c *C, s *Server, dir string, start bool)
 		c.Assert(err, IsNil)
 		// worker should be started and without error
 		c.Assert(utils.WaitSomething(30, 100*time.Millisecond, func() bool {
-			w := s.getWorker(true)
+			w := s.getSourceWorker(true)
 			return w != nil && !w.closed.Load()
 		}), IsTrue)
 		c.Assert(s.getSourceStatus(true).Result, IsNil)
 	} else {
 		// worker should be started before stopped
-		w := s.getWorker(true)
+		w := s.getSourceWorker(true)
 		c.Assert(w, NotNil)
 		c.Assert(w.closed.Load(), IsFalse)
 		_, err := ha.DeleteRelayConfig(s.etcdClient, w.name)
@@ -496,7 +496,7 @@ func (t *testServer) testOperateWorker(c *C, s *Server, dir string, start bool)
 		c.Assert(err, IsNil)
 		// worker should be closed and without error
 		c.Assert(utils.WaitSomething(30, 100*time.Millisecond, func() bool {
-			currentWorker := s.getWorker(true)
+			currentWorker := s.getSourceWorker(true)
 			return currentWorker == nil && w.closed.Load()
 		}), IsTrue)
 		c.Assert(s.getSourceStatus(true).Result, IsNil)
@@ -507,7 +507,7 @@ func (t *testServer) testRetryConnectMaster(c *C, s *Server, etcd *embed.Etcd, d
 	etcd.Close()
 	time.Sleep(6 * time.Second)
 	// When worker server fail to keepalive with etcd, server should close its worker
-	c.Assert(s.getWorker(true), IsNil)
+	c.Assert(s.getSourceWorker(true), IsNil)
 	c.Assert(s.getSourceStatus(true).Result, IsNil)
 	ETCD, err := createMockETCD(dir, "http://"+hostName)
 	c.Assert(err, IsNil)
@@ -552,9 +552,9 @@ func (t *testServer) testSubTaskRecover(c *C, s *Server, dir string) {
 func (t *testServer) testStopWorkerWhenLostConnect(c *C, s *Server, etcd *embed.Etcd) {
 	etcd.Close()
 	c.Assert(utils.WaitSomething(int(defaultKeepAliveTTL+3), time.Second, func() bool {
-		return s.getWorker(true) == nil
+		return s.getSourceWorker(true) == nil
 	}), IsTrue)
-	c.Assert(s.getWorker(true), IsNil)
+	c.Assert(s.getSourceWorker(true), IsNil)
 }
 
 func (t *testServer) TestGetMinLocInAllSubTasks(c *C) {
diff --git a/dm/dm/worker/source_worker_test.go b/dm/dm/worker/source_worker_test.go
index 770b7ec7983..4768e24dd4d 100644
--- a/dm/dm/worker/source_worker_test.go
+++ b/dm/dm/worker/source_worker_test.go
@@ -197,11 +197,11 @@ func (t *testServer2) TestTaskAutoResume(c *C) {
 	c.Assert(err, IsNil)
 	subtaskCfg.Mode = "full"
 	subtaskCfg.Timezone = "UTC"
-	c.Assert(s.getWorker(true).StartSubTask(&subtaskCfg, pb.Stage_Running, true), IsNil)
+	c.Assert(s.getSourceWorker(true).StartSubTask(&subtaskCfg, pb.Stage_Running, true), IsNil)
 
 	// check task in paused state
 	c.Assert(utils.WaitSomething(100, 100*time.Millisecond, func() bool {
-		subtaskStatus, _, _ := s.getWorker(true).QueryStatus(context.Background(), taskName)
+		subtaskStatus, _, _ := s.getSourceWorker(true).QueryStatus(context.Background(), taskName)
 		for _, st := range subtaskStatus {
 			if st.Name == taskName && st.Stage == pb.Stage_Paused {
 				return true
@@ -212,7 +212,7 @@ func (t *testServer2) TestTaskAutoResume(c *C) {
 	//nolint:errcheck
 	failpoint.Disable("github.com/pingcap/tiflow/dm/dumpling/dumpUnitProcessWithError")
 
-	rtsc, ok := s.getWorker(true).taskStatusChecker.(*realTaskStatusChecker)
+	rtsc, ok := s.getSourceWorker(true).taskStatusChecker.(*realTaskStatusChecker)
 	c.Assert(ok, IsTrue)
 	defer func() {
 		// close multiple time
@@ -222,7 +222,7 @@ func (t *testServer2) TestTaskAutoResume(c *C) {
 
 	// check task will be auto resumed
 	c.Assert(utils.WaitSomething(10, 100*time.Millisecond, func() bool {
-		sts, _, _ := s.getWorker(true).QueryStatus(context.Background(), taskName)
+		sts, _, _ := s.getSourceWorker(true).QueryStatus(context.Background(), taskName)
 		for _, st := range sts {
 			if st.Name == taskName && st.Stage == pb.Stage_Running {
 				return true
diff --git a/dm/syncer/syncer.go b/dm/syncer/syncer.go
index 3b8e8870c8d..7e53aa7c8c3 100644
--- a/dm/syncer/syncer.go
+++ b/dm/syncer/syncer.go
@@ -131,7 +131,7 @@ type Syncer struct {
 	// this is used for some background tasks s.Run that need logger it share same lifecycle with runCtx.
 	runTCtx   *tcontext.Context // this ctx only used for logger.
 	runCancel context.CancelFunc
-	rundone   chan struct{} // this will be closed when syncer.Run exit
+	runWg     sync.WaitGroup // control all goroutines in S.Run
 
 	cfg     *config.SubTaskConfig
 	syncCfg replication.BinlogSyncerConfig
@@ -144,7 +144,6 @@ type Syncer struct {
 	binlogType         BinlogType
 	streamerController *StreamerController
 
-	wg    sync.WaitGroup // counts goroutines
 	jobWg sync.WaitGroup // counts ddl/flush/asyncFlush job in-flight in s.dmlJobCh and s.ddlJobCh
 
 	schemaTracker *schema.Tracker
@@ -285,11 +284,9 @@ func NewSyncer(cfg *config.SubTaskConfig, etcdClient *clientv3.Client, relay rel
 }
 
 func (s *Syncer) newJobChans() {
-	s.closeJobChans()
 	chanSize := calculateChanSize(s.cfg.QueueSize, s.cfg.WorkerCount, s.cfg.Compact)
 	s.dmlJobCh = make(chan *job, chanSize)
 	s.ddlJobCh = make(chan *job, s.cfg.QueueSize)
-	s.checkpointFlushWorker.input = make(chan *checkpointFlushTask, 16)
 	s.jobsClosed.Store(false)
 }
 
@@ -301,7 +298,6 @@ func (s *Syncer) closeJobChans() {
 	}
 	close(s.dmlJobCh)
 	close(s.ddlJobCh)
-	s.checkpointFlushWorker.Close()
 	s.jobsClosed.Store(true)
 }
 
@@ -436,13 +432,6 @@ func (s *Syncer) Init(ctx context.Context) (err error) {
 			}
 		}
 	}
-	s.checkpointFlushWorker = &checkpointFlushWorker{
-		input:        nil, // will be created in s.reset()
-		cp:           s.checkpoint,
-		execError:    &s.execError,
-		afterFlushFn: s.afterFlushCheckpoint,
-		addCountFunc: s.addCount,
-	}
 
 	// when Init syncer, set active relay log info
 	err = s.setInitActiveRelayLog(ctx)
@@ -679,8 +668,6 @@ func (s *Syncer) Process(ctx context.Context, pr chan pb.ProcessResult) {
 		// cancel goroutines created in s.Run
 		cancel()
 	}
-	s.closeJobChans()   // Run returned, all jobs sent, we can close s.jobs
-	s.wg.Wait()         // wait for sync goroutine to return
 	close(runFatalChan) // Run returned, all potential fatal sent to s.runFatalChan
 	wg.Wait()           // wait for receive all fatal from s.runFatalChan
 
@@ -1251,8 +1238,8 @@ func (s *Syncer) logAndClearFilteredStatistics() {
 }
 
 // DDL synced one by one, so we only need to process one DDL at a time.
-func (s *Syncer) syncDDL(tctx *tcontext.Context, queueBucket string, db *dbconn.DBConn, ddlJobChan chan *job) {
-	defer s.wg.Done()
+func (s *Syncer) syncDDL(queueBucket string, db *dbconn.DBConn, ddlJobChan chan *job) {
+	defer s.runWg.Done()
 
 	var err error
 	for {
@@ -1276,12 +1263,12 @@ func (s *Syncer) syncDDL(tctx *tcontext.Context, queueBucket string, db *dbconn.
 			shardPessimistOp = s.pessimist.PendingOperation()
 			if shardPessimistOp != nil && !shardPessimistOp.Exec {
 				ignore = true
-				tctx.L().Info("ignore shard DDLs in pessimistic shard mode", zap.Strings("ddls", ddlJob.ddls))
+				s.tctx.L().Info("ignore shard DDLs in pessimistic shard mode", zap.Strings("ddls", ddlJob.ddls))
 			}
 		case config.ShardOptimistic:
 			if len(ddlJob.ddls) == 0 {
 				ignore = true
-				tctx.L().Info("ignore shard DDLs in optimistic mode", zap.Stringer("info", s.optimist.PendingInfo()))
+				s.tctx.L().Info("ignore shard DDLs in optimistic mode", zap.Stringer("info", s.optimist.PendingInfo()))
 			}
 		}
 
@@ -1293,9 +1280,9 @@ func (s *Syncer) syncDDL(tctx *tcontext.Context, queueBucket string, db *dbconn.
 
 		if !ignore {
 			var affected int
-			affected, err = db.ExecuteSQLWithIgnore(tctx, errorutil.IsIgnorableMySQLDDLError, ddlJob.ddls)
+			affected, err = db.ExecuteSQLWithIgnore(s.runTCtx, errorutil.IsIgnorableMySQLDDLError, ddlJob.ddls)
 			if err != nil {
-				err = s.handleSpecialDDLError(tctx, err, ddlJob.ddls, affected, db)
+				err = s.handleSpecialDDLError(s.runTCtx, err, ddlJob.ddls, affected, db)
 				err = terror.WithScope(err, terror.ScopeDownstream)
 			}
 		}
@@ -1329,7 +1316,7 @@ func (s *Syncer) syncDDL(tctx *tcontext.Context, queueBucket string, db *dbconn.
 			switch {
 			case shardInfo == nil:
 				// no need to do the shard DDL handle for `CREATE DATABASE/TABLE` now.
-				tctx.L().Warn("skip shard DDL handle in pessimistic shard mode", zap.Strings("ddl", ddlJob.ddls))
+				s.tctx.L().Warn("skip shard DDL handle in pessimistic shard mode", zap.Strings("ddl", ddlJob.ddls))
 			case shardPessimistOp == nil:
 				err = terror.ErrWorkerDDLLockOpNotFound.Generate(shardInfo)
 			default:
@@ -1342,7 +1329,7 @@ func (s *Syncer) syncDDL(tctx *tcontext.Context, queueBucket string, db *dbconn.
 				// no need to do the shard DDL handle for `DROP DATABASE/TABLE` now.
 				// but for `CREATE DATABASE` and `ALTER DATABASE` we execute it to the downstream directly without `shardInfo`.
 				if ignore { // actually ignored.
-					tctx.L().Warn("skip shard DDL handle in optimistic shard mode", zap.Strings("ddl", ddlJob.ddls))
+					s.tctx.L().Warn("skip shard DDL handle in optimistic shard mode", zap.Strings("ddl", ddlJob.ddls))
 				}
 			case s.optimist.PendingOperation() == nil:
 				err = terror.ErrWorkerDDLLockOpNotFound.Generate(shardInfo)
@@ -1401,7 +1388,7 @@ func (s *Syncer) fatalFunc(job *job, err error) {
 
 // DML synced with causality.
 func (s *Syncer) syncDML() {
-	defer s.wg.Done()
+	defer s.runWg.Done()
 
 	dmlJobCh := s.dmlJobCh
 	if s.cfg.Compact {
@@ -1415,7 +1402,8 @@ func (s *Syncer) syncDML() {
 	}
 }
 
-func (s *Syncer) waitTransactionEndBeforeExit(ctx context.Context) {
+func (s *Syncer) waitBeforeRunExit(ctx context.Context) {
+	defer s.runWg.Done()
 	failpoint.Inject("checkCheckpointInMiddleOfTransaction", func() {
 		s.tctx.L().Info("incr maxPauseOrStopWaitTime time ")
 		maxPauseOrStopWaitTime = time.Minute * 10
@@ -1436,7 +1424,7 @@ func (s *Syncer) waitTransactionEndBeforeExit(ctx context.Context) {
 		s.waitTransactionLock.Unlock()
 		select {
 		case <-s.runCtx.Done():
-			s.tctx.L().Info("run ctx done by waitComplete")
+			s.tctx.L().Info("syncer run exit so runCtx done")
 		case <-time.After(maxPauseOrStopWaitTime):
 			s.tctx.L().Info("wait transaction end timeout, exit now")
 			s.runCancel()
@@ -1448,7 +1436,7 @@ func (s *Syncer) waitTransactionEndBeforeExit(ctx context.Context) {
 }
 
 func (s *Syncer) updateTSOffsetCronJob(ctx context.Context) {
-	defer s.wg.Done()
+	defer s.runWg.Done()
 	// temporarily hard code there. if this metrics works well add this to config file.
 	ticker := time.NewTicker(time.Minute * 10)
 	defer ticker.Stop()
@@ -1465,7 +1453,7 @@ func (s *Syncer) updateTSOffsetCronJob(ctx context.Context) {
 }
 
 func (s *Syncer) updateLagCronJob(ctx context.Context) {
-	defer s.wg.Done()
+	defer s.runWg.Done()
 	// temporarily hard code there. if this metrics works well add this to config file.
 	ticker := time.NewTicker(time.Millisecond * 100)
 	defer ticker.Stop()
@@ -1491,20 +1479,37 @@ func (s *Syncer) updateTSOffset(ctx context.Context) error {
 
 // Run starts running for sync, we should guarantee it can rerun when paused.
 func (s *Syncer) Run(ctx context.Context) (err error) {
-	runCtx, runCancel := context.WithCancel(context.Background())
-	runTCtx := tcontext.NewContext(runCtx, s.tctx.L())
 	s.Lock()
-	s.runCtx = runCtx
-	s.runTCtx = runTCtx
-	s.runCancel = runCancel
-	s.rundone = make(chan struct{})
+	s.runCtx, s.runCancel = context.WithCancel(context.Background())
+	s.runTCtx = tcontext.NewContext(s.runCtx, s.tctx.L())
+	s.checkpointFlushWorker = &checkpointFlushWorker{
+		input:        make(chan *checkpointFlushTask, 16),
+		cp:           s.checkpoint,
+		execError:    &s.execError,
+		afterFlushFn: s.afterFlushCheckpoint,
+		addCountFunc: s.addCount,
+	}
 	s.Unlock()
 	defer func() {
-		// close rundone to notify other goroutine that syncer.Run is exited
-		close(s.rundone)
-		runCancel()
+		s.tctx.L().Info("in .... syncer run exit...........................................")
+		s.runCancel()
+		s.closeJobChans()
+		s.checkpointFlushWorker.Close()
+		s.runWg.Wait() // wait for all syncer.Run goroutine exit
+		s.tctx.L().Info("syncer run exit...........................................")
+	}()
+
+	// we should start this goroutine as soon as possible, because it's the only goroutine that cancel syncer.Run
+	s.runWg.Add(1)
+	go func() {
+		s.waitBeforeRunExit(ctx)
 	}()
 
+	// before sync run, we get the ts offset from upstream first
+	if utErr := s.updateTSOffset(ctx); utErr != nil {
+		return utErr
+	}
+
 	// some initialization that can't be put in Syncer.Init
 	fresh, err := s.IsFreshTask(s.runCtx)
 	if err != nil {
@@ -1516,8 +1521,6 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 			return err
 		}
 	}
-	// we should start this goroutine as soon as possible, because it's the only goroutine that cancel syncer.Run
-	go s.waitTransactionEndBeforeExit(ctx)
 
 	var (
 		flushCheckpoint bool
@@ -1539,13 +1542,23 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 	} else {
 		cleanDumpFile = false
 	}
+
+	s.runWg.Add(1)
+	// start syncDML worker first because if there is no dml worker checkpointFlushWorker will blocked forever
+	go s.syncDML()
+	s.runWg.Add(1)
 	// start flush checkpoints worker. this worker mast start before s.flushCheckPoints()
-	s.wg.Add(1)
 	go func() {
-		defer s.wg.Done()
+		defer s.runWg.Done()
+		// also need to use a ctx different s.runCtx, checkpointFlushWorker worker will closed in the first defer
 		s.checkpointFlushWorker.Run(s.tctx)
 	}()
-
+	s.runWg.Add(1)
+	go s.syncDDL(adminQueueName, s.ddlDBConn, s.ddlJobCh)
+	s.runWg.Add(1)
+	go s.updateLagCronJob(s.runCtx)
+	s.runWg.Add(1)
+	go s.updateTSOffsetCronJob(s.runCtx)
 	if flushCheckpoint {
 		if err = s.flushCheckPoints(); err != nil {
 			s.tctx.L().Warn("fail to flush checkpoints when starting task", zap.Error(err))
@@ -1570,11 +1583,6 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 		utils.OsExit(1)
 	})
 
-	// before sync run, we get the ts offset from upstream first
-	if utErr := s.updateTSOffset(ctx); utErr != nil {
-		return utErr
-	}
-
 	// startLocation is the start location for current received event
 	// currentLocation is the end location for current received event (End_log_pos in `show binlog events` for mysql)
 	// lastLocation is the end location for last received (ROTATE / QUERY / XID) event
@@ -1597,20 +1605,6 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 			return terror.Annotate(err, "fail to restart streamer controller")
 		}
 	}
-
-	// all pre check passed, we can start all background jobs, they all use s.runCancel to cancel
-	s.wg.Add(1)
-	go s.updateTSOffsetCronJob(s.runCtx)
-
-	s.wg.Add(1)
-	go s.syncDML()
-
-	s.wg.Add(1)
-	go s.syncDDL(s.runTCtx, adminQueueName, s.ddlDBConn, s.ddlJobCh)
-
-	s.wg.Add(1)
-	go s.updateLagCronJob(s.runCtx)
-
 	// syncing progress with sharding DDL group
 	// 1. use the global streamer to sync regular binlog events
 	// 2. sharding DDL synced for some sharding groups
@@ -1628,6 +1622,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 		traceSource             = fmt.Sprintf("%s.syncer.%s", s.cfg.SourceID, s.cfg.Name)
 	)
 
+	// this is second defer func in syncer.Run so in this time checkpointFlushWorker are still running
 	defer func() {
 		if err1 := recover(); err1 != nil {
 			failpoint.Inject("ExitAfterSaveOnlineDDL", func() {
@@ -3361,7 +3356,7 @@ func (s *Syncer) Close() {
 	if s.isClosed() {
 		return
 	}
-	s.stopSync()
+	s.stopStreamAndRollbackCheckpoint()
 	s.closeDBs()
 	s.checkpoint.Close()
 	if err := s.schemaTracker.Close(); err != nil {
@@ -3384,21 +3379,13 @@ func (s *Syncer) Kill() {
 	s.Close()
 }
 
-// stopSync stops syncing and rollback checkpoint now it used by Close,kILL and Pause
-// maybe we can refine the workflow more clear.
-func (s *Syncer) stopSync() {
-	// wait for s.Run() to finish
-	if s.rundone != nil {
-		<-s.rundone
-	}
-	s.closeJobChans()
-	s.wg.Wait() // wait job workers to return
-
+// stopStreamAndRollbackCheckpoint stop stream and rollback checkpoint now it used by Close() and Pause().
+func (s *Syncer) stopStreamAndRollbackCheckpoint() {
 	// before re-write workflow for s.syncer, simply close it
 	// when resuming, re-create s.syncer
 
 	if s.streamerController != nil {
-		s.streamerController.Close(s.runTCtx)
+		s.streamerController.Close(s.tctx)
 	}
 
 	// try to rollback checkpoints, if they already flushed, no effect, this operation should call before close schemaTracker
@@ -3423,7 +3410,7 @@ func (s *Syncer) Pause() {
 		s.tctx.L().Warn("try to pause, but already closed")
 		return
 	}
-	s.stopSync()
+	s.stopStreamAndRollbackCheckpoint()
 }
 
 // Resume resumes the paused process.
diff --git a/dm/syncer/syncer_test.go b/dm/syncer/syncer_test.go
index c4347e18ea6..6dac7315648 100644
--- a/dm/syncer/syncer_test.go
+++ b/dm/syncer/syncer_test.go
@@ -741,10 +741,10 @@ func (s *testSyncerSuite) TestcheckpointID(c *C) {
 
 func (s *testSyncerSuite) TestRun(c *C) {
 	// 1. run syncer with column mapping
-	// 2. execute some sqls which will trigger casuality
+	// 2. execute some sqls which will trigger causality
 	// 3. check the generated jobs
 	// 4. update config, add route rules, and update syncer
-	// 5. execute somes sqls and then check jobs generated
+	// 5. execute some sqls and then check jobs generated
 
 	db, mock, err := sqlmock.New()
 	c.Assert(err, IsNil)

From 06d4363ee09f63e9f69d69c02236c0bf12c5d0d1 Mon Sep 17 00:00:00 2001
From: ehco1996 <zh19960202@gmail.com>
Date: Fri, 21 Jan 2022 14:19:29 +0800
Subject: [PATCH 28/47] fix comments

---
 dm/syncer/dml_worker.go |  2 +-
 dm/syncer/syncer.go     | 22 ++++++++++++----------
 2 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/dm/syncer/dml_worker.go b/dm/syncer/dml_worker.go
index edbcf4bd1f3..7dfcccc2254 100644
--- a/dm/syncer/dml_worker.go
+++ b/dm/syncer/dml_worker.go
@@ -75,7 +75,7 @@ func dmlWorkerWrap(inCh chan *job, syncer *Syncer) chan *job {
 		fatalFunc:    syncer.fatalFunc,
 		lagFunc:      syncer.updateReplicationJobTS,
 		addCountFunc: syncer.addCount,
-		runTCtx:      syncer.runTCtx,
+		runTCtx:      syncer.runSyncTCtx,
 		toDBConns:    syncer.toDBConns,
 		inCh:         inCh,
 		flushCh:      make(chan *job),
diff --git a/dm/syncer/syncer.go b/dm/syncer/syncer.go
index 5520cdefe99..fa80e0afca8 100644
--- a/dm/syncer/syncer.go
+++ b/dm/syncer/syncer.go
@@ -126,12 +126,14 @@ type Syncer struct {
 	tctx *tcontext.Context // this ctx only used for logger.
 
 	// this ctx is a background ctx and was initialized in s.Run, it is used for some background tasks in s.Run
-	// when this ctx cancelled, syncer will shuts down all background running jobs and not wait transaction end
+	// when this ctx cancelled, syncer will shutdown all background running jobs and not wait transaction end
 	runCtx context.Context
 	// this is used for some background tasks s.Run that need logger it share same lifecycle with runCtx.
-	runTCtx   *tcontext.Context // this ctx only used for logger.
-	runCancel context.CancelFunc
-	runWg     sync.WaitGroup // control all goroutines in S.Run
+	runTCtx       *tcontext.Context
+	runCancel     context.CancelFunc
+	runSyncTCtx   *tcontext.Context // this ctx only used for syncDML and syncDDL and only cancelled when ungraceful stop
+	runSyncCancel context.CancelFunc
+	runWg         sync.WaitGroup // control all goroutines started in S.Run
 
 	cfg     *config.SubTaskConfig
 	syncCfg replication.BinlogSyncerConfig
@@ -1284,9 +1286,9 @@ func (s *Syncer) syncDDL(queueBucket string, db *dbconn.DBConn, ddlJobChan chan
 
 		if !ignore {
 			var affected int
-			affected, err = db.ExecuteSQLWithIgnore(s.runTCtx, errorutil.IsIgnorableMySQLDDLError, ddlJob.ddls)
+			affected, err = db.ExecuteSQLWithIgnore(s.runSyncTCtx, errorutil.IsIgnorableMySQLDDLError, ddlJob.ddls)
 			if err != nil {
-				err = s.handleSpecialDDLError(s.runTCtx, err, ddlJob.ddls, affected, db)
+				err = s.handleSpecialDDLError(s.runSyncTCtx, err, ddlJob.ddls, affected, db)
 				err = terror.WithScope(err, terror.ScopeDownstream)
 			}
 		}
@@ -1435,7 +1437,6 @@ func (s *Syncer) waitBeforeRunExit(ctx context.Context) {
 		}
 	case <-s.runCtx.Done(): // when no graceful stop, run ctx will canceled first.
 		s.tctx.L().Info("received ungraceful exit ctx, exit now")
-		s.runCancel()
 	}
 }
 
@@ -1486,6 +1487,8 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 	s.Lock()
 	s.runCtx, s.runCancel = context.WithCancel(context.Background())
 	s.runTCtx = tcontext.NewContext(s.runCtx, s.tctx.L())
+	syncCtx, syncCancel := context.WithCancel(context.Background())
+	s.runSyncTCtx, s.runSyncCancel = tcontext.NewContext(syncCtx, s.tctx.L()), syncCancel
 	s.checkpointFlushWorker = &checkpointFlushWorker{
 		input:        make(chan *checkpointFlushTask, 16),
 		cp:           s.checkpoint,
@@ -1495,12 +1498,10 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 	}
 	s.Unlock()
 	defer func() {
-		s.tctx.L().Info("in .... syncer run exit...........................................")
 		s.runCancel()
 		s.closeJobChans()
 		s.checkpointFlushWorker.Close()
-		s.runWg.Wait() // wait for all syncer.Run goroutine exit
-		s.tctx.L().Info("syncer run exit...........................................")
+		s.runWg.Wait()
 	}()
 
 	// we should start this goroutine as soon as possible, because it's the only goroutine that cancel syncer.Run
@@ -3382,6 +3383,7 @@ func (s *Syncer) Close() {
 func (s *Syncer) Kill() {
 	s.tctx.L().Warn("kill syncer without graceful")
 	s.runCancel()
+	s.runSyncCancel()
 	s.Close()
 }
 

From 7a39654024bcd27cd9179faaba68603a09c52a8b Mon Sep 17 00:00:00 2001
From: ehco1996 <zh19960202@gmail.com>
Date: Tue, 25 Jan 2022 10:52:16 +0800
Subject: [PATCH 29/47] merge one ctx

---
 dm/dm/worker/server.go | 10 ++++----
 dm/syncer/syncer.go    | 56 +++++++++++++++++++++---------------------
 2 files changed, 33 insertions(+), 33 deletions(-)

diff --git a/dm/dm/worker/server.go b/dm/dm/worker/server.go
index ff7590fd3b9..2b3a0e77c3e 100644
--- a/dm/dm/worker/server.go
+++ b/dm/dm/worker/server.go
@@ -458,7 +458,7 @@ func (s *Server) doClose() {
 
 // Close close the RPC server, this function can be called multiple times.
 func (s *Server) Close() {
-	s.doClose() // we should stop current sync first, other wise master may scheduler task on new worker while we are closing
+	s.doClose() // we should stop current sync first, otherwise master may schedule task on new worker while we are closing
 	s.stopKeepAlive()
 	s.etcdClient.Close()
 }
@@ -520,18 +520,18 @@ func (s *Server) stopSourceWorker(sourceID string, needLock, graceful bool) erro
 		s.Lock()
 		defer s.Unlock()
 	}
-	sw := s.getSourceWorker(false)
-	if sw == nil {
+	w := s.getSourceWorker(false)
+	if w == nil {
 		log.L().Warn("worker has not been started, no need to stop", zap.String("source", sourceID))
 		return nil // no need to stop because not started yet
 	}
-	if sourceID != "" && sw.cfg.SourceID != sourceID {
+	if sourceID != "" && w.cfg.SourceID != sourceID {
 		return terror.ErrWorkerSourceNotMatch
 	}
 	s.UpdateKeepAliveTTL(s.cfg.KeepAliveTTL)
 	s.setWorker(nil, false)
 	s.setSourceStatus("", nil, false)
-	sw.Stop(graceful)
+	w.Stop(graceful)
 	return nil
 }
 
diff --git a/dm/syncer/syncer.go b/dm/syncer/syncer.go
index fa80e0afca8..26eed10567a 100644
--- a/dm/syncer/syncer.go
+++ b/dm/syncer/syncer.go
@@ -125,15 +125,15 @@ type Syncer struct {
 
 	tctx *tcontext.Context // this ctx only used for logger.
 
-	// this ctx is a background ctx and was initialized in s.Run, it is used for some background tasks in s.Run
+	// this ctx derives from a background ctx and was initialized in s.Run, it is used for some background tasks in s.Run
 	// when this ctx cancelled, syncer will shutdown all background running jobs and not wait transaction end
-	runCtx context.Context
-	// this is used for some background tasks s.Run that need logger it share same lifecycle with runCtx.
-	runTCtx       *tcontext.Context
-	runCancel     context.CancelFunc
-	runSyncTCtx   *tcontext.Context // this ctx only used for syncDML and syncDDL and only cancelled when ungraceful stop
+	runCtx    *tcontext.Context
+	runCancel context.CancelFunc
+	// this ctx only used for syncDML and syncDDL and only cancelled when ungraceful stop
+	runSyncTCtx   *tcontext.Context
 	runSyncCancel context.CancelFunc
-	runWg         sync.WaitGroup // control all goroutines started in S.Run
+	// control all goroutines that started in S.Run
+	runWg sync.WaitGroup
 
 	cfg     *config.SubTaskConfig
 	syncCfg replication.BinlogSyncerConfig
@@ -1429,13 +1429,13 @@ func (s *Syncer) waitBeforeRunExit(ctx context.Context) {
 		s.waitXIDJob.Store(int64(waiting))
 		s.waitTransactionLock.Unlock()
 		select {
-		case <-s.runCtx.Done():
+		case <-s.runCtx.Ctx.Done():
 			s.tctx.L().Info("syncer run exit so runCtx done")
 		case <-time.After(maxPauseOrStopWaitTime):
 			s.tctx.L().Info("wait transaction end timeout, exit now")
 			s.runCancel()
 		}
-	case <-s.runCtx.Done(): // when no graceful stop, run ctx will canceled first.
+	case <-s.runCtx.Ctx.Done(): // when no graceful stop, run ctx will canceled first.
 		s.tctx.L().Info("received ungraceful exit ctx, exit now")
 	}
 }
@@ -1485,8 +1485,8 @@ func (s *Syncer) updateTSOffset(ctx context.Context) error {
 // Run starts running for sync, we should guarantee it can rerun when paused.
 func (s *Syncer) Run(ctx context.Context) (err error) {
 	s.Lock()
-	s.runCtx, s.runCancel = context.WithCancel(context.Background())
-	s.runTCtx = tcontext.NewContext(s.runCtx, s.tctx.L())
+	runCtx, runCancel := context.WithCancel(context.Background())
+	s.runCtx, s.runCancel = tcontext.NewContext(runCtx, s.tctx.L()), runCancel
 	syncCtx, syncCancel := context.WithCancel(context.Background())
 	s.runSyncTCtx, s.runSyncCancel = tcontext.NewContext(syncCtx, s.tctx.L()), syncCancel
 	s.checkpointFlushWorker = &checkpointFlushWorker{
@@ -1516,7 +1516,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 	}
 
 	// some initialization that can't be put in Syncer.Init
-	fresh, err := s.IsFreshTask(s.runCtx)
+	fresh, err := s.IsFreshTask(s.runCtx.Ctx)
 	if err != nil {
 		return err
 	} else if fresh {
@@ -1532,7 +1532,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 		delLoadTask     bool
 		cleanDumpFile   = s.cfg.CleanDumpFile
 	)
-	flushCheckpoint, err = s.adjustGlobalPointGTID(s.runTCtx)
+	flushCheckpoint, err = s.adjustGlobalPointGTID(s.runCtx)
 	if err != nil {
 		return err
 	}
@@ -1552,18 +1552,18 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 	// start syncDML worker first because if there is no dml worker checkpointFlushWorker will blocked forever
 	go s.syncDML()
 	s.runWg.Add(1)
-	// start flush checkpoints worker. this worker mast start before s.flushCheckPoints()
+	// start flush checkpoints worker. this worker must start before s.flushCheckPoints()
 	go func() {
 		defer s.runWg.Done()
-		// also need to use a ctx different s.runCtx, checkpointFlushWorker worker will closed in the first defer
+		// also need to use a ctx different s.runCtx, checkpointFlushWorker worker will be closed in the first defer
 		s.checkpointFlushWorker.Run(s.tctx)
 	}()
 	s.runWg.Add(1)
 	go s.syncDDL(adminQueueName, s.ddlDBConn, s.ddlJobCh)
 	s.runWg.Add(1)
-	go s.updateLagCronJob(s.runCtx)
+	go s.updateLagCronJob(s.runCtx.Ctx)
 	s.runWg.Add(1)
-	go s.updateTSOffsetCronJob(s.runCtx)
+	go s.updateTSOffsetCronJob(s.runCtx.Ctx)
 	if flushCheckpoint {
 		if err = s.flushCheckPoints(); err != nil {
 			s.tctx.L().Warn("fail to flush checkpoints when starting task", zap.Error(err))
@@ -1605,7 +1605,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 
 	if s.streamerController.IsClosed() {
 		s.locations.reset(lastLocation)
-		err = s.streamerController.Start(s.runTCtx, lastLocation)
+		err = s.streamerController.Start(s.runCtx, lastLocation)
 		if err != nil {
 			return terror.Annotate(err, "fail to restart streamer controller")
 		}
@@ -1676,7 +1676,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 	// if we start syncer at an early position, database must bear a period of inconsistent state,
 	// it's eventual consistency.
 	s.safeMode = sm.NewSafeMode()
-	s.enableSafeModeInitializationPhase(s.runTCtx)
+	s.enableSafeModeInitializationPhase(s.runCtx)
 
 	closeShardingResync := func() error {
 		if shardingReSync == nil {
@@ -1715,7 +1715,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 		}
 
 		for i := 0; i < n; {
-			e, err1 := s.getEvent(s.runTCtx, currentLocation)
+			e, err1 := s.getEvent(s.runCtx, currentLocation)
 			if err1 != nil {
 				return err
 			}
@@ -1766,7 +1766,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 			// if suffix>0, we are replacing error
 			s.isReplacingOrInjectingErr = currentLocation.Suffix != 0
 			s.locations.reset(shardingReSync.currLocation)
-			err = s.streamerController.RedirectStreamer(s.runTCtx, shardingReSync.currLocation)
+			err = s.streamerController.RedirectStreamer(s.runCtx, shardingReSync.currLocation)
 			if err != nil {
 				return err
 			}
@@ -1780,7 +1780,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 		var e *replication.BinlogEvent
 
 		startTime := time.Now()
-		e, err = s.getEvent(s.runTCtx, currentLocation)
+		e, err = s.getEvent(s.runCtx, currentLocation)
 		s.tctx.L().Debug("location refactor",
 			zap.Stringer("current", currentLocation),
 			zap.Stringer("start", startLocation),
@@ -1847,7 +1847,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 			// try to re-sync in gtid mode
 			if tryReSync && s.cfg.EnableGTID && utils.IsErrBinlogPurged(err) && s.cfg.AutoFixGTID {
 				time.Sleep(retryTimeout)
-				err = s.reSyncBinlog(*s.runTCtx, lastLocation)
+				err = s.reSyncBinlog(*s.runCtx, lastLocation)
 				if err != nil {
 					return err
 				}
@@ -1968,9 +1968,9 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 					s.errOperatorHolder.SetHasAllInjected(startLocation)
 					// reset event as startLocation, avoid to be marked in checkpoint
 					currentLocation.Position.Pos = startLocation.Position.Pos
-					err = s.streamerController.RedirectStreamer(s.runTCtx, startLocation)
+					err = s.streamerController.RedirectStreamer(s.runCtx, startLocation)
 				} else {
-					err = s.streamerController.RedirectStreamer(s.runTCtx, currentLocation)
+					err = s.streamerController.RedirectStreamer(s.runCtx, currentLocation)
 				}
 				if err != nil {
 					return err
@@ -1990,17 +1990,17 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 				// 2. push forward and replicate some sqls after safeModeExitPoint to downstream
 				// 3. quit because of network error, fail to flush global checkpoint and new safeModeExitPoint to downstream
 				// 4. restart again, quit safe mode at safeModeExitPoint, but some sqls after this location have already been replicated to the downstream
-				if err = s.checkpoint.FlushSafeModeExitPoint(s.runTCtx); err != nil {
+				if err = s.checkpoint.FlushSafeModeExitPoint(s.runCtx); err != nil {
 					return err
 				}
-				if err = s.safeMode.Add(s.runTCtx, -1); err != nil {
+				if err = s.safeMode.Add(s.runCtx, -1); err != nil {
 					return err
 				}
 			}
 		}
 
 		ec := eventContext{
-			tctx:                s.runTCtx,
+			tctx:                s.runCtx,
 			header:              e.Header,
 			startLocation:       &startLocation,
 			currentLocation:     &currentLocation,

From a73aa0be1dccbcde85300a3c4a6e3a6a71e216a9 Mon Sep 17 00:00:00 2001
From: ehco1996 <zh19960202@gmail.com>
Date: Tue, 25 Jan 2022 11:37:33 +0800
Subject: [PATCH 30/47] rename ctx

---
 dm/syncer/dml_worker.go |  6 +++---
 dm/syncer/syncer.go     | 12 ++++++------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/dm/syncer/dml_worker.go b/dm/syncer/dml_worker.go
index 7dfcccc2254..877a4c64878 100644
--- a/dm/syncer/dml_worker.go
+++ b/dm/syncer/dml_worker.go
@@ -36,7 +36,7 @@ type DMLWorker struct {
 	chanSize     int
 	multipleRows bool
 	toDBConns    []*dbconn.DBConn
-	runTCtx      *tcontext.Context
+	syncCtx      *tcontext.Context
 	logger       log.Logger
 
 	// for metrics
@@ -75,7 +75,7 @@ func dmlWorkerWrap(inCh chan *job, syncer *Syncer) chan *job {
 		fatalFunc:    syncer.fatalFunc,
 		lagFunc:      syncer.updateReplicationJobTS,
 		addCountFunc: syncer.addCount,
-		runTCtx:      syncer.runSyncTCtx,
+		syncCtx:      syncer.syncTCtx, // this ctx can be used to cancel all the workers
 		toDBConns:    syncer.toDBConns,
 		inCh:         inCh,
 		flushCh:      make(chan *job),
@@ -236,7 +236,7 @@ func (w *DMLWorker) executeBatchJobs(queueID int, jobs []*job) {
 		time.Sleep(time.Duration(t) * time.Second)
 	})
 	// use background context to execute sqls as much as possible
-	ctx, cancel := w.runTCtx.WithTimeout(maxDMLExecutionDuration)
+	ctx, cancel := w.syncCtx.WithTimeout(maxDMLExecutionDuration)
 	defer cancel()
 	affect, err = db.ExecuteSQL(ctx, queries, args...)
 	failpoint.Inject("SafeModeExit", func(val failpoint.Value) {
diff --git a/dm/syncer/syncer.go b/dm/syncer/syncer.go
index 26eed10567a..a8fb03ea005 100644
--- a/dm/syncer/syncer.go
+++ b/dm/syncer/syncer.go
@@ -130,8 +130,8 @@ type Syncer struct {
 	runCtx    *tcontext.Context
 	runCancel context.CancelFunc
 	// this ctx only used for syncDML and syncDDL and only cancelled when ungraceful stop
-	runSyncTCtx   *tcontext.Context
-	runSyncCancel context.CancelFunc
+	syncTCtx   *tcontext.Context
+	syncCancel context.CancelFunc
 	// control all goroutines that started in S.Run
 	runWg sync.WaitGroup
 
@@ -1286,9 +1286,9 @@ func (s *Syncer) syncDDL(queueBucket string, db *dbconn.DBConn, ddlJobChan chan
 
 		if !ignore {
 			var affected int
-			affected, err = db.ExecuteSQLWithIgnore(s.runSyncTCtx, errorutil.IsIgnorableMySQLDDLError, ddlJob.ddls)
+			affected, err = db.ExecuteSQLWithIgnore(s.syncTCtx, errorutil.IsIgnorableMySQLDDLError, ddlJob.ddls)
 			if err != nil {
-				err = s.handleSpecialDDLError(s.runSyncTCtx, err, ddlJob.ddls, affected, db)
+				err = s.handleSpecialDDLError(s.syncTCtx, err, ddlJob.ddls, affected, db)
 				err = terror.WithScope(err, terror.ScopeDownstream)
 			}
 		}
@@ -1488,7 +1488,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 	runCtx, runCancel := context.WithCancel(context.Background())
 	s.runCtx, s.runCancel = tcontext.NewContext(runCtx, s.tctx.L()), runCancel
 	syncCtx, syncCancel := context.WithCancel(context.Background())
-	s.runSyncTCtx, s.runSyncCancel = tcontext.NewContext(syncCtx, s.tctx.L()), syncCancel
+	s.syncTCtx, s.syncCancel = tcontext.NewContext(syncCtx, s.tctx.L()), syncCancel
 	s.checkpointFlushWorker = &checkpointFlushWorker{
 		input:        make(chan *checkpointFlushTask, 16),
 		cp:           s.checkpoint,
@@ -3383,7 +3383,7 @@ func (s *Syncer) Close() {
 func (s *Syncer) Kill() {
 	s.tctx.L().Warn("kill syncer without graceful")
 	s.runCancel()
-	s.runSyncCancel()
+	s.syncCancel()
 	s.Close()
 }
 

From 705c228dd22049a3499c4171ff5ddbbf01f12b10 Mon Sep 17 00:00:00 2001
From: ehco1996 <zh19960202@gmail.com>
Date: Wed, 26 Jan 2022 14:33:00 +0800
Subject: [PATCH 31/47] address comments

---
 dm/syncer/dml_worker.go |  2 +-
 dm/syncer/syncer.go     | 21 ++++++++++-----------
 2 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/dm/syncer/dml_worker.go b/dm/syncer/dml_worker.go
index 877a4c64878..4bc942b3db6 100644
--- a/dm/syncer/dml_worker.go
+++ b/dm/syncer/dml_worker.go
@@ -75,7 +75,7 @@ func dmlWorkerWrap(inCh chan *job, syncer *Syncer) chan *job {
 		fatalFunc:    syncer.fatalFunc,
 		lagFunc:      syncer.updateReplicationJobTS,
 		addCountFunc: syncer.addCount,
-		syncCtx:      syncer.syncTCtx, // this ctx can be used to cancel all the workers
+		syncCtx:      syncer.syncCtx, // this ctx can be used to cancel all the workers
 		toDBConns:    syncer.toDBConns,
 		inCh:         inCh,
 		flushCh:      make(chan *job),
diff --git a/dm/syncer/syncer.go b/dm/syncer/syncer.go
index a8fb03ea005..22bef2ab4b2 100644
--- a/dm/syncer/syncer.go
+++ b/dm/syncer/syncer.go
@@ -126,11 +126,11 @@ type Syncer struct {
 	tctx *tcontext.Context // this ctx only used for logger.
 
 	// this ctx derives from a background ctx and was initialized in s.Run, it is used for some background tasks in s.Run
-	// when this ctx cancelled, syncer will shutdown all background running jobs and not wait transaction end
+	// when this ctx cancelled, syncer will shutdown all background running jobs (except the syncDML and syncDDL) and not wait transaction end.
 	runCtx    *tcontext.Context
 	runCancel context.CancelFunc
-	// this ctx only used for syncDML and syncDDL and only cancelled when ungraceful stop
-	syncTCtx   *tcontext.Context
+	// this ctx only used for syncDML and syncDDL and only cancelled when ungraceful stop.
+	syncCtx    *tcontext.Context
 	syncCancel context.CancelFunc
 	// control all goroutines that started in S.Run
 	runWg sync.WaitGroup
@@ -1286,9 +1286,9 @@ func (s *Syncer) syncDDL(queueBucket string, db *dbconn.DBConn, ddlJobChan chan
 
 		if !ignore {
 			var affected int
-			affected, err = db.ExecuteSQLWithIgnore(s.syncTCtx, errorutil.IsIgnorableMySQLDDLError, ddlJob.ddls)
+			affected, err = db.ExecuteSQLWithIgnore(s.syncCtx, errorutil.IsIgnorableMySQLDDLError, ddlJob.ddls)
 			if err != nil {
-				err = s.handleSpecialDDLError(s.syncTCtx, err, ddlJob.ddls, affected, db)
+				err = s.handleSpecialDDLError(s.syncCtx, err, ddlJob.ddls, affected, db)
 				err = terror.WithScope(err, terror.ScopeDownstream)
 			}
 		}
@@ -1488,7 +1488,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 	runCtx, runCancel := context.WithCancel(context.Background())
 	s.runCtx, s.runCancel = tcontext.NewContext(runCtx, s.tctx.L()), runCancel
 	syncCtx, syncCancel := context.WithCancel(context.Background())
-	s.syncTCtx, s.syncCancel = tcontext.NewContext(syncCtx, s.tctx.L()), syncCancel
+	s.syncCtx, s.syncCancel = tcontext.NewContext(syncCtx, s.tctx.L()), syncCancel
 	s.checkpointFlushWorker = &checkpointFlushWorker{
 		input:        make(chan *checkpointFlushTask, 16),
 		cp:           s.checkpoint,
@@ -1549,7 +1549,6 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 	}
 
 	s.runWg.Add(1)
-	// start syncDML worker first because if there is no dml worker checkpointFlushWorker will blocked forever
 	go s.syncDML()
 	s.runWg.Add(1)
 	// start flush checkpoints worker. this worker must start before s.flushCheckPoints()
@@ -3363,7 +3362,7 @@ func (s *Syncer) Close() {
 	if s.isClosed() {
 		return
 	}
-	s.stopStreamAndRollbackCheckpoint()
+	s.stopSync()
 	s.closeDBs()
 	s.checkpoint.Close()
 	if err := s.schemaTracker.Close(); err != nil {
@@ -3387,8 +3386,8 @@ func (s *Syncer) Kill() {
 	s.Close()
 }
 
-// stopStreamAndRollbackCheckpoint stop stream and rollback checkpoint now it used by Close() and Pause().
-func (s *Syncer) stopStreamAndRollbackCheckpoint() {
+// stopSync stops stream and rollbacks checkpoint now it used by Close() and Pause().
+func (s *Syncer) stopSync() {
 	// before re-write workflow for s.syncer, simply close it
 	// when resuming, re-create s.syncer
 
@@ -3418,7 +3417,7 @@ func (s *Syncer) Pause() {
 		s.tctx.L().Warn("try to pause, but already closed")
 		return
 	}
-	s.stopStreamAndRollbackCheckpoint()
+	s.stopSync()
 }
 
 // Resume resumes the paused process.

From e881437b628ee2de07d95be1e6bd6d327b224d75 Mon Sep 17 00:00:00 2001
From: ehco1996 <zh19960202@gmail.com>
Date: Fri, 28 Jan 2022 10:04:10 +0800
Subject: [PATCH 32/47] fix shell && revert some logic of stop worker

---
 dm/dm/worker/server.go                           |  3 ++-
 .../checkpoint_transaction/conf/dm-worker1.toml  |  1 -
 dm/tests/checkpoint_transaction/run.sh           | 16 ++++------------
 dm/tests/new_relay/run.sh                        | 10 +++++-----
 4 files changed, 11 insertions(+), 19 deletions(-)

diff --git a/dm/dm/worker/server.go b/dm/dm/worker/server.go
index 2b3a0e77c3e..d3e2c9e9b38 100644
--- a/dm/dm/worker/server.go
+++ b/dm/dm/worker/server.go
@@ -447,11 +447,12 @@ func (s *Server) doClose() {
 	if s.closed.Load() {
 		return
 	}
+	// stop server in advance, stop receiving source bound and relay bound
+	s.cancel()
 	// stop worker and wait for return(we already lock the whole Sever, so no need use lock to get source worker)
 	if w := s.getSourceWorker(false); w != nil {
 		w.Stop(true)
 	}
-	s.cancel()
 	s.wg.Wait()
 	s.closed.Store(true)
 }
diff --git a/dm/tests/checkpoint_transaction/conf/dm-worker1.toml b/dm/tests/checkpoint_transaction/conf/dm-worker1.toml
index 9b8ecaf991c..3d99321d632 100644
--- a/dm/tests/checkpoint_transaction/conf/dm-worker1.toml
+++ b/dm/tests/checkpoint_transaction/conf/dm-worker1.toml
@@ -1,5 +1,4 @@
 join = "127.0.0.1:8261"
 keepalive-ttl = 1
-log-level = "info"
 name = "worker1"
 worker-addr = "127.0.0.1:8262"
diff --git a/dm/tests/checkpoint_transaction/run.sh b/dm/tests/checkpoint_transaction/run.sh
index 61eaee06efd..7a47c4165f7 100755
--- a/dm/tests/checkpoint_transaction/run.sh
+++ b/dm/tests/checkpoint_transaction/run.sh
@@ -7,8 +7,6 @@ source $cur/../_utils/test_prepare
 WORK_DIR=$TEST_DIR/$TEST_NAME
 
 function check_worker_ungraceful_stop_with_retry() {
-	all_matched=false
-
 	for ((k = 0; k < 10; k++)); do
 		sleep 1
 		echo "start check_worker_ungraceful_stop_with_retry times: $k"
@@ -25,18 +23,12 @@ function check_worker_ungraceful_stop_with_retry() {
 		if [ $num -lt 1 ]; then
 			continue
 		fi
-		all_matched=true
-		if $all_matched; then
-			echo "check_worker_ungraceful_stop_with_retry success after retry: $k"
-			break
-		fi
+		echo "check_worker_ungraceful_stop_with_retry success after retry: $k"
+		return 0
 	done
 
-	if ! $all_matched; then
-		echo "check_worker_ungraceful_stop_with_retry failed after retry"
-		exit 1
-	fi
-
+	echo "check_worker_ungraceful_stop_with_retry failed after retry"
+	exit 1
 }
 
 function run() {
diff --git a/dm/tests/new_relay/run.sh b/dm/tests/new_relay/run.sh
index 095136b8d27..512819f3504 100755
--- a/dm/tests/new_relay/run.sh
+++ b/dm/tests/new_relay/run.sh
@@ -90,7 +90,7 @@ function test_restart_relay_status() {
 		"relay" 1 \
 		"bound" 2
 
-	echo "》》》》》》》》》》》》》》》》》test test_restart_relay_status passed"
+	echo ">>>>>>>>>>>>>>>>>>>>>>>>>>test test_restart_relay_status passed"
 }
 
 function test_cant_dail_upstream() {
@@ -124,7 +124,7 @@ function test_cant_dail_upstream() {
 		"query-status -s $SOURCE_ID1" \
 		"injected error" 1
 
-	echo "》》》》》》》》》》》》》》》》》test test_cant_dail_upstream passed"
+	echo ">>>>>>>>>>>>>>>>>>>>>>>>>>test test_cant_dail_upstream passed"
 }
 
 function test_cant_dail_downstream() {
@@ -164,7 +164,7 @@ function test_cant_dail_downstream() {
 	run_tidb_server 4000 $TIDB_PASSWORD
 	sleep 2
 
-	echo "》》》》》》》》》》》》》》》》》test test_cant_dail_downstream passed"
+	echo ">>>>>>>>>>>>>>>>>>>>>>>>>>test test_cant_dail_downstream passed"
 }
 
 function test_kill_dump_connection() {
@@ -201,7 +201,7 @@ function test_kill_dump_connection() {
 	run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
 		"query-status -s $SOURCE_ID1" \
 		"\"relayCatchUpMaster\": true" 1
-	echo "》》》》》》》》》》》》》》》》》test test_kill_dump_connection passed"
+	echo ">>>>>>>>>>>>>>>>>>>>>>>>>>test test_kill_dump_connection passed"
 }
 
 function test_relay_operations() {
@@ -357,7 +357,7 @@ function test_relay_operations() {
 		"\"result\": true" 2
 
 	check_sync_diff $WORK_DIR $cur/conf/diff_config.toml
-	echo "》》》》》》》》》》》》》》》》》test test_relay_operations passed"
+	echo ">>>>>>>>>>>>>>>>>>>>>>>>>>test test_relay_operations passed"
 }
 
 function run() {

From a06c5e82cf6d7fc3e19e4daac5bd09c6685b66d5 Mon Sep 17 00:00:00 2001
From: ehco1996 <zh19960202@gmail.com>
Date: Fri, 28 Jan 2022 13:36:08 +0800
Subject: [PATCH 33/47] add ui for waitBeforeRunExit

---
 dm/syncer/syncer_test.go | 113 ++++++++++++++++++++++++++++-----------
 1 file changed, 83 insertions(+), 30 deletions(-)

diff --git a/dm/syncer/syncer_test.go b/dm/syncer/syncer_test.go
index fdafa31b09a..0b4f36e3f53 100644
--- a/dm/syncer/syncer_test.go
+++ b/dm/syncer/syncer_test.go
@@ -41,6 +41,7 @@ import (
 	"github.com/pingcap/tiflow/dm/pkg/utils"
 	"github.com/pingcap/tiflow/dm/syncer/dbconn"
 	"github.com/pingcap/tiflow/pkg/errorutil"
+	"github.com/stretchr/testify/require"
 
 	sqlmock "github.com/DATA-DOG/go-sqlmock"
 	"github.com/go-mysql-org/go-mysql/mysql"
@@ -102,12 +103,16 @@ type testSyncerSuite struct {
 }
 
 type MockStreamer struct {
-	events []*replication.BinlogEvent
-	idx    uint32
+	events  []*replication.BinlogEvent
+	idx     uint32
+	pending bool
 }
 
 func (m *MockStreamer) GetEvent(ctx context.Context) (*replication.BinlogEvent, error) {
 	if int(m.idx) >= len(m.events) {
+		if m.pending {
+			<-ctx.Done()
+		}
 		return nil, context.Canceled
 	}
 	e := m.events[m.idx]
@@ -121,7 +126,7 @@ type MockStreamProducer struct {
 
 func (mp *MockStreamProducer) generateStreamer(location binlog.Location) (reader.Streamer, error) {
 	if location.Position.Pos == 4 {
-		return &MockStreamer{mp.events, 0}, nil
+		return &MockStreamer{mp.events, 0, false}, nil
 	}
 	bytesLen := 0
 	idx := uint32(0)
@@ -132,32 +137,11 @@ func (mp *MockStreamProducer) generateStreamer(location binlog.Location) (reader
 			break
 		}
 	}
-	return &MockStreamer{mp.events, idx}, nil
+	return &MockStreamer{mp.events, idx, false}, nil
 }
 
 func (s *testSyncerSuite) SetUpSuite(c *C) {
-	loaderDir, err := os.MkdirTemp("", "loader")
-	c.Assert(err, IsNil)
-	loaderCfg := config.LoaderConfig{
-		Dir: loaderDir,
-	}
-	s.cfg = &config.SubTaskConfig{
-		From:             config.GetDBConfigForTest(),
-		To:               config.GetDBConfigForTest(),
-		ServerID:         101,
-		MetaSchema:       "test",
-		Name:             "syncer_ut",
-		ShadowTableRules: []string{config.DefaultShadowTableRules},
-		TrashTableRules:  []string{config.DefaultTrashTableRules},
-		Mode:             config.ModeIncrement,
-		Flavor:           "mysql",
-		LoaderConfig:     loaderCfg,
-	}
-	s.cfg.Experimental.AsyncCheckpointFlush = true
-	s.cfg.From.Adjust()
-	s.cfg.To.Adjust()
-
-	s.cfg.UseRelay = false
+	s.cfg = genDefaultSubTaskConfig4Test()
 	s.resetEventsGenerator(c)
 	c.Assert(log.InitLogger(&log.Config{}), IsNil)
 }
@@ -236,7 +220,7 @@ func (s *testSyncerSuite) TearDownSuite(c *C) {
 	os.RemoveAll(s.cfg.Dir)
 }
 
-func (s *testSyncerSuite) mockGetServerUnixTS(mock sqlmock.Sqlmock) {
+func mockGetServerUnixTS(mock sqlmock.Sqlmock) {
 	ts := time.Now().Unix()
 	rows := sqlmock.NewRows([]string{"UNIX_TIMESTAMP()"}).AddRow(strconv.FormatInt(ts, 10))
 	mock.ExpectQuery("SELECT UNIX_TIMESTAMP()").WillReturnRows(rows)
@@ -748,7 +732,7 @@ func (s *testSyncerSuite) TestRun(c *C) {
 
 	db, mock, err := sqlmock.New()
 	c.Assert(err, IsNil)
-	s.mockGetServerUnixTS(mock)
+	mockGetServerUnixTS(mock)
 	dbConn, err := db.Conn(context.Background())
 	c.Assert(err, IsNil)
 	checkPointDB, checkPointMock, err := sqlmock.New()
@@ -802,7 +786,7 @@ func (s *testSyncerSuite) TestRun(c *C) {
 	mock.ExpectQuery("SHOW CREATE TABLE " + "`test_1`.`t_1`").WillReturnRows(
 		sqlmock.NewRows([]string{"Table", "Create Table"}).
 			AddRow("t_1", "create table t_1(id int primary key, name varchar(24), KEY `index1` (`name`))"))
-	s.mockGetServerUnixTS(mock)
+	mockGetServerUnixTS(mock)
 	mock.ExpectQuery("SHOW CREATE TABLE " + "`test_1`.`t_2`").WillReturnRows(
 		sqlmock.NewRows([]string{"Table", "Create Table"}).
 			AddRow("t_2", "create table t_2(id int primary key, name varchar(24))"))
@@ -1040,7 +1024,7 @@ func (s *testSyncerSuite) TestRun(c *C) {
 func (s *testSyncerSuite) TestExitSafeModeByConfig(c *C) {
 	db, mock, err := sqlmock.New()
 	c.Assert(err, IsNil)
-	s.mockGetServerUnixTS(mock)
+	mockGetServerUnixTS(mock)
 
 	dbConn, err := db.Conn(context.Background())
 	c.Assert(err, IsNil)
@@ -1703,3 +1687,72 @@ func (s *testSyncerSuite) TestExecuteSQLSWithIgnore(c *C) {
 
 	c.Assert(mock.ExpectationsWereMet(), IsNil)
 }
+
+func genDefaultSubTaskConfig4Test() *config.SubTaskConfig {
+	loaderDir, err := os.MkdirTemp("", "loader")
+	if err != nil {
+		panic(err) // no happen
+	}
+
+	loaderCfg := config.LoaderConfig{
+		Dir: loaderDir,
+	}
+	cfg := &config.SubTaskConfig{
+		From:             config.GetDBConfigForTest(),
+		To:               config.GetDBConfigForTest(),
+		ServerID:         101,
+		MetaSchema:       "test",
+		Name:             "syncer_ut",
+		ShadowTableRules: []string{config.DefaultShadowTableRules},
+		TrashTableRules:  []string{config.DefaultTrashTableRules},
+		Mode:             config.ModeIncrement,
+		Flavor:           "mysql",
+		LoaderConfig:     loaderCfg,
+		UseRelay:         false,
+	}
+	cfg.Experimental.AsyncCheckpointFlush = true
+	cfg.From.Adjust()
+	cfg.To.Adjust()
+	return cfg
+}
+
+func TestWaitBeforeRunExit(t *testing.T) {
+	ctx := context.Background()
+	cfg := genDefaultSubTaskConfig4Test()
+	cfg.WorkerCount = 0
+	syncer := NewSyncer(cfg, nil, nil)
+
+	db, mock, err := sqlmock.New()
+	require.NoError(t, err)
+	mockGetServerUnixTS(mock)
+
+	syncer.fromDB = &dbconn.UpStreamConn{BaseDB: conn.NewBaseDB(db)}
+	syncer.reset()
+	require.NoError(t, syncer.genRouter())
+
+	mockStreamerProducer := &MockStreamProducer{}
+	mockStreamer, err := mockStreamerProducer.generateStreamer(binlog.NewLocation(""))
+	require.NoError(t, err)
+	// let getEvent pending until ctx.Done()
+	mockStreamer.(*MockStreamer).pending = true
+	syncer.streamerController = &StreamerController{
+		streamerProducer: mockStreamerProducer, streamer: mockStreamer, closed: false,
+	}
+
+	wg := &sync.WaitGroup{}
+	errCh := make(chan error, 1)
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		errCh <- syncer.Run(ctx)
+	}()
+	// wait s.Run start
+	time.Sleep(time.Second)
+	// s.Run will not exit unit we call cancel
+	require.Equal(t, 0, len(errCh))
+	require.NotNil(t, syncer.runCtx)
+	require.NotNil(t, syncer.runCancel)
+	syncer.runCancel() //this will make s.Run exit
+	wg.Wait()
+	require.Nil(t, <-errCh)
+}

From 1e51bec3be4a8460970648afd7c163ee516a77a4 Mon Sep 17 00:00:00 2001
From: ehco1996 <zh19960202@gmail.com>
Date: Tue, 8 Feb 2022 13:23:38 +0800
Subject: [PATCH 34/47] address comment

---
 dm/dm/worker/server.go   |  4 +++-
 dm/syncer/syncer.go      |  2 +-
 dm/syncer/syncer_test.go | 15 ++++++++++++++-
 3 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/dm/dm/worker/server.go b/dm/dm/worker/server.go
index d3e2c9e9b38..9e25e203d32 100644
--- a/dm/dm/worker/server.go
+++ b/dm/dm/worker/server.go
@@ -461,7 +461,9 @@ func (s *Server) doClose() {
 func (s *Server) Close() {
 	s.doClose() // we should stop current sync first, otherwise master may schedule task on new worker while we are closing
 	s.stopKeepAlive()
-	s.etcdClient.Close()
+	if s.etcdClient != nil {
+		s.etcdClient.Close()
+	}
 }
 
 // if needLock is false, we should make sure Server has been locked in caller.
diff --git a/dm/syncer/syncer.go b/dm/syncer/syncer.go
index 22bef2ab4b2..acf1c25d2fa 100644
--- a/dm/syncer/syncer.go
+++ b/dm/syncer/syncer.go
@@ -1554,7 +1554,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 	// start flush checkpoints worker. this worker must start before s.flushCheckPoints()
 	go func() {
 		defer s.runWg.Done()
-		// also need to use a ctx different s.runCtx, checkpointFlushWorker worker will be closed in the first defer
+		// also need to use a ctx different from s.runCtx, checkpointFlushWorker worker will be closed in the first defer
 		s.checkpointFlushWorker.Run(s.tctx)
 	}()
 	s.runWg.Add(1)
diff --git a/dm/syncer/syncer_test.go b/dm/syncer/syncer_test.go
index 0b4f36e3f53..d70e1ea4f14 100644
--- a/dm/syncer/syncer_test.go
+++ b/dm/syncer/syncer_test.go
@@ -1748,11 +1748,24 @@ func TestWaitBeforeRunExit(t *testing.T) {
 	}()
 	// wait s.Run start
 	time.Sleep(time.Second)
-	// s.Run will not exit unit we call cancel
+
+	// test s.Run will not exit unit caller cancel ctx or call s.runCancel
 	require.Equal(t, 0, len(errCh))
 	require.NotNil(t, syncer.runCtx)
 	require.NotNil(t, syncer.runCancel)
 	syncer.runCancel() //this will make s.Run exit
 	wg.Wait()
 	require.Nil(t, <-errCh)
+
+	// test syncer wait time not more than maxPauseOrStopWaitTime
+	oldMaxPauseOrStopWaitTime := maxPauseOrStopWaitTime
+	maxPauseOrStopWaitTime = time.Second
+	ctx2, cancel := context.WithCancel(context.Background())
+	cancel()
+	runCtx, runCancel := context.WithCancel(context.Background())
+	syncer.runCtx, syncer.runCancel = tcontext.NewContext(runCtx, syncer.tctx.L()), runCancel
+	syncer.runWg.Add(1)
+	syncer.waitBeforeRunExit(ctx2)
+	require.Equal(t, context.Canceled, syncer.runCtx.Ctx.Err())
+	maxPauseOrStopWaitTime = oldMaxPauseOrStopWaitTime
 }

From 63a235e8ef60eeb3e9bf8196afcd5abefd2ff44f Mon Sep 17 00:00:00 2001
From: ehco1996 <zh19960202@gmail.com>
Date: Tue, 8 Feb 2022 13:29:54 +0800
Subject: [PATCH 35/47] fix lint

---
 dm/syncer/syncer.go      | 2 +-
 dm/syncer/syncer_test.go | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/dm/syncer/syncer.go b/dm/syncer/syncer.go
index acf1c25d2fa..56f796ac7ae 100644
--- a/dm/syncer/syncer.go
+++ b/dm/syncer/syncer.go
@@ -1554,7 +1554,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 	// start flush checkpoints worker. this worker must start before s.flushCheckPoints()
 	go func() {
 		defer s.runWg.Done()
-		// also need to use a ctx different from s.runCtx, checkpointFlushWorker worker will be closed in the first defer
+		// also need to use a different ctx. checkpointFlushWorker worker will be closed in the first defer
 		s.checkpointFlushWorker.Run(s.tctx)
 	}()
 	s.runWg.Add(1)
diff --git a/dm/syncer/syncer_test.go b/dm/syncer/syncer_test.go
index d70e1ea4f14..c0d4343eb54 100644
--- a/dm/syncer/syncer_test.go
+++ b/dm/syncer/syncer_test.go
@@ -1753,7 +1753,7 @@ func TestWaitBeforeRunExit(t *testing.T) {
 	require.Equal(t, 0, len(errCh))
 	require.NotNil(t, syncer.runCtx)
 	require.NotNil(t, syncer.runCancel)
-	syncer.runCancel() //this will make s.Run exit
+	syncer.runCancel() // this will make s.Run exit
 	wg.Wait()
 	require.Nil(t, <-errCh)
 

From fb5f21fd8072154df2fa15d2302cab7af7e44287 Mon Sep 17 00:00:00 2001
From: ehco1996 <zh19960202@gmail.com>
Date: Tue, 8 Feb 2022 13:42:24 +0800
Subject: [PATCH 36/47] ungraceful exit the dml pipeline

---
 dm/syncer/causality.go  | 52 +++++++++++++++++++------------------
 dm/syncer/dml_worker.go | 57 ++++++++++++++++++++++-------------------
 2 files changed, 58 insertions(+), 51 deletions(-)

diff --git a/dm/syncer/causality.go b/dm/syncer/causality.go
index ab5eb8e8699..016f455f250 100644
--- a/dm/syncer/causality.go
+++ b/dm/syncer/causality.go
@@ -66,33 +66,35 @@ func causalityWrap(inCh chan *job, syncer *Syncer) chan *job {
 // run receives dml jobs and send causality jobs by adding causality key.
 // When meet conflict, sends a conflict job.
 func (c *causality) run() {
-	for j := range c.inCh {
-		metrics.QueueSizeGauge.WithLabelValues(c.task, "causality_input", c.source).Set(float64(len(c.inCh)))
-
-		startTime := time.Now()
-
-		switch j.tp {
-		case flush, asyncFlush:
-			c.relation.rotate(j.flushSeq)
-		case gc:
-			// gc is only used on inner-causality logic
-			c.relation.gc(j.flushSeq)
-			continue
-		default:
-			keys := j.dml.identifyKeys(c.sessCtx)
-
-			// detectConflict before add
-			if c.detectConflict(keys) {
-				c.logger.Debug("meet causality key, will generate a conflict job to flush all sqls", zap.Strings("keys", keys))
-				c.outCh <- newConflictJob(c.workerCount)
-				c.relation.clear()
+	for {
+		select {
+		case j, ok := <-c.inCh:
+			if !ok {
+				return
 			}
-			j.dml.key = c.add(keys)
-			c.logger.Debug("key for keys", zap.String("key", j.dml.key), zap.Strings("keys", keys))
+			metrics.QueueSizeGauge.WithLabelValues(c.task, "causality_input", c.source).Set(float64(len(c.inCh)))
+			startTime := time.Now()
+			switch j.tp {
+			case flush, asyncFlush:
+				c.relation.rotate(j.flushSeq)
+			case gc:
+				// gc is only used on inner-causality logic
+				c.relation.gc(j.flushSeq)
+				continue
+			default:
+				keys := j.dml.identifyKeys(c.sessCtx)
+				// detectConflict before add
+				if c.detectConflict(keys) {
+					c.logger.Debug("meet causality key, will generate a conflict job to flush all sqls", zap.Strings("keys", keys))
+					c.outCh <- newConflictJob(c.workerCount)
+					c.relation.clear()
+				}
+				j.dml.key = c.add(keys)
+				c.logger.Debug("key for keys", zap.String("key", j.dml.key), zap.Strings("keys", keys))
+			}
+			metrics.ConflictDetectDurationHistogram.WithLabelValues(c.task, c.source).Observe(time.Since(startTime).Seconds())
+			c.outCh <- j
 		}
-		metrics.ConflictDetectDurationHistogram.WithLabelValues(c.task, c.source).Observe(time.Since(startTime).Seconds())
-
-		c.outCh <- j
 	}
 }
 
diff --git a/dm/syncer/dml_worker.go b/dm/syncer/dml_worker.go
index 4bc942b3db6..87ed4174596 100644
--- a/dm/syncer/dml_worker.go
+++ b/dm/syncer/dml_worker.go
@@ -112,32 +112,37 @@ func (w *DMLWorker) run() {
 	for i := 0; i < w.workerCount; i++ {
 		queueBucketMapping[i] = queueBucketName(i)
 	}
-
-	for j := range w.inCh {
-		metrics.QueueSizeGauge.WithLabelValues(w.task, "dml_worker_input", w.source).Set(float64(len(w.inCh)))
-		switch j.tp {
-		case flush:
-			w.addCountFunc(false, adminQueueName, j.tp, 1, j.targetTable)
-			w.sendJobToAllDmlQueue(j, jobChs, queueBucketMapping)
-			j.flushWg.Wait()
-			w.addCountFunc(true, adminQueueName, j.tp, 1, j.targetTable)
-			w.flushCh <- j
-		case asyncFlush:
-			w.addCountFunc(false, adminQueueName, j.tp, 1, j.targetTable)
-			w.sendJobToAllDmlQueue(j, jobChs, queueBucketMapping)
-			w.flushCh <- j
-		case conflict:
-			w.addCountFunc(false, adminQueueName, j.tp, 1, j.targetTable)
-			w.sendJobToAllDmlQueue(j, jobChs, queueBucketMapping)
-			j.flushWg.Wait()
-			w.addCountFunc(true, adminQueueName, j.tp, 1, j.targetTable)
-		default:
-			queueBucket := int(utils.GenHashKey(j.dml.key)) % w.workerCount
-			w.addCountFunc(false, queueBucketMapping[queueBucket], j.tp, 1, j.targetTable)
-			startTime := time.Now()
-			w.logger.Debug("queue for key", zap.Int("queue", queueBucket), zap.String("key", j.dml.key))
-			jobChs[queueBucket] <- j
-			metrics.AddJobDurationHistogram.WithLabelValues(j.tp.String(), w.task, queueBucketMapping[queueBucket], w.source).Observe(time.Since(startTime).Seconds())
+	for {
+		select {
+		case j, ok := <-w.inCh:
+			if !ok {
+				return
+			}
+			metrics.QueueSizeGauge.WithLabelValues(w.task, "dml_worker_input", w.source).Set(float64(len(w.inCh)))
+			switch j.tp {
+			case flush:
+				w.addCountFunc(false, adminQueueName, j.tp, 1, j.targetTable)
+				w.sendJobToAllDmlQueue(j, jobChs, queueBucketMapping)
+				j.flushWg.Wait()
+				w.addCountFunc(true, adminQueueName, j.tp, 1, j.targetTable)
+				w.flushCh <- j
+			case asyncFlush:
+				w.addCountFunc(false, adminQueueName, j.tp, 1, j.targetTable)
+				w.sendJobToAllDmlQueue(j, jobChs, queueBucketMapping)
+				w.flushCh <- j
+			case conflict:
+				w.addCountFunc(false, adminQueueName, j.tp, 1, j.targetTable)
+				w.sendJobToAllDmlQueue(j, jobChs, queueBucketMapping)
+				j.flushWg.Wait()
+				w.addCountFunc(true, adminQueueName, j.tp, 1, j.targetTable)
+			default:
+				queueBucket := int(utils.GenHashKey(j.dml.key)) % w.workerCount
+				w.addCountFunc(false, queueBucketMapping[queueBucket], j.tp, 1, j.targetTable)
+				startTime := time.Now()
+				w.logger.Debug("queue for key", zap.Int("queue", queueBucket), zap.String("key", j.dml.key))
+				jobChs[queueBucket] <- j
+				metrics.AddJobDurationHistogram.WithLabelValues(j.tp.String(), w.task, queueBucketMapping[queueBucket], w.source).Observe(time.Since(startTime).Seconds())
+			}
 		}
 	}
 }

From b851b623e02165a84aaaa0696bf5ba0971c3ef55 Mon Sep 17 00:00:00 2001
From: ehco1996 <zh19960202@gmail.com>
Date: Tue, 8 Feb 2022 14:57:31 +0800
Subject: [PATCH 37/47] fix merge master

---
 dm/syncer/syncer.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dm/syncer/syncer.go b/dm/syncer/syncer.go
index 040ce4b9c65..4f484204156 100644
--- a/dm/syncer/syncer.go
+++ b/dm/syncer/syncer.go
@@ -1539,7 +1539,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 			cleanDumpFile = false
 		}
 		if s.cfg.ShardMode == config.ShardOptimistic {
-			s.flushOptimisticTableInfos(tctx)
+			s.flushOptimisticTableInfos(s.runCtx)
 		}
 	}
 
@@ -1967,7 +1967,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 				s.locations.reset(currentLocation)
 				if !s.errOperatorHolder.IsInject(startLocation) {
 					// replace operator need redirect to currentLocation
-					if err = s.streamerController.RedirectStreamer(tctx, currentLocation); err != nil {
+					if err = s.streamerController.RedirectStreamer(s.runCtx, currentLocation); err != nil {
 						return err
 					}
 				}

From 6e9d1ad02cfe4318a0ee1b20bfb2fb596474639c Mon Sep 17 00:00:00 2001
From: ehco1996 <zh19960202@gmail.com>
Date: Tue, 8 Feb 2022 15:25:55 +0800
Subject: [PATCH 38/47] fix datarace

---
 dm/syncer/syncer_test.go | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/dm/syncer/syncer_test.go b/dm/syncer/syncer_test.go
index c0d4343eb54..3026949c09d 100644
--- a/dm/syncer/syncer_test.go
+++ b/dm/syncer/syncer_test.go
@@ -1751,8 +1751,10 @@ func TestWaitBeforeRunExit(t *testing.T) {
 
 	// test s.Run will not exit unit caller cancel ctx or call s.runCancel
 	require.Equal(t, 0, len(errCh))
+	syncer.Lock()
 	require.NotNil(t, syncer.runCtx)
 	require.NotNil(t, syncer.runCancel)
+	syncer.Unlock()
 	syncer.runCancel() // this will make s.Run exit
 	wg.Wait()
 	require.Nil(t, <-errCh)

From 7aca258d8646f9edded417f0418942b8090de4aa Mon Sep 17 00:00:00 2001
From: ehco1996 <zh19960202@gmail.com>
Date: Tue, 8 Feb 2022 16:27:12 +0800
Subject: [PATCH 39/47] add syncer ctx to sync pipeline

---
 dm/syncer/causality.go  | 5 +++++
 dm/syncer/compactor.go  | 5 +++++
 dm/syncer/dml_worker.go | 2 ++
 3 files changed, 12 insertions(+)

diff --git a/dm/syncer/causality.go b/dm/syncer/causality.go
index 016f455f250..159b7fa7a8d 100644
--- a/dm/syncer/causality.go
+++ b/dm/syncer/causality.go
@@ -20,6 +20,7 @@ import (
 	"go.uber.org/zap"
 
 	"github.com/pingcap/tidb/sessionctx"
+	tcontext "github.com/pingcap/tiflow/dm/pkg/context"
 	"github.com/pingcap/tiflow/dm/pkg/log"
 	"github.com/pingcap/tiflow/dm/syncer/metrics"
 )
@@ -34,6 +35,7 @@ type causality struct {
 	outCh       chan *job
 	inCh        chan *job
 	logger      log.Logger
+	syncCtx     *tcontext.Context
 	sessCtx     sessionctx.Context
 	workerCount int
 
@@ -51,6 +53,7 @@ func causalityWrap(inCh chan *job, syncer *Syncer) chan *job {
 		logger:      syncer.tctx.Logger.WithFields(zap.String("component", "causality")),
 		inCh:        inCh,
 		outCh:       make(chan *job, syncer.cfg.QueueSize),
+		syncCtx:     syncer.syncCtx, // this ctx can be used to cancel all the workers
 		sessCtx:     syncer.sessCtx,
 		workerCount: syncer.cfg.WorkerCount,
 	}
@@ -68,6 +71,8 @@ func causalityWrap(inCh chan *job, syncer *Syncer) chan *job {
 func (c *causality) run() {
 	for {
 		select {
+		case <-c.syncCtx.Ctx.Done():
+			break
 		case j, ok := <-c.inCh:
 			if !ok {
 				return
diff --git a/dm/syncer/compactor.go b/dm/syncer/compactor.go
index 62c907f037d..28ea593c70d 100644
--- a/dm/syncer/compactor.go
+++ b/dm/syncer/compactor.go
@@ -22,6 +22,7 @@ import (
 	"github.com/pingcap/tidb-tools/pkg/filter"
 	"go.uber.org/zap"
 
+	tcontext "github.com/pingcap/tiflow/dm/pkg/context"
 	"github.com/pingcap/tiflow/dm/pkg/log"
 	"github.com/pingcap/tiflow/dm/syncer/metrics"
 )
@@ -33,6 +34,7 @@ type compactor struct {
 	bufferSize int
 	logger     log.Logger
 	safeMode   bool
+	syncCtx    *tcontext.Context
 
 	keyMap map[string]map[string]int // table -> key(pk or (uk + not null)) -> index in buffer
 	buffer []*job
@@ -58,6 +60,7 @@ func compactorWrap(inCh chan *job, syncer *Syncer) chan *job {
 		task:         syncer.cfg.Name,
 		source:       syncer.cfg.SourceID,
 		addCountFunc: syncer.addCount,
+		syncCtx:      syncer.syncCtx, // this ctx can be used to cancel all the workers
 	}
 	go func() {
 		compactor.run()
@@ -70,6 +73,8 @@ func compactorWrap(inCh chan *job, syncer *Syncer) chan *job {
 func (c *compactor) run() {
 	for {
 		select {
+		case <-c.syncCtx.Ctx.Done():
+			break
 		case j, ok := <-c.inCh:
 			if !ok {
 				return
diff --git a/dm/syncer/dml_worker.go b/dm/syncer/dml_worker.go
index 1dbe7111fe8..a866e94f7bd 100644
--- a/dm/syncer/dml_worker.go
+++ b/dm/syncer/dml_worker.go
@@ -114,6 +114,8 @@ func (w *DMLWorker) run() {
 	}
 	for {
 		select {
+		case <-w.syncCtx.Ctx.Done():
+			break
 		case j, ok := <-w.inCh:
 			if !ok {
 				return

From 0f5e2207e52cde9fdcc279e6d1cdda94d4c9c503 Mon Sep 17 00:00:00 2001
From: ehco1996 <zh19960202@gmail.com>
Date: Tue, 8 Feb 2022 16:50:42 +0800
Subject: [PATCH 40/47] fix ut panic

---
 dm/syncer/causality_test.go | 2 ++
 dm/syncer/compactor_test.go | 3 ++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/dm/syncer/causality_test.go b/dm/syncer/causality_test.go
index 1109b8c1b80..1ebfe16a3d3 100644
--- a/dm/syncer/causality_test.go
+++ b/dm/syncer/causality_test.go
@@ -95,6 +95,7 @@ func (s *testSyncerSuite) TestCasuality(c *C) {
 			Name:     "task",
 			SourceID: "source",
 		},
+		syncCtx: tcontext.Background(),
 		tctx:    tcontext.Background().WithLogger(log.L()),
 		sessCtx: utils.NewSessionCtx(map[string]string{"time_zone": "UTC"}),
 	}
@@ -166,6 +167,7 @@ func (s *testSyncerSuite) TestCasualityWithPrefixIndex(c *C) {
 			Name:     "task",
 			SourceID: "source",
 		},
+		syncCtx: tcontext.Background(),
 		tctx:    tcontext.Background().WithLogger(log.L()),
 		sessCtx: utils.NewSessionCtx(map[string]string{"time_zone": "UTC"}),
 	}
diff --git a/dm/syncer/compactor_test.go b/dm/syncer/compactor_test.go
index 506f9581a7f..ccf1b3bcc3a 100644
--- a/dm/syncer/compactor_test.go
+++ b/dm/syncer/compactor_test.go
@@ -251,7 +251,8 @@ func (s *testSyncerSuite) TestCompactorSafeMode(c *C) {
 
 	inCh := make(chan *job, 100)
 	syncer := &Syncer{
-		tctx: tcontext.NewContext(context.Background(), log.L()),
+		tctx:    tcontext.NewContext(context.Background(), log.L()),
+		syncCtx: tcontext.Background(),
 		cfg: &config.SubTaskConfig{
 			Name:     "task",
 			SourceID: "source",

From cd7903c91064827949f61e1f604716d52c3ae908 Mon Sep 17 00:00:00 2001
From: ehco1996 <zh19960202@gmail.com>
Date: Thu, 10 Feb 2022 14:29:22 +0800
Subject: [PATCH 41/47] address comments

---
 dm/dm/worker/subtask.go | 4 +++-
 dm/syncer/causality.go  | 2 +-
 dm/syncer/compactor.go  | 2 +-
 dm/syncer/dml_worker.go | 2 +-
 4 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/dm/dm/worker/subtask.go b/dm/dm/worker/subtask.go
index c6ed4f7f261..1a1ee47c206 100644
--- a/dm/dm/worker/subtask.go
+++ b/dm/dm/worker/subtask.go
@@ -394,8 +394,10 @@ func (st *SubTask) closeUnits() {
 
 func (st *SubTask) killCurrentUnit() {
 	if st.CurrUnit() != nil {
-		st.l.Info("kill unit", zap.String("task", st.cfg.Name), zap.Stringer("unit", st.CurrUnit().Type()))
+		ut := st.CurrUnit().Type()
+		st.l.Info("kill unit", zap.String("task", st.cfg.Name), zap.Stringer("unit", ut))
 		st.CurrUnit().Kill()
+		st.l.Info("kill unit done", zap.String("task", st.cfg.Name), zap.Stringer("unit", ut))
 	}
 }
 
diff --git a/dm/syncer/causality.go b/dm/syncer/causality.go
index 159b7fa7a8d..5b4d53621fd 100644
--- a/dm/syncer/causality.go
+++ b/dm/syncer/causality.go
@@ -72,7 +72,7 @@ func (c *causality) run() {
 	for {
 		select {
 		case <-c.syncCtx.Ctx.Done():
-			break
+			return
 		case j, ok := <-c.inCh:
 			if !ok {
 				return
diff --git a/dm/syncer/compactor.go b/dm/syncer/compactor.go
index 28ea593c70d..a3aa03576c1 100644
--- a/dm/syncer/compactor.go
+++ b/dm/syncer/compactor.go
@@ -74,7 +74,7 @@ func (c *compactor) run() {
 	for {
 		select {
 		case <-c.syncCtx.Ctx.Done():
-			break
+			return
 		case j, ok := <-c.inCh:
 			if !ok {
 				return
diff --git a/dm/syncer/dml_worker.go b/dm/syncer/dml_worker.go
index a866e94f7bd..cbe9919d584 100644
--- a/dm/syncer/dml_worker.go
+++ b/dm/syncer/dml_worker.go
@@ -115,7 +115,7 @@ func (w *DMLWorker) run() {
 	for {
 		select {
 		case <-w.syncCtx.Ctx.Done():
-			break
+			return
 		case j, ok := <-w.inCh:
 			if !ok {
 				return

From 05ca26babf53aa0de37cd296a27cf31ffd966e25 Mon Sep 17 00:00:00 2001
From: ehco1996 <zh19960202@gmail.com>
Date: Fri, 11 Feb 2022 16:03:34 +0800
Subject: [PATCH 42/47] revert ungraceful close job pipeline

---
 dm/dm/worker/server.go      |  2 +-
 dm/dm/worker/subtask.go     |  2 +-
 dm/syncer/causality.go      | 57 ++++++++++++++++--------------------
 dm/syncer/causality_test.go |  2 --
 dm/syncer/compactor.go      |  5 ----
 dm/syncer/compactor_test.go |  3 +-
 dm/syncer/dml_worker.go     | 58 ++++++++++++++++---------------------
 dm/syncer/syncer.go         | 22 +++++++-------
 8 files changed, 63 insertions(+), 88 deletions(-)

diff --git a/dm/dm/worker/server.go b/dm/dm/worker/server.go
index 9e25e203d32..6cfcbd481bc 100644
--- a/dm/dm/worker/server.go
+++ b/dm/dm/worker/server.go
@@ -449,11 +449,11 @@ func (s *Server) doClose() {
 	}
 	// stop server in advance, stop receiving source bound and relay bound
 	s.cancel()
+	s.wg.Wait()
 	// stop worker and wait for return(we already lock the whole Sever, so no need use lock to get source worker)
 	if w := s.getSourceWorker(false); w != nil {
 		w.Stop(true)
 	}
-	s.wg.Wait()
 	s.closed.Store(true)
 }
 
diff --git a/dm/dm/worker/subtask.go b/dm/dm/worker/subtask.go
index 1a1ee47c206..7c35acd47f3 100644
--- a/dm/dm/worker/subtask.go
+++ b/dm/dm/worker/subtask.go
@@ -504,7 +504,7 @@ func (st *SubTask) Close() {
 	updateTaskMetric(st.cfg.Name, st.cfg.SourceID, pb.Stage_Stopped, st.workerName)
 }
 
-// kill kill running unit and stop the sub task.
+// Kill kill running unit and stop the sub task.
 func (st *SubTask) Kill() {
 	st.l.Info("killing")
 	if !st.setStageIfNotIn([]pb.Stage{pb.Stage_Stopped, pb.Stage_Stopping, pb.Stage_Finished}, pb.Stage_Stopping) {
diff --git a/dm/syncer/causality.go b/dm/syncer/causality.go
index 5b4d53621fd..ab5eb8e8699 100644
--- a/dm/syncer/causality.go
+++ b/dm/syncer/causality.go
@@ -20,7 +20,6 @@ import (
 	"go.uber.org/zap"
 
 	"github.com/pingcap/tidb/sessionctx"
-	tcontext "github.com/pingcap/tiflow/dm/pkg/context"
 	"github.com/pingcap/tiflow/dm/pkg/log"
 	"github.com/pingcap/tiflow/dm/syncer/metrics"
 )
@@ -35,7 +34,6 @@ type causality struct {
 	outCh       chan *job
 	inCh        chan *job
 	logger      log.Logger
-	syncCtx     *tcontext.Context
 	sessCtx     sessionctx.Context
 	workerCount int
 
@@ -53,7 +51,6 @@ func causalityWrap(inCh chan *job, syncer *Syncer) chan *job {
 		logger:      syncer.tctx.Logger.WithFields(zap.String("component", "causality")),
 		inCh:        inCh,
 		outCh:       make(chan *job, syncer.cfg.QueueSize),
-		syncCtx:     syncer.syncCtx, // this ctx can be used to cancel all the workers
 		sessCtx:     syncer.sessCtx,
 		workerCount: syncer.cfg.WorkerCount,
 	}
@@ -69,37 +66,33 @@ func causalityWrap(inCh chan *job, syncer *Syncer) chan *job {
 // run receives dml jobs and send causality jobs by adding causality key.
 // When meet conflict, sends a conflict job.
 func (c *causality) run() {
-	for {
-		select {
-		case <-c.syncCtx.Ctx.Done():
-			return
-		case j, ok := <-c.inCh:
-			if !ok {
-				return
+	for j := range c.inCh {
+		metrics.QueueSizeGauge.WithLabelValues(c.task, "causality_input", c.source).Set(float64(len(c.inCh)))
+
+		startTime := time.Now()
+
+		switch j.tp {
+		case flush, asyncFlush:
+			c.relation.rotate(j.flushSeq)
+		case gc:
+			// gc is only used on inner-causality logic
+			c.relation.gc(j.flushSeq)
+			continue
+		default:
+			keys := j.dml.identifyKeys(c.sessCtx)
+
+			// detectConflict before add
+			if c.detectConflict(keys) {
+				c.logger.Debug("meet causality key, will generate a conflict job to flush all sqls", zap.Strings("keys", keys))
+				c.outCh <- newConflictJob(c.workerCount)
+				c.relation.clear()
 			}
-			metrics.QueueSizeGauge.WithLabelValues(c.task, "causality_input", c.source).Set(float64(len(c.inCh)))
-			startTime := time.Now()
-			switch j.tp {
-			case flush, asyncFlush:
-				c.relation.rotate(j.flushSeq)
-			case gc:
-				// gc is only used on inner-causality logic
-				c.relation.gc(j.flushSeq)
-				continue
-			default:
-				keys := j.dml.identifyKeys(c.sessCtx)
-				// detectConflict before add
-				if c.detectConflict(keys) {
-					c.logger.Debug("meet causality key, will generate a conflict job to flush all sqls", zap.Strings("keys", keys))
-					c.outCh <- newConflictJob(c.workerCount)
-					c.relation.clear()
-				}
-				j.dml.key = c.add(keys)
-				c.logger.Debug("key for keys", zap.String("key", j.dml.key), zap.Strings("keys", keys))
-			}
-			metrics.ConflictDetectDurationHistogram.WithLabelValues(c.task, c.source).Observe(time.Since(startTime).Seconds())
-			c.outCh <- j
+			j.dml.key = c.add(keys)
+			c.logger.Debug("key for keys", zap.String("key", j.dml.key), zap.Strings("keys", keys))
 		}
+		metrics.ConflictDetectDurationHistogram.WithLabelValues(c.task, c.source).Observe(time.Since(startTime).Seconds())
+
+		c.outCh <- j
 	}
 }
 
diff --git a/dm/syncer/causality_test.go b/dm/syncer/causality_test.go
index 1ebfe16a3d3..1109b8c1b80 100644
--- a/dm/syncer/causality_test.go
+++ b/dm/syncer/causality_test.go
@@ -95,7 +95,6 @@ func (s *testSyncerSuite) TestCasuality(c *C) {
 			Name:     "task",
 			SourceID: "source",
 		},
-		syncCtx: tcontext.Background(),
 		tctx:    tcontext.Background().WithLogger(log.L()),
 		sessCtx: utils.NewSessionCtx(map[string]string{"time_zone": "UTC"}),
 	}
@@ -167,7 +166,6 @@ func (s *testSyncerSuite) TestCasualityWithPrefixIndex(c *C) {
 			Name:     "task",
 			SourceID: "source",
 		},
-		syncCtx: tcontext.Background(),
 		tctx:    tcontext.Background().WithLogger(log.L()),
 		sessCtx: utils.NewSessionCtx(map[string]string{"time_zone": "UTC"}),
 	}
diff --git a/dm/syncer/compactor.go b/dm/syncer/compactor.go
index a3aa03576c1..62c907f037d 100644
--- a/dm/syncer/compactor.go
+++ b/dm/syncer/compactor.go
@@ -22,7 +22,6 @@ import (
 	"github.com/pingcap/tidb-tools/pkg/filter"
 	"go.uber.org/zap"
 
-	tcontext "github.com/pingcap/tiflow/dm/pkg/context"
 	"github.com/pingcap/tiflow/dm/pkg/log"
 	"github.com/pingcap/tiflow/dm/syncer/metrics"
 )
@@ -34,7 +33,6 @@ type compactor struct {
 	bufferSize int
 	logger     log.Logger
 	safeMode   bool
-	syncCtx    *tcontext.Context
 
 	keyMap map[string]map[string]int // table -> key(pk or (uk + not null)) -> index in buffer
 	buffer []*job
@@ -60,7 +58,6 @@ func compactorWrap(inCh chan *job, syncer *Syncer) chan *job {
 		task:         syncer.cfg.Name,
 		source:       syncer.cfg.SourceID,
 		addCountFunc: syncer.addCount,
-		syncCtx:      syncer.syncCtx, // this ctx can be used to cancel all the workers
 	}
 	go func() {
 		compactor.run()
@@ -73,8 +70,6 @@ func compactorWrap(inCh chan *job, syncer *Syncer) chan *job {
 func (c *compactor) run() {
 	for {
 		select {
-		case <-c.syncCtx.Ctx.Done():
-			return
 		case j, ok := <-c.inCh:
 			if !ok {
 				return
diff --git a/dm/syncer/compactor_test.go b/dm/syncer/compactor_test.go
index ccf1b3bcc3a..506f9581a7f 100644
--- a/dm/syncer/compactor_test.go
+++ b/dm/syncer/compactor_test.go
@@ -251,8 +251,7 @@ func (s *testSyncerSuite) TestCompactorSafeMode(c *C) {
 
 	inCh := make(chan *job, 100)
 	syncer := &Syncer{
-		tctx:    tcontext.NewContext(context.Background(), log.L()),
-		syncCtx: tcontext.Background(),
+		tctx: tcontext.NewContext(context.Background(), log.L()),
 		cfg: &config.SubTaskConfig{
 			Name:     "task",
 			SourceID: "source",
diff --git a/dm/syncer/dml_worker.go b/dm/syncer/dml_worker.go
index cbe9919d584..71802fbc09a 100644
--- a/dm/syncer/dml_worker.go
+++ b/dm/syncer/dml_worker.go
@@ -112,39 +112,31 @@ func (w *DMLWorker) run() {
 	for i := 0; i < w.workerCount; i++ {
 		queueBucketMapping[i] = queueBucketName(i)
 	}
-	for {
-		select {
-		case <-w.syncCtx.Ctx.Done():
-			return
-		case j, ok := <-w.inCh:
-			if !ok {
-				return
-			}
-			metrics.QueueSizeGauge.WithLabelValues(w.task, "dml_worker_input", w.source).Set(float64(len(w.inCh)))
-			switch j.tp {
-			case flush:
-				w.addCountFunc(false, adminQueueName, j.tp, 1, j.targetTable)
-				w.sendJobToAllDmlQueue(j, jobChs, queueBucketMapping)
-				j.flushWg.Wait()
-				w.addCountFunc(true, adminQueueName, j.tp, 1, j.targetTable)
-				w.flushCh <- j
-			case asyncFlush:
-				w.addCountFunc(false, adminQueueName, j.tp, 1, j.targetTable)
-				w.sendJobToAllDmlQueue(j, jobChs, queueBucketMapping)
-				w.flushCh <- j
-			case conflict:
-				w.addCountFunc(false, adminQueueName, j.tp, 1, j.targetTable)
-				w.sendJobToAllDmlQueue(j, jobChs, queueBucketMapping)
-				j.flushWg.Wait()
-				w.addCountFunc(true, adminQueueName, j.tp, 1, j.targetTable)
-			default:
-				queueBucket := int(utils.GenHashKey(j.dml.key)) % w.workerCount
-				w.addCountFunc(false, queueBucketMapping[queueBucket], j.tp, 1, j.targetTable)
-				startTime := time.Now()
-				w.logger.Debug("queue for key", zap.Int("queue", queueBucket), zap.String("key", j.dml.key))
-				jobChs[queueBucket] <- j
-				metrics.AddJobDurationHistogram.WithLabelValues(j.tp.String(), w.task, queueBucketMapping[queueBucket], w.source).Observe(time.Since(startTime).Seconds())
-			}
+	for j := range w.inCh {
+		metrics.QueueSizeGauge.WithLabelValues(w.task, "dml_worker_input", w.source).Set(float64(len(w.inCh)))
+		switch j.tp {
+		case flush:
+			w.addCountFunc(false, adminQueueName, j.tp, 1, j.targetTable)
+			w.sendJobToAllDmlQueue(j, jobChs, queueBucketMapping)
+			j.flushWg.Wait()
+			w.addCountFunc(true, adminQueueName, j.tp, 1, j.targetTable)
+			w.flushCh <- j
+		case asyncFlush:
+			w.addCountFunc(false, adminQueueName, j.tp, 1, j.targetTable)
+			w.sendJobToAllDmlQueue(j, jobChs, queueBucketMapping)
+			w.flushCh <- j
+		case conflict:
+			w.addCountFunc(false, adminQueueName, j.tp, 1, j.targetTable)
+			w.sendJobToAllDmlQueue(j, jobChs, queueBucketMapping)
+			j.flushWg.Wait()
+			w.addCountFunc(true, adminQueueName, j.tp, 1, j.targetTable)
+		default:
+			queueBucket := int(utils.GenHashKey(j.dml.key)) % w.workerCount
+			w.addCountFunc(false, queueBucketMapping[queueBucket], j.tp, 1, j.targetTable)
+			startTime := time.Now()
+			w.logger.Debug("queue for key", zap.Int("queue", queueBucket), zap.String("key", j.dml.key))
+			jobChs[queueBucket] <- j
+			metrics.AddJobDurationHistogram.WithLabelValues(j.tp.String(), w.task, queueBucketMapping[queueBucket], w.source).Observe(time.Since(startTime).Seconds())
 		}
 	}
 }
diff --git a/dm/syncer/syncer.go b/dm/syncer/syncer.go
index e3e7c8e845b..df6dd9f2b8d 100644
--- a/dm/syncer/syncer.go
+++ b/dm/syncer/syncer.go
@@ -289,6 +289,13 @@ func (s *Syncer) newJobChans() {
 	chanSize := calculateChanSize(s.cfg.QueueSize, s.cfg.WorkerCount, s.cfg.Compact)
 	s.dmlJobCh = make(chan *job, chanSize)
 	s.ddlJobCh = make(chan *job, s.cfg.QueueSize)
+	s.checkpointFlushWorker = &checkpointFlushWorker{
+		input:        make(chan *checkpointFlushTask, 16),
+		cp:           s.checkpoint,
+		execError:    &s.execError,
+		afterFlushFn: s.afterFlushCheckpoint,
+		addCountFunc: s.addCount,
+	}
 	s.jobsClosed.Store(false)
 }
 
@@ -300,6 +307,7 @@ func (s *Syncer) closeJobChans() {
 	}
 	close(s.dmlJobCh)
 	close(s.ddlJobCh)
+	s.checkpointFlushWorker.Close()
 	s.jobsClosed.Store(true)
 }
 
@@ -441,7 +449,6 @@ func (s *Syncer) Init(ctx context.Context) (err error) {
 		return err
 	}
 	rollbackHolder.Add(fr.FuncRollback{Name: "remove-active-realylog", Fn: s.removeActiveRelayLog})
-
 	s.reset()
 	return nil
 }
@@ -1431,6 +1438,7 @@ func (s *Syncer) waitBeforeRunExit(ctx context.Context) {
 		case <-s.runCtx.Ctx.Done():
 			s.tctx.L().Info("syncer run exit so runCtx done")
 		case <-time.After(maxPauseOrStopWaitTime):
+			// TODO: maxPauseOrStopWaitTime should also count the time of waiting waitTransactionLock
 			s.tctx.L().Info("wait transaction end timeout, exit now")
 			s.runCancel()
 		}
@@ -1483,23 +1491,13 @@ func (s *Syncer) updateTSOffset(ctx context.Context) error {
 
 // Run starts running for sync, we should guarantee it can rerun when paused.
 func (s *Syncer) Run(ctx context.Context) (err error) {
-	s.Lock()
 	runCtx, runCancel := context.WithCancel(context.Background())
 	s.runCtx, s.runCancel = tcontext.NewContext(runCtx, s.tctx.L()), runCancel
 	syncCtx, syncCancel := context.WithCancel(context.Background())
 	s.syncCtx, s.syncCancel = tcontext.NewContext(syncCtx, s.tctx.L()), syncCancel
-	s.checkpointFlushWorker = &checkpointFlushWorker{
-		input:        make(chan *checkpointFlushTask, 16),
-		cp:           s.checkpoint,
-		execError:    &s.execError,
-		afterFlushFn: s.afterFlushCheckpoint,
-		addCountFunc: s.addCount,
-	}
-	s.Unlock()
 	defer func() {
 		s.runCancel()
 		s.closeJobChans()
-		s.checkpointFlushWorker.Close()
 		s.runWg.Wait()
 	}()
 
@@ -1555,7 +1553,6 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 	s.runWg.Add(1)
 	go s.syncDML()
 	s.runWg.Add(1)
-	// start flush checkpoints worker. this worker must start before s.flushCheckPoints()
 	go func() {
 		defer s.runWg.Done()
 		// also need to use a different ctx. checkpointFlushWorker worker will be closed in the first defer
@@ -3377,6 +3374,7 @@ func (s *Syncer) Close() {
 	// when closing syncer by `stop-task`, remove active relay log from hub
 	s.removeActiveRelayLog()
 	metrics.RemoveLabelValuesWithTaskInMetrics(s.cfg.Name)
+	s.runWg.Wait()
 	s.closed.Store(true)
 }
 

From 4fcad7d90b7c44c3a0aaf6c7f99b9fcf569db759 Mon Sep 17 00:00:00 2001
From: ehco1996 <zh19960202@gmail.com>
Date: Fri, 11 Feb 2022 17:21:52 +0800
Subject: [PATCH 43/47] fix datarace

---
 dm/syncer/syncer_test.go | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/dm/syncer/syncer_test.go b/dm/syncer/syncer_test.go
index 3026949c09d..eca7152e770 100644
--- a/dm/syncer/syncer_test.go
+++ b/dm/syncer/syncer_test.go
@@ -1717,7 +1717,7 @@ func genDefaultSubTaskConfig4Test() *config.SubTaskConfig {
 }
 
 func TestWaitBeforeRunExit(t *testing.T) {
-	ctx := context.Background()
+	ctx, cancel := context.WithCancel(context.Background())
 	cfg := genDefaultSubTaskConfig4Test()
 	cfg.WorkerCount = 0
 	syncer := NewSyncer(cfg, nil, nil)
@@ -1746,18 +1746,15 @@ func TestWaitBeforeRunExit(t *testing.T) {
 		defer wg.Done()
 		errCh <- syncer.Run(ctx)
 	}()
-	// wait s.Run start
-	time.Sleep(time.Second)
+	time.Sleep(time.Second) // wait s.Run start
 
 	// test s.Run will not exit unit caller cancel ctx or call s.runCancel
+	cancel() // this will make s.Run exit
+	wg.Wait()
+	require.Nil(t, <-errCh)
 	require.Equal(t, 0, len(errCh))
-	syncer.Lock()
 	require.NotNil(t, syncer.runCtx)
 	require.NotNil(t, syncer.runCancel)
-	syncer.Unlock()
-	syncer.runCancel() // this will make s.Run exit
-	wg.Wait()
-	require.Nil(t, <-errCh)
 
 	// test syncer wait time not more than maxPauseOrStopWaitTime
 	oldMaxPauseOrStopWaitTime := maxPauseOrStopWaitTime

From d6484a68354578ec62968c13a63aa385efe8eae4 Mon Sep 17 00:00:00 2001
From: ehco1996 <zh19960202@gmail.com>
Date: Mon, 14 Feb 2022 15:24:05 +0800
Subject: [PATCH 44/47] move init checkpoint worker to init && revert a kill

---
 dm/syncer/syncer.go       | 16 ++++++++--------
 dm/tests/new_relay/run.sh |  8 ++------
 2 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/dm/syncer/syncer.go b/dm/syncer/syncer.go
index 58050b1c4ed..5392c7ac044 100644
--- a/dm/syncer/syncer.go
+++ b/dm/syncer/syncer.go
@@ -290,13 +290,6 @@ func (s *Syncer) newJobChans() {
 	chanSize := calculateChanSize(s.cfg.QueueSize, s.cfg.WorkerCount, s.cfg.Compact)
 	s.dmlJobCh = make(chan *job, chanSize)
 	s.ddlJobCh = make(chan *job, s.cfg.QueueSize)
-	s.checkpointFlushWorker = &checkpointFlushWorker{
-		input:              make(chan *checkpointFlushTask, 16),
-		cp:                 s.checkpoint,
-		execError:          &s.execError,
-		afterFlushFn:       s.afterFlushCheckpoint,
-		updateJobMetricsFn: s.updateJobMetrics,
-	}
 	s.jobsClosed.Store(false)
 }
 
@@ -308,7 +301,6 @@ func (s *Syncer) closeJobChans() {
 	}
 	close(s.dmlJobCh)
 	close(s.ddlJobCh)
-	s.checkpointFlushWorker.Close()
 	s.jobsClosed.Store(true)
 }
 
@@ -567,6 +559,13 @@ func (s *Syncer) reset() {
 	}
 	// create new job chans
 	s.newJobChans()
+	s.checkpointFlushWorker = &checkpointFlushWorker{
+		input:              make(chan *checkpointFlushTask, 16),
+		cp:                 s.checkpoint,
+		execError:          &s.execError,
+		afterFlushFn:       s.afterFlushCheckpoint,
+		updateJobMetricsFn: s.updateJobMetrics,
+	}
 
 	s.execError.Store(nil)
 	s.setErrLocation(nil, nil, false)
@@ -1514,6 +1513,7 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 	defer func() {
 		s.runCancel()
 		s.closeJobChans()
+		s.checkpointFlushWorker.Close()
 		s.runWg.Wait()
 	}()
 
diff --git a/dm/tests/new_relay/run.sh b/dm/tests/new_relay/run.sh
index 512819f3504..717bb3e210c 100755
--- a/dm/tests/new_relay/run.sh
+++ b/dm/tests/new_relay/run.sh
@@ -56,12 +56,8 @@ function test_restart_relay_status() {
 		"list-member -n worker3" \
 		"relay" 1
 
-	echo "kill dm-worker1"
-	ps aux | grep dm-worker1 | awk '{print $2}' | xargs kill || true
-	check_port_offline $WORKER1_PORT 20
-	echo "kill dm-master"
-	ps aux | grep dm-master | awk '{print $2}' | xargs kill || true
-	check_master_port_offline 1
+	kill_dm_worker
+	kill_dm_master
 
 	run_dm_master $WORK_DIR/master $MASTER_PORT $cur/conf/dm-master.toml
 	check_rpc_alive $cur/../bin/check_master_online 127.0.0.1:$MASTER_PORT

From 76180db7ea6f8b62a47929c730a7c02425d3e3c8 Mon Sep 17 00:00:00 2001
From: ehco1996 <zh19960202@gmail.com>
Date: Mon, 14 Feb 2022 15:42:56 +0800
Subject: [PATCH 45/47] fix check graceful stop

---
 dm/tests/checkpoint_transaction/run.sh | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/dm/tests/checkpoint_transaction/run.sh b/dm/tests/checkpoint_transaction/run.sh
index 7a47c4165f7..e3834469245 100755
--- a/dm/tests/checkpoint_transaction/run.sh
+++ b/dm/tests/checkpoint_transaction/run.sh
@@ -112,6 +112,16 @@ function run() {
 		"query-status test" \
 		"\"stage\": \"Running\"" 1
 
+	echo "kill dm-worker1"
+	ps aux | grep dm-worker1 | awk '{print $2}' | xargs kill || true
+	check_port_offline $WORKER1_PORT 20
+	rm -rf $WORK_DIR/worker1
+	run_dm_worker $WORK_DIR/worker1 $WORKER1_PORT $cur/conf/dm-worker1.toml
+	check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER1_PORT
+	run_dm_ctl $WORK_DIR "127.0.0.1:$MASTER_PORT" \
+		"query-status test" \
+		"\"stage\": \"Running\"" 1
+
 	run_sql_file $cur/data/db1.increment2.sql $MYSQL_HOST1 $MYSQL_PORT1 $MYSQL_PASSWORD1
 	# wait transaction start
 	# you can see why sleep in https://github.com/pingcap/dm/pull/1928#issuecomment-895820239

From 4809d42ff1a186b02cc08c6bf028c5a0fc310413 Mon Sep 17 00:00:00 2001
From: ehco1996 <zh19960202@gmail.com>
Date: Tue, 15 Feb 2022 17:08:13 +0800
Subject: [PATCH 46/47] address comments

---
 dm/syncer/syncer.go | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/dm/syncer/syncer.go b/dm/syncer/syncer.go
index d768ac8f0a6..754fd23d19b 100644
--- a/dm/syncer/syncer.go
+++ b/dm/syncer/syncer.go
@@ -1527,6 +1527,8 @@ func (s *Syncer) Run(ctx context.Context) (err error) {
 		s.closeJobChans()
 		s.checkpointFlushWorker.Close()
 		s.runWg.Wait()
+		// s.syncCancel won't be called when normal exit, this call just to follow the best practice of use context.
+		s.syncCancel()
 	}()
 
 	// we should start this goroutine as soon as possible, because it's the only goroutine that cancel syncer.Run

From f7df28a9d4172661c8f98070c79a4d57d3268801 Mon Sep 17 00:00:00 2001
From: ehco1996 <zh19960202@gmail.com>
Date: Wed, 16 Feb 2022 11:50:06 +0800
Subject: [PATCH 47/47] remove sleep

---
 dm/tests/new_relay/run.sh | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/dm/tests/new_relay/run.sh b/dm/tests/new_relay/run.sh
index 717bb3e210c..b359bd0c606 100755
--- a/dm/tests/new_relay/run.sh
+++ b/dm/tests/new_relay/run.sh
@@ -115,11 +115,9 @@ function test_cant_dail_upstream() {
 	check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER1_PORT
 
 	# make sure DM-worker doesn't exit
-	sleep 2
 	run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
 		"query-status -s $SOURCE_ID1" \
 		"injected error" 1
-
 	echo ">>>>>>>>>>>>>>>>>>>>>>>>>>test test_cant_dail_upstream passed"
 }
 
@@ -158,8 +156,6 @@ function test_cant_dail_downstream() {
 
 	# restart tidb
 	run_tidb_server 4000 $TIDB_PASSWORD
-	sleep 2
-
 	echo ">>>>>>>>>>>>>>>>>>>>>>>>>>test test_cant_dail_downstream passed"
 }