From 8f6b50c40e2abefa4220bc2f69cc2f838d503aef Mon Sep 17 00:00:00 2001
From: guo-shaoge <shaoge1994@163.com>
Date: Thu, 9 Feb 2023 18:06:15 +0800
Subject: [PATCH] store/copr: add log for buildBatchCopTasksConsistentHash
 (#41101)

close pingcap/tidb#41102
---
 store/copr/batch_coprocessor.go | 71 +++++++++++++++++++++++++++++----
 1 file changed, 64 insertions(+), 7 deletions(-)

diff --git a/store/copr/batch_coprocessor.go b/store/copr/batch_coprocessor.go
index 99b7a807d52e2..273b97a0c2a13 100644
--- a/store/copr/batch_coprocessor.go
+++ b/store/copr/batch_coprocessor.go
@@ -607,13 +607,16 @@ func buildBatchCopTasksConsistentHash(
 	rangesForEachPhysicalTable []*KeyRanges,
 	storeType kv.StoreType,
 	ttl time.Duration) (res []*batchCopTask, err error) {
+	start := time.Now()
 	const cmdType = tikvrpc.CmdBatchCop
 	cache := kvStore.GetRegionCache()
 	fetchTopoBo := backoff.NewBackofferWithVars(ctx, fetchTopoMaxBackoff, nil)
 
-	var retryNum int
-	var rangesLen int
-	var storesStr []string
+	var (
+		retryNum  int
+		rangesLen int
+		storesStr []string
+	)
 
 	tasks := make([]*copTask, 0)
 	regionIDs := make([]tikv.RegionVerID, 0)
@@ -635,7 +638,9 @@ func buildBatchCopTasksConsistentHash(
 			regionIDs = append(regionIDs, lo.Location.Region)
 		}
 	}
+	splitKeyElapsed := time.Since(start)
 
+	fetchTopoStart := time.Now()
 	for {
 		retryNum++
 		// todo: use AssureAndGetTopo() after SNS is done.
@@ -654,6 +659,7 @@ func buildBatchCopTasksConsistentHash(
 		}
 		break
 	}
+	fetchTopoElapsed := time.Since(fetchTopoStart)
 
 	rpcCtxs, err := getTiFlashComputeRPCContextByConsistentHash(regionIDs, storesStr)
 	if err != nil {
@@ -688,6 +694,24 @@ func buildBatchCopTasksConsistentHash(
 	}
 	logutil.BgLogger().Info("buildBatchCopTasksConsistentHash done", zap.Any("len(tasks)", len(taskMap)), zap.Any("len(tiflash_compute)", len(storesStr)))
 
+	if log.GetLevel() <= zap.DebugLevel {
+		debugTaskMap := make(map[string]string, len(taskMap))
+		for s, b := range taskMap {
+			debugTaskMap[s] = fmt.Sprintf("addr: %s; regionInfos: %v", b.storeAddr, b.regionInfos)
+		}
+		logutil.BgLogger().Debug("detailed info buildBatchCopTasksConsistentHash", zap.Any("taskMap", debugTaskMap), zap.Any("allStores", storesStr))
+	}
+
+	if elapsed := time.Since(start); elapsed > time.Millisecond*500 {
+		logutil.BgLogger().Warn("buildBatchCopTasksConsistentHash takes too much time",
+			zap.Duration("total elapsed", elapsed),
+			zap.Int("retryNum", retryNum),
+			zap.Duration("splitKeyElapsed", splitKeyElapsed),
+			zap.Duration("fetchTopoElapsed", fetchTopoElapsed),
+			zap.Int("range len", rangesLen),
+			zap.Int("copTaskNum", len(tasks)),
+			zap.Int("batchCopTaskNum", len(res)))
+	}
 	failpointCheckForConsistentHash(res)
 	return res, nil
 }
@@ -1185,15 +1209,23 @@ func buildBatchCopTasksConsistentHashForPD(bo *backoff.Backoffer,
 	storeType kv.StoreType,
 	ttl time.Duration) (res []*batchCopTask, err error) {
 	const cmdType = tikvrpc.CmdBatchCop
-	var retryNum int
+	var (
+		retryNum        int
+		rangesLen       int
+		copTaskNum      int
+		splitKeyElapsed time.Duration
+		getStoreElapsed time.Duration
+	)
 	cache := kvStore.GetRegionCache()
+	start := time.Now()
 
 	for {
 		retryNum++
-		var rangesLen int
+		rangesLen = 0
 		tasks := make([]*copTask, 0)
 		regionIDs := make([]tikv.RegionVerID, 0)
 
+		splitKeyStart := time.Now()
 		for i, ranges := range rangesForEachPhysicalTable {
 			rangesLen += ranges.Len()
 			locations, err := cache.SplitKeyRangesByLocations(bo, ranges, UnspecifiedLimit)
@@ -1211,7 +1243,9 @@ func buildBatchCopTasksConsistentHashForPD(bo *backoff.Backoffer,
 				regionIDs = append(regionIDs, lo.Location.Region)
 			}
 		}
+		splitKeyElapsed += time.Since(splitKeyStart)
 
+		getStoreStart := time.Now()
 		stores, err := cache.GetTiFlashComputeStores(bo.TiKVBackoffer())
 		if err != nil {
 			return nil, err
@@ -1220,13 +1254,14 @@ func buildBatchCopTasksConsistentHashForPD(bo *backoff.Backoffer,
 		if len(stores) == 0 {
 			return nil, errors.New("tiflash_compute node is unavailable")
 		}
+		getStoreElapsed = time.Since(getStoreStart)
 
 		rpcCtxs, err := cache.GetTiFlashComputeRPCContextByConsistentHash(bo.TiKVBackoffer(), regionIDs, stores)
 		if err != nil {
 			return nil, err
 		}
 		if rpcCtxs == nil {
-			logutil.BgLogger().Info("buildBatchCopTasksConsistentHash retry because rcpCtx is nil", zap.Int("retryNum", retryNum))
+			logutil.BgLogger().Info("buildBatchCopTasksConsistentHashForPD retry because rcpCtx is nil", zap.Int("retryNum", retryNum))
 			err := bo.Backoff(tikv.BoTiFlashRPC(), errors.New("Cannot find region with TiFlash peer"))
 			if err != nil {
 				return nil, errors.Trace(err)
@@ -1236,6 +1271,7 @@ func buildBatchCopTasksConsistentHashForPD(bo *backoff.Backoffer,
 		if len(rpcCtxs) != len(tasks) {
 			return nil, errors.Errorf("length should be equal, len(rpcCtxs): %d, len(tasks): %d", len(rpcCtxs), len(tasks))
 		}
+		copTaskNum = len(tasks)
 		taskMap := make(map[string]*batchCopTask)
 		for i, rpcCtx := range rpcCtxs {
 			regionInfo := RegionInfo{
@@ -1259,10 +1295,31 @@ func buildBatchCopTasksConsistentHashForPD(bo *backoff.Backoffer,
 				res = append(res, batchTask)
 			}
 		}
-		logutil.BgLogger().Info("buildBatchCopTasksConsistentHash done", zap.Any("len(tasks)", len(taskMap)), zap.Any("len(tiflash_compute)", len(stores)))
+		logutil.BgLogger().Info("buildBatchCopTasksConsistentHashForPD done", zap.Any("len(tasks)", len(taskMap)), zap.Any("len(tiflash_compute)", len(stores)))
+		if log.GetLevel() <= zap.DebugLevel {
+			debugStores := make([]string, 0, len(stores))
+			for _, s := range stores {
+				debugStores = append(debugStores, s.GetAddr())
+			}
+			debugTaskMap := make(map[string]string, len(taskMap))
+			for s, b := range taskMap {
+				debugTaskMap[s] = fmt.Sprintf("addr: %s; regionInfos: %v", b.storeAddr, b.regionInfos)
+			}
+			logutil.BgLogger().Debug("detailed info buildBatchCopTasksConsistentHashForPD", zap.Any("taskMap", debugTaskMap), zap.Any("allStores", debugStores))
+		}
 		break
 	}
 
+	if elapsed := time.Since(start); elapsed > time.Millisecond*500 {
+		logutil.BgLogger().Warn("buildBatchCopTasksConsistentHashForPD takes too much time",
+			zap.Duration("total elapsed", elapsed),
+			zap.Int("retryNum", retryNum),
+			zap.Duration("splitKeyElapsed", splitKeyElapsed),
+			zap.Duration("getStoreElapsed", getStoreElapsed),
+			zap.Int("range len", rangesLen),
+			zap.Int("copTaskNum", copTaskNum),
+			zap.Int("batchCopTaskNum", len(res)))
+	}
 	failpointCheckForConsistentHash(res)
 	return res, nil
 }