From 9060b826acef95dc1535ffe2c91282a0722b2220 Mon Sep 17 00:00:00 2001 From: Zixiong Liu Date: Wed, 29 Dec 2021 22:57:50 +0800 Subject: [PATCH 1/5] pkg/p2p, metrics(cdc): add dashboard panels for Peer Messages (#4108) --- cdc/capture/capture.go | 7 + cdc/metrics.go | 2 + metrics/grafana/ticdc.json | 776 ++++++++++++++++++++++++++++++++- pkg/cmd/server/server_test.go | 8 +- pkg/config/config_test_data.go | 4 +- pkg/config/messages.go | 4 +- pkg/p2p/metrics.go | 24 +- pkg/p2p/server.go | 29 +- 8 files changed, 840 insertions(+), 14 deletions(-) diff --git a/cdc/capture/capture.go b/cdc/capture/capture.go index 1008c4b71fe..ed65f53ef2f 100644 --- a/cdc/capture/capture.go +++ b/cdc/capture/capture.go @@ -198,6 +198,13 @@ func (c *Capture) reset(ctx context.Context) error { c.grpcService.Reset(c.MessageServer) messageClientConfig := conf.Debug.Messages.ToMessageClientConfig() + + // Puts the advertise-addr of the local node to the client config. + // This is for metrics purpose only, so that the receiver knows which + // node the connections are from. + advertiseAddr := conf.AdvertiseAddr + messageClientConfig.AdvertisedAddr = advertiseAddr + c.MessageRouter = p2p.NewMessageRouter(c.info.ID, conf.Security, messageClientConfig) } diff --git a/cdc/metrics.go b/cdc/metrics.go index 1a75a316a12..1c998521731 100644 --- a/cdc/metrics.go +++ b/cdc/metrics.go @@ -30,6 +30,7 @@ import ( "github.com/pingcap/tiflow/pkg/db" "github.com/pingcap/tiflow/pkg/etcd" "github.com/pingcap/tiflow/pkg/orchestrator" + "github.com/pingcap/tiflow/pkg/p2p" "github.com/prometheus/client_golang/prometheus" ) @@ -50,6 +51,7 @@ func init() { initServerMetrics(registry) actor.InitMetrics(registry) orchestrator.InitMetrics(registry) + p2p.InitMetrics(registry) // Sorter metrics sorter.InitMetrics(registry) memory.InitMetrics(registry) diff --git a/metrics/grafana/ticdc.json b/metrics/grafana/ticdc.json index 83e9388f2e5..60574864ae7 100644 --- a/metrics/grafana/ticdc.json +++ b/metrics/grafana/ticdc.json @@ -9863,6 +9863,778 @@ "title": "TiKV", "type": "row" }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 7 + }, + "id": 294, + "panels": [], + "title": "Peer Messages", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_CDC-CLUSTER1}", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 4, + "x": 0, + "y": 8 + }, + "hiddenSeries": false, + "id": 321, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.7", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum without (from) (rate(ticdc_message_server_message_count{instance=~\"$capture\"}[30s]))", + "interval": "", + "legendFormat": "{{instance}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Message Receive Rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_CDC-CLUSTER1}", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 4, + "x": 4, + "y": 8 + }, + "hiddenSeries": false, + "id": 323, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.7", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum without (to) (rate(ticdc_message_client_message_count{instance=~\"$capture\"}[30s]))", + "interval": "", + "legendFormat": "{{instance}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Message Send Rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_CDC-CLUSTER1}", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 4, + "x": 8, + "y": 8 + }, + "hiddenSeries": false, + "id": 352, + "legend": { + "avg": false, + "current": false, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.7", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "histogram_quantile(0.5, sum(rate(ticdc_message_server_message_batch_size_bucket[30s])) by (le))", + "interval": "", + "legendFormat": "p50", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.95, sum(rate(ticdc_message_server_message_batch_size_bucket[30s])) by (le))", + "hide": false, + "interval": "", + "legendFormat": "p90", + "refId": "B" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.99, sum(rate(ticdc_message_server_message_batch_size_bucket[30s])) by (le))", + "hide": false, + "interval": "", + "legendFormat": "p99", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Message Batch Size", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transformations": [], + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_CDC-CLUSTER1}", + "fieldConfig": { + "defaults": { + "unit": "short" + }, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 6, + "x": 12, + "y": 8 + }, + "hiddenSeries": false, + "id": 354, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.7", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "histogram_quantile(0.5, sum(rate(ticdc_message_server_message_batch_bytes_bucket [30s])) by (le))", + "hide": false, + "interval": "", + "legendFormat": "p50", + "refId": "C" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.95, sum(rate(ticdc_message_server_message_batch_bytes_bucket [30s])) by (le))", + "interval": "", + "legendFormat": "p95", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.99, sum(rate(ticdc_message_server_message_batch_bytes_bucket [30s])) by (le))", + "hide": false, + "interval": "", + "legendFormat": "p99", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Receive message batch bytes percentiles", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_CDC-CLUSTER1}", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 6, + "x": 18, + "y": 8 + }, + "hiddenSeries": false, + "id": 356, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.7", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "histogram_quantile(0.5, sum(rate(ticdc_message_server_message_bytes_bucket [30s])) by (le))", + "interval": "", + "legendFormat": "p50", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.95, sum(rate(ticdc_message_server_message_bytes_bucket [30s])) by (le))", + "hide": false, + "interval": "", + "legendFormat": "p95", + "refId": "B" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.99, sum(rate(ticdc_message_server_message_bytes_bucket [30s])) by (le))", + "hide": false, + "interval": "", + "legendFormat": "p99", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Receive Message Bytes Percentile", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "columns": [], + "datasource": "${DS_CDC-CLUSTER1}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "left", + "displayMode": "color-background", + "filterable": false + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgb(255, 255, 255)", + "value": null + }, + { + "color": "dark-blue", + "value": 1 + }, + { + "color": "dark-red", + "value": 2 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "dest\\src" + }, + "properties": [ + { + "id": "custom.width", + "value": 137 + } + ] + } + ] + }, + "fontSize": "100%", + "gridPos": { + "h": 8, + "w": 20, + "x": 0, + "y": 15 + }, + "id": 350, + "options": { + "showHeader": true, + "sortBy": [] + }, + "pageSize": null, + "pluginVersion": "7.5.7", + "scroll": true, + "showHeader": true, + "sort": { + "col": 0, + "desc": true + }, + "styles": [ + { + "alias": "Time", + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "pattern": "Time", + "type": "date" + }, + { + "alias": "", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "decimals": 2, + "pattern": "/.*/", + "thresholds": [], + "type": "number", + "unit": "short" + } + ], + "targets": [ + { + "exemplar": true, + "expr": "ticdc_message_server_cur_stream_count", + "format": "time_series", + "instant": true, + "interval": "", + "legendFormat": "", + "queryType": "randomWalk", + "refId": "A" + } + ], + "title": "Stream Count Between Nodes", + "transform": "timeseries_to_columns", + "transformations": [ + { + "id": "labelsToFields", + "options": { + "valueLabel": "from" + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "instance": false, + "job": true + }, + "indexByName": {}, + "renameByName": { + "instance": "dest\\src" + } + } + } + ], + "type": "table" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_CDC-CLUSTER1}", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 4, + "x": 20, + "y": 15 + }, + "hiddenSeries": false, + "id": 358, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.7", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ticdc_message_server_cur_stream_count)", + "interval": "", + "legendFormat": "actual", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "2 * count(process_start_time_seconds{tidb_cluster=\"$tidb_cluster\", job=\"ticdc\"}) - 1", + "hide": false, + "interval": "", + "legendFormat": "expected", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Total Stream Count", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, { "collapsed": true, "gridPos": { @@ -11011,5 +11783,5 @@ "timezone": "browser", "title": "Test-Cluster-TiCDC", "uid": "YiGL8hBZ1", - "version": 29 -} \ No newline at end of file + "version": 30 +} diff --git a/pkg/cmd/server/server_test.go b/pkg/cmd/server/server_test.go index 1a36d1cc03e..52507ae3228 100644 --- a/pkg/cmd/server/server_test.go +++ b/pkg/cmd/server/server_test.go @@ -198,8 +198,8 @@ func TestParseCfg(t *testing.T) { }, // We expect the default configuration here. Messages: &config.MessagesConfig{ - ClientMaxBatchInterval: config.TomlDuration(time.Millisecond * 100), - ClientMaxBatchSize: 8 * 1024, + ClientMaxBatchInterval: config.TomlDuration(time.Millisecond * 200), + ClientMaxBatchSize: 8 * 1024 * 1024, ClientMaxBatchCount: 128, ClientRetryRateLimit: 1.0, ServerMaxPendingMessageCount: 102400, @@ -474,8 +474,8 @@ cert-allowed-cn = ["dd","ee"] }, // We expect the default configuration here. Messages: &config.MessagesConfig{ - ClientMaxBatchInterval: config.TomlDuration(time.Millisecond * 100), - ClientMaxBatchSize: 8 * 1024, + ClientMaxBatchInterval: config.TomlDuration(time.Millisecond * 200), + ClientMaxBatchSize: 8 * 1024 * 1024, ClientMaxBatchCount: 128, ClientRetryRateLimit: 1.0, ServerMaxPendingMessageCount: 102400, diff --git a/pkg/config/config_test_data.go b/pkg/config/config_test_data.go index 68a2d04596b..f254ed076a8 100644 --- a/pkg/config/config_test_data.go +++ b/pkg/config/config_test_data.go @@ -128,8 +128,8 @@ const ( "cleanup-speed-limit": 10000 }, "messages": { - "client-max-batch-interval": 100000000, - "client-max-batch-size": 8192, + "client-max-batch-interval": 200000000, + "client-max-batch-size": 8388608, "client-max-batch-count": 128, "client-retry-rate-limit": 1, "server-max-pending-message-count": 102400, diff --git a/pkg/config/messages.go b/pkg/config/messages.go index 5450c25e9f8..541c074c42c 100644 --- a/pkg/config/messages.go +++ b/pkg/config/messages.go @@ -34,8 +34,8 @@ type MessagesConfig struct { // read only var defaultMessageConfig = &MessagesConfig{ - ClientMaxBatchInterval: TomlDuration(time.Millisecond * 100), - ClientMaxBatchSize: 8 * 1024, // 8MB + ClientMaxBatchInterval: TomlDuration(time.Millisecond * 200), + ClientMaxBatchSize: 8 * 1024 * 1024, // 8MB ClientMaxBatchCount: 128, ClientRetryRateLimit: 1.0, // Once per second ServerMaxPendingMessageCount: 102400, diff --git a/pkg/p2p/metrics.go b/pkg/p2p/metrics.go index 6a1cf2ce67e..d5a53824b54 100644 --- a/pkg/p2p/metrics.go +++ b/pkg/p2p/metrics.go @@ -39,8 +39,26 @@ var ( Namespace: "ticdc", Subsystem: "message_server", Name: "message_batch_size", - Help: "size of message batches received", - Buckets: prometheus.LinearBuckets(0, 5, 16), + Help: "size in number of messages of message batches received", + Buckets: prometheus.ExponentialBuckets(1, 2, 10), + }, []string{"from"}) + + // serverMessageBatchBytesHistogram records the wire sizes as reported by protobuf. + serverMessageBatchBytesHistogram = prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Namespace: "ticdc", + Subsystem: "message_server", + Name: "message_batch_bytes", + Help: "size in bytes of message batches received", + Buckets: prometheus.ExponentialBuckets(8.0, 2, 16), + }, []string{"from"}) + + // serverMessageBytesHistogram records the wire sizes as reported by protobuf. + serverMessageBytesHistogram = prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Namespace: "ticdc", + Subsystem: "message_server", + Name: "message_bytes", + Help: "size in bytes of messages received", + Buckets: prometheus.ExponentialBuckets(8.0, 2, 16), }, []string{"from"}) serverAckCount = prometheus.NewCounterVec(prometheus.CounterOpts{ @@ -89,6 +107,8 @@ func InitMetrics(registry *prometheus.Registry) { registry.MustRegister(serverStreamCount) registry.MustRegister(serverMessageCount) registry.MustRegister(serverMessageBatchHistogram) + registry.MustRegister(serverMessageBytesHistogram) + registry.MustRegister(serverMessageBatchBytesHistogram) registry.MustRegister(serverAckCount) registry.MustRegister(serverRepeatedMessageCount) registry.MustRegister(grpcClientMetrics) diff --git a/pkg/p2p/server.go b/pkg/p2p/server.go index 4fdbcfb25fc..a91f037301d 100755 --- a/pkg/p2p/server.go +++ b/pkg/p2p/server.go @@ -36,6 +36,10 @@ import ( "google.golang.org/grpc/status" ) +const ( + messageServerReportsIndividualMessageSize = true +) + // MessageServerConfig stores configurations for the MessageServer type MessageServerConfig struct { // The maximum number of entries to be cached for topics with no handler registered @@ -530,6 +534,7 @@ func (m *MessageServer) registerPeer( log.Info("peer connection received", zap.String("sender-id", streamMeta.SenderId), + zap.String("sender-advertise-addr", streamMeta.SenderAdvertisedAddr), zap.String("addr", clientIP), zap.Int64("epoch", streamMeta.Epoch)) @@ -697,6 +702,12 @@ func (m *MessageServer) receive(stream p2p.CDCPeerToPeer_SendMessageServer, stre metricsServerMessageBatchHistogram := serverMessageBatchHistogram.With(prometheus.Labels{ "from": streamHandle.GetStreamMeta().SenderAdvertisedAddr, }) + metricsServerMessageBatchBytesHistogram := serverMessageBatchBytesHistogram.With(prometheus.Labels{ + "from": streamHandle.GetStreamMeta().SenderAdvertisedAddr, + }) + metricsServerMessageBytesHistogram := serverMessageBytesHistogram.With(prometheus.Labels{ + "from": streamHandle.GetStreamMeta().SenderAdvertisedAddr, + }) for { failpoint.Inject("ServerInjectServerRestart", func() { @@ -714,10 +725,24 @@ func (m *MessageServer) receive(stream p2p.CDCPeerToPeer_SendMessageServer, stre batchSize := len(packet.GetEntries()) log.Debug("received packet", zap.String("streamHandle", streamHandle.GetStreamMeta().SenderId), zap.Int("num-entries", batchSize)) + + batchBytes := packet.Size() + metricsServerMessageBatchBytesHistogram.Observe(float64(batchBytes)) metricsServerMessageBatchHistogram.Observe(float64(batchSize)) + metricsServerMessageCount.Add(float64(batchSize)) + + entries := packet.GetEntries() + if batchSize > 0 { + if messageServerReportsIndividualMessageSize /* true for now */ { + // Note that this can be costly if the number of messages is huge. + // However, the current usage of this package in TiCDC should not + // cause any problem, as the messages are for metadata only. + for _, entry := range entries { + messageWireSize := entry.Size() + metricsServerMessageBytesHistogram.Observe(float64(messageWireSize)) + } + } - if len(packet.GetEntries()) > 0 { - metricsServerMessageCount.Inc() // See the comment above on why use scheduleTaskBlocking. if err := m.scheduleTaskBlocking(stream.Context(), taskOnMessageBatch{ streamMeta: streamHandle.GetStreamMeta(), From 2b88e04e96bf63671654aa977228b38b520437a6 Mon Sep 17 00:00:00 2001 From: Ehco Date: Thu, 30 Dec 2021 08:31:50 +0800 Subject: [PATCH 2/5] scheduler(dm): support auto pause/resume task when transfer source (#3889) close pingcap/tiflow#4127 --- dm/_utils/terror_gen/errors_release.txt | 3 +- dm/dm/master/openapi.go | 2 +- dm/dm/master/scheduler/scheduler.go | 157 ++++++++++++++++----- dm/dm/master/scheduler/scheduler_test.go | 73 +++++++--- dm/dm/master/scheduler/worker.go | 24 ++++ dm/dm/master/server.go | 2 +- dm/errors.toml | 8 +- dm/pkg/terror/error_list.go | 66 ++++----- dm/tests/ha/run.sh | 13 +- dm/tests/openapi/client/openapi_task_check | 6 +- dm/tests/openapi/run.sh | 2 +- 11 files changed, 250 insertions(+), 106 deletions(-) diff --git a/dm/_utils/terror_gen/errors_release.txt b/dm/_utils/terror_gen/errors_release.txt index afd95ce815a..5b66bbd6d6a 100644 --- a/dm/_utils/terror_gen/errors_release.txt +++ b/dm/_utils/terror_gen/errors_release.txt @@ -519,7 +519,7 @@ ErrSchedulerSubTaskStageInvalidUpdate,[code=46015:class=dm-master:scope=internal ErrSchedulerSubTaskOpTaskNotExist,[code=46016:class=dm-master:scope=internal:level=medium], "Message: subtasks with name %s need to be operate not exist, Workaround: Please use `query-status` command to see tasks." ErrSchedulerSubTaskOpSourceNotExist,[code=46017:class=dm-master:scope=internal:level=medium], "Message: sources %v need to be operate not exist" ErrSchedulerTaskNotExist,[code=46018:class=scheduler:scope=internal:level=medium], "Message: task with name %s not exist, Workaround: Please use `query-status` command to see tasks." -ErrSchedulerRequireNotRunning,[code=46019:class=scheduler:scope=internal:level=high], "Message: tasks %v on source %s should not be running, Workaround: Please use `pause-task [-s source ...] task` to pause them first." +ErrSchedulerRequireRunningTaskInSyncUnit,[code=46019:class=scheduler:scope=internal:level=high], "Message: running tasks %v to be transferred on source %s should in sync unit, Workaround: Please use `pause-task [-s source ...] task` to pause them first." ErrSchedulerRelayWorkersBusy,[code=46020:class=scheduler:scope=internal:level=high], "Message: these workers %s have started relay for sources %s respectively, Workaround: Please use `stop-relay` to stop them, or change your topology." ErrSchedulerRelayWorkersWrongBound,[code=46021:class=scheduler:scope=internal:level=high], "Message: these workers %s have bound for another sources %s respectively, Workaround: Please `start-relay` on free or same source workers." ErrSchedulerRelayWorkersWrongRelay,[code=46022:class=scheduler:scope=internal:level=high], "Message: these workers %s have started relay for another sources %s respectively, Workaround: Please correct sources in `stop-relay`." @@ -532,6 +532,7 @@ ErrSchedulerStartRelayOnSpecified,[code=46028:class=scheduler:scope=internal:lev ErrSchedulerStopRelayOnSpecified,[code=46029:class=scheduler:scope=internal:level=low], "Message: the source has `start-relay` with worker name for workers %v, so it can't `stop-relay` without worker name now, Workaround: Please specify worker names for `stop-relay`." ErrSchedulerStartRelayOnBound,[code=46030:class=scheduler:scope=internal:level=low], "Message: the source has `start-relay` automatically for bound worker, so it can't `start-relay` with worker name now, Workaround: Please stop relay by `stop-relay` without worker name first." ErrSchedulerStopRelayOnBound,[code=46031:class=scheduler:scope=internal:level=low], "Message: the source has `start-relay` automatically for bound worker, so it can't `stop-relay` with worker name now, Workaround: Please use `stop-relay` without worker name." +ErrSchedulerPauseTaskForTransferSource,[code=46032:class=scheduler:scope=internal:level=low], "Message: failed to auto pause tasks %s when transfer-source, Workaround: Please pause task by `dmctl pause-task`." ErrCtlGRPCCreateConn,[code=48001:class=dmctl:scope=internal:level=high], "Message: can not create grpc connection, Workaround: Please check your network connection." ErrCtlInvalidTLSCfg,[code=48002:class=dmctl:scope=internal:level=medium], "Message: invalid TLS config, Workaround: Please check the `ssl-ca`, `ssl-cert` and `ssl-key` config in command line." ErrCtlLoadTLSCfg,[code=48003:class=dmctl:scope=internal:level=high], "Message: can not load tls config, Workaround: Please ensure that the tls certificate is accessible on the node currently running dmctl." diff --git a/dm/dm/master/openapi.go b/dm/dm/master/openapi.go index 610265e0ea3..3fc03950243 100644 --- a/dm/dm/master/openapi.go +++ b/dm/dm/master/openapi.go @@ -400,7 +400,7 @@ func (s *Server) DMAPITransferSource(c *gin.Context, sourceName string) { _ = c.Error(err) return } - if err := s.scheduler.TransferSource(sourceName, req.WorkerName); err != nil { + if err := s.scheduler.TransferSource(c.Request.Context(), sourceName, req.WorkerName); err != nil { _ = c.Error(err) } } diff --git a/dm/dm/master/scheduler/scheduler.go b/dm/dm/master/scheduler/scheduler.go index 3e32006fb6a..b2062ac7ece 100644 --- a/dm/dm/master/scheduler/scheduler.go +++ b/dm/dm/master/scheduler/scheduler.go @@ -15,13 +15,14 @@ package scheduler import ( "context" - "errors" "sort" "sync" "time" + "github.com/pingcap/errors" "github.com/pingcap/failpoint" "go.etcd.io/etcd/clientv3" + "go.uber.org/atomic" "go.uber.org/zap" "github.com/pingcap/tiflow/dm/dm/config" @@ -35,6 +36,10 @@ import ( "github.com/pingcap/tiflow/dm/pkg/utils" ) +const ( + maxQueryWorkerRetryNum = 10 +) + // Scheduler schedules tasks for DM-worker instances, including: // - register/unregister DM-worker instances. // - observe the online/offline status of DM-worker instances. @@ -84,7 +89,7 @@ type Scheduler struct { logger log.Logger - started bool // whether the scheduler already started for work. + started atomic.Bool // whether the scheduler already started for work. cancel context.CancelFunc wg sync.WaitGroup @@ -207,7 +212,7 @@ func (s *Scheduler) Start(pCtx context.Context, etcdCli *clientv3.Client) (err e s.mu.Unlock() }() - if s.started { + if s.started.Load() { return terror.ErrSchedulerStarted.Generate() } @@ -273,7 +278,7 @@ func (s *Scheduler) Start(pCtx context.Context, etcdCli *clientv3.Client) (err e s.observeLoadTask(ctx, etcdCli, rev1) }(loadTaskRev) - s.started = true // started now + s.started.Store(true) // started now s.cancel = cancel s.logger.Info("the scheduler has started") return nil @@ -283,7 +288,7 @@ func (s *Scheduler) Start(pCtx context.Context, etcdCli *clientv3.Client) (err e func (s *Scheduler) Close() { s.mu.Lock() - if !s.started { + if !s.started.Load() { s.mu.Unlock() return } @@ -301,7 +306,7 @@ func (s *Scheduler) Close() { s.mu.Lock() defer s.mu.Unlock() - s.started = false // closed now. + s.started.Store(false) // closed now. s.logger.Info("the scheduler has closed") } @@ -318,7 +323,7 @@ func (s *Scheduler) AddSourceCfg(cfg *config.SourceConfig) error { s.mu.Lock() defer s.mu.Unlock() - if !s.started { + if !s.started.Load() { return terror.ErrSchedulerNotStarted.Generate() } @@ -348,7 +353,7 @@ func (s *Scheduler) UpdateSourceCfg(cfg *config.SourceConfig) error { s.mu.Lock() defer s.mu.Unlock() - if !s.started { + if !s.started.Load() { return terror.ErrSchedulerNotStarted.Generate() } @@ -392,7 +397,7 @@ func (s *Scheduler) RemoveSourceCfg(source string) error { s.mu.Lock() defer s.mu.Unlock() - if !s.started { + if !s.started.Load() { return terror.ErrSchedulerNotStarted.Generate() } @@ -620,26 +625,27 @@ func (s *Scheduler) transferWorkerAndSource(lworker, lsource, rworker, rsource s // TransferSource unbinds the `source` and binds it to a free or same-source-relay `worker`. // If fails halfway, the old worker should try recover. -func (s *Scheduler) TransferSource(source, worker string) error { - s.mu.Lock() - defer s.mu.Unlock() - - if !s.started { +func (s *Scheduler) TransferSource(ctx context.Context, source, worker string) error { + if !s.started.Load() { return terror.ErrSchedulerNotStarted.Generate() } - + s.mu.RLock() // 1. check existence or no need if _, ok := s.sourceCfgs[source]; !ok { + s.mu.RUnlock() return terror.ErrSchedulerSourceCfgNotExist.Generate(source) } w, ok := s.workers[worker] if !ok { + s.mu.RUnlock() return terror.ErrSchedulerWorkerNotExist.Generate(worker) } oldWorker, hasOldWorker := s.bounds[source] if hasOldWorker && oldWorker.BaseInfo().Name == worker { + s.mu.RUnlock() return nil } + s.mu.RUnlock() // 2. check new worker is free and not started relay for another source switch w.Stage() { @@ -660,7 +666,7 @@ func (s *Scheduler) TransferSource(source, worker string) error { return s.boundSourceToWorker(source, w) } - // 4. if there's old worker, make sure it's not running + // 4. check if old worker has running tasks var runningTasks []string s.expectSubTaskStages.Range(func(k, v interface{}) bool { task := k.(string) @@ -674,17 +680,42 @@ func (s *Scheduler) TransferSource(source, worker string) error { } return true }) - if len(runningTasks) > 0 { - return terror.ErrSchedulerRequireNotRunning.Generate(runningTasks, source) + // we only allow automatically transfer-source if all subtasks are in the sync phase. + resp, err := oldWorker.queryStatus(ctx) + if err != nil { + return terror.Annotatef(err, "failed to query worker: %s status err", oldWorker.baseInfo.Name) + } + for _, status := range resp.QueryStatus.GetSubTaskStatus() { + if status.GetUnit() != pb.UnitType_Sync { + return terror.ErrSchedulerRequireRunningTaskInSyncUnit.Generate(runningTasks, source) + } + } + // pause running tasks + if batchPauseErr := s.batchOperateTaskOnWorker(ctx, oldWorker, runningTasks, source, pb.Stage_Paused, true); batchPauseErr != nil { + return batchPauseErr + } + // we need resume tasks that we just paused, we use another goroutine to do this because if error happens + // just logging this message and let user handle it manually + defer func() { + go func() { + if err := s.batchOperateTaskOnWorker(context.Background(), w, runningTasks, source, pb.Stage_Running, false); err != nil { + s.logger.Warn( + "auto resume task failed", zap.Any("tasks", runningTasks), + zap.String("source", source), zap.String("worker", worker), zap.Error(err)) + } + }() + }() } // 5. replace the source bound failpoint.Inject("failToReplaceSourceBound", func(_ failpoint.Value) { failpoint.Return(errors.New("failToPutSourceBound")) }) + s.mu.Lock() _, err := ha.ReplaceSourceBound(s.etcdCli, source, oldWorker.BaseInfo().Name, worker) if err != nil { + s.mu.Unlock() return err } if err2 := oldWorker.Unbound(); err2 != nil { @@ -693,15 +724,75 @@ func (s *Scheduler) TransferSource(source, worker string) error { if err2 := s.updateStatusToBound(w, ha.NewSourceBound(source, worker)); err2 != nil { s.logger.DPanic("we have checked w.stage is free, so there should not be an error", zap.Error(err2)) } - - // 6. try bound the old worker + // 6. now this old worker is free, try bound source to it _, err = s.tryBoundForWorker(oldWorker) if err != nil { s.logger.Warn("in transfer source, error when try bound the old worker", zap.Error(err)) } + s.mu.Unlock() return nil } +// batchOperateTaskOnWorker batch operate tasks in one worker and use query-status to make sure all tasks are in expected stage if needWait=true. +func (s *Scheduler) batchOperateTaskOnWorker( + ctx context.Context, worker *Worker, tasks []string, source string, stage pb.Stage, needWait bool) error { + for _, taskName := range tasks { + if err := s.UpdateExpectSubTaskStage(stage, taskName, source); err != nil { + return err + } + } + if !needWait { + return nil + } + // wait all tasks are in expected stage before actually starting scheduling +WaitLoop: + for retry := 0; retry < maxQueryWorkerRetryNum; retry++ { + resp, err := worker.queryStatus(ctx) + if err != nil { + return terror.Annotatef(err, "failed to query worker: %s status", worker.baseInfo.Name) + } + + failpoint.Inject("batchOperateTaskOnWorkerMustRetry", func(v failpoint.Value) { + if retry < v.(int) { + resp.QueryStatus.SubTaskStatus[0].Stage = pb.Stage_InvalidStage + log.L().Info("batchOperateTaskOnWorkerMustRetry failpoint triggered", zap.Int("retry", retry)) + } else { + log.L().Info("batchOperateTaskOnWorkerMustRetry passed", zap.Int("retry", retry)) + } + }) + + for _, status := range resp.QueryStatus.GetSubTaskStatus() { + if status == nil { + // this should not happen when rpc logic in server side not changed + return errors.Errorf("expect a query-status with subtask status but got a nil, resp %v", resp) + } + if status.Stage != stage { + // NOTE: the defaultRPCTimeout is 10m, use 1s * retry times to increase the waiting time + sleepTime := time.Second * time.Duration(maxQueryWorkerRetryNum-retry) + s.logger.Info( + "waiting task", + zap.String("task", status.Name), + zap.Int("retry times", retry), + zap.Duration("sleep time", sleepTime), + zap.String("want stage", stage.String()), + zap.String("current stage", status.Stage.String()), + ) + failpoint.Inject("skipBatchOperateTaskOnWorkerSleep", func(_ failpoint.Value) { + failpoint.Continue("WaitLoop") + }) + select { + case <-ctx.Done(): + return terror.Annotatef(err, "failed to wait task on worker: %s because context is canceled", worker.baseInfo.Name) + case <-time.After(sleepTime): + continue WaitLoop + } + } + } + return nil // all task are in expected stage + } + return terror.ErrSchedulerPauseTaskForTransferSource.Generate(tasks) // failed to pause tasks, need user to handle it manually +} + // AcquireSubtaskLatch tries acquiring a latch for subtask name. func (s *Scheduler) AcquireSubtaskLatch(name string) (ReleaseFunc, error) { return s.subtaskLatch.tryAcquire(name) @@ -714,7 +805,7 @@ func (s *Scheduler) AddSubTasks(latched bool, cfgs ...config.SubTaskConfig) erro s.mu.RLock() defer s.mu.RUnlock() - if !s.started { + if !s.started.Load() { return terror.ErrSchedulerNotStarted.Generate() } @@ -811,7 +902,7 @@ func (s *Scheduler) AddSubTasks(latched bool, cfgs ...config.SubTaskConfig) erro // RemoveSubTasks removes the information of one or more subtasks for one task. func (s *Scheduler) RemoveSubTasks(task string, sources ...string) error { - if !s.started { + if !s.started.Load() { return terror.ErrSchedulerNotStarted.Generate() } @@ -954,7 +1045,7 @@ func (s *Scheduler) AddWorker(name, addr string) error { s.mu.Lock() defer s.mu.Unlock() - if !s.started { + if !s.started.Load() { return terror.ErrSchedulerNotStarted.Generate() } @@ -988,7 +1079,7 @@ func (s *Scheduler) RemoveWorker(name string) error { s.mu.Lock() defer s.mu.Unlock() - if !s.started { + if !s.started.Load() { return terror.ErrSchedulerNotStarted.Generate() } @@ -1013,7 +1104,7 @@ func (s *Scheduler) GetAllWorkers() ([]*Worker, error) { s.mu.RLock() defer s.mu.RUnlock() - if !s.started { + if !s.started.Load() { return nil, terror.ErrSchedulerNotStarted.Generate() } @@ -1068,7 +1159,7 @@ func (s *Scheduler) StartRelay(source string, workers []string) error { s.mu.Lock() defer s.mu.Unlock() - if !s.started { + if !s.started.Load() { return terror.ErrSchedulerNotStarted.Generate() } @@ -1182,7 +1273,7 @@ func (s *Scheduler) StopRelay(source string, workers []string) error { s.mu.Lock() defer s.mu.Unlock() - if !s.started { + if !s.started.Load() { return terror.ErrSchedulerNotStarted.Generate() } @@ -1280,7 +1371,7 @@ func (s *Scheduler) GetRelayWorkers(source string) ([]*Worker, error) { s.mu.RLock() defer s.mu.RUnlock() - if !s.started { + if !s.started.Load() { return nil, terror.ErrSchedulerNotStarted.Generate() } @@ -1312,7 +1403,7 @@ func (s *Scheduler) UpdateExpectRelayStage(newStage pb.Stage, sources ...string) s.mu.Lock() defer s.mu.Unlock() - if !s.started { + if !s.started.Load() { return terror.ErrSchedulerNotStarted.Generate() } @@ -1391,7 +1482,7 @@ func (s *Scheduler) GetExpectRelayStage(source string) ha.Stage { // because some user may want to update `{Running, Paused, ...}` to `{Running, Running, ...}`. // so, this should be also supported in DM-worker. func (s *Scheduler) UpdateExpectSubTaskStage(newStage pb.Stage, task string, sources ...string) error { - if !s.started { + if !s.started.Load() { return terror.ErrSchedulerNotStarted.Generate() } @@ -1483,9 +1574,7 @@ func (s *Scheduler) GetExpectSubTaskStage(task, source string) ha.Stage { // Started returns if the scheduler is started. func (s *Scheduler) Started() bool { - s.mu.RLock() - defer s.mu.RUnlock() - return s.started + return s.started.Load() } // recoverSourceCfgs recovers history source configs and expectant relay stages from etcd. @@ -2273,7 +2362,7 @@ func (s *Scheduler) RemoveLoadTask(task string) error { s.mu.Lock() defer s.mu.Unlock() - if !s.started { + if !s.started.Load() { return terror.ErrSchedulerNotStarted.Generate() } _, _, err := ha.DelLoadTaskByTask(s.etcdCli, task) diff --git a/dm/dm/master/scheduler/scheduler_test.go b/dm/dm/master/scheduler/scheduler_test.go index e2d13b0ea9d..cc22cbd43c6 100644 --- a/dm/dm/master/scheduler/scheduler_test.go +++ b/dm/dm/master/scheduler/scheduler_test.go @@ -902,7 +902,7 @@ func (t *testScheduler) TestWatchWorkerEventEtcdCompact(c *C) { defer cancel() // step 1: start an empty scheduler without listening the worker event - s.started = true + s.started.Store(true) s.cancel = cancel s.etcdCli = etcdTestCli @@ -1044,7 +1044,7 @@ func (t *testScheduler) TestLastBound(c *C) { worker4 := &Worker{baseInfo: ha.WorkerInfo{Name: workerName4}} // step 1: start an empty scheduler without listening the worker event - s.started = true + s.started.Store(true) s.etcdCli = etcdTestCli s.workers[workerName1] = worker1 s.workers[workerName2] = worker2 @@ -1114,7 +1114,7 @@ func (t *testScheduler) TestInvalidLastBound(c *C) { worker1 := &Worker{baseInfo: ha.WorkerInfo{Name: workerName1}} // step 1: start an empty scheduler without listening the worker event - s.started = true + s.started.Store(true) s.etcdCli = etcdTestCli s.workers[workerName1] = worker1 // sourceID2 doesn't have a source config and not in unbound @@ -1151,7 +1151,7 @@ func (t *testScheduler) TestTransferSource(c *C) { worker4 := &Worker{baseInfo: ha.WorkerInfo{Name: workerName4}} // step 1: start an empty scheduler - s.started = true + s.started.Store(true) s.etcdCli = etcdTestCli s.workers[workerName1] = worker1 s.workers[workerName2] = worker2 @@ -1171,38 +1171,39 @@ func (t *testScheduler) TestTransferSource(c *C) { worker3.ToFree() worker4.ToFree() + ctx := context.Background() // test invalid transfer: source not exists - c.Assert(s.TransferSource("not-exist", workerName3), NotNil) + c.Assert(s.TransferSource(ctx, "not-exist", workerName3), NotNil) // test valid transfer: source -> worker = bound -> free - c.Assert(s.TransferSource(sourceID1, workerName4), IsNil) + c.Assert(s.TransferSource(ctx, sourceID1, workerName4), IsNil) c.Assert(s.bounds[sourceID1], DeepEquals, worker4) c.Assert(worker1.Stage(), Equals, WorkerFree) // test valid transfer: source -> worker = unbound -> free s.sourceCfgs[sourceID3] = &config.SourceConfig{} s.unbounds[sourceID3] = struct{}{} - c.Assert(s.TransferSource(sourceID3, workerName3), IsNil) + c.Assert(s.TransferSource(ctx, sourceID3, workerName3), IsNil) c.Assert(s.bounds[sourceID3], DeepEquals, worker3) // test valid transfer: self - c.Assert(s.TransferSource(sourceID3, workerName3), IsNil) + c.Assert(s.TransferSource(ctx, sourceID3, workerName3), IsNil) c.Assert(s.bounds[sourceID3], DeepEquals, worker3) // test invalid transfer: source -> worker = bound -> bound - c.Assert(s.TransferSource(sourceID1, workerName3), NotNil) + c.Assert(s.TransferSource(ctx, sourceID1, workerName3), NotNil) c.Assert(s.bounds[sourceID1], DeepEquals, worker4) c.Assert(s.bounds[sourceID3], DeepEquals, worker3) // test invalid transfer: source -> worker = bound -> offline worker1.ToOffline() - c.Assert(s.TransferSource(sourceID1, workerName1), NotNil) + c.Assert(s.TransferSource(ctx, sourceID1, workerName1), NotNil) c.Assert(s.bounds[sourceID1], DeepEquals, worker4) // test invalid transfer: source -> worker = unbound -> bound s.sourceCfgs[sourceID4] = &config.SourceConfig{} s.unbounds[sourceID4] = struct{}{} - c.Assert(s.TransferSource(sourceID4, workerName3), NotNil) + c.Assert(s.TransferSource(ctx, sourceID4, workerName3), NotNil) c.Assert(s.bounds[sourceID3], DeepEquals, worker3) delete(s.unbounds, sourceID4) delete(s.sourceCfgs, sourceID4) @@ -1212,16 +1213,44 @@ func (t *testScheduler) TestTransferSource(c *C) { // test fail halfway won't left old worker unbound c.Assert(failpoint.Enable("github.com/pingcap/tiflow/dm/dm/master/scheduler/failToReplaceSourceBound", `return()`), IsNil) - c.Assert(s.TransferSource(sourceID1, workerName1), NotNil) + c.Assert(s.TransferSource(ctx, sourceID1, workerName1), NotNil) c.Assert(s.bounds[sourceID1], DeepEquals, worker4) c.Assert(worker1.Stage(), Equals, WorkerFree) c.Assert(failpoint.Disable("github.com/pingcap/tiflow/dm/dm/master/scheduler/failToReplaceSourceBound"), IsNil) - // test can't transfer when there's any running task on the source + // set running tasks s.expectSubTaskStages.Store("test", map[string]ha.Stage{sourceID1: {Expect: pb.Stage_Running}}) - c.Assert(s.TransferSource(sourceID1, workerName1), NotNil) - c.Assert(s.bounds[sourceID1], DeepEquals, worker4) - c.Assert(worker1.Stage(), Equals, WorkerFree) + + // test can't transfer when running tasks not in sync unit + c.Assert(failpoint.Enable("github.com/pingcap/tiflow/dm/dm/master/scheduler/operateWorkerQueryStatus", `return("notInSyncUnit")`), IsNil) + defer failpoint.Disable("github.com/pingcap/tiflow/dm/dm/master/scheduler/operateWorkerQueryStatus") //nolint:errcheck + c.Assert(terror.ErrSchedulerRequireRunningTaskInSyncUnit.Equal(s.TransferSource(ctx, sourceID1, workerName1)), IsTrue) + c.Assert(failpoint.Disable("github.com/pingcap/tiflow/dm/dm/master/scheduler/operateWorkerQueryStatus"), IsNil) + + // test can't transfer when query status met error + c.Assert(failpoint.Enable("github.com/pingcap/tiflow/dm/dm/master/scheduler/operateWorkerQueryStatus", `return("error")`), IsNil) + c.Assert(s.TransferSource(ctx, sourceID1, workerName1), ErrorMatches, "failed to query worker.*") + c.Assert(failpoint.Disable("github.com/pingcap/tiflow/dm/dm/master/scheduler/operateWorkerQueryStatus"), IsNil) + + // test can transfer when all running task is in sync unit + c.Assert(failpoint.Enable("github.com/pingcap/tiflow/dm/dm/master/scheduler/skipBatchOperateTaskOnWorkerSleep", `return()`), IsNil) + defer failpoint.Disable("github.com/pingcap/tiflow/dm/dm/master/scheduler/skipBatchOperateTaskOnWorkerSleep") //nolint:errcheck + c.Assert(failpoint.Enable("github.com/pingcap/tiflow/dm/dm/master/scheduler/operateWorkerQueryStatus", `return("allTaskIsPaused")`), IsNil) + + // we only retry 10 times, open a failpoint to make need retry more than 10 times, so this transfer will fail + c.Assert(failpoint.Enable("github.com/pingcap/tiflow/dm/dm/master/scheduler/batchOperateTaskOnWorkerMustRetry", `return(11)`), IsNil) + c.Assert(terror.ErrSchedulerPauseTaskForTransferSource.Equal(s.TransferSource(ctx, sourceID1, workerName1)), IsTrue) + c.Assert(failpoint.Disable("github.com/pingcap/tiflow/dm/dm/master/scheduler/batchOperateTaskOnWorkerMustRetry"), IsNil) + + // now we can transfer successfully after 2 times retry + s.expectSubTaskStages.Store("test", map[string]ha.Stage{sourceID1: {Expect: pb.Stage_Running}}) + c.Assert(failpoint.Enable("github.com/pingcap/tiflow/dm/dm/master/scheduler/batchOperateTaskOnWorkerMustRetry", `return(2)`), IsNil) + c.Assert(s.TransferSource(ctx, sourceID1, workerName1), IsNil) + c.Assert(failpoint.Disable("github.com/pingcap/tiflow/dm/dm/master/scheduler/batchOperateTaskOnWorkerMustRetry"), IsNil) + c.Assert(s.bounds[sourceID1], DeepEquals, worker1) + c.Assert(worker1.Stage(), Equals, WorkerBound) + c.Assert(failpoint.Disable("github.com/pingcap/tiflow/dm/dm/master/scheduler/operateWorkerQueryStatus"), IsNil) + c.Assert(failpoint.Disable("github.com/pingcap/tiflow/dm/dm/master/scheduler/skipBatchOperateTaskOnWorkerSleep"), IsNil) } func (t *testScheduler) TestStartStopRelay(c *C) { @@ -1246,7 +1275,7 @@ func (t *testScheduler) TestStartStopRelay(c *C) { worker4 := &Worker{baseInfo: ha.WorkerInfo{Name: workerName4}} // step 1: start an empty scheduler - s.started = true + s.started.Store(true) s.etcdCli = etcdTestCli s.workers[workerName1] = worker1 s.workers[workerName2] = worker2 @@ -1368,7 +1397,7 @@ func (t *testScheduler) TestRelayWithWithoutWorker(c *C) { worker2 := &Worker{baseInfo: ha.WorkerInfo{Name: workerName2}} // step 1: start an empty scheduler - s.started = true + s.started.Store(true) s.etcdCli = etcdTestCli s.workers[workerName1] = worker1 s.workers[workerName2] = worker2 @@ -1489,7 +1518,7 @@ func (t *testScheduler) TestStartSourcesWithoutSourceConfigsInEtcd(c *C) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() - s.started = true + s.started.Store(true) s.etcdCli = etcdTestCli // found source configs before bound s.sourceCfgs[sourceID1] = &config.SourceConfig{} @@ -1518,7 +1547,7 @@ func (t *testScheduler) TestStartSourcesWithoutSourceConfigsInEtcd(c *C) { c.Assert(err, IsNil) c.Assert(bounded, IsTrue) - s.started = false + s.started.Store(false) sbm, _, err := ha.GetSourceBound(etcdTestCli, "") c.Assert(err, IsNil) c.Assert(sbm, HasLen, 2) @@ -1561,7 +1590,7 @@ func (t *testScheduler) TestTransferWorkerAndSource(c *C) { worker4 := &Worker{baseInfo: ha.WorkerInfo{Name: workerName4}} // step 1: start an empty scheduler - s.started = true + s.started.Store(true) s.etcdCli = etcdTestCli s.workers[workerName1] = worker1 s.workers[workerName2] = worker2 @@ -1640,7 +1669,7 @@ func (t *testScheduler) TestWatchLoadTask(c *C) { ) // step 1: start an empty scheduler - s.started = true + s.started.Store(true) s.etcdCli = etcdTestCli worker1 := &Worker{baseInfo: ha.WorkerInfo{Name: workerName1}} diff --git a/dm/dm/master/scheduler/worker.go b/dm/dm/master/scheduler/worker.go index cbac851ef33..6544310fbcd 100644 --- a/dm/dm/master/scheduler/worker.go +++ b/dm/dm/master/scheduler/worker.go @@ -18,11 +18,14 @@ import ( "sync" "time" + "github.com/pingcap/errors" "go.uber.org/zap" + "github.com/pingcap/failpoint" "github.com/pingcap/tiflow/dm/dm/config" "github.com/pingcap/tiflow/dm/dm/master/metrics" "github.com/pingcap/tiflow/dm/dm/master/workerrpc" + "github.com/pingcap/tiflow/dm/dm/pb" "github.com/pingcap/tiflow/dm/pkg/ha" "github.com/pingcap/tiflow/dm/pkg/log" "github.com/pingcap/tiflow/dm/pkg/terror" @@ -249,6 +252,27 @@ func (w *Worker) reportMetrics() { metrics.ReportWorkerStage(w.baseInfo.Name, s) } +func (w *Worker) queryStatus(ctx context.Context) (*workerrpc.Response, error) { + rpcTimeOut := time.Second * 10 // we relay on ctx.Done() to cancel the rpc, so just set a very long timeout + req := &workerrpc.Request{Type: workerrpc.CmdQueryStatus, QueryStatus: &pb.QueryStatusRequest{}} + failpoint.Inject("operateWorkerQueryStatus", func(v failpoint.Value) { + resp := &workerrpc.Response{Type: workerrpc.CmdQueryStatus, QueryStatus: &pb.QueryStatusResponse{}} + switch v.(string) { + case "notInSyncUnit": + resp.QueryStatus.SubTaskStatus = append( + resp.QueryStatus.SubTaskStatus, &pb.SubTaskStatus{Unit: pb.UnitType_Dump}) + failpoint.Return(resp, nil) + case "allTaskIsPaused": + resp.QueryStatus.SubTaskStatus = append( + resp.QueryStatus.SubTaskStatus, &pb.SubTaskStatus{Stage: pb.Stage_Paused, Unit: pb.UnitType_Sync}) + failpoint.Return(resp, nil) + default: + failpoint.Return(nil, errors.New("query error")) + } + }) + return w.SendRequest(ctx, req, rpcTimeOut) +} + // NewMockWorker is used in tests. func NewMockWorker(cli workerrpc.Client) *Worker { return &Worker{cli: cli} diff --git a/dm/dm/master/server.go b/dm/dm/master/server.go index e63331498c3..a8fc484a984 100644 --- a/dm/dm/master/server.go +++ b/dm/dm/master/server.go @@ -2323,7 +2323,7 @@ func (s *Server) TransferSource(ctx context.Context, req *pb.TransferSourceReque return resp2, err2 } - err := s.scheduler.TransferSource(req.Source, req.Worker) + err := s.scheduler.TransferSource(ctx, req.Source, req.Worker) if err != nil { resp2.Msg = err.Error() // nolint:nilerr diff --git a/dm/errors.toml b/dm/errors.toml index e964a55155b..d5fb121ee03 100644 --- a/dm/errors.toml +++ b/dm/errors.toml @@ -3125,7 +3125,7 @@ workaround = "Please use `query-status` command to see tasks." tags = ["internal", "medium"] [error.DM-scheduler-46019] -message = "tasks %v on source %s should not be running" +message = "running tasks %v to be transferred on source %s should in sync unit" description = "" workaround = "Please use `pause-task [-s source ...] task` to pause them first." tags = ["internal", "high"] @@ -3202,6 +3202,12 @@ description = "" workaround = "Please use `stop-relay` without worker name." tags = ["internal", "low"] +[error.DM-scheduler-46032] +message = "failed to auto pause tasks %s when transfer-source" +description = "" +workaround = "Please pause task by `dmctl pause-task`." +tags = ["internal", "low"] + [error.DM-dmctl-48001] message = "can not create grpc connection" description = "" diff --git a/dm/pkg/terror/error_list.go b/dm/pkg/terror/error_list.go index 40ec97749ea..77e086235d9 100644 --- a/dm/pkg/terror/error_list.go +++ b/dm/pkg/terror/error_list.go @@ -636,7 +636,7 @@ const ( codeSchedulerSubTaskOpTaskNotExist codeSchedulerSubTaskOpSourceNotExist codeSchedulerTaskNotExist - codeSchedulerRequireNotRunning + codeSchedulerRequireRunningTaskInSyncUnit codeSchedulerRelayWorkersBusy codeSchedulerRelayWorkersBound codeSchedulerRelayWorkersWrongRelay @@ -649,6 +649,7 @@ const ( codeSchedulerStopRelayOnSpecified codeSchedulerStartRelayOnBound codeSchedulerStopRelayOnBound + codeSchedulerPauseTaskForTransferSource ) // dmctl error code. @@ -1266,37 +1267,38 @@ var ( "failed to fetch downstream table %v by show create table statement in schema tracker", "") // HA scheduler. - ErrSchedulerNotStarted = New(codeSchedulerNotStarted, ClassScheduler, ScopeInternal, LevelHigh, "the scheduler has not started", "") - ErrSchedulerStarted = New(codeSchedulerStarted, ClassScheduler, ScopeInternal, LevelMedium, "the scheduler has already started", "") - ErrSchedulerWorkerExist = New(codeSchedulerWorkerExist, ClassScheduler, ScopeInternal, LevelMedium, "dm-worker with name %s already exists", "") - ErrSchedulerWorkerNotExist = New(codeSchedulerWorkerNotExist, ClassScheduler, ScopeInternal, LevelMedium, "dm-worker with name %s not exists", "") - ErrSchedulerWorkerOnline = New(codeSchedulerWorkerOnline, ClassScheduler, ScopeInternal, LevelMedium, "dm-worker with name %s is still online", "Please shut it down first.") - ErrSchedulerWorkerInvalidTrans = New(codeSchedulerWorkerInvalidTrans, ClassScheduler, ScopeInternal, LevelMedium, "invalid stage transformation for dm-worker %s, from %s to %s", "") - ErrSchedulerSourceCfgExist = New(codeSchedulerSourceCfgExist, ClassScheduler, ScopeInternal, LevelMedium, "source config with ID %s already exists", "") - ErrSchedulerSourceCfgNotExist = New(codeSchedulerSourceCfgNotExist, ClassScheduler, ScopeInternal, LevelMedium, "source config with ID %s not exists", "") - ErrSchedulerSourcesUnbound = New(codeSchedulerSourcesUnbound, ClassDMMaster, ScopeInternal, LevelMedium, "sources %v have not bound", "") - ErrSchedulerSourceOpTaskExist = New(codeSchedulerSourceOpTaskExist, ClassDMMaster, ScopeInternal, LevelMedium, "source with name %s need to operate has existing tasks %v", "Please `stop-task` first.") - ErrSchedulerRelayStageInvalidUpdate = New(codeSchedulerRelayStageInvalidUpdate, ClassScheduler, ScopeInternal, LevelMedium, "invalid new expectant relay stage %s", "") - ErrSchedulerRelayStageSourceNotExist = New(codeSchedulerRelayStageSourceNotExist, ClassScheduler, ScopeInternal, LevelMedium, "sources %v need to update expectant relay stage not exist", "") - ErrSchedulerMultiTask = New(codeSchedulerMultiTask, ClassScheduler, ScopeInternal, LevelMedium, "the scheduler cannot perform multiple different tasks %v in one operation", "") - ErrSchedulerSubTaskExist = New(codeSchedulerSubTaskExist, ClassScheduler, ScopeInternal, LevelMedium, "subtasks with name %s for sources %v already exist", "Please use `query-status` command to see tasks.") - ErrSchedulerSubTaskStageInvalidUpdate = New(codeSchedulerSubTaskStageInvalidUpdate, ClassDMMaster, ScopeInternal, LevelMedium, "invalid new expectant subtask stage %s", "") - ErrSchedulerSubTaskOpTaskNotExist = New(codeSchedulerSubTaskOpTaskNotExist, ClassDMMaster, ScopeInternal, LevelMedium, "subtasks with name %s need to be operate not exist", "Please use `query-status` command to see tasks.") - ErrSchedulerSubTaskOpSourceNotExist = New(codeSchedulerSubTaskOpSourceNotExist, ClassDMMaster, ScopeInternal, LevelMedium, "sources %v need to be operate not exist", "") - ErrSchedulerTaskNotExist = New(codeSchedulerTaskNotExist, ClassScheduler, ScopeInternal, LevelMedium, "task with name %s not exist", "Please use `query-status` command to see tasks.") - ErrSchedulerRequireNotRunning = New(codeSchedulerRequireNotRunning, ClassScheduler, ScopeInternal, LevelHigh, "tasks %v on source %s should not be running", "Please use `pause-task [-s source ...] task` to pause them first.") - ErrSchedulerRelayWorkersBusy = New(codeSchedulerRelayWorkersBusy, ClassScheduler, ScopeInternal, LevelHigh, "these workers %s have started relay for sources %s respectively", "Please use `stop-relay` to stop them, or change your topology.") - ErrSchedulerRelayWorkersWrongBound = New(codeSchedulerRelayWorkersBound, ClassScheduler, ScopeInternal, LevelHigh, "these workers %s have bound for another sources %s respectively", "Please `start-relay` on free or same source workers.") - ErrSchedulerRelayWorkersWrongRelay = New(codeSchedulerRelayWorkersWrongRelay, ClassScheduler, ScopeInternal, LevelHigh, "these workers %s have started relay for another sources %s respectively", "Please correct sources in `stop-relay`.") - ErrSchedulerSourceOpRelayExist = New(codeSchedulerSourceOpRelayExist, ClassScheduler, ScopeInternal, LevelHigh, "source with name %s need to operate has existing relay workers %s", "Please `stop-relay` first.") - ErrSchedulerLatchInUse = New(codeSchedulerLatchInUse, ClassScheduler, ScopeInternal, LevelLow, "when %s, resource %s is in use by other client", "Please try again later") - ErrSchedulerSourceCfgUpdate = New(codeSchedulerSourceCfgUpdate, ClassScheduler, ScopeInternal, LevelLow, "source can only update relay-log related parts for now", "") - ErrSchedulerWrongWorkerInput = New(codeSchedulerWrongWorkerInput, ClassScheduler, ScopeInternal, LevelMedium, "require DM master to modify worker [%s] with source [%s], but currently the worker is bound to source [%s]", "") - ErrSchedulerBoundDiffWithStartedRelay = New(codeSchedulerCantTransferToRelayWorker, ClassScheduler, ScopeInternal, LevelMedium, "require DM worker [%s] to be bound to source [%s], but it has been started relay for source [%s]", "If you intend to bind the source with worker, you can stop-relay for current source.") - ErrSchedulerStartRelayOnSpecified = New(codeSchedulerStartRelayOnSpecified, ClassScheduler, ScopeInternal, LevelLow, "the source has `start-relay` with worker name for workers %v, so it can't `start-relay` without worker name now", "Please stop all relay workers first, or specify worker name for `start-relay`.") - ErrSchedulerStopRelayOnSpecified = New(codeSchedulerStopRelayOnSpecified, ClassScheduler, ScopeInternal, LevelLow, "the source has `start-relay` with worker name for workers %v, so it can't `stop-relay` without worker name now", "Please specify worker names for `stop-relay`.") - ErrSchedulerStartRelayOnBound = New(codeSchedulerStartRelayOnBound, ClassScheduler, ScopeInternal, LevelLow, "the source has `start-relay` automatically for bound worker, so it can't `start-relay` with worker name now", "Please stop relay by `stop-relay` without worker name first.") - ErrSchedulerStopRelayOnBound = New(codeSchedulerStopRelayOnBound, ClassScheduler, ScopeInternal, LevelLow, "the source has `start-relay` automatically for bound worker, so it can't `stop-relay` with worker name now", "Please use `stop-relay` without worker name.") + ErrSchedulerNotStarted = New(codeSchedulerNotStarted, ClassScheduler, ScopeInternal, LevelHigh, "the scheduler has not started", "") + ErrSchedulerStarted = New(codeSchedulerStarted, ClassScheduler, ScopeInternal, LevelMedium, "the scheduler has already started", "") + ErrSchedulerWorkerExist = New(codeSchedulerWorkerExist, ClassScheduler, ScopeInternal, LevelMedium, "dm-worker with name %s already exists", "") + ErrSchedulerWorkerNotExist = New(codeSchedulerWorkerNotExist, ClassScheduler, ScopeInternal, LevelMedium, "dm-worker with name %s not exists", "") + ErrSchedulerWorkerOnline = New(codeSchedulerWorkerOnline, ClassScheduler, ScopeInternal, LevelMedium, "dm-worker with name %s is still online", "Please shut it down first.") + ErrSchedulerWorkerInvalidTrans = New(codeSchedulerWorkerInvalidTrans, ClassScheduler, ScopeInternal, LevelMedium, "invalid stage transformation for dm-worker %s, from %s to %s", "") + ErrSchedulerSourceCfgExist = New(codeSchedulerSourceCfgExist, ClassScheduler, ScopeInternal, LevelMedium, "source config with ID %s already exists", "") + ErrSchedulerSourceCfgNotExist = New(codeSchedulerSourceCfgNotExist, ClassScheduler, ScopeInternal, LevelMedium, "source config with ID %s not exists", "") + ErrSchedulerSourcesUnbound = New(codeSchedulerSourcesUnbound, ClassDMMaster, ScopeInternal, LevelMedium, "sources %v have not bound", "") + ErrSchedulerSourceOpTaskExist = New(codeSchedulerSourceOpTaskExist, ClassDMMaster, ScopeInternal, LevelMedium, "source with name %s need to operate has existing tasks %v", "Please `stop-task` first.") + ErrSchedulerRelayStageInvalidUpdate = New(codeSchedulerRelayStageInvalidUpdate, ClassScheduler, ScopeInternal, LevelMedium, "invalid new expectant relay stage %s", "") + ErrSchedulerRelayStageSourceNotExist = New(codeSchedulerRelayStageSourceNotExist, ClassScheduler, ScopeInternal, LevelMedium, "sources %v need to update expectant relay stage not exist", "") + ErrSchedulerMultiTask = New(codeSchedulerMultiTask, ClassScheduler, ScopeInternal, LevelMedium, "the scheduler cannot perform multiple different tasks %v in one operation", "") + ErrSchedulerSubTaskExist = New(codeSchedulerSubTaskExist, ClassScheduler, ScopeInternal, LevelMedium, "subtasks with name %s for sources %v already exist", "Please use `query-status` command to see tasks.") + ErrSchedulerSubTaskStageInvalidUpdate = New(codeSchedulerSubTaskStageInvalidUpdate, ClassDMMaster, ScopeInternal, LevelMedium, "invalid new expectant subtask stage %s", "") + ErrSchedulerSubTaskOpTaskNotExist = New(codeSchedulerSubTaskOpTaskNotExist, ClassDMMaster, ScopeInternal, LevelMedium, "subtasks with name %s need to be operate not exist", "Please use `query-status` command to see tasks.") + ErrSchedulerSubTaskOpSourceNotExist = New(codeSchedulerSubTaskOpSourceNotExist, ClassDMMaster, ScopeInternal, LevelMedium, "sources %v need to be operate not exist", "") + ErrSchedulerTaskNotExist = New(codeSchedulerTaskNotExist, ClassScheduler, ScopeInternal, LevelMedium, "task with name %s not exist", "Please use `query-status` command to see tasks.") + ErrSchedulerRequireRunningTaskInSyncUnit = New(codeSchedulerRequireRunningTaskInSyncUnit, ClassScheduler, ScopeInternal, LevelHigh, "running tasks %v to be transferred on source %s should in sync unit", "Please use `pause-task [-s source ...] task` to pause them first.") + ErrSchedulerRelayWorkersBusy = New(codeSchedulerRelayWorkersBusy, ClassScheduler, ScopeInternal, LevelHigh, "these workers %s have started relay for sources %s respectively", "Please use `stop-relay` to stop them, or change your topology.") + ErrSchedulerRelayWorkersWrongBound = New(codeSchedulerRelayWorkersBound, ClassScheduler, ScopeInternal, LevelHigh, "these workers %s have bound for another sources %s respectively", "Please `start-relay` on free or same source workers.") + ErrSchedulerRelayWorkersWrongRelay = New(codeSchedulerRelayWorkersWrongRelay, ClassScheduler, ScopeInternal, LevelHigh, "these workers %s have started relay for another sources %s respectively", "Please correct sources in `stop-relay`.") + ErrSchedulerSourceOpRelayExist = New(codeSchedulerSourceOpRelayExist, ClassScheduler, ScopeInternal, LevelHigh, "source with name %s need to operate has existing relay workers %s", "Please `stop-relay` first.") + ErrSchedulerLatchInUse = New(codeSchedulerLatchInUse, ClassScheduler, ScopeInternal, LevelLow, "when %s, resource %s is in use by other client", "Please try again later") + ErrSchedulerSourceCfgUpdate = New(codeSchedulerSourceCfgUpdate, ClassScheduler, ScopeInternal, LevelLow, "source can only update relay-log related parts for now", "") + ErrSchedulerWrongWorkerInput = New(codeSchedulerWrongWorkerInput, ClassScheduler, ScopeInternal, LevelMedium, "require DM master to modify worker [%s] with source [%s], but currently the worker is bound to source [%s]", "") + ErrSchedulerBoundDiffWithStartedRelay = New(codeSchedulerCantTransferToRelayWorker, ClassScheduler, ScopeInternal, LevelMedium, "require DM worker [%s] to be bound to source [%s], but it has been started relay for source [%s]", "If you intend to bind the source with worker, you can stop-relay for current source.") + ErrSchedulerStartRelayOnSpecified = New(codeSchedulerStartRelayOnSpecified, ClassScheduler, ScopeInternal, LevelLow, "the source has `start-relay` with worker name for workers %v, so it can't `start-relay` without worker name now", "Please stop all relay workers first, or specify worker name for `start-relay`.") + ErrSchedulerStopRelayOnSpecified = New(codeSchedulerStopRelayOnSpecified, ClassScheduler, ScopeInternal, LevelLow, "the source has `start-relay` with worker name for workers %v, so it can't `stop-relay` without worker name now", "Please specify worker names for `stop-relay`.") + ErrSchedulerStartRelayOnBound = New(codeSchedulerStartRelayOnBound, ClassScheduler, ScopeInternal, LevelLow, "the source has `start-relay` automatically for bound worker, so it can't `start-relay` with worker name now", "Please stop relay by `stop-relay` without worker name first.") + ErrSchedulerStopRelayOnBound = New(codeSchedulerStopRelayOnBound, ClassScheduler, ScopeInternal, LevelLow, "the source has `start-relay` automatically for bound worker, so it can't `stop-relay` with worker name now", "Please use `stop-relay` without worker name.") + ErrSchedulerPauseTaskForTransferSource = New(codeSchedulerPauseTaskForTransferSource, ClassScheduler, ScopeInternal, LevelLow, "failed to auto pause tasks %s when transfer-source", "Please pause task by `dmctl pause-task`.") // dmctl. ErrCtlGRPCCreateConn = New(codeCtlGRPCCreateConn, ClassDMCtl, ScopeInternal, LevelHigh, "can not create grpc connection", "Please check your network connection.") diff --git a/dm/tests/ha/run.sh b/dm/tests/ha/run.sh index dc615b39bb4..ca776c05ed5 100755 --- a/dm/tests/ha/run.sh +++ b/dm/tests/ha/run.sh @@ -64,7 +64,6 @@ function run() { echo "use sync_diff_inspector to check increment data" check_sync_diff $WORK_DIR $cur/conf/diff_config.toml - sleep 2 echo "pause task before kill and restart dm-worker" run_dm_ctl $WORK_DIR "127.0.0.1:$MASTER_PORT" \ @@ -106,21 +105,11 @@ function run() { # manually transfer a exist source to a newly started worker run_dm_worker $WORK_DIR/worker3 $WORKER3_PORT $cur/conf/dm-worker3.toml - - # pause task first check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER3_PORT + run_dm_ctl $WORK_DIR "127.0.0.1:$MASTER_PORT" \ - "transfer-source $SOURCE_ID1 worker3" \ - "tasks \[test\] on source $SOURCE_ID1 should not be running" 1 - run_dm_ctl $WORK_DIR "127.0.0.1:$MASTER_PORT" \ - "pause-task -s $SOURCE_ID1 test" \ - "\"result\": true" 2 - run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \ "transfer-source $SOURCE_ID1 worker3" \ "\"result\": true" 1 - run_dm_ctl $WORK_DIR "127.0.0.1:$MASTER_PORT" \ - "resume-task -s $SOURCE_ID1 test" \ - "\"result\": true" 2 run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \ "list-member --name worker3" \ diff --git a/dm/tests/openapi/client/openapi_task_check b/dm/tests/openapi/client/openapi_task_check index ac763a8aa36..d6548dc94fe 100755 --- a/dm/tests/openapi/client/openapi_task_check +++ b/dm/tests/openapi/client/openapi_task_check @@ -179,7 +179,7 @@ def get_task_status_success(task_name, total): resp = requests.get(url=url) data = resp.json() assert resp.status_code == 200 - print("get_task_status_failed resp=", data) + print("get_task_status_success resp=", data) assert data["total"] == int(total) @@ -200,6 +200,8 @@ def pause_task_success(task_name, source_name): source_name, ], ) + if resp.status_code != 200: + print("pause_task_failed resp=", resp.json()) assert resp.status_code == 200 @@ -211,6 +213,8 @@ def resume_task_success(task_name, source_name): source_name, ], ) + if resp.status_code != 200: + print("resume_task_failed resp=", resp.json()) assert resp.status_code == 200 diff --git a/dm/tests/openapi/run.sh b/dm/tests/openapi/run.sh index 86b9ca48856..1f0a4eb3600 100644 --- a/dm/tests/openapi/run.sh +++ b/dm/tests/openapi/run.sh @@ -265,7 +265,7 @@ function test_noshard_task() { # get task status success openapi_task_check "get_task_status_success" "$task_name" 2 - # delte source with force + # delete source with force openapi_source_check "delete_source_with_force_success" "mysql-01" # after delete source-1, there is only one subtask status From be78431013772b41deab32ca9c1675b7d9cf1d80 Mon Sep 17 00:00:00 2001 From: docsir <73268456+docsir@users.noreply.github.com> Date: Thu, 30 Dec 2021 09:09:50 +0800 Subject: [PATCH 3/5] test(dm): add retry to avoid query-status test failed (#4144) close pingcap/tiflow#4134 --- dm/tests/many_tables/run.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dm/tests/many_tables/run.sh b/dm/tests/many_tables/run.sh index 2fb71f2e993..9f2a018bb29 100644 --- a/dm/tests/many_tables/run.sh +++ b/dm/tests/many_tables/run.sh @@ -42,7 +42,7 @@ function run() { dmctl_operate_source create $WORK_DIR/source1.yaml $SOURCE_ID1 dmctl_start_task_standalone - run_dm_ctl $WORK_DIR "127.0.0.1:$MASTER_PORT" \ + run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \ "query-status test" \ "\"totalTables\": \"500\"" 1 \ "\"completedTables\"" 1 \ From 1d376cd565b45fe5fff7960bbe1a6fa30b1be3ec Mon Sep 17 00:00:00 2001 From: Ehco Date: Thu, 30 Dec 2021 12:01:50 +0800 Subject: [PATCH 4/5] openapi(dm): add openapi conifg in dm-master config file (#3836) ref pingcap/tiflow#3583 --- dm/dm/master/config.go | 9 +++++++- dm/dm/master/config_test.go | 25 ++++++++++++++++++++++ dm/dm/master/dm-master.toml | 3 +-- dm/dm/master/openapi_test.go | 6 +++--- dm/dm/master/server.go | 2 +- dm/tests/dmctl_basic/conf/get_master1.toml | 2 +- dm/tests/openapi/conf/dm-master1.toml | 2 -- dm/tests/openapi/conf/dm-master2.toml | 2 -- 8 files changed, 39 insertions(+), 12 deletions(-) diff --git a/dm/dm/master/config.go b/dm/dm/master/config.go index e595906a986..9dbac15a4b2 100644 --- a/dm/dm/master/config.go +++ b/dm/dm/master/config.go @@ -60,6 +60,7 @@ func NewConfig() *Config { fs.BoolVar(&cfg.printVersion, "V", false, "prints version and exit") fs.BoolVar(&cfg.printSampleConfig, "print-sample-config", false, "print sample config file of dm-worker") + fs.BoolVar(&cfg.OpenAPI, "openapi", false, "enable openapi") fs.StringVar(&cfg.ConfigFile, "config", "", "path to config file") fs.StringVar(&cfg.MasterAddr, "master-addr", "", "master API server and status addr") fs.StringVar(&cfg.AdvertiseAddr, "advertise-addr", "", `advertise address for client traffic (default "${master-addr}")`) @@ -91,7 +92,7 @@ func NewConfig() *Config { } type ExperimentalFeatures struct { - OpenAPI bool `toml:"openapi"` + OpenAPI bool `toml:"openapi,omitempty"` // OpenAPI is available in v5.4 as default. } // Config is the configuration for dm-master. @@ -128,6 +129,7 @@ type Config struct { AutoCompactionMode string `toml:"auto-compaction-mode" json:"auto-compaction-mode"` AutoCompactionRetention string `toml:"auto-compaction-retention" json:"auto-compaction-retention"` QuotaBackendBytes int64 `toml:"quota-backend-bytes" json:"quota-backend-bytes"` + OpenAPI bool `toml:"openapi" json:"openapi"` // directory path used to store source config files when upgrading from v1.0.x. // if this path set, DM-master leader will try to upgrade from v1.0.x to the current version. @@ -313,6 +315,11 @@ func (c *Config) adjust() error { c.QuotaBackendBytes = quotaBackendBytesLowerBound } + if c.ExperimentalFeatures.OpenAPI { + c.OpenAPI = true + c.ExperimentalFeatures.OpenAPI = false + log.L().Warn("openapi is a GA feature and removed from experimental features, so this configuration may have no affect in feature release, please set openapi=true in dm-master config file") + } return err } diff --git a/dm/dm/master/config_test.go b/dm/dm/master/config_test.go index a1486644dd7..7b1f1531e3e 100644 --- a/dm/dm/master/config_test.go +++ b/dm/dm/master/config_test.go @@ -115,6 +115,7 @@ func (t *testConfigSuite) TestConfig(c *check.C) { c.Assert(cfg.Join, check.Equals, "") c.Assert(cfg.String(), check.Matches, fmt.Sprintf("{.*master-addr\":\"%s\".*}", masterAddr)) c.Assert(cfg.ExperimentalFeatures.OpenAPI, check.Equals, false) + c.Assert(cfg.OpenAPI, check.Equals, false) } } } @@ -299,3 +300,27 @@ func (t *testConfigSuite) TestAdjustAddr(c *check.C) { c.Assert(cfg.adjust(), check.IsNil) c.Assert(cfg.AdvertiseAddr, check.Equals, cfg.MasterAddr) } + +func (t *testConfigSuite) TestAdjustOpenAPI(c *check.C) { + cfg := NewConfig() + c.Assert(cfg.configFromFile(defaultConfigFile), check.IsNil) + c.Assert(cfg.adjust(), check.IsNil) + + // test default value + c.Assert(cfg.OpenAPI, check.Equals, false) + c.Assert(cfg.ExperimentalFeatures.OpenAPI, check.Equals, false) + + // adjust openapi from experimental-features + cfg.ExperimentalFeatures.OpenAPI = true + c.Assert(cfg.adjust(), check.IsNil) + c.Assert(cfg.OpenAPI, check.Equals, true) + c.Assert(cfg.ExperimentalFeatures.OpenAPI, check.Equals, false) + + // test from flags + c.Assert(cfg.Parse([]string{"--openapi=false", "--master-addr=127.0.0.1:8261"}), check.IsNil) + c.Assert(cfg.adjust(), check.IsNil) + c.Assert(cfg.OpenAPI, check.Equals, false) + c.Assert(cfg.Parse([]string{"--openapi=true", "--master-addr=127.0.0.1:8261"}), check.IsNil) + c.Assert(cfg.adjust(), check.IsNil) + c.Assert(cfg.OpenAPI, check.Equals, true) +} diff --git a/dm/dm/master/dm-master.toml b/dm/dm/master/dm-master.toml index a078e070cce..20aeca72017 100644 --- a/dm/dm/master/dm-master.toml +++ b/dm/dm/master/dm-master.toml @@ -42,6 +42,5 @@ rpc-timeout = "30s" rpc-rate-burst = 40 rpc-rate-limit = 10.0 -# some experimental features -[experimental] +# openapi feature openapi = false diff --git a/dm/dm/master/openapi_test.go b/dm/dm/master/openapi_test.go index a139c43d9c0..e929927e7cf 100644 --- a/dm/dm/master/openapi_test.go +++ b/dm/dm/master/openapi_test.go @@ -88,7 +88,7 @@ func (t *openAPISuite) TestRedirectRequestToLeader(c *check.C) { cfg1.PeerUrls = tempurl.Alloc() cfg1.AdvertisePeerUrls = cfg1.PeerUrls cfg1.InitialCluster = fmt.Sprintf("%s=%s", cfg1.Name, cfg1.AdvertisePeerUrls) - cfg1.ExperimentalFeatures.OpenAPI = true + cfg1.OpenAPI = true s1 := NewServer(cfg1) c.Assert(s1.Start(ctx), check.IsNil) @@ -108,7 +108,7 @@ func (t *openAPISuite) TestRedirectRequestToLeader(c *check.C) { cfg2.PeerUrls = tempurl.Alloc() cfg2.AdvertisePeerUrls = cfg2.PeerUrls cfg2.Join = cfg1.MasterAddr // join to an existing cluster - cfg2.ExperimentalFeatures.OpenAPI = true + cfg2.OpenAPI = true s2 := NewServer(cfg2) c.Assert(s2.Start(ctx), check.IsNil) @@ -778,7 +778,7 @@ func setupServer(ctx context.Context, c *check.C) *Server { cfg1.AdvertisePeerUrls = cfg1.PeerUrls cfg1.AdvertiseAddr = cfg1.MasterAddr cfg1.InitialCluster = fmt.Sprintf("%s=%s", cfg1.Name, cfg1.AdvertisePeerUrls) - cfg1.ExperimentalFeatures.OpenAPI = true + cfg1.OpenAPI = true s1 := NewServer(cfg1) c.Assert(s1.Start(ctx), check.IsNil) diff --git a/dm/dm/master/server.go b/dm/dm/master/server.go index a8fc484a984..c70a0a55ef2 100644 --- a/dm/dm/master/server.go +++ b/dm/dm/master/server.go @@ -191,7 +191,7 @@ func (s *Server) Start(ctx context.Context) (err error) { "/status": getStatusHandle(), "/debug/": getDebugHandler(), } - if s.cfg.ExperimentalFeatures.OpenAPI { + if s.cfg.OpenAPI { if initOpenAPIErr := s.InitOpenAPIHandles(); initOpenAPIErr != nil { return terror.ErrOpenAPICommonError.Delegate(initOpenAPIErr) } diff --git a/dm/tests/dmctl_basic/conf/get_master1.toml b/dm/tests/dmctl_basic/conf/get_master1.toml index 2a6f1029254..cfd9657fc58 100644 --- a/dm/tests/dmctl_basic/conf/get_master1.toml +++ b/dm/tests/dmctl_basic/conf/get_master1.toml @@ -20,10 +20,10 @@ max-request-bytes = 1572864 auto-compaction-mode = "periodic" auto-compaction-retention = "1h" quota-backend-bytes = 2147483648 +openapi = false v1-sources-path = "" ssl-ca = "" ssl-cert = "" ssl-key = "" [experimental] - openapi = false diff --git a/dm/tests/openapi/conf/dm-master1.toml b/dm/tests/openapi/conf/dm-master1.toml index 3ba09ab9dbc..972548cc304 100644 --- a/dm/tests/openapi/conf/dm-master1.toml +++ b/dm/tests/openapi/conf/dm-master1.toml @@ -4,6 +4,4 @@ initial-cluster = "master1=http://127.0.0.1:8291" master-addr = ":8261" name = "master1" peer-urls = "127.0.0.1:8291" - -[experimental] openapi = true diff --git a/dm/tests/openapi/conf/dm-master2.toml b/dm/tests/openapi/conf/dm-master2.toml index 9b6333daa09..adb14af1142 100644 --- a/dm/tests/openapi/conf/dm-master2.toml +++ b/dm/tests/openapi/conf/dm-master2.toml @@ -4,6 +4,4 @@ join = "127.0.0.1:8261" master-addr = ":8361" name = "master2" peer-urls = "127.0.0.1:8292" - -[experimental] openapi = true From 7ea445a1d9a6d88dbec1e046fe48ac7ccdb763c9 Mon Sep 17 00:00:00 2001 From: Neil Shen Date: Thu, 30 Dec 2021 13:01:50 +0800 Subject: [PATCH 5/5] metrics(ticdc): add db metrics (#4148) ref pingcap/tiflow#3227 --- metrics/grafana/ticdc.json | 6565 ++++++++++++++++++++++-------------- 1 file changed, 4051 insertions(+), 2514 deletions(-) diff --git a/metrics/grafana/ticdc.json b/metrics/grafana/ticdc.json index 60574864ae7..b686c173e93 100644 --- a/metrics/grafana/ticdc.json +++ b/metrics/grafana/ticdc.json @@ -125,7 +125,7 @@ "gnetId": null, "graphTooltip": 1, "id": null, - "iteration": 1640447119260, + "iteration": 1640792097028, "links": [], "panels": [ { @@ -4710,7 +4710,7 @@ "x": 0, "y": 3 }, - "id": 13, + "id": 269, "panels": [ { "aliasColors": {}, @@ -4718,87 +4718,48 @@ "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", - "description": "The number of established Eventfeed RPC between TiCDC and TiKV", "fill": 1, - "fillGradient": 0, "gridPos": { - "h": 7, + "h": 6, "w": 8, "x": 0, "y": 5 }, - "hiddenSeries": false, - "id": 15, + "id": 271, "legend": { - "alignAsTable": true, "avg": false, - "current": true, - "hideEmpty": false, + "current": false, "max": false, "min": false, - "rightSide": true, "show": true, "total": false, - "values": true + "values": false }, "lines": true, "linewidth": 1, - "links": [], "nullPointMode": "null", - "options": { - "dataLinks": [] - }, - "paceLength": 10, "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", - "seriesOverrides": [ - { - "alias": "/.*-rpc/", - "yaxis": 2 - } - ], + "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(ticdc_kvclient_event_feed_count{tidb_cluster=\"$tidb_cluster\"}) by (instance)", + "expr": "sum(ticdc_sorter_on_disk_data_size_gauge{tidb_cluster=\"$tidb_cluster\", capture=~\"$capture\"}) by (capture)", "format": "time_series", "intervalFactor": 1, - "legendFormat": "{{instance}}", + "legendFormat": "{{capture}}", "refId": "A" - }, - { - "expr": "sum(grpc_client_started_total{tidb_cluster=\"$tidb_cluster\", grpc_method=\"EventFeed\"}) by (instance) - sum(grpc_client_handled_total{tidb_cluster=\"$tidb_cluster\", grpc_method=\"EventFeed\"}) by (instance)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{instance}}-rpc", - "refId": "B" - }, - { - "expr": "sum(grpc_client_started_total{tidb_cluster=\"$tidb_cluster\", grpc_method=\"EventFeed\"}) by (instance)", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "{{instance}}-rpc-started", - "refId": "C" - }, - { - "expr": "sum(grpc_client_handled_total{tidb_cluster=\"$tidb_cluster\", grpc_method=\"EventFeed\"}) by (instance)", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "{{instance}}-rpc-handled", - "refId": "D" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Eventfeed count", + "title": "On disk data size", "tooltip": { "shared": true, "sort": 0, @@ -4814,11 +4775,11 @@ }, "yaxes": [ { - "format": "short", + "format": "bytes", "label": null, "logBase": 1, "max": null, - "min": null, + "min": "0", "show": true }, { @@ -4841,38 +4802,38 @@ "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", - "decimals": null, - "description": "Percentiles of Eventfeed message size", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, "fill": 1, "fillGradient": 0, "gridPos": { - "h": 7, + "h": 6, "w": 8, "x": 8, "y": 5 }, "hiddenSeries": false, - "id": 17, + "id": 273, "legend": { - "alignAsTable": true, "avg": false, - "current": true, + "current": false, "max": false, "min": false, - "rightSide": true, "show": true, "total": false, - "values": true + "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "options": { - "dataLinks": [] + "alertThreshold": true }, - "paceLength": 10, "percentage": false, + "pluginVersion": "7.5.7", "pointradius": 2, "points": false, "renderer": "flot", @@ -4882,25 +4843,18 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.999, sum(rate(ticdc_kvclient_event_size_bytes_bucket{tidb_cluster=\"$tidb_cluster\"}[1m])) by (le, instance, type))", + "expr": "sum(ticdc_sorter_in_memory_data_size_gauge{tidb_cluster=\"$tidb_cluster\", capture=~\"$capture\"}) by (capture)", "format": "time_series", "intervalFactor": 1, - "legendFormat": "{{instance}}-{{type}}-p999", + "legendFormat": "{{capture}}", "refId": "A" - }, - { - "expr": "histogram_quantile(0.95, sum(rate(ticdc_kvclient_event_size_bytes_bucket{tidb_cluster=\"$tidb_cluster\"}[1m])) by (le, instance, type))", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{instance}}-{{type}}-p95", - "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Event size percentile", + "title": "In-memory data size", "tooltip": { "shared": true, "sort": 0, @@ -4920,7 +4874,7 @@ "label": null, "logBase": 1, "max": null, - "min": null, + "min": "0", "show": true }, { @@ -4939,30 +4893,26 @@ }, { "aliasColors": {}, - "bars": true, + "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", - "description": "The number of errors that interrupt Eventfeed RPC", + "description": "The count of files of each levels", "fill": 1, "fillGradient": 0, "gridPos": { - "h": 7, + "h": 6, "w": 8, "x": 16, "y": 5 }, "hiddenSeries": false, - "id": 28, + "id": 272, "legend": { - "alignAsTable": true, "avg": false, "current": true, - "hideEmpty": true, - "hideZero": true, "max": true, "min": false, - "rightSide": true, "show": true, "total": false, "values": true @@ -4971,9 +4921,6 @@ "linewidth": 1, "links": [], "nullPointMode": "null", - "options": { - "dataLinks": [] - }, "paceLength": 10, "percentage": false, "pointradius": 2, @@ -4985,37 +4932,18 @@ "steppedLine": false, "targets": [ { - "expr": "sum(increase(ticdc_kvclient_event_feed_error_count{tidb_cluster=\"$tidb_cluster\", capture=~\"$capture\"}[1m])) by (type)", + "expr": "sum(ticdc_db_level_count{tidb_cluster=\"$tidb_cluster\", capture=~\"$capture\"}) by (capture, level)", "format": "time_series", - "hide": false, - "interval": "1m", "intervalFactor": 1, - "legendFormat": "{{type}}", + "legendFormat": "{{capture}}-{{level}}", "refId": "A" - }, - { - "expr": "-sum(increase(pd_schedule_operators_count{tidb_cluster=\"$tidb_cluster\", event=\"create\", type=~\".*leader\"}[1m]))", - "format": "time_series", - "hide": false, - "interval": "1m", - "intervalFactor": 1, - "legendFormat": "transfer-leader", - "refId": "B" - }, - { - "expr": "-sum(increase(pd_schedule_operators_count{tidb_cluster=\"$tidb_cluster\", event=\"create\", type=~\".*(peer|region)\"}[1m]))", - "format": "time_series", - "interval": "1m", - "intervalFactor": 1, - "legendFormat": "move-region", - "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Eventfeed error/m", + "title": "Level files", "tooltip": { "shared": true, "sort": 0, @@ -5052,29 +4980,114 @@ "alignLevel": null } }, + { + "cards": { + "cardPadding": 0, + "cardRound": 0 + }, + "color": { + "cardColor": "#FF9830", + "colorScale": "linear", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "min": 0, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_TEST-CLUSTER}", + "description": "The time of sorter write", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 8, + "x": 0, + "y": 11 + }, + "heatmap": {}, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 274, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "links": [], + "maxPerRow": 3, + "repeatDirection": "h", + "reverseYBuckets": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(ticdc_sorter_db_write_duration_seconds_bucket{tidb_cluster=\"$tidb_cluster\", capture=~\"$capture\"}[1m])) by (le)", + "format": "heatmap", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{le}}", + "refId": "A" + } + ], + "title": "Write duration", + "tooltip": { + "show": true, + "showHistogram": true + }, + "tooltipDecimals": 1, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 1, + "format": "s", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketBound": "upper", + "yBucketNumber": null, + "yBucketSize": null + }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", - "description": "The number of KV client received events from TiKV per seconds", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, "fill": 1, "fillGradient": 0, "gridPos": { - "h": 7, + "h": 6, "w": 8, - "x": 0, - "y": 12 + "x": 8, + "y": 11 }, "hiddenSeries": false, - "id": 29, + "id": 288, "legend": { "alignAsTable": true, - "avg": false, + "avg": true, "current": true, - "hideEmpty": true, - "hideZero": true, "max": true, "min": false, "rightSide": false, @@ -5087,31 +5100,59 @@ "links": [], "nullPointMode": "null", "options": { - "dataLinks": [] + "alertThreshold": true }, - "paceLength": 10, "percentage": false, + "pluginVersion": "7.5.7", "pointradius": 2, "points": false, "renderer": "flot", - "seriesOverrides": [], + "seriesOverrides": [ + { + "alias": "/.*amplification/", + "yaxis": 2 + } + ], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(ticdc_kvclient_pull_event_count{tidb_cluster=\"$tidb_cluster\", changefeed=~\"$changefeed\"}[1m])) by (instance, type)", + "exemplar": true, + "expr": "sum(rate(ticdc_sorter_db_write_bytes_sum{tidb_cluster=\"$tidb_cluster\", capture=~\"$capture\"}[1m])) by (capture)", "format": "time_series", + "hide": false, + "interval": "", "intervalFactor": 1, - "legendFormat": "{{instance}}-{{type}}", + "legendFormat": "{{capture}}-sorter", "refId": "A" + }, + { + "exemplar": true, + "expr": "sum(rate(ticdc_sorter_db_write_bytes_total{tidb_cluster=\"$tidb_cluster\", capture=~\"$capture\"}[1m])) by (capture) / sum(rate(ticdc_sorter_db_write_bytes_sum{tidb_cluster=\"$tidb_cluster\", capture=~\"$capture\"}[1m])) by (capture)", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{capture}}-amplification", + "refId": "B" + }, + { + "exemplar": true, + "expr": "sum(rate(ticdc_sorter_db_write_bytes_total{tidb_cluster=\"$tidb_cluster\", capture=~\"$capture\"}[1m])) by (capture)", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{capture}}-disk", + "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "KV client receive events/s", + "title": "Write speed", "tooltip": { "shared": true, "sort": 0, @@ -5127,17 +5168,17 @@ }, "yaxes": [ { - "format": "short", + "format": "bytes", "label": null, "logBase": 1, "max": null, - "min": null, + "min": "0", "show": true }, { - "format": "short", + "format": "percentunit", "label": null, - "logBase": 1, + "logBase": 10, "max": null, "min": null, "show": true @@ -5154,21 +5195,22 @@ "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", - "description": "The number of events that puller outputs to sorter \n per second", + "description": "CPU usage of LevelDB sorter", "fill": 1, "fillGradient": 0, "gridPos": { - "h": 7, + "h": 6, "w": 8, - "x": 8, - "y": 12 + "x": 16, + "y": 11 }, "hiddenSeries": false, - "id": 5, + "id": 286, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, "max": true, "min": false, "rightSide": false, @@ -5188,24 +5230,36 @@ "pointradius": 2, "points": false, "renderer": "flot", - "seriesOverrides": [], + "seriesOverrides": [ + { + "alias": "/.*sorter-[0-9]+/", + "yaxis": 2 + } + ], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum (rate(ticdc_puller_txn_collect_event_count{tidb_cluster=\"$tidb_cluster\", changefeed=~\"$changefeed\"}[1m])) by (instance, type)", + "expr": "sum(rate(ticdc_actor_worker_cpu_seconds_total{tidb_cluster=\"$tidb_cluster\", instance=~\"$capture\", name=~\"sorter|cleaner|compactor\"}[1m])) by (name, instance)", "format": "time_series", "intervalFactor": 1, - "legendFormat": "{{instance}}-{{type}}", + "legendFormat": "{{instance}}-{{name}}", "refId": "A" + }, + { + "expr": "sum(rate(ticdc_actor_worker_cpu_seconds_total{tidb_cluster=\"$tidb_cluster\", instance=~\"$capture\", name=\"sorter\"}[1m])) by (name, id, instance)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{instance}}-{{name}}-{{id}}", + "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Puller output events/s", + "title": "CPU usage", "tooltip": { "shared": true, "sort": 0, @@ -5221,7 +5275,7 @@ }, "yaxes": [ { - "format": "none", + "format": "percentunit", "label": null, "logBase": 1, "max": null, @@ -5229,10 +5283,10 @@ "show": true }, { - "format": "short", + "format": "percentunit", "label": null, "logBase": 1, - "max": null, + "max": "1.2", "min": null, "show": true } @@ -5242,28 +5296,114 @@ "alignLevel": null } }, + { + "cards": { + "cardPadding": 0, + "cardRound": 0 + }, + "color": { + "cardColor": "#FF9830", + "colorScale": "linear", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "min": 0, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_TEST-CLUSTER}", + "description": "The time of sorter write", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 8, + "x": 0, + "y": 17 + }, + "heatmap": {}, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 277, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "links": [], + "maxPerRow": 3, + "repeatDirection": "h", + "reverseYBuckets": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(ticdc_sorter_db_write_bytes_bucket{tidb_cluster=\"$tidb_cluster\", capture=~\"$capture\"}[1m])) by (le)", + "format": "heatmap", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{le}}", + "refId": "A" + } + ], + "title": "Write bytes", + "tooltip": { + "show": true, + "showHistogram": true + }, + "tooltipDecimals": 1, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 1, + "format": "bytes", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketBound": "upper", + "yBucketNumber": null, + "yBucketSize": null + }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", - "description": "The number of events that are buffered in Processor's output channel and Mounter input channel", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, "fill": 1, "fillGradient": 0, "gridPos": { - "h": 7, + "h": 6, "w": 8, - "x": 16, - "y": 12 + "x": 8, + "y": 17 }, "hiddenSeries": false, - "id": 107, + "id": 279, "legend": { "alignAsTable": true, - "avg": false, + "avg": true, "current": true, - "hideEmpty": false, "max": true, "min": false, "rightSide": false, @@ -5276,10 +5416,10 @@ "links": [], "nullPointMode": "null", "options": { - "dataLinks": [] + "alertThreshold": true }, - "paceLength": 10, "percentage": false, + "pluginVersion": "7.5.7", "pointradius": 2, "points": false, "renderer": "flot", @@ -5289,25 +5429,21 @@ "steppedLine": false, "targets": [ { - "expr": "sum(ticdc_mounter_input_chan_size{tidb_cluster=\"$tidb_cluster\", changefeed=~\"$changefeed\",capture=~\"$capture\"}) by (capture)", + "exemplar": true, + "expr": "sum(rate(ticdc_sorter_db_write_bytes_count{tidb_cluster=\"$tidb_cluster\", capture=~\"$capture\"}[1m])) by (capture)", "format": "time_series", + "hide": false, + "interval": "", "intervalFactor": 1, - "legendFormat": "{{capture}}-mounter input chan", + "legendFormat": "{{capture}}-sorter", "refId": "A" - }, - { - "expr": "-sum(ticdc_sink_buffer_chan_size{tidb_cluster=\"$tidb_cluster\", changefeed=~\"$changefeed\",capture=~\"$capture\"}) by (capture)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{capture}}-sink buffer chan", - "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Sink/Mounter buffer size", + "title": "Write OPS", "tooltip": { "shared": true, "sort": 0, @@ -5323,17 +5459,17 @@ }, "yaxes": [ { - "format": "short", + "format": "ops", "label": null, "logBase": 1, "max": null, - "min": null, + "min": "0", "show": true }, { - "format": "short", + "format": "percentunit", "label": null, - "logBase": 1, + "logBase": 10, "max": null, "min": null, "show": true @@ -5344,107 +5480,36 @@ "alignLevel": null } }, - { - "cards": { - "cardPadding": 0, - "cardRound": 0 - }, - "color": { - "cardColor": "#FF9830", - "colorScale": "linear", - "colorScheme": "interpolateSpectral", - "exponent": 0.5, - "min": 0, - "mode": "spectrum" - }, - "dataFormat": "tsbuckets", - "datasource": "${DS_TEST-CLUSTER}", - "description": "Duration of sorting unsorted events", - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 19 - }, - "heatmap": {}, - "hideZeroBuckets": true, - "highlightCards": true, - "id": 99, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": true, - "min": false, - "rightSide": true, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "links": [], - "reverseYBuckets": false, - "targets": [ - { - "expr": "sum(rate(ticdc_puller_entry_sorter_sort_bucket{tidb_cluster=\"$tidb_cluster\", capture=~\"$capture\"}[1m])) by (le)", - "format": "heatmap", - "instant": false, - "intervalFactor": 2, - "legendFormat": "{{le}}", - "refId": "A" - } - ], - "title": "Entry sorter sort duration", - "tooltip": { - "show": true, - "showHistogram": true - }, - "tooltipDecimals": 1, - "type": "heatmap", - "xAxis": { - "show": true - }, - "xBucketNumber": null, - "xBucketSize": null, - "yAxis": { - "decimals": 1, - "format": "s", - "logBase": 1, - "max": null, - "min": null, - "show": true, - "splitFactor": null - }, - "yBucketBound": "upper", - "yBucketNumber": null, - "yBucketSize": null - }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", - "description": "Percentiles of sorting events duration", + "description": "The count and duration of write delay", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, "fill": 1, "fillGradient": 0, "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 19 + "h": 6, + "w": 8, + "x": 16, + "y": 17 }, "hiddenSeries": false, - "id": 53, + "id": 275, "legend": { - "alignAsTable": true, + "alignAsTable": false, "avg": false, "current": true, - "hideEmpty": false, "max": true, "min": false, - "rightSide": true, + "rightSide": false, "show": true, "total": false, "values": true @@ -5454,31 +5519,41 @@ "links": [], "nullPointMode": "null", "options": { - "dataLinks": [] + "alertThreshold": true }, "paceLength": 10, "percentage": false, + "pluginVersion": "7.5.7", "pointradius": 2, "points": false, "renderer": "flot", - "seriesOverrides": [], + "seriesOverrides": [ + { + "alias": "/.*count.*/", + "yaxis": 2 + } + ], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.999, sum(rate(ticdc_puller_entry_sorter_sort_bucket{tidb_cluster=\"$tidb_cluster\", changefeed=~\"$changefeed\",capture=~\"$capture\"}[1m])) by (le,capture))", + "exemplar": true, + "expr": "sum(rate(ticdc_db_write_delay_seconds{tidb_cluster=\"$tidb_cluster\", capture=~\"$capture\"}[1m])) by (capture)", "format": "time_series", + "interval": "", "intervalFactor": 1, - "legendFormat": "{{capture}}-p999", + "legendFormat": "{{capture}}-duration", "refId": "A" }, { - "expr": "histogram_quantile(0.95, sum(rate(ticdc_puller_entry_sorter_sort_bucket{tidb_cluster=\"$tidb_cluster\", changefeed=~\"$changefeed\",capture=~\"$capture\"}[1m])) by (le,capture))", + "exemplar": true, + "expr": "sum(rate(ticdc_db_write_delay_total{tidb_cluster=\"$tidb_cluster\", capture=~\"$capture\"}[1m])) by (capture)", "format": "time_series", "hide": false, + "interval": "", "intervalFactor": 1, - "legendFormat": "{{capture}}-p95", + "legendFormat": "{{capture}}-count", "refId": "B" } ], @@ -5486,7 +5561,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Entry sorter sort duration percentile", + "title": "Write delay", "tooltip": { "shared": true, "sort": 0, @@ -5502,9 +5577,9 @@ }, "yaxes": [ { - "format": "s", + "format": "dtdurations", "label": null, - "logBase": 2, + "logBase": 1, "max": null, "min": null, "show": true @@ -5538,17 +5613,21 @@ }, "dataFormat": "tsbuckets", "datasource": "${DS_TEST-CLUSTER}", - "description": "Duration of merging sorted events", - "gridPos": { - "h": 7, - "w": 12, + "description": "The time of sorter iterator read", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 8, "x": 0, - "y": 26 + "y": 23 }, "heatmap": {}, "hideZeroBuckets": true, "highlightCards": true, - "id": 105, + "id": 278, "legend": { "alignAsTable": true, "avg": false, @@ -5563,18 +5642,22 @@ "values": true }, "links": [], + "maxPerRow": 3, + "repeatDirection": "h", "reverseYBuckets": false, "targets": [ { - "expr": "sum(rate(ticdc_puller_entry_sorter_merge_bucket{tidb_cluster=\"$tidb_cluster\", capture=~\"$capture\"}[1m])) by (le)", + "exemplar": true, + "expr": "sum(rate(ticdc_sorter_db_iter_read_duration_seconds_bucket{tidb_cluster=\"$tidb_cluster\", capture=~\"$capture\", call=\"first\"}[1m])) by (le)", "format": "heatmap", "instant": false, + "interval": "", "intervalFactor": 2, "legendFormat": "{{le}}", "refId": "A" } ], - "title": "Entry sorter merge duration", + "title": "Read duration - First", "tooltip": { "show": true, "showHistogram": true @@ -5600,107 +5683,88 @@ "yBucketSize": null }, { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, + "cards": { + "cardPadding": 0, + "cardRound": 0 + }, + "color": { + "cardColor": "#FF9830", + "colorScale": "linear", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "min": 0, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", "datasource": "${DS_TEST-CLUSTER}", - "description": "Percentiles of merging sorted events duration", - "fill": 1, - "fillGradient": 0, + "description": "The time of sorter iterator read", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 26 + "h": 6, + "w": 8, + "x": 8, + "y": 23 }, - "hiddenSeries": false, - "id": 106, + "heatmap": {}, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 282, "legend": { "alignAsTable": true, "avg": false, "current": true, - "hideEmpty": false, "max": true, "min": false, "rightSide": true, "show": true, + "sort": "current", + "sortDesc": true, "total": false, "values": true }, - "lines": true, - "linewidth": 1, "links": [], - "nullPointMode": "null", - "options": { - "dataLinks": [] - }, - "paceLength": 10, - "percentage": false, - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, + "maxPerRow": 3, + "repeatDirection": "h", + "reverseYBuckets": false, "targets": [ { - "expr": "histogram_quantile(0.999, sum(rate(ticdc_puller_entry_sorter_merge_bucket{tidb_cluster=\"$tidb_cluster\", changefeed=~\"$changefeed\",capture=~\"$capture\"}[1m])) by (le,capture))", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{capture}}-p999", + "exemplar": true, + "expr": "sum(rate(ticdc_sorter_db_iter_read_duration_seconds_bucket{tidb_cluster=\"$tidb_cluster\", capture=~\"$capture\", call=\"next\"}[1m])) by (le)", + "format": "heatmap", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{le}}", "refId": "A" - }, - { - "expr": "histogram_quantile(0.95, sum(rate(ticdc_puller_entry_sorter_merge_bucket{tidb_cluster=\"$tidb_cluster\", changefeed=~\"$changefeed\",capture=~\"$capture\"}[1m])) by (le,capture))", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "{{capture}}-p95", - "refId": "B" } ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Entry sorter merge duration percentile", + "title": "Read duration - Next", "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" + "show": true, + "showHistogram": true }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, + "tooltipDecimals": 1, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 1, + "format": "s", + "logBase": 1, + "max": null, + "min": null, "show": true, - "values": [] + "splitFactor": null }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 2, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + "yBucketBound": "upper", + "yBucketNumber": null, + "yBucketSize": null }, { "cards": { @@ -5717,17 +5781,21 @@ }, "dataFormat": "tsbuckets", "datasource": "${DS_TEST-CLUSTER}", - "description": "Duration of unmarshal events from kv to SQL row", + "description": "The time of sorter iterator read", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 33 + "h": 6, + "w": 8, + "x": 16, + "y": 23 }, "heatmap": {}, "hideZeroBuckets": true, "highlightCards": true, - "id": 101, + "id": 281, "legend": { "alignAsTable": true, "avg": false, @@ -5742,18 +5810,22 @@ "values": true }, "links": [], + "maxPerRow": 3, + "repeatDirection": "h", "reverseYBuckets": false, "targets": [ { - "expr": "max(rate(ticdc_mounter_unmarshal_and_mount_bucket{tidb_cluster=\"$tidb_cluster\", capture=~\"$capture\"}[1m])) by (le)", + "exemplar": true, + "expr": "sum(rate(ticdc_sorter_db_iter_read_duration_seconds_bucket{tidb_cluster=\"$tidb_cluster\", capture=~\"$capture\", call=\"release\"}[1m])) by (le)", "format": "heatmap", "instant": false, + "interval": "", "intervalFactor": 2, "legendFormat": "{{le}}", "refId": "A" } ], - "title": "Mounter unmarshal duration", + "title": "Read duration - Release", "tooltip": { "show": true, "showHistogram": true @@ -5784,27 +5856,28 @@ "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", - "description": "Percentiles of unmarshal events from kv to SQL row duration", - "fill": 0, + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 1, "fillGradient": 0, "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 33 + "h": 6, + "w": 8, + "x": 0, + "y": 29 }, "hiddenSeries": false, - "id": 55, + "id": 280, "legend": { "alignAsTable": true, - "avg": false, + "avg": true, "current": true, "max": true, "min": false, - "rightSide": true, + "rightSide": false, "show": true, - "sort": "current", - "sortDesc": true, "total": false, "values": true }, @@ -5813,10 +5886,10 @@ "links": [], "nullPointMode": "null", "options": { - "dataLinks": [] + "alertThreshold": true }, - "paceLength": 10, "percentage": false, + "pluginVersion": "7.5.7", "pointradius": 2, "points": false, "renderer": "flot", @@ -5826,31 +5899,24 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(ticdc_mounter_unmarshal_and_mount_bucket{tidb_cluster=\"$tidb_cluster\", changefeed=~\"$changefeed\",capture=~\"$capture\"}[1m])) by (le, capture))", + "exemplar": true, + "expr": "sum(rate(ticdc_sorter_db_iter_read_duration_seconds_count{tidb_cluster=\"$tidb_cluster\", capture=~\"$capture\", call=\"first\"}[1m])) by (capture)", "format": "time_series", "hide": false, + "interval": "", "intervalFactor": 1, - "legendFormat": "{{capture}}-p99", + "legendFormat": "{{capture}}-sorter", "refId": "A" - }, - { - "expr": "histogram_quantile(0.999, sum(rate(ticdc_mounter_unmarshal_and_mount_bucket{tidb_cluster=\"$tidb_cluster\", changefeed=~\"$changefeed\",capture=~\"$capture\"}[1m])) by (le, capture))", - "format": "time_series", - "hide": true, - "instant": false, - "intervalFactor": 1, - "legendFormat": "{{capture}}-p999", - "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Mounter unmarshal duration percentile", + "title": "Read OPS - First", "tooltip": { "shared": true, - "sort": 2, + "sort": 0, "value_type": "individual" }, "type": "graph", @@ -5863,17 +5929,17 @@ }, "yaxes": [ { - "format": "s", + "format": "ops", "label": null, "logBase": 1, "max": null, - "min": null, + "min": "0", "show": true }, { - "format": "none", + "format": "percentunit", "label": null, - "logBase": 1, + "logBase": 10, "max": null, "min": null, "show": true @@ -5890,26 +5956,27 @@ "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", - "description": "The number of KV client dispatched event per second", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, "fill": 1, "fillGradient": 0, "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 40 + "h": 6, + "w": 8, + "x": 8, + "y": 29 }, "hiddenSeries": false, - "id": 31, + "id": 283, "legend": { "alignAsTable": true, - "avg": false, + "avg": true, "current": true, - "hideEmpty": true, - "hideZero": true, "max": true, "min": false, - "rightSide": true, + "rightSide": false, "show": true, "total": false, "values": true @@ -5919,43 +5986,34 @@ "links": [], "nullPointMode": "null", "options": { - "dataLinks": [] + "alertThreshold": true }, - "paceLength": 10, "percentage": false, + "pluginVersion": "7.5.7", "pointradius": 2, "points": false, "renderer": "flot", - "seriesOverrides": [ - { - "alias": "/.*batch-resolved/", - "yaxis": 2 - } - ], + "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(ticdc_kvclient_send_event_count{tidb_cluster=\"$tidb_cluster\", changefeed=~\"$changefeed\"}[1m])) by (capture, changefeed, type)", + "exemplar": true, + "expr": "sum(rate(ticdc_sorter_db_iter_read_duration_seconds_count{tidb_cluster=\"$tidb_cluster\", capture=~\"$capture\", call=\"next\"}[1m])) by (capture)", "format": "time_series", + "hide": false, + "interval": "", "intervalFactor": 1, - "legendFormat": "{{capture}}-{{changefeed}}-{{type}}", + "legendFormat": "{{capture}}-sorter", "refId": "A" - }, - { - "expr": "sum(rate(ticdc_kvclient_batch_resolved_event_size_count{tidb_cluster=\"$tidb_cluster\", changefeed=~\"$changefeed\",capture=~\"$capture\"}[1m])) by (capture, changefeed, table)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{capture}}-{{changefeed}}-batch-resolved", - "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "KV client dispatch events/s", + "title": "Read OPS - Next", "tooltip": { "shared": true, "sort": 0, @@ -5971,18 +6029,118 @@ }, "yaxes": [ { - "format": "short", + "format": "ops", "label": null, "logBase": 1, "max": null, - "min": null, + "min": "0", "show": true }, { - "format": "short", + "format": "percentunit", + "label": null, + "logBase": 10, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 8, + "x": 16, + "y": 29 + }, + "hiddenSeries": false, + "id": 287, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.7", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(ticdc_sorter_db_iter_read_duration_seconds_count{tidb_cluster=\"$tidb_cluster\", capture=~\"$capture\", call=\"release\"}[1m])) by (capture)", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{capture}}-sorter", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Read OPS - Release", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "ops", "label": null, "logBase": 1, "max": null, + "min": "0", + "show": true + }, + { + "format": "percentunit", + "label": null, + "logBase": 10, + "max": null, "min": null, "show": true } @@ -6007,17 +6165,21 @@ }, "dataFormat": "tsbuckets", "datasource": "${DS_TEST-CLUSTER}", - "description": "The size of batch resolved ts message from TiKV", + "description": "The time of sorter compact", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 40 + "h": 6, + "w": 8, + "x": 0, + "y": 35 }, "heatmap": {}, "hideZeroBuckets": true, "highlightCards": true, - "id": 97, + "id": 285, "legend": { "alignAsTable": true, "avg": false, @@ -6032,18 +6194,22 @@ "values": true }, "links": [], + "maxPerRow": 3, + "repeatDirection": "h", "reverseYBuckets": false, "targets": [ { - "expr": "sum(rate(ticdc_kvclient_batch_resolved_event_size_bucket{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}[1m])) by (le)", + "exemplar": true, + "expr": "sum(rate(ticdc_sorter_db_compact_duration_seconds_bucket{tidb_cluster=\"$tidb_cluster\", capture=~\"$capture\"}[1m])) by (le)", "format": "heatmap", "instant": false, + "interval": "", "intervalFactor": 2, "legendFormat": "{{le}}", "refId": "A" } ], - "title": "KV client batch resolved size", + "title": "Compact duration", "tooltip": { "show": true, "showHistogram": true @@ -6057,7 +6223,7 @@ "xBucketSize": null, "yAxis": { "decimals": 1, - "format": "none", + "format": "s", "logBase": 1, "max": null, "min": null, @@ -6067,31 +6233,44 @@ "yBucketBound": "upper", "yBucketNumber": null, "yBucketSize": null - }, + } + ], + "title": "DB", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 3 + }, + "id": 13, + "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", - "description": "The number of regions that are being scanned", + "description": "The number of established Eventfeed RPC between TiCDC and TiKV", "fill": 1, "fillGradient": 0, "gridPos": { "h": 7, - "w": 12, + "w": 8, "x": 0, - "y": 47 + "y": 5 }, "hiddenSeries": false, - "id": 177, + "id": 15, "legend": { "alignAsTable": true, "avg": false, "current": true, - "hideEmpty": true, - "hideZero": true, - "max": true, + "hideEmpty": false, + "max": false, "min": false, "rightSide": true, "show": true, @@ -6110,24 +6289,52 @@ "pointradius": 2, "points": false, "renderer": "flot", - "seriesOverrides": [], + "seriesOverrides": [ + { + "alias": "/.*-rpc/", + "yaxis": 2 + } + ], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(ticdc_kvclient_region_token{tidb_cluster=\"$tidb_cluster\", changefeed=~\"$changefeed\", capture=~\"$capture\"}) by (capture, changefeed, store)", + "expr": "sum(ticdc_kvclient_event_feed_count{tidb_cluster=\"$tidb_cluster\"}) by (instance)", "format": "time_series", "intervalFactor": 1, - "legendFormat": "{{changefeed}}-{{capture}}-{{store}}", + "legendFormat": "{{instance}}", "refId": "A" + }, + { + "expr": "sum(grpc_client_started_total{tidb_cluster=\"$tidb_cluster\", grpc_method=\"EventFeed\"}) by (instance) - sum(grpc_client_handled_total{tidb_cluster=\"$tidb_cluster\", grpc_method=\"EventFeed\"}) by (instance)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{instance}}-rpc", + "refId": "B" + }, + { + "expr": "sum(grpc_client_started_total{tidb_cluster=\"$tidb_cluster\", grpc_method=\"EventFeed\"}) by (instance)", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{instance}}-rpc-started", + "refId": "C" + }, + { + "expr": "sum(grpc_client_handled_total{tidb_cluster=\"$tidb_cluster\", grpc_method=\"EventFeed\"}) by (instance)", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{instance}}-rpc-handled", + "refId": "D" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "KV client scanning regions", + "title": "Eventfeed count", "tooltip": { "shared": true, "sort": 0, @@ -6170,24 +6377,23 @@ "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", - "description": "Active stream count of each gRPC connection", + "decimals": null, + "description": "Percentiles of Eventfeed message size", "fill": 1, "fillGradient": 0, "gridPos": { "h": 7, - "w": 12, - "x": 12, - "y": 47 + "w": 8, + "x": 8, + "y": 5 }, "hiddenSeries": false, - "id": 188, + "id": 17, "legend": { "alignAsTable": true, "avg": false, "current": true, - "hideEmpty": true, - "hideZero": true, - "max": true, + "max": false, "min": false, "rightSide": true, "show": true, @@ -6212,18 +6418,25 @@ "steppedLine": false, "targets": [ { - "expr": "sum(ticdc_kvclient_grpc_stream_count{tidb_cluster=\"$tidb_cluster\"}) by (store)", + "expr": "histogram_quantile(0.999, sum(rate(ticdc_kvclient_event_size_bytes_bucket{tidb_cluster=\"$tidb_cluster\"}[1m])) by (le, instance, type))", "format": "time_series", "intervalFactor": 1, - "legendFormat": "{{store}}", + "legendFormat": "{{instance}}-{{type}}-p999", "refId": "A" + }, + { + "expr": "histogram_quantile(0.95, sum(rate(ticdc_kvclient_event_size_bytes_bucket{tidb_cluster=\"$tidb_cluster\"}[1m])) by (le, instance, type))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{instance}}-{{type}}-p95", + "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "KV client gRPC stream count", + "title": "Event size percentile", "tooltip": { "shared": true, "sort": 0, @@ -6239,7 +6452,7 @@ }, "yaxes": [ { - "format": "short", + "format": "bytes", "label": null, "logBase": 1, "max": null, @@ -6262,27 +6475,21 @@ }, { "aliasColors": {}, - "bars": false, + "bars": true, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", - "description": "The number of regions that have not connected to TiKV", - "fieldConfig": { - "defaults": { - "links": [] - }, - "overrides": [] - }, + "description": "The number of errors that interrupt Eventfeed RPC", "fill": 1, "fillGradient": 0, "gridPos": { "h": 7, - "w": 12, - "x": 0, - "y": 54 + "w": 8, + "x": 16, + "y": 5 }, "hiddenSeries": false, - "id": 251, + "id": 28, "legend": { "alignAsTable": true, "avg": false, @@ -6301,7 +6508,7 @@ "links": [], "nullPointMode": "null", "options": { - "alertThreshold": true + "dataLinks": [] }, "paceLength": 10, "percentage": false, @@ -6314,29 +6521,46 @@ "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "sum(ticdc_kvclient_cached_region{tidb_cluster=\"$tidb_cluster\", changefeed=~\"$changefeed\", capture=~\"$capture\"}) by (capture, changefeed, store)", + "expr": "sum(increase(ticdc_kvclient_event_feed_error_count{tidb_cluster=\"$tidb_cluster\", capture=~\"$capture\"}[1m])) by (type)", "format": "time_series", - "interval": "", + "hide": false, + "interval": "1m", "intervalFactor": 1, - "legendFormat": "{{changefeed}}-{{capture}}-{{store}}", + "legendFormat": "{{type}}", "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "KV client cached regions", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", + }, + { + "expr": "-sum(increase(pd_schedule_operators_count{tidb_cluster=\"$tidb_cluster\", event=\"create\", type=~\".*leader\"}[1m]))", + "format": "time_series", + "hide": false, + "interval": "1m", + "intervalFactor": 1, + "legendFormat": "transfer-leader", + "refId": "B" + }, + { + "expr": "-sum(increase(pd_schedule_operators_count{tidb_cluster=\"$tidb_cluster\", event=\"create\", type=~\".*(peer|region)\"}[1m]))", + "format": "time_series", + "interval": "1m", + "intervalFactor": 1, + "legendFormat": "move-region", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Eventfeed error/m", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", "name": null, "show": true, "values": [] @@ -6370,23 +6594,17 @@ "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", - "description": "Estimate the remaining time for a changefeed initialization (on a specific capture)", - "fieldConfig": { - "defaults": { - "unit": "s" - }, - "overrides": [] - }, + "description": "The number of KV client received events from TiKV per seconds", "fill": 1, "fillGradient": 0, "gridPos": { "h": 7, - "w": 12, - "x": 12, - "y": 54 + "w": 8, + "x": 0, + "y": 12 }, "hiddenSeries": false, - "id": 252, + "id": 29, "legend": { "alignAsTable": true, "avg": false, @@ -6395,7 +6613,7 @@ "hideZero": true, "max": true, "min": false, - "rightSide": true, + "rightSide": false, "show": true, "total": false, "values": true @@ -6405,7 +6623,7 @@ "links": [], "nullPointMode": "null", "options": { - "alertThreshold": true + "dataLinks": [] }, "paceLength": 10, "percentage": false, @@ -6418,12 +6636,10 @@ "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "abs(sum(ticdc_kvclient_cached_region{tidb_cluster=\"$tidb_cluster\", changefeed=~\"$changefeed\", capture=~\"$capture\"} / deriv(ticdc_kvclient_cached_region{tidb_cluster=\"$tidb_cluster\", changefeed=~\"$changefeed\", capture=~\"$capture\"}[1m])) by (capture, changefeed, store))", + "expr": "sum(rate(ticdc_kvclient_pull_event_count{tidb_cluster=\"$tidb_cluster\", changefeed=~\"$changefeed\"}[1m])) by (instance, type)", "format": "time_series", - "interval": "", "intervalFactor": 1, - "legendFormat": "{{changefeed}}-{{capture}}-{{store}}", + "legendFormat": "{{instance}}-{{type}}", "refId": "A" } ], @@ -6431,7 +6647,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Estimate remaining time for initialization", + "title": "KV client receive events/s", "tooltip": { "shared": true, "sort": 0, @@ -6447,9 +6663,9 @@ }, "yaxes": [ { - "format": "s", - "label": "", - "logBase": 2, + "format": "short", + "label": null, + "logBase": 1, "max": null, "min": null, "show": true @@ -6467,48 +6683,43 @@ "align": false, "alignLevel": null } - } - ], - "title": "Events", - "type": "row" - }, - { - "collapsed": true, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 4 - }, - "id": 130, - "panels": [ + }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", + "description": "The number of events that puller outputs to sorter \n per second", "fill": 1, + "fillGradient": 0, "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 5 + "h": 7, + "w": 8, + "x": 8, + "y": 12 }, - "id": 131, + "hiddenSeries": false, + "id": 5, "legend": { + "alignAsTable": true, "avg": false, - "current": false, - "max": false, + "current": true, + "max": true, "min": false, + "rightSide": false, "show": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "paceLength": 10, "percentage": false, "pointradius": 2, "points": false, @@ -6519,10 +6730,10 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(ticdc_sorter_consume_count{tidb_cluster=\"$tidb_cluster\", changefeed=~\"$changefeed\", capture=~\"$capture\"}[1m])) by (capture,changefeed)", + "expr": "sum (rate(ticdc_puller_txn_collect_event_count{tidb_cluster=\"$tidb_cluster\", changefeed=~\"$changefeed\"}[1m])) by (instance, type)", "format": "time_series", "intervalFactor": 1, - "legendFormat": "{{capture}}-{{changefeed}}", + "legendFormat": "{{instance}}-{{type}}", "refId": "A" } ], @@ -6530,7 +6741,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Unified Sorter intake rate", + "title": "Puller output events/s", "tooltip": { "shared": true, "sort": 0, @@ -6546,7 +6757,7 @@ }, "yaxes": [ { - "format": "short", + "format": "none", "label": null, "logBase": 1, "max": null, @@ -6573,27 +6784,37 @@ "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", + "description": "The number of events that are buffered in Processor's output channel and Mounter input channel", "fill": 1, + "fillGradient": 0, "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 5 + "h": 7, + "w": 8, + "x": 16, + "y": 12 }, - "id": 132, + "hiddenSeries": false, + "id": 107, "legend": { + "alignAsTable": true, "avg": false, - "current": false, - "max": false, + "current": true, + "hideEmpty": false, + "max": true, "min": false, + "rightSide": false, "show": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "paceLength": 10, "percentage": false, "pointradius": 2, "points": false, @@ -6604,18 +6825,25 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(ticdc_sorter_event_count{tidb_cluster=\"$tidb_cluster\", changefeed=~\"$changefeed\", capture=~\"$capture\"}[1m])) by (capture,changefeed)", + "expr": "sum(ticdc_mounter_input_chan_size{tidb_cluster=\"$tidb_cluster\", changefeed=~\"$changefeed\",capture=~\"$capture\"}) by (capture)", "format": "time_series", "intervalFactor": 1, - "legendFormat": "{{capture}}-{{changefeed}}", + "legendFormat": "{{capture}}-mounter input chan", "refId": "A" + }, + { + "expr": "-sum(ticdc_sink_buffer_chan_size{tidb_cluster=\"$tidb_cluster\", changefeed=~\"$changefeed\",capture=~\"$capture\"}) by (capture)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{capture}}-sink buffer chan", + "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Unified Sorter event output rate", + "title": "Sink/Mounter buffer size", "tooltip": { "shared": true, "sort": 0, @@ -6652,33 +6880,119 @@ "alignLevel": null } }, + { + "cards": { + "cardPadding": 0, + "cardRound": 0 + }, + "color": { + "cardColor": "#FF9830", + "colorScale": "linear", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "min": 0, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_TEST-CLUSTER}", + "description": "Duration of sorting unsorted events", + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 19 + }, + "heatmap": {}, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 99, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "links": [], + "reverseYBuckets": false, + "targets": [ + { + "expr": "sum(rate(ticdc_puller_entry_sorter_sort_bucket{tidb_cluster=\"$tidb_cluster\", capture=~\"$capture\"}[1m])) by (le)", + "format": "heatmap", + "instant": false, + "intervalFactor": 2, + "legendFormat": "{{le}}", + "refId": "A" + } + ], + "title": "Entry sorter sort duration", + "tooltip": { + "show": true, + "showHistogram": true + }, + "tooltipDecimals": 1, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 1, + "format": "s", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketBound": "upper", + "yBucketNumber": null, + "yBucketSize": null + }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", + "description": "Percentiles of sorting events duration", "fill": 1, + "fillGradient": 0, "gridPos": { - "h": 8, + "h": 7, "w": 12, - "x": 0, - "y": 13 + "x": 12, + "y": 19 }, - "id": 133, + "hiddenSeries": false, + "id": 53, "legend": { + "alignAsTable": true, "avg": false, - "current": false, - "max": false, + "current": true, + "hideEmpty": false, + "max": true, "min": false, + "rightSide": true, "show": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "paceLength": 10, "percentage": false, "pointradius": 2, "points": false, @@ -6689,18 +7003,26 @@ "steppedLine": false, "targets": [ { - "expr": "sum(ticdc_sorter_on_disk_data_size_gauge{tidb_cluster=\"$tidb_cluster\", capture=~\"$capture\"}) by (capture)", + "expr": "histogram_quantile(0.999, sum(rate(ticdc_puller_entry_sorter_sort_bucket{tidb_cluster=\"$tidb_cluster\", changefeed=~\"$changefeed\",capture=~\"$capture\"}[1m])) by (le,capture))", "format": "time_series", "intervalFactor": 1, - "legendFormat": "{{capture}}", + "legendFormat": "{{capture}}-p999", "refId": "A" + }, + { + "expr": "histogram_quantile(0.95, sum(rate(ticdc_puller_entry_sorter_sort_bucket{tidb_cluster=\"$tidb_cluster\", changefeed=~\"$changefeed\",capture=~\"$capture\"}[1m])) by (le,capture))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{capture}}-p95", + "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Unified Sorter on disk data size", + "title": "Entry sorter sort duration percentile", "tooltip": { "shared": true, "sort": 0, @@ -6716,11 +7038,11 @@ }, "yaxes": [ { - "format": "bytes", + "format": "s", "label": null, - "logBase": 1, + "logBase": 2, "max": null, - "min": "0", + "min": null, "show": true }, { @@ -6737,33 +7059,119 @@ "alignLevel": null } }, + { + "cards": { + "cardPadding": 0, + "cardRound": 0 + }, + "color": { + "cardColor": "#FF9830", + "colorScale": "linear", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "min": 0, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_TEST-CLUSTER}", + "description": "Duration of merging sorted events", + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 26 + }, + "heatmap": {}, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 105, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "links": [], + "reverseYBuckets": false, + "targets": [ + { + "expr": "sum(rate(ticdc_puller_entry_sorter_merge_bucket{tidb_cluster=\"$tidb_cluster\", capture=~\"$capture\"}[1m])) by (le)", + "format": "heatmap", + "instant": false, + "intervalFactor": 2, + "legendFormat": "{{le}}", + "refId": "A" + } + ], + "title": "Entry sorter merge duration", + "tooltip": { + "show": true, + "showHistogram": true + }, + "tooltipDecimals": 1, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 1, + "format": "s", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketBound": "upper", + "yBucketNumber": null, + "yBucketSize": null + }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", + "description": "Percentiles of merging sorted events duration", "fill": 1, + "fillGradient": 0, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 13 + "y": 26 }, - "id": 134, + "hiddenSeries": false, + "id": 106, "legend": { + "alignAsTable": true, "avg": false, - "current": false, - "max": false, + "current": true, + "hideEmpty": false, + "max": true, "min": false, + "rightSide": true, "show": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "paceLength": 10, "percentage": false, "pointradius": 2, "points": false, @@ -6774,18 +7182,26 @@ "steppedLine": false, "targets": [ { - "expr": "sum(ticdc_sorter_in_memory_data_size_gauge{tidb_cluster=\"$tidb_cluster\", capture=~\"$capture\"}) by (capture)", + "expr": "histogram_quantile(0.999, sum(rate(ticdc_puller_entry_sorter_merge_bucket{tidb_cluster=\"$tidb_cluster\", changefeed=~\"$changefeed\",capture=~\"$capture\"}[1m])) by (le,capture))", "format": "time_series", "intervalFactor": 1, - "legendFormat": "{{capture}}", + "legendFormat": "{{capture}}-p999", "refId": "A" + }, + { + "expr": "histogram_quantile(0.95, sum(rate(ticdc_puller_entry_sorter_merge_bucket{tidb_cluster=\"$tidb_cluster\", changefeed=~\"$changefeed\",capture=~\"$capture\"}[1m])) by (le,capture))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{capture}}-p95", + "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Unified Sorter in-memory data size", + "title": "Entry sorter merge duration percentile", "tooltip": { "shared": true, "sort": 0, @@ -6801,11 +7217,11 @@ }, "yaxes": [ { - "format": "bytes", + "format": "s", "label": null, - "logBase": 1, + "logBase": 2, "max": null, - "min": "0", + "min": null, "show": true }, { @@ -6824,51 +7240,61 @@ }, { "cards": { - "cardPadding": null, - "cardRound": null + "cardPadding": 0, + "cardRound": 0 }, "color": { - "cardColor": "#b4ff00", - "colorScale": "sqrt", - "colorScheme": "interpolateOranges", + "cardColor": "#FF9830", + "colorScale": "linear", + "colorScheme": "interpolateSpectral", "exponent": 0.5, - "max": null, - "min": null, + "min": 0, "mode": "spectrum" }, "dataFormat": "tsbuckets", "datasource": "${DS_TEST-CLUSTER}", + "description": "Duration of unmarshal events from kv to SQL row", "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 21 + "y": 33 }, "heatmap": {}, "hideZeroBuckets": true, "highlightCards": true, - "id": 135, + "id": 101, "legend": { - "show": false + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true }, "links": [], "reverseYBuckets": false, "targets": [ { - "expr": "sum(rate(ticdc_sorter_flush_count_histogram_bucket{tidb_cluster=\"$tidb_cluster\", changefeed=~\"$changefeed\", capture=~\"$capture\"}[1m])) by (le)", + "expr": "max(rate(ticdc_mounter_unmarshal_and_mount_bucket{tidb_cluster=\"$tidb_cluster\", capture=~\"$capture\"}[1m])) by (le)", "format": "heatmap", - "intervalFactor": 1, + "instant": false, + "intervalFactor": 2, "legendFormat": "{{le}}", "refId": "A" } ], - "timeFrom": null, - "timeShift": null, - "title": "Unified Sorter flush sizes", + "title": "Mounter unmarshal duration", "tooltip": { "show": true, - "showHistogram": false + "showHistogram": true }, + "tooltipDecimals": 1, "type": "heatmap", "xAxis": { "show": true @@ -6876,153 +7302,91 @@ "xBucketNumber": null, "xBucketSize": null, "yAxis": { - "decimals": null, - "format": "none", + "decimals": 1, + "format": "s", "logBase": 1, "max": null, "min": null, "show": true, "splitFactor": null }, - "yBucketBound": "auto", + "yBucketBound": "upper", "yBucketNumber": null, "yBucketSize": null }, { - "cards": { - "cardPadding": null, - "cardRound": null - }, - "color": { - "cardColor": "#b4ff00", - "colorScale": "sqrt", - "colorScheme": "interpolateBlues", - "exponent": 0.5, - "mode": "spectrum" - }, - "dataFormat": "tsbuckets", + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, "datasource": "${DS_TEST-CLUSTER}", + "description": "Percentiles of unmarshal events from kv to SQL row duration", + "fill": 0, + "fillGradient": 0, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 21 + "y": 33 }, - "heatmap": {}, - "hideZeroBuckets": true, - "highlightCards": true, - "id": 136, + "hiddenSeries": false, + "id": 55, "legend": { - "show": false + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true }, + "lines": true, + "linewidth": 1, "links": [], - "reverseYBuckets": false, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "paceLength": 10, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, "targets": [ { - "expr": "sum(rate(ticdc_sorter_merge_count_histogram_bucket{tidb_cluster=\"$tidb_cluster\", changefeed=~\"$changefeed\", capture=~\"$capture\"}[1m])) by (le)", - "format": "heatmap", + "expr": "histogram_quantile(0.99, sum(rate(ticdc_mounter_unmarshal_and_mount_bucket{tidb_cluster=\"$tidb_cluster\", changefeed=~\"$changefeed\",capture=~\"$capture\"}[1m])) by (le, capture))", + "format": "time_series", + "hide": false, "intervalFactor": 1, - "legendFormat": "{{le}}", + "legendFormat": "{{capture}}-p99", "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Unified Sorter merge size", - "tooltip": { - "show": true, - "showHistogram": false - }, - "type": "heatmap", - "xAxis": { - "show": true - }, - "xBucketNumber": null, - "xBucketSize": null, - "yAxis": { - "decimals": null, - "format": "none", - "logBase": 1, - "max": null, - "min": null, - "show": true, - "splitFactor": null - }, - "yBucketBound": "auto", - "yBucketNumber": null, - "yBucketSize": null - } - ], - "title": "Unified Sorter", - "type": "row" - }, - { - "collapsed": true, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 5 - }, - "id": 266, - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_TEST-CLUSTER}", - "fill": 1, - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 16 - }, - "id": 289, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": true, - "hideZero": true, - "max": true, - "min": false, - "rightSide": true, - "show": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ + }, { - "expr": "etcd_debugging_mvcc_db_total_size_in_bytes{tidb_cluster=\"$tidb_cluster\", job=\"pd\"}", + "expr": "histogram_quantile(0.999, sum(rate(ticdc_mounter_unmarshal_and_mount_bucket{tidb_cluster=\"$tidb_cluster\", changefeed=~\"$changefeed\",capture=~\"$capture\"}[1m])) by (le, capture))", "format": "time_series", - "hide": false, + "hide": true, + "instant": false, "intervalFactor": 1, - "legendFormat": "{{instance}}", - "refId": "A" + "legendFormat": "{{capture}}-p999", + "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Etcd MVCC DB total size", + "title": "Mounter unmarshal duration percentile", "tooltip": { "shared": true, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -7035,7 +7399,7 @@ }, "yaxes": [ { - "format": "decbytes", + "format": "s", "label": null, "logBase": 1, "max": null, @@ -7043,12 +7407,12 @@ "show": true }, { - "format": "short", + "format": "none", "label": null, "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ], "yaxis": { @@ -7062,21 +7426,23 @@ "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", + "description": "The number of KV client dispatched event per second", "fill": 1, "fillGradient": 0, "gridPos": { "h": 7, "w": 12, - "x": 12, - "y": 16 + "x": 0, + "y": 40 }, "hiddenSeries": false, - "id": 114, + "id": 31, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, @@ -7096,38 +7462,36 @@ "pointradius": 2, "points": false, "renderer": "flot", - "seriesOverrides": [], + "seriesOverrides": [ + { + "alias": "/.*batch-resolved/", + "yaxis": 2 + } + ], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.999, sum(rate(ticdc_server_etcd_health_check_duration_bucket{tidb_cluster=\"$tidb_cluster\", capture=~\"$capture\"}[1m])) by (le,instance))", + "expr": "sum(rate(ticdc_kvclient_send_event_count{tidb_cluster=\"$tidb_cluster\", changefeed=~\"$changefeed\"}[1m])) by (capture, changefeed, type)", "format": "time_series", "intervalFactor": 1, - "legendFormat": "p999-{{instance}}", + "legendFormat": "{{capture}}-{{changefeed}}-{{type}}", "refId": "A" }, { - "expr": "histogram_quantile(0.99, sum(rate(ticdc_server_etcd_health_check_duration_bucket{tidb_cluster=\"$tidb_cluster\", capture=~\"$capture\"}[1m])) by (le,instance))", + "expr": "sum(rate(ticdc_kvclient_batch_resolved_event_size_count{tidb_cluster=\"$tidb_cluster\", changefeed=~\"$changefeed\",capture=~\"$capture\"}[1m])) by (capture, changefeed, table)", "format": "time_series", "intervalFactor": 1, - "legendFormat": "p99-{{instance}}", + "legendFormat": "{{capture}}-{{changefeed}}-batch-resolved", "refId": "B" - }, - { - "expr": "histogram_quantile(0.95, sum(rate(ticdc_server_etcd_health_check_duration_bucket{tidb_cluster=\"$tidb_cluster\", capture=~\"$capture\"}[1m])) by (le,instance))", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "p95-{{instance}}", - "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Etcd health check duration", + "title": "KV client dispatch events/s", "tooltip": { "shared": true, "sort": 0, @@ -7143,7 +7507,7 @@ }, "yaxes": [ { - "format": "s", + "format": "short", "label": null, "logBase": 1, "max": null, @@ -7170,52 +7534,52 @@ "cardRound": 0 }, "color": { - "cardColor": "#b4ff00", - "colorScale": "sqrt", + "cardColor": "#FF9830", + "colorScale": "linear", "colorScheme": "interpolateSpectral", "exponent": 0.5, - "max": null, - "min": 1, + "min": 0, "mode": "spectrum" }, "dataFormat": "tsbuckets", "datasource": "${DS_TEST-CLUSTER}", - "description": "", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, + "description": "The size of batch resolved ts message from TiKV", "gridPos": { "h": 7, "w": 12, - "x": 0, - "y": 23 + "x": 12, + "y": 40 }, "heatmap": {}, "hideZeroBuckets": true, "highlightCards": true, - "id": 267, + "id": 97, "legend": { - "show": true + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true }, - "pluginVersion": "6.1.6", + "links": [], "reverseYBuckets": false, "targets": [ { - "exemplar": true, - "expr": "sum(rate(ticdc_etcd_worker_tick_reactor_duration_bucket{tidb_cluster=\"$tidb_cluster\", capture=~\"$capture\"}[1m])) by (le)", + "expr": "sum(rate(ticdc_kvclient_batch_resolved_event_size_bucket{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}[1m])) by (le)", "format": "heatmap", "instant": false, - "interval": "", "intervalFactor": 2, "legendFormat": "{{le}}", - "queryType": "randomWalk", "refId": "A" } ], - "timeFrom": null, - "timeShift": null, - "title": "EtcdWorker exec etcd txn duration", + "title": "KV client batch resolved size", "tooltip": { "show": true, "showHistogram": true @@ -7229,7 +7593,7 @@ "xBucketSize": null, "yAxis": { "decimals": 1, - "format": "s", + "format": "none", "logBase": 1, "max": null, "min": null, @@ -7246,38 +7610,39 @@ "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", - "description": "", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, + "description": "The number of regions that are being scanned", "fill": 1, "fillGradient": 0, "gridPos": { "h": 7, "w": 12, - "x": 12, - "y": 23 + "x": 0, + "y": 47 }, "hiddenSeries": false, - "id": 264, + "id": 177, "legend": { + "alignAsTable": true, "avg": false, - "current": false, - "max": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, "min": false, + "rightSide": true, "show": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, + "links": [], "nullPointMode": "null", "options": { - "alertThreshold": true + "dataLinks": [] }, + "paceLength": 10, "percentage": false, - "pluginVersion": "6.1.6", "pointradius": 2, "points": false, "renderer": "flot", @@ -7287,27 +7652,18 @@ "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "histogram_quantile(0.95, sum(rate(ticdc_etcd_worker_tick_reactor_duration_bucket{tidb_cluster=\"$tidb_cluster\", capture=~\"$capture\"}[1m])) by (le,capture))", - "interval": "", - "legendFormat": "{{capture}}-95", - "queryType": "randomWalk", + "expr": "sum(ticdc_kvclient_region_token{tidb_cluster=\"$tidb_cluster\", changefeed=~\"$changefeed\", capture=~\"$capture\"}) by (capture, changefeed, store)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{changefeed}}-{{capture}}-{{store}}", "refId": "A" - }, - { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(ticdc_etcd_worker_tick_reactor_duration_bucket{tidb_cluster=\"$tidb_cluster\", capture=~\"$capture\"}[1m])) by (le,capture))", - "hide": false, - "interval": "", - "legendFormat": "{{capture}}-99", - "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "EtcdWorker tick reactor duration", + "title": "KV client scanning regions", "tooltip": { "shared": true, "sort": 0, @@ -7323,7 +7679,7 @@ }, "yaxes": [ { - "format": "s", + "format": "short", "label": null, "logBase": 1, "max": null, @@ -7331,7 +7687,7 @@ "show": true }, { - "format": "none", + "format": "short", "label": null, "logBase": 1, "max": null, @@ -7345,121 +7701,44 @@ } }, { - "cards": { - "cardPadding": 0, - "cardRound": 0 - }, - "color": { - "cardColor": "#b4ff00", - "colorScale": "sqrt", - "colorScheme": "interpolateSpectral", - "exponent": 0.5, - "max": null, - "min": 1, - "mode": "spectrum" - }, - "dataFormat": "tsbuckets", + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, "datasource": "${DS_TEST-CLUSTER}", - "description": "", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, + "description": "Active stream count of each gRPC connection", + "fill": 1, + "fillGradient": 0, "gridPos": { "h": 7, "w": 12, - "x": 0, - "y": 30 - }, - "heatmap": {}, - "hideZeroBuckets": true, - "highlightCards": true, - "id": 256, - "legend": { - "show": true - }, - "pluginVersion": "6.1.6", - "reverseYBuckets": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(rate(ticdc_etcd_worker_etcd_txn_exec_duration_bucket{tidb_cluster=\"$tidb_cluster\", capture=~\"$capture\"}[1m])) by (le)", - "format": "heatmap", - "instant": false, - "interval": "", - "intervalFactor": 2, - "legendFormat": "{{le}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "EtcdWorker exec etcd txn duration", - "tooltip": { - "show": true, - "showHistogram": true - }, - "tooltipDecimals": 1, - "type": "heatmap", - "xAxis": { - "show": true - }, - "xBucketNumber": null, - "xBucketSize": null, - "yAxis": { - "decimals": 1, - "format": "s", - "logBase": 1, - "max": null, - "min": null, - "show": true, - "splitFactor": null - }, - "yBucketBound": "upper", - "yBucketNumber": null, - "yBucketSize": null - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_TEST-CLUSTER}", - "description": "", - "fieldConfig": { - "defaults": { - "unit": "s" - }, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 30 + "x": 12, + "y": 47 }, "hiddenSeries": false, - "id": 258, + "id": 188, "legend": { + "alignAsTable": true, "avg": false, - "current": false, - "max": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, "min": false, + "rightSide": true, "show": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, + "links": [], "nullPointMode": "null", "options": { - "alertThreshold": true + "dataLinks": [] }, + "paceLength": 10, "percentage": false, - "pluginVersion": "6.1.6", "pointradius": 2, "points": false, "renderer": "flot", @@ -7469,31 +7748,18 @@ "steppedLine": false, "targets": [ { - "exemplar": true, - "expr": "histogram_quantile(0.95, sum(rate(ticdc_etcd_worker_etcd_txn_exec_duration_bucket{tidb_cluster=\"$tidb_cluster\", capture=~\"$capture\"}[1m])) by (le,capture))", + "expr": "sum(ticdc_kvclient_grpc_stream_count{tidb_cluster=\"$tidb_cluster\"}) by (store)", "format": "time_series", - "interval": "", "intervalFactor": 1, - "legendFormat": "{{capture}}-p95", - "queryType": "randomWalk", + "legendFormat": "{{store}}", "refId": "A" - }, - { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(ticdc_etcd_worker_etcd_txn_exec_duration_bucket{tidb_cluster=\"$tidb_cluster\", capture=~\"$capture\"}[1m])) by (le,capture))", - "format": "time_series", - "hide": false, - "interval": "", - "intervalFactor": 1, - "legendFormat": "{{capture}}-p99", - "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "EtcdWorker exec etcd txn duration percentile", + "title": "KV client gRPC stream count", "tooltip": { "shared": true, "sort": 0, @@ -7509,9 +7775,9 @@ }, "yaxes": [ { - "format": "s", + "format": "short", "label": null, - "logBase": 2, + "logBase": 1, "max": null, "min": null, "show": true @@ -7530,88 +7796,17 @@ "alignLevel": null } }, - { - "cards": { - "cardPadding": null, - "cardRound": null - }, - "color": { - "cardColor": "#b4ff00", - "colorScale": "sqrt", - "colorScheme": "interpolatePurples", - "exponent": 0.5, - "min": 0, - "mode": "spectrum" - }, - "dataFormat": "tsbuckets", - "datasource": "${DS_TEST-CLUSTER}", - "description": "", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 37 - }, - "heatmap": {}, - "hideZeroBuckets": true, - "highlightCards": true, - "id": 254, - "legend": { - "show": true - }, - "pluginVersion": "6.1.6", - "reverseYBuckets": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(rate(ticdc_etcd_worker_etcd_txn_size_bytes_bucket{tidb_cluster=\"$tidb_cluster\"}[1m])) by (le)", - "format": "heatmap", - "instant": false, - "interval": "", - "intervalFactor": 1, - "legendFormat": "{{le}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "title": "EtcdWorker txn size", - "tooltip": { - "show": true, - "showHistogram": true - }, - "tooltipDecimals": null, - "type": "heatmap", - "xAxis": { - "show": true - }, - "xBucketNumber": null, - "xBucketSize": null, - "yAxis": { - "decimals": null, - "format": "decbytes", - "logBase": 1, - "max": null, - "min": null, - "show": true, - "splitFactor": null - }, - "yBucketBound": "auto", - "yBucketNumber": null, - "yBucketSize": null - }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", - "description": "", + "description": "The number of regions that have not connected to TiKV", "fieldConfig": { - "defaults": {}, + "defaults": { + "links": [] + }, "overrides": [] }, "fill": 1, @@ -7619,28 +7814,33 @@ "gridPos": { "h": 7, "w": 12, - "x": 12, - "y": 37 + "x": 0, + "y": 54 }, "hiddenSeries": false, - "id": 260, + "id": 251, "legend": { + "alignAsTable": true, "avg": false, - "current": false, - "max": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, "min": false, + "rightSide": true, "show": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, + "links": [], "nullPointMode": "null", "options": { "alertThreshold": true }, + "paceLength": 10, "percentage": false, - "pluginVersion": "6.1.6", "pointradius": 2, "points": false, "renderer": "flot", @@ -7651,26 +7851,19 @@ "targets": [ { "exemplar": true, - "expr": "histogram_quantile(0.95, sum(rate(ticdc_etcd_worker_etcd_txn_size_bytes_bucket{tidb_cluster=\"$tidb_cluster\", capture=~\"$capture\"}[1m])) by (le,capture))", + "expr": "sum(ticdc_kvclient_cached_region{tidb_cluster=\"$tidb_cluster\", changefeed=~\"$changefeed\", capture=~\"$capture\"}) by (capture, changefeed, store)", + "format": "time_series", "interval": "", - "legendFormat": "{{capture}}-p95", - "queryType": "randomWalk", + "intervalFactor": 1, + "legendFormat": "{{changefeed}}-{{capture}}-{{store}}", "refId": "A" - }, - { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(ticdc_etcd_worker_etcd_txn_size_bytes_bucket{tidb_cluster=\"$tidb_cluster\", capture=~\"$capture\"}[1m])) by (le,capture))", - "hide": false, - "interval": "", - "legendFormat": "{{capture}}-p99", - "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "EtcdWorker txn size percentile", + "title": "KV client cached regions", "tooltip": { "shared": true, "sort": 0, @@ -7686,7 +7879,7 @@ }, "yaxes": [ { - "format": "bytes", + "format": "short", "label": null, "logBase": 1, "max": null, @@ -7713,18 +7906,23 @@ "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", - "description": "The time consumed of writing WAL into the persistent storage in .99", - "editable": true, - "error": false, + "description": "Estimate the remaining time for a changefeed initialization (on a specific capture)", + "fieldConfig": { + "defaults": { + "unit": "s" + }, + "overrides": [] + }, "fill": 1, - "grid": {}, + "fillGradient": 0, "gridPos": { "h": 7, "w": 12, - "x": 0, - "y": 44 + "x": 12, + "y": 54 }, - "id": 291, + "hiddenSeries": false, + "id": 252, "legend": { "alignAsTable": true, "avg": false, @@ -7735,7 +7933,6 @@ "min": false, "rightSide": true, "show": true, - "sideWidth": 300, "total": false, "values": true }, @@ -7743,9 +7940,12 @@ "linewidth": 1, "links": [], "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "paceLength": 10, "percentage": false, - "pointradius": 5, + "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], @@ -7754,25 +7954,24 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket{tidb_cluster=\"$tidb_cluster\"}[5m])) by (instance, le))", + "exemplar": true, + "expr": "abs(sum(ticdc_kvclient_cached_region{tidb_cluster=\"$tidb_cluster\", changefeed=~\"$changefeed\", capture=~\"$capture\"} / deriv(ticdc_kvclient_cached_region{tidb_cluster=\"$tidb_cluster\", changefeed=~\"$changefeed\", capture=~\"$capture\"}[1m])) by (capture, changefeed, store))", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "metric": "", - "refId": "A", - "step": 4 + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{changefeed}}-{{capture}}-{{store}}", + "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Etcd 99% WAL fsync duration", + "title": "Estimate remaining time for initialization", "tooltip": { - "msResolution": false, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, "type": "graph", "xaxis": { @@ -7785,8 +7984,8 @@ "yaxes": [ { "format": "s", - "label": null, - "logBase": 1, + "label": "", + "logBase": 2, "max": null, "min": null, "show": true @@ -7804,44 +8003,50 @@ "align": false, "alignLevel": null } - }, + } + ], + "title": "Events", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 4 + }, + "id": 130, + "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", - "description": "The time consumed of handling etcd transactions in .99", - "editable": true, - "error": false, "fill": 1, - "grid": {}, "gridPos": { - "h": 7, + "h": 8, "w": 12, - "x": 12, - "y": 44 + "x": 0, + "y": 5 }, - "id": 290, + "id": 131, "legend": { - "alignAsTable": true, "avg": false, - "current": true, - "max": true, + "current": false, + "max": false, "min": false, - "rightSide": true, "show": true, - "sideWidth": 300, "total": false, - "values": true + "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", - "paceLength": 10, "percentage": false, - "pointradius": 5, + "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], @@ -7850,24 +8055,22 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(pd_txn_handle_txns_duration_seconds_bucket{tidb_cluster=\"$tidb_cluster\"}[5m])) by (instance, result, le))", + "expr": "sum(rate(ticdc_sorter_consume_count{tidb_cluster=\"$tidb_cluster\", changefeed=~\"$changefeed\", capture=~\"$capture\"}[1m])) by (capture,changefeed)", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} {{result}}", - "refId": "A", - "step": 4 + "intervalFactor": 1, + "legendFormat": "{{capture}}-{{changefeed}}", + "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Etcd 99% Handle transactions duration", + "title": "Unified Sorter intake rate", "tooltip": { - "msResolution": false, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, "type": "graph", "xaxis": { @@ -7879,7 +8082,7 @@ }, "yaxes": [ { - "format": "s", + "format": "short", "label": null, "logBase": 1, "max": null, @@ -7899,108 +8102,56 @@ "align": false, "alignLevel": null } - } - ], - "title": "Etcd", - "type": "row" - }, - { - "collapsed": true, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 6 - }, - "id": 58, - "panels": [ + }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "CPU usage of TiKV CDC component", "fill": 1, "gridPos": { - "h": 7, + "h": 8, "w": 12, - "x": 0, - "y": 7 + "x": 12, + "y": 5 }, - "id": 60, + "id": 132, "legend": { - "alignAsTable": true, "avg": false, - "current": true, - "max": true, + "current": false, + "max": false, "min": false, - "rightSide": true, "show": true, - "sideWidth": null, - "sort": "current", - "sortDesc": true, "total": false, - "values": true + "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", - "paceLength": 10, "percentage": false, - "pointradius": 5, + "pointradius": 2, "points": false, "renderer": "flot", - "seriesOverrides": [ - { - "alias": "/.*workers/", - "transform": "negative-Y", - "yaxis": 2 - }, - { - "alias": "/.*tso/", - "fill": 0, - "pointradius": 1, - "points": true - } - ], + "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_thread_cpu_seconds_total{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\", name=~\"cdc_.*|cdc\"}[1m])) by (instance)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}-endpoint", - "refId": "A", - "step": 4 - }, - { - "expr": "sum(rate(tikv_thread_cpu_seconds_total{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\", name=~\"cdcwkr.*\"}[1m])) by (instance)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}-workers", - "refId": "B", - "step": 4 - }, - { - "expr": "sum(rate(tikv_thread_cpu_seconds_total{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\", name=~\"tso\"}[1m])) by (instance)", + "expr": "sum(rate(ticdc_sorter_event_count{tidb_cluster=\"$tidb_cluster\", changefeed=~\"$changefeed\", capture=~\"$capture\"}[1m])) by (capture,changefeed)", "format": "time_series", - "hide": false, - "intervalFactor": 2, - "legendFormat": "{{instance}}-tso", - "refId": "C", - "step": 4 + "intervalFactor": 1, + "legendFormat": "{{capture}}-{{changefeed}}", + "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "CDC CPU", + "title": "Unified Sorter event output rate", "tooltip": { "shared": true, "sort": 0, @@ -8016,7 +8167,7 @@ }, "yaxes": [ { - "format": "percentunit", + "format": "short", "label": null, "logBase": 1, "max": null, @@ -8024,7 +8175,7 @@ "show": true }, { - "format": "percent", + "format": "short", "label": null, "logBase": 1, "max": null, @@ -8039,80 +8190,54 @@ }, { "aliasColors": {}, - "bars": true, + "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "Outbound network traffic of TiKV CDC component", - "editable": true, - "error": false, - "fill": 0, - "grid": {}, + "fill": 1, "gridPos": { - "h": 7, + "h": 8, "w": 12, - "x": 12, - "y": 7 + "x": 0, + "y": 13 }, - "id": 74, + "id": 133, "legend": { - "alignAsTable": true, "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": false, - "max": true, + "current": false, + "max": false, "min": false, - "rightSide": true, "show": true, - "sideWidth": null, - "sort": "max", - "sortDesc": true, "total": false, - "values": true + "values": false }, - "lines": false, + "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", - "paceLength": 10, "percentage": false, - "pointradius": 5, + "pointradius": 2, "points": false, "renderer": "flot", - "seriesOverrides": [ - { - "alias": "/.*resolved_ts/", - "bars": false, - "fill": 1, - "lines": true, - "linewidth": 2, - "transform": "negative-Y", - "yaxis": 2 - } - ], + "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_cdc_grpc_message_sent_bytes{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}[30s])) by (instance, type)", + "expr": "sum(ticdc_sorter_on_disk_data_size_gauge{tidb_cluster=\"$tidb_cluster\", capture=~\"$capture\"}) by (capture)", "format": "time_series", - "hide": false, - "intervalFactor": 2, - "legendFormat": "{{instance}}-{{type}}", - "refId": "A", - "step": 10 + "intervalFactor": 1, + "legendFormat": "{{capture}}", + "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "CDC network traffic", + "title": "Unified Sorter on disk data size", "tooltip": { - "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" @@ -8127,15 +8252,15 @@ }, "yaxes": [ { - "format": "Bps", + "format": "bytes", "label": null, "logBase": 1, "max": null, - "min": null, + "min": "0", "show": true }, { - "format": "bytes", + "format": "short", "label": null, "logBase": 1, "max": null, @@ -8145,7 +8270,7 @@ ], "yaxis": { "align": false, - "alignLevel": 0 + "alignLevel": null } }, { @@ -8154,38 +8279,29 @@ "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The count of different kinds of gRPC message", "fill": 1, "gridPos": { - "h": 7, + "h": 8, "w": 12, - "x": 0, - "y": 14 + "x": 12, + "y": 13 }, - "id": 147, + "id": 134, "legend": { - "alignAsTable": true, "avg": false, - "current": true, - "hideEmpty": true, - "hideZero": true, - "max": true, + "current": false, + "max": false, "min": false, - "rightSide": true, "show": true, - "sideWidth": null, - "sort": "current", - "sortDesc": true, "total": false, - "values": true + "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, - "pointradius": 5, + "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], @@ -8194,11 +8310,1824 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_grpc_msg_duration_seconds_count{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\", type!=\"kv_gc\"}[1m])) by (type)", + "expr": "sum(ticdc_sorter_in_memory_data_size_gauge{tidb_cluster=\"$tidb_cluster\", capture=~\"$capture\"}) by (capture)", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{type}}", - "metric": "tikv_grpc_msg_duration_seconds_bucket", + "intervalFactor": 1, + "legendFormat": "{{capture}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Unified Sorter in-memory data size", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateOranges", + "exponent": 0.5, + "max": null, + "min": null, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_TEST-CLUSTER}", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 21 + }, + "heatmap": {}, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 135, + "legend": { + "show": false + }, + "links": [], + "reverseYBuckets": false, + "targets": [ + { + "expr": "sum(rate(ticdc_sorter_flush_count_histogram_bucket{tidb_cluster=\"$tidb_cluster\", changefeed=~\"$changefeed\", capture=~\"$capture\"}[1m])) by (le)", + "format": "heatmap", + "intervalFactor": 1, + "legendFormat": "{{le}}", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Unified Sorter flush sizes", + "tooltip": { + "show": true, + "showHistogram": false + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": null, + "format": "none", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketBound": "auto", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateBlues", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_TEST-CLUSTER}", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 21 + }, + "heatmap": {}, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 136, + "legend": { + "show": false + }, + "links": [], + "reverseYBuckets": false, + "targets": [ + { + "expr": "sum(rate(ticdc_sorter_merge_count_histogram_bucket{tidb_cluster=\"$tidb_cluster\", changefeed=~\"$changefeed\", capture=~\"$capture\"}[1m])) by (le)", + "format": "heatmap", + "intervalFactor": 1, + "legendFormat": "{{le}}", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Unified Sorter merge size", + "tooltip": { + "show": true, + "showHistogram": false + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": null, + "format": "none", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketBound": "auto", + "yBucketNumber": null, + "yBucketSize": null + } + ], + "title": "Unified Sorter", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 5 + }, + "id": 266, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "fill": 1, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 289, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "etcd_debugging_mvcc_db_total_size_in_bytes{tidb_cluster=\"$tidb_cluster\", job=\"pd\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Etcd MVCC DB total size", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "decbytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 16 + }, + "hiddenSeries": false, + "id": 114, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "paceLength": 10, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.999, sum(rate(ticdc_server_etcd_health_check_duration_bucket{tidb_cluster=\"$tidb_cluster\", capture=~\"$capture\"}[1m])) by (le,instance))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "p999-{{instance}}", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.99, sum(rate(ticdc_server_etcd_health_check_duration_bucket{tidb_cluster=\"$tidb_cluster\", capture=~\"$capture\"}[1m])) by (le,instance))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "p99-{{instance}}", + "refId": "B" + }, + { + "expr": "histogram_quantile(0.95, sum(rate(ticdc_server_etcd_health_check_duration_bucket{tidb_cluster=\"$tidb_cluster\", capture=~\"$capture\"}[1m])) by (le,instance))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "p95-{{instance}}", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Etcd health check duration", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "cards": { + "cardPadding": 0, + "cardRound": 0 + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "max": null, + "min": 1, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_TEST-CLUSTER}", + "description": "", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 23 + }, + "heatmap": {}, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 267, + "legend": { + "show": true + }, + "pluginVersion": "6.1.6", + "reverseYBuckets": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(ticdc_etcd_worker_tick_reactor_duration_bucket{tidb_cluster=\"$tidb_cluster\", capture=~\"$capture\"}[1m])) by (le)", + "format": "heatmap", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{le}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "EtcdWorker exec etcd txn duration", + "tooltip": { + "show": true, + "showHistogram": true + }, + "tooltipDecimals": 1, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 1, + "format": "s", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketBound": "upper", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "description": "", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 23 + }, + "hiddenSeries": false, + "id": 264, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "6.1.6", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "histogram_quantile(0.95, sum(rate(ticdc_etcd_worker_tick_reactor_duration_bucket{tidb_cluster=\"$tidb_cluster\", capture=~\"$capture\"}[1m])) by (le,capture))", + "interval": "", + "legendFormat": "{{capture}}-95", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.99, sum(rate(ticdc_etcd_worker_tick_reactor_duration_bucket{tidb_cluster=\"$tidb_cluster\", capture=~\"$capture\"}[1m])) by (le,capture))", + "hide": false, + "interval": "", + "legendFormat": "{{capture}}-99", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "EtcdWorker tick reactor duration", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "none", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "cards": { + "cardPadding": 0, + "cardRound": 0 + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "max": null, + "min": 1, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_TEST-CLUSTER}", + "description": "", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 30 + }, + "heatmap": {}, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 256, + "legend": { + "show": true + }, + "pluginVersion": "6.1.6", + "reverseYBuckets": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(ticdc_etcd_worker_etcd_txn_exec_duration_bucket{tidb_cluster=\"$tidb_cluster\", capture=~\"$capture\"}[1m])) by (le)", + "format": "heatmap", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{le}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "EtcdWorker exec etcd txn duration", + "tooltip": { + "show": true, + "showHistogram": true + }, + "tooltipDecimals": 1, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 1, + "format": "s", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketBound": "upper", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "description": "", + "fieldConfig": { + "defaults": { + "unit": "s" + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 30 + }, + "hiddenSeries": false, + "id": 258, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "6.1.6", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "histogram_quantile(0.95, sum(rate(ticdc_etcd_worker_etcd_txn_exec_duration_bucket{tidb_cluster=\"$tidb_cluster\", capture=~\"$capture\"}[1m])) by (le,capture))", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{capture}}-p95", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.99, sum(rate(ticdc_etcd_worker_etcd_txn_exec_duration_bucket{tidb_cluster=\"$tidb_cluster\", capture=~\"$capture\"}[1m])) by (le,capture))", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{capture}}-p99", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "EtcdWorker exec etcd txn duration percentile", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 2, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolatePurples", + "exponent": 0.5, + "min": 0, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_TEST-CLUSTER}", + "description": "", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 37 + }, + "heatmap": {}, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 254, + "legend": { + "show": true + }, + "pluginVersion": "6.1.6", + "reverseYBuckets": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(ticdc_etcd_worker_etcd_txn_size_bytes_bucket{tidb_cluster=\"$tidb_cluster\"}[1m])) by (le)", + "format": "heatmap", + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{le}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "title": "EtcdWorker txn size", + "tooltip": { + "show": true, + "showHistogram": true + }, + "tooltipDecimals": null, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": null, + "format": "decbytes", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketBound": "auto", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "description": "", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 37 + }, + "hiddenSeries": false, + "id": 260, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "6.1.6", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "histogram_quantile(0.95, sum(rate(ticdc_etcd_worker_etcd_txn_size_bytes_bucket{tidb_cluster=\"$tidb_cluster\", capture=~\"$capture\"}[1m])) by (le,capture))", + "interval": "", + "legendFormat": "{{capture}}-p95", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.99, sum(rate(ticdc_etcd_worker_etcd_txn_size_bytes_bucket{tidb_cluster=\"$tidb_cluster\", capture=~\"$capture\"}[1m])) by (le,capture))", + "hide": false, + "interval": "", + "legendFormat": "{{capture}}-p99", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "EtcdWorker txn size percentile", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "description": "The time consumed of writing WAL into the persistent storage in .99", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 44 + }, + "id": 291, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": 300, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "paceLength": 10, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket{tidb_cluster=\"$tidb_cluster\"}[5m])) by (instance, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "metric": "", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Etcd 99% WAL fsync duration", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "description": "The time consumed of handling etcd transactions in .99", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 44 + }, + "id": 290, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": 300, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "paceLength": 10, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(pd_txn_handle_txns_duration_seconds_bucket{tidb_cluster=\"$tidb_cluster\"}[5m])) by (instance, result, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} {{result}}", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Etcd 99% Handle transactions duration", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "title": "Etcd", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 6 + }, + "id": 58, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "decimals": 1, + "description": "CPU usage of TiKV CDC component", + "fill": 1, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 7 + }, + "id": 60, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "paceLength": 10, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*workers/", + "transform": "negative-Y", + "yaxis": 2 + }, + { + "alias": "/.*tso/", + "fill": 0, + "pointradius": 1, + "points": true + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(tikv_thread_cpu_seconds_total{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\", name=~\"cdc_.*|cdc\"}[1m])) by (instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}-endpoint", + "refId": "A", + "step": 4 + }, + { + "expr": "sum(rate(tikv_thread_cpu_seconds_total{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\", name=~\"cdcwkr.*\"}[1m])) by (instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}-workers", + "refId": "B", + "step": 4 + }, + { + "expr": "sum(rate(tikv_thread_cpu_seconds_total{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\", name=~\"tso\"}[1m])) by (instance)", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{instance}}-tso", + "refId": "C", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "CDC CPU", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "percent", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": true, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "decimals": 1, + "description": "Outbound network traffic of TiKV CDC component", + "editable": true, + "error": false, + "fill": 0, + "grid": {}, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 7 + }, + "id": 74, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": false, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "paceLength": 10, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*resolved_ts/", + "bars": false, + "fill": 1, + "lines": true, + "linewidth": 2, + "transform": "negative-Y", + "yaxis": 2 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(tikv_cdc_grpc_message_sent_bytes{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}[30s])) by (instance, type)", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{instance}}-{{type}}", + "refId": "A", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "CDC network traffic", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": 0 + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "decimals": 1, + "description": "The count of different kinds of gRPC message", + "fill": 1, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 14 + }, + "id": 147, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(tikv_grpc_msg_duration_seconds_count{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\", type!=\"kv_gc\"}[1m])) by (type)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{type}}", + "metric": "tikv_grpc_msg_duration_seconds_bucket", + "refId": "A", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "gRPC message count", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "decimals": 1, + "description": "The memory usage per TiKV instance", + "editable": true, + "error": false, + "fill": 0, + "grid": {}, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 14 + }, + "id": 194, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "sort": "max", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "paceLength": 10, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*-cap-.*/", + "yaxis": 2 + }, + { + "alias": "/.*tikv.*/", + "pointradius": 1, + "points": true, + "yaxis": 2 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "avg(process_resident_memory_bytes{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\", job=~\"tikv.*\"}) by (instance)", + "format": "time_series", + "hide": true, + "intervalFactor": 2, + "legendFormat": "tikv-{{instance}}", + "refId": "A", + "step": 10 + }, + { + "expr": "avg(process_resident_memory_bytes{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\", job=~\"cdc.*\"}) by (instance)", + "format": "time_series", + "hide": true, + "intervalFactor": 2, + "legendFormat": "cdc-{{instance}}", + "refId": "B", + "step": 10 + }, + { + "expr": "(avg(process_resident_memory_bytes{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\", job=~\"tikv.*\"}) by (instance)) - (avg(tikv_engine_block_cache_size_bytes{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\", db=\"kv\"}) by(instance))", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "tikv-{{instance}}", + "refId": "C", + "step": 10 + }, + { + "expr": "sum(tikv_cdc_sink_memory_bytes{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}) by (instance)", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "sink-{{instance}}", + "refId": "D", + "step": 10 + }, + { + "expr": "sum(tikv_cdc_old_value_cache_bytes{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}) by (instance)", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "old-value-{{instance}}", + "refId": "E", + "step": 10 + }, + { + "expr": "sum(tikv_cdc_sink_memory_capacity{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}) by (instance)", + "format": "time_series", + "hide": true, + "intervalFactor": 2, + "legendFormat": "sink-cap-{{instance}}", + "refId": "F", + "step": 10 + }, + { + "expr": "sum(tikv_cdc_old_value_cache_memory_quota{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}) by (instance)", + "format": "time_series", + "hide": true, + "intervalFactor": 2, + "legendFormat": "old-value-cap-{{instance}}", + "refId": "G", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "CDC memory", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "decimals": 1, + "description": "The min resolved ts of each TiKV", + "editable": true, + "error": false, + "fill": 0, + "grid": {}, + "gridPos": { + "h": 7, + "w": 7, + "x": 0, + "y": 21 + }, + "id": 152, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "sort": null, + "sortDesc": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "paceLength": 10, + "percentage": false, + "pointradius": 1, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*-ts/", + "lines": true, + "linewidth": 3, + "points": false, + "yaxis": 2 + }, + { + "alias": "/.*-lag/", + "bars": true, + "fill": 1 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "scalar(max(pd_cluster_tso{tidb_cluster=\"$tidb_cluster\"}))/1000 - avg(tikv_cdc_min_resolved_ts{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}/1000) by (instance) > 0", + "format": "time_series", + "hide": false, + "instant": false, + "intervalFactor": 2, + "legendFormat": "{{instance}}-min-resolved-lag", + "refId": "A", + "step": 10 + }, + { + "expr": "max(pd_cluster_tso{tidb_cluster=\"$tidb_cluster\"})", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "current-ts", + "refId": "B", + "step": 10 + }, + { + "expr": "avg(tikv_cdc_min_resolved_ts{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}) by (instance)", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{instance}}-min-resolved-ts", + "refId": "C", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Min resolved ts", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "none", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "decimals": 1, + "description": "The ID of the min resolved region of each TiKV", + "editable": true, + "error": false, + "fill": 0, + "grid": {}, + "gridPos": { + "h": 7, + "w": 5, + "x": 7, + "y": 21 + }, + "id": 153, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "sort": null, + "sortDesc": null, + "total": false, + "values": true + }, + "lines": false, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "paceLength": 10, + "percentage": false, + "pointradius": 1, + "points": true, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "avg(tikv_cdc_min_resolved_ts_region{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}) by (instance)", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{instance}}-min-resolved-region", "refId": "A", "step": 10 } @@ -8207,7 +10136,277 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "gRPC message count", + "title": "Min resolved Region", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "none", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "none", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "decimals": 1, + "description": "", + "fill": 1, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 21 + }, + "id": 70, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "sort": null, + "sortDesc": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "paceLength": 10, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99999, sum(rate(tikv_cdc_resolved_ts_gap_seconds_bucket{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}[1m])) by (le, instance))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{instance}}-p9999", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Resolved ts lag duration percentile", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 10, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "cards": { + "cardPadding": 0, + "cardRound": 0 + }, + "color": { + "cardColor": "#FF9830", + "colorScale": "linear", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "min": 0, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_TEST-CLUSTER}", + "description": "The time consumed to CDC incremental scan", + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 28 + }, + "heatmap": {}, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 68, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "links": [], + "maxPerRow": 3, + "repeat": null, + "repeatDirection": "h", + "reverseYBuckets": false, + "targets": [ + { + "expr": "sum(rate(tikv_cdc_scan_duration_seconds_bucket{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}[1m])) by (le)", + "format": "heatmap", + "instant": false, + "intervalFactor": 2, + "legendFormat": "{{le}}", + "refId": "A" + } + ], + "title": "Initial scan duration", + "tooltip": { + "show": true, + "showHistogram": true + }, + "tooltipDecimals": 1, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 1, + "format": "s", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketBound": "upper", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "decimals": 1, + "description": "", + "fill": 1, + "gridPos": { + "h": 7, + "w": 6, + "x": 12, + "y": 28 + }, + "id": 72, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": true, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "paceLength": 10, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.9999, sum(rate(tikv_cdc_scan_duration_seconds_bucket{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}[1m])) by (le, instance))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{instance}}-p9999", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Initial scan duration percentile", "tooltip": { "shared": true, "sort": 0, @@ -8223,7 +10422,7 @@ }, "yaxes": [ { - "format": "ops", + "format": "s", "label": null, "logBase": 1, "max": null, @@ -8236,7 +10435,7 @@ "logBase": 1, "max": null, "min": null, - "show": true + "show": false } ], "yaxis": { @@ -8251,28 +10450,27 @@ "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, - "description": "The memory usage per TiKV instance", - "editable": true, - "error": false, - "fill": 0, - "grid": {}, + "description": "The number of incremental scan task in different status.", + "fill": 1, "gridPos": { "h": 7, - "w": 12, - "x": 12, - "y": 14 + "w": 6, + "x": 18, + "y": 28 }, - "id": 194, + "id": 140, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": false, + "hideZero": true, "max": true, "min": false, - "rightSide": true, + "rightSide": false, "show": true, "sideWidth": null, - "sort": "max", + "sort": "current", "sortDesc": true, "total": false, "values": true @@ -8288,13 +10486,7 @@ "renderer": "flot", "seriesOverrides": [ { - "alias": "/.*-cap-.*/", - "yaxis": 2 - }, - { - "alias": "/.*tikv.*/", - "pointradius": 1, - "points": true, + "alias": "/.*ongoing/", "yaxis": 2 } ], @@ -8303,76 +10495,28 @@ "steppedLine": false, "targets": [ { - "expr": "avg(process_resident_memory_bytes{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\", job=~\"tikv.*\"}) by (instance)", - "format": "time_series", - "hide": true, - "intervalFactor": 2, - "legendFormat": "tikv-{{instance}}", - "refId": "A", - "step": 10 - }, - { - "expr": "avg(process_resident_memory_bytes{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\", job=~\"cdc.*\"}) by (instance)", - "format": "time_series", - "hide": true, - "intervalFactor": 2, - "legendFormat": "cdc-{{instance}}", - "refId": "B", - "step": 10 - }, - { - "expr": "(avg(process_resident_memory_bytes{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\", job=~\"tikv.*\"}) by (instance)) - (avg(tikv_engine_block_cache_size_bytes{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\", db=\"kv\"}) by(instance))", - "format": "time_series", - "hide": false, - "intervalFactor": 2, - "legendFormat": "tikv-{{instance}}", - "refId": "C", - "step": 10 - }, - { - "expr": "sum(tikv_cdc_sink_memory_bytes{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}) by (instance)", + "expr": "sum(tikv_cdc_scan_tasks{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\", type=\"ongoing\"}) by (type, instance)", "format": "time_series", "hide": false, - "intervalFactor": 2, - "legendFormat": "sink-{{instance}}", - "refId": "D", - "step": 10 + "intervalFactor": 1, + "legendFormat": "{{instance}}-{{type}}", + "refId": "A" }, { - "expr": "sum(tikv_cdc_old_value_cache_bytes{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}) by (instance)", + "expr": "sum(tikv_cdc_scan_tasks{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\", type=\"total\"}) by (instance) - sum(tikv_cdc_scan_tasks{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\", type=~\"abort|finish\"}) by (instance)", "format": "time_series", "hide": false, - "intervalFactor": 2, - "legendFormat": "old-value-{{instance}}", - "refId": "E", - "step": 10 - }, - { - "expr": "sum(tikv_cdc_sink_memory_capacity{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}) by (instance)", - "format": "time_series", - "hide": true, - "intervalFactor": 2, - "legendFormat": "sink-cap-{{instance}}", - "refId": "F", - "step": 10 - }, - { - "expr": "sum(tikv_cdc_old_value_cache_memory_quota{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}) by (instance)", - "format": "time_series", - "hide": true, - "intervalFactor": 2, - "legendFormat": "old-value-cap-{{instance}}", - "refId": "G", - "step": 10 + "intervalFactor": 1, + "legendFormat": "{{instance}}-pending", + "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "CDC memory", + "title": "Initial scan tasks status", "tooltip": { - "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" @@ -8387,15 +10531,15 @@ }, "yaxes": [ { - "format": "bytes", + "format": "none", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { - "format": "bytes", + "format": "none", "label": null, "logBase": 1, "max": null, @@ -8415,25 +10559,25 @@ "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, - "description": "The min resolved ts of each TiKV", + "description": "The memory usage per TiKV instance", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, - "w": 7, + "w": 12, "x": 0, - "y": 21 + "y": 35 }, - "id": 152, + "id": 78, "legend": { "alignAsTable": true, "avg": false, "current": true, - "max": false, + "max": true, "min": false, - "rightSide": false, + "rightSide": true, "show": true, "sideWidth": null, "sort": null, @@ -8447,61 +10591,38 @@ "nullPointMode": "null", "paceLength": 10, "percentage": false, - "pointradius": 1, + "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ - { - "alias": "/.*-ts/", - "lines": true, - "linewidth": 3, - "points": false, - "yaxis": 2 - }, - { - "alias": "/.*-lag/", - "bars": true, - "fill": 1 - } - ], + "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { - "expr": "scalar(max(pd_cluster_tso{tidb_cluster=\"$tidb_cluster\"}))/1000 - avg(tikv_cdc_min_resolved_ts{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}/1000) by (instance) > 0", + "expr": "avg(tikv_cdc_captured_region_total{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}) by (instance)", "format": "time_series", "hide": false, - "instant": false, "intervalFactor": 2, - "legendFormat": "{{instance}}-min-resolved-lag", + "legendFormat": "tikv-{{instance}}-total", "refId": "A", "step": 10 }, { - "expr": "max(pd_cluster_tso{tidb_cluster=\"$tidb_cluster\"})", + "expr": "sum(tikv_cdc_region_resolve_status{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}) by (instance, status)", "format": "time_series", "hide": false, "intervalFactor": 2, - "legendFormat": "current-ts", + "legendFormat": "tikv-{{instance}}-{{status}}", "refId": "B", "step": 10 - }, - { - "expr": "avg(tikv_cdc_min_resolved_ts{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}) by (instance)", - "format": "time_series", - "hide": false, - "intervalFactor": 2, - "legendFormat": "{{instance}}-min-resolved-ts", - "refId": "C", - "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Min resolved ts", + "title": "Captured region count", "tooltip": { "msResolution": false, "shared": true, @@ -8518,7 +10639,7 @@ }, "yaxes": [ { - "format": "s", + "format": "none", "label": null, "logBase": 1, "max": null, @@ -8526,7 +10647,7 @@ "show": true }, { - "format": "none", + "format": "short", "label": null, "logBase": 1, "max": null, @@ -8546,40 +10667,42 @@ "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, - "description": "The ID of the min resolved region of each TiKV", + "description": "The speed of TiKV CDC incremental scan", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, - "w": 5, - "x": 7, - "y": 21 + "w": 6, + "x": 12, + "y": 35 }, - "id": 153, + "id": 76, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": false, "show": true, "sideWidth": null, - "sort": null, - "sortDesc": null, + "sort": "current", + "sortDesc": true, "total": false, "values": true }, - "lines": false, + "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "paceLength": 10, "percentage": false, - "pointradius": 1, - "points": true, + "pointradius": 5, + "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, @@ -8587,11 +10710,11 @@ "steppedLine": false, "targets": [ { - "expr": "avg(tikv_cdc_min_resolved_ts_region{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}) by (instance)", + "expr": "sum(rate(tikv_cdc_scan_bytes_total{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\", job=\"tikv\"}[30s])) by (instance)", "format": "time_series", "hide": false, "intervalFactor": 2, - "legendFormat": "{{instance}}-min-resolved-region", + "legendFormat": "tikv-{{instance}}", "refId": "A", "step": 10 } @@ -8600,7 +10723,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Min resolved Region", + "title": "CDC scan speed", "tooltip": { "msResolution": false, "shared": true, @@ -8617,7 +10740,7 @@ }, "yaxes": [ { - "format": "none", + "format": "bytes", "label": null, "logBase": 1, "max": null, @@ -8625,12 +10748,12 @@ "show": true }, { - "format": "none", + "format": "short", "label": null, "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ], "yaxis": { @@ -8640,37 +10763,40 @@ }, { "aliasColors": {}, - "bars": false, + "bars": true, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, - "description": "", - "fill": 1, + "description": "The total bytes of TiKV CDC incremental scan", + "editable": true, + "error": false, + "fill": 0, + "grid": {}, "gridPos": { "h": 7, - "w": 12, - "x": 12, - "y": 21 + "w": 6, + "x": 18, + "y": 35 }, - "id": 70, + "id": 139, "legend": { "alignAsTable": true, "avg": false, "current": true, - "hideEmpty": false, + "hideEmpty": true, "hideZero": true, "max": true, "min": false, - "rightSide": true, + "rightSide": false, "show": true, "sideWidth": null, - "sort": null, - "sortDesc": null, + "sort": "current", + "sortDesc": true, "total": false, "values": true }, - "lines": true, + "lines": false, "linewidth": 1, "links": [], "nullPointMode": "null", @@ -8685,19 +10811,22 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99999, sum(rate(tikv_cdc_resolved_ts_gap_seconds_bucket{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}[1m])) by (le, instance))", + "expr": "sum(tikv_cdc_scan_bytes_total{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\", job=\"tikv\"}) by (instance)", "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{instance}}-p9999", - "refId": "A" + "hide": false, + "intervalFactor": 2, + "legendFormat": "tikv-{{instance}}", + "refId": "A", + "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Resolved ts lag duration percentile", + "title": "CDC total scan bytes", "tooltip": { + "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" @@ -8712,9 +10841,9 @@ }, "yaxes": [ { - "format": "s", + "format": "bytes", "label": null, - "logBase": 10, + "logBase": 1, "max": null, "min": null, "show": true @@ -8734,83 +10863,131 @@ } }, { - "cards": { - "cardPadding": 0, - "cardRound": 0 - }, - "color": { - "cardColor": "#FF9830", - "colorScale": "linear", - "colorScheme": "interpolateSpectral", - "exponent": 0.5, - "min": 0, - "mode": "spectrum" - }, - "dataFormat": "tsbuckets", + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, "datasource": "${DS_TEST-CLUSTER}", - "description": "The time consumed to CDC incremental scan", + "decimals": 1, + "description": "", + "fill": 4, "gridPos": { "h": 7, "w": 12, "x": 0, - "y": 28 + "y": 42 }, - "heatmap": {}, - "hideZeroBuckets": true, - "highlightCards": true, - "id": 68, + "id": 143, "legend": { "alignAsTable": true, "avg": false, "current": true, - "max": true, - "min": false, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": true, "rightSide": true, "show": true, + "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, + "lines": true, + "linewidth": 1, "links": [], - "maxPerRow": 3, - "repeat": null, - "repeatDirection": "h", - "reverseYBuckets": false, + "nullPointMode": "null", + "paceLength": 10, + "percentage": false, + "pointradius": 1, + "points": true, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/(access|miss).*/", + "fill": 0, + "points": false, + "yaxis": 2 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_cdc_scan_duration_seconds_bucket{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}[1m])) by (le)", - "format": "heatmap", - "instant": false, - "intervalFactor": 2, - "legendFormat": "{{le}}", + "expr": "(sum(rate(tikv_cdc_old_value_cache_access{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}[1m])) by (instance) - sum(rate(tikv_cdc_old_value_cache_miss{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}[1m])) by (instance)) / sum(rate(tikv_cdc_old_value_cache_access{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}[1m])) by (instance)", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "hit-rate-{{instance}}", "refId": "A" + }, + { + "expr": "-sum(rate(tikv_cdc_old_value_cache_access{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}[1m])) by (instance)", + "format": "time_series", + "hide": true, + "intervalFactor": 1, + "legendFormat": "access-{{instance}}", + "refId": "B" + }, + { + "expr": "-sum(rate(tikv_cdc_old_value_cache_miss{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}[1m])) by (instance)", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "miss-{{instance}}", + "refId": "C" + }, + { + "expr": "-sum(rate(tikv_cdc_old_value_cache_miss_none{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}[1m])) by (instance)", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "miss-none-{{instance}}", + "refId": "D" } ], - "title": "Initial scan duration", + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Old value cache hit", "tooltip": { - "show": true, - "showHistogram": true - }, - "tooltipDecimals": 1, - "type": "heatmap", - "xAxis": { - "show": true + "shared": true, + "sort": 0, + "value_type": "individual" }, - "xBucketNumber": null, - "xBucketSize": null, - "yAxis": { - "decimals": 1, - "format": "s", - "logBase": 1, - "max": null, - "min": null, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, "show": true, - "splitFactor": null + "values": [] }, - "yBucketBound": "upper", - "yBucketNumber": null, - "yBucketSize": null + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": "1", + "min": "0", + "show": true + }, + { + "format": "none", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } }, { "aliasColors": {}, @@ -8819,15 +10996,15 @@ "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, - "description": "", + "description": "The total number of cache entries in the old value cache.", "fill": 1, "gridPos": { "h": 7, - "w": 6, + "w": 12, "x": 12, - "y": 28 + "y": 42 }, - "id": 72, + "id": 145, "legend": { "alignAsTable": true, "avg": false, @@ -8836,7 +11013,7 @@ "hideZero": true, "max": true, "min": false, - "rightSide": false, + "rightSide": true, "show": true, "sideWidth": null, "sort": "current", @@ -8850,27 +11027,49 @@ "nullPointMode": "null", "paceLength": 10, "percentage": false, - "pointradius": 5, + "pointradius": 1, "points": false, "renderer": "flot", - "seriesOverrides": [], + "seriesOverrides": [ + { + "alias": "/.*len/", + "yaxis": 2 + } + ], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.9999, sum(rate(tikv_cdc_scan_duration_seconds_bucket{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}[1m])) by (le, instance))", + "expr": "sum(tikv_cdc_old_value_cache_length{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}) by (instance)", "format": "time_series", + "hide": false, "intervalFactor": 1, - "legendFormat": "{{instance}}-p9999", + "legendFormat": "{{instance}}-len", "refId": "A" + }, + { + "expr": "sum(tikv_cdc_old_value_cache_bytes{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}) by (instance) / sum(tikv_cdc_old_value_cache_length{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}) by (instance)", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{instance}}-avg entry bytes", + "refId": "B" + }, + { + "expr": "sum(tikv_cdc_old_value_cache_memory_quota{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}) by (instance)", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{instance}}-quota", + "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Initial scan duration percentile", + "title": "Old value cache size", "tooltip": { "shared": true, "sort": 0, @@ -8886,20 +11085,21 @@ }, "yaxes": [ { - "format": "s", + "format": "bytes", "label": null, "logBase": 1, "max": null, - "min": null, + "min": "0", "show": true }, { - "format": "short", + "decimals": null, + "format": "none", "label": null, "logBase": 1, "max": null, - "min": null, - "show": false + "min": "0", + "show": true } ], "yaxis": { @@ -8914,24 +11114,27 @@ "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, - "description": "The number of incremental scan task in different status.", - "fill": 1, + "description": "", + "editable": true, + "error": false, + "fill": 0, + "grid": {}, "gridPos": { "h": 7, - "w": 6, - "x": 18, - "y": 28 + "w": 12, + "x": 0, + "y": 49 }, - "id": 140, + "id": 141, "legend": { "alignAsTable": true, "avg": false, "current": true, - "hideEmpty": false, + "hideEmpty": true, "hideZero": true, "max": true, "min": false, - "rightSide": false, + "rightSide": true, "show": true, "sideWidth": null, "sort": "current", @@ -8948,39 +11151,28 @@ "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [ - { - "alias": "/.*ongoing/", - "yaxis": 2 - } - ], + "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(tikv_cdc_scan_tasks{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\", type=\"ongoing\"}) by (type, instance)", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "{{instance}}-{{type}}", - "refId": "A" - }, - { - "expr": "sum(tikv_cdc_scan_tasks{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\", type=\"total\"}) by (instance) - sum(tikv_cdc_scan_tasks{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\", type=~\"abort|finish\"}) by (instance)", + "expr": "sum(rate(tikv_cdc_old_value_scan_details{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}[1m])) by (instance, cf, tag)", "format": "time_series", "hide": false, - "intervalFactor": 1, - "legendFormat": "{{instance}}-pending", - "refId": "B" + "intervalFactor": 2, + "legendFormat": "{{instance}}-{{cf}}-{{tag}}", + "refId": "A", + "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Initial scan tasks status", + "title": "Old value seek operation", "tooltip": { + "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" @@ -8995,15 +11187,15 @@ }, "yaxes": [ { - "format": "none", + "format": "ops", "label": null, "logBase": 1, "max": null, - "min": null, + "min": "0", "show": true }, { - "format": "none", + "format": "short", "label": null, "logBase": 1, "max": null, @@ -9016,6 +11208,84 @@ "alignLevel": null } }, + { + "cards": { + "cardPadding": 0, + "cardRound": 0 + }, + "color": { + "cardColor": "#FF9830", + "colorScale": "linear", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "min": 0, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_TEST-CLUSTER}", + "description": "The time consumed to get an old value (both from cache and from disk)", + "gridPos": { + "h": 7, + "w": 6, + "x": 12, + "y": 49 + }, + "heatmap": {}, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 146, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "links": [], + "maxPerRow": 3, + "repeatDirection": "h", + "reverseYBuckets": false, + "targets": [ + { + "expr": "sum(rate(tikv_cdc_old_value_duration_bucket{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}[1m])) by (le)", + "format": "heatmap", + "instant": false, + "intervalFactor": 2, + "legendFormat": "{{le}}", + "refId": "A" + } + ], + "title": "Old value seek duration", + "tooltip": { + "show": true, + "showHistogram": true + }, + "tooltipDecimals": 1, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 1, + "format": "s", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketBound": "upper", + "yBucketNumber": null, + "yBucketSize": null + }, { "aliasColors": {}, "bars": false, @@ -9023,29 +11293,28 @@ "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, - "description": "The memory usage per TiKV instance", - "editable": true, - "error": false, - "fill": 0, - "grid": {}, + "description": "", + "fill": 1, "gridPos": { "h": 7, - "w": 12, - "x": 0, - "y": 35 + "w": 6, + "x": 18, + "y": 49 }, - "id": 78, + "id": 142, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": false, + "hideZero": true, "max": true, "min": false, - "rightSide": true, + "rightSide": false, "show": true, "sideWidth": null, - "sort": null, - "sortDesc": null, + "sort": "current", + "sortDesc": true, "total": false, "values": true }, @@ -9064,31 +11333,33 @@ "steppedLine": false, "targets": [ { - "expr": "avg(tikv_cdc_captured_region_total{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}) by (instance)", + "expr": "histogram_quantile(0.99, sum(rate(tikv_cdc_old_value_duration_bucket{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}[1m])) by (le, instance, tag))", "format": "time_series", - "hide": false, - "intervalFactor": 2, - "legendFormat": "tikv-{{instance}}-total", - "refId": "A", - "step": 10 + "intervalFactor": 1, + "legendFormat": "{{instance}}-99%-{{tag}}", + "refId": "A" }, { - "expr": "sum(tikv_cdc_region_resolve_status{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}) by (instance, status)", + "expr": "histogram_quantile(0.95, sum(rate(tikv_cdc_old_value_duration_bucket{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}[1m])) by (le, instance, tag))", "format": "time_series", - "hide": false, - "intervalFactor": 2, - "legendFormat": "tikv-{{instance}}-{{status}}", - "refId": "B", - "step": 10 + "intervalFactor": 1, + "legendFormat": "{{instance}}-95%-{{tag}}", + "refId": "B" + }, + { + "expr": "sum(rate(tikv_cdc_old_value_duration_sum{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}[1m])) by (le, instance, tag) / sum(rate(tikv_cdc_old_value_duration_count{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}[1m])) by (le, instance, tag)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{instance}}-avg-{{tag}}", + "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Captured region count", + "title": "Old value seek duration", "tooltip": { - "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" @@ -9103,11 +11374,11 @@ }, "yaxes": [ { - "format": "none", + "format": "s", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { @@ -9116,56 +11387,66 @@ "logBase": 1, "max": null, "min": null, - "show": true + "show": false } ], "yaxis": { "align": false, "alignLevel": null } - }, + } + ], + "title": "TiKV", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 7 + }, + "id": 294, + "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The speed of TiKV CDC incremental scan", - "editable": true, - "error": false, - "fill": 0, - "grid": {}, + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, "gridPos": { "h": 7, - "w": 6, - "x": 12, - "y": 35 + "w": 4, + "x": 0, + "y": 8 }, - "id": 76, + "hiddenSeries": false, + "id": 321, "legend": { - "alignAsTable": true, "avg": false, - "current": true, - "hideEmpty": true, - "hideZero": true, - "max": true, + "current": false, + "max": false, "min": false, - "rightSide": false, "show": true, - "sideWidth": null, - "sort": "current", - "sortDesc": true, "total": false, - "values": true + "values": false }, "lines": true, "linewidth": 1, - "links": [], "nullPointMode": "null", - "paceLength": 10, + "options": { + "alertThreshold": true + }, "percentage": false, - "pointradius": 5, + "pluginVersion": "7.5.7", + "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], @@ -9174,22 +11455,20 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_cdc_scan_bytes_total{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\", job=\"tikv\"}[30s])) by (instance)", - "format": "time_series", - "hide": false, - "intervalFactor": 2, - "legendFormat": "tikv-{{instance}}", - "refId": "A", - "step": 10 + "exemplar": true, + "expr": "sum without (from) (rate(ticdc_message_server_message_count{instance=~\"$capture\"}[30s]))", + "interval": "", + "legendFormat": "{{instance}}", + "queryType": "randomWalk", + "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "CDC scan speed", + "title": "Message Receive Rate", "tooltip": { - "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" @@ -9204,11 +11483,11 @@ }, "yaxes": [ { - "format": "bytes", + "format": "short", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { @@ -9227,46 +11506,42 @@ }, { "aliasColors": {}, - "bars": true, + "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The total bytes of TiKV CDC incremental scan", - "editable": true, - "error": false, - "fill": 0, - "grid": {}, + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, "gridPos": { "h": 7, - "w": 6, - "x": 18, - "y": 35 + "w": 4, + "x": 4, + "y": 8 }, - "id": 139, + "hiddenSeries": false, + "id": 323, "legend": { - "alignAsTable": true, "avg": false, - "current": true, - "hideEmpty": true, - "hideZero": true, - "max": true, + "current": false, + "max": false, "min": false, - "rightSide": false, "show": true, - "sideWidth": null, - "sort": "current", - "sortDesc": true, "total": false, - "values": true + "values": false }, - "lines": false, + "lines": true, "linewidth": 1, - "links": [], "nullPointMode": "null", - "paceLength": 10, + "options": { + "alertThreshold": true + }, "percentage": false, - "pointradius": 5, + "pluginVersion": "7.5.7", + "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], @@ -9275,22 +11550,20 @@ "steppedLine": false, "targets": [ { - "expr": "sum(tikv_cdc_scan_bytes_total{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\", job=\"tikv\"}) by (instance)", - "format": "time_series", - "hide": false, - "intervalFactor": 2, - "legendFormat": "tikv-{{instance}}", - "refId": "A", - "step": 10 + "exemplar": true, + "expr": "sum without (to) (rate(ticdc_message_client_message_count{instance=~\"$capture\"}[30s]))", + "interval": "", + "legendFormat": "{{instance}}", + "queryType": "randomWalk", + "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "CDC total scan bytes", + "title": "Message Send Rate", "tooltip": { - "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" @@ -9305,7 +11578,7 @@ }, "yaxes": [ { - "format": "bytes", + "format": "short", "label": null, "logBase": 1, "max": null, @@ -9332,96 +11605,83 @@ "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "", - "fill": 4, + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, "gridPos": { "h": 7, - "w": 12, - "x": 0, - "y": 42 + "w": 4, + "x": 8, + "y": 8 }, - "id": 143, + "hiddenSeries": false, + "id": 352, "legend": { - "alignAsTable": true, "avg": false, - "current": true, - "hideEmpty": false, + "current": false, + "hideEmpty": true, "hideZero": true, "max": false, - "min": true, - "rightSide": true, + "min": false, "show": true, - "sideWidth": null, - "sort": "current", - "sortDesc": true, "total": false, - "values": true + "values": false }, "lines": true, "linewidth": 1, - "links": [], "nullPointMode": "null", - "paceLength": 10, + "options": { + "alertThreshold": true + }, "percentage": false, - "pointradius": 1, - "points": true, + "pluginVersion": "7.5.7", + "pointradius": 2, + "points": false, "renderer": "flot", - "seriesOverrides": [ - { - "alias": "/(access|miss).*/", - "fill": 0, - "points": false, - "yaxis": 2 - } - ], + "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { - "expr": "(sum(rate(tikv_cdc_old_value_cache_access{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}[1m])) by (instance) - sum(rate(tikv_cdc_old_value_cache_miss{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}[1m])) by (instance)) / sum(rate(tikv_cdc_old_value_cache_access{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}[1m])) by (instance)", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "hit-rate-{{instance}}", + "exemplar": true, + "expr": "histogram_quantile(0.5, sum(rate(ticdc_message_server_message_batch_size_bucket[30s])) by (le))", + "interval": "", + "legendFormat": "p50", + "queryType": "randomWalk", "refId": "A" }, { - "expr": "-sum(rate(tikv_cdc_old_value_cache_access{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}[1m])) by (instance)", - "format": "time_series", - "hide": true, - "intervalFactor": 1, - "legendFormat": "access-{{instance}}", + "exemplar": true, + "expr": "histogram_quantile(0.95, sum(rate(ticdc_message_server_message_batch_size_bucket[30s])) by (le))", + "hide": false, + "interval": "", + "legendFormat": "p90", "refId": "B" }, { - "expr": "-sum(rate(tikv_cdc_old_value_cache_miss{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}[1m])) by (instance)", - "format": "time_series", + "exemplar": true, + "expr": "histogram_quantile(0.99, sum(rate(ticdc_message_server_message_batch_size_bucket[30s])) by (le))", "hide": false, - "intervalFactor": 1, - "legendFormat": "miss-{{instance}}", + "interval": "", + "legendFormat": "p99", "refId": "C" - }, - { - "expr": "-sum(rate(tikv_cdc_old_value_cache_miss_none{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}[1m])) by (instance)", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "miss-none-{{instance}}", - "refId": "D" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Old value cache hit", + "title": "Message Batch Size", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, + "transformations": [], "type": "graph", "xaxis": { "buckets": null, @@ -9432,15 +11692,15 @@ }, "yaxes": [ { - "format": "percentunit", + "format": "short", "label": null, "logBase": 1, - "max": "1", - "min": "0", + "max": null, + "min": null, "show": true }, { - "format": "none", + "format": "short", "label": null, "logBase": 1, "max": null, @@ -9459,81 +11719,77 @@ "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The total number of cache entries in the old value cache.", - "fill": 1, + "fieldConfig": { + "defaults": { + "unit": "short" + }, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, "gridPos": { "h": 7, - "w": 12, + "w": 6, "x": 12, - "y": 42 + "y": 8 }, - "id": 145, + "hiddenSeries": false, + "id": 354, "legend": { - "alignAsTable": true, "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": true, + "current": false, + "max": false, "min": false, - "rightSide": true, "show": true, - "sideWidth": null, - "sort": "current", - "sortDesc": true, "total": false, - "values": true + "values": false }, "lines": true, "linewidth": 1, - "links": [], "nullPointMode": "null", - "paceLength": 10, + "options": { + "alertThreshold": true + }, "percentage": false, - "pointradius": 1, + "pluginVersion": "7.5.7", + "pointradius": 2, "points": false, "renderer": "flot", - "seriesOverrides": [ - { - "alias": "/.*len/", - "yaxis": 2 - } - ], + "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(tikv_cdc_old_value_cache_length{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}) by (instance)", - "format": "time_series", + "exemplar": true, + "expr": "histogram_quantile(0.5, sum(rate(ticdc_message_server_message_batch_bytes_bucket [30s])) by (le))", "hide": false, - "intervalFactor": 1, - "legendFormat": "{{instance}}-len", - "refId": "A" + "interval": "", + "legendFormat": "p50", + "refId": "C" }, { - "expr": "sum(tikv_cdc_old_value_cache_bytes{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}) by (instance) / sum(tikv_cdc_old_value_cache_length{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}) by (instance)", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "{{instance}}-avg entry bytes", - "refId": "B" + "exemplar": true, + "expr": "histogram_quantile(0.95, sum(rate(ticdc_message_server_message_batch_bytes_bucket [30s])) by (le))", + "interval": "", + "legendFormat": "p95", + "queryType": "randomWalk", + "refId": "A" }, { - "expr": "sum(tikv_cdc_old_value_cache_memory_quota{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}) by (instance)", - "format": "time_series", + "exemplar": true, + "expr": "histogram_quantile(0.99, sum(rate(ticdc_message_server_message_batch_bytes_bucket [30s])) by (le))", "hide": false, - "intervalFactor": 1, - "legendFormat": "{{instance}}-quota", - "refId": "C" + "interval": "", + "legendFormat": "p99", + "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Old value cache size", + "title": "Receive message batch bytes percentiles", "tooltip": { "shared": true, "sort": 0, @@ -9549,20 +11805,19 @@ }, "yaxes": [ { - "format": "bytes", + "format": "short", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { - "decimals": null, - "format": "none", + "format": "short", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true } ], @@ -9577,42 +11832,38 @@ "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "", - "editable": true, - "error": false, + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, "fill": 0, - "grid": {}, + "fillGradient": 0, "gridPos": { "h": 7, - "w": 12, - "x": 0, - "y": 49 + "w": 6, + "x": 18, + "y": 8 }, - "id": 141, + "hiddenSeries": false, + "id": 356, "legend": { - "alignAsTable": true, "avg": false, - "current": true, - "hideEmpty": true, - "hideZero": true, - "max": true, + "current": false, + "max": false, "min": false, - "rightSide": true, "show": true, - "sideWidth": null, - "sort": "current", - "sortDesc": true, "total": false, - "values": true + "values": false }, "lines": true, "linewidth": 1, - "links": [], "nullPointMode": "null", - "paceLength": 10, + "options": { + "alertThreshold": true + }, "percentage": false, - "pointradius": 5, + "pluginVersion": "7.5.7", + "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], @@ -9621,22 +11872,36 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_cdc_old_value_scan_details{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}[1m])) by (instance, cf, tag)", - "format": "time_series", + "exemplar": true, + "expr": "histogram_quantile(0.5, sum(rate(ticdc_message_server_message_bytes_bucket [30s])) by (le))", + "interval": "", + "legendFormat": "p50", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.95, sum(rate(ticdc_message_server_message_bytes_bucket [30s])) by (le))", "hide": false, - "intervalFactor": 2, - "legendFormat": "{{instance}}-{{cf}}-{{tag}}", - "refId": "A", - "step": 10 + "interval": "", + "legendFormat": "p95", + "refId": "B" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.99, sum(rate(ticdc_message_server_message_bytes_bucket [30s])) by (le))", + "hide": false, + "interval": "", + "legendFormat": "p99", + "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Old value seek operation", + "title": "Receive Message Bytes Percentile", "tooltip": { - "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" @@ -9651,11 +11916,11 @@ }, "yaxes": [ { - "format": "ops", + "format": "short", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { @@ -9673,82 +11938,132 @@ } }, { - "cards": { - "cardPadding": 0, - "cardRound": 0 - }, - "color": { - "cardColor": "#FF9830", - "colorScale": "linear", - "colorScheme": "interpolateSpectral", - "exponent": 0.5, - "min": 0, - "mode": "spectrum" - }, - "dataFormat": "tsbuckets", + "columns": [], "datasource": "${DS_TEST-CLUSTER}", - "description": "The time consumed to get an old value (both from cache and from disk)", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "left", + "displayMode": "color-background", + "filterable": false + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgb(255, 255, 255)", + "value": null + }, + { + "color": "dark-blue", + "value": 1 + }, + { + "color": "dark-red", + "value": 2 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "dest\\src" + }, + "properties": [ + { + "id": "custom.width", + "value": 137 + } + ] + } + ] + }, + "fontSize": "100%", "gridPos": { - "h": 7, - "w": 6, - "x": 12, - "y": 49 + "h": 8, + "w": 20, + "x": 0, + "y": 15 }, - "heatmap": {}, - "hideZeroBuckets": true, - "highlightCards": true, - "id": 146, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": true, - "min": false, - "rightSide": true, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true + "id": 350, + "options": { + "showHeader": true, + "sortBy": [] }, - "links": [], - "maxPerRow": 3, - "repeatDirection": "h", - "reverseYBuckets": false, + "pageSize": null, + "pluginVersion": "7.5.7", + "scroll": true, + "showHeader": true, + "sort": { + "col": 0, + "desc": true + }, + "styles": [ + { + "alias": "Time", + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "pattern": "Time", + "type": "date" + }, + { + "alias": "", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "decimals": 2, + "pattern": "/.*/", + "thresholds": [], + "type": "number", + "unit": "short" + } + ], "targets": [ { - "expr": "sum(rate(tikv_cdc_old_value_duration_bucket{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}[1m])) by (le)", - "format": "heatmap", - "instant": false, - "intervalFactor": 2, - "legendFormat": "{{le}}", + "exemplar": true, + "expr": "ticdc_message_server_cur_stream_count", + "format": "time_series", + "instant": true, + "interval": "", + "legendFormat": "", + "queryType": "randomWalk", "refId": "A" } ], - "title": "Old value seek duration", - "tooltip": { - "show": true, - "showHistogram": true - }, - "tooltipDecimals": 1, - "type": "heatmap", - "xAxis": { - "show": true - }, - "xBucketNumber": null, - "xBucketSize": null, - "yAxis": { - "decimals": 1, - "format": "s", - "logBase": 1, - "max": null, - "min": null, - "show": true, - "splitFactor": null - }, - "yBucketBound": "upper", - "yBucketNumber": null, - "yBucketSize": null + "title": "Stream Count Between Nodes", + "transform": "timeseries_to_columns", + "transformations": [ + { + "id": "labelsToFields", + "options": { + "valueLabel": "from" + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "instance": false, + "job": true + }, + "indexByName": {}, + "renameByName": { + "instance": "dest\\src" + } + } + } + ], + "type": "table" }, { "aliasColors": {}, @@ -9756,39 +12071,38 @@ "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, "fill": 1, + "fillGradient": 0, "gridPos": { - "h": 7, - "w": 6, - "x": 18, - "y": 49 + "h": 8, + "w": 4, + "x": 20, + "y": 15 }, - "id": 142, + "hiddenSeries": false, + "id": 358, "legend": { - "alignAsTable": true, "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": true, + "current": false, + "max": false, "min": false, - "rightSide": false, "show": true, - "sideWidth": null, - "sort": "current", - "sortDesc": true, "total": false, - "values": true + "values": false }, "lines": true, "linewidth": 1, - "links": [], "nullPointMode": "null", - "paceLength": 10, + "options": { + "alertThreshold": true + }, "percentage": false, - "pointradius": 5, + "pluginVersion": "7.5.7", + "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], @@ -9797,32 +12111,27 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(tikv_cdc_old_value_duration_bucket{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}[1m])) by (le, instance, tag))", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{instance}}-99%-{{tag}}", + "exemplar": true, + "expr": "sum(ticdc_message_server_cur_stream_count)", + "interval": "", + "legendFormat": "actual", + "queryType": "randomWalk", "refId": "A" }, { - "expr": "histogram_quantile(0.95, sum(rate(tikv_cdc_old_value_duration_bucket{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}[1m])) by (le, instance, tag))", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{instance}}-95%-{{tag}}", + "exemplar": true, + "expr": "2 * count(process_start_time_seconds{tidb_cluster=\"$tidb_cluster\", job=\"ticdc\"}) - 1", + "hide": false, + "interval": "", + "legendFormat": "expected", "refId": "B" - }, - { - "expr": "sum(rate(tikv_cdc_old_value_duration_sum{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}[1m])) by (le, instance, tag) / sum(rate(tikv_cdc_old_value_duration_count{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}[1m])) by (le, instance, tag)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{instance}}-avg-{{tag}}", - "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Old value seek duration", + "title": "Total Stream Count", "tooltip": { "shared": true, "sort": 0, @@ -9838,7 +12147,7 @@ }, "yaxes": [ { - "format": "s", + "format": "short", "label": null, "logBase": 1, "max": null, @@ -9851,7 +12160,7 @@ "logBase": 1, "max": null, "min": null, - "show": false + "show": true } ], "yaxis": { @@ -9860,788 +12169,16 @@ } } ], - "title": "TiKV", - "type": "row" - }, - { - "collapsed": true, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 7 - }, - "id": 294, - "panels": [], "title": "Peer Messages", "type": "row" }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_CDC-CLUSTER1}", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 7, - "w": 4, - "x": 0, - "y": 8 - }, - "hiddenSeries": false, - "id": 321, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.7", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum without (from) (rate(ticdc_message_server_message_count{instance=~\"$capture\"}[30s]))", - "interval": "", - "legendFormat": "{{instance}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Message Receive Rate", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_CDC-CLUSTER1}", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 7, - "w": 4, - "x": 4, - "y": 8 - }, - "hiddenSeries": false, - "id": 323, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.7", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum without (to) (rate(ticdc_message_client_message_count{instance=~\"$capture\"}[30s]))", - "interval": "", - "legendFormat": "{{instance}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Message Send Rate", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_CDC-CLUSTER1}", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 7, - "w": 4, - "x": 8, - "y": 8 - }, - "hiddenSeries": false, - "id": 352, - "legend": { - "avg": false, - "current": false, - "hideEmpty": true, - "hideZero": true, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.7", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "histogram_quantile(0.5, sum(rate(ticdc_message_server_message_batch_size_bucket[30s])) by (le))", - "interval": "", - "legendFormat": "p50", - "queryType": "randomWalk", - "refId": "A" - }, - { - "exemplar": true, - "expr": "histogram_quantile(0.95, sum(rate(ticdc_message_server_message_batch_size_bucket[30s])) by (le))", - "hide": false, - "interval": "", - "legendFormat": "p90", - "refId": "B" - }, - { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(ticdc_message_server_message_batch_size_bucket[30s])) by (le))", - "hide": false, - "interval": "", - "legendFormat": "p99", - "refId": "C" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Message Batch Size", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "transformations": [], - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_CDC-CLUSTER1}", - "fieldConfig": { - "defaults": { - "unit": "short" - }, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 7, - "w": 6, - "x": 12, - "y": 8 - }, - "hiddenSeries": false, - "id": 354, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.7", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "histogram_quantile(0.5, sum(rate(ticdc_message_server_message_batch_bytes_bucket [30s])) by (le))", - "hide": false, - "interval": "", - "legendFormat": "p50", - "refId": "C" - }, - { - "exemplar": true, - "expr": "histogram_quantile(0.95, sum(rate(ticdc_message_server_message_batch_bytes_bucket [30s])) by (le))", - "interval": "", - "legendFormat": "p95", - "queryType": "randomWalk", - "refId": "A" - }, - { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(ticdc_message_server_message_batch_bytes_bucket [30s])) by (le))", - "hide": false, - "interval": "", - "legendFormat": "p99", - "refId": "B" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Receive message batch bytes percentiles", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_CDC-CLUSTER1}", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 7, - "w": 6, - "x": 18, - "y": 8 - }, - "hiddenSeries": false, - "id": 356, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.7", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "histogram_quantile(0.5, sum(rate(ticdc_message_server_message_bytes_bucket [30s])) by (le))", - "interval": "", - "legendFormat": "p50", - "queryType": "randomWalk", - "refId": "A" - }, - { - "exemplar": true, - "expr": "histogram_quantile(0.95, sum(rate(ticdc_message_server_message_bytes_bucket [30s])) by (le))", - "hide": false, - "interval": "", - "legendFormat": "p95", - "refId": "B" - }, - { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(ticdc_message_server_message_bytes_bucket [30s])) by (le))", - "hide": false, - "interval": "", - "legendFormat": "p99", - "refId": "C" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Receive Message Bytes Percentile", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "columns": [], - "datasource": "${DS_CDC-CLUSTER1}", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "custom": { - "align": "left", - "displayMode": "color-background", - "filterable": false - }, - "mappings": [], - "noValue": "0", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgb(255, 255, 255)", - "value": null - }, - { - "color": "dark-blue", - "value": 1 - }, - { - "color": "dark-red", - "value": 2 - } - ] - } - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "dest\\src" - }, - "properties": [ - { - "id": "custom.width", - "value": 137 - } - ] - } - ] - }, - "fontSize": "100%", - "gridPos": { - "h": 8, - "w": 20, - "x": 0, - "y": 15 - }, - "id": 350, - "options": { - "showHeader": true, - "sortBy": [] - }, - "pageSize": null, - "pluginVersion": "7.5.7", - "scroll": true, - "showHeader": true, - "sort": { - "col": 0, - "desc": true - }, - "styles": [ - { - "alias": "Time", - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "pattern": "Time", - "type": "date" - }, - { - "alias": "", - "colorMode": null, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "decimals": 2, - "pattern": "/.*/", - "thresholds": [], - "type": "number", - "unit": "short" - } - ], - "targets": [ - { - "exemplar": true, - "expr": "ticdc_message_server_cur_stream_count", - "format": "time_series", - "instant": true, - "interval": "", - "legendFormat": "", - "queryType": "randomWalk", - "refId": "A" - } - ], - "title": "Stream Count Between Nodes", - "transform": "timeseries_to_columns", - "transformations": [ - { - "id": "labelsToFields", - "options": { - "valueLabel": "from" - } - }, - { - "id": "organize", - "options": { - "excludeByName": { - "Time": true, - "instance": false, - "job": true - }, - "indexByName": {}, - "renameByName": { - "instance": "dest\\src" - } - } - } - ], - "type": "table" - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_CDC-CLUSTER1}", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 4, - "x": 20, - "y": 15 - }, - "hiddenSeries": false, - "id": 358, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.7", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ticdc_message_server_cur_stream_count)", - "interval": "", - "legendFormat": "actual", - "queryType": "randomWalk", - "refId": "A" - }, - { - "exemplar": true, - "expr": "2 * count(process_start_time_seconds{tidb_cluster=\"$tidb_cluster\", job=\"ticdc\"}) - 1", - "hide": false, - "interval": "", - "legendFormat": "expected", - "refId": "B" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Total Stream Count", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, - "y": 7 + "y": 8 }, "id": 155, "panels": [ @@ -11217,7 +12754,7 @@ "h": 1, "w": 24, "x": 0, - "y": 8 + "y": 9 }, "id": 187, "panels": [ @@ -11783,5 +13320,5 @@ "timezone": "browser", "title": "Test-Cluster-TiCDC", "uid": "YiGL8hBZ1", - "version": 30 -} + "version": 31 +} \ No newline at end of file