From 51cfefb9b5ab5bd39edf2da5349d84562d92fa5f Mon Sep 17 00:00:00 2001 From: Zheming Li Date: Thu, 3 Sep 2020 14:28:38 +0800 Subject: [PATCH] cherry pick #2881 to release-4.0 Signed-off-by: ti-srebot --- metrics/grafana/pd.json | 194 ++++++++++++++++++++++++++++++++++++++++ server/grpc_service.go | 16 ++++ server/metrics.go | 20 +++++ 3 files changed, 230 insertions(+) diff --git a/metrics/grafana/pd.json b/metrics/grafana/pd.json index 2bc93f41d1f..85bc032b8c7 100644 --- a/metrics/grafana/pd.json +++ b/metrics/grafana/pd.json @@ -8016,6 +8016,200 @@ "align": false, "alignLevel": null } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "description": "The store heartbeat handle duration in .99", + "editable": true, + "error": false, + "fill": 0, + "grid": {}, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 47 + }, + "id": 1400, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": false, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "paceLength": 10, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(pd_scheduler_handle_store_heartbeat_duration_seconds_bucket{store=~\"$store\"}[1m])) by (address, store, le))", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{address}}-store-{{store}}", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "99% store heartbeat handle duration", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "description": "The region heartbeat handle duration in .99", + "editable": true, + "error": false, + "fill": 0, + "grid": {}, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 47 + }, + "id": 1401, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": false, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "paceLength": 10, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(pd_scheduler_handle_region_heartbeat_duration_seconds_bucket{store=~\"$store\"}[1m])) by (address, store, le))", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{address}}-store-{{store}}", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "99% region heartbeat handle duration", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } } ], "repeat": null, diff --git a/server/grpc_service.go b/server/grpc_service.go index 036f92a5116..c380d9185ad 100644 --- a/server/grpc_service.go +++ b/server/grpc_service.go @@ -295,11 +295,23 @@ func (s *Server) StoreHeartbeat(ctx context.Context, request *pdpb.StoreHeartbea }, nil } + storeID := request.Stats.GetStoreId() + store := rc.GetStore(storeID) + if store == nil { + return nil, core.NewStoreNotFoundErr(storeID) + } + + storeAddress := store.GetAddress() + storeLabel := strconv.FormatUint(storeID, 10) + start := time.Now() + err := rc.HandleStoreHeartbeat(request.Stats) if err != nil { return nil, status.Errorf(codes.Unknown, err.Error()) } + storeHeartbeatHandleDuration.WithLabelValues(storeAddress, storeLabel).Observe(time.Since(start).Seconds()) + return &pdpb.StoreHeartbeatResponse{ Header: s.header(), ReplicationStatus: rc.GetReplicationMode().GetReplicationStatus(), @@ -401,12 +413,16 @@ func (s *Server) RegionHeartbeat(stream pdpb.PD_RegionHeartbeatServer) error { continue } + start := time.Now() + err = rc.HandleRegionHeartbeat(region) if err != nil { msg := err.Error() s.hbStreams.sendErr(pdpb.ErrorType_UNKNOWN, msg, request.GetLeader(), storeAddress, storeLabel) + continue } + regionHeartbeatHandleDuration.WithLabelValues(storeAddress, storeLabel).Observe(time.Since(start).Seconds()) regionHeartbeatCounter.WithLabelValues(storeAddress, storeLabel, "report", "ok").Inc() } } diff --git a/server/metrics.go b/server/metrics.go index bfbfc9935ca..a9335d73c72 100644 --- a/server/metrics.go +++ b/server/metrics.go @@ -65,6 +65,24 @@ var ( Help: "Bucketed histogram of processing time (s) of handled tso requests.", Buckets: prometheus.ExponentialBuckets(0.0005, 2, 13), }) + + regionHeartbeatHandleDuration = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Namespace: "pd", + Subsystem: "scheduler", + Name: "handle_region_heartbeat_duration_seconds", + Help: "Bucketed histogram of processing time (s) of handled region heartbeat requests.", + Buckets: prometheus.ExponentialBuckets(1, 2, 12), + }, []string{"address", "store"}) + + storeHeartbeatHandleDuration = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Namespace: "pd", + Subsystem: "scheduler", + Name: "handle_store_heartbeat_duration_seconds", + Help: "Bucketed histogram of processing time (s) of handled store heartbeat requests.", + Buckets: prometheus.ExponentialBuckets(1, 2, 12), + }, []string{"address", "store"}) ) func init() { @@ -74,4 +92,6 @@ func init() { prometheus.MustRegister(metadataGauge) prometheus.MustRegister(etcdStateGauge) prometheus.MustRegister(tsoHandleDuration) + prometheus.MustRegister(regionHeartbeatHandleDuration) + prometheus.MustRegister(storeHeartbeatHandleDuration) }