Skip to content

Commit

Permalink
Documentation/op-guide: fix failed RPC rate, leader election metrics
Browse files Browse the repository at this point in the history
This fixes failed RPC rate query, where we do not need
subtraction because we already query by the status code.
Also adds grpc_method to make it more specific. Most of the
time, the failure recovers within 10-second, which is our
Prometheus scrap interval, so 'rate' query might not cover
that time window, showing as 0s, but still shows up in the graph.

Signed-off-by: Gyu-Ho Lee <[email protected]>
  • Loading branch information
gyuho committed Jun 15, 2017
1 parent ee0c805 commit c642470
Showing 1 changed file with 30 additions and 27 deletions.
57 changes: 30 additions & 27 deletions Documentation/op-guide/grafana.json
Original file line number Diff line number Diff line change
Expand Up @@ -112,23 +112,26 @@
"renderer": "flot",
"seriesOverrides": [],
"span": 5,
"stack": false,
"stack": true,
"steppedLine": false,
"targets": [{
"expr": "sum(rate(grpc_server_started_total{grpc_type=\"unary\"} [1m]))",
"intervalFactor": 2,
"legendFormat": "{{instance}} RPC Rate",
"metric": "grpc_server_started_total",
"refId": "A",
"step": 2
"targets": [
{
"expr": "sum(rate(grpc_server_started_total{grpc_type=\"unary\"}[5m])) by (instance, grpc_method)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{instance}} {{grpc_method}} RPC Rate",
"metric": "grpc_server_started_total",
"refId": "A",
"step": 240
},
{
"expr": "sum(rate(grpc_server_started_total{grpc_type=\"unary\"} [1m])) - sum(rate(grpc_server_handled_total{grpc_type=\"unary\",grpc_code!=\"OK\"} [1m]))",
"intervalFactor": 2,
"legendFormat": "{{instance}} RPC Failed Rate",
"metric": "grpc_server_handled_total",
"refId": "B",
"step": 2
"expr": "rate(grpc_server_handled_total{grpc_type=\"unary\",grpc_code!=\"OK\"}[5m])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{instance}} {{grpc_method}} RPC Failed Rate",
"metric": "grpc_server_handled_total",
"refId": "B",
"step": 240
}
],
"thresholds": [],
Expand Down Expand Up @@ -361,7 +364,7 @@
"stack": false,
"steppedLine": true,
"targets": [{
"expr": "histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket [5m])) by (instance, le))",
"expr": "histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) by (instance, le))",
"hide": false,
"intervalFactor": 2,
"legendFormat": "{{instance}} WAL fsync",
Expand All @@ -370,7 +373,7 @@
"step": 4
},
{
"expr": "histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket [5m])) by (instance, le))",
"expr": "histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) by (instance, le))",
"intervalFactor": 2,
"legendFormat": "{{instance}} DB fsync",
"metric": "etcd_disk_backend_commit_duration_seconds_bucket",
Expand Down Expand Up @@ -522,7 +525,7 @@
"stack": true,
"steppedLine": false,
"targets": [{
"expr": "rate(etcd_network_client_grpc_received_bytes_total [1m])",
"expr": "rate(etcd_network_client_grpc_received_bytes_total[5m])",
"intervalFactor": 2,
"legendFormat": "{{instance}} Client Traffic In",
"metric": "etcd_network_client_grpc_received_bytes_total",
Expand Down Expand Up @@ -595,7 +598,7 @@
"stack": true,
"steppedLine": false,
"targets": [{
"expr": "rate(etcd_network_client_grpc_sent_bytes_total [1m])",
"expr": "rate(etcd_network_client_grpc_sent_bytes_total[5m])",
"intervalFactor": 2,
"legendFormat": "{{instance}} Client Traffic Out",
"metric": "etcd_network_client_grpc_sent_bytes_total",
Expand Down Expand Up @@ -668,7 +671,7 @@
"stack": false,
"steppedLine": false,
"targets": [{
"expr": "sum(rate(etcd_network_peer_received_bytes_total [1m])) by (instance)",
"expr": "sum(rate(etcd_network_peer_received_bytes_total[5m])) by (instance)",
"intervalFactor": 2,
"legendFormat": "{{instance}} Peer Traffic In",
"metric": "etcd_network_peer_received_bytes_total",
Expand Down Expand Up @@ -742,7 +745,7 @@
"stack": false,
"steppedLine": false,
"targets": [{
"expr": "sum(rate(etcd_network_peer_sent_bytes_total [1m])) by (instance)",
"expr": "sum(rate(etcd_network_peer_sent_bytes_total[5m])) by (instance)",
"hide": false,
"interval": "",
"intervalFactor": 2,
Expand Down Expand Up @@ -822,7 +825,7 @@
"stack": false,
"steppedLine": false,
"targets": [{
"expr": "sum(rate(etcd_server_proposals_failed_total [1m]))",
"expr": "sum(rate(etcd_server_proposals_failed_total[5m]))",
"intervalFactor": 2,
"legendFormat": "Proposal Failure Rate",
"metric": "etcd_server_proposals_failed_total",
Expand All @@ -838,15 +841,15 @@
"step": 2
},
{
"expr": "sum(rate(etcd_server_proposals_committed_total [1m]))",
"expr": "sum(rate(etcd_server_proposals_committed_total[5m]))",
"intervalFactor": 2,
"legendFormat": "Proposal Commit Rate",
"metric": "etcd_server_proposals_committed_total",
"refId": "C",
"step": 2
},
{
"expr": "sum(rate(etcd_server_proposals_applied_total [1m]))",
"expr": "sum(rate(etcd_server_proposals_applied_total[5m]))",
"intervalFactor": 2,
"legendFormat": "Proposal Apply Rate",
"refId": "D",
Expand Down Expand Up @@ -922,17 +925,17 @@
"stack": false,
"steppedLine": false,
"targets": [{
"expr": "etcd_server_leader_changes_seen_total",
"expr": "changes(etcd_server_leader_changes_seen_total[1d])",
"intervalFactor": 2,
"legendFormat": "{{instance}} Leader Change Seen",
"legendFormat": "{{instance}} Total Leader Elections Per Day",
"metric": "etcd_server_leader_changes_seen_total",
"refId": "A",
"step": 2
}],
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "Rate Leader Elections",
"title": "Total Leader Elections Per Day",
"tooltip": {
"msResolution": false,
"shared": true,
Expand Down Expand Up @@ -1009,4 +1012,4 @@
"version": 215,
"links": [],
"gnetId": null
}
}

0 comments on commit c642470

Please sign in to comment.