Skip to content
This repository has been archived by the owner on Apr 2, 2024. It is now read-only.

Commit

Permalink
Collecting job stats for each maintenance job type and signal.
Browse files Browse the repository at this point in the history
- A test has been moved to the extension repository and
  another test was adjusted to not over-specify the number of jobs in a
  metrics test.
- Two new families of metrics were added to the dashboard
- Histograms are now supported in database-reported metrics
  • Loading branch information
sumerman committed Nov 9, 2022
1 parent d0bd0b1 commit c21884e
Show file tree
Hide file tree
Showing 8 changed files with 472 additions and 95 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ We use the following categories for changes:

### Added
- Alerts from promscale monitoring mixin are groupped also by namespace label [#1714]
- Added a new family of metrics tracking database maintenance jobs durations and failures [#1745]

### Changed
- Reduced the verbosity of the logs emitted by the vacuum engine [#1715]
Expand Down
328 changes: 326 additions & 2 deletions docs/mixin/dashboards/promscale.json
Original file line number Diff line number Diff line change
Expand Up @@ -2455,6 +2455,330 @@
"title": "Longest running maintenance query",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"description": "",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": true,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
}
]
},
"unit": "s"
},
"overrides": []
},
"gridPos": {
"h": 9,
"w": 12,
"x": 0,
"y": 41
},
"id": 54,
"interval": "2m",
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"editorMode": "code",
"exemplar": false,
"expr": "histogram_quantile(0.99, max(rate(promscale_sql_database_worker_maintenance_job_metrics_compression_last_duration_seconds_bucket{namespace=~\"$namespace\"}[$__rate_interval])) by (le, job))",
"interval": "",
"legendFormat": "metrics-compression",
"range": true,
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"editorMode": "code",
"exemplar": false,
"expr": "histogram_quantile(0.99, max(rate(promscale_sql_database_worker_maintenance_job_metrics_retention_last_duration_seconds_bucket{namespace=~\"$namespace\"}[$__rate_interval])) by (le, job))",
"hide": false,
"interval": "",
"legendFormat": "metrics-retention",
"range": true,
"refId": "B"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"editorMode": "code",
"exemplar": false,
"expr": "histogram_quantile(0.99, max(rate(promscale_sql_database_worker_maintenance_job_traces_retention_last_duration_seconds_bucket{namespace=~\"$namespace\"}[$__rate_interval])) by (le, job))",
"hide": false,
"interval": "",
"legendFormat": "traces-retention",
"range": true,
"refId": "C"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"editorMode": "code",
"exemplar": false,
"expr": "histogram_quantile(0.99, max(rate(promscale_sql_database_worker_maintenance_job_traces_compression_last_duration_seconds_bucket{namespace=~\"$namespace\"}[$__rate_interval])) by (le, job))",
"hide": false,
"interval": "",
"legendFormat": "traces-compression",
"range": true,
"refId": "D"
}
],
"title": "Duration of recent jobs",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"description": "",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": true,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
}
]
},
"unit": "opm"
},
"overrides": []
},
"gridPos": {
"h": 9,
"w": 12,
"x": 12,
"y": 41
},
"id": 57,
"interval": "2m",
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"editorMode": "code",
"exemplar": false,
"expr": "max by (job, instance)(rate(promscale_sql_database_worker_maintenance_job_metrics_compression_total_runs_count{namespace=~\"$namespace\"}[$__rate_interval])) * 60",
"interval": "",
"legendFormat": "metrics-compression-total",
"range": true,
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"editorMode": "code",
"exemplar": false,
"expr": "max by (job, instance)(rate(promscale_sql_database_worker_maintenance_job_metrics_retention_total_runs_count{namespace=~\"$namespace\"}[$__rate_interval])) * 60",
"hide": false,
"interval": "",
"legendFormat": "metrics-retention-total",
"range": true,
"refId": "B"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"editorMode": "code",
"exemplar": false,
"expr": "max by (job, instance)(rate(promscale_sql_database_worker_maintenance_job_traces_retention_total_runs_count{namespace=~\"$namespace\"}[$__rate_interval])) * 60",
"hide": false,
"interval": "",
"legendFormat": "traces-retention-total",
"range": true,
"refId": "C"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"editorMode": "code",
"exemplar": false,
"expr": "max by (job, instance)(rate(promscale_sql_database_worker_maintenance_job_metrics_compression_failures_count{namespace=~\"$namespace\"}[$__rate_interval])) * 60",
"hide": false,
"interval": "",
"legendFormat": "metrics-compression-failures",
"range": true,
"refId": "D"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"editorMode": "code",
"exemplar": false,
"expr": "max by (job, instance)(rate(promscale_sql_database_worker_maintenance_job_metrics_retention_failures_count{namespace=~\"$namespace\"}[$__rate_interval])) * 60",
"hide": false,
"interval": "",
"legendFormat": "metrics-retention-failures",
"range": true,
"refId": "E"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"editorMode": "code",
"exemplar": false,
"expr": "max by (job, instance)(rate(promscale_sql_database_worker_maintenance_job_traces_retention_failures_count{namespace=~\"$namespace\"}[$__rate_interval])) * 60",
"hide": false,
"interval": "",
"legendFormat": "traces-retention-failures",
"range": true,
"refId": "F"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"editorMode": "code",
"exemplar": false,
"expr": "max by (job, instance)(rate(promscale_sql_database_worker_maintenance_job_traces_compression_failures_count{namespace=~\"$namespace\"}[$__rate_interval])) * 60",
"hide": false,
"interval": "",
"legendFormat": "traces-compression-failures",
"range": true,
"refId": "G"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"editorMode": "code",
"exemplar": false,
"expr": "max by (job, instance)(rate(promscale_sql_database_worker_maintenance_job_traces_retention_total_runs_count{namespace=~\"$namespace\"}[$__rate_interval])) * 60",
"hide": false,
"interval": "",
"legendFormat": "traces-compression-total",
"range": true,
"refId": "H"
}
],
"title": "Completion and failure rates",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
Expand Down Expand Up @@ -2514,7 +2838,7 @@
"h": 9,
"w": 12,
"x": 0,
"y": 49
"y": 42
},
"id": 50,
"interval": "2m",
Expand Down Expand Up @@ -2691,7 +3015,7 @@
"h": 9,
"w": 12,
"x": 12,
"y": 49
"y": 42
},
"id": 52,
"interval": "2m",
Expand Down
2 changes: 2 additions & 0 deletions pkg/pgmodel/metrics/database/database.go
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,8 @@ func updateMetric(m prometheus.Collector, value int64) {
n.Set(float64(value))
case prometheus.Counter:
n.Add(float64(value))
case prometheus.Histogram:
n.Observe(float64(value))
default:
panic(fmt.Sprintf("metric %s is of type %T", m, m))
}
Expand Down
Loading

0 comments on commit c21884e

Please sign in to comment.