Skip to content

Commit

Permalink
Dashboards: Add panels for ingest storage (#7670)
Browse files Browse the repository at this point in the history
* Add some ingest storage dashboards to Write and Queries dashboards.

Signed-off-by: Peter Štibraný <[email protected]>
  • Loading branch information
pstibrany authored Mar 20, 2024
1 parent 13a383b commit 48a1a24
Show file tree
Hide file tree
Showing 6 changed files with 329 additions and 26 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
### Mixin

* [ENHANCEMENT] Alerts: allow configuring alerts range interval via `_config.base_alerts_range_interval_minutes`. #7591
* [ENHANCEMENT] Dashboards: Add panels for monitoring distributor and ingester when using ingest-storage. These panels are disabled by default, but can be enabled using `show_ingest_storage_panels: true` config option. Similarly existing panels used when distributors and ingesters use gRPC for forwarding requests can be disabled by setting `show_grpc_ingestion_panels: false`. #7670

### Jsonnet

Expand Down
8 changes: 8 additions & 0 deletions operations/mimir-mixin/config.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -649,6 +649,8 @@
'debug_pprof',
],

ingester_read_path_routes_regex: '/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata',

// The default datasource used for dashboards.
dashboard_datasource: 'default',
datasource_regex: '',
Expand All @@ -675,5 +677,11 @@
// Disabled by default, because when -ingester.limit-inflight-requests-using-grpc-method-limiter and -distributor.limit-inflight-requests-using-grpc-method-limiter is
// not used (default), then rejected requests are already counted as failures.
show_rejected_requests_on_writes_dashboard: false,

// Show panels that use queries for gRPC-based ingestion (distributor -> ingester)
show_grpc_ingestion_panels: true,

// Show panels that use queries for "ingest storage" ingestion (distributor -> Kafka, Kafka -> ingesters)
show_ingest_storage_panels: false,
},
}
49 changes: 26 additions & 23 deletions operations/mimir-mixin/dashboards/dashboard-utils.libsonnet
Original file line number Diff line number Diff line change
@@ -1,22 +1,25 @@
local utils = import 'mixin-utils/utils.libsonnet';

(import 'grafana-builder/grafana.libsonnet') {
local resourceRequestColor = '#FFC000',
local resourceLimitColor = '#E02F44',
local successColor = '#7EB26D',
local warningColor = '#EAB839',
local errorColor = '#E24D42',
_colors:: {
resourceRequest: '#FFC000',
resourceLimit: '#E02F44',
success: '#7EB26D',
clientError: '#EF843C',
warning: '#EAB839',
failed: '#E24D42', // "error" is reserved word in Jsonnet.
},

// Colors palette picked from Grafana UI, excluding red-ish colors which we want to keep reserved for errors / failures.
local nonErrorColorsPalette = ['#429D48', '#F1C731', '#2A66CF', '#9E44C1', '#FFAB57', '#C79424', '#84D586', '#A1C4FC', '#C788DE'],

local resourceRequestStyle = $.overrideFieldByName('request', [
$.overrideProperty('color', { mode: 'fixed', fixedColor: resourceRequestColor }),
$.overrideProperty('color', { mode: 'fixed', fixedColor: $._colors.resourceRequest }),
$.overrideProperty('custom.fillOpacity', 0),
$.overrideProperty('custom.lineStyle', { fill: 'dash' }),
]),
local resourceLimitStyle = $.overrideFieldByName('limit', [
$.overrideProperty('color', { mode: 'fixed', fixedColor: resourceLimitColor }),
$.overrideProperty('color', { mode: 'fixed', fixedColor: $._colors.resourceLimit }),
$.overrideProperty('custom.fillOpacity', 0),
$.overrideProperty('custom.lineStyle', { fill: 'dash' }),
]),
Expand Down Expand Up @@ -196,14 +199,14 @@ local utils = import 'mixin-utils/utils.libsonnet';
qpsPanel(selector, statusLabelName='status_code')::
super.qpsPanel(selector, statusLabelName) +
$.aliasColors({
'1xx': warningColor,
'2xx': successColor,
'1xx': $._colors.warning,
'2xx': $._colors.success,
'3xx': '#6ED0E0',
'4xx': '#EF843C',
'5xx': errorColor,
OK: successColor,
success: successColor,
'error': errorColor,
'5xx': $._colors.failed,
OK: $._colors.success,
success: $._colors.success,
'error': $._colors.failed,
cancel: '#A9A9A9',
}) + {
fieldConfig+: {
Expand Down Expand Up @@ -260,15 +263,15 @@ local utils = import 'mixin-utils/utils.libsonnet';
// Set the failure color only if there's just 1 legend and it doesn't contain any placeholder.
$.aliasColors(
if (std.type(legends) == 'string' && std.length(std.findSubstr('{', legends[0])) == 0) then {
[legends]: errorColor,
[legends]: $._colors.failed,
} else {}
),

successFailurePanel(successMetric, failureMetric)::
$.queryPanel([successMetric, failureMetric], ['successful', 'failed']) +
$.aliasColors({
successful: successColor,
failed: errorColor,
successful: $._colors.success,
failed: $._colors.failed,
}),

// successFailureCustomPanel is like successFailurePanel() but allows to customize the legends
Expand All @@ -277,8 +280,8 @@ local utils = import 'mixin-utils/utils.libsonnet';
successFailureCustomPanel(queries, legends)::
$.queryPanel(queries, legends) +
$.aliasColors({
[legends[0]]: successColor,
[legends[1]]: errorColor,
[legends[0]]: $._colors.success,
[legends[1]]: $._colors.failed,
}),

// Displays started, completed and failed rate.
Expand All @@ -288,8 +291,8 @@ local utils = import 'mixin-utils/utils.libsonnet';
$.stack +
$.aliasColors({
started: '#34CCEB',
completed: successColor,
failed: errorColor,
completed: $._colors.success,
failed: $._colors.failed,
}),

resourceUtilizationAndLimitLegend(resourceName)::
Expand Down Expand Up @@ -993,9 +996,9 @@ local utils = import 'mixin-utils/utils.libsonnet';
thresholds: {
mode: 'absolute',
steps: [
{ color: successColor, value: null },
{ color: warningColor, value: 0.01 }, // 1%
{ color: errorColor, value: 0.05 }, // 5%
{ color: $._colors.success, value: null },
{ color: $._colors.warning, value: 0.01 }, // 1%
{ color: $._colors.failed, value: 0.05 }, // 5%
],
},
},
Expand Down
151 changes: 151 additions & 0 deletions operations/mimir-mixin/dashboards/queries.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,157 @@ local filename = 'mimir-queries.json';
{ fieldConfig+: { defaults+: { unit: 'short' } } },
)
)
.addRowIf(
$._config.show_ingest_storage_panels,
($.row('Ingester (ingest storage: strong consistency)'))
.addPanel(
$.timeseriesPanel('Requests with strong read consistency / sec') +
$.panelDescription(
'Requests with strong read consistency / sec',
|||
Shows rate of requests with strong read consistency, and rate of failed requests with strong read consistency.
|||
) +
$.queryPanel(
[
|||
sum(rate(cortex_ingest_storage_strong_consistency_requests_total{%s}[$__rate_interval]))
-
sum(rate(cortex_ingest_storage_strong_consistency_failures_total{%s}[$__rate_interval]))
||| % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)],
|||
sum(rate(cortex_ingest_storage_strong_consistency_failures_total{%s}[$__rate_interval]))
||| % [$.jobMatcher($._config.job_names.ingester)],
],
[
'successful',
'failed',
],
) + {
fieldConfig+: {
defaults+: { unit: 'reqps' },
},
} + $.aliasColors({ successful: $._colors.success, failed: $._colors.failed }) + $.stack,
)
.addPanel(
$.timeseriesPanel('Requests with strong read consistency ratio') +
$.panelDescription(
'Requests with strong read consistency ratio',
|||
Ratio between requests with strong read consistency and all read requests on ingesters.
|||
) +
$.queryPanel(
[
|||
(
sum(rate(cortex_ingest_storage_strong_consistency_requests_total{%s}[$__rate_interval]))
-
sum(rate(cortex_ingest_storage_strong_consistency_failures_total{%s}[$__rate_interval]))
)
/
sum(rate(cortex_request_duration_seconds_count{%s,route=~"%s"}[$__rate_interval]))
||| % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester), $._config.ingester_read_path_routes_regex],
|||
sum(rate(cortex_ingest_storage_strong_consistency_failures_total{%s}[$__rate_interval]))
/
sum(rate(cortex_request_duration_seconds_count{%s,route=~"%s"}[$__rate_interval]))
||| % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester), $._config.ingester_read_path_routes_regex],
],
['successful', 'failed'],
)
+ $.aliasColors({ failed: $._colors.failed, successful: $._colors.success })
+ { fieldConfig+: { defaults+: { unit: 'percentunit', min: 0, max: 1 } } }
+ $.stack
)
.addPanel(
$.timeseriesPanel('Strong read consistency queries — wait latency') +
$.panelDescription(
'Strong read consistency queries — wait latency',
|||
How long does the request wait to guarantee strong read consistency.
|||
) +
$.queryPanel(
[
'max(max_over_time(cortex_ingest_storage_strong_consistency_wait_duration_seconds{%s,quantile="0.5"}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester)],
'max(max_over_time(cortex_ingest_storage_strong_consistency_wait_duration_seconds{%s,quantile="0.99"}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester)],
'max(max_over_time(cortex_ingest_storage_strong_consistency_wait_duration_seconds{%s,quantile="0.999"}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester)],
'max(max_over_time(cortex_ingest_storage_strong_consistency_wait_duration_seconds{%s,quantile="1.0"}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester)],
],
[
'50th percentile',
'99th percentile',
'99.9th percentile',
'100th percentile',
],
) + {
fieldConfig+: {
defaults+: { unit: 's' },
},
},
)
)
.addRowIf(
$._config.show_ingest_storage_panels,
($.row('Ingester (ingest storage: last produced offset)'))
.addPanel(
$.timeseriesPanel('Last produced offset requests / sec') +
$.panelDescription(
'Rate of requests to fetch last produced offset for partition',
|||
Shows rate of requests to fetch last produced offset for partition, and rate of failed requests.
|||
) +
$.queryPanel(
[
|||
sum(rate(cortex_ingest_storage_reader_last_produced_offset_requests_total{%s}[$__rate_interval]))
-
sum(rate(cortex_ingest_storage_reader_last_produced_offset_failures_total{%s}[$__rate_interval]))
||| % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)],
|||
sum(rate(cortex_ingest_storage_reader_last_produced_offset_failures_total{%s}[$__rate_interval]))
||| % [$.jobMatcher($._config.job_names.ingester)],
],
[
'successful',
'failed',
],
) + {
fieldConfig+: {
defaults+: { unit: 'reqps' },
},
} + $.aliasColors({ successful: $._colors.success, failed: $._colors.failed }) + $.stack,
)
.addPanel(
$.timeseriesPanel('Last produced offset latency') +
$.panelDescription(
'Latency',
|||
How long does it take to fetch "last produced offset" of partition.
|||
) +
$.queryPanel(
[
'max(max_over_time(cortex_ingest_storage_reader_last_produced_offset_request_duration_seconds{%s,quantile="0.5"}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester)],
'max(max_over_time(cortex_ingest_storage_reader_last_produced_offset_request_duration_seconds{%s,quantile="0.99"}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester)],
'max(max_over_time(cortex_ingest_storage_reader_last_produced_offset_request_duration_seconds{%s,quantile="0.999"}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester)],
'max(max_over_time(cortex_ingest_storage_reader_last_produced_offset_request_duration_seconds{%s,quantile="1.0"}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester)],
],
[
'50th percentile',
'99th percentile',
'99.9th percentile',
'100th percentile',
],
) + {
fieldConfig+: {
defaults+: { unit: 's' },
},
},
)
)
.addRow(
$.row('Querier')
.addPanel(
Expand Down
2 changes: 1 addition & 1 deletion operations/mimir-mixin/dashboards/reads.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,7 @@ local filename = 'mimir-reads.json';
$.row('Ingester')
.addPanel(
$.timeseriesPanel('Requests / sec') +
$.qpsPanel('cortex_request_duration_seconds_count{%s,route=~"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata"}' % $.jobMatcher($._config.job_names.ingester))
$.qpsPanel('cortex_request_duration_seconds_count{%s,route=~"%s"}' % [$.jobMatcher($._config.job_names.ingester), $._config.ingester_read_path_routes_regex])
)
.addPanel(
$.timeseriesPanel('Latency') +
Expand Down
Loading

0 comments on commit 48a1a24

Please sign in to comment.