Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dashboard: Add panels for ingest storage to Writes and Queries dashboards #7670

Merged
merged 12 commits into from
Mar 20, 2024
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
### Mixin

* [ENHANCEMENT] Alerts: allow configuring alerts range interval via `_config.base_alerts_range_interval_minutes`. #7591
* [ENHANCEMENT] Dashboards: Add panels for monitoring distributor and ingester when using ingest-storage. These panels are disabled by default, but can be enabled using `show_ingest_storage_panels: true` config option. Similarly existing panels used when distributors and ingesters use gRPC for forwarding requests can be disabled by setting `show_grpc_ingestion_panels: false`. #7670

### Jsonnet

Expand Down
8 changes: 8 additions & 0 deletions operations/mimir-mixin/config.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -649,6 +649,8 @@
'debug_pprof',
],

ingester_read_path_routes_regex: '/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata',
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I know this is what we use in the Reads dashboard but I think it's missing several stuff (e.g. ActiveSeries, UserStats, ...).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will send separate PR to fix, since it's unrelated to this PR.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in #7676.


// The default datasource used for dashboards.
dashboard_datasource: 'default',
datasource_regex: '',
Expand All @@ -675,5 +677,11 @@
// Disabled by default, because when -ingester.limit-inflight-requests-using-grpc-method-limiter and -distributor.limit-inflight-requests-using-grpc-method-limiter is
// not used (default), then rejected requests are already counted as failures.
show_rejected_requests_on_writes_dashboard: false,

// Show panels that use queries for gRPC-based ingestion (distributor -> ingester)
show_grpc_ingestion_panels: true,

// Show panels that use queries for "ingest storage" ingestion (distributor -> Kafka, Kafka -> ingesters)
show_ingest_storage_panels: false,
},
}
49 changes: 26 additions & 23 deletions operations/mimir-mixin/dashboards/dashboard-utils.libsonnet
Original file line number Diff line number Diff line change
@@ -1,22 +1,25 @@
local utils = import 'mixin-utils/utils.libsonnet';

(import 'grafana-builder/grafana.libsonnet') {
local resourceRequestColor = '#FFC000',
local resourceLimitColor = '#E02F44',
local successColor = '#7EB26D',
local warningColor = '#EAB839',
local errorColor = '#E24D42',
_colors:: {
resourceRequest: '#FFC000',
resourceLimit: '#E02F44',
success: '#7EB26D',
clientError: '#EF843C',
warning: '#EAB839',
failed: '#E24D42', // "error" is reserved word in Jsonnet.
},

// Colors palette picked from Grafana UI, excluding red-ish colors which we want to keep reserved for errors / failures.
local nonErrorColorsPalette = ['#429D48', '#F1C731', '#2A66CF', '#9E44C1', '#FFAB57', '#C79424', '#84D586', '#A1C4FC', '#C788DE'],

local resourceRequestStyle = $.overrideFieldByName('request', [
$.overrideProperty('color', { mode: 'fixed', fixedColor: resourceRequestColor }),
$.overrideProperty('color', { mode: 'fixed', fixedColor: $._colors.resourceRequest }),
$.overrideProperty('custom.fillOpacity', 0),
$.overrideProperty('custom.lineStyle', { fill: 'dash' }),
]),
local resourceLimitStyle = $.overrideFieldByName('limit', [
$.overrideProperty('color', { mode: 'fixed', fixedColor: resourceLimitColor }),
$.overrideProperty('color', { mode: 'fixed', fixedColor: $._colors.resourceLimit }),
$.overrideProperty('custom.fillOpacity', 0),
$.overrideProperty('custom.lineStyle', { fill: 'dash' }),
]),
Expand Down Expand Up @@ -196,14 +199,14 @@ local utils = import 'mixin-utils/utils.libsonnet';
qpsPanel(selector, statusLabelName='status_code')::
super.qpsPanel(selector, statusLabelName) +
$.aliasColors({
'1xx': warningColor,
'2xx': successColor,
'1xx': $._colors.warning,
'2xx': $._colors.success,
'3xx': '#6ED0E0',
'4xx': '#EF843C',
'5xx': errorColor,
OK: successColor,
success: successColor,
'error': errorColor,
'5xx': $._colors.failed,
OK: $._colors.success,
success: $._colors.success,
'error': $._colors.failed,
cancel: '#A9A9A9',
}) + {
fieldConfig+: {
Expand Down Expand Up @@ -260,15 +263,15 @@ local utils = import 'mixin-utils/utils.libsonnet';
// Set the failure color only if there's just 1 legend and it doesn't contain any placeholder.
$.aliasColors(
if (std.type(legends) == 'string' && std.length(std.findSubstr('{', legends[0])) == 0) then {
[legends]: errorColor,
[legends]: $._colors.failed,
} else {}
),

successFailurePanel(successMetric, failureMetric)::
$.queryPanel([successMetric, failureMetric], ['successful', 'failed']) +
$.aliasColors({
successful: successColor,
failed: errorColor,
successful: $._colors.success,
failed: $._colors.failed,
}),

// successFailureCustomPanel is like successFailurePanel() but allows to customize the legends
Expand All @@ -277,8 +280,8 @@ local utils = import 'mixin-utils/utils.libsonnet';
successFailureCustomPanel(queries, legends)::
$.queryPanel(queries, legends) +
$.aliasColors({
[legends[0]]: successColor,
[legends[1]]: errorColor,
[legends[0]]: $._colors.success,
[legends[1]]: $._colors.failed,
}),

// Displays started, completed and failed rate.
Expand All @@ -288,8 +291,8 @@ local utils = import 'mixin-utils/utils.libsonnet';
$.stack +
$.aliasColors({
started: '#34CCEB',
completed: successColor,
failed: errorColor,
completed: $._colors.success,
failed: $._colors.failed,
}),

resourceUtilizationAndLimitLegend(resourceName)::
Expand Down Expand Up @@ -993,9 +996,9 @@ local utils = import 'mixin-utils/utils.libsonnet';
thresholds: {
mode: 'absolute',
steps: [
{ color: successColor, value: null },
{ color: warningColor, value: 0.01 }, // 1%
{ color: errorColor, value: 0.05 }, // 5%
{ color: $._colors.success, value: null },
{ color: $._colors.warning, value: 0.01 }, // 1%
{ color: $._colors.failed, value: 0.05 }, // 5%
],
},
},
Expand Down
151 changes: 151 additions & 0 deletions operations/mimir-mixin/dashboards/queries.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,157 @@ local filename = 'mimir-queries.json';
{ fieldConfig+: { defaults+: { unit: 'short' } } },
)
)
.addRowIf(
$._config.show_ingest_storage_panels,
($.row('Ingester (ingest storage: strong consistency)'))
.addPanel(
$.timeseriesPanel('Requests with strong consistency') +
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nit] WDYT about being more specific about the "strong consistency"?

Suggested change
$.timeseriesPanel('Requests with strong consistency') +
$.timeseriesPanel('Requests with strong read consistency / sec') +

If you agree, same comment applies in other places.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good, fixing.

$.panelDescription(
'Requests with strong consistency',
|||
Shows rate of requests with strong consistency, and rate of failed requests with strong consistency.
|||
) +
$.queryPanel(
[
|||
sum(rate(cortex_ingest_storage_strong_consistency_requests_total{%s}[$__rate_interval]))
-
sum(rate(cortex_ingest_storage_strong_consistency_failures_total{%s}[$__rate_interval]))
||| % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)],
|||
sum(rate(cortex_ingest_storage_strong_consistency_failures_total{%s}[$__rate_interval]))
||| % [$.jobMatcher($._config.job_names.ingester)],
],
[
'requests',
'failed',
],
) + {
fieldConfig+: {
defaults+: { unit: 'reqps' },
},
} + $.aliasColors({ failed: $._colors.failed }) + $.stack,
)
.addPanel(
$.timeseriesPanel('Requests with strong consistency ratio') +
$.panelDescription(
'Requests with strong consistency ratio',
|||
Ratio between requests with strong consistency and all read requests on ingesters.
|||
) +
$.queryPanel(
[
|||
(
sum(rate(cortex_ingest_storage_strong_consistency_requests_total{%s}[$__rate_interval]))
-
sum(rate(cortex_ingest_storage_strong_consistency_failures_total{%s}[$__rate_interval]))
)
/
sum(rate(cortex_request_duration_seconds_count{%s,route=~"%s"}[$__rate_interval]))
||| % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester), $._config.ingester_read_path_routes_regex],
|||
sum(rate(cortex_ingest_storage_strong_consistency_failures_total{%s}[$__rate_interval]))
/
sum(rate(cortex_request_duration_seconds_count{%s,route=~"%s"}[$__rate_interval]))
||| % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester), $._config.ingester_read_path_routes_regex],
],
['successful', 'failed'],
)
+ $.aliasColors({ failed: $._colors.failed, successful: $._colors.success })
+ { fieldConfig+: { defaults+: { unit: 'percentunit', min: 0, max: 1 } } }
+ $.stack
)
.addPanel(
$.timeseriesPanel('Strong consistency queries — wait latency') +
$.panelDescription(
'Strong consistency queries — wait latency',
|||
How long does the request wait to guarantee strong consistency.
|||
) +
$.queryPanel(
[
'max(max_over_time(cortex_ingest_storage_strong_consistency_wait_duration_seconds{%s,quantile="0.5"}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester)],
'max(max_over_time(cortex_ingest_storage_strong_consistency_wait_duration_seconds{%s,quantile="0.99"}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester)],
'max(max_over_time(cortex_ingest_storage_strong_consistency_wait_duration_seconds{%s,quantile="0.999"}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester)],
'max(max_over_time(cortex_ingest_storage_strong_consistency_wait_duration_seconds{%s,quantile="1.0"}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester)],
],
[
'50th percentile',
'99th percentile',
'99.9th percentile',
'100th percentile',
],
) + {
fieldConfig+: {
defaults+: { unit: 's' },
},
},
)
)
.addRowIf(
$._config.show_ingest_storage_panels,
($.row('Ingester (ingest storage: last produced offset)'))
.addPanel(
$.timeseriesPanel('Last produced offset requests / sec') +
$.panelDescription(
'Rate of requests to fetch last produced offset for partition',
|||
Shows rate of requests to fetch last produced offset for partition, and rate of failed requests.
|||
) +
$.queryPanel(
[
|||
sum(rate(cortex_ingest_storage_reader_last_produced_offset_requests_total{%s}[$__rate_interval]))
-
sum(rate(cortex_ingest_storage_reader_last_produced_offset_failures_total{%s}[$__rate_interval]))
||| % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)],
|||
sum(rate(cortex_ingest_storage_reader_last_produced_offset_failures_total{%s}[$__rate_interval]))
||| % [$.jobMatcher($._config.job_names.ingester)],
],
[
'requests',
'failed',
],
) + {
fieldConfig+: {
defaults+: { unit: 'reqps' },
},
} + $.aliasColors({ failed: $._colors.failed }) + $.stack,
)
.addPanel(
$.timeseriesPanel('Last produced offset latency') +
$.panelDescription(
'Latency',
|||
How long does it take to fetch "last produced offset" of partition.
|||
) +
$.queryPanel(
[
'max(max_over_time(cortex_ingest_storage_reader_last_produced_offset_request_duration_seconds{%s,quantile="0.5"}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester)],
'max(max_over_time(cortex_ingest_storage_reader_last_produced_offset_request_duration_seconds{%s,quantile="0.99"}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester)],
'max(max_over_time(cortex_ingest_storage_reader_last_produced_offset_request_duration_seconds{%s,quantile="0.999"}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester)],
'max(max_over_time(cortex_ingest_storage_reader_last_produced_offset_request_duration_seconds{%s,quantile="1.0"}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester)],
],
[
'50th percentile',
'99th percentile',
'99.9th percentile',
'100th percentile',
],
) + {
fieldConfig+: {
defaults+: { unit: 's' },
},
},
)
)
.addRow(
$.row('Querier')
.addPanel(
Expand Down
2 changes: 1 addition & 1 deletion operations/mimir-mixin/dashboards/reads.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,7 @@ local filename = 'mimir-reads.json';
$.row('Ingester')
.addPanel(
$.timeseriesPanel('Requests / sec') +
$.qpsPanel('cortex_request_duration_seconds_count{%s,route=~"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata"}' % $.jobMatcher($._config.job_names.ingester))
$.qpsPanel('cortex_request_duration_seconds_count{%s,route=~"%s"}' % [$.jobMatcher($._config.job_names.ingester), $._config.ingester_read_path_routes_regex])
)
.addPanel(
$.timeseriesPanel('Latency') +
Expand Down
Loading
Loading