Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dashboard: Add panels for ingest storage to Writes and Queries dashboards #7670

Merged
merged 12 commits into from
Mar 20, 2024
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
### Mixin

* [ENHANCEMENT] Alerts: allow configuring alerts range interval via `_config.base_alerts_range_interval_minutes`. #7591
* [ENHANCEMENT] Dashboards: Add panels for monitoring distributor and ingester when using ingest-storage. These panels are disabled by default, but can be enabled using `show_ingest_storage_panels: true` config option. Similarly existing panels used when distributors and ingesters use gRPC for forwarding requests can be disabled by setting `show_grpc_ingestion_panels: false`. #7670

### Jsonnet

Expand Down
6 changes: 6 additions & 0 deletions operations/mimir-mixin/config.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -675,5 +675,11 @@
// Disabled by default, because when -ingester.limit-inflight-requests-using-grpc-method-limiter and -distributor.limit-inflight-requests-using-grpc-method-limiter is
// not used (default), then rejected requests are already counted as failures.
show_rejected_requests_on_writes_dashboard: false,

// Show panels that use queries for gRPC-based ingestion (distributor -> ingester)
show_grpc_ingestion_panels: true,

// Show panels that use queries for "ingest storage" ingestion (distributor -> Kafka, Kafka -> ingesters)
show_ingest_storage_panels: false,
},
}
259 changes: 257 additions & 2 deletions operations/mimir-mixin/dashboards/writes.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -162,10 +162,39 @@ local filename = 'mimir-writes.json';
'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"/distributor.Distributor/Push|/httpgrpc.*|%s"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.distributor), $.queries.write_http_routes_regex], ''
)
)
.addPanelIf(
$._config.show_ingest_storage_panels,
$.timeseriesPanel('ingest storage: WriteSync latency') +
$.panelDescription(
'WriteSync latency',
|||
Latency of WriteSync operation used to store data into Kafka.
|||
) +
$.queryPanel(
[
'max(cortex_ingest_storage_writer_latency_seconds{%s,quantile="0.5"})' % [$.jobMatcher($._config.job_names.distributor)],
'max(cortex_ingest_storage_writer_latency_seconds{%s,quantile="0.99"})' % [$.jobMatcher($._config.job_names.distributor)],
'max(cortex_ingest_storage_writer_latency_seconds{%s,quantile="0.999"})' % [$.jobMatcher($._config.job_names.distributor)],
'max(cortex_ingest_storage_writer_latency_seconds{%s,quantile="1.0"})' % [$.jobMatcher($._config.job_names.distributor)],
],
[
'50th percentile',
'99th percentile',
'99.9th percentile',
'100th percentile',
],
) + {
fieldConfig+: {
defaults+: { unit: 's' },
},
},
)
)
.addRowsIf(std.objectHasAll($._config.injectRows, 'postDistributor'), $._config.injectRows.postDistributor($))
.addRow(
$.row('Ingester')
.addRowIf(
$._config.show_grpc_ingestion_panels,
($.row('Ingester'))
.addPanel(
$.timeseriesPanel('Requests / sec') +
$.panelDescription(
Expand Down Expand Up @@ -206,6 +235,232 @@ local filename = 'mimir-writes.json';
)
)
)
.addRowIf(
$._config.show_ingest_storage_panels,
($.row('Ingester (ingest storage: fetching and processing records)'))
.addPanel(
$.timeseriesPanel('Responses / sec') +
$.panelDescription(
'Responses / sec',
|||
Rate of responses from Kafka brokers. Client can return multiple responses ("fetches") at once. Some of the responses may be failures.
|||
) +
$.queryPanel(
[
|||
sum (rate (cortex_ingest_storage_reader_fetches_total{%s}[$__rate_interval]))
-
sum (rate (cortex_ingest_storage_reader_fetch_errors_total{%s}[$__rate_interval]))
||| % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)],
'sum (rate (cortex_ingest_storage_reader_fetch_errors_total{%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester)],
],
[
'fetches',
'failed',
],
) + $.aliasColors({ failed: '#FF0000' }) + $.stack,
)
.addPanel(
$.timeseriesPanel('Records per fetch') +
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of this panel I would add "Kafka records / sec", which similar to "Kafka fetches / sec" but based on cortex_ingest_storage_reader_records_* metrics. I think it's important to show the rate of records / s, as much as the rate of fetches / s.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great idea, this is much more useful than records per fetch.

$.panelDescription(
'Records per fetch',
|||
Number of fetched records per fetch operation.
|||
) +
$.queryPanel(
[
|||
sum(rate(cortex_ingest_storage_reader_records_per_fetch_sum{%s}[$__rate_interval]))
/
sum(rate(cortex_ingest_storage_reader_records_per_fetch_count{%s}[$__rate_interval]))
||| % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)],
'histogram_quantile(0.99, sum by(le) (rate(cortex_ingest_storage_reader_records_per_fetch_bucket{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)],
],
[
'Average',
'99th percentile',
],
),
)
.addPanel(
$.timeseriesPanel('Processing Latency') +
$.panelDescription(
'Processing Latency',
|||
Time used to process a single record (write request). This time is spent by appending data to per-tenant TSDB.
|||
) +
$.queryPanel(
[
'max(cortex_ingest_storage_reader_processing_time_seconds{%s,quantile="0.99"})' % [$.jobMatcher($._config.job_names.ingester)],
'max(cortex_ingest_storage_reader_processing_time_seconds{%s,quantile="0.999"})' % [$.jobMatcher($._config.job_names.ingester)],
'max(cortex_ingest_storage_reader_processing_time_seconds{%s,quantile="0.5"})' % [$.jobMatcher($._config.job_names.ingester)],
|||
sum(rate(cortex_ingest_storage_reader_processing_time_seconds_sum{%s}[$__rate_interval]))
/
sum(rate(cortex_ingest_storage_reader_processing_time_seconds_count{%s}[$__rate_interval]))
||| % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)],
],
[
'99th percentile',
'99.9th percentile',
'50th percentile',
'average',
],
) + {
fieldConfig+: {
defaults+: { unit: 's' },
},
},
)
.addPanel(
$.timeseriesPanel('End-to-end latency') +
$.panelDescription(
'End-to-end latency',
|||
Time between writing request to Kafka by distributor and reading the record by ingester.
|||
) +
$.queryPanel(
[
'max(cortex_ingest_storage_reader_receive_delay_seconds{%s,quantile="0.99"})' % [$.jobMatcher($._config.job_names.ingester)],
'max(cortex_ingest_storage_reader_receive_delay_seconds{%s,quantile="0.999"})' % [$.jobMatcher($._config.job_names.ingester)],
'max(cortex_ingest_storage_reader_receive_delay_seconds{%s,quantile="0.5"})' % [$.jobMatcher($._config.job_names.ingester)],
],
[
'99th percentile',
'99.9th percentile',
'50th percentile',
],
) + {
fieldConfig+: {
defaults+: { unit: 's' },
},
},
)
)
.addRowIf(
$._config.show_ingest_storage_panels,
($.row('Ingester (ingest storage: strong consistency)'))
.addPanel(
$.timeseriesPanel('Requests with strong consistency') +
$.panelDescription(
'Requests with strong consistency',
|||
Shows rate of requests with strong consistency, and rate of failed requests with strong consistency.
|||
) +
$.queryPanel(
[
|||
sum(rate(cortex_ingest_storage_strong_consistency_requests_total{%s}[$__rate_interval]))
-
sum(rate(cortex_ingest_storage_strong_consistency_failures_total{%s}[$__rate_interval]))
||| % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)],
|||
sum(rate(cortex_ingest_storage_strong_consistency_failures_total{%s}[$__rate_interval]))
||| % [$.jobMatcher($._config.job_names.ingester)],
],
[
'requests',
'failed',
],
) + {
fieldConfig+: {
defaults+: { unit: 'reqps' },
},
} + $.aliasColors({ failed: '#FF0000' }) + $.stack,
)
.addPanel(
$.timeseriesPanel('Strong consistency – wait latency') +
$.panelDescription(
'Strong consistency – Wait latency',
|||
How long does the request wait to guarantee strong consistency.
|||
) +
$.queryPanel(
[
'max(max_over_time(cortex_ingest_storage_strong_consistency_wait_duration_seconds{%s,quantile="0.99"}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester)],
|||
sum(rate(cortex_ingest_storage_strong_consistency_wait_duration_seconds_sum{%s}[$__rate_interval]))
/
sum(rate(cortex_ingest_storage_strong_consistency_wait_duration_seconds_count{%s}[$__rate_interval]))
||| % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)],
],
[
'99th percentile',
'Average',
],
) + {
fieldConfig+: {
defaults+: { unit: 's' },
},
},
)
)
.addRowIf(
$._config.show_ingest_storage_panels,
($.row('Ingester (ingest storage: last produced offset)'))
.addPanel(
$.timeseriesPanel('Requests to get last produced offset') +
$.panelDescription(
'Rate of requests to fetch last produced offset for partition',
|||
Shows rate of requests to fetch last produced offset for partition, and rate of failed requests.
|||
) +
$.queryPanel(
[
|||
sum(rate(cortex_ingest_storage_reader_last_produced_offset_requests_total{%s}[$__rate_interval]))
-
sum(rate(cortex_ingest_storage_reader_last_produced_offset_failures_total{%s}[$__rate_interval]))
||| % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)],
|||
sum(rate(cortex_ingest_storage_reader_last_produced_offset_failures_total{%s}[$__rate_interval]))
||| % [$.jobMatcher($._config.job_names.ingester)],
],
[
'requests',
'failed',
],
) + {
fieldConfig+: {
defaults+: { unit: 'reqps' },
},
} + $.aliasColors({ failed: '#FF0000' }) + $.stack,
)
.addPanel(
$.timeseriesPanel('Last produced offset – latency') +
$.panelDescription(
'Latency',
|||
How long does it take to fetch "last produced offset" of partition.
|||
) +
$.queryPanel(
[
'max(max_over_time(cortex_ingest_storage_reader_last_produced_offset_request_duration_seconds{%s,quantile="0.99"}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester)],
|||
sum(rate(cortex_ingest_storage_reader_last_produced_offset_request_duration_seconds_sum{%s}[$__rate_interval]))
/
sum(rate(cortex_ingest_storage_reader_last_produced_offset_request_duration_seconds_count{%s}[$__rate_interval]))
||| % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)],
],
[
'99th percentile',
'Average',
],
) + {
fieldConfig+: {
defaults+: { unit: 's' },
},
},
)
)
.addRowIf(
$._config.gateway_enabled && $._config.autoscaling.gateway.enabled,
$.cpuAndMemoryBasedAutoScalingRow('Gateway'),
Expand Down
Loading