grafana · pstibrany · Mar 20, 2024 · Mar 19, 2024 · Mar 20, 2024 · Mar 20, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -18,6 +18,7 @@
 ### Mixin
 
 * [ENHANCEMENT] Alerts: allow configuring alerts range interval via `_config.base_alerts_range_interval_minutes`. #7591
+* [ENHANCEMENT] Dashboards: Add panels for monitoring distributor and ingester when using ingest-storage. These panels are disabled by default, but can be enabled using `show_ingest_storage_panels: true` config option. Similarly existing panels used when distributors and ingesters use gRPC for forwarding requests can be disabled by setting `show_grpc_ingestion_panels: false`. #7670
 
 ### Jsonnet
 

@@ -675,5 +675,11 @@
     // Disabled by default, because when -ingester.limit-inflight-requests-using-grpc-method-limiter and -distributor.limit-inflight-requests-using-grpc-method-limiter is
     // not used (default), then rejected requests are already counted as failures.
     show_rejected_requests_on_writes_dashboard: false,
+
+    // Show panels that use queries for gRPC-based ingestion (distributor -> ingester)
+    show_grpc_ingestion_panels: true,
+
+    // Show panels that use queries for "ingest storage" ingestion (distributor -> Kafka, Kafka -> ingesters)
+    show_ingest_storage_panels: false,
   },
 }
@@ -162,10 +162,39 @@ local filename = 'mimir-writes.json';
           'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"/distributor.Distributor/Push|/httpgrpc.*|%s"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.distributor), $.queries.write_http_routes_regex], ''
         )
       )
+      .addPanelIf(
+        $._config.show_ingest_storage_panels,
+        $.timeseriesPanel('ingest storage: WriteSync latency') +
+        $.panelDescription(
+          'WriteSync latency',
+          |||
+            Latency of WriteSync operation used to store data into Kafka.
+          |||
+        ) +
+        $.queryPanel(
+          [
+            'max(cortex_ingest_storage_writer_latency_seconds{%s,quantile="0.5"})' % [$.jobMatcher($._config.job_names.distributor)],
+            'max(cortex_ingest_storage_writer_latency_seconds{%s,quantile="0.99"})' % [$.jobMatcher($._config.job_names.distributor)],
+            'max(cortex_ingest_storage_writer_latency_seconds{%s,quantile="0.999"})' % [$.jobMatcher($._config.job_names.distributor)],
+            'max(cortex_ingest_storage_writer_latency_seconds{%s,quantile="1.0"})' % [$.jobMatcher($._config.job_names.distributor)],
+          ],
+          [
+            '50th percentile',
+            '99th percentile',
+            '99.9th percentile',
+            '100th percentile',
+          ],
+        ) + {
+          fieldConfig+: {
+            defaults+: { unit: 's' },
+          },
+        },
+      )
     )
     .addRowsIf(std.objectHasAll($._config.injectRows, 'postDistributor'), $._config.injectRows.postDistributor($))
-    .addRow(
-      $.row('Ingester')
+    .addRowIf(
+      $._config.show_grpc_ingestion_panels,
+      ($.row('Ingester'))
       .addPanel(
         $.timeseriesPanel('Requests / sec') +
         $.panelDescription(
@@ -206,6 +235,232 @@ local filename = 'mimir-writes.json';
         )
       )
     )
+    .addRowIf(
+      $._config.show_ingest_storage_panels,
+      ($.row('Ingester (ingest storage: fetching and processing records)'))
+      .addPanel(
+        $.timeseriesPanel('Responses / sec') +
+        $.panelDescription(
+          'Responses / sec',
+          |||
+            Rate of responses from Kafka brokers. Client can return multiple responses ("fetches") at once. Some of the responses may be failures.
+          |||
+        ) +
+        $.queryPanel(
+          [
+            |||
+              sum (rate (cortex_ingest_storage_reader_fetches_total{%s}[$__rate_interval]))
+              -
+              sum (rate (cortex_ingest_storage_reader_fetch_errors_total{%s}[$__rate_interval]))
+            ||| % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)],
+            'sum (rate (cortex_ingest_storage_reader_fetch_errors_total{%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester)],
+          ],
+          [
+            'fetches',
+            'failed',
+          ],
+        ) + $.aliasColors({ failed: '#FF0000' }) + $.stack,
+      )
+      .addPanel(
+        $.timeseriesPanel('Records per fetch') +
+        $.panelDescription(
+          'Records per fetch',
+          |||
+            Number of fetched records per fetch operation.
+          |||
+        ) +
+        $.queryPanel(
+          [
+            |||
+              sum(rate(cortex_ingest_storage_reader_records_per_fetch_sum{%s}[$__rate_interval]))
+              /
+              sum(rate(cortex_ingest_storage_reader_records_per_fetch_count{%s}[$__rate_interval]))
+            ||| % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)],
+            'histogram_quantile(0.99, sum by(le) (rate(cortex_ingest_storage_reader_records_per_fetch_bucket{%s}[$__rate_interval])))' % [$.jobMatcher($._config.job_names.ingester)],
+          ],
+          [
+            'Average',
+            '99th percentile',
+          ],
+        ),
+      )
+      .addPanel(
+        $.timeseriesPanel('Processing Latency') +
+        $.panelDescription(
+          'Processing Latency',
+          |||
+            Time used to process a single record (write request). This time is spent by appending data to per-tenant TSDB.
+          |||
+        ) +
+        $.queryPanel(
+          [
+            'max(cortex_ingest_storage_reader_processing_time_seconds{%s,quantile="0.99"})' % [$.jobMatcher($._config.job_names.ingester)],
+            'max(cortex_ingest_storage_reader_processing_time_seconds{%s,quantile="0.999"})' % [$.jobMatcher($._config.job_names.ingester)],
+            'max(cortex_ingest_storage_reader_processing_time_seconds{%s,quantile="0.5"})' % [$.jobMatcher($._config.job_names.ingester)],
+            |||
+              sum(rate(cortex_ingest_storage_reader_processing_time_seconds_sum{%s}[$__rate_interval]))
+              /
+              sum(rate(cortex_ingest_storage_reader_processing_time_seconds_count{%s}[$__rate_interval]))
+            ||| % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)],
+          ],
+          [
+            '99th percentile',
+            '99.9th percentile',
+            '50th percentile',
+            'average',
+          ],
+        ) + {
+          fieldConfig+: {
+            defaults+: { unit: 's' },
+          },
+        },
+      )
+      .addPanel(
+        $.timeseriesPanel('End-to-end latency') +
+        $.panelDescription(
+          'End-to-end latency',
+          |||
+            Time between writing request to Kafka by distributor and reading the record by ingester.
+          |||
+        ) +
+        $.queryPanel(
+          [
+            'max(cortex_ingest_storage_reader_receive_delay_seconds{%s,quantile="0.99"})' % [$.jobMatcher($._config.job_names.ingester)],
+            'max(cortex_ingest_storage_reader_receive_delay_seconds{%s,quantile="0.999"})' % [$.jobMatcher($._config.job_names.ingester)],
+            'max(cortex_ingest_storage_reader_receive_delay_seconds{%s,quantile="0.5"})' % [$.jobMatcher($._config.job_names.ingester)],
+          ],
+          [
+            '99th percentile',
+            '99.9th percentile',
+            '50th percentile',
+          ],
+        ) + {
+          fieldConfig+: {
+            defaults+: { unit: 's' },
+          },
+        },
+      )
+    )
+    .addRowIf(
+      $._config.show_ingest_storage_panels,
+      ($.row('Ingester (ingest storage: strong consistency)'))
+      .addPanel(
+        $.timeseriesPanel('Requests with strong consistency') +
+        $.panelDescription(
+          'Requests with strong consistency',
+          |||
+            Shows rate of requests with strong consistency, and rate of failed requests with strong consistency.
+          |||
+        ) +
+        $.queryPanel(
+          [
+            |||
+              sum(rate(cortex_ingest_storage_strong_consistency_requests_total{%s}[$__rate_interval]))
+              -
+              sum(rate(cortex_ingest_storage_strong_consistency_failures_total{%s}[$__rate_interval]))
+            ||| % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)],
+            |||
+              sum(rate(cortex_ingest_storage_strong_consistency_failures_total{%s}[$__rate_interval]))
+            ||| % [$.jobMatcher($._config.job_names.ingester)],
+          ],
+          [
+            'requests',
+            'failed',
+          ],
+        ) + {
+          fieldConfig+: {
+            defaults+: { unit: 'reqps' },
+          },
+        } + $.aliasColors({ failed: '#FF0000' }) + $.stack,
+      )
+      .addPanel(
+        $.timeseriesPanel('Strong consistency – wait latency') +
+        $.panelDescription(
+          'Strong consistency – Wait latency',
+          |||
+            How long does the request wait to guarantee strong consistency.
+          |||
+        ) +
+        $.queryPanel(
+          [
+            'max(max_over_time(cortex_ingest_storage_strong_consistency_wait_duration_seconds{%s,quantile="0.99"}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester)],
+            |||
+              sum(rate(cortex_ingest_storage_strong_consistency_wait_duration_seconds_sum{%s}[$__rate_interval]))
+              /
+              sum(rate(cortex_ingest_storage_strong_consistency_wait_duration_seconds_count{%s}[$__rate_interval]))
+            ||| % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)],
+          ],
+          [
+            '99th percentile',
+            'Average',
+          ],
+        ) + {
+          fieldConfig+: {
+            defaults+: { unit: 's' },
+          },
+        },
+      )
+    )
+    .addRowIf(
+      $._config.show_ingest_storage_panels,
+      ($.row('Ingester (ingest storage: last produced offset)'))
+      .addPanel(
+        $.timeseriesPanel('Requests to get last produced offset') +
+        $.panelDescription(
+          'Rate of requests to fetch last produced offset for partition',
+          |||
+            Shows rate of requests to fetch last produced offset for partition, and rate of failed requests.
+          |||
+        ) +
+        $.queryPanel(
+          [
+            |||
+              sum(rate(cortex_ingest_storage_reader_last_produced_offset_requests_total{%s}[$__rate_interval]))
+              -
+              sum(rate(cortex_ingest_storage_reader_last_produced_offset_failures_total{%s}[$__rate_interval]))
+            ||| % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)],
+            |||
+              sum(rate(cortex_ingest_storage_reader_last_produced_offset_failures_total{%s}[$__rate_interval]))
+            ||| % [$.jobMatcher($._config.job_names.ingester)],
+          ],
+          [
+            'requests',
+            'failed',
+          ],
+        ) + {
+          fieldConfig+: {
+            defaults+: { unit: 'reqps' },
+          },
+        } + $.aliasColors({ failed: '#FF0000' }) + $.stack,
+      )
+      .addPanel(
+        $.timeseriesPanel('Last produced offset – latency') +
+        $.panelDescription(
+          'Latency',
+          |||
+            How long does it take to fetch "last produced offset" of partition.
+          |||
+        ) +
+        $.queryPanel(
+          [
+            'max(max_over_time(cortex_ingest_storage_reader_last_produced_offset_request_duration_seconds{%s,quantile="0.99"}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester)],
+            |||
+              sum(rate(cortex_ingest_storage_reader_last_produced_offset_request_duration_seconds_sum{%s}[$__rate_interval]))
+              /
+              sum(rate(cortex_ingest_storage_reader_last_produced_offset_request_duration_seconds_count{%s}[$__rate_interval]))
+            ||| % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)],
+          ],
+          [
+            '99th percentile',
+            'Average',
+          ],
+        ) + {
+          fieldConfig+: {
+            defaults+: { unit: 's' },
+          },
+        },
+      )
+    )
     .addRowIf(
       $._config.gateway_enabled && $._config.autoscaling.gateway.enabled,
       $.cpuAndMemoryBasedAutoScalingRow('Gateway'),