Support native histograms in panels

Give utilities to generate native and classic queries over generic histogram metrics defined by a metric name and selectors. Allow to define panels that can switch between showing either. Native histograms are https://grafana.com/docs/mimir/latest/send/native-histograms/ Signed-off-by: György Krajcsovits <[email protected]>
grafana · Mar 14, 2024 · 56fe626 · 56fe626
1 parent 0f4cbd9
commit 56fe626
Show file tree

Hide file tree

Showing 2 changed files with 256 additions and 11 deletions.
diff --git a/grafana-builder/grafana.libsonnet b/grafana-builder/grafana.libsonnet
@@ -1,3 +1,5 @@
+local utils = import 'mixin-utils/utils.libsonnet';
+
 {
   dashboard(title, uid='', datasource='default', datasource_regex=''):: {
     // Stuff that isn't materialised.
@@ -70,6 +72,40 @@
       },
     },
 
+    addShowNativeLatencyVariable():: self {
+      templating+: {
+        list+: [{
+          current: {
+            selected: true,
+            text: 'classic',
+            value: '1',
+          },
+          description: 'Choose between showing latencies based on low precision classic or high precision native histogram metrics.',
+          hide: 0,
+          includeAll: false,
+          label: 'Latency metrics',
+          multi: false,
+          name: 'latency_metrics',
+          query: 'native : -1,classic : 1',
+          options: [
+            {
+              selected: false,
+              text: 'native',
+              value: '-1',
+            },
+            {
+              selected: true,
+              text: 'classic',
+              value: '1',
+            },
+          ],
+          skipUrlSync: false,
+          type: 'custom',
+          useTags: false,
+        }],
+      },
+    },
+
     dashboardLinkUrl(title, url):: self {
       links+: [
         {
@@ -420,18 +456,20 @@
       },
     ],
 
+  httpStatusColors:: {
+    '1xx': '#EAB839',
+    '2xx': '#7EB26D',
+    '3xx': '#6ED0E0',
+    '4xx': '#EF843C',
+    '5xx': '#E24D42',
+    OK: '#7EB26D',
+    success: '#7EB26D',
+    'error': '#E24D42',
+    cancel: '#A9A9A9',
+  },
+
   qpsPanel(selector, statusLabelName='status_code'):: {
-    aliasColors: {
-      '1xx': '#EAB839',
-      '2xx': '#7EB26D',
-      '3xx': '#6ED0E0',
-      '4xx': '#EF843C',
-      '5xx': '#E24D42',
-      OK: '#7EB26D',
-      success: '#7EB26D',
-      'error': '#E24D42',
-      cancel: '#A9A9A9',
-    },
+    aliasColors: $.httpStatusColors,
     targets: [
       {
         expr:
@@ -448,6 +486,74 @@
     ],
   } + $.stack,
 
+  // Assumes that the metricName is for a histogram (as opposed to qpsPanel above)
+  // Assumes that there is a dashboard variable named latency_metrics, values are -1 (native) or 1 (classic)
+  qpsPanelNativeHistogram(title, metricName, selector, statusLabelName='status_code'):: $.timeseriesPanel(title) {
+    fieldConfig+: {
+      defaults+: {
+        custom+: {
+          lineWidth: 0,
+          fillOpacity: 100,  // Get solid fill.
+          stacking: {
+            mode: 'normal',
+            group: 'A',
+          },
+        },
+        unit: 'reqps',
+        min: 0,
+      },
+      overrides+: [{
+        matcher: {
+          id: 'byName',
+          options: status,
+        },
+        properties: [
+          {
+            id: 'color',
+            value: {
+              mode: 'fixed',
+              fixedColor: $.httpStatusColors[status],
+            },
+          },
+        ],
+      } for status in std.objectFieldsAll($.httpStatusColors)],
+    },
+    targets: [
+      {
+        expr:
+          |||
+            sum by (status) (
+              label_replace(label_replace(%(metricQuery)s,
+              "status", "${1}xx", "%(label)s", "([0-9]).."),
+              "status", "${1}", "%(label)s", "([a-zA-Z]+)"))
+              < ($latency_metrics * -Inf)
+          ||| % {
+            metricQuery: utils.nativeClassicHistogramCountRate(metricName, selector).native,
+            label: statusLabelName,
+          },
+        format: 'time_series',
+        legendFormat: '{{status}}',
+        refId: 'A',
+      },
+      {
+        expr:
+          |||
+            sum by (status) (
+              label_replace(label_replace(%(metricQuery)s,
+              "status", "${1}xx", "%(label)s", "([0-9]).."),
+              "status", "${1}", "%(label)s", "([a-zA-Z]+)"))
+              < ($latency_metrics * +Inf)
+          ||| % {
+            metricQuery: utils.nativeClassicHistogramCountRate(metricName, selector).classic,
+            label: statusLabelName,
+          },
+        format: 'time_series',
+        legendFormat: '{{status}}',
+        refId: 'A_classic',
+      },
+    ],
+  } + $.stack,
+
   latencyPanel(metricName, selector, multiplier='1e3'):: {
     nullPointMode: 'null as zero',
     targets: [
@@ -473,6 +579,58 @@
     yaxes: $.yaxes('ms'),
   },
 
+  // Assumes that there is a dashboard variable named latency_metrics, values are -1 (native) or 1 (classic)
+  latencyPanelNativeHistogram(title, metricName, selector, multiplier='1e3'):: $.timeseriesPanel(title) {
+    nullPointMode: 'null as zero',
+    fieldConfig+: {
+      defaults+: {
+        custom+: {
+          fillOpacity: 10,
+        },
+        unit: 'ms',
+      },
+    },
+    targets: [
+      {
+        expr: utils.showNativeHistogramQuery(utils.nativeClassicHistogramQuantile('0.99', metricName, selector, multiplier=multiplier)),
+        format: 'time_series',
+        legendFormat: '99th percentile',
+        refId: 'A',
+      },
+      {
+        expr: utils.showClassicHistogramQuery(utils.nativeClassicHistogramQuantile('0.99', metricName, selector, multiplier=multiplier)),
+        format: 'time_series',
+        legendFormat: '99th percentile',
+        refId: 'A_classic',
+      },
+      {
+        expr: utils.showNativeHistogramQuery(utils.nativeClassicHistogramQuantile('0.50', metricName, selector, multiplier=multiplier)),
+        format: 'time_series',
+        legendFormat: '50th percentile',
+        refId: 'B',
+      },
+      {
+        expr: utils.showClassicHistogramQuery(utils.nativeClassicHistogramQuantile('0.50', metricName, selector, multiplier=multiplier)),
+        format: 'time_series',
+        legendFormat: '50th percentile',
+        refId: 'B_classic',
+      },
+      {
+        expr: utils.showNativeHistogramQuery(utils.nativeClassicHistogramAverageRate(metricName, selector, multiplier=multiplier)),
+        format: 'time_series',
+        legendFormat: 'Average',
+        refId: 'C',
+      },
+      {
+        expr: utils.showClassicHistogramQuery(utils.nativeClassicHistogramAverageRate(metricName, selector, multiplier=multiplier)),
+        format: 'time_series',
+        legendFormat: 'Average',
+        refId: 'C_classic',
+      },
+    ],
+    yaxes: $.yaxes('ms'),
+  },
+
   selector:: {
     eq(label, value):: { label: label, op: '=', value: value },
     neq(label, value):: { label: label, op: '!=', value: value },

diff --git a/mixin-utils/utils.libsonnet b/mixin-utils/utils.libsonnet
@@ -1,6 +1,93 @@
 local g = import 'grafana-builder/grafana.libsonnet';
 
 {
+  // The classicNativeHistogramQuantile function is used to calculate histogram quantiles from native histograms or classic histograms.
+  // Metric name should be provided without _bucket suffix.
+  nativeClassicHistogramQuantile(percentile, metric, selector, sum_by=[], rate_interval='$__rate_interval', multiplier='')::
+    local classicSumBy = if std.length(sum_by) > 0 then ' by (%(lbls)s) ' % { lbls: std.join(',', ['le'] + sum_by) } else ' by (le) ';
+    local nativeSumBy = if std.length(sum_by) > 0 then ' by (%(lbls)s) ' % { lbls: std.join(',', sum_by) } else ' ';
+    local multiplierStr = if multiplier == '' then '' else ' * %s' % multiplier;
+    {
+      classic: 'histogram_quantile(%(percentile)s, sum%(classicSumBy)s(rate(%(metric)s_bucket{%(selector)s}[%(rateInterval)s])))%(multiplierStr)s' % {
+        classicSumBy: classicSumBy,
+        metric: metric,
+        multiplierStr: multiplierStr,
+        percentile: percentile,
+        rateInterval: rate_interval,
+        selector: selector,
+      },
+      native: 'histogram_quantile(%(percentile)s, sum%(nativeSumBy)s(rate(%(metric)s{%(selector)s}[%(rateInterval)s])))%(multiplierStr)s' % {
+        metric: metric,
+        multiplierStr: multiplierStr,
+        nativeSumBy: nativeSumBy,
+        percentile: percentile,
+        rateInterval: rate_interval,
+        selector: selector,
+      },
+    },
+
+  // The classicNativeHistogramSumRate function is used to calculate the histogram sum of rate from native histograms or classic histograms.
+  // Metric name should be provided without _sum suffix.
+  nativeClassicHistogramSumRate(metric, selector, rate_interval='$__rate_interval')::
+    {
+      classic: 'rate(%(metric)s_sum{%(selector)s}[%(rateInterval)s])' % {
+        metric: metric,
+        rateInterval: rate_interval,
+        selector: selector,
+      },
+      native: 'histogram_sum(rate(%(metric)s{%(selector)s}[%(rateInterval)s]))' % {
+        metric: metric,
+        rateInterval: rate_interval,
+        selector: selector,
+      },
+    },
+
+
+  // The classicNativeHistogramCountRate function is used to calculate the histogram count of rate from native histograms or classic histograms.
+  // Metric name should be provided without _count suffix.
+  nativeClassicHistogramCountRate(metric, selector, rate_interval='$__rate_interval')::
+    {
+      classic: 'rate(%(metric)s_count{%(selector)s}[%(rateInterval)s])' % {
+        metric: metric,
+        rateInterval: rate_interval,
+        selector: selector,
+      },
+      native: 'histogram_count(rate(%(metric)s{%(selector)s}[%(rateInterval)s]))' % {
+        metric: metric,
+        rateInterval: rate_interval,
+        selector: selector,
+      },
+    },
+
+  // TODO(krajorama) Switch to histogram_avg function for native histograms later.
+  nativeClassicHistogramAverageRate(metric, selector, rate_interval='$__rate_interval', multiplier='')::
+    local multiplierStr = if multiplier == '' then '' else '%s * ' % multiplier;
+    {
+      classic: |||
+        %(multiplier)ssum(%(sumMetricQuery)s) /
+        sum(%(countMetricQuery)s)
+      ||| % {
+        sumMetricQuery: $.nativeClassicHistogramSumRate(metric, selector, rate_interval).classic,
+        countMetricQuery: $.nativeClassicHistogramCountRate(metric, selector, rate_interval).classic,
+        multiplier: multiplierStr,
+      },
+      native: |||
+        %(multiplier)ssum(%(sumMetricQuery)s) /
+        sum(%(countMetricQuery)s)
+      ||| % {
+        sumMetricQuery: $.nativeClassicHistogramSumRate(metric, selector, rate_interval).native,
+        countMetricQuery: $.nativeClassicHistogramCountRate(metric, selector, rate_interval).native,
+        multiplier: multiplierStr,
+      },
+    },
+
+  // showClassicHistogramQuery wraps a query defined as map {classic: q, native: q}, and compares the classic query
+  // to dashboard variable which should take -1 or +1 as values in order to hide or show the classic query.
+  showClassicHistogramQuery(query, dashboard_variable='latency_metrics'):: '%s < ($%s * +Inf)' % [query.classic, dashboard_variable],
+  // showNativeHistogramQuery wraps a query defined as map {classic: q, native: q}, and compares the native query
+  // to dashboard variable which should take -1 or +1 as values in order to show or hide the native query.
+  showNativeHistogramQuery(query, dashboard_variable='latency_metrics'):: '%s < ($%s * -Inf)' % [query.native, dashboard_variable],
+
   histogramRules(metric, labels, interval='1m', record_native=false)::
     local vars = {
       metric: metric,