Jsonnet: Fix ruler-querier CPU threshold (#3520)

* Jsonnet: Fix ruler-querier CPU threshold In auto-scaling Jsonnet logic, fix ruler-querier CPU threshold so it's a string encoded integer millicores value. Signed-off-by: Arve Knudsen <[email protected]>
grafana · Nov 25, 2022 · f93cb3d · f93cb3d
1 parent 31d5ff5
commit f93cb3d
Show file tree

Hide file tree

Showing 4 changed files with 15 additions and 7 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -105,6 +105,7 @@
 * [ENHANCEMENT] Add support for autoscaling distributors. #3378
 * [ENHANCEMENT] Make auto-scaling logic ensure integer KEDA thresholds. #3512
 * [BUGFIX] Fixed query-scheduler ring configuration for dedicated ruler's queries and query-frontends. #3237 #3239
+* [BUGFIX] Jsonnet: Fix auto-scaling so that ruler-querier CPU threshold is a string-encoded integer millicores value.
 
 ### Mimirtool
 

diff --git a/operations/mimir-tests/test-autoscaling-generated.yaml b/operations/mimir-tests/test-autoscaling-generated.yaml
@@ -928,8 +928,8 @@ spec:
           limits:
             memory: 24Gi
           requests:
-            cpu: "1"
-            memory: 12Gi
+            cpu: "0.2"
+            memory: 1Gi
         volumeMounts:
         - mountPath: /etc/mimir
           name: overrides
@@ -1813,6 +1813,7 @@ spec:
   - metadata:
       metricName: cortex_ruler_querier_hpa_default
       query: max_over_time(sum(rate(container_cpu_usage_seconds_total{container="ruler-querier",namespace="default"}[5m]))[15m:])
+        * 1000
       serverAddress: http://prometheus.default:9090/prometheus
-      threshold: "1"
+      threshold: "200"
     type: prometheus
diff --git a/operations/mimir-tests/test-autoscaling.jsonnet b/operations/mimir-tests/test-autoscaling.jsonnet
@@ -34,4 +34,9 @@ mimir {
     // the KEDA threshold
     k.util.resourcesRequests(2, '3.2Gi') +
     k.util.resourcesLimits(null, '6Gi'),
+  ruler_querier_container+::
+    // Test a <1 non-integer CPU request, to verify that this gets converted into an integer for
+    // the KEDA threshold
+    // Also specify CPU request as a string to make sure it works
+    k.util.resourcesRequests('0.2', '1Gi'),
 }
diff --git a/operations/mimir/autoscaling.libsonnet b/operations/mimir/autoscaling.libsonnet
@@ -212,9 +212,10 @@
         // Due to the more predicatable nature of the ruler-querier workload we can scale on CPU usage.
         // To scale out relatively quickly, but scale in slower, we look at the average CPU utilization per ruler-querier over 5m (rolling window)
         // and then we pick the highest value over the last 15m.
-        query: 'max_over_time(sum(rate(container_cpu_usage_seconds_total{container="%s",namespace="%s"}[5m]))[15m:])' % [name, $._config.namespace],
+        query: 'max_over_time(sum(rate(container_cpu_usage_seconds_total{container="%s",namespace="%s"}[5m]))[15m:]) * 1000' % [name, $._config.namespace],
 
-        threshold: querier_cpu_requests,
+        // threshold is expected to be a string.
+        threshold: std.toString(cpuToMilliCPUInt(querier_cpu_requests)),
       },
     ],
   }),
@@ -255,8 +256,8 @@
         // Multiply by 1000 to get the result in millicores. This is due to KEDA only working with Ints.
         query: 'max_over_time(sum(rate(container_cpu_usage_seconds_total{container="%s",namespace="%s"}[5m]))[15m:]) * 1000' % [name, $._config.namespace],
 
-        // threshold is expected to be a string, so use '' to cast any ints returned by cpuToMilliCPUInt.
-        threshold: cpuToMilliCPUInt(distributor_cpu_requests) + '',
+        // threshold is expected to be a string.
+        threshold: std.toString(cpuToMilliCPUInt(distributor_cpu_requests)),
       },
       {
         metric_name: 'cortex_%s_memory_hpa_%s' % [std.strReplace(name, '-', '_'), $._config.namespace],