diff --git a/CHANGELOG.md b/CHANGELOG.md index 3057db846ef..85edfcdcfd4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -105,6 +105,7 @@ * [ENHANCEMENT] Add support for autoscaling distributors. #3378 * [ENHANCEMENT] Make auto-scaling logic ensure integer KEDA thresholds. #3512 * [BUGFIX] Fixed query-scheduler ring configuration for dedicated ruler's queries and query-frontends. #3237 #3239 +* [BUGFIX] Jsonnet: Fix auto-scaling so that ruler-querier CPU threshold is a string-encoded integer millicores value. ### Mimirtool diff --git a/operations/mimir-tests/test-autoscaling-generated.yaml b/operations/mimir-tests/test-autoscaling-generated.yaml index 03c351c7ff4..d4f3b09c568 100644 --- a/operations/mimir-tests/test-autoscaling-generated.yaml +++ b/operations/mimir-tests/test-autoscaling-generated.yaml @@ -928,8 +928,8 @@ spec: limits: memory: 24Gi requests: - cpu: "1" - memory: 12Gi + cpu: "0.2" + memory: 1Gi volumeMounts: - mountPath: /etc/mimir name: overrides @@ -1813,6 +1813,7 @@ spec: - metadata: metricName: cortex_ruler_querier_hpa_default query: max_over_time(sum(rate(container_cpu_usage_seconds_total{container="ruler-querier",namespace="default"}[5m]))[15m:]) + * 1000 serverAddress: http://prometheus.default:9090/prometheus - threshold: "1" + threshold: "200" type: prometheus diff --git a/operations/mimir-tests/test-autoscaling.jsonnet b/operations/mimir-tests/test-autoscaling.jsonnet index fd5f3283a67..8bd7b8f929b 100644 --- a/operations/mimir-tests/test-autoscaling.jsonnet +++ b/operations/mimir-tests/test-autoscaling.jsonnet @@ -34,4 +34,9 @@ mimir { // the KEDA threshold k.util.resourcesRequests(2, '3.2Gi') + k.util.resourcesLimits(null, '6Gi'), + ruler_querier_container+:: + // Test a <1 non-integer CPU request, to verify that this gets converted into an integer for + // the KEDA threshold + // Also specify CPU request as a string to make sure it works + k.util.resourcesRequests('0.2', '1Gi'), } diff --git a/operations/mimir/autoscaling.libsonnet b/operations/mimir/autoscaling.libsonnet index c69efafc312..7b0190b461e 100644 --- a/operations/mimir/autoscaling.libsonnet +++ b/operations/mimir/autoscaling.libsonnet @@ -212,9 +212,10 @@ // Due to the more predicatable nature of the ruler-querier workload we can scale on CPU usage. // To scale out relatively quickly, but scale in slower, we look at the average CPU utilization per ruler-querier over 5m (rolling window) // and then we pick the highest value over the last 15m. - query: 'max_over_time(sum(rate(container_cpu_usage_seconds_total{container="%s",namespace="%s"}[5m]))[15m:])' % [name, $._config.namespace], + query: 'max_over_time(sum(rate(container_cpu_usage_seconds_total{container="%s",namespace="%s"}[5m]))[15m:]) * 1000' % [name, $._config.namespace], - threshold: querier_cpu_requests, + // threshold is expected to be a string. + threshold: std.toString(cpuToMilliCPUInt(querier_cpu_requests)), }, ], }), @@ -255,8 +256,8 @@ // Multiply by 1000 to get the result in millicores. This is due to KEDA only working with Ints. query: 'max_over_time(sum(rate(container_cpu_usage_seconds_total{container="%s",namespace="%s"}[5m]))[15m:]) * 1000' % [name, $._config.namespace], - // threshold is expected to be a string, so use '' to cast any ints returned by cpuToMilliCPUInt. - threshold: cpuToMilliCPUInt(distributor_cpu_requests) + '', + // threshold is expected to be a string. + threshold: std.toString(cpuToMilliCPUInt(distributor_cpu_requests)), }, { metric_name: 'cortex_%s_memory_hpa_%s' % [std.strReplace(name, '-', '_'), $._config.namespace],