diff --git a/CHANGELOG.md b/CHANGELOG.md index 17f8d27b4d..51e7e65bd4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -75,6 +75,7 @@ * [ENHANCEMENT] Make label-selector in ReplicaTemplate/ingester-zone-a object configurable when using ingest-storage. #9480 * [ENHANCEMENT] Add `querier_only_args` option to specify CLI flags that apply only to queriers but not ruler-queriers. #9503 * [ENHANCEMENT] Validate the Kafka client ID configured when ingest storage is enabled. #9573 +* [ENHANCEMENT] Configure pod anti-affinity and tolerations to run etcd pods multi-AZ when `_config.multi_zone_etcd_enabled` is set to `true`. #9725 ### Mimirtool diff --git a/operations/mimir-tests/test-multi-zone-etcd-generated.yaml b/operations/mimir-tests/test-multi-zone-etcd-generated.yaml new file mode 100644 index 0000000000..029ef24457 --- /dev/null +++ b/operations/mimir-tests/test-multi-zone-etcd-generated.yaml @@ -0,0 +1,2406 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: default +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: alertmanager + name: alertmanager + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: alertmanager +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: compactor + name: compactor + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: compactor +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: distributor + name: distributor + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: distributor +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: ingester-rollout + name: ingester-rollout + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + rollout-group: ingester +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: memcached + name: memcached + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: memcached +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: memcached-frontend + name: memcached-frontend + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: memcached-frontend +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: memcached-index-queries + name: memcached-index-queries + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: memcached-index-queries +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: memcached-metadata + name: memcached-metadata + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: memcached-metadata +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: querier + name: querier + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: querier +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: query-frontend + name: query-frontend + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: query-frontend +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: query-scheduler + name: query-scheduler + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: query-scheduler +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: rollout-operator + name: rollout-operator + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: rollout-operator +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: ruler + name: ruler + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + name: ruler +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + name: store-gateway-rollout + name: store-gateway-rollout + namespace: default +spec: + maxUnavailable: 1 + selector: + matchLabels: + rollout-group: store-gateway +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: rollout-operator + namespace: default +--- +apiVersion: v1 +data: + overrides.yaml: | + overrides: {} +kind: ConfigMap +metadata: + name: overrides + namespace: default +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: rollout-operator-role + namespace: default +rules: +- apiGroups: + - "" + resources: + - pods + verbs: + - list + - get + - watch + - delete +- apiGroups: + - apps + resources: + - statefulsets + verbs: + - list + - get + - watch + - patch +- apiGroups: + - apps + resources: + - statefulsets/status + verbs: + - update +- apiGroups: + - "" + resources: + - configmaps + verbs: + - get + - update + - create +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: rollout-operator-rolebinding + namespace: default +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: rollout-operator-role +subjects: +- kind: ServiceAccount + name: rollout-operator + namespace: default +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: alertmanager + name: alertmanager + namespace: default +spec: + clusterIP: None + ports: + - name: alertmanager-http-metrics + port: 8080 + targetPort: 8080 + - name: alertmanager-grpc + port: 9095 + targetPort: 9095 + - name: alertmanager-gossip-ring + port: 7946 + targetPort: 7946 + selector: + name: alertmanager +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: compactor + name: compactor + namespace: default +spec: + clusterIP: None + ports: + - name: compactor-http-metrics + port: 8080 + targetPort: 8080 + - name: compactor-grpc + port: 9095 + targetPort: 9095 + - name: compactor-gossip-ring + port: 7946 + targetPort: 7946 + selector: + name: compactor +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: distributor + name: distributor + namespace: default +spec: + clusterIP: None + ports: + - name: distributor-http-metrics + port: 8080 + targetPort: 8080 + - name: distributor-grpc + port: 9095 + targetPort: 9095 + - name: distributor-gossip-ring + port: 7946 + targetPort: 7946 + selector: + name: distributor +--- +apiVersion: v1 +kind: Service +metadata: + name: gossip-ring + namespace: default +spec: + clusterIP: None + ports: + - appProtocol: tcp + name: gossip-ring + port: 7946 + protocol: TCP + targetPort: 7946 + selector: + gossip_ring_member: "true" +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: ingester-zone-a + name: ingester-zone-a + namespace: default +spec: + clusterIP: None + ports: + - name: ingester-http-metrics + port: 8080 + targetPort: 8080 + - name: ingester-grpc + port: 9095 + targetPort: 9095 + - name: ingester-gossip-ring + port: 7946 + targetPort: 7946 + selector: + name: ingester-zone-a + rollout-group: ingester +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: ingester-zone-b + name: ingester-zone-b + namespace: default +spec: + clusterIP: None + ports: + - name: ingester-http-metrics + port: 8080 + targetPort: 8080 + - name: ingester-grpc + port: 9095 + targetPort: 9095 + - name: ingester-gossip-ring + port: 7946 + targetPort: 7946 + selector: + name: ingester-zone-b + rollout-group: ingester +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: ingester-zone-c + name: ingester-zone-c + namespace: default +spec: + clusterIP: None + ports: + - name: ingester-http-metrics + port: 8080 + targetPort: 8080 + - name: ingester-grpc + port: 9095 + targetPort: 9095 + - name: ingester-gossip-ring + port: 7946 + targetPort: 7946 + selector: + name: ingester-zone-c + rollout-group: ingester +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: memcached + name: memcached + namespace: default +spec: + clusterIP: None + ports: + - name: memcached-client + port: 11211 + targetPort: 11211 + - name: exporter-http-metrics + port: 9150 + targetPort: 9150 + selector: + name: memcached +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: memcached-frontend + name: memcached-frontend + namespace: default +spec: + clusterIP: None + ports: + - name: memcached-client + port: 11211 + targetPort: 11211 + - name: exporter-http-metrics + port: 9150 + targetPort: 9150 + selector: + name: memcached-frontend +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: memcached-index-queries + name: memcached-index-queries + namespace: default +spec: + clusterIP: None + ports: + - name: memcached-client + port: 11211 + targetPort: 11211 + - name: exporter-http-metrics + port: 9150 + targetPort: 9150 + selector: + name: memcached-index-queries +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: memcached-metadata + name: memcached-metadata + namespace: default +spec: + clusterIP: None + ports: + - name: memcached-client + port: 11211 + targetPort: 11211 + - name: exporter-http-metrics + port: 9150 + targetPort: 9150 + selector: + name: memcached-metadata +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: querier + name: querier + namespace: default +spec: + ports: + - name: querier-http-metrics + port: 8080 + targetPort: 8080 + - name: querier-grpc + port: 9095 + targetPort: 9095 + - name: querier-gossip-ring + port: 7946 + targetPort: 7946 + selector: + name: querier +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: query-frontend + name: query-frontend + namespace: default +spec: + ports: + - name: query-frontend-http-metrics + port: 8080 + targetPort: 8080 + - name: query-frontend-grpc + port: 9095 + targetPort: 9095 + selector: + name: query-frontend +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: query-scheduler + name: query-scheduler + namespace: default +spec: + ports: + - name: query-scheduler-http-metrics + port: 8080 + targetPort: 8080 + - name: query-scheduler-grpc + port: 9095 + targetPort: 9095 + selector: + name: query-scheduler +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: query-scheduler + name: query-scheduler-discovery + namespace: default +spec: + clusterIP: None + ports: + - name: query-scheduler-http-metrics + port: 8080 + targetPort: 8080 + - name: query-scheduler-grpc + port: 9095 + targetPort: 9095 + publishNotReadyAddresses: true + selector: + name: query-scheduler +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: ruler + name: ruler + namespace: default +spec: + ports: + - name: ruler-http-metrics + port: 8080 + targetPort: 8080 + - name: ruler-grpc + port: 9095 + targetPort: 9095 + selector: + name: ruler +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: store-gateway-multi-zone + name: store-gateway-multi-zone + namespace: default +spec: + ports: + - name: store-gateway-http-metrics + port: 80 + protocol: TCP + targetPort: 80 + selector: + rollout-group: store-gateway +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: store-gateway-zone-a + name: store-gateway-zone-a + namespace: default +spec: + clusterIP: None + ports: + - name: store-gateway-http-metrics + port: 8080 + targetPort: 8080 + - name: store-gateway-grpc + port: 9095 + targetPort: 9095 + - name: store-gateway-gossip-ring + port: 7946 + targetPort: 7946 + selector: + name: store-gateway-zone-a + rollout-group: store-gateway +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: store-gateway-zone-b + name: store-gateway-zone-b + namespace: default +spec: + clusterIP: None + ports: + - name: store-gateway-http-metrics + port: 8080 + targetPort: 8080 + - name: store-gateway-grpc + port: 9095 + targetPort: 9095 + - name: store-gateway-gossip-ring + port: 7946 + targetPort: 7946 + selector: + name: store-gateway-zone-b + rollout-group: store-gateway +--- +apiVersion: v1 +kind: Service +metadata: + labels: + name: store-gateway-zone-c + name: store-gateway-zone-c + namespace: default +spec: + clusterIP: None + ports: + - name: store-gateway-http-metrics + port: 8080 + targetPort: 8080 + - name: store-gateway-grpc + port: 9095 + targetPort: 9095 + - name: store-gateway-gossip-ring + port: 7946 + targetPort: 7946 + selector: + name: store-gateway-zone-c + rollout-group: store-gateway +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: distributor + namespace: default +spec: + minReadySeconds: 10 + replicas: 3 + revisionHistoryLimit: 10 + selector: + matchLabels: + name: distributor + strategy: + rollingUpdate: + maxSurge: 15% + maxUnavailable: 0 + template: + metadata: + labels: + gossip_ring_member: "true" + name: distributor + spec: + containers: + - args: + - -distributor.ha-tracker.enable=true + - -distributor.ha-tracker.enable-for-all-users=true + - -distributor.ha-tracker.etcd.endpoints=etcd-client.default.svc.cluster.local.:2379 + - -distributor.ha-tracker.prefix=prom_ha/ + - -distributor.ha-tracker.store=etcd + - -distributor.health-check-ingesters=true + - -distributor.ingestion-burst-size=200000 + - -distributor.ingestion-rate-limit=10000 + - -distributor.ring.heartbeat-period=1m + - -distributor.ring.heartbeat-timeout=4m + - -distributor.ring.prefix= + - -distributor.ring.store=memberlist + - -ingester.ring.heartbeat-timeout=10m + - -ingester.ring.prefix= + - -ingester.ring.replication-factor=3 + - -ingester.ring.store=memberlist + - -ingester.ring.zone-awareness-enabled=true + - -mem-ballast-size-bytes=1073741824 + - -memberlist.bind-port=7946 + - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc.keepalive.max-connection-age=60s + - -server.grpc.keepalive.max-connection-age-grace=5m + - -server.grpc.keepalive.max-connection-idle=1m + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -shutdown-delay=90s + - -target=distributor + - -usage-stats.installation-mode=jsonnet + env: + - name: GOMAXPROCS + value: "8" + - name: JAEGER_REPORTER_MAX_QUEUE_SIZE + value: "1000" + image: grafana/mimir:2.14.0 + imagePullPolicy: IfNotPresent + name: distributor + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + - containerPort: 7946 + name: gossip-ring + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 4Gi + requests: + cpu: "2" + memory: 2Gi + volumeMounts: + - mountPath: /etc/mimir + name: overrides + terminationGracePeriodSeconds: 100 + topologySpreadConstraints: + - labelSelector: + matchLabels: + name: distributor + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - configMap: + name: overrides + name: overrides +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: querier + namespace: default +spec: + minReadySeconds: 10 + replicas: 6 + revisionHistoryLimit: 10 + selector: + matchLabels: + name: querier + strategy: + rollingUpdate: + maxSurge: 15% + maxUnavailable: 0 + template: + metadata: + labels: + gossip_ring_member: "true" + name: querier + spec: + containers: + - args: + - -blocks-storage.bucket-store.metadata-cache.backend=memcached + - -blocks-storage.bucket-store.metadata-cache.memcached.addresses=dnssrvnoa+memcached-metadata.default.svc.cluster.local.:11211 + - -blocks-storage.bucket-store.metadata-cache.memcached.max-async-concurrency=50 + - -blocks-storage.bucket-store.metadata-cache.memcached.max-item-size=1048576 + - -blocks-storage.bucket-store.sync-dir=/data/tsdb + - -blocks-storage.bucket-store.sync-interval=15m + - -blocks-storage.gcs.bucket-name=blocks-bucket + - -common.storage.backend=gcs + - -distributor.health-check-ingesters=true + - -ingester.ring.heartbeat-timeout=10m + - -ingester.ring.prefix= + - -ingester.ring.replication-factor=3 + - -ingester.ring.store=memberlist + - -ingester.ring.zone-awareness-enabled=true + - -mem-ballast-size-bytes=268435456 + - -memberlist.bind-port=7946 + - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 + - -querier.frontend-client.grpc-max-send-msg-size=104857600 + - -querier.max-concurrent=8 + - -querier.max-partial-query-length=768h + - -querier.scheduler-address=query-scheduler-discovery.default.svc.cluster.local.:9095 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -store-gateway.sharding-ring.heartbeat-timeout=4m + - -store-gateway.sharding-ring.prefix=multi-zone/ + - -store-gateway.sharding-ring.replication-factor=3 + - -store-gateway.sharding-ring.store=memberlist + - -store-gateway.sharding-ring.zone-awareness-enabled=true + - -target=querier + - -usage-stats.installation-mode=jsonnet + env: + - name: GOMAXPROCS + value: "5" + - name: JAEGER_REPORTER_MAX_QUEUE_SIZE + value: "5000" + image: grafana/mimir:2.14.0 + imagePullPolicy: IfNotPresent + name: querier + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + - containerPort: 7946 + name: gossip-ring + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 24Gi + requests: + cpu: "1" + memory: 12Gi + volumeMounts: + - mountPath: /etc/mimir + name: overrides + terminationGracePeriodSeconds: 180 + topologySpreadConstraints: + - labelSelector: + matchLabels: + name: querier + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - configMap: + name: overrides + name: overrides +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: query-frontend + namespace: default +spec: + minReadySeconds: 10 + replicas: 2 + revisionHistoryLimit: 10 + selector: + matchLabels: + name: query-frontend + strategy: + rollingUpdate: + maxSurge: 15% + maxUnavailable: 0 + template: + metadata: + labels: + name: query-frontend + spec: + containers: + - args: + - -query-frontend.cache-results=true + - -query-frontend.max-cache-freshness=10m + - -query-frontend.max-total-query-length=12000h + - -query-frontend.query-sharding-target-series-per-shard=2500 + - -query-frontend.results-cache.backend=memcached + - -query-frontend.results-cache.memcached.addresses=dnssrvnoa+memcached-frontend.default.svc.cluster.local.:11211 + - -query-frontend.results-cache.memcached.max-item-size=5242880 + - -query-frontend.results-cache.memcached.timeout=500ms + - -query-frontend.scheduler-address=query-scheduler-discovery.default.svc.cluster.local.:9095 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc.keepalive.max-connection-age=30s + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -shutdown-delay=90s + - -target=query-frontend + - -usage-stats.installation-mode=jsonnet + env: + - name: JAEGER_REPORTER_MAX_QUEUE_SIZE + value: "5000" + image: grafana/mimir:2.14.0 + imagePullPolicy: IfNotPresent + name: query-frontend + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 1200Mi + requests: + cpu: "2" + memory: 600Mi + volumeMounts: + - mountPath: /etc/mimir + name: overrides + terminationGracePeriodSeconds: 390 + topologySpreadConstraints: + - labelSelector: + matchLabels: + name: query-frontend + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - configMap: + name: overrides + name: overrides +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: query-scheduler + namespace: default +spec: + minReadySeconds: 10 + replicas: 2 + revisionHistoryLimit: 10 + selector: + matchLabels: + name: query-scheduler + strategy: + rollingUpdate: + maxSurge: 1 + maxUnavailable: 0 + template: + metadata: + labels: + name: query-scheduler + spec: + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + name: query-scheduler + topologyKey: kubernetes.io/hostname + containers: + - args: + - -query-scheduler.max-outstanding-requests-per-tenant=100 + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -target=query-scheduler + - -usage-stats.installation-mode=jsonnet + image: grafana/mimir:2.14.0 + imagePullPolicy: IfNotPresent + name: query-scheduler + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 2Gi + requests: + cpu: "2" + memory: 1Gi + volumeMounts: + - mountPath: /etc/mimir + name: overrides + terminationGracePeriodSeconds: 180 + volumes: + - configMap: + name: overrides + name: overrides +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: rollout-operator + namespace: default +spec: + minReadySeconds: 10 + replicas: 1 + revisionHistoryLimit: 10 + selector: + matchLabels: + name: rollout-operator + strategy: + rollingUpdate: + maxSurge: 0 + maxUnavailable: 1 + template: + metadata: + labels: + name: rollout-operator + spec: + containers: + - args: + - -kubernetes.namespace=default + - -use-zone-tracker=true + - -zone-tracker.config-map-name=rollout-operator-zone-tracker + image: grafana/rollout-operator:v0.19.1 + imagePullPolicy: IfNotPresent + name: rollout-operator + ports: + - containerPort: 8001 + name: http-metrics + readinessProbe: + httpGet: + path: /ready + port: 8001 + initialDelaySeconds: 5 + timeoutSeconds: 1 + resources: + limits: + memory: 200Mi + requests: + cpu: 100m + memory: 100Mi + serviceAccountName: rollout-operator +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: ruler + namespace: default +spec: + minReadySeconds: 10 + replicas: 2 + revisionHistoryLimit: 10 + selector: + matchLabels: + name: ruler + strategy: + rollingUpdate: + maxSurge: 50% + maxUnavailable: 0 + template: + metadata: + labels: + gossip_ring_member: "true" + name: ruler + spec: + containers: + - args: + - -blocks-storage.bucket-store.metadata-cache.backend=memcached + - -blocks-storage.bucket-store.metadata-cache.memcached.addresses=dnssrvnoa+memcached-metadata.default.svc.cluster.local.:11211 + - -blocks-storage.bucket-store.metadata-cache.memcached.max-async-concurrency=50 + - -blocks-storage.bucket-store.metadata-cache.memcached.max-item-size=1048576 + - -blocks-storage.bucket-store.sync-dir=/data/tsdb + - -blocks-storage.bucket-store.sync-interval=15m + - -blocks-storage.gcs.bucket-name=blocks-bucket + - -common.storage.backend=gcs + - -distributor.health-check-ingesters=true + - -distributor.remote-timeout=10s + - -ingester.ring.heartbeat-timeout=10m + - -ingester.ring.prefix= + - -ingester.ring.replication-factor=3 + - -ingester.ring.store=memberlist + - -ingester.ring.zone-awareness-enabled=true + - -memberlist.bind-port=7946 + - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 + - -querier.max-partial-query-length=768h + - -ruler-storage.cache.backend=memcached + - -ruler-storage.cache.memcached.addresses=dnssrvnoa+memcached-metadata.default.svc.cluster.local.:11211 + - -ruler-storage.cache.memcached.max-async-concurrency=50 + - -ruler-storage.cache.memcached.max-item-size=1048576 + - -ruler-storage.gcs.bucket-name=rules-bucket + - -ruler.alertmanager-url=http://alertmanager.default.svc.cluster.local./alertmanager + - -ruler.max-rule-groups-per-tenant=70 + - -ruler.max-rules-per-rule-group=20 + - -ruler.ring.store=memberlist + - -ruler.rule-path=/rules + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -store-gateway.sharding-ring.heartbeat-timeout=4m + - -store-gateway.sharding-ring.prefix=multi-zone/ + - -store-gateway.sharding-ring.replication-factor=3 + - -store-gateway.sharding-ring.store=memberlist + - -store-gateway.sharding-ring.zone-awareness-enabled=true + - -target=ruler + - -usage-stats.installation-mode=jsonnet + env: + - name: JAEGER_REPORTER_MAX_QUEUE_SIZE + value: "1000" + image: grafana/mimir:2.14.0 + imagePullPolicy: IfNotPresent + name: ruler + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + cpu: "16" + memory: 16Gi + requests: + cpu: "1" + memory: 6Gi + volumeMounts: + - mountPath: /etc/mimir + name: overrides + terminationGracePeriodSeconds: 600 + topologySpreadConstraints: + - labelSelector: + matchLabels: + name: ruler + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - configMap: + name: overrides + name: overrides +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + labels: + name: alertmanager + name: alertmanager + namespace: default +spec: + replicas: 3 + selector: + matchLabels: + name: alertmanager + serviceName: alertmanager + template: + metadata: + labels: + gossip_ring_member: "true" + name: alertmanager + spec: + containers: + - args: + - -alertmanager-storage.gcs.bucket-name=alerts-bucket + - -alertmanager.sharding-ring.replication-factor=3 + - -alertmanager.sharding-ring.store=memberlist + - -alertmanager.storage.path=/data + - -alertmanager.web.external-url=http://test/alertmanager + - -common.storage.backend=gcs + - -memberlist.bind-port=7946 + - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-idle-timeout=6m + - -server.http-listen-port=8080 + - -target=alertmanager + - -usage-stats.installation-mode=jsonnet + env: + - name: POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + image: grafana/mimir:2.14.0 + imagePullPolicy: IfNotPresent + name: alertmanager + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + - containerPort: 7946 + name: gossip-ring + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 15Gi + requests: + cpu: "2" + memory: 10Gi + volumeMounts: + - mountPath: /data + name: alertmanager-data + - mountPath: /etc/mimir + name: overrides + securityContext: + runAsUser: 0 + terminationGracePeriodSeconds: 900 + volumes: + - configMap: + name: overrides + name: overrides + updateStrategy: + type: RollingUpdate + volumeClaimTemplates: + - apiVersion: v1 + kind: PersistentVolumeClaim + metadata: + name: alertmanager-data + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 100Gi +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + labels: + name: compactor + name: compactor + namespace: default +spec: + podManagementPolicy: Parallel + replicas: 1 + selector: + matchLabels: + name: compactor + serviceName: compactor + template: + metadata: + labels: + gossip_ring_member: "true" + name: compactor + spec: + containers: + - args: + - -blocks-storage.gcs.bucket-name=blocks-bucket + - -common.storage.backend=gcs + - -compactor.block-ranges=2h,12h,24h + - -compactor.blocks-retention-period=0 + - -compactor.cleanup-interval=15m + - -compactor.compaction-concurrency=1 + - -compactor.compaction-interval=30m + - -compactor.compactor-tenant-shard-size=1 + - -compactor.data-dir=/data + - -compactor.deletion-delay=2h + - -compactor.first-level-compaction-wait-period=25m + - -compactor.max-closing-blocks-concurrency=2 + - -compactor.max-opening-blocks-concurrency=4 + - -compactor.ring.heartbeat-period=1m + - -compactor.ring.heartbeat-timeout=4m + - -compactor.ring.prefix= + - -compactor.ring.store=memberlist + - -compactor.ring.wait-stability-min-duration=1m + - -compactor.split-and-merge-shards=0 + - -compactor.split-groups=1 + - -compactor.symbols-flushers-concurrency=4 + - -memberlist.bind-port=7946 + - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -target=compactor + - -usage-stats.installation-mode=jsonnet + image: grafana/mimir:2.14.0 + imagePullPolicy: IfNotPresent + name: compactor + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + - containerPort: 7946 + name: gossip-ring + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 6Gi + requests: + cpu: 1 + memory: 6Gi + volumeMounts: + - mountPath: /data + name: compactor-data + - mountPath: /etc/mimir + name: overrides + securityContext: + runAsUser: 0 + terminationGracePeriodSeconds: 900 + volumes: + - configMap: + name: overrides + name: overrides + updateStrategy: + type: RollingUpdate + volumeClaimTemplates: + - apiVersion: v1 + kind: PersistentVolumeClaim + metadata: + name: compactor-data + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 250Gi + storageClassName: standard +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + annotations: + rollout-max-unavailable: "50" + labels: + rollout-group: ingester + name: ingester-zone-a + namespace: default +spec: + podManagementPolicy: Parallel + replicas: 1 + selector: + matchLabels: + name: ingester-zone-a + rollout-group: ingester + serviceName: ingester-zone-a + template: + metadata: + labels: + gossip_ring_member: "true" + name: ingester-zone-a + rollout-group: ingester + spec: + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchExpressions: + - key: rollout-group + operator: In + values: + - ingester + - key: name + operator: NotIn + values: + - ingester-zone-a + topologyKey: kubernetes.io/hostname + containers: + - args: + - -blocks-storage.gcs.bucket-name=blocks-bucket + - -blocks-storage.tsdb.block-ranges-period=2h + - -blocks-storage.tsdb.dir=/data/tsdb + - -blocks-storage.tsdb.head-compaction-interval=15m + - -blocks-storage.tsdb.ship-interval=1m + - -blocks-storage.tsdb.wal-replay-concurrency=3 + - -common.storage.backend=gcs + - -distributor.health-check-ingesters=true + - -ingester.max-global-metadata-per-metric=10 + - -ingester.max-global-metadata-per-user=30000 + - -ingester.max-global-series-per-user=150000 + - -ingester.ring.heartbeat-period=2m + - -ingester.ring.heartbeat-timeout=10m + - -ingester.ring.instance-availability-zone=zone-a + - -ingester.ring.num-tokens=512 + - -ingester.ring.prefix= + - -ingester.ring.replication-factor=3 + - -ingester.ring.store=memberlist + - -ingester.ring.tokens-file-path=/data/tokens + - -ingester.ring.unregister-on-shutdown=true + - -ingester.ring.zone-awareness-enabled=true + - -memberlist.bind-port=7946 + - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc-max-concurrent-streams=500 + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -target=ingester + - -usage-stats.installation-mode=jsonnet + env: + - name: A + value: ingester-a-only + - name: GOGC + value: "off" + - name: GOMAXPROCS + value: "9" + - name: GOMEMLIMIT + value: 1Gi + - name: JAEGER_REPORTER_MAX_QUEUE_SIZE + value: "1000" + - name: Z + value: "123" + image: grafana/mimir:2.14.0 + imagePullPolicy: IfNotPresent + name: ingester + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + - containerPort: 7946 + name: gossip-ring + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 25Gi + requests: + cpu: "4" + memory: 15Gi + volumeMounts: + - mountPath: /data + name: ingester-data + - mountPath: /etc/mimir + name: overrides + securityContext: + runAsUser: 0 + terminationGracePeriodSeconds: 1200 + volumes: + - configMap: + name: overrides + name: overrides + updateStrategy: + type: OnDelete + volumeClaimTemplates: + - apiVersion: v1 + kind: PersistentVolumeClaim + metadata: + name: ingester-data + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 100Gi + storageClassName: fast +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + annotations: + rollout-max-unavailable: "50" + labels: + rollout-group: ingester + name: ingester-zone-b + namespace: default +spec: + podManagementPolicy: Parallel + replicas: 1 + selector: + matchLabels: + name: ingester-zone-b + rollout-group: ingester + serviceName: ingester-zone-b + template: + metadata: + labels: + gossip_ring_member: "true" + name: ingester-zone-b + rollout-group: ingester + spec: + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchExpressions: + - key: rollout-group + operator: In + values: + - ingester + - key: name + operator: NotIn + values: + - ingester-zone-b + topologyKey: kubernetes.io/hostname + containers: + - args: + - -blocks-storage.gcs.bucket-name=blocks-bucket + - -blocks-storage.tsdb.block-ranges-period=2h + - -blocks-storage.tsdb.dir=/data/tsdb + - -blocks-storage.tsdb.head-compaction-interval=15m + - -blocks-storage.tsdb.ship-interval=1m + - -blocks-storage.tsdb.wal-replay-concurrency=3 + - -common.storage.backend=gcs + - -distributor.health-check-ingesters=true + - -ingester.max-global-metadata-per-metric=10 + - -ingester.max-global-metadata-per-user=30000 + - -ingester.max-global-series-per-user=150000 + - -ingester.ring.heartbeat-period=2m + - -ingester.ring.heartbeat-timeout=10m + - -ingester.ring.instance-availability-zone=zone-b + - -ingester.ring.num-tokens=512 + - -ingester.ring.prefix= + - -ingester.ring.replication-factor=3 + - -ingester.ring.store=memberlist + - -ingester.ring.tokens-file-path=/data/tokens + - -ingester.ring.unregister-on-shutdown=true + - -ingester.ring.zone-awareness-enabled=true + - -memberlist.bind-port=7946 + - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc-max-concurrent-streams=500 + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -target=ingester + - -usage-stats.installation-mode=jsonnet + env: + - name: A + value: all-ingesters + - name: GOMAXPROCS + value: "9" + - name: JAEGER_REPORTER_MAX_QUEUE_SIZE + value: "1000" + image: grafana/mimir:2.14.0 + imagePullPolicy: IfNotPresent + name: ingester + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + - containerPort: 7946 + name: gossip-ring + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 25Gi + requests: + cpu: "4" + memory: 15Gi + volumeMounts: + - mountPath: /data + name: ingester-data + - mountPath: /etc/mimir + name: overrides + securityContext: + runAsUser: 0 + terminationGracePeriodSeconds: 1200 + volumes: + - configMap: + name: overrides + name: overrides + updateStrategy: + type: OnDelete + volumeClaimTemplates: + - apiVersion: v1 + kind: PersistentVolumeClaim + metadata: + name: ingester-data + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 100Gi + storageClassName: fast +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + annotations: + rollout-max-unavailable: "50" + labels: + rollout-group: ingester + name: ingester-zone-c + namespace: default +spec: + podManagementPolicy: Parallel + replicas: 1 + selector: + matchLabels: + name: ingester-zone-c + rollout-group: ingester + serviceName: ingester-zone-c + template: + metadata: + labels: + gossip_ring_member: "true" + name: ingester-zone-c + rollout-group: ingester + spec: + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchExpressions: + - key: rollout-group + operator: In + values: + - ingester + - key: name + operator: NotIn + values: + - ingester-zone-c + topologyKey: kubernetes.io/hostname + containers: + - args: + - -blocks-storage.gcs.bucket-name=blocks-bucket + - -blocks-storage.tsdb.block-ranges-period=2h + - -blocks-storage.tsdb.dir=/data/tsdb + - -blocks-storage.tsdb.head-compaction-interval=15m + - -blocks-storage.tsdb.ship-interval=1m + - -blocks-storage.tsdb.wal-replay-concurrency=3 + - -common.storage.backend=gcs + - -distributor.health-check-ingesters=true + - -ingester.max-global-metadata-per-metric=10 + - -ingester.max-global-metadata-per-user=30000 + - -ingester.max-global-series-per-user=150000 + - -ingester.ring.heartbeat-period=2m + - -ingester.ring.heartbeat-timeout=10m + - -ingester.ring.instance-availability-zone=zone-c + - -ingester.ring.num-tokens=512 + - -ingester.ring.prefix= + - -ingester.ring.replication-factor=3 + - -ingester.ring.store=memberlist + - -ingester.ring.tokens-file-path=/data/tokens + - -ingester.ring.unregister-on-shutdown=true + - -ingester.ring.zone-awareness-enabled=true + - -memberlist.bind-port=7946 + - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc-max-concurrent-streams=500 + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -target=ingester + - -usage-stats.installation-mode=jsonnet + env: + - name: A + value: all-ingesters + - name: GOMAXPROCS + value: "9" + - name: JAEGER_REPORTER_MAX_QUEUE_SIZE + value: "1000" + image: grafana/mimir:2.14.0 + imagePullPolicy: IfNotPresent + name: ingester + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + - containerPort: 7946 + name: gossip-ring + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 25Gi + requests: + cpu: "4" + memory: 15Gi + volumeMounts: + - mountPath: /data + name: ingester-data + - mountPath: /etc/mimir + name: overrides + securityContext: + runAsUser: 0 + terminationGracePeriodSeconds: 1200 + volumes: + - configMap: + name: overrides + name: overrides + updateStrategy: + type: OnDelete + volumeClaimTemplates: + - apiVersion: v1 + kind: PersistentVolumeClaim + metadata: + name: ingester-data + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 100Gi + storageClassName: fast +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: memcached + namespace: default +spec: + replicas: 3 + selector: + matchLabels: + name: memcached + serviceName: memcached + template: + metadata: + labels: + name: memcached + spec: + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + name: memcached + topologyKey: kubernetes.io/hostname + containers: + - args: + - -m 6144 + - -I 1m + - -c 16384 + - -v + - --extended=track_sizes + image: memcached:1.6.28-alpine + imagePullPolicy: IfNotPresent + name: memcached + ports: + - containerPort: 11211 + name: client + resources: + limits: + memory: 9Gi + requests: + cpu: 500m + memory: 6552Mi + - args: + - --memcached.address=localhost:11211 + - --web.listen-address=0.0.0.0:9150 + image: prom/memcached-exporter:v0.14.4 + imagePullPolicy: IfNotPresent + name: exporter + ports: + - containerPort: 9150 + name: http-metrics + updateStrategy: + type: RollingUpdate +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: memcached-frontend + namespace: default +spec: + replicas: 3 + selector: + matchLabels: + name: memcached-frontend + serviceName: memcached-frontend + template: + metadata: + labels: + name: memcached-frontend + spec: + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + name: memcached-frontend + topologyKey: kubernetes.io/hostname + containers: + - args: + - -m 1024 + - -I 5m + - -c 16384 + - -v + - --extended=track_sizes + image: memcached:1.6.28-alpine + imagePullPolicy: IfNotPresent + name: memcached + ports: + - containerPort: 11211 + name: client + resources: + limits: + memory: 1536Mi + requests: + cpu: 500m + memory: 1176Mi + - args: + - --memcached.address=localhost:11211 + - --web.listen-address=0.0.0.0:9150 + image: prom/memcached-exporter:v0.14.4 + imagePullPolicy: IfNotPresent + name: exporter + ports: + - containerPort: 9150 + name: http-metrics + updateStrategy: + type: RollingUpdate +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: memcached-index-queries + namespace: default +spec: + replicas: 3 + selector: + matchLabels: + name: memcached-index-queries + serviceName: memcached-index-queries + template: + metadata: + labels: + name: memcached-index-queries + spec: + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + name: memcached-index-queries + topologyKey: kubernetes.io/hostname + containers: + - args: + - -m 1024 + - -I 5m + - -c 16384 + - -v + - --extended=track_sizes + image: memcached:1.6.28-alpine + imagePullPolicy: IfNotPresent + name: memcached + ports: + - containerPort: 11211 + name: client + resources: + limits: + memory: 1536Mi + requests: + cpu: 500m + memory: 1176Mi + - args: + - --memcached.address=localhost:11211 + - --web.listen-address=0.0.0.0:9150 + image: prom/memcached-exporter:v0.14.4 + imagePullPolicy: IfNotPresent + name: exporter + ports: + - containerPort: 9150 + name: http-metrics + updateStrategy: + type: RollingUpdate +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: memcached-metadata + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + name: memcached-metadata + serviceName: memcached-metadata + template: + metadata: + labels: + name: memcached-metadata + spec: + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + name: memcached-metadata + topologyKey: kubernetes.io/hostname + containers: + - args: + - -m 512 + - -I 1m + - -c 16384 + - -v + - --extended=track_sizes + image: memcached:1.6.28-alpine + imagePullPolicy: IfNotPresent + name: memcached + ports: + - containerPort: 11211 + name: client + resources: + limits: + memory: 768Mi + requests: + cpu: 500m + memory: 638Mi + - args: + - --memcached.address=localhost:11211 + - --web.listen-address=0.0.0.0:9150 + image: prom/memcached-exporter:v0.14.4 + imagePullPolicy: IfNotPresent + name: exporter + ports: + - containerPort: 9150 + name: http-metrics + updateStrategy: + type: RollingUpdate +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + annotations: + rollout-max-unavailable: "50" + labels: + rollout-group: store-gateway + name: store-gateway-zone-a + namespace: default +spec: + podManagementPolicy: Parallel + replicas: 1 + selector: + matchLabels: + name: store-gateway-zone-a + rollout-group: store-gateway + serviceName: store-gateway-zone-a + template: + metadata: + labels: + gossip_ring_member: "true" + name: store-gateway-zone-a + rollout-group: store-gateway + spec: + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchExpressions: + - key: rollout-group + operator: In + values: + - store-gateway + - key: name + operator: NotIn + values: + - store-gateway-zone-a + topologyKey: kubernetes.io/hostname + containers: + - args: + - -blocks-storage.bucket-store.chunks-cache.backend=memcached + - -blocks-storage.bucket-store.chunks-cache.memcached.addresses=dnssrvnoa+memcached.default.svc.cluster.local.:11211 + - -blocks-storage.bucket-store.chunks-cache.memcached.max-async-concurrency=50 + - -blocks-storage.bucket-store.chunks-cache.memcached.max-get-multi-concurrency=100 + - -blocks-storage.bucket-store.chunks-cache.memcached.max-idle-connections=150 + - -blocks-storage.bucket-store.chunks-cache.memcached.max-item-size=1048576 + - -blocks-storage.bucket-store.chunks-cache.memcached.timeout=750ms + - -blocks-storage.bucket-store.index-cache.backend=memcached + - -blocks-storage.bucket-store.index-cache.memcached.addresses=dnssrvnoa+memcached-index-queries.default.svc.cluster.local.:11211 + - -blocks-storage.bucket-store.index-cache.memcached.max-async-concurrency=50 + - -blocks-storage.bucket-store.index-cache.memcached.max-get-multi-concurrency=100 + - -blocks-storage.bucket-store.index-cache.memcached.max-idle-connections=150 + - -blocks-storage.bucket-store.index-cache.memcached.max-item-size=5242880 + - -blocks-storage.bucket-store.index-cache.memcached.timeout=750ms + - -blocks-storage.bucket-store.metadata-cache.backend=memcached + - -blocks-storage.bucket-store.metadata-cache.memcached.addresses=dnssrvnoa+memcached-metadata.default.svc.cluster.local.:11211 + - -blocks-storage.bucket-store.metadata-cache.memcached.max-async-concurrency=50 + - -blocks-storage.bucket-store.metadata-cache.memcached.max-get-multi-concurrency=100 + - -blocks-storage.bucket-store.metadata-cache.memcached.max-idle-connections=150 + - -blocks-storage.bucket-store.metadata-cache.memcached.max-item-size=1048576 + - -blocks-storage.bucket-store.sync-dir=/data/tsdb + - -blocks-storage.bucket-store.sync-interval=15m + - -blocks-storage.gcs.bucket-name=blocks-bucket + - -common.storage.backend=gcs + - -memberlist.bind-port=7946 + - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -store-gateway.sharding-ring.heartbeat-period=1m + - -store-gateway.sharding-ring.heartbeat-timeout=4m + - -store-gateway.sharding-ring.instance-availability-zone=zone-a + - -store-gateway.sharding-ring.prefix=multi-zone/ + - -store-gateway.sharding-ring.replication-factor=3 + - -store-gateway.sharding-ring.store=memberlist + - -store-gateway.sharding-ring.tokens-file-path=/data/tokens + - -store-gateway.sharding-ring.unregister-on-shutdown=false + - -store-gateway.sharding-ring.wait-stability-min-duration=1m + - -store-gateway.sharding-ring.zone-awareness-enabled=true + - -target=store-gateway + - -usage-stats.installation-mode=jsonnet + env: + - name: A + value: all-store-gateways + - name: GOMAXPROCS + value: "5" + - name: GOMEMLIMIT + value: "12884901888" + - name: JAEGER_REPORTER_MAX_QUEUE_SIZE + value: "1000" + image: grafana/mimir:2.14.0 + imagePullPolicy: IfNotPresent + name: store-gateway + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + - containerPort: 7946 + name: gossip-ring + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 18Gi + requests: + cpu: "1" + memory: 12Gi + volumeMounts: + - mountPath: /data + name: store-gateway-data + - mountPath: /etc/mimir + name: overrides + securityContext: + runAsUser: 0 + terminationGracePeriodSeconds: 120 + volumes: + - configMap: + name: overrides + name: overrides + updateStrategy: + type: OnDelete + volumeClaimTemplates: + - apiVersion: v1 + kind: PersistentVolumeClaim + metadata: + name: store-gateway-data + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 50Gi + storageClassName: standard +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + annotations: + rollout-max-unavailable: "50" + labels: + rollout-group: store-gateway + name: store-gateway-zone-b + namespace: default +spec: + podManagementPolicy: Parallel + replicas: 1 + selector: + matchLabels: + name: store-gateway-zone-b + rollout-group: store-gateway + serviceName: store-gateway-zone-b + template: + metadata: + labels: + gossip_ring_member: "true" + name: store-gateway-zone-b + rollout-group: store-gateway + spec: + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchExpressions: + - key: rollout-group + operator: In + values: + - store-gateway + - key: name + operator: NotIn + values: + - store-gateway-zone-b + topologyKey: kubernetes.io/hostname + containers: + - args: + - -blocks-storage.bucket-store.chunks-cache.backend=memcached + - -blocks-storage.bucket-store.chunks-cache.memcached.addresses=dnssrvnoa+memcached.default.svc.cluster.local.:11211 + - -blocks-storage.bucket-store.chunks-cache.memcached.max-async-concurrency=50 + - -blocks-storage.bucket-store.chunks-cache.memcached.max-get-multi-concurrency=100 + - -blocks-storage.bucket-store.chunks-cache.memcached.max-idle-connections=150 + - -blocks-storage.bucket-store.chunks-cache.memcached.max-item-size=1048576 + - -blocks-storage.bucket-store.chunks-cache.memcached.timeout=750ms + - -blocks-storage.bucket-store.index-cache.backend=memcached + - -blocks-storage.bucket-store.index-cache.memcached.addresses=dnssrvnoa+memcached-index-queries.default.svc.cluster.local.:11211 + - -blocks-storage.bucket-store.index-cache.memcached.max-async-concurrency=50 + - -blocks-storage.bucket-store.index-cache.memcached.max-get-multi-concurrency=100 + - -blocks-storage.bucket-store.index-cache.memcached.max-idle-connections=150 + - -blocks-storage.bucket-store.index-cache.memcached.max-item-size=5242880 + - -blocks-storage.bucket-store.index-cache.memcached.timeout=750ms + - -blocks-storage.bucket-store.metadata-cache.backend=memcached + - -blocks-storage.bucket-store.metadata-cache.memcached.addresses=dnssrvnoa+memcached-metadata.default.svc.cluster.local.:11211 + - -blocks-storage.bucket-store.metadata-cache.memcached.max-async-concurrency=50 + - -blocks-storage.bucket-store.metadata-cache.memcached.max-get-multi-concurrency=100 + - -blocks-storage.bucket-store.metadata-cache.memcached.max-idle-connections=150 + - -blocks-storage.bucket-store.metadata-cache.memcached.max-item-size=1048576 + - -blocks-storage.bucket-store.sync-dir=/data/tsdb + - -blocks-storage.bucket-store.sync-interval=15m + - -blocks-storage.gcs.bucket-name=blocks-bucket + - -common.storage.backend=gcs + - -memberlist.bind-port=7946 + - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -store-gateway.sharding-ring.heartbeat-period=1m + - -store-gateway.sharding-ring.heartbeat-timeout=4m + - -store-gateway.sharding-ring.instance-availability-zone=zone-b + - -store-gateway.sharding-ring.prefix=multi-zone/ + - -store-gateway.sharding-ring.replication-factor=3 + - -store-gateway.sharding-ring.store=memberlist + - -store-gateway.sharding-ring.tokens-file-path=/data/tokens + - -store-gateway.sharding-ring.unregister-on-shutdown=false + - -store-gateway.sharding-ring.wait-stability-min-duration=1m + - -store-gateway.sharding-ring.zone-awareness-enabled=true + - -target=store-gateway + - -usage-stats.installation-mode=jsonnet + env: + - name: A + value: zone-b + - name: GOGC + value: "1000" + - name: GOMAXPROCS + value: "5" + - name: GOMEMLIMIT + value: "12884901888" + - name: JAEGER_REPORTER_MAX_QUEUE_SIZE + value: "1000" + image: grafana/mimir:2.14.0 + imagePullPolicy: IfNotPresent + name: store-gateway + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + - containerPort: 7946 + name: gossip-ring + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 18Gi + requests: + cpu: "1" + memory: 12Gi + volumeMounts: + - mountPath: /data + name: store-gateway-data + - mountPath: /etc/mimir + name: overrides + securityContext: + runAsUser: 0 + terminationGracePeriodSeconds: 120 + volumes: + - configMap: + name: overrides + name: overrides + updateStrategy: + type: OnDelete + volumeClaimTemplates: + - apiVersion: v1 + kind: PersistentVolumeClaim + metadata: + name: store-gateway-data + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 50Gi + storageClassName: standard +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + annotations: + rollout-max-unavailable: "50" + labels: + rollout-group: store-gateway + name: store-gateway-zone-c + namespace: default +spec: + podManagementPolicy: Parallel + replicas: 1 + selector: + matchLabels: + name: store-gateway-zone-c + rollout-group: store-gateway + serviceName: store-gateway-zone-c + template: + metadata: + labels: + gossip_ring_member: "true" + name: store-gateway-zone-c + rollout-group: store-gateway + spec: + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchExpressions: + - key: rollout-group + operator: In + values: + - store-gateway + - key: name + operator: NotIn + values: + - store-gateway-zone-c + topologyKey: kubernetes.io/hostname + containers: + - args: + - -blocks-storage.bucket-store.chunks-cache.backend=memcached + - -blocks-storage.bucket-store.chunks-cache.memcached.addresses=dnssrvnoa+memcached.default.svc.cluster.local.:11211 + - -blocks-storage.bucket-store.chunks-cache.memcached.max-async-concurrency=50 + - -blocks-storage.bucket-store.chunks-cache.memcached.max-get-multi-concurrency=100 + - -blocks-storage.bucket-store.chunks-cache.memcached.max-idle-connections=150 + - -blocks-storage.bucket-store.chunks-cache.memcached.max-item-size=1048576 + - -blocks-storage.bucket-store.chunks-cache.memcached.timeout=750ms + - -blocks-storage.bucket-store.index-cache.backend=memcached + - -blocks-storage.bucket-store.index-cache.memcached.addresses=dnssrvnoa+memcached-index-queries.default.svc.cluster.local.:11211 + - -blocks-storage.bucket-store.index-cache.memcached.max-async-concurrency=50 + - -blocks-storage.bucket-store.index-cache.memcached.max-get-multi-concurrency=100 + - -blocks-storage.bucket-store.index-cache.memcached.max-idle-connections=150 + - -blocks-storage.bucket-store.index-cache.memcached.max-item-size=5242880 + - -blocks-storage.bucket-store.index-cache.memcached.timeout=750ms + - -blocks-storage.bucket-store.metadata-cache.backend=memcached + - -blocks-storage.bucket-store.metadata-cache.memcached.addresses=dnssrvnoa+memcached-metadata.default.svc.cluster.local.:11211 + - -blocks-storage.bucket-store.metadata-cache.memcached.max-async-concurrency=50 + - -blocks-storage.bucket-store.metadata-cache.memcached.max-get-multi-concurrency=100 + - -blocks-storage.bucket-store.metadata-cache.memcached.max-idle-connections=150 + - -blocks-storage.bucket-store.metadata-cache.memcached.max-item-size=1048576 + - -blocks-storage.bucket-store.sync-dir=/data/tsdb + - -blocks-storage.bucket-store.sync-interval=15m + - -blocks-storage.gcs.bucket-name=blocks-bucket + - -common.storage.backend=gcs + - -memberlist.bind-port=7946 + - -memberlist.join=dns+gossip-ring.default.svc.cluster.local.:7946 + - -runtime-config.file=/etc/mimir/overrides.yaml + - -server.grpc.keepalive.min-time-between-pings=10s + - -server.grpc.keepalive.ping-without-stream-allowed=true + - -server.http-listen-port=8080 + - -store-gateway.sharding-ring.heartbeat-period=1m + - -store-gateway.sharding-ring.heartbeat-timeout=4m + - -store-gateway.sharding-ring.instance-availability-zone=zone-c + - -store-gateway.sharding-ring.prefix=multi-zone/ + - -store-gateway.sharding-ring.replication-factor=3 + - -store-gateway.sharding-ring.store=memberlist + - -store-gateway.sharding-ring.tokens-file-path=/data/tokens + - -store-gateway.sharding-ring.unregister-on-shutdown=false + - -store-gateway.sharding-ring.wait-stability-min-duration=1m + - -store-gateway.sharding-ring.zone-awareness-enabled=true + - -target=store-gateway + - -usage-stats.installation-mode=jsonnet + env: + - name: A + value: all-store-gateways + - name: GOMAXPROCS + value: "5" + - name: GOMEMLIMIT + value: "12884901888" + - name: JAEGER_REPORTER_MAX_QUEUE_SIZE + value: "1000" + image: grafana/mimir:2.14.0 + imagePullPolicy: IfNotPresent + name: store-gateway + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 9095 + name: grpc + - containerPort: 7946 + name: gossip-ring + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + timeoutSeconds: 1 + resources: + limits: + memory: 18Gi + requests: + cpu: "1" + memory: 12Gi + volumeMounts: + - mountPath: /data + name: store-gateway-data + - mountPath: /etc/mimir + name: overrides + securityContext: + runAsUser: 0 + terminationGracePeriodSeconds: 120 + volumes: + - configMap: + name: overrides + name: overrides + updateStrategy: + type: OnDelete + volumeClaimTemplates: + - apiVersion: v1 + kind: PersistentVolumeClaim + metadata: + name: store-gateway-data + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 50Gi + storageClassName: standard +--- +apiVersion: etcd.database.coreos.com/v1beta2 +kind: EtcdCluster +metadata: + annotations: + etcd.database.coreos.com/scope: clusterwide + name: etcd + namespace: default +spec: + pod: + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - podAffinityTerm: + labelSelector: + matchLabels: + etcd_cluster: etcd + topologyKey: topology.kubernetes.io/zone + weight: 100 + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + etcd_cluster: etcd + topologyKey: kubernetes.io/hostname + annotations: + prometheus.io/port: "2379" + prometheus.io/scrape: "true" + etcdEnv: + - name: ETCD_AUTO_COMPACTION_RETENTION + value: 1h + labels: + name: etcd + resources: + limits: + memory: 512Mi + requests: + cpu: 500m + memory: 512Mi + tolerations: + - effect: NoSchedule + key: topology + operator: Equal + value: multi-az + size: 3 + version: 3.3.13 diff --git a/operations/mimir-tests/test-multi-zone-etcd.jsonnet b/operations/mimir-tests/test-multi-zone-etcd.jsonnet new file mode 100644 index 0000000000..b45df3d7ac --- /dev/null +++ b/operations/mimir-tests/test-multi-zone-etcd.jsonnet @@ -0,0 +1,6 @@ +// Based on test-multi-zone.jsonnet. +(import 'test-multi-zone.jsonnet') { + _config+:: { + multi_zone_etcd_enabled: true, + }, +} diff --git a/operations/mimir/mimir.libsonnet b/operations/mimir/mimir.libsonnet index dd54c4a154..f559ae89e2 100644 --- a/operations/mimir/mimir.libsonnet +++ b/operations/mimir/mimir.libsonnet @@ -24,12 +24,15 @@ // Mimir features (import 'shuffle-sharding.libsonnet') + (import 'query-sharding.libsonnet') + -(import 'multi-zone.libsonnet') + -(import 'multi-zone-distributor.libsonnet') + (import 'rollout-operator.libsonnet') + (import 'ruler-remote-evaluation.libsonnet') + (import 'continuous-test.libsonnet') + +// Multi-zone support. +(import 'multi-zone.libsonnet') + +(import 'multi-zone-distributor.libsonnet') + +(import 'multi-zone-etcd.libsonnet') + + // Import autoscaling after other features because it overrides deployments. (import 'autoscaling.libsonnet') + diff --git a/operations/mimir/multi-zone-etcd.libsonnet b/operations/mimir/multi-zone-etcd.libsonnet new file mode 100644 index 0000000000..91a7fd209f --- /dev/null +++ b/operations/mimir/multi-zone-etcd.libsonnet @@ -0,0 +1,45 @@ +{ + _config+:: { + multi_zone_etcd_enabled: false, + }, + + // Enforcing the spread of etcd pods across multi-AZ is not easy because we're limited + // by etcd-operator features. The etcd-operator doesn't support setting pod topology spread + // constraints, so we can only leverage on pod affinity. + // + // Here we configure a preferred (but not required) anti-affinity rule to increase the likelihood + // that different etcd pods will run in different AZs. The reason why we use "preferred" + // instead of "required" is because if the number of etcd pods (e.g. 5) is greater than the + // number of available AZs (e.g. 3) some pods will be never scheduled if we use "required". + etcd: overrideSuperIfExists('etcd', if !$._config.multi_zone_etcd_enabled then {} else { + local podAntiAffinity = $.apps.v1.deployment.mixin.spec.template.spec.affinity.podAntiAffinity, + local weightedPodAffinityTerm = $.core.v1.weightedPodAffinityTerm, + local deployment = $.apps.v1.deployment, + + spec+: { + pod+: { + affinity+: { + podAntiAffinity+: + podAntiAffinity.withPreferredDuringSchedulingIgnoredDuringExecution([ + weightedPodAffinityTerm.withWeight(100) + + weightedPodAffinityTerm.podAffinityTerm.labelSelector.withMatchLabels({ etcd_cluster: 'etcd' }) + + weightedPodAffinityTerm.podAffinityTerm.withTopologyKey('topology.kubernetes.io/zone'), + ]).spec.template.spec.affinity.podAntiAffinity, + }, + + tolerations+: + deployment.spec.template.spec.withTolerationsMixin([ + $.core.v1.toleration.withKey('topology') + + $.core.v1.toleration.withOperator('Equal') + + $.core.v1.toleration.withValue('multi-az') + + $.core.v1.toleration.withEffect('NoSchedule'), + ]).spec.template.spec.tolerations, + }, + }, + }), + + + // Utility used to override a field only if exists in super. + local overrideSuperIfExists(name, override) = if !( name in super) || super[name] == null || super[name] == {} then null else + super[name] + override, +}