Skip to content

Commit

Permalink
Add Loki ruler and static rules for tenant OCM (#331)
Browse files Browse the repository at this point in the history
  • Loading branch information
periklis authored Sep 27, 2022
1 parent cd2a8a6 commit 1d6dc77
Show file tree
Hide file tree
Showing 8 changed files with 424 additions and 4 deletions.
8 changes: 4 additions & 4 deletions jsonnetfile.lock.json
Original file line number Diff line number Diff line change
Expand Up @@ -149,8 +149,8 @@
"subdir": "jsonnet/lib"
}
},
"version": "4d5fe09aaab876288355a1fbe5e3eed43734a0d0",
"sum": "o0dOrsTJx4Lt/F0TA/4Pg/wipWPWOwhtxCJ6lHxzvjY=",
"version": "a5834effa711ab261558f489407223e4b3e166cb",
"sum": "ZJlzUf7NGTHUr2RtytxG3i8z7DfQJ3gPtV89hnDNjGE=",
"name": "observatorium-api"
},
{
Expand All @@ -160,8 +160,8 @@
"subdir": "configuration"
}
},
"version": "39bd3b6e85614d09607c41c575a477b9cfd78569",
"sum": "uiddp3srcX6NncnnYYNY6lYEy7oyRBz6Izf8qGfUKnY="
"version": "36071db2f584b7cfb7932946dc9336873c363f09",
"sum": "xKdQFAsNSYo1lK+2HVOSVcyG9sjEduHVbWhxIe5eJ6o="
},
{
"source": {
Expand Down
303 changes: 303 additions & 0 deletions resources/services/observatorium-logs-template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -673,6 +673,26 @@ objects:
"update_interval": "1m"
"query_scheduler":
"max_outstanding_requests_per_tenant": 256
"ruler":
"alertmanager_refresh_interval": "1m"
"alertmanager_url": "http://_web._tcp.observatorium-alertmanager.${ALERTMANAGER_NAMESPACE}.svc.cluster.local"
"enable_alertmanager_discovery": true
"enable_alertmanager_v2": true
"enable_api": true
"enable_sharding": true
"ring":
"kvstore":
"store": "memberlist"
"rule_path": "/data"
"storage":
"local":
"directory": "/tmp/rules"
"type": "local"
"wal":
"dir": "/data/loki/wal"
"max_age": "4h"
"min_age": "5m"
"truncate_frequency": "60m"
"schema_config":
"configs":
- "from": "2020-10-01"
Expand Down Expand Up @@ -1969,9 +1989,280 @@ objects:
app.kubernetes.io/instance: observatorium
app.kubernetes.io/name: loki
app.kubernetes.io/part-of: observatorium
- apiVersion: v1
kind: Service
metadata:
labels:
app.kubernetes.io/component: ruler
app.kubernetes.io/instance: observatorium
app.kubernetes.io/name: loki
app.kubernetes.io/part-of: observatorium
app.kubernetes.io/version: ${LOKI_IMAGE_TAG}
name: observatorium-loki-ruler-grpc
spec:
clusterIP: None
ports:
- name: grpc
port: 9095
targetPort: 9095
selector:
app.kubernetes.io/component: ruler
app.kubernetes.io/instance: observatorium
app.kubernetes.io/name: loki
app.kubernetes.io/part-of: observatorium
- apiVersion: v1
kind: Service
metadata:
labels:
app.kubernetes.io/component: ruler
app.kubernetes.io/instance: observatorium
app.kubernetes.io/name: loki
app.kubernetes.io/part-of: observatorium
app.kubernetes.io/version: ${LOKI_IMAGE_TAG}
name: observatorium-loki-ruler-http
spec:
ports:
- name: metrics
port: 3100
targetPort: 3100
selector:
app.kubernetes.io/component: ruler
app.kubernetes.io/instance: observatorium
app.kubernetes.io/name: loki
app.kubernetes.io/part-of: observatorium
- apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
app.kubernetes.io/instance: observatorium
app.kubernetes.io/name: loki
app.kubernetes.io/part-of: observatorium
prometheus: app-sre
name: observatorium-loki-ruler
spec:
endpoints:
- port: metrics
namespaceSelector:
matchNames:
- ${NAMESPACE}
selector:
matchLabels:
app.kubernetes.io/component: ruler
app.kubernetes.io/instance: observatorium
app.kubernetes.io/name: loki
app.kubernetes.io/part-of: observatorium
- apiVersion: apps/v1
kind: StatefulSet
metadata:
labels:
app.kubernetes.io/component: ruler
app.kubernetes.io/instance: observatorium
app.kubernetes.io/name: loki
app.kubernetes.io/part-of: observatorium
app.kubernetes.io/version: ${LOKI_IMAGE_TAG}
name: observatorium-loki-ruler
spec:
replicas: ${{LOKI_RULER_REPLICAS}}
selector:
matchLabels:
app.kubernetes.io/component: ruler
app.kubernetes.io/instance: observatorium
app.kubernetes.io/name: loki
app.kubernetes.io/part-of: observatorium
loki.grafana.com/gossip: "true"
serviceName: observatorium-loki-ruler-grpc
template:
metadata:
labels:
app.kubernetes.io/component: ruler
app.kubernetes.io/instance: observatorium
app.kubernetes.io/name: loki
app.kubernetes.io/part-of: observatorium
app.kubernetes.io/tracing: jaeger-agent
loki.grafana.com/gossip: "true"
spec:
containers:
- args:
- -target=ruler
- -config.file=/etc/loki/config/config.yaml
- -limits.per-user-override-config=/etc/loki/config/overrides.yaml
- -log.level=error
- -s3.buckets=$(S3_BUCKETS)
- -s3.region=$(S3_REGION)
- -s3.access-key-id=$(AWS_ACCESS_KEY_ID)
- -s3.secret-access-key=$(AWS_SECRET_ACCESS_KEY)
- -distributor.replication-factor=1
- -ruler.external.url="${ALERTMANAGER_EXTERNAL_URL}"
env:
- name: S3_BUCKETS
valueFrom:
secretKeyRef:
key: bucket
name: ${LOKI_S3_SECRET}
- name: S3_REGION
valueFrom:
secretKeyRef:
key: aws_region
name: ${LOKI_S3_SECRET}
- name: AWS_ACCESS_KEY_ID
valueFrom:
secretKeyRef:
key: aws_access_key_id
name: ${LOKI_S3_SECRET}
- name: AWS_SECRET_ACCESS_KEY
valueFrom:
secretKeyRef:
key: aws_secret_access_key
name: ${LOKI_S3_SECRET}
image: ${LOKI_IMAGE}:${LOKI_IMAGE_TAG}
imagePullPolicy: IfNotPresent
livenessProbe:
failureThreshold: 10
httpGet:
path: /metrics
port: 3100
scheme: HTTP
periodSeconds: 30
name: observatorium-loki-ruler
ports:
- containerPort: 3100
name: metrics
- containerPort: 9095
name: grpc
- containerPort: 7946
name: gossip-ring
readinessProbe:
httpGet:
path: /ready
port: 3100
scheme: HTTP
initialDelaySeconds: 15
timeoutSeconds: 1
resources:
limits:
cpu: ${LOKI_RULER_CPU_LIMITS}
memory: ${LOKI_RULER_MEMORY_LIMITS}
requests:
cpu: ${LOKI_RULER_CPU_REQUESTS}
memory: ${LOKI_RULER_REQUESTS}
volumeMounts:
- mountPath: /etc/loki/config/
name: config
readOnly: false
- mountPath: /data
name: storage
readOnly: false
- mountPath: /tmp/rules
name: rules
readOnly: false
- args:
- --reporter.grpc.host-port=dns:///jaeger-collector-headless.${JAEGER_COLLECTOR_NAMESPACE}.svc:14250
- --reporter.type=grpc
- --agent.tags=pod.namespace=$(NAMESPACE),pod.name=$(POD)
env:
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: POD
valueFrom:
fieldRef:
fieldPath: metadata.name
image: ${JAEGER_AGENT_IMAGE}:${JAEGER_AGENT_IMAGE_TAG}
livenessProbe:
failureThreshold: 5
httpGet:
path: /
port: 14271
scheme: HTTP
name: jaeger-agent
ports:
- containerPort: 5778
name: configs
- containerPort: 6831
name: jaeger-thrift
- containerPort: 14271
name: metrics
resources:
limits:
cpu: 128m
memory: 128Mi
requests:
cpu: 32m
memory: 64Mi
serviceAccountName: ${SERVICE_ACCOUNT_NAME}
volumes:
- configMap:
name: observatorium-loki
name: config
- configMap:
name: observatorium-loki-rules
name: rules
volumeClaimTemplates:
- metadata:
labels:
app.kubernetes.io/instance: observatorium
app.kubernetes.io/name: loki
app.kubernetes.io/part-of: observatorium
name: storage
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: ${LOKI_RULER_PVC_REQUEST}
storageClassName: ${STORAGE_CLASS}
- apiVersion: v1
data:
rhobs-logs-ocm-alerts: |-
"groups":
- "interval": "1m"
"name": "uhc-stage-logs-based-alerts"
"rules":
- "alert": "UHC Server Errors"
"annotations":
"summary": "${labels.kubernetes_labels_app} is returning server-side errors"
"expr": "sum(rate({kubernetes_namespace_name=\"uhc-stage\"} |= `returning http 500` | json | line_format \"{{ .message }}\" [1m])) > 0"
"for": "5m"
"labels":
"namespace": "uhc-stage"
"service": "${labels.kubernetes_labels_app}"
"severity": "warn"
- "alert": "UHC Nil Reference Errors - Stage"
"annotations":
"summary": "${labels.kubernetes_labels_app} is throwing nil reference errors"
"expr": "sum(rate({kubernetes_namespace_name=\"uhc-stage\"} |= `nil reference` | json | line_format \"{{ .message }}\" [1m])) > 0"
"for": "5m"
"labels":
"namespace": "uhc-stage"
"service": "${labels.kubernetes_labels_app}"
"severity": "warn"
- "alert": "UHC Panic Errors - Stage"
"annotations":
"summary": "${labels.kubernetes_labels_app} is throwing panic errors"
"expr": "sum(rate({kubernetes_namespace_name=\"uhc-stage\"} |= `panic` | json | line_format \"{{ .message }}\")) > 0"
"for": "5m"
"labels":
"namespace": "uhc-stage"
"service": "${labels.kubernetes_labels_app}"
"severity": "warn"
kind: ConfigMap
metadata:
annotations:
qontract.recycle: "true"
labels:
app.kubernetes.io/instance: observatorium
app.kubernetes.io/name: loki
app.kubernetes.io/part-of: observatorium
app.kubernetes.io/version: ${LOKI_IMAGE_TAG}
name: observatorium-loki-rules
parameters:
- name: NAMESPACE
value: observatorium-logs
- name: ALERTMANAGER_NAMESPACE
value: observatorium-mst-stage
- name: ALERTMANAGER_EXTERNAL_URL
value: https://observatorium-alertmanager-mst.api.stage.openshift.com
- name: STORAGE_CLASS
value: gp2
- name: LOKI_IMAGE_TAG
Expand Down Expand Up @@ -2048,6 +2339,16 @@ parameters:
value: 600Mi
- name: LOKI_QUERY_FRONTEND_MEMORY_LIMITS
value: 1200Mi
- name: LOKI_RULER_REPLICAS
value: "2"
- name: LOKI_RULER_CPU_REQUESTS
value: 1000m
- name: LOKI_RULER_CPU_LIMITS
value: 2000m
- name: LOKI_RULER_MEMORY_REQUESTS
value: 4Gi
- name: LOKI_RULER_MEMORY_LIMITS
value: 6Gi
- name: LOKI_REPLICATION_FACTOR
value: "2"
- name: LOKI_QUERIER_MAX_CONCURRENCY
Expand Down Expand Up @@ -2088,6 +2389,8 @@ parameters:
value: 10Gi
- name: LOKI_INGESTER_PVC_REQUEST
value: 150Gi
- name: LOKI_RULER_PVC_REQUEST
value: 10Gi
- name: JAEGER_COLLECTOR_NAMESPACE
value: observatorium
- name: JAEGER_AGENT_IMAGE
Expand Down
1 change: 1 addition & 0 deletions resources/services/observatorium-template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1027,6 +1027,7 @@ objects:
- --logs.read.endpoint=http://observatorium-loki-query-frontend-http.${OBSERVATORIUM_LOGS_NAMESPACE}.svc.cluster.local:3100
- --logs.tail.endpoint=http://observatorium-loki-querier-http.${OBSERVATORIUM_LOGS_NAMESPACE}.svc.cluster.local:3100
- --logs.write.endpoint=http://observatorium-loki-distributor-http.${OBSERVATORIUM_LOGS_NAMESPACE}.svc.cluster.local:3100
- --logs.rules.endpoint=http://observatorium-loki-ruler-http.${OBSERVATORIUM_LOGS_NAMESPACE}.svc.cluster.local:3100
- --traces.write.endpoint=observatorium-otel-collector-headless.${OBSERVATORIUM_TRACES_NAMESPACE}.svc.cluster.local:4317
- --grpc.listen=0.0.0.0:8090
- --experimental.traces.read.endpoint-template=http://observatorium-jaeger-{tenant}-query.${OBSERVATORIUM_TRACES_NAMESPACE}.svc.cluster.local:16686/
Expand Down
3 changes: 3 additions & 0 deletions resources/services/observatorium-traces-template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,9 @@ objects:
- "routing"
receivers:
- "otlp"
telemetry:
metrics:
level: "basic"
image: ${OPENTELEMETRY_COLLECTOR_IMAGE}:${OPENTELEMETRY_COLLECTOR_IMAGE_TAG}
mode: deployment
parameters:
Expand Down
Loading

0 comments on commit 1d6dc77

Please sign in to comment.