Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Loki ruler and static rules for tenant OCM #331

Merged
merged 1 commit into from
Sep 27, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions jsonnetfile.lock.json
Original file line number Diff line number Diff line change
Expand Up @@ -159,8 +159,8 @@
"subdir": "jsonnet/lib"
}
},
"version": "4d5fe09aaab876288355a1fbe5e3eed43734a0d0",
"sum": "o0dOrsTJx4Lt/F0TA/4Pg/wipWPWOwhtxCJ6lHxzvjY=",
"version": "a5834effa711ab261558f489407223e4b3e166cb",
"sum": "ZJlzUf7NGTHUr2RtytxG3i8z7DfQJ3gPtV89hnDNjGE=",
"name": "observatorium-api"
},
{
Expand All @@ -170,8 +170,8 @@
"subdir": "configuration"
}
},
"version": "39bd3b6e85614d09607c41c575a477b9cfd78569",
"sum": "uiddp3srcX6NncnnYYNY6lYEy7oyRBz6Izf8qGfUKnY="
"version": "36071db2f584b7cfb7932946dc9336873c363f09",
"sum": "xKdQFAsNSYo1lK+2HVOSVcyG9sjEduHVbWhxIe5eJ6o="
},
{
"source": {
Expand Down
303 changes: 303 additions & 0 deletions resources/services/observatorium-logs-template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -673,6 +673,26 @@ objects:
"update_interval": "1m"
"query_scheduler":
"max_outstanding_requests_per_tenant": 256
"ruler":
"alertmanager_refresh_interval": "1m"
"alertmanager_url": "http://_web._tcp.observatorium-alertmanager.${ALERTMANAGER_NAMESPACE}.svc.cluster.local"
"enable_alertmanager_discovery": true
"enable_alertmanager_v2": true
"enable_api": true
"enable_sharding": true
"ring":
"kvstore":
"store": "memberlist"
"rule_path": "/data"
"storage":
"local":
"directory": "/tmp/rules"
"type": "local"
"wal":
"dir": "/data/loki/wal"
"max_age": "4h"
"min_age": "5m"
"truncate_frequency": "60m"
"schema_config":
"configs":
- "from": "2020-10-01"
Expand Down Expand Up @@ -1969,9 +1989,280 @@ objects:
app.kubernetes.io/instance: observatorium
app.kubernetes.io/name: loki
app.kubernetes.io/part-of: observatorium
- apiVersion: v1
kind: Service
metadata:
labels:
app.kubernetes.io/component: ruler
app.kubernetes.io/instance: observatorium
app.kubernetes.io/name: loki
app.kubernetes.io/part-of: observatorium
app.kubernetes.io/version: ${LOKI_IMAGE_TAG}
name: observatorium-loki-ruler-grpc
spec:
clusterIP: None
ports:
- name: grpc
port: 9095
targetPort: 9095
selector:
app.kubernetes.io/component: ruler
app.kubernetes.io/instance: observatorium
app.kubernetes.io/name: loki
app.kubernetes.io/part-of: observatorium
- apiVersion: v1
kind: Service
metadata:
labels:
app.kubernetes.io/component: ruler
app.kubernetes.io/instance: observatorium
app.kubernetes.io/name: loki
app.kubernetes.io/part-of: observatorium
app.kubernetes.io/version: ${LOKI_IMAGE_TAG}
name: observatorium-loki-ruler-http
spec:
ports:
- name: metrics
port: 3100
targetPort: 3100
selector:
app.kubernetes.io/component: ruler
app.kubernetes.io/instance: observatorium
app.kubernetes.io/name: loki
app.kubernetes.io/part-of: observatorium
- apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
app.kubernetes.io/instance: observatorium
app.kubernetes.io/name: loki
app.kubernetes.io/part-of: observatorium
prometheus: app-sre
name: observatorium-loki-ruler
spec:
endpoints:
- port: metrics
namespaceSelector:
matchNames:
- ${NAMESPACE}
selector:
matchLabels:
app.kubernetes.io/component: ruler
app.kubernetes.io/instance: observatorium
app.kubernetes.io/name: loki
app.kubernetes.io/part-of: observatorium
- apiVersion: apps/v1
kind: StatefulSet
metadata:
labels:
app.kubernetes.io/component: ruler
app.kubernetes.io/instance: observatorium
app.kubernetes.io/name: loki
app.kubernetes.io/part-of: observatorium
app.kubernetes.io/version: ${LOKI_IMAGE_TAG}
name: observatorium-loki-ruler
spec:
replicas: ${{LOKI_RULER_REPLICAS}}
selector:
matchLabels:
app.kubernetes.io/component: ruler
app.kubernetes.io/instance: observatorium
app.kubernetes.io/name: loki
app.kubernetes.io/part-of: observatorium
loki.grafana.com/gossip: "true"
serviceName: observatorium-loki-ruler-grpc
template:
metadata:
labels:
app.kubernetes.io/component: ruler
app.kubernetes.io/instance: observatorium
app.kubernetes.io/name: loki
app.kubernetes.io/part-of: observatorium
app.kubernetes.io/tracing: jaeger-agent
loki.grafana.com/gossip: "true"
spec:
containers:
- args:
- -target=ruler
- -config.file=/etc/loki/config/config.yaml
- -limits.per-user-override-config=/etc/loki/config/overrides.yaml
- -log.level=error
- -s3.buckets=$(S3_BUCKETS)
- -s3.region=$(S3_REGION)
- -s3.access-key-id=$(AWS_ACCESS_KEY_ID)
- -s3.secret-access-key=$(AWS_SECRET_ACCESS_KEY)
- -distributor.replication-factor=1
- -ruler.external.url="${ALERTMANAGER_EXTERNAL_URL}"
env:
- name: S3_BUCKETS
valueFrom:
secretKeyRef:
key: bucket
name: ${LOKI_S3_SECRET}
- name: S3_REGION
valueFrom:
secretKeyRef:
key: aws_region
name: ${LOKI_S3_SECRET}
- name: AWS_ACCESS_KEY_ID
valueFrom:
secretKeyRef:
key: aws_access_key_id
name: ${LOKI_S3_SECRET}
- name: AWS_SECRET_ACCESS_KEY
valueFrom:
secretKeyRef:
key: aws_secret_access_key
name: ${LOKI_S3_SECRET}
image: ${LOKI_IMAGE}:${LOKI_IMAGE_TAG}
imagePullPolicy: IfNotPresent
livenessProbe:
failureThreshold: 10
httpGet:
path: /metrics
port: 3100
scheme: HTTP
periodSeconds: 30
name: observatorium-loki-ruler
ports:
- containerPort: 3100
name: metrics
- containerPort: 9095
name: grpc
- containerPort: 7946
name: gossip-ring
readinessProbe:
httpGet:
path: /ready
port: 3100
scheme: HTTP
initialDelaySeconds: 15
timeoutSeconds: 1
resources:
limits:
cpu: ${LOKI_RULER_CPU_LIMITS}
memory: ${LOKI_RULER_MEMORY_LIMITS}
requests:
cpu: ${LOKI_RULER_CPU_REQUESTS}
memory: ${LOKI_RULER_REQUESTS}
volumeMounts:
- mountPath: /etc/loki/config/
name: config
readOnly: false
- mountPath: /data
name: storage
readOnly: false
- mountPath: /tmp/rules
name: rules
readOnly: false
- args:
- --reporter.grpc.host-port=dns:///jaeger-collector-headless.${JAEGER_COLLECTOR_NAMESPACE}.svc:14250
- --reporter.type=grpc
- --agent.tags=pod.namespace=$(NAMESPACE),pod.name=$(POD)
env:
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: POD
valueFrom:
fieldRef:
fieldPath: metadata.name
image: ${JAEGER_AGENT_IMAGE}:${JAEGER_AGENT_IMAGE_TAG}
livenessProbe:
failureThreshold: 5
httpGet:
path: /
port: 14271
scheme: HTTP
name: jaeger-agent
ports:
- containerPort: 5778
name: configs
- containerPort: 6831
name: jaeger-thrift
- containerPort: 14271
name: metrics
resources:
limits:
cpu: 128m
memory: 128Mi
requests:
cpu: 32m
memory: 64Mi
serviceAccountName: ${SERVICE_ACCOUNT_NAME}
volumes:
- configMap:
name: observatorium-loki
name: config
- configMap:
name: observatorium-loki-rules
name: rules
volumeClaimTemplates:
- metadata:
labels:
app.kubernetes.io/instance: observatorium
app.kubernetes.io/name: loki
app.kubernetes.io/part-of: observatorium
name: storage
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: ${LOKI_RULER_PVC_REQUEST}
storageClassName: ${STORAGE_CLASS}
- apiVersion: v1
data:
rhobs-logs-ocm-alerts: |-
"groups":
- "interval": "1m"
"name": "uhc-stage-logs-based-alerts"
"rules":
- "alert": "UHC Server Errors"
"annotations":
"summary": "${labels.kubernetes_labels_app} is returning server-side errors"
"expr": "sum(rate({kubernetes_namespace_name=\"uhc-stage\"} |= `returning http 500` | json | line_format \"{{ .message }}\" [1m])) > 0"
"for": "5m"
"labels":
"namespace": "uhc-stage"
"service": "${labels.kubernetes_labels_app}"
"severity": "warn"
- "alert": "UHC Nil Reference Errors - Stage"
"annotations":
"summary": "${labels.kubernetes_labels_app} is throwing nil reference errors"
"expr": "sum(rate({kubernetes_namespace_name=\"uhc-stage\"} |= `nil reference` | json | line_format \"{{ .message }}\" [1m])) > 0"
"for": "5m"
"labels":
"namespace": "uhc-stage"
"service": "${labels.kubernetes_labels_app}"
"severity": "warn"
- "alert": "UHC Panic Errors - Stage"
"annotations":
"summary": "${labels.kubernetes_labels_app} is throwing panic errors"
"expr": "sum(rate({kubernetes_namespace_name=\"uhc-stage\"} |= `panic` | json | line_format \"{{ .message }}\")) > 0"
"for": "5m"
"labels":
"namespace": "uhc-stage"
"service": "${labels.kubernetes_labels_app}"
"severity": "warn"
kind: ConfigMap
metadata:
annotations:
qontract.recycle: "true"
labels:
app.kubernetes.io/instance: observatorium
app.kubernetes.io/name: loki
app.kubernetes.io/part-of: observatorium
app.kubernetes.io/version: ${LOKI_IMAGE_TAG}
name: observatorium-loki-rules
parameters:
- name: NAMESPACE
value: observatorium-logs
- name: ALERTMANAGER_NAMESPACE
value: observatorium-mst-stage
- name: ALERTMANAGER_EXTERNAL_URL
value: https://observatorium-alertmanager-mst.api.stage.openshift.com
- name: STORAGE_CLASS
value: gp2
- name: LOKI_IMAGE_TAG
Expand Down Expand Up @@ -2048,6 +2339,16 @@ parameters:
value: 600Mi
- name: LOKI_QUERY_FRONTEND_MEMORY_LIMITS
value: 1200Mi
- name: LOKI_RULER_REPLICAS
value: "2"
- name: LOKI_RULER_CPU_REQUESTS
value: 1000m
- name: LOKI_RULER_CPU_LIMITS
value: 2000m
- name: LOKI_RULER_MEMORY_REQUESTS
value: 4Gi
- name: LOKI_RULER_MEMORY_LIMITS
value: 6Gi
- name: LOKI_REPLICATION_FACTOR
value: "2"
- name: LOKI_QUERIER_MAX_CONCURRENCY
Expand Down Expand Up @@ -2088,6 +2389,8 @@ parameters:
value: 10Gi
- name: LOKI_INGESTER_PVC_REQUEST
value: 150Gi
- name: LOKI_RULER_PVC_REQUEST
value: 10Gi
- name: JAEGER_COLLECTOR_NAMESPACE
value: observatorium
- name: JAEGER_AGENT_IMAGE
Expand Down
1 change: 1 addition & 0 deletions resources/services/observatorium-template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1019,6 +1019,7 @@ objects:
- --logs.read.endpoint=http://observatorium-loki-query-frontend-http.${OBSERVATORIUM_LOGS_NAMESPACE}.svc.cluster.local:3100
- --logs.tail.endpoint=http://observatorium-loki-querier-http.${OBSERVATORIUM_LOGS_NAMESPACE}.svc.cluster.local:3100
- --logs.write.endpoint=http://observatorium-loki-distributor-http.${OBSERVATORIUM_LOGS_NAMESPACE}.svc.cluster.local:3100
- --logs.rules.endpoint=http://observatorium-loki-ruler-http.${OBSERVATORIUM_LOGS_NAMESPACE}.svc.cluster.local:3100
- --traces.write.endpoint=observatorium-otel-collector-headless.${OBSERVATORIUM_TRACES_NAMESPACE}.svc.cluster.local:4317
- --grpc.listen=0.0.0.0:8090
- --experimental.traces.read.endpoint-template=http://observatorium-jaeger-{tenant}-query.${OBSERVATORIUM_TRACES_NAMESPACE}.svc.cluster.local:16686/
Expand Down
3 changes: 3 additions & 0 deletions resources/services/observatorium-traces-template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,9 @@ objects:
- "routing"
receivers:
- "otlp"
telemetry:
metrics:
level: "basic"
image: ${OPENTELEMETRY_COLLECTOR_IMAGE}:${OPENTELEMETRY_COLLECTOR_IMAGE_TAG}
mode: deployment
parameters:
Expand Down
Loading