Skip to content

Commit

Permalink
feat(alerting): magic alerting severities (#127)
Browse files Browse the repository at this point in the history
* feat(alerting): magic alerting hocus pocus

Signed-off-by: Hy3n4 <[email protected]>

* feat(alert-severities): config like this?

Signed-off-by: Robin Opletal <[email protected]>

* fix(alert-severities): remove NoSlo severity

Signed-off-by: Robin Opletal <[email protected]>

* stylistic fixes

Signed-off-by: Robin Opletal <[email protected]>

* fix(config): nicer Cfg initialization

Signed-off-by: Robin Opletal <[email protected]>

* stop hardcoding 1-target in alertmanagerrule

Signed-off-by: Robin Opletal <[email protected]>

* feat(make): make wait for services better

when deploying dev infratructure

Signed-off-by: Hy3n4 <[email protected]>

* feat(alerting): alertmanager config loader edits (#130)

* feat(alerting): alertmanager config loader

Signed-off-by: Robin Opletal <[email protected]>

* generated stuff...

Signed-off-by: Robin Opletal <[email protected]>

---------

Signed-off-by: Robin Opletal <[email protected]>

* chore: add release-drafter gh action (#133)

Signed-off-by: Jose Santorum <[email protected]>

* feat(magic): some features some fixes

mainly fixed duration issues
also, added some feature to devel, osko dashboards, kustomize for asier
deployment and other cool stuff
modified function responsible for creating the alerting rule when
magiAlerting is enabled, also added some basic mapping for opsgenie,
pagerduty and custom alerting tool. Currently not working as expected
tho

Signed-off-by: Hy3n4 <[email protected]>

* fix(rules): duplicated rules for 5m window

Signed-off-by: Hy3n4 <[email protected]>

* prometheus helper: simplify unique windows, resolve finalizer API warnings

Signed-off-by: Robin Opletal <[email protected]>

* at least move uniqueStrings to function

Signed-off-by: Robin Opletal <[email protected]>

---------

Signed-off-by: Hy3n4 <[email protected]>
Signed-off-by: Robin Opletal <[email protected]>
Signed-off-by: Jose Santorum <[email protected]>
Co-authored-by: Hy3n4 <[email protected]>
Co-authored-by: Jose Santorum <[email protected]>
  • Loading branch information
3 people authored Dec 27, 2024
1 parent 83c5b03 commit 5ff41f5
Show file tree
Hide file tree
Showing 19 changed files with 981 additions and 75 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ $(ENVTEST): $(LOCALBIN)

.PHONY: deploydev
deploydev:
@$(KUBECTL) apply -R -f devel/
@$(KUBECTL) apply -k devel/
@echo "Waiting for services to come online for the port-forwards..."
@until [ "$$($(KUBECTL) get pods -l app=grafana -o jsonpath='{.items}')}" != "[]" ] && \
[ "$$($(KUBECTL) get pods -l app=grafana -o jsonpath='{.items[0].status.containerStatuses[0].ready}')" == "true" ]; do \
Expand Down
15 changes: 8 additions & 7 deletions api/osko/v1alpha1/mimirrule_types.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package v1alpha1

import (
monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
"github.com/prometheus/common/model"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
Expand Down Expand Up @@ -33,13 +34,13 @@ type RuleGroup struct {
}

type Rule struct {
Record string `json:"record,omitempty"`
Alert string `json:"alert,omitempty"`
Expr string `json:"expr"`
For model.Duration `json:"for,omitempty"`
KeepFiringFor model.Duration `json:"keep_firing_for,omitempty"`
Labels map[string]string `json:"labels,omitempty"`
Annotations map[string]string `json:"annotations,omitempty"`
Record string `json:"record,omitempty"`
Alert string `json:"alert,omitempty"`
Expr string `json:"expr"`
For *monitoringv1.Duration `json:"for,omitempty"`
KeepFiringFor model.Duration `json:"keep_firing_for,omitempty"`
Labels map[string]string `json:"labels,omitempty"`
Annotations map[string]string `json:"annotations,omitempty"`
}

//+kubebuilder:object:root=true
Expand Down
6 changes: 6 additions & 0 deletions api/osko/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ func main() {
opts.BindFlags(flag.CommandLine)

flag.Parse()
cfg := config.NewConfig()
config.NewConfig()

ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts)))

Expand Down Expand Up @@ -141,7 +141,7 @@ func main() {
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
Recorder: mgr.GetEventRecorderFor("mimirrule-controller"),
RequeueAfterPeriod: cfg.MimirRuleRequeuePeriod,
RequeueAfterPeriod: config.Cfg.MimirRuleRequeuePeriod,
}).SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "MimirRule")
os.Exit(1)
Expand Down
10 changes: 5 additions & 5 deletions config/crd/bases/osko.dev_mimirrules.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -85,11 +85,11 @@ spec:
type: string
for:
description: |-
Duration wraps time.Duration. It is used to parse the custom duration format
from YAML.
This type should not propagate beyond the scope of input/output processing.
format: int64
type: integer
Duration is a valid time duration that can be parsed by Prometheus model.ParseDuration() function.
Supported units: y, w, d, h, m, s, ms
Examples: `30s`, `1m`, `1h20m15s`, `15d`
pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$
type: string
keep_firing_for:
description: |-
Duration wraps time.Duration. It is used to parse the custom duration format
Expand Down
5 changes: 4 additions & 1 deletion config/samples/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
## Append samples of your project ##
resources:
- osko_v1alpha1_alertmanagerconfig.yaml
- openslo_v1_datasource.yaml
- openslo_v1_slo.yaml
- config_secret.yaml
- osko_v1alpha1_alertmanagerconfig.yaml
# +kubebuilder:scaffold:manifestskustomizesamples
2 changes: 0 additions & 2 deletions config/samples/openslo_v1_datasource.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,8 @@ spec:
description: Mimir Datasource for logging tenant
type: mimir
connectionDetails:
address: https://mimir.monitoring.dev.heu.group/
address: http://localhost:9009/
sourceTenants:
- gatekeeper-system
targetTenant: gatekeeper-system
- monitoring
targetTenant: monitoring
28 changes: 14 additions & 14 deletions config/samples/openslo_v1_slo.yaml
Original file line number Diff line number Diff line change
@@ -1,39 +1,39 @@
apiVersion: openslo.com/v1
kind: SLO
metadata:
name: mimir-ingestion-latency
labels:
label.osko.dev/team: "infrastructure"
label.osko.dev/system: "gatekeeper"
label.osko.dev/domain: "security"
label.osko.dev/team: "infra"
label.osko.dev/system: "monitoring"
label.osko.dev/domain: "observability"
label.osko.dev/service: "mimir"
annotations:
osko.dev/datasourceRef: "mimir-infra-ds"
osko.dev/magicAlerting: "true"
name: gatekeeper-webhook-response-time
spec:
budgetingMethod: Occurrences
description: 99% of Gatekeeper webhook requests return in less than 0.5s
description: 95% of all queries should have a latency of less than 300 milliseconds
indicator:
metadata:
name: gatekeeper-webhook-less-than-05s
name: distributor-query-success-latency
spec:
description: 99% of Gatekeeper webhook requests return in less than 0.5s
description: 95% of all queries should have a latency of less than 500 milliseconds
ratioMetric:
good:
metricSource:
metricSourceRef: mimir-infra-ds
type: Mimir
spec:
query: controller_runtime_webhook_latency_seconds_bucket{le="0.5", job="gatekeeper-metrics"}
query: cortex_distributor_query_duration_seconds_bucket{le="0.5", method="Distributor.QueryStream", status_code="200"}
total:
metricSource:
metricSourceRef: mimir-infra-ds
type: Mimir
spec:
query: controller_runtime_webhook_latency_seconds_count{job="gatekeeper-metrics"}
query: cortex_distributor_query_duration_seconds_count{method="Distributor.QueryStream"}
objectives:
- displayName: gatekeeper-webhook-less-than-05s
target: '0.99'
service: testing
- target: "0.99"
service: mimir
timeWindow:
- duration: 28d
isRolling: true
- duration: 28d
isRolling: true
11 changes: 10 additions & 1 deletion devel/grafana-agent/configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,19 @@ data:
}
}
}
prometheus.scrape "static" {
prometheus.relabel "cluster" {
rule {
target_label = "cluster"
replacement = "local"
}
forward_to = [
prometheus.remote_write.local.receiver,
]
}
prometheus.scrape "static" {
forward_to = [
prometheus.relabel.cluster.receiver,
]
targets = [
{
"__address__" = "mimir-service:9009",
Expand Down
13 changes: 8 additions & 5 deletions devel/grafana-agent/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@ spec:
- name: grafana-agent
image: grafana/agent:latest
args:
- 'run'
- '/etc/agent/agent.river'
- '--storage.path=/tmp/agent'
- '--server.http.listen-addr=127.0.0.1:80'
- '--server.http.ui-path-prefix=/'
- "run"
- "/etc/agent/agent.river"
- "--storage.path=/tmp/agent"
- "--server.http.listen-addr=127.0.0.1:12345"
- "--server.http.ui-path-prefix=/"
volumeMounts:
- name: config-volume
mountPath: /etc/agent
Expand All @@ -31,6 +31,9 @@ spec:
valueFrom:
fieldRef:
fieldPath: spec.nodeName
ports:
- containerPort: 12345
name: http-agent

volumes:
- name: config-volume
Expand Down
Loading

0 comments on commit 5ff41f5

Please sign in to comment.