diff --git a/.github/tools b/.github/tools index a52be9a07..8e72847f3 100644 --- a/.github/tools +++ b/.github/tools @@ -8,9 +8,4 @@ operator-sdk v1.37.0 opm v1.47.0 promq v0.0.1 crdoc v0.5.2 -jsonnet v0.20.0 -jsonnetfmt v0.20.0 -jsonnet-lint v0.20.0 -jb v0.5.1 -gojsontoyaml v0.1.0 shellcheck 0.10.0 diff --git a/Makefile b/Makefile index 706491e4f..1d3015ea4 100644 --- a/Makefile +++ b/Makefile @@ -24,51 +24,24 @@ test-unit: go test -cover ./cmd/... ./pkg/... .PHONY: lint -lint: lint-golang lint-jsonnet lint-shell +lint: lint-golang lint-shell .PHONY: lint-golang lint-golang: $(GOLANGCI_LINT) $(GOLANGCI_LINT) run ./... --fix -.PHONY: lint-jsonnet -lint-jsonnet: $(JSONNET_LINT) jsonnet-vendor - find jsonnet/ -name 'vendor' -prune \ - -o -name '*.libsonnet' -print \ - -o -name '*.jsonnet' -print \ - | xargs -n 1 -- $(JSONNET_LINT) -J $(JSONNET_VENDOR) +# TODO(simonpasquier): remove this after #629 merges. +.PHONY: lint-jsonnet fmt-jsonnet +lint-jsonnet fmt-jsonnet: .PHONY: lint-shell lint-shell: $(SHELLCHECK) find -name "*.sh" -print0 | xargs --null $(SHELLCHECK) -.PHONY: fmt-jsonnet -fmt-jsonnet: $(JSONNETFMT) jsonnet-vendor - find jsonnet/ -name 'vendor' -prune \ - -o -name '*.libsonnet' -print \ - -o -name '*.jsonnet' -print \ - | xargs -n 1 -- $(JSONNETFMT) $(JSONNETFMT_ARGS) -i - - .PHONY: check-jq check-jq: jq --version > /dev/null -.PHONY: jsonnet-vendor -jsonnet-vendor: $(JB) - cd jsonnet && $(JB) install - -.PHONY: generate-prometheus-rules -generate-prometheus-rules: jsonnet-tools check-jq kustomize jsonnet-vendor - for dir in jsonnet/components/*/; do \ - component=$$(basename $$dir) ;\ - echo "Generating prometheusrule file for $$component" ;\ - $(JSONNET) -J $(JSONNET_VENDOR) $$dir/main.jsonnet \ - | jq .rule \ - | $(GOJSONTOYAML) > deploy/monitoring/monitoring-$$component-rules.yaml ;\ - cd deploy/monitoring && \ - $(KUSTOMIZE) edit add resource "monitoring-$$component-rules.yaml" && cd - ;\ - done; - .PHONY: docs docs: $(CRDOC) mkdir -p docs @@ -129,7 +102,7 @@ generate-deepcopy: $(CONTROLLER_GEN) $(CONTROLLER_GEN) object:headerFile="hack/boilerplate.go.txt" paths="./pkg/apis/..." .PHONY: generate -generate: generate-crds generate-deepcopy generate-kustomize generate-package-resources generate-prometheus-rules docs +generate: generate-crds generate-deepcopy generate-kustomize generate-package-resources docs .PHONY: operator operator: generate build @@ -317,4 +290,4 @@ kind-cluster: $(OPERATOR_SDK) .PHONY: clean clean: clean-tools - rm -rf $(JSONNET_VENDOR) bundle/ bundle.Dockerfile + rm -rf bundle/ bundle.Dockerfile diff --git a/Makefile.tools b/Makefile.tools index 1f8ab9e78..48fd9f848 100644 --- a/Makefile.tools +++ b/Makefile.tools @@ -34,21 +34,6 @@ OC_VERSION = v4.8.11 CRDOC = $(TOOLS_DIR)/crdoc CRDOC_VERSION = v0.5.2 -# jsonnet related tools and dependencies -JSONNET = $(TOOLS_DIR)/jsonnet -JSONNETFMT = $(TOOLS_DIR)/jsonnetfmt -JSONNET_LINT = $(TOOLS_DIR)/jsonnet-lint -JSONNET_VERSION = v0.20.0 - -JB = $(TOOLS_DIR)/jb -JB_VERSION = v0.5.1 - -GOJSONTOYAML = $(TOOLS_DIR)/gojsontoyaml -GOJSONTOYAML_VERSION = v0.1.0 - -JSONNET_VENDOR = jsonnet/vendor -JSONNETFMT_ARGS = -n 2 --max-blank-lines 2 --string-style s --comment-style s - SHELLCHECK = $(TOOLS_DIR)/shellcheck SHELLCHECK_VERSION = 0.10.0 @@ -141,47 +126,6 @@ $(CRDOC) crdoc: $(TOOLS_DIR) GOBIN=$(TOOLS_DIR) go install fybrik.io/crdoc@$(CRDOC_VERSION) ;\ } -.PHONY: jsonnet -$(JSONNET) jsonnet: $(TOOLS_DIR) - @{ \ - set -ex ;\ - [[ -f $(JSONNET) ]] && exit 0 ;\ - GOBIN=$(TOOLS_DIR) go install github.com/google/go-jsonnet/cmd/jsonnet@$(JSONNET_VERSION) ;\ - } - - -.PHONY: jsonnetfmt -$(JSONNETFMT) jsonnetfmt: $(TOOLS_DIR) - @{ \ - set -ex ;\ - [[ -f $(JSONNETFMT) ]] && exit 0 ;\ - GOBIN=$(TOOLS_DIR) go install github.com/google/go-jsonnet/cmd/jsonnetfmt@$(JSONNET_VERSION) ;\ - } - -.PHONY: jsonnet-lint -$(JSONNET_LINT) jsonnet-lint: $(TOOLS_DIR) - @{ \ - set -ex ;\ - [[ -f $(JSONNET_LINT) ]] && exit 0 ;\ - GOBIN=$(TOOLS_DIR) go install github.com/google/go-jsonnet/cmd/jsonnet-lint@$(JSONNET_VERSION) ;\ - } - -.PHONY: jb -$(JB) jb: $(TOOLS_DIR) - @{ \ - set -ex ;\ - [[ -f $(JB) ]] && exit 0 ;\ - GOBIN=$(TOOLS_DIR) go install github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb@$(JB_VERSION) ;\ - } - -.PHONY: gojsontoyaml -$(GOJSONTOYAML) gojsontoyaml: $(TOOLS_DIR) - @{ \ - set -ex ;\ - [[ -f $(GOJSONTOYAML) ]] && exit 0 ;\ - GOBIN=$(TOOLS_DIR) go install github.com/brancz/gojsontoyaml@$(GOJSONTOYAML_VERSION) ;\ - } - .PHONY: shellcheck $(SHELLCHECK) shellcheck: $(TOOLS_DIR) @{ \ @@ -196,9 +140,6 @@ $(SHELLCHECK) shellcheck: $(TOOLS_DIR) $(SHELLCHECK) -V | grep -q $${version##v} ;\ } -.PHONY: jsonnet-tools -jsonnet-tools: jsonnet jsonnetfmt jsonnet-lint jb gojsontoyaml - # Install all required tools .PHONY: tools tools: $(CONTROLLER_GEN) \ @@ -209,8 +150,7 @@ tools: $(CONTROLLER_GEN) \ $(PROMQ) \ $(CRDOC) \ $(GOLANGCI_LINT) \ - $(SHELLCHECK) \ - jsonnet-tools + $(SHELLCHECK) @{ \ set -ex ;\ tools_file=.github/tools ;\ @@ -224,11 +164,6 @@ tools: $(CONTROLLER_GEN) \ echo $$(basename $(OPM)) $(OPM_VERSION) >> $$tools_file ;\ echo $$(basename $(PROMQ)) $(PROMQ_VERSION) >> $$tools_file ;\ echo $$(basename $(CRDOC)) $(CRDOC_VERSION) >> $$tools_file ; \ - echo $$(basename $(JSONNET)) $(JSONNET_VERSION) >> $$tools_file ;\ - echo $$(basename $(JSONNETFMT)) $(JSONNET_VERSION) >> $$tools_file ;\ - echo $$(basename $(JSONNET_LINT)) $(JSONNET_VERSION) >> $$tools_file ;\ - echo $$(basename $(JB)) $(JB_VERSION) >> $$tools_file ;\ - echo $$(basename $(GOJSONTOYAML)) $(GOJSONTOYAML_VERSION) >> $$tools_file ;\ echo $$(basename $(SHELLCHECK)) $(SHELLCHECK_VERSION) >> $$tools_file ;\ } @@ -246,7 +181,4 @@ validate-tools: @$(OPM) version @$(PROMQ) --help | head -n 2 @$(CRDOC) --help | head -n 3 - @$(JSONNETFMT) --version - @$(JSONNET_LINT) --version - @$(JB) --version @$(SHELLCHECK) -V | head -n 2 diff --git a/bundle/manifests/observability-operator.clusterserviceversion.yaml b/bundle/manifests/observability-operator.clusterserviceversion.yaml index 2e0a95b2f..04216fc7b 100644 --- a/bundle/manifests/observability-operator.clusterserviceversion.yaml +++ b/bundle/manifests/observability-operator.clusterserviceversion.yaml @@ -42,7 +42,7 @@ metadata: categories: Monitoring certified: "false" containerImage: observability-operator:0.4.2 - createdAt: "2024-11-18T09:45:14Z" + createdAt: "2024-11-19T13:38:30Z" description: A Go based Kubernetes operator to setup and manage highly available Monitoring Stack using Prometheus, Alertmanager and Thanos Querier. operatorframework.io/cluster-monitoring: "true" diff --git a/deploy/monitoring/kustomization.yaml b/deploy/monitoring/kustomization.yaml index 3c61e4146..5d41193d7 100644 --- a/deploy/monitoring/kustomization.yaml +++ b/deploy/monitoring/kustomization.yaml @@ -2,6 +2,3 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: - observability-operator-rules.yaml -- monitoring-alertmanager-rules.yaml -- monitoring-prometheus-rules.yaml -- monitoring-prometheus-operator-rules.yaml diff --git a/deploy/monitoring/monitoring-alertmanager-rules.yaml b/deploy/monitoring/monitoring-alertmanager-rules.yaml deleted file mode 100644 index e103f279d..000000000 --- a/deploy/monitoring/monitoring-alertmanager-rules.yaml +++ /dev/null @@ -1,128 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - labels: - app.kubernetes.io/component: operator - app.kubernetes.io/name: observability-operator-alertmanager-rules - app.kubernetes.io/part-of: observability-operator - prometheus: k8s - role: alert-rules - name: observability-operator-alertmanager-rules -spec: - groups: - - name: alertmanager.rules - rules: - - alert: AlertmanagerFailedReload - annotations: - description: Configuration has failed to load for {{$labels.instance}}. - summary: Reloading an Alertmanager configuration has failed. - expr: | - # Without max_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - max_over_time(alertmanager_config_last_reload_successful{job="alertmanager"}[5m]) == 0 - for: 10m - labels: - severity: critical - - alert: AlertmanagerMembersInconsistent - annotations: - description: Alertmanager {{$labels.instance}} has only found {{ $value }} members of the {{$labels.job}} cluster. - summary: A member of an Alertmanager cluster has not found all other cluster members. - expr: | - # Without max_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - max_over_time(alertmanager_cluster_members{job="alertmanager"}[5m]) - < on (job) group_left - count by (job) (max_over_time(alertmanager_cluster_members{job="alertmanager"}[5m])) - for: 15m - labels: - severity: critical - - alert: AlertmanagerFailedToSendAlerts - annotations: - description: Alertmanager {{$labels.instance}} failed to send {{ $value | humanizePercentage }} of notifications to {{ $labels.integration }}. - summary: An Alertmanager instance failed to send notifications. - expr: | - ( - rate(alertmanager_notifications_failed_total{job="alertmanager"}[5m]) - / - rate(alertmanager_notifications_total{job="alertmanager"}[5m]) - ) - > 0.01 - for: 5m - labels: - severity: warning - - alert: AlertmanagerClusterFailedToSendAlerts - annotations: - description: The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}. - summary: All Alertmanager instances in a cluster failed to send notifications to a critical integration. - expr: | - min by (job, integration) ( - rate(alertmanager_notifications_failed_total{job="alertmanager", integration=~`.*`}[5m]) - / - rate(alertmanager_notifications_total{job="alertmanager", integration=~`.*`}[5m]) - ) - > 0.01 - for: 5m - labels: - severity: critical - - alert: AlertmanagerClusterFailedToSendAlerts - annotations: - description: The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}. - summary: All Alertmanager instances in a cluster failed to send notifications to a non-critical integration. - expr: | - min by (job, integration) ( - rate(alertmanager_notifications_failed_total{job="alertmanager", integration!~`.*`}[5m]) - / - rate(alertmanager_notifications_total{job="alertmanager", integration!~`.*`}[5m]) - ) - > 0.01 - for: 5m - labels: - severity: warning - - alert: AlertmanagerConfigInconsistent - annotations: - description: Alertmanager instances within the {{$labels.job}} cluster have different configurations. - summary: Alertmanager instances within the same cluster have different configurations. - expr: | - count by (job) ( - count_values by (job) ("config_hash", alertmanager_config_hash{job="alertmanager"}) - ) - != 1 - for: 20m - labels: - severity: critical - - alert: AlertmanagerClusterDown - annotations: - description: '{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have been up for less than half of the last 5m.' - summary: Half or more of the Alertmanager instances within the same cluster are down. - expr: | - ( - count by (job) ( - avg_over_time(up{job="alertmanager"}[5m]) < 0.5 - ) - / - count by (job) ( - up{job="alertmanager"} - ) - ) - >= 0.5 - for: 5m - labels: - severity: critical - - alert: AlertmanagerClusterCrashlooping - annotations: - description: '{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have restarted at least 5 times in the last 10m.' - summary: Half or more of the Alertmanager instances within the same cluster are crashlooping. - expr: | - ( - count by (job) ( - changes(process_start_time_seconds{job="alertmanager"}[10m]) > 4 - ) - / - count by (job) ( - up{job="alertmanager"} - ) - ) - >= 0.5 - for: 5m - labels: - severity: critical diff --git a/deploy/monitoring/monitoring-prometheus-operator-rules.yaml b/deploy/monitoring/monitoring-prometheus-operator-rules.yaml deleted file mode 100644 index a2b5a5f0e..000000000 --- a/deploy/monitoring/monitoring-prometheus-operator-rules.yaml +++ /dev/null @@ -1,90 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - labels: - app.kubernetes.io/component: operator - app.kubernetes.io/name: observability-operator-prometheus-operator-rules - app.kubernetes.io/part-of: observability-operator - prometheus: k8s - role: alert-rules - name: observability-operator-prometheus-operator-rules -spec: - groups: - - name: prometheus-operator - rules: - - alert: PrometheusOperatorListErrors - annotations: - description: Errors while performing List operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace. - summary: Errors while performing list operations in controller. - expr: | - (sum by (controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job="prometheus-operator"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_list_operations_total{job="prometheus-operator"}[10m]))) > 0.4 - for: 15m - labels: - severity: warning - - alert: PrometheusOperatorWatchErrors - annotations: - description: Errors while performing watch operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace. - summary: Errors while performing watch operations in controller. - expr: | - (sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator"}[10m]))) > 0.4 - for: 15m - labels: - severity: warning - - alert: PrometheusOperatorSyncFailed - annotations: - description: Controller {{ $labels.controller }} in {{ $labels.namespace }} namespace fails to reconcile {{ $value }} objects. - summary: Last controller reconciliation failed - expr: | - min_over_time(prometheus_operator_syncs{status="failed",job="prometheus-operator"}[5m]) > 0 - for: 10m - labels: - severity: warning - - alert: PrometheusOperatorReconcileErrors - annotations: - description: '{{ $value | humanizePercentage }} of reconciling operations failed for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.' - summary: Errors while reconciling controller. - expr: | - (sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator"}[5m]))) > 0.1 - for: 10m - labels: - severity: warning - - alert: PrometheusOperatorNodeLookupErrors - annotations: - description: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace. - summary: Errors while reconciling Prometheus. - expr: | - rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator"}[5m]) > 0.1 - for: 10m - labels: - severity: warning - - alert: PrometheusOperatorNotReady - annotations: - description: Prometheus operator in {{ $labels.namespace }} namespace isn't ready to reconcile {{ $labels.controller }} resources. - summary: Prometheus operator not ready - expr: | - min by(namespace, controller) (max_over_time(prometheus_operator_ready{job="prometheus-operator"}[5m]) == 0) - for: 5m - labels: - severity: warning - - alert: PrometheusOperatorRejectedResources - annotations: - description: Prometheus operator in {{ $labels.namespace }} namespace rejected {{ printf "%0.0f" $value }} {{ $labels.controller }}/{{ $labels.resource }} resources. - summary: Resources rejected by Prometheus operator - expr: | - min_over_time(prometheus_operator_managed_resources{state="rejected",job="prometheus-operator"}[5m]) > 0 - for: 5m - labels: - severity: warning - - name: config-reloaders - rules: - - alert: ConfigReloaderSidecarErrors - annotations: - description: |- - Errors encountered while the {{$labels.pod}} config-reloader sidecar attempts to sync config in {{$labels.namespace}} namespace. - As a result, configuration for service running in {{$labels.pod}} may be stale and cannot be updated anymore. - summary: config-reloader sidecar has not had a successful reload for 10m - expr: | - max_over_time(reloader_last_reload_successful{namespace=~".+"}[5m]) == 0 - for: 10m - labels: - severity: warning diff --git a/deploy/monitoring/monitoring-prometheus-rules.yaml b/deploy/monitoring/monitoring-prometheus-rules.yaml deleted file mode 100644 index 6c48065d8..000000000 --- a/deploy/monitoring/monitoring-prometheus-rules.yaml +++ /dev/null @@ -1,229 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - labels: - app.kubernetes.io/component: operator - app.kubernetes.io/name: observability-operator-prometheus-rules - app.kubernetes.io/part-of: observability-operator - prometheus: k8s - role: alert-rules - name: observability-operator-prometheus-rules -spec: - groups: - - name: prometheus - rules: - - alert: PrometheusBadConfig - annotations: - description: Prometheus {{$labels.instance}} has failed to reload its configuration. - summary: Failed Prometheus configuration reload. - expr: | - # Without max_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - max_over_time(prometheus_config_last_reload_successful{job="prometheus"}[5m]) == 0 - for: 10m - labels: - severity: critical - - alert: PrometheusNotificationQueueRunningFull - annotations: - description: Alert notification queue of Prometheus {{$labels.instance}} is running full. - summary: Prometheus alert notification queue predicted to run full in less than 30m. - expr: | - # Without min_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - ( - predict_linear(prometheus_notifications_queue_length{job="prometheus"}[5m], 60 * 30) - > - min_over_time(prometheus_notifications_queue_capacity{job="prometheus"}[5m]) - ) - for: 15m - labels: - severity: warning - - alert: PrometheusErrorSendingAlertsToSomeAlertmanagers - annotations: - description: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus {{$labels.instance}} to Alertmanager {{$labels.alertmanager}}.' - summary: Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager. - expr: | - ( - rate(prometheus_notifications_errors_total{job="prometheus"}[5m]) - / - rate(prometheus_notifications_sent_total{job="prometheus"}[5m]) - ) - * 100 - > 1 - for: 15m - labels: - severity: warning - - alert: PrometheusNotConnectedToAlertmanagers - annotations: - description: Prometheus {{$labels.instance}} is not connected to any Alertmanagers. - summary: Prometheus is not connected to any Alertmanagers. - expr: | - # Without max_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - max_over_time(prometheus_notifications_alertmanagers_discovered{job="prometheus"}[5m]) < 1 - for: 10m - labels: - severity: warning - - alert: PrometheusTSDBReloadsFailing - annotations: - description: Prometheus {{$labels.instance}} has detected {{$value | humanize}} reload failures over the last 3h. - summary: Prometheus has issues reloading blocks from disk. - expr: | - increase(prometheus_tsdb_reloads_failures_total{job="prometheus"}[3h]) > 0 - for: 4h - labels: - severity: warning - - alert: PrometheusTSDBCompactionsFailing - annotations: - description: Prometheus {{$labels.instance}} has detected {{$value | humanize}} compaction failures over the last 3h. - summary: Prometheus has issues compacting blocks. - expr: | - increase(prometheus_tsdb_compactions_failed_total{job="prometheus"}[3h]) > 0 - for: 4h - labels: - severity: warning - - alert: PrometheusNotIngestingSamples - annotations: - description: Prometheus {{$labels.instance}} is not ingesting samples. - summary: Prometheus is not ingesting samples. - expr: | - ( - rate(prometheus_tsdb_head_samples_appended_total{job="prometheus"}[5m]) <= 0 - and - ( - sum without(scrape_job) (prometheus_target_metadata_cache_entries{job="prometheus"}) > 0 - or - sum without(rule_group) (prometheus_rule_group_rules{job="prometheus"}) > 0 - ) - ) - for: 10m - labels: - severity: warning - - alert: PrometheusDuplicateTimestamps - annotations: - description: Prometheus {{$labels.instance}} is dropping {{ printf "%.4g" $value }} samples/s with different values but duplicated timestamp. - summary: Prometheus is dropping samples with duplicate timestamps. - expr: | - rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus"}[5m]) > 0 - for: 10m - labels: - severity: warning - - alert: PrometheusOutOfOrderTimestamps - annotations: - description: Prometheus {{$labels.instance}} is dropping {{ printf "%.4g" $value }} samples/s with timestamps arriving out of order. - summary: Prometheus drops samples with out-of-order timestamps. - expr: | - rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus"}[5m]) > 0 - for: 10m - labels: - severity: warning - - alert: PrometheusRemoteStorageFailures - annotations: - description: Prometheus {{$labels.instance}} failed to send {{ printf "%.1f" $value }}% of the samples to {{ $labels.remote_name}}:{{ $labels.url }} - summary: Prometheus fails to send samples to remote storage. - expr: | - ( - (rate(prometheus_remote_storage_failed_samples_total{job="prometheus"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="prometheus"}[5m])) - / - ( - (rate(prometheus_remote_storage_failed_samples_total{job="prometheus"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="prometheus"}[5m])) - + - (rate(prometheus_remote_storage_succeeded_samples_total{job="prometheus"}[5m]) or rate(prometheus_remote_storage_samples_total{job="prometheus"}[5m])) - ) - ) - * 100 - > 1 - for: 15m - labels: - severity: critical - - alert: PrometheusRemoteWriteBehind - annotations: - description: Prometheus {{$labels.instance}} remote write is {{ printf "%.1f" $value }}s behind for {{ $labels.remote_name}}:{{ $labels.url }}. - summary: Prometheus remote write is behind. - expr: | - # Without max_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - ( - max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="prometheus"}[5m]) - - ignoring(remote_name, url) group_right - max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="prometheus"}[5m]) - ) - > 120 - for: 15m - labels: - severity: critical - - alert: PrometheusRemoteWriteDesiredShards - annotations: - description: Prometheus {{$labels.instance}} remote write desired shards calculation wants to run {{ $value }} shards for queue {{ $labels.remote_name}}:{{ $labels.url }}, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus"}` $labels.instance | query | first | value }}. - summary: Prometheus remote write desired shards calculation wants to run more than configured max shards. - expr: | - # Without max_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - ( - max_over_time(prometheus_remote_storage_shards_desired{job="prometheus"}[5m]) - > - max_over_time(prometheus_remote_storage_shards_max{job="prometheus"}[5m]) - ) - for: 15m - labels: - severity: warning - - alert: PrometheusRuleFailures - annotations: - description: Prometheus {{$labels.instance}} has failed to evaluate {{ printf "%.0f" $value }} rules in the last 5m. - summary: Prometheus is failing rule evaluations. - expr: | - increase(prometheus_rule_evaluation_failures_total{job="prometheus"}[5m]) > 0 - for: 15m - labels: - severity: critical - - alert: PrometheusMissingRuleEvaluations - annotations: - description: Prometheus {{$labels.instance}} has missed {{ printf "%.0f" $value }} rule group evaluations in the last 5m. - summary: Prometheus is missing rule evaluations due to slow rule group evaluation. - expr: | - increase(prometheus_rule_group_iterations_missed_total{job="prometheus"}[5m]) > 0 - for: 15m - labels: - severity: warning - - alert: PrometheusTargetLimitHit - annotations: - description: Prometheus {{$labels.instance}} has dropped {{ printf "%.0f" $value }} targets because the number of targets exceeded the configured target_limit. - summary: Prometheus has dropped targets because some scrape configs have exceeded the targets limit. - expr: | - increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="prometheus"}[5m]) > 0 - for: 15m - labels: - severity: warning - - alert: PrometheusLabelLimitHit - annotations: - description: Prometheus {{$labels.instance}} has dropped {{ printf "%.0f" $value }} targets because some samples exceeded the configured label_limit, label_name_length_limit or label_value_length_limit. - summary: Prometheus has dropped targets because some scrape configs have exceeded the labels limit. - expr: | - increase(prometheus_target_scrape_pool_exceeded_label_limits_total{job="prometheus"}[5m]) > 0 - for: 15m - labels: - severity: warning - - alert: PrometheusTargetSyncFailure - annotations: - description: '{{ printf "%.0f" $value }} targets in Prometheus {{$labels.instance}} have failed to sync because invalid configuration was supplied.' - summary: Prometheus has failed to sync targets. - expr: | - increase(prometheus_target_sync_failed_total{job="prometheus"}[30m]) > 0 - for: 5m - labels: - severity: critical - - alert: PrometheusErrorSendingAlertsToAnyAlertmanager - annotations: - description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.instance}} to any Alertmanager.' - summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager. - expr: | - min without (alertmanager) ( - rate(prometheus_notifications_errors_total{job="prometheus",alertmanager!~``}[5m]) - / - rate(prometheus_notifications_sent_total{job="prometheus",alertmanager!~``}[5m]) - ) - * 100 - > 3 - for: 15m - labels: - severity: critical diff --git a/deploy/monitoring/observability-operator-rules.yaml b/deploy/monitoring/observability-operator-rules.yaml index fedaf4d14..b4b3ff499 100644 --- a/deploy/monitoring/observability-operator-rules.yaml +++ b/deploy/monitoring/observability-operator-rules.yaml @@ -1,54 +1,28 @@ --- -# NOTE: For PrometheusRule ObO uses platform monitoring in OpenShift -# instead of the forked PO's CRDs. As the ObO bundle ships the forked PO's CRD, -# these won't be available at the time OLM installs the bundle - apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: labels: app.kubernetes.io/component: operator - app.kubernetes.io/name: observability-operator-rules + app.kubernetes.io/name: observability-operator app.kubernetes.io/part-of: observability-operator - prometheus: k8s - role: alert-rules - name: observability-operator-rules + openshift.io/user-monitoring: "false" + name: observability-operator spec: groups: - - name: observability-operator.rules + - name: operator rules: - - alert: ObservabilityOperatorReconcileErrors + - alert: ClusterObservabilityOperatorReconciliationsFailed annotations: - description: | - Observability Operator controller - {{ $labels.controller }} fails to reconcile. - Inspect the observability-operator log for potential root causes. - summary: Observability Operator controller - {{ $labels.controller }} fails to reconcile - expr: | - increase(controller_runtime_reconcile_errors_total{job="observability-operator"}[15m]) > 0 + description: |- + {{$value | humanize}}% of reconciliation requests are failing for the '{{ $labels.controller}}' controller. + + Check the logs of the {{$labels.namespace}}/{{$labels.pod}} pod to investigate further. + summary: Cluster observability operator fails to reconcile resources + expr: |- + sum by(controller,pod,namespace) (rate(controller_runtime_reconcile_total{result="error",job="observability-operator"}[5m])) + / + sum by(controller,pod,namespace) (rate(controller_runtime_reconcile_total{job="observability-operator"}[5m])) > 0.1 for: 15m labels: severity: warning - - alert: ObservabilityOperatorReconcileLongerThan10Min - annotations: - description: | - Observability Operator controller reconcilation takes longer than 10 minutes for the controller - {{ $labels.controller }}. - Inspect the observability-operator log for potential root causes. - summary: Observability Operator controller - {{ $labels.controller }} reconcilation takes too long to reconcile - expr: | - rate(controller_runtime_reconcile_time_seconds_sum{job="observability-operator"}[5m]) / - rate(controller_runtime_reconcile_time_seconds_count{job="observability-operator"}[5m]) - > 600 - for: 10m - labels: - severity: warning - - alert: ObservabilityOperatorBacklogNotDrained - annotations: - description: | - The backlog of Observability Operator controller - {{ $labels.name }} is not getting drained; an indication that reconcile loop may be stuck - Inspect the observability-operator log for potential root causes. - summary: Observability Operator controller - {{ $labels.name }} backlog is not being drained. - expr: | - rate(workqueue_depth{job="observability-operator"}[15m]) > 0 - for: 15m - labels: - severity: critical diff --git a/jsonnet/OWNERS b/jsonnet/OWNERS deleted file mode 100644 index da9ef2be8..000000000 --- a/jsonnet/OWNERS +++ /dev/null @@ -1,4 +0,0 @@ -approvers: - - contributors -reviewers: - - contributors diff --git a/jsonnet/common.libsonnet b/jsonnet/common.libsonnet deleted file mode 100644 index 5c1c107b2..000000000 --- a/jsonnet/common.libsonnet +++ /dev/null @@ -1,16 +0,0 @@ -{ - // hidden k namespace for this library - k:: { - prometheusrule: { - new(name, labels, rules): { - apiVersion: 'monitoring.coreos.com/v1', - kind: 'PrometheusRule', - metadata: { - labels: labels, - name: name, - }, - spec: rules, - }, - }, - }, -} diff --git a/jsonnet/components/alertmanager/alertmanager-rules.jsonnet b/jsonnet/components/alertmanager/alertmanager-rules.jsonnet deleted file mode 100644 index 338202fe8..000000000 --- a/jsonnet/components/alertmanager/alertmanager-rules.jsonnet +++ /dev/null @@ -1,18 +0,0 @@ -/* cannot use https://github.com/prometheus/prometheus/blob/main/documentation/prometheus-mixin/mixin.libsonnet */ -/* since it generates yaml with double quotes wrapped */ - -local rules = ( - import 'github.com/prometheus/alertmanager/doc/alertmanager-mixin/mixin.libsonnet' -).prometheusAlerts; - -{ - _commonLabels:: { - 'app.kubernetes.io/component': 'operator', - 'app.kubernetes.io/name': 'observability-operator-alertmanager-rules', - 'app.kubernetes.io/part-of': 'observability-operator', - prometheus: 'k8s', - role: 'alert-rules', - }, - - rule: $.k.prometheusrule.new('observability-operator-alertmanager-rules', $._commonLabels, rules), -} diff --git a/jsonnet/components/alertmanager/main.jsonnet b/jsonnet/components/alertmanager/main.jsonnet deleted file mode 100644 index fc93341fe..000000000 --- a/jsonnet/components/alertmanager/main.jsonnet +++ /dev/null @@ -1,2 +0,0 @@ -(import '../common.libsonnet') + -(import 'alertmanager-rules.jsonnet') diff --git a/jsonnet/components/prometheus-operator/main.jsonnet b/jsonnet/components/prometheus-operator/main.jsonnet deleted file mode 100644 index 49b5cbcd7..000000000 --- a/jsonnet/components/prometheus-operator/main.jsonnet +++ /dev/null @@ -1,2 +0,0 @@ -(import '../common.libsonnet') + -(import 'prometheus-operator-rules.jsonnet') diff --git a/jsonnet/components/prometheus-operator/prometheus-operator-rules.jsonnet b/jsonnet/components/prometheus-operator/prometheus-operator-rules.jsonnet deleted file mode 100644 index 5dd617893..000000000 --- a/jsonnet/components/prometheus-operator/prometheus-operator-rules.jsonnet +++ /dev/null @@ -1,13 +0,0 @@ -local rules = import 'github.com/rhobs/obo-prometheus-operator/jsonnet/mixin/alerts.jsonnet'; - -{ - _commonLabels:: { - 'app.kubernetes.io/component': 'operator', - 'app.kubernetes.io/name': 'observability-operator-prometheus-operator-rules', - 'app.kubernetes.io/part-of': 'observability-operator', - prometheus: 'k8s', - role: 'alert-rules', - }, - - rule: $.k.prometheusrule.new('observability-operator-prometheus-operator-rules', $._commonLabels, rules), -} diff --git a/jsonnet/components/prometheus/main.jsonnet b/jsonnet/components/prometheus/main.jsonnet deleted file mode 100644 index c981d70ce..000000000 --- a/jsonnet/components/prometheus/main.jsonnet +++ /dev/null @@ -1,2 +0,0 @@ -(import '../common.libsonnet') + -(import 'prometheus-rules.jsonnet') diff --git a/jsonnet/components/prometheus/prometheus-rules.jsonnet b/jsonnet/components/prometheus/prometheus-rules.jsonnet deleted file mode 100644 index 3031ef01b..000000000 --- a/jsonnet/components/prometheus/prometheus-rules.jsonnet +++ /dev/null @@ -1,18 +0,0 @@ -/* cannot use import https://github.com/prometheus/prometheus/blob/main/documentation/prometheus-mixin/mixin.libsonnet directly */ -/* since it generates yaml with double quotes wrapped */ - -local rules = ( - import 'github.com/prometheus/prometheus/documentation/prometheus-mixin/mixin.libsonnet' -).prometheusAlerts; - -{ - _commonLabels:: { - 'app.kubernetes.io/component': 'operator', - 'app.kubernetes.io/name': 'observability-operator-prometheus-rules', - 'app.kubernetes.io/part-of': 'observability-operator', - prometheus: 'k8s', - role: 'alert-rules', - }, - - rule: $.k.prometheusrule.new('observability-operator-prometheus-rules', $._commonLabels, rules), -} diff --git a/jsonnet/jsonnetfile.json b/jsonnet/jsonnetfile.json deleted file mode 100644 index 4add3801c..000000000 --- a/jsonnet/jsonnetfile.json +++ /dev/null @@ -1,33 +0,0 @@ -{ - "version": 1, - "dependencies": [ - { - "source": { - "git": { - "remote": "https://github.com/rhobs/obo-prometheus-operator.git", - "subdir": "jsonnet/mixin" - } - }, - "version": "v0.57.0" - }, - { - "source": { - "git": { - "remote": "https://github.com/prometheus/alertmanager.git", - "subdir": "doc/alertmanager-mixin" - } - }, - "version": "v0.23.0" - }, - { - "source": { - "git": { - "remote": "https://github.com/prometheus/prometheus.git", - "subdir": "documentation/prometheus-mixin" - } - }, - "version": "v2.31.0" - } - ], - "legacyImports": true -} diff --git a/jsonnet/jsonnetfile.lock.json b/jsonnet/jsonnetfile.lock.json deleted file mode 100644 index 196bd3f42..000000000 --- a/jsonnet/jsonnetfile.lock.json +++ /dev/null @@ -1,56 +0,0 @@ -{ - "version": 1, - "dependencies": [ - { - "source": { - "git": { - "remote": "https://github.com/grafana/grafonnet-lib.git", - "subdir": "grafonnet" - } - }, - "version": "3626fc4dc2326931c530861ac5bebe39444f6cbf", - "sum": "gF8foHByYcB25jcUOBqP6jxk0OPifQMjPvKY0HaCk6w=" - }, - { - "source": { - "git": { - "remote": "https://github.com/grafana/jsonnet-libs.git", - "subdir": "grafana-builder" - } - }, - "version": "4452566af0a58f25cda10b3e568fac979fda85c3", - "sum": "0KkygBQd/AFzUvVzezE4qF/uDYgrwUXVpZfINBti0oc=" - }, - { - "source": { - "git": { - "remote": "https://github.com/rhobs/obo-prometheus-operator.git", - "subdir": "jsonnet/mixin" - } - }, - "version": "5f47f12d5943ad575829f48f118382162e2004a2", - "sum": "qZ4WgiweaE6eeKtFK60QUjLO8sf2L9Q8fgafWvDcyfY=" - }, - { - "source": { - "git": { - "remote": "https://github.com/prometheus/alertmanager.git", - "subdir": "doc/alertmanager-mixin" - } - }, - "version": "976297c0dc1f924560a7d0f748513a1802bc0b23", - "sum": "pep+dHzfIjh2SU5pEkwilMCAT/NoL6YYflV4x8cr7vU=" - }, - { - "source": { - "git": { - "remote": "https://github.com/prometheus/prometheus.git", - "subdir": "documentation/prometheus-mixin" - } - }, - "version": "33925c8ebc60c94f3274db944acdf2b650fc6ccf", - "sum": "m4VHwft4fUcxzL4+52lLZG/V5aH5ZEdjaweb88vISL0=" - } - ], - "legacyImports": false -}