From ff49e2854bc5bbffbcfd32e17186ec60009aa431 Mon Sep 17 00:00:00 2001 From: Andrew Pollock Date: Wed, 5 Feb 2025 06:13:42 +0000 Subject: [PATCH] feat(monitoring): add terraform to generate k8s CronJob monitoring This commit adds a new Terraform module definition and instance to generate a resource for every k8s CronJob definition, using the cronLastSuccessfulTimeMins label defined for the CronJob --- .../environments/oss-vdb-test/main.tf | 18 +++++++++++ .../modules/k8s_cron_alert/k8s_cron_alert.tf | 30 +++++++++++++++++++ 2 files changed, 48 insertions(+) create mode 100644 deployment/terraform/modules/k8s_cron_alert/k8s_cron_alert.tf diff --git a/deployment/terraform/environments/oss-vdb-test/main.tf b/deployment/terraform/environments/oss-vdb-test/main.tf index fc889d51bb5..cfd1a7b539e 100644 --- a/deployment/terraform/environments/oss-vdb-test/main.tf +++ b/deployment/terraform/environments/oss-vdb-test/main.tf @@ -1,3 +1,13 @@ +locals { + # Iterate of each yaml configuration and create a key based on kind and name in the yaml file. + kube_manifests = { + for manifest in flatten([for i in fileset("../../..", "./clouddeploy/gke-workers/base/*.yaml") : yamldecode(file("../../../${i}"))]) : + "${try(manifest.kind, "")}--${try(manifest.metadata.name, "")}" => manifest + if try(manifest.kind, "") == "CronJob" # Filter for CronJobs, handling missing kind + } + project_id = "oss-vdb-test" +} + module "osv_test" { source = "../../modules/osv" @@ -19,6 +29,14 @@ module "osv_test" { esp_version = "2.51.0" } +module "k8s_cron_alert" { + for_each = local.kube_manifests + source = "../../modules/k8s_cron_alert" + project_id = local.project_id + cronjob_name = each.value.metadata.name + cronjob_expected_latency_minutes = lookup(each.value.metadata.labels, "cronLastSuccessfulTimeMins", null) +} + import { to = module.osv_test.google_firestore_database.datastore id = "oss-vdb-test/(default)" diff --git a/deployment/terraform/modules/k8s_cron_alert/k8s_cron_alert.tf b/deployment/terraform/modules/k8s_cron_alert/k8s_cron_alert.tf new file mode 100644 index 00000000000..2e670045e95 --- /dev/null +++ b/deployment/terraform/modules/k8s_cron_alert/k8s_cron_alert.tf @@ -0,0 +1,30 @@ +variable "project_id" { + type = string + description = "The project to create the alert policy in." +} + +variable "cronjob_name" { + type = string + description = "Name of the kubernetes cronjob to monitor." +} + +variable "cronjob_expected_latency_minutes" { + type = number + description = "Expected amount of time since last successful run of the job expressed in minutes." +} + +resource "google_monitoring_alert_policy" "cron_alert_policy" { + project = var.project_id + display_name = "Cronjob: ${var.cronjob_name} has not run recently." + combiner = "OR" + conditions { + display_name = "Cronjob: ${var.cronjob_name} has not run recently." + condition_prometheus_query_language { + query = "((time() - kube_cronjob_status_last_successful_time{cronjob=\"${var.cronjob_name}\"})/60) > ${var.cronjob_expected_latency_minutes}" + duration = "60s" + evaluation_interval = "60s" + alert_rule = "AlwaysOn" + rule_group = "cronjob ${var.cronjob_name}" + } + } +}