Skip to content

Commit

Permalink
feat(monitoring): add terraform to generate k8s CronJob monitoring
Browse files Browse the repository at this point in the history
This commit adds a new Terraform module definition and instance to
generate a resource for every k8s CronJob definition, using the
cronLastSuccessfulTimeMins label defined for the CronJob
  • Loading branch information
andrewpollock committed Feb 5, 2025
1 parent d6a9b4b commit ff49e28
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 0 deletions.
18 changes: 18 additions & 0 deletions deployment/terraform/environments/oss-vdb-test/main.tf
Original file line number Diff line number Diff line change
@@ -1,3 +1,13 @@
locals {
# Iterate of each yaml configuration and create a key based on kind and name in the yaml file.
kube_manifests = {
for manifest in flatten([for i in fileset("../../..", "./clouddeploy/gke-workers/base/*.yaml") : yamldecode(file("../../../${i}"))]) :
"${try(manifest.kind, "")}--${try(manifest.metadata.name, "")}" => manifest
if try(manifest.kind, "") == "CronJob" # Filter for CronJobs, handling missing kind
}
project_id = "oss-vdb-test"
}

module "osv_test" {
source = "../../modules/osv"

Expand All @@ -19,6 +29,14 @@ module "osv_test" {
esp_version = "2.51.0"
}

module "k8s_cron_alert" {
for_each = local.kube_manifests
source = "../../modules/k8s_cron_alert"
project_id = local.project_id
cronjob_name = each.value.metadata.name
cronjob_expected_latency_minutes = lookup(each.value.metadata.labels, "cronLastSuccessfulTimeMins", null)
}

import {
to = module.osv_test.google_firestore_database.datastore
id = "oss-vdb-test/(default)"
Expand Down
30 changes: 30 additions & 0 deletions deployment/terraform/modules/k8s_cron_alert/k8s_cron_alert.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
variable "project_id" {
type = string
description = "The project to create the alert policy in."
}

variable "cronjob_name" {
type = string
description = "Name of the kubernetes cronjob to monitor."
}

variable "cronjob_expected_latency_minutes" {
type = number
description = "Expected amount of time since last successful run of the job expressed in minutes."
}

resource "google_monitoring_alert_policy" "cron_alert_policy" {
project = var.project_id
display_name = "Cronjob: ${var.cronjob_name} has not run recently."
combiner = "OR"
conditions {
display_name = "Cronjob: ${var.cronjob_name} has not run recently."
condition_prometheus_query_language {
query = "((time() - kube_cronjob_status_last_successful_time{cronjob=\"${var.cronjob_name}\"})/60) > ${var.cronjob_expected_latency_minutes}"
duration = "60s"
evaluation_interval = "60s"
alert_rule = "AlwaysOn"
rule_group = "cronjob ${var.cronjob_name}"
}
}
}

0 comments on commit ff49e28

Please sign in to comment.