Alerting feature

Signed-off-by: Raul Sevilla <[email protected]> Make alert-profile flag optional Signed-off-by: Raul Sevilla <[email protected]> Remove sevInfo Signed-off-by: Raul Sevilla <[email protected]> Skip metric parsing on error Signed-off-by: Raul Sevilla <[email protected]> Add alerting docs Signed-off-by: Raul Sevilla <[email protected]> Alerting e2e test Signed-off-by: Raul Sevilla <[email protected]>
rsevilla87 · Dec 10, 2020 · e8f9d3e · e8f9d3e
1 parent f9a15b9
commit e8f9d3e
Show file tree

Hide file tree

Showing 14 changed files with 390 additions and 55 deletions.
diff --git a/cmd/kube-burner.go b/cmd/kube-burner.go
@@ -21,6 +21,7 @@ import (
 	"time"
 
 	"github.com/cloud-bulldozer/kube-burner/log"
+	"github.com/cloud-bulldozer/kube-burner/pkg/alerting"
 	"github.com/cloud-bulldozer/kube-burner/pkg/burner"
 	"github.com/cloud-bulldozer/kube-burner/pkg/config"
 	"k8s.io/client-go/kubernetes"
@@ -52,10 +53,12 @@ To configure your bash shell to load completions for each session execute:
 }
 
 func initCmd() *cobra.Command {
-	var url, metricsProfile, configFile string
+	var url, metricsProfile, alertProfile, configFile string
 	var username, password, uuid, token string
 	var skipTLSVerify bool
 	var prometheusStep time.Duration
+	var prometheusClient *prometheus.Prometheus
+	var alertM *alerting.AlertManager
 	cmd := &cobra.Command{
 		Use:   "init",
 		Short: "Launch benchmark",
@@ -66,26 +69,39 @@ func initCmd() *cobra.Command {
 			if err != nil {
 				log.Fatal(err)
 			}
-			var p *prometheus.Prometheus
 			if url != "" {
-				p, err = prometheus.NewPrometheusClient(url, token, username, password, metricsProfile, uuid, skipTLSVerify, prometheusStep)
+				prometheusClient, err = prometheus.NewPrometheusClient(url, token, username, password, uuid, skipTLSVerify, prometheusStep)
 				if err != nil {
 					log.Fatal(err)
 				}
+				// If indexer is enabled or writeTofile is enabled we read the profile
+				if config.ConfigSpec.GlobalConfig.IndexerConfig.Enabled || config.ConfigSpec.GlobalConfig.WriteToFile {
+					if err := prometheusClient.ReadProfile(metricsProfile); err != nil {
+						log.Fatal(err)
+					}
+				}
+				if alertProfile != "" {
+					if alertM, err = alerting.NewAlertManager(alertProfile, prometheusClient); err != nil {
+						log.Fatalf("Error creating alert manager: %s", err)
+					}
+				}
 			}
-			steps(uuid, p, prometheusStep)
+			steps(uuid, prometheusClient, alertM)
 		},
 	}
 	cmd.Flags().StringVar(&uuid, "uuid", "", "Benchmark UUID")
 	cmd.Flags().StringVarP(&url, "prometheus-url", "u", "", "Prometheus URL")
 	cmd.Flags().StringVarP(&token, "token", "t", "", "Prometheus Bearer token")
 	cmd.Flags().StringVar(&username, "username", "", "Prometheus username for authentication")
 	cmd.Flags().StringVarP(&password, "password", "p", "", "Prometheus password for basic authentication")
-	cmd.Flags().StringVarP(&metricsProfile, "metrics-profile", "m", "metrics.yaml", "Metrics profile file")
+	cmd.Flags().StringVarP(&metricsProfile, "metrics-profile", "m", "metrics.yaml", "Metrics profile file or URL")
+	cmd.Flags().StringVarP(&alertProfile, "alert-profile", "a", "", "Alert profile file or URL")
 	cmd.Flags().BoolVar(&skipTLSVerify, "skip-tls-verify", true, "Verify prometheus TLS certificate")
 	cmd.Flags().DurationVarP(&prometheusStep, "step", "s", 30*time.Second, "Prometheus step size")
-	cmd.Flags().StringVarP(&configFile, "config", "c", "", "Config file path")
+	cmd.Flags().StringVarP(&configFile, "config", "c", "", "Config file path or URL")
 	cmd.MarkFlagRequired("config")
+	cmd.MarkFlagRequired("uuid")
+	cmd.Flags().SortFlags = false
 	return cmd
 }
 
@@ -113,6 +129,7 @@ func destroyCmd() *cobra.Command {
 		},
 	}
 	cmd.Flags().StringVar(&uuid, "uuid", "", "UUID")
+	cmd.MarkFlagRequired("uuid")
 	return cmd
 }
 
@@ -135,10 +152,13 @@ func indexCmd() *cobra.Command {
 			if config.ConfigSpec.GlobalConfig.IndexerConfig.Enabled {
 				indexer = indexers.NewIndexer()
 			}
-			p, err := prometheus.NewPrometheusClient(url, token, username, password, metricsProfile, uuid, skipTLSVerify, prometheusStep)
+			p, err := prometheus.NewPrometheusClient(url, token, username, password, uuid, skipTLSVerify, prometheusStep)
 			if err != nil {
 				log.Fatal(err)
 			}
+			if err := p.ReadProfile(metricsProfile); err != nil {
+				log.Fatal(err)
+			}
 			startTime := time.Unix(start, 0)
 			endTime := time.Unix(end, 0)
 			log.Infof("Indexing metrics with UUID %s", uuid)
@@ -157,9 +177,52 @@ func indexCmd() *cobra.Command {
 	cmd.Flags().DurationVarP(&prometheusStep, "step", "s", 30*time.Second, "Prometheus step size")
 	cmd.Flags().Int64VarP(&start, "start", "", time.Now().Unix()-3600, "Epoch start time")
 	cmd.Flags().Int64VarP(&end, "end", "", time.Now().Unix(), "Epoch end time")
-	cmd.Flags().StringVarP(&configFile, "config", "c", "", "Config file path")
+	cmd.Flags().StringVarP(&configFile, "config", "c", "", "Config file path or URL")
 	cmd.MarkFlagRequired("prometheus-url")
+	cmd.MarkFlagRequired("uuid")
 	cmd.MarkFlagRequired("config")
+	cmd.Flags().SortFlags = false
+	return cmd
+}
+
+func alertCmd() *cobra.Command {
+	var url, alertProfile string
+	var start, end int64
+	var username, password, uuid, token string
+	var skipTLSVerify bool
+	var alertM *alerting.AlertManager
+	var prometheusStep time.Duration
+	cmd := &cobra.Command{
+		Use:   "check-alerts",
+		Short: "Evaluate alerts for the given time range",
+		Args:  cobra.MaximumNArgs(0),
+		Run: func(cmd *cobra.Command, args []string) {
+			p, err := prometheus.NewPrometheusClient(url, token, username, password, uuid, skipTLSVerify, prometheusStep)
+			if err != nil {
+				log.Fatal(err)
+			}
+			startTime := time.Unix(start, 0)
+			endTime := time.Unix(end, 0)
+			if alertM, err = alerting.NewAlertManager(alertProfile, p); err != nil {
+				log.Fatalf("Error creating alert manager: %s", err)
+			}
+			result := alertM.Evaluate(startTime, endTime)
+			log.Info("👋 Exiting kube-burner")
+			os.Exit(result)
+		},
+	}
+	cmd.Flags().StringVarP(&url, "prometheus-url", "u", "", "Prometheus URL")
+	cmd.Flags().StringVarP(&token, "token", "t", "", "Prometheus Bearer token")
+	cmd.Flags().StringVar(&username, "username", "", "Prometheus username for authentication")
+	cmd.Flags().StringVarP(&password, "password", "p", "", "Prometheus password for basic authentication")
+	cmd.Flags().StringVarP(&alertProfile, "alert-profile", "a", "alerts.yaml", "Alert profile file or URL")
+	cmd.Flags().BoolVar(&skipTLSVerify, "skip-tls-verify", true, "Verify prometheus TLS certificate")
+	cmd.Flags().DurationVarP(&prometheusStep, "step", "s", 30*time.Second, "Prometheus step size")
+	cmd.Flags().Int64VarP(&start, "start", "", time.Now().Unix()-3600, "Epoch start time")
+	cmd.Flags().Int64VarP(&end, "end", "", time.Now().Unix(), "Epoch end time")
+	cmd.MarkFlagRequired("prometheus-url")
+	cmd.MarkFlagRequired("alert-profile")
+	cmd.Flags().SortFlags = false
 	return cmd
 }
 
@@ -182,20 +245,20 @@ func init() {
 		initCmd(),
 		destroyCmd(),
 		indexCmd(),
+		alertCmd(),
 	)
-	for _, c := range rootCmd.Commands() {
-		logLevel := c.Flags().String("log-level", "info", "Allowed values: debug, info, warn, error, fatal")
-		c.PreRun = func(cmd *cobra.Command, args []string) {
-			log.Infof("Setting log level to %s", *logLevel)
-			log.SetLogLevel(*logLevel)
-		}
-		c.MarkFlagRequired("uuid")
+	logLevel := rootCmd.PersistentFlags().String("log-level", "info", "Allowed values: debug, info, warn, error, fatal")
+	rootCmd.PersistentPreRun = func(cmd *cobra.Command, args []string) {
+		log.Infof("Setting log level to %s", *logLevel)
+		log.SetLogLevel(*logLevel)
 	}
 	rootCmd.AddCommand(completionCmd)
 	cobra.OnInitialize()
+	rootCmd.Execute()
+
 }
 
-func steps(uuid string, p *prometheus.Prometheus, prometheusStep time.Duration) {
+func steps(uuid string, p *prometheus.Prometheus, alertM *alerting.AlertManager) {
 	start := time.Now().UTC()
 	verification := true
 	var rc int
@@ -239,10 +302,14 @@ func steps(uuid string, p *prometheus.Prometheus, prometheusStep time.Duration)
 			time.Sleep(job.Config.JobPause)
 		}
 	}
+	// If alertManager is configured and evaluation!=0 rc=1
+	if alertM != nil && alertM.Evaluate(start, time.Now().UTC()) == 1 {
+		rc = 1
+	}
 	// If prometheus is enabled query metrics from the start of the first job to the end of the last one
-	if p != nil {
-		log.Infof("Waiting %v extra before scraping prometheus metrics", prometheusStep)
-		time.Sleep(prometheusStep)
+	if p != nil && len(p.MetricsProfile.Metrics) > 0 {
+		log.Infof("Waiting %v extra before scraping prometheus metrics", p.Step)
+		time.Sleep(p.Step)
 		if err := p.ScrapeMetrics(start, time.Now().UTC(), indexer); err != nil {
 			log.Error(err)
 		}

diff --git a/docs/ALERTING.md b/docs/ALERTING.md
@@ -0,0 +1,53 @@
+# Alerting
+
+Kube-burner includes an alert mechanism able to evaluate Prometheus expressions after the end of the latest Kube-burner's job. 
+
+## Configuration
+
+Alerting is configured through a configuration file pointed by the flag `--alert-profile` or `-a`. This file looks like:
+
+```yaml
+- expr: avg_over_time(histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[2m]))[5m:]) > 0.01
+  description: 5 minutes avg. etcd fsync latency on {{$labels.pod}} higher than 10ms {{$value}}
+  severity: error                                      
+
+- expr: avg_over_time(histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[5m]))[5m:]) > 0.1
+  description: 5 minutes avg. etcd netowrk peer round trip on {{$labels.pod}} higher than 100ms {{$value}}
+  severity: error
+
+- expr: increase(etcd_server_leader_changes_seen_total[2m]) > 0
+  description: etcd leader changes observed
+  severity: error
+```
+
+Where expr holds the Prometheus expression to evaluate and description holds a description of the alert. In the description we can make use of prometheus labels to improve verbosity, using the syntax `{{$labels.<label_name>}}` and print the expression value that triggered the alarm using `{{$value}}`.
+Alarm can be configured with a severity. Each one with different effects. At the moment they do the following:
+
+- info: Prints an info message with the alarm description to stdout. By default all expressions have this severity.
+- warning: Prints a warning message with the alarm description to stdout.
+- error: Prints a error message with the alarm description to stdout and makes kube-burner rc = 1
+- critical: Prints a fatal message with the alarm description to stdout and exits execution inmediatly with rc != 0
+
+## Checking alerts
+
+It's possible to look for alerts w/o triggering a kube-burner workload. To do so you can use the `check-alerts` option from the CLI, similar to the `index` CLI option, this one accepts the flags `--start` and `--end` to evaluate the alerts at a given time range.
+
+```shell
+$ kube-burner check-alerts -u https://prometheus.url.com -t ${token} -a alert-profile.yml                       
+INFO[2020-12-10 11:47:23] Setting log level to info                                                                                                                                                                                           
+INFO[2020-12-10 11:47:23] 👽 Initializing prometheus client                                                                                                                                                                                   
+INFO[2020-12-10 11:47:24] 🔔 Initializaing alert manager                                                                                                                                                                                      
+INFO[2020-12-10 11:47:24] Evaluating expression: 'avg_over_time(histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[2m]))[5m:]) > 0.01'                                                                                 
+ERRO[2020-12-10 11:47:24] Alert triggered at 2020-12-10 11:01:53 +0100 CET: '5 minutes avg. etcd fsync latency on etcd-ip-10-0-213-209.us-west-2.compute.internal higher than 10ms 0.010281314285714311' 
+INFO[2020-12-10 11:47:24] Evaluating expression: 'avg_over_time(histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[5m]))[5m:]) > 0.1'                                                                             
+INFO[2020-12-10 11:47:24] Evaluating expression: 'increase(etcd_server_leader_changes_seen_total[2m]) > 0'                                                                                                                                    
+INFO[2020-12-10 11:47:24] Evaluating expression: 'avg_over_time(histogram_quantile(0.99, sum(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver",verb=~"POST|PUT|DELETE|PATCH|CREATE"}) by (verb,resource,subresource,le))[5m
+:]) > 1'                                                                                                                                                                                                                                      
+INFO[2020-12-10 11:47:25] Evaluating expression: 'avg_over_time(histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver",verb="GET",scope="resource"}[2m])) by (verb,resource,subresource,le))[5
+m:]) > 1'                                                                                                                                                                                                                                     
+INFO[2020-12-10 11:47:25] Evaluating expression: 'avg_over_time(histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver",verb="LIST",scope="namespace"}[2m])) by (verb,resource,subresource,le))
+[5m:]) > 5'                                                                                                                                                                                                                                   
+INFO[2020-12-10 11:47:26] Evaluating expression: 'avg_over_time(histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver",verb="LIST",scope="cluster"}[2m])) by (verb,resource,subresource,le))[5
+m:]) > 30'                                                                                                                                                                                                                                    
+INFO[2020-12-10 11:47:27] Evaluating expression: 'avg_over_time(histogram_quantile(0.99,rate(coredns_kubernetes_dns_programming_duration_seconds_bucket[2m]))[5m:]) > 1'
+```
diff --git a/docs/CLI.md b/docs/CLI.md
@@ -1,3 +1,4 @@
+# CLI
 kube-burner is basically a binary client with currently the following options:
 
 ```console
@@ -10,6 +11,7 @@ Usage:
   kube-burner [command]
 
 Available Commands:
+  check-alerts Evaluate alerts for the given time range
   completion  Generates completion scripts for bash shell
   destroy     Destroy old namespaces labeled with the given UUID.
   help        Help about any command
@@ -23,7 +25,7 @@ Flags:
 Use "kube-burner [command] --help" for more information about a command.
 ```
 
-# Init
+## Init
 
 This option is meant to run Kube-burner benchmark, and it supports the these flags:
 
@@ -58,20 +60,24 @@ If you have no interest in collecting prometheus metrics, kube-burner can also b
 $ kube-burner init -c cfg.yml --uuid 67f9ec6d-6a9e-46b6-a3bb-065cde988790`
 ```
 
-# Index
+## Index
 
 This option can be used to collect and index the metrics from a given time range. The time range is given by:
 
   - start: Epoch start time. Defaults to one hour before the current time.
-  - End: Epoch end time. Defaults to the current time.
+  - end: Epoch end time. Defaults to the current time.
 
-# Destroy
+## Check alerts
+
+This option can be used to evaluate alerts configured in in the given alert profile. Similar to `index` the time range is given by the start and end flags.
+
+## Destroy
 
 This option requires the above `config` and `UUID` flags to destroy all namespaces labeled with `kube-burner-uuid=<UUID>`.
 
-# Completion
+## Completion
 Generates bash a completion script that can be imported with:
 `. <(kube-burner completion)`
 
 Or permanently imported with:
-`kube-burner completion > /etc/bash_completion.d/kube-burner`
+`kube-burner completion > /etc/bash_completion.d/kube-burner`
diff --git a/docs/CONFIGURATION.md b/docs/CONFIGURATION.md
@@ -10,8 +10,8 @@ In this section is described global job configuration, it holds the following pa
 | kubeconfig       | Points to a valid kubeconfig file. Can be omitted if using the KUBECONFIG environment variable, or running from a pod | String  | ~/mykubeconfig | in-cluster |             |
 | writeToFile      | Whether to dump collected metrics to files                                                               | Boolean        | true           | true        |
 | metricsDirectory | Directory where collected metrics will be dumped into. It will be created if it doesn't exist previously | String         | ./metrics      | ./collected-metrics | 
-| measurements     | List of measurements. Detailed in the [measurements section](#Measurements)                              | List           | -              | []          |
-| indexerConfig    | Holds the indexer configuration. Detailed in the [indexers section](#Indexers)                           | Object         | -              | -           |
+| measurements     | List of measurements. Detailed in the [measurements section]                                             | List           | -              | []          |
+| indexerConfig    | Holds the indexer configuration. Detailed in the [indexers section]                                      | Object         | -              | -           |
 
 # Jobs
 
@@ -158,3 +158,7 @@ spec:
   dockerImageRepository: {{.image}}
 {{ end }}
 ```
+
+
+[measurements section]: ../measurements/
+[indexers section]: ../indexers/
diff --git a/docs/INDEXERS.md b/docs/INDEXERS.md
@@ -1,6 +1,7 @@
-# Configuration 
+# Indexers
+`kube-burner` is able to **index the collected prometheus metrics** into a given Indexer. These metrics are indexed after the execution of the last Kube-burner's job.
 
-`kube-burner` is able to **index the collected prometheus metrics** into a given Indexer. 
+## Indexer configuration
 The indexer configuration is described in the `indexerConfig` section and can be configured with the following parameters:
 
 
@@ -10,7 +11,7 @@ The indexer configuration is described in the `indexerConfig` section and can be
 | type                 | Type of indexer       | String   | elastic    | ""      | 
 
 
-# Elastic
+### Elastic
 
 Index documents in Elasticsearch 7 instances.
 

diff --git a/docs/MEASUREMENTS.md b/docs/MEASUREMENTS.md
@@ -1,8 +1,10 @@
+# Measurements
+
 Apart from prometheus metrics collection, Kube-burner allows to get further metrics using other mechanisms or data sources such as the own kubernetes API, these mechanisms are called measurements.
 Measurements are enabled in the measurements section of the configuration file. This section contains a list of measurements with their options.
 'kube-burner' supports the following measurements so far:
 
-# Pod latency
+## Pod latency
 
 Collects latencies from the different pod startup phases, these **latency metrics are in ms**. Can be enabled with:
 
@@ -64,7 +66,7 @@ More information about the pod lifecycle can be found in the [kubernetes docs](h
 
 **Note**: The __esIndex__ option can be used to configure the ES index where metrics will be indexed.
 
-# Pprof collection
+## Pprof collection
 
 This measurement takes care of collecting golang profiling information from pods. To do so, kube-burner connects to pods with the given labels running in certain namespaces. This measurement uses an implementation similar to `kubectl exec`, and as soon as it connects to one pod it executes the command `curl <pprofURL>` to get the pprof data. Pprof files are collected in a regular basis given by the parameter `pprofInterval` and these files are stored in the directory configured by the parameter `pprofDirectory` which by default is `pprof`.
 It's also possible to configure a token to get pprof data from authenticated endoints such as kube-apiserver with the variable `bearerToken`.

diff --git a/docs/METRICS.md b/docs/METRICS.md
@@ -1,4 +1,4 @@
-# Metrics profile
+# Metrics
 
 The metrics-profile flag points to a YAML or URL of a file containing a list of the prometheus queries kube-burner will collect for each job.
 As soon one of job finishes, `kube-burner` makes a range query for each query described in this file, and indexes it in the index configured by the parameter `defaultIndex`.
@@ -28,7 +28,7 @@ metrics:
     instant: true
 ```
 
-# Job Summary
+## Job Summary
 
 In case indexing is enabled, at the end of each job, a document holding the job summary is indexed. This is useful to identify the parameters the job was executed with: