Skip to content

Commit

Permalink
Alerting feature
Browse files Browse the repository at this point in the history
Signed-off-by: Raul Sevilla <[email protected]>

Make alert-profile flag optional

Signed-off-by: Raul Sevilla <[email protected]>

Remove sevInfo

Signed-off-by: Raul Sevilla <[email protected]>

Skip metric parsing on error

Signed-off-by: Raul Sevilla <[email protected]>

Add alerting docs

Signed-off-by: Raul Sevilla <[email protected]>

Alerting e2e test

Signed-off-by: Raul Sevilla <[email protected]>
  • Loading branch information
rsevilla87 committed Dec 10, 2020
1 parent f9a15b9 commit e8f9d3e
Show file tree
Hide file tree
Showing 14 changed files with 390 additions and 55 deletions.
105 changes: 86 additions & 19 deletions cmd/kube-burner.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"time"

"github.com/cloud-bulldozer/kube-burner/log"
"github.com/cloud-bulldozer/kube-burner/pkg/alerting"
"github.com/cloud-bulldozer/kube-burner/pkg/burner"
"github.com/cloud-bulldozer/kube-burner/pkg/config"
"k8s.io/client-go/kubernetes"
Expand Down Expand Up @@ -52,10 +53,12 @@ To configure your bash shell to load completions for each session execute:
}

func initCmd() *cobra.Command {
var url, metricsProfile, configFile string
var url, metricsProfile, alertProfile, configFile string
var username, password, uuid, token string
var skipTLSVerify bool
var prometheusStep time.Duration
var prometheusClient *prometheus.Prometheus
var alertM *alerting.AlertManager
cmd := &cobra.Command{
Use: "init",
Short: "Launch benchmark",
Expand All @@ -66,26 +69,39 @@ func initCmd() *cobra.Command {
if err != nil {
log.Fatal(err)
}
var p *prometheus.Prometheus
if url != "" {
p, err = prometheus.NewPrometheusClient(url, token, username, password, metricsProfile, uuid, skipTLSVerify, prometheusStep)
prometheusClient, err = prometheus.NewPrometheusClient(url, token, username, password, uuid, skipTLSVerify, prometheusStep)
if err != nil {
log.Fatal(err)
}
// If indexer is enabled or writeTofile is enabled we read the profile
if config.ConfigSpec.GlobalConfig.IndexerConfig.Enabled || config.ConfigSpec.GlobalConfig.WriteToFile {
if err := prometheusClient.ReadProfile(metricsProfile); err != nil {
log.Fatal(err)
}
}
if alertProfile != "" {
if alertM, err = alerting.NewAlertManager(alertProfile, prometheusClient); err != nil {
log.Fatalf("Error creating alert manager: %s", err)
}
}
}
steps(uuid, p, prometheusStep)
steps(uuid, prometheusClient, alertM)
},
}
cmd.Flags().StringVar(&uuid, "uuid", "", "Benchmark UUID")
cmd.Flags().StringVarP(&url, "prometheus-url", "u", "", "Prometheus URL")
cmd.Flags().StringVarP(&token, "token", "t", "", "Prometheus Bearer token")
cmd.Flags().StringVar(&username, "username", "", "Prometheus username for authentication")
cmd.Flags().StringVarP(&password, "password", "p", "", "Prometheus password for basic authentication")
cmd.Flags().StringVarP(&metricsProfile, "metrics-profile", "m", "metrics.yaml", "Metrics profile file")
cmd.Flags().StringVarP(&metricsProfile, "metrics-profile", "m", "metrics.yaml", "Metrics profile file or URL")
cmd.Flags().StringVarP(&alertProfile, "alert-profile", "a", "", "Alert profile file or URL")
cmd.Flags().BoolVar(&skipTLSVerify, "skip-tls-verify", true, "Verify prometheus TLS certificate")
cmd.Flags().DurationVarP(&prometheusStep, "step", "s", 30*time.Second, "Prometheus step size")
cmd.Flags().StringVarP(&configFile, "config", "c", "", "Config file path")
cmd.Flags().StringVarP(&configFile, "config", "c", "", "Config file path or URL")
cmd.MarkFlagRequired("config")
cmd.MarkFlagRequired("uuid")
cmd.Flags().SortFlags = false
return cmd
}

Expand Down Expand Up @@ -113,6 +129,7 @@ func destroyCmd() *cobra.Command {
},
}
cmd.Flags().StringVar(&uuid, "uuid", "", "UUID")
cmd.MarkFlagRequired("uuid")
return cmd
}

Expand All @@ -135,10 +152,13 @@ func indexCmd() *cobra.Command {
if config.ConfigSpec.GlobalConfig.IndexerConfig.Enabled {
indexer = indexers.NewIndexer()
}
p, err := prometheus.NewPrometheusClient(url, token, username, password, metricsProfile, uuid, skipTLSVerify, prometheusStep)
p, err := prometheus.NewPrometheusClient(url, token, username, password, uuid, skipTLSVerify, prometheusStep)
if err != nil {
log.Fatal(err)
}
if err := p.ReadProfile(metricsProfile); err != nil {
log.Fatal(err)
}
startTime := time.Unix(start, 0)
endTime := time.Unix(end, 0)
log.Infof("Indexing metrics with UUID %s", uuid)
Expand All @@ -157,9 +177,52 @@ func indexCmd() *cobra.Command {
cmd.Flags().DurationVarP(&prometheusStep, "step", "s", 30*time.Second, "Prometheus step size")
cmd.Flags().Int64VarP(&start, "start", "", time.Now().Unix()-3600, "Epoch start time")
cmd.Flags().Int64VarP(&end, "end", "", time.Now().Unix(), "Epoch end time")
cmd.Flags().StringVarP(&configFile, "config", "c", "", "Config file path")
cmd.Flags().StringVarP(&configFile, "config", "c", "", "Config file path or URL")
cmd.MarkFlagRequired("prometheus-url")
cmd.MarkFlagRequired("uuid")
cmd.MarkFlagRequired("config")
cmd.Flags().SortFlags = false
return cmd
}

func alertCmd() *cobra.Command {
var url, alertProfile string
var start, end int64
var username, password, uuid, token string
var skipTLSVerify bool
var alertM *alerting.AlertManager
var prometheusStep time.Duration
cmd := &cobra.Command{
Use: "check-alerts",
Short: "Evaluate alerts for the given time range",
Args: cobra.MaximumNArgs(0),
Run: func(cmd *cobra.Command, args []string) {
p, err := prometheus.NewPrometheusClient(url, token, username, password, uuid, skipTLSVerify, prometheusStep)
if err != nil {
log.Fatal(err)
}
startTime := time.Unix(start, 0)
endTime := time.Unix(end, 0)
if alertM, err = alerting.NewAlertManager(alertProfile, p); err != nil {
log.Fatalf("Error creating alert manager: %s", err)
}
result := alertM.Evaluate(startTime, endTime)
log.Info("👋 Exiting kube-burner")
os.Exit(result)
},
}
cmd.Flags().StringVarP(&url, "prometheus-url", "u", "", "Prometheus URL")
cmd.Flags().StringVarP(&token, "token", "t", "", "Prometheus Bearer token")
cmd.Flags().StringVar(&username, "username", "", "Prometheus username for authentication")
cmd.Flags().StringVarP(&password, "password", "p", "", "Prometheus password for basic authentication")
cmd.Flags().StringVarP(&alertProfile, "alert-profile", "a", "alerts.yaml", "Alert profile file or URL")
cmd.Flags().BoolVar(&skipTLSVerify, "skip-tls-verify", true, "Verify prometheus TLS certificate")
cmd.Flags().DurationVarP(&prometheusStep, "step", "s", 30*time.Second, "Prometheus step size")
cmd.Flags().Int64VarP(&start, "start", "", time.Now().Unix()-3600, "Epoch start time")
cmd.Flags().Int64VarP(&end, "end", "", time.Now().Unix(), "Epoch end time")
cmd.MarkFlagRequired("prometheus-url")
cmd.MarkFlagRequired("alert-profile")
cmd.Flags().SortFlags = false
return cmd
}

Expand All @@ -182,20 +245,20 @@ func init() {
initCmd(),
destroyCmd(),
indexCmd(),
alertCmd(),
)
for _, c := range rootCmd.Commands() {
logLevel := c.Flags().String("log-level", "info", "Allowed values: debug, info, warn, error, fatal")
c.PreRun = func(cmd *cobra.Command, args []string) {
log.Infof("Setting log level to %s", *logLevel)
log.SetLogLevel(*logLevel)
}
c.MarkFlagRequired("uuid")
logLevel := rootCmd.PersistentFlags().String("log-level", "info", "Allowed values: debug, info, warn, error, fatal")
rootCmd.PersistentPreRun = func(cmd *cobra.Command, args []string) {
log.Infof("Setting log level to %s", *logLevel)
log.SetLogLevel(*logLevel)
}
rootCmd.AddCommand(completionCmd)
cobra.OnInitialize()
rootCmd.Execute()

}

func steps(uuid string, p *prometheus.Prometheus, prometheusStep time.Duration) {
func steps(uuid string, p *prometheus.Prometheus, alertM *alerting.AlertManager) {
start := time.Now().UTC()
verification := true
var rc int
Expand Down Expand Up @@ -239,10 +302,14 @@ func steps(uuid string, p *prometheus.Prometheus, prometheusStep time.Duration)
time.Sleep(job.Config.JobPause)
}
}
// If alertManager is configured and evaluation!=0 rc=1
if alertM != nil && alertM.Evaluate(start, time.Now().UTC()) == 1 {
rc = 1
}
// If prometheus is enabled query metrics from the start of the first job to the end of the last one
if p != nil {
log.Infof("Waiting %v extra before scraping prometheus metrics", prometheusStep)
time.Sleep(prometheusStep)
if p != nil && len(p.MetricsProfile.Metrics) > 0 {
log.Infof("Waiting %v extra before scraping prometheus metrics", p.Step)
time.Sleep(p.Step)
if err := p.ScrapeMetrics(start, time.Now().UTC(), indexer); err != nil {
log.Error(err)
}
Expand Down
53 changes: 53 additions & 0 deletions docs/ALERTING.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# Alerting

Kube-burner includes an alert mechanism able to evaluate Prometheus expressions after the end of the latest Kube-burner's job.

## Configuration

Alerting is configured through a configuration file pointed by the flag `--alert-profile` or `-a`. This file looks like:

```yaml
- expr: avg_over_time(histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[2m]))[5m:]) > 0.01
description: 5 minutes avg. etcd fsync latency on {{$labels.pod}} higher than 10ms {{$value}}
severity: error

- expr: avg_over_time(histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[5m]))[5m:]) > 0.1
description: 5 minutes avg. etcd netowrk peer round trip on {{$labels.pod}} higher than 100ms {{$value}}
severity: error

- expr: increase(etcd_server_leader_changes_seen_total[2m]) > 0
description: etcd leader changes observed
severity: error
```
Where expr holds the Prometheus expression to evaluate and description holds a description of the alert. In the description we can make use of prometheus labels to improve verbosity, using the syntax `{{$labels.<label_name>}}` and print the expression value that triggered the alarm using `{{$value}}`.
Alarm can be configured with a severity. Each one with different effects. At the moment they do the following:

- info: Prints an info message with the alarm description to stdout. By default all expressions have this severity.
- warning: Prints a warning message with the alarm description to stdout.
- error: Prints a error message with the alarm description to stdout and makes kube-burner rc = 1
- critical: Prints a fatal message with the alarm description to stdout and exits execution inmediatly with rc != 0

## Checking alerts

It's possible to look for alerts w/o triggering a kube-burner workload. To do so you can use the `check-alerts` option from the CLI, similar to the `index` CLI option, this one accepts the flags `--start` and `--end` to evaluate the alerts at a given time range.

```shell
$ kube-burner check-alerts -u https://prometheus.url.com -t ${token} -a alert-profile.yml
INFO[2020-12-10 11:47:23] Setting log level to info
INFO[2020-12-10 11:47:23] 👽 Initializing prometheus client
INFO[2020-12-10 11:47:24] 🔔 Initializaing alert manager
INFO[2020-12-10 11:47:24] Evaluating expression: 'avg_over_time(histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[2m]))[5m:]) > 0.01'
ERRO[2020-12-10 11:47:24] Alert triggered at 2020-12-10 11:01:53 +0100 CET: '5 minutes avg. etcd fsync latency on etcd-ip-10-0-213-209.us-west-2.compute.internal higher than 10ms 0.010281314285714311'
INFO[2020-12-10 11:47:24] Evaluating expression: 'avg_over_time(histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[5m]))[5m:]) > 0.1'
INFO[2020-12-10 11:47:24] Evaluating expression: 'increase(etcd_server_leader_changes_seen_total[2m]) > 0'
INFO[2020-12-10 11:47:24] Evaluating expression: 'avg_over_time(histogram_quantile(0.99, sum(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver",verb=~"POST|PUT|DELETE|PATCH|CREATE"}) by (verb,resource,subresource,le))[5m
:]) > 1'
INFO[2020-12-10 11:47:25] Evaluating expression: 'avg_over_time(histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver",verb="GET",scope="resource"}[2m])) by (verb,resource,subresource,le))[5
m:]) > 1'
INFO[2020-12-10 11:47:25] Evaluating expression: 'avg_over_time(histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver",verb="LIST",scope="namespace"}[2m])) by (verb,resource,subresource,le))
[5m:]) > 5'
INFO[2020-12-10 11:47:26] Evaluating expression: 'avg_over_time(histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver",verb="LIST",scope="cluster"}[2m])) by (verb,resource,subresource,le))[5
m:]) > 30'
INFO[2020-12-10 11:47:27] Evaluating expression: 'avg_over_time(histogram_quantile(0.99,rate(coredns_kubernetes_dns_programming_duration_seconds_bucket[2m]))[5m:]) > 1'
```
18 changes: 12 additions & 6 deletions docs/CLI.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# CLI
kube-burner is basically a binary client with currently the following options:

```console
Expand All @@ -10,6 +11,7 @@ Usage:
kube-burner [command]

Available Commands:
check-alerts Evaluate alerts for the given time range
completion Generates completion scripts for bash shell
destroy Destroy old namespaces labeled with the given UUID.
help Help about any command
Expand All @@ -23,7 +25,7 @@ Flags:
Use "kube-burner [command] --help" for more information about a command.
```

# Init
## Init

This option is meant to run Kube-burner benchmark, and it supports the these flags:

Expand Down Expand Up @@ -58,20 +60,24 @@ If you have no interest in collecting prometheus metrics, kube-burner can also b
$ kube-burner init -c cfg.yml --uuid 67f9ec6d-6a9e-46b6-a3bb-065cde988790`
```

# Index
## Index

This option can be used to collect and index the metrics from a given time range. The time range is given by:

- start: Epoch start time. Defaults to one hour before the current time.
- End: Epoch end time. Defaults to the current time.
- end: Epoch end time. Defaults to the current time.

# Destroy
## Check alerts

This option can be used to evaluate alerts configured in in the given alert profile. Similar to `index` the time range is given by the start and end flags.

## Destroy

This option requires the above `config` and `UUID` flags to destroy all namespaces labeled with `kube-burner-uuid=<UUID>`.

# Completion
## Completion
Generates bash a completion script that can be imported with:
`. <(kube-burner completion)`

Or permanently imported with:
`kube-burner completion > /etc/bash_completion.d/kube-burner`
`kube-burner completion > /etc/bash_completion.d/kube-burner`
8 changes: 6 additions & 2 deletions docs/CONFIGURATION.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ In this section is described global job configuration, it holds the following pa
| kubeconfig | Points to a valid kubeconfig file. Can be omitted if using the KUBECONFIG environment variable, or running from a pod | String | ~/mykubeconfig | in-cluster | |
| writeToFile | Whether to dump collected metrics to files | Boolean | true | true |
| metricsDirectory | Directory where collected metrics will be dumped into. It will be created if it doesn't exist previously | String | ./metrics | ./collected-metrics |
| measurements | List of measurements. Detailed in the [measurements section](#Measurements) | List | - | [] |
| indexerConfig | Holds the indexer configuration. Detailed in the [indexers section](#Indexers) | Object | - | - |
| measurements | List of measurements. Detailed in the [measurements section] | List | - | [] |
| indexerConfig | Holds the indexer configuration. Detailed in the [indexers section] | Object | - | - |

# Jobs

Expand Down Expand Up @@ -158,3 +158,7 @@ spec:
dockerImageRepository: {{.image}}
{{ end }}
```


[measurements section]: ../measurements/
[indexers section]: ../indexers/
7 changes: 4 additions & 3 deletions docs/INDEXERS.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Configuration
# Indexers
`kube-burner` is able to **index the collected prometheus metrics** into a given Indexer. These metrics are indexed after the execution of the last Kube-burner's job.

`kube-burner` is able to **index the collected prometheus metrics** into a given Indexer.
## Indexer configuration
The indexer configuration is described in the `indexerConfig` section and can be configured with the following parameters:


Expand All @@ -10,7 +11,7 @@ The indexer configuration is described in the `indexerConfig` section and can be
| type | Type of indexer | String | elastic | "" |


# Elastic
### Elastic

Index documents in Elasticsearch 7 instances.

Expand Down
6 changes: 4 additions & 2 deletions docs/MEASUREMENTS.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
# Measurements

Apart from prometheus metrics collection, Kube-burner allows to get further metrics using other mechanisms or data sources such as the own kubernetes API, these mechanisms are called measurements.
Measurements are enabled in the measurements section of the configuration file. This section contains a list of measurements with their options.
'kube-burner' supports the following measurements so far:

# Pod latency
## Pod latency

Collects latencies from the different pod startup phases, these **latency metrics are in ms**. Can be enabled with:

Expand Down Expand Up @@ -64,7 +66,7 @@ More information about the pod lifecycle can be found in the [kubernetes docs](h

**Note**: The __esIndex__ option can be used to configure the ES index where metrics will be indexed.

# Pprof collection
## Pprof collection

This measurement takes care of collecting golang profiling information from pods. To do so, kube-burner connects to pods with the given labels running in certain namespaces. This measurement uses an implementation similar to `kubectl exec`, and as soon as it connects to one pod it executes the command `curl <pprofURL>` to get the pprof data. Pprof files are collected in a regular basis given by the parameter `pprofInterval` and these files are stored in the directory configured by the parameter `pprofDirectory` which by default is `pprof`.
It's also possible to configure a token to get pprof data from authenticated endoints such as kube-apiserver with the variable `bearerToken`.
Expand Down
4 changes: 2 additions & 2 deletions docs/METRICS.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Metrics profile
# Metrics

The metrics-profile flag points to a YAML or URL of a file containing a list of the prometheus queries kube-burner will collect for each job.
As soon one of job finishes, `kube-burner` makes a range query for each query described in this file, and indexes it in the index configured by the parameter `defaultIndex`.
Expand Down Expand Up @@ -28,7 +28,7 @@ metrics:
instant: true
```
# Job Summary
## Job Summary
In case indexing is enabled, at the end of each job, a document holding the job summary is indexed. This is useful to identify the parameters the job was executed with:
Expand Down
Loading

0 comments on commit e8f9d3e

Please sign in to comment.