Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Unify App Mesh and Istio builtin metric checks #146

Merged
merged 5 commits into from
Apr 12, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 59 additions & 38 deletions pkg/controller/scheduler.go
Original file line number Diff line number Diff line change
Expand Up @@ -494,56 +494,77 @@ func (c *Controller) analyseCanary(r *flaggerv1.Canary) bool {
metric.Interval = r.GetMetricInterval()
}

if metric.Name == "envoy_cluster_upstream_rq" {
val, err := c.observer.GetEnvoySuccessRate(r.Spec.TargetRef.Name, r.Namespace, metric.Name, metric.Interval)
if err != nil {
if strings.Contains(err.Error(), "no values found") {
c.recordEventWarningf(r, "Halt advancement no values found for metric %s probably %s.%s is not receiving traffic",
metric.Name, r.Spec.TargetRef.Name, r.Namespace)
} else {
c.recordEventErrorf(r, "Metrics server %s query failed: %v", c.observer.GetMetricsServer(), err)
// App Mesh checks
if c.meshProvider == "appmesh" {
if metric.Name == "request-success-rate" || metric.Name == "envoy_cluster_upstream_rq" {
val, err := c.observer.GetEnvoySuccessRate(r.Spec.TargetRef.Name, r.Namespace, metric.Name, metric.Interval)
if err != nil {
if strings.Contains(err.Error(), "no values found") {
c.recordEventWarningf(r, "Halt advancement no values found for metric %s probably %s.%s is not receiving traffic",
metric.Name, r.Spec.TargetRef.Name, r.Namespace)
} else {
c.recordEventErrorf(r, "Metrics server %s query failed: %v", c.observer.GetMetricsServer(), err)
}
return false
}
if float64(metric.Threshold) > val {
c.recordEventWarningf(r, "Halt %s.%s advancement success rate %.2f%% < %v%%",
r.Name, r.Namespace, val, metric.Threshold)
return false
}
return false
}
if float64(metric.Threshold) > val {
c.recordEventWarningf(r, "Halt %s.%s advancement success rate %.2f%% < %v%%",
r.Name, r.Namespace, val, metric.Threshold)
return false
}
}

if metric.Name == "istio_requests_total" {
val, err := c.observer.GetIstioSuccessRate(r.Spec.TargetRef.Name, r.Namespace, metric.Name, metric.Interval)
if err != nil {
if strings.Contains(err.Error(), "no values found") {
c.recordEventWarningf(r, "Halt advancement no values found for metric %s probably %s.%s is not receiving traffic",
metric.Name, r.Spec.TargetRef.Name, r.Namespace)
} else {
if metric.Name == "request-duration" || metric.Name == "envoy_cluster_upstream_rq_time_bucket" {
val, err := c.observer.GetEnvoyRequestDuration(r.Spec.TargetRef.Name, r.Namespace, metric.Name, metric.Interval)
if err != nil {
c.recordEventErrorf(r, "Metrics server %s query failed: %v", c.observer.GetMetricsServer(), err)
return false
}
t := time.Duration(metric.Threshold) * time.Millisecond
if val > t {
c.recordEventWarningf(r, "Halt %s.%s advancement request duration %v > %v",
r.Name, r.Namespace, val, t)
return false
}
return false
}
if float64(metric.Threshold) > val {
c.recordEventWarningf(r, "Halt %s.%s advancement success rate %.2f%% < %v%%",
r.Name, r.Namespace, val, metric.Threshold)
return false
}
}

if metric.Name == "istio_request_duration_seconds_bucket" {
val, err := c.observer.GetIstioRequestDuration(r.Spec.TargetRef.Name, r.Namespace, metric.Name, metric.Interval)
if err != nil {
c.recordEventErrorf(r, "Metrics server %s query failed: %v", c.observer.GetMetricsServer(), err)
return false
// Istio checks
if c.meshProvider == "istio" {
if metric.Name == "request-success-rate" || metric.Name == "istio_requests_total" {
val, err := c.observer.GetIstioSuccessRate(r.Spec.TargetRef.Name, r.Namespace, metric.Name, metric.Interval)
if err != nil {
if strings.Contains(err.Error(), "no values found") {
c.recordEventWarningf(r, "Halt advancement no values found for metric %s probably %s.%s is not receiving traffic",
metric.Name, r.Spec.TargetRef.Name, r.Namespace)
} else {
c.recordEventErrorf(r, "Metrics server %s query failed: %v", c.observer.GetMetricsServer(), err)
}
return false
}
if float64(metric.Threshold) > val {
c.recordEventWarningf(r, "Halt %s.%s advancement success rate %.2f%% < %v%%",
r.Name, r.Namespace, val, metric.Threshold)
return false
}
}
t := time.Duration(metric.Threshold) * time.Millisecond
if val > t {
c.recordEventWarningf(r, "Halt %s.%s advancement request duration %v > %v",
r.Name, r.Namespace, val, t)
return false

if metric.Name == "request-duration" || metric.Name == "istio_request_duration_seconds_bucket" {
val, err := c.observer.GetIstioRequestDuration(r.Spec.TargetRef.Name, r.Namespace, metric.Name, metric.Interval)
if err != nil {
c.recordEventErrorf(r, "Metrics server %s query failed: %v", c.observer.GetMetricsServer(), err)
return false
}
t := time.Duration(metric.Threshold) * time.Millisecond
if val > t {
c.recordEventWarningf(r, "Halt %s.%s advancement request duration %v > %v",
r.Name, r.Namespace, val, t)
return false
}
}
}

// custom checks
if metric.Query != "" {
val, err := c.observer.GetScalar(metric.Query)
if err != nil {
Expand Down
54 changes: 54 additions & 0 deletions pkg/metrics/envoy.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"fmt"
"net/url"
"strconv"
"time"
)

const envoySuccessRateQuery = `
Expand Down Expand Up @@ -63,3 +64,56 @@ func (c *Observer) GetEnvoySuccessRate(name string, namespace string, metric str
}
return *rate, nil
}

const envoyRequestDurationQuery = `
histogram_quantile(0.99, sum(rate(
envoy_cluster_upstream_rq_time_bucket{kubernetes_namespace="{{ .Namespace }}",
kubernetes_pod_name=~"{{ .Name }}-[0-9a-zA-Z]+(-[0-9a-zA-Z]+)"}
[{{ .Interval }}])) by (le))
`

// GetEnvoyRequestDuration returns the 99P requests delay using envoy_cluster_upstream_rq_time_bucket metrics
func (c *Observer) GetEnvoyRequestDuration(name string, namespace string, metric string, interval string) (time.Duration, error) {
if c.metricsServer == "fake" {
return 1, nil
}

meta := struct {
Name string
Namespace string
Interval string
}{
name,
namespace,
interval,
}

query, err := render(meta, envoyRequestDurationQuery)
if err != nil {
return 0, err
}

var rate *float64
querySt := url.QueryEscape(query)
result, err := c.queryMetric(querySt)
if err != nil {
return 0, err
}

for _, v := range result.Data.Result {
metricValue := v.Value[1]
switch metricValue.(type) {
case string:
f, err := strconv.ParseFloat(metricValue.(string), 64)
if err != nil {
return 0, err
}
rate = &f
}
}
if rate == nil {
return 0, fmt.Errorf("no values found for metric %s", metric)
}
ms := time.Duration(int64(*rate)) * time.Millisecond
return ms, nil
}
23 changes: 23 additions & 0 deletions pkg/metrics/envoy_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,26 @@ func Test_EnvoySuccessRateQueryRender(t *testing.T) {
t.Errorf("\nGot %s \nWanted %s", query, expected)
}
}

func Test_EnvoyRequestDurationQueryRender(t *testing.T) {
meta := struct {
Name string
Namespace string
Interval string
}{
"podinfo",
"default",
"1m",
}

query, err := render(meta, envoyRequestDurationQuery)
if err != nil {
t.Fatal(err)
}

expected := `histogram_quantile(0.99, sum(rate(envoy_cluster_upstream_rq_time_bucket{kubernetes_namespace="default",kubernetes_pod_name=~"podinfo-[0-9a-zA-Z]+(-[0-9a-zA-Z]+)"}[1m])) by (le))`

if query != expected {
t.Errorf("\nGot %s \nWanted %s", query, expected)
}
}
19 changes: 19 additions & 0 deletions pkg/metrics/observer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,25 @@ func TestCanaryObserver_GetEnvoySuccessRate(t *testing.T) {

}

func TestCanaryObserver_GetEnvoyRequestDuration(t *testing.T) {
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
json := `{"status":"success","data":{"resultType":"vector","result":[{"metric":{},"value":[1545905245.596,"200"]}]}}`
w.Write([]byte(json))
}))
defer ts.Close()

observer := NewObserver(ts.URL)

val, err := observer.GetEnvoyRequestDuration("podinfo", "default", "envoy_cluster_upstream_rq_time_bucket", "1m")
if err != nil {
t.Fatal(err.Error())
}

if val != 200*time.Millisecond {
t.Errorf("Got %v wanted %v", val, 200*time.Millisecond)
}
}

func TestCanaryObserver_GetIstioSuccessRate(t *testing.T) {
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
json := `{"status":"success","data":{"resultType":"vector","result":[{"metric":{},"value":[1545905245.458,"100"]}]}}`
Expand Down
4 changes: 2 additions & 2 deletions test/e2e-tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,10 @@ spec:
maxWeight: 50
stepWeight: 10
metrics:
- name: istio_requests_total
- name: request-success-rate
threshold: 99
interval: 1m
- name: istio_request_duration_seconds_bucket
- name: request-duration
threshold: 500
interval: 30s
- name: "404s percentage"
Expand Down