Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🌱 Test spot-migrator Prometheus metric collection #36

Merged
merged 7 commits into from
Jan 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions e2e/config/kube-prometheus-stack-values.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
global:
rbac:
create: true
pspEnabled: false
fullnameOverride: prometheus
defaultRules:
create: false
alertmanager:
enabled: false
grafana:
enabled: false
kubeApiServer:
enabled: false
kubelet:
enabled: false
kubeControllerManager:
enabled: false
coreDns:
enabled: false
kubeEtcd:
enabled: false
kubeScheduler:
enabled: false
kubeProxy:
enabled: false
kubeStateMetrics:
enabled: false
nodeExporter:
enabled: false
prometheus:
enabled: false
prometheusOperator:
prometheusInstanceNamespaces:
- monitoring
serviceMonitor:
selfMonitor: false
69 changes: 69 additions & 0 deletions e2e/config/prometheus.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# https://github.com/prometheus-operator/prometheus-operator/tree/48d3604507e082f4187f39edda9bc22935881a14/example/rbac/prometheus
apiVersion: v1
kind: ServiceAccount
metadata:
name: prometheus
namespace: monitoring
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: prometheus
rules:
- apiGroups:
- ""
resources:
- nodes
- nodes/metrics
- services
- endpoints
- pods
verbs:
- get
- list
- watch
- apiGroups:
- ""
resources:
- configmaps
verbs:
- get
- apiGroups:
- networking.k8s.io
resources:
- ingresses
verbs:
- get
- list
- watch
- nonResourceURLs:
- /metrics
verbs:
- get
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: prometheus
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus
subjects:
- kind: ServiceAccount
name: prometheus
namespace: monitoring
---
apiVersion: monitoring.coreos.com/v1
kind: Prometheus
metadata:
name: prometheus
namespace: monitoring
spec:
serviceAccountName: prometheus
scrapeInterval: 5s
# Watch for all PrometheusRules and PodMonitors
ruleSelector: {}
ruleNamespaceSelector: {}
podMonitorSelector: {}
podMonitorNamespaceSelector: {}
42 changes: 38 additions & 4 deletions e2e/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,10 +85,13 @@ func setup(ctx context.Context, image string) error {
}

// Install CRDs
err = runCommand("kubectl", "apply",
"-f", "https://raw.githubusercontent.com/kubernetes/autoscaler/5469d7912072c1070eedc680c89e27d46b8f4f82/vertical-pod-autoscaler/deploy/vpa-v1-crd-gen.yaml",
"-f", "https://raw.githubusercontent.com/prometheus-community/helm-charts/d616961860a0248f77a2783923511550fad23569/charts/kube-prometheus-stack/charts/crds/crds/crd-prometheusrules.yaml",
"-f", "https://raw.githubusercontent.com/prometheus-community/helm-charts/d616961860a0248f77a2783923511550fad23569/charts/kube-prometheus-stack/charts/crds/crds/crd-podmonitors.yaml")
err = runCommand("kubectl", "apply", "-f", "https://raw.githubusercontent.com/kubernetes/autoscaler/5469d7912072c1070eedc680c89e27d46b8f4f82/vertical-pod-autoscaler/deploy/vpa-v1-crd-gen.yaml")
if err != nil {
return err
}

// Install Prometheus Operator and Prometheus
err = installPrometheus(ctx, image)
if err != nil {
return err
}
Expand Down Expand Up @@ -249,6 +252,37 @@ podMonitor:
return kubernetes.WaitUntilDeploymentAvailable(ctx, kubeClient, "cost-manager", "cost-manager")
}

func installPrometheus(ctx context.Context, image string) (rerr error) {
// Add prometheus-community Helm repository:
// https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack#get-helm-repository-info
err := runCommand("helm", "repo", "add", "prometheus-community", "https://prometheus-community.github.io/helm-charts")
if err != nil {
return err
}
err = runCommand("helm", "repo", "update", "prometheus-community")
if err != nil {
return err
}

// Install Prometheus Operator
err = runCommand("helm", "upgrade", "--install",
"kube-prometheus-stack", "prometheus-community/kube-prometheus-stack",
"--namespace", "monitoring", "--create-namespace",
"--values", "./config/kube-prometheus-stack-values.yaml",
"--wait", "--timeout", "2m")
if err != nil {
return err
}

// Install Prometheus
err = runCommand("kubectl", "apply", "-f", "./config/prometheus.yaml")
if err != nil {
return err
}

return nil
}

func teardown() error {
err := runCommand("kind", "delete", "cluster", "--name", kindClusterName)
if err != nil {
Expand Down
65 changes: 57 additions & 8 deletions e2e/spot_migrator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ import (
cloudproviderfake "github.com/hsbc/cost-manager/pkg/cloudprovider/fake"
"github.com/hsbc/cost-manager/pkg/kubernetes"
"github.com/hsbc/cost-manager/pkg/test"
"github.com/prometheus/client_golang/api"
prometheusv1 "github.com/prometheus/client_golang/api/prometheus/v1"
"github.com/prometheus/common/model"
"github.com/stretchr/testify/require"
corev1 "k8s.io/api/core/v1"
policyv1 "k8s.io/api/policy/v1"
Expand All @@ -26,7 +29,8 @@ func TestSpotMigrator(t *testing.T) {
t.Parallel()

ctx := context.Background()
kubeClient, err := client.NewWithWatch(config.GetConfigOrDie(), client.Options{})
restConfig := config.GetConfigOrDie()
kubeClient, err := client.NewWithWatch(restConfig, client.Options{})
require.Nil(t, err)

// Find a worker Node
Expand Down Expand Up @@ -102,10 +106,9 @@ func TestSpotMigrator(t *testing.T) {
err = kubeClient.Patch(ctx, node, client.RawPatch(types.StrategicMergePatchType, patch))
require.Nil(t, err)

// Wait for the Node to be marked as unschedulable
// Wait for the Node to be marked as unschedulable. This should not take longer than 2 minutes
// since spot-migrator is configured with a 1 minute migration interval
t.Logf("Waiting for Node %s to be marked as unschedulable...", nodeName)
// spot-migrator is configured with a 1 minute migration interval so this should not take longer
// than 2 minutes
ctxWithTimeout, cancel := context.WithTimeout(ctx, 2*time.Minute)
defer cancel()
listerWatcher = kubernetes.NewListerWatcher(ctx, kubeClient, &corev1.NodeList{})
Expand Down Expand Up @@ -137,6 +140,56 @@ func TestSpotMigrator(t *testing.T) {
require.Nil(t, err)
t.Logf("Deployment %s/%s is unavailable!", deployment.Namespace, deployment.Name)

// Delete Node; typically this would be done by the node controller but we simulate it here:
// https://github.com/hsbc/cost-manager/blob/bf176ada100e19a765d276aee1a0a2d6038275e0/pkg/controller/spot_migrator.go#L242-L250
err = kubeClient.Delete(ctx, node)
require.Nil(t, err)

// Wait for Prometheus metric to indicate successful migration
t.Logf("Waiting for Prometheus metric to indicate successful migration...")
pod, err := kubernetes.WaitForAnyReadyPod(ctx, kubeClient, client.InNamespace("monitoring"), client.MatchingLabels{"app.kubernetes.io/name": "prometheus"})
require.Nil(t, err)
// Port forward to Prometheus in the background
forwardedPort, close, err := kubernetes.PortForward(ctx, restConfig, pod.Namespace, pod.Name, 9090)
require.Nil(t, err)
defer func() {
err := close()
require.Nil(t, err)
}()
// Setup Prometheus client using local forwarded port
prometheusAddress := fmt.Sprintf("http://127.0.0.1:%d", forwardedPort)
prometheusClient, err := api.NewClient(api.Config{
Address: prometheusAddress,
})
require.Nil(t, err)
prometheusAPI := prometheusv1.NewAPI(prometheusClient)
// Wait for the spot-migrator metric to be scraped by Prometheus...
var currentMetricValue model.SampleValue
for {
results, _, err := prometheusAPI.Query(ctx, `sum(cost_manager_spot_migrator_operation_success_total{job="cost-manager",namespace="cost-manager"})`, time.Now())
require.Nil(t, err)
if len(results.(model.Vector)) == 1 {
currentMetricValue = results.(model.Vector)[0].Value
break
}
time.Sleep(time.Second)
}
// ...and then wait for it to increase
for {
results, _, err := prometheusAPI.Query(ctx, `sum(cost_manager_spot_migrator_operation_success_total{job="cost-manager",namespace="cost-manager"})`, time.Now())
require.Nil(t, err)
require.Equal(t, 1, len(results.(model.Vector)))
if results.(model.Vector)[0].Value > currentMetricValue {
break
}
time.Sleep(time.Second)
}
t.Logf("Migration successful!")

// Delete Namespace
err = kubeClient.Delete(ctx, namespace)
require.Nil(t, err)

// Verify that all control plane Nodes are schedulable
controlPlaneNodeSelector, err := metav1.LabelSelectorAsSelector(&metav1.LabelSelector{
MatchExpressions: []metav1.LabelSelectorRequirement{
Expand All @@ -154,8 +207,4 @@ func TestSpotMigrator(t *testing.T) {
for _, node := range nodeList.Items {
require.False(t, node.Spec.Unschedulable)
}

// Delete Namespace
err = kubeClient.Delete(ctx, namespace)
require.Nil(t, err)
}
5 changes: 3 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,13 @@ go 1.21.3

require (
github.com/go-logr/logr v1.3.0
github.com/hashicorp/go-multierror v1.1.1
github.com/pkg/errors v0.9.1
github.com/prometheus/client_golang v1.17.0
github.com/prometheus/common v0.45.0
github.com/robfig/cron/v3 v3.0.1
github.com/stretchr/testify v1.8.4
google.golang.org/api v0.149.0
gopkg.in/robfig/cron.v2 v2.0.0-20150107220207-be2e0b0deed5
k8s.io/api v0.29.0
k8s.io/apimachinery v0.29.0
k8s.io/client-go v0.29.0
Expand Down Expand Up @@ -67,6 +68,7 @@ require (
github.com/gregjones/httpcache v0.0.0-20180305231024-9cad4c3443a7 // indirect
github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0 // indirect
github.com/grpc-ecosystem/grpc-gateway/v2 v2.16.0 // indirect
github.com/hashicorp/errwrap v1.0.0 // indirect
github.com/imdario/mergo v0.3.13 // indirect
github.com/inconshreveable/mousetrap v1.1.0 // indirect
github.com/josharian/intern v1.0.0 // indirect
Expand All @@ -85,7 +87,6 @@ require (
github.com/peterbourgon/diskv v2.0.1+incompatible // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/prometheus/client_model v0.4.1-0.20230718164431-9a2bf3000d16 // indirect
github.com/prometheus/common v0.45.0 // indirect
github.com/prometheus/procfs v0.11.1 // indirect
github.com/russross/blackfriday/v2 v2.1.0 // indirect
github.com/spf13/cobra v1.7.0 // indirect
Expand Down
10 changes: 8 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,10 @@ github.com/grpc-ecosystem/grpc-gateway v1.16.0 h1:gmcG1KaJ57LophUzW0Hy8NmPhnMZb4
github.com/grpc-ecosystem/grpc-gateway v1.16.0/go.mod h1:BDjrQk3hbvj6Nolgz8mAMFbcEtjT1g+wF4CSlocrBnw=
github.com/grpc-ecosystem/grpc-gateway/v2 v2.16.0 h1:YBftPWNWd4WwGqtY2yeZL2ef8rHAxPBD8KFhJpmcqms=
github.com/grpc-ecosystem/grpc-gateway/v2 v2.16.0/go.mod h1:YN5jB8ie0yfIUg6VvR9Kz84aCaG7AsGZnLjhHbUqwPg=
github.com/hashicorp/errwrap v1.0.0 h1:hLrqtEDnRye3+sgx6z4qVLNuviH3MR5aQ0ykNJa/UYA=
github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4=
github.com/hashicorp/go-multierror v1.1.1 h1:H5DkEtf6CXdFp0N0Em5UCwQpXMWke8IA0+lD48awMYo=
github.com/hashicorp/go-multierror v1.1.1/go.mod h1:iw975J/qwKPdAO1clOe2L8331t/9/fmwbPZ6JB6eMoM=
github.com/imdario/mergo v0.3.13 h1:lFzP57bqS/wsqKssCGmtLAb8A0wKjLGrve2q3PPVcBk=
github.com/imdario/mergo v0.3.13/go.mod h1:4lJ1jqUDcsbIECGy0RUJAXNIhg+6ocWgb1ALK2O4oXg=
github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8=
Expand All @@ -165,6 +169,8 @@ github.com/jonboulle/clockwork v0.2.2 h1:UOGuzwb1PwsrDAObMuhUnj0p5ULPj8V/xJ7Kx9q
github.com/jonboulle/clockwork v0.2.2/go.mod h1:Pkfl5aHPm1nk2H9h0bjmnJD/BcgbGXUBGnn1kMkgxc8=
github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY=
github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y=
github.com/jpillora/backoff v1.0.0 h1:uvFg412JmmHBHw7iwprIxkPMI+sGQ4kzOWsMeHnm2EA=
github.com/jpillora/backoff v1.0.0/go.mod h1:J/6gKK9jxlEcS3zixgDgUAsiuZ7yrSoa/FX5e0EB2j4=
github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8=
Expand Down Expand Up @@ -199,6 +205,8 @@ github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00 h1:n6/
github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00/go.mod h1:Pm3mSP3c5uWn86xMLZ5Sa7JB9GsEZySvHYXCTK4E9q4=
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f h1:KUppIJq7/+SVif2QVs3tOP0zanoHgBEVAwHxUSIzRqU=
github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U=
github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f h1:y5//uYreIhSUg3J1GEMiLbxo1LJaP8RfCpH6pymGZus=
github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f/go.mod h1:ZdcZmHo+o7JKHSa8/e818NopupXU1YMK5fe1lsApnBw=
github.com/nxadm/tail v1.4.4 h1:DQuhQpB1tVlglWS2hLQ5OV6B5r8aGxSrPc5Qo6uTN78=
Expand Down Expand Up @@ -438,8 +446,6 @@ gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc=
gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw=
gopkg.in/natefinch/lumberjack.v2 v2.2.1 h1:bBRl1b0OH9s/DuPhuXpNl+VtCaJXFZ5/uEFST95x9zc=
gopkg.in/natefinch/lumberjack.v2 v2.2.1/go.mod h1:YD8tP3GAjkrDg1eZH7EGmyESg/lsYskCTPBJVb9jqSc=
gopkg.in/robfig/cron.v2 v2.0.0-20150107220207-be2e0b0deed5 h1:E846t8CnR+lv5nE+VuiKTDG/v1U2stad0QzddfJC7kY=
gopkg.in/robfig/cron.v2 v2.0.0-20150107220207-be2e0b0deed5/go.mod h1:hiOFpYm0ZJbusNj2ywpbrXowU3G8U6GIQzqn2mw1UIE=
gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 h1:uRGJdciOHaEIrze2W8Q3AKkepLTh2hOroT7a+7czfdQ=
gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw=
gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
Expand Down
27 changes: 27 additions & 0 deletions pkg/kubernetes/pod.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
package kubernetes

import (
"context"

corev1 "k8s.io/api/core/v1"
apiwatch "k8s.io/apimachinery/pkg/watch"
"k8s.io/client-go/tools/watch"
"k8s.io/kubectl/pkg/util/podutils"
"sigs.k8s.io/controller-runtime/pkg/client"
)

func WaitForAnyReadyPod(ctx context.Context, kubeClient client.WithWatch, opts ...client.ListOption) (*corev1.Pod, error) {
listerWatcher := NewListerWatcher(ctx, kubeClient, &corev1.PodList{}, opts...)
condition := func(event apiwatch.Event) (bool, error) {
pod, err := ParseWatchEventObject[*corev1.Pod](event)
if err != nil {
return false, err
}
return podutils.IsPodReady(pod), nil
}
event, err := watch.UntilWithSync(ctx, listerWatcher, &corev1.Pod{}, nil, condition)
if err != nil {
return nil, err
}
return event.Object.(*corev1.Pod), nil
}
Loading