From 60acf405634d9459d5eac0ad5d8dd7500a9d795d Mon Sep 17 00:00:00 2001 From: Luke Addison Date: Sun, 7 Jan 2024 20:09:37 +0000 Subject: [PATCH 1/7] Test Prometheus metric collection --- e2e/config/kube-prometheus-stack-values.yaml | 36 +++++++++++ e2e/config/prometheus.yaml | 68 ++++++++++++++++++++ e2e/main_test.go | 42 ++++++++++-- e2e/spot_migrator_test.go | 10 ++- 4 files changed, 149 insertions(+), 7 deletions(-) create mode 100644 e2e/config/kube-prometheus-stack-values.yaml create mode 100644 e2e/config/prometheus.yaml diff --git a/e2e/config/kube-prometheus-stack-values.yaml b/e2e/config/kube-prometheus-stack-values.yaml new file mode 100644 index 0000000..a77abae --- /dev/null +++ b/e2e/config/kube-prometheus-stack-values.yaml @@ -0,0 +1,36 @@ +global: + rbac: + create: true + pspEnabled: false +fullnameOverride: prometheus +defaultRules: + create: false +alertmanager: + enabled: false +grafana: + enabled: false +kubeApiServer: + enabled: false +kubelet: + enabled: false +kubeControllerManager: + enabled: false +coreDns: + enabled: false +kubeEtcd: + enabled: false +kubeScheduler: + enabled: false +kubeProxy: + enabled: false +kubeStateMetrics: + enabled: false +nodeExporter: + enabled: false +prometheus: + enabled: false +prometheusOperator: + prometheusInstanceNamespaces: + - monitoring + serviceMonitor: + selfMonitor: false diff --git a/e2e/config/prometheus.yaml b/e2e/config/prometheus.yaml new file mode 100644 index 0000000..be27618 --- /dev/null +++ b/e2e/config/prometheus.yaml @@ -0,0 +1,68 @@ +# https://github.com/prometheus-operator/prometheus-operator/tree/48d3604507e082f4187f39edda9bc22935881a14/example/rbac/prometheus +apiVersion: v1 +kind: ServiceAccount +metadata: + name: prometheus + namespace: monitoring +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: prometheus +rules: +- apiGroups: + - "" + resources: + - nodes + - nodes/metrics + - services + - endpoints + - pods + verbs: + - get + - list + - watch +- apiGroups: + - "" + resources: + - configmaps + verbs: + - get +- apiGroups: + - networking.k8s.io + resources: + - ingresses + verbs: + - get + - list + - watch +- nonResourceURLs: + - /metrics + verbs: + - get +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: prometheus +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: prometheus +subjects: +- kind: ServiceAccount + name: prometheus + namespace: monitoring +--- +apiVersion: monitoring.coreos.com/v1 +kind: Prometheus +metadata: + name: prometheus + namespace: monitoring +spec: + serviceAccountName: prometheus + # Watch for all PrometheusRules and PodMonitors + ruleSelector: {} + ruleNamespaceSelector: {} + podMonitorSelector: {} + podMonitorNamespaceSelector: {} diff --git a/e2e/main_test.go b/e2e/main_test.go index aa3af24..4f8e3c6 100644 --- a/e2e/main_test.go +++ b/e2e/main_test.go @@ -85,10 +85,13 @@ func setup(ctx context.Context, image string) error { } // Install CRDs - err = runCommand("kubectl", "apply", - "-f", "https://raw.githubusercontent.com/kubernetes/autoscaler/5469d7912072c1070eedc680c89e27d46b8f4f82/vertical-pod-autoscaler/deploy/vpa-v1-crd-gen.yaml", - "-f", "https://raw.githubusercontent.com/prometheus-community/helm-charts/d616961860a0248f77a2783923511550fad23569/charts/kube-prometheus-stack/charts/crds/crds/crd-prometheusrules.yaml", - "-f", "https://raw.githubusercontent.com/prometheus-community/helm-charts/d616961860a0248f77a2783923511550fad23569/charts/kube-prometheus-stack/charts/crds/crds/crd-podmonitors.yaml") + err = runCommand("kubectl", "apply", "-f", "https://raw.githubusercontent.com/kubernetes/autoscaler/5469d7912072c1070eedc680c89e27d46b8f4f82/vertical-pod-autoscaler/deploy/vpa-v1-crd-gen.yaml") + if err != nil { + return err + } + + // Install Prometheus Operator and Prometheus + err = installPrometheus(ctx, image) if err != nil { return err } @@ -249,6 +252,37 @@ podMonitor: return kubernetes.WaitUntilDeploymentAvailable(ctx, kubeClient, "cost-manager", "cost-manager") } +func installPrometheus(ctx context.Context, image string) (rerr error) { + // Add prometheus-community Helm repository: + // https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack#get-helm-repository-info + err := runCommand("helm", "repo", "add", "prometheus-community", "https://prometheus-community.github.io/helm-charts") + if err != nil { + return err + } + err = runCommand("helm", "repo", "update", "prometheus-community") + if err != nil { + return err + } + + // Install Prometheus Operator + err = runCommand("helm", "upgrade", "--install", + "kube-prometheus-stack", "prometheus-community/kube-prometheus-stack", + "--namespace", "monitoring", "--create-namespace", + "--values", "./config/kube-prometheus-stack-values.yaml", + "--wait", "--timeout", "2m") + if err != nil { + return err + } + + // Install Prometheus + err = runCommand("kubectl", "apply", "-f", "./config/prometheus.yaml") + if err != nil { + return err + } + + return nil +} + func teardown() error { err := runCommand("kind", "delete", "cluster", "--name", kindClusterName) if err != nil { diff --git a/e2e/spot_migrator_test.go b/e2e/spot_migrator_test.go index bfa506f..5c6a929 100644 --- a/e2e/spot_migrator_test.go +++ b/e2e/spot_migrator_test.go @@ -102,10 +102,9 @@ func TestSpotMigrator(t *testing.T) { err = kubeClient.Patch(ctx, node, client.RawPatch(types.StrategicMergePatchType, patch)) require.Nil(t, err) - // Wait for the Node to be marked as unschedulable + // Wait for the Node to be marked as unschedulable. This should not take longer than 2 minutes + // since spot-migrator is configured with a 1 minute migration interval t.Logf("Waiting for Node %s to be marked as unschedulable...", nodeName) - // spot-migrator is configured with a 1 minute migration interval so this should not take longer - // than 2 minutes ctxWithTimeout, cancel := context.WithTimeout(ctx, 2*time.Minute) defer cancel() listerWatcher = kubernetes.NewListerWatcher(ctx, kubeClient, &corev1.NodeList{}) @@ -155,6 +154,11 @@ func TestSpotMigrator(t *testing.T) { require.False(t, node.Spec.Unschedulable) } + // Delete Node; typically this would be done by the node-controller: + // https://github.com/hsbc/cost-manager/blob/bf176ada100e19a765d276aee1a0a2d6038275e0/pkg/controller/spot_migrator.go#L242-L250 + err = kubeClient.Delete(ctx, node) + require.Nil(t, err) + // Delete Namespace err = kubeClient.Delete(ctx, namespace) require.Nil(t, err) From e6176378d6bf972956165f113563745294a33a9c Mon Sep 17 00:00:00 2001 From: 45132765 Date: Sun, 7 Jan 2024 20:11:17 +0000 Subject: [PATCH 2/7] Implement Go port-forwarding --- go.mod | 3 +- go.sum | 6 ++- pkg/kubernetes/port_forward.go | 79 ++++++++++++++++++++++++++++++++++ 3 files changed, 85 insertions(+), 3 deletions(-) create mode 100644 pkg/kubernetes/port_forward.go diff --git a/go.mod b/go.mod index 7c5b3f9..b09d21f 100644 --- a/go.mod +++ b/go.mod @@ -4,12 +4,12 @@ go 1.21.3 require ( github.com/go-logr/logr v1.3.0 + github.com/hashicorp/go-multierror v1.1.1 github.com/pkg/errors v0.9.1 github.com/prometheus/client_golang v1.17.0 github.com/robfig/cron/v3 v3.0.1 github.com/stretchr/testify v1.8.4 google.golang.org/api v0.149.0 - gopkg.in/robfig/cron.v2 v2.0.0-20150107220207-be2e0b0deed5 k8s.io/api v0.29.0 k8s.io/apimachinery v0.29.0 k8s.io/client-go v0.29.0 @@ -67,6 +67,7 @@ require ( github.com/gregjones/httpcache v0.0.0-20180305231024-9cad4c3443a7 // indirect github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0 // indirect github.com/grpc-ecosystem/grpc-gateway/v2 v2.16.0 // indirect + github.com/hashicorp/errwrap v1.0.0 // indirect github.com/imdario/mergo v0.3.13 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/josharian/intern v1.0.0 // indirect diff --git a/go.sum b/go.sum index 8202bc3..fb8123f 100644 --- a/go.sum +++ b/go.sum @@ -157,6 +157,10 @@ github.com/grpc-ecosystem/grpc-gateway v1.16.0 h1:gmcG1KaJ57LophUzW0Hy8NmPhnMZb4 github.com/grpc-ecosystem/grpc-gateway v1.16.0/go.mod h1:BDjrQk3hbvj6Nolgz8mAMFbcEtjT1g+wF4CSlocrBnw= github.com/grpc-ecosystem/grpc-gateway/v2 v2.16.0 h1:YBftPWNWd4WwGqtY2yeZL2ef8rHAxPBD8KFhJpmcqms= github.com/grpc-ecosystem/grpc-gateway/v2 v2.16.0/go.mod h1:YN5jB8ie0yfIUg6VvR9Kz84aCaG7AsGZnLjhHbUqwPg= +github.com/hashicorp/errwrap v1.0.0 h1:hLrqtEDnRye3+sgx6z4qVLNuviH3MR5aQ0ykNJa/UYA= +github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= +github.com/hashicorp/go-multierror v1.1.1 h1:H5DkEtf6CXdFp0N0Em5UCwQpXMWke8IA0+lD48awMYo= +github.com/hashicorp/go-multierror v1.1.1/go.mod h1:iw975J/qwKPdAO1clOe2L8331t/9/fmwbPZ6JB6eMoM= github.com/imdario/mergo v0.3.13 h1:lFzP57bqS/wsqKssCGmtLAb8A0wKjLGrve2q3PPVcBk= github.com/imdario/mergo v0.3.13/go.mod h1:4lJ1jqUDcsbIECGy0RUJAXNIhg+6ocWgb1ALK2O4oXg= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= @@ -438,8 +442,6 @@ gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= gopkg.in/natefinch/lumberjack.v2 v2.2.1 h1:bBRl1b0OH9s/DuPhuXpNl+VtCaJXFZ5/uEFST95x9zc= gopkg.in/natefinch/lumberjack.v2 v2.2.1/go.mod h1:YD8tP3GAjkrDg1eZH7EGmyESg/lsYskCTPBJVb9jqSc= -gopkg.in/robfig/cron.v2 v2.0.0-20150107220207-be2e0b0deed5 h1:E846t8CnR+lv5nE+VuiKTDG/v1U2stad0QzddfJC7kY= -gopkg.in/robfig/cron.v2 v2.0.0-20150107220207-be2e0b0deed5/go.mod h1:hiOFpYm0ZJbusNj2ywpbrXowU3G8U6GIQzqn2mw1UIE= gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 h1:uRGJdciOHaEIrze2W8Q3AKkepLTh2hOroT7a+7czfdQ= gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw= gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= diff --git a/pkg/kubernetes/port_forward.go b/pkg/kubernetes/port_forward.go new file mode 100644 index 0000000..7c01d6a --- /dev/null +++ b/pkg/kubernetes/port_forward.go @@ -0,0 +1,79 @@ +package kubernetes + +import ( + "context" + "errors" + "fmt" + "io" + "net/http" + "net/url" + "strings" + + "github.com/hashicorp/go-multierror" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/portforward" + "k8s.io/client-go/transport/spdy" +) + +// PortForward port forwards to the specified Pod in the background. The forwarded port is a random +// available local port which is returned as well as a function to close the listener when finished +func PortForward(ctx context.Context, restConfig *rest.Config, podNamespace, podName string, port int) (uint16, func() error, error) { + stopChan, readyChan, errChan := make(chan struct{}, 1), make(chan struct{}, 1), make(chan error, 1) + forwarder, err := createForwarder(ctx, restConfig, stopChan, readyChan, podNamespace, podName, port) + if err != nil { + return 0, nil, err + } + go func() { + errChan <- forwarder.ForwardPorts() + }() + // Wait for port forward to be ready or fail + select { + case <-readyChan: + case err := <-errChan: + if err != nil { + return 0, nil, err + } + return 0, nil, errors.New("port forward finished") + } + // Create function for the caller to finish port forwarding + close := func() error { + // Make sure any started listeners are stopped... + close(stopChan) + // ...and wait for the port forward to finish + return <-errChan + } + forwardedPorts, err := forwarder.GetPorts() + if err != nil { + return 0, nil, multierror.Append(err, close()) + } + if len(forwardedPorts) != 1 { + err := fmt.Errorf("unexpected number of forwarded ports: %d", len(forwardedPorts)) + return 0, nil, multierror.Append(err, close()) + } + return forwardedPorts[0].Local, close, nil +} + +func createForwarder(ctx context.Context, restConfig *rest.Config, stopChan, readyChan chan struct{}, podNamespace, podName string, port int) (*portforward.PortForwarder, error) { + // Discard output to avoid race conditions + out, errOut := io.Discard, io.Discard + + roundTripper, upgrader, err := spdy.RoundTripperFor(restConfig) + if err != nil { + return nil, err + } + + path := fmt.Sprintf("/api/v1/namespaces/%s/pods/%s/portforward", podNamespace, podName) + hostIP := strings.TrimLeft(restConfig.Host, "htps:/") + serverURL := url.URL{Scheme: "https", Path: path, Host: hostIP} + + dialer := spdy.NewDialer(upgrader, &http.Client{Transport: roundTripper}, http.MethodPost, &serverURL) + // Listen on a random available local port to avoid collisions: + // https://github.com/kubernetes/client-go/blob/86d49e7265f07676cb39f342595a858b032112de/tools/portforward/portforward.go#L75 + forwarderPort := fmt.Sprintf(":%d", port) + forwarder, err := portforward.New(dialer, []string{forwarderPort}, stopChan, readyChan, out, errOut) + if err != nil { + return nil, err + } + + return forwarder, nil +} From 070b36516e174049413a0fe1d516dd3c1977fef4 Mon Sep 17 00:00:00 2001 From: Luke Addison Date: Sun, 7 Jan 2024 20:40:04 +0000 Subject: [PATCH 3/7] Wait for successful Prometheus metric --- e2e/spot_migrator_test.go | 40 ++++++++++++++++++++++++++++++++++++++- go.mod | 2 +- go.sum | 4 ++++ pkg/kubernetes/pod.go | 27 ++++++++++++++++++++++++++ 4 files changed, 71 insertions(+), 2 deletions(-) create mode 100644 pkg/kubernetes/pod.go diff --git a/e2e/spot_migrator_test.go b/e2e/spot_migrator_test.go index 5c6a929..5220487 100644 --- a/e2e/spot_migrator_test.go +++ b/e2e/spot_migrator_test.go @@ -9,6 +9,9 @@ import ( cloudproviderfake "github.com/hsbc/cost-manager/pkg/cloudprovider/fake" "github.com/hsbc/cost-manager/pkg/kubernetes" "github.com/hsbc/cost-manager/pkg/test" + "github.com/prometheus/client_golang/api" + prometheusv1 "github.com/prometheus/client_golang/api/prometheus/v1" + "github.com/prometheus/common/model" "github.com/stretchr/testify/require" corev1 "k8s.io/api/core/v1" policyv1 "k8s.io/api/policy/v1" @@ -154,11 +157,46 @@ func TestSpotMigrator(t *testing.T) { require.False(t, node.Spec.Unschedulable) } - // Delete Node; typically this would be done by the node-controller: + // Delete Node; typically this would be done by the node-controller but we simulate it here: // https://github.com/hsbc/cost-manager/blob/bf176ada100e19a765d276aee1a0a2d6038275e0/pkg/controller/spot_migrator.go#L242-L250 err = kubeClient.Delete(ctx, node) require.Nil(t, err) + // Wait for Prometheus metric to indicate successful migration + t.Logf("Waiting for successful Prometheus metric...") + pod, err := kubernetes.WaitForAnyReadyPod(ctx, kubeClient, client.InNamespace("monitoring"), client.MatchingLabels{"app.kubernetes.io/name": "prometheus"}) + require.Nil(t, err) + // Port forward to Prometheus in the background + forwardedPort, close, err := kubernetes.PortForward(ctx, config.GetConfigOrDie(), pod.Namespace, pod.Name, 9090) + require.Nil(t, err) + defer func() { + err := close() + require.Nil(t, err) + }() + // Setup Prometheus client using local port forwarded port + prometheusAddress := fmt.Sprintf("http://127.0.0.1:%d", forwardedPort) + prometheusClient, err := api.NewClient(api.Config{ + Address: prometheusAddress, + }) + prometheusAPI := prometheusv1.NewAPI(prometheusClient) + for { + results, _, err := prometheusAPI.Query(ctx, "cost_manager_spot_migrator_operation_success_total", time.Now()) + require.Nil(t, err) + // Any result with a value greater than 0 indicates migration success + migrationSuccess := false + for _, result := range results.(model.Vector) { + if result.Value > 0 { + migrationSuccess = true + break + } + } + if migrationSuccess { + break + } + time.Sleep(time.Second) + } + t.Logf("Found successful Prometheus metric!") + // Delete Namespace err = kubeClient.Delete(ctx, namespace) require.Nil(t, err) diff --git a/go.mod b/go.mod index b09d21f..5b57e42 100644 --- a/go.mod +++ b/go.mod @@ -7,6 +7,7 @@ require ( github.com/hashicorp/go-multierror v1.1.1 github.com/pkg/errors v0.9.1 github.com/prometheus/client_golang v1.17.0 + github.com/prometheus/common v0.45.0 github.com/robfig/cron/v3 v3.0.1 github.com/stretchr/testify v1.8.4 google.golang.org/api v0.149.0 @@ -86,7 +87,6 @@ require ( github.com/peterbourgon/diskv v2.0.1+incompatible // indirect github.com/pmezard/go-difflib v1.0.0 // indirect github.com/prometheus/client_model v0.4.1-0.20230718164431-9a2bf3000d16 // indirect - github.com/prometheus/common v0.45.0 // indirect github.com/prometheus/procfs v0.11.1 // indirect github.com/russross/blackfriday/v2 v2.1.0 // indirect github.com/spf13/cobra v1.7.0 // indirect diff --git a/go.sum b/go.sum index fb8123f..77251a5 100644 --- a/go.sum +++ b/go.sum @@ -169,6 +169,8 @@ github.com/jonboulle/clockwork v0.2.2 h1:UOGuzwb1PwsrDAObMuhUnj0p5ULPj8V/xJ7Kx9q github.com/jonboulle/clockwork v0.2.2/go.mod h1:Pkfl5aHPm1nk2H9h0bjmnJD/BcgbGXUBGnn1kMkgxc8= github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= +github.com/jpillora/backoff v1.0.0 h1:uvFg412JmmHBHw7iwprIxkPMI+sGQ4kzOWsMeHnm2EA= +github.com/jpillora/backoff v1.0.0/go.mod h1:J/6gKK9jxlEcS3zixgDgUAsiuZ7yrSoa/FX5e0EB2j4= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= @@ -203,6 +205,8 @@ github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00 h1:n6/ github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00/go.mod h1:Pm3mSP3c5uWn86xMLZ5Sa7JB9GsEZySvHYXCTK4E9q4= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= +github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f h1:KUppIJq7/+SVif2QVs3tOP0zanoHgBEVAwHxUSIzRqU= +github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f h1:y5//uYreIhSUg3J1GEMiLbxo1LJaP8RfCpH6pymGZus= github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f/go.mod h1:ZdcZmHo+o7JKHSa8/e818NopupXU1YMK5fe1lsApnBw= github.com/nxadm/tail v1.4.4 h1:DQuhQpB1tVlglWS2hLQ5OV6B5r8aGxSrPc5Qo6uTN78= diff --git a/pkg/kubernetes/pod.go b/pkg/kubernetes/pod.go new file mode 100644 index 0000000..f9f822b --- /dev/null +++ b/pkg/kubernetes/pod.go @@ -0,0 +1,27 @@ +package kubernetes + +import ( + "context" + + corev1 "k8s.io/api/core/v1" + apiwatch "k8s.io/apimachinery/pkg/watch" + "k8s.io/client-go/tools/watch" + "k8s.io/kubectl/pkg/util/podutils" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +func WaitForAnyReadyPod(ctx context.Context, kubeClient client.WithWatch, opts ...client.ListOption) (*corev1.Pod, error) { + listerWatcher := NewListerWatcher(ctx, kubeClient, &corev1.PodList{}, opts...) + condition := func(event apiwatch.Event) (bool, error) { + pod, err := ParseWatchEventObject[*corev1.Pod](event) + if err != nil { + return false, err + } + return podutils.IsPodReady(pod), nil + } + event, err := watch.UntilWithSync(ctx, listerWatcher, &corev1.Pod{}, nil, condition) + if err != nil { + return nil, err + } + return event.Object.(*corev1.Pod), nil +} From 1a9567018a73eeeb3d22796d0f4afa2192290db3 Mon Sep 17 00:00:00 2001 From: Luke Addison Date: Sun, 7 Jan 2024 20:44:04 +0000 Subject: [PATCH 4/7] Fix lint issues --- e2e/spot_migrator_test.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/e2e/spot_migrator_test.go b/e2e/spot_migrator_test.go index 5220487..37e9807 100644 --- a/e2e/spot_migrator_test.go +++ b/e2e/spot_migrator_test.go @@ -157,7 +157,7 @@ func TestSpotMigrator(t *testing.T) { require.False(t, node.Spec.Unschedulable) } - // Delete Node; typically this would be done by the node-controller but we simulate it here: + // Delete Node; typically this would be done by the node controller but we simulate it here: // https://github.com/hsbc/cost-manager/blob/bf176ada100e19a765d276aee1a0a2d6038275e0/pkg/controller/spot_migrator.go#L242-L250 err = kubeClient.Delete(ctx, node) require.Nil(t, err) @@ -173,11 +173,12 @@ func TestSpotMigrator(t *testing.T) { err := close() require.Nil(t, err) }() - // Setup Prometheus client using local port forwarded port + // Setup Prometheus client using local forwarded port prometheusAddress := fmt.Sprintf("http://127.0.0.1:%d", forwardedPort) prometheusClient, err := api.NewClient(api.Config{ Address: prometheusAddress, }) + require.Nil(t, err) prometheusAPI := prometheusv1.NewAPI(prometheusClient) for { results, _, err := prometheusAPI.Query(ctx, "cost_manager_spot_migrator_operation_success_total", time.Now()) From 9ad6dca8ef892f1a9b15f553c78a0664e1f1a81a Mon Sep 17 00:00:00 2001 From: Luke Addison Date: Sun, 7 Jan 2024 20:51:11 +0000 Subject: [PATCH 5/7] Wait for metric increase --- e2e/spot_migrator_test.go | 58 +++++++++++++++++++-------------------- 1 file changed, 28 insertions(+), 30 deletions(-) diff --git a/e2e/spot_migrator_test.go b/e2e/spot_migrator_test.go index 37e9807..29cb755 100644 --- a/e2e/spot_migrator_test.go +++ b/e2e/spot_migrator_test.go @@ -139,31 +139,13 @@ func TestSpotMigrator(t *testing.T) { require.Nil(t, err) t.Logf("Deployment %s/%s is unavailable!", deployment.Namespace, deployment.Name) - // Verify that all control plane Nodes are schedulable - controlPlaneNodeSelector, err := metav1.LabelSelectorAsSelector(&metav1.LabelSelector{ - MatchExpressions: []metav1.LabelSelectorRequirement{ - { - Key: "node-role.kubernetes.io/control-plane", - Operator: "Exists", - }, - }, - }) - require.Nil(t, err) - nodeList = &corev1.NodeList{} - err = kubeClient.List(ctx, nodeList, client.MatchingLabelsSelector{Selector: controlPlaneNodeSelector}) - require.Nil(t, err) - require.Greater(t, len(nodeList.Items), 0) - for _, node := range nodeList.Items { - require.False(t, node.Spec.Unschedulable) - } - // Delete Node; typically this would be done by the node controller but we simulate it here: // https://github.com/hsbc/cost-manager/blob/bf176ada100e19a765d276aee1a0a2d6038275e0/pkg/controller/spot_migrator.go#L242-L250 err = kubeClient.Delete(ctx, node) require.Nil(t, err) // Wait for Prometheus metric to indicate successful migration - t.Logf("Waiting for successful Prometheus metric...") + t.Logf("Waiting for Prometheus metric to indicate successful migration...") pod, err := kubernetes.WaitForAnyReadyPod(ctx, kubeClient, client.InNamespace("monitoring"), client.MatchingLabels{"app.kubernetes.io/name": "prometheus"}) require.Nil(t, err) // Port forward to Prometheus in the background @@ -180,25 +162,41 @@ func TestSpotMigrator(t *testing.T) { }) require.Nil(t, err) prometheusAPI := prometheusv1.NewAPI(prometheusClient) + // Wait for the number of successful operations to increase + results, _, err := prometheusAPI.Query(ctx, `sum(cost_manager_spot_migrator_operation_success_total{job="cost-manager",namespace="cost-manager"})`, time.Now()) + require.Nil(t, err) + require.Equal(t, 1, len(results.(model.Vector))) + currentMetricValue := results.(model.Vector)[0].Value for { - results, _, err := prometheusAPI.Query(ctx, "cost_manager_spot_migrator_operation_success_total", time.Now()) + results, _, err := prometheusAPI.Query(ctx, `sum(cost_manager_spot_migrator_operation_success_total{job="cost-manager",namespace="cost-manager"})`, time.Now()) require.Nil(t, err) - // Any result with a value greater than 0 indicates migration success - migrationSuccess := false - for _, result := range results.(model.Vector) { - if result.Value > 0 { - migrationSuccess = true - break - } - } - if migrationSuccess { + require.Equal(t, 1, len(results.(model.Vector))) + if results.(model.Vector)[0].Value > currentMetricValue { break } time.Sleep(time.Second) } - t.Logf("Found successful Prometheus metric!") + t.Logf("Migration successful!") // Delete Namespace err = kubeClient.Delete(ctx, namespace) require.Nil(t, err) + + // Verify that all control plane Nodes are schedulable + controlPlaneNodeSelector, err := metav1.LabelSelectorAsSelector(&metav1.LabelSelector{ + MatchExpressions: []metav1.LabelSelectorRequirement{ + { + Key: "node-role.kubernetes.io/control-plane", + Operator: "Exists", + }, + }, + }) + require.Nil(t, err) + nodeList = &corev1.NodeList{} + err = kubeClient.List(ctx, nodeList, client.MatchingLabelsSelector{Selector: controlPlaneNodeSelector}) + require.Nil(t, err) + require.Greater(t, len(nodeList.Items), 0) + for _, node := range nodeList.Items { + require.False(t, node.Spec.Unschedulable) + } } From 39d9eb057b1a8fc81467000f365d81200827964e Mon Sep 17 00:00:00 2001 From: Luke Addison Date: Sun, 7 Jan 2024 21:03:30 +0000 Subject: [PATCH 6/7] Fix metric retrieval --- e2e/config/prometheus.yaml | 1 + e2e/spot_migrator_test.go | 17 ++++++++++++----- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/e2e/config/prometheus.yaml b/e2e/config/prometheus.yaml index be27618..0359eb2 100644 --- a/e2e/config/prometheus.yaml +++ b/e2e/config/prometheus.yaml @@ -61,6 +61,7 @@ metadata: namespace: monitoring spec: serviceAccountName: prometheus + scrapeInterval: 5s # Watch for all PrometheusRules and PodMonitors ruleSelector: {} ruleNamespaceSelector: {} diff --git a/e2e/spot_migrator_test.go b/e2e/spot_migrator_test.go index 29cb755..2ac1fa5 100644 --- a/e2e/spot_migrator_test.go +++ b/e2e/spot_migrator_test.go @@ -162,11 +162,18 @@ func TestSpotMigrator(t *testing.T) { }) require.Nil(t, err) prometheusAPI := prometheusv1.NewAPI(prometheusClient) - // Wait for the number of successful operations to increase - results, _, err := prometheusAPI.Query(ctx, `sum(cost_manager_spot_migrator_operation_success_total{job="cost-manager",namespace="cost-manager"})`, time.Now()) - require.Nil(t, err) - require.Equal(t, 1, len(results.(model.Vector))) - currentMetricValue := results.(model.Vector)[0].Value + // Wait for the spot-migrator metric to be scraped by Prometheus... + var currentMetricValue model.SampleValue + for { + results, _, err := prometheusAPI.Query(ctx, `sum(cost_manager_spot_migrator_operation_success_total{job="cost-manager",namespace="cost-manager"})`, time.Now()) + require.Nil(t, err) + if len(results.(model.Vector)) == 1 { + currentMetricValue = results.(model.Vector)[0].Value + break + } + time.Sleep(time.Second) + } + // ...and then wait for it to increase for { results, _, err := prometheusAPI.Query(ctx, `sum(cost_manager_spot_migrator_operation_success_total{job="cost-manager",namespace="cost-manager"})`, time.Now()) require.Nil(t, err) From 126d886b83a94cc3114aa129e3accdc410b9e9aa Mon Sep 17 00:00:00 2001 From: Luke Addison Date: Sun, 7 Jan 2024 21:07:51 +0000 Subject: [PATCH 7/7] Reuse REST config --- e2e/spot_migrator_test.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/e2e/spot_migrator_test.go b/e2e/spot_migrator_test.go index 2ac1fa5..a4ecf75 100644 --- a/e2e/spot_migrator_test.go +++ b/e2e/spot_migrator_test.go @@ -29,7 +29,8 @@ func TestSpotMigrator(t *testing.T) { t.Parallel() ctx := context.Background() - kubeClient, err := client.NewWithWatch(config.GetConfigOrDie(), client.Options{}) + restConfig := config.GetConfigOrDie() + kubeClient, err := client.NewWithWatch(restConfig, client.Options{}) require.Nil(t, err) // Find a worker Node @@ -149,7 +150,7 @@ func TestSpotMigrator(t *testing.T) { pod, err := kubernetes.WaitForAnyReadyPod(ctx, kubeClient, client.InNamespace("monitoring"), client.MatchingLabels{"app.kubernetes.io/name": "prometheus"}) require.Nil(t, err) // Port forward to Prometheus in the background - forwardedPort, close, err := kubernetes.PortForward(ctx, config.GetConfigOrDie(), pod.Namespace, pod.Name, 9090) + forwardedPort, close, err := kubernetes.PortForward(ctx, restConfig, pod.Namespace, pod.Name, 9090) require.Nil(t, err) defer func() { err := close()