Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[release-4.16] OCPBUGS-49808: sriov-network-metrics-exporter fix for multiple nodes #1054

Open
wants to merge 7 commits into
base: release-4.16
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion bindata/manifests/metrics-exporter/metrics-daemonset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ spec:
readOnly: true
nodeSelector:
{{- range $key, $value := .NodeSelectorField }}
{{ $key }}: {{ $value }}
{{ $key }}: "{{ $value }}"
{{- end }}
restartPolicy: Always
volumes:
Expand Down
38 changes: 38 additions & 0 deletions bindata/manifests/metrics-exporter/metrics-prometheus-rule.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
---
{{ if and .IsPrometheusOperatorInstalled .PrometheusOperatorDeployRules }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: sriov-vf-rules
namespace: {{.Namespace}}
spec:
groups:
- name: sriov-network-metrics-operator.rules
interval: 30s
rules:
- expr: |
sriov_vf_tx_packets * on (pciAddr,node) group_left(pod,namespace,dev_type) sriov_kubepoddevice
record: network:sriov_vf_tx_packets
- expr: |
sriov_vf_rx_packets * on (pciAddr,node) group_left(pod,namespace,dev_type) sriov_kubepoddevice
record: network:sriov_vf_rx_packets
- expr: |
sriov_vf_tx_bytes * on (pciAddr,node) group_left(pod,namespace,dev_type) sriov_kubepoddevice
record: network:sriov_vf_tx_bytes
- expr: |
sriov_vf_rx_bytes * on (pciAddr,node) group_left(pod,namespace,dev_type) sriov_kubepoddevice
record: network:sriov_vf_rx_bytes
- expr: |
sriov_vf_tx_dropped * on (pciAddr,node) group_left(pod,namespace,dev_type) sriov_kubepoddevice
record: network:sriov_vf_tx_dropped
- expr: |
sriov_vf_rx_dropped * on (pciAddr,node) group_left(pod,namespace,dev_type) sriov_kubepoddevice
record: network:sriov_vf_rx_dropped
- expr: |
sriov_vf_rx_broadcast * on (pciAddr,node) group_left(pod,namespace,dev_type) sriov_kubepoddevice
record: network:sriov_vf_rx_broadcast
- expr: |
sriov_vf_rx_multicast * on (pciAddr,node) group_left(pod,namespace,dev_type) sriov_kubepoddevice
record: network:sriov_vf_rx_multicast
{{ end }}

11 changes: 11 additions & 0 deletions bindata/manifests/metrics-exporter/metrics-prometheus.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,17 @@ spec:
bearerTokenFile: "/var/run/secrets/kubernetes.io/serviceaccount/token"
scheme: "https"
honorLabels: true
relabelings:
- action: replace
sourceLabels:
- __meta_kubernetes_endpoint_node_name
targetLabel: node
- action: labeldrop
regex: pod
- action: labeldrop
regex: container
- action: labeldrop
regex: namespace
tlsConfig:
serverName: sriov-network-metrics-exporter-service.{{.Namespace}}.svc
caFile: /etc/prometheus/configmaps/serving-certs-ca-bundle/service-ca.crt
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -396,6 +396,8 @@ spec:
value: metrics-exporter-cert
- name: METRICS_EXPORTER_PROMETHEUS_OPERATOR_ENABLED
value: "true"
- name: METRICS_EXPORTER_PROMETHEUS_DEPLOY_RULES
value: "true"
- name: METRICS_EXPORTER_PROMETHEUS_OPERATOR_NAMESPACE
value: openshift-monitoring
- name: METRICS_EXPORTER_PROMETHEUS_OPERATOR_SERVICE_ACCOUNT
Expand Down Expand Up @@ -553,6 +555,8 @@ spec:
verbs:
- get
- create
- update
- delete
- apiGroups:
- apps
resourceNames:
Expand Down
2 changes: 2 additions & 0 deletions config/manager/manager.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,8 @@ spec:
value: metrics-exporter-cert
- name: METRICS_EXPORTER_PROMETHEUS_OPERATOR_ENABLED
value: "true"
- name: METRICS_EXPORTER_PROMETHEUS_DEPLOY_RULES
value: "true"
- name: METRICS_EXPORTER_PROMETHEUS_OPERATOR_NAMESPACE
value: openshift-monitoring
- name: METRICS_EXPORTER_PROMETHEUS_OPERATOR_SERVICE_ACCOUNT
Expand Down
2 changes: 2 additions & 0 deletions config/rbac/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ rules:
verbs:
- get
- create
- update
- delete
- apiGroups:
- apps
resourceNames:
Expand Down
1 change: 1 addition & 0 deletions controllers/sriovoperatorconfig_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,7 @@ func (r *SriovOperatorConfigReconciler) syncMetricsExporter(ctx context.Context,
data.Data["IsOpenshift"] = r.PlatformHelper.IsOpenshiftCluster()

data.Data["IsPrometheusOperatorInstalled"] = strings.ToLower(os.Getenv("METRICS_EXPORTER_PROMETHEUS_OPERATOR_ENABLED")) == trueString
data.Data["PrometheusOperatorDeployRules"] = strings.ToLower(os.Getenv("METRICS_EXPORTER_PROMETHEUS_DEPLOY_RULES")) == trueString
data.Data["PrometheusOperatorServiceAccount"] = os.Getenv("METRICS_EXPORTER_PROMETHEUS_OPERATOR_SERVICE_ACCOUNT")
data.Data["PrometheusOperatorNamespace"] = os.Getenv("METRICS_EXPORTER_PROMETHEUS_OPERATOR_NAMESPACE")

Expand Down
71 changes: 55 additions & 16 deletions controllers/sriovoperatorconfig_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -242,12 +242,9 @@ var _ = Describe("SriovOperatorConfig controller", Ordered, func() {

It("should be able to update the node selector of sriov-network-config-daemon", func() {
By("specify the configDaemonNodeSelector")
config := &sriovnetworkv1.SriovOperatorConfig{}
Expect(k8sClient.Get(ctx, types.NamespacedName{Namespace: testNamespace, Name: "default"}, config)).NotTo(HaveOccurred())

config.Spec.ConfigDaemonNodeSelector = map[string]string{"node-role.kubernetes.io/worker": ""}
err := k8sClient.Update(ctx, config)
Expect(err).NotTo(HaveOccurred())
nodeSelector := map[string]string{"node-role.kubernetes.io/worker": ""}
restore := updateConfigDaemonNodeSelector(nodeSelector)
DeferCleanup(restore)

daemonSet := &appsv1.DaemonSet{}
Eventually(func() map[string]string {
Expand All @@ -257,19 +254,17 @@ var _ = Describe("SriovOperatorConfig controller", Ordered, func() {
return nil
}
return daemonSet.Spec.Template.Spec.NodeSelector
}, util.APITimeout, util.RetryInterval).Should(Equal(config.Spec.ConfigDaemonNodeSelector))
}, util.APITimeout, util.RetryInterval).Should(Equal(nodeSelector))
})

It("should be able to do multiple updates to the node selector of sriov-network-config-daemon", func() {
By("changing the configDaemonNodeSelector")
config := &sriovnetworkv1.SriovOperatorConfig{}
Expect(k8sClient.Get(ctx, types.NamespacedName{Namespace: testNamespace, Name: "default"}, config)).NotTo(HaveOccurred())
config.Spec.ConfigDaemonNodeSelector = map[string]string{"labelA": "", "labelB": "", "labelC": ""}
err := k8sClient.Update(ctx, config)
Expect(err).NotTo(HaveOccurred())
config.Spec.ConfigDaemonNodeSelector = map[string]string{"labelA": "", "labelB": ""}
err = k8sClient.Update(ctx, config)
Expect(err).NotTo(HaveOccurred())
firstNodeSelector := map[string]string{"labelA": "", "labelB": "", "labelC": ""}
restore := updateConfigDaemonNodeSelector(firstNodeSelector)
DeferCleanup(restore)

secondNodeSelector := map[string]string{"labelA": "", "labelB": ""}
updateConfigDaemonNodeSelector(secondNodeSelector)

daemonSet := &appsv1.DaemonSet{}
Eventually(func() map[string]string {
Expand All @@ -278,7 +273,7 @@ var _ = Describe("SriovOperatorConfig controller", Ordered, func() {
return nil
}
return daemonSet.Spec.Template.Spec.NodeSelector
}, util.APITimeout, util.RetryInterval).Should(Equal(config.Spec.ConfigDaemonNodeSelector))
}, util.APITimeout, util.RetryInterval).Should(Equal(secondNodeSelector))
})

It("should not render disable-plugins cmdline flag of sriov-network-config-daemon if disablePlugin not provided in spec", func() {
Expand Down Expand Up @@ -381,9 +376,28 @@ var _ = Describe("SriovOperatorConfig controller", Ordered, func() {
Expect(err).ToNot(HaveOccurred())
})

It("should deploy the sriov-network-metrics-exporter using the Spec.ConfigDaemonNodeSelector field", func() {
nodeSelector := map[string]string{
"node-role.kubernetes.io/worker": "",
"bool-key": "true",
}

restore := updateConfigDaemonNodeSelector(nodeSelector)
DeferCleanup(restore)

Eventually(func(g Gomega) {
metricsDaemonset := appsv1.DaemonSet{}
err := util.WaitForNamespacedObject(&metricsDaemonset, k8sClient, testNamespace, "sriov-network-metrics-exporter", util.RetryInterval, util.APITimeout)
g.Expect(err).NotTo(HaveOccurred())
g.Expect(metricsDaemonset.Spec.Template.Spec.NodeSelector).To((Equal(nodeSelector)))
}).Should(Succeed())
})

It("should deploy extra configuration when the Prometheus operator is installed", func() {
DeferCleanup(os.Setenv, "METRICS_EXPORTER_PROMETHEUS_OPERATOR_ENABLED", os.Getenv("METRICS_EXPORTER_PROMETHEUS_OPERATOR_ENABLED"))
os.Setenv("METRICS_EXPORTER_PROMETHEUS_OPERATOR_ENABLED", "true")
DeferCleanup(os.Setenv, "METRICS_EXPORTER_PROMETHEUS_DEPLOY_RULES", os.Getenv("METRICS_EXPORTER_PROMETHEUS_DEPLOY_RULES"))
os.Setenv("METRICS_EXPORTER_PROMETHEUS_DEPLOY_RULES", "true")

err := util.WaitForNamespacedObject(&rbacv1.Role{}, k8sClient, testNamespace, "prometheus-k8s", util.RetryInterval, util.APITimeout)
Expect(err).ToNot(HaveOccurred())
Expand All @@ -398,6 +412,14 @@ var _ = Describe("SriovOperatorConfig controller", Ordered, func() {
Version: "v1",
},
client.ObjectKey{Namespace: testNamespace, Name: "sriov-network-metrics-exporter"})

assertResourceExists(
schema.GroupVersionKind{
Group: "monitoring.coreos.com",
Kind: "PrometheusRule",
Version: "v1",
},
client.ObjectKey{Namespace: testNamespace, Name: "sriov-vf-rules"})
})
})
})
Expand Down Expand Up @@ -544,3 +566,20 @@ func assertResourceDoesNotExist(gvk schema.GroupVersionKind, key client.ObjectKe
WithTimeout(2*time.Second).
Should(Succeed(), "Resource type[%s] name[%s] still present in the cluster", gvk.String(), key.String())
}

func updateConfigDaemonNodeSelector(newValue map[string]string) func() {
config := &sriovnetworkv1.SriovOperatorConfig{}
err := k8sClient.Get(context.Background(), types.NamespacedName{Namespace: testNamespace, Name: "default"}, config)
Expect(err).NotTo(HaveOccurred())

previousValue := config.Spec.ConfigDaemonNodeSelector
ret := func() {
updateConfigDaemonNodeSelector(previousValue)
}

config.Spec.ConfigDaemonNodeSelector = newValue
err = k8sClient.Update(context.Background(), config)
Expect(err).NotTo(HaveOccurred())

return ret
}
2 changes: 2 additions & 0 deletions deploy/operator.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,8 @@ spec:
value: $METRICS_EXPORTER_KUBE_RBAC_PROXY_IMAGE
- name: METRICS_EXPORTER_PROMETHEUS_OPERATOR_ENABLED
value: "$METRICS_EXPORTER_PROMETHEUS_OPERATOR_ENABLED"
- name: METRICS_EXPORTER_PROMETHEUS_DEPLOY_RULES
value: "$METRICS_EXPORTER_PROMETHEUS_DEPLOY_RULES"
- name: METRICS_EXPORTER_PROMETHEUS_OPERATOR_SERVICE_ACCOUNT
value: $METRICS_EXPORTER_PROMETHEUS_OPERATOR_SERVICE_ACCOUNT
- name: METRICS_EXPORTER_PROMETHEUS_OPERATOR_NAMESPACE
Expand Down
3 changes: 3 additions & 0 deletions deploy/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,12 @@ rules:
- monitoring.coreos.com
resources:
- servicemonitors
- prometheusrules
verbs:
- get
- create
- update
- delete
- apiGroups:
- apps
resourceNames:
Expand Down
1 change: 1 addition & 0 deletions deployment/sriov-network-operator/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ We have introduced the following Chart parameters.
| `operator.metricsExporter.prometheusOperator.enabled` | bool | false | Wheter the operator shoud configure Prometheus resources or not (e.g. `ServiceMonitors`). |
| `operator.metricsExporter.prometheusOperator.serviceAccount` | string | `prometheus-k8s` | The service account used by the Prometheus Operator. This is used to give Prometheus the permission to list resource in the SR-IOV operator namespace |
| `operator.metricsExporter.prometheusOperator.namespace` | string | `monitoring` | The namespace where the Prometheus Operator is installed. Setting this variable makes the operator deploy `monitoring.coreos.com` resources. |
| `operator.metricsExporter.prometheusOperator.deployRules` | bool | false | Whether the operator should deploy `PrometheusRules` to scrape namespace version of metrics. |

#### Admission Controllers parameters

Expand Down
2 changes: 2 additions & 0 deletions deployment/sriov-network-operator/templates/operator.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,8 @@ spec:
{{- if .Values.operator.metricsExporter.prometheusOperator.enabled }}
- name: METRICS_EXPORTER_PROMETHEUS_OPERATOR_ENABLED
value: {{ .Values.operator.metricsExporter.prometheusOperator.enabled | quote}}
- name: METRICS_EXPORTER_PROMETHEUS_DEPLOY_RULES
value: {{ .Values.operator.metricsExporter.prometheusOperator.deployRules | quote}}
- name: METRICS_EXPORTER_PROMETHEUS_OPERATOR_SERVICE_ACCOUNT
value: {{ .Values.operator.metricsExporter.prometheusOperator.serviceAccount }}
- name: METRICS_EXPORTER_PROMETHEUS_OPERATOR_NAMESPACE
Expand Down
3 changes: 3 additions & 0 deletions deployment/sriov-network-operator/templates/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,12 @@ rules:
- monitoring.coreos.com
resources:
- servicemonitors
- prometheusrules
verbs:
- get
- create
- update
- delete
- apiGroups:
- apps
resourceNames:
Expand Down
1 change: 1 addition & 0 deletions deployment/sriov-network-operator/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ operator:
enabled: false
serviceAccount: "prometheus-k8s"
namespace: "monitoring"
deployRules: false
admissionControllers:
enabled: false
certificates:
Expand Down
4 changes: 2 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ require (
github.com/pkg/errors v0.9.1
github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.68.0
github.com/prometheus-operator/prometheus-operator/pkg/client v0.68.0
github.com/prometheus/client_model v0.5.0
github.com/prometheus/common v0.45.0
github.com/safchain/ethtool v0.3.0
github.com/spf13/cobra v1.7.0
github.com/stretchr/testify v1.8.4
Expand Down Expand Up @@ -115,8 +117,6 @@ require (
github.com/peterbourgon/diskv v2.0.1+incompatible // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/prometheus/client_golang v1.17.0 // indirect
github.com/prometheus/client_model v0.5.0 // indirect
github.com/prometheus/common v0.45.0 // indirect
github.com/prometheus/procfs v0.12.0 // indirect
github.com/robfig/cron v1.2.0 // indirect
github.com/russross/blackfriday/v2 v2.1.0 // indirect
Expand Down
3 changes: 2 additions & 1 deletion hack/run-e2e-conformance-virtual-ocp.sh
Original file line number Diff line number Diff line change
Expand Up @@ -191,8 +191,9 @@ export DEV_MODE=TRUE
export CLUSTER_HAS_EMULATED_PF=TRUE
export OPERATOR_LEADER_ELECTION_ENABLE=true
export METRICS_EXPORTER_PROMETHEUS_OPERATOR_ENABLED=true
export METRICS_EXPORTER_PROMETHEUS_DEPLOY_RULES=true
export METRICS_EXPORTER_PROMETHEUS_OPERATOR_SERVICE_ACCOUNT=${METRICS_EXPORTER_PROMETHEUS_OPERATOR_SERVICE_ACCOUNT:-"prometheus-k8s"}
export METRICS_EXPORTER_PROMETHEUS_OPERATOR_NAMESPACE=${METRICS_EXPORTER_PROMETHEUS_OPERATOR_NAMESPACE:-"openshfit-monitoring"}
export METRICS_EXPORTER_PROMETHEUS_OPERATOR_NAMESPACE=${METRICS_EXPORTER_PROMETHEUS_OPERATOR_NAMESPACE:-"openshift-monitoring"}

export SRIOV_NETWORK_OPERATOR_IMAGE="$registry/$NAMESPACE/sriov-network-operator:latest"
export SRIOV_NETWORK_CONFIG_DAEMON_IMAGE="$registry/$NAMESPACE/sriov-network-config-daemon:latest"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -396,6 +396,8 @@ spec:
value: metrics-exporter-cert
- name: METRICS_EXPORTER_PROMETHEUS_OPERATOR_ENABLED
value: "true"
- name: METRICS_EXPORTER_PROMETHEUS_DEPLOY_RULES
value: "true"
- name: METRICS_EXPORTER_PROMETHEUS_OPERATOR_NAMESPACE
value: openshift-monitoring
- name: METRICS_EXPORTER_PROMETHEUS_OPERATOR_SERVICE_ACCOUNT
Expand Down Expand Up @@ -553,6 +555,8 @@ spec:
verbs:
- get
- create
- update
- delete
- apiGroups:
- apps
resourceNames:
Expand Down
Loading