From 1af2d66bea3cc10f1c5303d3fdbc4d050a944f3b Mon Sep 17 00:00:00 2001 From: Antonin Bas Date: Fri, 14 Feb 2025 17:51:59 -0800 Subject: [PATCH] [SecondaryNetwork] Require resourceName annotations for SRIOV The "k8s.v1.cni.cncf.io/resourceName" annotation should be provided for all SRIOV NetworkAttachmentDefinitions. Otherwise, it is impossible to guarantee that the right device type is allocated for a given interface. Signed-off-by: Antonin Bas --- .../secondarynetwork/podwatch/controller.go | 33 ++++- .../podwatch/controller_test.go | 130 ++++++++++++++---- pkg/agent/secondarynetwork/podwatch/sriov.go | 64 +++++---- 3 files changed, 169 insertions(+), 58 deletions(-) diff --git a/pkg/agent/secondarynetwork/podwatch/controller.go b/pkg/agent/secondarynetwork/podwatch/controller.go index db9c576e614..1dcbfd449d1 100644 --- a/pkg/agent/secondarynetwork/podwatch/controller.go +++ b/pkg/agent/secondarynetwork/podwatch/controller.go @@ -57,6 +57,7 @@ const ( const ( networkAttachDefAnnotationKey = "k8s.v1.cni.cncf.io/networks" + resourceNameAnnotationKey = "k8s.v1.cni.cncf.io/resourceName" cniPath = "/opt/cni/bin/" startIfaceIndex = 1 endIfaceIndex = 101 @@ -324,8 +325,10 @@ func (pc *PodController) processNextWorkItem() bool { func (pc *PodController) configureSecondaryInterface( pod *corev1.Pod, network *netdefv1.NetworkSelectionElement, + resourceName string, podCNIInfo *podCNIInfo, - networkConfig *SecondaryNetworkConfig) error { + networkConfig *SecondaryNetworkConfig, +) error { var ipamResult *ipam.IPAMResult var ifConfigErr error if networkConfig.IPAM != nil { @@ -357,7 +360,7 @@ func (pc *PodController) configureSecondaryInterface( switch networkConfig.NetworkType { case sriovNetworkType: - ifConfigErr = pc.configureSriovAsSecondaryInterface(pod, network, podCNIInfo, int(networkConfig.MTU), &ipamResult.Result) + ifConfigErr = pc.configureSriovAsSecondaryInterface(pod, network, resourceName, podCNIInfo, int(networkConfig.MTU), &ipamResult.Result) case vlanNetworkType: if networkConfig.VLAN > 0 { // Let VLAN ID in the CNI network configuration override the IPPool subnet @@ -384,7 +387,7 @@ func (pc *PodController) configurePodSecondaryNetwork(pod *corev1.Pod, networkLi interfacesConfigured := 0 for _, network := range networkList { klog.V(2).InfoS("Secondary Network attached to Pod", "network", network, "Pod", klog.KObj(pod)) - netDefCRD, err := pc.netAttachDefClient.NetworkAttachmentDefinitions(network.Namespace).Get(context.TODO(), network.Name, metav1.GetOptions{}) + netAttachDef, err := pc.netAttachDefClient.NetworkAttachmentDefinitions(network.Namespace).Get(context.TODO(), network.Name, metav1.GetOptions{}) if err != nil { klog.ErrorS(err, "Failed to get NetworkAttachmentDefinition", "network", network, "Pod", klog.KRef(pod.Namespace, pod.Name)) @@ -392,7 +395,7 @@ func (pc *PodController) configurePodSecondaryNetwork(pod *corev1.Pod, networkLi continue } - cniConfig, err := netdefutils.GetCNIConfig(netDefCRD, "") + cniConfig, err := netdefutils.GetCNIConfig(netAttachDef, "") if err != nil { klog.ErrorS(err, "Failed to parse NetworkAttachmentDefinition", "network", network, "Pod", klog.KRef(pod.Namespace, pod.Name)) @@ -405,14 +408,30 @@ func (pc *PodController) configurePodSecondaryNetwork(pod *corev1.Pod, networkLi if networkConfig != nil && networkConfig.Type != cniserver.AntreaCNIType { // Ignore non-Antrea CNI type. klog.InfoS("Not Antrea CNI type in NetworkAttachmentDefinition, ignoring", - "NetworkAttachmentDefinition", klog.KObj(netDefCRD), "Pod", klog.KRef(pod.Namespace, pod.Name)) + "NetworkAttachmentDefinition", klog.KObj(netAttachDef), "Pod", klog.KRef(pod.Namespace, pod.Name)) } else { klog.ErrorS(err, "NetworkConfig validation failed", - "NetworkAttachmentDefinition", klog.KObj(netDefCRD), "Pod", klog.KRef(pod.Namespace, pod.Name)) + "NetworkAttachmentDefinition", klog.KObj(netAttachDef), "Pod", klog.KRef(pod.Namespace, pod.Name)) } continue } + var resourceName string + if networkConfig.NetworkType == sriovNetworkType { + v, ok := netAttachDef.Annotations[resourceNameAnnotationKey] + if !ok { + // This annotation is required for SRIOV devices, otherwise there is + // no way to make sure that we allocate the "right" type of device. + err := fmt.Errorf("missing annotation: %s", resourceNameAnnotationKey) + klog.ErrorS(err, "Invalid NetworkAttachmentDefinition", "NetworkAttachmentDefinition", klog.KObj(netAttachDef)) + // It is probably worth retrying as the NetworkAttachmentDefinition + // may eventually be updated with the missing annotation. + savedErr = err + continue + } + resourceName = v + } + // Generate a new interface name, if the secondary interface name was not provided in the // Pod annotation. if network.InterfaceRequest == "" { @@ -425,7 +444,7 @@ func (pc *PodController) configurePodSecondaryNetwork(pod *corev1.Pod, networkLi } // Secondary network information retrieved from API server. Proceed to configure secondary interface now. - if err = pc.configureSecondaryInterface(pod, network, podCNIInfo, networkConfig); err != nil { + if err = pc.configureSecondaryInterface(pod, network, resourceName, podCNIInfo, networkConfig); err != nil { klog.ErrorS(err, "Secondary interface configuration failed", "Pod", klog.KRef(pod.Namespace, pod.Name), "interface", network.InterfaceRequest, "networkType", networkConfig.NetworkType) diff --git a/pkg/agent/secondarynetwork/podwatch/controller_test.go b/pkg/agent/secondarynetwork/podwatch/controller_test.go index b65bb02aff0..08bbffaf79d 100644 --- a/pkg/agent/secondarynetwork/podwatch/controller_test.go +++ b/pkg/agent/secondarynetwork/podwatch/controller_test.go @@ -78,27 +78,34 @@ const ( "vlan": {{.VLAN}} }` - defaultCNIVersion = "0.3.0" - defaultMTU = 1500 - sriovDeviceID = "sriov-device-id" - podName = "pod1" - containerID = "container1" - podIP = "1.2.3.4" - networkName = "net" - interfaceName = "eth2" + defaultCNIVersion = "0.3.0" + defaultMTU = 1500 + sriovResourceName1 = "intel.com/intel_sriov_netdevice" + sriovResourceName2 = "mellanox.com/mlnx_connectx5" + sriovDeviceID11 = "sriov-device-id-11" + sriovDeviceID12 = "sriov-device-id-12" + sriovDeviceID21 = "sriov-device-id-21" + podName = "pod1" + containerID = "container1" + podIP = "1.2.3.4" + networkName = "net" + interfaceName = "eth2" ) func testNetwork(name string, networkType networkType) *netdefv1.NetworkAttachmentDefinition { - return testNetworkExt(name, "", "", string(networkType), "", 0, 0, false) + return testNetworkExt(name, "", "", networkType, "", "", 0, 0, false) } -func testNetworkExt(name, cniVersion, cniType, networkType, ipamType string, mtu, vlan int, noIPAM bool) *netdefv1.NetworkAttachmentDefinition { +func testNetworkExt(name, cniVersion, cniType string, networkType networkType, resourceName, ipamType string, mtu, vlan int, noIPAM bool) *netdefv1.NetworkAttachmentDefinition { if cniVersion == "" { cniVersion = defaultCNIVersion } if cniType == "" { cniType = "antrea" } + if networkType == sriovNetworkType && resourceName == "" { + resourceName = sriovResourceName1 + } if ipamType == "" { ipamType = ipam.AntreaIPAMType } @@ -109,7 +116,7 @@ func testNetworkExt(name, cniVersion, cniType, networkType, ipamType string, mtu IPAMType string MTU int VLAN int - }{cniVersion, cniType, networkType, ipamType, mtu, vlan} + }{cniVersion, cniType, string(networkType), ipamType, mtu, vlan} var tmpl *template.Template if !noIPAM { @@ -119,9 +126,14 @@ func testNetworkExt(name, cniVersion, cniType, networkType, ipamType string, mtu } var b bytes.Buffer tmpl.Execute(&b, &data) + annotations := make(map[string]string) + if resourceName != "" { + annotations[resourceNameAnnotationKey] = resourceName + } return &netdefv1.NetworkAttachmentDefinition{ ObjectMeta: metav1.ObjectMeta{ - Name: name, + Name: name, + Annotations: annotations, }, Spec: netdefv1.NetworkAttachmentDefinitionSpec{ Config: b.String(), @@ -189,8 +201,11 @@ func testIPAMResult(cidr string, vlan int) *ipam.IPAMResult { } func init() { - getPodContainerDeviceIDsFn = func(name string, namespace string) ([]string, error) { - return []string{sriovDeviceID}, nil + getPodContainerDeviceIDsFn = func(name string, namespace string) (map[string][]string, error) { + return map[string][]string{ + sriovResourceName1: {sriovDeviceID11, sriovDeviceID12}, + sriovResourceName2: {sriovDeviceID21}, + }, nil } } @@ -258,7 +273,7 @@ func TestPodControllerRun(t *testing.T) { containerNetNs(containerID), interfaceName, defaultMTU, - sriovDeviceID, + sriovDeviceID11, &ipamResult.Result, ).Do(func(string, string, string, string, string, int, string, *current.Result) { atomic.AddInt32(&interfaceConfigured, 1) @@ -277,7 +292,7 @@ func TestPodControllerRun(t *testing.T) { _, err = client.CoreV1().Pods(testNamespace).Create(context.Background(), pod, metav1.CreateOptions{}) require.NoError(t, err, "error when creating test Pod") - // Wait for ConfigureSriovSecondaryInterface is called. + // Wait for ConfigureSriovSecondaryInterface to be called. assert.Eventually(t, func() bool { return atomic.LoadInt32(&interfaceConfigured) == 1 }, 1*time.Second, 10*time.Millisecond) @@ -292,7 +307,8 @@ func TestPodControllerRun(t *testing.T) { containerNetNs(containerID), interfaceName, defaultMTU, - "", + // We haven't updated the vfDeviceIDUsageMap, so a different device will be allocated. + sriovDeviceID12, &ipamResult.Result, ).Do(func(string, string, string, string, string, int, string, *current.Result) { atomic.AddInt32(&interfaceConfigured, 1) @@ -329,7 +345,7 @@ func TestPodControllerRun(t *testing.T) { mockIPAM.EXPECT().SecondaryNetworkRelease(podOwner) // CNI Del event. event.IsAdd = false - // Interfac is not deleted from the interface store, so CNI Del should trigger interface + // Interface is not deleted from the interface store, so CNI Del should trigger interface // deletion again. podController.processCNIUpdate(event) _, exists = cniCache.Load(podKey) @@ -455,7 +471,7 @@ func TestConfigurePodSecondaryNetwork(t *testing.T) { containerNetNs(containerID), interfaceName, 1500, - sriovDeviceID, + sriovDeviceID11, &testIPAMResult("148.14.24.100/24", 0).Result, ) }, @@ -561,7 +577,7 @@ func TestConfigurePodSecondaryNetwork(t *testing.T) { if !tc.doNotCreateNetwork { network1 := testNetworkExt(networkName, tc.cniVersion, tc.cniType, - string(tc.networkType), tc.ipamType, tc.mtu, tc.vlan, tc.noIPAM) + tc.networkType, "", tc.ipamType, tc.mtu, tc.vlan, tc.noIPAM) pc.netAttachDefClient.NetworkAttachmentDefinitions(testNamespace).Create(context.Background(), network1, metav1.CreateOptions{}) } @@ -579,6 +595,74 @@ func TestConfigurePodSecondaryNetwork(t *testing.T) { } +func TestConfigurePodSecondaryNetworkMultipleSriovDevices(t *testing.T) { + ctx := context.Background() + element1 := netdefv1.NetworkSelectionElement{ + Namespace: testNamespace, + Name: "net1", + InterfaceRequest: "eth10", + } + element2 := netdefv1.NetworkSelectionElement{ + Namespace: testNamespace, + Name: "net2", + InterfaceRequest: "eth11", + } + pod, cniInfo := testPod(podName, containerID, podIP, element1, element2) + ctrl := gomock.NewController(t) + pc, _, interfaceConfigurator := testPodControllerStart(ctrl) + + network1 := testNetworkExt("net1", "", "", sriovNetworkType, sriovResourceName1, "", 1500, 0, true /* noIPAM */) + pc.netAttachDefClient.NetworkAttachmentDefinitions(testNamespace).Create(ctx, network1, metav1.CreateOptions{}) + network2 := testNetworkExt("net2", "", "", sriovNetworkType, sriovResourceName2, "", 1500, 0, true /* noIPAM */) + pc.netAttachDefClient.NetworkAttachmentDefinitions(testNamespace).Create(ctx, network2, metav1.CreateOptions{}) + + gomock.InOrder( + interfaceConfigurator.EXPECT().ConfigureSriovSecondaryInterface( + podName, + testNamespace, + containerID, + containerNetNs(containerID), + element2.InterfaceRequest, + 1500, + sriovDeviceID21, + gomock.Any(), + ), + interfaceConfigurator.EXPECT().ConfigureSriovSecondaryInterface( + podName, + testNamespace, + containerID, + containerNetNs(containerID), + element1.InterfaceRequest, + 1500, + sriovDeviceID11, + gomock.Any(), + ), + ) + assert.NoError(t, pc.configurePodSecondaryNetwork(pod, []*netdefv1.NetworkSelectionElement{&element2, &element1}, cniInfo)) + + podKey := podKeyGet(pod.Name, pod.Namespace) + deviceCache, ok := pc.vfDeviceIDUsageMap.Load(podKey) + require.True(t, ok) + expectedDeviceCache := []podSriovVFDeviceIDInfo{ + { + resourceName: sriovResourceName1, + vfDeviceID: sriovDeviceID11, + ifName: element1.InterfaceRequest, + }, + { + resourceName: sriovResourceName1, + vfDeviceID: sriovDeviceID12, + ifName: "", + }, + { + resourceName: sriovResourceName2, + vfDeviceID: sriovDeviceID21, + ifName: element2.InterfaceRequest, + }, + } + assert.ElementsMatch(t, expectedDeviceCache, deviceCache.([]podSriovVFDeviceIDInfo)) +} + func TestPodControllerAddPod(t *testing.T) { pod, _ := testPod(podName, containerID, podIP, netdefv1.NetworkSelectionElement{ Name: networkName, @@ -623,7 +707,7 @@ func TestPodControllerAddPod(t *testing.T) { ) network1 := testNetwork("net1", sriovNetworkType) testVLAN := 100 - network2 := testNetworkExt("net2", "", "", string(vlanNetworkType), "", defaultMTU, testVLAN, false) + network2 := testNetworkExt("net2", "", "", vlanNetworkType, "", "", defaultMTU, testVLAN, false) podOwner1 := &crdv1beta1.PodOwner{Name: podName, Namespace: testNamespace, ContainerID: containerID, IFName: "eth10"} @@ -771,7 +855,7 @@ func TestPodControllerAddPod(t *testing.T) { containerNetNs(containerID), interfaceName, defaultMTU, - sriovDeviceID, + sriovDeviceID11, gomock.Any(), ) mockIPAM.EXPECT().SecondaryNetworkAllocate(gomock.Any(), gomock.Any()).Return(testIPAMResult("148.14.24.100/24", 0), nil) @@ -885,7 +969,7 @@ func TestPodControllerAddPod(t *testing.T) { t.Run("updating deviceID cache per Pod", func(t *testing.T) { ctrl := gomock.NewController(t) podController, _, _ := testPodController(ctrl) - _, err := podController.assignUnusedSriovVFDeviceID(podName, testNamespace, interfaceName) + _, err := podController.assignUnusedSriovVFDeviceID(podName, testNamespace, sriovResourceName1, interfaceName) _, exists := podController.vfDeviceIDUsageMap.Load(podKey) assert.True(t, exists) require.NoError(t, err, "error while assigning unused VfDevice ID") diff --git a/pkg/agent/secondarynetwork/podwatch/sriov.go b/pkg/agent/secondarynetwork/podwatch/sriov.go index bdde76d6592..0e2833bc4a3 100644 --- a/pkg/agent/secondarynetwork/podwatch/sriov.go +++ b/pkg/agent/secondarynetwork/podwatch/sriov.go @@ -46,30 +46,27 @@ var ( getPodContainerDeviceIDsFn = getPodContainerDeviceIDs ) -type kubeletPodResources struct { - resources []*podresourcesv1alpha1.PodResources -} - // Structure to associate a unique VF's PCI Address to the Linux ethernet interface. type podSriovVFDeviceIDInfo struct { - vfDeviceID string - ifName string + resourceName string + vfDeviceID string + ifName string } // getPodContainerDeviceIDs returns the device IDs assigned to a Pod's containers. -func getPodContainerDeviceIDs(podName string, podNamespace string) ([]string, error) { +func getPodContainerDeviceIDs(podName string, podNamespace string) (map[string][]string, error) { conn, err := grpc.NewClient( "unix:///"+path.Join(kubeletPodResourcesPath, kubeletSocket), grpc.WithTransportCredentials(grpcinsecure.NewCredentials()), ) if err != nil { - return []string{}, fmt.Errorf("error getting the gRPC client for Pod resources: %v", err) + return nil, fmt.Errorf("error getting the gRPC client for Pod resources: %v", err) } defer conn.Close() client := podresourcesv1alpha1.NewPodResourcesListerClient(conn) if client == nil { - return []string{}, fmt.Errorf("error getting the lister client for Pod resources") + return nil, fmt.Errorf("error getting the lister client for Pod resources") } ctx, cancel := context.WithTimeout(context.Background(), listTimeout) @@ -77,22 +74,21 @@ func getPodContainerDeviceIDs(podName string, podNamespace string) ([]string, er podResources, err := client.List(ctx, &podresourcesv1alpha1.ListPodResourcesRequest{}) if err != nil { - return []string{}, fmt.Errorf("error getting the Pod resources: %v %v", podResources, err) + return nil, fmt.Errorf("error getting the Pod resources: %v %v", podResources, err) } - var podDeviceIDs []string - var kpr kubeletPodResources - kpr.resources = podResources.GetPodResources() - for _, pr := range kpr.resources { + podDeviceIDs := make(map[string][]string) + resources := podResources.GetPodResources() + for _, pr := range resources { if pr.Name == podName && pr.Namespace == podNamespace { for _, ctr := range pr.Containers { for _, dev := range ctr.Devices { - podDeviceIDs = append(podDeviceIDs, dev.DeviceIds...) + podDeviceIDs[dev.ResourceName] = append(podDeviceIDs[dev.ResourceName], dev.DeviceIds...) } } } } - klog.V(2).Infof("Pod container device IDs of %s/%s are: %v", podNamespace, podName, podDeviceIDs) + klog.V(2).InfoS("Retrieved Pod container device IDs", "pod", klog.KRef(podNamespace, podName), "deviceIDs", podDeviceIDs) return podDeviceIDs, nil } @@ -109,14 +105,19 @@ func (pc *PodController) buildVFDeviceIDListPerPod(podName, podNamespace string) if cacheFound { return deviceCache.([]podSriovVFDeviceIDInfo), nil } - podSriovVFDeviceIDs, err := getPodContainerDeviceIDsFn(podName, podNamespace) + deviceIDsByResourceName, err := getPodContainerDeviceIDsFn(podName, podNamespace) if err != nil { - return nil, fmt.Errorf("getPodContainerDeviceIDs failed: %v", err) + return nil, fmt.Errorf("getPodContainerDeviceIDs failed: %w", err) } var vfDeviceIDInfoCache []podSriovVFDeviceIDInfo - for _, pciAddress := range podSriovVFDeviceIDs { - initSriovVfDeviceID := podSriovVFDeviceIDInfo{vfDeviceID: pciAddress, ifName: ""} - vfDeviceIDInfoCache = append(vfDeviceIDInfoCache, initSriovVfDeviceID) + for resourceName, deviceIDs := range deviceIDsByResourceName { + for _, deviceID := range deviceIDs { + vfDeviceIDInfoCache = append(vfDeviceIDInfoCache, podSriovVFDeviceIDInfo{ + resourceName: resourceName, + vfDeviceID: deviceID, + ifName: "", // we will set this field when allocating the device + }) + } } pc.vfDeviceIDUsageMap.Store(podKey, vfDeviceIDInfoCache) klog.V(2).InfoS("Pod specific SRIOV VF cache created", "Key", podKey) @@ -140,32 +141,39 @@ func (pc *PodController) releaseSriovVFDeviceID(podName, podNamespace, interface return } cache := obj.([]podSriovVFDeviceIDInfo) - for idx := 0; idx < len(cache); idx++ { + for idx := range cache { if cache[idx].ifName == interfaceName { cache[idx].ifName = "" } } } -func (pc *PodController) assignUnusedSriovVFDeviceID(podName, podNamespace, interfaceName string) (string, error) { +func (pc *PodController) assignUnusedSriovVFDeviceID(podName, podNamespace, resourceName, interfaceName string) (string, error) { var cache []podSriovVFDeviceIDInfo cache, err := pc.buildVFDeviceIDListPerPod(podName, podNamespace) if err != nil { return "", err } - for idx := 0; idx < len(cache); idx++ { - if cache[idx].ifName == "" { + for idx := range cache { + if cache[idx].resourceName == resourceName && cache[idx].ifName == "" { // Unused PCI address found. Associate PCI address to the interface. cache[idx].ifName = interfaceName return cache[idx].vfDeviceID, nil } } - return "", err + return "", fmt.Errorf("no available device") } // Configure SRIOV VF as a Secondary Network Interface. -func (pc *PodController) configureSriovAsSecondaryInterface(pod *corev1.Pod, network *netdefv1.NetworkSelectionElement, podCNIInfo *podCNIInfo, mtu int, result *current.Result) error { - podSriovVFDeviceID, err := pc.assignUnusedSriovVFDeviceID(pod.Name, pod.Namespace, network.InterfaceRequest) +func (pc *PodController) configureSriovAsSecondaryInterface( + pod *corev1.Pod, + network *netdefv1.NetworkSelectionElement, + resourceName string, + podCNIInfo *podCNIInfo, + mtu int, + result *current.Result, +) error { + podSriovVFDeviceID, err := pc.assignUnusedSriovVFDeviceID(pod.Name, pod.Namespace, resourceName, network.InterfaceRequest) if err != nil { return err }