From eb2787abc8e18505c5cd4869c9d59d39336760ee Mon Sep 17 00:00:00 2001 From: Kenji Kaneda Date: Fri, 3 Jan 2025 13:09:24 -0800 Subject: [PATCH] feat(engine): ignore cordoned GPU nodes from cluster status --- dispatcher/internal/clusterstatus/manager.go | 5 +++++ .../internal/clusterstatus/manager_test.go | 21 +++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/dispatcher/internal/clusterstatus/manager.go b/dispatcher/internal/clusterstatus/manager.go index 616ac52..cc1a6c7 100644 --- a/dispatcher/internal/clusterstatus/manager.go +++ b/dispatcher/internal/clusterstatus/manager.go @@ -152,6 +152,11 @@ func toProvisionableResource(np krpv1.NodePool) *v1.ProvisionableResource { } func toGPUNode(node corev1.Node, logger logr.Logger) (*v1.GpuNode, bool) { + // Ignore cordoned nodes. + if node.Spec.Unschedulable { + return nil, false + } + // TODO(kenji): Support other accelerator types. rs := map[corev1.ResourceName]bool{ nvidiaGPU: true, diff --git a/dispatcher/internal/clusterstatus/manager_test.go b/dispatcher/internal/clusterstatus/manager_test.go index ad843bc..c15bb2b 100644 --- a/dispatcher/internal/clusterstatus/manager_test.go +++ b/dispatcher/internal/clusterstatus/manager_test.go @@ -79,6 +79,27 @@ func TestManager(t *testing.T) { GpuNodes: []*v1.GpuNode{}, }, }, + { + name: "cordoned gpu node", + objs: []runtime.Object{ + &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node1", + }, + Spec: corev1.NodeSpec{ + Unschedulable: true, + }, + Status: corev1.NodeStatus{ + Allocatable: corev1.ResourceList{ + nvidiaGPU: resource.MustParse("1"), + }, + }, + }, + }, + want: &v1.ClusterStatus{ + GpuNodes: []*v1.GpuNode{}, + }, + }, { name: "provisionable resources of instance type", objs: []runtime.Object{