Skip to content

Commit

Permalink
Update mig-related logics and refine logs (#833)
Browse files Browse the repository at this point in the history
fix an issue where migconfig.yaml for mig-parted may not be generated properly

Signed-off-by: limengxuan <[email protected]>
  • Loading branch information
archlitchi authored Jan 24, 2025
1 parent 7fe6183 commit 77c724f
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 13 deletions.
16 changes: 13 additions & 3 deletions pkg/device-plugin/nvidiadevice/nvinternal/plugin/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,7 @@ func (nv *NvidiaDevicePlugin) ApplyMigTemplate() {
if err != nil {
klog.Error("marshal failed", err.Error())
}
klog.Infoln("Applying data=", string(data))
os.WriteFile("/tmp/migconfig.yaml", data, os.ModePerm)
cmd := exec.Command("nvidia-mig-parted", "apply", "-f", "/tmp/migconfig.yaml")
var stdout, stderr bytes.Buffer
Expand Down Expand Up @@ -232,13 +233,22 @@ func (nv *NvidiaDevicePlugin) GenerateMigTemplate(devtype string, devindex int,

if !ok || currentCount != expectedCount {
needsreset = true
nv.migCurrent.MigConfigs["current"][migidx].MigDevices[migTemplateEntry.Name] = expectedCount
klog.InfoS("updated mig device count", "TemplateName", migTemplateEntry.Name, "Count", expectedCount)
klog.InfoS("updated mig device count", "Template", v)
} else {
nv.migCurrent.MigConfigs["current"][migidx].MigDevices[migTemplateEntry.Name]++
klog.InfoS("incremented mig device count", "TemplateName", migTemplateEntry.Name, "Count", currentCount+1)
}
}

if needsreset {
for k := range nv.migCurrent.MigConfigs["current"][migidx].MigDevices {
delete(nv.migCurrent.MigConfigs["current"][migidx].MigDevices, k)
}

for _, migTemplateEntry := range v {
nv.migCurrent.MigConfigs["current"][migidx].MigDevices[migTemplateEntry.Name] = migTemplateEntry.Count
nv.migCurrent.MigConfigs["current"][migidx].MigEnabled = true
}
}
break
}
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/scheduler/pods.go
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ func (m *podManager) ListPodsInfo() []*podInfo {
"devices", pi.Devices,
)
}
klog.InfoS("Listed pod infos",
klog.V(5).InfoS("Listed pod infos",
"podCount", len(pods),
)
return pods
Expand Down
18 changes: 9 additions & 9 deletions pkg/scheduler/scheduler.go
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ func (s *Scheduler) RegisterFromNodeAnnotations() {
for {
select {
case <-s.nodeNotify:
klog.InfoS("Received node notification")
klog.V(5).InfoS("Received node notification")
case <-ticker.C:
klog.InfoS("Ticker triggered")
case <-s.stopCh:
Expand All @@ -181,17 +181,17 @@ func (s *Scheduler) RegisterFromNodeAnnotations() {
klog.ErrorS(err, "Failed to list nodes with selector", "selector", labelSelector.String())
continue
}
klog.InfoS("Listed nodes", "nodeCount", len(rawNodes))
klog.V(5).InfoS("Listed nodes", "nodeCount", len(rawNodes))
var nodeNames []string
for _, val := range rawNodes {
nodeNames = append(nodeNames, val.Name)
klog.InfoS("Processing node", "nodeName", val.Name)
klog.V(5).InfoS("Processing node", "nodeName", val.Name)

for devhandsk, devInstance := range device.GetDevices() {
klog.InfoS("Checking device health", "nodeName", val.Name, "deviceVendor", devhandsk)
klog.V(5).InfoS("Checking device health", "nodeName", val.Name, "deviceVendor", devhandsk)

health, needUpdate := devInstance.CheckHealth(devhandsk, val)
klog.InfoS("Device health check result", "nodeName", val.Name, "deviceVendor", devhandsk, "health", health, "needUpdate", needUpdate)
klog.V(5).InfoS("Device health check result", "nodeName", val.Name, "deviceVendor", devhandsk, "health", health, "needUpdate", needUpdate)

if !health {
klog.Warning("Device is unhealthy, cleaning up node", "nodeName", val.Name, "deviceVendor", devhandsk)
Expand All @@ -208,7 +208,7 @@ func (s *Scheduler) RegisterFromNodeAnnotations() {
continue
}
if !needUpdate {
klog.InfoS("No update needed for device", "nodeName", val.Name, "deviceVendor", devhandsk)
klog.V(5).InfoS("No update needed for device", "nodeName", val.Name, "deviceVendor", devhandsk)
continue
}
_, ok := util.HandshakeAnnos[devhandsk]
Expand All @@ -221,15 +221,15 @@ func (s *Scheduler) RegisterFromNodeAnnotations() {
klog.ErrorS(err, "Failed to get node", "nodeName", val.Name)
continue
}
klog.InfoS("Patching node annotations", "nodeName", val.Name, "annotations", tmppat)
klog.V(5).InfoS("Patching node annotations", "nodeName", val.Name, "annotations", tmppat)
if err := util.PatchNodeAnnotations(n, tmppat); err != nil {
klog.ErrorS(err, "Failed to patch node annotations", "nodeName", val.Name)
}
}
nodeInfo := &util.NodeInfo{}
nodeInfo.ID = val.Name
nodeInfo.Node = val
klog.InfoS("Fetching node devices", "nodeName", val.Name, "deviceVendor", devhandsk)
klog.V(5).InfoS("Fetching node devices", "nodeName", val.Name, "deviceVendor", devhandsk)
nodedevices, err := devInstance.GetNodeDevices(*val)
if err != nil {
klog.ErrorS(err, "Failed to get node devices", "nodeName", val.Name, "deviceVendor", devhandsk)
Expand All @@ -242,7 +242,7 @@ func (s *Scheduler) RegisterFromNodeAnnotations() {
s.addNode(val.Name, nodeInfo)
if s.nodes[val.Name] != nil && len(nodeInfo.Devices) > 0 {
if printedLog[val.Name] {
klog.InfoS("Node device updated", "nodeName", val.Name, "deviceVendor", devhandsk, "nodeInfo", nodeInfo, "totalDevices", s.nodes[val.Name].Devices)
klog.V(5).InfoS("Node device updated", "nodeName", val.Name, "deviceVendor", devhandsk, "nodeInfo", nodeInfo, "totalDevices", s.nodes[val.Name].Devices)
} else {
klog.InfoS("Node device added", "nodeName", val.Name, "deviceVendor", devhandsk, "nodeInfo", nodeInfo, "totalDevices", s.nodes[val.Name].Devices)
printedLog[val.Name] = true
Expand Down

0 comments on commit 77c724f

Please sign in to comment.