Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(nvml): handle "not supported" error to not fail-fast for NVML get calls #291

Merged
merged 13 commits into from
Jan 10, 2025
Merged
18 changes: 15 additions & 3 deletions components/accelerator/nvidia/query/nvml/clock_events.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,11 @@ func ClockEventsSupportedByDevice(dev device.Device) (bool, error) {
// undefined symbol: nvmlDeviceGetCurrentClocksEventReasons
// ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g7e505374454a0d4fc7339b6c885656d6
_, ret := dev.GetCurrentClocksEventReasons()
if ret == nvml.ERROR_NOT_SUPPORTED {
if IsNotSupportError(ret) {
return false, nil
}

// not a "not supported" error, not a success return, thus return an error here
if ret != nvml.SUCCESS {
return false, fmt.Errorf("could not get current clock events: %v", nvml.ErrorString(ret))
}
Expand Down Expand Up @@ -97,6 +99,9 @@ type ClockEvents struct {
HWSlowdownThermal bool `json:"hw_thermal_slowdown"`
// Set true if the HW Power Brake Slowdown reason due to the external power brake assertion is active.
HWSlowdownPowerBrake bool `json:"hw_slowdown_power_brake"`

// Supported is true if the clock events are supported by the device.
Supported bool `json:"supported"`
}

func (evs *ClockEvents) JSON() ([]byte, error) {
Expand All @@ -115,15 +120,22 @@ func (evs *ClockEvents) YAML() ([]byte, error) {

func GetClockEvents(uuid string, dev device.Device) (ClockEvents, error) {
clockEvents := ClockEvents{
Time: metav1.Time{Time: time.Now().UTC()},
UUID: uuid,
Time: metav1.Time{Time: time.Now().UTC()},
UUID: uuid,
Supported: true,
}

// clock events are supported in versions 535 and above
// otherwise, CGO call just exits with
// undefined symbol: nvmlDeviceGetCurrentClocksEventReasons
// ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g7e505374454a0d4fc7339b6c885656d6
reasons, ret := dev.GetCurrentClocksEventReasons()
if IsNotSupportError(ret) {
clockEvents.Supported = false
return clockEvents, nil
}

// not a "not supported" error, not a success return, thus return an error here
if ret != nvml.SUCCESS {
return clockEvents, fmt.Errorf("failed to get device clock event reasons: %v", nvml.ErrorString(ret))
}
Expand Down
20 changes: 16 additions & 4 deletions components/accelerator/nvidia/query/nvml/clock_speed.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,12 @@ type ClockSpeed struct {

GraphicsMHz uint32 `json:"graphics_mhz"`
MemoryMHz uint32 `json:"memory_mhz"`

// ClockGraphicsSupported is true if the clock speed is supported by the device.
ClockGraphicsSupported bool `json:"clock_graphics_supported"`

// ClockMemorySupported is true if the clock speed is supported by the device.
ClockMemorySupported bool `json:"clock_memory_supported"`
}

func GetClockSpeed(uuid string, dev device.Device) (ClockSpeed, error) {
Expand All @@ -25,16 +31,22 @@ func GetClockSpeed(uuid string, dev device.Device) (ClockSpeed, error) {

// ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g2efc4dd4096173f01d80b2a8bbfd97ad
graphicsClock, ret := dev.GetClockInfo(nvml.CLOCK_GRAPHICS)
if ret != nvml.SUCCESS {
return ClockSpeed{}, fmt.Errorf("failed to get device clock info for nvml.CLOCK_GRAPHICS: %v", nvml.ErrorString(ret))
if IsNotSupportError(ret) {
clockSpeed.ClockGraphicsSupported = false
} else if ret != nvml.SUCCESS { // not a "not supported" error, not a success return, thus return an error here
return clockSpeed, fmt.Errorf("failed to get device clock info for nvml.CLOCK_GRAPHICS: %v", nvml.ErrorString(ret))
}
clockSpeed.ClockGraphicsSupported = true
clockSpeed.GraphicsMHz = graphicsClock

// ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g2efc4dd4096173f01d80b2a8bbfd97ad
memClock, ret := dev.GetClockInfo(nvml.CLOCK_MEM)
if ret != nvml.SUCCESS {
return ClockSpeed{}, fmt.Errorf("failed to get device clock info for nvml.CLOCK_MEM: %v", nvml.ErrorString(ret))
if IsNotSupportError(ret) {
clockSpeed.ClockMemorySupported = false
} else if ret != nvml.SUCCESS { // not a "not supported" error, not a success return, thus return an error here
return clockSpeed, fmt.Errorf("failed to get device clock info for nvml.CLOCK_MEM: %v", nvml.ErrorString(ret))
}
clockSpeed.ClockMemorySupported = true
clockSpeed.MemoryMHz = memClock

return clockSpeed, nil
Expand Down
Loading
Loading