Skip to content

Commit

Permalink
skip old architecture version GPU settings time slice
Browse files Browse the repository at this point in the history
Signed-off-by: wawa0210 <[email protected]>
  • Loading branch information
wawa0210 committed Apr 3, 2024
1 parent 0e01612 commit 40dae3c
Showing 1 changed file with 30 additions and 0 deletions.
30 changes: 30 additions & 0 deletions cmd/nvidia-dra-plugin/sharing.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"fmt"
"os"
"os/exec"
"strconv"
"strings"
"text/template"
"time"
Expand Down Expand Up @@ -102,6 +103,17 @@ func (t *TimeSlicingManager) SetTimeSlice(devices *PreparedDevices, config *shar
return fmt.Errorf("setting a TimeSlice duration on MIG devices is unsupported")
}

for _, gpu := range devices.Gpu.Devices {
err, isSupportTimeSlice := detectSupportTimeSliceByCudaComputeCapability(gpu.cudaComputeCapability)
if err != nil {
return fmt.Errorf("failed to detectSupportTimeSliceByCudaComputeCapability : %w", err)
}
if !isSupportTimeSlice {
klog.InfoS("the current card does not support setting time slices and will be ignored.", "arch", gpu.architecture, "uuid", gpu.uuid, "cudaComputeCapability", gpu.cudaComputeCapability)
return fmt.Errorf("setting a TimeSlice duration on devices uuid=%v is unsupported", gpu.uuid)
}
}

timeSlice := sharing.DefaultTimeSlice
if config != nil && config.TimeSlice != nil {
timeSlice = *config.TimeSlice
Expand Down Expand Up @@ -390,3 +402,21 @@ func (m *MpsControlDaemon) Stop(ctx context.Context) error {

return nil
}

// detactSupportTimeSliceByArch Determine whether the architecture series
// supports setting time slices based on the gpu cudaComputeCapability.
func detectSupportTimeSliceByCudaComputeCapability(cudaComputeCapability string) (error, bool) {
// ref https://github.com/NVIDIA/k8s-dra-driver/pull/58#discussion_r1469338562
// we believe time-slicing is available on Volta+ architectures, so the check would simply be cudaComputeCapability >= 7.0
// by https://github.com/NVIDIA/go-nvlib/blob/main/pkg/nvlib/device/device.go#L149, We know that cuda major and minor versions are concatenated through `.` .

cudaVersion := strings.Split(cudaComputeCapability, ".")
major, err := strconv.Atoi(cudaVersion[0])
if err != nil {
return fmt.Errorf("error to get cudaComputeCapability major version %v", cudaComputeCapability), false
}
if major >= 7 {
return nil, true
}
return nil, false
}

0 comments on commit 40dae3c

Please sign in to comment.