From f08aeb2134c7090d50cab9dd0ca301500d303fb9 Mon Sep 17 00:00:00 2001 From: wawa0210 Date: Wed, 24 Jan 2024 23:50:46 +0800 Subject: [PATCH] skip old architecture version GPU settings time slice Signed-off-by: wawa0210 --- cmd/nvidia-dra-plugin/sharing.go | 52 ++++++++++++++++++++++++++++++-- 1 file changed, 50 insertions(+), 2 deletions(-) diff --git a/cmd/nvidia-dra-plugin/sharing.go b/cmd/nvidia-dra-plugin/sharing.go index 9f177373..5d25c031 100644 --- a/cmd/nvidia-dra-plugin/sharing.go +++ b/cmd/nvidia-dra-plugin/sharing.go @@ -101,17 +101,32 @@ func (t *TimeSlicingManager) SetTimeSlice(devices *PreparedDevices, config *nasc return fmt.Errorf("setting a TimeSlice duration on MIG devices is unsupported") } + noSupportTimeSliceIDs := []string{} + for _, gpu := range devices.Gpu.Devices { + if !detectSupportTimeSliceByArch(gpu.architecture) { + klog.InfoS("the current card does not support setting time slices and will be ignored.", "arch", gpu.architecture, "uuid", gpu) + noSupportTimeSliceIDs = append(noSupportTimeSliceIDs, gpu.uuid) + continue + } + } + + supportTimeSliceIDs := difference(devices.UUIDs(), noSupportTimeSliceIDs) + + if len(supportTimeSliceIDs) == 0 { + klog.InfoS("all card does not support setting time slices and will be ignored.", "uuids", noSupportTimeSliceIDs) + } + timeSlice := nascrd.DefaultTimeSlice if config != nil && config.TimeSlice != nil { timeSlice = *config.TimeSlice } - err := t.nvdevlib.setComputeMode(devices.UUIDs(), "DEFAULT") + err := t.nvdevlib.setComputeMode(supportTimeSliceIDs, "DEFAULT") if err != nil { return fmt.Errorf("error setting compute mode: %w", err) } - err = t.nvdevlib.setTimeSlice(devices.UUIDs(), timeSlice.Int()) + err = t.nvdevlib.setTimeSlice(supportTimeSliceIDs, timeSlice.Int()) if err != nil { return fmt.Errorf("error setting time slice: %w", err) } @@ -389,3 +404,36 @@ func (m *MpsControlDaemon) Stop(ctx context.Context) error { return nil } + +// detactSupportTimeSliceByArch Determine whether the architecture series +// supports setting time slices based on the gpu architecture. +func detectSupportTimeSliceByArch(arch string) bool { + // todo: More information is needed to determine the support of various architectures + switch arch { + case "Pascal": + return false + case "Ada": + case "Ampere": + case "Turing": + case "Hopper": + return true + default: + return true + } + return true +} + +// difference returns the elements in `a` that aren't in `b`. +func difference(a, b []string) []string { + mb := make(map[string]struct{}, len(b)) + for _, x := range b { + mb[x] = struct{}{} + } + var diff []string + for _, x := range a { + if _, found := mb[x]; !found { + diff = append(diff, x) + } + } + return diff +}