diff --git a/cmd/gpud/command/command.go b/cmd/gpud/command/command.go index 59aa6988..1fc1d834 100644 --- a/cmd/gpud/command/command.go +++ b/cmd/gpud/command/command.go @@ -374,6 +374,12 @@ sudo rm /etc/systemd/system/gpud.service }, }, + { + Name: "is-nvidia", + + Usage: "quick check if the host has NVIDIA GPUs installed", + Action: cmdIsNvidia, + }, { Name: "accelerator", Aliases: []string{"a"}, diff --git a/cmd/gpud/command/is-nvidia.go b/cmd/gpud/command/is-nvidia.go new file mode 100644 index 00000000..cb6d16de --- /dev/null +++ b/cmd/gpud/command/is-nvidia.go @@ -0,0 +1,24 @@ +package command + +import ( + "context" + "fmt" + "time" + + nvidia_query "github.com/leptonai/gpud/components/accelerator/nvidia/query" + + "github.com/urfave/cli" +) + +func cmdIsNvidia(cliContext *cli.Context) error { + ctx, cancel := context.WithTimeout(context.Background(), time.Minute) + defer cancel() + + nvidiaInstalled, err := nvidia_query.GPUsInstalled(ctx) + if err != nil { + return err + } + + fmt.Printf("NVIDIA installed: %v", nvidiaInstalled) + return nil +} diff --git a/components/accelerator/detect.go b/components/accelerator/detect.go index dd85390e..34f5b449 100644 --- a/components/accelerator/detect.go +++ b/components/accelerator/detect.go @@ -2,13 +2,9 @@ package accelerator import ( "context" - "fmt" + nvidia_query "github.com/leptonai/gpud/components/accelerator/nvidia/query" "github.com/leptonai/gpud/pkg/file" - - "github.com/NVIDIA/go-nvlib/pkg/nvlib/device" - nvinfo "github.com/NVIDIA/go-nvlib/pkg/nvlib/info" - "github.com/NVIDIA/go-nvml/pkg/nvml" ) type Type string @@ -21,7 +17,7 @@ const ( // Returns the GPU type (e.g., "NVIDIA") and product name (e.g., "A100") func DetectTypeAndProductName(ctx context.Context) (Type, string, error) { if p, err := file.LocateExecutable("nvidia-smi"); p != "" && err == nil { - productName, err := LoadNVIDIAProductName(ctx) + productName, err := nvidia_query.LoadGPUDeviceName(ctx) if err != nil { return TypeNVIDIA, "unknown", err } @@ -30,38 +26,3 @@ func DetectTypeAndProductName(ctx context.Context) (Type, string, error) { return TypeUnknown, "unknown", nil } - -func LoadNVIDIAProductName(ctx context.Context) (string, error) { - nvmlLib := nvml.New() - if ret := nvmlLib.Init(); ret != nvml.SUCCESS { - return "", fmt.Errorf("failed to initialize NVML: %v", nvml.ErrorString(ret)) - } - - deviceLib := device.New(nvmlLib) - infoLib := nvinfo.New( - nvinfo.WithNvmlLib(nvmlLib), - nvinfo.WithDeviceLib(deviceLib), - ) - - nvmlExists, nvmlExistsMsg := infoLib.HasNvml() - if !nvmlExists { - return "", fmt.Errorf("NVML not found: %s", nvmlExistsMsg) - } - - devices, err := deviceLib.GetDevices() - if err != nil { - return "", err - } - - for _, d := range devices { - name, ret := d.GetName() - if ret != nvml.SUCCESS { - return "", fmt.Errorf("failed to get device name: %v", nvml.ErrorString(ret)) - } - if name != "" { - return name, nil - } - } - - return "", nil -} diff --git a/components/accelerator/nvidia/query/detect.go b/components/accelerator/nvidia/query/detect.go new file mode 100644 index 00000000..2f400746 --- /dev/null +++ b/components/accelerator/nvidia/query/detect.go @@ -0,0 +1,145 @@ +package query + +import ( + "bufio" + "context" + "fmt" + "strings" + + "github.com/leptonai/gpud/log" + "github.com/leptonai/gpud/pkg/file" + "github.com/leptonai/gpud/pkg/process" + + "github.com/NVIDIA/go-nvlib/pkg/nvlib/device" + nvinfo "github.com/NVIDIA/go-nvlib/pkg/nvlib/info" + "github.com/NVIDIA/go-nvml/pkg/nvml" +) + +// Returns true if the local machine has NVIDIA GPUs installed. +func GPUsInstalled(ctx context.Context) (bool, error) { + smiInstalled := SMIExists() + if !smiInstalled { + return false, nil + } + log.Logger.Debugw("nvidia-smi installed") + + // now that nvidia-smi installed, + // check the NVIDIA GPU presence via PCI bus + pciDevices, err := ListNVIDIAPCIs(ctx) + if err != nil { + return false, err + } + if len(pciDevices) == 0 { + return false, nil + } + log.Logger.Infow("nvidia PCI devices found", "devices", len(pciDevices)) + + // now that we have the NVIDIA PCI devices, + // call NVML C-based API for NVML API + gpuDeviceName, err := LoadGPUDeviceName(ctx) + if err != nil { + return false, err + } + log.Logger.Infow("detected nvidia gpu", "gpuDeviceName", gpuDeviceName) + + return true, nil +} + +// Loads the product name of the NVIDIA GPU device. +func LoadGPUDeviceName(ctx context.Context) (string, error) { + nvmlLib := nvml.New() + if ret := nvmlLib.Init(); ret != nvml.SUCCESS { + return "", fmt.Errorf("failed to initialize NVML: %v", nvml.ErrorString(ret)) + } + + deviceLib := device.New(nvmlLib) + infoLib := nvinfo.New( + nvinfo.WithNvmlLib(nvmlLib), + nvinfo.WithDeviceLib(deviceLib), + ) + + nvmlExists, nvmlExistsMsg := infoLib.HasNvml() + if !nvmlExists { + return "", fmt.Errorf("NVML not found: %s", nvmlExistsMsg) + } + + devices, err := deviceLib.GetDevices() + if err != nil { + return "", err + } + + for _, d := range devices { + name, ret := d.GetName() + if ret != nvml.SUCCESS { + return "", fmt.Errorf("failed to get device name: %v", nvml.ErrorString(ret)) + } + if name != "" { + return name, nil + } + } + + return "", nil +} + +// Lists all PCI devices that are compatible with NVIDIA. +func ListNVIDIAPCIs(ctx context.Context) ([]string, error) { + lspciPath, err := file.LocateExecutable("lspci") + if err != nil { + return nil, nil + } + if lspciPath == "" { + return nil, nil + } + + p, err := process.New( + process.WithCommand(lspciPath), + process.WithRunAsBashScript(), + ) + if err != nil { + return nil, err + } + + if err := p.Start(ctx); err != nil { + return nil, err + } + + lines := make([]string, 0) + + scanner := bufio.NewScanner(p.StdoutReader()) + for scanner.Scan() { // returns false at the end of the output + line := scanner.Text() + + // e.g., + // 01:00.0 VGA compatible controller: NVIDIA Corporation Device 2684 (rev a1) + // 01:00.1 Audio device: NVIDIA Corporation Device 22ba (rev a1) + if strings.Contains(line, "NVIDIA") { + lines = append(lines, line) + } + + select { + case err := <-p.Wait(): + if err != nil { + return nil, err + } + default: + } + } + if serr := scanner.Err(); serr != nil { + // process already dead, thus ignore + // e.g., "read |0: file already closed" + if !strings.Contains(serr.Error(), "file already closed") { + return nil, serr + } + } + + select { + case err := <-p.Wait(): + if err != nil { + return nil, err + } + case <-ctx.Done(): + return nil, ctx.Err() + } + + return lines, nil +} diff --git a/components/diagnose/scan.go b/components/diagnose/scan.go index 428adfbc..5abce2ad 100644 --- a/components/diagnose/scan.go +++ b/components/diagnose/scan.go @@ -43,7 +43,13 @@ func Scan(ctx context.Context, opts ...OpOption) error { fmt.Printf("\n\n%s scanning the host\n\n", inProgress) - if nvidia_query.SMIExists() { + nvidiaInstalled, err := nvidia_query.GPUsInstalled(ctx) + if err != nil { + log.Logger.Warnw("error checking nvidia gpu installation", "error", err) + return err + } + + if nvidiaInstalled { fmt.Printf("%s scanning nvidia accelerators\n", inProgress) for _, lib := range defaultNVIDIALibraries { diff --git a/components/dmesg/filters_nvidia.go b/components/dmesg/filters_nvidia.go index 30def526..9bfac16a 100644 --- a/components/dmesg/filters_nvidia.go +++ b/components/dmesg/filters_nvidia.go @@ -1,6 +1,9 @@ package dmesg import ( + "context" + "time" + nvidia_error "github.com/leptonai/gpud/components/accelerator/nvidia/error" nvidia_nccl_id "github.com/leptonai/gpud/components/accelerator/nvidia/nccl/id" nvidia_peermem_id "github.com/leptonai/gpud/components/accelerator/nvidia/peermem/id" @@ -15,7 +18,15 @@ import ( ) func init() { - if nvidia_query.SMIExists() { + ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) + defer cancel() + + nvidiaInstalled, err := nvidia_query.GPUsInstalled(ctx) + if err != nil { + return + } + + if nvidiaInstalled { defaultFilters = append(defaultFilters, DefaultDmesgFiltersForNvidia()...) } for i := range defaultFilters { diff --git a/config/default.go b/config/default.go index 6da5d6e8..93c2dfaf 100644 --- a/config/default.go +++ b/config/default.go @@ -195,7 +195,12 @@ func DefaultConfig(ctx context.Context, opts ...OpOption) (*Config, error) { log.Logger.Debugw("auto-detect tailscale not supported -- skipping", "os", runtime.GOOS) } - if runtime.GOOS == "linux" && nvidia_query.SMIExists() { + nvidiaInstalled, err := nvidia_query.GPUsInstalled(ctx) + if err != nil { + return nil, err + } + + if runtime.GOOS == "linux" && nvidiaInstalled { driverVersion, err := nvidia_query_nvml.GetDriverVersion() if err != nil { return nil, err diff --git a/internal/server/handlers_root.go b/internal/server/handlers_root.go index 4af7ea42..f49e96f7 100644 --- a/internal/server/handlers_root.go +++ b/internal/server/handlers_root.go @@ -1,14 +1,15 @@ package server import ( + "context" "embed" "fmt" "html/template" stdos "os" "runtime" "strings" + "time" - "github.com/dustin/go-humanize" "github.com/leptonai/gpud/components" nvidia_clock "github.com/leptonai/gpud/components/accelerator/nvidia/clock" nvidia_clockspeed "github.com/leptonai/gpud/components/accelerator/nvidia/clock-speed" @@ -28,6 +29,7 @@ import ( "github.com/leptonai/gpud/log" "github.com/leptonai/gpud/version" + "github.com/dustin/go-humanize" "github.com/gin-gonic/gin" "github.com/shirou/gopsutil/v4/process" ) @@ -80,8 +82,15 @@ func createRootHandler(handlerDescs []componentHandlerDescription, webConfig con nvidiaClockSpeedChart := false nvidiaErrsChart := false + ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) + nvidiaInstalled, err := nvidia_query.GPUsInstalled(ctx) + cancel() + if err != nil { + log.Logger.Fatalw("failed to check if nvidia is installed", "error", err) + } + var nvidiaInfoOutputProvider components.OutputProvider - if nvidia_query.SMIExists() { + if nvidiaInstalled { nvidiaInfoComponent, err := components.GetComponent(nvidia_info.Name) if err != nil { panic(fmt.Sprintf("component %q required but not set", nvidia_info.Name))