Skip to content

Commit

Permalink
Support NVIDIA CDI in Ubuntu Core environment (#14347)
Browse files Browse the repository at this point in the history
Tested using testflinger with `core24-latest` image deployed.
  • Loading branch information
tomponline authored Oct 25, 2024
2 parents 7d86766 + e322dff commit 41eac02
Show file tree
Hide file tree
Showing 5 changed files with 42 additions and 8 deletions.
5 changes: 3 additions & 2 deletions lxd/device/cdi/configure.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"tags.cncf.io/container-device-interface/specs-go"

"github.com/canonical/lxd/lxd/instance"
"github.com/canonical/lxd/lxd/state"
"github.com/canonical/lxd/shared"
"github.com/canonical/lxd/shared/logger"
)
Expand Down Expand Up @@ -277,9 +278,9 @@ func applyContainerEdits(edits specs.ContainerEdits, configDevices *ConfigDevice
// 4. Now we process all the mounts we collected from the spec in order to turn them into disk devices.
// This operations generate a side effect: it generates a list of indirect symlinks (see `specMountToNativeDev`)
// 5. Merge all the hooks (direct + indirect) into a single list of hooks.
func GenerateFromCDI(inst instance.Instance, cdiID ID, l logger.Logger) (*ConfigDevices, *Hooks, error) {
func GenerateFromCDI(s *state.State, inst instance.Instance, cdiID ID, l logger.Logger) (*ConfigDevices, *Hooks, error) {
// 1. Generate the CDI specification
spec, err := generateSpec(cdiID, inst)
spec, err := generateSpec(s, cdiID, inst)
if err != nil {
return nil, nil, fmt.Errorf("Failed to generate CDI spec: %w", err)
}
Expand Down
26 changes: 21 additions & 5 deletions lxd/device/cdi/spec.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (
"tags.cncf.io/container-device-interface/specs-go"

"github.com/canonical/lxd/lxd/instance"
"github.com/canonical/lxd/lxd/state"
"github.com/canonical/lxd/shared"
"github.com/canonical/lxd/shared/logger"
)
Expand All @@ -39,7 +40,7 @@ func defaultNvidiaTegraCSVFiles(rootPath string) []string {
}

// generateNvidiaSpec generates a CDI spec for an Nvidia vendor.
func generateNvidiaSpec(cdiID ID, inst instance.Instance) (*specs.Spec, error) {
func generateNvidiaSpec(s *state.State, cdiID ID, inst instance.Instance) (*specs.Spec, error) {
l := logger.AddContext(logger.Ctx{"instanceName": inst.Name(), "projectName": inst.Project().Name, "cdiID": cdiID.String()})
mode := nvcdi.ModeAuto
if cdiID.Class == IGPU {
Expand All @@ -62,15 +63,30 @@ func generateNvidiaSpec(cdiID ID, inst instance.Instance) (*specs.Spec, error) {
}

rootPath := ""
if shared.InSnap() {
devRootPath := ""
if s.OS.InUbuntuCore() {
//
// This magic "gpu-2404-2" name comes from:
// https://github.com/canonical/mesa-2404/blob/0e48b4d1b8e5cb4d3098d64417025824decd9846/scripts/bin/gpu-2404-provider-wrapper.in#L5
//
// Also, we can't use `/snap/lxd/current/gpu-2404-2` as a rootPath because of a bug in nvcdi package
// which make nvcdi to fail to lookup for a library when driver root path contains a symlink
// (in our case it's `/snap/lxd/current`).
// We workaround it by using $SNAP environment variable which is not a symlink but a path to lxd snap
// with a revision number like `/snap/lxd/12345`.
//
rootPath = os.Getenv("SNAP") + "/gpu-2404-2"
devRootPath = "/"
} else if shared.InSnap() {
rootPath = "/var/lib/snapd/hostfs"
devRootPath = rootPath
}

cdilib, err := nvcdi.New(
nvcdi.WithDeviceNamers(indexDeviceNamer, uuidDeviceNamer),
nvcdi.WithLogger(NewCDILogger(l)),
nvcdi.WithDriverRoot(rootPath),
nvcdi.WithDevRoot(rootPath),
nvcdi.WithDevRoot(devRootPath),
nvcdi.WithNVIDIACDIHookPath(nvidiaCTKPath),
nvcdi.WithMode(mode),
nvcdi.WithCSVFiles(defaultNvidiaTegraCSVFiles(rootPath)),
Expand Down Expand Up @@ -109,10 +125,10 @@ func generateNvidiaSpec(cdiID ID, inst instance.Instance) (*specs.Spec, error) {
}

// generateSpec generates a CDI spec for the given CDI ID.
func generateSpec(cdiID ID, inst instance.Instance) (*specs.Spec, error) {
func generateSpec(s *state.State, cdiID ID, inst instance.Instance) (*specs.Spec, error) {
switch cdiID.Vendor {
case NVIDIA:
return generateNvidiaSpec(cdiID, inst)
return generateNvidiaSpec(s, cdiID, inst)
default:
return nil, fmt.Errorf("Unsupported CDI vendor (%q) for the spec generation", cdiID.Vendor)
}
Expand Down
2 changes: 1 addition & 1 deletion lxd/device/gpu_physical.go
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,7 @@ func (d *gpuPhysical) startContainer() (*deviceConfig.RunConfig, error) {
return nil, fmt.Errorf(`MIG GPU notation detected for a "physical" gputype device. Choose a "mig" gputype device instead.`)
}

configDevices, hooks, err := cdi.GenerateFromCDI(d.inst, cdiID, d.logger)
configDevices, hooks, err := cdi.GenerateFromCDI(d.state, d.inst, cdiID, d.logger)
if err != nil {
return nil, err
}
Expand Down
4 changes: 4 additions & 0 deletions lxd/instance/instance_utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,10 @@ func ValidConfig(sysOS *sys.OS, config map[string]string, expanded bool, instanc
return fmt.Errorf("nvidia.runtime is incompatible with privileged containers")
}

if sysOS.InUbuntuCore() && shared.IsTrue(config["nvidia.runtime"]) {
return fmt.Errorf("nvidia.runtime is incompatible with Ubuntu Core")
}

// Validate pinning strategy when limits.cpu specifies static pinning.
cpuPinStrategy := config["limits.cpu.pin_strategy"]
cpuLimit := config["limits.cpu"]
Expand Down
13 changes: 13 additions & 0 deletions lxd/sys/os.go
Original file line number Diff line number Diff line change
Expand Up @@ -234,3 +234,16 @@ func (s *OS) Init() ([]cluster.Warning, error) {
func (s *OS) InitStorage() error {
return s.initStorageDirs()
}

// InUbuntuCore returns true if we're running on Ubuntu Core.
func (s *OS) InUbuntuCore() bool {
if !shared.InSnap() {
return false
}

if s.ReleaseInfo["NAME"] == "Ubuntu Core" {
return true
}

return false
}

0 comments on commit 41eac02

Please sign in to comment.