Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Address issues with running DRA driver on GKE #23

Merged
merged 5 commits into from
Nov 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions cmd/nvidia-dra-plugin/cdi.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ type CDIHandler struct {
nvcdi nvcdi.Interface
registry cdiapi.Registry
driverRoot string
devRoot string
targetDriverRoot string
nvidiaCTKPath string

Expand Down Expand Up @@ -84,6 +85,7 @@ func NewCDIHandler(opts ...cdiOption) (*CDIHandler, error) {
nvcdilib, err := nvcdi.New(
nvcdi.WithDeviceLib(h.nvdevice),
nvcdi.WithDriverRoot(h.driverRoot),
nvcdi.WithDevRoot(h.devRoot),
nvcdi.WithLogger(h.logger),
nvcdi.WithNvmlLib(h.nvml),
nvcdi.WithMode("nvml"),
Expand Down
11 changes: 8 additions & 3 deletions cmd/nvidia-dra-plugin/device_state.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"sync"

"github.com/NVIDIA/go-nvlib/pkg/nvml"
"k8s.io/klog/v2"

nascrd "github.com/NVIDIA/k8s-dra-driver/api/nvidia.com/resource/gpu/nas/v1alpha1"
)
Expand Down Expand Up @@ -126,7 +127,8 @@ type DeviceState struct {
}

func NewDeviceState(ctx context.Context, config *Config) (*DeviceState, error) {
nvdevlib, err := newDeviceLib(root(config.flags.containerDriverRoot))
containerDriverRoot := root(config.flags.containerDriverRoot)
nvdevlib, err := newDeviceLib(containerDriverRoot)
if err != nil {
return nil, fmt.Errorf("failed to create device library: %w", err)
}
Expand All @@ -136,12 +138,15 @@ func NewDeviceState(ctx context.Context, config *Config) (*DeviceState, error) {
return nil, fmt.Errorf("error enumerating all possible devices: %w", err)
}

devRoot := containerDriverRoot.getDevRoot()
klog.Infof("using devRoot=%v", devRoot)

hostDriverRoot := config.flags.hostDriverRoot
containerDriverRoot := config.flags.containerDriverRoot
cdi, err := NewCDIHandler(
WithNvml(nvdevlib.nvmllib),
WithDeviceLib(nvdevlib),
WithDriverRoot(containerDriverRoot),
WithDriverRoot(string(containerDriverRoot)),
WithDevRoot(devRoot),
WithTargetDriverRoot(hostDriverRoot),
WithNvidiaCTKPath(config.flags.nvidiaCTKPath),
WithCDIRoot(config.flags.cdiRoot),
Expand Down
20 changes: 20 additions & 0 deletions cmd/nvidia-dra-plugin/find.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package main

import (
"fmt"
"os"
"path/filepath"
)

Expand Down Expand Up @@ -60,6 +61,25 @@ func (r root) getNvidiaSMIPath() (string, error) {
return binaryPath, nil
}

// isDevRoot checks whether the specified root is a dev root.
// A dev root is defined as a root containing a /dev folder.
func (r root) isDevRoot() bool {
stat, err := os.Stat(filepath.Join(string(r), "dev"))
if err != nil {
return false
}
return stat.IsDir()
}

// getDevRoot returns the dev root associated with the root.
// If the root is not a dev root, this defaults to "/".
func (r root) getDevRoot() string {
if r.isDevRoot() {
return string(r)
}
return "/"
}

// findFile searches the root for a specified file.
// A number of folders can be specified to search in addition to the root itself.
// If the file represents a symlink, this is resolved and the final path is returned.
Expand Down
7 changes: 7 additions & 0 deletions cmd/nvidia-dra-plugin/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,13 @@ func WithDriverRoot(root string) cdiOption {
}
}

// WithDevRoot provides a cdiOption to set the device root used by the 'cdi' interface.
func WithDevRoot(root string) cdiOption {
return func(c *CDIHandler) {
c.devRoot = root
}
}

// WithTargetDriverRoot provides an cdiOption to set the target driver root used by the 'cdi' interface.
func WithTargetDriverRoot(root string) cdiOption {
return func(c *CDIHandler) {
Expand Down
135 changes: 135 additions & 0 deletions demo/clusters/gke/create-cluster.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
#!/bin/bash

# Copyright 2023 NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

: ${PROJECT_NAME:=$(gcloud config list --format 'value(core.project)' 2>/dev/null)}

if [[ -z ${PROJECT_NAME} ]]; then
echo "Project name could not be determined"
echo "Please run 'gcloud config set project'"
exit 1
fi

CURRENT_DIR="$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)"
PROJECT_DIR="$(cd -- "$( dirname -- "${CURRENT_DIR}/../../../.." )" &> /dev/null && pwd)"

# We extract information from versions.mk
function from_versions_mk() {
local makevar=$1
local value=$(grep -E "^\s*${makevar}\s+[\?:]= " ${PROJECT_DIR}/versions.mk)
echo ${value##*= }
}
DRIVER_NAME=$(from_versions_mk "DRIVER_NAME")

NETWORK_NAME="${DRIVER_NAME}-net"
CLUSTER_NAME="${DRIVER_NAME}-cluster"

## Create the Network for the cluster
gcloud compute networks create "${NETWORK_NAME}" \
--quiet \
--project="${PROJECT_NAME}" \
--description=Manually\ created\ network\ for\ TMS\ DRA\ Alpha\ cluster \
--subnet-mode=auto \
--mtu=1460 \
--bgp-routing-mode=regional

## Create the cluster
gcloud container clusters create "${CLUSTER_NAME}" \
--quiet \
--enable-kubernetes-alpha \
--no-enable-autorepair \
--no-enable-autoupgrade \
--region us-west1 \
--network "${NETWORK_NAME}" \
--node-labels=nvidia.com/dra.controller=true

# Create t4 node pool
gcloud beta container node-pools create "pool-1" \
--quiet \
--project "${PROJECT_NAME}" \
--cluster "${CLUSTER_NAME}" \
--region "us-west1" \
--node-version "1.27.3-gke.100" \
--machine-type "n1-standard-8" \
--accelerator "type=nvidia-tesla-t4,count=1" \
--image-type "UBUNTU_CONTAINERD" \
--disk-type "pd-standard" \
--disk-size "100" \
--metadata disable-legacy-endpoints=true \
--scopes "https://www.googleapis.com/auth/devstorage.read_only","https://www.googleapis.com/auth/logging.write","https://www.googleapis.com/auth/monitoring","https://www.googleapis.com/auth/servicecontrol","https://www.googleapis.com/auth/service.management.readonly","https://www.googleapis.com/auth/trace.append" \
--num-nodes "2" \
--enable-autoscaling \
--min-nodes "2" \
--max-nodes "6" \
--location-policy "ANY" \
--no-enable-autoupgrade \
--no-enable-autorepair \
--max-surge-upgrade 1 \
--max-unavailable-upgrade 0 \
--node-locations "us-west1-a" \
--node-labels=gke-no-default-nvidia-gpu-device-plugin=true,nvidia.com/gpu=present,nvidia.com/dra.kubelet-plugin=true

# Create v100 node pool
gcloud beta container node-pools create "pool-2" \
--quiet \
--project "${PROJECT_NAME}" \
--cluster "${CLUSTER_NAME}" \
--region "us-west1" \
--node-version "1.27.3-gke.100" \
--machine-type "n1-standard-8" \
--accelerator "type=nvidia-tesla-v100,count=1" \
--image-type "UBUNTU_CONTAINERD" \
--disk-type "pd-standard" \
--disk-size "100" \
--metadata disable-legacy-endpoints=true \
--scopes "https://www.googleapis.com/auth/devstorage.read_only","https://www.googleapis.com/auth/logging.write","https://www.googleapis.com/auth/monitoring","https://www.googleapis.com/auth/servicecontrol","https://www.googleapis.com/auth/service.management.readonly","https://www.googleapis.com/auth/trace.append" \
--num-nodes "1" \
--enable-autoscaling \
--min-nodes "1" \
--max-nodes "6" \
--location-policy "ANY" \
--no-enable-autoupgrade \
--no-enable-autorepair \
--max-surge-upgrade 1 \
--max-unavailable-upgrade 0 \
--node-locations "us-west1-a" \
--node-labels=gke-no-default-nvidia-gpu-device-plugin=true,nvidia.com/gpu=present,nvidia.com/dra.kubelet-plugin=true

## Allow the GPU nodes access to the internet
gcloud compute routers create ${NETWORK_NAME}-nat-router \
--quiet \
--project "${PROJECT_NAME}" \
--network "${NETWORK_NAME}" \
--region "us-west1"

gcloud compute routers nats create "${NETWORK_NAME}-nat-config" \
--quiet \
--project "${PROJECT_NAME}" \
--router "${NETWORK_NAME}-nat-router" \
--nat-all-subnet-ip-ranges \
--auto-allocate-nat-external-ips \
--router-region "us-west1"

## Start using this cluster for kubectl
gcloud container clusters get-credentials "${CLUSTER_NAME}" --location="us-west1"

## Launch the nvidia-driver-installer daemonset to install the GPU drivers on any GPU nodes that come online:
kubectl apply -f https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/nvidia-driver-installer/ubuntu/daemonset-preloaded.yaml

## Create the nvidia namespace
kubectl create namespace nvidia

## Deploy a custom daemonset that prepares a node for use with DRA
kubectl apply -f https://raw.githubusercontent.com/NVIDIA/k8s-dra-driver/456d097feb452cca1351817bab2ccd0782e96c9f/demo/prepare-gke-nodes-for-dra.yaml
55 changes: 55 additions & 0 deletions demo/clusters/gke/delete-cluster.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#!/bin/bash

# Copyright 2023 NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

: ${PROJECT_NAME:=$(gcloud config list --format 'value(core.project)' 2>/dev/null)}

CURRENT_DIR="$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)"
PROJECT_DIR="$(cd -- "$( dirname -- "${CURRENT_DIR}/../../../.." )" &> /dev/null && pwd)"

# We extract information from versions.mk
function from_versions_mk() {
local makevar=$1
local value=$(grep -E "^\s*${makevar}\s+[\?:]= " ${PROJECT_DIR}/versions.mk)
echo ${value##*= }
}
DRIVER_NAME=$(from_versions_mk "DRIVER_NAME")

NETWORK_NAME="${DRIVER_NAME}-net"
CLUSTER_NAME="${DRIVER_NAME}-cluster"

## Delete the cluster
gcloud container clusters delete "${CLUSTER_NAME}" \
--quiet \
--project "${PROJECT_NAME}" \
--region "us-west1"

## Delete the nat config
gcloud compute routers nats delete "${NETWORK_NAME}-nat-config" \
--quiet \
--project "${PROJECT_NAME}" \
--router "${NETWORK_NAME}-nat-router" \
--router-region "us-west1"

## Delete the nat router
gcloud compute routers delete ${NETWORK_NAME}-nat-router \
--quiet \
--project "${PROJECT_NAME}" \
--region "us-west1"

## Delete the network
gcloud compute networks delete "${NETWORK_NAME}" \
--quiet \
--project "${PROJECT_NAME}"
41 changes: 41 additions & 0 deletions demo/clusters/gke/install-dra-driver.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#!/bin/bash

# Copyright 2023 NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

CURRENT_DIR="$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)"
PROJECT_DIR="$(cd -- "$( dirname -- "${CURRENT_DIR}/../../../.." )" &> /dev/null && pwd)"

# We extract information from versions.mk
function from_versions_mk() {
local makevar=$1
local value=$(grep -E "^\s*${makevar}\s+[\?:]= " ${PROJECT_DIR}/versions.mk)
echo ${value##*= }
}
DRIVER_NAME=$(from_versions_mk "DRIVER_NAME")

: ${IMAGE_REGISTRY:=registry.gitlab.com/nvidia/cloud-native/k8s-dra-driver/staging}
: ${IMAGE_NAME:=${DRIVER_NAME}}
: ${IMAGE_TAG:=530b16c-ubuntu20.04}

helm upgrade -i --create-namespace --namespace nvidia nvidia-dra-driver ${PROJECT_DIR}/deployments/helm/k8s-dra-driver \
--set image.repository=${IMAGE_REGISTRY}/${IMAGE_NAME} \
--set image.tag=${IMAGE_TAG} \
--set image.pullPolicy=Always \
--set controller.priorityClassName="" \
--set kubeletPlugin.priorityClassName="" \
--set nvidiaDriverRoot="/opt/nvidia" \
--set kubeletPlugin.tolerations[0].key=nvidia.com/gpu \
--set kubeletPlugin.tolerations[0].operator=Exists \
--set kubeletPlugin.tolerations[0].effect=NoSchedule
48 changes: 48 additions & 0 deletions demo/specs/selectors/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
#### List the set of nodes in the cluster
```console
kubectl get nodes -A
```

#### Show the set of nodes which have GPUs available
```console
kubectl get nodeallocationstates.nas.gpu.resource.nvidia.com -A
```

#### Show the set of allocatable GPUs from each node
```console
kubectl get nodeallocationstates.nas.gpu.resource.nvidia.com -A -o=json \
| jq -r '.items[]
| "\(.metadata.name):",
(.spec.allocatableDevices[])'
```

### Open the yaml files with the specs for the demo
```console
vi -O parameters.yaml claims.yaml pods.yaml
```

#### Create a namespace for the demo and deploy the demo pods
```console
kubectl create namespace kubecon-demo
kubectl apply -f parameters.yaml -f claims.yaml -f pods.yaml
```

#### Show the pods running
```console
kubectl get pod -n kubecon-demo
```

#### Show the set of GPUs allocated to some claim
```console
kubectl get nodeallocationstates.nas.gpu.resource.nvidia.com -A -o=json \
| jq -r '.items[]
| select(.spec.allocatedClaims)
| "\(.metadata.name):",
(.spec.allocatedClaims[])'
```

#### Show the logs of the inference and training pods
```console
kubectl logs -n kubecon-demo inference-pod
kubectl logs -n kubecon-demo training-pod
```
Loading