Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add cuda 10.0 support #1

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions hooks/nvidia-bootstrap/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,10 @@ all: image

.PHONY: image push

image:
docker build -t kopeio/nvidia-bootstrap:1.6.0 -f image/Dockerfile image/
image: ## Build the images
docker build --build-arg CUDA_VERSION=9.1 -t qntfy/nvidia-device-plugin:0.2.0-CUDA-9.1 -f image/Dockerfile image/
docker build --build-arg CUDA_VERSION=10.0 -t qntfy/nvidia-device-plugin:0.2.0-CUDA-10.0 -f image/Dockerfile image/

push: image
docker push kopeio/nvidia-bootstrap:1.6.0
push: ## Push the images
docker push qntfy/nvidia-device-plugin:0.2.0-CUDA-9.1
docker push qntfy/nvidia-device-plugin:0.2.0-CUDA-10.0
4 changes: 4 additions & 0 deletions hooks/nvidia-bootstrap/image/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,8 @@ RUN apt-get update && apt-get -yq install curl jq

ADD run.sh /run.sh

ARG CUDA_VERSION=9.1

ENV CUDA_VERSION ${CUDA_VERSION}

CMD [ "/bin/bash", "/run.sh" ]
101 changes: 69 additions & 32 deletions hooks/nvidia-bootstrap/image/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,43 +26,80 @@ CACHE_DIR_CONTAINER="${ROOTFS_DIR}${CACHE_DIR_HOST}"
# AWS Instance Types to Nvidia Card Mapping (cut and pasted from AWS docs)
# Load the correct driver for the correct instance type
# Instances Product Type Product Series Product
# G2 GRID GRID Series GRID K520 <-- I think they meant G3
# G2 GRID GRID Series GRID K520 (deprecated)
# G3 Tesla M-Series M-60
# G3S Tesla M-Series M-60
# G4 Tesla T-Series T-4
# P2 Tesla K-Series K-80
# P3 Tesla V-Series V100
# Both P2 and P3 are set for Cuda Toolkit 9.1
# http://www.nvidia.com/Download/index.aspx
declare -A class_to_driver_file
class_to_driver_file=( \
["g3"]="http://us.download.nvidia.com/XFree86/Linux-x86_64/367.124/NVIDIA-Linux-x86_64-367.124.run" \
["g3s"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \
["p2"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \
["p3"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \
)
declare -A class_to_driver_checksum
class_to_driver_checksum=( \
["g3"]="77f37939efeea4b6505842bed50445971992e303" \
["g3s"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \
["p2"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \
["p3"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \
)

# CUDA Files that need to be installed ~1.4GB
# First one is main installation
# Subsequent files are patches which need to be applied in order
# Order in the arrays below matters
# https://developer.nvidia.com/cuda-downloads
cuda_files=( \
"https://developer.nvidia.com/compute/cuda/9.1/Prod/local_installers/cuda_9.1.85_387.26_linux" \
"https://developer.nvidia.com/compute/cuda/9.1/Prod/patches/1/cuda_9.1.85.1_linux" \
"https://developer.nvidia.com/compute/cuda/9.1/Prod/patches/2/cuda_9.1.85.2_linux" \
"https://developer.nvidia.com/compute/cuda/9.1/Prod/patches/3/cuda_9.1.85.3_linux" \
)
cuda_files_checksums=( \
"1540658f4fe657dddd8b0899555b7468727d4aa8" \
"7ec6970ecd81163b0d02ef30d35599e7fd6e97d8" \
"cfa3b029b58fc117d8ce510a70efc848924dd565" \
"6269a2c5784b08997edb97ea0020fb4e6c8769ed" \
)
case $CUDA_VERSION in
9.1)
class_to_driver_file=( \
["g3"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \
["g3s"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \
["g4dn"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \
["p2"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \
["p3"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \
)

class_to_driver_checksum=( \
["g3"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \
["g3s"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \
["g4dn"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \
["p2"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \
["p3"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \
)

# CUDA Files that need to be installed ~1.4GB
# First one is main installation
# Subsequent files are patches which need to be applied in order
# Order in the arrays below matters
# https://developer.nvidia.com/cuda-downloads
cuda_files=( \
"https://developer.nvidia.com/compute/cuda/9.1/Prod/local_installers/cuda_9.1.85_387.26_linux" \
"https://developer.nvidia.com/compute/cuda/9.1/Prod/patches/1/cuda_9.1.85.1_linux" \
"https://developer.nvidia.com/compute/cuda/9.1/Prod/patches/2/cuda_9.1.85.2_linux" \
"https://developer.nvidia.com/compute/cuda/9.1/Prod/patches/3/cuda_9.1.85.3_linux" \
)

cuda_files_checksums=( \
"1540658f4fe657dddd8b0899555b7468727d4aa8" \
"7ec6970ecd81163b0d02ef30d35599e7fd6e97d8" \
"cfa3b029b58fc117d8ce510a70efc848924dd565" \
"6269a2c5784b08997edb97ea0020fb4e6c8769ed" \
)
;;
10.0)
class_to_driver_file=( \
["g3"]="http://us.download.nvidia.com/tesla/410.129/NVIDIA-Linux-x86_64-410.129-diagnostic.run" \
["g3s"]="http://us.download.nvidia.com/tesla/410.129/NVIDIA-Linux-x86_64-410.129-diagnostic.run" \
["g4dn"]="http://us.download.nvidia.com/tesla/410.129/NVIDIA-Linux-x86_64-410.129-diagnostic.run" \
["p2"]="http://us.download.nvidia.com/tesla/410.129/NVIDIA-Linux-x86_64-410.129-diagnostic.run" \
["p3"]="http://us.download.nvidia.com/tesla/410.129/NVIDIA-Linux-x86_64-410.129-diagnostic.run" \
)

class_to_driver_checksum=( \
["g3"]="e5d234cc8acb35f425f60e1923e07e7e50272d9c" \
["g3s"]="e5d234cc8acb35f425f60e1923e07e7e50272d9c" \
["g4dn"]="e5d234cc8acb35f425f60e1923e07e7e50272d9c" \
["p2"]="e5d234cc8acb35f425f60e1923e07e7e50272d9c" \
["p3"]="e5d234cc8acb35f425f60e1923e07e7e50272d9c" \
)

cuda_files=( \
"http://developer.download.nvidia.com/compute/cuda/10.1/Prod/local_installers/cuda_10.1.243_418.87.00_linux.run" \
)

cuda_files_checksums=( \
"36706e2c0fb7efa14aea6a3c889271d97fd3575d" \
)
;;
*) echo "CUDA ${CUDA_VERSION} not supported by kops hook" && exit 1
esac

containsElement () { for e in "${@:2}"; do [[ "$e" = "$1" ]] && return 0; done; return 1; }

Expand Down Expand Up @@ -170,7 +207,7 @@ for (( i=0; i<${length}; i++ )); do
touch $filepath_installed # Mark successful installation
elif [[ $download =~ .*local_installers.*cuda.* ]]; then
# Install the primary cuda library (using gcc)
chroot ${ROOTFS_DIR} $filepath_host --toolkit --silent --verbose
chroot ${ROOTFS_DIR} $filepath_host --toolkit --silent
touch $filepath_installed # Mark successful installation
elif [[ $download =~ .*patches.*cuda.* ]]; then
# Install an update to the primary cuda library (using gcc)
Expand Down
10 changes: 6 additions & 4 deletions hooks/nvidia-device-plugin/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,13 @@ all: image

.PHONY: image push

image: ## Build the image
docker build -t dcwangmit01/nvidia-device-plugin:0.1.0 -f image/Dockerfile image/
image: ## Build the images
docker build --build-arg CUDA_VERSION=9.1 -t qntfy/nvidia-device-plugin:0.2.0-CUDA-9.1 -f image/Dockerfile image/
docker build --build-arg CUDA_VERSION=10.0 -t qntfy/nvidia-device-plugin:0.2.0-CUDA-10.0 -f image/Dockerfile image/

push: ## Push the image
docker push dcwangmit01/nvidia-device-plugin:0.1.0
push: ## Push the images
docker push qntfy/nvidia-device-plugin:0.2.0-CUDA-9.1
docker push qntfy/nvidia-device-plugin:0.2.0-CUDA-10.0

help: ## Print list of Makefile targets
@# Taken from https://github.com/spf13/hugo/blob/master/Makefile
Expand Down
18 changes: 13 additions & 5 deletions hooks/nvidia-device-plugin/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,9 @@ Using this hook indicates that you agree to the Nvidia
Although this hook *may* work among many combinatorial versions of software and
images, it has only been tested with the following:

* kops: **1.9**
* kubernetes: 1.10, **1.11**
* OS Image: **`kope.io/k8s-1.10-debian-stretch-amd64-hvm-ebs-2018-05-27`**
* kops: 1.9, **1.13**
* kubernetes: 1.10, 1.11, **1.13**
* OS Image: **`kope.io/k8s-1.10-debian-stretch-amd64-hvm-ebs-2019-06-21`**
* This is most certainly not the default image for kops. The OS image must
be explicitly overridden in the cluster or instancegroup spec.
* Debian stretch is needed because `nvidia-docker` requires a newer version
Expand All @@ -56,10 +56,18 @@ This kops hook was developed against the following version combinations.

| Kops Version | Kubernetes Version | GPU Mode | OS Image |
| ------------- | ------------------ | ------------ | -------- |
| 1.13.0 | 1.13 | deviceplugin | kope.io/k8s-1.12-debian-stretch-amd64-hvm-ebs-2019-06-21
| 1.10-beta.1 | 1.10 | deviceplugin | kope.io/k8s-1.10-debian-stretch-amd64-hvm-ebs-2018-05-27
| 1.9.1 | 1.11 | deviceplugin | kope.io/k8s-1.10-debian-stretch-amd64-hvm-ebs-2018-05-27
| 1.9.1 | 1.10 | legacy | kope.io/k8s-1.10-debian-stretch-amd64-hvm-ebs-2018-05-27

#### Driver Support

| Image | NVIDIA Driver Version | CUDA Version |
| ------------------------------------------ | --------------------- | ------------ |
| qntfy/nvidia-device-plugin:0.2.0-CUDA-9.1 | 390.46 | 9.1
| qntfy/nvidia-device-plugin:0.2.0-CUDA-10.0 | 410.129 | 10.0

## Using this DevicePlugin

### Create a Cluster with GPU Nodes
Expand Down Expand Up @@ -95,7 +103,7 @@ spec:
image: kope.io/k8s-1.10-debian-stretch-amd64-hvm-ebs-2018-05-27
hooks:
- execContainer:
image: dcwangmit01/nvidia-device-plugin:0.1.0
image: qntfy/nvidia-device-plugin:0.2.0-CUDA-9.1

### The settings below are only necessary for kubernetes <= 1.11.0, where
### deviceplugins are not enabled by default.
Expand All @@ -120,7 +128,7 @@ spec:
image: kope.io/k8s-1.10-debian-stretch-amd64-hvm-ebs-2018-05-27
hooks:
- execContainer:
image: dcwangmit01/nvidia-device-plugin:0.1.0
image: qntfy/nvidia-device-plugin:0.2.0-CUDA-9.1
environment:
NVIDIA_DEVICE_PLUGIN_MODE: legacy
kubelet:
Expand Down
6 changes: 6 additions & 0 deletions hooks/nvidia-device-plugin/image/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,10 @@ RUN apt-get update && \
COPY run.sh /run.sh
COPY files/* /nvidia-device-plugin/

ARG CUDA_VERSION=9.1

ENV CUDA_VERSION ${CUDA_VERSION}

RUN echo "CUDA_VERSION=${CUDA_VERSION}" > /nvidia-device-plugin/nvidia.env

CMD [ "/bin/bash", "/run.sh" ]
101 changes: 67 additions & 34 deletions hooks/nvidia-device-plugin/image/files/01-aws-nvidia-driver.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,43 +29,76 @@ CACHE_DIR=/nvidia-device-plugin
# G2 GRID GRID Series GRID K520 (deprecated)
# G3 Tesla M-Series M-60
# G3S Tesla M-Series M-60
# G4 Tesla T-Series T-4
# P2 Tesla K-Series K-80
# P3 Tesla V-Series V100
# http://www.nvidia.com/Download/index.aspx
declare -A class_to_driver_file
class_to_driver_file=( \
["g2"]="http://us.download.nvidia.com/XFree86/Linux-x86_64/367.124/NVIDIA-Linux-x86_64-367.124.run" \
["g3"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \
["g3s"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \
["p2"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \
["p3"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \
)
declare -A class_to_driver_checksum
class_to_driver_checksum=( \
["g2"]="77f37939efeea4b6505842bed50445971992e303" \
["g3"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \
["g3s"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \
["p2"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \
["p3"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \
)

# CUDA Files that need to be installed ~1.4GB
# First one is main installation
# Subsequent files are patches which need to be applied in order
# Order in the arrays below matters
# https://developer.nvidia.com/cuda-downloads
cuda_files=( \
"https://developer.nvidia.com/compute/cuda/9.1/Prod/local_installers/cuda_9.1.85_387.26_linux" \
"https://developer.nvidia.com/compute/cuda/9.1/Prod/patches/1/cuda_9.1.85.1_linux" \
"https://developer.nvidia.com/compute/cuda/9.1/Prod/patches/2/cuda_9.1.85.2_linux" \
"https://developer.nvidia.com/compute/cuda/9.1/Prod/patches/3/cuda_9.1.85.3_linux" \
)
cuda_files_checksums=( \
"1540658f4fe657dddd8b0899555b7468727d4aa8" \
"7ec6970ecd81163b0d02ef30d35599e7fd6e97d8" \
"cfa3b029b58fc117d8ce510a70efc848924dd565" \
"6269a2c5784b08997edb97ea0020fb4e6c8769ed" \
)
case $CUDA_VERSION in
9.1)
class_to_driver_file=( \
["g3"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \
["g3s"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \
["g4dn"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \
["p2"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \
["p3"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \
)

class_to_driver_checksum=( \
["g3"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \
["g3s"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \
["g4dn"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \
["p2"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \
["p3"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \
)

# CUDA Files that need to be installed ~1.4GB
# First one is main installation
# Subsequent files are patches which need to be applied in order
# Order in the arrays below matters
# https://developer.nvidia.com/cuda-downloads
cuda_files=( \
"https://developer.nvidia.com/compute/cuda/9.1/Prod/local_installers/cuda_9.1.85_387.26_linux" \
"https://developer.nvidia.com/compute/cuda/9.1/Prod/patches/1/cuda_9.1.85.1_linux" \
"https://developer.nvidia.com/compute/cuda/9.1/Prod/patches/2/cuda_9.1.85.2_linux" \
"https://developer.nvidia.com/compute/cuda/9.1/Prod/patches/3/cuda_9.1.85.3_linux" \
)

cuda_files_checksums=( \
"1540658f4fe657dddd8b0899555b7468727d4aa8" \
"7ec6970ecd81163b0d02ef30d35599e7fd6e97d8" \
"cfa3b029b58fc117d8ce510a70efc848924dd565" \
"6269a2c5784b08997edb97ea0020fb4e6c8769ed" \
)
;;
10.0)
class_to_driver_file=( \
["g3"]="http://us.download.nvidia.com/tesla/410.129/NVIDIA-Linux-x86_64-410.129-diagnostic.run" \
["g3s"]="http://us.download.nvidia.com/tesla/410.129/NVIDIA-Linux-x86_64-410.129-diagnostic.run" \
["g4dn"]="http://us.download.nvidia.com/tesla/410.129/NVIDIA-Linux-x86_64-410.129-diagnostic.run" \
["p2"]="http://us.download.nvidia.com/tesla/410.129/NVIDIA-Linux-x86_64-410.129-diagnostic.run" \
["p3"]="http://us.download.nvidia.com/tesla/410.129/NVIDIA-Linux-x86_64-410.129-diagnostic.run" \
)

class_to_driver_checksum=( \
["g3"]="e5d234cc8acb35f425f60e1923e07e7e50272d9c" \
["g3s"]="e5d234cc8acb35f425f60e1923e07e7e50272d9c" \
["g4dn"]="e5d234cc8acb35f425f60e1923e07e7e50272d9c" \
["p2"]="e5d234cc8acb35f425f60e1923e07e7e50272d9c" \
["p3"]="e5d234cc8acb35f425f60e1923e07e7e50272d9c" \
)

cuda_files=( \
"http://developer.download.nvidia.com/compute/cuda/10.1/Prod/local_installers/cuda_10.1.243_418.87.00_linux.run" \
)

cuda_files_checksums=( \
"36706e2c0fb7efa14aea6a3c889271d97fd3575d" \
)
;;
*) echo "CUDA ${CUDA_VERSION} not supported by kops hook" && exit 1
esac

containsElement () { for e in "${@:2}"; do [[ "$e" = "$1" ]] && return 0; done; return 1; }

Expand Down Expand Up @@ -102,7 +135,7 @@ echo "Identified machine as AWS_INSTANCE_TYPE[$AWS_INSTANCE_TYPE] AWS_INSTANCE_C
# Install with --no-upgrade so that the c-libs are not upgraded, possibly
# breaking programs and requiring restart
apt-get -y update
apt-get -y --no-upgrade install gcc libc-dev linux-headers-$(uname -r)
apt-get -y --no-upgrade install gcc libc-dev libxml2-dev linux-headers-$(uname -r)
apt-get -y clean
apt-get -y autoremove

Expand Down Expand Up @@ -153,7 +186,7 @@ for (( i=0; i<${length}; i++ )); do
touch $filepath_installed # Mark successful installation
elif [[ $download =~ .*local_installers.*cuda.* ]]; then
# Install the primary cuda library
$filepath --toolkit --silent --verbose
$filepath --toolkit --silent
touch $filepath_installed # Mark successful installation
elif [[ $download =~ .*patches.*cuda.* ]]; then
# Install an update to the primary cuda library
Expand Down
4 changes: 2 additions & 2 deletions hooks/nvidia-device-plugin/image/files/02-nvidia-docker.sh
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ EOF
# --force-confold prevents prompt for replacement of daemon.json
apt-get -y update
# Stop protokube to ensure not bring kubelet up again
systemctl stop protokube
systemctl stop protokube || true
# Stop kubelet to ensure not bring stopped containers up again and leak
# them as orphan containers
systemctl stop kubelet
Expand All @@ -86,4 +86,4 @@ systemctl mask cloud-init.service
systemctl mask kops-configuration.service

# Restore protokube and protokube will bring up kubelet
systemctl start protokube
systemctl start protokube || systemctl start kubelet
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ After=cloud-config.target cloud-init.target kops-configuration.service apt-daily

[Service]
Type=oneshot
EnvironmentFile=/nvidia-device-plugin/nvidia.env
ExecStart=/bin/bash -c "/nvidia-device-plugin/nvidia-device-plugin.sh"

[Install]
Expand Down