From d9eea32883ad7105cd05b591b9a3e25f8e653c1d Mon Sep 17 00:00:00 2001 From: Casey Hilland Date: Thu, 7 Nov 2019 15:55:13 -0500 Subject: [PATCH 1/5] Add cuda 10.0 support --- hooks/nvidia-bootstrap/Makefile | 10 +- hooks/nvidia-bootstrap/image/Dockerfile | 4 + hooks/nvidia-bootstrap/image/run.sh | 92 ++++++++++++------ hooks/nvidia-device-plugin/Makefile | 10 +- hooks/nvidia-device-plugin/image/Dockerfile | 6 ++ .../image/files/01-aws-nvidia-driver.sh | 96 ++++++++++++------- .../image/files/02-nvidia-docker.sh | 4 +- .../image/files/nvidia-device-plugin.service | 1 + 8 files changed, 148 insertions(+), 75 deletions(-) diff --git a/hooks/nvidia-bootstrap/Makefile b/hooks/nvidia-bootstrap/Makefile index f7c65f5b1a6e2..ba478e94441bf 100644 --- a/hooks/nvidia-bootstrap/Makefile +++ b/hooks/nvidia-bootstrap/Makefile @@ -16,8 +16,10 @@ all: image .PHONY: image push -image: - docker build -t kopeio/nvidia-bootstrap:1.6.0 -f image/Dockerfile image/ +image: ## Build the images + docker build --build-arg CUDA_VERSION=9.1 -t qntfy/nvidia-device-plugin:0.2.0-CUDA-9.1 -f image/Dockerfile image/ + docker build --build-arg CUDA_VERSION=10.0 -t qntfy/nvidia-device-plugin:0.2.0-CUDA-10.0 -f image/Dockerfile image/ -push: image - docker push kopeio/nvidia-bootstrap:1.6.0 +push: ## Push the images + docker push qntfy/nvidia-device-plugin:0.2.0-CUDA-9.1 + docker push qntfy/nvidia-device-plugin:0.2.0-CUDA-10.0 diff --git a/hooks/nvidia-bootstrap/image/Dockerfile b/hooks/nvidia-bootstrap/image/Dockerfile index f9c2ba7fb6bc3..fedb5cae5e74c 100644 --- a/hooks/nvidia-bootstrap/image/Dockerfile +++ b/hooks/nvidia-bootstrap/image/Dockerfile @@ -20,4 +20,8 @@ RUN apt-get update && apt-get -yq install curl jq ADD run.sh /run.sh +ARG CUDA_VERSION=9.1 + +ENV CUDA_VERSION ${CUDA_VERSION} + CMD [ "/bin/bash", "/run.sh" ] diff --git a/hooks/nvidia-bootstrap/image/run.sh b/hooks/nvidia-bootstrap/image/run.sh index 875f77716f048..1d08a613e67c4 100755 --- a/hooks/nvidia-bootstrap/image/run.sh +++ b/hooks/nvidia-bootstrap/image/run.sh @@ -32,37 +32,67 @@ CACHE_DIR_CONTAINER="${ROOTFS_DIR}${CACHE_DIR_HOST}" # Both P2 and P3 are set for Cuda Toolkit 9.1 # http://www.nvidia.com/Download/index.aspx declare -A class_to_driver_file -class_to_driver_file=( \ - ["g3"]="http://us.download.nvidia.com/XFree86/Linux-x86_64/367.124/NVIDIA-Linux-x86_64-367.124.run" \ - ["g3s"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \ - ["p2"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \ - ["p3"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \ -) declare -A class_to_driver_checksum -class_to_driver_checksum=( \ - ["g3"]="77f37939efeea4b6505842bed50445971992e303" \ - ["g3s"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \ - ["p2"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \ - ["p3"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \ -) - -# CUDA Files that need to be installed ~1.4GB -# First one is main installation -# Subsequent files are patches which need to be applied in order -# Order in the arrays below matters -# https://developer.nvidia.com/cuda-downloads -cuda_files=( \ - "https://developer.nvidia.com/compute/cuda/9.1/Prod/local_installers/cuda_9.1.85_387.26_linux" \ - "https://developer.nvidia.com/compute/cuda/9.1/Prod/patches/1/cuda_9.1.85.1_linux" \ - "https://developer.nvidia.com/compute/cuda/9.1/Prod/patches/2/cuda_9.1.85.2_linux" \ - "https://developer.nvidia.com/compute/cuda/9.1/Prod/patches/3/cuda_9.1.85.3_linux" \ -) -cuda_files_checksums=( \ - "1540658f4fe657dddd8b0899555b7468727d4aa8" \ - "7ec6970ecd81163b0d02ef30d35599e7fd6e97d8" \ - "cfa3b029b58fc117d8ce510a70efc848924dd565" \ - "6269a2c5784b08997edb97ea0020fb4e6c8769ed" \ -) +case $CUDA_VERSION in + 9.1) + class_to_driver_file=( \ + ["g3"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \ + ["g3s"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \ + ["p2"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \ + ["p3"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \ + ) + + class_to_driver_checksum=( \ + ["g3"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \ + ["g3s"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \ + ["p2"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \ + ["p3"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \ + ) + + # CUDA Files that need to be installed ~1.4GB + # First one is main installation + # Subsequent files are patches which need to be applied in order + # Order in the arrays below matters + # https://developer.nvidia.com/cuda-downloads + cuda_files=( \ + "https://developer.nvidia.com/compute/cuda/9.1/Prod/local_installers/cuda_9.1.85_387.26_linux" \ + "https://developer.nvidia.com/compute/cuda/9.1/Prod/patches/1/cuda_9.1.85.1_linux" \ + "https://developer.nvidia.com/compute/cuda/9.1/Prod/patches/2/cuda_9.1.85.2_linux" \ + "https://developer.nvidia.com/compute/cuda/9.1/Prod/patches/3/cuda_9.1.85.3_linux" \ + ) + + cuda_files_checksums=( \ + "1540658f4fe657dddd8b0899555b7468727d4aa8" \ + "7ec6970ecd81163b0d02ef30d35599e7fd6e97d8" \ + "cfa3b029b58fc117d8ce510a70efc848924dd565" \ + "6269a2c5784b08997edb97ea0020fb4e6c8769ed" \ + ) + ;; + 10.0) + class_to_driver_file=( \ + ["g3"]="http://us.download.nvidia.com/tesla/410.129/NVIDIA-Linux-x86_64-410.129-diagnostic.run" \ + ["g3s"]="http://us.download.nvidia.com/tesla/410.129/NVIDIA-Linux-x86_64-410.129-diagnostic.run" \ + ["p2"]="http://us.download.nvidia.com/tesla/410.129/NVIDIA-Linux-x86_64-410.129-diagnostic.run" \ + ["p3"]="http://us.download.nvidia.com/tesla/410.129/NVIDIA-Linux-x86_64-410.129-diagnostic.run" \ + ) + + class_to_driver_checksum=( \ + ["g3"]="e5d234cc8acb35f425f60e1923e07e7e50272d9c" \ + ["g3s"]="e5d234cc8acb35f425f60e1923e07e7e50272d9c" \ + ["p2"]="e5d234cc8acb35f425f60e1923e07e7e50272d9c" \ + ["p3"]="e5d234cc8acb35f425f60e1923e07e7e50272d9c" \ + ) + + cuda_files=( \ + "http://developer.download.nvidia.com/compute/cuda/10.1/Prod/local_installers/cuda_10.1.243_418.87.00_linux.run" \ + ) + + cuda_files_checksums=( \ + "36706e2c0fb7efa14aea6a3c889271d97fd3575d" \ + ) + ;; + *) echo "CUDA ${CUDA_VERSION} not supported by kops hook" && exit 1 +esac containsElement () { for e in "${@:2}"; do [[ "$e" = "$1" ]] && return 0; done; return 1; } @@ -170,7 +200,7 @@ for (( i=0; i<${length}; i++ )); do touch $filepath_installed # Mark successful installation elif [[ $download =~ .*local_installers.*cuda.* ]]; then # Install the primary cuda library (using gcc) - chroot ${ROOTFS_DIR} $filepath_host --toolkit --silent --verbose + chroot ${ROOTFS_DIR} $filepath_host --toolkit --silent touch $filepath_installed # Mark successful installation elif [[ $download =~ .*patches.*cuda.* ]]; then # Install an update to the primary cuda library (using gcc) diff --git a/hooks/nvidia-device-plugin/Makefile b/hooks/nvidia-device-plugin/Makefile index cb19c580e01a8..ead76ca361e90 100644 --- a/hooks/nvidia-device-plugin/Makefile +++ b/hooks/nvidia-device-plugin/Makefile @@ -18,11 +18,13 @@ all: image .PHONY: image push -image: ## Build the image - docker build -t dcwangmit01/nvidia-device-plugin:0.1.0 -f image/Dockerfile image/ +image: ## Build the images + docker build --build-arg CUDA_VERSION=9.1 -t qntfy/nvidia-device-plugin:0.2.0-CUDA-9.1 -f image/Dockerfile image/ + docker build --build-arg CUDA_VERSION=10.0 -t qntfy/nvidia-device-plugin:0.2.0-CUDA-10.0 -f image/Dockerfile image/ -push: ## Push the image - docker push dcwangmit01/nvidia-device-plugin:0.1.0 +push: ## Push the images + docker push qntfy/nvidia-device-plugin:0.2.0-CUDA-9.1 + docker push qntfy/nvidia-device-plugin:0.2.0-CUDA-10.0 help: ## Print list of Makefile targets @# Taken from https://github.com/spf13/hugo/blob/master/Makefile diff --git a/hooks/nvidia-device-plugin/image/Dockerfile b/hooks/nvidia-device-plugin/image/Dockerfile index 5cfe7857db60f..1310805477799 100644 --- a/hooks/nvidia-device-plugin/image/Dockerfile +++ b/hooks/nvidia-device-plugin/image/Dockerfile @@ -24,4 +24,10 @@ RUN apt-get update && \ COPY run.sh /run.sh COPY files/* /nvidia-device-plugin/ +ARG CUDA_VERSION=9.1 + +ENV CUDA_VERSION ${CUDA_VERSION} + +RUN echo "CUDA_VERSION=${CUDA_VERSION}" > /nvidia-device-plugin/nvidia.env + CMD [ "/bin/bash", "/run.sh" ] diff --git a/hooks/nvidia-device-plugin/image/files/01-aws-nvidia-driver.sh b/hooks/nvidia-device-plugin/image/files/01-aws-nvidia-driver.sh index 5140c3426605a..9e0ecaea326bb 100755 --- a/hooks/nvidia-device-plugin/image/files/01-aws-nvidia-driver.sh +++ b/hooks/nvidia-device-plugin/image/files/01-aws-nvidia-driver.sh @@ -33,39 +33,67 @@ CACHE_DIR=/nvidia-device-plugin # P3 Tesla V-Series V100 # http://www.nvidia.com/Download/index.aspx declare -A class_to_driver_file -class_to_driver_file=( \ - ["g2"]="http://us.download.nvidia.com/XFree86/Linux-x86_64/367.124/NVIDIA-Linux-x86_64-367.124.run" \ - ["g3"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \ - ["g3s"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \ - ["p2"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \ - ["p3"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \ -) declare -A class_to_driver_checksum -class_to_driver_checksum=( \ - ["g2"]="77f37939efeea4b6505842bed50445971992e303" \ - ["g3"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \ - ["g3s"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \ - ["p2"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \ - ["p3"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \ -) - -# CUDA Files that need to be installed ~1.4GB -# First one is main installation -# Subsequent files are patches which need to be applied in order -# Order in the arrays below matters -# https://developer.nvidia.com/cuda-downloads -cuda_files=( \ - "https://developer.nvidia.com/compute/cuda/9.1/Prod/local_installers/cuda_9.1.85_387.26_linux" \ - "https://developer.nvidia.com/compute/cuda/9.1/Prod/patches/1/cuda_9.1.85.1_linux" \ - "https://developer.nvidia.com/compute/cuda/9.1/Prod/patches/2/cuda_9.1.85.2_linux" \ - "https://developer.nvidia.com/compute/cuda/9.1/Prod/patches/3/cuda_9.1.85.3_linux" \ -) -cuda_files_checksums=( \ - "1540658f4fe657dddd8b0899555b7468727d4aa8" \ - "7ec6970ecd81163b0d02ef30d35599e7fd6e97d8" \ - "cfa3b029b58fc117d8ce510a70efc848924dd565" \ - "6269a2c5784b08997edb97ea0020fb4e6c8769ed" \ -) +case $CUDA_VERSION in + 9.1) + class_to_driver_file=( \ + ["g3"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \ + ["g3s"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \ + ["p2"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \ + ["p3"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \ + ) + + class_to_driver_checksum=( \ + ["g3"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \ + ["g3s"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \ + ["p2"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \ + ["p3"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \ + ) + + # CUDA Files that need to be installed ~1.4GB + # First one is main installation + # Subsequent files are patches which need to be applied in order + # Order in the arrays below matters + # https://developer.nvidia.com/cuda-downloads + cuda_files=( \ + "https://developer.nvidia.com/compute/cuda/9.1/Prod/local_installers/cuda_9.1.85_387.26_linux" \ + "https://developer.nvidia.com/compute/cuda/9.1/Prod/patches/1/cuda_9.1.85.1_linux" \ + "https://developer.nvidia.com/compute/cuda/9.1/Prod/patches/2/cuda_9.1.85.2_linux" \ + "https://developer.nvidia.com/compute/cuda/9.1/Prod/patches/3/cuda_9.1.85.3_linux" \ + ) + + cuda_files_checksums=( \ + "1540658f4fe657dddd8b0899555b7468727d4aa8" \ + "7ec6970ecd81163b0d02ef30d35599e7fd6e97d8" \ + "cfa3b029b58fc117d8ce510a70efc848924dd565" \ + "6269a2c5784b08997edb97ea0020fb4e6c8769ed" \ + ) + ;; + 10.0) + class_to_driver_file=( \ + ["g3"]="http://us.download.nvidia.com/tesla/410.129/NVIDIA-Linux-x86_64-410.129-diagnostic.run" \ + ["g3s"]="http://us.download.nvidia.com/tesla/410.129/NVIDIA-Linux-x86_64-410.129-diagnostic.run" \ + ["p2"]="http://us.download.nvidia.com/tesla/410.129/NVIDIA-Linux-x86_64-410.129-diagnostic.run" \ + ["p3"]="http://us.download.nvidia.com/tesla/410.129/NVIDIA-Linux-x86_64-410.129-diagnostic.run" \ + ) + + class_to_driver_checksum=( \ + ["g3"]="e5d234cc8acb35f425f60e1923e07e7e50272d9c" \ + ["g3s"]="e5d234cc8acb35f425f60e1923e07e7e50272d9c" \ + ["p2"]="e5d234cc8acb35f425f60e1923e07e7e50272d9c" \ + ["p3"]="e5d234cc8acb35f425f60e1923e07e7e50272d9c" \ + ) + + cuda_files=( \ + "http://developer.download.nvidia.com/compute/cuda/10.1/Prod/local_installers/cuda_10.1.243_418.87.00_linux.run" \ + ) + + cuda_files_checksums=( \ + "36706e2c0fb7efa14aea6a3c889271d97fd3575d" \ + ) + ;; + *) echo "CUDA ${CUDA_VERSION} not supported by kops hook" && exit 1 +esac containsElement () { for e in "${@:2}"; do [[ "$e" = "$1" ]] && return 0; done; return 1; } @@ -102,7 +130,7 @@ echo "Identified machine as AWS_INSTANCE_TYPE[$AWS_INSTANCE_TYPE] AWS_INSTANCE_C # Install with --no-upgrade so that the c-libs are not upgraded, possibly # breaking programs and requiring restart apt-get -y update -apt-get -y --no-upgrade install gcc libc-dev linux-headers-$(uname -r) +apt-get -y --no-upgrade install gcc libc-dev libxml2-dev linux-headers-$(uname -r) apt-get -y clean apt-get -y autoremove @@ -153,7 +181,7 @@ for (( i=0; i<${length}; i++ )); do touch $filepath_installed # Mark successful installation elif [[ $download =~ .*local_installers.*cuda.* ]]; then # Install the primary cuda library - $filepath --toolkit --silent --verbose + $filepath --toolkit --silent touch $filepath_installed # Mark successful installation elif [[ $download =~ .*patches.*cuda.* ]]; then # Install an update to the primary cuda library diff --git a/hooks/nvidia-device-plugin/image/files/02-nvidia-docker.sh b/hooks/nvidia-device-plugin/image/files/02-nvidia-docker.sh index 77f62bd2b4365..b285f00284e6c 100755 --- a/hooks/nvidia-device-plugin/image/files/02-nvidia-docker.sh +++ b/hooks/nvidia-device-plugin/image/files/02-nvidia-docker.sh @@ -67,7 +67,7 @@ EOF # --force-confold prevents prompt for replacement of daemon.json apt-get -y update # Stop protokube to ensure not bring kubelet up again -systemctl stop protokube +systemctl stop protokube || true # Stop kubelet to ensure not bring stopped containers up again and leak # them as orphan containers systemctl stop kubelet @@ -86,4 +86,4 @@ systemctl mask cloud-init.service systemctl mask kops-configuration.service # Restore protokube and protokube will bring up kubelet -systemctl start protokube +systemctl start protokube || systemctl start kubelet diff --git a/hooks/nvidia-device-plugin/image/files/nvidia-device-plugin.service b/hooks/nvidia-device-plugin/image/files/nvidia-device-plugin.service index 660d7e48c5ed4..e688e06f7a808 100644 --- a/hooks/nvidia-device-plugin/image/files/nvidia-device-plugin.service +++ b/hooks/nvidia-device-plugin/image/files/nvidia-device-plugin.service @@ -4,6 +4,7 @@ After=cloud-config.target cloud-init.target kops-configuration.service apt-daily [Service] Type=oneshot +EnvironmentFile=/nvidia-device-plugin/nvidia.env ExecStart=/bin/bash -c "/nvidia-device-plugin/nvidia-device-plugin.sh" [Install] From 531611a3f6b63bb704e7a8237ff6d8d139f34bfa Mon Sep 17 00:00:00 2001 From: Casey Hilland Date: Sat, 9 Nov 2019 18:16:07 -0500 Subject: [PATCH 2/5] Update docs --- hooks/nvidia-device-plugin/README.md | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/hooks/nvidia-device-plugin/README.md b/hooks/nvidia-device-plugin/README.md index 6cf68895cab16..ca461c89d8104 100644 --- a/hooks/nvidia-device-plugin/README.md +++ b/hooks/nvidia-device-plugin/README.md @@ -34,9 +34,9 @@ Using this hook indicates that you agree to the Nvidia Although this hook *may* work among many combinatorial versions of software and images, it has only been tested with the following: -* kops: **1.9** -* kubernetes: 1.10, **1.11** -* OS Image: **`kope.io/k8s-1.10-debian-stretch-amd64-hvm-ebs-2018-05-27`** +* kops: 1.9, **1.13** +* kubernetes: 1.10, 1.11, **1.13** +* OS Image: **`kope.io/k8s-1.10-debian-stretch-amd64-hvm-ebs-2019-06-21`** * This is most certainly not the default image for kops. The OS image must be explicitly overridden in the cluster or instancegroup spec. * Debian stretch is needed because `nvidia-docker` requires a newer version @@ -56,10 +56,18 @@ This kops hook was developed against the following version combinations. | Kops Version | Kubernetes Version | GPU Mode | OS Image | | ------------- | ------------------ | ------------ | -------- | +| 1.13.0 | 1.13 | deviceplugin | kope.io/k8s-1.12-debian-stretch-amd64-hvm-ebs-2019-06-21 | 1.10-beta.1 | 1.10 | deviceplugin | kope.io/k8s-1.10-debian-stretch-amd64-hvm-ebs-2018-05-27 | 1.9.1 | 1.11 | deviceplugin | kope.io/k8s-1.10-debian-stretch-amd64-hvm-ebs-2018-05-27 | 1.9.1 | 1.10 | legacy | kope.io/k8s-1.10-debian-stretch-amd64-hvm-ebs-2018-05-27 +#### Driver Support + +| Image | NVIDIA Driver Version | CUDA Version | +| ------------------------------------------ | --------------------- | ------------ | +| qntfy/nvidia-device-plugin:0.2.0-CUDA-9.1 | 390.46 | 9.1 +| qntfy/nvidia-device-plugin:0.2.0-CUDA-10.0 | 410.129 | 10.0 + ## Using this DevicePlugin ### Create a Cluster with GPU Nodes @@ -95,7 +103,7 @@ spec: image: kope.io/k8s-1.10-debian-stretch-amd64-hvm-ebs-2018-05-27 hooks: - execContainer: - image: dcwangmit01/nvidia-device-plugin:0.1.0 + image: qntfy/nvidia-device-plugin:0.2.0-CUDA-9.1 ### The settings below are only necessary for kubernetes <= 1.11.0, where ### deviceplugins are not enabled by default. @@ -120,7 +128,7 @@ spec: image: kope.io/k8s-1.10-debian-stretch-amd64-hvm-ebs-2018-05-27 hooks: - execContainer: - image: dcwangmit01/nvidia-device-plugin:0.1.0 + image: qntfy/nvidia-device-plugin:0.2.0-CUDA-9.1 environment: NVIDIA_DEVICE_PLUGIN_MODE: legacy kubelet: From 8c5a0a2242079945bdff305d30bd75a43dc73c0b Mon Sep 17 00:00:00 2001 From: Casey Hilland Date: Wed, 27 Nov 2019 09:30:14 -0500 Subject: [PATCH 3/5] Add AWS g4 support --- hooks/nvidia-bootstrap/image/run.sh | 9 ++++++++- .../image/files/01-aws-nvidia-driver.sh | 5 +++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/hooks/nvidia-bootstrap/image/run.sh b/hooks/nvidia-bootstrap/image/run.sh index 1d08a613e67c4..fc6c256c0b626 100755 --- a/hooks/nvidia-bootstrap/image/run.sh +++ b/hooks/nvidia-bootstrap/image/run.sh @@ -26,7 +26,10 @@ CACHE_DIR_CONTAINER="${ROOTFS_DIR}${CACHE_DIR_HOST}" # AWS Instance Types to Nvidia Card Mapping (cut and pasted from AWS docs) # Load the correct driver for the correct instance type # Instances Product Type Product Series Product -# G2 GRID GRID Series GRID K520 <-- I think they meant G3 +# G2 GRID GRID Series GRID K520 (deprecated) +# G3 Tesla M-Series M-60 +# G3S Tesla M-Series M-60 +# G4 Tesla T-Series T-4 # P2 Tesla K-Series K-80 # P3 Tesla V-Series V100 # Both P2 and P3 are set for Cuda Toolkit 9.1 @@ -38,6 +41,7 @@ case $CUDA_VERSION in class_to_driver_file=( \ ["g3"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \ ["g3s"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \ + ["g4"]="http://us.download.nvidia.com/tesla/418.87/NVIDIA-Linux-x86_64-390.46.00.run" \ ["p2"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \ ["p3"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \ ) @@ -45,6 +49,7 @@ case $CUDA_VERSION in class_to_driver_checksum=( \ ["g3"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \ ["g3s"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \ + ["g4"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \ ["p2"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \ ["p3"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \ ) @@ -72,6 +77,7 @@ case $CUDA_VERSION in class_to_driver_file=( \ ["g3"]="http://us.download.nvidia.com/tesla/410.129/NVIDIA-Linux-x86_64-410.129-diagnostic.run" \ ["g3s"]="http://us.download.nvidia.com/tesla/410.129/NVIDIA-Linux-x86_64-410.129-diagnostic.run" \ + ["g4"]="http://us.download.nvidia.com/tesla/410.129/NVIDIA-Linux-x86_64-410.129-diagnostic.run" \ ["p2"]="http://us.download.nvidia.com/tesla/410.129/NVIDIA-Linux-x86_64-410.129-diagnostic.run" \ ["p3"]="http://us.download.nvidia.com/tesla/410.129/NVIDIA-Linux-x86_64-410.129-diagnostic.run" \ ) @@ -79,6 +85,7 @@ case $CUDA_VERSION in class_to_driver_checksum=( \ ["g3"]="e5d234cc8acb35f425f60e1923e07e7e50272d9c" \ ["g3s"]="e5d234cc8acb35f425f60e1923e07e7e50272d9c" \ + ["g4"]="e5d234cc8acb35f425f60e1923e07e7e50272d9c" \ ["p2"]="e5d234cc8acb35f425f60e1923e07e7e50272d9c" \ ["p3"]="e5d234cc8acb35f425f60e1923e07e7e50272d9c" \ ) diff --git a/hooks/nvidia-device-plugin/image/files/01-aws-nvidia-driver.sh b/hooks/nvidia-device-plugin/image/files/01-aws-nvidia-driver.sh index 9e0ecaea326bb..c4edf2e227e91 100755 --- a/hooks/nvidia-device-plugin/image/files/01-aws-nvidia-driver.sh +++ b/hooks/nvidia-device-plugin/image/files/01-aws-nvidia-driver.sh @@ -29,6 +29,7 @@ CACHE_DIR=/nvidia-device-plugin # G2 GRID GRID Series GRID K520 (deprecated) # G3 Tesla M-Series M-60 # G3S Tesla M-Series M-60 +# G4 Tesla T-Series T-4 # P2 Tesla K-Series K-80 # P3 Tesla V-Series V100 # http://www.nvidia.com/Download/index.aspx @@ -39,6 +40,7 @@ case $CUDA_VERSION in class_to_driver_file=( \ ["g3"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \ ["g3s"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \ + ["g4"]="http://us.download.nvidia.com/tesla/418.87/NVIDIA-Linux-x86_64-390.46.00.run" \ ["p2"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \ ["p3"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \ ) @@ -46,6 +48,7 @@ case $CUDA_VERSION in class_to_driver_checksum=( \ ["g3"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \ ["g3s"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \ + ["g4"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \ ["p2"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \ ["p3"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \ ) @@ -73,6 +76,7 @@ case $CUDA_VERSION in class_to_driver_file=( \ ["g3"]="http://us.download.nvidia.com/tesla/410.129/NVIDIA-Linux-x86_64-410.129-diagnostic.run" \ ["g3s"]="http://us.download.nvidia.com/tesla/410.129/NVIDIA-Linux-x86_64-410.129-diagnostic.run" \ + ["g4"]="http://us.download.nvidia.com/tesla/410.129/NVIDIA-Linux-x86_64-410.129-diagnostic.run" \ ["p2"]="http://us.download.nvidia.com/tesla/410.129/NVIDIA-Linux-x86_64-410.129-diagnostic.run" \ ["p3"]="http://us.download.nvidia.com/tesla/410.129/NVIDIA-Linux-x86_64-410.129-diagnostic.run" \ ) @@ -80,6 +84,7 @@ case $CUDA_VERSION in class_to_driver_checksum=( \ ["g3"]="e5d234cc8acb35f425f60e1923e07e7e50272d9c" \ ["g3s"]="e5d234cc8acb35f425f60e1923e07e7e50272d9c" \ + ["g4"]="e5d234cc8acb35f425f60e1923e07e7e50272d9c" \ ["p2"]="e5d234cc8acb35f425f60e1923e07e7e50272d9c" \ ["p3"]="e5d234cc8acb35f425f60e1923e07e7e50272d9c" \ ) From d529d73df862df42c8dfa172d4fd87e908453356 Mon Sep 17 00:00:00 2001 From: Casey Hilland Date: Sun, 12 Jan 2020 17:11:00 -0500 Subject: [PATCH 4/5] Fix g4dn instance name --- hooks/nvidia-bootstrap/image/run.sh | 8 ++++---- .../image/files/01-aws-nvidia-driver.sh | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/hooks/nvidia-bootstrap/image/run.sh b/hooks/nvidia-bootstrap/image/run.sh index fc6c256c0b626..a8acb0143fdaa 100755 --- a/hooks/nvidia-bootstrap/image/run.sh +++ b/hooks/nvidia-bootstrap/image/run.sh @@ -41,7 +41,7 @@ case $CUDA_VERSION in class_to_driver_file=( \ ["g3"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \ ["g3s"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \ - ["g4"]="http://us.download.nvidia.com/tesla/418.87/NVIDIA-Linux-x86_64-390.46.00.run" \ + ["g4dn"]="http://us.download.nvidia.com/tesla/418.87/NVIDIA-Linux-x86_64-390.46.00.run" \ ["p2"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \ ["p3"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \ ) @@ -49,7 +49,7 @@ case $CUDA_VERSION in class_to_driver_checksum=( \ ["g3"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \ ["g3s"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \ - ["g4"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \ + ["g4dn"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \ ["p2"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \ ["p3"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \ ) @@ -77,7 +77,7 @@ case $CUDA_VERSION in class_to_driver_file=( \ ["g3"]="http://us.download.nvidia.com/tesla/410.129/NVIDIA-Linux-x86_64-410.129-diagnostic.run" \ ["g3s"]="http://us.download.nvidia.com/tesla/410.129/NVIDIA-Linux-x86_64-410.129-diagnostic.run" \ - ["g4"]="http://us.download.nvidia.com/tesla/410.129/NVIDIA-Linux-x86_64-410.129-diagnostic.run" \ + ["g4dn"]="http://us.download.nvidia.com/tesla/410.129/NVIDIA-Linux-x86_64-410.129-diagnostic.run" \ ["p2"]="http://us.download.nvidia.com/tesla/410.129/NVIDIA-Linux-x86_64-410.129-diagnostic.run" \ ["p3"]="http://us.download.nvidia.com/tesla/410.129/NVIDIA-Linux-x86_64-410.129-diagnostic.run" \ ) @@ -85,7 +85,7 @@ case $CUDA_VERSION in class_to_driver_checksum=( \ ["g3"]="e5d234cc8acb35f425f60e1923e07e7e50272d9c" \ ["g3s"]="e5d234cc8acb35f425f60e1923e07e7e50272d9c" \ - ["g4"]="e5d234cc8acb35f425f60e1923e07e7e50272d9c" \ + ["g4dn"]="e5d234cc8acb35f425f60e1923e07e7e50272d9c" \ ["p2"]="e5d234cc8acb35f425f60e1923e07e7e50272d9c" \ ["p3"]="e5d234cc8acb35f425f60e1923e07e7e50272d9c" \ ) diff --git a/hooks/nvidia-device-plugin/image/files/01-aws-nvidia-driver.sh b/hooks/nvidia-device-plugin/image/files/01-aws-nvidia-driver.sh index c4edf2e227e91..b3c2342554b3f 100755 --- a/hooks/nvidia-device-plugin/image/files/01-aws-nvidia-driver.sh +++ b/hooks/nvidia-device-plugin/image/files/01-aws-nvidia-driver.sh @@ -40,7 +40,7 @@ case $CUDA_VERSION in class_to_driver_file=( \ ["g3"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \ ["g3s"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \ - ["g4"]="http://us.download.nvidia.com/tesla/418.87/NVIDIA-Linux-x86_64-390.46.00.run" \ + ["g4dn"]="http://us.download.nvidia.com/tesla/418.87/NVIDIA-Linux-x86_64-390.46.00.run" \ ["p2"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \ ["p3"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \ ) @@ -48,7 +48,7 @@ case $CUDA_VERSION in class_to_driver_checksum=( \ ["g3"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \ ["g3s"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \ - ["g4"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \ + ["g4dn"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \ ["p2"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \ ["p3"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \ ) @@ -76,7 +76,7 @@ case $CUDA_VERSION in class_to_driver_file=( \ ["g3"]="http://us.download.nvidia.com/tesla/410.129/NVIDIA-Linux-x86_64-410.129-diagnostic.run" \ ["g3s"]="http://us.download.nvidia.com/tesla/410.129/NVIDIA-Linux-x86_64-410.129-diagnostic.run" \ - ["g4"]="http://us.download.nvidia.com/tesla/410.129/NVIDIA-Linux-x86_64-410.129-diagnostic.run" \ + ["g4dn"]="http://us.download.nvidia.com/tesla/410.129/NVIDIA-Linux-x86_64-410.129-diagnostic.run" \ ["p2"]="http://us.download.nvidia.com/tesla/410.129/NVIDIA-Linux-x86_64-410.129-diagnostic.run" \ ["p3"]="http://us.download.nvidia.com/tesla/410.129/NVIDIA-Linux-x86_64-410.129-diagnostic.run" \ ) @@ -84,7 +84,7 @@ case $CUDA_VERSION in class_to_driver_checksum=( \ ["g3"]="e5d234cc8acb35f425f60e1923e07e7e50272d9c" \ ["g3s"]="e5d234cc8acb35f425f60e1923e07e7e50272d9c" \ - ["g4"]="e5d234cc8acb35f425f60e1923e07e7e50272d9c" \ + ["g4dn"]="e5d234cc8acb35f425f60e1923e07e7e50272d9c" \ ["p2"]="e5d234cc8acb35f425f60e1923e07e7e50272d9c" \ ["p3"]="e5d234cc8acb35f425f60e1923e07e7e50272d9c" \ ) From b7443a1f61221f7a7690f1ea52b7a56594f1a6a7 Mon Sep 17 00:00:00 2001 From: Casey Hilland Date: Wed, 22 Jan 2020 15:22:01 -0500 Subject: [PATCH 5/5] Fix driver address --- hooks/nvidia-bootstrap/image/run.sh | 2 +- hooks/nvidia-device-plugin/image/files/01-aws-nvidia-driver.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/hooks/nvidia-bootstrap/image/run.sh b/hooks/nvidia-bootstrap/image/run.sh index a8acb0143fdaa..ed00939b7afac 100755 --- a/hooks/nvidia-bootstrap/image/run.sh +++ b/hooks/nvidia-bootstrap/image/run.sh @@ -41,7 +41,7 @@ case $CUDA_VERSION in class_to_driver_file=( \ ["g3"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \ ["g3s"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \ - ["g4dn"]="http://us.download.nvidia.com/tesla/418.87/NVIDIA-Linux-x86_64-390.46.00.run" \ + ["g4dn"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \ ["p2"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \ ["p3"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \ ) diff --git a/hooks/nvidia-device-plugin/image/files/01-aws-nvidia-driver.sh b/hooks/nvidia-device-plugin/image/files/01-aws-nvidia-driver.sh index b3c2342554b3f..b2ef43fb28dd4 100755 --- a/hooks/nvidia-device-plugin/image/files/01-aws-nvidia-driver.sh +++ b/hooks/nvidia-device-plugin/image/files/01-aws-nvidia-driver.sh @@ -40,7 +40,7 @@ case $CUDA_VERSION in class_to_driver_file=( \ ["g3"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \ ["g3s"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \ - ["g4dn"]="http://us.download.nvidia.com/tesla/418.87/NVIDIA-Linux-x86_64-390.46.00.run" \ + ["g4dn"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \ ["p2"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \ ["p3"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \ )