qntfy · chilland · Nov 7, 2019 · Nov 9, 2019 · Nov 27, 2019 · Jan 12, 2020
diff --git a/hooks/nvidia-bootstrap/Makefile b/hooks/nvidia-bootstrap/Makefile
@@ -16,8 +16,10 @@ all: image
 
 .PHONY: image push
 
-image:
-	docker build -t kopeio/nvidia-bootstrap:1.6.0 -f image/Dockerfile image/
+image:  ## Build the images
+    docker build --build-arg CUDA_VERSION=9.1 -t qntfy/nvidia-device-plugin:0.2.0-CUDA-9.1 -f image/Dockerfile image/
+    docker build --build-arg CUDA_VERSION=10.0 -t qntfy/nvidia-device-plugin:0.2.0-CUDA-10.0 -f image/Dockerfile image/
 
-push: image
-	docker push kopeio/nvidia-bootstrap:1.6.0
+push:   ## Push the images
+    docker push qntfy/nvidia-device-plugin:0.2.0-CUDA-9.1
+    docker push qntfy/nvidia-device-plugin:0.2.0-CUDA-10.0
diff --git a/hooks/nvidia-bootstrap/image/Dockerfile b/hooks/nvidia-bootstrap/image/Dockerfile
@@ -20,4 +20,8 @@ RUN apt-get update && apt-get -yq install curl jq
 
 ADD run.sh /run.sh
 
+ARG CUDA_VERSION=9.1
+
+ENV CUDA_VERSION ${CUDA_VERSION}
+
 CMD [ "/bin/bash", "/run.sh" ]
diff --git a/hooks/nvidia-bootstrap/image/run.sh b/hooks/nvidia-bootstrap/image/run.sh
@@ -26,43 +26,80 @@ CACHE_DIR_CONTAINER="${ROOTFS_DIR}${CACHE_DIR_HOST}"
 # AWS Instance Types to Nvidia Card Mapping (cut and pasted from AWS docs)
 # Load the correct driver for the correct instance type
 #   Instances  Product Type  Product Series  Product
-#   G2         GRID          GRID Series     GRID K520   <-- I think they meant G3
+#   G2         GRID          GRID Series     GRID K520 (deprecated)
+#   G3         Tesla         M-Series        M-60
+#   G3S        Tesla         M-Series        M-60
+#   G4         Tesla         T-Series        T-4
 #   P2         Tesla         K-Series        K-80
 #   P3         Tesla         V-Series        V100
 # Both P2 and P3 are set for Cuda Toolkit 9.1
 # http://www.nvidia.com/Download/index.aspx
 declare -A class_to_driver_file
-class_to_driver_file=( \
-    ["g3"]="http://us.download.nvidia.com/XFree86/Linux-x86_64/367.124/NVIDIA-Linux-x86_64-367.124.run" \
-    ["g3s"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \
-    ["p2"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \
-    ["p3"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \
-)
 declare -A class_to_driver_checksum
-class_to_driver_checksum=( \
-    ["g3"]="77f37939efeea4b6505842bed50445971992e303" \
-    ["g3s"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \
-    ["p2"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \
-    ["p3"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \
-)
-
-# CUDA Files that need to be installed ~1.4GB
-#   First one is main installation
-#   Subsequent files are patches which need to be applied in order
-#   Order in the arrays below matters
-# https://developer.nvidia.com/cuda-downloads
-cuda_files=( \
-  "https://developer.nvidia.com/compute/cuda/9.1/Prod/local_installers/cuda_9.1.85_387.26_linux" \
-  "https://developer.nvidia.com/compute/cuda/9.1/Prod/patches/1/cuda_9.1.85.1_linux" \
-  "https://developer.nvidia.com/compute/cuda/9.1/Prod/patches/2/cuda_9.1.85.2_linux" \
-  "https://developer.nvidia.com/compute/cuda/9.1/Prod/patches/3/cuda_9.1.85.3_linux" \
-)
-cuda_files_checksums=( \
-  "1540658f4fe657dddd8b0899555b7468727d4aa8" \
-  "7ec6970ecd81163b0d02ef30d35599e7fd6e97d8" \
-  "cfa3b029b58fc117d8ce510a70efc848924dd565" \
-  "6269a2c5784b08997edb97ea0020fb4e6c8769ed" \
-)
+case $CUDA_VERSION in
+    9.1)
+        class_to_driver_file=( \
+            ["g3"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \
+            ["g3s"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \
+            ["g4dn"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \
+            ["p2"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \
+            ["p3"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \
+        )
+
+        class_to_driver_checksum=( \
+            ["g3"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \
+            ["g3s"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \
+            ["g4dn"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \
+            ["p2"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \
+            ["p3"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \
+        )
+
+        # CUDA Files that need to be installed ~1.4GB
+        #   First one is main installation
+        #   Subsequent files are patches which need to be applied in order
+        #   Order in the arrays below matters
+        # https://developer.nvidia.com/cuda-downloads
+        cuda_files=( \
+        "https://developer.nvidia.com/compute/cuda/9.1/Prod/local_installers/cuda_9.1.85_387.26_linux" \
+        "https://developer.nvidia.com/compute/cuda/9.1/Prod/patches/1/cuda_9.1.85.1_linux" \
+        "https://developer.nvidia.com/compute/cuda/9.1/Prod/patches/2/cuda_9.1.85.2_linux" \
+        "https://developer.nvidia.com/compute/cuda/9.1/Prod/patches/3/cuda_9.1.85.3_linux" \
+        )
+
+        cuda_files_checksums=( \
+        "1540658f4fe657dddd8b0899555b7468727d4aa8" \
+        "7ec6970ecd81163b0d02ef30d35599e7fd6e97d8" \
+        "cfa3b029b58fc117d8ce510a70efc848924dd565" \
+        "6269a2c5784b08997edb97ea0020fb4e6c8769ed" \
+        )
+        ;;
+    10.0)
+        class_to_driver_file=( \
+            ["g3"]="http://us.download.nvidia.com/tesla/410.129/NVIDIA-Linux-x86_64-410.129-diagnostic.run" \
+            ["g3s"]="http://us.download.nvidia.com/tesla/410.129/NVIDIA-Linux-x86_64-410.129-diagnostic.run" \
+            ["g4dn"]="http://us.download.nvidia.com/tesla/410.129/NVIDIA-Linux-x86_64-410.129-diagnostic.run" \
+            ["p2"]="http://us.download.nvidia.com/tesla/410.129/NVIDIA-Linux-x86_64-410.129-diagnostic.run" \
+            ["p3"]="http://us.download.nvidia.com/tesla/410.129/NVIDIA-Linux-x86_64-410.129-diagnostic.run" \
+        )
+
+        class_to_driver_checksum=( \
+            ["g3"]="e5d234cc8acb35f425f60e1923e07e7e50272d9c" \
+            ["g3s"]="e5d234cc8acb35f425f60e1923e07e7e50272d9c" \
+            ["g4dn"]="e5d234cc8acb35f425f60e1923e07e7e50272d9c" \
+            ["p2"]="e5d234cc8acb35f425f60e1923e07e7e50272d9c" \
+            ["p3"]="e5d234cc8acb35f425f60e1923e07e7e50272d9c" \
+        )
+
+        cuda_files=( \
+        "http://developer.download.nvidia.com/compute/cuda/10.1/Prod/local_installers/cuda_10.1.243_418.87.00_linux.run" \
+        )
+
+        cuda_files_checksums=( \
+        "36706e2c0fb7efa14aea6a3c889271d97fd3575d" \
+        )
+        ;;
+    *) echo "CUDA ${CUDA_VERSION} not supported by kops hook" && exit 1
+esac
 
 containsElement () { for e in "${@:2}"; do [[ "$e" = "$1" ]] && return 0; done; return 1; }
 
@@ -170,7 +207,7 @@ for (( i=0; i<${length}; i++ )); do
       touch $filepath_installed # Mark successful installation
     elif [[ $download =~ .*local_installers.*cuda.* ]]; then
       # Install the primary cuda library (using gcc)
-      chroot ${ROOTFS_DIR} $filepath_host --toolkit --silent --verbose
+      chroot ${ROOTFS_DIR} $filepath_host --toolkit --silent
       touch $filepath_installed # Mark successful installation
     elif [[ $download =~ .*patches.*cuda.* ]]; then
       # Install an update to the primary cuda library (using gcc)

diff --git a/hooks/nvidia-device-plugin/Makefile b/hooks/nvidia-device-plugin/Makefile
@@ -18,11 +18,13 @@ all: image
 
 .PHONY: image push
 
-image:  ## Build the image
-	docker build -t dcwangmit01/nvidia-device-plugin:0.1.0 -f image/Dockerfile image/
+image:  ## Build the images
+	docker build --build-arg CUDA_VERSION=9.1 -t qntfy/nvidia-device-plugin:0.2.0-CUDA-9.1 -f image/Dockerfile image/
+	docker build --build-arg CUDA_VERSION=10.0 -t qntfy/nvidia-device-plugin:0.2.0-CUDA-10.0 -f image/Dockerfile image/
 
-push:   ## Push the image
-	docker push dcwangmit01/nvidia-device-plugin:0.1.0
+push:   ## Push the images
+	docker push qntfy/nvidia-device-plugin:0.2.0-CUDA-9.1
+	docker push qntfy/nvidia-device-plugin:0.2.0-CUDA-10.0
 
 help:   ## Print list of Makefile targets
 	@# Taken from https://github.com/spf13/hugo/blob/master/Makefile

diff --git a/hooks/nvidia-device-plugin/README.md b/hooks/nvidia-device-plugin/README.md
@@ -34,9 +34,9 @@ Using this hook indicates that you agree to the Nvidia
 Although this hook *may* work among many combinatorial versions of software and
 images, it has only been tested with the following:
 
-* kops: **1.9**
-* kubernetes: 1.10, **1.11**
-* OS Image: **`kope.io/k8s-1.10-debian-stretch-amd64-hvm-ebs-2018-05-27`**
+* kops: 1.9, **1.13**
+* kubernetes: 1.10, 1.11, **1.13**
+* OS Image: **`kope.io/k8s-1.10-debian-stretch-amd64-hvm-ebs-2019-06-21`**
   * This is most certainly not the default image for kops.  The OS image must
     be explicitly overridden in the cluster or instancegroup spec.
   * Debian stretch is needed because `nvidia-docker` requires a newer version
@@ -56,10 +56,18 @@ This kops hook was developed against the following version combinations.
 
 | Kops Version  | Kubernetes Version | GPU Mode     | OS Image |
 | ------------- | ------------------ | ------------ | -------- |
+| 1.13.0        | 1.13               | deviceplugin | kope.io/k8s-1.12-debian-stretch-amd64-hvm-ebs-2019-06-21
 | 1.10-beta.1   | 1.10               | deviceplugin | kope.io/k8s-1.10-debian-stretch-amd64-hvm-ebs-2018-05-27
 | 1.9.1         | 1.11               | deviceplugin | kope.io/k8s-1.10-debian-stretch-amd64-hvm-ebs-2018-05-27
 | 1.9.1         | 1.10               | legacy       | kope.io/k8s-1.10-debian-stretch-amd64-hvm-ebs-2018-05-27
 
+#### Driver Support
+
+| Image                                      | NVIDIA Driver Version | CUDA Version |
+| ------------------------------------------ | --------------------- | ------------ | 
+| qntfy/nvidia-device-plugin:0.2.0-CUDA-9.1  | 390.46                | 9.1
+| qntfy/nvidia-device-plugin:0.2.0-CUDA-10.0 | 410.129               | 10.0 
+
 ## Using this DevicePlugin
 
 ### Create a Cluster with GPU Nodes
@@ -95,7 +103,7 @@ spec:
   image: kope.io/k8s-1.10-debian-stretch-amd64-hvm-ebs-2018-05-27
   hooks:
   - execContainer:
-      image: dcwangmit01/nvidia-device-plugin:0.1.0
+      image: qntfy/nvidia-device-plugin:0.2.0-CUDA-9.1
 
 ### The settings below are only necessary for kubernetes <= 1.11.0, where
 ###   deviceplugins are not enabled by default.
@@ -120,7 +128,7 @@ spec:
   image: kope.io/k8s-1.10-debian-stretch-amd64-hvm-ebs-2018-05-27
   hooks:
   - execContainer:
-      image: dcwangmit01/nvidia-device-plugin:0.1.0
+      image: qntfy/nvidia-device-plugin:0.2.0-CUDA-9.1
       environment:
         NVIDIA_DEVICE_PLUGIN_MODE: legacy
   kubelet:

diff --git a/hooks/nvidia-device-plugin/image/Dockerfile b/hooks/nvidia-device-plugin/image/Dockerfile
@@ -24,4 +24,10 @@ RUN apt-get update && \
 COPY run.sh /run.sh
 COPY files/* /nvidia-device-plugin/
 
+ARG CUDA_VERSION=9.1
+
+ENV CUDA_VERSION ${CUDA_VERSION}
+
+RUN echo "CUDA_VERSION=${CUDA_VERSION}" > /nvidia-device-plugin/nvidia.env
+
 CMD [ "/bin/bash", "/run.sh" ]
diff --git a/hooks/nvidia-device-plugin/image/files/01-aws-nvidia-driver.sh b/hooks/nvidia-device-plugin/image/files/01-aws-nvidia-driver.sh
@@ -29,43 +29,76 @@ CACHE_DIR=/nvidia-device-plugin
 #   G2         GRID          GRID Series     GRID K520 (deprecated)
 #   G3         Tesla         M-Series        M-60
 #   G3S        Tesla         M-Series        M-60
+#   G4         Tesla         T-Series        T-4
 #   P2         Tesla         K-Series        K-80
 #   P3         Tesla         V-Series        V100
 # http://www.nvidia.com/Download/index.aspx
 declare -A class_to_driver_file
-class_to_driver_file=( \
-    ["g2"]="http://us.download.nvidia.com/XFree86/Linux-x86_64/367.124/NVIDIA-Linux-x86_64-367.124.run" \
-    ["g3"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \
-    ["g3s"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \
-    ["p2"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \
-    ["p3"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \
-)
 declare -A class_to_driver_checksum
-class_to_driver_checksum=( \
-    ["g2"]="77f37939efeea4b6505842bed50445971992e303" \
-    ["g3"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \
-    ["g3s"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \
-    ["p2"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \
-    ["p3"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \
-)
-
-# CUDA Files that need to be installed ~1.4GB
-#   First one is main installation
-#   Subsequent files are patches which need to be applied in order
-#   Order in the arrays below matters
-# https://developer.nvidia.com/cuda-downloads
-cuda_files=( \
-  "https://developer.nvidia.com/compute/cuda/9.1/Prod/local_installers/cuda_9.1.85_387.26_linux" \
-  "https://developer.nvidia.com/compute/cuda/9.1/Prod/patches/1/cuda_9.1.85.1_linux" \
-  "https://developer.nvidia.com/compute/cuda/9.1/Prod/patches/2/cuda_9.1.85.2_linux" \
-  "https://developer.nvidia.com/compute/cuda/9.1/Prod/patches/3/cuda_9.1.85.3_linux" \
-)
-cuda_files_checksums=( \
-  "1540658f4fe657dddd8b0899555b7468727d4aa8" \
-  "7ec6970ecd81163b0d02ef30d35599e7fd6e97d8" \
-  "cfa3b029b58fc117d8ce510a70efc848924dd565" \
-  "6269a2c5784b08997edb97ea0020fb4e6c8769ed" \
-)
+case $CUDA_VERSION in
+    9.1)
+        class_to_driver_file=( \
+            ["g3"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \
+            ["g3s"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \
+            ["g4dn"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \
+            ["p2"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \
+            ["p3"]="http://us.download.nvidia.com/tesla/390.46/NVIDIA-Linux-x86_64-390.46.run" \
+        )
+
+        class_to_driver_checksum=( \
+            ["g3"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \
+            ["g3s"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \
+            ["g4dn"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \
+            ["p2"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \
+            ["p3"]="57569ecb6f6d839ecc77fa10a2c573cc069990cc" \
+        )
+
+        # CUDA Files that need to be installed ~1.4GB
+        #   First one is main installation
+        #   Subsequent files are patches which need to be applied in order
+        #   Order in the arrays below matters
+        # https://developer.nvidia.com/cuda-downloads
+        cuda_files=( \
+        "https://developer.nvidia.com/compute/cuda/9.1/Prod/local_installers/cuda_9.1.85_387.26_linux" \
+        "https://developer.nvidia.com/compute/cuda/9.1/Prod/patches/1/cuda_9.1.85.1_linux" \
+        "https://developer.nvidia.com/compute/cuda/9.1/Prod/patches/2/cuda_9.1.85.2_linux" \
+        "https://developer.nvidia.com/compute/cuda/9.1/Prod/patches/3/cuda_9.1.85.3_linux" \
+        )
+
+        cuda_files_checksums=( \
+        "1540658f4fe657dddd8b0899555b7468727d4aa8" \
+        "7ec6970ecd81163b0d02ef30d35599e7fd6e97d8" \
+        "cfa3b029b58fc117d8ce510a70efc848924dd565" \
+        "6269a2c5784b08997edb97ea0020fb4e6c8769ed" \
+        )
+        ;;
+    10.0)
+        class_to_driver_file=( \
+            ["g3"]="http://us.download.nvidia.com/tesla/410.129/NVIDIA-Linux-x86_64-410.129-diagnostic.run" \
+            ["g3s"]="http://us.download.nvidia.com/tesla/410.129/NVIDIA-Linux-x86_64-410.129-diagnostic.run" \
+            ["g4dn"]="http://us.download.nvidia.com/tesla/410.129/NVIDIA-Linux-x86_64-410.129-diagnostic.run" \
+            ["p2"]="http://us.download.nvidia.com/tesla/410.129/NVIDIA-Linux-x86_64-410.129-diagnostic.run" \
+            ["p3"]="http://us.download.nvidia.com/tesla/410.129/NVIDIA-Linux-x86_64-410.129-diagnostic.run" \
+        )
+
+        class_to_driver_checksum=( \
+            ["g3"]="e5d234cc8acb35f425f60e1923e07e7e50272d9c" \
+            ["g3s"]="e5d234cc8acb35f425f60e1923e07e7e50272d9c" \
+            ["g4dn"]="e5d234cc8acb35f425f60e1923e07e7e50272d9c" \
+            ["p2"]="e5d234cc8acb35f425f60e1923e07e7e50272d9c" \
+            ["p3"]="e5d234cc8acb35f425f60e1923e07e7e50272d9c" \
+        )
+
+        cuda_files=( \
+        "http://developer.download.nvidia.com/compute/cuda/10.1/Prod/local_installers/cuda_10.1.243_418.87.00_linux.run" \
+        )
+
+        cuda_files_checksums=( \
+        "36706e2c0fb7efa14aea6a3c889271d97fd3575d" \
+        ) 
+        ;;
+    *) echo "CUDA ${CUDA_VERSION} not supported by kops hook" && exit 1
+esac
 
 containsElement () { for e in "${@:2}"; do [[ "$e" = "$1" ]] && return 0; done; return 1; }
 
@@ -102,7 +135,7 @@ echo "Identified machine as AWS_INSTANCE_TYPE[$AWS_INSTANCE_TYPE] AWS_INSTANCE_C
 #   Install with --no-upgrade so that the c-libs are not upgraded, possibly
 #   breaking programs and requiring restart
 apt-get -y update
-apt-get -y --no-upgrade install gcc libc-dev linux-headers-$(uname -r)
+apt-get -y --no-upgrade install gcc libc-dev libxml2-dev linux-headers-$(uname -r)
 apt-get -y clean
 apt-get -y autoremove
 
@@ -153,7 +186,7 @@ for (( i=0; i<${length}; i++ )); do
       touch $filepath_installed # Mark successful installation
     elif [[ $download =~ .*local_installers.*cuda.* ]]; then
       # Install the primary cuda library
-      $filepath --toolkit --silent --verbose
+      $filepath --toolkit --silent
       touch $filepath_installed # Mark successful installation
     elif [[ $download =~ .*patches.*cuda.* ]]; then
       # Install an update to the primary cuda library

diff --git a/hooks/nvidia-device-plugin/image/files/02-nvidia-docker.sh b/hooks/nvidia-device-plugin/image/files/02-nvidia-docker.sh
@@ -67,7 +67,7 @@ EOF
 # --force-confold prevents prompt for replacement of daemon.json
 apt-get -y update
 # Stop protokube to ensure not bring kubelet up again
-systemctl stop protokube
+systemctl stop protokube || true
 # Stop kubelet to ensure not bring stopped containers up again and leak
 # them as orphan containers
 systemctl stop kubelet
@@ -86,4 +86,4 @@ systemctl mask cloud-init.service
 systemctl mask kops-configuration.service
 
 # Restore protokube and protokube will bring up kubelet
-systemctl start protokube
+systemctl start protokube || systemctl start kubelet
diff --git a/hooks/nvidia-device-plugin/image/files/nvidia-device-plugin.service b/hooks/nvidia-device-plugin/image/files/nvidia-device-plugin.service
@@ -4,6 +4,7 @@ After=cloud-config.target cloud-init.target kops-configuration.service apt-daily
 
 [Service]
 Type=oneshot
+EnvironmentFile=/nvidia-device-plugin/nvidia.env
 ExecStart=/bin/bash -c "/nvidia-device-plugin/nvidia-device-plugin.sh"
 
 [Install]