From b86d05edfa8498859e6cde3b55789b08721e45ca Mon Sep 17 00:00:00 2001 From: Larry Ruckman Date: Tue, 9 Jul 2024 10:45:23 -0700 Subject: [PATCH 1/5] overhauling the comp_and_load_drivers.sh script --- data_gpu/driver/comp_and_load_drivers.sh | 58 ++++++------------------ 1 file changed, 13 insertions(+), 45 deletions(-) diff --git a/data_gpu/driver/comp_and_load_drivers.sh b/data_gpu/driver/comp_and_load_drivers.sh index 26fd8f2..5df90f7 100755 --- a/data_gpu/driver/comp_and_load_drivers.sh +++ b/data_gpu/driver/comp_and_load_drivers.sh @@ -1,48 +1,20 @@ #!/bin/bash -# Provide defaults for CC -[ -z "$CC" ] && CC=gcc - -# Function to check if the script is run with sudo -check_sudo() { - if [ "$EUID" -ne 0 ]; then - echo "Error: This script must be run with sudo." >&2 - exit 1 - fi -} -# Checks that GCC matches what the kernel was built with -check_gcc_version() { - _GCC_VER="$($CC --version | grep -Eo "\s[0-9]+\.[0-9]+\.[0-9]+\s" | awk '{$1=$1};1')" - if ! cat /proc/version | grep -Eoq "gcc version $_GCC_VER"; then - echo "Error: GCC version 'gcc version $_GCC_VER' does not match what the kernel was built with: '$(cat /proc/version | grep -Eo "gcc version [0-9]+\.[0-9]+\.[0-9]+")'" - echo " You can specify an alternative compiler by setting the 'CC' environment variable" - exit 1 - fi -} - # Check if the script is run with sudo -check_sudo - -# Check that our GCC matches what the kernel was built with -check_gcc_version - -# Function to find the latest Nvidia version directory -get_latest_nvidia_path() { - # Navigate to the /usr/src directory - cd /usr/src +if [ "$EUID" -ne 0 ]; then + echo "Error: This script must be run with sudo." >&2 + exit 1 +fi - # List and sort NVIDIA directories, then get the last one (the latest) - latest_nvidia_path=$(ls -d nvidia-* | sort -V | tail -n 1) +# Get the gcc that kernel was built with +version_info=$(cat /proc/version) +CC=$(echo "$version_info" | grep -oP '\b\w+-\w+-gcc-\d+\b') +echo "CC: $CC" - # Check if no NVIDIA directory was found - if [ -z "$latest_nvidia_path" ]; then - echo "Error: No NVIDIA directory found in /usr/src" >&2 - exit 1 - else - # Print the full path of the latest NVIDIA directory - echo "/usr/src/$latest_nvidia_path" - fi -} +# Define Nvidia path +output=$(find /usr -name nv-p2p.h) +NVIDIA_PATH=$(echo "$output" | grep -oP '^/usr/src/nvidia-\d+\.\d+\.\d+') +echo "Using Nvidia path: $NVIDIA_PATH" # Return directory RET_DIR=$PWD @@ -55,12 +27,8 @@ echo "Using RET_DIR: $RET_DIR" /usr/sbin/rmmod nvidia-modeset 2>/dev/null /usr/sbin/rmmod nvidia 2>/dev/null -# Define Nvidia path -NVIDIA_PATH=$(get_latest_nvidia_path) -echo "Using Nvidia path: $NVIDIA_PATH" - +# Go to nvidia path and build cd $NVIDIA_PATH - make if modinfo ecc >/dev/null 2>&1; then From 718176734e7f57d0c0008358d65fa367eee3fb89 Mon Sep 17 00:00:00 2001 From: Larry Ruckman Date: Tue, 9 Jul 2024 12:46:48 -0700 Subject: [PATCH 2/5] addin CC arg to Makefile --- data_gpu/driver/Makefile | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/data_gpu/driver/Makefile b/data_gpu/driver/Makefile index fa83210..37c954c 100644 --- a/data_gpu/driver/Makefile +++ b/data_gpu/driver/Makefile @@ -13,8 +13,9 @@ # contained in the LICENSE.txt file. # ---------------------------------------------------------------------------- -# Optional path to NVIDIA drivers, if not specified, use empty string. +# Args to this Makefile NVIDIA_DRIVERS ?= "" +CC ?= "" # Define the module name. NAME := datagpu @@ -63,10 +64,10 @@ obj-m := $(NAME).o # Default target: Display git version and build the module. all: @echo $(GITV) - $(MAKE) ARCH=$(ARCH) CROSS_COMPILE=$(CROSS_COMPILE) -C $(KERNELDIR) M=$(HOME) modules + $(MAKE) ARCH=$(ARCH) CROSS_COMPILE=$(CROSS_COMPILE) CC=$(CC) -C $(KERNELDIR) M=$(HOME) modules # Clean target: Remove built module files and object files. clean: - $(MAKE) ARCH=$(ARCH) CROSS_COMPILE=$(CROSS_COMPILE) -C $(KERNELDIR) M=$(HOME) clean + $(MAKE) ARCH=$(ARCH) CROSS_COMPILE=$(CROSS_COMPILE) CC=$(CC) -C $(KERNELDIR) M=$(HOME) clean rm -f $(OBJS) From a4e2c64ba4cdc3421da0c1ffb7d9d22525e71cc8 Mon Sep 17 00:00:00 2001 From: Larry Ruckman Date: Tue, 9 Jul 2024 12:47:17 -0700 Subject: [PATCH 3/5] overhauling the comp_and_load_drivers.sh script --- data_gpu/driver/comp_and_load_drivers.sh | 50 +++++++++++++----------- 1 file changed, 28 insertions(+), 22 deletions(-) diff --git a/data_gpu/driver/comp_and_load_drivers.sh b/data_gpu/driver/comp_and_load_drivers.sh index 5df90f7..c2e4c18 100755 --- a/data_gpu/driver/comp_and_load_drivers.sh +++ b/data_gpu/driver/comp_and_load_drivers.sh @@ -7,43 +7,49 @@ if [ "$EUID" -ne 0 ]; then fi # Get the gcc that kernel was built with -version_info=$(cat /proc/version) -CC=$(echo "$version_info" | grep -oP '\b\w+-\w+-gcc-\d+\b') +version_content=$(cat /proc/version) +CC=$(echo "$version_content" | grep -oP 'x86_64-linux-gnu-gcc-\d+') echo "CC: $CC" # Define Nvidia path -output=$(find /usr -name nv-p2p.h) -NVIDIA_PATH=$(echo "$output" | grep -oP '^/usr/src/nvidia-\d+\.\d+\.\d+') +output=$(find /usr -name nv-p2p.h 2>/dev/null) +NVIDIA_PATH=$(echo "$output" | grep -oP '^/usr/src/nvidia-\d+\.\d+\.\d+' | head -n 1) echo "Using Nvidia path: $NVIDIA_PATH" # Return directory RET_DIR=$PWD echo "Using RET_DIR: $RET_DIR" +# Stop the Xserver and nvidia-persistenced to prevent rmmod due to Module XXX is in use by: YYY +# https://forums.developer.nvidia.com/t/cant-install-new-driver-cannot-unload-module/63639 +systemctl stop gdm # For GNOME Display Manager +systemctl stop lightdm # For LightDM +systemctl stop sddm # For SDDM +systemctl stop nvidia-persistenced + # Remove existing Nvidia modules (if any) -/usr/sbin/rmmod datagpu 2>/dev/null -/usr/sbin/rmmod nvidia-drm 2>/dev/null -/usr/sbin/rmmod nvidia-uvm 2>/dev/null -/usr/sbin/rmmod nvidia-modeset 2>/dev/null -/usr/sbin/rmmod nvidia 2>/dev/null +/usr/sbin/rmmod datagpu +/usr/sbin/rmmod nvidia-drm +/usr/sbin/rmmod nvidia-uvm +/usr/sbin/rmmod nvidia-modeset +/usr/sbin/rmmod nvidia -# Go to nvidia path and build -cd $NVIDIA_PATH -make +# Go to nvidia path and build Nvidia driver +cd "$NVIDIA_PATH" || { echo "Error: Failed to change directory to $NVIDIA_PATH"; exit 1; } +make CC=$CC if modinfo ecc >/dev/null 2>&1; then modprobe ecc || { echo "Error: Failed to insert ecc module."; exit 1; } fi -/usr/sbin/insmod nvidia.ko NVreg_OpenRmEnableUnsupportedGpus=1 NVreg_EnableStreamMemOPs=1 || { echo "Error: Failed to insert nvidia.ko."; exit 1; } - -/usr/sbin/insmod nvidia-modeset.ko || { echo "Error: Failed to insert nvidia-modeset.ko."; exit 1; } - -/usr/sbin/insmod nvidia-uvm.ko || { echo "Error: Failed to insert nvidia-uvm.ko."; exit 1; } - -/usr/sbin/insmod nvidia-drm.ko modeset=1 || { echo "Error: Failed to insert nvidia-drm.ko."; exit 1; } +# Load the nvidia kernel drivers +/usr/sbin/insmod $NVIDIA_PATH/nvidia.ko NVreg_OpenRmEnableUnsupportedGpus=1 NVreg_EnableStreamMemOPs=1 || { echo "Error: Failed to insert nvidia.ko."; exit 1; } +/usr/sbin/insmod $NVIDIA_PATH/nvidia-modeset.ko || { echo "Error: Failed to insert nvidia-modeset.ko."; exit 1; } +/usr/sbin/insmod $NVIDIA_PATH/nvidia-drm.ko modeset=1 || { echo "Error: Failed to insert nvidia-drm.ko."; exit 1; } +/usr/sbin/insmod $NVIDIA_PATH/nvidia-uvm.ko || { echo "Error: Failed to insert nvidia-uvm.ko."; exit 1; } -cd $RET_DIR +# Go to nvidia path and build Nvidia driver +cd "$RET_DIR" || { echo "Error: Failed to change directory to $RET_DIR"; exit 1; } +make CC=$CC NVIDIA_DRIVERS=$NVIDIA_PATH +/usr/sbin/insmod $RET_DIR/datagpu.ko || { echo "Error: Failed to insert datagpu.ko."; exit 1; } -make NVIDIA_DRIVERS=$NVIDIA_PATH -/usr/sbin/insmod datagpu.ko || { echo "Error: Failed to insert datagpu.ko."; exit 1; } From e19b731d112abae1c938a30ef7a6a1d14d91bbdd Mon Sep 17 00:00:00 2001 From: Larry Ruckman Date: Tue, 9 Jul 2024 12:52:31 -0700 Subject: [PATCH 4/5] moving systemctl disable to README.md --- data_gpu/driver/README.md | 14 +++++++++----- data_gpu/driver/comp_and_load_drivers.sh | 17 +++++------------ 2 files changed, 14 insertions(+), 17 deletions(-) diff --git a/data_gpu/driver/README.md b/data_gpu/driver/README.md index fe04c8d..002715e 100644 --- a/data_gpu/driver/README.md +++ b/data_gpu/driver/README.md @@ -14,10 +14,14 @@ $ sudo apt-get install nvidia-cuda-toolkit $ sudo ./comp_and_load_drivers.sh ``` -If NVLink support is required, install cuda-drivers-fabricmanager- as well. However, as of Febuary 2024, the latest version of fabricmanager is 535. -When it becomes available on Ubuntu, install with: +Disable the Xserver and nvidia-persistenced to prevent rmmod due to Module XXX is in use by: YYY +because the Nvidia driver gets loaded by default at startup -``` -$ sudo apt-get install cuda-drivers-fabricmanager-545 -``` +https://forums.developer.nvidia.com/t/cant-install-new-driver-cannot-unload-module/63639 +```bash +$ sudo systemctl disable gdm # For GNOME Display Manager +$ sudo systemctl disable lightdm # For LightDM +$ sudo systemctl disable sddm # For SDDM +$ sudo systemctl disable nvidia-persistenced +``` diff --git a/data_gpu/driver/comp_and_load_drivers.sh b/data_gpu/driver/comp_and_load_drivers.sh index c2e4c18..e32c65c 100755 --- a/data_gpu/driver/comp_and_load_drivers.sh +++ b/data_gpu/driver/comp_and_load_drivers.sh @@ -20,19 +20,12 @@ echo "Using Nvidia path: $NVIDIA_PATH" RET_DIR=$PWD echo "Using RET_DIR: $RET_DIR" -# Stop the Xserver and nvidia-persistenced to prevent rmmod due to Module XXX is in use by: YYY -# https://forums.developer.nvidia.com/t/cant-install-new-driver-cannot-unload-module/63639 -systemctl stop gdm # For GNOME Display Manager -systemctl stop lightdm # For LightDM -systemctl stop sddm # For SDDM -systemctl stop nvidia-persistenced - # Remove existing Nvidia modules (if any) -/usr/sbin/rmmod datagpu -/usr/sbin/rmmod nvidia-drm -/usr/sbin/rmmod nvidia-uvm -/usr/sbin/rmmod nvidia-modeset -/usr/sbin/rmmod nvidia +/usr/sbin/rmmod datagpu 2>&1 +/usr/sbin/rmmod nvidia-drm 2>&1 +/usr/sbin/rmmod nvidia-uvm 2>&1 +/usr/sbin/rmmod nvidia-modeset 2>&1 +/usr/sbin/rmmod nvidia 2>&1 # Go to nvidia path and build Nvidia driver cd "$NVIDIA_PATH" || { echo "Error: Failed to change directory to $NVIDIA_PATH"; exit 1; } From f7b4fa85fbceb52fd899b8fe4bf90a353e0439cc Mon Sep 17 00:00:00 2001 From: Larry Ruckman Date: Tue, 9 Jul 2024 12:57:00 -0700 Subject: [PATCH 5/5] restoring this back --- data_gpu/driver/comp_and_load_drivers.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/data_gpu/driver/comp_and_load_drivers.sh b/data_gpu/driver/comp_and_load_drivers.sh index e32c65c..7a38c13 100755 --- a/data_gpu/driver/comp_and_load_drivers.sh +++ b/data_gpu/driver/comp_and_load_drivers.sh @@ -21,11 +21,11 @@ RET_DIR=$PWD echo "Using RET_DIR: $RET_DIR" # Remove existing Nvidia modules (if any) -/usr/sbin/rmmod datagpu 2>&1 -/usr/sbin/rmmod nvidia-drm 2>&1 -/usr/sbin/rmmod nvidia-uvm 2>&1 -/usr/sbin/rmmod nvidia-modeset 2>&1 -/usr/sbin/rmmod nvidia 2>&1 +/usr/sbin/rmmod datagpu 2>/dev/null +/usr/sbin/rmmod nvidia-drm 2>/dev/null +/usr/sbin/rmmod nvidia-uvm 2>/dev/null +/usr/sbin/rmmod nvidia-modeset 2>/dev/null +/usr/sbin/rmmod nvidia 2>/dev/null # Go to nvidia path and build Nvidia driver cd "$NVIDIA_PATH" || { echo "Error: Failed to change directory to $NVIDIA_PATH"; exit 1; }