From b6216ea004947b9af964db66325ea86fd8af62a0 Mon Sep 17 00:00:00 2001 From: Sharvil Shah Date: Thu, 24 Oct 2024 13:11:29 -0700 Subject: [PATCH] Gaudi 1.18 openshift notebook container (#480) Signed-off-by: sharvil10 Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Tyler Titsworth --- .../redhat/openshift-ai/gaudi/README.md | 1 + .../gaudi/docker/Dockerfile.rhel9.2 | 20 ++++++--- .../gaudi/docker/Dockerfile.rhel9.4 | 45 +++++++++++++------ .../gaudi/docker/docker-compose.yaml | 22 ++++----- .../openshift-ai/gaudi/docker/install_efa.sh | 31 +++++++++---- 5 files changed, 82 insertions(+), 37 deletions(-) diff --git a/enterprise/redhat/openshift-ai/gaudi/README.md b/enterprise/redhat/openshift-ai/gaudi/README.md index c1338f70..72cf54c8 100644 --- a/enterprise/redhat/openshift-ai/gaudi/README.md +++ b/enterprise/redhat/openshift-ai/gaudi/README.md @@ -8,6 +8,7 @@ Intel® Gaudi AI Software Tools for OpenShift AI(RedHat OpenShift Data Science/R | -----------------------------| ------------- | ------------- | | Intel Gaudi Notebook Container 1.17.0-495 | [Intel® Gaudi Software Stack*](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html), [Intel® Gaudi PyTorch](https://docs.habana.ai/en/latest/PyTorch/index.html), [Intel® Gaudi vLLM](https://github.com/HabanaAI/vllm-fork.git), [Intel® Gaudi DeepSpeed](https://github.com/HabanaAI/DeepSpeed) | [`registry.connect.redhat.com/intel/gaudi-notebooks:1.17.0-495-rhel-9.2`](registry.connect.redhat.com/intel/gaudi-notebooks@sha256:a62baf968caa7dd23b7f4cdcddc26e109d894f1436e247b4ea1e2fb4a5c94d54) | | Intel Gaudi Notebook Container 1.17.1-40 | [Intel® Gaudi Software Stack*](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html), [Intel® Gaudi PyTorch](https://docs.habana.ai/en/latest/PyTorch/index.html), [Intel® Gaudi vLLM](https://github.com/HabanaAI/vllm-fork.git), [Intel® Gaudi DeepSpeed](https://github.com/HabanaAI/DeepSpeed) | [`registry.connect.redhat.com/intel/gaudi-notebooks:1.17.1-40-rhel-9.2`](registry.connect.redhat.com/intel/gaudi-notebooks@sha256:00ca535956b7fcdd91e71bc4a3cd4493ddcaceea9b8d7bb95a7edc0e1cb0bac4) | +| Intel Gaudi Notebook Container 1.18.0-524 | [Intel® Gaudi Software Stack*](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html), [Intel® Gaudi PyTorch](https://docs.habana.ai/en/latest/PyTorch/index.html), [Intel® Gaudi vLLM](https://github.com/HabanaAI/vllm-fork.git), [Intel® Gaudi DeepSpeed](https://github.com/HabanaAI/DeepSpeed) | [`registry.connect.redhat.com/intel/gaudi-notebooks:1.18.0-524-rhel-9.2`](registry.connect.redhat.com/intel/gaudi-notebooks@sha256:142b11253e5708ff9744c895868b2adda2f6f01c40127b71f1aca3d7a6e6bc29) | ## Run Gaudi Notebook Containers diff --git a/enterprise/redhat/openshift-ai/gaudi/docker/Dockerfile.rhel9.2 b/enterprise/redhat/openshift-ai/gaudi/docker/Dockerfile.rhel9.2 index 6e1685dc..accf3f94 100644 --- a/enterprise/redhat/openshift-ai/gaudi/docker/Dockerfile.rhel9.2 +++ b/enterprise/redhat/openshift-ai/gaudi/docker/Dockerfile.rhel9.2 @@ -51,6 +51,7 @@ RUN dnf install -y \ wget \ git \ libffi-devel \ + bzip2 \ bzip2-devel \ zlib-devel \ mesa-libGL \ @@ -59,7 +60,8 @@ RUN dnf install -y \ # update pkgs (except OS version) for resolving potentials CVEs dnf versionlock add redhat-release* && \ dnf update -y && \ - dnf clean all && rm -rf /var/cache/yum + dnf clean all && rm -rf /var/cache/yum && \ + rm -f /etc/ssh/ssh_host_*_key* RUN mkdir -p /licenses && \ wget -O /licenses/LICENSE https://raw.githubusercontent.com/intel/ai-containers/main/LICENSE @@ -72,20 +74,22 @@ ENV LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH COPY install_efa.sh . RUN ./install_efa.sh && rm install_efa.sh && rm -rf /etc/ld.so.conf.d/efa.conf /etc/profile.d/efa.sh +ENV OPENMPI_VERSION=4.1.6 ENV LIBFABRIC_VERSION="1.20.0" ENV LIBFABRIC_ROOT="/opt/habanalabs/libfabric-${LIBFABRIC_VERSION}" -ENV MPI_ROOT=/opt/amazon/openmpi +ENV MPI_ROOT=/opt/habanalabs/openmpi ENV LD_LIBRARY_PATH=$LIBFABRIC_ROOT/lib:${MPI_ROOT}/lib:/usr/lib/habanalabs:$LD_LIBRARY_PATH ENV PATH=${LIBFABRIC_ROOT}/bin:${MPI_ROOT}/bin:$PATH ENV OPAL_PREFIX=${MPI_ROOT} ENV MPICC=${MPI_ROOT}/bin/mpicc ENV RDMAV_FORK_SAFE=1 -ENV FI_EFA_USE_DEVICE_RDMA=1 +ENV FI_EFA_USE_DEVICE_RDMA=0 +ENV OMPI_MCA_btl=^openib RUN echo "[habanalabs]" > /etc/yum.repos.d/habanalabs.repo && \ echo "name=Habana RH9 Linux repo" >> /etc/yum.repos.d/habanalabs.repo && \ echo "baseurl=https://${ARTIFACTORY_URL}/artifactory/rhel/9/9.2" >> /etc/yum.repos.d/habanalabs.repo && \ - echo "gpgkey=https://${ARTIFACTORY_URL}/artifactory/api/v2/repositories/rhel/keyPairs/primary/public" >> /etc/yum.repos.d/habanalabs.repo && \ + echo "gpgkey=https://${ARTIFACTORY_URL}/artifactory/rhel/9/9.2/repodata/repomd.xml.key" >> /etc/yum.repos.d/habanalabs.repo && \ echo "gpgcheck=1" >> /etc/yum.repos.d/habanalabs.repo # for Habana GPG key with SHA-1 signature @@ -112,6 +116,12 @@ RUN wget -nv -O /tmp/libfabric-${LIBFABRIC_VERSION}.tar.bz2 https://github.com/o ./configure --prefix=$LIBFABRIC_ROOT --enable-psm3-verbs --enable-verbs=yes --with-synapseai=/usr && \ make && make install && cd / && rm -rf /tmp/libfabric-${LIBFABRIC_VERSION}.tar.bz2 /tmp/libfabric-${LIBFABRIC_VERSION} +RUN wget -q -O /tmp/openmpi-${OPENMPI_VERSION}.tar.gz https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OPENMPI_VERSION}.tar.gz && \ + tar -xzf /tmp/openmpi-${OPENMPI_VERSION}.tar.gz -C /tmp && \ + cd /tmp/openmpi-${OPENMPI_VERSION} && \ + ./configure --prefix=${MPI_ROOT} --with-libfabric=$LIBFABRIC_ROOT --with-verbs && \ + make -j$(nproc) && make install && cd / && rm -rf /tmp/openmpi-${OPENMPI_VERSION}.tar.gz /tmp/openmpi-${OPENMPI_VERSION} + RUN wget -nv -O /tmp/main.zip https://github.com/HabanaAI/hccl_ofi_wrapper/archive/refs/heads/main.zip && \ unzip /tmp/main.zip -d /tmp && \ cd /tmp/hccl_ofi_wrapper-main && \ @@ -126,7 +136,7 @@ ENV HABANA_PLUGINS_LIB_PATH=/opt/habanalabs/habana_plugins ENV APP_ROOT="/opt/app-root" -RUN python3.10 -m pip install "pip>=23.3" "setuptools>=70.0.0" "wheel==0.38.4" +RUN python3.10 -m pip install "pip==24.2" "setuptools==75.1.0" "wheel==0.44.0" WORKDIR ${APP_ROOT} diff --git a/enterprise/redhat/openshift-ai/gaudi/docker/Dockerfile.rhel9.4 b/enterprise/redhat/openshift-ai/gaudi/docker/Dockerfile.rhel9.4 index 18eeef28..194c6811 100644 --- a/enterprise/redhat/openshift-ai/gaudi/docker/Dockerfile.rhel9.4 +++ b/enterprise/redhat/openshift-ai/gaudi/docker/Dockerfile.rhel9.4 @@ -49,14 +49,15 @@ RUN dnf install -y \ lsof \ python3-devel \ openssh-clients \ - openssl-1:3.0.7-27.el9 \ - openssl-devel-1:3.0.7-27.el9 \ + openssl-1:3.0.7-28.el9_4 \ + openssl-devel-1:3.0.7-28.el9_4 \ libjpeg-devel \ openssh-server \ lsb_release \ wget \ git \ libffi-devel \ + bzip2 \ bzip2-devel \ zlib-devel \ mesa-libGL \ @@ -64,13 +65,14 @@ RUN dnf install -y \ python3.11 \ python3.11-pip \ python3.11-devel \ + python3.11-rpm \ ffmpeg-free \ - perl-Net-SSLeay-1.92-2.el9 \ python3-dnf-plugin-versionlock && \ # update pkgs (except OS version) for resolving potentials CVEs - dnf versionlock add redhat-release* openssl* perl-Net-SSLeay && \ + dnf versionlock add redhat-release* openssl* libcurl-minimal curl-minimal ima-evm-utils python3-rpm rpm* && \ dnf update -y && \ - dnf clean all && rm -rf /var/cache/yum + dnf clean all && rm -rf /var/cache/yum && \ + rm -f /etc/ssh/ssh_host_*_key* RUN mkdir -p /licenses && \ wget -O /licenses/LICENSE https://raw.githubusercontent.com/intel/ai-containers/main/LICENSE @@ -85,15 +87,17 @@ RUN alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 2 && \ COPY install_efa.sh . RUN ./install_efa.sh && rm install_efa.sh && rm -rf /etc/ld.so.conf.d/efa.conf /etc/profile.d/efa.sh -ENV LIBFABRIC_VERSION="1.20.0" +ENV OPENMPI_VERSION=4.1.6 +ENV LIBFABRIC_VERSION="1.22.0" ENV LIBFABRIC_ROOT="/opt/habanalabs/libfabric-${LIBFABRIC_VERSION}" -ENV MPI_ROOT=/opt/amazon/openmpi +ENV MPI_ROOT=/opt/habanalabs/openmpi ENV LD_LIBRARY_PATH=$LIBFABRIC_ROOT/lib:${MPI_ROOT}/lib:/usr/lib/habanalabs:$LD_LIBRARY_PATH ENV PATH=${LIBFABRIC_ROOT}/bin:${MPI_ROOT}/bin:$PATH ENV OPAL_PREFIX=${MPI_ROOT} ENV MPICC=${MPI_ROOT}/bin/mpicc ENV RDMAV_FORK_SAFE=1 -ENV FI_EFA_USE_DEVICE_RDMA=1 +ENV FI_EFA_USE_DEVICE_RDMA=0 +ENV OMPI_MCA_btl=^openib RUN echo "[habanalabs]" > /etc/yum.repos.d/habanalabs.repo && \ echo "name=Habana RH9 Linux repo" >> /etc/yum.repos.d/habanalabs.repo && \ @@ -125,6 +129,12 @@ RUN wget -nv -O /tmp/libfabric-${LIBFABRIC_VERSION}.tar.bz2 https://github.com/o ./configure --prefix=$LIBFABRIC_ROOT --enable-psm3-verbs --enable-verbs=yes --with-synapseai=/usr && \ make && make install && cd / && rm -rf /tmp/libfabric-${LIBFABRIC_VERSION}.tar.bz2 /tmp/libfabric-${LIBFABRIC_VERSION} +RUN wget -q -O /tmp/openmpi-${OPENMPI_VERSION}.tar.gz https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OPENMPI_VERSION}.tar.gz && \ + tar -xzf /tmp/openmpi-${OPENMPI_VERSION}.tar.gz -C /tmp && \ + cd /tmp/openmpi-${OPENMPI_VERSION} && \ + ./configure --prefix=${MPI_ROOT} --with-libfabric=$LIBFABRIC_ROOT --with-verbs && \ + make -j$(nproc) && make install && cd / && rm -rf /tmp/openmpi-${OPENMPI_VERSION}.tar.gz /tmp/openmpi-${OPENMPI_VERSION} + RUN wget -nv -O /tmp/main.zip https://github.com/HabanaAI/hccl_ofi_wrapper/archive/refs/heads/main.zip && \ unzip /tmp/main.zip -d /tmp && \ cd /tmp/hccl_ofi_wrapper-main && \ @@ -134,7 +144,7 @@ RUN wget -nv -O /tmp/main.zip https://github.com/HabanaAI/hccl_ofi_wrapper/archi ENV APP_ROOT="/opt/app-root" -RUN python3.11 -m pip install pip==23.3.1 setuptools==67.3.3 wheel==0.38.4 +RUN python3.11 -m pip install pip==24.2 setuptools==75.1.0 wheel==0.44.0 WORKDIR ${APP_ROOT} @@ -170,7 +180,7 @@ ARG ARTIFACTORY_URL ENV BASE_NAME=rhel9.4 LABEL name="PyTorch Installer" -LABEL summary="Habanalabs PyTorch installer layer for RHEL9.2" +LABEL summary="Habanalabs PyTorch installer layer for RHEL9.4" LABEL description="Image with pre installed Habanalabs packages for PyTorch" RUN echo "/usr/lib/habanalabs" > $(python -c "import sysconfig; print(sysconfig.get_path('platlib'))")/habanalabs-graph.pt @@ -184,7 +194,7 @@ RUN echo "[CRB]" > /etc/yum.repos.d/CentOS-Linux-CRB.repo && \ echo "gpgcheck=1" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo RUN dnf install --allowerasing -y \ - curl \ + curl-7.76.1-29.el9_4.1 \ cairo-devel \ numactl-devel \ iproute \ @@ -196,10 +206,19 @@ RUN dnf install --allowerasing -y \ gperftools-devel && \ dnf clean all && rm -rf /var/cache/yum -RUN dnf config-manager --add-repo https://yum.repos.intel.com/mkl/setup/intel-mkl.repo -y && \ - dnf install --allowerasing -y intel-mkl-64bit-2020.4-912 && \ +RUN echo "[oneAPI]" >> /etc/yum.repos.d/oneAPI.repo && \ + echo "name=Intel® oneAPI repository" >> /etc/yum.repos.d/oneAPI.repo && \ + echo "baseurl=https://yum.repos.intel.com/oneapi" >> /etc/yum.repos.d/oneAPI.repo && \ + echo 'enabled=1' >> /etc/yum.repos.d/oneAPI.repo && \ + echo "gpgcheck=1" >> /etc/yum.repos.d/oneAPI.repo && \ + echo "repo_gpgcheck=1" >> /etc/yum.repos.d/oneAPI.repo && \ + echo "gpgkey=https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB" >> /etc/yum.repos.d/oneAPI.repo + +RUN dnf install --allowerasing -y intel-oneapi-mkl-2024.2.0 && \ dnf clean all && rm -rf /var/cache/yum +ENV LD_LIBRARY_PATH=/opt/intel/oneapi/mkl/2024.2/lib:${LD_LIBRARY_PATH} + RUN rm -rf /tmp/* USER 1001 diff --git a/enterprise/redhat/openshift-ai/gaudi/docker/docker-compose.yaml b/enterprise/redhat/openshift-ai/gaudi/docker/docker-compose.yaml index 5fde6ead..804b4b62 100644 --- a/enterprise/redhat/openshift-ai/gaudi/docker/docker-compose.yaml +++ b/enterprise/redhat/openshift-ai/gaudi/docker/docker-compose.yaml @@ -22,12 +22,12 @@ services: https_proxy: ${https_proxy} no_proxy: "" ARTIFACTORY_URL: ${ARTIFACTORY_URL:-vault.habana.ai} - VERSION: ${VERSION:-1.17.1} - REVISION: ${REVISION:-40} + VERSION: ${VERSION:-1.18.0} + REVISION: ${REVISION:-524} context: . target: gaudi-base dockerfile: Dockerfile.rhel${RHEL_OS:-9.2} - image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-gaudi-base-${VERSION:-1.17.1}-${REVISION:-40}-rhel-${RHEL_OS:-9.2} + image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-gaudi-base-${VERSION:-1.18.0}-${REVISION:-524}-rhel-${RHEL_OS:-9.2} entrypoint: ["/bin/bash", "-c"] command: > "hl-smi" @@ -37,17 +37,17 @@ services: BASE_IMAGE: ${BASE_IMAGE:-registry.access.redhat.com/ubi9/ubi} BASE_TAG: ${RHEL_OS:-9.2} BASE_NAME: rhel${RHEL_OS:-rhel9.2} - PT_VERSION: ${PT_VERSION:-2.3.1} + PT_VERSION: ${PT_VERSION:-2.4.0} http_proxy: ${http_proxy} https_proxy: ${https_proxy} no_proxy: "" ARTIFACTORY_URL: ${ARTIFACTORY_URL:-vault.habana.ai} - VERSION: ${VERSION:-1.17.1} - REVISION: ${REVISION:-40} + VERSION: ${VERSION:-1.18.0} + REVISION: ${REVISION:-524} context: . target: gaudi-pytorch dockerfile: Dockerfile.rhel${RHEL_OS:-9.2} - image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-gaudi-pytorch-${VERSION:-1.17.1}-${REVISION:-40}-rhel-${RHEL_OS:-9.2} + image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-gaudi-pytorch-${VERSION:-1.18.0}-${REVISION:-524}-rhel-${RHEL_OS:-9.2} entrypoint: ["/bin/bash", "-c"] command: > "python -c 'import torch'" @@ -57,17 +57,17 @@ services: BASE_IMAGE: ${BASE_IMAGE:-registry.access.redhat.com/ubi9/ubi} BASE_TAG: ${RHEL_OS:-9.2} BASE_NAME: ${BASE_NAME:-rhel9.2} - PT_VERSION: ${PT_VERSION:-2.3.1} + PT_VERSION: ${PT_VERSION:-2.4.0} http_proxy: ${http_proxy} https_proxy: ${https_proxy} no_proxy: "" ARTIFACTORY_URL: ${ARTIFACTORY_URL:-vault.habana.ai} - VERSION: ${VERSION:-1.17.1} - REVISION: ${REVISION:-40} + VERSION: ${VERSION:-1.18.0} + REVISION: ${REVISION:-524} context: . target: gaudi-notebooks dockerfile: Dockerfile.rhel${RHEL_OS:-9.2} - image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-gaudi-notebook-${VERSION:-1.17.1}-${REVISION:-40}-rhel-${RHEL_OS:-9.2} + image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-gaudi-notebook-${VERSION:-1.18.0}-${REVISION:-524}-rhel-${RHEL_OS:-9.2} entrypoint: ["/bin/bash", "-c"] command: > "python -m jupyter notebook --version" diff --git a/enterprise/redhat/openshift-ai/gaudi/docker/install_efa.sh b/enterprise/redhat/openshift-ai/gaudi/docker/install_efa.sh index 4175e8f8..5de8e790 100755 --- a/enterprise/redhat/openshift-ai/gaudi/docker/install_efa.sh +++ b/enterprise/redhat/openshift-ai/gaudi/docker/install_efa.sh @@ -14,27 +14,42 @@ # See the License for the specific language governing permissions and # limitations under the License. -DEFAULT_EFA_INSTALLER_VER=1.29.0 +#!/bin/bash -ex + +DEFAULT_EFA_INSTALLER_VER=1.34.0 efa_installer_version=${1:-$DEFAULT_EFA_INSTALLER_VER} tmp_dir=$(mktemp -d) wget -nv https://efa-installer.amazonaws.com/aws-efa-installer-"$efa_installer_version".tar.gz -P "$tmp_dir" tar -xf "$tmp_dir"/aws-efa-installer-"$efa_installer_version".tar.gz -C "$tmp_dir" +RUN_EFA_INSTALLER="./efa_installer.sh -y --skip-kmod --skip-limit-conf --no-verify" pushd "$tmp_dir"/aws-efa-installer # shellcheck disable=SC1091 -case $( - . /etc/os-release - echo -n "$ID" -) in +. /etc/os-release +case $ID in rhel) # we cannot install dkms packages on RHEL images due to OCP rules - rm -f RPMS/RHEL8/x86_64/dkms*.rpm + find RPMS/ -name 'dkms*.rpm' -exec rm -f {} \; + find RPMS/ -name 'efa-*.rpm' -exec rm -f {} \; + case $VERSION_ID in + 8*) + dnf install -y RPMS/ROCKYLINUX8/x86_64/rdma-core/*.rpm + ;; + 9*) + dnf install -y RPMS/ROCKYLINUX9/x86_64/rdma-core/*.rpm + ;; + *) + echo "Unsupported RHEL version: $VERSION_ID" + exit 1 + ;; + esac + RUN_EFA_INSTALLER="echo 'Skipping EFA installer on RHEL'" ;; tencentos) - dnf install -y RPMS/ROCKYLINUX8/x86_64/rdma-core/libibverbs-46.0-1.el8.x86_64.rpm RPMS/ROCKYLINUX8/x86_64/rdma-core/libibverbs-utils-46.0-1.el8.x86_64.rpm + dnf install -y RPMS/ROCKYLINUX8/x86_64/rdma-core/*.rpm patch -f -p1 -i /tmp/tencentos_efa_patch.txt --reject-file=tencentos_efa_patch.rej --no-backup-if-mismatch ;; esac -./efa_installer.sh -y --skip-kmod --skip-limit-conf --no-verify +eval "$RUN_EFA_INSTALLER" popd rm -rf "$tmp_dir"