Skip to content

Commit

Permalink
Gaudi 1.18 openshift notebook container (#480)
Browse files Browse the repository at this point in the history
Signed-off-by: sharvil10 <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Tyler Titsworth <[email protected]>
  • Loading branch information
3 people authored Oct 24, 2024
1 parent caa06a4 commit b6216ea
Show file tree
Hide file tree
Showing 5 changed files with 82 additions and 37 deletions.
1 change: 1 addition & 0 deletions enterprise/redhat/openshift-ai/gaudi/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ Intel® Gaudi AI Software Tools for OpenShift AI(RedHat OpenShift Data Science/R
| -----------------------------| ------------- | ------------- |
| Intel Gaudi Notebook Container 1.17.0-495 | [Intel® Gaudi Software Stack*](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html), [Intel® Gaudi PyTorch](https://docs.habana.ai/en/latest/PyTorch/index.html), [Intel® Gaudi vLLM](https://github.com/HabanaAI/vllm-fork.git), [Intel® Gaudi DeepSpeed](https://github.com/HabanaAI/DeepSpeed) | [`registry.connect.redhat.com/intel/gaudi-notebooks:1.17.0-495-rhel-9.2`](registry.connect.redhat.com/intel/gaudi-notebooks@sha256:a62baf968caa7dd23b7f4cdcddc26e109d894f1436e247b4ea1e2fb4a5c94d54) |
| Intel Gaudi Notebook Container 1.17.1-40 | [Intel® Gaudi Software Stack*](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html), [Intel® Gaudi PyTorch](https://docs.habana.ai/en/latest/PyTorch/index.html), [Intel® Gaudi vLLM](https://github.com/HabanaAI/vllm-fork.git), [Intel® Gaudi DeepSpeed](https://github.com/HabanaAI/DeepSpeed) | [`registry.connect.redhat.com/intel/gaudi-notebooks:1.17.1-40-rhel-9.2`](registry.connect.redhat.com/intel/gaudi-notebooks@sha256:00ca535956b7fcdd91e71bc4a3cd4493ddcaceea9b8d7bb95a7edc0e1cb0bac4) |
| Intel Gaudi Notebook Container 1.18.0-524 | [Intel® Gaudi Software Stack*](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html), [Intel® Gaudi PyTorch](https://docs.habana.ai/en/latest/PyTorch/index.html), [Intel® Gaudi vLLM](https://github.com/HabanaAI/vllm-fork.git), [Intel® Gaudi DeepSpeed](https://github.com/HabanaAI/DeepSpeed) | [`registry.connect.redhat.com/intel/gaudi-notebooks:1.18.0-524-rhel-9.2`](registry.connect.redhat.com/intel/gaudi-notebooks@sha256:142b11253e5708ff9744c895868b2adda2f6f01c40127b71f1aca3d7a6e6bc29) |

## Run Gaudi Notebook Containers

Expand Down
20 changes: 15 additions & 5 deletions enterprise/redhat/openshift-ai/gaudi/docker/Dockerfile.rhel9.2
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ RUN dnf install -y \
wget \
git \
libffi-devel \
bzip2 \
bzip2-devel \
zlib-devel \
mesa-libGL \
Expand All @@ -59,7 +60,8 @@ RUN dnf install -y \
# update pkgs (except OS version) for resolving potentials CVEs
dnf versionlock add redhat-release* && \
dnf update -y && \
dnf clean all && rm -rf /var/cache/yum
dnf clean all && rm -rf /var/cache/yum && \
rm -f /etc/ssh/ssh_host_*_key*

RUN mkdir -p /licenses && \
wget -O /licenses/LICENSE https://raw.githubusercontent.com/intel/ai-containers/main/LICENSE
Expand All @@ -72,20 +74,22 @@ ENV LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
COPY install_efa.sh .
RUN ./install_efa.sh && rm install_efa.sh && rm -rf /etc/ld.so.conf.d/efa.conf /etc/profile.d/efa.sh

ENV OPENMPI_VERSION=4.1.6
ENV LIBFABRIC_VERSION="1.20.0"
ENV LIBFABRIC_ROOT="/opt/habanalabs/libfabric-${LIBFABRIC_VERSION}"
ENV MPI_ROOT=/opt/amazon/openmpi
ENV MPI_ROOT=/opt/habanalabs/openmpi
ENV LD_LIBRARY_PATH=$LIBFABRIC_ROOT/lib:${MPI_ROOT}/lib:/usr/lib/habanalabs:$LD_LIBRARY_PATH
ENV PATH=${LIBFABRIC_ROOT}/bin:${MPI_ROOT}/bin:$PATH
ENV OPAL_PREFIX=${MPI_ROOT}
ENV MPICC=${MPI_ROOT}/bin/mpicc
ENV RDMAV_FORK_SAFE=1
ENV FI_EFA_USE_DEVICE_RDMA=1
ENV FI_EFA_USE_DEVICE_RDMA=0
ENV OMPI_MCA_btl=^openib

RUN echo "[habanalabs]" > /etc/yum.repos.d/habanalabs.repo && \
echo "name=Habana RH9 Linux repo" >> /etc/yum.repos.d/habanalabs.repo && \
echo "baseurl=https://${ARTIFACTORY_URL}/artifactory/rhel/9/9.2" >> /etc/yum.repos.d/habanalabs.repo && \
echo "gpgkey=https://${ARTIFACTORY_URL}/artifactory/api/v2/repositories/rhel/keyPairs/primary/public" >> /etc/yum.repos.d/habanalabs.repo && \
echo "gpgkey=https://${ARTIFACTORY_URL}/artifactory/rhel/9/9.2/repodata/repomd.xml.key" >> /etc/yum.repos.d/habanalabs.repo && \
echo "gpgcheck=1" >> /etc/yum.repos.d/habanalabs.repo

# for Habana GPG key with SHA-1 signature
Expand All @@ -112,6 +116,12 @@ RUN wget -nv -O /tmp/libfabric-${LIBFABRIC_VERSION}.tar.bz2 https://github.com/o
./configure --prefix=$LIBFABRIC_ROOT --enable-psm3-verbs --enable-verbs=yes --with-synapseai=/usr && \
make && make install && cd / && rm -rf /tmp/libfabric-${LIBFABRIC_VERSION}.tar.bz2 /tmp/libfabric-${LIBFABRIC_VERSION}

RUN wget -q -O /tmp/openmpi-${OPENMPI_VERSION}.tar.gz https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OPENMPI_VERSION}.tar.gz && \
tar -xzf /tmp/openmpi-${OPENMPI_VERSION}.tar.gz -C /tmp && \
cd /tmp/openmpi-${OPENMPI_VERSION} && \
./configure --prefix=${MPI_ROOT} --with-libfabric=$LIBFABRIC_ROOT --with-verbs && \
make -j$(nproc) && make install && cd / && rm -rf /tmp/openmpi-${OPENMPI_VERSION}.tar.gz /tmp/openmpi-${OPENMPI_VERSION}

RUN wget -nv -O /tmp/main.zip https://github.com/HabanaAI/hccl_ofi_wrapper/archive/refs/heads/main.zip && \
unzip /tmp/main.zip -d /tmp && \
cd /tmp/hccl_ofi_wrapper-main && \
Expand All @@ -126,7 +136,7 @@ ENV HABANA_PLUGINS_LIB_PATH=/opt/habanalabs/habana_plugins

ENV APP_ROOT="/opt/app-root"

RUN python3.10 -m pip install "pip>=23.3" "setuptools>=70.0.0" "wheel==0.38.4"
RUN python3.10 -m pip install "pip==24.2" "setuptools==75.1.0" "wheel==0.44.0"

WORKDIR ${APP_ROOT}

Expand Down
45 changes: 32 additions & 13 deletions enterprise/redhat/openshift-ai/gaudi/docker/Dockerfile.rhel9.4
Original file line number Diff line number Diff line change
Expand Up @@ -49,28 +49,30 @@ RUN dnf install -y \
lsof \
python3-devel \
openssh-clients \
openssl-1:3.0.7-27.el9 \
openssl-devel-1:3.0.7-27.el9 \
openssl-1:3.0.7-28.el9_4 \
openssl-devel-1:3.0.7-28.el9_4 \
libjpeg-devel \
openssh-server \
lsb_release \
wget \
git \
libffi-devel \
bzip2 \
bzip2-devel \
zlib-devel \
mesa-libGL \
iproute \
python3.11 \
python3.11-pip \
python3.11-devel \
python3.11-rpm \
ffmpeg-free \
perl-Net-SSLeay-1.92-2.el9 \
python3-dnf-plugin-versionlock && \
# update pkgs (except OS version) for resolving potentials CVEs
dnf versionlock add redhat-release* openssl* perl-Net-SSLeay && \
dnf versionlock add redhat-release* openssl* libcurl-minimal curl-minimal ima-evm-utils python3-rpm rpm* && \
dnf update -y && \
dnf clean all && rm -rf /var/cache/yum
dnf clean all && rm -rf /var/cache/yum && \
rm -f /etc/ssh/ssh_host_*_key*

RUN mkdir -p /licenses && \
wget -O /licenses/LICENSE https://raw.githubusercontent.com/intel/ai-containers/main/LICENSE
Expand All @@ -85,15 +87,17 @@ RUN alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 2 && \
COPY install_efa.sh .
RUN ./install_efa.sh && rm install_efa.sh && rm -rf /etc/ld.so.conf.d/efa.conf /etc/profile.d/efa.sh

ENV LIBFABRIC_VERSION="1.20.0"
ENV OPENMPI_VERSION=4.1.6
ENV LIBFABRIC_VERSION="1.22.0"
ENV LIBFABRIC_ROOT="/opt/habanalabs/libfabric-${LIBFABRIC_VERSION}"
ENV MPI_ROOT=/opt/amazon/openmpi
ENV MPI_ROOT=/opt/habanalabs/openmpi
ENV LD_LIBRARY_PATH=$LIBFABRIC_ROOT/lib:${MPI_ROOT}/lib:/usr/lib/habanalabs:$LD_LIBRARY_PATH
ENV PATH=${LIBFABRIC_ROOT}/bin:${MPI_ROOT}/bin:$PATH
ENV OPAL_PREFIX=${MPI_ROOT}
ENV MPICC=${MPI_ROOT}/bin/mpicc
ENV RDMAV_FORK_SAFE=1
ENV FI_EFA_USE_DEVICE_RDMA=1
ENV FI_EFA_USE_DEVICE_RDMA=0
ENV OMPI_MCA_btl=^openib

RUN echo "[habanalabs]" > /etc/yum.repos.d/habanalabs.repo && \
echo "name=Habana RH9 Linux repo" >> /etc/yum.repos.d/habanalabs.repo && \
Expand Down Expand Up @@ -125,6 +129,12 @@ RUN wget -nv -O /tmp/libfabric-${LIBFABRIC_VERSION}.tar.bz2 https://github.com/o
./configure --prefix=$LIBFABRIC_ROOT --enable-psm3-verbs --enable-verbs=yes --with-synapseai=/usr && \
make && make install && cd / && rm -rf /tmp/libfabric-${LIBFABRIC_VERSION}.tar.bz2 /tmp/libfabric-${LIBFABRIC_VERSION}

RUN wget -q -O /tmp/openmpi-${OPENMPI_VERSION}.tar.gz https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OPENMPI_VERSION}.tar.gz && \
tar -xzf /tmp/openmpi-${OPENMPI_VERSION}.tar.gz -C /tmp && \
cd /tmp/openmpi-${OPENMPI_VERSION} && \
./configure --prefix=${MPI_ROOT} --with-libfabric=$LIBFABRIC_ROOT --with-verbs && \
make -j$(nproc) && make install && cd / && rm -rf /tmp/openmpi-${OPENMPI_VERSION}.tar.gz /tmp/openmpi-${OPENMPI_VERSION}

RUN wget -nv -O /tmp/main.zip https://github.com/HabanaAI/hccl_ofi_wrapper/archive/refs/heads/main.zip && \
unzip /tmp/main.zip -d /tmp && \
cd /tmp/hccl_ofi_wrapper-main && \
Expand All @@ -134,7 +144,7 @@ RUN wget -nv -O /tmp/main.zip https://github.com/HabanaAI/hccl_ofi_wrapper/archi

ENV APP_ROOT="/opt/app-root"

RUN python3.11 -m pip install pip==23.3.1 setuptools==67.3.3 wheel==0.38.4
RUN python3.11 -m pip install pip==24.2 setuptools==75.1.0 wheel==0.44.0

WORKDIR ${APP_ROOT}

Expand Down Expand Up @@ -170,7 +180,7 @@ ARG ARTIFACTORY_URL
ENV BASE_NAME=rhel9.4

LABEL name="PyTorch Installer"
LABEL summary="Habanalabs PyTorch installer layer for RHEL9.2"
LABEL summary="Habanalabs PyTorch installer layer for RHEL9.4"
LABEL description="Image with pre installed Habanalabs packages for PyTorch"

RUN echo "/usr/lib/habanalabs" > $(python -c "import sysconfig; print(sysconfig.get_path('platlib'))")/habanalabs-graph.pt
Expand All @@ -184,7 +194,7 @@ RUN echo "[CRB]" > /etc/yum.repos.d/CentOS-Linux-CRB.repo && \
echo "gpgcheck=1" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo

RUN dnf install --allowerasing -y \
curl \
curl-7.76.1-29.el9_4.1 \
cairo-devel \
numactl-devel \
iproute \
Expand All @@ -196,10 +206,19 @@ RUN dnf install --allowerasing -y \
gperftools-devel && \
dnf clean all && rm -rf /var/cache/yum

RUN dnf config-manager --add-repo https://yum.repos.intel.com/mkl/setup/intel-mkl.repo -y && \
dnf install --allowerasing -y intel-mkl-64bit-2020.4-912 && \
RUN echo "[oneAPI]" >> /etc/yum.repos.d/oneAPI.repo && \
echo "name=Intel® oneAPI repository" >> /etc/yum.repos.d/oneAPI.repo && \
echo "baseurl=https://yum.repos.intel.com/oneapi" >> /etc/yum.repos.d/oneAPI.repo && \
echo 'enabled=1' >> /etc/yum.repos.d/oneAPI.repo && \
echo "gpgcheck=1" >> /etc/yum.repos.d/oneAPI.repo && \
echo "repo_gpgcheck=1" >> /etc/yum.repos.d/oneAPI.repo && \
echo "gpgkey=https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB" >> /etc/yum.repos.d/oneAPI.repo

RUN dnf install --allowerasing -y intel-oneapi-mkl-2024.2.0 && \
dnf clean all && rm -rf /var/cache/yum

ENV LD_LIBRARY_PATH=/opt/intel/oneapi/mkl/2024.2/lib:${LD_LIBRARY_PATH}

RUN rm -rf /tmp/*

USER 1001
Expand Down
22 changes: 11 additions & 11 deletions enterprise/redhat/openshift-ai/gaudi/docker/docker-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,12 @@ services:
https_proxy: ${https_proxy}
no_proxy: ""
ARTIFACTORY_URL: ${ARTIFACTORY_URL:-vault.habana.ai}
VERSION: ${VERSION:-1.17.1}
REVISION: ${REVISION:-40}
VERSION: ${VERSION:-1.18.0}
REVISION: ${REVISION:-524}
context: .
target: gaudi-base
dockerfile: Dockerfile.rhel${RHEL_OS:-9.2}
image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-gaudi-base-${VERSION:-1.17.1}-${REVISION:-40}-rhel-${RHEL_OS:-9.2}
image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-gaudi-base-${VERSION:-1.18.0}-${REVISION:-524}-rhel-${RHEL_OS:-9.2}
entrypoint: ["/bin/bash", "-c"]
command: >
"hl-smi"
Expand All @@ -37,17 +37,17 @@ services:
BASE_IMAGE: ${BASE_IMAGE:-registry.access.redhat.com/ubi9/ubi}
BASE_TAG: ${RHEL_OS:-9.2}
BASE_NAME: rhel${RHEL_OS:-rhel9.2}
PT_VERSION: ${PT_VERSION:-2.3.1}
PT_VERSION: ${PT_VERSION:-2.4.0}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
no_proxy: ""
ARTIFACTORY_URL: ${ARTIFACTORY_URL:-vault.habana.ai}
VERSION: ${VERSION:-1.17.1}
REVISION: ${REVISION:-40}
VERSION: ${VERSION:-1.18.0}
REVISION: ${REVISION:-524}
context: .
target: gaudi-pytorch
dockerfile: Dockerfile.rhel${RHEL_OS:-9.2}
image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-gaudi-pytorch-${VERSION:-1.17.1}-${REVISION:-40}-rhel-${RHEL_OS:-9.2}
image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-gaudi-pytorch-${VERSION:-1.18.0}-${REVISION:-524}-rhel-${RHEL_OS:-9.2}
entrypoint: ["/bin/bash", "-c"]
command: >
"python -c 'import torch'"
Expand All @@ -57,17 +57,17 @@ services:
BASE_IMAGE: ${BASE_IMAGE:-registry.access.redhat.com/ubi9/ubi}
BASE_TAG: ${RHEL_OS:-9.2}
BASE_NAME: ${BASE_NAME:-rhel9.2}
PT_VERSION: ${PT_VERSION:-2.3.1}
PT_VERSION: ${PT_VERSION:-2.4.0}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
no_proxy: ""
ARTIFACTORY_URL: ${ARTIFACTORY_URL:-vault.habana.ai}
VERSION: ${VERSION:-1.17.1}
REVISION: ${REVISION:-40}
VERSION: ${VERSION:-1.18.0}
REVISION: ${REVISION:-524}
context: .
target: gaudi-notebooks
dockerfile: Dockerfile.rhel${RHEL_OS:-9.2}
image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-gaudi-notebook-${VERSION:-1.17.1}-${REVISION:-40}-rhel-${RHEL_OS:-9.2}
image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-gaudi-notebook-${VERSION:-1.18.0}-${REVISION:-524}-rhel-${RHEL_OS:-9.2}
entrypoint: ["/bin/bash", "-c"]
command: >
"python -m jupyter notebook --version"
31 changes: 23 additions & 8 deletions enterprise/redhat/openshift-ai/gaudi/docker/install_efa.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,27 +14,42 @@
# See the License for the specific language governing permissions and
# limitations under the License.

DEFAULT_EFA_INSTALLER_VER=1.29.0
#!/bin/bash -ex

DEFAULT_EFA_INSTALLER_VER=1.34.0
efa_installer_version=${1:-$DEFAULT_EFA_INSTALLER_VER}

tmp_dir=$(mktemp -d)
wget -nv https://efa-installer.amazonaws.com/aws-efa-installer-"$efa_installer_version".tar.gz -P "$tmp_dir"
tar -xf "$tmp_dir"/aws-efa-installer-"$efa_installer_version".tar.gz -C "$tmp_dir"
RUN_EFA_INSTALLER="./efa_installer.sh -y --skip-kmod --skip-limit-conf --no-verify"
pushd "$tmp_dir"/aws-efa-installer
# shellcheck disable=SC1091
case $(
. /etc/os-release
echo -n "$ID"
) in
. /etc/os-release
case $ID in
rhel)
# we cannot install dkms packages on RHEL images due to OCP rules
rm -f RPMS/RHEL8/x86_64/dkms*.rpm
find RPMS/ -name 'dkms*.rpm' -exec rm -f {} \;
find RPMS/ -name 'efa-*.rpm' -exec rm -f {} \;
case $VERSION_ID in
8*)
dnf install -y RPMS/ROCKYLINUX8/x86_64/rdma-core/*.rpm
;;
9*)
dnf install -y RPMS/ROCKYLINUX9/x86_64/rdma-core/*.rpm
;;
*)
echo "Unsupported RHEL version: $VERSION_ID"
exit 1
;;
esac
RUN_EFA_INSTALLER="echo 'Skipping EFA installer on RHEL'"
;;
tencentos)
dnf install -y RPMS/ROCKYLINUX8/x86_64/rdma-core/libibverbs-46.0-1.el8.x86_64.rpm RPMS/ROCKYLINUX8/x86_64/rdma-core/libibverbs-utils-46.0-1.el8.x86_64.rpm
dnf install -y RPMS/ROCKYLINUX8/x86_64/rdma-core/*.rpm
patch -f -p1 -i /tmp/tencentos_efa_patch.txt --reject-file=tencentos_efa_patch.rej --no-backup-if-mismatch
;;
esac
./efa_installer.sh -y --skip-kmod --skip-limit-conf --no-verify
eval "$RUN_EFA_INSTALLER"
popd
rm -rf "$tmp_dir"

0 comments on commit b6216ea

Please sign in to comment.