From ce84349d92a6ccd9083526b9231af348c818772e Mon Sep 17 00:00:00 2001 From: Jefferson Fialho Date: Thu, 31 Oct 2024 10:56:51 -0300 Subject: [PATCH] rocm compiling Signed-off-by: Jefferson Fialho --- Dockerfile.ubi | 248 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 243 insertions(+), 5 deletions(-) diff --git a/Dockerfile.ubi b/Dockerfile.ubi index a2630bc5a2a67..ee43d6684700f 100644 --- a/Dockerfile.ubi +++ b/Dockerfile.ubi @@ -1,13 +1,251 @@ -# Start from released image -FROM quay.io/opendatahub/vllm:cuda-pr-212 as vllm-grpc-adapter +## Global Args ################################################################## +ARG BASE_UBI_IMAGE_TAG=9.4 +ARG PYTHON_VERSION=3.12 +# Default ROCm ARCHes to build vLLM for. +ARG PYTORCH_ROCM_ARCH="gfx908;gfx90a;gfx942;gfx1100" +ARG MAX_JOBS=12 + +FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS base + +ARG PYTHON_VERSION + +ENV VIRTUAL_ENV=/opt/vllm +ENV PATH="$VIRTUAL_ENV/bin:$PATH" + +RUN --mount=type=cache,target=/root/.cache/pip \ + microdnf -y update && \ + microdnf install -y --setopt=install_weak_deps=0 --nodocs \ + python${PYTHON_VERSION}-devel \ + python${PYTHON_VERSION}-pip \ + python${PYTHON_VERSION}-wheel && \ + python${PYTHON_VERSION} -m venv $VIRTUAL_ENV && \ + pip install -U pip wheel setuptools uv + + +FROM base AS rocm_base +ENV ROCM_VERSION=6.1.2 + +RUN printf "[amdgpu]\n\ +name=amdgpu\n\ +baseurl=https://repo.radeon.com/amdgpu/${ROCM_VERSION}/rhel/9.4/main/x86_64/\n\ +enabled=1\n\ +priority=50\n\ +gpgcheck=1\n\ +gpgkey=https://repo.radeon.com/rocm/rocm.gpg.key\n\ +[ROCm-${ROCM_VERSION}]\n\ +name=ROCm${ROCM_VERSION}\n\ +baseurl=https://repo.radeon.com/rocm/rhel9/${ROCM_VERSION}/main\n\ +enabled=1\n\ +priority=50\n\ +gpgcheck=1\n\ +gpgkey=https://repo.radeon.com/rocm/rocm.gpg.key" > /etc/yum.repos.d/amdgpu.repo + + +RUN microdnf -y install \ + rocm-hip-libraries rocm-hip-runtime \ + miopen-hip && \ + microdnf clean all + +RUN --mount=type=cache,target=/root/.cache/pip \ + --mount=type=cache,target=/root/.cache/uv \ + uv pip install -v --index-url "https://download.pytorch.org/whl/nightly/rocm6.1" \ + torch==2.5.0.dev20240912+rocm6.1 \ + torchvision==0.20.0.dev20240912+rocm6.1 + +FROM rocm_base as rocm_devel + +ENV CCACHE_DIR=/root/.cache/ccache + +RUN rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \ + rpm -ql epel-release && \ + microdnf -y update && \ + microdnf -y install \ + ccache \ + git \ + rocm \ + hipcc \ + wget \ + which && \ + microdnf clean all + +WORKDIR /workspace + +ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer +ENV PATH=$PATH:/opt/rocm/bin:/libtorch/bin +ENV CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/libtorch/include:/libtorch/include/torch/csrc/api/include:/opt/rocm/include + + +FROM rocm_devel AS build_amdsmi + +# Build AMD SMI wheel +RUN cd /opt/rocm/share/amd_smi && \ + python3 -m pip wheel . --wheel-dir=/install + +################################################################################################## + +FROM rocm_devel AS build_flashattention + +# Whether to install CK-based flash-attention +ARG BUILD_FA="1" +ARG TRY_FA_WHEEL="1" +# Note: The ROCm fork provides a wheel built for ROCm but only for 2.5.9 and python 3.9, so this will be incompatible with the current build +ARG FA_WHEEL_URL="https://github.com/ROCm/flash-attention/releases/download/v2.5.9post1-cktile-vllm/flash_attn-2.5.9.post1-cp39-cp39-linux_x86_64.whl" +# only required when not using triton backend +ARG FA_GFX_ARCHS="gfx90a;gfx942" +ARG FLASH_ATTENTION_USE_TRITON_ROCM="TRUE" +# FA_BRANCH is the main_perf branch as of Sep 4 2024 which includes triton backend support, see https://github.com/Dao-AILab/flash-attention/pull/1203 +ARG FA_BRANCH="75b5360" +ARG MAX_JOBS +ENV MAX_JOBS=${MAX_JOBS} +ENV FLASH_ATTENTION_USE_TRITON_ROCM=${FLASH_ATTENTION_USE_TRITON_ROCM} + +# Build ROCm flash-attention wheel if `BUILD_FA` is set to 1 +RUN --mount=type=cache,target=/root/.cache/uv \ + --mount=type=cache,target=/workspace/build \ + if [ "$BUILD_FA" = "1" ]; then \ + if [ "$TRY_FA_WHEEL" = "1" ] && python3 -m pip install "${FA_WHEEL_URL}"; then \ + # If a suitable wheel exists, download it instead of building FA + mkdir -p /install && wget -N "${FA_WHEEL_URL}" -P /install; \ + else \ + mkdir -p /libs && \ + cd /libs && \ + git clone https://github.com/ROCm/flash-attention.git && \ + cd flash-attention && \ + git checkout ${FA_BRANCH} && \ + git submodule update --init && \ + uv pip install cmake ninja packaging && \ + env \ + GPU_ARCHS="${FA_GFX_ARCHS}" \ + BUILD_TARGET="rocm" \ + python3 setup.py bdist_wheel --dist-dir=/install; \ + fi; \ + else \ + # Create an empty directory otherwise AS later build stages expect one + mkdir -p /install; \ + fi + +################################################################################################## + +FROM rocm_devel AS build_vllm +ARG PYTORCH_ROCM_ARCH +ARG MAX_JOBS +ENV MAX_JOBS=${MAX_JOBS} +ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} + + +COPY . . + + +ENV VLLM_TARGET_DEVICE="rocm" +ENV MAX_JOBS=${MAX_JOBS} +# Make sure punica kernels are built (for LoRA) +ENV VLLM_INSTALL_PUNICA_KERNELS=1 + +RUN --mount=type=cache,target=/root/.cache/ccache \ + --mount=type=cache,target=/root/.cache/pip \ + --mount=type=cache,target=/root/.cache/uv \ + env CFLAGS="-march=haswell" \ + CXXFLAGS="$CFLAGS $CXXFLAGS" \ + CMAKE_BUILD_TYPE=Release \ + uv pip install -v -U \ + ninja setuptools-scm>=8 "cmake>=3.26" packaging && \ + python3 setup.py bdist_wheel --dist-dir=dist + +#################### libsodium Build IMAGE #################### +FROM rocm_base as libsodium-builder + +RUN microdnf install -y gcc gzip tar \ + && microdnf clean all + +WORKDIR /usr/src/libsodium + +ARG LIBSODIUM_VERSION=1.0.20 +RUN curl -LO https://github.com/jedisct1/libsodium/releases/download/${LIBSODIUM_VERSION}-RELEASE/libsodium-${LIBSODIUM_VERSION}.tar.gz \ + && tar -xzvf libsodium*.tar.gz \ + && rm -f libsodium*.tar.gz \ + && mv libsodium*/* ./ + +RUN CFLAGS="-O3 -Wall -Werror=format-security -Wno-unused-function -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection"\ ./configure --prefix="/usr/" --libdir=/usr/lib64 && make -j $MAX_JOBS && make check + +################################################################################################## + +FROM rocm_base AS vllm-openai +ARG MAX_JOBS + +WORKDIR /workspace + +ENV VIRTUAL_ENV=/opt/vllm +ENV PATH=$VIRTUAL_ENV/bin:$PATH + +# Required for triton +RUN microdnf install -y --setopt=install_weak_deps=0 --nodocs gcc rsync && \ + microdnf clean all + +# Install libsodium for Tensorizer encryption +RUN --mount=type=bind,from=libsodium-builder,src=/usr/src/libsodium,target=/usr/src/libsodium \ + cd /usr/src/libsodium \ + && make install + +RUN --mount=type=bind,from=build_amdsmi,src=/install,target=/install/amdsmi/ \ + --mount=type=bind,from=build_flashattention,src=/install,target=/install/flashattention \ + --mount=type=bind,from=build_vllm,src=/workspace/dist,target=/install/vllm/ \ + --mount=type=cache,target=/root/.cache/pip \ + --mount=type=cache,target=/root/.cache/uv \ + uv pip install -v \ + --index-strategy=unsafe-best-match \ + --extra-index-url "https://download.pytorch.org/whl/nightly/rocm6.1" \ + /install/amdsmi/*.whl\ + /install/flashattention/*.whl\ + /install/vllm/*.whl + +# Set up a non-root user for OpenShift +RUN umask 002 && \ + useradd --uid 2000 --gid 0 vllm && \ + mkdir -p /licenses && \ + chmod g+rwx $HOME /usr/src /workspace && \ + chmod 0775 /var/log/rocm_smi_lib && \ + chmod 0664 /var/log/rocm_smi_lib/* + +COPY LICENSE /licenses/vllm.md +COPY examples/*.jinja /app/data/template/ + +ENV HF_HUB_OFFLINE=1 \ + PORT=8000 \ + HOME=/home/vllm \ + # Allow requested max length to exceed what is extracted from the + # config.json + # see: https://github.com/vllm-project/vllm/pull/7080 + VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 \ + VLLM_USAGE_SOURCE=production-docker-image \ + VLLM_WORKER_MULTIPROC_METHOD=fork \ + VLLM_NO_USAGE_STATS=1 \ + # Silences the HF Tokenizers warning + TOKENIZERS_PARALLELISM=false \ + RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 \ + FLASH_ATTENTION_USE_TRITON_ROCM="TRUE" \ + OUTLINES_CACHE_DIR=/tmp/outlines \ + NUMBA_CACHE_DIR=/tmp/numba \ + TRITON_CACHE_DIR=/tmp/triton \ + LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/vllm/lib/python3.12/site-packages/pillow.libs/:/opt/vllm/lib/python3.12/site-packages/numpy.libs/" + +# Switch to the non-root user +USER 2000 + +# Set the entrypoint +ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] + + +FROM vllm-openai as vllm-grpc-adapter USER root # Copy source code changes into the installed location to overwrite the installed python code -COPY vllm /opt/vllm/lib64/python3.12/site-packages/vllm +# COPY vllm /opt/vllm/lib64/python3.12/site-packages/vllm # RUN --mount=type=cache,target=/root/.cache/pip \ -# pip install vllm-tgis-adapter==0.5.1 +# --mount=type=bind,from=build_vllm,src=/workspace/dist,target=/install/vllm/ \ +# uv pip install /install/vllm/*.whl vllm-tgis-adapter==0.5.3 + RUN --mount=type=cache,target=/root/.cache/pip \ pip install git+https://github.com/opendatahub-io/vllm-tgis-adapter@ibm-20241024-adapter @@ -20,4 +258,4 @@ ENV GRPC_PORT=8033 \ DISABLE_LOGPROBS_DURING_SPEC_DECODING=false USER 2000 -ENTRYPOINT ["python3", "-m", "vllm_tgis_adapter", "--uvicorn-log-level=warning"] +ENTRYPOINT ["python3", "-m", "vllm_tgis_adapter", "--uvicorn-log-level=warning"] \ No newline at end of file