Skip to content

Commit

Permalink
rocm compiling
Browse files Browse the repository at this point in the history
Signed-off-by: Jefferson Fialho <[email protected]>
  • Loading branch information
fialhocoelho committed Oct 31, 2024
1 parent 26564a0 commit ce84349
Showing 1 changed file with 243 additions and 5 deletions.
248 changes: 243 additions & 5 deletions Dockerfile.ubi
Original file line number Diff line number Diff line change
@@ -1,13 +1,251 @@
# Start from released image
FROM quay.io/opendatahub/vllm:cuda-pr-212 as vllm-grpc-adapter
## Global Args ##################################################################
ARG BASE_UBI_IMAGE_TAG=9.4
ARG PYTHON_VERSION=3.12
# Default ROCm ARCHes to build vLLM for.
ARG PYTORCH_ROCM_ARCH="gfx908;gfx90a;gfx942;gfx1100"
ARG MAX_JOBS=12

FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS base

ARG PYTHON_VERSION

ENV VIRTUAL_ENV=/opt/vllm
ENV PATH="$VIRTUAL_ENV/bin:$PATH"

RUN --mount=type=cache,target=/root/.cache/pip \
microdnf -y update && \
microdnf install -y --setopt=install_weak_deps=0 --nodocs \
python${PYTHON_VERSION}-devel \
python${PYTHON_VERSION}-pip \
python${PYTHON_VERSION}-wheel && \
python${PYTHON_VERSION} -m venv $VIRTUAL_ENV && \
pip install -U pip wheel setuptools uv


FROM base AS rocm_base
ENV ROCM_VERSION=6.1.2

RUN printf "[amdgpu]\n\
name=amdgpu\n\
baseurl=https://repo.radeon.com/amdgpu/${ROCM_VERSION}/rhel/9.4/main/x86_64/\n\
enabled=1\n\
priority=50\n\
gpgcheck=1\n\
gpgkey=https://repo.radeon.com/rocm/rocm.gpg.key\n\
[ROCm-${ROCM_VERSION}]\n\
name=ROCm${ROCM_VERSION}\n\
baseurl=https://repo.radeon.com/rocm/rhel9/${ROCM_VERSION}/main\n\
enabled=1\n\
priority=50\n\
gpgcheck=1\n\
gpgkey=https://repo.radeon.com/rocm/rocm.gpg.key" > /etc/yum.repos.d/amdgpu.repo


RUN microdnf -y install \
rocm-hip-libraries rocm-hip-runtime \
miopen-hip && \
microdnf clean all

RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=cache,target=/root/.cache/uv \
uv pip install -v --index-url "https://download.pytorch.org/whl/nightly/rocm6.1" \
torch==2.5.0.dev20240912+rocm6.1 \
torchvision==0.20.0.dev20240912+rocm6.1

FROM rocm_base as rocm_devel

ENV CCACHE_DIR=/root/.cache/ccache

RUN rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \
rpm -ql epel-release && \
microdnf -y update && \
microdnf -y install \
ccache \
git \
rocm \
hipcc \
wget \
which && \
microdnf clean all

WORKDIR /workspace

ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer
ENV PATH=$PATH:/opt/rocm/bin:/libtorch/bin
ENV CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/libtorch/include:/libtorch/include/torch/csrc/api/include:/opt/rocm/include


FROM rocm_devel AS build_amdsmi

# Build AMD SMI wheel
RUN cd /opt/rocm/share/amd_smi && \
python3 -m pip wheel . --wheel-dir=/install

##################################################################################################

FROM rocm_devel AS build_flashattention

# Whether to install CK-based flash-attention
ARG BUILD_FA="1"
ARG TRY_FA_WHEEL="1"
# Note: The ROCm fork provides a wheel built for ROCm but only for 2.5.9 and python 3.9, so this will be incompatible with the current build
ARG FA_WHEEL_URL="https://github.com/ROCm/flash-attention/releases/download/v2.5.9post1-cktile-vllm/flash_attn-2.5.9.post1-cp39-cp39-linux_x86_64.whl"
# only required when not using triton backend
ARG FA_GFX_ARCHS="gfx90a;gfx942"
ARG FLASH_ATTENTION_USE_TRITON_ROCM="TRUE"
# FA_BRANCH is the main_perf branch as of Sep 4 2024 which includes triton backend support, see https://github.com/Dao-AILab/flash-attention/pull/1203
ARG FA_BRANCH="75b5360"
ARG MAX_JOBS
ENV MAX_JOBS=${MAX_JOBS}
ENV FLASH_ATTENTION_USE_TRITON_ROCM=${FLASH_ATTENTION_USE_TRITON_ROCM}

# Build ROCm flash-attention wheel if `BUILD_FA` is set to 1
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=cache,target=/workspace/build \
if [ "$BUILD_FA" = "1" ]; then \
if [ "$TRY_FA_WHEEL" = "1" ] && python3 -m pip install "${FA_WHEEL_URL}"; then \
# If a suitable wheel exists, download it instead of building FA
mkdir -p /install && wget -N "${FA_WHEEL_URL}" -P /install; \
else \
mkdir -p /libs && \
cd /libs && \
git clone https://github.com/ROCm/flash-attention.git && \
cd flash-attention && \
git checkout ${FA_BRANCH} && \
git submodule update --init && \
uv pip install cmake ninja packaging && \
env \
GPU_ARCHS="${FA_GFX_ARCHS}" \
BUILD_TARGET="rocm" \
python3 setup.py bdist_wheel --dist-dir=/install; \
fi; \
else \
# Create an empty directory otherwise AS later build stages expect one
mkdir -p /install; \
fi

##################################################################################################

FROM rocm_devel AS build_vllm
ARG PYTORCH_ROCM_ARCH
ARG MAX_JOBS
ENV MAX_JOBS=${MAX_JOBS}
ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}


COPY . .


ENV VLLM_TARGET_DEVICE="rocm"
ENV MAX_JOBS=${MAX_JOBS}
# Make sure punica kernels are built (for LoRA)
ENV VLLM_INSTALL_PUNICA_KERNELS=1

RUN --mount=type=cache,target=/root/.cache/ccache \
--mount=type=cache,target=/root/.cache/pip \
--mount=type=cache,target=/root/.cache/uv \
env CFLAGS="-march=haswell" \
CXXFLAGS="$CFLAGS $CXXFLAGS" \
CMAKE_BUILD_TYPE=Release \
uv pip install -v -U \
ninja setuptools-scm>=8 "cmake>=3.26" packaging && \
python3 setup.py bdist_wheel --dist-dir=dist

#################### libsodium Build IMAGE ####################
FROM rocm_base as libsodium-builder

RUN microdnf install -y gcc gzip tar \
&& microdnf clean all

WORKDIR /usr/src/libsodium

ARG LIBSODIUM_VERSION=1.0.20
RUN curl -LO https://github.com/jedisct1/libsodium/releases/download/${LIBSODIUM_VERSION}-RELEASE/libsodium-${LIBSODIUM_VERSION}.tar.gz \
&& tar -xzvf libsodium*.tar.gz \
&& rm -f libsodium*.tar.gz \
&& mv libsodium*/* ./

RUN CFLAGS="-O3 -Wall -Werror=format-security -Wno-unused-function -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection"\ ./configure --prefix="/usr/" --libdir=/usr/lib64 && make -j $MAX_JOBS && make check

##################################################################################################

FROM rocm_base AS vllm-openai
ARG MAX_JOBS

WORKDIR /workspace

ENV VIRTUAL_ENV=/opt/vllm
ENV PATH=$VIRTUAL_ENV/bin:$PATH

# Required for triton
RUN microdnf install -y --setopt=install_weak_deps=0 --nodocs gcc rsync && \
microdnf clean all

# Install libsodium for Tensorizer encryption
RUN --mount=type=bind,from=libsodium-builder,src=/usr/src/libsodium,target=/usr/src/libsodium \
cd /usr/src/libsodium \
&& make install

RUN --mount=type=bind,from=build_amdsmi,src=/install,target=/install/amdsmi/ \
--mount=type=bind,from=build_flashattention,src=/install,target=/install/flashattention \
--mount=type=bind,from=build_vllm,src=/workspace/dist,target=/install/vllm/ \
--mount=type=cache,target=/root/.cache/pip \
--mount=type=cache,target=/root/.cache/uv \
uv pip install -v \
--index-strategy=unsafe-best-match \
--extra-index-url "https://download.pytorch.org/whl/nightly/rocm6.1" \
/install/amdsmi/*.whl\
/install/flashattention/*.whl\
/install/vllm/*.whl

# Set up a non-root user for OpenShift
RUN umask 002 && \
useradd --uid 2000 --gid 0 vllm && \
mkdir -p /licenses && \
chmod g+rwx $HOME /usr/src /workspace && \
chmod 0775 /var/log/rocm_smi_lib && \
chmod 0664 /var/log/rocm_smi_lib/*

COPY LICENSE /licenses/vllm.md
COPY examples/*.jinja /app/data/template/

ENV HF_HUB_OFFLINE=1 \
PORT=8000 \
HOME=/home/vllm \
# Allow requested max length to exceed what is extracted from the
# config.json
# see: https://github.com/vllm-project/vllm/pull/7080
VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 \
VLLM_USAGE_SOURCE=production-docker-image \
VLLM_WORKER_MULTIPROC_METHOD=fork \
VLLM_NO_USAGE_STATS=1 \
# Silences the HF Tokenizers warning
TOKENIZERS_PARALLELISM=false \
RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 \
FLASH_ATTENTION_USE_TRITON_ROCM="TRUE" \
OUTLINES_CACHE_DIR=/tmp/outlines \
NUMBA_CACHE_DIR=/tmp/numba \
TRITON_CACHE_DIR=/tmp/triton \
LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/vllm/lib/python3.12/site-packages/pillow.libs/:/opt/vllm/lib/python3.12/site-packages/numpy.libs/"

# Switch to the non-root user
USER 2000

# Set the entrypoint
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]


FROM vllm-openai as vllm-grpc-adapter

USER root

# Copy source code changes into the installed location to overwrite the installed python code
COPY vllm /opt/vllm/lib64/python3.12/site-packages/vllm
# COPY vllm /opt/vllm/lib64/python3.12/site-packages/vllm

# RUN --mount=type=cache,target=/root/.cache/pip \
# pip install vllm-tgis-adapter==0.5.1
# --mount=type=bind,from=build_vllm,src=/workspace/dist,target=/install/vllm/ \
# uv pip install /install/vllm/*.whl vllm-tgis-adapter==0.5.3

RUN --mount=type=cache,target=/root/.cache/pip \
pip install git+https://github.com/opendatahub-io/vllm-tgis-adapter@ibm-20241024-adapter

Expand All @@ -20,4 +258,4 @@ ENV GRPC_PORT=8033 \
DISABLE_LOGPROBS_DURING_SPEC_DECODING=false

USER 2000
ENTRYPOINT ["python3", "-m", "vllm_tgis_adapter", "--uvicorn-log-level=warning"]
ENTRYPOINT ["python3", "-m", "vllm_tgis_adapter", "--uvicorn-log-level=warning"]

0 comments on commit ce84349

Please sign in to comment.