From dd6c2c4607bb90a639e59ce68f2c93953124b0b4 Mon Sep 17 00:00:00 2001
From: Max de Bayser <mbayser@br.ibm.com>
Date: Fri, 11 Oct 2024 18:24:49 -0300
Subject: [PATCH] two stage build

Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
---
 Dockerfile.ubi | 205 ++-----------------------------------------------
 1 file changed, 5 insertions(+), 200 deletions(-)

diff --git a/Dockerfile.ubi b/Dockerfile.ubi
index 4fc1915feb2c0..e3cd994c1ba42 100644
--- a/Dockerfile.ubi
+++ b/Dockerfile.ubi
@@ -1,206 +1,11 @@
-## Global Args #################################################################
-ARG BASE_UBI_IMAGE_TAG=9.4
-ARG PYTHON_VERSION=3.12
-
-ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
-ARG vllm_fa_cmake_gpu_arches='80-real;90-real'
-
-## Base Layer ##################################################################
-FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} as base
-ARG PYTHON_VERSION
-ENV PYTHON_VERSION=${PYTHON_VERSION}
-RUN microdnf -y update && microdnf install -y \
-    python${PYTHON_VERSION}-pip python${PYTHON_VERSION}-wheel \
-    && microdnf clean all
-
-WORKDIR /workspace
-
-ENV LANG=C.UTF-8 \
-    LC_ALL=C.UTF-8
-
-# Some utils for dev purposes - tar required for kubectl cp
-RUN microdnf install -y \
-        which procps findutils tar vim git\
-    && microdnf clean all
-
-
-## Python Installer ############################################################
-FROM base as python-install
-ARG PYTHON_VERSION
-
-ENV VIRTUAL_ENV=/opt/vllm
-ENV PATH="$VIRTUAL_ENV/bin:$PATH"
-ENV PYTHON_VERSION=${PYTHON_VERSION}
-RUN microdnf install -y \
-    python${PYTHON_VERSION}-devel  && \
-    python${PYTHON_VERSION} -m venv $VIRTUAL_ENV && pip install --no-cache -U pip wheel uv && microdnf clean all
-
-
-## CUDA Base ###################################################################
-FROM python-install as cuda-base
-
-RUN curl -Lo /etc/yum.repos.d/cuda-rhel9.repo \
-        https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo
-
-RUN microdnf install -y \
-        cuda-nvcc-12-4 cuda-nvtx-12-4 cuda-libraries-devel-12-4 && \
-    microdnf clean all
-
-ENV CUDA_HOME="/usr/local/cuda" \
-    PATH="${CUDA_HOME}/bin:${PATH}" \
-    LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${CUDA_HOME}/extras/CUPTI/lib64:${LD_LIBRARY_PATH}"
-
-## Python cuda base #################################################################
-FROM cuda-base AS python-cuda-base
-
-ENV VIRTUAL_ENV=/opt/vllm
-ENV PATH="$VIRTUAL_ENV/bin:$PATH"
-
-# install cuda and common dependencies
-RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=cache,target=/root/.cache/uv \
-    --mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \
-    --mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \
-    uv pip install \
-        -r requirements-cuda.txt
-
-
-## Development #################################################################
-FROM python-cuda-base AS dev
-
-# install build and runtime dependencies
-RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=cache,target=/root/.cache/uv \
-    --mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \
-    --mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \
-    --mount=type=bind,source=requirements-dev.txt,target=requirements-dev.txt \
-    --mount=type=bind,source=requirements-lint.txt,target=requirements-lint.txt \
-    --mount=type=bind,source=requirements-test.txt,target=requirements-test.txt \
-    uv pip install \
-        -r requirements-cuda.txt \
-        -r requirements-dev.txt
-
-## Builder #####################################################################
-FROM dev AS build
-
-# install build dependencies
-RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=cache,target=/root/.cache/uv \
-    --mount=type=bind,source=requirements-build.txt,target=requirements-build.txt \
-    uv pip install -r requirements-build.txt
-
-# install compiler cache to speed up compilation leveraging local or remote caching
-# git is required for the cutlass kernels
-RUN rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && rpm -ql epel-release && microdnf install -y git ccache && microdnf clean all
-
-COPY . .
-
-ARG TORCH_CUDA_ARCH_LIST
-ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST
-ARG vllm_fa_cmake_gpu_arches
-ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches}
-
-# max jobs used by Ninja to build extensions
-ARG max_jobs=2
-ENV MAX_JOBS=${max_jobs}
-# number of threads used by nvcc
-ARG nvcc_threads=8
-ENV NVCC_THREADS=$nvcc_threads
-# make sure punica kernels are built (for LoRA)
-ENV VLLM_INSTALL_PUNICA_KERNELS=1
-
-# Make sure the cuda environment is in the PATH
-ENV PATH=/usr/local/cuda/bin:$PATH
-
-ENV CCACHE_DIR=/root/.cache/ccache
-RUN --mount=type=cache,target=/root/.cache/ccache \
-    --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=cache,target=/root/.cache/uv \
-    --mount=type=bind,src=.git,target=/workspace/.git \
-    env CFLAGS="-march=haswell" \
-        CXXFLAGS="$CFLAGS $CXXFLAGS" \
-        CMAKE_BUILD_TYPE=Release \
-        python3 setup.py bdist_wheel --dist-dir=dist
-
-#################### libsodium Build IMAGE ####################
-FROM base as libsodium-builder
-
-RUN microdnf install -y gcc gzip \
-    && microdnf clean all
-
-WORKDIR /usr/src/libsodium
-
-ARG LIBSODIUM_VERSION=1.0.20
-RUN curl -LO https://github.com/jedisct1/libsodium/releases/download/${LIBSODIUM_VERSION}-RELEASE/libsodium-${LIBSODIUM_VERSION}.tar.gz \
-    && tar -xzvf libsodium*.tar.gz \
-    && rm -f libsodium*.tar.gz \
-    && mv libsodium*/* ./
-
-RUN CFLAGS="-O3 -Wall -Werror=format-security -Wno-unused-function -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection"\
-    ./configure --prefix="/usr/" && make -j $MAX_JOBS && make check
-
-## Release #####################################################################
-FROM python-install AS vllm-openai
-ARG PYTHON_VERSION
-
-WORKDIR /workspace
-
-ENV VIRTUAL_ENV=/opt/vllm
-ENV PATH=$VIRTUAL_ENV/bin/:$PATH
-
-# force using the python venv's cuda runtime libraries
-ENV LD_LIBRARY_PATH="${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nvidia/cuda_nvrtc/lib:${LD_LIBRARY_PATH}"
-ENV LD_LIBRARY_PATH="${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nvidia/cuda_runtime/lib:${LD_LIBRARY_PATH}"
-ENV LD_LIBRARY_PATH="${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nvidia/nvtx/lib:${LD_LIBRARY_PATH}"
-
-# Triton needs a CC compiler
-RUN microdnf install -y gcc \
-    && microdnf clean all
-
-# install vllm wheel first, so that torch etc will be installed
-RUN --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \
-    --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=cache,target=/root/.cache/uv \
-    uv pip install $(echo dist/*.whl)'[tensorizer]' --verbose
-
-# Install libsodium for Tensorizer encryption
-RUN --mount=type=bind,from=libsodium-builder,src=/usr/src/libsodium,target=/usr/src/libsodium \
-    cd /usr/src/libsodium \
-    && make install
-
-RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=cache,target=/root/.cache/uv \
-    uv pip install \
-        "https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu124torch2.4-cp312-cp312-linux_x86_64.whl"
-
-ENV HF_HUB_OFFLINE=1 \
-    HOME=/home/vllm \
-    # Allow requested max length to exceed what is extracted from the
-    # config.json
-    # see: https://github.com/vllm-project/vllm/pull/7080
-    VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 \
-    VLLM_USAGE_SOURCE=production-docker-image \
-    VLLM_WORKER_MULTIPROC_METHOD=fork \
-    VLLM_NO_USAGE_STATS=1
-
-# setup non-root user for OpenShift
-RUN umask 002 \
-    && useradd --uid 2000 --gid 0 vllm \
-    && chmod g+rwx $HOME /usr/src /workspace
-
-COPY LICENSE /licenses/vllm.md
-
-# Copy only .jinja files from example directory to template directory
-COPY examples/*.jinja /app/data/template/
-
-USER 2000
-ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
-
-
-FROM vllm-openai as vllm-grpc-adapter
+# Start from released image
+FROM quay.io/opendatahub/vllm:cuda-pr-196 as vllm-grpc-adapter
 
 USER root
 
+# Copy source code changes into the installed location to overwrite the installed python code
+COPY vllm /opt/vllm/lib64/python3.12/site-packages/vllm
+
 # RUN --mount=type=cache,target=/root/.cache/pip \
 #     pip install vllm-tgis-adapter==0.5.1
 RUN --mount=type=cache,target=/root/.cache/pip \