h2oai · achraf-mer · Jul 3, 2024 · Apr 15, 2025 · Apr 15, 2025
diff --git a/Makefile b/Makefile
@@ -0,0 +1,20 @@
+
+NPROCS                     := $(shell nproc)
+VERSION                    := $(shell grep -oP '(?<=__version__ = ")[^"]*' vllm/version.py)-$(shell git rev-parse --short HEAD)
+DOCKER_TEST_IMAGE_VLLM     := harbor.h2o.ai/h2ogpt/test-image-vllm:$(VERSION)
+
+ifeq ($(VERSION),)
+  $(error Failed to extract version number from vllm/version.py)
+endif
+
+VLLM_CUDA_VERSION ?= 12.1.0
+VLLM_BASE_IMAGE   ?= 353750902984.dkr.ecr.us-east-1.amazonaws.com/h2ogpt-vllm-wolfi-base:2
+
+docker_build:
+	docker pull $(VLLM_BASE_IMAGE)
+	docker pull nvidia/cuda:$(VLLM_CUDA_VERSION)-devel-ubuntu22.04
+	docker buildx build --load --build-arg max_jobs=$(NPROCS) --build-arg PYTHON_VERSION=3.10 --build-arg CUDA_VERSION=$(VLLM_CUDA_VERSION) --build-arg WOLFI_OS_BASE_IMAGE=$(VLLM_BASE_IMAGE) --tag $(DOCKER_TEST_IMAGE_VLLM) --file docker/Dockerfile .
+
+docker_push:
+	docker tag $(DOCKER_TEST_IMAGE_VLLM) 353750902984.dkr.ecr.us-east-1.amazonaws.com/h2ogpte-vllm:$(VERSION)
+	docker push 353750902984.dkr.ecr.us-east-1.amazonaws.com/h2ogpte-vllm:$(VERSION)
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -2,8 +2,9 @@
 # to run the OpenAI compatible server.
 
 # Please update any changes made here to
-# docs/source/contributing/dockerfile/dockerfile.md and
-# docs/source/assets/contributing/dockerfile-stages-dependency.png
+# docs/source/dev/dockerfile/dockerfile.rst and
+# docs/source/assets/dev/dockerfile-stages-dependency.png
+ARG WOLFI_OS_BASE_IMAGE=none
 
 ARG CUDA_VERSION=12.4.1
 #################### BASE BUILD IMAGE ####################
@@ -98,6 +99,9 @@ ARG GIT_REPO_CHECK=0
 RUN --mount=type=bind,source=.git,target=.git \
     if [ "$GIT_REPO_CHECK" != "0" ]; then bash tools/check_repo.sh ; fi
 
+# set version explicitly
+ENV SETUPTOOLS_SCM_PRETEND_VERSION=0.8.4
+
 # max jobs used by Ninja to build extensions
 ARG max_jobs=2
 ENV MAX_JOBS=${max_jobs}
@@ -141,8 +145,8 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
 
 # Check the size of the wheel if RUN_WHEEL_CHECK is true
 COPY .buildkite/check-wheel-size.py check-wheel-size.py
-# sync the default value with .buildkite/check-wheel-size.py
-ARG VLLM_MAX_SIZE_MB=400
+# Default max size of the wheel is 550MB
+ARG VLLM_MAX_SIZE_MB=550
 ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB
 ARG RUN_WHEEL_CHECK=true
 RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
@@ -168,31 +172,14 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 
 #################### vLLM installation IMAGE ####################
 # image with vLLM installed
-# TODO: Restore to base image after FlashInfer AOT wheel fixed
-FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS vllm-base
+FROM ${WOLFI_OS_BASE_IMAGE} AS vllm-base
+ARG WOLFI_OS_BASE_IMAGE=none
 ARG CUDA_VERSION=12.4.1
-ARG PYTHON_VERSION=3.12
-WORKDIR /vllm-workspace
-ENV DEBIAN_FRONTEND=noninteractive
-ARG TARGETPLATFORM
 
-RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
-    echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
+USER root
+
+WORKDIR /workspace
 
-# Install Python and other dependencies
-RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
-    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
-    && apt-get update -y \
-    && apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
-    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
-    && add-apt-repository ppa:deadsnakes/ppa \
-    && apt-get update -y \
-    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
-    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
-    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
-    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
-    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
-    && python3 --version && python3 -m pip --version
 # Install uv for faster pip installs
 RUN --mount=type=cache,target=/root/.cache/uv \
     python3 -m pip install uv
@@ -205,7 +192,8 @@ ENV UV_HTTP_TIMEOUT=500
 # https://github.com/pytorch/pytorch/issues/107960 -- hopefully
 # this won't be needed for future versions of this docker image
 # or future versions of triton.
-RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
+#RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/lib64/stubs/
+#RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/lib64/
 
 # arm64 (GH200) build follows the practice of "use existing pytorch" build,
 # we need to install torch and torchvision from the nightly builds first,
@@ -251,41 +239,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 
 #################### vLLM installation IMAGE ####################
 
-#################### TEST IMAGE ####################
-# image to run unit testing suite
-# note that this uses vllm installed by `pip`
-FROM vllm-base AS test
-
-ADD . /vllm-workspace/
-
-# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
-# Reference: https://github.com/astral-sh/uv/pull/1694
-ENV UV_HTTP_TIMEOUT=500
-
-# install development dependencies (for testing)
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system -r requirements/dev.txt
-
-# install development dependencies (for testing)
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system -e tests/vllm_test_utils
-
-# enable fast downloads from hf (for testing)
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system hf_transfer
-ENV HF_HUB_ENABLE_HF_TRANSFER 1
-
-# Copy in the v1 package for testing (it isn't distributed yet)
-COPY vllm/v1 /usr/local/lib/python3.12/dist-packages/vllm/v1
-
-# doc requires source code
-# we hide them inside `test_docs/` , so that this source code
-# will not be imported by other tests
-RUN mkdir test_docs
-RUN mv docs test_docs/
-RUN mv vllm test_docs/
-#################### TEST IMAGE ####################
-
 #################### OPENAI API SERVER ####################
 # base openai image with additional requirements, for any subsequent openai-style images
 FROM vllm-base AS vllm-openai-base
@@ -302,16 +255,13 @@ RUN --mount=type=cache,target=/root/.cache/uv \
         uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.3' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
     fi
 
-ENV VLLM_USAGE_SOURCE production-docker-image
-
-# define sagemaker first, so it is not default from `docker build`
-FROM vllm-openai-base AS vllm-sagemaker
-
-COPY examples/online_serving/sagemaker-entrypoint.sh .
-RUN chmod +x sagemaker-entrypoint.sh
-ENTRYPOINT ["./sagemaker-entrypoint.sh"]
+ENV VLLM_USAGE_SOURCE=production-docker-image
+ENV VLLM_NCCL_SO_PATH=/usr/lib/python3.10/site-packages/nvidia/nccl/lib/libnccl.so.2
+ENV NUMBA_CACHE_DIR=/workspace/numba_cache
+RUN mkdir -p ${NUMBA_CACHE_DIR}
+RUN chmod -R a+rwx /workspace
 
-FROM vllm-openai-base AS vllm-openai
+USER h2ogpt
 
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
 #################### OPENAI API SERVER ####################