Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@

NPROCS := $(shell nproc)
VERSION := $(shell grep -oP '(?<=__version__ = ")[^"]*' vllm/version.py)-$(shell git rev-parse --short HEAD)
DOCKER_TEST_IMAGE_VLLM := harbor.h2o.ai/h2ogpt/test-image-vllm:$(VERSION)

ifeq ($(VERSION),)
$(error Failed to extract version number from vllm/version.py)
endif

VLLM_CUDA_VERSION ?= 12.1.0
VLLM_BASE_IMAGE ?= 353750902984.dkr.ecr.us-east-1.amazonaws.com/h2ogpt-vllm-wolfi-base:2

docker_build:
docker pull $(VLLM_BASE_IMAGE)
docker pull nvidia/cuda:$(VLLM_CUDA_VERSION)-devel-ubuntu22.04
docker buildx build --load --build-arg max_jobs=$(NPROCS) --build-arg PYTHON_VERSION=3.10 --build-arg CUDA_VERSION=$(VLLM_CUDA_VERSION) --build-arg WOLFI_OS_BASE_IMAGE=$(VLLM_BASE_IMAGE) --tag $(DOCKER_TEST_IMAGE_VLLM) --file docker/Dockerfile .

docker_push:
docker tag $(DOCKER_TEST_IMAGE_VLLM) 353750902984.dkr.ecr.us-east-1.amazonaws.com/h2ogpte-vllm:$(VERSION)
docker push 353750902984.dkr.ecr.us-east-1.amazonaws.com/h2ogpte-vllm:$(VERSION)
92 changes: 21 additions & 71 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
# to run the OpenAI compatible server.

# Please update any changes made here to
# docs/source/contributing/dockerfile/dockerfile.md and
# docs/source/assets/contributing/dockerfile-stages-dependency.png
# docs/source/dev/dockerfile/dockerfile.rst and
# docs/source/assets/dev/dockerfile-stages-dependency.png
ARG WOLFI_OS_BASE_IMAGE=none

ARG CUDA_VERSION=12.4.1
#################### BASE BUILD IMAGE ####################
Expand Down Expand Up @@ -98,6 +99,9 @@ ARG GIT_REPO_CHECK=0
RUN --mount=type=bind,source=.git,target=.git \
if [ "$GIT_REPO_CHECK" != "0" ]; then bash tools/check_repo.sh ; fi

# set version explicitly
ENV SETUPTOOLS_SCM_PRETEND_VERSION=0.8.4

# max jobs used by Ninja to build extensions
ARG max_jobs=2
ENV MAX_JOBS=${max_jobs}
Expand Down Expand Up @@ -141,8 +145,8 @@ RUN --mount=type=cache,target=/root/.cache/ccache \

# Check the size of the wheel if RUN_WHEEL_CHECK is true
COPY .buildkite/check-wheel-size.py check-wheel-size.py
# sync the default value with .buildkite/check-wheel-size.py
ARG VLLM_MAX_SIZE_MB=400
# Default max size of the wheel is 550MB
ARG VLLM_MAX_SIZE_MB=550
ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB
ARG RUN_WHEEL_CHECK=true
RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
Expand All @@ -168,31 +172,14 @@ RUN --mount=type=cache,target=/root/.cache/uv \

#################### vLLM installation IMAGE ####################
# image with vLLM installed
# TODO: Restore to base image after FlashInfer AOT wheel fixed
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS vllm-base
FROM ${WOLFI_OS_BASE_IMAGE} AS vllm-base
ARG WOLFI_OS_BASE_IMAGE=none
ARG CUDA_VERSION=12.4.1
ARG PYTHON_VERSION=3.12
WORKDIR /vllm-workspace
ENV DEBIAN_FRONTEND=noninteractive
ARG TARGETPLATFORM

RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
USER root

WORKDIR /workspace

# Install Python and other dependencies
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
&& apt-get update -y \
&& apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
&& add-apt-repository ppa:deadsnakes/ppa \
&& apt-get update -y \
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
&& update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
&& python3 --version && python3 -m pip --version
# Install uv for faster pip installs
RUN --mount=type=cache,target=/root/.cache/uv \
python3 -m pip install uv
Expand All @@ -205,7 +192,8 @@ ENV UV_HTTP_TIMEOUT=500
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
# this won't be needed for future versions of this docker image
# or future versions of triton.
RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
#RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/lib64/stubs/
#RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/lib64/

# arm64 (GH200) build follows the practice of "use existing pytorch" build,
# we need to install torch and torchvision from the nightly builds first,
Expand Down Expand Up @@ -251,41 +239,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \

#################### vLLM installation IMAGE ####################

#################### TEST IMAGE ####################
# image to run unit testing suite
# note that this uses vllm installed by `pip`
FROM vllm-base AS test

ADD . /vllm-workspace/

# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
# Reference: https://github.com/astral-sh/uv/pull/1694
ENV UV_HTTP_TIMEOUT=500

# install development dependencies (for testing)
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r requirements/dev.txt

# install development dependencies (for testing)
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -e tests/vllm_test_utils

# enable fast downloads from hf (for testing)
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system hf_transfer
ENV HF_HUB_ENABLE_HF_TRANSFER 1

# Copy in the v1 package for testing (it isn't distributed yet)
COPY vllm/v1 /usr/local/lib/python3.12/dist-packages/vllm/v1

# doc requires source code
# we hide them inside `test_docs/` , so that this source code
# will not be imported by other tests
RUN mkdir test_docs
RUN mv docs test_docs/
RUN mv vllm test_docs/
#################### TEST IMAGE ####################

#################### OPENAI API SERVER ####################
# base openai image with additional requirements, for any subsequent openai-style images
FROM vllm-base AS vllm-openai-base
Expand All @@ -302,16 +255,13 @@ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.3' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
fi

ENV VLLM_USAGE_SOURCE production-docker-image

# define sagemaker first, so it is not default from `docker build`
FROM vllm-openai-base AS vllm-sagemaker

COPY examples/online_serving/sagemaker-entrypoint.sh .
RUN chmod +x sagemaker-entrypoint.sh
ENTRYPOINT ["./sagemaker-entrypoint.sh"]
ENV VLLM_USAGE_SOURCE=production-docker-image
ENV VLLM_NCCL_SO_PATH=/usr/lib/python3.10/site-packages/nvidia/nccl/lib/libnccl.so.2
ENV NUMBA_CACHE_DIR=/workspace/numba_cache
RUN mkdir -p ${NUMBA_CACHE_DIR}
RUN chmod -R a+rwx /workspace

FROM vllm-openai-base AS vllm-openai
USER h2ogpt

ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
#################### OPENAI API SERVER ####################
Loading