Skip to content
This repository has been archived by the owner on Oct 11, 2024. It is now read-only.

Commit

Permalink
Docker image improvements (#276)
Browse files Browse the repository at this point in the history
1. Allowed generating docker image for both nightly and release
2. Removed build part, only installed wheels directly from nm pypi
3. Updated workflow files to automate the docker image generation

---------

Co-authored-by: dhuangnm <dhuang@MacBook-Pro-2.local>
Co-authored-by: dhuangnm <dhuang@ip-192-168-198-30.ec2.internal>
  • Loading branch information
3 people authored Jun 3, 2024
1 parent fec3563 commit bd14cbd
Show file tree
Hide file tree
Showing 4 changed files with 129 additions and 77 deletions.
33 changes: 33 additions & 0 deletions .github/actions/nm-build-docker/action.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
name: Build docker image
description: 'build docker image for nm-vllm'
inputs:
docker_tag:
description: "tag to be used for the docker image"
type: string
required: true
build_type:
description: "type of nm-vllm to install for the docker image: nightly (default) or release"
type: string
default: 'nightly'
build_version:
description: "version of nm-vllm to install for the docker image: latest (default) or specific version e.g. 0.4.0, 0.4.0.20240531"
type: string
default: 'latest'
runs:
using: composite
steps:
- run: |
# clean up
docker stop $(docker ps -a -q) || echo 'no container to stop'
docker rm $(docker ps -a -q) || echo 'no container to remove'
docker rmi -f $(docker images -aq) || echo 'no image to remove'
docker system prune --all --force
# build
docker build --tag ghcr.io/neuralmagic/nm-vllm-openai:${{ inputs.docker_tag }} \
--build-arg build_type=${{ inputs.build_type }} \
--build-arg build_version=${{ inputs.build_version }} \
--target vllm-openai . || status=$?
echo "status=${status}" >> $GITHUB_OUTPUT
echo "status=${status}"
exit ${status}
shell: bash
20 changes: 20 additions & 0 deletions .github/actions/nm-setup-nvidia-container-toolkit/action.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
name: set up nvidia-container-toolkit for docker
description: 'sets up nvidia-container-toolkit for docker'
runs:
using: composite
steps:
- run: |
# install nvidia-container-toolkit
curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
&& curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
sudo sed -i -e '/experimental/ s/^#//g' /etc/apt/sources.list.d/nvidia-container-toolkit.list
sudo killall apt apt-get || echo 'no apt or apt-get process to kill'
sudo apt-get update
sudo apt-get install -y nvidia-container-toolkit
# config and restart docker
sudo systemctl stop docker
sudo nvidia-ctk runtime configure --runtime=docker
sudo systemctl start docker
shell: bash
56 changes: 36 additions & 20 deletions .github/workflows/publish-docker.yml
Original file line number Diff line number Diff line change
@@ -1,26 +1,36 @@
name: Docker Build + Publish

on:
# For now, just manually trigger
# push:
# branches:
# - main
# pull_request:
# branches:
# - main
workflow_dispatch:
inputs:
docker_tag:
description: "tag to be used for the docker image"
type: string
required: true
push_to_repository:
description: "whether to push out the docker image: no (default) or yes"
type: string
default: 'no'
gitref:
description: "git commit hash or branch name"
type: string
default: 'main'
build_type:
description: "type of nm-vllm to install for the docker image: nightly (default) or release"
type: string
default: 'nightly'
build_version:
description: "version of nm-vllm to install for the docker image: latest (default) or specific version e.g. 0.4.0, 0.4.0.20240531"
type: string
default: 'latest'

jobs:
build-docker-image:

runs-on: aws-avx2-192G-4-a10g-96G
timeout-minutes: 240
runs-on: aws-avx2-32G-a10g-24G
timeout-minutes: 60

steps:

- name: Set up Docker Buildx
id: buildx
uses: docker/setup-buildx-action@v3

- name: Login to Github Packages
uses: docker/login-action@v3
Expand All @@ -35,17 +45,23 @@ jobs:
fetch-depth: 1
submodules: recursive

- name: Get version tag
id: extract_tag
run: echo "tag=$(date +%Y%m%d)" >> $GITHUB_OUTPUT
- name: Set up nvidia-container-toolkit
id: setup
uses: ./.github/actions/nm-setup-nvidia-container-toolkit/

- name: Current Version Name
run: echo ${{ steps.extract_tag.outputs.tag }}
- name: Build image
id: build
uses: ./.github/actions/nm-build-docker/
with:
docker_tag: ${{ inputs.docker_tag }}
build_type: ${{ inputs.build_type }}
build_version: ${{ inputs.build_version }}

- name: nm-vllm latest
- name: Push image
uses: docker/build-push-action@v5
if: ${{ inputs.push_to_repository == 'yes' && steps.build.outputs.status == 0 }}
with:
context: .
target: vllm-openai
push: true
tags: ghcr.io/neuralmagic/nm-vllm-openai:${{ steps.extract_tag.outputs.tag }}
tags: ghcr.io/neuralmagic/nm-vllm-openai:${{ inputs.docker_tag }}
97 changes: 40 additions & 57 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -31,61 +31,29 @@ COPY requirements-dev.txt requirements-dev.txt
RUN --mount=type=cache,target=/root/.cache/pip \
pip install -r requirements-dev.txt

# cuda arch list used by torch
# can be useful for both `dev` and `test`
# explicitly set the list to avoid issues with torch 2.2
# see https://github.com/pytorch/pytorch/pull/123243
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
#################### BASE BUILD IMAGE ####################


#################### WHEEL BUILD IMAGE ####################
FROM dev AS build

# install build dependencies
COPY requirements-build.txt requirements-build.txt
RUN --mount=type=cache,target=/root/.cache/pip \
pip install -r requirements-build.txt

# install compiler cache to speed up compilation leveraging local or remote caching
RUN apt-get update -y && apt-get install -y ccache

# files and directories related to build wheels
COPY csrc csrc
COPY setup.py setup.py
COPY cmake cmake
COPY CMakeLists.txt CMakeLists.txt
COPY requirements-common.txt requirements-common.txt
COPY requirements-cuda.txt requirements-cuda.txt
COPY pyproject.toml pyproject.toml
COPY vllm vllm

# max jobs used by Ninja to build extensions
ARG max_jobs=2
ENV MAX_JOBS=${max_jobs}
# number of threads used by nvcc
ARG nvcc_threads=8
ENV NVCC_THREADS=$nvcc_threads
# make sure punica kernels are built (for LoRA)
ENV VLLM_INSTALL_PUNICA_KERNELS=1

ENV CCACHE_DIR=/root/.cache/ccache
RUN --mount=type=cache,target=/root/.cache/ccache \
--mount=type=cache,target=/root/.cache/pip \
python3 setup.py bdist_wheel --dist-dir=dist
#################### EXTENSION Build IMAGE ####################

# check the size of the wheel, we cannot upload wheels larger than 100MB
COPY .buildkite/check-wheel-size.py check-wheel-size.py
RUN python3 check-wheel-size.py dist
#################### FLASH_ATTENTION Build IMAGE ####################
FROM dev as flash-attn-builder
# flash attention version
ARG flash_attn_version=v2.5.8
ENV FLASH_ATTN_VERSION=${flash_attn_version}

# the `vllm_nccl` package must be installed from source distribution
# pip is too smart to store a wheel in the cache, and other CI jobs
# will directly use the wheel from the cache, which is not what we want.
# we need to remove it manually
RUN --mount=type=cache,target=/root/.cache/pip \
pip cache remove vllm_nccl*
#################### EXTENSION Build IMAGE ####################
WORKDIR /usr/src/flash-attention-v2

# Download the wheel or build it if a pre-compiled release doesn't exist
RUN pip --verbose wheel flash-attn==${FLASH_ATTN_VERSION} \
--no-build-isolation --no-deps --no-cache-dir

#################### FLASH_ATTENTION Build IMAGE ####################

#################### vLLM installation IMAGE ####################
# image with vLLM installed
Expand All @@ -101,28 +69,43 @@ RUN apt-get update -y \
# or future versions of triton.
RUN ldconfig /usr/local/cuda-12.4/compat/

# UPSTREAM SYNC: Install sparsity extras
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
# install nm-vllm wheel first, so that torch etc will be installed
ARG build_type="nightly"
ARG build_version="latest"
ENV INSTALL_TYPE=${build_type}
ENV INSTALL_VERSION=${build_version}
# UPSTREAM SYNC: Install nm-vllm with sparsity extras
# use nm pypi for now for testing
RUN --mount=type=bind,from=build \
--mount=type=cache,target=/root/.cache/pip \
pip install nm-magic-wand-nightly --extra-index-url https://pypi.neuralmagic.com/simple

# install vllm wheel first, so that torch etc will be installed
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
if [ "${INSTALL_TYPE}" = "nightly" ]; then \
if [ "${INSTALL_VERSION}" = "latest" ]; then \
pip install nm-vllm-nightly[sparse] --extra-index-url https://pypi.neuralmagic.com/simple; \
else \
pip install nm-vllm-nightly[sparse]==${INSTALL_VERSION} --extra-index-url https://pypi.neuralmagic.com/simple; \
fi; \
else \
if [ "${INSTALL_VERSION}" = "latest" ]; then \
pip install nm-vllm[sparse] --extra-index-url https://pypi.neuralmagic.com/simple; \
else \
pip install nm-vllm[sparse]==${INSTALL_VERSION} --extra-index-url https://pypi.neuralmagic.com/simple; \
fi; \
fi

RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
--mount=type=cache,target=/root/.cache/pip \
pip install dist/*.whl --verbose
pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir
#################### vLLM installation IMAGE ####################


#################### TEST IMAGE ####################
# image to run unit testing suite
# note that this uses vllm installed by `pip`
FROM vllm-base AS test

ADD . /vllm-workspace/

# install development dependencies (for testing)
RUN --mount=type=cache,target=/root/.cache/pip \
pip install -r requirements-dev.txt
# check installed version
RUN pip freeze | grep -e nm-vllm -e nm-magic-wand

# doc requires source code
# we hide them inside `test_docs/` , so that this source code
Expand All @@ -144,4 +127,4 @@ RUN --mount=type=cache,target=/root/.cache/pip \
ENV VLLM_USAGE_SOURCE production-docker-image

ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
#################### OPENAI API SERVER ####################
#################### OPENAI API SERVER ####################

0 comments on commit bd14cbd

Please sign in to comment.