diff --git a/.github/actions/nm-build-docker/action.yml b/.github/actions/nm-build-docker/action.yml new file mode 100644 index 0000000000000..db128c5d103d2 --- /dev/null +++ b/.github/actions/nm-build-docker/action.yml @@ -0,0 +1,33 @@ +name: Build docker image +description: 'build docker image for nm-vllm' +inputs: + docker_tag: + description: "tag to be used for the docker image" + type: string + required: true + build_type: + description: "type of nm-vllm to install for the docker image: nightly (default) or release" + type: string + default: 'nightly' + build_version: + description: "version of nm-vllm to install for the docker image: latest (default) or specific version e.g. 0.4.0, 0.4.0.20240531" + type: string + default: 'latest' +runs: + using: composite + steps: + - run: | + # clean up + docker stop $(docker ps -a -q) || echo 'no container to stop' + docker rm $(docker ps -a -q) || echo 'no container to remove' + docker rmi -f $(docker images -aq) || echo 'no image to remove' + docker system prune --all --force + # build + docker build --tag ghcr.io/neuralmagic/nm-vllm-openai:${{ inputs.docker_tag }} \ + --build-arg build_type=${{ inputs.build_type }} \ + --build-arg build_version=${{ inputs.build_version }} \ + --target vllm-openai . || status=$? + echo "status=${status}" >> $GITHUB_OUTPUT + echo "status=${status}" + exit ${status} + shell: bash diff --git a/.github/actions/nm-setup-nvidia-container-toolkit/action.yml b/.github/actions/nm-setup-nvidia-container-toolkit/action.yml new file mode 100644 index 0000000000000..2e95d43d87827 --- /dev/null +++ b/.github/actions/nm-setup-nvidia-container-toolkit/action.yml @@ -0,0 +1,20 @@ +name: set up nvidia-container-toolkit for docker +description: 'sets up nvidia-container-toolkit for docker' +runs: + using: composite + steps: + - run: | + # install nvidia-container-toolkit + curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \ + && curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \ + sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \ + sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list + sudo sed -i -e '/experimental/ s/^#//g' /etc/apt/sources.list.d/nvidia-container-toolkit.list + sudo killall apt apt-get || echo 'no apt or apt-get process to kill' + sudo apt-get update + sudo apt-get install -y nvidia-container-toolkit + # config and restart docker + sudo systemctl stop docker + sudo nvidia-ctk runtime configure --runtime=docker + sudo systemctl start docker + shell: bash diff --git a/.github/workflows/publish-docker.yml b/.github/workflows/publish-docker.yml index 7e8a08064f459..1e129ebc55996 100644 --- a/.github/workflows/publish-docker.yml +++ b/.github/workflows/publish-docker.yml @@ -1,26 +1,36 @@ name: Docker Build + Publish on: - # For now, just manually trigger - # push: - # branches: - # - main - # pull_request: - # branches: - # - main workflow_dispatch: + inputs: + docker_tag: + description: "tag to be used for the docker image" + type: string + required: true + push_to_repository: + description: "whether to push out the docker image: no (default) or yes" + type: string + default: 'no' + gitref: + description: "git commit hash or branch name" + type: string + default: 'main' + build_type: + description: "type of nm-vllm to install for the docker image: nightly (default) or release" + type: string + default: 'nightly' + build_version: + description: "version of nm-vllm to install for the docker image: latest (default) or specific version e.g. 0.4.0, 0.4.0.20240531" + type: string + default: 'latest' jobs: build-docker-image: - runs-on: aws-avx2-192G-4-a10g-96G - timeout-minutes: 240 + runs-on: aws-avx2-32G-a10g-24G + timeout-minutes: 60 steps: - - - name: Set up Docker Buildx - id: buildx - uses: docker/setup-buildx-action@v3 - name: Login to Github Packages uses: docker/login-action@v3 @@ -35,17 +45,23 @@ jobs: fetch-depth: 1 submodules: recursive - - name: Get version tag - id: extract_tag - run: echo "tag=$(date +%Y%m%d)" >> $GITHUB_OUTPUT + - name: Set up nvidia-container-toolkit + id: setup + uses: ./.github/actions/nm-setup-nvidia-container-toolkit/ - - name: Current Version Name - run: echo ${{ steps.extract_tag.outputs.tag }} + - name: Build image + id: build + uses: ./.github/actions/nm-build-docker/ + with: + docker_tag: ${{ inputs.docker_tag }} + build_type: ${{ inputs.build_type }} + build_version: ${{ inputs.build_version }} - - name: nm-vllm latest + - name: Push image uses: docker/build-push-action@v5 + if: ${{ inputs.push_to_repository == 'yes' && steps.build.outputs.status == 0 }} with: context: . target: vllm-openai push: true - tags: ghcr.io/neuralmagic/nm-vllm-openai:${{ steps.extract_tag.outputs.tag }} + tags: ghcr.io/neuralmagic/nm-vllm-openai:${{ inputs.docker_tag }} diff --git a/Dockerfile b/Dockerfile index 29c561fdfe884..9ec091a103741 100644 --- a/Dockerfile +++ b/Dockerfile @@ -31,61 +31,29 @@ COPY requirements-dev.txt requirements-dev.txt RUN --mount=type=cache,target=/root/.cache/pip \ pip install -r requirements-dev.txt -# cuda arch list used by torch -# can be useful for both `dev` and `test` -# explicitly set the list to avoid issues with torch 2.2 -# see https://github.com/pytorch/pytorch/pull/123243 -ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX' -ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list} #################### BASE BUILD IMAGE #################### - #################### WHEEL BUILD IMAGE #################### FROM dev AS build -# install build dependencies -COPY requirements-build.txt requirements-build.txt -RUN --mount=type=cache,target=/root/.cache/pip \ - pip install -r requirements-build.txt - # install compiler cache to speed up compilation leveraging local or remote caching RUN apt-get update -y && apt-get install -y ccache -# files and directories related to build wheels -COPY csrc csrc -COPY setup.py setup.py -COPY cmake cmake -COPY CMakeLists.txt CMakeLists.txt -COPY requirements-common.txt requirements-common.txt -COPY requirements-cuda.txt requirements-cuda.txt -COPY pyproject.toml pyproject.toml -COPY vllm vllm - -# max jobs used by Ninja to build extensions -ARG max_jobs=2 -ENV MAX_JOBS=${max_jobs} -# number of threads used by nvcc -ARG nvcc_threads=8 -ENV NVCC_THREADS=$nvcc_threads -# make sure punica kernels are built (for LoRA) -ENV VLLM_INSTALL_PUNICA_KERNELS=1 - -ENV CCACHE_DIR=/root/.cache/ccache -RUN --mount=type=cache,target=/root/.cache/ccache \ - --mount=type=cache,target=/root/.cache/pip \ - python3 setup.py bdist_wheel --dist-dir=dist +#################### EXTENSION Build IMAGE #################### -# check the size of the wheel, we cannot upload wheels larger than 100MB -COPY .buildkite/check-wheel-size.py check-wheel-size.py -RUN python3 check-wheel-size.py dist +#################### FLASH_ATTENTION Build IMAGE #################### +FROM dev as flash-attn-builder +# flash attention version +ARG flash_attn_version=v2.5.8 +ENV FLASH_ATTN_VERSION=${flash_attn_version} -# the `vllm_nccl` package must be installed from source distribution -# pip is too smart to store a wheel in the cache, and other CI jobs -# will directly use the wheel from the cache, which is not what we want. -# we need to remove it manually -RUN --mount=type=cache,target=/root/.cache/pip \ - pip cache remove vllm_nccl* -#################### EXTENSION Build IMAGE #################### +WORKDIR /usr/src/flash-attention-v2 + +# Download the wheel or build it if a pre-compiled release doesn't exist +RUN pip --verbose wheel flash-attn==${FLASH_ATTN_VERSION} \ + --no-build-isolation --no-deps --no-cache-dir + +#################### FLASH_ATTENTION Build IMAGE #################### #################### vLLM installation IMAGE #################### # image with vLLM installed @@ -101,18 +69,34 @@ RUN apt-get update -y \ # or future versions of triton. RUN ldconfig /usr/local/cuda-12.4/compat/ -# UPSTREAM SYNC: Install sparsity extras -RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \ +# install nm-vllm wheel first, so that torch etc will be installed +ARG build_type="nightly" +ARG build_version="latest" +ENV INSTALL_TYPE=${build_type} +ENV INSTALL_VERSION=${build_version} +# UPSTREAM SYNC: Install nm-vllm with sparsity extras +# use nm pypi for now for testing +RUN --mount=type=bind,from=build \ --mount=type=cache,target=/root/.cache/pip \ - pip install nm-magic-wand-nightly --extra-index-url https://pypi.neuralmagic.com/simple - -# install vllm wheel first, so that torch etc will be installed -RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \ + if [ "${INSTALL_TYPE}" = "nightly" ]; then \ + if [ "${INSTALL_VERSION}" = "latest" ]; then \ + pip install nm-vllm-nightly[sparse] --extra-index-url https://pypi.neuralmagic.com/simple; \ + else \ + pip install nm-vllm-nightly[sparse]==${INSTALL_VERSION} --extra-index-url https://pypi.neuralmagic.com/simple; \ + fi; \ + else \ + if [ "${INSTALL_VERSION}" = "latest" ]; then \ + pip install nm-vllm[sparse] --extra-index-url https://pypi.neuralmagic.com/simple; \ + else \ + pip install nm-vllm[sparse]==${INSTALL_VERSION} --extra-index-url https://pypi.neuralmagic.com/simple; \ + fi; \ + fi + +RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \ --mount=type=cache,target=/root/.cache/pip \ - pip install dist/*.whl --verbose + pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir #################### vLLM installation IMAGE #################### - #################### TEST IMAGE #################### # image to run unit testing suite # note that this uses vllm installed by `pip` @@ -120,9 +104,8 @@ FROM vllm-base AS test ADD . /vllm-workspace/ -# install development dependencies (for testing) -RUN --mount=type=cache,target=/root/.cache/pip \ - pip install -r requirements-dev.txt +# check installed version +RUN pip freeze | grep -e nm-vllm -e nm-magic-wand # doc requires source code # we hide them inside `test_docs/` , so that this source code @@ -144,4 +127,4 @@ RUN --mount=type=cache,target=/root/.cache/pip \ ENV VLLM_USAGE_SOURCE production-docker-image ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] -#################### OPENAI API SERVER #################### \ No newline at end of file +#################### OPENAI API SERVER ####################