Skip to content

Commit

Permalink
Merge branch 'main' into cleanup-req
Browse files Browse the repository at this point in the history
  • Loading branch information
WoosukKwon committed Apr 5, 2024
2 parents 335f64b + e0dd4d3 commit 10ba462
Show file tree
Hide file tree
Showing 10 changed files with 143 additions and 78 deletions.
2 changes: 1 addition & 1 deletion .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ steps:
- bash run-benchmarks.sh

- label: Documentation Build
working_dir: "/vllm-workspace/docs"
working_dir: "/vllm-workspace/test_docs/docs"
no_gpu: True
commands:
- pip install -r requirements-docs.txt
Expand Down
86 changes: 48 additions & 38 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# to run the OpenAI compatible server.

#################### BASE BUILD IMAGE ####################
# prepare basic build environment
FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev

RUN apt-get update -y \
Expand Down Expand Up @@ -35,7 +36,7 @@ ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
#################### BASE BUILD IMAGE ####################


#################### EXTENSION BUILD IMAGE ####################
#################### WHEEL BUILD IMAGE ####################
FROM dev AS build

# install build dependencies
Expand All @@ -46,15 +47,15 @@ RUN --mount=type=cache,target=/root/.cache/pip \
# install compiler cache to speed up compilation leveraging local or remote caching
RUN apt-get update -y && apt-get install -y ccache

# copy input files
# files and directories related to build wheels
COPY csrc csrc
COPY setup.py setup.py
COPY cmake cmake
COPY CMakeLists.txt CMakeLists.txt
COPY requirements-common.txt requirements-common.txt
COPY requirements-cuda.txt requirements-cuda.txt
COPY pyproject.toml pyproject.toml
COPY vllm/__init__.py vllm/__init__.py
COPY vllm vllm

# max jobs used by Ninja to build extensions
ARG max_jobs=2
Expand All @@ -67,7 +68,15 @@ ENV VLLM_INSTALL_PUNICA_KERNELS=1

ENV CCACHE_DIR=/root/.cache/ccache
RUN --mount=type=cache,target=/root/.cache/ccache \
python3 setup.py build_ext --inplace
--mount=type=cache,target=/root/.cache/pip \
python3 setup.py bdist_wheel --dist-dir=dist

# the `vllm_nccl` package must be installed from source distribution
# pip is too smart to store a wheel in the cache, and other CI jobs
# will directly use the wheel from the cache, which is not what we want.
# we need to remove it manually
RUN --mount=type=cache,target=/root/.cache/pip \
pip cache remove vllm_nccl*
#################### EXTENSION Build IMAGE ####################

#################### FLASH_ATTENTION Build IMAGE ####################
Expand All @@ -87,58 +96,59 @@ RUN pip --verbose wheel flash-attn==${FLASH_ATTN_VERSION} \

#################### FLASH_ATTENTION Build IMAGE ####################

#################### TEST IMAGE ####################
# image to run unit testing suite
FROM dev AS test

# copy pytorch extensions separately to avoid having to rebuild
# when python code changes
#################### vLLM installation IMAGE ####################
# image with vLLM installed
FROM nvidia/cuda:12.1.0-base-ubuntu22.04 AS vllm-base
WORKDIR /vllm-workspace
# ADD is used to preserve directory structure
ADD . /vllm-workspace/
COPY --from=build /workspace/vllm/*.so /vllm-workspace/vllm/
# Install flash attention (from pre-built wheel)

RUN apt-get update -y \
&& apt-get install -y python3-pip git vim

# Workaround for https://github.com/openai/triton/issues/2507 and
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
# this won't be needed for future versions of this docker image
# or future versions of triton.
RUN ldconfig /usr/local/cuda-12.1/compat/

# install vllm wheel first, so that torch etc will be installed
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
--mount=type=cache,target=/root/.cache/pip \
pip install dist/*.whl --verbose

RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
--mount=type=cache,target=/root/.cache/pip \
pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir
# ignore build dependencies installation because we are using pre-complied extensions
RUN rm pyproject.toml
RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip install . --verbose
#################### TEST IMAGE ####################
#################### vLLM installation IMAGE ####################


#################### RUNTIME BASE IMAGE ####################
# We used base cuda image because pytorch installs its own cuda libraries.
# However pynccl depends on cuda libraries so we had to switch to the runtime image
# In the future it would be nice to get a container with pytorch and cuda without duplicating cuda
FROM nvidia/cuda:12.1.0-runtime-ubuntu22.04 AS vllm-base
#################### TEST IMAGE ####################
# image to run unit testing suite
# note that this uses vllm installed by `pip`
FROM vllm-base AS test

# libnccl required for ray
RUN apt-get update -y \
&& apt-get install -y python3-pip
ADD . /vllm-workspace/

WORKDIR /workspace
COPY requirements-common.txt requirements-common.txt
COPY requirements-cuda.txt requirements-cuda.txt
# install development dependencies (for testing)
RUN --mount=type=cache,target=/root/.cache/pip \
pip install -r requirements-cuda.txt

# Install flash attention (from pre-built wheel)
RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir
pip install -r requirements-dev.txt

#################### RUNTIME BASE IMAGE ####################
# doc requires source code
# we hide them inside `test_docs/` , so that this source code
# will not be imported by other tests
RUN mkdir test_docs
RUN mv docs test_docs/
RUN mv vllm test_docs/

#################### TEST IMAGE ####################

#################### OPENAI API SERVER ####################
# openai api server alternative
FROM vllm-base AS vllm-openai

# install additional dependencies for openai api server
RUN --mount=type=cache,target=/root/.cache/pip \
pip install accelerate hf_transfer modelscope

COPY --from=build /workspace/vllm/*.so /workspace/vllm/
COPY vllm vllm

ENV VLLM_USAGE_SOURCE production-docker-image

ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
Expand Down
3 changes: 0 additions & 3 deletions docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,10 @@
# documentation root, use os.path.abspath to make it absolute, like shown here.

import logging
import os
import sys

from sphinx.ext import autodoc

sys.path.insert(0, os.path.abspath(os.path.join('..', '..')))

logger = logging.getLogger(__name__)

# -- Project information -----------------------------------------------------
Expand Down
16 changes: 16 additions & 0 deletions docs/source/models/engine_args.rst
Original file line number Diff line number Diff line change
Expand Up @@ -118,3 +118,19 @@ Below, you can find an explanation of every engine argument for vLLM:
.. option:: --quantization (-q) {awq,squeezellm,None}

Method used to quantize the weights.

Async Engine Arguments
----------------------
Below are the additional arguments related to the asynchronous engine:

.. option:: --engine-use-ray

Use Ray to start the LLM engine in a separate process as the server process.

.. option:: --disable-log-requests

Disable logging requests.

.. option:: --max-log-len

Max number of prompt characters or prompt ID numbers being printed in log. Defaults to unlimited.
30 changes: 14 additions & 16 deletions examples/fp8/quantizer/quantize.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # noqa: E501
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down Expand Up @@ -131,7 +131,8 @@ def get_tokenizer(ckpt_path, max_seq_len=MAX_SEQ_LEN, model_type=None):
tokenizer.pad_token = tokenizer.eos_token
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
assert tokenizer.pad_token is not None, f"Pad token for {model_type} cannot be set!"
assert (tokenizer.pad_token
is not None), f"Pad token for {model_type} cannot be set!"

return tokenizer

Expand All @@ -158,9 +159,9 @@ def get_model(ckpt_path, dtype="fp16", device="cuda"):

model_dtype = next(model.parameters()).dtype
if dtype != model_dtype:
print(
f"[TensorRT-LLM][WARNING] The manually set model data type is {dtype}, "
f"but the data type of the HuggingFace model is {model_dtype}.")
print("[TensorRT-LLM][WARNING] The manually set model data type is "
f"{dtype}, but the data type of the HuggingFace model is "
f"{model_dtype}.")

return model

Expand Down Expand Up @@ -244,15 +245,13 @@ def main(args):
else:
if "awq" in args.qformat:
if args.calib_size > 32:
print(
f"AWQ calibration could take longer with calib_size = {args.calib_size}, Using"
" calib_size=32 instead")
print("AWQ calibration could take longer with calib_size = "
f"{args.calib_size}, Using calib_size=32 instead")
args.calib_size = 32
print(
"\nAWQ calibration could take longer than other calibration methods. Please"
" increase the batch size to speed up the calibration process. Batch size can be"
" set by adding the argument --batch_size <batch_size> to the command line.\n"
)
print("\nAWQ calibration could take longer than other calibration "
"methods. Please increase the batch size to speed up the "
"calibration process. Batch size can be set by adding the "
"argument --batch_size <batch_size> to the command line.\n")

calib_dataloader = get_calib_dataloader(
tokenizer=tokenizer,
Expand Down Expand Up @@ -287,9 +286,8 @@ def main(args):

with torch.inference_mode():
if model_type is None:
print(
f"Unknown model type {type(model).__name__}. Continue exporting..."
)
print(f"Unknown model type {type(model).__name__}. Continue "
"exporting...")
model_type = f"unknown:{type(model).__name__}"

export_path = args.output_dir
Expand Down
6 changes: 5 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,15 @@ def cleanup():


@pytest.fixture()
def should_do_global_cleanup_after_test() -> bool:
def should_do_global_cleanup_after_test(request) -> bool:
"""Allow subdirectories to skip global cleanup by overriding this fixture.
This can provide a ~10x speedup for non-GPU unit tests since they don't need
to initialize torch.
"""

if request.node.get_closest_marker("skip_global_cleanup"):
return False

return True


Expand Down
3 changes: 3 additions & 0 deletions tests/spec_decode/test_batch_expansion.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@


@pytest.mark.parametrize('num_target_seq_ids', [100])
@pytest.mark.skip_global_cleanup
def test_create_target_seq_id_iterator(num_target_seq_ids: int):
"""Verify all new sequence ids are greater than all input
seq ids.
Expand All @@ -27,6 +28,7 @@ def test_create_target_seq_id_iterator(num_target_seq_ids: int):


@pytest.mark.parametrize('k', [1, 2, 6])
@pytest.mark.skip_global_cleanup
def test_get_token_ids_to_score(k: int):
"""Verify correct tokens are selected for scoring.
"""
Expand All @@ -53,6 +55,7 @@ def test_get_token_ids_to_score(k: int):


@pytest.mark.parametrize('k', [1, 2, 6])
@pytest.mark.skip_global_cleanup
def test_create_single_target_seq_group_metadata(k: int):
"""Verify correct creation of a batch-expanded seq group metadata.
"""
Expand Down
6 changes: 3 additions & 3 deletions tests/spec_decode/test_spec_decode_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -487,7 +487,7 @@ def test_empty_input_batch(k: int, batch_size: int):
**execute_model_data.to_dict())


@torch.inference_mode()
@pytest.mark.skip_global_cleanup
def test_init_device():
"""Verify SpecDecodeWorker invokes proposer/scorer worker init_device, as
well as other GPU initialization.
Expand Down Expand Up @@ -537,7 +537,7 @@ def test_init_cache_engine():
@pytest.mark.parametrize('available_cpu_blocks', [500])
@pytest.mark.parametrize('target_cache_block_size_bytes', [2 * 2 * 4096])
@pytest.mark.parametrize('draft_kv_size_bytes', [0, 2 * 2 * 768, 2 * 2 * 4096])
@torch.inference_mode()
@pytest.mark.skip_global_cleanup
def test_profile_num_available_blocks(available_gpu_blocks: int,
available_cpu_blocks: int,
target_cache_block_size_bytes: int,
Expand Down Expand Up @@ -584,7 +584,7 @@ def test_profile_num_available_blocks(available_gpu_blocks: int,
@pytest.mark.parametrize('target_cache_block_size_bytes',
[2 * 2 * 4096, 2 * 2 * 8192])
@pytest.mark.parametrize('draft_kv_size_bytes', [0, 2 * 2 * 768, 2 * 2 * 4096])
@torch.inference_mode()
@pytest.mark.skip_global_cleanup
def test_split_num_cache_blocks_evenly(available_gpu_blocks: int,
target_cache_block_size_bytes: int,
draft_kv_size_bytes: int):
Expand Down
Loading

0 comments on commit 10ba462

Please sign in to comment.