Merge branch 'main' into cleanup-req

vllm-project · Apr 5, 2024 · 10ba462 · 10ba462
2 parents 335f64b + e0dd4d3
commit 10ba462
Show file tree

Hide file tree

Showing 10 changed files with 143 additions and 78 deletions.
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -90,7 +90,7 @@ steps:
   - bash run-benchmarks.sh
 
 - label: Documentation Build
-  working_dir: "/vllm-workspace/docs"
+  working_dir: "/vllm-workspace/test_docs/docs"
   no_gpu: True
   commands:
   - pip install -r requirements-docs.txt

diff --git a/Dockerfile b/Dockerfile
@@ -2,6 +2,7 @@
 # to run the OpenAI compatible server.
 
 #################### BASE BUILD IMAGE ####################
+# prepare basic build environment
 FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev
 
 RUN apt-get update -y \
@@ -35,7 +36,7 @@ ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
 #################### BASE BUILD IMAGE ####################
 
 
-#################### EXTENSION BUILD IMAGE ####################
+#################### WHEEL BUILD IMAGE ####################
 FROM dev AS build
 
 # install build dependencies
@@ -46,15 +47,15 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 # install compiler cache to speed up compilation leveraging local or remote caching
 RUN apt-get update -y && apt-get install -y ccache
 
-# copy input files
+# files and directories related to build wheels
 COPY csrc csrc
 COPY setup.py setup.py
 COPY cmake cmake
 COPY CMakeLists.txt CMakeLists.txt
 COPY requirements-common.txt requirements-common.txt
 COPY requirements-cuda.txt requirements-cuda.txt
 COPY pyproject.toml pyproject.toml
-COPY vllm/__init__.py vllm/__init__.py
+COPY vllm vllm
 
 # max jobs used by Ninja to build extensions
 ARG max_jobs=2
@@ -67,7 +68,15 @@ ENV VLLM_INSTALL_PUNICA_KERNELS=1
 
 ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/ccache \
-    python3 setup.py build_ext --inplace
+    --mount=type=cache,target=/root/.cache/pip \
+    python3 setup.py bdist_wheel --dist-dir=dist
+
+# the `vllm_nccl` package must be installed from source distribution
+# pip is too smart to store a wheel in the cache, and other CI jobs
+# will directly use the wheel from the cache, which is not what we want.
+# we need to remove it manually
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip cache remove vllm_nccl*
 #################### EXTENSION Build IMAGE ####################
 
 #################### FLASH_ATTENTION Build IMAGE ####################
@@ -87,58 +96,59 @@ RUN pip --verbose wheel flash-attn==${FLASH_ATTN_VERSION} \
 
 #################### FLASH_ATTENTION Build IMAGE ####################
 
-#################### TEST IMAGE ####################
-# image to run unit testing suite
-FROM dev AS test
-
-# copy pytorch extensions separately to avoid having to rebuild
-# when python code changes
+#################### vLLM installation IMAGE ####################
+# image with vLLM installed
+FROM nvidia/cuda:12.1.0-base-ubuntu22.04 AS vllm-base
 WORKDIR /vllm-workspace
-# ADD is used to preserve directory structure
-ADD . /vllm-workspace/
-COPY --from=build /workspace/vllm/*.so /vllm-workspace/vllm/
-# Install flash attention (from pre-built wheel)
+
+RUN apt-get update -y \
+    && apt-get install -y python3-pip git vim
+
+# Workaround for https://github.com/openai/triton/issues/2507 and
+# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
+# this won't be needed for future versions of this docker image
+# or future versions of triton.
+RUN ldconfig /usr/local/cuda-12.1/compat/
+
+# install vllm wheel first, so that torch etc will be installed
+RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
+    --mount=type=cache,target=/root/.cache/pip \
+    pip install dist/*.whl --verbose
+
 RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
+    --mount=type=cache,target=/root/.cache/pip \
     pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir
-# ignore build dependencies installation because we are using pre-complied extensions
-RUN rm pyproject.toml
-RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip install . --verbose
-#################### TEST IMAGE ####################
+#################### vLLM installation IMAGE ####################
 
 
-#################### RUNTIME BASE IMAGE ####################
-# We used base cuda image because pytorch installs its own cuda libraries.
-# However pynccl depends on cuda libraries so we had to switch to the runtime image
-# In the future it would be nice to get a container with pytorch and cuda without duplicating cuda
-FROM nvidia/cuda:12.1.0-runtime-ubuntu22.04 AS vllm-base
+#################### TEST IMAGE ####################
+# image to run unit testing suite
+# note that this uses vllm installed by `pip`
+FROM vllm-base AS test
 
-# libnccl required for ray
-RUN apt-get update -y \
-    && apt-get install -y python3-pip
+ADD . /vllm-workspace/
 
-WORKDIR /workspace
-COPY requirements-common.txt requirements-common.txt
-COPY requirements-cuda.txt requirements-cuda.txt
+# install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install -r requirements-cuda.txt
-
-# Install flash attention (from pre-built wheel)
-RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
-    pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir
+    pip install -r requirements-dev.txt
 
-#################### RUNTIME BASE IMAGE ####################
+# doc requires source code
+# we hide them inside `test_docs/` , so that this source code
+# will not be imported by other tests
+RUN mkdir test_docs
+RUN mv docs test_docs/
+RUN mv vllm test_docs/
 
+#################### TEST IMAGE ####################
 
 #################### OPENAI API SERVER ####################
 # openai api server alternative
 FROM vllm-base AS vllm-openai
+
 # install additional dependencies for openai api server
 RUN --mount=type=cache,target=/root/.cache/pip \
     pip install accelerate hf_transfer modelscope
 
-COPY --from=build /workspace/vllm/*.so /workspace/vllm/
-COPY vllm vllm
-
 ENV VLLM_USAGE_SOURCE production-docker-image
 
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -11,13 +11,10 @@
 # documentation root, use os.path.abspath to make it absolute, like shown here.
 
 import logging
-import os
 import sys
 
 from sphinx.ext import autodoc
 
-sys.path.insert(0, os.path.abspath(os.path.join('..', '..')))
-
 logger = logging.getLogger(__name__)
 
 # -- Project information -----------------------------------------------------

diff --git a/docs/source/models/engine_args.rst b/docs/source/models/engine_args.rst
@@ -118,3 +118,19 @@ Below, you can find an explanation of every engine argument for vLLM:
 .. option:: --quantization (-q) {awq,squeezellm,None}
 
     Method used to quantize the weights.
+
+Async Engine Arguments
+----------------------
+Below are the additional arguments related to the asynchronous engine:
+
+.. option:: --engine-use-ray
+
+    Use Ray to start the LLM engine in a separate process as the server process.
+
+.. option:: --disable-log-requests
+
+    Disable logging requests.
+
+.. option:: --max-log-len
+
+    Max number of prompt characters or prompt ID numbers being printed in log. Defaults to unlimited.
diff --git a/examples/fp8/quantizer/quantize.py b/examples/fp8/quantizer/quantize.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # noqa: E501
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -131,7 +131,8 @@ def get_tokenizer(ckpt_path, max_seq_len=MAX_SEQ_LEN, model_type=None):
         tokenizer.pad_token = tokenizer.eos_token
     if tokenizer.pad_token is None:
         tokenizer.pad_token = tokenizer.eos_token
-    assert tokenizer.pad_token is not None, f"Pad token for {model_type} cannot be set!"
+    assert (tokenizer.pad_token
+            is not None), f"Pad token for {model_type} cannot be set!"
 
     return tokenizer
 
@@ -158,9 +159,9 @@ def get_model(ckpt_path, dtype="fp16", device="cuda"):
 
     model_dtype = next(model.parameters()).dtype
     if dtype != model_dtype:
-        print(
-            f"[TensorRT-LLM][WARNING] The manually set model data type is {dtype}, "
-            f"but the data type of the HuggingFace model is {model_dtype}.")
+        print("[TensorRT-LLM][WARNING] The manually set model data type is "
+              f"{dtype}, but the data type of the HuggingFace model is "
+              f"{model_dtype}.")
 
     return model
 
@@ -244,15 +245,13 @@ def main(args):
     else:
         if "awq" in args.qformat:
             if args.calib_size > 32:
-                print(
-                    f"AWQ calibration could take longer with calib_size = {args.calib_size}, Using"
-                    " calib_size=32 instead")
+                print("AWQ calibration could take longer with calib_size = "
+                      f"{args.calib_size}, Using calib_size=32 instead")
                 args.calib_size = 32
-            print(
-                "\nAWQ calibration could take longer than other calibration methods. Please"
-                " increase the batch size to speed up the calibration process. Batch size can be"
-                " set by adding the argument --batch_size <batch_size> to the command line.\n"
-            )
+            print("\nAWQ calibration could take longer than other calibration "
+                  "methods. Please increase the batch size to speed up the "
+                  "calibration process. Batch size can be set by adding the "
+                  "argument --batch_size <batch_size> to the command line.\n")
 
         calib_dataloader = get_calib_dataloader(
             tokenizer=tokenizer,
@@ -287,9 +286,8 @@ def main(args):
 
     with torch.inference_mode():
         if model_type is None:
-            print(
-                f"Unknown model type {type(model).__name__}. Continue exporting..."
-            )
+            print(f"Unknown model type {type(model).__name__}. Continue "
+                  "exporting...")
             model_type = f"unknown:{type(model).__name__}"
 
         export_path = args.output_dir

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -56,11 +56,15 @@ def cleanup():
 
 
 @pytest.fixture()
-def should_do_global_cleanup_after_test() -> bool:
+def should_do_global_cleanup_after_test(request) -> bool:
     """Allow subdirectories to skip global cleanup by overriding this fixture.
     This can provide a ~10x speedup for non-GPU unit tests since they don't need
     to initialize torch.
     """
+
+    if request.node.get_closest_marker("skip_global_cleanup"):
+        return False
+
     return True
 
 

diff --git a/tests/spec_decode/test_batch_expansion.py b/tests/spec_decode/test_batch_expansion.py
@@ -7,6 +7,7 @@
 
 
 @pytest.mark.parametrize('num_target_seq_ids', [100])
+@pytest.mark.skip_global_cleanup
 def test_create_target_seq_id_iterator(num_target_seq_ids: int):
     """Verify all new sequence ids are greater than all input
     seq ids.
@@ -27,6 +28,7 @@ def test_create_target_seq_id_iterator(num_target_seq_ids: int):
 
 
 @pytest.mark.parametrize('k', [1, 2, 6])
+@pytest.mark.skip_global_cleanup
 def test_get_token_ids_to_score(k: int):
     """Verify correct tokens are selected for scoring.
     """
@@ -53,6 +55,7 @@ def test_get_token_ids_to_score(k: int):
 
 
 @pytest.mark.parametrize('k', [1, 2, 6])
+@pytest.mark.skip_global_cleanup
 def test_create_single_target_seq_group_metadata(k: int):
     """Verify correct creation of a batch-expanded seq group metadata.
     """

diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py
@@ -487,7 +487,7 @@ def test_empty_input_batch(k: int, batch_size: int):
         **execute_model_data.to_dict())
 
 
-@torch.inference_mode()
+@pytest.mark.skip_global_cleanup
 def test_init_device():
     """Verify SpecDecodeWorker invokes proposer/scorer worker init_device, as
     well as other GPU initialization.
@@ -537,7 +537,7 @@ def test_init_cache_engine():
 @pytest.mark.parametrize('available_cpu_blocks', [500])
 @pytest.mark.parametrize('target_cache_block_size_bytes', [2 * 2 * 4096])
 @pytest.mark.parametrize('draft_kv_size_bytes', [0, 2 * 2 * 768, 2 * 2 * 4096])
-@torch.inference_mode()
+@pytest.mark.skip_global_cleanup
 def test_profile_num_available_blocks(available_gpu_blocks: int,
                                       available_cpu_blocks: int,
                                       target_cache_block_size_bytes: int,
@@ -584,7 +584,7 @@ def test_profile_num_available_blocks(available_gpu_blocks: int,
 @pytest.mark.parametrize('target_cache_block_size_bytes',
                          [2 * 2 * 4096, 2 * 2 * 8192])
 @pytest.mark.parametrize('draft_kv_size_bytes', [0, 2 * 2 * 768, 2 * 2 * 4096])
-@torch.inference_mode()
+@pytest.mark.skip_global_cleanup
 def test_split_num_cache_blocks_evenly(available_gpu_blocks: int,
                                        target_cache_block_size_bytes: int,
                                        draft_kv_size_bytes: int):