neuralmagic · SageMoore · Apr 10, 2024 · Mar 2, 2024 · Mar 2, 2024 · Mar 3, 2024
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -34,7 +34,10 @@ steps:
   command: pytest -v -s engine tokenization test_sequence.py test_config.py
 
 - label: Entrypoints Test
-  command: pytest -v -s entrypoints
+  commands:
+  # these tests have to be separated, because each one will allocate all posible GPU memory
+  - pytest -v -s entrypoints --ignore=entrypoints/test_server_oot_registration.py
+  - pytest -v -s entrypoints/test_server_oot_registration.py
 
 - label: Examples Test
   working_dir: "/vllm-workspace/examples"
@@ -90,7 +93,7 @@ steps:
   - bash run-benchmarks.sh
 
 - label: Documentation Build
-  working_dir: "/vllm-workspace/docs"
+  working_dir: "/vllm-workspace/test_docs/docs"
   no_gpu: True
   commands:
   - pip install -r requirements-docs.txt

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -49,7 +49,7 @@ jobs:
       matrix:
           os: ['ubuntu-20.04']
           python-version: ['3.8', '3.9', '3.10', '3.11']
-          pytorch-version: ['2.1.2']  # Must be the most recent version that meets requirements.txt.
+          pytorch-version: ['2.2.1']  # Must be the most recent version that meets requirements-cuda.txt.
           cuda-version: ['11.8', '12.1']
 
     steps:

diff --git a/.github/workflows/scripts/build.sh b/.github/workflows/scripts/build.sh
@@ -9,7 +9,7 @@ LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH
 
 # Install requirements
 $python_executable -m pip install wheel packaging
-$python_executable -m pip install -r requirements.txt
+$python_executable -m pip install -r requirements-cuda.txt
 
 # Limit the number of parallel jobs to avoid OOM
 export MAX_JOBS=1

diff --git a/.gitignore b/.gitignore
@@ -183,6 +183,7 @@ _build/
 # hip files generated by PyTorch
 *.hip
 *_hip*
+hip_compat.h
 
 # Benchmark dataset
 *.json

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -19,7 +19,7 @@ set(PYTHON_SUPPORTED_VERSIONS "3.8" "3.9" "3.10" "3.11")
 set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0")
 
 # Supported AMD GPU architectures.
-set(HIP_SUPPORTED_ARCHS "gfx908;gfx90a;gfx942;gfx1100")
+set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100")
 
 #
 # Supported/expected torch versions for CUDA/ROCm.
@@ -31,7 +31,7 @@ set(HIP_SUPPORTED_ARCHS "gfx908;gfx90a;gfx942;gfx1100")
 # requirements.txt files and should be kept consistent.  The ROCm torch
 # versions are derived from Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.1.2")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.2.1")
 set(TORCH_SUPPORTED_VERSION_ROCM_5X "2.0.1")
 set(TORCH_SUPPORTED_VERSION_ROCM_6X "2.1.1")
 

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -21,7 +21,6 @@ Express your support on Twitter if vLLM aids you, or simply offer your appreciat
 ### Build from source
 
 ```bash
-pip install -r requirements.txt
 pip install -e .  # This may take several minutes.
 ```
 
@@ -30,6 +29,8 @@ pip install -e .  # This may take several minutes.
 ```bash
 pip install -r requirements-dev.txt
 
+# linting and formatting
+bash format.sh
 # Static type checking
 mypy
 # Unit tests

diff --git a/Dockerfile b/Dockerfile
@@ -2,6 +2,7 @@
 # to run the OpenAI compatible server.
 
 #################### BASE BUILD IMAGE ####################
+# prepare basic build environment
 FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev
 
 RUN apt-get update -y \
@@ -16,18 +17,26 @@ RUN ldconfig /usr/local/cuda-12.1/compat/
 WORKDIR /workspace
 
 # install build and runtime dependencies
-COPY requirements.txt requirements.txt
+COPY requirements-common.txt requirements-common.txt
+COPY requirements-cuda.txt requirements-cuda.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install -r requirements.txt
+    pip install -r requirements-cuda.txt
 
 # install development dependencies
 COPY requirements-dev.txt requirements-dev.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
     pip install -r requirements-dev.txt
+
+# cuda arch list used by torch
+# can be useful for both `dev` and `test`
+# explicitly set the list to avoid issues with torch 2.2
+# see https://github.com/pytorch/pytorch/pull/123243
+ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
+ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
 #################### BASE BUILD IMAGE ####################
 
 
-#################### EXTENSION BUILD IMAGE ####################
+#################### WHEEL BUILD IMAGE ####################
 FROM dev AS build
 
 # install build dependencies
@@ -38,18 +47,16 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 # install compiler cache to speed up compilation leveraging local or remote caching
 RUN apt-get update -y && apt-get install -y ccache
 
-# copy input files
+# files and directories related to build wheels
 COPY csrc csrc
 COPY setup.py setup.py
 COPY cmake cmake
 COPY CMakeLists.txt CMakeLists.txt
-COPY requirements.txt requirements.txt
+COPY requirements-common.txt requirements-common.txt
+COPY requirements-cuda.txt requirements-cuda.txt
 COPY pyproject.toml pyproject.toml
-COPY vllm/__init__.py vllm/__init__.py
+COPY vllm vllm
 
-# cuda arch list used by torch
-ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
-ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
 # max jobs used by Ninja to build extensions
 ARG max_jobs=2
 ENV MAX_JOBS=${max_jobs}
@@ -61,7 +68,15 @@ ENV VLLM_INSTALL_PUNICA_KERNELS=1
 
 ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/ccache \
-    python3 setup.py build_ext --inplace
+    --mount=type=cache,target=/root/.cache/pip \
+    python3 setup.py bdist_wheel --dist-dir=dist
+
+# the `vllm_nccl` package must be installed from source distribution
+# pip is too smart to store a wheel in the cache, and other CI jobs
+# will directly use the wheel from the cache, which is not what we want.
+# we need to remove it manually
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip cache remove vllm_nccl*
 #################### EXTENSION Build IMAGE ####################
 
 #################### FLASH_ATTENTION Build IMAGE ####################
@@ -79,17 +94,36 @@ WORKDIR /usr/src/flash-attention-v2
 RUN pip --verbose wheel flash-attn==${FLASH_ATTN_VERSION} \
     --no-build-isolation --no-deps --no-cache-dir
 
-#################### FLASH_ATTENTION Build IMAGE ####################
+#################### vLLM installation IMAGE ####################
+# image with vLLM installed
+FROM nvidia/cuda:12.1.0-base-ubuntu22.04 AS vllm-base
+WORKDIR /vllm-workspace
+
+RUN apt-get update -y \
+    && apt-get install -y python3-pip git vim
+
+# Workaround for https://github.com/openai/triton/issues/2507 and
+# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
+# this won't be needed for future versions of this docker image
+# or future versions of triton.
+RUN ldconfig /usr/local/cuda-12.1/compat/
+
+# install vllm wheel first, so that torch etc will be installed
+RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
+    --mount=type=cache,target=/root/.cache/pip \
+    pip install dist/*.whl --verbose
+
+RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
+    --mount=type=cache,target=/root/.cache/pip \
+    pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir
 
 #################### TEST IMAGE ####################
 # image to run unit testing suite
-FROM dev AS test
+# note that this uses vllm installed by `pip`
+FROM vllm-base AS test
 
-# copy pytorch extensions separately to avoid having to rebuild
-# when python code changes
-WORKDIR /vllm-workspace
-# ADD is used to preserve directory structure
 ADD . /vllm-workspace/
+<<<<<<< HEAD
 COPY --from=build /workspace/vllm/*.so /vllm-workspace/vllm/
 # Install flash attention (from pre-built wheel)
 RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
@@ -124,17 +158,31 @@ RUN --mount=type=cache,target=/root/.cache/pip \
     pip install nm-magic-wand
 
 #################### RUNTIME BASE IMAGE ####################
+=======
 
+# install development dependencies (for testing)
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install -r requirements-dev.txt
+>>>>>>> upstream/main
+
+# doc requires source code
+# we hide them inside `test_docs/` , so that this source code
+# will not be imported by other tests
+RUN mkdir test_docs
+RUN mv docs test_docs/
+RUN mv vllm test_docs/
+
+#################### TEST IMAGE ####################
 
 #################### OPENAI API SERVER ####################
 # openai api server alternative
 FROM vllm-base AS vllm-openai
+
 # install additional dependencies for openai api server
 RUN --mount=type=cache,target=/root/.cache/pip \
     pip install accelerate hf_transfer modelscope
 
-COPY --from=build /workspace/vllm/*.so /workspace/vllm/
-COPY vllm vllm
+ENV VLLM_USAGE_SOURCE production-docker-image
 
 ENV VLLM_USAGE_SOURCE production-docker-image
 

diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,5 +1,6 @@
 include LICENSE
-include requirements.txt
+include requirements-common.txt
+include requirements-cuda.txt
 include CMakeLists.txt
 
 recursive-include licenses *

diff --git a/README.md b/README.md
@@ -6,8 +6,6 @@
 
 ## Installation
 
-The [nm-vllm PyPi package](https://pypi.org/project/nm-vllm/) includes pre-compiled binaries for CUDA (version 12.1) kernels, streamlining the setup process. For other PyTorch or CUDA versions, please compile the package from source.
-
 Install it using pip:
 ```bash
 pip install nm-vllm

diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
@@ -24,6 +24,7 @@ def main(args: argparse.Namespace):
               dtype=args.dtype,
               enforce_eager=args.enforce_eager,
               kv_cache_dtype=args.kv_cache_dtype,
+              quantization_param_path=args.quantization_param_path,
               device=args.device,
               ray_workers_use_nsight=args.ray_workers_use_nsight,
               enable_chunked_prefill=args.enable_chunked_prefill,
@@ -67,7 +68,8 @@ def run_to_completion(profile_dir: Optional[str] = None):
             return latency
 
     print("Warming up...")
-    run_to_completion(profile_dir=None)
+    for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
+        run_to_completion(profile_dir=None)
 
     if args.profile:
         profile_dir = args.profile_result_dir
@@ -83,7 +85,12 @@ def run_to_completion(profile_dir: Optional[str] = None):
     latencies = []
     for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
         latencies.append(run_to_completion(profile_dir=None))
+    latencies = np.array(latencies)
+    percentages = [10, 25, 50, 75, 90]
+    percentiles = np.percentile(latencies, percentages)
     print(f'Avg latency: {np.mean(latencies)} seconds')
+    for percentage, percentile in zip(percentages, percentiles):
+        print(f'{percentage}% percentile latency: {percentile} seconds')
 
 
 if __name__ == '__main__':
@@ -105,9 +112,13 @@ def run_to_completion(profile_dir: Optional[str] = None):
                         default=1,
                         help='Number of generated sequences per prompt.')
     parser.add_argument('--use-beam-search', action='store_true')
+    parser.add_argument('--num-iters-warmup',
+                        type=int,
+                        default=10,
+                        help='Number of iterations to run for warmup.')
     parser.add_argument('--num-iters',
                         type=int,
-                        default=3,
+                        default=30,
                         help='Number of iterations to run.')
     parser.add_argument('--trust-remote-code',
                         action='store_true',
@@ -127,10 +138,23 @@ def run_to_completion(profile_dir: Optional[str] = None):
     parser.add_argument(
         "--kv-cache-dtype",
         type=str,
-        choices=['auto', 'fp8_e5m2'],
+        choices=['auto', 'fp8'],
         default='auto',
         help=
-        'Data type for kv cache storage. If "auto", will use model data type.')
+        'Data type for kv cache storage. If "auto", will use model data type. '
+        'FP8_E5M2 (without scaling) is only supported on cuda version greater '
+        'than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported for '
+        'common inference criteria.')
+    parser.add_argument(
+        '--quantization-param-path',
+        type=str,
+        default=None,
+        help='Path to the JSON file containing the KV cache scaling factors. '
+        'This should generally be supplied, when KV cache dtype is FP8. '
+        'Otherwise, KV cache scaling factors default to 1.0, which may cause '
+        'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
+        'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
+        'instead supported for common inference criteria.')
     parser.add_argument(
         '--profile',
         action='store_true',

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
@@ -112,7 +112,9 @@ def sample_sonnet_requests(
     prefix_len: int,
     tokenizer: PreTrainedTokenizerBase,
 ) -> List[Tuple[str, str, int, int]]:
-    assert input_len > prefix_len, "input_len must be greater than prefix_len."
+    assert (
+        input_len > prefix_len
+    ), "'args.sonnet-input-len' must be greater than 'args.prefix-input-len'."
 
     # Load the dataset.
     with open(dataset_path) as f:
@@ -133,16 +135,17 @@ def sample_sonnet_requests(
         base_message, add_generation_prompt=True, tokenize=False)
     base_prompt_offset = len(tokenizer(base_prompt_formatted).input_ids)
 
-    assert (input_len > base_prompt_offset
-            ), f"Please set 'args.input-len' higher than {base_prompt_offset}."
+    assert (
+        input_len > base_prompt_offset
+    ), f"Please set 'args.sonnet-input-len' higher than {base_prompt_offset}."
     num_input_lines = round(
         (input_len - base_prompt_offset) / average_poem_len)
 
     # First approximately `prefix_len` number of tokens in the
     # prompt are fixed poem lines.
     assert (
         prefix_len > base_prompt_offset
-    ), f"Please set 'args.prefix-len' higher than {base_prompt_offset}."
+    ), f"Please set 'args.sonnet-prefix-len' higher than {base_prompt_offset}."
 
     num_prefix_lines = round(
         (prefix_len - base_prompt_offset) / average_poem_len)
@@ -375,9 +378,9 @@ def main(args: argparse.Namespace):
             input_requests = sample_sonnet_requests(
                 dataset_path=args.dataset_path,
                 num_requests=args.num_prompts,
-                input_len=args.input_len,
-                output_len=args.output_len,
-                prefix_len=args.prefix_len,
+                input_len=args.sonnet_input_len,
+                output_len=args.sonnet_output_len,
+                prefix_len=args.sonnet_prefix_len,
                 tokenizer=tokenizer,
             )
             input_requests = [(prompt, prompt_len, output_len)
@@ -390,9 +393,9 @@ def main(args: argparse.Namespace):
             input_requests = sample_sonnet_requests(
                 dataset_path=args.dataset_path,
                 num_requests=args.num_prompts,
-                input_len=args.input_len,
-                output_len=args.output_len,
-                prefix_len=args.prefix_len,
+                input_len=args.sonnet_input_len,
+                output_len=args.sonnet_output_len,
+                prefix_len=args.sonnet_prefix_len,
                 tokenizer=tokenizer,
             )
             input_requests = [(prompt_formatted, prompt_len, output_len)