diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index c331a9c49c0d0..2dbeee8562971 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -46,7 +46,7 @@ docker exec cpu-test bash -c " docker exec cpu-test bash -c " export VLLM_CPU_KVCACHE_SPACE=10 export VLLM_CPU_OMP_THREADS_BIND=48-92 - python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m & + python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half & timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1 python3 benchmarks/benchmark_serving.py \ --backend vllm \ diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 9444dc43ea97e..1eb749f64d36b 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -321,7 +321,6 @@ steps: - tests/models/decoder_only/language commands: - pytest -v -s models/decoder_only/language/test_models.py - - pytest -v -s models/decoder_only/language/test_big_models.py - label: Decoder-only Language Models Test (Extended) # 1h20min nightly: true @@ -329,7 +328,7 @@ steps: - vllm/ - tests/models/decoder_only/language commands: - - pytest -v -s models/decoder_only/language --ignore=models/decoder_only/language/test_models.py --ignore=models/decoder_only/language/test_big_models.py + - pytest -v -s models/decoder_only/language --ignore=models/decoder_only/language/test_models.py - label: Decoder-only Multi-Modal Models Test (Standard) #mirror_hardwares: [amd] diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml index b80749aaa8fec..5eddf6b7c649b 100644 --- a/.github/workflows/actionlint.yml +++ b/.github/workflows/actionlint.yml @@ -6,12 +6,14 @@ on: paths: - '.github/workflows/*.ya?ml' - '.github/workflows/actionlint.*' + - '.github/workflows/matchers/actionlint.json' pull_request: branches: - "main" paths: - '.github/workflows/*.ya?ml' - '.github/workflows/actionlint.*' + - '.github/workflows/matchers/actionlint.json' env: LC_ALL: en_US.UTF-8 diff --git a/.github/workflows/clang-format.yml b/.github/workflows/clang-format.yml index 68d60d7365ed1..167c115d8956f 100644 --- a/.github/workflows/clang-format.yml +++ b/.github/workflows/clang-format.yml @@ -6,9 +6,21 @@ on: push: branches: - main + paths: + - '**/*.h' + - '**/*.cpp' + - '**/*.cu' + - '**/*.cuh' + - '.github/workflows/clang-format.yml' pull_request: branches: - main + paths: + - '**/*.h' + - '**/*.cpp' + - '**/*.cu' + - '**/*.cuh' + - '.github/workflows/clang-format.yml' jobs: clang-format: diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml index 5f1e5f8eeaf7d..18b354948f0cc 100644 --- a/.github/workflows/mypy.yaml +++ b/.github/workflows/mypy.yaml @@ -6,9 +6,19 @@ on: push: branches: - main + paths: + - '**/*.py' + - '.github/workflows/mypy.yaml' + - 'tools/mypy.sh' + - 'pyproject.toml' pull_request: branches: - main + paths: + - '**/*.py' + - '.github/workflows/mypy.yaml' + - 'tools/mypy.sh' + - 'pyproject.toml' jobs: mypy: diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml index 9cc8a9e914474..197f918765e7d 100644 --- a/.github/workflows/ruff.yml +++ b/.github/workflows/ruff.yml @@ -6,16 +6,28 @@ on: push: branches: - main + paths: + - "**/*.py" + - pyproject.toml + - requirements-lint.txt + - .github/workflows/matchers/ruff.json + - .github/workflows/ruff.yml pull_request: branches: - main + paths: + - "**/*.py" + - pyproject.toml + - requirements-lint.txt + - .github/workflows/matchers/ruff.json + - .github/workflows/ruff.yml jobs: ruff: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] + python-version: ["3.12"] steps: - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 - name: Set up Python ${{ matrix.python-version }} @@ -30,9 +42,6 @@ jobs: run: | echo "::add-matcher::.github/workflows/matchers/ruff.json" ruff check --output-format github . - - name: Spelling check with codespell - run: | - codespell --toml pyproject.toml - name: Run isort run: | isort . --check-only diff --git a/.github/workflows/yapf.yml b/.github/workflows/yapf.yml index 9f06b35c19e32..35579302c5c14 100644 --- a/.github/workflows/yapf.yml +++ b/.github/workflows/yapf.yml @@ -6,15 +6,22 @@ on: push: branches: - main + paths: + - "**/*.py" + - .github/workflows/yapf.yml pull_request: branches: - main + paths: + - "**/*.py" + - .github/workflows/yapf.yml + jobs: yapf: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] + python-version: ["3.12"] steps: - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 - name: Set up Python ${{ matrix.python-version }} diff --git a/Dockerfile.cpu b/Dockerfile.cpu index f1a21d6bd13fc..287b4958da4e5 100644 --- a/Dockerfile.cpu +++ b/Dockerfile.cpu @@ -22,7 +22,7 @@ ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/li RUN echo 'ulimit -c 0' >> ~/.bashrc -RUN pip install intel_extension_for_pytorch==2.4.0 +RUN pip install intel_extension_for_pytorch==2.5.0 WORKDIR /workspace diff --git a/Dockerfile.tpu b/Dockerfile.tpu index b43442e4c0af1..0a507b6ecdf60 100644 --- a/Dockerfile.tpu +++ b/Dockerfile.tpu @@ -9,12 +9,6 @@ RUN apt-get update && apt-get install -y \ git \ ffmpeg libsm6 libxext6 libgl1 -# Install the TPU and Pallas dependencies. -RUN --mount=type=cache,target=/root/.cache/pip \ - python3 -m pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html -RUN --mount=type=cache,target=/root/.cache/pip \ - python3 -m pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html - # Build vLLM. COPY . . ARG GIT_REPO_CHECK=0 @@ -25,7 +19,6 @@ ENV VLLM_TARGET_DEVICE="tpu" RUN --mount=type=cache,target=/root/.cache/pip \ --mount=type=bind,source=.git,target=.git \ python3 -m pip install \ - 'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \ -r requirements-tpu.txt RUN python3 setup.py develop diff --git a/benchmarks/README.md b/benchmarks/README.md index 192d6c4022c83..2aa4a285021f1 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -6,3 +6,14 @@ You can download the dataset by running: ```bash wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json ``` + +## Downloading the ShareGPT4V dataset + +The json file refers to several image datasets (coco, llava, etc.). The benchmark scripts +will ignore a datapoint if the referred image is missing. +```bash +wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/resolve/main/sharegpt4v_instruct_gpt4-vision_cap100k.json +mkdir coco -p +wget http://images.cocodataset.org/zips/train2017.zip -O coco/train2017.zip +unzip coco/train2017.zip -d coco/ +``` diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 262b8652e49ff..159cf055737ce 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -8,6 +8,7 @@ import torch import uvloop +from PIL import Image from tqdm import tqdm from transformers import (AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase) @@ -38,12 +39,33 @@ class SampleRequest: multi_modal_data: Optional[MultiModalDataDict] = None -def sample_requests( - dataset_path: str, - num_requests: int, - tokenizer: PreTrainedTokenizerBase, - fixed_output_len: Optional[int], -) -> List[SampleRequest]: +def _get_prompt_for_image_model(question: str, *, model: str) -> str: + """Prepend and append special tokens around the question to form a prompt. + + Args: + question: The input question text to wrap with special tokens + model: The name of the model being used, to determine which special + tokens to add + + Returns: + The formatted prompt string with appropriate special tokens for the + model + + Raises: + ValueError: If an unsupported model name is provided + """ + model = model.lower() + if "pixtral" in model: + return f"[INST]{question}\n[IMG][/INST]" + raise ValueError(f"Unsupported model {model}") + + +def sample_requests(tokenizer: PreTrainedTokenizerBase, + args: argparse.Namespace) -> List[SampleRequest]: + dataset_path: str = args.dataset + num_requests: int = args.num_prompts + fixed_output_len: Optional[int] = args.output_len + model: str = args.model if fixed_output_len is not None and fixed_output_len < 4: raise ValueError("output_len too small") @@ -52,23 +74,36 @@ def sample_requests( dataset = json.load(f) # Filter out the conversations with less than 2 turns. dataset = [data for data in dataset if len(data["conversations"]) >= 2] - # Only keep the first two turns of each conversation. - dataset = [(data["conversations"][0]["value"], - data["conversations"][1]["value"]) for data in dataset] - # Shuffle the dataset. random.shuffle(dataset) # Filter out sequences that are too long or too short filtered_dataset: List[SampleRequest] = [] - for i in range(len(dataset)): + for data in dataset: if len(filtered_dataset) == num_requests: break + # Only keep the first two turns of each conversation. + prompt = data["conversations"][0]["value"] + completion = data["conversations"][1]["value"] + + multi_modal_data: Optional[MultiModalDataDict] = None + if "image" in data: + multi_modal_data = multi_modal_data or {} + image_path = data["image"] + # TODO(vllm-project/vllm/issues/9778): Support multiple images. + assert isinstance(image_path, + str), "Only support single image input" + try: + multi_modal_data["image"] = Image.open(image_path).convert( + "RGB") + except FileNotFoundError: + # Ignore datapoint where asset is missing + continue + prompt = _get_prompt_for_image_model(question=prompt, model=model) + # Tokenize the prompts and completions. - prompt = dataset[i][0] prompt_token_ids = tokenizer(prompt).input_ids - completion = dataset[i][1] completion_token_ids = tokenizer(completion).input_ids prompt_len = len(prompt_token_ids) output_len = len(completion_token_ids @@ -82,7 +117,8 @@ def sample_requests( filtered_dataset.append( SampleRequest(prompt=prompt, prompt_len=prompt_len, - expected_output_len=output_len)) + expected_output_len=output_len, + multi_modal_data=multi_modal_data)) return filtered_dataset @@ -99,7 +135,9 @@ def run_vllm( prompts: List[TextPrompt] = [] sampling_params: List[SamplingParams] = [] for request in requests: - prompts.append(TextPrompt(prompt=request.prompt)) + prompts.append( + TextPrompt(prompt=request.prompt, + multi_modal_data=request.multi_modal_data)) sampling_params.append( SamplingParams( n=n, @@ -148,7 +186,9 @@ async def run_vllm_async( prompts: List[TextPrompt] = [] sampling_params: List[SamplingParams] = [] for request in requests: - prompts.append(TextPrompt(prompt=request.prompt)) + prompts.append( + TextPrompt(prompt=request.prompt, + multi_modal_data=request.multi_modal_data)) sampling_params.append( SamplingParams( n=n, @@ -272,9 +312,10 @@ def main(args: argparse.Namespace): for _ in range(args.num_prompts) ] else: - requests = sample_requests(args.dataset, args.num_prompts, tokenizer, - args.output_len) + requests = sample_requests(tokenizer, args) + is_multi_modal = any(request.multi_modal_data is not None + for request in requests) if args.backend == "vllm": if args.async_engine: elapsed_time = uvloop.run( @@ -300,6 +341,11 @@ def main(args: argparse.Namespace): for request in requests) total_output_tokens = sum(request.expected_output_len for request in requests) + if is_multi_modal: + print("\033[91mWARNING\033[0m: Multi-modal request detected. The " + "following metrics are not accurate because image tokens are not" + " counted. See vllm-project/vllm/issues/9778 for details.") + # TODO(vllm-project/vllm/issues/9778): Count molti-modal token length. print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, " f"{total_num_tokens / elapsed_time:.2f} total tokens/s, " f"{total_output_tokens / elapsed_time:.2f} output tokens/s") diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake index 7237d246ddf55..776a0bb11ae64 100644 --- a/cmake/cpu_extension.cmake +++ b/cmake/cpu_extension.cmake @@ -18,6 +18,7 @@ include_directories("${CMAKE_SOURCE_DIR}/csrc") # list(APPEND CXX_COMPILE_FLAGS "-fopenmp" + "-mf16c" "-DVLLM_CPU_EXTENSION") execute_process(COMMAND cat /proc/cpuinfo diff --git a/csrc/cpu/attention.cpp b/csrc/cpu/attention.cpp index abb4e3bea14bb..e3953c7c45719 100644 --- a/csrc/cpu/attention.cpp +++ b/csrc/cpu/attention.cpp @@ -22,6 +22,16 @@ struct KernelVecType { using v_load_vec_type = vec_op::FP32Vec16; }; +template <> +struct KernelVecType { + using q_load_vec_type = vec_op::FP16Vec8; + using q_vec_type = vec_op::FP32Vec16; + using k_load_vec_type = vec_op::FP16Vec16; + using k_vec_type = vec_op::FP32Vec16; + using qk_acc_vec_type = vec_op::FP32Vec16; + using v_load_vec_type = vec_op::FP16Vec16; +}; + #ifdef __AVX512BF16__ template <> struct KernelVecType { diff --git a/csrc/cpu/cpu_types_x86.hpp b/csrc/cpu/cpu_types_x86.hpp index a325153b470cc..12d5757b495be 100644 --- a/csrc/cpu/cpu_types_x86.hpp +++ b/csrc/cpu/cpu_types_x86.hpp @@ -11,10 +11,10 @@ static_assert(false, "AVX2 must be supported for the current implementation."); namespace vec_op { -// FIXME: FP16 is not fully supported in Torch-CPU #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ - AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) + AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)) @@ -50,37 +50,37 @@ template struct Vec { struct FP32Vec8; struct FP32Vec16; -#ifdef __AVX512FP16__ struct FP16Vec8 : public Vec { constexpr static int VEC_ELEM_NUM = 8; - __m128h reg; + __m128i reg; - explicit FP16Vec8(_Float16 v) : reg(_mm_set1_ph(v)) {} + explicit FP16Vec8(const void *ptr) + : reg((__m128i)_mm_loadu_si128((__m128i *)ptr)) {} - explicit FP16Vec8(const void *ptr) : reg(_mm_loadu_ph(ptr)) {} + explicit FP16Vec8(const FP32Vec8 &); - explicit FP16Vec8(__m128h data) : reg(data) {} + void save(void *ptr) const { *reinterpret_cast<__m128i *>(ptr) = reg; } +}; - FP16Vec8 operator*(const FP16Vec8 &b) const { - return FP16Vec8(_mm_mul_ph(reg, b.reg)); - } +struct FP16Vec16 : public Vec { + constexpr static int VEC_ELEM_NUM = 16; - FP16Vec8 operator+(const FP16Vec8 &b) const { - return FP16Vec8(_mm_add_ph(reg, b.reg)); - } + __m256i reg; - FP16Vec8 operator-(const FP16Vec8 &b) const { - return FP16Vec8(_mm_sub_ph(reg, b.reg)); - } + explicit FP16Vec16(const void *ptr) + : reg((__m256i)_mm256_loadu_si256((__m256i *)ptr)) {} - FP16Vec8 operator/(const FP16Vec8 &b) const { - return FP16Vec8(_mm_div_ph(reg, b.reg)); - } + explicit FP16Vec16(const FP32Vec16 &); - void save(void *ptr) const { _mm_storeu_ph(ptr, reg); } + void save(void *ptr) const { *reinterpret_cast<__m256i *>(ptr) = reg; } + + void save(void* ptr, const int elem_num) const { + constexpr uint32_t M = 0xFFFFFFFF; + __mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num)); + _mm256_mask_storeu_epi16(ptr, mask, reg); + } }; -#endif struct BF16Vec8 : public Vec { constexpr static int VEC_ELEM_NUM = 8; @@ -202,9 +202,7 @@ struct FP32Vec8 : public Vec { explicit FP32Vec8(const FP32Vec8 &data) : reg(data.reg) {} -#ifdef __AVX512FP16__ - explicit FP32Vec8(__m128h v) : reg(_mm256_cvtph_ps(_mm_castph_si128(v))) {} -#endif + explicit FP32Vec8(const FP16Vec8 &v) : reg(_mm256_cvtph_ps(v.reg)) {} explicit FP32Vec8(const BF16Vec8 &v) : reg(_mm256_castsi256_ps( @@ -323,6 +321,10 @@ struct FP32Vec16 : public Vec { : reg(_mm512_castsi512_ps( _mm512_bslli_epi128(_mm512_cvtepu16_epi32(v.reg), 2))) {} + explicit FP32Vec16(const FP16Vec16 &v) : reg(_mm512_cvtph_ps(v.reg)) {} + + explicit FP32Vec16(const FP16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {} + explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {} explicit FP32Vec16(const INT32Vec16 &v) @@ -534,24 +536,34 @@ template using vec_t = typename VecType::vec_type; template <> struct VecType { using vec_type = FP32Vec8; }; -#ifdef __AVX512FP16__ -template <> struct VecType { using vec_type = FP16Vec16; }; -#endif +template <> struct VecType { using vec_type = FP16Vec8; }; template <> struct VecType { using vec_type = BF16Vec8; }; template void storeFP32(float v, T *ptr) { *ptr = v; } -#ifdef __AVX512FP16__ -template <> inline void storeFP32(float v, c10::Half *ptr) { - *reinterpret_cast<_Float16 *>(ptr) = v; -} -#endif - inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) { acc = acc + a * b; } +template <> inline void storeFP32(float v, c10::Half *ptr) { + *reinterpret_cast(ptr) = + _cvtss_sh(v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); +} + +inline FP16Vec8::FP16Vec8(const FP32Vec8 &v) + : reg(_mm256_cvtps_ph(v.reg, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)) {} + +#ifdef __AVX512F__ +inline FP16Vec16::FP16Vec16(const FP32Vec16 &v) + : reg(_mm512_cvtps_ph(v.reg, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)) {} +#else +inline FP16Vec16::FP16Vec16(const FP32Vec16 &v) + : reg(_mm256_insertf128_si256(_mm256_castsi128_si256(FP16Vec8(FP32Vec8(v.reg_low)).reg), FP16Vec8(FP32Vec8(v.reg_low)).reg, 1)) {} +#endif + #ifdef __AVX512BF16__ template <> inline void storeFP32(float v, c10::BFloat16 *ptr) { *reinterpret_cast<__bfloat16 *>(ptr) = _mm_cvtness_sbh(v); diff --git a/csrc/cpu/dnnl_helper.hpp b/csrc/cpu/dnnl_helper.hpp index 024ad4ae43da8..8b5011dc065f0 100644 --- a/csrc/cpu/dnnl_helper.hpp +++ b/csrc/cpu/dnnl_helper.hpp @@ -2,6 +2,7 @@ #define DNNL_HELPER_HPP #include +#include #include "oneapi/dnnl/dnnl.hpp" @@ -32,6 +33,11 @@ struct DNNLType { static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::bf16; }; +template <> +struct DNNLType { + static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::f16; +}; + template constexpr inline dnnl::memory::data_type get_dnnl_type() { return DNNLType>::type; diff --git a/csrc/cpu/quant.cpp b/csrc/cpu/quant.cpp index b493fd793818a..f42fa2361a2db 100644 --- a/csrc/cpu/quant.cpp +++ b/csrc/cpu/quant.cpp @@ -23,6 +23,13 @@ struct KernelVecType { using cvt_vec_type = vec_op::FP32Vec16; }; +template <> +struct KernelVecType { + using load_vec_type = vec_op::FP16Vec16; + using azp_adj_load_vec_type = vec_op::INT32Vec16; + using cvt_vec_type = vec_op::FP32Vec16; +}; + #ifdef __AVX512F__ template void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output, diff --git a/docs/source/getting_started/cpu-installation.rst b/docs/source/getting_started/cpu-installation.rst index d12aeebbbc184..69530fd778c55 100644 --- a/docs/source/getting_started/cpu-installation.rst +++ b/docs/source/getting_started/cpu-installation.rst @@ -3,13 +3,13 @@ Installation with CPU ======================== -vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32 and BF16. vLLM CPU backend supports the following vLLM features: +vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32, FP16 and BF16. vLLM CPU backend supports the following vLLM features: - Tensor Parallel (``-tp = N``) - Quantization (``INT8 W8A8, AWQ``) .. note:: - FP16 data type and more advanced features on `chunked-prefill`, `prefix-caching` and `FP8 KV cache` are under development and will be available soon. + More advanced features on `chunked-prefill`, `prefix-caching` and `FP8 KV cache` are under development and will be available soon. Table of contents: @@ -72,8 +72,6 @@ Build from source $ VLLM_TARGET_DEVICE=cpu python setup.py install .. note:: - - BF16 is the default data type in the current CPU backend (that means the backend will cast FP16 to BF16), and is compatible will all CPUs with AVX512 ISA support. - - AVX512_BF16 is an extension ISA provides native BF16 data type conversion and vector product instructions, will brings some performance improvement compared with pure AVX512. The CPU backend build script will check the host CPU flags to determine whether to enable AVX512_BF16. - If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable VLLM_CPU_AVX512BF16=1 before the building. diff --git a/docs/source/getting_started/tpu-installation.rst b/docs/source/getting_started/tpu-installation.rst index f0c812b941c1f..75ab2b6ba02dc 100644 --- a/docs/source/getting_started/tpu-installation.rst +++ b/docs/source/getting_started/tpu-installation.rst @@ -119,27 +119,19 @@ Uninstall the existing `torch` and `torch_xla` packages: pip uninstall torch torch-xla -y -Install `torch` and `torch_xla` +Install build dependencies: .. code-block:: bash - pip install --pre torch==2.6.0.dev20241028+cpu torchvision==0.20.0.dev20241028+cpu --index-url https://download.pytorch.org/whl/nightly/cpu - pip install 'torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev-cp310-cp310-linux_x86_64.whl' -f https://storage.googleapis.com/libtpu-releases/index.html + pip install -r requirements-tpu.txt + sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev -Install JAX and Pallas: +Run the setup script: .. code-block:: bash - pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html - pip install jaxlib==0.4.32.dev20240829 jax==0.4.32.dev20240829 -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html - -Install other build dependencies: + VLLM_TARGET_DEVICE="tpu" python setup.py develop -.. code-block:: bash - - pip install -r requirements-tpu.txt - VLLM_TARGET_DEVICE="tpu" python setup.py develop - sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev Provision Cloud TPUs with GKE ----------------------------- @@ -168,45 +160,6 @@ Run the Docker image with the following command: $ # Make sure to add `--privileged --net host --shm-size=16G`. $ docker run --privileged --net host --shm-size=16G -it vllm-tpu - -.. _build_from_source_tpu: - -Build from source ------------------ - -You can also build and install the TPU backend from source. - -First, install the dependencies: - -.. code-block:: console - - $ # (Recommended) Create a new conda environment. - $ conda create -n myenv python=3.10 -y - $ conda activate myenv - - $ # Clean up the existing torch and torch-xla packages. - $ pip uninstall torch torch-xla -y - - $ # Install PyTorch and PyTorch XLA. - $ export DATE="20241017" - $ export TORCH_VERSION="2.6.0" - $ pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-${TORCH_VERSION}.dev${DATE}-cp310-cp310-linux_x86_64.whl - $ pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-${TORCH_VERSION}.dev${DATE}-cp310-cp310-linux_x86_64.whl - - $ # Install JAX and Pallas. - $ pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html - $ pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html - - $ # Install other build dependencies. - $ pip install -r requirements-tpu.txt - - -Next, build vLLM from source. This will only take a few seconds: - -.. code-block:: console - - $ VLLM_TARGET_DEVICE="tpu" python setup.py develop - .. note:: Since TPU relies on XLA which requires static shapes, vLLM bucketizes the possible input shapes and compiles an XLA graph for each different shape. diff --git a/requirements-cpu.txt b/requirements-cpu.txt index 27ca8ca5dbc58..749b03a0603d8 100644 --- a/requirements-cpu.txt +++ b/requirements-cpu.txt @@ -2,5 +2,5 @@ -r requirements-common.txt # Dependencies for x86_64 CPUs -torch == 2.4.0+cpu; platform_machine != "ppc64le" +torch == 2.5.1+cpu; platform_machine != "ppc64le" torchvision; platform_machine != "ppc64le" # required for the image processor of phi3v, this must be updated alongside torch diff --git a/requirements-tpu.txt b/requirements-tpu.txt index 4c606cf0a9105..f9a0770804e55 100644 --- a/requirements-tpu.txt +++ b/requirements-tpu.txt @@ -2,6 +2,22 @@ -r requirements-common.txt # Dependencies for TPU -# Currently, the TPU backend uses a nightly version of PyTorch XLA. -# You can install the dependencies in Dockerfile.tpu. +cmake>=3.26 +ninja +packaging +setuptools-scm>=8 +wheel +jinja2 ray[default] + +# Install torch_xla +--pre +--extra-index-url https://download.pytorch.org/whl/nightly/cpu +--find-links https://storage.googleapis.com/libtpu-releases/index.html +--find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html +--find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html +torch==2.6.0.dev20241028+cpu +torchvision==0.20.0.dev20241028+cpu +torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241028-cp310-cp310-linux_x86_64.whl +jaxlib==0.4.32.dev20240829 +jax==0.4.32.dev20240829 diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py index 79647589d5204..7f16baa65a644 100644 --- a/tests/basic_correctness/test_basic_correctness.py +++ b/tests/basic_correctness/test_basic_correctness.py @@ -156,3 +156,29 @@ def test_model_with_failure(vllm_runner) -> None: ModelInputForGPUWithSamplingMetadata) finally: os.remove(filename) + + +def test_failure_with_async_out_proc(vllm_runner) -> None: + + filename = None + try: + with vllm_runner("facebook/opt-125m", + dtype="half", + enforce_eager=False, + gpu_memory_utilization=0.7) as vllm_model,\ + patch("vllm.model_executor.models.opt.OPTForCausalLM.forward", + side_effect=ValueError()): + model_config = vllm_model.model.llm_engine.model_config + assert model_config.use_async_output_proc + with pytest.raises(ValueError) as exc_info: + vllm_model.generate_greedy('how to make pizza?', 250) + matches = re.search(r"input dumped to (.+).pkl", + str(exc_info.value)) + assert matches is not None + + filename = f"{matches.group(1)}.pkl" + finally: + # Clean up + if filename is not None: + os.remove(filename) + pass diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py index e3e5a7d0fc5a5..9c65059c6b348 100644 --- a/tests/compile/piecewise/test_toy_llama.py +++ b/tests/compile/piecewise/test_toy_llama.py @@ -1,6 +1,10 @@ """ Test the piecewise compilation with a simple model, comparing the output with and without the piecewise compilation. + +This is a tractable model, the weights and computation are specially designed +if the config `tractable_init` is set to True. Otherwise, the weights are +initialized randomly with a fixed seed. """ import os from dataclasses import dataclass @@ -49,6 +53,12 @@ class LlamaConfig: mlp_size: int = 256 vocab_size: int = 128 num_layers: int = 2 + init_value: float = 1.0 + tractable_init: bool = False + random_seed: int = 0 + + def __post_init__(self): + assert self.mlp_size >= self.hidden_size class LlamaMLP(nn.Module): @@ -66,10 +76,23 @@ def __init__(self, config: LlamaConfig) -> None: bias=False, ) - self.gate_up_projection.weight.data.fill_(0.0) - self.down_projection.weight.data.fill_(0.0) + if config.tractable_init: + nn.init.eye_(self.gate_up_projection.weight.data[:config.mlp_size]) + nn.init.eye_(self.gate_up_projection.weight.data[config.mlp_size:]) + nn.init.eye_(self.down_projection.weight.data) + else: + nn.init.xavier_normal_(self.gate_up_projection.weight.data, + generator=torch.Generator().manual_seed( + config.random_seed), + gain=0.001) + nn.init.xavier_normal_(self.down_projection.weight.data, + generator=torch.Generator().manual_seed( + config.random_seed), + gain=0.001) def forward(self, x): + # for tractable_init and positive input, this is + # essentially an elementwise-square x = self.gate_up_projection(x) x = x[:, :x.size(1) // 2] * torch.nn.functional.relu( x[:, x.size(1) // 2:]) @@ -84,21 +107,39 @@ def __init__(self, config: LlamaConfig) -> None: self.qkv_projection = nn.Linear( in_features=config.hidden_size, out_features=config.hidden_size * 3, + bias=False, ) self.output_projection = nn.Linear( in_features=config.hidden_size, out_features=config.hidden_size, + bias=False, ) - self.qkv_projection.weight.data.fill_(0.0) - self.output_projection.weight.data.fill_(0.0) + if config.tractable_init: + nn.init.eye_(self.qkv_projection.weight.data[:config.hidden_size]) + nn.init.eye_(self.qkv_projection.weight.data[config.hidden_size:2 * + config.hidden_size]) + nn.init.eye_(self.qkv_projection.weight.data[2 * + config.hidden_size:]) + nn.init.eye_(self.output_projection.weight.data) + else: + nn.init.xavier_normal_(self.qkv_projection.weight.data, + generator=torch.Generator().manual_seed( + config.random_seed), + gain=0.001) + nn.init.xavier_normal_(self.output_projection.weight.data, + generator=torch.Generator().manual_seed( + config.random_seed), + gain=0.001) def forward( self, positions: torch.Tensor, hidden_states: torch.Tensor, ) -> torch.Tensor: + # for tractable_init, this is: + # output = (hidden_states * 3 + positions * 2) qkv = self.qkv_projection(hidden_states) hidden_size = qkv.size(-1) // 3 q, k, v = qkv.split([hidden_size, hidden_size, hidden_size], dim=-1) @@ -126,20 +167,29 @@ def forward( hidden_states: torch.Tensor, residual: Optional[torch.Tensor], ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + For tractable computation: + - if residual is None, the outputs are: + - residual = (hidden_states + 1) * 3 + positions * 2 + hidden_states = hidden_states * 4 + positions * 2 + 3 + - hidden_states = (residual + 1) ** 2 + - if residual is not None, the outputs are: + - residual = (hidden_states + residual + 1) * 3 + positions * 2 + hidden_states + residual = (hidden_states + residual) * 4 + positions * 2 + 3 + - hidden_states = (residual + 1) ** 2 + """ # noqa if residual is None: residual = hidden_states - hidden_states = hidden_states / 2 + hidden_states = hidden_states + 1 else: hidden_states = hidden_states + residual residual = hidden_states - hidden_states = hidden_states / 2 + hidden_states = hidden_states + 1 hidden_states = self.self_attention(positions=positions, hidden_states=hidden_states) hidden_states = hidden_states + residual residual = hidden_states - hidden_states = hidden_states / 2 + hidden_states = hidden_states + 1 hidden_states = self.mlp(hidden_states) return hidden_states, residual @@ -156,7 +206,8 @@ def __init__(self, config: LlamaConfig) -> None: self.layers = nn.ModuleList( [LlamaDecoderLayer(config) for _ in range(config.num_layers)]) - self.embedding_tokens.weight.data.fill_(0.0) + # this is the initial value of the hidden states + self.embedding_tokens.weight.data.fill_(config.init_value) def forward( self, @@ -170,6 +221,28 @@ def forward( return hidden_states +def tractable_computation(input_ids: torch.Tensor, + positions: torch.Tensor, + config: LlamaConfig, + init_value: float = 1.0) -> torch.Tensor: + hidden_states = torch.ones(input_ids.size(0), + config.hidden_size, + device=input_ids.device, + dtype=input_ids.dtype) * init_value + + # first layer + residual = hidden_states * 4 + positions.unsqueeze(1) * 2 + 3 + hidden_states = (residual + 1)**2 + + # following layers + for _ in range(config.num_layers - 1): + hidden_states = hidden_states + residual + residual = hidden_states * 4 + positions.unsqueeze(1) * 2 + 3 + hidden_states = (residual + 1)**2 + + return hidden_states + + @torch.inference_mode def run_model(llama_config, use_compile: bool, @@ -213,7 +286,15 @@ def run_model(llama_config, del os.environ["VLLM_TORCH_COMPILE_LEVEL"] set_compilation_config(None) - return output.cpu() + output = output.cpu() + + if llama_config.tractable_init: + expected_output = tractable_computation(input_ids[:2], positions[:2], + llama_config).cpu() + + assert torch.allclose(output, expected_output) + else: + return output.cpu() def test_toy_llama(): @@ -222,7 +303,13 @@ def test_toy_llama(): llama_config = LlamaConfig(hidden_size=128, mlp_size=256, vocab_size=128, - num_layers=2) + num_layers=12) + + tractable_config = LlamaConfig(hidden_size=128, + mlp_size=256, + vocab_size=128, + num_layers=2, + tractable_init=True) outputs = [] with compilation_counter.expect( @@ -233,6 +320,8 @@ def test_toy_llama(): num_cudagraph_caputured=0, ): outputs.append(run_model(llama_config, use_compile=False)) + run_model(tractable_config, use_compile=False) + with compilation_counter.expect( num_graphs_seen=1, # one graph for the model num_piecewise_graphs_seen=1, @@ -242,6 +331,7 @@ def test_toy_llama(): 2, # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen ): outputs.append(run_model(llama_config, use_compile=True)) + run_model(tractable_config, use_compile=True) with compilation_counter.expect( num_graphs_seen=1, # one graph for the model @@ -257,6 +347,7 @@ def test_toy_llama(): ): outputs.append( run_model(llama_config, use_compile=True, split_attn=True)) + run_model(tractable_config, use_compile=True, split_attn=True) for i in range(1, len(outputs)): assert torch.allclose(outputs[0], outputs[i]) diff --git a/tests/core/utils.py b/tests/core/utils.py index a95a573db7cd3..cd0caa4704e11 100644 --- a/tests/core/utils.py +++ b/tests/core/utils.py @@ -4,6 +4,7 @@ from typing import Tuple from vllm import SamplingParams +from vllm.inputs import EncoderDecoderInputs, token_inputs from vllm.lora.request import LoRARequest from vllm.sequence import Logprob, Sequence, SequenceGroup @@ -27,10 +28,7 @@ def create_dummy_prompt( prompt_tokens = list(range(prompt_length)) prompt_str = " ".join([str(t) for t in prompt_tokens]) prompt = Sequence(int(request_id), - inputs={ - "prompt": prompt_str, - "prompt_token_ids": prompt_tokens, - }, + inputs=token_inputs(prompt_tokens, prompt=prompt_str), block_size=block_size) seq_group = SequenceGroup(request_id=request_id, seqs=[prompt], @@ -63,23 +61,21 @@ def create_dummy_prompt_encoder_decoder( encoder_prompt_tokens = list(reversed(list(range(encoder_prompt_length)))) encoder_prompt_str = " ".join([str(t) for t in encoder_prompt_tokens]) - inputs = { - "prompt": decoder_prompt_str, - "prompt_token_ids": decoder_prompt_tokens, - "encoder_prompt": encoder_prompt_str, - "encoder_prompt_token_ids": encoder_prompt_tokens, - "multi_modal_data": None, + inputs: EncoderDecoderInputs = { + "decoder": token_inputs(decoder_prompt_tokens, + prompt=decoder_prompt_str), + "encoder": token_inputs(encoder_prompt_tokens, + prompt=encoder_prompt_str), } decoder_prompt = Sequence(int(request_id), - inputs=inputs, - block_size=block_size, - from_decoder_prompt=True) + inputs=inputs["decoder"], + block_size=block_size) encoder_prompt = Sequence(int(request_id), - inputs=inputs, - block_size=block_size, - from_decoder_prompt=False) + inputs=inputs["encoder"], + block_size=block_size) + seq_group = SequenceGroup(request_id=request_id, seqs=[decoder_prompt], sampling_params=SamplingParams(best_of=best_of), @@ -108,7 +104,7 @@ def create_seq_group( for seq_id_offset, output_len in enumerate(seq_output_lens): seq = Sequence( seq_id=seq_id_start + seq_id_offset, - inputs={"prompt_token_ids": prompt_token_ids}, + inputs=token_inputs(prompt_token_ids), block_size=16, ) @@ -143,21 +139,19 @@ def create_seq_group_encoder_decoder( prompt_token_ids = [0] * seq_prompt_len - inputs = { - "prompt": "", - "prompt_token_ids": prompt_token_ids, - "encoder_prompt": "", - "encoder_prompt_token_ids": prompt_token_ids, - "multi_modal_data": None, + inputs: EncoderDecoderInputs = { + "decoder": token_inputs(prompt_token_ids), + "encoder": token_inputs(prompt_token_ids), } seqs = [] for seq_id_offset, output_len in enumerate(seq_output_lens): # Construct decoder input sequences - seq = Sequence(seq_id=seq_id_start + seq_id_offset, - inputs=inputs, - block_size=16, - from_decoder_prompt=True) + seq = Sequence( + seq_id=seq_id_start + seq_id_offset, + inputs=inputs["decoder"], + block_size=16, + ) for i in range(output_len): seq.append_token_id( @@ -167,10 +161,11 @@ def create_seq_group_encoder_decoder( seqs.append(seq) # Encoder input sequence - encoder_seq = Sequence(seq_id=seq_id_start + len(seq_output_lens), - inputs=inputs, - block_size=16, - from_decoder_prompt=False) + encoder_seq = Sequence( + seq_id=seq_id_start + len(seq_output_lens), + inputs=inputs["encoder"], + block_size=16, + ) return SequenceGroup(request_id=request_id, seqs=seqs, diff --git a/tests/engine/output_processor/test_stop_checker.py b/tests/engine/output_processor/test_stop_checker.py index 0d84443c51f99..cc14e8cbf75df 100644 --- a/tests/engine/output_processor/test_stop_checker.py +++ b/tests/engine/output_processor/test_stop_checker.py @@ -4,6 +4,7 @@ from transformers import PreTrainedTokenizer from vllm.engine.output_processor.stop_checker import StopChecker +from vllm.inputs import token_inputs from vllm.sampling_params import SamplingParams from vllm.sequence import Logprob, Sequence, SequenceStatus @@ -15,7 +16,7 @@ def sequence_with_eos(text: str, eos_token: str, """ seq = Sequence( seq_id=0, - inputs={"prompt_token_ids": []}, + inputs=token_inputs([]), block_size=16, eos_token_id=eos_token_id, ) diff --git a/tests/kernels/test_activation.py b/tests/kernels/test_activation.py index 057a11746014c..a84501f9c303f 100644 --- a/tests/kernels/test_activation.py +++ b/tests/kernels/test_activation.py @@ -14,7 +14,7 @@ DTYPES = [torch.half, torch.bfloat16, torch.float] NUM_TOKENS = [7, 83, 2048] # Arbitrary values for testing -D = [512, 4096, 5120, 13824] # Arbitrary values for testing +D = [512, 13824] # Arbitrary values for testing SEEDS = [0] CUDA_DEVICES = [ f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py index 4ecd0fc1a21ad..3e3c0668198ad 100644 --- a/tests/kernels/test_attention.py +++ b/tests/kernels/test_attention.py @@ -33,7 +33,7 @@ # FlashAttention forward only supports head dimension at most 128 # https://github.com/ROCmSoftwarePlatform/flash-attention/blob/3d2b6f5d037782cc2c906909a46fb7e2e1b48b25/csrc/flash_attn_rocm/flash_api.cpp#L62 -HEAD_SIZES = [64, 80, 96, 112, 120, 128, 192, 256] +HEAD_SIZES = [64, 80, 120, 256] BLOCK_SIZES = [16, 32] USE_ALIBI = [False, True] diff --git a/tests/kernels/test_awq_marlin.py b/tests/kernels/test_awq_marlin.py index 59917dd2c58ad..238d6426bf099 100644 --- a/tests/kernels/test_awq_marlin.py +++ b/tests/kernels/test_awq_marlin.py @@ -14,13 +14,17 @@ awq_marlin_quantize) from vllm.scalar_type import scalar_types +NUM_EXPERTS = [8, 64] +TOP_KS = [2, 6] +GROUP_SIZES = [-1, 32, 128] -@pytest.mark.parametrize("m", [64, 512, 222, 33, 1]) -@pytest.mark.parametrize("n", [128, 2048, 256, 1024]) -@pytest.mark.parametrize("k", [128, 1024, 512]) -@pytest.mark.parametrize("e", [8, 64]) -@pytest.mark.parametrize("topk", [2, 6]) -@pytest.mark.parametrize("group_size", [-1, 32, 64, 128]) + +@pytest.mark.parametrize("m", [1, 33, 64, 222]) +@pytest.mark.parametrize("n", [128, 2048]) +@pytest.mark.parametrize("k", [128, 1024]) +@pytest.mark.parametrize("e", NUM_EXPERTS) +@pytest.mark.parametrize("topk", TOP_KS) +@pytest.mark.parametrize("group_size", GROUP_SIZES) @pytest.mark.skipif(not (ops.supports_moe_ops and hasattr(torch.ops._moe_C, "marlin_gemm_moe")), reason="Marlin is not supported on this GPU type.") diff --git a/tests/kernels/test_blocksparse_attention.py b/tests/kernels/test_blocksparse_attention.py index fb601852dd523..fad342d1b5923 100644 --- a/tests/kernels/test_blocksparse_attention.py +++ b/tests/kernels/test_blocksparse_attention.py @@ -25,10 +25,10 @@ DTYPES = [torch.half, torch.bfloat16] NUM_GEN_SEQS = [3] # Arbitrary values for testing NUM_PREFILL_SEQS = [3] # Arbitrary values for testing -NUM_HEADS = [(40, 40), (64, 8)] # Arbitrary values for testing +NUM_HEADS = [(40, 40)] # Arbitrary values for testing HEAD_SIZES = [64, 112] -BLOCK_SIZES = [16, 32] +BLOCK_SIZES = [16] USE_ALIBI = [False, True] KV_CACHE_DTYPE = ["auto", "fp8"] SEEDS = [0] @@ -37,7 +37,7 @@ BLOCKSPARSE_VERT_STRIDES = [8] BLOCKSPARSE_BLOCK_SIZES = [64] -BLOCKSPARSE_HEADS_SLIDINGS = [0, 2, -1] +BLOCKSPARSE_HEADS_SLIDINGS = [2, -1] BLOCKSPARSE_HOMO_HEADS = [True, False] diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py index e2b4778b94b9e..40550ed51e2c7 100644 --- a/tests/kernels/test_cache.py +++ b/tests/kernels/test_cache.py @@ -13,7 +13,7 @@ NUM_TOKENS = [42] # Arbitrary values for testing NUM_LAYERS = [1] # Arbitrary values for testing NUM_HEADS = [8] # Arbitrary values for testing -HEAD_SIZES = [64, 80, 96, 112, 120, 128, 192, 256] +HEAD_SIZES = [64, 80, 120, 256] BLOCK_SIZES = [8, 16, 32] # Arbitrary values for testing diff --git a/tests/kernels/test_cutlass.py b/tests/kernels/test_cutlass.py index 993e67e827ea0..afe53797322f9 100644 --- a/tests/kernels/test_cutlass.py +++ b/tests/kernels/test_cutlass.py @@ -11,6 +11,28 @@ from vllm import _custom_ops as ops from vllm.platforms import current_platform +MNK_FACTORS = [ + (1, 256, 128), + (1, 16384, 1024), + (1, 24576, 496), + (16, 256, 496), + (16, 16384, 128), + (16, 24576, 4096), + (32, 8192, 4096), + (32, 16384, 4096), + (33, 1024, 1024), + (33, 8192, 128), + (64, 2048, 496), + (64, 16384, 1024), + (100, 8192, 496), + (128, 32768, 4096), + (256, 4096, 4096), + (512, 256, 1024), + (512, 8192, 4096), + (512, 16384, 128), + (512, 24576, 128), +] + CUDA_DEVICES = [ f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) ] @@ -116,9 +138,7 @@ def cutlass_int8_gemm_helper(m: int, (out, a, b, scale_a, scale_b, bias)) -@pytest.mark.parametrize("m", [1, 16, 32, 64, 128, 256, 512, 222, 100, 33]) -@pytest.mark.parametrize("n", [2048, 4096, 8192, 16384, 24576, 256, 1024]) -@pytest.mark.parametrize("k", [128, 496, 1024]) +@pytest.mark.parametrize("m,n,k", MNK_FACTORS) @pytest.mark.parametrize("per_act_token", [True, False]) @pytest.mark.parametrize("per_out_ch", [True, False]) @pytest.mark.parametrize("use_bias", [True, False]) @@ -129,9 +149,7 @@ def test_cutlass_fp8_gemm(m: int, n: int, k: int, per_act_token: bool, cutlass_fp8_gemm_helper(m, n, k, per_act_token, per_out_ch, use_bias) -@pytest.mark.parametrize("m", [1, 16, 32, 64, 128, 256, 512, 222, 33, 1]) -@pytest.mark.parametrize("n", [2048, 8192, 16384, 256, 1024]) -@pytest.mark.parametrize("k", [128, 496, 1024]) +@pytest.mark.parametrize("m,n,k", MNK_FACTORS) @pytest.mark.parametrize("per_act_token", [True, False]) @pytest.mark.parametrize("per_out_ch", [True, False]) @pytest.mark.parametrize("use_bias", [True, False]) diff --git a/tests/kernels/test_int8_quant.py b/tests/kernels/test_int8_quant.py index 8db6a0d0d9fa4..12c578db0893c 100644 --- a/tests/kernels/test_int8_quant.py +++ b/tests/kernels/test_int8_quant.py @@ -7,11 +7,10 @@ from vllm.platforms import current_platform DTYPES = [torch.half, torch.bfloat16, torch.float] -HIDDEN_SIZES = [16, 67, 768, 2048, 5120, 5137, 8192, - 8193] # Arbitrary values for testing +HIDDEN_SIZES = [16, 67, 768, 5137, 8193] # Arbitrary values for testing NUM_TOKENS = [1, 7, 83, 4096] # Arbitrary values for testing SEEDS = [0] -SCALE = [0.1, 0.5, 0.8, 1.2, 2.1] +SCALE = [0.1, 2.1] def opcheck_int8_quant_static(output, input, scale, azp=None): @@ -132,7 +131,7 @@ def test_static_scaled_int8_quant(num_tokens: int, hidden_size: int, @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("scale", SCALE[2:]) # Reduce test time +@pytest.mark.parametrize("scale", SCALE) @pytest.mark.parametrize("azp", [-255, 54]) @torch.inference_mode() def test_static_scaled_int8_azp_quant(num_tokens: int, hidden_size: int, diff --git a/tests/kernels/test_marlin_gemm.py b/tests/kernels/test_marlin_gemm.py index 5cfd4d6da7a86..b6dd68cc51a9f 100644 --- a/tests/kernels/test_marlin_gemm.py +++ b/tests/kernels/test_marlin_gemm.py @@ -35,7 +35,7 @@ USE_FP32_REDUCE_OPTS = [False, True] MARLIN_K_CHUNKS = [128] -MARLIN_N_CHUNKS = [64, 128, 256] +MARLIN_N_CHUNKS = [64, 256] MARLIN_24_K_CHUNKS = [128] MARLIN_24_N_CHUNKS = [512] diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py index 19c3fc1e1fe3a..17428ebfc2e28 100644 --- a/tests/kernels/test_moe.py +++ b/tests/kernels/test_moe.py @@ -20,12 +20,15 @@ from vllm.platforms import current_platform from vllm.scalar_type import scalar_types +NUM_EXPERTS = [8, 64] +TOP_KS = [2, 6] -@pytest.mark.parametrize("m", [1024 * 128, 512, 222, 33, 1]) -@pytest.mark.parametrize("n", [2048, 256, 1024]) + +@pytest.mark.parametrize("m", [1, 33, 64, 222, 1024 * 128]) +@pytest.mark.parametrize("n", [128, 1024, 2048]) @pytest.mark.parametrize("k", [128, 511, 1024]) -@pytest.mark.parametrize("e", [8, 64]) -@pytest.mark.parametrize("topk", [2, 6]) +@pytest.mark.parametrize("e", NUM_EXPERTS) +@pytest.mark.parametrize("topk", TOP_KS) @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) def test_fused_moe( m: int, @@ -93,12 +96,12 @@ def test_mixtral_moe(dtype: torch.dtype): atol=mixtral_moe_tol[dtype]) -@pytest.mark.parametrize("m", [64, 512, 222, 33, 1]) -@pytest.mark.parametrize("n", [128, 2048, 256, 1024]) -@pytest.mark.parametrize("k", [128, 1024, 512]) -@pytest.mark.parametrize("e", [8, 64]) -@pytest.mark.parametrize("topk", [2, 6]) -@pytest.mark.parametrize("group_size", [-1, 32, 64, 128]) +@pytest.mark.parametrize("m", [1, 33, 64, 222]) +@pytest.mark.parametrize("n", [128, 2048]) +@pytest.mark.parametrize("k", [128, 1024]) +@pytest.mark.parametrize("e", NUM_EXPERTS) +@pytest.mark.parametrize("topk", TOP_KS) +@pytest.mark.parametrize("group_size", [-1, 32, 128]) @pytest.mark.parametrize("act_order", [True, False]) @pytest.mark.parametrize("num_bits", [4, 8]) @pytest.mark.parametrize("is_k_full", [True, False]) diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/test_pos_encoding.py index b408559cc0b07..eee77c22ab81a 100644 --- a/tests/kernels/test_pos_encoding.py +++ b/tests/kernels/test_pos_encoding.py @@ -11,10 +11,10 @@ IS_NEOX_STYLE = [True, False] DTYPES = [torch.half, torch.bfloat16, torch.float] -HEAD_SIZES = [64, 80, 96, 112, 120, 128, 192, 256] +HEAD_SIZES = [64, 80, 112, 120, 256] ROTARY_DIMS = [None, 32] # None means rotary dim == head size -NUM_HEADS = [7, 17] # Arbitrary values for testing -BATCH_SIZES = [1, 5] # Arbitrary values for testing +NUM_HEADS = [17] # Arbitrary values for testing +BATCH_SIZES = [5] # Arbitrary values for testing SEQ_LENS = [11, 8192] # Arbitrary values for testing SEEDS = [0] CUDA_DEVICES = [ diff --git a/tests/models/decoder_only/language/test_big_models.py b/tests/models/decoder_only/language/test_big_models.py index fcfc159e4f5a0..cbbeb464ccac9 100644 --- a/tests/models/decoder_only/language/test_big_models.py +++ b/tests/models/decoder_only/language/test_big_models.py @@ -28,8 +28,7 @@ "h2oai/h2o-danube3-4b-base", ] -# TODO: remove this after CPU float16 support ready -target_dtype = "float" if current_platform.is_cpu() else "half" +target_dtype = "half" @pytest.mark.parametrize("model", MODELS) diff --git a/tests/models/decoder_only/language/test_fp8.py b/tests/models/decoder_only/language/test_fp8.py index 5a947ce62c785..f874bf6c73142 100644 --- a/tests/models/decoder_only/language/test_fp8.py +++ b/tests/models/decoder_only/language/test_fp8.py @@ -21,11 +21,11 @@ "kv_cache_dtype,base_model,test_model,scale_path", [ # Test FP8 checkpoint w. fp8_e4m3 kv-cache scaling factors. - ("fp8_e4m3", "meta-llama/Meta-Llama-3-8B-Instruct", - "nm-testing/Meta-Llama-3-8B-Instruct-FP8-KV", None), + ("fp8_e4m3", "meta-llama/Llama-3.2-1B-Instruct", + "nm-testing/Llama-3.2-1B-Instruct-FP8-KV", None), # Test FP16 checkpoint w. fp8_e5m2 kv-cache. - ("fp8_e5m2", "meta-llama/Meta-Llama-3-8B-Instruct", - "meta-llama/Meta-Llama-3-8B-Instruct", None), + ("fp8_e5m2", "meta-llama/Llama-3.2-1B-Instruct", + "meta-llama/Llama-3.2-1B-Instruct", None), # Test FP16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json. ("fp8_e4m3", "meta-llama/Llama-2-7b-chat-hf", "meta-llama/Llama-2-7b-chat-hf", @@ -33,7 +33,7 @@ ]) # Due to low-precision numerical divergence, we only test logprob of 4 tokens @pytest.mark.parametrize("max_tokens", [4]) -@pytest.mark.parametrize("enforce_eager", [False, True]) +@pytest.mark.parametrize("enforce_eager", [True]) @pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"]) # NOTE: Increasing this in this suite will fail CI because we currently cannot # reset distributed env properly. Use a value > 1 just when you test. diff --git a/tests/models/decoder_only/language/test_gptq_marlin.py b/tests/models/decoder_only/language/test_gptq_marlin.py index 2155e83dbe915..a896f145c11f1 100644 --- a/tests/models/decoder_only/language/test_gptq_marlin.py +++ b/tests/models/decoder_only/language/test_gptq_marlin.py @@ -22,24 +22,11 @@ MAX_MODEL_LEN = 1024 MODELS = [ - # act_order==False, group_size=channelwise - ("robertgshaw2/zephyr-7b-beta-channelwise-gptq", "main"), - # act_order==False, group_size=128 - ("TheBloke/Llama-2-7B-GPTQ", "main"), - # act_order==True, group_size=128 ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "main"), - # act_order==True, group_size=64 - ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-4bit-64g-actorder_True"), - # act_order==True, group_size=32 - ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-4bit-32g-actorder_True"), # 8-bit, act_order==True, group_size=channelwise ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit--1g-actorder_True"), - # 8-bit, act_order==True, group_size=128 - ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit-128g-actorder_True"), - # 8-bit, act_order==True, group_size=32 - ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit-32g-actorder_True"), # 4-bit, act_order==True, group_size=128 ("TechxGenus/gemma-1.1-2b-it-GPTQ", "main") diff --git a/tests/models/decoder_only/language/test_gptq_marlin_24.py b/tests/models/decoder_only/language/test_gptq_marlin_24.py index d65be05f141b4..aa63f9f36a3a8 100644 --- a/tests/models/decoder_only/language/test_gptq_marlin_24.py +++ b/tests/models/decoder_only/language/test_gptq_marlin_24.py @@ -25,16 +25,16 @@ class ModelPair: # 4-bit, group_size == 128 ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-g128", model_gptq="alexm-nm/tinyllama-24-gptq-4bit-g128"), - # 4-bit, group_size == channelwise - ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-channelwise", - model_gptq="alexm-nm/tinyllama-24-gptq-4bit-channelwise"), + # # 4-bit, group_size == channelwise + # ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-channelwise", + # model_gptq="alexm-nm/tinyllama-24-gptq-4bit-channelwise"), # 8-bit, group_size == 128 ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-g128", model_gptq="alexm-nm/tinyllama-24-gptq-8bit-g128"), - # 8-bit, group_size == channelwise - ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-channelwise", - model_gptq="alexm-nm/tinyllama-24-gptq-8bit-channelwise"), + # # 8-bit, group_size == channelwise + # ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-channelwise", + # model_gptq="alexm-nm/tinyllama-24-gptq-8bit-channelwise"), ] diff --git a/tests/models/decoder_only/language/test_marlin.py b/tests/models/decoder_only/language/test_marlin.py deleted file mode 100644 index c802346dee8af..0000000000000 --- a/tests/models/decoder_only/language/test_marlin.py +++ /dev/null @@ -1,69 +0,0 @@ -"""Compare the outputs of a GPTQ model to a Marlin model. - -Note: GPTQ and Marlin do not have bitwise correctness. -As a result, in this test, we just confirm that the top selected tokens of the -Marlin/GPTQ models are in the top 3 selections of each other. - -Note: Marlin internally uses locks to synchronize the threads. This can -result in very slight nondeterminism for Marlin. As a result, we re-run the test -up to 3 times to see if we pass. - -Run `pytest tests/models/test_marlin.py`. -""" -from dataclasses import dataclass - -import pytest - -from tests.quantization.utils import is_quant_method_supported - -from ...utils import check_logprobs_close - - -@dataclass -class ModelPair: - model_marlin: str - model_gptq: str - - -model_pairs = [ - ModelPair(model_marlin="nm-testing/zephyr-beta-7b-marlin-g128", - model_gptq="nm-testing/zephyr-beta-7b-gptq-g128"), - ModelPair(model_marlin="robertgshaw2/zephyr-7b-beta-channelwise-marlin", - model_gptq="robertgshaw2/zephyr-7b-beta-channelwise-gptq"), - ModelPair(model_marlin="robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", - model_gptq="robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-gptq") -] - - -@pytest.mark.flaky(reruns=2) -@pytest.mark.skipif(not is_quant_method_supported("marlin"), - reason="Marlin is not supported on this GPU type.") -@pytest.mark.parametrize("model_pair", model_pairs) -@pytest.mark.parametrize("dtype", ["half"]) -@pytest.mark.parametrize("max_tokens", [32]) -@pytest.mark.parametrize("num_logprobs", [5]) -def test_models( - vllm_runner, - example_prompts, - model_pair: ModelPair, - dtype: str, - max_tokens: int, - num_logprobs: int, -) -> None: - with vllm_runner(model_pair.model_marlin, - dtype=dtype, - quantization="marlin") as marlin_model: - marlin_outputs = marlin_model.generate_greedy_logprobs( - example_prompts, max_tokens, num_logprobs) - - with vllm_runner(model_pair.model_gptq, dtype=dtype, - quantization="gptq") as gptq_model: - gptq_outputs = gptq_model.generate_greedy_logprobs( - example_prompts, max_tokens, num_logprobs) - - check_logprobs_close( - outputs_0_lst=gptq_outputs, - outputs_1_lst=marlin_outputs, - name_0="gptq", - name_1="marlin", - ) diff --git a/tests/models/decoder_only/language/test_mistral.py b/tests/models/decoder_only/language/test_mistral.py index 174b905d9cbb9..5be44c54a717c 100644 --- a/tests/models/decoder_only/language/test_mistral.py +++ b/tests/models/decoder_only/language/test_mistral.py @@ -4,7 +4,7 @@ """ import pytest -from vllm import LLM, SamplingParams +from vllm import SamplingParams from ...utils import check_logprobs_close @@ -15,6 +15,10 @@ # "mistralai/Mistral-Nemo-Instruct-2407" ] +MISTRAL_FORMAT_MODELS = [ + "mistralai/Mistral-7B-Instruct-v0.3", +] + SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5) SYMBOLIC_LANG_PROMPTS = [ "勇敢な船乗りについての詩を書く", # japanese @@ -95,7 +99,7 @@ def test_models( ) -@pytest.mark.parametrize("model", MODELS[1:]) +@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS) @pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("max_tokens", [64]) @pytest.mark.parametrize("num_logprobs", [5]) @@ -135,28 +139,29 @@ def test_mistral_format( ) -@pytest.mark.parametrize("model", MODELS[1:]) +@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS) @pytest.mark.parametrize("dtype", ["bfloat16"]) -@pytest.mark.parametrize("prompt", SYMBOLIC_LANG_PROMPTS) def test_mistral_symbolic_languages( + vllm_runner, model: str, dtype: str, - prompt: str, ) -> None: - prompt = "hi" - msg = {"role": "user", "content": prompt} - llm = LLM(model=model, - dtype=dtype, - max_model_len=8192, - tokenizer_mode="mistral", - config_format="mistral", - load_format="mistral") - outputs = llm.chat([msg], sampling_params=SAMPLING_PARAMS) - assert "�" not in outputs[0].outputs[0].text.strip() + with vllm_runner(model, + dtype=dtype, + max_model_len=8192, + tokenizer_mode="mistral", + config_format="mistral", + load_format="mistral") as vllm_model: + for prompt in SYMBOLIC_LANG_PROMPTS: + msg = {"role": "user", "content": prompt} + outputs = vllm_model.model.chat([msg], + sampling_params=SAMPLING_PARAMS) + assert "�" not in outputs[0].outputs[0].text.strip() @pytest.mark.parametrize("dtype", ["bfloat16"]) -@pytest.mark.parametrize("model", MODELS[1:]) # v1 can't do func calling +@pytest.mark.parametrize("model", + MISTRAL_FORMAT_MODELS) # v1 can't do func calling def test_mistral_function_calling( vllm_runner, model: str, diff --git a/tests/models/decoder_only/language/test_models.py b/tests/models/decoder_only/language/test_models.py index 68055cbe29095..d705909c24bf8 100644 --- a/tests/models/decoder_only/language/test_models.py +++ b/tests/models/decoder_only/language/test_models.py @@ -7,25 +7,38 @@ """ import pytest -from ...utils import check_outputs_equal +from vllm.platforms import current_platform + +from ...utils import check_logprobs_close MODELS = [ - "facebook/opt-125m", - "gpt2", - "bigcode/tiny_starcoder_py", - "EleutherAI/pythia-70m", - "bigscience/bloom-560m", # Testing alibi slopes. - "microsoft/phi-2", - "stabilityai/stablelm-3b-4e1t", - # "allenai/OLMo-1B", # Broken - "bigcode/starcoder2-3b", - "google/gemma-1.1-2b-it", + "facebook/opt-125m", # opt + "openai-community/gpt2", # gpt2 + # "Milos/slovak-gpt-j-405M", # gptj + # "bigcode/tiny_starcoder_py", # gpt_bigcode + # "EleutherAI/pythia-70m", # gpt_neox + "bigscience/bloom-560m", # bloom - testing alibi slopes + "microsoft/phi-2", # phi + # "stabilityai/stablelm-3b-4e1t", # stablelm + # "bigcode/starcoder2-3b", # starcoder2 + "google/gemma-1.1-2b-it", # gemma + "Qwen/Qwen2.5-0.5B-Instruct", # qwen2 + "meta-llama/Llama-3.2-1B-Instruct", # llama ] +if not current_platform.is_cpu(): + MODELS += [ + # fused_moe which not supported on CPU + "openbmb/MiniCPM3-4B", + ] + +target_dtype = "half" + @pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["float"]) -@pytest.mark.parametrize("max_tokens", [96]) +@pytest.mark.parametrize("dtype", [target_dtype]) +@pytest.mark.parametrize("max_tokens", [32]) +@pytest.mark.parametrize("num_logprobs", [5]) def test_models( hf_runner, vllm_runner, @@ -33,33 +46,24 @@ def test_models( model: str, dtype: str, max_tokens: int, + num_logprobs: int, ) -> None: - # To pass the small model tests, we need full precision. - assert dtype == "float" with hf_runner(model, dtype=dtype) as hf_model: - hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) + hf_outputs = hf_model.generate_greedy_logprobs_limit( + example_prompts, max_tokens, num_logprobs) with vllm_runner(model, dtype=dtype) as vllm_model: - vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) + vllm_outputs = vllm_model.generate_greedy_logprobs( + example_prompts, max_tokens, num_logprobs) + # This test is for verifying whether the model's extra_repr + # can be printed correctly. + print(vllm_model.model.llm_engine.model_executor.driver_worker. + model_runner.model) - check_outputs_equal( + check_logprobs_close( outputs_0_lst=hf_outputs, outputs_1_lst=vllm_outputs, name_0="hf", name_1="vllm", ) - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["float"]) -def test_model_print( - vllm_runner, - model: str, - dtype: str, -) -> None: - with vllm_runner(model, dtype=dtype) as vllm_model: - # This test is for verifying whether the model's extra_repr - # can be printed correctly. - print(vllm_model.model.llm_engine.model_executor.driver_worker. - model_runner.model) diff --git a/tests/models/decoder_only/language/test_qwen.py b/tests/models/decoder_only/language/test_qwen.py deleted file mode 100644 index 128fe65afbb84..0000000000000 --- a/tests/models/decoder_only/language/test_qwen.py +++ /dev/null @@ -1,34 +0,0 @@ -"""Ensure that a text-only Qwen model can be run without throwing an error. -We explicitly test this because Qwen is implemented as a multimodal and -supports a visual encoder for models like Qwen-VL. -""" -from typing import List, Type - -import pytest - -from ....conftest import VllmRunner - -models = [ - "Qwen/Qwen-7B-Chat" # Has no visual encoder -] - - -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize("dtype", ["bfloat16"]) -@pytest.mark.parametrize("max_tokens", [32]) -@pytest.mark.parametrize("num_logprobs", [5]) -def test_text_only_qwen_model_can_be_loaded_and_run( - vllm_runner: Type[VllmRunner], - example_prompts: List[str], - model: str, - *, - dtype: str, - max_tokens: int, - num_logprobs: int, -): - with vllm_runner(model, dtype=dtype) as vllm_model: - vllm_model.generate_greedy_logprobs( - example_prompts, - max_tokens, - num_logprobs=num_logprobs, - ) diff --git a/tests/test_cache_block_hashing.py b/tests/test_cache_block_hashing.py index 3576a4834ebc3..e8f8499aa88ca 100644 --- a/tests/test_cache_block_hashing.py +++ b/tests/test_cache_block_hashing.py @@ -6,6 +6,7 @@ import pytest +from vllm.inputs import token_inputs from vllm.lora.request import LoRARequest from vllm.sequence import Sequence from vllm.transformers_utils.tokenizer_group import TokenizerGroup @@ -70,10 +71,8 @@ def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int, hashes[-1].append([]) prompt_token_ids = tokenizer.encode(prompt) seq = Sequence(seq_id, - inputs={ - "prompt": prompt, - "prompt_token_ids": prompt_token_ids, - }, + inputs=token_inputs(prompt_token_ids, + prompt=prompt), block_size=block_size, eos_token_id=tokenizer.tokenizer.eos_token_id, lora_request=lora_request) diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py index 1d07885349409..a3e70a40db979 100644 --- a/tests/tokenization/test_detokenize.py +++ b/tests/tokenization/test_detokenize.py @@ -3,6 +3,7 @@ import pytest from transformers import AutoTokenizer +from vllm.inputs import token_inputs from vllm.sequence import Logprob, SamplingParams, Sequence, SequenceGroup from vllm.transformers_utils.detokenizer import (Detokenizer, detokenize_incrementally) @@ -169,10 +170,7 @@ def create_sequence(prompt_token_ids=None): prompt_token_ids = prompt_token_ids or [1] return Sequence( seq_id=0, - inputs={ - "prompt": "", - "prompt_token_ids": prompt_token_ids, - }, + inputs=token_inputs(prompt_token_ids, prompt=""), block_size=16, ) diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 96ddcba467c5b..de32cabbe6d07 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -6,6 +6,7 @@ import torch import torch.fx as fx +import vllm.envs as envs from vllm.logger import init_logger from vllm.utils import weak_ref_tensors @@ -193,6 +194,7 @@ def wrap_inductor(graph, @dataclasses.dataclass class SplitItem: submod_name: str + graph_id: int is_splitting_graph: bool graph: fx.GraphModule @@ -226,9 +228,7 @@ def split_graph(graph: fx.GraphModule, outputs = [] - # sort the names to make sure the order is deterministic names = [name for (name, module) in split_gm.named_modules()] - names.sort() for name in names: if "." in name or name == "": @@ -238,7 +238,11 @@ def split_graph(graph: fx.GraphModule, module = getattr(split_gm, name) graph_id = int(name.replace("submod_", "")) - outputs.append(SplitItem(name, graph_id in split_op_graphs, module)) + outputs.append( + SplitItem(name, graph_id, (graph_id in split_op_graphs), module)) + + # sort by intetger graph_id, rather than string name + outputs.sort(key=lambda x: x.graph_id) return split_gm, outputs @@ -252,6 +256,11 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter): It runs the given graph with fake inputs, and compile some submodules specified by `compile_submod_names` with the given compilation configs. + + NOTE: the order in `compile_submod_names` matters, because + it will be used to determine the order of the compiled piecewise + graphs. The first graph will handle logging, and the last graph + has some special cudagraph output handling. """ def __init__(self, module: torch.fx.GraphModule, @@ -263,7 +272,6 @@ def __init__(self, module: torch.fx.GraphModule, self.compile_submod_names = compile_submod_names self.compilation_configs = compilation_configs self.graph_pool = graph_pool - self.have_seen_first_graph = False def run(self, *args): fake_args = [ @@ -279,6 +287,7 @@ def call_module(self, target: torch.fx.node.Target, output = super().call_module(target, args, kwargs) if target in self.compile_submod_names: + index = self.compile_submod_names.index(target) submod = self.fetch_attr(target) sym_shape_indices = [ i for i, x in enumerate(args) if isinstance(x, torch.SymInt) @@ -288,15 +297,14 @@ def call_module(self, target: torch.fx.node.Target, args, self.compilation_configs.inductor_compile_config, runtime_shape=None, - do_logging=not self.have_seen_first_graph, + do_logging=index == 0, use_inductor=self.compilation_configs.use_inductor) self.module.__dict__[target] = PiecewiseBackend( - submod, self.compilation_configs, self.graph_pool, - not self.have_seen_first_graph, sym_shape_indices, + submod, self.compilation_configs, self.graph_pool, index, + len(self.compile_submod_names), sym_shape_indices, compiled_graph_for_general_shape) - self.have_seen_first_graph = True compilation_counter.num_piecewise_capturable_graphs_seen += 1 return output @@ -352,8 +360,9 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable: graph, self.compilation_configs.non_cudagraph_ops) from torch._dynamo.utils import lazy_format_graph_code - logger.debug("%s", - lazy_format_graph_code("stiching module", self.split_gm)) + logger.debug("%s", lazy_format_graph_code("before split", self.graph)) + logger.debug("%s", lazy_format_graph_code("after split", + self.split_gm)) compilation_counter.num_piecewise_graphs_seen += len( self.piecewise_graphs) @@ -385,12 +394,17 @@ class ConcreteSizeEntry: cudagraph: Optional[torch.cuda.CUDAGraph] = None output: Optional[Any] = None + # for cudagraph debugging, track the input addresses + # during capture, and check if they are the same during replay + input_addresses: Optional[List[int]] = None + class PiecewiseBackend: def __init__(self, graph: fx.GraphModule, compilation_configs: CompilationConfig, graph_pool: Any, - is_first_graph: bool, sym_shape_indices: List[int], + piecewise_compile_index: int, total_piecewise_compiles: int, + sym_shape_indices: List[int], compiled_graph_for_general_shape: Callable): """ The backend for piecewise compilation. @@ -408,7 +422,12 @@ def __init__(self, graph: fx.GraphModule, self.graph = graph self.compilation_configs = compilation_configs self.graph_pool = graph_pool - self.is_first_graph = is_first_graph + self.piecewise_compile_index = piecewise_compile_index + self.total_piecewise_compiles = total_piecewise_compiles + + self.is_first_graph = piecewise_compile_index == 0 + self.is_last_graph = ( + piecewise_compile_index == total_piecewise_compiles - 1) self.compile_sizes: Set[int] = set( self.compilation_configs.compile_sizes) @@ -422,6 +441,8 @@ def __init__(self, graph: fx.GraphModule, self.sym_shape_indices = sym_shape_indices + self.is_debugging_mode = envs.VLLM_LOGGING_LEVEL == "DEBUG" + # the entries for different shapes that we need to either # compile or capture cudagraph self.concrete_size_entries: Dict[int, ConcreteSizeEntry] = {} @@ -476,14 +497,45 @@ def __call__(self, *args) -> Any: logger.info("Capturing a cudagraph for shape %s", runtime_shape) + input_addresses = [ + x.data_ptr() for x in args if isinstance(x, torch.Tensor) + ] + entry.input_addresses = input_addresses cudagraph = torch.cuda.CUDAGraph() + + # mind-exploding: carefully manage the reference and memory. with torch.cuda.graph(cudagraph, pool=self.graph_pool): - entry.output = weak_ref_tensors(entry.runnable(*args)) + # `output` is managed by pytorch's cudagraph pool + output = entry.runnable(*args) + if self.is_last_graph: + # by converting it to weak ref, + # the original `output` will immediately be released + # to save memory. It is only safe to do this for + # the last graph, because the output of the last graph + # will not be used by any other cuda graph. + output = weak_ref_tensors(output) + + # here we always use weak ref for the output + # to save memory + entry.output = weak_ref_tensors(output) + entry.cudagraph = cudagraph compilation_counter.num_cudagraph_caputured += 1 - entry.cudagraph = cudagraph - return entry.output + # important: we need to return the output, rather than + # the weak ref of the output, so that pytorch can correctly + # manage the memory during cuda graph capture + return output + + if self.is_debugging_mode: + # check if the input addresses are the same + new_input_addresses = [ + x.data_ptr() for x in args if isinstance(x, torch.Tensor) + ] + assert new_input_addresses == entry.input_addresses, ( + "Input addresses for cudagraphs are different during replay." + f" Expected {entry.input_addresses}, got {new_input_addresses}" + ) entry.cudagraph.replay() return entry.output diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 2c584218485c8..a1809b1a9dd26 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -10,7 +10,7 @@ from typing import Set, Type, Union, cast, overload import torch -from typing_extensions import TypeIs, TypeVar +from typing_extensions import TypeVar import vllm.envs as envs from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig, @@ -29,9 +29,9 @@ from vllm.executor.executor_base import ExecutorBase from vllm.executor.gpu_executor import GPUExecutor from vllm.executor.ray_utils import initialize_ray_cluster -from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, - EncoderDecoderInputs, InputRegistry, PromptType, - TokensPrompt) +from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs, + PromptType) +from vllm.inputs.parse import is_encoder_decoder_inputs, is_token_prompt from vllm.inputs.preprocess import InputPreprocessor from vllm.logger import init_logger from vllm.logits_process import get_bad_words_logits_processors @@ -638,7 +638,7 @@ def _verify_args(self) -> None: def _add_processed_request( self, request_id: str, - processed_inputs: Union[DecoderOnlyInputs, EncoderDecoderInputs], + processed_inputs: ProcessorInputs, params: Union[SamplingParams, PoolingParams], arrival_time: float, lora_request: Optional[LoRARequest], @@ -669,18 +669,19 @@ def _add_processed_request( seq_id = next(self.seq_counter) eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request) - seq = Sequence(seq_id, processed_inputs, block_size, eos_token_id, + if is_encoder_decoder_inputs(processed_inputs): + decoder_inputs = processed_inputs["decoder"] + encoder_inputs = processed_inputs["encoder"] + else: + decoder_inputs = processed_inputs + encoder_inputs = None + + seq = Sequence(seq_id, decoder_inputs, block_size, eos_token_id, lora_request, prompt_adapter_request) - encoder_seq = None - if 'encoder_prompt_token_ids' in processed_inputs: - encoder_seq = Sequence(seq_id, - processed_inputs, - block_size, - eos_token_id, - lora_request, - prompt_adapter_request, - from_decoder_prompt=False) + encoder_seq = (None if encoder_inputs is None else Sequence( + seq_id, encoder_inputs, block_size, eos_token_id, lora_request, + prompt_adapter_request)) # Create a SequenceGroup based on SamplingParams or PoolingParams if isinstance(params, SamplingParams): @@ -874,7 +875,7 @@ def _validate_token_prompt(self, prompt: PromptType, # This needs to happen before multimodal input pre-processing, which # may add dummy tokens that aren't part of the tokenizer's # vocabulary. - if self._is_token_prompt(prompt): + if is_token_prompt(prompt): prompt_ids = prompt["prompt_token_ids"] if len(prompt_ids) == 0: # Empty prompt check is handled later @@ -884,10 +885,6 @@ def _validate_token_prompt(self, prompt: PromptType, raise ValueError( "Token id {} is out of vocabulary".format(max_input_id)) - @staticmethod - def _is_token_prompt(prompt: PromptType) -> TypeIs[TokensPrompt]: - return isinstance(prompt, dict) and "prompt_token_ids" in prompt - def _create_sequence_group_with_sampling( self, request_id: str, @@ -1978,17 +1975,17 @@ def create_trace_span(self, seq_group: SequenceGroup) -> None: def is_encoder_decoder_model(self): return self.input_preprocessor.is_encoder_decoder_model() - def _validate_model_inputs(self, inputs: Union[DecoderOnlyInputs, - EncoderDecoderInputs], + def _validate_model_inputs(self, inputs: ProcessorInputs, lora_request: Optional[LoRARequest]): - if self.model_config.is_multimodal_model: + if is_encoder_decoder_inputs(inputs): # For encoder-decoder multimodal models, the max_prompt_len # restricts the decoder prompt length - prompt_ids = inputs.get("prompt_token_ids") - elif self.is_encoder_decoder_model(): - prompt_ids = inputs.get("encoder_prompt_token_ids") + prompt_inputs = inputs["decoder" if self.model_config. + is_multimodal_model else "encoder"] else: - prompt_ids = inputs.get("prompt_token_ids") + prompt_inputs = inputs + + prompt_ids = prompt_inputs.get("prompt_token_ids") if prompt_ids is None or len(prompt_ids) == 0: raise ValueError("Prompt cannot be empty") diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py index 9dd6fa5b14315..e1dcb82829d76 100644 --- a/vllm/engine/multiprocessing/engine.py +++ b/vllm/engine/multiprocessing/engine.py @@ -5,7 +5,6 @@ import cloudpickle import zmq -from ray.exceptions import RayTaskError from vllm import AsyncEngineArgs, SamplingParams # yapf conflicts with isort for this block @@ -306,11 +305,17 @@ def _health_check(self): def _send_outputs(self, outputs: REQUEST_OUTPUTS_T): """Send List of RequestOutput to RPCClient.""" if outputs: - # RayTaskError might not pickelable here. We need to unpack the - # underlying exception as the real exception in the output. - if (isinstance(outputs, RPCError) - and isinstance(outputs.exception, RayTaskError)): - outputs.exception = outputs.exception.cause + try: + from ray.exceptions import RayTaskError + + # RayTaskError might not pickelable here. We need to unpack the + # underlying exception as the real exception in the output. + if (isinstance(outputs, RPCError) + and isinstance(outputs.exception, RayTaskError)): + outputs.exception = outputs.exception.cause + except ImportError: + pass + output_bytes = pickle.dumps(outputs) self.output_socket.send_multipart((output_bytes, ), copy=False) diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index 6a09361c56865..e0b59d94cfdc3 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -1,11 +1,12 @@ import asyncio from abc import ABC, abstractmethod -from typing import AsyncGenerator, List, Mapping, Optional, Union +from typing import AsyncGenerator, List, Mapping, Optional from vllm.beam_search import BeamSearchSequence, create_sort_beams_key_function from vllm.config import DecodingConfig, ModelConfig from vllm.core.scheduler import SchedulerOutputs from vllm.inputs.data import PromptType, TokensPrompt +from vllm.inputs.parse import is_explicit_encoder_decoder_prompt from vllm.inputs.preprocess import InputPreprocessor from vllm.logger import init_logger from vllm.lora.request import LoRARequest @@ -60,7 +61,7 @@ def generate( async def beam_search( self, - prompt: Union[PromptType, List[int]], + prompt: PromptType, model_config: ModelConfig, request_id: str, params: BeamSearchParams, @@ -76,11 +77,19 @@ async def beam_search( tokenizer = await self.get_tokenizer() input_preprocessor = InputPreprocessor(model_config, tokenizer) - (prompt_text, prompt_token_ids, multi_modal_data, - mm_processor_kwargs) = input_preprocessor._extract_prompt_components( - prompt, - request_id=request_id, - ) + if is_explicit_encoder_decoder_prompt(prompt): + raise NotImplementedError + else: + processed_inputs = input_preprocessor._prompt_to_llm_inputs( + prompt, + request_id=request_id, + ) + + prompt_token_ids = processed_inputs["prompt_token_ids"] + prompt_text = processed_inputs.get("prompt") + multi_modal_data = processed_inputs.get("multi_modal_data") + mm_processor_kwargs = processed_inputs.get("mm_processor_kwargs") + tokenized_length = len(prompt_token_ids) sort_beams_key = create_sort_beams_key_function( diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index bef36ffdbfcd3..917b347ff1161 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -569,7 +569,8 @@ async def run_server(args, **uvicorn_kwargs) -> None: # This avoids race conditions with ray. # see https://github.com/vllm-project/vllm/issues/8204 sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - sock.bind(("", args.port)) + sock.bind((args.host or "", args.port)) + sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) def signal_handler(*_) -> None: # Interrupt server on sigterm while initializing @@ -593,13 +594,14 @@ def signal_handler(*_) -> None: ssl_certfile=args.ssl_certfile, ssl_ca_certs=args.ssl_ca_certs, ssl_cert_reqs=args.ssl_cert_reqs, - fd=sock.fileno(), **uvicorn_kwargs, ) # NB: Await server shutdown only after the backend context is exited await shutdown_task + sock.close() + if __name__ == "__main__": # NOTE(simon): diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py index ab3ebb4e43d18..4ceb5a837dd7f 100644 --- a/vllm/executor/cpu_executor.py +++ b/vllm/executor/cpu_executor.py @@ -2,8 +2,6 @@ from functools import partial from typing import Any, Awaitable, List, Optional, Set, Tuple, Union -import torch - import vllm.envs as envs from vllm.config import (CacheConfig, ModelConfig, ParallelConfig, SchedulerConfig) @@ -316,9 +314,6 @@ async def check_health_async(self) -> None: def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig: - if config.dtype == torch.float16: - logger.warning("float16 is not supported on CPU, casting to bfloat16.") - config.dtype = torch.bfloat16 # Reminder: Please update docs/source/serving/compatibility_matrix.rst # If the feature combo become valid if not config.enforce_eager: diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py index ac7b3ca28b406..68ac50a2c5a16 100644 --- a/vllm/inputs/__init__.py +++ b/vllm/inputs/__init__.py @@ -1,8 +1,8 @@ from .data import (DecoderOnlyInputs, EncoderDecoderInputs, - ExplicitEncoderDecoderPrompt, PromptType, SingletonInputs, - SingletonPrompt, TextPrompt, TokenInputs, TokensPrompt, - build_explicit_enc_dec_prompt, to_enc_dec_tuple_list, - token_inputs, zip_enc_dec_prompts) + ExplicitEncoderDecoderPrompt, ProcessorInputs, PromptType, + SingletonInputs, SingletonPrompt, TextPrompt, TokenInputs, + TokensPrompt, build_explicit_enc_dec_prompt, + to_enc_dec_tuple_list, token_inputs, zip_enc_dec_prompts) from .registry import DummyData, InputContext, InputRegistry INPUT_REGISTRY = InputRegistry() @@ -22,9 +22,10 @@ "ExplicitEncoderDecoderPrompt", "TokenInputs", "token_inputs", - "SingletonInputs", "DecoderOnlyInputs", "EncoderDecoderInputs", + "ProcessorInputs", + "SingletonInputs", "build_explicit_enc_dec_prompt", "to_enc_dec_tuple_list", "zip_enc_dec_prompts", diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py index ba393cbcce4eb..46b41f431bec7 100644 --- a/vllm/inputs/data.py +++ b/vllm/inputs/data.py @@ -1,4 +1,4 @@ -from typing import (TYPE_CHECKING, Any, Dict, Generic, Iterable, List, +from typing import (TYPE_CHECKING, Any, Dict, Generic, Iterable, List, Literal, Optional, Tuple, Union, cast) from typing_extensions import NotRequired, TypedDict, TypeVar @@ -122,27 +122,30 @@ class ExplicitEncoderDecoderPrompt(TypedDict, Generic[_T1_co, _T2_co]): class TokenInputs(TypedDict): """Represents token-based inputs.""" + + type: Literal["token"] + """The type of inputs.""" + prompt_token_ids: List[int] """The token IDs of the prompt.""" - prompt: NotRequired[Optional[str]] + prompt: NotRequired[str] """ The original prompt text corresponding to the token IDs, if available. """ - multi_modal_data: NotRequired[Optional["MultiModalDataDict"]] + multi_modal_data: NotRequired["MultiModalDataDict"] """ Optional multi-modal data to pass to the model, if the model supports it. """ - multi_modal_placeholders: NotRequired[ - Optional["MultiModalPlaceholderDict"]] + multi_modal_placeholders: NotRequired["MultiModalPlaceholderDict"] """ Placeholder ranges for the multi-modal data. """ - mm_processor_kwargs: NotRequired[Optional[Dict[str, Any]]] + mm_processor_kwargs: NotRequired[Dict[str, Any]] """ Optional multi-modal processor kwargs to be forwarded to the multimodal input mapper & processor. Note that if multiple modalities @@ -159,7 +162,7 @@ def token_inputs( mm_processor_kwargs: Optional[Dict[str, Any]] = None, ) -> TokenInputs: """Construct :class:`TokenInputs` from optional values.""" - inputs = TokenInputs(prompt_token_ids=prompt_token_ids) + inputs = TokenInputs(type="token", prompt_token_ids=prompt_token_ids) if prompt is not None: inputs["prompt"] = prompt @@ -173,12 +176,6 @@ def token_inputs( return inputs -SingletonInputs = TokenInputs -""" -A processed :class:`SingletonPrompt` which can be passed to -:class:`vllm.sequence.Sequence`. -""" - DecoderOnlyInputs = TokenInputs """ The inputs in :class:`~vllm.LLMEngine` before they are @@ -187,28 +184,30 @@ def token_inputs( """ -class EncoderDecoderInputs(TokenInputs): +class EncoderDecoderInputs(TypedDict): """ The inputs in :class:`~vllm.LLMEngine` before they are passed to the model executor. This specifies the required data for encoder-decoder models. """ - encoder_prompt_token_ids: List[int] - """The token IDs of the encoder prompt.""" + encoder: TokenInputs + """The inputs for the encoder portion.""" - encoder_prompt: NotRequired[Optional[str]] - """ - The original encoder prompt text corresponding to the token IDs, if - available. - """ + decoder: TokenInputs + """The inputs for the decoder portion.""" - encoder_multi_modal_data: NotRequired[Optional["MultiModalDataDict"]] - """ - Optional multi-modal data to pass to the encoder model, - if the model supports it. - """ +SingletonInputs = TokenInputs +""" +A processed :class:`SingletonPrompt` which can be passed to +:class:`vllm.sequence.Sequence`. +""" + +ProcessorInputs = Union[DecoderOnlyInputs, EncoderDecoderInputs] +""" +The inputs to :data:`vllm.inputs.InputProcessor`. +""" _T1 = TypeVar("_T1", bound=SingletonPrompt, default=SingletonPrompt) _T2 = TypeVar("_T2", bound=SingletonPrompt, default=SingletonPrompt) diff --git a/vllm/inputs/parse.py b/vllm/inputs/parse.py index e79d2c813bb4f..09f1ff2cb42e9 100644 --- a/vllm/inputs/parse.py +++ b/vllm/inputs/parse.py @@ -4,9 +4,9 @@ from vllm.utils import is_list_of -from .data import (DecoderOnlyInputs, EncoderDecoderInputs, - ExplicitEncoderDecoderPrompt, PromptType, SingletonPrompt, - TextPrompt, TokensPrompt) +from .data import (EncoderDecoderInputs, ExplicitEncoderDecoderPrompt, + ProcessorInputs, PromptType, SingletonPrompt, TextPrompt, + TokensPrompt) class ParsedText(TypedDict): @@ -98,12 +98,15 @@ def parse_singleton_prompt( raise TypeError("inputs must be a string, TextPrompt, or TokensPrompt") +def is_token_prompt(prompt: PromptType) -> TypeIs[TokensPrompt]: + return isinstance(prompt, dict) and "prompt_token_ids" in prompt + + def is_explicit_encoder_decoder_prompt( prompt: PromptType) -> TypeIs[ExplicitEncoderDecoderPrompt]: return isinstance(prompt, dict) and "encoder_prompt" in prompt def is_encoder_decoder_inputs( - inputs: Union[DecoderOnlyInputs, EncoderDecoderInputs], -) -> TypeIs[EncoderDecoderInputs]: - return "encoder_prompt_token_ids" in inputs + inputs: ProcessorInputs) -> TypeIs[EncoderDecoderInputs]: + return "encoder" in inputs and "decoder" in inputs diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index 82ce7d392b719..a5c787a56b5a9 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -1,5 +1,5 @@ import asyncio -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union +from typing import List, Optional from typing_extensions import assert_never @@ -10,22 +10,12 @@ from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup from vllm.utils import print_warning_once -from .data import (DecoderOnlyInputs, EncoderDecoderInputs, PromptType, - SingletonPrompt) +from .data import (DecoderOnlyInputs, EncoderDecoderInputs, ProcessorInputs, + PromptType, SingletonInputs, SingletonPrompt, token_inputs) from .parse import is_explicit_encoder_decoder_prompt, parse_singleton_prompt -if TYPE_CHECKING: - from vllm.multimodal import MultiModalDataDict - logger = init_logger(__name__) -PromptComponents = Tuple[Optional[str], List[int], - Optional["MultiModalDataDict"], Optional[Dict[str, - Any]]] -DecoderPromptComponents = Tuple[Optional[str], Optional[List[int]], - Optional["MultiModalDataDict"], - Optional[Dict[str, Any]]] - class InputPreprocessor: @@ -115,7 +105,7 @@ def _get_default_enc_dec_decoder_prompt(self) -> List[int]: "default" decoder prompt be . However, it is possible that in the future - other models may have different or more + other models may have different or more complex logic for the default decoder prompt. This motivates having a special helper method for default decoder prompts. @@ -132,7 +122,6 @@ def _get_default_enc_dec_decoder_prompt(self) -> List[int]: def _prepare_decoder_input_ids_for_generation( self, decoder_input_ids: Optional[List[int]], - force_bos: bool = True, ) -> List[int]: """ Prepares `decoder_input_ids` for generation with encoder-decoder models. @@ -162,8 +151,8 @@ def _prepare_decoder_input_ids_for_generation( # use decoder_start_token_id as decoder_input_ids decoder_input_ids = self._get_default_enc_dec_decoder_prompt() - if force_bos and (len(decoder_input_ids) == 0 - or decoder_input_ids[0] != decoder_start_token_id): + if (len(decoder_input_ids) == 0 + or decoder_input_ids[0] != decoder_start_token_id): decoder_input_ids = [decoder_start_token_id] + decoder_input_ids return decoder_input_ids @@ -209,12 +198,12 @@ async def _tokenize_prompt_async( prompt=prompt, lora_request=lora_request) - def _extract_prompt_components( + def _prompt_to_llm_inputs( self, prompt: SingletonPrompt, request_id: str, lora_request: Optional[LoRARequest] = None, - ) -> PromptComponents: + ) -> SingletonInputs: ''' Extract the components of any single encoder or decoder input prompt. @@ -241,34 +230,52 @@ def _extract_prompt_components( request_id=request_id, lora_request=lora_request, ) - multi_modal_data = None - mm_processor_kwargs = None - elif parsed["type"] == "tokens": - prompt_text = None - prompt_token_ids = parsed["content"]["prompt_token_ids"] - multi_modal_data = parsed["content"].get("multi_modal_data") - mm_processor_kwargs = parsed["content"].get("mm_processor_kwargs") - elif parsed["type"] == "text": - prompt_text = parsed["content"]["prompt"] + + return token_inputs( + prompt=prompt_text, + prompt_token_ids=prompt_token_ids, + ) + + if parsed["type"] == "tokens": + tokens_content = parsed["content"] + + prompt_token_ids = tokens_content["prompt_token_ids"] + multi_modal_data = tokens_content.get("multi_modal_data") + mm_processor_kwargs = tokens_content.get("mm_processor_kwargs") + + return token_inputs( + prompt_token_ids=prompt_token_ids, + multi_modal_data=multi_modal_data, + mm_processor_kwargs=mm_processor_kwargs, + ) + + if parsed["type"] == "text": + text_content = parsed["content"] + + prompt_text = text_content["prompt"] prompt_token_ids = self._tokenize_prompt( prompt_text, request_id=request_id, lora_request=lora_request, ) - multi_modal_data = parsed["content"].get("multi_modal_data") - mm_processor_kwargs = parsed["content"].get("mm_processor_kwargs") - else: - assert_never(parsed) + multi_modal_data = text_content.get("multi_modal_data") + mm_processor_kwargs = text_content.get("mm_processor_kwargs") + + return token_inputs( + prompt=prompt_text, + prompt_token_ids=prompt_token_ids, + multi_modal_data=multi_modal_data, + mm_processor_kwargs=mm_processor_kwargs, + ) - return (prompt_text, prompt_token_ids, multi_modal_data, - mm_processor_kwargs) + assert_never(parsed) - async def _extract_prompt_components_async( + async def _prompt_to_llm_inputs_async( self, prompt: SingletonPrompt, request_id: str, lora_request: Optional[LoRARequest] = None, - ) -> PromptComponents: + ) -> SingletonInputs: """Async version of :meth:`_extract_prompt_components`.""" parsed = parse_singleton_prompt(prompt) @@ -279,59 +286,74 @@ async def _extract_prompt_components_async( request_id=request_id, lora_request=lora_request, ) - multi_modal_data = None - mm_processor_kwargs = None - elif parsed["type"] == "tokens": - prompt_text = None - prompt_token_ids = parsed["content"]["prompt_token_ids"] - multi_modal_data = parsed["content"].get("multi_modal_data") - mm_processor_kwargs = parsed["content"].get("mm_processor_kwargs") - elif parsed["type"] == "text": - prompt_text = parsed["content"]["prompt"] + + return token_inputs( + prompt=prompt_text, + prompt_token_ids=prompt_token_ids, + ) + + if parsed["type"] == "tokens": + tokens_content = parsed["content"] + + prompt_token_ids = tokens_content["prompt_token_ids"] + multi_modal_data = tokens_content.get("multi_modal_data") + mm_processor_kwargs = tokens_content.get("mm_processor_kwargs") + + return token_inputs( + prompt_token_ids=prompt_token_ids, + multi_modal_data=multi_modal_data, + mm_processor_kwargs=mm_processor_kwargs, + ) + + if parsed["type"] == "text": + text_content = parsed["content"] + + prompt_text = text_content["prompt"] prompt_token_ids = await self._tokenize_prompt_async( prompt_text, request_id=request_id, lora_request=lora_request, ) - multi_modal_data = parsed["content"].get("multi_modal_data") - mm_processor_kwargs = parsed["content"].get("mm_processor_kwargs") - else: - assert_never(parsed) + multi_modal_data = text_content.get("multi_modal_data") + mm_processor_kwargs = text_content.get("mm_processor_kwargs") + + return token_inputs( + prompt=prompt_text, + prompt_token_ids=prompt_token_ids, + multi_modal_data=multi_modal_data, + mm_processor_kwargs=mm_processor_kwargs, + ) - return (prompt_text, prompt_token_ids, multi_modal_data, - mm_processor_kwargs) + assert_never(parsed) def _build_enc_dec_llm_inputs( self, - encoder_comps: PromptComponents, - decoder_comps: DecoderPromptComponents, - mm_processor_kwargs: Dict[str, Any], + encoder_inputs: SingletonInputs, + decoder_inputs: Optional[SingletonInputs], ) -> EncoderDecoderInputs: - encoder_prompt, encoder_prompt_ids, encoder_mm_data, _ = encoder_comps - decoder_prompt, decoder_prompt_ids, decoder_mm_data, _ = decoder_comps - - # Reminder: Please update docs/source/serving/compatibility_matrix.rst - # If the feature combo become valid - if decoder_mm_data is not None: - raise ValueError( - "Multi-modality decoder inputs of encoder-decoder models are " - "not supported yet") - - # For Multi-Modal models (e.g., mllama), the text input can be - # <|image|><|begin_of_text|>hello world. And we should not add - # another <|begin_of_text|> to the beginning. - decoder_prompt_ids = (self._prepare_decoder_input_ids_for_generation( - decoder_prompt_ids, - force_bos=(encoder_mm_data is None and decoder_mm_data is None))) + if encoder_inputs["type"] == "token": + pass + else: + assert_never(encoder_inputs) + + if decoder_inputs is None: + dec_token_ids = self._prepare_decoder_input_ids_for_generation( + None) + decoder_inputs = token_inputs(dec_token_ids) + elif decoder_inputs["type"] == "token": + dec_token_ids = self._prepare_decoder_input_ids_for_generation( + decoder_inputs["prompt_token_ids"]) + decoder_inputs["prompt_token_ids"] = dec_token_ids + + if "multi_modal_data" in decoder_inputs: + raise ValueError("Multi-modal decoder inputs of encoder-" + "decoder models are not supported yet") + else: + assert_never(encoder_inputs) return EncoderDecoderInputs( - prompt_token_ids=decoder_prompt_ids, - prompt=decoder_prompt, - multi_modal_data=decoder_mm_data, - mm_processor_kwargs=mm_processor_kwargs, - encoder_prompt_token_ids=encoder_prompt_ids, - encoder_prompt=encoder_prompt, - encoder_multi_modal_data=encoder_mm_data, + encoder=encoder_inputs, + decoder=decoder_inputs, ) def _process_encoder_decoder_prompt( @@ -341,8 +363,7 @@ def _process_encoder_decoder_prompt( ) -> EncoderDecoderInputs: ''' For encoder/decoder models only: - Process an input prompt into an - :class:`EncoderDecoderInputs` instance. + Process an input prompt into an :class:`EncoderDecoderInputs` instance. There are two types of input prompts: singleton prompts which carry only the @@ -361,7 +382,7 @@ def _process_encoder_decoder_prompt( have any possible singleton type; thus this method relies on helper functions to obtain token ids for the sub-prompts. - + Arguments: * prompt: an input prompt @@ -372,40 +393,31 @@ def _process_encoder_decoder_prompt( * :class:`EncoderDecoderInputs` instance ''' - encoder_comps: PromptComponents - decoder_comps: DecoderPromptComponents + encoder_inputs: SingletonInputs + decoder_inputs: Optional[SingletonInputs] if is_explicit_encoder_decoder_prompt(prompt): - encoder_comps = self._extract_prompt_components( + encoder_inputs = self._prompt_to_llm_inputs( prompt["encoder_prompt"], request_id=request_id, ) if (decoder_input := prompt["decoder_prompt"]) is None: - decoder_comps = None, None, None, None + decoder_inputs = None else: - decoder_comps = self._extract_prompt_components( + decoder_inputs = self._prompt_to_llm_inputs( decoder_input, request_id=request_id, ) - # Handle this carefully in case it was directly initialized by user - mm_processor_kwargs = prompt.get("mm_processor_kwargs", {}) else: - encoder_comps = self._extract_prompt_components( + encoder_inputs = self._prompt_to_llm_inputs( prompt, request_id=request_id, ) - # If there are no decoder components, we assume the - # mm_processor_kwargs are in the encoder prompt - mm_processor_kwargs = encoder_comps[-1] if encoder_comps[ - -1] is not None else {} - decoder_comps = None, None, None, None - - return self._build_enc_dec_llm_inputs( - encoder_comps, - decoder_comps, - mm_processor_kwargs, - ) + + decoder_inputs = None + + return self._build_enc_dec_llm_inputs(encoder_inputs, decoder_inputs) async def _process_encoder_decoder_prompt_async( self, @@ -413,59 +425,50 @@ async def _process_encoder_decoder_prompt_async( request_id: str, ) -> EncoderDecoderInputs: """Async version of :meth:`_process_encoder_decoder_prompt`.""" - encoder_comps: PromptComponents - decoder_comps: DecoderPromptComponents + encoder_inputs: SingletonInputs + decoder_inputs: Optional[SingletonInputs] if is_explicit_encoder_decoder_prompt(prompt): - encoder_task = self._extract_prompt_components_async( + encoder_task = self._prompt_to_llm_inputs_async( prompt["encoder_prompt"], request_id=request_id, ) if (decoder_input := prompt["decoder_prompt"]) is None: - encoder_comps = await encoder_task - decoder_comps = None, None, None, None + encoder_inputs = await encoder_task + decoder_inputs = None else: - decoder_task = self._extract_prompt_components_async( + decoder_task = self._prompt_to_llm_inputs_async( decoder_input, request_id=request_id, ) - encoder_comps, decoder_comps = await asyncio.gather( + encoder_inputs, decoder_inputs = await asyncio.gather( encoder_task, decoder_task) - mm_processor_kwargs = prompt["mm_processor_kwargs"] else: - encoder_comps = await self._extract_prompt_components_async( + encoder_inputs = await self._prompt_to_llm_inputs_async( prompt, request_id=request_id, ) - # If there are no decoder components, we assume the - # mm_processor_kwargs are in the encoder prompt - mm_processor_kwargs = encoder_comps[-1] if encoder_comps[ - -1] is not None else {} - decoder_comps = None, None, None, None - - return self._build_enc_dec_llm_inputs( - encoder_comps, - decoder_comps, - mm_processor_kwargs, - ) + + decoder_inputs = None + + return self._build_enc_dec_llm_inputs(encoder_inputs, decoder_inputs) def _build_decoder_only_llm_inputs( self, - prompt_comps: PromptComponents, + prompt_inputs: DecoderOnlyInputs, prompt_adapter_request: Optional[PromptAdapterRequest], ) -> DecoderOnlyInputs: - (prompt, prompt_token_ids, multi_modal_data, - mm_processor_kwargs) = prompt_comps - - prompt_token_ids = self._apply_prompt_adapter( - prompt_token_ids, prompt_adapter_request=prompt_adapter_request) + if prompt_inputs["type"] == "token": + prompt_inputs["prompt_token_ids"] = self._apply_prompt_adapter( + prompt_inputs["prompt_token_ids"], + prompt_adapter_request=prompt_adapter_request, + ) + else: + assert_never(prompt_inputs) - return DecoderOnlyInputs(prompt_token_ids=prompt_token_ids, - prompt=prompt, - multi_modal_data=multi_modal_data, - mm_processor_kwargs=mm_processor_kwargs) + return prompt_inputs def _process_decoder_only_prompt( self, @@ -490,7 +493,7 @@ def _process_decoder_only_prompt( * :class:`DecoderOnlyInputs` instance ''' - prompt_comps = self._extract_prompt_components( + prompt_comps = self._prompt_to_llm_inputs( prompt, request_id=request_id, lora_request=lora_request, @@ -509,7 +512,7 @@ async def _process_decoder_only_prompt_async( prompt_adapter_request: Optional[PromptAdapterRequest] = None, ) -> DecoderOnlyInputs: """Async version of :meth:`_process_decoder_only_prompt`.""" - prompt_comps = await self._extract_prompt_components_async( + prompt_comps = await self._prompt_to_llm_inputs_async( prompt, request_id=request_id, lora_request=lora_request, @@ -526,7 +529,7 @@ def preprocess( request_id: str, lora_request: Optional[LoRARequest] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, - ) -> Union[DecoderOnlyInputs, EncoderDecoderInputs]: + ) -> ProcessorInputs: """Preprocess the input prompt.""" if self.is_encoder_decoder_model(): # Encoder-decoder model requires special mapping of @@ -554,7 +557,7 @@ async def preprocess_async( request_id: str, lora_request: Optional[LoRARequest] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, - ) -> Union[DecoderOnlyInputs, EncoderDecoderInputs]: + ) -> ProcessorInputs: """Async version of :meth:`preprocess`.""" if self.is_encoder_decoder_model(): # Encoder-decoder model requires special mapping of diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index fbf912a212568..7d7a797be4f60 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -2,7 +2,7 @@ from collections import UserDict from dataclasses import dataclass from typing import (TYPE_CHECKING, Any, Callable, Dict, Mapping, NamedTuple, - Optional, Protocol, Type) + Optional, Protocol, Type, cast) from torch import nn from transformers import PretrainedConfig @@ -12,7 +12,7 @@ from vllm.utils import (get_allowed_kwarg_only_overrides, print_warning_once, resolve_mm_processor_kwargs) -from .data import DecoderOnlyInputs +from .data import ProcessorInputs if TYPE_CHECKING: from vllm.config import ModelConfig @@ -109,7 +109,7 @@ def __getitem__(self, key: str) -> int: raise KeyError(msg) from exc -InputProcessor = Callable[[InputContext, DecoderOnlyInputs], DecoderOnlyInputs] +InputProcessor = Callable[[InputContext, ProcessorInputs], ProcessorInputs] """Preprocess the inputs to the model.""" @@ -254,8 +254,8 @@ def dummy_data_for_profiling( def _default_input_processor( self, ctx: InputContext, - inputs: DecoderOnlyInputs, - ) -> DecoderOnlyInputs: + inputs: ProcessorInputs, + ) -> ProcessorInputs: """The default input processor is a no-op.""" return inputs @@ -288,7 +288,7 @@ def _get_model_input_processor(self, model_cls: Type[nn.Module]): .get(model_cls, self._default_input_processor) def process_input(self, model_config: "ModelConfig", - inputs: DecoderOnlyInputs) -> DecoderOnlyInputs: + inputs: ProcessorInputs) -> ProcessorInputs: """ Apply an input processor to an instance of model inputs. @@ -308,7 +308,7 @@ def process_input(self, model_config: "ModelConfig", # If it's empty, it'll fall back to the default kwarg values mm_processor_kwargs = resolve_mm_processor_kwargs( model_config.mm_processor_kwargs, - inputs.get("mm_processor_kwargs"), + cast(Dict[str, Any], inputs.get("mm_processor_kwargs")), processor, ) diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py index 658a3700f33d6..e347ca80ff765 100644 --- a/vllm/model_executor/layers/activation.py +++ b/vllm/model_executor/layers/activation.py @@ -299,3 +299,33 @@ def get_act_fn( return ScaledActivation(act_fn, intermediate_size, input_is_parallel, params_dtype) return act_fn + + +_ACTIVATION_AND_MUL_REGISTRY = LazyDict({ + "gelu": lambda: GeluAndMul(), + "silu": lambda: SiluAndMul(), +}) + + +def get_act_and_mul_fn( + act_fn_name: str, + quant_config: Optional[QuantizationConfig] = None, + intermediate_size: Optional[int] = None, + input_is_parallel: bool = True, + params_dtype: Optional[torch.dtype] = None, +) -> nn.Module: + """Get an activation-and-mul (i.e. SiluAndMul) function by name.""" + act_fn_name = act_fn_name.lower() + if act_fn_name not in _ACTIVATION_AND_MUL_REGISTRY: + raise ValueError( + f"Activation function {act_fn_name!r} is not supported.") + + act_fn = _ACTIVATION_AND_MUL_REGISTRY[act_fn_name] + if (quant_config is not None + and act_fn_name in quant_config.get_scaled_act_names()): + if intermediate_size is None: + raise ValueError("intermediate_size must be specified for scaled " + "activation functions.") + return ScaledActivation(act_fn, intermediate_size, input_is_parallel, + params_dtype) + return act_fn diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py index 718967a065192..78965d7b9495c 100644 --- a/vllm/model_executor/layers/quantization/bitsandbytes.py +++ b/vllm/model_executor/layers/quantization/bitsandbytes.py @@ -203,8 +203,9 @@ def create_qweight_for_4bit(): qweight = create_qweight_for_8bit() else: qweight = create_qweight_for_4bit() - - layer.register_parameter("qweight", qweight) + # Enable parameters to have the same name as in the BNB + # checkpoint format. + layer.register_parameter("weight", qweight) set_weight_attrs(qweight, extra_weight_attrs) def apply(self, @@ -234,7 +235,7 @@ def _apply_8bit_weight( reshape_after_matmul = True bf_x = x.to(torch.bfloat16) - qweight = layer.qweight + qweight = layer.weight offsets = qweight.bnb_shard_offsets quant_states = qweight.bnb_quant_state matmul_states = qweight.matmul_state @@ -313,7 +314,7 @@ def _apply_4bit_weight( reshape_after_matmul = True bf_x = x.to(torch.bfloat16) - qweight = layer.qweight + qweight = layer.weight quant_states = qweight.bnb_quant_state offsets = qweight.bnb_shard_offsets diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py index e54052632e468..cca8170ada6fd 100644 --- a/vllm/model_executor/layers/quantization/ipex_quant.py +++ b/vllm/model_executor/layers/quantization/ipex_quant.py @@ -54,7 +54,7 @@ def get_name(cls) -> str: @classmethod def get_supported_act_dtypes(cls) -> List[torch.dtype]: - return [torch.bfloat16] + return [torch.bfloat16, torch.float16] @classmethod def get_min_capability(cls) -> int: diff --git a/vllm/model_executor/layers/resampler.py b/vllm/model_executor/layers/resampler.py index bce91f1d7fd5e..bca44d2bf2e28 100644 --- a/vllm/model_executor/layers/resampler.py +++ b/vllm/model_executor/layers/resampler.py @@ -177,7 +177,7 @@ def __init__(self, embed_dim, bias=False, quant_config=quant_config, - prefix=prefix) + prefix=f"{prefix}.kv_proj") else: # Maintain the same return value with ReplicatedLinear.forward self.kv_proj = lambda *args, **kwargs: ( # type: ignore # noqa diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index c3e0290f270ae..1f8d531198324 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -892,7 +892,7 @@ def _quantized_8bit_generator(self, hf_weights_files, use_safetensors, if not weight_name.lower().endswith(".scb"): continue - weight_key = weight_name.lower().replace(".scb", ".qweight") + weight_key = weight_name.lower().replace(".scb", ".weight") quant_state_dict[weight_key] = weight_tensor for weight_name, weight_tensor in self._hf_weight_iter( @@ -901,11 +901,9 @@ def _quantized_8bit_generator(self, hf_weights_files, use_safetensors, if self._is_8bit_weight_name(weight_name): continue - qweight_name = weight_name.replace(".weight", ".qweight") - - if qweight_name in quant_state_dict: + if weight_name in quant_state_dict: set_weight_attrs(weight_tensor, {"load_in_8bit": True}) - yield qweight_name, weight_tensor + yield weight_name, weight_tensor else: yield weight_name, weight_tensor @@ -950,9 +948,8 @@ def _parse_quant_state(param_name: str, (f"{weight_name}.quant_state.bitsandbytes__fp4" \ in temp_state_dict): quant_state = _parse_quant_state(weight_name, temp_state_dict) - weight_name = weight_name.replace(".weight", ".qweight") quant_state_dict[weight_name] = quant_state - yield weight_name.replace(".weight", ".qweight"), weight_tensor + yield weight_name, weight_tensor else: yield weight_name, weight_tensor @@ -967,7 +964,6 @@ def _unquantized_generator(self, hf_weights_files, use_safetensors, if any(target_module in weight_name for target_module in self.target_modules) and weight_name.endswith(".weight"): - weight_name = weight_name.replace(".weight", ".qweight") # Without sharding if any( weight_name.startswith(module) @@ -1093,7 +1089,7 @@ def _load_weights(self, model_config: ModelConfig, # Some models, such as MiniCPM V2.5/2.6, contain both # module names 'kv_proj' and 'qkv_proj'. To prevent 'kv_proj' # from being incorrectly identified as being present in - # 'vpm.encoder.layers.0.self_attn.qkv_proj.qweight + # 'vpm.encoder.layers.0.self_attn.qkv_proj.weight if shard_pos > 0 and quant_param_name[shard_pos - 1] == ".": shard_index = index quant_param_name = quant_param_name.replace( diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index c3c9ec703c1e6..181f3c2b0fc35 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -54,8 +54,9 @@ def mm_input_mapper_for_glmv( data: MultiModalData[object], ) -> Dict: model_config = ctx.model_config - tokenizer = cached_get_tokenizer(model_config.tokenizer, - trust_remote_code=True) + tokenizer = cached_get_tokenizer( + model_config.tokenizer, + trust_remote_code=model_config.trust_remote_code) if tokenizer is None: raise RuntimeError("No HuggingFace processor is available " "to process the image object") @@ -525,7 +526,7 @@ def _parse_and_validate_image_input( elif isinstance(pixel_values, list): return torch.concat(pixel_values) else: - raise TypeError("""pixel_values must be a torch.Tensor + raise TypeError("""pixel_values must be a torch.Tensor or a list of torch.Tensor """) return GLMImagePixelInputs(pixel_values=pixel_values) diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py index ec726dc4ff4fa..985ba6f3c60c1 100644 --- a/vllm/model_executor/models/mamba.py +++ b/vllm/model_executor/models/mamba.py @@ -39,8 +39,8 @@ def __init__(self, super().__init__() self.config = config self.is_falcon_mamba = config.model_type == "falcon_mamba" - mixer_rms_rps = config.mixer_rms_rps if self.is_falcon_mamba else None - self.mamba = MambaMixer(hidden_size=config.hidden_size, + mixer_rms_eps = config.mixer_rms_eps if self.is_falcon_mamba else None + self.mixer = MambaMixer(hidden_size=config.hidden_size, ssm_state_size=config.state_size, conv_kernel_size=config.conv_kernel, intermediate_size=config.intermediate_size, @@ -48,7 +48,7 @@ def __init__(self, use_conv_bias=config.use_conv_bias, use_bias=config.use_bias, use_rms_norm=self.is_falcon_mamba, - rms_norm_eps=mixer_rms_rps, + rms_norm_eps=mixer_rms_eps, activation=config.hidden_act) self.norm = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon) @@ -99,7 +99,6 @@ def __init__( for i in range(config.num_hidden_layers): decoder_layers.append( MambaDecoderLayer(config, - layer_idx=i, cache_config=cache_config, quant_config=quant_config)) self.layers = nn.ModuleList(decoder_layers) diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py index d30b9addd09f1..251bfc079684e 100644 --- a/vllm/model_executor/models/mllama.py +++ b/vllm/model_executor/models/mllama.py @@ -36,8 +36,8 @@ from vllm.attention.ops.paged_attn import PagedAttention from vllm.config import CacheConfig, MultiModalConfig from vllm.distributed import get_tensor_model_parallel_world_size -from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, - EncoderDecoderInputs, InputContext) +from vllm.inputs import (INPUT_REGISTRY, DummyData, EncoderDecoderInputs, + InputContext, TokenInputs, token_inputs) from vllm.logger import init_logger from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (ColumnParallelLinear, @@ -52,6 +52,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.sequence import SequenceData +from vllm.utils import is_list_of from .clip import CLIPMLP from .interfaces import SupportsMultiModal @@ -86,41 +87,58 @@ def _get_num_image_in_last_group(prompt_token_ids: List[int]) -> int: return num_images -def input_processor_for_mllama(ctx: InputContext, - inputs: Union[DecoderOnlyInputs, - EncoderDecoderInputs]): - # move encoder_prompt to prompt - if inputs.get("prompt") is None: - inputs["prompt"] = inputs["encoder_prompt"] - inputs["prompt_token_ids"] = inputs["encoder_prompt_token_ids"] +def input_processor_for_mllama( + ctx: InputContext, + inputs: EncoderDecoderInputs, +) -> EncoderDecoderInputs: + # Example input to processor: + # { + # 'encoder': { + # 'type': 'token', + # 'prompt_token_ids': [128000, 128256, 128000, 3923, 374, 279, 2262, 315, 420, 2217, 30], # noqa: E501 + # 'prompt': '<|image|><|begin_of_text|>What is the content of this image?', # noqa: E501 + # 'multi_modal_data': {'image': }, # noqa: E501 + # }, + # 'decoder': { + # 'type': 'token', + # 'prompt_token_ids': [128000], + # }, + # } + + # move encoder prompt to decoder + dec_inputs = TokenInputs(**inputs["encoder"]) + + multi_modal_data = dec_inputs.get("multi_modal_data") + if multi_modal_data is None or "image" not in multi_modal_data: + # text-only + return EncoderDecoderInputs( + encoder=token_inputs([]), + decoder=dec_inputs, + ) - # process multi-modal data - multi_modal_data = inputs.get("encoder_multi_modal_data") + image_data = multi_modal_data["image"] + if isinstance(image_data, Image.Image): + image_data = [image_data] - if multi_modal_data is None or "image" not in multi_modal_data \ - or multi_modal_data["image"] is None: - # text-only - inputs["encoder_prompt"] = "" - inputs["encoder_prompt_token_ids"] = [] - inputs["encoder_multi_modal_data"] = {} - return inputs + assert is_list_of(image_data, Image.Image) - if isinstance(multi_modal_data['image'], Image.Image): - multi_modal_data['image'] = [multi_modal_data['image']] # Since only the last group of consecutive images # are attended by the decoded tokens, we only need to # get the number of tiles for those images. num_decode_images = _get_num_image_in_last_group( - inputs["prompt_token_ids"]) + dec_inputs["prompt_token_ids"]) + hf_config = ctx.model_config.hf_config + vision_config = hf_config.vision_config + num_tiles = 0 - for image in multi_modal_data["image"][::-1]: + for image in image_data[::-1]: width, height = image.size - tile_size = hf_config.vision_config.image_size + tile_size = vision_config.image_size canvas_height, canvas_width = get_optimal_tiled_canvas( image_height=height, image_width=width, - max_image_tiles=hf_config.vision_config.max_num_tiles, + max_image_tiles=vision_config.max_num_tiles, tile_size=tile_size, ) num_tiles_height = canvas_height // tile_size @@ -133,14 +151,34 @@ def input_processor_for_mllama(ctx: InputContext, # Set encoder prompt length based on the number of tiles. # This tells the block manager to allocate correct number # of slots for encoder tokens. - assert hf_config.vision_config.image_size % 14 == 0, \ + assert vision_config.image_size % 14 == 0, \ "chunk size should be multiple of 14" - token_per_chunk = (hf_config.vision_config.image_size // 14)**2 + 1 + token_per_chunk = (vision_config.image_size // 14)**2 + 1 num_tokens = num_tiles * token_per_chunk - inputs["encoder_prompt"] = MLLAMA_IMAGE_TOKEN * num_tokens - inputs["encoder_prompt_token_ids"] = [MLLAMA_IMAGE_TOKEN_ID] * num_tokens - return inputs + # Example output from processor: + # { + # 'encoder': { + # 'type': 'token', + # 'prompt_token_ids': [128256, 128256, ..., 128256], + # 'prompt': '<|image|><|image|>...<|image|>', + # 'multi_modal_data': {'image': }, # noqa: E501 + # }, + # 'decoder': { + # 'type': 'token', + # 'prompt_token_ids': [128000, 128256, 128000, 3923, 374, 279, 2262, 315, 420, 2217, 30], # noqa: E501 + # 'prompt': '<|image|><|begin_of_text|>What is the content of this image?', # noqa: E501 + # 'multi_modal_data': {'image': }, # noqa: E501 + # }, + # } + return EncoderDecoderInputs( + encoder=token_inputs( + prompt_token_ids=[MLLAMA_IMAGE_TOKEN_ID] * num_tokens, + prompt=MLLAMA_IMAGE_TOKEN * num_tokens, + multi_modal_data=multi_modal_data, + ), + decoder=dec_inputs, + ) def get_max_mllama_image_tokens(ctx: InputContext) -> int: diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index ba798833e26a9..07c06149f0206 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -844,9 +844,10 @@ def get_max_tokens(max_crops: int, crop_patches: int, left_margin: int, def get_max_molmo_image_tokens(ctx: InputContext) -> int: - processor = cached_get_processor(ctx.model_config.model, - trust_remote_code=True, - revision=ctx.model_config.code_revision) + processor = cached_get_processor( + ctx.model_config.model, + trust_remote_code=ctx.model_config.trust_remote_code, + revision=ctx.model_config.code_revision) image_processor = processor.image_processor max_llm_image_tokens = get_max_tokens( image_processor.max_crops, @@ -870,9 +871,10 @@ def image_input_mapper_for_molmo( def dummy_data_for_molmo(ctx: InputContext, seq_len: int, mm_counts: Mapping[str, int]): - processor = cached_get_processor(ctx.model_config.model, - trust_remote_code=True, - revision=ctx.model_config.code_revision) + processor = cached_get_processor( + ctx.model_config.model, + trust_remote_code=ctx.model_config.trust_remote_code, + revision=ctx.model_config.code_revision) image_processor = processor.image_processor base_image_input_d = image_processor.image_patch_size @@ -935,11 +937,11 @@ def input_processor_for_molmo(ctx: InputContext, inputs: DecoderOnlyInputs): multi_modal_data = inputs.get("multi_modal_data") image = None if multi_modal_data is None else multi_modal_data.get("image") - processor = cached_get_processor(ctx.model_config.model, - trust_remote_code=True, - revision=ctx.model_config.code_revision) - model_config = ctx.model_config + processor = cached_get_processor( + ctx.model_config.model, + trust_remote_code=model_config.trust_remote_code, + revision=ctx.model_config.code_revision) tokenizer = cached_get_tokenizer( model_config.tokenizer, trust_remote_code=model_config.trust_remote_code) diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index 051454c49bff8..ee9f150b17cfc 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -19,8 +19,11 @@ from vllm.config import CacheConfig, ModelConfig, MultiModalConfig from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, InputContext, token_inputs) -from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.activation import get_act_and_mul_fn from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -798,20 +801,24 @@ def __init__( super().__init__() assert config.intermediate_size is not None - # TODO: Use quant_config and prefix after optimizing this - self.gate_proj = nn.Linear(config.hidden_size, - config.intermediate_size, - bias=False) - self.up_proj = nn.Linear(config.hidden_size, - config.intermediate_size, - bias=False) - self.down_proj = nn.Linear(config.intermediate_size, - config.hidden_size, - bias=False) - self.act = get_act_fn(config.hidden_act) + self.gate_up_proj = MergedColumnParallelLinear( + input_size=config.hidden_size, + output_sizes=[config.intermediate_size] * 2, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.gate_up_proj") + self.down_proj = RowParallelLinear(input_size=config.intermediate_size, + output_size=config.hidden_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.down_proj") + self.act_and_mul = get_act_and_mul_fn(config.hidden_act) def forward(self, x: torch.Tensor) -> torch.Tensor: - return self.down_proj(self.act(self.gate_proj(x)) * self.up_proj(x)) + gate_up, _ = self.gate_up_proj(x) + x = self.act_and_mul(gate_up) + x, _ = self.down_proj(x) + return x class PixtralHFAttention(nn.Module): @@ -830,21 +837,21 @@ def __init__( self.n_heads = config.num_attention_heads self.head_dim = config.hidden_size // config.num_attention_heads - self.scale = self.head_dim**-0.5 - - # TODO: Use quant_config and prefix after optimizing this - self.q_proj = nn.Linear(config.hidden_size, - config.hidden_size, - bias=False) - self.k_proj = nn.Linear(config.hidden_size, - config.hidden_size, - bias=False) - self.v_proj = nn.Linear(config.hidden_size, - config.hidden_size, - bias=False) - self.o_proj = nn.Linear(config.hidden_size, - config.hidden_size, - bias=False) + self.qkv_proj = QKVParallelLinear( + hidden_size=config.hidden_size, + head_size=self.head_dim, + total_num_heads=self.n_heads, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", + ) + self.o_proj = RowParallelLinear( + input_size=config.hidden_size, + output_size=config.hidden_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.o_proj", + ) def forward( self, @@ -854,13 +861,13 @@ def forward( ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: batch, patches, _ = hidden_states.size() - q = self.q_proj(hidden_states) - k = self.k_proj(hidden_states) - v = self.v_proj(hidden_states) + qkv_states, _ = self.qkv_proj(hidden_states) + q, k, v = qkv_states.chunk(3, dim=-1) # Transpose q and k to apply HF's Rotary Position Embedding q = q.view(batch, patches, self.n_heads, self.head_dim).transpose(1, 2) k = k.view(batch, patches, self.n_heads, self.head_dim).transpose(1, 2) + v = v.view(batch, patches, self.n_heads, self.head_dim) cos, sin = position_embeddings q, k = apply_rotary_pos_emb(q, k, cos, sin, unsqueeze_dim=0) @@ -868,22 +875,21 @@ def forward( # Transpose q and k back for attention q = q.transpose(1, 2).contiguous() k = k.transpose(1, 2).contiguous() - v = v.reshape(batch, patches, self.n_heads, self.head_dim) out = xops.memory_efficient_attention(q, k, v, attn_bias=attention_mask) else: - v = v.reshape(batch, patches, self.n_heads, - self.head_dim).transpose(1, 2) + v = v.transpose(1, 2) out = nn.functional.scaled_dot_product_attention( q, k, v, attn_mask=attention_mask) out = out.transpose(1, 2) - out = out.reshape(batch, patches, self.n_heads * self.head_dim) + out = out.view(batch, patches, self.n_heads * self.head_dim) + attn_output, _ = self.o_proj(out) - return self.o_proj(out) + return attn_output, None class PixtralHFTransformerBlock(nn.Module): @@ -912,9 +918,9 @@ def forward( attention_mask: torch.Tensor, position_embeddings: torch.Tensor, ) -> torch.Tensor: - r = self.attention.forward(self.attention_norm(hidden_states), - attention_mask=attention_mask, - position_embeddings=position_embeddings) + r, _ = self.attention.forward(self.attention_norm(hidden_states), + attention_mask=attention_mask, + position_embeddings=position_embeddings) h = hidden_states + r r = self.feed_forward.forward(self.ffn_norm(h)) out = h + r @@ -1053,10 +1059,24 @@ def forward( # (TODO) Add prefix argument for filtering out weights to be loaded # ref: https://github.com/vllm-project/vllm/pull/7186#discussion_r1734163986 def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): - stacked_params_mapping = [] + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + (".qkv_proj", ".q_proj", "q"), + (".qkv_proj", ".k_proj", "k"), + (".qkv_proj", ".v_proj", "v"), + (".gate_up_proj", ".gate_proj", 0), + (".gate_up_proj", ".up_proj", 1), + ] params_dict = dict(self.named_parameters()) + layer_count = len(self.transformer.layers) for name, loaded_weight in weights: + # omit layers when num_hidden_layers_override is set + if name.startswith("transformer.layers"): + layer_idx = int(name.split(".")[2]) + if layer_idx >= layer_count: + continue + for (param_name, weight_name, shard_id) in stacked_params_mapping: if weight_name not in name: continue diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 3a929f5cb5195..af52fbffba19e 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -343,6 +343,11 @@ def register_model( def _raise_for_unsupported(self, architectures: List[str]): all_supported_archs = self.get_supported_archs() + if any(arch in all_supported_archs for arch in architectures): + raise ValueError( + f"Model architectures {architectures} failed " + "to be inspected. Please check the logs for more details.") + raise ValueError( f"Model architectures {architectures} are not supported for now. " f"Supported architectures: {all_supported_archs}") diff --git a/vllm/sequence.py b/vllm/sequence.py index 44a9257c9a4c1..7d7ddc7ec4447 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -9,12 +9,12 @@ from typing import (TYPE_CHECKING, Any, Callable, DefaultDict, Dict, List, Mapping, Optional) from typing import Sequence as GenericSequence -from typing import Set, Tuple, Union, cast +from typing import Set, Tuple, Union import msgspec import torch +from typing_extensions import assert_never -from vllm.inputs.parse import is_encoder_decoder_inputs from vllm.lora.request import LoRARequest from vllm.multimodal import MultiModalDataDict, MultiModalPlaceholderDict from vllm.pooling_params import PoolingParams @@ -379,15 +379,10 @@ def __repr__(self) -> str: class Sequence: """Stores the data, status, and block information of a sequence. - - The sequence is constructed from the :code:`SingletonInputs` instance - passed in through the :code:`inputs` constructor argument. - - For encoder/decoder models, SingletonInputs encapsulates both a - decoder and encoder prompt, creating an ambiguity about which - prompt to construct the sequence from. The `from_decoder_prompt` - constructor argument signals whether to construct the Sequence - from the SingletonInputs decoder prompt, or encoder prompt. + + The sequence is constructed from the :data:`DecoderOnlyInputs` + (for decoder-only) or :data:`EncoderDecoderInputs` (for encoder-decoder) + instance passed in through the :code:`inputs` constructor argument. Args: seq_id: The ID of the sequence. @@ -397,10 +392,6 @@ class Sequence: eos_token_id: The end-of-sequence (EOS) token id recognized by this LLM. lora_request: LoRA request. prompt_adapter_request: Prompt Adapter request. - from_decoder_prompt: Construct Sequence from SingletonInputs decoder - prompt (True) or encoder prompt (False.) Must be - True for decoder-only model. - """ def __init__( @@ -411,7 +402,6 @@ def __init__( eos_token_id: Optional[int] = None, lora_request: Optional[LoRARequest] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, - from_decoder_prompt: bool = True, ) -> None: self.seq_id = seq_id self.inputs = inputs @@ -419,33 +409,6 @@ def __init__( self.eos_token_id = eos_token_id self.lora_request = lora_request self.prompt_adapter_request = prompt_adapter_request - self.from_decoder_prompt = from_decoder_prompt - - # For decoder-only models, a Sequence is constructed - # from an DecoderOnlyInputs instance (the `inputs` arg.) - # - # For encoder/decoder models the same `inputs` - # instance could be utilized to construct either an - # encoder sequence or a decoder sequence, because - # `DecoderOnlyInputs` has both decoder- and encoder-oriented - # member variables (i.e. it encapsulates both an encoder - # and a decoder prompt.) The decision of which type of sequence - # to generate is determined by the `from_decoder_prompt` argument. - # - # When constructing a encoder sequence - # (`from_decoder_prompt` False) it matters that - # the `DecoderOnlyInputs` instance stored in `inputs` is valid - # in the sense that its encoder-related member variables are - # populated; below, an exception is raised if this is - # not the case. - # - # When constructing a decoder sequence (`from_decoder_prompt` True) - # it does not matter whether `inputs` has its encoder-related - # member variables populated. - if not (from_decoder_prompt or is_encoder_decoder_inputs(inputs)): - raise ValueError("Cannot extract encoder input prompt from " - f"invalid input {inputs}; did you forget the " - "encoder input prompt fields?") self.data = SequenceData.from_seqs(self.prompt_token_ids) self.output_logprobs: SampleLogprobs = [] @@ -470,45 +433,57 @@ def n_blocks(self) -> int: @cached_property def prompt(self) -> Optional[str]: - # Select decoder or encoder input prompt str, as appropriate - prompt_key: str = ("prompt" - if self.from_decoder_prompt else "encoder_prompt") + inputs = self.inputs - return cast(Optional[str], self.inputs.get(prompt_key)) + if inputs["type"] == "token": + return inputs.get("prompt") + + assert_never(inputs) @cached_property def prompt_token_ids(self) -> List[int]: - # Select decoder or encoder input prompt token ids, as appropriate - prompt_token_ids_key: str = ("prompt_token_ids" - if self.from_decoder_prompt else - "encoder_prompt_token_ids") + inputs = self.inputs - # Cache computed prompt token ids - return cast(List[int], self.inputs.get(prompt_token_ids_key)) + if inputs["type"] == "token": + return inputs.get("prompt_token_ids", []) - @property - def multi_modal_data(self) -> MultiModalDataDict: + assert_never(inputs) + + @cached_property + def prompt_embeds(self) -> Optional[torch.Tensor]: inputs = self.inputs - if (inputs.get("multi_modal_data") - and inputs.get("encoder_multi_modal_data")): - raise ValueError( - "Multi-modal data in both encoder and decoder is not supported." - ) + if inputs["type"] == "token": + return None - return cast( - MultiModalDataDict, - (inputs.get("multi_modal_data") - or inputs.get("encoder_multi_modal_data") or {}), - ) + assert_never(inputs) + + @cached_property + def multi_modal_data(self) -> "MultiModalDataDict": + inputs = self.inputs + + if inputs["type"] == "token": + return inputs.get("multi_modal_data", {}) + + assert_never(inputs) + + @cached_property + def mm_processor_kwargs(self) -> Dict[str, Any]: + inputs = self.inputs + + if inputs["type"] == "token": + return inputs.get("mm_processor_kwargs", {}) + + assert_never(inputs) @property def multi_modal_placeholders(self) -> MultiModalPlaceholderDict: - return self.inputs.get("multi_modal_placeholders") or {} + inputs = self.inputs - @property - def mm_processor_kwargs(self) -> Dict[str, Any]: - return self.inputs.get("mm_processor_kwargs") or {} + if inputs["type"] == "token": + return inputs.get("multi_modal_placeholders", {}) + + assert_never(inputs) @property def lora_int_id(self) -> int: diff --git a/vllm/utils.py b/vllm/utils.py index a742ec8d76908..0b75e8761c916 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -1148,9 +1148,23 @@ def __call__(self, parser, namespace, values, option_string=None): "Expected 'true' or 'false'.") +class SortedHelpFormatter(argparse.HelpFormatter): + """SortedHelpFormatter that sorts arguments by their option strings.""" + + def add_arguments(self, actions): + actions = sorted(actions, key=lambda x: x.option_strings) + super(SortedHelpFormatter, self).add_arguments(actions) + + class FlexibleArgumentParser(argparse.ArgumentParser): """ArgumentParser that allows both underscore and dash in names.""" + def __init__(self, *args, **kwargs): + # Set the default 'formatter_class' to SortedHelpFormatter + if 'formatter_class' not in kwargs: + kwargs['formatter_class'] = SortedHelpFormatter + super().__init__(*args, **kwargs) + def parse_args(self, args=None, namespace=None): if args is None: args = sys.argv[1:] diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 328dab598f8ef..1e8ea4e8e79cf 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -48,9 +48,10 @@ from vllm.sampling_params import SamplingParams from vllm.sequence import IntermediateTensors, SequenceGroupMetadata from vllm.transformers_utils.config import uses_mrope -from vllm.utils import (DeviceMemoryProfiler, PyObjectCache, async_tensor_h2d, - flatten_2d_lists, is_pin_memory_available, - supports_dynamo, weak_ref_tensor) +from vllm.utils import (DeviceMemoryProfiler, GiB_bytes, PyObjectCache, + async_tensor_h2d, flatten_2d_lists, + is_pin_memory_available, supports_dynamo, + weak_ref_tensor) from vllm.worker.model_runner_base import ( ModelRunnerBase, ModelRunnerInputBase, ModelRunnerInputBuilderBase, _add_attn_metadata_broadcastable_dict, @@ -135,6 +136,18 @@ def from_broadcasted_tensor_dict( attn_backend, tensor_dict) return cls(**tensor_dict) + # Exclude `async_callback` to be able to pickle this object + def __getstate__(self): + state = self.__dict__.copy() + del state["async_callback"] + return state + + # TODO: What happens when we depickle this object? + # How can we update this callback to properly pass it to the engine? + def __setstate__(self, state): + self.__dict__.update(state) + self.__dict__.update({'async_callback': None}) + @dataclass(frozen=True) class ModelInputForGPUWithSamplingMetadata(ModelInputForGPU): @@ -1383,16 +1396,16 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: per sequence in the batch. """ assert not self.model_config.enforce_eager - logger.info("Capturing the model for CUDA graphs. This may lead to " + logger.info("Capturing cudagraphs for decoding. This may lead to " "unexpected consequences if the model is not static. To " "run the model in eager mode, set 'enforce_eager=True' or " "use '--enforce-eager' in the CLI.") - logger.info("CUDA graphs can take additional 1~3 GiB memory per GPU. " - "If you are running out of memory, consider decreasing " - "`gpu_memory_utilization` or enforcing eager mode. " - "You can also reduce the `max_num_seqs` as needed " - "to decrease memory usage.") + logger.info("If out-of-memory error occurs during cudagraph capture," + " consider decreasing `gpu_memory_utilization` or " + "switching to eager mode. You can also reduce the " + "`max_num_seqs` as needed to decrease memory usage.") start_time = time.perf_counter() + start_free_gpu_memory = torch.cuda.mem_get_info()[0] # Prepare dummy inputs. These will be reused for all batch sizes. max_batch_size = self.max_batchsize_to_capture @@ -1497,9 +1510,12 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: graph_runner) end_time = time.perf_counter() + end_free_gpu_memory = torch.cuda.mem_get_info()[0] elapsed_time = end_time - start_time + cuda_graph_size = start_free_gpu_memory - end_free_gpu_memory # This usually takes < 10 seconds. - logger.info("Graph capturing finished in %.0f secs.", elapsed_time) + logger.info("Graph capturing finished in %.0f secs, took %.2f GiB", + elapsed_time, cuda_graph_size / GiB_bytes) def _update_inputs_to_capture_for_enc_dec_model(self, capture_inputs: Dict[str,