diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index c65ab04b8ddda..42a1eacb6de57 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -13,6 +13,9 @@ steps: - label: Basic Correctness Test command: pytest -v -s --forked basic_correctness + +- label: Core Test + command: pytest -v -s core - label: Distributed Comm Ops Test command: pytest -v -s --forked test_comm_ops.py @@ -25,7 +28,7 @@ steps: num_gpus: 2 # only support 1 or 2 for now. - label: Engine Test - command: pytest -v -s engine + command: pytest -v -s engine test_sequence.py - label: Entrypoints Test command: pytest -v -s entrypoints @@ -49,6 +52,9 @@ steps: - label: Worker Test command: pytest -v -s worker +- label: Speculative decoding tests + command: pytest -v -s spec_decode + - label: LoRA Test command: pytest -v -s lora --forked diff --git a/.github/workflows/remote-push.yml b/.github/workflows/remote-push.yml index 5bad8eab365a7..03aeabf5278d7 100644 --- a/.github/workflows/remote-push.yml +++ b/.github/workflows/remote-push.yml @@ -21,7 +21,7 @@ jobs: uses: ./.github/workflows/build-test.yml with: label: aws-avx2-192G-4-a10g-96G - timeout: 180 + timeout: 240 gitref: '${{ github.ref }}' Gi_per_thread: 4 python: ${{ matrix.python }} diff --git a/Dockerfile.rocm b/Dockerfile.rocm index 54ae06be6e101..a45265d79a6ac 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -23,6 +23,9 @@ RUN echo "FA_BRANCH is $FA_BRANCH" # In that case, we need to use the python reference attention implementation in vllm ARG BUILD_FA="1" +# whether to build cupy on rocm +ARG BUILD_CUPY="1" + # Install some basic utilities RUN apt-get update && apt-get install python3 python3-pip -y @@ -70,16 +73,33 @@ RUN if [ "$BUILD_FA" = "1" ]; then \ && cd ..; \ fi -COPY ./ /app/vllm - -RUN python3 -m pip install --upgrade pip -RUN python3 -m pip install xformers==0.0.23 --no-deps - # Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt. # Manually removed it so that later steps of numpy upgrade can continue RUN if [ "$BASE_IMAGE" = "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" ]; then \ rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/; fi +# build cupy +RUN if [ "$BUILD_CUPY" = "1" ]; then \ + mkdir -p libs \ + && cd libs \ + && git clone -b hipgraph_enablement --recursive https://github.com/ROCm/cupy.git \ + && cd cupy \ + && pip install mpi4py-mpich \ + && pip install scipy==1.9.3 \ + && pip install cython==0.29.* \ + && env CC=$MPI_HOME/bin/mpicc python -m pip install mpi4py \ + && export CUPY_INSTALL_USE_HIP=1 \ + && export ROCM_HOME=/opt/rocm \ + && export HCC_AMDGPU_TARGET="gfx90a,gfx942,gfx1100" \ + && pip install . \ + && cd ..; \ + fi + +COPY ./ /app/vllm + +RUN python3 -m pip install --upgrade pip +RUN python3 -m pip install xformers==0.0.23 --no-deps + RUN cd /app \ && cd vllm \ && pip install -U -r requirements-rocm.txt \ diff --git a/README.md b/README.md index 22304eb48f4f8..9e4d1e7879216 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ pip install -e . ## Quickstart -Neural Magic maintains a variety of sparse models on our Hugging Face organization profiles, [neuralmagic](https://huggingface.co/neuralmagic) and [nm-testing](https://huggingface.co/nm-testing). +Neural Magic maintains a variety of sparse models on our Hugging Face organization profiles, [neuralmagic](https://huggingface.co/neuralmagic) and [nm-testing](https://huggingface.co/nm-testing). A collection of ready-to-use SparseGPT and GPTQ models in inference optimized marlin format are [available on Hugging Face](https://huggingface.co/collections/neuralmagic/compressed-llms-for-nm-vllm-65e73e3d51d3200e34b77431) @@ -63,7 +63,7 @@ For a quick demonstration, here's how to run a small [50% sparse llama2-110M](ht from vllm import LLM, SamplingParams model = LLM( - "neuralmagic/llama2.c-stories110M-pruned50", + "neuralmagic/llama2.c-stories110M-pruned50", sparsity="sparse_w16a16", # If left off, model will be loaded as dense ) diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index 11e1b6a04e093..26d2c24d5655c 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -277,10 +277,80 @@ async def async_request_openai_completions( return output +async def async_request_openai_chat_completions( + request_func_input: RequestFuncInput, + pbar: Optional[tqdm] = None, +) -> RequestFuncOutput: + api_url = request_func_input.api_url + assert api_url.endswith( + "v1/chat/completions" + ), "OpenAI Chat API URL must end with 'v1/chat/completions'." + + async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: + assert not request_func_input.use_beam_search + payload = { + "model": request_func_input.model, + "messages": [ + { + "role": "user", + "content": request_func_input.prompt, + }, + ], + "temperature": 0.0, + "max_tokens": request_func_input.output_len, + "stream": True, + } + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}" + } + + output = RequestFuncOutput() + output.prompt_len = request_func_input.prompt_len + + generated_text = "" + ttft = 0 + st = time.perf_counter() + try: + async with session.post(url=api_url, json=payload, + headers=headers) as response: + if response.status == 200: + async for chunk in response.content: + if ttft == 0: + ttft = time.perf_counter() - st + output.ttft = ttft + + chunk = chunk.strip() + if not chunk: + continue + + chunk = chunk.decode("utf-8").lstrip("data: ") + if chunk == "[DONE]": + latency = time.perf_counter() - st + else: + body = json.loads(chunk) + if "content" in body["choices"][0]["delta"]: + generated_text += body["choices"][0]["delta"][ + "content"] + + output.generated_text = generated_text + output.success = True + output.latency = latency + else: + output.success = False + except (aiohttp.ClientOSError, aiohttp.ServerDisconnectedError): + output.success = False + + if pbar: + pbar.update(1) + return output + + ASYNC_REQUEST_FUNCS = { "tgi": async_request_tgi, "vllm": async_request_vllm, "deepspeed-mii": async_request_deepspeed_mii, "openai": async_request_openai_completions, + "openai-chat": async_request_openai_chat_completions, "tensorrt-llm": async_request_trt_llm, } diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 7d389a9c7d703..3f5e2d9c8f4dc 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -12,7 +12,7 @@ On the client side, run: python benchmarks/benchmark_serving.py \ --backend \ - --tokenizer --dataset \ + --model --dataset \ --request-rate """ import argparse @@ -171,10 +171,10 @@ async def benchmark( else: raise ValueError(f"Unknown backend: {backend}") - pbar = None if disable_tqdm else tqdm(total=len(input_requests)) - print(f"Traffic request rate: {request_rate}") + pbar = None if disable_tqdm else tqdm(total=len(input_requests)) + benchmark_start_time = time.perf_counter() tasks = [] async for request in get_request(input_requests, request_rate): diff --git a/csrc/attention/attention_kernels.cu b/csrc/attention/attention_kernels.cu index b5be3befa07e2..5e61668d5cc1a 100644 --- a/csrc/attention/attention_kernels.cu +++ b/csrc/attention/attention_kernels.cu @@ -15,9 +15,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#ifdef USE_ROCM -#include -#endif #include #include @@ -31,11 +28,6 @@ #include -#ifndef USE_ROCM -#define WARP_SIZE 32 -#else -#define WARP_SIZE warpSize -#endif #define MAX(a, b) ((a) > (b) ? (a) : (b)) #define MIN(a, b) ((a) < (b) ? (a) : (b)) #define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b)) diff --git a/csrc/cuda_compat.h b/csrc/cuda_compat.h index aa58dd73c148a..c711d8d1b24b9 100644 --- a/csrc/cuda_compat.h +++ b/csrc/cuda_compat.h @@ -1,5 +1,15 @@ #pragma once +#ifdef USE_ROCM +#include +#endif + +#ifndef USE_ROCM + #define WARP_SIZE 32 +#else + #define WARP_SIZE warpSize +#endif + #ifndef USE_ROCM #define VLLM_LDG(arg) __ldg(arg) #else diff --git a/csrc/punica/bgmv/bgmv_config.h b/csrc/punica/bgmv/bgmv_config.h index d5fee9c40d00c..4dc90de1ab42a 100644 --- a/csrc/punica/bgmv/bgmv_config.h +++ b/csrc/punica/bgmv/bgmv_config.h @@ -14,6 +14,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, narrow, 128) \ f(in_T, out_T, W_T, narrow, 256) \ f(in_T, out_T, W_T, narrow, 512) \ + f(in_T, out_T, W_T, narrow, 768) \ f(in_T, out_T, W_T, narrow, 1024) \ f(in_T, out_T, W_T, narrow, 1280) \ f(in_T, out_T, W_T, narrow, 1728) \ @@ -21,6 +22,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, narrow, 2048) \ f(in_T, out_T, W_T, narrow, 2560) \ f(in_T, out_T, W_T, narrow, 2752) \ + f(in_T, out_T, W_T, narrow, 2816) \ f(in_T, out_T, W_T, narrow, 3072) \ f(in_T, out_T, W_T, narrow, 3456) \ f(in_T, out_T, W_T, narrow, 3584) \ @@ -36,6 +38,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, narrow, 10240) \ f(in_T, out_T, W_T, narrow, 11008) \ f(in_T, out_T, W_T, narrow, 12288) \ + f(in_T, out_T, W_T, narrow, 13696) \ f(in_T, out_T, W_T, narrow, 13824) \ f(in_T, out_T, W_T, narrow, 14336) \ f(in_T, out_T, W_T, narrow, 16384) \ diff --git a/csrc/reduction_utils.cuh b/csrc/reduction_utils.cuh index b95ccef162072..210bf0b023ab2 100644 --- a/csrc/reduction_utils.cuh +++ b/csrc/reduction_utils.cuh @@ -24,7 +24,7 @@ namespace vllm { template __inline__ __device__ T warpReduceSum(T val) { #pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) + for (int mask = WARP_SIZE/2; mask > 0; mask >>= 1) val += VLLM_SHFL_XOR_SYNC(val, mask); return val; } @@ -32,7 +32,7 @@ __inline__ __device__ T warpReduceSum(T val) { /* Calculate the sum of all elements in a block */ template __inline__ __device__ T blockReduceSum(T val) { - static __shared__ T shared[32]; + static __shared__ T shared[WARP_SIZE]; int lane = threadIdx.x & 0x1f; int wid = threadIdx.x >> 5; @@ -45,7 +45,7 @@ __inline__ __device__ T blockReduceSum(T val) { // Modify from blockDim.x << 5 to blockDim.x / 32. to prevent // blockDim.x is not divided by 32 - val = (threadIdx.x < (blockDim.x / 32.f)) ? shared[lane] : (T)(0.0f); + val = (threadIdx.x < (blockDim.x / (WARP_SIZE * 1.0f))) ? shared[lane] : (T)(0.0f); val = warpReduceSum(val); return val; } diff --git a/docs/source/conf.py b/docs/source/conf.py index 5a45c6f9d1e0a..61d24e1612128 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -72,8 +72,15 @@ # Mock out external dependencies here. autodoc_mock_imports = [ - "torch", "transformers", "psutil", "prometheus_client", "sentencepiece", - "vllm.cuda_utils", "vllm._C" + "torch", + "transformers", + "psutil", + "prometheus_client", + "sentencepiece", + "vllm.cuda_utils", + "vllm._C", + "numpy", + "tqdm", ] for mock_target in autodoc_mock_imports: diff --git a/docs/source/models/lora.rst b/docs/source/models/lora.rst index 21b18c75fc552..71211189d184f 100644 --- a/docs/source/models/lora.rst +++ b/docs/source/models/lora.rst @@ -90,9 +90,9 @@ Requests can specify the LoRA adapter as if it were any other model via the ``mo processed according to the server-wide LoRA configuration (i.e. in parallel with base model requests, and potentially other LoRA adapter requests if they were provided and ``max_loras`` is set high enough). -The following is an example request +The following is an example request -.. code-block::bash +.. code-block::bash curl http://localhost:8000/v1/completions \ -H "Content-Type: application/json" \ -d '{ diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index 9d4ec663a16e5..3b30dfaf3b47b 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -8,84 +8,109 @@ The following is the list of model architectures that are currently supported by Alongside each architecture, we include some popular models that use it. .. list-table:: - :widths: 25 25 50 + :widths: 25 25 50 5 :header-rows: 1 * - Architecture - Models - Example HuggingFace Models + - :ref:`LoRA ` * - :code:`AquilaForCausalLM` - Aquila - :code:`BAAI/Aquila-7B`, :code:`BAAI/AquilaChat-7B`, etc. + - ✅︎ * - :code:`BaiChuanForCausalLM` - Baichuan - :code:`baichuan-inc/Baichuan2-13B-Chat`, :code:`baichuan-inc/Baichuan-7B`, etc. + - * - :code:`ChatGLMModel` - ChatGLM - :code:`THUDM/chatglm2-6b`, :code:`THUDM/chatglm3-6b`, etc. + - * - :code:`DeciLMForCausalLM` - DeciLM - :code:`Deci/DeciLM-7B`, :code:`Deci/DeciLM-7B-instruct`, etc. + - * - :code:`BloomForCausalLM` - BLOOM, BLOOMZ, BLOOMChat - :code:`bigscience/bloom`, :code:`bigscience/bloomz`, etc. + - * - :code:`FalconForCausalLM` - Falcon - :code:`tiiuae/falcon-7b`, :code:`tiiuae/falcon-40b`, :code:`tiiuae/falcon-rw-7b`, etc. + - * - :code:`GemmaForCausalLM` - Gemma - :code:`google/gemma-2b`, :code:`google/gemma-7b`, etc. + - ✅︎ * - :code:`GPT2LMHeadModel` - GPT-2 - :code:`gpt2`, :code:`gpt2-xl`, etc. + - * - :code:`GPTBigCodeForCausalLM` - StarCoder, SantaCoder, WizardCoder - :code:`bigcode/starcoder`, :code:`bigcode/gpt_bigcode-santacoder`, :code:`WizardLM/WizardCoder-15B-V1.0`, etc. + - * - :code:`GPTJForCausalLM` - GPT-J - :code:`EleutherAI/gpt-j-6b`, :code:`nomic-ai/gpt4all-j`, etc. + - * - :code:`GPTNeoXForCausalLM` - GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM - :code:`EleutherAI/gpt-neox-20b`, :code:`EleutherAI/pythia-12b`, :code:`OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, :code:`databricks/dolly-v2-12b`, :code:`stabilityai/stablelm-tuned-alpha-7b`, etc. + - * - :code:`InternLMForCausalLM` - InternLM - :code:`internlm/internlm-7b`, :code:`internlm/internlm-chat-7b`, etc. + - ✅︎ * - :code:`InternLM2ForCausalLM` - InternLM2 - :code:`internlm/internlm2-7b`, :code:`internlm/internlm2-chat-7b`, etc. + - * - :code:`LlamaForCausalLM` - LLaMA, LLaMA-2, Vicuna, Alpaca, Yi - :code:`meta-llama/Llama-2-13b-hf`, :code:`meta-llama/Llama-2-70b-hf`, :code:`openlm-research/open_llama_13b`, :code:`lmsys/vicuna-13b-v1.3`, :code:`01-ai/Yi-6B`, :code:`01-ai/Yi-34B`, etc. + - ✅︎ * - :code:`MistralForCausalLM` - Mistral, Mistral-Instruct - :code:`mistralai/Mistral-7B-v0.1`, :code:`mistralai/Mistral-7B-Instruct-v0.1`, etc. + - ✅︎ * - :code:`MixtralForCausalLM` - Mixtral-8x7B, Mixtral-8x7B-Instruct - :code:`mistralai/Mixtral-8x7B-v0.1`, :code:`mistralai/Mixtral-8x7B-Instruct-v0.1`, etc. + - ✅︎ * - :code:`MPTForCausalLM` - MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter - :code:`mosaicml/mpt-7b`, :code:`mosaicml/mpt-7b-storywriter`, :code:`mosaicml/mpt-30b`, etc. + - * - :code:`OLMoForCausalLM` - OLMo - :code:`allenai/OLMo-1B`, :code:`allenai/OLMo-7B`, etc. + - * - :code:`OPTForCausalLM` - OPT, OPT-IML - :code:`facebook/opt-66b`, :code:`facebook/opt-iml-max-30b`, etc. + - * - :code:`OrionForCausalLM` - Orion - :code:`OrionStarAI/Orion-14B-Base`, :code:`OrionStarAI/Orion-14B-Chat`, etc. + - * - :code:`PhiForCausalLM` - Phi - :code:`microsoft/phi-1_5`, :code:`microsoft/phi-2`, etc. + - * - :code:`QWenLMHeadModel` - Qwen - :code:`Qwen/Qwen-7B`, :code:`Qwen/Qwen-7B-Chat`, etc. + - * - :code:`Qwen2ForCausalLM` - Qwen2 - :code:`Qwen/Qwen2-beta-7B`, :code:`Qwen/Qwen2-beta-7B-Chat`, etc. + - ✅︎ * - :code:`StableLmForCausalLM` - StableLM - :code:`stabilityai/stablelm-3b-4e1t/` , :code:`stabilityai/stablelm-base-alpha-7b-v2`, etc. + - If your model uses one of the above model architectures, you can seamlessly run your model with vLLM. Otherwise, please refer to :ref:`Adding a New Model ` for instructions on how to implement support for your model. diff --git a/pyproject.toml b/pyproject.toml index c5db016cebdb7..f74e50265be24 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,6 +9,10 @@ requires = [ ] build-backend = "setuptools.build_meta" +[tool.ruff] +# Allow lines to be as long as 80. +line-length = 80 + [tool.ruff.lint] select = [ # pycodestyle diff --git a/requirements-dev.txt b/requirements-dev.txt index c5fc6f7f07993..18de4b7420de2 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -23,3 +23,7 @@ einops # required for MPT openai requests ray +peft + +# Benchmarking +aiohttp diff --git a/setup.py b/setup.py index 84c452225a0dc..af78769695811 100644 --- a/setup.py +++ b/setup.py @@ -144,8 +144,8 @@ def get_pytorch_rocm_arch() -> Set[str]: # If we don't have PYTORCH_ROCM_ARCH specified pull the list from rocm_agent_enumerator if env_arch_list is None: command = "rocm_agent_enumerator" - env_arch_list = subprocess.check_output([command]).decode('utf-8')\ - .strip().replace("\n", ";") + env_arch_list = (subprocess.check_output( + [command]).decode('utf-8').strip().replace("\n", ";")) arch_source_str = "rocm_agent_enumerator" else: arch_source_str = "PYTORCH_ROCM_ARCH env variable" diff --git a/tests/async_engine/test_async_llm_engine.py b/tests/async_engine/test_async_llm_engine.py index 1edb19c550010..cb125a7bfec30 100644 --- a/tests/async_engine/test_async_llm_engine.py +++ b/tests/async_engine/test_async_llm_engine.py @@ -25,12 +25,8 @@ async def step_async(self): return [RequestOutput( request_id=self.request_id)] if self.request_id else [] - async def encode_request_async( - self, - *args, - **kwargs, - ): - return [1] + async def encode_request_async(self, *args, **kwargs): + pass def generate(self, request_id): self.request_id = request_id @@ -43,13 +39,16 @@ def add_request(self, **kwargs): self.add_request_calls += 1 async def add_request_async(self, **kwargs): - del kwargs # Unused self.add_request_calls += 1 + return def abort_request(self, request_id): del request_id # Unused self.abort_request_calls += 1 + def has_unfinished_requests(self): + return self.request_id is not None + class MockAsyncLLMEngine(AsyncLLMEngine): @@ -72,20 +71,24 @@ async def test_new_requests_event(): await engine.add_request("2", "", None) engine.engine.generate("2") await asyncio.sleep(0) - assert engine.engine.add_request_calls == 2 - assert engine.engine.step_calls == 2 await asyncio.sleep(0) - assert engine.engine.step_calls == 3 + assert engine.engine.add_request_calls == 2 + assert engine.engine.step_calls >= 2 + await asyncio.sleep(0.001) + assert engine.engine.step_calls >= 3 engine.engine.stop_generating() - await asyncio.sleep(0) - assert engine.engine.step_calls == 4 - await asyncio.sleep(0) - assert engine.engine.step_calls == 4 + await asyncio.sleep(0.001) + old_step_calls = engine.engine.step_calls + await asyncio.sleep(0.001) + assert engine.engine.step_calls == old_step_calls await engine.add_request("3", "", None) await asyncio.sleep(0.01) assert engine.engine.add_request_calls == 3 - assert engine.engine.step_calls == 5 + assert engine.engine.step_calls == old_step_calls + 1 await asyncio.sleep(0.01) assert engine.engine.add_request_calls == 3 - assert engine.engine.step_calls == 5 + assert engine.engine.step_calls == old_step_calls + 1 + + engine = MockAsyncLLMEngine(worker_use_ray=True, engine_use_ray=True) + assert engine.get_tokenizer() is not None diff --git a/tests/async_engine/test_chat_template.py b/tests/async_engine/test_chat_template.py index 32d110e0f0b47..e98bba8d43b49 100644 --- a/tests/async_engine/test_chat_template.py +++ b/tests/async_engine/test_chat_template.py @@ -73,7 +73,7 @@ def test_load_chat_template(): assert template_content is not None # Hard coded value for template_chatml.jinja assert template_content == """{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\\n'}}{% endif %}{% endfor %} -{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\\n' }}{% endif %}""" +{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\\n' }}{% endif %}""" # noqa: E501 def test_no_load_chat_template(): @@ -117,4 +117,6 @@ async def test_get_gen_prompt(model, template, add_generation_prompt, add_generation_prompt=mock_request.add_generation_prompt) # Test assertion - assert result == expected_output, f"The generated prompt does not match the expected output for model {model} and template {template}" + assert result == expected_output, ( + f"The generated prompt does not match the expected output for " + f"model {model} and template {template}") diff --git a/tests/async_engine/test_request_tracker.py b/tests/async_engine/test_request_tracker.py index 4043558bae919..7b1f4a9e1eb2f 100644 --- a/tests/async_engine/test_request_tracker.py +++ b/tests/async_engine/test_request_tracker.py @@ -4,25 +4,14 @@ from vllm.outputs import RequestOutput -class DummyEvent: - - def __init__(self): - self.flag = False - - def set(self): - self.flag = True - - def clear(self): - self.flag = False - - -def test_request_tracker(): +@pytest.mark.asyncio +async def test_request_tracker(): tracker = RequestTracker() - tracker.new_requests_event = DummyEvent() stream_1 = tracker.add_request("1") - assert tracker.new_requests_event.flag + assert tracker.new_requests_event.is_set() + await tracker.wait_for_new_requests() new, finished = tracker.get_new_and_finished_requests() - assert not tracker.new_requests_event.flag + assert not tracker.new_requests_event.is_set() assert len(new) == 1 assert new[0]["request_id"] == "1" assert not finished @@ -30,9 +19,10 @@ def test_request_tracker(): stream_2 = tracker.add_request("2") stream_3 = tracker.add_request("3") - assert tracker.new_requests_event.flag + assert tracker.new_requests_event.is_set() + await tracker.wait_for_new_requests() new, finished = tracker.get_new_and_finished_requests() - assert not tracker.new_requests_event.flag + assert not tracker.new_requests_event.is_set() assert len(new) == 2 assert new[0]["request_id"] == "2" assert new[1]["request_id"] == "3" @@ -43,7 +33,7 @@ def test_request_tracker(): # request_ids must be unique with pytest.raises(KeyError): tracker.add_request("1") - assert not tracker.new_requests_event.flag + assert not tracker.new_requests_event.is_set() tracker.abort_request("1") new, finished = tracker.get_new_and_finished_requests() @@ -54,7 +44,8 @@ def test_request_tracker(): stream_4 = tracker.add_request("4") tracker.abort_request("4") - assert tracker.new_requests_event.flag + assert tracker.new_requests_event.is_set() + await tracker.wait_for_new_requests() new, finished = tracker.get_new_and_finished_requests() assert len(finished) == 1 assert "4" in finished @@ -62,11 +53,12 @@ def test_request_tracker(): assert stream_4.finished stream_5 = tracker.add_request("5") - assert tracker.new_requests_event.flag + assert tracker.new_requests_event.is_set() tracker.process_request_output( - RequestOutput("2", "output", [], [], [], bool(finished))) + RequestOutput("2", "output", [], [], [], finished=True)) + await tracker.wait_for_new_requests() new, finished = tracker.get_new_and_finished_requests() - assert not tracker.new_requests_event.flag + assert not tracker.new_requests_event.is_set() assert len(finished) == 1 assert "2" in finished assert len(new) == 1 diff --git a/tests/worker/spec_decode/__init__.py b/tests/core/__init__.py similarity index 100% rename from tests/worker/spec_decode/__init__.py rename to tests/core/__init__.py diff --git a/tests/core/test_block_manager.py b/tests/core/test_block_manager.py new file mode 100644 index 0000000000000..b280fd1d73c2f --- /dev/null +++ b/tests/core/test_block_manager.py @@ -0,0 +1,276 @@ +import pytest +import time +from typing import List + +from vllm import SamplingParams +from vllm.block import PhysicalTokenBlock +from vllm.core.block_manager import (BlockAllocator, BlockSpaceManager, + AllocStatus) +from vllm.utils import Device +from vllm.sequence import Sequence, SequenceGroup, SequenceStatus, Logprob + +from .utils import create_dummy_prompt + + +def test_block_allocator_allocate(): + block_size = 4 + num_cpu_blocks = 4 + cpu_allocator = BlockAllocator(Device.CPU, block_size, num_cpu_blocks) + + # Allocate all available cpu blocks. + num_free = num_cpu_blocks + assert cpu_allocator.get_num_free_blocks() == num_free + for _ in range(num_cpu_blocks): + block = cpu_allocator.allocate() + num_free -= 1 + + assert block.block_hash not in cpu_allocator.evictor + assert cpu_allocator.get_num_free_blocks() == num_free + + with pytest.raises(ValueError): + cpu_allocator.allocate() + + +def test_block_allocator_free(): + block_size = 4 + num_cpu_blocks = 4 + cpu_allocator = BlockAllocator(Device.CPU, block_size, num_cpu_blocks) + + # Allocate all available cpu blocks. + blocks: List[PhysicalTokenBlock] = [] + for _ in range(num_cpu_blocks): + block = cpu_allocator.allocate() + blocks.append(block) + assert block.block_hash not in cpu_allocator.evictor + + # Free all allocated cpu blocks. + num_free = 0 + assert cpu_allocator.get_num_free_blocks() == num_free + for block in blocks: + cpu_allocator.free(block) + num_free += 1 + assert block.block_hash in cpu_allocator.evictor + assert cpu_allocator.get_num_free_blocks() == num_free + + with pytest.raises(ValueError): + cpu_allocator.free(block) + + +def test_allocate(): + block_size = 4 + num_cpu_blocks = 4 + num_gpu_blocks = 4 + block_manager = BlockSpaceManager(block_size, + num_cpu_blocks, + num_gpu_blocks, + watermark=0) + + # Allocate same sequence group to all available gpu blocks. + for i in range(num_gpu_blocks): + _, seq_group = create_dummy_prompt(str(i), block_size) + assert block_manager.can_allocate(seq_group) + block_manager.allocate(seq_group) + assert block_manager.can_allocate(seq_group) != AllocStatus.OK + + # Allocate same sequence group to all available gpu blocks. + # Use watermark to reserve one gpu block. + block_manager = BlockSpaceManager(block_size, + num_cpu_blocks, + num_gpu_blocks, + watermark=1 / num_gpu_blocks) + for i in range(num_gpu_blocks - 1): + _, seq_group = create_dummy_prompt(str(i), block_size) + assert block_manager.can_allocate(seq_group) + block_manager.allocate(seq_group) + assert block_manager.can_allocate(seq_group) != AllocStatus.OK + + +def test_append_slot_single_seq(): + block_size = 4 + num_cpu_blocks = 4 + num_gpu_blocks = 4 + block_manager = BlockSpaceManager(block_size, + num_cpu_blocks, + num_gpu_blocks, + watermark=0) + + # Allocate single seq to gpu block. + prompt, seq_group = create_dummy_prompt("1", block_size) + block_manager.allocate(seq_group) + + # Nothing to append. Sequence has no new logical blocks. + assert block_manager.can_append_slot(seq_group) + before_blocks = block_manager.get_num_free_gpu_blocks() + assert not block_manager.append_slot(prompt) + after_blocks = block_manager.get_num_free_gpu_blocks() + assert before_blocks == after_blocks + + # Add block_size number of new tokens and append slot. + for i in range(block_size): + token_id = i + 5 + prompt.append_token_id(token_id, {token_id: Logprob(0.0)}) + + assert block_manager.can_append_slot(seq_group) + before_blocks = block_manager.get_num_free_gpu_blocks() + assert not block_manager.append_slot(prompt) + after_blocks = block_manager.get_num_free_gpu_blocks() + assert before_blocks - after_blocks == 1 + + +def test_append_slot_cow(): + block_size = 4 + num_cpu_blocks = 4 + num_gpu_blocks = 4 + block_manager = BlockSpaceManager(block_size=block_size, + num_cpu_blocks=num_cpu_blocks, + num_gpu_blocks=num_gpu_blocks, + watermark=0) + + # Allocate prompt to gpu block. There is one slot left in the block. + prompt = Sequence(seq_id=1, + prompt="one two three", + prompt_token_ids=[1, 2, 3], + block_size=block_size) + + # Fork the sequence, such that a COW will be required when we append a new + # token id. + child = prompt.fork(new_seq_id=2) + + # Allocate space for the sequence group. + seq_group = SequenceGroup("1", [prompt, child], SamplingParams(), + time.time(), time.perf_counter) + block_manager.allocate(seq_group) + + # Fork and append a new token id. We expect a COW to be scheduled. + token_id = 4 + child.append_token_id(token_id, {token_id: Logprob(0.0)}) + block_manager.fork(prompt, child) + + assert block_manager.can_append_slot(seq_group) + before_blocks = block_manager.get_num_free_gpu_blocks() + + maybe_src_dst_block = block_manager.append_slot(child) + assert maybe_src_dst_block is not None + src_block, dst_block = maybe_src_dst_block + assert src_block != dst_block + + after_blocks = block_manager.get_num_free_gpu_blocks() + assert before_blocks - after_blocks == 1 + + +def test_fork(): + block_size = 4 + num_cpu_blocks = 4 + num_gpu_blocks = 4 + block_manager = BlockSpaceManager(block_size, + num_cpu_blocks, + num_gpu_blocks, + watermark=0) + + prompt, seq_group = create_dummy_prompt("1", + block_size - 1, + block_size=block_size) + block_manager.allocate(seq_group) + + # Fork prompt and copy block tables. + child = prompt.fork(2) + block_manager.fork(prompt, child) + assert block_manager.get_block_table( + prompt) == block_manager.get_block_table(child) + token_id = 4 + # Append token to child. Block is shared so copy on write occurs. + child.append_token_id(token_id, {token_id: Logprob(0.0)}) + block_manager.append_slot(child) + assert block_manager.get_block_table( + prompt) != block_manager.get_block_table(child) + + +def test_swap(): + block_size = 4 + num_cpu_blocks = 4 + num_gpu_blocks = 4 + block_manager = BlockSpaceManager(block_size, + num_cpu_blocks, + num_gpu_blocks, + watermark=0) + + prompt, seq_group = create_dummy_prompt("1", prompt_length=block_size - 1) + prompt.status = SequenceStatus.WAITING + block_manager.allocate(seq_group) + + # Emulate a forward pass by appending a single token. + # The block manager then knows how many unprocessed + # tokens will be written in the next forward pass. + token_id = 0 + prompt.status = SequenceStatus.RUNNING + prompt.append_token_id(token_id, {token_id: Logprob(0.0)}) + + # Swap seq group from GPU -> CPU. + gpu_blocks = block_manager.get_block_table(prompt) + assert block_manager.can_swap_out(seq_group) + before_cpu_blocks = block_manager.get_num_free_cpu_blocks() + before_gpu_blocks = block_manager.get_num_free_gpu_blocks() + mapping = block_manager.swap_out(seq_group) + assert list(mapping.keys()) == gpu_blocks + after_cpu_blocks = block_manager.get_num_free_cpu_blocks() + after_gpu_blocks = block_manager.get_num_free_gpu_blocks() + assert before_cpu_blocks == after_cpu_blocks + len(gpu_blocks) + assert before_gpu_blocks + len(gpu_blocks) == after_gpu_blocks + prompt.status = SequenceStatus.SWAPPED + + # Swap seq group from CPU -> GPU. + cpu_blocks = block_manager.get_block_table(prompt) + assert block_manager.can_swap_in(seq_group) + before_cpu_blocks = block_manager.get_num_free_cpu_blocks() + before_gpu_blocks = block_manager.get_num_free_gpu_blocks() + mapping = block_manager.swap_in(seq_group) + assert list(mapping.keys()) == cpu_blocks + after_cpu_blocks = block_manager.get_num_free_cpu_blocks() + after_gpu_blocks = block_manager.get_num_free_gpu_blocks() + assert before_cpu_blocks + len(cpu_blocks) == after_cpu_blocks + assert before_gpu_blocks == after_gpu_blocks + len(cpu_blocks) + + +def test_free(): + block_size = 4 + num_cpu_blocks = 4 + num_gpu_blocks = 4 + block_manager = BlockSpaceManager(block_size, + num_cpu_blocks, + num_gpu_blocks, + watermark=0) + + prompt, seq_group = create_dummy_prompt("1", block_size) + block_manager.allocate(seq_group) + + # Free allocated seq. + prompt_blocks = len(block_manager.get_block_table(prompt)) + before_blocks = block_manager.get_num_free_gpu_blocks() + block_manager.free(prompt) + after_blocks = block_manager.get_num_free_gpu_blocks() + assert after_blocks == before_blocks + prompt_blocks + + # Block table for freed seq is deleted. + with pytest.raises(KeyError): + block_manager.get_block_table(prompt) + + +def test_reset(): + block_size = 4 + num_cpu_blocks = 4 + num_gpu_blocks = 4 + block_manager = BlockSpaceManager(block_size, + num_cpu_blocks, + num_gpu_blocks, + watermark=0) + + # Allocate same seq group on all available gpu blocks. + original_blocks = block_manager.get_num_free_gpu_blocks() + for i in range(num_gpu_blocks): + _, seq_group = create_dummy_prompt(str(i), block_size) + block_manager.allocate(seq_group) + assert block_manager.get_num_free_gpu_blocks() == 0 + + # Resetting block manager frees all allocated blocks. + block_manager.reset() + assert block_manager.get_num_free_gpu_blocks() == original_blocks diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py new file mode 100644 index 0000000000000..ebfeb8ba04812 --- /dev/null +++ b/tests/core/test_scheduler.py @@ -0,0 +1,170 @@ +from typing import List +import pytest # noqa + +from vllm.config import CacheConfig, SchedulerConfig +from vllm.core.scheduler import Scheduler +from vllm.sequence import SequenceGroup, Logprob + +from .utils import create_dummy_prompt + + +def test_scheduler_add_seq_group(): + block_size = 4 + scheduler_config = SchedulerConfig(100, 64, 1, 256) + cache_config = CacheConfig(block_size, 1.0, 1, "auto") + cache_config.num_cpu_blocks = 4 + cache_config.num_gpu_blocks = 4 + scheduler = Scheduler(scheduler_config, cache_config, None) + + # Add seq group to scheduler. + num_seq_group = 4 + for i in range(num_seq_group): + _, seq_group = create_dummy_prompt(str(i), block_size) + scheduler.add_seq_group(seq_group) + assert scheduler.get_num_unfinished_seq_groups() == i + 1 + + +def test_scheduler_abort_seq_group(): + block_size = 4 + scheduler_config = SchedulerConfig(100, 64, 1, 256) + cache_config = CacheConfig(block_size, 1.0, 1, "auto") + cache_config.num_cpu_blocks = 4 + cache_config.num_gpu_blocks = 4 + scheduler = Scheduler(scheduler_config, cache_config, None) + + # Add multiple seq groups to scheduler. + num_seq_group = 4 + request_ids = set() + for i in range(num_seq_group): + _, seq_group = create_dummy_prompt(str(i), block_size) + scheduler.add_seq_group(seq_group) + request_ids.add(str(i)) + + # Abort all added seq groups. + assert scheduler.get_num_unfinished_seq_groups() == num_seq_group + scheduler.abort_seq_group(request_ids) + assert scheduler.get_num_unfinished_seq_groups() == 0 + + +def test_scheduler_schedule_simple(): + block_size = 4 + num_seq_group = 4 + max_model_len = 16 + scheduler_config = SchedulerConfig(64, num_seq_group, max_model_len, 256) + cache_config = CacheConfig(block_size, 1.0, 1, "auto") + cache_config.num_cpu_blocks = 8 + cache_config.num_gpu_blocks = 8 + scheduler = Scheduler(scheduler_config, cache_config, None) + + # Add seq groups to scheduler. + running: List[SequenceGroup] = [] + for i in range(num_seq_group): + _, seq_group = create_dummy_prompt(str(i), prompt_length=block_size) + scheduler.add_seq_group(seq_group) + running.append(seq_group) + + # Schedule seq groups prompts. + seq_group_meta, out = scheduler.schedule() + assert set(out.scheduled_seq_groups) == set(running) + assert out.num_batched_tokens == num_seq_group * seq_group.get_seqs( + )[0].get_len() + assert (not out.blocks_to_copy and not out.blocks_to_swap_in + and not out.blocks_to_swap_out) + assert len(seq_group_meta) == num_seq_group + + # Schedule seq groups generation. + seq_group_meta, out = scheduler.schedule() + assert set(out.scheduled_seq_groups) == set(running) + assert out.num_batched_tokens == num_seq_group + assert (not out.blocks_to_copy and not out.blocks_to_swap_in + and not out.blocks_to_swap_out) + assert len(seq_group_meta) == num_seq_group + + +def test_scheduler_schedule_preempt_abort(): + block_size = 4 + max_model_len = 16 + scheduler_config = SchedulerConfig(64, 2, max_model_len, 256) + cache_config = CacheConfig(block_size, 1.0, 1, "auto") + cache_config.num_cpu_blocks = 2 + cache_config.num_gpu_blocks = 2 + scheduler = Scheduler(scheduler_config, cache_config, None) + + # Add seq groups to scheduler. + seq_a, seq_group_a = create_dummy_prompt("1", block_size) + seq_b, seq_group_b = create_dummy_prompt("2", block_size) + scheduler.add_seq_group(seq_group_a) + scheduler.add_seq_group(seq_group_b) + + # Schedule seq groups prompts. + seq_group_meta, out = scheduler.schedule() + assert out.scheduled_seq_groups == [seq_group_a, seq_group_b] + assert out.num_batched_tokens == seq_group_a.get_seqs()[0].get_len() * 2 + assert (not out.blocks_to_copy and not out.blocks_to_swap_in + and not out.blocks_to_swap_out) + assert len(seq_group_meta) == 2 + assert scheduler.get_num_unfinished_seq_groups() == 2 + + # Append "generated" tokens, allowing the sequence to mark prompt tokens as + # processed. + token_id = 0 + seq_a.append_token_id(token_id, {token_id: Logprob(0.0)}) + seq_b.append_token_id(token_id, {token_id: Logprob(0.0)}) + + # Schedule seq groups generation and preempt seq group b. + seq_group_meta, out = scheduler.schedule() + assert out.scheduled_seq_groups == [seq_group_a] + assert out.num_batched_tokens == 1 + assert (not out.blocks_to_copy and not out.blocks_to_swap_in + and not out.blocks_to_swap_out) + assert len(seq_group_meta) == 1 + assert scheduler.get_num_unfinished_seq_groups() == 2 + + # Abort seq group a. Re-schedule seq group b prompt with recomputation. + scheduler.abort_seq_group("1") + seq_group_meta, out = scheduler.schedule() + assert out.scheduled_seq_groups == [seq_group_b] + assert out.num_batched_tokens == seq_group_b.get_seqs()[0].get_len() + assert (not out.blocks_to_copy and not out.blocks_to_swap_in + and not out.blocks_to_swap_out) + assert len(seq_group_meta) == 1 + assert scheduler.get_num_unfinished_seq_groups() == 1 + + +def test_scheduler_max_seqs(): + block_size = 4 + num_seq_group = 4 + max_seq_group = 2 + max_model_len = 16 + scheduler_config = SchedulerConfig(64, max_seq_group, max_model_len, 256) + cache_config = CacheConfig(block_size, 1.0, 1, "auto") + cache_config.num_cpu_blocks = 8 + cache_config.num_gpu_blocks = 8 + scheduler = Scheduler(scheduler_config, cache_config, None) + + all_seq_groups: List[SequenceGroup] = [] + # Add seq groups to scheduler. + for i in range(num_seq_group): + _, seq_group = create_dummy_prompt(str(i), prompt_length=block_size) + all_seq_groups.append(seq_group) + + # Append 1 seq group + scheduler.add_seq_group(all_seq_groups[0]) + + # Schedule seq groups prompts. + _, out = scheduler.schedule() + assert set(out.scheduled_seq_groups) == set([all_seq_groups[0]]) + + # Schedule seq groups generation. + _, out = scheduler.schedule() + assert set(out.scheduled_seq_groups) == set([all_seq_groups[0]]) + + # Append 2 more seq group + scheduler.add_seq_group(all_seq_groups[1]) + scheduler.add_seq_group(all_seq_groups[2]) + + # Schedule seq groups prompts. + # Only 1 seq group should be scheduled since max_seq_group is 2 + # and one is prompting. + _, out = scheduler.schedule() + assert set(out.scheduled_seq_groups) == set([all_seq_groups[1]]) diff --git a/tests/core/utils.py b/tests/core/utils.py new file mode 100644 index 0000000000000..6469789e89386 --- /dev/null +++ b/tests/core/utils.py @@ -0,0 +1,27 @@ +import time +from typing import Tuple + +from vllm import SamplingParams +from vllm.sequence import Sequence, SequenceGroup + + +def create_dummy_prompt( + request_id: str, + prompt_length: int, + block_size: int = None) -> Tuple[Sequence, SequenceGroup]: + if not block_size: + block_size = prompt_length + + # Create dummy prompt sequence with tokens 0...block_size-1 + # and prompt "0 ... block_size". + prompt_tokens = list(range(prompt_length)) + prompt_str = " ".join([str(t) for t in prompt_tokens]) + prompt = Sequence(int(request_id), prompt_str, prompt_tokens, block_size) + seq_group = SequenceGroup(request_id, [prompt], SamplingParams(), + time.time(), None) + + return prompt, seq_group + + +def round_up_to_next_block(seq_len: int, block_size: int) -> int: + return (seq_len + block_size - 1) // block_size diff --git a/tests/engine/test_computed_prefix_blocks.py b/tests/engine/test_computed_prefix_blocks.py new file mode 100644 index 0000000000000..ed35212cc3f11 --- /dev/null +++ b/tests/engine/test_computed_prefix_blocks.py @@ -0,0 +1,34 @@ +import pytest + +from vllm.engine.arg_utils import EngineArgs +from vllm.engine.llm_engine import LLMEngine +from vllm.sampling_params import SamplingParams + + +@pytest.mark.parametrize("model", ["facebook/opt-125m"]) +@pytest.mark.parametrize("block_size", [16]) +def test_computed_prefix_blocks(model: str, block_size: int): + # This test checks if we are able to run the engine to completion + # without triggering asserts. + # We are in a scenario where all blocks from the second request's prompt + # are full and already computed when the second request arrives. + prompt = ( + "You are a helpful assistant. How do I build a car from cardboard and " + "paper clips? Is there an easy to follow video tutorial available " + "online for free?") + prompt2 = ( + " Please recommend to me some resources where I can learn not only to " + "handle technical difficulties of building a car, but also " + "decoration.") + + engine_args = EngineArgs(model=model, + block_size=block_size, + enable_prefix_caching=True) + + engine = LLMEngine.from_engine_args(engine_args) + sampling_params = SamplingParams() + + engine.add_request("0", prompt + prompt2, sampling_params) + engine.step() + engine.add_request("1", prompt, sampling_params) + engine.step() diff --git a/tests/entrypoints/test_guided_processors.py b/tests/entrypoints/test_guided_processors.py index 5b39269916f8b..4a0e3e759e25a 100644 --- a/tests/entrypoints/test_guided_processors.py +++ b/tests/entrypoints/test_guided_processors.py @@ -46,8 +46,8 @@ "required": ["name", "age", "skills", "work history"] } -TEST_REGEX = r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}" + \ - r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)" +TEST_REGEX = (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}" + r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)") def test_guided_logits_processors(): diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py index 35e83e1c70cc3..4a2b89befd93f 100644 --- a/tests/entrypoints/test_openai_server.py +++ b/tests/entrypoints/test_openai_server.py @@ -7,9 +7,12 @@ import sys import pytest import requests -import ray # using Ray for overall ease of process management, parallel requests, and debugging. +# using Ray for overall ease of process management, parallel requests, +# and debugging. +import ray import openai # use the official client for correctness check -from huggingface_hub import snapshot_download # downloading lora to test lora requests +# downloading lora to test lora requests +from huggingface_hub import snapshot_download # imports for guided decoding tests import json @@ -19,8 +22,11 @@ from vllm.transformers_utils.tokenizer import get_tokenizer MAX_SERVER_START_WAIT_S = 600 # wait for server to start for 60 seconds -MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" # any model with a chat template should work here -LORA_NAME = "typeof/zephyr-7b-beta-lora" # technically this needs Mistral-7B-v0.1 as base, but we're not testing generation quality here +# any model with a chat template should work here +MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" +# technically this needs Mistral-7B-v0.1 as base, but we're not testing +# generation quality here +LORA_NAME = "typeof/zephyr-7b-beta-lora" TEST_SCHEMA = { "type": "object", @@ -61,8 +67,8 @@ "required": ["name", "age", "skills", "work history"] } -TEST_REGEX = r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}" + \ - r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)" +TEST_REGEX = (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}" + r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)") TEST_CHOICE = [ "Python", "Java", "JavaScript", "C++", "C#", "PHP", "TypeScript", "Ruby", @@ -122,8 +128,9 @@ def server(zephyr_lora_files): server_runner = ServerRunner.remote([ "--model", MODEL_NAME, + # use half precision for speed and memory savings in CI environment "--dtype", - "bfloat16", # use half precision for speed and memory savings in CI environment + "bfloat16", "--max-model-len", "8192", "--enforce-eager", @@ -394,7 +401,8 @@ async def test_batch_completions(server, client: openai.AsyncOpenAI, max_tokens=5, temperature=0.0, extra_body=dict( - # NOTE: this has to be true for n > 1 in vLLM, but not necessary for official client. + # NOTE: this has to be true for n > 1 in vLLM, but not necessary + # for official client. use_beam_search=True), ) assert len(batch.choices) == 4 @@ -471,8 +479,8 @@ async def test_logits_bias(server, client: openai.AsyncOpenAI): async def test_guided_json_completion(server, client: openai.AsyncOpenAI): completion = await client.completions.create( model=MODEL_NAME, - prompt= - f"Give an example JSON for an employee profile that fits this schema: {TEST_SCHEMA}", + prompt=f"Give an example JSON for an employee profile " + f"that fits this schema: {TEST_SCHEMA}", n=3, temperature=1.0, max_tokens=500, @@ -491,9 +499,11 @@ async def test_guided_json_chat(server, client: openai.AsyncOpenAI): "role": "system", "content": "you are a helpful assistant" }, { - "role": "user", - "content": "Give an example JSON for an employee profile that " + \ - f"fits this schema: {TEST_SCHEMA}" + "role": + "user", + "content": + f"Give an example JSON for an employee profile that " + f"fits this schema: {TEST_SCHEMA}" }] chat_completion = await client.chat.completions.create( model=MODEL_NAME, diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py index c402fe3e98c7f..6165225d2d819 100644 --- a/tests/kernels/test_moe.py +++ b/tests/kernels/test_moe.py @@ -57,7 +57,8 @@ def test_fused_moe( [torch.float32, torch.float16, torch.bfloat16]) @torch.inference_mode() def test_mixtral_moe(dtype: torch.dtype): - "Make sure our Mixtral MoE implementation agrees with the one from huggingface." + """Make sure our Mixtral MoE implementation agrees with the one from + huggingface.""" # Instantiate our and huggingface's MoE blocks config = MixtralConfig() diff --git a/tests/kernels/test_prefix_prefill.py b/tests/kernels/test_prefix_prefill.py index 97753d1b659c5..204cc325f7da8 100644 --- a/tests/kernels/test_prefix_prefill.py +++ b/tests/kernels/test_prefix_prefill.py @@ -3,7 +3,7 @@ import time import torch -from vllm.model_executor.layers.triton_kernel.prefix_prefill import ( +from vllm.model_executor.layers.attention.ops.prefix_prefill import ( context_attention_fwd) from xformers import ops as xops from xformers.ops.fmha.attn_bias import BlockDiagonalCausalFromBottomRightMask @@ -117,7 +117,8 @@ def test_contexted_kv_attention( v_cache = v_cache.view(-1, block_size, num_kv_heads, head_size).permute(0, 2, 3, 1).contiguous() - # Warm up the Triton kernel by calling it once before actually measuring generation time + # Warm up the Triton kernel by calling it once before actually measuring + # generation time context_attention_fwd(query, k, v, output, k_cache, v_cache, block_table, b_start_loc, b_seq_len, b_ctx_len, max_input_len) torch.cuda.synchronize() diff --git a/tests/lora/test_gemma.py b/tests/lora/test_gemma.py index 4e7cba1e68b08..838f56f7fd05c 100644 --- a/tests/lora/test_gemma.py +++ b/tests/lora/test_gemma.py @@ -28,7 +28,6 @@ def do_sample(llm, lora_path: str, lora_id: int) -> str: @pytest.mark.skip(reason="high likelihood sproadic failure in GHA") -@pytest.mark.flaky(reruns=2) def test_gemma_lora(gemma_lora_files): llm = vllm.LLM(MODEL_PATH, max_model_len=1024, diff --git a/tests/lora/test_layer_variation.py b/tests/lora/test_layer_variation.py new file mode 100644 index 0000000000000..95cf0cede8729 --- /dev/null +++ b/tests/lora/test_layer_variation.py @@ -0,0 +1,104 @@ +from typing import List, Optional +import peft +import pytest +from random import sample +import tempfile +from transformers import AutoModelForCausalLM + +import vllm +from vllm.lora.request import LoRARequest +from .conftest import cleanup + +MODEL_PATH = "Felladrin/Llama-68M-Chat-v1" +PROMPTS = [ + "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nSpellForce 3 is a pretty bad game. The developer Grimlore Games is clearly a bunch of no-talent hacks, and 2017 was a terrible year for games anyway. [/user] [assistant]", # noqa: E501 + "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nI wanted to like Grimlore Games' 2017 entry, but in SpellForce 3 they just didn't get anything right. [/user] [assistant]", # noqa: E501 + "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nBioShock is a good role-playing, action-adventure, shooter that released for PlayStation, Xbox, and PC in 2007. It is available on Steam, and it has a Mac release but not a Linux release. [/user] [assistant]", # noqa: E501 +] + + +def get_lora_model(model_id: str, target_modules: List[str], rank: int): + model = AutoModelForCausalLM.from_pretrained(model_id) + lora_config = peft.tuners.lora.LoraConfig(target_modules, rank) + lora_model = peft.PeftModel(model, lora_config) + return lora_model + + +def do_sample(llm, + lora_path: Optional[str] = None, + lora_id: Optional[int] = None, + logprobs: int = 0, + n_tokens: int = 256): + prompts = PROMPTS + sampling_params = vllm.SamplingParams(temperature=0, + max_tokens=n_tokens, + logprobs=logprobs, + stop=["[/assistant]"]) + outputs = llm.generate( + prompts, + sampling_params, + lora_request=LoRARequest(str(lora_id), lora_id, lora_path) + if lora_id else None) + # Print the outputs. + generated_texts = [] + generated_logprobs = [] + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + generated_texts.append(generated_text) + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + generated_logprobs.append([ + list(logprob.keys()) for out in output.outputs + for logprob in out.logprobs + ]) + return generated_logprobs if logprobs else generated_texts + + +SUPPORTED_MODULES = [ + "qkv_proj", "o_proj", "gate_up_proj", "down_proj", "embed_tokens", + "lm_head" +] +TARGET_MODULES_LIST = [] +for length in range(2, 6): + TARGET_MODULES_LIST.extend( + [sample(SUPPORTED_MODULES, length) for _ in range(3)]) + + +# Test the correctness when layer and rank are varied +# step 1: init a base model and serve with LoRA to get the reference results +# step 2: merge the same LoRA to the base model, serve the merged model +# step 3: compare the results from step 1 and step 2 +@pytest.mark.parametrize("tp_size", [1]) +@pytest.mark.parametrize("target_modules", TARGET_MODULES_LIST) +@pytest.mark.parametrize("rank", [8, 16, 32, 64]) +def test_layer_variation_correctness(tp_size, target_modules, rank): + llm = vllm.LLM(MODEL_PATH, + enable_lora=True, + max_num_seqs=16, + max_loras=4, + tensor_parallel_size=tp_size, + worker_use_ray=True) + model = get_lora_model(MODEL_PATH, target_modules, rank) + with tempfile.TemporaryDirectory() as tmpdir: + model.save_pretrained(tmpdir) + merged_probs = do_sample(llm, tmpdir, 1, logprobs=5, n_tokens=32) + del llm + cleanup() + reference_id_sets = [set(prob[0]) for prob in merged_probs] + + model = get_lora_model(MODEL_PATH, target_modules, rank) + with tempfile.TemporaryDirectory() as tmpdir: + merged_model = model.merge_and_unload() + merged_model.save_pretrained(tmpdir) + llm = vllm.LLM(tmpdir, + tokenizer=MODEL_PATH, + enable_lora=False, + max_num_seqs=16, + tensor_parallel_size=tp_size, + worker_use_ray=True) + probs = do_sample(llm, logprobs=5, n_tokens=32) + del llm + cleanup() + # verify the top-5 tokens are identical for each token + id_sets = [set(prob[0]) for prob in probs] + assert id_sets == reference_id_sets diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py index 1e06274133dd6..675e9136ea823 100644 --- a/tests/lora/test_layers.py +++ b/tests/lora/test_layers.py @@ -17,14 +17,16 @@ LoRAMapping, BaseLayerWithLoRA, ) -from vllm.lora.models import LoRALayerWeights, convert_mapping, PackedLoRALayerWeights +from vllm.lora.models import (LoRALayerWeights, convert_mapping, + PackedLoRALayerWeights) from vllm.config import LoRAConfig from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.linear import (ColumnParallelLinear, MergedColumnParallelLinear, RowParallelLinear, QKVParallelLinear) -from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding, ParallelLMHead +from vllm.model_executor.layers.vocab_parallel_embedding import ( + VocabParallelEmbedding, ParallelLMHead) from vllm.model_executor.utils import set_random_seed from .utils import DummyLoRAManager @@ -260,7 +262,8 @@ def create_random_embedding_layer(): @torch.inference_mode() -# @pytest.mark.skip(reason="Fails when loras are in any slot other than the first.") +# @pytest.mark.skip( +# reason="Fails when loras are in any slot other than the first.") @pytest.mark.parametrize("num_loras", [1, 2, 4, 8]) @pytest.mark.parametrize("device", CUDA_DEVICES) def test_embeddings_with_new_embeddings(dist_init, num_loras, device) -> None: @@ -684,9 +687,9 @@ class FakeConfig: result = linear(input_)[0] subloras = sublora_dict[lora_id] for i, sublora in enumerate(subloras): - result[:, sublora.lora_b.shape[1] * i:sublora.lora_b.shape[1] * ( - i + 1 - )] += input_ @ sublora.lora_a @ sublora.lora_b * sublora.scaling + result[:, sublora.lora_b.shape[1] * i:sublora.lora_b.shape[1] * + (i + 1)] += (input_ @ sublora.lora_a @ sublora.lora_b * + sublora.scaling) expected_results.append(result) expected_result = torch.cat(expected_results) diff --git a/tests/lora/test_llama.py b/tests/lora/test_llama.py index dfaf8c700695a..130906c3d584d 100644 --- a/tests/lora/test_llama.py +++ b/tests/lora/test_llama.py @@ -10,12 +10,12 @@ def do_sample(llm, lora_path: str, lora_id: int): prompts = [ - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_95 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a low tone mora with a gloss of /˩okiru/ [òkìɽɯ́]? [/user] [assistant]", - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]", - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]", - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]" + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501 + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", # noqa: E501 + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_95 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a low tone mora with a gloss of /˩okiru/ [òkìɽɯ́]? [/user] [assistant]", # noqa: E501 + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]", # noqa: E501 + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]", # noqa: E501 + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]" # noqa: E501 ] sampling_params = vllm.SamplingParams(temperature=0, max_tokens=256, @@ -48,20 +48,20 @@ def test_llama_lora(sql_lora_files, tp_size): tensor_parallel_size=tp_size) expected_no_lora_output = [ - "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_75 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_76 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_77 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_78 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user]", - " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? ", - "\n\n answer: 1\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_96 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_97 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_98 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one m", - " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. ", - " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? ", - "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE", + "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_75 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_76 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_77 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_78 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user]", # noqa: E501 + " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? ", # noqa: E501 + "\n\n answer: 1\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_96 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_97 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_98 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one m", # noqa: E501 + " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. ", # noqa: E501 + " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? ", # noqa: E501 + "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE", # noqa: E501 ] expected_lora_output = [ - " SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ", - " SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ", - " SELECT one_mora FROM table_name_95 WHERE gloss = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] AND accented_mora = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] ", - " SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(people_id) DESC LIMIT 1) ", - " SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ", - " SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' " + " SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ", # noqa: E501 + " SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ", # noqa: E501 + " SELECT one_mora FROM table_name_95 WHERE gloss = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] AND accented_mora = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] ", # noqa: E501 + " SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(people_id) DESC LIMIT 1) ", # noqa: E501 + " SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ", # noqa: E501 + " SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' " # noqa: E501 ] print("lora adapter created") @@ -121,7 +121,8 @@ def test_llama_tensor_parallel_equality(sql_lora_files): def test_llama_lora_warmup(sql_lora_files): - """Test that the LLM initialization works with a warmup LORA path and is more conservative""" + """Test that the LLM initialization works with a warmup LORA path and + is more conservative""" @ray.remote(num_gpus=1) def get_num_gpu_blocks_lora(): @@ -132,13 +133,15 @@ def get_num_gpu_blocks_lora(): @ray.remote(num_gpus=1) def get_num_gpu_blocks_no_lora(): llm = vllm.LLM(MODEL_PATH, max_num_seqs=16) - num_gpu_blocks_no_lora_warmup = llm.llm_engine.cache_config.num_gpu_blocks + num_gpu_blocks_no_lora_warmup = ( + llm.llm_engine.cache_config.num_gpu_blocks) return num_gpu_blocks_no_lora_warmup num_gpu_blocks_lora_warmup = ray.get(get_num_gpu_blocks_lora.remote()) num_gpu_blocks_no_lora_warmup = ray.get( get_num_gpu_blocks_no_lora.remote()) assert num_gpu_blocks_lora_warmup < num_gpu_blocks_no_lora_warmup, ( - "The warmup with lora should be more" - " conservative than without lora, therefore the number of memory blocks for the KV cache should be " + "The warmup with lora should be more " + "conservative than without lora, therefore the number of " + "memory blocks for the KV cache should be " "less when using lora than when not using lora") diff --git a/tests/lora/test_mixtral.py b/tests/lora/test_mixtral.py index 07779e1f5b889..47172853a1c66 100644 --- a/tests/lora/test_mixtral.py +++ b/tests/lora/test_mixtral.py @@ -9,9 +9,9 @@ def do_sample(llm, lora_path: str, lora_id: int): prompts = [ - "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nSpellForce 3 is a pretty bad game. The developer Grimlore Games is clearly a bunch of no-talent hacks, and 2017 was a terrible year for games anyway. [/user] [assistant]", - "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nI wanted to like Grimlore Games' 2017 entry, but in SpellForce 3 they just didn't get anything right. [/user] [assistant]", - "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nBioShock is a good role-playing, action-adventure, shooter that released for PlayStation, Xbox, and PC in 2007. It is available on Steam, and it has a Mac release but not a Linux release. [/user] [assistant]", + "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nSpellForce 3 is a pretty bad game. The developer Grimlore Games is clearly a bunch of no-talent hacks, and 2017 was a terrible year for games anyway. [/user] [assistant]", # noqa: E501 + "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nI wanted to like Grimlore Games' 2017 entry, but in SpellForce 3 they just didn't get anything right. [/user] [assistant]", # noqa: E501 + "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nBioShock is a good role-playing, action-adventure, shooter that released for PlayStation, Xbox, and PC in 2007. It is available on Steam, and it has a Mac release but not a Linux release. [/user] [assistant]", # noqa: E501 ] sampling_params = vllm.SamplingParams(temperature=0, max_tokens=256) outputs = llm.generate( @@ -43,9 +43,9 @@ def test_mixtral_lora(mixtral_lora_files, tp_size): worker_use_ray=True) expected_lora_output = [ - "give_opinion(name[SpellForce 3], release_year[2017], developer[Grimlore Games], rating[poor])", - "give_opinion(name[SpellForce 3], release_year[2017], developer[Grimlore Games], rating[poor])", - "inform(name[BioShock], release_year[2007], rating[good], genres[action-adventure, role-playing, shooter], platforms[PlayStation, Xbox, PC], available_on_steam[yes], has_linux_release[no], has_mac_release[yes])", + "give_opinion(name[SpellForce 3], release_year[2017], developer[Grimlore Games], rating[poor])", # noqa: E501 + "give_opinion(name[SpellForce 3], release_year[2017], developer[Grimlore Games], rating[poor])", # noqa: E501 + "inform(name[BioShock], release_year[2007], rating[good], genres[action-adventure, role-playing, shooter], platforms[PlayStation, Xbox, PC], available_on_steam[yes], has_linux_release[no], has_mac_release[yes])", # noqa: E501 ] assert do_sample(llm, mixtral_lora_files, diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py index 410bdfa5c69e2..0ab9c63ce4377 100644 --- a/tests/metrics/test_metrics.py +++ b/tests/metrics/test_metrics.py @@ -21,7 +21,8 @@ def test_metric_counter_prompt_tokens( gpu_memory_utilization=0.4) tokenizer = vllm_model.model.get_tokenizer() prompt_token_counts = [len(tokenizer.encode(p)) for p in example_prompts] - # This test needs at least 2 prompts in a batch of different lengths to verify their token count is correct despite padding. + # This test needs at least 2 prompts in a batch of different lengths to + # verify their token count is correct despite padding. assert len(example_prompts) > 1, "at least 2 prompts are required" assert prompt_token_counts[0] != prompt_token_counts[1], ( "prompts of different lengths are required") @@ -33,8 +34,8 @@ def test_metric_counter_prompt_tokens( **stat_logger.labels)._value.get() assert vllm_prompt_token_count == metric_count, ( - f"prompt token count: {vllm_prompt_token_count!r}\nmetric: {metric_count!r}" - ) + f"prompt token count: {vllm_prompt_token_count!r}\n" + f"metric: {metric_count!r}") @pytest.mark.parametrize("model", MODELS) @@ -60,9 +61,10 @@ def test_metric_counter_generation_tokens( for i in range(len(example_prompts)): vllm_output_ids, vllm_output_str = vllm_outputs[i] prompt_ids = tokenizer.encode(example_prompts[i]) - # vllm_output_ids contains both prompt tokens and generation tokens. We're interested only in the count of the generation tokens. + # vllm_output_ids contains both prompt tokens and generation tokens. + # We're interested only in the count of the generation tokens. vllm_generation_count += len(vllm_output_ids) - len(prompt_ids) assert vllm_generation_count == metric_count, ( - f"generation token count: {vllm_generation_count!r}\nmetric: {metric_count!r}" - ) + f"generation token count: {vllm_generation_count!r}\n" + f"metric: {metric_count!r}") diff --git a/tests/models/test_marlin.py b/tests/models/test_marlin.py index 7accb425cd666..bd9b75b1c1540 100644 --- a/tests/models/test_marlin.py +++ b/tests/models/test_marlin.py @@ -45,6 +45,7 @@ class ModelPair: ] +@pytest.mark.skip(reason="out of memory") @pytest.mark.flaky(reruns=2) @pytest.mark.skipif(marlin_not_supported, reason="Marlin is not supported on this GPU type.") @@ -66,7 +67,9 @@ def test_models( marlin_outputs = marlin_model.generate_greedy_logprobs( example_prompts, max_tokens, num_logprobs) - # Note: deleting just the model does not always free the GPU memory, not sure why. + # Note: not sure why, but deleting just the model on Ada Lovelace + # does not free the GPU memory. On Ampere, deleting the just model + # frees the memory. del marlin_model.model.llm_engine.driver_worker del marlin_model @@ -77,11 +80,14 @@ def test_models( max_tokens, num_logprobs) - # Note: deleting just the model does not always free the GPU memory, not sure why. + # Note: not sure why, but deleting just the model on Ada Lovelace + # does not free the GPU memory. On Ampere, deleting the just model + # frees the memory. del gptq_model.model.llm_engine.driver_worker del gptq_model # loop through the prompts + # use logprobs or else this will consistently run out of memory check_logprobs_close( outputs_0_lst=gptq_outputs, outputs_1_lst=marlin_outputs, diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py index 7ef8dde7bb8f6..c83551c36ef10 100644 --- a/tests/prefix_caching/test_prefix_caching.py +++ b/tests/prefix_caching/test_prefix_caching.py @@ -20,20 +20,23 @@ def test_block_allocator( num_blocks, enable_caching=True) - # Allocate two PysicalTokenBlocks with the same hash and check that they are the same PhysicalTokenBlock + # Allocate two PysicalTokenBlocks with the same hash and check + # that they are the same PhysicalTokenBlock first_block = block_allocator.allocate(block_hash, 0) second_block = block_allocator.allocate(block_hash, 0) assert (first_block == second_block) assert (second_block.ref_count == 2) - # Free the first_block and confirm that the ref_count is correctly decremented on the second block + # Free the first_block and confirm that the ref_count is correctly + # decremented on the second block block_allocator.free(first_block) assert (second_block.ref_count == 1) # Free the second block block_allocator.free(second_block) - # Reallocate the first block and confirm that, even after the block had its ref_count go to 0, we still get the same block back + # Reallocate the first block and confirm that, even after the block + # had its ref_count go to 0, we still get the same block back first_block = block_allocator.allocate(block_hash, 0) assert (first_block == second_block) assert (first_block.block_hash == block_hash) @@ -56,7 +59,8 @@ def test_eviction(num_blocks: int, ): for block in blocks: block_allocator.free(block) - # Allocate a new block and confirm that it's the first block freed. I.E The Least Recently Used block + # Allocate a new block and confirm that it's the first block freed. + # I.E The Least Recently Used block new_block_hash = block_size new_block = block_allocator.allocate(new_block_hash, 0) assert (new_block == blocks[0]) @@ -68,7 +72,8 @@ def test_eviction(num_blocks: int, ): assert (realloc_block == blocks[realloc_block_hash]) assert (realloc_block.block_hash == realloc_block_hash) - # Allocate a new block and confirm that it's not the realloc_block, since the realloc_block shouldn't be in the free list + # Allocate a new block and confirm that it's not the realloc_block, + # since the realloc_block shouldn't be in the free list new_block_hash = block_size + 1 new_block = block_allocator.allocate(new_block_hash, 0) assert (realloc_block != new_block) diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py index 1abb55f021214..14f1872c45258 100644 --- a/tests/samplers/test_logprobs.py +++ b/tests/samplers/test_logprobs.py @@ -70,8 +70,8 @@ def test_get_prompt_logprobs( hf_logprob[i][-1][token_id].item(), atol=1e-2, rtol=1e-2) - assert isinstance(sample_logprob.decoded_token, str), \ - ("The token should be decoded by the time it is returned " + assert isinstance(sample_logprob.decoded_token, str), ( + "The token should be decoded by the time it is returned " " to the user.") diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py index eb384dd1a73c5..31c43454d47ac 100644 --- a/tests/samplers/test_sampler.py +++ b/tests/samplers/test_sampler.py @@ -257,9 +257,10 @@ def test_sampling(model_runner: ModelRunner): if metadata.sampling_params.use_beam_search: continue - if metadata.sampling_params.seed is not None \ - and expected_tokens[i] is None: - # Record seeded random result to compare with results of second invocation + if (metadata.sampling_params.seed is not None + and expected_tokens[i] is None): + # Record seeded random result to compare with results of + # second invocation expected_tokens[i] = [ nth_output.output_token for nth_output in sequence_output.samples @@ -267,11 +268,13 @@ def test_sampling(model_runner: ModelRunner): continue for n, nth_output in enumerate(sequence_output.samples): - if metadata.sampling_params.temperature == 0 or metadata.sampling_params.seed is not None: + if (metadata.sampling_params.temperature == 0 + or metadata.sampling_params.seed is not None): # Ensure exact matches for greedy or random with seed assert nth_output.output_token == expected_tokens[i][n] else: - # For non-seeded random check that one of the high-logit tokens were chosen + # For non-seeded random check that one of the high-logit + # tokens were chosen assert nth_output.output_token in expected_tokens[i] # Test batch @@ -286,8 +289,8 @@ def test_sampling(model_runner: ModelRunner): input_tensor.data = input_tensor.index_select(0, target_index) fake_logits.data = fake_logits.index_select(0, target_index) - # This time, results of seeded random samples will be compared with the corresponding - # sample in the pre-shuffled batch + # This time, results of seeded random samples will be compared with + # the corresponding sample in the pre-shuffled batch test_sampling(model_runner) del model_runner diff --git a/vllm/model_executor/layers/triton_kernel/__init__.py b/tests/spec_decode/__init__.py similarity index 100% rename from vllm/model_executor/layers/triton_kernel/__init__.py rename to tests/spec_decode/__init__.py diff --git a/tests/spec_decode/test_batch_expansion.py b/tests/spec_decode/test_batch_expansion.py new file mode 100644 index 0000000000000..fddc3995452cc --- /dev/null +++ b/tests/spec_decode/test_batch_expansion.py @@ -0,0 +1,95 @@ +import torch +import pytest + +from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer + +from .utils import mock_worker, create_seq_group_metadata_from_prompts + + +@pytest.mark.parametrize('num_target_seq_ids', [100]) +def test_create_target_seq_id_iterator(num_target_seq_ids: int): + """Verify all new sequence ids are greater than all input + seq ids. + """ + scorer = BatchExpansionTop1Scorer(mock_worker(), 'cuda:0', 32_000) + + all_seq_ids = [ + [1, 3, 5, 7], + list(range(100)) + [0], + [100], + ] + + for seq_ids in all_seq_ids: + max_seq_id = max(seq_ids) + iterator = scorer._create_target_seq_id_iterator(seq_ids) # pylint: disable=protected-access + for _ in range(num_target_seq_ids): + assert next(iterator) > max_seq_id + + +@pytest.mark.parametrize('k', [1, 2, 6]) +def test_get_token_ids_to_score(k: int): + """Verify correct tokens are selected for scoring. + """ + proposal_token_ids = torch.tensor( + list(range(k)), + dtype=torch.int64, + device='cuda', + ) + + expected_output = [ + [], + ] + for i in range(proposal_token_ids.shape[0]): + expected_output.append(proposal_token_ids[:i + 1].tolist()) + + scorer = BatchExpansionTop1Scorer(mock_worker(), 'cuda:0', 32_000) + actual_output = scorer._get_token_ids_to_score(proposal_token_ids) # pylint: disable=protected-access + + actual_output = [ + x.tolist() if isinstance(x, torch.Tensor) else x for x in actual_output + ] + + assert actual_output == expected_output + + +@pytest.mark.parametrize('k', [1, 2, 6]) +def test_create_single_target_seq_group_metadata(k: int): + """Verify correct creation of a batch-expanded seq group metadata. + """ + + prompt_tokens = [1, 2, 3] + prev_output_tokens = [4, 5, 6] + + token_ids = list(range(k)) + + num_tokens_processed = len(prompt_tokens) + len(prev_output_tokens) - 1 + + final_seq_len = len(prompt_tokens) + len(prev_output_tokens) + len( + token_ids) + + block_size = 32 + input_seq_group_metadata = create_seq_group_metadata_from_prompts( + [prompt_tokens], 2048 // block_size, block_size, [final_seq_len], + [prev_output_tokens], [num_tokens_processed])[0] + + input_seq_id = list(input_seq_group_metadata.seq_data.keys())[0] + target_seq_id = 100 + + scorer = BatchExpansionTop1Scorer(mock_worker(), 'cuda:0', 32_000) + output = scorer._create_single_target_seq_group_metadata( # pylint: disable=protected-access + input_seq_group_metadata, + input_seq_id, + target_seq_id, + token_ids, + ) + + assert output.request_id == input_seq_group_metadata.request_id + assert len(output.seq_data) == 1 + assert output.seq_data[target_seq_id].get_prompt_token_ids( + ) == prompt_tokens + assert output.seq_data[target_seq_id].get_output_token_ids( + ) == prev_output_tokens + token_ids + + assert len(output.block_tables) == 1 + assert output.block_tables[ + target_seq_id] == input_seq_group_metadata.block_tables[input_seq_id] diff --git a/tests/spec_decode/test_metrics.py b/tests/spec_decode/test_metrics.py new file mode 100644 index 0000000000000..09847136d13e9 --- /dev/null +++ b/tests/spec_decode/test_metrics.py @@ -0,0 +1,159 @@ +import torch +import math +import pytest + +from unittest.mock import MagicMock + +from vllm.spec_decode.metrics import AsyncMetricsCollector + + +def test_initial_call_returns_none(): + """Expect first call to get metrics to return None. + """ + rej_sampler = MagicMock() + rej_sampler.num_accepted_tokens = torch.tensor(0, + dtype=torch.long, + device='cuda') + rej_sampler.num_emitted_tokens = torch.tensor(0, + dtype=torch.long, + device='cuda') + rej_sampler.num_draft_tokens = 0 + + collector = AsyncMetricsCollector(rej_sampler) + collector.init_gpu_tensors(rank=0) + maybe_metrics = collector.maybe_collect_rejsample_metrics(k=5) + assert maybe_metrics is None + + +def test_second_call_returns_metrics(): + """Expect second call to not return None. + """ + rej_sampler = MagicMock() + rej_sampler.num_accepted_tokens = torch.tensor(0, + dtype=torch.long, + device='cuda') + rej_sampler.num_emitted_tokens = torch.tensor(0, + dtype=torch.long, + device='cuda') + rej_sampler.num_draft_tokens = 0 + + collect_interval_s = 5.0 + timer = MagicMock() + timer.side_effect = [ + 0.0, collect_interval_s + 0.1, collect_interval_s + 0.2 + ] + + collector = AsyncMetricsCollector(rejection_sampler=rej_sampler, + timer=timer, + collect_interval_s=collect_interval_s) + collector.init_gpu_tensors(rank=0) + _ = collector.maybe_collect_rejsample_metrics(k=5) + metrics = collector.maybe_collect_rejsample_metrics(k=5) + assert metrics is not None + + +@pytest.mark.parametrize("rank", [1, 2, 3, 4]) +def test_nonzero_rank_noop(rank): + """Verify nonzero ranks don't collect metrics. + """ + rej_sampler = MagicMock() + rej_sampler.num_accepted_tokens = torch.tensor(0, + dtype=torch.long, + device='cuda') + rej_sampler.num_emitted_tokens = torch.tensor(0, + dtype=torch.long, + device='cuda') + rej_sampler.num_draft_tokens = 0 + + collector = AsyncMetricsCollector(rej_sampler) + collector.init_gpu_tensors(rank=rank) + _ = collector.maybe_collect_rejsample_metrics(k=5) + metrics = collector.maybe_collect_rejsample_metrics(k=5) + assert metrics is None + + +def test_noop_until_time(): + """Verify metrics aren't collected until enough time passes. + """ + rej_sampler = MagicMock() + rej_sampler.num_accepted_tokens = torch.tensor(0, + dtype=torch.long, + device='cuda') + rej_sampler.num_emitted_tokens = torch.tensor(0, + dtype=torch.long, + device='cuda') + rej_sampler.num_draft_tokens = 0 + + collect_interval_s = 5.0 + timer = MagicMock() + timer.side_effect = [ + 0.0, collect_interval_s - 0.1, collect_interval_s - 0.1, + collect_interval_s + 0.1, collect_interval_s + 0.1 + ] + + collector = AsyncMetricsCollector(rejection_sampler=rej_sampler, + timer=timer, + collect_interval_s=collect_interval_s) + collector.init_gpu_tensors(rank=0) + + _ = collector.maybe_collect_rejsample_metrics(k=5) + metrics = collector.maybe_collect_rejsample_metrics(k=5) + assert metrics is None + + _ = collector.maybe_collect_rejsample_metrics(k=5) + metrics = collector.maybe_collect_rejsample_metrics(k=5) + assert metrics is not None + + +@pytest.mark.parametrize("has_data", [True, False]) +def test_initial_metrics_has_correct_values(has_data: bool): + """Test correctness of metrics data. + """ + if has_data: + num_accepted_tokens = 103 + num_emitted_tokens = 104 + num_draft_tokens = 105 + else: + num_accepted_tokens = 0 + num_emitted_tokens = 0 + num_draft_tokens = 0 + k = 5 + + num_possible_tokens = AsyncMetricsCollector.get_max_num_accepted_tokens( + num_draft_tokens, k) + + rej_sampler = MagicMock() + rej_sampler.num_accepted_tokens = torch.tensor(num_accepted_tokens, + dtype=torch.long, + device='cuda') + rej_sampler.num_emitted_tokens = torch.tensor(num_emitted_tokens, + dtype=torch.long, + device='cuda') + rej_sampler.num_draft_tokens = num_draft_tokens + + collect_interval_s = 5.0 + timer = MagicMock() + timer.side_effect = [ + 0.0, collect_interval_s + 0.1, collect_interval_s + 0.2 + ] + + collector = AsyncMetricsCollector(rejection_sampler=rej_sampler, + timer=timer, + collect_interval_s=collect_interval_s) + collector.init_gpu_tensors(rank=0) + _ = collector.maybe_collect_rejsample_metrics(k) + metrics = collector.maybe_collect_rejsample_metrics(k) + + assert metrics.num_spec_tokens == k + assert metrics.accepted_tokens == num_accepted_tokens + assert metrics.draft_tokens == num_draft_tokens + assert metrics.emitted_tokens == num_emitted_tokens + + if has_data: + assert (metrics.draft_acceptance_rate == num_accepted_tokens / + num_draft_tokens) + assert (metrics.system_efficiency == num_emitted_tokens / + num_possible_tokens) + else: + assert math.isnan(metrics.draft_acceptance_rate) + assert math.isnan(metrics.system_efficiency) diff --git a/tests/worker/spec_decode/test_multi_step_worker.py b/tests/spec_decode/test_multi_step_worker.py similarity index 61% rename from tests/worker/spec_decode/test_multi_step_worker.py rename to tests/spec_decode/test_multi_step_worker.py index ea54802903578..45b43ec59ee8f 100644 --- a/tests/worker/spec_decode/test_multi_step_worker.py +++ b/tests/spec_decode/test_multi_step_worker.py @@ -3,14 +3,16 @@ import pytest from unittest.mock import MagicMock -from vllm.worker.spec_decode.multi_step_worker import MultiStepWorker +from vllm.spec_decode.multi_step_worker import (MultiStepWorker, + DraftModelTop1Proposer) from vllm.worker.worker import Worker from vllm.model_executor.utils import set_random_seed +from vllm.sequence import SamplerOutput from .utils import (create_execute_model_data, create_worker, create_seq_group_metadata_from_prompts, zero_kv_cache, patch_execute_model_with_seeds, - assert_logprobs_dict_allclose) + assert_logprobs_dict_allclose, create_batch) @pytest.mark.parametrize('num_steps', list(range(1, 17))) @@ -259,3 +261,160 @@ def test_same_output_for_multi_step(): multi_step_output_logprobs, single_step_output_logprobs): assert_logprobs_dict_allclose(multi_step_logprobs, single_step_logprobs) + + +@torch.inference_mode() +def test_draft_proposals_full_speculation_len(): + """Verify DraftModelTop1Proposer correctly handles case where all sequences + can speculate. + """ + k = 10 + batch_size = 32 + vocab_size = 32_000 + device = 'cuda:0' + + draft_worker = MagicMock() + proposer = DraftModelTop1Proposer( + draft_worker=draft_worker, + device=device, + max_model_len=2048, + vocab_size=vocab_size, + ) + draft_worker.execute_model_multi_step.return_value = [ + SamplerOutput( + outputs=[], + sampled_token_probs=torch.rand(batch_size, + vocab_size, + device=device, + dtype=torch.float32), + sampled_token_ids=torch.randint(low=0, + high=vocab_size, + size=(batch_size, ), + device=device, + dtype=torch.long), + ) for _ in range(k) + ] + + execute_model_data, _, _ = create_batch(batch_size, k) + + proposals = proposer.get_proposals( + **execute_model_data.to_dict(), + max_proposal_len=k, + ) + + assert torch.is_tensor(proposals.proposal_token_ids) + assert torch.is_tensor(proposals.proposal_probs) + + assert proposals.proposal_token_ids.shape == torch.Size([batch_size, k]) + assert proposals.proposal_probs.shape[:-1] == torch.Size([batch_size, k]) + + assert proposals.proposal_lens.shape == torch.Size([batch_size]) + assert proposals.proposal_lens.tolist() == [k for _ in range(batch_size)] + + +@torch.inference_mode() +def test_draft_proposals_no_speculations(): + """Verify DraftModelTop1Proposer correctly handles case where no sequences + can speculate. + """ + k = 10 + batch_size = 32 + vocab_size = 32_000 + device = 'cuda:0' + prompt_len = 10 + + draft_worker = MagicMock() + proposer = DraftModelTop1Proposer( + draft_worker=draft_worker, + device=device, + max_model_len=prompt_len + k - 1, + vocab_size=vocab_size, + ) + + execute_model_data, _, _ = create_batch(batch_size, + k, + prompt_len=prompt_len) + + proposals = proposer.get_proposals( + **execute_model_data.to_dict(), + max_proposal_len=k, + ) + + assert torch.is_tensor(proposals.proposal_token_ids) + assert torch.is_tensor(proposals.proposal_probs) + + assert proposals.proposal_token_ids.shape == torch.Size([0, k]) + assert proposals.proposal_probs.shape[:-1] == torch.Size([0, k]) + + assert proposals.proposal_lens.shape == torch.Size([batch_size]) + assert proposals.proposal_lens.tolist() == [0 for _ in range(batch_size)] + + +@torch.inference_mode() +def test_draft_proposals_mixed_k(): + """Verify DraftModelTop1Proposer correctly handles case some sequences can + speculate and some can't. + """ + k = 10 + batch_size = 32 + vocab_size = 32_000 + device = 'cuda:0' + + small_prompt_len = 5 + long_prompt_len = 10 + prev_output_token_len = 20 + + expected_num_proposal_seqs = 6 + expected_num_no_proposal_seqs = batch_size - expected_num_proposal_seqs + + prompt_len = [ + small_prompt_len for _ in range(expected_num_proposal_seqs - 1) + ] + [long_prompt_len + for _ in range(expected_num_no_proposal_seqs)] + [small_prompt_len] + + draft_worker = MagicMock() + proposer = DraftModelTop1Proposer( + draft_worker=draft_worker, + device=device, + max_model_len=long_prompt_len + prev_output_token_len + k - 1, + vocab_size=vocab_size, + ) + + draft_worker.execute_model_multi_step.return_value = [ + SamplerOutput( + outputs=[], + sampled_token_probs=torch.rand(expected_num_proposal_seqs, + vocab_size, + device=device, + dtype=torch.float32), + sampled_token_ids=torch.randint( + low=0, + high=vocab_size, + size=(expected_num_proposal_seqs, ), + device=device, + dtype=torch.long), + ) for _ in range(k) + ] + + execute_model_data, _, _ = create_batch( + batch_size, + k, + prompt_len=prompt_len, + prev_output_token_len=prev_output_token_len, + ) + + proposals = proposer.get_proposals( + **execute_model_data.to_dict(), + max_proposal_len=k, + ) + + assert torch.is_tensor(proposals.proposal_token_ids) + assert torch.is_tensor(proposals.proposal_probs) + + assert proposals.proposal_token_ids.shape == torch.Size([batch_size, k]) + assert proposals.proposal_probs.shape[:-1] == torch.Size([batch_size, k]) + + assert proposals.proposal_lens.shape == torch.Size([batch_size]) + assert proposals.proposal_lens.tolist() == [ + k for _ in range(expected_num_proposal_seqs - 1) + ] + [0 for _ in range(expected_num_no_proposal_seqs)] + [k] diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py new file mode 100644 index 0000000000000..bfc69e01e3eb9 --- /dev/null +++ b/tests/spec_decode/test_spec_decode_worker.py @@ -0,0 +1,597 @@ +import torch +import random +import pytest +from unittest.mock import MagicMock + +from vllm.spec_decode.multi_step_worker import MultiStepWorker +from vllm.spec_decode.spec_decode_worker import (SpecDecodeWorker, + split_num_cache_blocks_evenly) +from vllm.spec_decode.interfaces import SpeculativeProposals +from vllm.model_executor.utils import set_random_seed +from vllm.model_executor.layers.rejection_sampler import RejectionSampler +from .utils import (mock_worker, create_batch, ExecuteModelData, + create_sampler_output_list) +from vllm.spec_decode.metrics import (SpecDecodeWorkerMetrics, + AsyncMetricsCollector) + + +@pytest.mark.parametrize('k', [1, 2, 6]) +@pytest.mark.parametrize('batch_size', [1, 2, 32]) +@torch.inference_mode() +def test_correctly_calls_draft_model(k: int, batch_size: int): + """Verify SpecDecodeWorker calls the draft worker with correct + inputs. Everything else is mocked out. + """ + draft_worker = mock_worker(cls=MultiStepWorker) + target_worker = mock_worker() + rejection_sampler = MagicMock(spec=RejectionSampler) + metrics_collector = MagicMock(spec=AsyncMetricsCollector) + worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler, + metrics_collector) + + exception_secret = 'artifical stop' + draft_worker.get_spec_proposals.side_effect = ValueError(exception_secret) + + execute_model_data, _, _ = create_batch(batch_size, k) + + with pytest.raises(ValueError, match=exception_secret): + worker.execute_model(**execute_model_data.to_dict(), num_spec_tokens=k) + + call_args_list = draft_worker.get_spec_proposals.call_args_list + assert len(call_args_list) == 1 + + for args, _ in call_args_list: + (seq_group_metadata_list, blocks_to_swap_in, blocks_to_swap_out, + blocks_to_copy, actual_k) = args + actual_execute_model_data = ExecuteModelData(seq_group_metadata_list, + blocks_to_swap_in, + blocks_to_swap_out, + blocks_to_copy) + assert actual_execute_model_data == execute_model_data + assert actual_k == k + + +@pytest.mark.parametrize('k', [1, 2, 6]) +@pytest.mark.parametrize('batch_size', [1, 2, 32]) +@torch.inference_mode() +def test_correctly_calls_target_model(k: int, batch_size: int): + """Verify SpecDecodeWorker calls the target model with correct + inputs. Everything else is mocked out. + """ + draft_worker = mock_worker(cls=MultiStepWorker) + target_worker = mock_worker() + rejection_sampler = MagicMock(spec=RejectionSampler) + rejection_sampler.token_id_dtype = torch.int64 + metrics_collector = MagicMock(spec=AsyncMetricsCollector) + + draft_worker.device = 'cuda' + target_worker.device = 'cuda' + + set_random_seed(1) + + worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler, + metrics_collector) + worker.init_model() + + vocab_size = 32_000 + + proposal_token_ids = torch.randint(low=0, + high=vocab_size, + size=(batch_size, k), + dtype=torch.int64, + device='cuda') + proposal_probs = torch.rand(batch_size, + k, + vocab_size, + dtype=torch.float32, + device='cuda') + proposal_lens = torch.ones(batch_size, dtype=torch.int64, + device='cuda') * k + + execute_model_data, prompts, prev_output_tokens = create_batch( + batch_size, k) + + draft_worker.get_spec_proposals.return_value = SpeculativeProposals( + proposal_token_ids=proposal_token_ids, + proposal_probs=proposal_probs, + proposal_lens=proposal_lens) + + exception_secret = 'artifical stop' + target_worker.execute_model.side_effect = ValueError(exception_secret) + + with pytest.raises(ValueError, match=exception_secret): + worker.execute_model(**execute_model_data.to_dict(), num_spec_tokens=k) + + seen_contexts = [] + + call_args_list = target_worker.execute_model.call_args_list + assert len(call_args_list) == 1 + for args, kwargs in call_args_list: + target_execute_model_data = ExecuteModelData.from_dict(kwargs) + + assert len(target_execute_model_data.seq_group_metadata_list) == ( + k + 1) * batch_size + for seq_group_metadata in ( + target_execute_model_data.seq_group_metadata_list): + for seq_data in seq_group_metadata.seq_data.values(): + seen_contexts.append(seq_data.get_token_ids()) + + expected_seen_contexts = [] + + for prompt, prev_generated, draft_tokens in zip( + prompts, prev_output_tokens, proposal_token_ids.tolist()): + + for i in range(len(draft_tokens) + 1): + expected_seen_contexts.append(prompt + prev_generated + + draft_tokens[:i]) + + seen_contexts.sort() + expected_seen_contexts.sort() + assert expected_seen_contexts == seen_contexts + + +@pytest.mark.parametrize('k', [1, 2, 6]) +@pytest.mark.parametrize('batch_size', [1, 2, 32]) +@torch.inference_mode() +def test_correctly_calls_rejection_sampler(k: int, batch_size: int): + """Verify SpecDecodeWorker calls the rejection sampler with + correct inputs. Everything else is mocked out. + """ + vocab_size = 32_000 + + draft_worker = mock_worker(cls=MultiStepWorker, vocab_size=vocab_size) + target_worker = mock_worker(vocab_size=vocab_size) + rejection_sampler = MagicMock(spec=RejectionSampler) + rejection_sampler.token_id_dtype = torch.int64 + metrics_collector = MagicMock(spec=AsyncMetricsCollector) + draft_worker.device = 'cuda' + target_worker.device = 'cuda' + + set_random_seed(1) + + worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler, + metrics_collector) + worker.init_model() + + proposal_token_ids = torch.randint(low=0, + high=vocab_size, + size=(batch_size, k), + dtype=torch.int64, + device='cuda') + proposal_probs = torch.rand(batch_size, + k, + vocab_size, + dtype=torch.float32, + device='cuda') + + proposal_lens = torch.ones(batch_size, dtype=torch.int64, + device='cuda') * k + + execute_model_data, _, _ = create_batch(batch_size, k) + + draft_worker.get_spec_proposals.return_value = SpeculativeProposals( + proposal_token_ids=proposal_token_ids, + proposal_probs=proposal_probs, + proposal_lens=proposal_lens) + + target_token_ids = torch.randint(low=0, + high=vocab_size, + size=(1, batch_size * (k + 1)), + dtype=torch.int64, + device='cuda') + target_token_probs = torch.rand(1, + batch_size * (k + 1), + vocab_size, + dtype=torch.float32, + device='cuda') + target_output = create_sampler_output_list(target_token_ids, + target_token_probs) + + target_worker.execute_model.return_value = target_output[0] + + exception_secret = 'artifical stop' + rejection_sampler.side_effect = ValueError(exception_secret) + + with pytest.raises(ValueError, match=exception_secret): + worker.execute_model(**execute_model_data.to_dict(), num_spec_tokens=k) + + assert len(rejection_sampler.call_args_list) == 1 + args, _ = rejection_sampler.call_args_list[0] + (actual_proposal_scores, actual_bonus_token_ids, actual_proposal_probs, + actual_proposal_token_ids) = args + + assert torch.equal(actual_bonus_token_ids, + target_token_ids.reshape(batch_size, k + 1)[:, -1:]) + assert torch.equal( + actual_proposal_scores, + target_token_probs.reshape(batch_size, k + 1, -1)[:, :-1]) + assert torch.equal(actual_proposal_token_ids, proposal_token_ids) + assert torch.equal(actual_proposal_probs, proposal_probs) + + +@pytest.mark.parametrize('k', [1, 2, 6]) +@pytest.mark.parametrize('batch_size', [1, 2, 32]) +@torch.inference_mode() +def test_correctly_formats_output(k: int, batch_size: int): + """Verify SpecDecodeWorker formats sampler output correctly. + Everything else is mocked out. + """ + vocab_size = 32_000 + + draft_worker = mock_worker(cls=MultiStepWorker, vocab_size=vocab_size) + target_worker = mock_worker(vocab_size=vocab_size) + rejection_sampler = MagicMock(spec=RejectionSampler) + rejection_sampler.token_id_dtype = torch.int64 + metrics_collector = MagicMock(spec=AsyncMetricsCollector) + draft_worker.device = 'cuda' + target_worker.device = 'cuda' + + set_random_seed(1) + + worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler, + metrics_collector) + worker.init_model() + + proposal_token_ids = torch.randint(low=0, + high=vocab_size, + size=(batch_size, k), + dtype=torch.int64, + device='cuda') + proposal_probs = torch.rand(batch_size, + k, + vocab_size, + dtype=torch.float32, + device='cuda') + + proposal_lens = torch.ones(batch_size, dtype=torch.int64, + device='cuda') * k + + execute_model_data, _, _ = create_batch(batch_size, k) + + draft_worker.get_spec_proposals.return_value = SpeculativeProposals( + proposal_token_ids=proposal_token_ids, + proposal_probs=proposal_probs, + proposal_lens=proposal_lens) + + target_token_ids = torch.randint(low=0, + high=vocab_size, + size=(1, batch_size * (k + 1)), + dtype=torch.int64, + device='cuda') + target_token_probs = torch.rand(1, + batch_size * (k + 1), + vocab_size, + dtype=torch.float32, + device='cuda') + target_output = create_sampler_output_list(target_token_ids, + target_token_probs) + + target_worker.execute_model.return_value = target_output[0] + + rejection_sampler_output = torch.randint(low=0, + high=vocab_size, + size=(batch_size, k + 1), + dtype=torch.int64, + device='cuda') + for i in range(batch_size): + minimum_accepted_tokens = 1 + rejection_sampler_output[i][ + -random.randint(minimum_accepted_tokens, k + 1):] = -1 + + rejection_sampler.return_value = rejection_sampler_output + + output = worker.execute_model(**execute_model_data.to_dict(), + num_spec_tokens=k) + + expected_output = create_sampler_output_list( + rejection_sampler_output.transpose(0, 1), [None for _ in range(k + 1)]) + + seq_ids = [ + next(iter(seq_group_metadata.seq_data.keys())) + for seq_group_metadata in execute_model_data.seq_group_metadata_list + ] + actual_output_by_seq = {seq_id: [] for seq_id in seq_ids} + expected_output_by_seq = {seq_id: [] for seq_id in seq_ids} + + for step in output: + for seq_group in step: + for sample in seq_group.samples: + seq_id = sample.parent_seq_id + actual_output_by_seq[seq_id].append(sample) + + for step in expected_output: + for seq_group in step: + for sample in seq_group.samples: + seq_id = sample.parent_seq_id + expected_output_by_seq[seq_id].append(sample) + + all_seen_seq_ids = set( + list(actual_output_by_seq.keys()) + + list(expected_output_by_seq.keys())) + for seq_id in all_seen_seq_ids: + actual_by_step = actual_output_by_seq[seq_id] + expected_by_step = expected_output_by_seq[seq_id] + + for i in range(k + 1): + if i >= len(actual_by_step): + assert expected_by_step[i].output_token == -1 + continue + assert actual_by_step[i].output_token == expected_by_step[ + i].output_token + assert actual_by_step[i].logprobs == expected_by_step[i].logprobs + + +@pytest.mark.parametrize('k', [1, 2]) +@pytest.mark.parametrize('batch_size', [1]) +@pytest.mark.parametrize('returns_metrics', [True, False]) +@torch.inference_mode() +def test_collects_metrics(k: int, batch_size: int, returns_metrics: bool): + """Verify SpecDecodeWorker collects metrics. + """ + vocab_size = 32_000 + + draft_worker = mock_worker(cls=MultiStepWorker, vocab_size=vocab_size) + target_worker = mock_worker(vocab_size=vocab_size) + rejection_sampler = MagicMock(spec=RejectionSampler) + rejection_sampler.token_id_dtype = torch.int64 + metrics_collector = MagicMock(spec=AsyncMetricsCollector) + draft_worker.device = 'cuda' + target_worker.device = 'cuda' + + set_random_seed(1) + + worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler, + metrics_collector) + worker.init_model() + + proposal_token_ids = torch.randint(low=0, + high=vocab_size, + size=(batch_size, k), + dtype=torch.int64, + device='cuda') + proposal_probs = torch.rand(batch_size, + k, + vocab_size, + dtype=torch.float32, + device='cuda') + + proposal_lens = torch.ones(batch_size, dtype=torch.int64, + device='cuda') * k + + execute_model_data, _, _ = create_batch(batch_size, k) + + draft_worker.get_spec_proposals.return_value = SpeculativeProposals( + proposal_token_ids=proposal_token_ids, + proposal_probs=proposal_probs, + proposal_lens=proposal_lens) + + target_token_ids = torch.randint(low=0, + high=vocab_size, + size=(1, batch_size * (k + 1)), + dtype=torch.int64, + device='cuda') + target_token_probs = torch.rand(1, + batch_size * (k + 1), + vocab_size, + dtype=torch.float32, + device='cuda') + target_output = create_sampler_output_list(target_token_ids, + target_token_probs) + + target_worker.execute_model.return_value = target_output[0] + + rejection_sampler_output = torch.randint(low=0, + high=vocab_size, + size=(batch_size, k + 1), + dtype=torch.int64, + device='cuda') + for i in range(batch_size): + minimum_accepted_tokens = 1 + rejection_sampler_output[i][ + -random.randint(minimum_accepted_tokens, k + 1):] = -1 + + rejection_sampler.return_value = rejection_sampler_output + + mock_rejsample_metrics = MagicMock( + spec=SpecDecodeWorkerMetrics) if returns_metrics else None + metrics_collector.maybe_collect_rejsample_metrics.return_value = ( + mock_rejsample_metrics) + + output = worker.execute_model(**execute_model_data.to_dict(), + num_spec_tokens=k) + assert output[0].spec_decode_worker_metrics == mock_rejsample_metrics + + call_args_list = ( + metrics_collector.maybe_collect_rejsample_metrics.call_args_list) + assert len(call_args_list) == 1 + args, kwargs = call_args_list[0] + assert args[0] == k or kwargs.get('k', -1) == k + + +@pytest.mark.parametrize('k', [0]) +@pytest.mark.parametrize('batch_size', [1, 2, 32]) +@torch.inference_mode() +def test_k_equals_zero(k: int, batch_size: int): + """Verify that the SpecDecodeWorker calls the draft and target workers + when k is zero. This happens during prefill. + """ + draft_worker = mock_worker(cls=MultiStepWorker) + target_worker = mock_worker() + rejection_sampler = MagicMock(spec=RejectionSampler) + rejection_sampler.token_id_dtype = torch.int64 + metrics_collector = MagicMock(spec=AsyncMetricsCollector) + + draft_worker.device = 'cuda' + target_worker.device = 'cuda' + + set_random_seed(1) + + worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler, + metrics_collector) + + execute_model_data, prompts, prev_output_tokens = create_batch( + batch_size, k, prev_output_token_len=0) + + out = worker.execute_model(**execute_model_data.to_dict(), + num_spec_tokens=k) + + assert len(out) == 1, f"expected only one token output when {k=}" + assert out[0].probs is None, "expect gpu tensor references to be None" + assert out[ + 0].sampled_tokens is None, "expect gpu tensor references to be None" + + draft_worker.execute_model.assert_called_once_with( + **execute_model_data.to_dict(), return_python_output=False) + target_worker.execute_model.assert_called_once_with( + **execute_model_data.to_dict()) + + +@pytest.mark.parametrize('k', [0, 5]) +@pytest.mark.parametrize('batch_size', [0]) +@torch.inference_mode() +def test_empty_input_batch(k: int, batch_size: int): + """Verify that the SpecDecodeWorker calls the draft and target workers + when the input batch is empty. This can happen if the engine communicates + to the workers information without scheduling a batch. + """ + draft_worker = mock_worker(cls=MultiStepWorker) + target_worker = mock_worker() + rejection_sampler = MagicMock(spec=RejectionSampler) + rejection_sampler.token_id_dtype = torch.int64 + metrics_collector = MagicMock(spec=AsyncMetricsCollector) + + draft_worker.device = 'cuda' + target_worker.device = 'cuda' + + set_random_seed(1) + + worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler, + metrics_collector) + + execute_model_data, prompts, prev_output_tokens = create_batch( + batch_size, k, prev_output_token_len=0) + + out = worker.execute_model(**execute_model_data.to_dict(), + num_spec_tokens=k) + + assert len(out) == 1, f"expected only one token output when {k=}" + assert out[0].probs is None, "expect gpu tensor references to be None" + assert out[ + 0].sampled_tokens is None, "expect gpu tensor references to be None" + + draft_worker.execute_model.assert_called_once_with( + **execute_model_data.to_dict(), return_python_output=False) + target_worker.execute_model.assert_called_once_with( + **execute_model_data.to_dict()) + + +@torch.inference_mode() +def test_init_model(): + """Verify SpecDecodeWorker invokes proposer/scorer worker init_model, as + well as other GPU initialization. + """ + draft_worker = mock_worker(cls=MultiStepWorker) + target_worker = mock_worker() + rejection_sampler = MagicMock(spec=RejectionSampler) + rejection_sampler.token_id_dtype = torch.int64 + metrics_collector = MagicMock(spec=AsyncMetricsCollector) + + worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler, + metrics_collector) + + worker.init_model() + + draft_worker.init_model.assert_called_once() + + target_worker.init_model.assert_called_once() + + metrics_collector.init_gpu_tensors.assert_called_once() + rejection_sampler.init_gpu_tensors.assert_called_once() + + +@torch.inference_mode() +def test_init_cache_engine(): + """Verify SpecDecodeWorker invokes init_cache_engine on proposer/scorer + workers. + """ + draft_worker = mock_worker(cls=MultiStepWorker) + target_worker = mock_worker() + rejection_sampler = MagicMock(spec=RejectionSampler) + rejection_sampler.token_id_dtype = torch.int64 + metrics_collector = MagicMock(spec=AsyncMetricsCollector) + + worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler, + metrics_collector) + + cache_config = MagicMock() + + worker.init_cache_engine(cache_config) + + draft_worker.init_cache_engine.assert_called_once_with(cache_config) + target_worker.init_cache_engine.assert_called_once_with(cache_config) + + +@pytest.mark.parametrize('available_gpu_blocks', [1, 1024]) +@pytest.mark.parametrize('available_cpu_blocks', [500]) +@pytest.mark.parametrize('target_cache_block_size_bytes', [2 * 2 * 4096]) +@pytest.mark.parametrize('draft_kv_size_bytes', [0, 2 * 2 * 768, 2 * 2 * 4096]) +@torch.inference_mode() +def test_profile_num_available_blocks(available_gpu_blocks: int, + available_cpu_blocks: int, + target_cache_block_size_bytes: int, + draft_kv_size_bytes: int): + """Verify SpecDecodeWorker correctly profiles num available GPU blocks. + Specifically, it should run profiling in the scorer worker, and then evenly + split the blocks between proposer and scorer worker. + """ + draft_worker = mock_worker(cls=MultiStepWorker) + target_worker = mock_worker() + rejection_sampler = MagicMock(spec=RejectionSampler) + rejection_sampler.token_id_dtype = torch.int64 + metrics_collector = MagicMock(spec=AsyncMetricsCollector) + + target_worker.profile_num_available_blocks.return_value = ( + available_gpu_blocks, available_cpu_blocks) + target_worker.get_cache_block_size_bytes.return_value = ( + target_cache_block_size_bytes) + draft_worker.get_cache_block_size_bytes.return_value = draft_kv_size_bytes + + worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler, + metrics_collector) + + # These values do not directly impact the adjusted block size calculation, + # so they can be fixed. + gpu_memory_utilization = 0.9 + cpu_swap_space = 100 + block_size = 16 + + num_gpu_blocks, num_cpu_blocks = worker.profile_num_available_blocks( + block_size, gpu_memory_utilization, cpu_swap_space, cache_dtype="auto") + + target_worker.profile_num_available_blocks.assert_called_once_with( + block_size, gpu_memory_utilization, cpu_swap_space, "auto") + assert num_cpu_blocks == available_cpu_blocks + + assert num_gpu_blocks == split_num_cache_blocks_evenly( + target_cache_block_size_bytes, draft_kv_size_bytes, + available_gpu_blocks) + + +@pytest.mark.parametrize('available_gpu_blocks', + list(range(20)) + [1024, 1024**2]) +@pytest.mark.parametrize('target_cache_block_size_bytes', + [2 * 2 * 4096, 2 * 2 * 8192]) +@pytest.mark.parametrize('draft_kv_size_bytes', [0, 2 * 2 * 768, 2 * 2 * 4096]) +@torch.inference_mode() +def test_split_num_cache_blocks_evenly(available_gpu_blocks: int, + target_cache_block_size_bytes: int, + draft_kv_size_bytes: int): + """Verify split_num_cache_blocks_evenly does not exceed original memory + allocation in bytes. + """ + num_blocks = split_num_cache_blocks_evenly(target_cache_block_size_bytes, + draft_kv_size_bytes, + available_gpu_blocks) + assert (num_blocks * target_cache_block_size_bytes) + ( + num_blocks * draft_kv_size_bytes) <= (available_gpu_blocks * + target_cache_block_size_bytes) diff --git a/tests/spec_decode/test_utils.py b/tests/spec_decode/test_utils.py new file mode 100644 index 0000000000000..19833ddb06154 --- /dev/null +++ b/tests/spec_decode/test_utils.py @@ -0,0 +1,111 @@ +from vllm.spec_decode.util import get_all_seq_ids +from vllm.sequence import SequenceGroupMetadata +from vllm.spec_decode.util import split_batch_by_proposal_len + +import pytest +from unittest.mock import MagicMock + + +def test_get_all_seq_ids(): + """Verify get_all_seq_ids extracts all seq ids. + """ + expected_seq_ids = list(range(10)) + list(range(100, 110)) + + seq_group_metadata_list = [ + SequenceGroupMetadata( + request_id=str(seq_id), + is_prompt=True, + seq_data={ + seq_id: MagicMock(), + }, + sampling_params=MagicMock(), + block_tables={ + seq_id: MagicMock(), + }, + lora_request=None, + ) for seq_id in expected_seq_ids + ] + + actual_seq_ids = get_all_seq_ids(seq_group_metadata_list) + assert actual_seq_ids == expected_seq_ids + + +@pytest.fixture +def fake_sequence_group_metadata(): + seq_ids = list(range(3)) + return [ + SequenceGroupMetadata( + request_id=str(i), + is_prompt=True, + seq_data={ + i: MagicMock(), + }, + sampling_params=MagicMock(), + block_tables={ + i: MagicMock(), + }, + lora_request=None, + ) for i in seq_ids + ] + + +def test_filter_zero_length_proposals(fake_sequence_group_metadata): + proposal_lens = [0, 1, 0] + filtered_groups, indices = split_batch_by_proposal_len( + fake_sequence_group_metadata, + proposal_lens, + select_proposal_len_zero=True) + + expected_groups = [ + fake_sequence_group_metadata[0], fake_sequence_group_metadata[2] + ] + expected_indices = [0, 2] + + assert filtered_groups == expected_groups + assert indices == expected_indices + + +def test_filter_non_zero_length_proposals(fake_sequence_group_metadata): + proposal_lens = [0, 1, 2] + filtered_groups, indices = split_batch_by_proposal_len( + fake_sequence_group_metadata, + proposal_lens, + select_proposal_len_zero=False) + + expected_groups = [ + fake_sequence_group_metadata[1], fake_sequence_group_metadata[2] + ] + expected_indices = [1, 2] + + assert filtered_groups == expected_groups + assert indices == expected_indices + + +def test_empty_inputs(): + filtered_groups, indices = split_batch_by_proposal_len( + [], [], select_proposal_len_zero=True) + + assert filtered_groups == [] + assert indices == [] + + +def test_all_zero_with_non_zero_filter(fake_sequence_group_metadata): + proposal_lens = [0, 0, 0] + filtered_groups, indices = split_batch_by_proposal_len( + fake_sequence_group_metadata, + proposal_lens, + select_proposal_len_zero=False) + + assert filtered_groups == [] + assert indices == [] + + +def test_all_non_zero_with_zero_filter(fake_sequence_group_metadata): + proposal_lens = [1, 1, 1] + filtered_groups, indices = split_batch_by_proposal_len( + fake_sequence_group_metadata, + proposal_lens, + select_proposal_len_zero=True) + + assert filtered_groups == [] + assert indices == [] diff --git a/tests/worker/spec_decode/utils.py b/tests/spec_decode/utils.py similarity index 60% rename from tests/worker/spec_decode/utils.py rename to tests/spec_decode/utils.py index fa8767cf898aa..997093988c0eb 100644 --- a/tests/worker/spec_decode/utils.py +++ b/tests/spec_decode/utils.py @@ -1,13 +1,16 @@ import torch -from typing import List, Optional, Dict +from typing import List, Optional, Dict, Iterable, Union +from unittest.mock import MagicMock from vllm.worker.worker import Worker from vllm.utils import get_distributed_init_method, get_ip, get_open_port from vllm.engine.arg_utils import EngineArgs -from vllm.sequence import Logprob, SequenceGroupMetadata, SequenceData +from vllm.sequence import (Logprob, SequenceGroupMetadata, SequenceData, + SamplerOutput, SequenceGroupOutput, SequenceOutput) from vllm.sampling_params import SamplingParams from vllm.worker.cache_engine import CacheEngine from vllm.model_executor.utils import set_random_seed +from itertools import count from dataclasses import dataclass, fields @@ -24,6 +27,11 @@ def to_dict(self): return dict( (field.name, getattr(self, field.name)) for field in fields(self)) + @classmethod + def from_dict(cls, d): + cleaned = dict((field.name, d[field.name]) for field in fields(cls)) + return cls(**cleaned) + def round_up_to_next_block(seq_len: int, block_size: int) -> int: return (seq_len + block_size - 1) // block_size @@ -50,6 +58,21 @@ def create_execute_model_data( ) +def mock_worker(cls=None, + vocab_size: int = 30_000, + max_model_len: int = 2048, + rank: int = 0) -> MagicMock: + if cls is None: + cls = Worker + + worker = MagicMock(spec=cls) + worker.vocab_size = vocab_size + worker.max_model_len = max_model_len + worker.rank = rank + worker.device = 'cuda:0' + return worker + + def patch_execute_model_with_seeds(worker: Worker, rand_seeds: List[int]): seed_iter = iter(rand_seeds) original_execute_model = worker.execute_model @@ -117,25 +140,12 @@ def create_seq_group_metadata_from_prompts( block_size: int, final_seq_lens: List[int], continuations: Optional[List[List[int]]] = None, - num_tokens_processed: Optional[List[int]] = None, seq_ids: Optional[List[int]] = None, ) -> List[SequenceGroupMetadata]: if continuations is None: continuations = [[] for _ in prompts] - if num_tokens_processed is None: - # Default to 1 token missing from kv cache for generation sequences. - num_tokens_processed = [] - for continuation, prompt in zip(continuations, prompts): - # If prefill, then default to zero tokens processed. - if not continuation: - num_tokens_processed.append(0) - else: - # If generation, then default to all but one tokens processed. - num_tokens_processed.append( - len(continuation) + len(prompt) - 1) - if seq_ids is None: seq_ids = list(i for i, _ in enumerate(prompts)) @@ -155,13 +165,15 @@ def create_seq_group_metadata_from_prompts( is_prompt=len(cont_token_ids) == 0, seq_data={ i: - SequenceData(prompt_token_ids=prompt_token_ids[:] + - cont_token_ids[:]) + SequenceData( + prompt_token_ids=prompt_token_ids[:], + output_token_ids=cont_token_ids[:], + ), }, sampling_params=SamplingParams(temperature=0.0, ), block_tables={i: block_allocations[i][:]}, - ) for i, (prompt_token_ids, cont_token_ids, num_tokens_saved) in - enumerate(zip(prompts, continuations, num_tokens_processed)) + ) for i, (prompt_token_ids, + cont_token_ids) in enumerate(zip(prompts, continuations)) ] @@ -178,3 +190,68 @@ def assert_logprobs_dict_allclose( expected = torch.tensor( single_step_expected_logprobs[token_id].logprob) assert torch.allclose(actual, expected) + + +def create_sampler_output_list( + token_ids: torch.Tensor, + probs: Iterable[Optional[torch.Tensor]], + seq_ids: Optional[List[int]] = None) -> List[SamplerOutput]: + num_steps, batch_size = token_ids.shape + token_ids_by_step = token_ids.tolist() + + if seq_ids is None: + seq_ids = list(range(batch_size)) + + return [ + SamplerOutput(outputs=[ + SequenceGroupOutput( + samples=[ + SequenceOutput( + output_token=token_id, + parent_seq_id=seq_ids[seq_index], + logprobs={token_id: 0}, + ) + ], + prompt_logprobs=None, + ) for seq_index, token_id in enumerate(token_ids_by_step[step]) + ], + sampled_token_probs=probs[step], + sampled_token_ids=token_ids[step]) + for step in range(num_steps) + ] + + +def create_batch(batch_size, + k, + prompt_len: Union[int, List[int]] = 10, + prev_output_token_len: int = 10, + seq_ids: Optional[List[int]] = None, + num_gpu_blocks: Optional[int] = None, + block_size: Optional[int] = None): + if block_size is None: + block_size = 8 + + if num_gpu_blocks is None: + num_gpu_blocks = 2048 // block_size + + iterator = count() + + if isinstance(prompt_len, int): + prompt_lens = [prompt_len for _ in range(batch_size)] + else: + prompt_lens = prompt_len + + prompts = [[next(iterator) for _ in range(p_len)] for p_len in prompt_lens] + prev_output_tokens = [[ + next(iterator) for _ in range(prev_output_token_len) + ] for _ in range(batch_size)] + final_seq_lens = [ + len(prompt) + len(prev_output_token) + k + 1 + for prompt, prev_output_token in zip(prompts, prev_output_tokens) + ] + + execute_model_data = create_execute_model_data( + create_seq_group_metadata_from_prompts(prompts, num_gpu_blocks, + block_size, final_seq_lens, + prev_output_tokens, seq_ids), ) + return execute_model_data, prompts, prev_output_tokens diff --git a/tests/test_cache_block_hashing.py b/tests/test_cache_block_hashing.py index 7c4ade7f8c8ed..fb541f38f3489 100644 --- a/tests/test_cache_block_hashing.py +++ b/tests/test_cache_block_hashing.py @@ -2,8 +2,11 @@ Run `pytest tests/test_cache_block_hashing.py`. """ +from typing import List, Optional + import pytest +from vllm.lora.request import LoRARequest from vllm.transformers_utils.tokenizer import TokenizerGroup from vllm.sequence import Sequence @@ -36,7 +39,10 @@ def flatten_2d(li): @pytest.mark.parametrize("model", ["facebook/opt-125m"]) @pytest.mark.parametrize("block_size", [16]) @pytest.mark.parametrize("max_num_seqs", [256]) -def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int): +@pytest.mark.parametrize("concurrent_lora_int_ids", + [[None], [1], [None, 1], [None, 1, 2], [1, 2]]) +def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int, + concurrent_lora_int_ids: List[Optional[int]]): tokenizer = TokenizerGroup( tokenizer_id="facebook/opt-125m", @@ -48,19 +54,30 @@ def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int): hashes = [] for prefix in prefixes: - hashes.append([]) - prompts = [prefix + prompt for prompt in sample_prompts] - seq_id = 0 - for prompt in prompts: - hashes[-1].append([]) - prompt_token_ids = tokenizer.encode(prompt) - seq = Sequence(seq_id, prompt, prompt_token_ids, block_size) - - num_blocks = len(prompt_token_ids) // block_size - for idx in range(num_blocks): - hashes[-1][-1].append(seq.hash_of_block(idx)) - - seq_id += 1 + for lora_int_id in concurrent_lora_int_ids: + lora_request = None + + if lora_int_id is not None: + lora_request = LoRARequest( + f"example_lora_{lora_int_id}", + lora_int_id, + f"example/path/to/lora_{lora_int_id}", + ) + + hashes.append([]) + prompts = [prefix + prompt for prompt in sample_prompts] + seq_id = 0 + for prompt in prompts: + hashes[-1].append([]) + prompt_token_ids = tokenizer.encode(prompt) + seq = Sequence(seq_id, prompt, prompt_token_ids, block_size, + tokenizer.tokenizer.eos_token_id, lora_request) + + num_blocks = len(prompt_token_ids) // block_size + for idx in range(num_blocks): + hashes[-1][-1].append(seq.hash_of_block(idx)) + + seq_id += 1 # Check that hashes made with two prefixes with different first blocks are # different everywhere. diff --git a/tests/test_sequence.py b/tests/test_sequence.py new file mode 100644 index 0000000000000..e18df059d770f --- /dev/null +++ b/tests/test_sequence.py @@ -0,0 +1,50 @@ +import pytest + +from vllm.sequence import SequenceGroupOutput, SamplerOutput, SequenceOutput + + +@pytest.fixture +def sample_outputs(): + return [ + SequenceGroupOutput(samples=[ + SequenceOutput(parent_seq_id=0, output_token=i, logprobs={}) + ], + prompt_logprobs=None) for i in range(5) + ] + + +@pytest.fixture +def sampler_output(sample_outputs): + return SamplerOutput(outputs=sample_outputs) + + +def test_sampler_output_initialization(sampler_output, sample_outputs): + assert len(sampler_output) == len(sample_outputs) + assert sampler_output.sampled_token_probs is None + assert sampler_output.sampled_token_ids is None + assert sampler_output.spec_decode_worker_metrics is None + + +def test_sampler_output_getitem(sampler_output, sample_outputs): + assert sampler_output[2] == sample_outputs[2] + + +def test_sampler_output_setitem(sampler_output): + new_output = SequenceGroupOutput(samples=[ + SequenceOutput(parent_seq_id=0, output_token=99, logprobs={}) + ], + prompt_logprobs=None) + sampler_output[2] = new_output + assert sampler_output[2] == new_output + + +def test_sampler_output_len(sampler_output, sample_outputs): + assert len(sampler_output) == len(sample_outputs) + + +def test_sampler_output_eq(sample_outputs): + sampler_output1 = SamplerOutput(outputs=sample_outputs) + sampler_output2 = SamplerOutput(outputs=sample_outputs.copy()) + sampler_output3 = SamplerOutput(outputs=sample_outputs[:-1]) + assert sampler_output1 == sampler_output2 + assert sampler_output1 != sampler_output3 diff --git a/vllm/config.py b/vllm/config.py index 0f3c064a90386..c8e261afc8377 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -219,8 +219,8 @@ def _verify_quantization(self) -> None: if is_hip( ) and self.quantization in rocm_not_supported_quantization: raise ValueError( - f"{self.quantization} quantization is currently not supported " - f"in ROCm.") + f"{self.quantization} quantization is currently not " + f"supported in ROCm.") if self.quantization != "marlin": logger.warning( f"{self.quantization} quantization is not fully " @@ -351,7 +351,8 @@ def __init__( self.num_cpu_blocks = None def metrics_info(self): - # convert cache_config to dict(key: str, value: str) for prometheus metrics info + # convert cache_config to dict(key: str, value: str) for prometheus + # metrics info return {key: str(value) for key, value in self.__dict__.items()} def _verify_args(self) -> None: @@ -429,8 +430,9 @@ def __init__( ) -> None: self.pipeline_parallel_size = pipeline_parallel_size if is_neuron(): - # For Neuron device support, here we assign TP=1 to avoid sharding within vLLM directly. - # Transformer-neuronx would take neuron_tp_degree attribute, and distribute the workload + # For Neuron device support, here we assign TP=1 to avoid sharding + # within vLLM directly. Transformer-neuronx would take + # neuron_tp_degree attribute, and distribute the workload # to multiple NeuronCores. self.tensor_parallel_size = 1 self.neuron_tp_degree = tensor_parallel_size diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index daf83827a7e52..8bfc14999f0a7 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -1,6 +1,6 @@ """A block manager that manages token blocks.""" import enum -from itertools import count +from itertools import count, takewhile from os.path import commonprefix from typing import Dict, List, Optional, Set, Tuple @@ -95,13 +95,15 @@ def free(self, block: PhysicalTokenBlock) -> None: del self.cached_blocks[block.block_hash] def get_num_free_blocks(self) -> int: - return self.num_blocks - self.current_num_blocks + self.evictor.num_blocks + return (self.num_blocks - self.current_num_blocks + + self.evictor.num_blocks) def contains_block(self, block_hash: int) -> bool: return block_hash in self.cached_blocks or block_hash in self.evictor def update_hash(self, block_hash: int, block: PhysicalTokenBlock): - # If caching is enabled, update the hash of block and the cached_blocks dictionary. + # If caching is enabled, update the hash of block and the + # cached_blocks dictionary. if self.enable_caching: assert not self.contains_block(block_hash) old_hash = block.block_hash @@ -218,10 +220,12 @@ def _promote_last_block( seq: Sequence, last_block: PhysicalTokenBlock, ) -> PhysicalTokenBlock: - # Compute a new hash for the block so that it can be shared by other Sequences + # Compute a new hash for the block so that it can be shared by + # other Sequences new_hash = seq.hash_of_block(len(seq.logical_token_blocks) - 1) - # if new_hash is already in the cached table, then free last_block and return the cached version + # if new_hash is already in the cached table, then free last_block + # and return the cached version if self.gpu_allocator.contains_block(new_hash): self.gpu_allocator.free(last_block) return self.gpu_allocator.allocate(new_hash) @@ -289,7 +293,8 @@ def append_slot( assert last_block.device == Device.GPU if last_block.ref_count == 1: # Not shared with other sequences. Appendable. - # If the last block is now complete, promote it to a full block so that it can be shared + # If the last block is now complete, promote it to a full block so + # that it can be shared new_block = self._maybe_promote_last_block(seq, last_block) block_table[-1] = new_block return None @@ -426,23 +431,29 @@ def access_all_blocks_in_seq( for block in block_table: block.last_accessed = access_time - def compute_last_full_block_in_seq(self, seq: Sequence): + def compute_full_blocks_in_seq(self, seq: Sequence): if seq.seq_id not in self.block_tables: return max_full_block = seq.get_len() // self.block_size - 1 block_table = self.block_tables[seq.seq_id] if max_full_block == -1: return - block_table[max_full_block].computed = True + for i in reversed(range(max_full_block)): + if block_table[i].computed: + break + block_table[i].computed = True - def get_all_block_ids_till_computed(self, seq: Sequence) -> List[int]: + def get_all_computed_blocks(self, seq: Sequence) -> List[int]: if seq.seq_id not in self.block_tables: return [] block_table = self.block_tables[seq.seq_id] - for block_idx in reversed(range(len(block_table))): - if block_table[block_idx].computed: - return [b.block_number for b in block_table[:block_idx + 1]] - return [] + # NOTE We exclude the last block to avoid the case where the entire + # prompt is cached. This would cause erroneous behavior in model + # runner. + return [ + b.block_number + for b in takewhile(lambda b: b.computed, block_table[:-1]) + ] def get_common_computed_block_ids(self, seq_group: SequenceGroup) -> List[int]: @@ -451,14 +462,12 @@ def get_common_computed_block_ids(self, return [] ids_list = [ - self.get_all_block_ids_till_computed(seq) + self.get_all_computed_blocks(seq) for seq in iter(seq_group.seqs_dict.values()) ] return commonprefix([ids for ids in ids_list if ids != []]) def mark_blocks_as_computed(self, seq_group: SequenceGroup): - # NOTE: We only mark the last full block because with prefix caching, - # all blocks until the marked one are guaranteed to be computed. if self.enable_caching: for seq in seq_group.seqs_dict.values(): - self.compute_last_full_block_in_seq(seq) + self.compute_full_blocks_in_seq(seq) diff --git a/vllm/core/evictor.py b/vllm/core/evictor.py index b538ea574b604..1d81f5a97d71c 100644 --- a/vllm/core/evictor.py +++ b/vllm/core/evictor.py @@ -39,9 +39,9 @@ def add(self, block: PhysicalTokenBlock): @abstractmethod def remove(self, block_hash: int) -> PhysicalTokenBlock: """Simply removes the block with the hash value block_hash from the - evictor. Caller is responsible for making sure that block_hash is contained - in the evictor before calling remove. Should be used to "bring back" blocks - that have been freed but not evicted yet. + evictor. Caller is responsible for making sure that block_hash is + contained in the evictor before calling remove. Should be used to + "bring back" blocks that have been freed but not evicted yet. """ pass diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index 1ae58f525b0fb..9255f91be55cb 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -59,10 +59,9 @@ def is_empty(self) -> bool: and not self.blocks_to_swap_out and not self.blocks_to_copy) def _sort_by_lora_ids(self) -> bool: - self.scheduled_seq_groups = sorted( - self.scheduled_seq_groups, - key=lambda g: (g.lora_request.lora_int_id - if g.lora_request else 0, g.request_id)) + self.scheduled_seq_groups = sorted(self.scheduled_seq_groups, + key=lambda g: + (g.lora_int_id, g.request_id)) @property def lora_requests(self) -> Set[LoRARequest]: @@ -215,8 +214,8 @@ def _schedule(self) -> SchedulerOutputs: lora_int_id = 0 if self.lora_enabled: lora_int_id = seq_group.lora_int_id - if lora_int_id > 0 and lora_int_id not in curr_loras and len( - curr_loras) >= self.lora_config.max_loras: + if (lora_int_id > 0 and lora_int_id not in curr_loras + and len(curr_loras) >= self.lora_config.max_loras): # We don't have a space for another LoRA, so # we ignore this request for now. leftover_waiting_sequences.appendleft(seq_group) @@ -310,8 +309,8 @@ def _schedule(self) -> SchedulerOutputs: lora_int_id = 0 if self.lora_enabled: lora_int_id = seq_group.lora_int_id - if lora_int_id > 0 and lora_int_id not in curr_loras and len( - curr_loras) >= self.lora_config.max_loras: + if (lora_int_id > 0 and lora_int_id not in curr_loras + and len(curr_loras) >= self.lora_config.max_loras): # We don't have a space for another LoRA, so # we ignore this request for now. leftover_swapped.appendleft(seq_group) diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index df66139fddcd1..5629d1a863d04 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -1,8 +1,11 @@ import asyncio +import os import time from functools import partial from typing import (Any, Dict, Iterable, List, Optional, Set, Tuple, Type, - Union, AsyncIterator) + Union, AsyncIterator, Callable) + +from transformers import PreTrainedTokenizer from vllm.lora.request import LoRARequest from vllm.config import ModelConfig @@ -14,28 +17,31 @@ from vllm.sampling_params import SamplingParams logger = init_logger(__name__) +ENGINE_ITERATION_TIMEOUT_S = int( + os.environ.get("VLLM_ENGINE_ITERATION_TIMEOUT_S", "60")) class AsyncEngineDeadError(RuntimeError): pass -def _raise_exception_on_finish(task: asyncio.Task, - request_tracker: "RequestTracker") -> None: +def _raise_exception_on_finish( + task: asyncio.Task, error_callback: Callable[[Exception], + None]) -> None: msg = ("Task finished unexpectedly. This should never happen! " "Please open an issue on Github.") + + exception = None try: - try: - task.result() - except asyncio.CancelledError: - return - except Exception as exc: - raise AsyncEngineDeadError( - msg + " See stack trace above for the actual cause.") from exc + task.result() + # NOTE: This will be thrown if task exits normally (which it should not) raise AsyncEngineDeadError(msg) - except Exception as exc: - request_tracker.propagate_exception(exc) - raise exc + except Exception as e: + exception = e + logger.error("Engine background task failed", exc_info=e) + error_callback(exception) + raise AsyncEngineDeadError( + msg + " See stack trace above for the actual cause.") from e class AsyncStream: @@ -78,13 +84,13 @@ def __init__(self) -> None: self._finished_requests: asyncio.Queue[str] = asyncio.Queue() self._new_requests: asyncio.Queue[Tuple[AsyncStream, dict]] = asyncio.Queue() - self.new_requests_event = None + self.new_requests_event = asyncio.Event() def __contains__(self, item): return item in self._request_streams - def init_event(self): - self.new_requests_event = asyncio.Event() + def __len__(self) -> int: + return len(self._request_streams) def propagate_exception(self, exc: Exception, @@ -93,9 +99,11 @@ def propagate_exception(self, (all if request_id is None).""" if request_id is not None: self._request_streams[request_id].put(exc) + self.abort_request(request_id) else: - for stream in self._request_streams.values(): + for rid, stream in self._request_streams.items(): stream.put(exc) + self.abort_request(rid) def process_request_output(self, request_output: RequestOutput, @@ -172,12 +180,15 @@ def get_new_and_finished_requests(self) -> Tuple[List[Dict], Set[str]]: self._request_streams[stream.request_id] = stream new_requests.append(new_request) - self.new_requests_event.clear() - return new_requests, finished_requests async def wait_for_new_requests(self): - await self.new_requests_event.wait() + if not self.has_new_requests(): + await self.new_requests_event.wait() + self.new_requests_event.clear() + + def has_new_requests(self): + return not self._new_requests.empty() class _AsyncLLMEngine(LLMEngine): @@ -285,6 +296,10 @@ async def _run_workers_async( all_outputs = await asyncio.gather(*coros) return all_outputs + async def check_health_async(self): + """Raises an error if engine is unhealthy.""" + self._check_if_any_actor_is_dead() + class AsyncLLMEngine: """An asynchronous wrapper for LLMEngine. @@ -335,27 +350,51 @@ def __init__(self, # collected self._background_loop_unshielded = None self.start_engine_loop = start_engine_loop - self._request_tracker = RequestTracker() + self._request_tracker: Optional[RequestTracker] = None + self._errored_with: Optional[BaseException] = None @property def is_running(self) -> bool: return (self.background_loop is not None - and not self.background_loop.done()) + and not self._background_loop_unshielded.done()) + + @property + def is_stopped(self) -> bool: + return self.errored or (self.background_loop is not None + and self._background_loop_unshielded.done()) - def get_tokenizer(self): - return self.engine.tokenizer.tokenizer + @property + def errored(self) -> bool: + return self._errored_with is not None + + def set_errored(self, exc: Exception) -> None: + self._errored_with = exc + + def _error_callback(self, exc: Exception) -> None: + self.set_errored(exc) + self._request_tracker.propagate_exception(exc) + + async def get_tokenizer(self) -> "PreTrainedTokenizer": + if self.engine_use_ray: + return await self.engine.get_tokenizer.remote() + else: + return self.engine.get_tokenizer() def start_background_loop(self) -> None: """Start the background loop.""" + if self.errored: + raise AsyncEngineDeadError( + "Background loop has errored already.") from self._errored_with if self.is_running: raise RuntimeError("Background loop is already running.") - self._request_tracker.init_event() + # Initialize the RequestTracker here so it uses the right event loop. + self._request_tracker = RequestTracker() self._background_loop_unshielded = asyncio.get_event_loop( ).create_task(self.run_engine_loop()) self._background_loop_unshielded.add_done_callback( partial(_raise_exception_on_finish, - request_tracker=self._request_tracker)) + error_callback=self._error_callback)) self.background_loop = asyncio.shield(self._background_loop_unshielded) def _init_engine(self, *args, @@ -423,12 +462,23 @@ async def _engine_abort(self, request_ids: Iterable[str]): self.engine.abort_request(request_ids) async def run_engine_loop(self): - # Initialize the RequestTracker here so it uses the right event loop. has_requests_in_progress = False while True: if not has_requests_in_progress: + logger.debug("Waiting for new requests...") await self._request_tracker.wait_for_new_requests() - has_requests_in_progress = await self.engine_step() + logger.debug("Got new requests!") + + # Abort if iteration takes too long due to unrecoverable errors + # (eg. NCCL timeouts). + try: + has_requests_in_progress = await asyncio.wait_for( + self.engine_step(), ENGINE_ITERATION_TIMEOUT_S) + except asyncio.TimeoutError as exc: + logger.error( + "Engine iteration timed out. This should never happen!") + self.set_errored(exc) + raise await asyncio.sleep(0) async def add_request( @@ -647,3 +697,19 @@ async def do_log_stats(self) -> None: await self.engine.do_log_stats.remote() else: self.engine.do_log_stats() + + async def check_health(self): + """Raises an error if engine is unhealthy.""" + t = time.perf_counter() + logger.debug("Starting health check...") + if self.is_stopped: + raise AsyncEngineDeadError("Background loop is stopped.") + + if self.engine_use_ray: + try: + await self.engine.check_health.remote() + except ray.exceptions.RayActorError as e: + raise RuntimeError("Engine is dead.") from e + else: + await self.engine.check_health_async() + logger.debug(f"Health check took {time.perf_counter()-t}s") diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 936bff652ee2b..3e35e04107fa2 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -9,6 +9,8 @@ from typing import (TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Union) +from transformers import PreTrainedTokenizer + import vllm from vllm.lora.request import LoRARequest from vllm.config import (CacheConfig, DeviceConfig, ModelConfig, @@ -100,7 +102,8 @@ def __init__( f"download_dir={model_config.download_dir!r}, " f"load_format={model_config.load_format}, " f"tensor_parallel_size={parallel_config.tensor_parallel_size}, " - f"disable_custom_all_reduce={parallel_config.disable_custom_all_reduce}, " + f"disable_custom_all_reduce=" + f"{parallel_config.disable_custom_all_reduce}, " f"quantization={model_config.quantization}, " f"sparsity={model_config.sparsity}, " f"enforce_eager={model_config.enforce_eager}, " @@ -161,7 +164,16 @@ def __init__( if USE_RAY_COMPILED_DAG: self.forward_dag = self._compiled_ray_dag() - def get_tokenizer_for_seq(self, sequence: Sequence): + def __reduce__(self): + # This is to ensure that the LLMEngine is not referenced in + # the closure used to initialize Ray worker actors + raise RuntimeError("LLMEngine should not be pickled!") + + def get_tokenizer(self) -> "PreTrainedTokenizer": + return self.tokenizer.get_lora_tokenizer() + + def get_tokenizer_for_seq(self, + sequence: Sequence) -> "PreTrainedTokenizer": return self.tokenizer.get_lora_tokenizer(sequence.lora_request) def _dispatch_worker(self): @@ -283,6 +295,8 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", parallel_config = copy.deepcopy(self.parallel_config) scheduler_config = copy.deepcopy(self.scheduler_config) device_config = copy.deepcopy(self.device_config) + lora_config = copy.deepcopy(self.lora_config) + kv_cache_dtype = self.cache_config.cache_dtype for rank, (worker, (node_id, _)) in enumerate(zip(self.workers, @@ -298,22 +312,22 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", local_rank, rank, distributed_init_method, - lora_config=self.lora_config, - kv_cache_dtype=self.cache_config.cache_dtype, + lora_config=lora_config, + kv_cache_dtype=kv_cache_dtype, )) driver_rank = 0 driver_local_rank = node_workers[driver_node_id].index(driver_rank) self.driver_worker = Worker( - model_config, - parallel_config, - scheduler_config, - device_config, + self.model_config, + self.parallel_config, + self.scheduler_config, + self.device_config, driver_local_rank, driver_rank, distributed_init_method, lora_config=self.lora_config, - kv_cache_dtype=self.cache_config.cache_dtype, + kv_cache_dtype=kv_cache_dtype, is_driver_worker=True, ) @@ -494,8 +508,10 @@ def add_request( # Create the sequences. block_size = self.cache_config.block_size seq_id = next(self.seq_counter) + eos_token_id = self.tokenizer.get_lora_tokenizer( + lora_request).eos_token_id seq = Sequence(seq_id, prompt, prompt_token_ids, block_size, - lora_request) + eos_token_id, lora_request) # Defensive copy of SamplingParams, which are used by the sampler, # this doesn't deep-copy LogitsProcessor objects @@ -551,15 +567,13 @@ def _check_beam_search_early_stopping( if early_stopping is True: return True - current_worst_score = (current_worst_seq.get_beam_search_score( + current_worst_score = current_worst_seq.get_beam_search_score( length_penalty=length_penalty, - eos_token_id=self.get_tokenizer_for_seq( - current_worst_seq).eos_token_id)) + eos_token_id=current_worst_seq.eos_token_id) if early_stopping is False: - highest_attainable_score = (best_running_seq.get_beam_search_score( + highest_attainable_score = best_running_seq.get_beam_search_score( length_penalty=length_penalty, - eos_token_id=self.get_tokenizer_for_seq( - best_running_seq).eos_token_id)) + eos_token_id=best_running_seq.eos_token_id) else: assert early_stopping == "never" if length_penalty > 0.0: @@ -573,8 +587,7 @@ def _check_beam_search_early_stopping( highest_attainable_score = ( best_running_seq.get_beam_search_score( length_penalty=length_penalty, - eos_token_id=self.get_tokenizer_for_seq( - best_running_seq).eos_token_id, + eos_token_id=best_running_seq.eos_token_id, seq_len=max_possible_length)) else: # Otherwise, beam search will prefer shorter sequences. The @@ -583,8 +596,7 @@ def _check_beam_search_early_stopping( highest_attainable_score = ( best_running_seq.get_beam_search_score( length_penalty=length_penalty, - eos_token_id=self.get_tokenizer_for_seq( - best_running_seq).eos_token_id)) + eos_token_id=best_running_seq.eos_token_id)) return current_worst_score >= highest_attainable_score def _process_sequence_group_outputs(self, seq_group: SequenceGroup, @@ -682,8 +694,7 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup, all_finished_seqs = existing_finished_seqs + new_finished_seqs # Sort the finished sequences by their scores. all_finished_seqs.sort(key=lambda x: x[0].get_beam_search_score( - length_penalty=length_penalty, - eos_token_id=self.get_tokenizer_for_seq(x[0]).eos_token_id), + length_penalty=length_penalty, eos_token_id=x[0].eos_token_id), reverse=True) for seq, parent, is_new in all_finished_seqs[:beam_width]: if is_new: @@ -710,8 +721,7 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup, if not seq.is_finished()] # Sort the running sequences by their scores. running_child_seqs.sort(key=lambda x: x[0].get_beam_search_score( - length_penalty=length_penalty, - eos_token_id=self.get_tokenizer_for_seq(x[0]).eos_token_id), + length_penalty=length_penalty, eos_token_id=x[0].eos_token_id), reverse=True) # Check if we can stop the beam search. @@ -923,7 +933,8 @@ def _get_stats(self, # Latency Timings. time_last_iters = [] for seq_group in scheduler_outputs.scheduled_seq_groups: - # Time since last token. (n.b. updates seq_group.metrics.last_token_time) + # Time since last token. + # (n.b. updates seq_group.metrics.last_token_time) time_last_iters.append(seq_group.get_last_latency(now)) # Time since arrival for all finished requests. if seq_group.is_finished(): @@ -955,16 +966,17 @@ def _decode_logprobs(self, seq: Sequence, prms: SamplingParams, for token_id, sample_logprob in logprobs.items(): if (sample_logprob.decoded_token is None and token_id != -1): all_input_ids_with_logprob = all_input_ids[:-1] + [token_id] - _, new_text, prefix_offset, read_offset = detokenize_incrementally( - self.get_tokenizer_for_seq(seq), - all_input_ids=all_input_ids_with_logprob, - prev_tokens=seq.tokens, - prefix_offset=seq.prefix_offset, - read_offset=seq.read_offset, - skip_special_tokens=prms.skip_special_tokens, - spaces_between_special_tokens=prms. - spaces_between_special_tokens, - ) + (_, new_text, prefix_offset, + read_offset) = detokenize_incrementally( + self.get_tokenizer_for_seq(seq), + all_input_ids=all_input_ids_with_logprob, + prev_tokens=seq.tokens, + prefix_offset=seq.prefix_offset, + read_offset=seq.read_offset, + skip_special_tokens=prms.skip_special_tokens, + spaces_between_special_tokens=prms. + spaces_between_special_tokens, + ) sample_logprob.decoded_token = new_text def _decode_sequence(self, seq: Sequence, prms: SamplingParams) -> None: @@ -1017,8 +1029,8 @@ def _check_stop(self, seq: Sequence, return # Check if the sequence has generated the EOS token. - if ((not sampling_params.ignore_eos) and seq.get_last_token_id() - == self.get_tokenizer_for_seq(seq).eos_token_id): + if ((not sampling_params.ignore_eos) + and seq.get_last_token_id() == seq.eos_token_id): seq.status = SequenceStatus.FINISHED_STOPPED return @@ -1122,3 +1134,23 @@ def _compiled_ray_dag(self): for worker in self.workers ]) return forward_dag.experimental_compile() + + def check_health(self) -> None: + """Raises an error if engine is unhealthy.""" + self._check_if_any_actor_is_dead() + + def _check_if_any_actor_is_dead(self): + if not self.parallel_config.worker_use_ray: + return + + if not self.workers: + return + + dead_actors = [] + for actor in self.workers: + actor_state = ray.state.actors(actor._ray_actor_id.hex()) # pylint: disable=protected-access + if actor_state["State"] == "DEAD": + dead_actors.append(actor) + if dead_actors: + raise RuntimeError("At least one Worker is dead. " + f"Dead Workers: {dead_actors}. ") diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index d31542159e4a4..17b1852f5b0a3 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -1,5 +1,6 @@ from vllm.logger import init_logger -from prometheus_client import Counter, Gauge, Histogram, Info, REGISTRY, disable_created_metrics +from prometheus_client import (Counter, Gauge, Histogram, Info, REGISTRY, + disable_created_metrics) import time import numpy as np @@ -177,10 +178,12 @@ def _log_prometheus(self, stats: Stats) -> None: def _log_prometheus_interval(self, prompt_throughput: float, generation_throughput: float) -> None: # Logs metrics to prometheus that are computed every logging_interval. - # Support legacy gauge metrics that make throughput calculations on the vLLM side. - # Moving forward, we should use counters like counter_prompt_tokens, counter_generation_tokens - # Which log raw data and calculate summaries using rate() on the grafana/prometheus side. - # See https://github.com/vllm-project/vllm/pull/2316#discussion_r1464204666 + # Support legacy gauge metrics that make throughput calculations on + # the vLLM side. Moving forward, we should use counters like + # counter_prompt_tokens, counter_generation_tokens + # Which log raw data and calculate summaries using rate() on the + # grafana/prometheus side. See + # https://github.com/vllm-project/vllm/pull/2316#discussion_r1464204666 self.metrics.gauge_avg_prompt_throughput.labels( **self.labels).set(prompt_throughput) self.metrics.gauge_avg_generation_throughput.labels( @@ -188,7 +191,7 @@ def _log_prometheus_interval(self, prompt_throughput: float, def log(self, stats: Stats) -> None: """Called by LLMEngine. - Logs to prometheus and tracked stats every iteration. + Logs to prometheus and tracked stats every iteration. Logs to Stdout every self.local_interval seconds.""" # Log to prometheus. @@ -200,8 +203,8 @@ def log(self, stats: Stats) -> None: # Log locally every local_interval seconds. if self._local_interval_elapsed(stats.now): - - # Compute summary metrics for tracked stats (and log them to promethus if applicable). + # Compute summary metrics for tracked stats (and log them + # to promethus if applicable). prompt_throughput = self._get_throughput(self.num_prompt_tokens, now=stats.now) generation_throughput = self._get_throughput( @@ -213,7 +216,8 @@ def log(self, stats: Stats) -> None: # Log to stdout. logger.info( f"Avg prompt throughput: {prompt_throughput:.1f} tokens/s, " - f"Avg generation throughput: {generation_throughput:.1f} tokens/s, " + f"Avg generation throughput: " + f"{generation_throughput:.1f} tokens/s, " f"Running: {stats.num_running} reqs, " f"Swapped: {stats.num_swapped} reqs, " f"Pending: {stats.num_waiting} reqs, " diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py index 1eb4ab8b06b64..86b6c4c67cfa4 100644 --- a/vllm/entrypoints/api_server.py +++ b/vllm/entrypoints/api_server.py @@ -1,7 +1,9 @@ """ -NOTE: This API server is used only for demonstrating usage of AsyncEngine and simple performance benchmarks. -It is not intended for production use. For production use, we recommend using our OpenAI compatible server. -We are also not going to accept PRs modifying this file, please change `vllm/entrypoints/openai/api_server.py` instead. +NOTE: This API server is used only for demonstrating usage of AsyncEngine +and simple performance benchmarks. It is not intended for production use. +For production use, we recommend using our OpenAI compatible server. +We are also not going to accept PRs modifying this file, please +change `vllm/entrypoints/openai/api_server.py` instead. """ import argparse diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 2d46e918ddc23..8938f655d75da 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -200,7 +200,9 @@ def _run_engine(self, use_tqdm: bool) -> List[RequestOutput]: # Initialize tqdm. if use_tqdm: num_requests = self.llm_engine.get_num_unfinished_requests() - pbar = tqdm(total=num_requests, desc="Processed prompts") + pbar = tqdm(total=num_requests, + desc="Processed prompts", + dynamic_ncols=True) # Run the engine. outputs: List[RequestOutput] = [] while self.llm_engine.has_unfinished_requests(): diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 993a834e5a720..00407bc0e809c 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -18,7 +18,9 @@ import vllm from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine -from vllm.entrypoints.openai.protocol import CompletionRequest, ChatCompletionRequest, ErrorResponse +from vllm.entrypoints.openai.protocol import (CompletionRequest, + ChatCompletionRequest, + ErrorResponse) from vllm.logger import init_logger from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion @@ -84,13 +86,11 @@ def parse_args(): type=json.loads, default=["*"], help="allowed headers") - parser.add_argument( - "--api-key", - type=str, - default=None, - help= - "If provided, the server will require this key to be presented in the header." - ) + parser.add_argument("--api-key", + type=str, + default=None, + help="If provided, the server will require this key " + "to be presented in the header.") parser.add_argument("--served-model-name", type=str, default=None, @@ -103,9 +103,8 @@ def parse_args(): default=None, nargs='+', action=LoRAParserAction, - help= - "LoRA module configurations in the format name=path. Multiple modules can be specified." - ) + help="LoRA module configurations in the format name=path. " + "Multiple modules can be specified.") parser.add_argument("--chat-template", type=str, default=None, @@ -138,9 +137,10 @@ def parse_args(): help="Additional ASGI middleware to apply to the app. " "We accept multiple --middleware arguments. " "The value should be an import path. " - "If a function is provided, vLLM will add it to the server using @app.middleware('http'). " - "If a class is provided, vLLM will add it to the server using app.add_middleware(). " - ) + "If a function is provided, vLLM will add it to the server " + "using @app.middleware('http'). " + "If a class is provided, vLLM will add it to the server " + "using app.add_middleware(). ") parser = AsyncEngineArgs.add_cli_args(parser) return parser.parse_args() @@ -160,6 +160,7 @@ async def validation_exception_handler(_, exc): @app.get("/health") async def health() -> Response: """Health check.""" + await openai_serving_chat.engine.check_health() return Response(status_code=200) @@ -234,9 +235,8 @@ async def authentication(request: Request, call_next): elif inspect.iscoroutinefunction(imported): app.middleware("http")(imported) else: - raise ValueError( - f"Invalid middleware {middleware}. Must be a function or a class." - ) + raise ValueError(f"Invalid middleware {middleware}. " + f"Must be a function or a class.") logger.info(f"vLLM API server version {vllm.__version__}") logger.info(f"args: {args}") diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index ba352f18f6454..d2fb9ca001b15 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -12,7 +12,8 @@ UsageInfo) from vllm.outputs import RequestOutput from vllm.entrypoints.openai.serving_engine import OpenAIServing, LoRA -from vllm.model_executor.guided_decoding import get_guided_decoding_logits_processor +from vllm.model_executor.guided_decoding import ( + get_guided_decoding_logits_processor) logger = init_logger(__name__) @@ -37,8 +38,9 @@ async def create_chat_completion( ChatCompletionResponse]: """Completion API similar to OpenAI's API. - See https://platform.openai.com/docs/api-reference/chat/create - for the API specification. This API mimics the OpenAI ChatCompletion API. + See https://platform.openai.com/docs/api-reference/chat/create + for the API specification. This API mimics the OpenAI + ChatCompletion API. NOTE: Currently we do not support the following feature: - function_call (Users should implement this by themselves) @@ -65,7 +67,7 @@ async def create_chat_completion( lora_request = self._maybe_get_lora(request) guided_decode_logits_processor = ( await get_guided_decoding_logits_processor( - request, self.engine.get_tokenizer())) + request, await self.engine.get_tokenizer())) if guided_decode_logits_processor: if sampling_params.logits_processors is None: sampling_params.logits_processors = [] @@ -116,7 +118,8 @@ async def chat_completion_stream_generator( # the result_generator, it needs to be sent as the FIRST # response (by the try...catch). if first_iteration: - # Send first response for each request.n (index) with the role + # Send first response for each request.n (index) with + # the role role = self.get_chat_request_role(request) for i in range(request.n): choice_data = ChatCompletionResponseStreamChoice( @@ -133,7 +136,8 @@ async def chat_completion_stream_generator( data = chunk.model_dump_json(exclude_unset=True) yield f"data: {data}\n\n" - # Send response to echo the input portion of the last message + # Send response to echo the input portion of the + # last message if request.echo: last_msg_content = "" if request.messages and isinstance( @@ -145,11 +149,12 @@ async def chat_completion_stream_generator( if last_msg_content: for i in range(request.n): - choice_data = ChatCompletionResponseStreamChoice( - index=i, - delta=DeltaMessage( - content=last_msg_content), - finish_reason=None) + choice_data = ( + ChatCompletionResponseStreamChoice( + index=i, + delta=DeltaMessage( + content=last_msg_content), + finish_reason=None)) chunk = ChatCompletionStreamResponse( id=request_id, object=chunk_object_type, diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index a8244fd150753..b78f053800f3c 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -1,7 +1,8 @@ import asyncio import time from fastapi import Request -from typing import AsyncGenerator, AsyncIterator, Callable, List, Optional, Dict, Tuple +from typing import (AsyncGenerator, AsyncIterator, Callable, List, Optional, + Dict, Tuple) from vllm.logger import init_logger from vllm.utils import random_uuid from vllm.engine.async_llm_engine import AsyncLLMEngine @@ -16,7 +17,8 @@ ) from vllm.outputs import RequestOutput from vllm.entrypoints.openai.serving_engine import OpenAIServing, LoRA -from vllm.model_executor.guided_decoding import get_guided_decoding_logits_processor +from vllm.model_executor.guided_decoding import ( + get_guided_decoding_logits_processor) logger = init_logger(__name__) @@ -44,9 +46,8 @@ def parse_prompt_format(prompt) -> Tuple[bool, list]: prompt_is_tokens = True prompts = prompt # case 4: array of token arrays else: - raise ValueError( - "prompt must be a string, array of strings, array of tokens, or array of token arrays" - ) + raise ValueError("prompt must be a string, array of strings, " + "array of tokens, or array of token arrays") return prompt_is_tokens, prompts @@ -126,7 +127,7 @@ async def create_completion(self, request: CompletionRequest, lora_request = self._maybe_get_lora(request) guided_decode_logit_processor = ( await get_guided_decoding_logits_processor( - request, self.engine.get_tokenizer())) + request, await self.engine.get_tokenizer())) if guided_decode_logit_processor is not None: if sampling_params.logits_processors is None: sampling_params.logits_processors = [] @@ -156,7 +157,8 @@ async def create_completion(self, request: CompletionRequest, int, RequestOutput]] = merge_async_iterators(*generators) # Similar to the OpenAI API, when n != best_of, we do not stream the - # results. In addition, we do not stream the results when use beam search. + # results. In addition, we do not stream the results when use + # beam search. stream = (request.stream and (request.best_of is None or request.n == request.best_of) and not request.use_beam_search) @@ -223,7 +225,8 @@ async def completion_stream_generator( for output in res.outputs: i = output.index + prompt_idx * request.n - # TODO(simon): optimize the performance by avoiding full text O(n^2) sending. + # TODO(simon): optimize the performance by avoiding full + # text O(n^2) sending. if request.echo and request.max_tokens == 0: # only return the prompt @@ -231,11 +234,12 @@ async def completion_stream_generator( delta_token_ids = res.prompt_token_ids top_logprobs = res.prompt_logprobs has_echoed[i] = True - elif request.echo and request.max_tokens > 0 and not has_echoed[ - i]: + elif (request.echo and request.max_tokens > 0 + and not has_echoed[i]): # echo the prompt and first token delta_text = res.prompt + output.text - delta_token_ids = res.prompt_token_ids + output.token_ids + delta_token_ids = (res.prompt_token_ids + + output.token_ids) top_logprobs = res.prompt_logprobs + (output.logprobs or []) has_echoed[i] = True @@ -248,7 +252,9 @@ async def completion_stream_generator( i]:] if output.logprobs else None if request.logprobs is not None: - assert top_logprobs is not None, "top_logprobs must be provided when logprobs is requested" + assert top_logprobs is not None, ( + "top_logprobs must be provided when logprobs " + "is requested") logprobs = self._create_logprobs( token_ids=delta_token_ids, top_logprobs=top_logprobs, diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 230d13d97dbba..2db884945c491 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -50,10 +50,12 @@ def __init__(self, except RuntimeError: event_loop = None - if event_loop is not None and event_loop.is_running( - ): # If the current is instanced by Ray Serve, there is already a running event loop + if event_loop is not None and event_loop.is_running(): + # If the current is instanced by Ray Serve, + # there is already a running event loop event_loop.create_task(self._post_init()) - else: # When using single vLLM without engine_use_ray + else: + # When using single vLLM without engine_use_ray asyncio.run(self._post_init()) async def _post_init(self): @@ -178,8 +180,9 @@ def _validate_prompt_and_tokenize( if token_num + request.max_tokens > self.max_model_len: raise ValueError( - f"This model's maximum context length is {self.max_model_len} tokens. " - f"However, you requested {request.max_tokens + token_num} tokens " + f"This model's maximum context length is " + f"{self.max_model_len} tokens. However, you requested " + f"{request.max_tokens + token_num} tokens " f"({token_num} in the messages, " f"{request.max_tokens} in the completion). " f"Please reduce the length of the messages or completion.", ) diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index e667d70f71e39..99e6cdeee6364 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -20,10 +20,12 @@ RowParallelLinear, QKVParallelLinear, MergedColumnParallelLinear) -from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding, ParallelLMHead +from vllm.model_executor.layers.vocab_parallel_embedding import ( + VocabParallelEmbedding, ParallelLMHead) from vllm.model_executor.parallel_utils.parallel_state import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) -from vllm.model_executor.parallel_utils.utils import split_tensor_along_last_dim +from vllm.model_executor.parallel_utils.utils import ( + split_tensor_along_last_dim) if TYPE_CHECKING: pass @@ -84,7 +86,8 @@ def _apply_lora_packed_nslice( lora_b_stacked: 3 element tuple of (num_loras, output_dim, lora_rank) indices: (batch_size) output: (batch_size, q_slice_size + 2*kv_slice_size) - output_slices: n-1 element tuple of (slice_size...), where n is number of slices + output_slices: n-1 element tuple of (slice_size...), + where n is number of slices """ org_output = output x = x.view(-1, x.shape[-1]) @@ -819,9 +822,8 @@ def create_lora_weights( ) -> None: # Keep this in sync with csrc/punica/bgmv/bgmv_config.h if 32000 < self.base_layer.vocab_size > 33024: - raise ValueError( - "When using LoRA, vocab size must be 32000 >= vocab_size <= 33024" - ) + raise ValueError("When using LoRA, vocab size must be " + "32000 >= vocab_size <= 33024") self.lora_a_stacked = torch.zeros( ( max_loras, diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 7386d21c58e4e..238da256b7cdc 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -13,7 +13,8 @@ from vllm.config import LoRAConfig from vllm.utils import LRUCache, in_wsl -from vllm.lora.layers import BaseLayerWithLoRA, LoRAMapping, from_layer, from_layer_sampler +from vllm.lora.layers import (BaseLayerWithLoRA, LoRAMapping, from_layer, + from_layer_sampler) from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights from vllm.lora.utils import parse_fine_tuned_lora_name, replace_submodule diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py index 7e92bc93ab472..911115d63a639 100644 --- a/vllm/lora/worker_manager.py +++ b/vllm/lora/worker_manager.py @@ -154,10 +154,9 @@ def _load_lora(self, lora_request: LoRARequest) -> LoRAModel: f"LoRA rank {lora.rank} is greater than max_lora_rank " f"{self.lora_config.max_lora_rank}.") if lora.extra_vocab_size > self.lora_config.lora_extra_vocab_size: - raise ValueError( - f"LoRA added vocab size {lora.extra_vocab_size} is greater than " - f"lora_extra_vocab_size {self.lora_config.lora_extra_vocab_size}." - ) + raise ValueError(f"LoRA added vocab size {lora.extra_vocab_size} " + f"is greater than lora_extra_vocab_size " + f"{self.lora_config.lora_extra_vocab_size}.") return lora def add_dummy_lora(self, lora_request: LoRARequest, rank: int) -> bool: diff --git a/vllm/model_executor/guided_decoding.py b/vllm/model_executor/guided_decoding.py index a8573f8bdc6c8..00984460d79a6 100644 --- a/vllm/model_executor/guided_decoding.py +++ b/vllm/model_executor/guided_decoding.py @@ -8,8 +8,10 @@ from typing import Union, Tuple from pydantic import BaseModel -from vllm.entrypoints.openai.protocol import CompletionRequest, ChatCompletionRequest -from vllm.model_executor.guided_logits_processors import JSONLogitsProcessor, RegexLogitsProcessor +from vllm.entrypoints.openai.protocol import (CompletionRequest, + ChatCompletionRequest) +from vllm.model_executor.guided_logits_processors import (JSONLogitsProcessor, + RegexLogitsProcessor) class GuidedDecodingMode(Enum): diff --git a/vllm/model_executor/guided_logits_processors.py b/vllm/model_executor/guided_logits_processors.py index 1b3e5e71a5911..76d41aa37dd7b 100644 --- a/vllm/model_executor/guided_logits_processors.py +++ b/vllm/model_executor/guided_logits_processors.py @@ -107,12 +107,15 @@ def __init__(self, Parameters ---------- schema - A JSON schema that encodes the structure we want the model to generate + A JSON schema that encodes the structure we want the model to + generate tokenizer The model's tokenizer whitespace_pattern - Pattern to use for JSON syntactic whitespace (doesn't impact string literals) - Example: allow only a single space or newline with `whitespace_pattern=r"[\n ]?"` + Pattern to use for JSON syntactic whitespace (doesn't impact + string literals) + Example: allow only a single space or newline with + `whitespace_pattern=r"[\n ]?"` """ if isinstance(schema, type(BaseModel)): schema_str = json.dumps(schema.model_json_schema()) @@ -122,8 +125,8 @@ def __init__(self, schema_str = schema else: raise ValueError( - f"Cannot parse schema {schema}. The schema must be either " + - "a Pydantic object, a dictionary or a string that contains the JSON " - + "Schema specification") + f"Cannot parse schema {schema}. The schema must be either " + f"a Pydantic object, a dictionary or a string that contains " + f"the JSON Schema specification") regex_string = build_regex_from_schema(schema_str, whitespace_pattern) super().__init__(regex_string, tokenizer) diff --git a/vllm/model_executor/layers/attention/__init__.py b/vllm/model_executor/layers/attention/__init__.py new file mode 100644 index 0000000000000..1c42a3d28f976 --- /dev/null +++ b/vllm/model_executor/layers/attention/__init__.py @@ -0,0 +1,5 @@ +from vllm.model_executor.layers.attention.attention import Attention + +__all__ = [ + "Attention", +] diff --git a/vllm/model_executor/layers/attention/attention.py b/vllm/model_executor/layers/attention/attention.py new file mode 100644 index 0000000000000..4b63b9eaf59a7 --- /dev/null +++ b/vllm/model_executor/layers/attention/attention.py @@ -0,0 +1,84 @@ +"""Attention layer.""" +from functools import lru_cache +from typing import List, Optional + +import torch +import torch.nn as nn + +from vllm.logger import init_logger +from vllm.model_executor.input_metadata import InputMetadata +from vllm.utils import is_hip + +logger = init_logger(__name__) + + +class Attention(nn.Module): + """Attention layer. + + This class takes query, key, and value tensors as input. The input tensors + can either contain prompt tokens or generation tokens. + The class does the following: + + 1. Store the input key and value tensors in the KV cache. + 2. Perform (multi-head/multi-query/grouped-query) attention. + 3. Return the output tensor. + """ + + def __init__( + self, + num_heads: int, + head_size: int, + scale: float, + num_kv_heads: Optional[int] = None, + alibi_slopes: Optional[List[float]] = None, + sliding_window: Optional[int] = None, + ) -> None: + super().__init__() + if _use_flash_attn(): + from vllm.model_executor.layers.attention.backends.flash_attn import FlashAttentionBackend # noqa: E501 + self.backend = FlashAttentionBackend(num_heads, head_size, scale, + num_kv_heads, alibi_slopes, + sliding_window) + else: + from vllm.model_executor.layers.attention.backends.xformers import XFormersBackend # noqa: E501 + self.backend = XFormersBackend(num_heads, head_size, scale, + num_kv_heads, alibi_slopes, + sliding_window) + + def forward( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + key_cache: Optional[torch.Tensor], + value_cache: Optional[torch.Tensor], + input_metadata: InputMetadata, + ) -> torch.Tensor: + return self.backend.forward(query, key, value, key_cache, value_cache, + input_metadata) + + +@lru_cache(maxsize=1) +def _use_flash_attn() -> bool: + try: + import flash_attn # noqa: F401 + except ImportError: + logger.info("flash_attn is not found. Using xformers backend.") + return False + + if is_hip(): + # AMD GPUs. + return False + if torch.cuda.get_device_capability()[0] < 8: + # Volta and Turing NVIDIA GPUs. + logger.info("flash_attn is not supported on Turing or older GPUs. " + "Using xformers backend.") + return False + if torch.get_default_dtype() not in (torch.float16, torch.bfloat16): + logger.info( + "flash_attn only supports torch.float16 or torch.bfloat16. " + "Using xformers backend.") + return False + + logger.info("Using flash_attn backend.") + return True diff --git a/vllm/model_executor/layers/attention/backends/__init__.py b/vllm/model_executor/layers/attention/backends/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/vllm/model_executor/layers/attention/backends/flash_attn.py b/vllm/model_executor/layers/attention/backends/flash_attn.py new file mode 100644 index 0000000000000..58ccd461b993e --- /dev/null +++ b/vllm/model_executor/layers/attention/backends/flash_attn.py @@ -0,0 +1,121 @@ +"""Attention layer with Flash and PagedAttention.""" +from typing import List, Optional + +from flash_attn import flash_attn_func +import torch + +from vllm.model_executor.input_metadata import InputMetadata +from vllm.model_executor.layers.attention.ops.paged_attn import ( + PagedAttentionImpl) + + +class FlashAttentionBackend: + + def __init__( + self, + num_heads: int, + head_size: int, + scale: float, + num_kv_heads: Optional[int] = None, + alibi_slopes: Optional[List[float]] = None, + sliding_window: Optional[int] = None, + ) -> None: + self.num_heads = num_heads + self.head_size = head_size + self.scale = float(scale) + self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads + self.sliding_window = sliding_window + if alibi_slopes is not None: + alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32) + self.alibi_slopes = alibi_slopes + + assert self.num_heads % self.num_kv_heads == 0 + self.num_queries_per_kv = self.num_heads // self.num_kv_heads + suppored_head_sizes = PagedAttentionImpl.get_supported_head_sizes() + if head_size not in suppored_head_sizes: + raise ValueError( + f"Head size {head_size} is not supported by PagedAttention. " + f"Supported head sizes are: {suppored_head_sizes}.") + + self.sliding_window = ((self.sliding_window, self.sliding_window) if + self.sliding_window is not None else (-1, -1)) + + def forward( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + key_cache: Optional[torch.Tensor], + value_cache: Optional[torch.Tensor], + input_metadata: InputMetadata, + ) -> torch.Tensor: + """Forward pass with FlashAttention and PagedAttention. + + Args: + query: shape = [batch_size, seq_len, num_heads * head_size] + key: shape = [batch_size, seq_len, num_kv_heads * head_size] + value: shape = [batch_size, seq_len, num_kv_heads * head_size] + key_cache: shape = [num_blocks, num_kv_heads, head_size/x, + block_size, x] + value_cache: shape = [num_blocks, num_kv_heads, head_size, + block_size] + input_metadata: metadata for the inputs. + Returns: + shape = [batch_size, seq_len, num_heads * head_size] + """ + batch_size, seq_len, hidden_size = query.shape + # Reshape the query, key, and value tensors. + query = query.view(-1, self.num_heads, self.head_size) + key = key.view(-1, self.num_kv_heads, self.head_size) + value = value.view(-1, self.num_kv_heads, self.head_size) + + # Reshape the keys and values and store them in the cache. + # If key_cache and value_cache are not provided, the new key and value + # vectors will not be cached. This happens during the initial memory + # profiling run. + if key_cache is not None and value_cache is not None: + PagedAttentionImpl.reshape_and_cache(key, value, key_cache, + value_cache, input_metadata) + + if input_metadata.is_prompt: + # Prompt run. + if (key_cache is None or value_cache is None + or input_metadata.block_tables.numel() == 0): + # normal attention + query = query.unflatten(0, (batch_size, seq_len)) + key = key.unflatten(0, (batch_size, seq_len)) + value = value.unflatten(0, (batch_size, seq_len)) + output = flash_attn_func( + query, + key, + value, + softmax_scale=self.scale, + causal=True, + window_size=self.sliding_window, + alibi_slopes=self.alibi_slopes, + ) + else: + # prefix-enabled attention + output = PagedAttentionImpl.forward_prefix( + query, + key, + value, + key_cache, + value_cache, + input_metadata, + self.alibi_slopes, + ) + else: + # Decoding run. + output = PagedAttentionImpl.forward_decode( + query, + key_cache, + value_cache, + input_metadata, + self.num_kv_heads, + self.scale, + self.alibi_slopes, + ) + + # Reshape the output tensor. + return output.view(batch_size, seq_len, hidden_size) diff --git a/vllm/model_executor/layers/attention/backends/xformers.py b/vllm/model_executor/layers/attention/backends/xformers.py new file mode 100644 index 0000000000000..bad2a648b6703 --- /dev/null +++ b/vllm/model_executor/layers/attention/backends/xformers.py @@ -0,0 +1,255 @@ +"""Attention layer with xFormers and PagedAttention.""" +import importlib +from typing import List, Optional + +import torch +from xformers import ops as xops +from xformers.ops.fmha.attn_bias import (BlockDiagonalCausalMask, + LowerTriangularMaskWithTensorBias) + +from vllm.model_executor.input_metadata import InputMetadata +from vllm.model_executor.layers.attention.ops.paged_attn import ( + PagedAttentionImpl) +from vllm.utils import is_hip + + +class XFormersBackend: + + def __init__( + self, + num_heads: int, + head_size: int, + scale: float, + num_kv_heads: Optional[int] = None, + alibi_slopes: Optional[List[float]] = None, + sliding_window: Optional[int] = None, + ) -> None: + self.num_heads = num_heads + self.head_size = head_size + self.scale = float(scale) + self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads + self.sliding_window = sliding_window + if alibi_slopes is not None: + alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32) + self.alibi_slopes = alibi_slopes + + assert self.num_heads % self.num_kv_heads == 0 + self.num_queries_per_kv = self.num_heads // self.num_kv_heads + suppored_head_sizes = PagedAttentionImpl.get_supported_head_sizes() + if head_size not in suppored_head_sizes: + raise ValueError( + f"Head size {head_size} is not supported by PagedAttention. " + f"Supported head sizes are: {suppored_head_sizes}.") + + self.use_ref_attention = _check_use_ref_attention() + + def forward( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + key_cache: Optional[torch.Tensor], + value_cache: Optional[torch.Tensor], + input_metadata: InputMetadata, + ) -> torch.Tensor: + """Forward pass with xFormers and PagedAttention. + + Args: + query: shape = [batch_size, seq_len, num_heads * head_size] + key: shape = [batch_size, seq_len, num_kv_heads * head_size] + value: shape = [batch_size, seq_len, num_kv_heads * head_size] + key_cache: shape = [num_blocks, num_kv_heads, head_size/x, + block_size, x] + value_cache: shape = [num_blocks, num_kv_heads, head_size, + block_size] + input_metadata: metadata for the inputs. + Returns: + shape = [batch_size, seq_len, num_heads * head_size] + """ + batch_size, seq_len, hidden_size = query.shape + # Reshape the query, key, and value tensors. + query = query.view(-1, self.num_heads, self.head_size) + key = key.view(-1, self.num_kv_heads, self.head_size) + value = value.view(-1, self.num_kv_heads, self.head_size) + + # Reshape the keys and values and store them in the cache. + # If key_cache and value_cache are not provided, the new key and value + # vectors will not be cached. This happens during the initial memory + # profiling run. + if key_cache is not None and value_cache is not None: + PagedAttentionImpl.reshape_and_cache(key, value, key_cache, + value_cache, input_metadata) + + if input_metadata.is_prompt: + # Prompt run. + if (key_cache is None or value_cache is None + or input_metadata.block_tables.numel() == 0): + # normal attention + if self.num_kv_heads != self.num_heads: + # As of Nov 2023, xformers only supports MHA. For MQA/GQA, + # project the key and value tensors to the desired number of + # heads. + # TODO(woosuk): Use MQA/GQA kernels for higher performance. + query = query.view(query.shape[0], self.num_kv_heads, + self.num_queries_per_kv, + query.shape[-1]) + key = key[:, :, + None, :].expand(key.shape[0], self.num_kv_heads, + self.num_queries_per_kv, + key.shape[-1]) + value = value[:, :, + None, :].expand(value.shape[0], + self.num_kv_heads, + self.num_queries_per_kv, + value.shape[-1]) + + # Set attention bias if not provided. This typically happens at + # the very attention layer of every iteration. + # FIXME(woosuk): This is a hack. + if input_metadata.attn_bias is None: + if self.alibi_slopes is None: + attn_bias = BlockDiagonalCausalMask.from_seqlens( + [seq_len] * batch_size) + if self.sliding_window is not None: + attn_bias = attn_bias.make_local_attention( + self.sliding_window) + input_metadata.attn_bias = attn_bias + else: + input_metadata.attn_bias = _make_alibi_bias( + self.alibi_slopes, self.num_kv_heads, batch_size, + seq_len, query.dtype) + + if self.use_ref_attention: + output = _ref_masked_attention( + query, + key, + value, + self.num_heads, + self.num_kv_heads, + self.head_size, + self.scale, + ) + # Using view got RuntimeError: view size is not compatible + # with input tensor's size and stride (at least one + # dimension spans across two contiguous subspaces). + # Use reshape instead. + return output.reshape(batch_size, seq_len, hidden_size) + + # TODO(woosuk): Too many view operations. Let's try to reduce + # them in the future for code readability. + if self.alibi_slopes is None: + query = query.unsqueeze(0) + key = key.unsqueeze(0) + value = value.unsqueeze(0) + else: + query = query.unflatten(0, (batch_size, seq_len)) + key = key.unflatten(0, (batch_size, seq_len)) + value = value.unflatten(0, (batch_size, seq_len)) + + out = xops.memory_efficient_attention_forward( + query, + key, + value, + attn_bias=input_metadata.attn_bias, + p=0.0, + scale=self.scale, + op=xops.fmha.MemoryEfficientAttentionFlashAttentionOp[0] if + (is_hip()) else None, + ) + output = out.view_as(query) + + else: + # prefix-enabled attention + output = PagedAttentionImpl.forward_prefix( + query, + key, + value, + key_cache, + value_cache, + input_metadata, + self.alibi_slopes, + ) + else: + # Decoding run. + output = PagedAttentionImpl.forward_decode( + query, + key_cache, + value_cache, + input_metadata, + self.num_kv_heads, + self.scale, + self.alibi_slopes, + ) + + # Reshape the output tensor. + return output.view(batch_size, seq_len, hidden_size) + + +def _make_alibi_bias( + alibi_slopes: torch.Tensor, + num_kv_heads: int, + batch_size: int, + seq_len: int, + dtype: torch.dtype, +) -> LowerTriangularMaskWithTensorBias: + bias = torch.arange(seq_len, dtype=dtype) + # NOTE(zhuohan): HF uses + # `bias = bias[None, :].repeat(prompt_len, 1)` + # here. We find that both biases give the same results, but + # the bias below more accurately follows the original ALiBi + # paper. + bias = bias[None, :] - bias[:, None] + + # When using custom attention bias, xformers requires the bias to + # be sliced from a tensor whose length is a multiple of 8. + padded_len = (seq_len + 7) // 8 * 8 + num_heads = alibi_slopes.shape[0] + bias = torch.empty( + batch_size, + num_heads, + seq_len, + padded_len, + device=alibi_slopes.device, + dtype=dtype, + )[:, :, :, :seq_len].copy_(bias) + bias.mul_(alibi_slopes[:, None, None]) + if num_heads != num_kv_heads: + bias = bias.unflatten(1, (num_kv_heads, num_heads // num_kv_heads)) + attn_bias = LowerTriangularMaskWithTensorBias(bias) + return attn_bias + + +def _check_use_ref_attention() -> bool: + if not is_hip(): + return False + # For ROCm, check whether flash attention is installed or not. + # if not, use_ref_attention needs to be True + return importlib.util.find_spec("flash_attn") is None + + +def _ref_masked_attention( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + num_heads: int, + num_kv_heads: int, + head_size: int, + scale: float, +) -> torch.Tensor: + query = query.view(-1, num_heads, head_size) + key = key.view(-1, num_kv_heads, head_size) + value = value.view(-1, num_kv_heads, head_size) + + seq_len, _, _ = query.shape + attn_mask = torch.triu(torch.ones(seq_len, + seq_len, + dtype=query.dtype, + device=query.device), + diagonal=1) + attn_mask = attn_mask * torch.finfo(query.dtype).min + + attn_weights = scale * torch.einsum("qhd,khd->hqk", query, key).float() + attn_weights = attn_weights + attn_mask.float() + attn_weights = torch.softmax(attn_weights, dim=-1).to(value.dtype) + out = torch.einsum("hqk,khd->qhd", attn_weights, value) + return out diff --git a/vllm/model_executor/layers/attention/ops/__init__.py b/vllm/model_executor/layers/attention/ops/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/vllm/model_executor/layers/attention/ops/paged_attn.py b/vllm/model_executor/layers/attention/ops/paged_attn.py new file mode 100644 index 0000000000000..c5a9618c2395b --- /dev/null +++ b/vllm/model_executor/layers/attention/ops/paged_attn.py @@ -0,0 +1,138 @@ +from typing import List, Optional + +import torch + +from vllm._C import cache_ops +from vllm._C import ops +from vllm.model_executor.input_metadata import InputMetadata +from vllm.model_executor.layers.attention.ops.prefix_prefill import ( + context_attention_fwd) + +# Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`. +_PARTITION_SIZE = 512 + + +class PagedAttentionImpl: + + @staticmethod + def get_supported_head_sizes() -> List[int]: + return [64, 80, 96, 112, 128, 256] + + @staticmethod + def reshape_and_cache( + key: torch.Tensor, + value: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + input_metadata: InputMetadata, + ) -> None: + cache_ops.reshape_and_cache( + key, + value, + key_cache, + value_cache, + input_metadata.slot_mapping.flatten(), + input_metadata.kv_cache_dtype, + ) + + @staticmethod + def forward_decode( + query: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + input_metadata: InputMetadata, + num_kv_heads: int, + scale: float, + alibi_slopes: Optional[torch.Tensor], + ) -> torch.Tensor: + output = torch.empty_like(query) + + block_size = value_cache.shape[3] + num_seqs, num_heads, head_size = query.shape + max_num_partitions = ( + (input_metadata.max_context_len + _PARTITION_SIZE - 1) // + _PARTITION_SIZE) + # NOTE(woosuk): We use a simple heuristic to decide whether to use + # PagedAttention V1 or V2. If the number of partitions is 1, we use + # V1 to avoid the overhead of reduction. Also, if the number of + # sequences or heads is large, we use V1 since there is enough work + # to parallelize. + # TODO(woosuk): Tune this heuristic. + # For context len > 8192, use V2 kernel to avoid shared memory shortage. + use_v1 = input_metadata.max_context_len <= 8192 and ( + max_num_partitions == 1 or num_seqs * num_heads > 512) + if use_v1: + # Run PagedAttention V1. + ops.paged_attention_v1( + output, + query, + key_cache, + value_cache, + num_kv_heads, + scale, + input_metadata.block_tables, + input_metadata.context_lens, + block_size, + input_metadata.max_context_len, + alibi_slopes, + input_metadata.kv_cache_dtype, + ) + else: + # Run PagedAttention V2. + assert _PARTITION_SIZE % block_size == 0 + tmp_output = torch.empty( + size=(num_seqs, num_heads, max_num_partitions, head_size), + dtype=output.dtype, + device=output.device, + ) + exp_sums = torch.empty( + size=(num_seqs, num_heads, max_num_partitions), + dtype=torch.float32, + device=output.device, + ) + max_logits = torch.empty_like(exp_sums) + ops.paged_attention_v2( + output, + exp_sums, + max_logits, + tmp_output, + query, + key_cache, + value_cache, + num_kv_heads, + scale, + input_metadata.block_tables, + input_metadata.context_lens, + block_size, + input_metadata.max_context_len, + alibi_slopes, + input_metadata.kv_cache_dtype, + ) + return output + + @staticmethod + def forward_prefix( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + input_metadata: InputMetadata, + alibi_slopes: Optional[torch.Tensor], + ) -> torch.Tensor: + output = torch.empty_like(query) + context_attention_fwd( + query, + key, + value, + output, + key_cache, + value_cache, + input_metadata.block_tables, # [BS, max_block_per_request] + input_metadata.start_loc, + input_metadata.prompt_lens, + input_metadata.context_lens, + input_metadata.max_seq_len, + alibi_slopes, + ) + return output diff --git a/vllm/model_executor/layers/triton_kernel/prefix_prefill.py b/vllm/model_executor/layers/attention/ops/prefix_prefill.py similarity index 100% rename from vllm/model_executor/layers/triton_kernel/prefix_prefill.py rename to vllm/model_executor/layers/attention/ops/prefix_prefill.py diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 08e3c2d5b706e..3e6dd0dfe2eb3 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -30,9 +30,10 @@ def fused_moe_kernel( K, EM, num_valid_tokens, - # The stride variables represent how much to increase the ptr by when moving by 1 - # element in a particular dimension. E.g. `stride_am` is how much to increase `a_ptr` - # by to get the element one row down (A has M rows). + # The stride variables represent how much to increase the ptr by when + # moving by 1 element in a particular dimension. E.g. `stride_am` is + # how much to increase `a_ptr` by to get the element one row down + # (A has M rows). stride_am, stride_ak, stride_be, @@ -50,17 +51,30 @@ def fused_moe_kernel( compute_type: tl.constexpr, ): """ - Implements the fused computation for a Mixture of Experts (MOE) using token and expert matrices. + Implements the fused computation for a Mixture of Experts (MOE) using + token and expert matrices. Key Parameters: - - A: The input tensor representing tokens with shape (*, K), where '*' can be any shape representing batches and K is the feature dimension of each token. - - B: The stacked MOE weight tensor with shape (E, N, K), where E is the number of experts, K is the input feature dimension, and N is the output feature dimension. - - C: The output cache tensor with shape (M, topk, N), where M is the total number of tokens post padding, topk is the number of times each token is repeated, - and N is the output feature dimension. - - sorted_token_ids: A tensor containing the sorted indices of tokens, repeated topk times and arranged by the expert index they are assigned to. - - expert_ids: A tensor containing the indices of the expert for each block. It determines which expert matrix from B should be used for each block in A. - This kernel performs the multiplication of a token by its corresponding expert matrix as determined by `expert_ids`. The sorting of `sorted_token_ids` - by expert index and padding ensures divisibility by BLOCK_SIZE_M, which is necessary to maintain consistency in block matrix multiplication across different blocks processed by the same expert. + - A: The input tensor representing tokens with shape (*, K), where '*' can + be any shape representing batches and K is the feature dimension of + each token. + - B: The stacked MOE weight tensor with shape (E, N, K), where E is + the number of experts, K is the input feature dimension, and N is + the output feature dimension. + - C: The output cache tensor with shape (M, topk, N), where M is the + total number of tokens post padding, topk is the number of times + each token is repeated, and N is the output feature dimension. + - sorted_token_ids: A tensor containing the sorted indices of tokens, + repeated topk times and arranged by the expert index they are + assigned to. + - expert_ids: A tensor containing the indices of the expert for each + block. It determines which expert matrix from B should be used for + each block in A. + This kernel performs the multiplication of a token by its corresponding + expert matrix as determined by `expert_ids`. The sorting of + `sorted_token_ids` by expert index and padding ensures divisibility by + BLOCK_SIZE_M, which is necessary to maintain consistency in block matrix + multiplication across different blocks processed by the same expert. """ # ----------------------------------------------------------- # Map program ids `pid` to the block of C it should compute. @@ -105,7 +119,8 @@ def fused_moe_kernel( accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)): - # Load the next block of A and B, generate a mask by checking the K dimension. + # Load the next block of A and B, generate a mask by checking the + # K dimension. a = tl.load(a_ptrs, mask=token_mask[:, None] & (offs_k[None, :] < K - k * BLOCK_SIZE_K), @@ -139,30 +154,41 @@ def moe_align_block_size( topk_ids: torch.Tensor, block_size: int, num_experts: int) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """ - Aligns the token distribution across experts to be compatible with block size for matrix multiplication. + Aligns the token distribution across experts to be compatible with block + size for matrix multiplication. Parameters: - - topk_ids: A tensor of shape [total_tokens, top_k] representing the top-k expert indices for each token. + - topk_ids: A tensor of shape [total_tokens, top_k] representing the + top-k expert indices for each token. - block_size: The block size used in block matrix multiplication. - num_experts: The total number of experts. Returns: - - sorted_token_ids: A tensor containing the sorted token indices according to their allocated expert. + - sorted_token_ids: A tensor containing the sorted token indices according + to their allocated expert. - expert_ids: A tensor indicating the assigned expert index for each block. - - num_tokens_post_padded: The total number of tokens after padding, ensuring divisibility by block_size. + - num_tokens_post_padded: The total number of tokens after padding, + ensuring divisibility by block_size. - This function pads the number of tokens that each expert needs to process so that it is divisible by block_size. - Padding ensures that during block matrix multiplication, the dimensions align correctly. + This function pads the number of tokens that each expert needs to process + so that it is divisible by block_size. + Padding ensures that during block matrix multiplication, the dimensions + align correctly. Example: - Given topk_ids = [[2, 3, 4], [1, 2, 4], [1, 3, 4], [1, 2, 3]], block_size = 4, and num_experts = 4: - - We initially have 12 tokens (after repeating 'top_k' times) and 4 experts, with each expert needing to process 3 tokens. + Given topk_ids = [[2, 3, 4], [1, 2, 4], [1, 3, 4], [1, 2, 3]], + block_size = 4, and num_experts = 4: + - We initially have 12 tokens (after repeating 'top_k' times) and 4 experts, + with each expert needing to process 3 tokens. - As block_size is 4, we pad 1 token for each expert. - First, flatten topk_ids to [2, 3, 4, 1, 2, 4, 1, 3, 4, 1, 2, 3]. - Then append padding tokens [12, 12, 12, 12] for each block. - - After sorting by expert index, we obtain token_ids [3, 6, 9, 12, 0, 4, 10, 12, 1, 7, 11, 12, 2, 5, 8, 12]. - Tokens 12 are non-existent (padding) and are ignored in the subsequent matrix multiplication. - - The padding ensures that the total number of tokens is now divisible by block_size for proper block matrix operations. + - After sorting by expert index, we obtain token_ids + [3, 6, 9, 12, 0, 4, 10, 12, 1, 7, 11, 12, 2, 5, 8, 12]. + Tokens 12 are non-existent (padding) and are ignored in + the subsequent matrix multiplication. + - The padding ensures that the total number of tokens is now divisible + by block_size for proper block matrix operations. """ sorted_ids = torch.empty( (topk_ids.numel() + num_experts * (block_size - 1), ), @@ -224,13 +250,14 @@ def get_moe_configs(E: int, N: int) -> Optional[Dict[int, Any]]: """ Return optimized configurations for the fused MoE kernel. - The return value will be a dictionary that maps an irregular grid of batch sizes - to configurations of the fused_moe kernel. To evaluate the kernel on a given batch - size bs, the closest batch size in the grid should be picked and the associated - configuration chosen to invoke the kernel. + The return value will be a dictionary that maps an irregular grid of + batch sizes to configurations of the fused_moe kernel. To evaluate the + kernel on a given batch size bs, the closest batch size in the grid should + be picked and the associated configuration chosen to invoke the kernel. """ - # First look up if an optimized configuration is available in the configs directory + # First look up if an optimized configuration is available in the configs + # directory device_name = torch.cuda.get_device_name().replace(" ", "_") config_file_path = os.path.join( @@ -243,7 +270,8 @@ def get_moe_configs(E: int, N: int) -> Optional[Dict[int, Any]]: # If a configuration has been found, return it return {int(key): val for key, val in json.load(f).items()} - # If no optimized configuration is available, we will use the default configuration + # If no optimized configuration is available, we will use the default + # configuration return None @@ -258,18 +286,22 @@ def fused_moe( override_config: Optional[Dict[str, Any]] = None, ) -> torch.Tensor: """ - This function computes a Mixture of Experts (MoE) layer using two sets of weights, w1 and w2, and top-k gating mechanism. - + This function computes a Mixture of Experts (MoE) layer using two sets of + weights, w1 and w2, and top-k gating mechanism. + Parameters: - hidden_states (torch.Tensor): The input tensor to the MoE layer. - w1 (torch.Tensor): The first set of expert weights. - w2 (torch.Tensor): The second set of expert weights. - - gating_output (torch.Tensor): The output of the gating operation (before softmax). + - gating_output (torch.Tensor): The output of the gating operation + (before softmax). - topk (int): The number of top-k experts to select. - renormalize (bool): If True, renormalize the top-k weights to sum to 1. - - inplace (bool): If True, perform the operation in-place. Defaults to False. - - override_config (Optional[Dict[str, Any]]): Optional override for the kernel configuration. - + - inplace (bool): If True, perform the operation in-place. + Defaults to False. + - override_config (Optional[Dict[str, Any]]): Optional override + for the kernel configuration. + Returns: - torch.Tensor: The output tensor after applying the MoE layer. """ @@ -325,7 +357,8 @@ def fused_moe( configs = get_moe_configs(E, w2.shape[2]) if configs: - # If an optimal configuration map has been found, look up the optimal config + # If an optimal configuration map has been found, look up the + # optimal config config = configs[min(configs.keys(), key=lambda x: abs(x - M))] else: # Else use the default config diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 45f6ea8438043..8d26271bd60a5 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -293,7 +293,8 @@ def weight_loader(self, shard_size = shard_size // param.pack_factor shard_offset = shard_offset // param.pack_factor - # If marlin, we need to adjust the offset and size to account for the tiling. + # If marlin, we need to adjust the offset and size to + # account for the tiling. shard_size, shard_offset = adjust_marlin_shard( param, shard_size, shard_offset) @@ -315,7 +316,8 @@ def weight_loader(self, shard_size = shard_size // param.pack_factor shard_offset = shard_offset // param.pack_factor - # If marlin, we need to adjust the offset and size to account for the tiling. + # If marlin, we need to adjust the offset and size to + # account for the tiling. shard_size, shard_offset = adjust_marlin_shard( param, shard_size, shard_offset) @@ -431,7 +433,8 @@ def weight_loader(self, shard_size = shard_size // param.pack_factor shard_offset = shard_offset // param.pack_factor - # If marlin, we need to adjust the offset and size to account for the tiling. + # If marlin, we need to adjust the offset and size to + # account for the tiling. shard_size, shard_offset = adjust_marlin_shard( param, shard_size, shard_offset) @@ -460,7 +463,8 @@ def weight_loader(self, shard_size = shard_size // param.pack_factor shard_offset = shard_offset // param.pack_factor - # If marlin, we need to adjust the offset and size to account for the tiling. + # If marlin, we need to adjust the offset and size to + # account for the tiling. shard_size, shard_offset = adjust_marlin_shard( param, shard_size, shard_offset) diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py index b05790b339243..1891ed4c2b8ee 100644 --- a/vllm/model_executor/layers/quantization/__init__.py +++ b/vllm/model_executor/layers/quantization/__init__.py @@ -2,7 +2,8 @@ from typing import Type -from vllm.model_executor.layers.quantization.base_config import QuantizationConfig +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig) from vllm.model_executor.layers.quantization.awq import AWQConfig from vllm.model_executor.layers.quantization.gptq import GPTQConfig from vllm.model_executor.layers.quantization.squeezellm import SqueezeLLMConfig diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py index 3e1c814dd233c..2caef5f1ebf50 100644 --- a/vllm/model_executor/layers/quantization/awq.py +++ b/vllm/model_executor/layers/quantization/awq.py @@ -6,7 +6,8 @@ from vllm._C import ops from vllm.model_executor.layers.linear import (LinearMethodBase, set_weight_attrs) -from vllm.model_executor.layers.quantization.base_config import QuantizationConfig +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig) class AWQConfig(QuantizationConfig): @@ -50,7 +51,8 @@ def get_min_capability(self) -> int: def get_config_filenames() -> List[str]: return [ "quant_config.json", # E.g., casperhansen/vicuna-7b-v1.5-awq - "quantize_config.json", # E.g., abhinavkulkarni/mosaicml-mpt-7b-instruct-w4-g128-awq + # E.g., abhinavkulkarni/mosaicml-mpt-7b-instruct-w4-g128-awq + "quantize_config.json", ] @classmethod diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py index 2e6aabb232673..bb69c7235a133 100644 --- a/vllm/model_executor/layers/quantization/gptq.py +++ b/vllm/model_executor/layers/quantization/gptq.py @@ -31,8 +31,8 @@ def __init__( self.pack_factor = Fraction(32, self.weight_bits) if self.weight_bits not in [2, 3, 4, 8]: raise ValueError( - "Currently, only 2/3/4/8-bit weight quantization is supported for " - f"GPTQ, but got {self.weight_bits} bits.") + "Currently, only 2/3/4/8-bit weight quantization is " + f"supported for GPTQ, but got {self.weight_bits} bits.") def __repr__(self) -> str: return (f"GPTQConfig(weight_bits={self.weight_bits}, " @@ -101,7 +101,8 @@ def create_weights( "The input size is not aligned with the quantized " "weight shape. This can be caused by too large " "tensor parallel size.") - if output_size_per_partition % self.quant_config.pack_factor.numerator != 0: + if (output_size_per_partition % self.quant_config.pack_factor.numerator + != 0): raise ValueError( "The output size is not aligned with the quantized " "weight shape. This can be caused by too large " @@ -114,7 +115,8 @@ def create_weights( exllama_state = ExllamaState.UNINITIALIZED scale_and_zero_size = input_size // group_size scale_and_zero_input_dim = None - if input_size != input_size_per_partition and self.quant_config.group_size != -1: + if (input_size != input_size_per_partition + and self.quant_config.group_size != -1): # For act-order models, we cannot use Exllama for row parallel layer if self.quant_config.desc_act: exllama_state = ExllamaState.UNUSED diff --git a/vllm/model_executor/layers/quantization/marlin.py b/vllm/model_executor/layers/quantization/marlin.py index 96800759b4a25..0c4f20d9e3a58 100644 --- a/vllm/model_executor/layers/quantization/marlin.py +++ b/vllm/model_executor/layers/quantization/marlin.py @@ -5,7 +5,8 @@ from vllm._C import ops from vllm.model_executor.layers.linear import LinearMethodBase, set_weight_attrs -from vllm.model_executor.layers.quantization.base_config import QuantizationConfig +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig) class MarlinConfig(QuantizationConfig): @@ -22,8 +23,9 @@ def __init__( self.group_size = group_size if self.group_size != 128 and self.group_size != -1: raise ValueError( - "Currently, only group size 128 and -1 (channelwise) is supported for " - f"Marlin, but got group_size of {self.group_size}") + "Currently, only group size 128 and -1 (channelwise) " + "is supported for Marlin, but got group_size of " + f"{self.group_size}") # 4 Bits packed into 32 bit datatype. self.pack_factor = 32 // 4 @@ -37,7 +39,8 @@ def __init__( # Min in_features dim self.min_k_threads = 128 - # Max parallel problems to solve at once (improves large batch performance) + # Max parallel problems to solve at once (improves large + # batch performance) self.max_parallel = 16 # Permutation length used by the marlin kernels. @@ -52,9 +55,10 @@ def get_name(cls) -> str: @classmethod def get_supported_act_dtypes(cls) -> List[torch.dtype]: - return [torch.float16] + return [torch.half] @classmethod + # Need to figure it out def get_min_capability(cls) -> int: return 80 @@ -92,6 +96,8 @@ def create_weights( output_size: int, params_dtype: torch.dtype, ) -> Dict[str, Any]: + del output_size # Unused. + if params_dtype != torch.float16: raise ValueError( f"The params dtype must be float16, but got {params_dtype}") @@ -99,22 +105,26 @@ def create_weights( # Validate output_size_per_partition if output_size_per_partition % self.quant_config.min_n_threads != 0: raise ValueError( - f"Weight output_size_per_partition = {output_size_per_partition} is not divisible by min_n_threads = {self.quant_config.min_n_threads}." - ) + f"Weight output_size_per_partition = " + f"{output_size_per_partition} is not divisible by " + f"min_n_threads = {self.quant_config.min_n_threads}.") if output_size_per_partition % self.quant_config.pack_factor != 0: raise ValueError( - f"Weight output_size_per_partition = {output_size_per_partition} is not divisible by pack_factor = {self.quant_config.pack_factor}." - ) + f"Weight output_size_per_partition = " + f"{output_size_per_partition} is not divisible by " + f"pack_factor = {self.quant_config.pack_factor}.") # Validate input_size_per_partition if input_size_per_partition % self.quant_config.min_k_threads != 0: raise ValueError( - f"Weight input_size_per_partition = {input_size_per_partition} is not divisible by min_k_threads = {self.quant_config.min_k_threads}." - ) - if self.quant_config.group_size != -1 and input_size_per_partition % self.quant_config.group_size != 0: - raise ValueError( - f"Weight input_size_per_partition = f{input_size_per_partition} is not divisible by group_size = {self.quant_config.group_size}." - ) + f"Weight input_size_per_partition = " + f"{input_size_per_partition} is not divisible by " + f"min_k_threads = {self.quant_config.min_k_threads}.") + if (self.quant_config.group_size != -1 and + input_size_per_partition % self.quant_config.group_size != 0): + raise ValueError(f"Weight input_size_per_partition = " + f"{input_size_per_partition} is not divisible by " + f"group_size = {self.quant_config.group_size}.") # Check that we have at least 4 tiles horizontally in the shard num_tiles_per_perm = self.quant_config.perm_len // ( @@ -146,7 +156,9 @@ def create_weights( ) # Determine if channelwise or not - input_groups = 1 if self.quant_config.group_size == -1 else input_size_per_partition // self.quant_config.group_size + input_groups = (1 if self.quant_config.group_size == -1 else + input_size_per_partition // + self.quant_config.group_size) scales = Parameter( torch.empty( diff --git a/vllm/model_executor/layers/quantization/squeezellm.py b/vllm/model_executor/layers/quantization/squeezellm.py index 9244e88552756..ed25455e6ec1f 100644 --- a/vllm/model_executor/layers/quantization/squeezellm.py +++ b/vllm/model_executor/layers/quantization/squeezellm.py @@ -6,7 +6,8 @@ from vllm._C import ops from vllm.model_executor.layers.linear import (LinearMethodBase, set_weight_attrs) -from vllm.model_executor.layers.quantization.base_config import QuantizationConfig +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig) from vllm.utils import is_hip diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py index 3e1cfc783b8ef..5643454060251 100644 --- a/vllm/model_executor/layers/rejection_sampler.py +++ b/vllm/model_executor/layers/rejection_sampler.py @@ -21,8 +21,6 @@ def __init__(self, strict_mode: bool = False): nontrivial latency. """ super().__init__() - self.probs_dtype = torch.float32 - self.token_id_dtype = torch.int64 self._strict_mode = strict_mode # NOTE: A "bonus token" is accepted iff all proposal tokens are @@ -44,6 +42,14 @@ def init_gpu_tensors(self, rank: int) -> None: dtype=torch.long, device=device) + @property + def probs_dtype(self): + return torch.float32 + + @property + def token_id_dtype(self): + return torch.int64 + def forward( self, target_probs: torch.Tensor, diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index b48dde0318d09..4377b845df628 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -6,7 +6,8 @@ from vllm.model_executor.parallel_utils.communication_op import ( tensor_model_parallel_gather) -from vllm.model_executor.sampling_metadata import SamplingMetadata, SamplingTensors +from vllm.model_executor.sampling_metadata import (SamplingMetadata, + SamplingTensors) from vllm.sampling_params import SamplingParams, SamplingType from vllm.sequence import (Logprob, PromptLogprobs, SampleLogprobs, SamplerOutput, SequenceData, SequenceGroupOutput, @@ -516,7 +517,6 @@ def _get_logprobs( if (i < sampling_metadata.num_prompts and sampling_params.prompt_logprobs is not None): num_logprobs = sampling_params.prompt_logprobs - prompt_len = sampling_metadata.prompt_lens[i] prompt_tokens = sampling_metadata.seq_data[ seq_ids[0]].prompt_token_ids group_prompt_logprobs: PromptLogprobs = [None] @@ -588,4 +588,4 @@ def _build_sampler_output( SequenceOutput(seq_ids[parent_id], next_token_id, logprobs)) sampler_output.append( SequenceGroupOutput(seq_outputs, group_prompt_logprobs)) - return sampler_output + return SamplerOutput(outputs=sampler_output) diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py index 550dec6487f9e..cbf472750e294 100644 --- a/vllm/model_executor/models/baichuan.py +++ b/vllm/model_executor/models/baichuan.py @@ -27,7 +27,7 @@ from vllm.model_executor.input_metadata import InputMetadata from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (LinearMethodBase, MergedColumnParallelLinear, @@ -151,10 +151,10 @@ def __init__( alibi_slopes = alibi_slopes[head_start:head_end].tolist() scaling = self.head_dim**-0.5 - self.attn = PagedAttention(self.num_heads, - self.head_dim, - scaling, - alibi_slopes=alibi_slopes) + self.attn = Attention(self.num_heads, + self.head_dim, + scaling, + alibi_slopes=alibi_slopes) else: self.rotary_emb = get_rope( self.head_dim, @@ -163,8 +163,7 @@ def __init__( base=self.rope_theta, ) self.scaling = self.head_dim**-0.5 - self.attn = PagedAttention(self.num_heads, self.head_dim, - self.scaling) + self.attn = Attention(self.num_heads, self.head_dim, self.scaling) def forward( self, @@ -334,7 +333,8 @@ def load_weights(self, if "rotary_emb.inv_freq" in name: continue if name == "lm_head.weight": - # Unlike Baichuan, Baichuan2 normalizes the head weights. Refer to: + # Unlike Baichuan, Baichuan2 normalizes the head weights. + # Refer to: # https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat/blob/84603cde5ebffb6084e476cfaeceaf0b8b91fe54/modeling_baichuan.py#L508 # Distinguish between Baichuan and Baichuan2 by checking the # vocab size. This is suggested by diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py index 4adfb6b78102f..0548b2b140b1b 100644 --- a/vllm/model_executor/models/bloom.py +++ b/vllm/model_executor/models/bloom.py @@ -25,7 +25,7 @@ from vllm.model_executor.input_metadata import InputMetadata from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import (ColumnParallelLinear, LinearMethodBase, QKVParallelLinear, @@ -107,10 +107,10 @@ def __init__( alibi_slopes = alibi_slopes[head_start:head_end].tolist() scaling = self.head_dim**-0.5 - self.attn = PagedAttention(self.num_heads, - self.head_dim, - scaling, - alibi_slopes=alibi_slopes) + self.attn = Attention(self.num_heads, + self.head_dim, + scaling, + alibi_slopes=alibi_slopes) def forward( self, diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index dca8d724f976b..1c5dcfacaff2b 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -10,7 +10,7 @@ from vllm.model_executor.input_metadata import InputMetadata from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (LinearMethodBase, MergedColumnParallelLinear, @@ -87,7 +87,7 @@ def __init__( base=10000 * rope_ratio, is_neox_style=False, ) - self.attn = PagedAttention( + self.attn = Attention( self.num_heads, self.head_dim, self.scaling, diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py index 6dba952736921..13c080cb02774 100644 --- a/vllm/model_executor/models/deepseek.py +++ b/vllm/model_executor/models/deepseek.py @@ -29,7 +29,7 @@ from vllm.model_executor.input_metadata import InputMetadata from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import fused_moe from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (LinearMethodBase, @@ -119,7 +119,8 @@ def __init__( linear_method=None) if config.n_shared_experts is not None: - intermediate_size = config.moe_intermediate_size * config.n_shared_experts + intermediate_size = (config.moe_intermediate_size * + config.n_shared_experts) self.shared_experts = DeepseekMLP( hidden_size=config.hidden_size, intermediate_size=intermediate_size, @@ -229,10 +230,10 @@ def __init__( base=rope_theta, rope_scaling=rope_scaling, ) - self.attn = PagedAttention(self.num_heads, - self.head_dim, - self.scaling, - num_kv_heads=self.num_kv_heads) + self.attn = Attention(self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads) def forward( self, @@ -273,8 +274,9 @@ def __init__( max_position_embeddings=max_position_embeddings, linear_method=linear_method, ) - if (config.n_routed_experts is not None and \ - layer_idx >= config.first_k_dense_replace and layer_idx % config.moe_layer_freq == 0): + if (config.n_routed_experts is not None + and layer_idx >= config.first_k_dense_replace + and layer_idx % config.moe_layer_freq == 0): self.mlp = DeepseekMoE(config=config, linear_method=linear_method) else: self.mlp = DeepseekMLP( diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py index 2b5e022312e3b..3c148be5b10f4 100644 --- a/vllm/model_executor/models/falcon.py +++ b/vllm/model_executor/models/falcon.py @@ -28,7 +28,7 @@ from vllm.model_executor.input_metadata import InputMetadata from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import (ColumnParallelLinear, LinearMethodBase, QKVParallelLinear, @@ -150,10 +150,10 @@ def __init__( max_position=max_position_embeddings, base=rope_theta, ) - self.attn = PagedAttention(self.num_heads, - self.head_dim, - self.inv_norm_factor, - num_kv_heads=self.num_kv_heads) + self.attn = Attention(self.num_heads, + self.head_dim, + self.inv_norm_factor, + num_kv_heads=self.num_kv_heads) elif self.use_alibi: tp_rank = get_tensor_model_parallel_rank() head_start = tp_rank * self.num_heads @@ -161,16 +161,16 @@ def __init__( alibi_slopes = (_get_alibi_slopes(self.total_num_heads) * self.inv_norm_factor) alibi_slopes = alibi_slopes[head_start:head_end].tolist() - self.attn = PagedAttention(self.num_heads, - self.head_dim, - self.inv_norm_factor, - num_kv_heads=self.num_kv_heads, - alibi_slopes=alibi_slopes) + self.attn = Attention(self.num_heads, + self.head_dim, + self.inv_norm_factor, + num_kv_heads=self.num_kv_heads, + alibi_slopes=alibi_slopes) else: - self.attn = PagedAttention(self.num_heads, - self.head_dim, - scale=self.inv_norm_factor, - num_kv_heads=self.num_kv_heads) + self.attn = Attention(self.num_heads, + self.head_dim, + scale=self.inv_norm_factor, + num_kv_heads=self.num_kv_heads) def forward( self, diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index 03948132d32c3..386a36cf492d6 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -23,7 +23,7 @@ from vllm.config import LoRAConfig from vllm.model_executor.input_metadata import InputMetadata from vllm.model_executor.layers.activation import GeluAndMul -from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (LinearMethodBase, MergedColumnParallelLinear, @@ -123,10 +123,10 @@ def __init__(self, base=self.rope_theta, is_neox_style=True, ) - self.attn = PagedAttention(self.num_heads, - self.head_dim, - self.scaling, - num_kv_heads=self.num_kv_heads) + self.attn = Attention(self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads) def forward( self, @@ -325,11 +325,17 @@ def load_weights(self, if shard_name not in name: continue name = name.replace(shard_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue param = params_dict[name] weight_loader = param.weight_loader weight_loader(param, loaded_weight, shard_id) break else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue # GemmaRMSNorm is different from Llama's in that it multiplies # (1 + weight) to the output, instead of just weight. if "norm.weight" in name: diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py index 661da0fe0434e..3f7b21e5a4133 100644 --- a/vllm/model_executor/models/gpt2.py +++ b/vllm/model_executor/models/gpt2.py @@ -25,7 +25,7 @@ from vllm.model_executor.input_metadata import InputMetadata from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import (ColumnParallelLinear, LinearMethodBase, QKVParallelLinear, @@ -73,9 +73,7 @@ def __init__( bias=True, linear_method=linear_method, ) - self.attn = PagedAttention(self.num_heads, - self.head_dim, - scale=self.scale) + self.attn = Attention(self.num_heads, self.head_dim, scale=self.scale) def forward( self, diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py index ef4c1d4143c88..5c30d47d93e36 100644 --- a/vllm/model_executor/models/gpt_bigcode.py +++ b/vllm/model_executor/models/gpt_bigcode.py @@ -26,7 +26,7 @@ from vllm.model_executor.input_metadata import InputMetadata from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import (ColumnParallelLinear, LinearMethodBase, QKVParallelLinear, @@ -85,10 +85,10 @@ def __init__( bias=True, linear_method=linear_method, ) - self.attn = PagedAttention(self.num_heads, - self.head_dim, - scale=self.scale, - num_kv_heads=self.num_kv_heads) + self.attn = Attention(self.num_heads, + self.head_dim, + scale=self.scale, + num_kv_heads=self.num_kv_heads) def forward( self, diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py index 5bab30d9d442e..93dce7b67a7a5 100644 --- a/vllm/model_executor/models/gpt_j.py +++ b/vllm/model_executor/models/gpt_j.py @@ -24,7 +24,7 @@ from vllm.model_executor.input_metadata import InputMetadata from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import (ColumnParallelLinear, LinearMethodBase, QKVParallelLinear, @@ -86,7 +86,7 @@ def __init__( base=rope_theta, is_neox_style=False, ) - self.attn = PagedAttention(self.num_heads, self.head_size, scaling) + self.attn = Attention(self.num_heads, self.head_size, scaling) def forward( self, @@ -143,7 +143,8 @@ def __init__( linear_method: Optional[LinearMethodBase] = None, ): super().__init__() - inner_dim = 4 * config.n_embd if config.n_inner is None else config.n_inner + inner_dim = (4 * config.n_embd + if config.n_inner is None else config.n_inner) self.ln_1 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon) self.attn = GPTJAttention(config, linear_method) self.mlp = GPTJMLP(inner_dim, config, linear_method) diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py index 8f7e1063e0c1d..98107350e60b9 100644 --- a/vllm/model_executor/models/gpt_neox.py +++ b/vllm/model_executor/models/gpt_neox.py @@ -24,7 +24,7 @@ from vllm.model_executor.input_metadata import InputMetadata from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import (ColumnParallelLinear, LinearMethodBase, QKVParallelLinear, @@ -87,7 +87,7 @@ def __init__( max_position=max_position_embeddings, base=rope_theta, ) - self.attn = PagedAttention(self.num_heads, self.head_size, scaling) + self.attn = Attention(self.num_heads, self.head_size, scaling) def forward( self, diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index ebf1d8a89a022..7b2215ef4bda5 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -7,7 +7,7 @@ from vllm.model_executor.input_metadata import InputMetadata from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (LinearMethodBase, MergedColumnParallelLinear, @@ -114,10 +114,10 @@ def __init__( base=rope_theta, rope_scaling=rope_scaling, ) - self.attn = PagedAttention(self.num_heads, - self.head_dim, - self.scaling, - num_kv_heads=self.num_kv_heads) + self.attn = Attention(self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads) def forward( self, @@ -305,7 +305,8 @@ def load_weights(self, param = params_dict[name] if "wqkv" in name: config = self.config - kv_groups = config.num_attention_heads // config.num_key_value_heads + kv_groups = (config.num_attention_heads // + config.num_key_value_heads) head_dim = config.hidden_size // config.num_attention_heads loaded_weight = loaded_weight.view(-1, 2 + kv_groups, head_dim, diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index d35887cc0f6a3..4c163dfdab537 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -30,7 +30,7 @@ from vllm.config import LoRAConfig from vllm.model_executor.input_metadata import InputMetadata from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (LinearMethodBase, MergedColumnParallelLinear, @@ -139,11 +139,11 @@ def __init__( base=rope_theta, rope_scaling=rope_scaling, ) - self.attn = PagedAttention(self.num_heads, - self.head_dim, - self.scaling, - num_kv_heads=self.num_kv_heads, - sliding_window=sliding_window) + self.attn = Attention(self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + sliding_window=sliding_window) def forward( self, diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index 0100624a44d78..d47834e519697 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -29,7 +29,7 @@ from vllm.config import LoRAConfig from vllm.model_executor.input_metadata import InputMetadata -from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import fused_moe from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (LinearMethodBase, @@ -197,7 +197,7 @@ def __init__(self, base=int(self.rope_theta), is_neox_style=True, ) - self.attn = PagedAttention( + self.attn = Attention( self.num_heads, self.head_dim, self.scaling, diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py index a8dadce24aa1d..25c7f1978c0dc 100644 --- a/vllm/model_executor/models/mixtral_quant.py +++ b/vllm/model_executor/models/mixtral_quant.py @@ -32,7 +32,7 @@ from transformers import MixtralConfig from vllm.model_executor.input_metadata import InputMetadata -from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (LinearMethodBase, ReplicatedLinear, @@ -214,7 +214,7 @@ def __init__(self, base=int(self.rope_theta), is_neox_style=True, ) - self.attn = PagedAttention( + self.attn = Attention( self.num_heads, self.head_dim, self.scaling, diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py index 22a876e2ef691..16ecac3d0529a 100644 --- a/vllm/model_executor/models/mpt.py +++ b/vllm/model_executor/models/mpt.py @@ -8,7 +8,7 @@ from vllm.model_executor.input_metadata import InputMetadata from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import (ColumnParallelLinear, LinearMethodBase, QKVParallelLinear, @@ -105,11 +105,11 @@ def __init__( self.head_dim = self.d_model // self.total_num_heads scaling = self.head_dim**-0.5 - self.attn = PagedAttention(self.num_heads, - self.head_dim, - scaling, - alibi_slopes=alibi_slopes, - num_kv_heads=self.num_kv_heads) + self.attn = Attention(self.num_heads, + self.head_dim, + scaling, + alibi_slopes=alibi_slopes, + num_kv_heads=self.num_kv_heads) def forward( self, diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py index 9d563039208c8..2b0a420e82faf 100644 --- a/vllm/model_executor/models/olmo.py +++ b/vllm/model_executor/models/olmo.py @@ -43,7 +43,7 @@ from torch import nn from vllm.model_executor.input_metadata import InputMetadata -from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, LinearMethodBase, @@ -52,7 +52,8 @@ ) from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler -from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding +from vllm.model_executor.layers.vocab_parallel_embedding import ( + VocabParallelEmbedding) from vllm.model_executor.parallel_utils.parallel_state import ( get_tensor_model_parallel_world_size, ) from vllm.model_executor.sampling_metadata import SamplingMetadata @@ -81,7 +82,8 @@ def output_multiplier(self) -> float: class OlmoAttention(nn.Module): """ - This is the attention block where the output is computed as ``Attention(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))`` + This is the attention block where the output is computed as + ``Attention(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))`` (plus another skip connection). """ @@ -94,11 +96,12 @@ def __init__( self.config = config self.hidden_size = config.d_model assert config.d_model % config.n_heads == 0 - tensor_model_parallel_world_size = get_tensor_model_parallel_world_size( - ) + tensor_model_parallel_world_size = ( + get_tensor_model_parallel_world_size()) self.total_num_heads = self.config.n_heads assert self.total_num_heads % tensor_model_parallel_world_size == 0 - self.num_heads = self.total_num_heads // tensor_model_parallel_world_size + self.num_heads = (self.total_num_heads // + tensor_model_parallel_world_size) self.head_dim = self.hidden_size // self.total_num_heads # Layer norms. @@ -126,9 +129,9 @@ def __init__( base=rope_theta, ) self.scaling = self.head_dim**-0.5 - self.attn = PagedAttention(self.num_heads, - self.head_dim, - scale=self.scaling) + self.attn = Attention(self.num_heads, + self.head_dim, + scale=self.scaling) # Attention output projection. self.attn_out = RowParallelLinear( @@ -158,7 +161,8 @@ def forward( class OlmoMLP(nn.Module): """ - This is the MLP block where the output is computed as ``MLP(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))`` + This is the MLP block where the output is computed as + ``MLP(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))`` (plus another skip connection). """ @@ -217,7 +221,8 @@ def forward( class OlmoBlock(nn.Module): """ - This is a typical transformer block where the output is computed as ``MLP(LN(x + Attention(LN(x))))`` + This is a typical transformer block where the output is + computed as ``MLP(LN(x + Attention(LN(x))))`` (plus another skip connection). """ diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py index 393b2dcabcd5a..782f43ce265bd 100644 --- a/vllm/model_executor/models/opt.py +++ b/vllm/model_executor/models/opt.py @@ -25,7 +25,7 @@ from vllm.model_executor.input_metadata import InputMetadata from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import (ColumnParallelLinear, LinearMethodBase, QKVParallelLinear, @@ -89,9 +89,9 @@ def __init__( bias=bias, linear_method=linear_method, ) - self.attn = PagedAttention(self.num_heads, - self.head_dim, - scale=self.scaling) + self.attn = Attention(self.num_heads, + self.head_dim, + scale=self.scaling) def forward( self, diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py index 0b067d4fc8802..6039b1cdc3534 100644 --- a/vllm/model_executor/models/orion.py +++ b/vllm/model_executor/models/orion.py @@ -12,7 +12,7 @@ from vllm.model_executor.input_metadata import InputMetadata from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import (LinearMethodBase, MergedColumnParallelLinear, QKVParallelLinear, @@ -118,10 +118,10 @@ def __init__( base=rope_theta, rope_scaling=rope_scaling, ) - self.attn = PagedAttention(self.num_heads, - self.head_dim, - self.scaling, - num_kv_heads=self.num_kv_heads) + self.attn = Attention(self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads) def forward( self, diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py index d143261968288..039dc7a9b7675 100644 --- a/vllm/model_executor/models/phi.py +++ b/vllm/model_executor/models/phi.py @@ -43,7 +43,7 @@ from vllm.model_executor.input_metadata import InputMetadata from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import (ColumnParallelLinear, LinearMethodBase, QKVParallelLinear, @@ -108,7 +108,7 @@ def __init__(self, max_position=max_position_embeddings, base=rope_theta, ) - self.attn = PagedAttention(self.num_heads, self.head_size, scaling) + self.attn = Attention(self.num_heads, self.head_size, scaling) def forward( self, diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index 37af84c7cd53f..d4d5a4e8bb9a5 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -12,7 +12,7 @@ from vllm.model_executor.input_metadata import InputMetadata from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (LinearMethodBase, MergedColumnParallelLinear, @@ -104,7 +104,7 @@ def __init__( base=rope_theta, rope_scaling=rope_scaling, ) - self.attn = PagedAttention(self.num_heads, self.head_dim, self.scaling) + self.attn = Attention(self.num_heads, self.head_dim, self.scaling) def forward( self, diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index e823e6f8c3dbe..3e4f843e649b4 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -30,7 +30,7 @@ from vllm.model_executor.input_metadata import InputMetadata from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (LinearMethodBase, MergedColumnParallelLinear, @@ -46,6 +46,7 @@ from vllm.model_executor.weight_utils import (default_weight_loader, hf_model_weights_iterator) from vllm.sequence import SamplerOutput +from vllm.config import LoRAConfig KVCache = Tuple[torch.Tensor, torch.Tensor] @@ -135,11 +136,11 @@ def __init__(self, max_position=max_position, base=self.rope_theta, ) - self.attn = PagedAttention(self.num_heads, - self.head_dim, - self.scaling, - num_kv_heads=self.num_kv_heads, - sliding_window=self.sliding_window) + self.attn = Attention(self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + sliding_window=self.sliding_window) def forward( self, @@ -169,7 +170,8 @@ def __init__( self.hidden_size = config.hidden_size # Requires transformers > 4.32.0 rope_theta = getattr(config, "rope_theta", 1000000) - use_sliding_window = config.use_sliding_window and layer_idx < config.max_window_layers + use_sliding_window = (config.use_sliding_window + and layer_idx < config.max_window_layers) self.self_attn = Qwen2Attention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, @@ -264,12 +266,35 @@ def forward( class Qwen2ForCausalLM(nn.Module): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } + + # LoRA specific attributes + supported_lora_modules = [ + "qkv_proj", + "o_proj", + "gate_up_proj", + "down_proj", + ] + embedding_modules = {} + embedding_padding_modules = [] def __init__( self, config: Qwen2Config, linear_method: Optional[LinearMethodBase] = None, + lora_config: Optional[LoRAConfig] = None, ) -> None: + del lora_config super().__init__() self.config = config self.linear_method = linear_method diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py index 44c57e5a6d4f9..c66f327beee7a 100644 --- a/vllm/model_executor/models/stablelm.py +++ b/vllm/model_executor/models/stablelm.py @@ -1,5 +1,6 @@ # coding=utf-8 -# Copyright 2023 Stability AI, EleutherAI, and The HuggingFace Inc. team. All rights reserved. +# Copyright 2023 Stability AI, EleutherAI, and The HuggingFace Inc. team. +# All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,7 +17,8 @@ # This code is based off the following work: # https://huggingface.co/stabilityai/stablelm-3b-4e1t/blob/main/modeling_stablelm_epoch.py # https://huggingface.co/stabilityai/stablelm-3b-4e1t/blob/main/config.json -"""Inference-only StabeLM (https://github.com/Stability-AI/StableLM) model compatible with HuggingFace weights.""" +"""Inference-only StabeLM (https://github.com/Stability-AI/StableLM) +model compatible with HuggingFace weights.""" from typing import List, Optional, Tuple import torch @@ -25,7 +27,7 @@ from vllm.model_executor.input_metadata import InputMetadata from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import (LinearMethodBase, MergedColumnParallelLinear, QKVParallelLinear, @@ -102,9 +104,9 @@ def __init__(self, self.kv_size = self.num_key_value_heads * self.head_dim self.qkv_bias = getattr(config, "use_qkv_bias", False) if (self.head_dim * self.num_heads * tp_size) != self.hidden_size: - raise ValueError( - f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" - f" and `num_heads`: {self.num_heads}).") + raise ValueError(f"hidden_size must be divisible by num_heads " + f"(got `hidden_size`: {self.hidden_size}" + f" and `num_heads`: {self.num_heads}).") self.qkv_proj = QKVParallelLinear(self.hidden_size, self.head_dim, @@ -122,10 +124,10 @@ def __init__(self, max_position=self.config.max_position_embeddings, base=self.config.rope_theta, ) - self.attn = PagedAttention(self.num_heads, - self.head_dim, - self.scaling, - num_kv_heads=self.num_key_value_heads) + self.attn = Attention(self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_key_value_heads) def forward( self, @@ -192,7 +194,6 @@ def __init__(self, config: PretrainedConfig, linear_method: Optional[LinearMethodBase] = None) -> None: super().__init__() - # self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id) self.embed_tokens = VocabParallelEmbedding( config.vocab_size, config.hidden_size, diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py index 1eda07b724cae..cfbb1bdb7909e 100644 --- a/vllm/model_executor/models/starcoder2.py +++ b/vllm/model_executor/models/starcoder2.py @@ -25,7 +25,7 @@ from vllm.model_executor.input_metadata import InputMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.linear import (ColumnParallelLinear, @@ -35,7 +35,8 @@ from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding, ParallelLMHead, DEFAULT_VOCAB_PADDING_SIZE) -from vllm.model_executor.parallel_utils.parallel_state import get_tensor_model_parallel_world_size +from vllm.model_executor.parallel_utils.parallel_state import ( + get_tensor_model_parallel_world_size) from vllm.model_executor.weight_utils import (default_weight_loader, hf_model_weights_iterator) from vllm.sequence import SamplerOutput @@ -103,7 +104,7 @@ def __init__(self, base=int(self.rope_theta), is_neox_style=True, ) - self.attn = PagedAttention( + self.attn = Attention( self.num_heads, self.head_dim, self.scaling, diff --git a/vllm/model_executor/neuron_model_loader.py b/vllm/model_executor/neuron_model_loader.py index b8d63d4ff12fc..c434b270a5562 100644 --- a/vllm/model_executor/neuron_model_loader.py +++ b/vllm/model_executor/neuron_model_loader.py @@ -34,7 +34,8 @@ def _get_model_architecture(config: PretrainedConfig) -> Type[nn.Module]: def get_model(model_config: ModelConfig, device_config: DeviceConfig, **kwargs) -> nn.Module: - from transformers_neuronx.config import NeuronConfig, ContinuousBatchingConfig + from transformers_neuronx.config import (NeuronConfig, + ContinuousBatchingConfig) parallel_config = kwargs.get("parallel_config") scheduler_config = kwargs.get("scheduler_config") diff --git a/vllm/model_executor/parallel_utils/communication_op.py b/vllm/model_executor/parallel_utils/communication_op.py index cf805df892fdc..521b6b8a383b0 100644 --- a/vllm/model_executor/parallel_utils/communication_op.py +++ b/vllm/model_executor/parallel_utils/communication_op.py @@ -11,7 +11,8 @@ get_tensor_model_parallel_group, is_cupy_nccl_enabled_for_all_reduce, ) -from vllm.model_executor.parallel_utils.custom_all_reduce import custom_all_reduce +from vllm.model_executor.parallel_utils.custom_all_reduce import ( + custom_all_reduce) def tensor_model_parallel_all_reduce(input_: torch.Tensor) -> torch.Tensor: @@ -24,7 +25,7 @@ def tensor_model_parallel_all_reduce(input_: torch.Tensor) -> torch.Tensor: and GPU topology. TLDR: always assume this function modifies its input, but use the return - value as the output. + value as the output. """ # Bypass the function if we are using only 1 GPU. if get_tensor_model_parallel_world_size() == 1: diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py index 7deb80801856e..b23f0170a6ca5 100644 --- a/vllm/model_executor/sampling_metadata.py +++ b/vllm/model_executor/sampling_metadata.py @@ -114,7 +114,8 @@ def from_sampling_metadata( do_penalties = True if (i < sampling_metadata.num_prompts and sampling_params.prompt_logprobs is not None): - # For tokens in the prompt that we only need to get their logprobs + # For tokens in the prompt that we only need to get + # their logprobs prompt_len = sampling_metadata.prompt_lens[i] temperatures += [temperature] * (prompt_len - 1) top_ps += [top_p] * (prompt_len - 1) diff --git a/vllm/model_executor/weight_utils.py b/vllm/model_executor/weight_utils.py index bbb69786020f0..ea63baa5420c1 100644 --- a/vllm/model_executor/weight_utils.py +++ b/vllm/model_executor/weight_utils.py @@ -22,6 +22,9 @@ logger = init_logger(__name__) +_xdg_cache_home = os.getenv('XDG_CACHE_HOME', os.path.expanduser('~/.cache')) +_vllm_filelocks_path = os.path.join(_xdg_cache_home, 'vllm/locks/') + class Disabledtqdm(tqdm): @@ -30,7 +33,8 @@ def __init__(self, *args, **kwargs): def get_lock(model_name_or_path: str, cache_dir: Optional[str] = None): - lock_dir = cache_dir if cache_dir is not None else "/tmp" + lock_dir = cache_dir if cache_dir is not None else _vllm_filelocks_path + os.makedirs(os.path.dirname(lock_dir), exist_ok=True) lock_file_name = model_name_or_path.replace("/", "-") + ".lock" lock = filelock.FileLock(os.path.join(lock_dir, lock_file_name)) return lock diff --git a/vllm/outputs.py b/vllm/outputs.py index a6de2a5a2257b..b8173fd7a0638 100644 --- a/vllm/outputs.py +++ b/vllm/outputs.py @@ -87,32 +87,33 @@ def __init__( @classmethod def from_seq_group(cls, seq_group: SequenceGroup) -> "RequestOutput": - # Get the top-n sequences. - n = seq_group.sampling_params.n seqs = seq_group.get_seqs() - if seq_group.sampling_params.use_beam_search: - sorting_key = lambda seq: seq.get_beam_search_score( - seq_group.sampling_params.length_penalty) + if len(seqs) == 1: + top_n_seqs = seqs else: - sorting_key = lambda seq: seq.get_cumulative_logprob() - sorted_seqs = sorted(seqs, key=sorting_key, reverse=True) - top_n_seqs = sorted_seqs[:n] + # Get the top-n sequences. + n = seq_group.sampling_params.n + if seq_group.sampling_params.use_beam_search: + sorting_key = lambda seq: seq.get_beam_search_score( + seq_group.sampling_params.length_penalty) + else: + sorting_key = lambda seq: seq.get_cumulative_logprob() + sorted_seqs = sorted(seqs, key=sorting_key, reverse=True) + top_n_seqs = sorted_seqs[:n] # Create the outputs. - outputs: List[CompletionOutput] = [] - for seq in top_n_seqs: - logprobs = seq.output_logprobs - if seq_group.sampling_params.logprobs is None: - # NOTE: We need to take care of this case because the sequence - # always has the logprobs of the sampled tokens even if the - # logprobs are not requested. - logprobs = None - finshed_reason = SequenceStatus.get_finished_reason(seq.status) - output = CompletionOutput(seqs.index(seq), seq.output_text, - seq.get_output_token_ids(), - seq.get_cumulative_logprob(), logprobs, - finshed_reason) - outputs.append(output) + # NOTE: We need omit logprobs here explicitly because the sequence + # always has the logprobs of the sampled tokens even if the + # logprobs are not requested. + include_logprobs = seq_group.sampling_params.logprobs + outputs = [ + CompletionOutput(seqs.index(seq), seq.output_text, + seq.get_output_token_ids(), + seq.get_cumulative_logprob(), + seq.output_logprobs if include_logprobs else None, + SequenceStatus.get_finished_reason(seq.status)) + for seq in top_n_seqs + ] # Every sequence in the sequence group should have the same prompt. prompt = seq_group.prompt diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index 8103f3c2b24bf..4aa158878fb96 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -74,8 +74,8 @@ class SamplingParams: stop_token_ids: List of tokens that stop the generation when they are generated. The returned output will contain the stop tokens unless the stop tokens are special tokens. - include_stop_str_in_output: Whether to include the stop strings in output - text. Defaults to False. + include_stop_str_in_output: Whether to include the stop strings in + output text. Defaults to False. ignore_eos: Whether to ignore the EOS token and continue generating tokens after the EOS token is generated. max_tokens: Maximum number of tokens to generate per output sequence. diff --git a/vllm/sequence.py b/vllm/sequence.py index a110ab6b748f8..4a002edaf580f 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -2,12 +2,16 @@ import copy import enum from dataclasses import dataclass -from typing import Dict, List, Optional, Union +from typing import Dict, List, Optional, Union, TYPE_CHECKING from vllm.block import LogicalTokenBlock from vllm.sampling_params import SamplingParams from vllm.lora.request import LoRARequest +if TYPE_CHECKING: + import torch + from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics + @dataclass class Logprob: @@ -81,6 +85,8 @@ class SequenceData: Args: prompt_token_ids: The token IDs of the prompt. + output_token_ids: The token IDs of the output. Set to an empty list if + None. Attributes: prompt_token_ids: The token IDs of the prompt. @@ -91,9 +97,13 @@ class SequenceData: def __init__( self, prompt_token_ids: List[int], + output_token_ids: Optional[List[int]] = None, ) -> None: + if output_token_ids is None: + output_token_ids = [] + self.prompt_token_ids = prompt_token_ids - self.output_token_ids: List[int] = [] + self.output_token_ids = output_token_ids self.cumulative_logprob = 0.0 def append_token_id(self, token_id: int, logprob: float) -> None: @@ -117,6 +127,12 @@ def get_last_token_id(self) -> int: return self.prompt_token_ids[-1] return self.output_token_ids[-1] + def get_prompt_token_ids(self) -> int: + return self.prompt_token_ids + + def get_output_token_ids(self) -> int: + return self.output_token_ids + def __repr__(self) -> str: return (f"SequenceData(" f"prompt_token_ids={self.prompt_token_ids}, " @@ -142,11 +158,13 @@ def __init__( prompt: str, prompt_token_ids: List[int], block_size: int, + eos_token_id: Optional[int] = None, lora_request: Optional[LoRARequest] = None, ) -> None: self.seq_id = seq_id self.prompt = prompt self.block_size = block_size + self.eos_token_id = eos_token_id self.lora_request = lora_request self.data = SequenceData(prompt_token_ids) @@ -173,7 +191,8 @@ def hash_of_block(self, logical_idx: int) -> int: # TODO: The current hashing function is O(L^2). We should optimize # this in the future. num_tokens = self.num_hashed_tokens_of_block(logical_idx) - return hash(tuple(self.data.get_token_ids()[0:num_tokens])) + return hash( + (tuple(self.data.get_token_ids()[0:num_tokens]), self.lora_int_id)) def num_hashed_tokens_of_block(self, logical_idx: int): return logical_idx * self.block_size + self.block_size @@ -332,7 +351,8 @@ def maybe_set_first_token_time(self, time: float) -> None: self.metrics.first_token_time = time def maybe_set_first_scheduled_time(self, time: float) -> None: - """Sets the first scheduled time and time in queue for Request level timings.""" + """Sets the first scheduled time and time in queue for Request + level timings.""" if self.metrics.first_scheduled_time is None: self.metrics.first_scheduled_time = time self.metrics.time_in_queue = time - self.metrics.arrival_time @@ -362,12 +382,9 @@ def get_seqs( self, status: Optional[SequenceStatus] = None, ) -> List[Sequence]: - if status is None: - return list(self.seqs_dict.values()) - else: - return [ - seq for seq in self.seqs_dict.values() if seq.status == status - ] + return list(self.seqs_dict.values()) if status is None else [ + seq for seq in self.seqs_dict.values() if seq.status == status + ] def get_unfinished_seqs(self) -> List[Sequence]: return [ @@ -506,6 +523,35 @@ def __eq__(self, other: object) -> bool: and self.prompt_logprobs == other.prompt_logprobs) -# For each sequence group, we generate a list of SequenceOutput object, -# each of which contains one possible candidate for the next token. -SamplerOutput = List[SequenceGroupOutput] +@dataclass +class SamplerOutput: + """For each sequence group, we generate a list of SequenceOutput object, + each of which contains one possible candidate for the next token. + + This datastructure implements methods so it can be used like a list, but + also has optional fields for device tensors. + """ + + outputs: List[SequenceGroupOutput] + + # On-device tensor containing probabilities of each token. + sampled_token_probs: Optional["torch.Tensor"] = None + + # On-device tensor containing the sampled token ids. + sampled_token_ids: Optional["torch.Tensor"] = None + + # Spec decode metrics populated by workers. + spec_decode_worker_metrics: Optional["SpecDecodeWorkerMetrics"] = None + + def __getitem__(self, idx: int): + return self.outputs[idx] + + def __setitem__(self, idx: int, value): + self.outputs[idx] = value + + def __len__(self): + return len(self.outputs) + + def __eq__(self, other: object): + return isinstance(other, + self.__class__) and self.outputs == other.outputs diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py new file mode 100644 index 0000000000000..0f698fa346010 --- /dev/null +++ b/vllm/spec_decode/batch_expansion.py @@ -0,0 +1,358 @@ +from typing import Iterator, List, Tuple, Optional, Dict +from itertools import chain, count + +import torch + +from vllm.sequence import (SamplerOutput, SequenceGroupMetadata, SequenceData) +from vllm.worker.worker import Worker +from vllm.spec_decode.util import (nvtx_range, sampler_output_to_torch, + get_all_seq_ids, + split_batch_by_proposal_len) +from vllm.spec_decode.interfaces import (SpeculativeScorer, + SpeculativeProposals, + SpeculativeScores) + +SeqId = int +TargetSeqId = int +TokenId = int + + +class BatchExpansionTop1Scorer(SpeculativeScorer): + """Implements a speculative scorer that uses batch expansion to get + probabilities of speculative tokens according to the scoring model. + + Batch expansion converts a list of sequences and multiple query positions + to a new batch of sequences, each with a single query position. This allows + for MQA-like scoring in speculative decoding without requiring an MQA + kernel. + + It is strictly less efficient than MQA scoring. + + It only supports scoring the top1 proposal tokens of the proposer, instead + of topk/tree. + """ + + def __init__(self, scorer_worker: Worker, device: str, vocab_size: int): + self._scorer_worker = scorer_worker + self._device = device + self._vocab_size = vocab_size + + @nvtx_range("BatchExpansionTop1Scorer.score_proposals") + def score_proposals( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: Optional[Dict[int, int]], + blocks_to_swap_out: Optional[Dict[int, int]], + blocks_to_copy: Optional[Dict[int, List[int]]], + k: int, + proposals: SpeculativeProposals, + ) -> SpeculativeScores: + """Score the proposed tokens via the scorer model. + + This converts each input sequence to a set of k+1 target sequences. The + target sequences have the unique continuations to be scored and a + unique sequence ID that is different from all input sequence ids. + + If a speculative sequence length would exceed the max model length, then + no speculation is produced for that sequence. + + Args: + seq_group_metadata_list: The input sequence group metadata. + blocks_to_swap_in: This is passed to the worker during scoring. + blocks_to_swap_out: This is passed to the worker during scoring. + blocks_to_copy: This is passed to the worker during scoring. + k: The fixed proposal length. + proposals: The speculative proposals to score. + Returns: + SpeculativeScores: The scores of each speculative token, along with + which sequences were ignored during scoring. + """ + + # TODO(cade) perform this on GPU to remove blocking call. + proposal_lens_list = proposals.proposal_lens.tolist() + proposal_token_ids_list = proposals.proposal_token_ids.tolist() + + (spec_indices, non_spec_indices, target_seq_group_metadata_list, + num_scoring_tokens) = self._expand_batch( + seq_group_metadata_list=seq_group_metadata_list, + proposal_token_ids_list=proposal_token_ids_list, + proposal_lens_list=proposal_lens_list, + ) + + target_sampler_output = self._scorer_worker.execute_model( + seq_group_metadata_list=target_seq_group_metadata_list, + blocks_to_swap_in=blocks_to_swap_in, + blocks_to_swap_out=blocks_to_swap_out, + blocks_to_copy=blocks_to_copy, + return_python_output=False) + + all_tokens, all_probs = self._contract_batch( + original_bs=len(seq_group_metadata_list), + target_sampler_output=target_sampler_output, + proposals=proposals, + num_scoring_tokens=num_scoring_tokens, + non_spec_indices=non_spec_indices, + spec_indices=spec_indices, + k=k, + ) + + return SpeculativeScores( + probs=all_probs, + token_ids=all_tokens, + ) + + def _expand_batch( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + proposal_token_ids_list: List[TokenId], + proposal_lens_list: List[int], + ) -> Tuple[List[int], List[int], List[SequenceGroupMetadata], int]: + """Given the input sequences and potentially multiple corresponding + proposal tokens, create a new batch where each sequence has a single + query token. + """ + + # vLLM currently only supports proposal lens equal to zero or the batch + # proposal len. This adds some complexity (splitting the batch into spec + # and non spec sequences) and should be removed in the future. It can be + # done by supporting per-sequence proposal lens. + spec_seqs, spec_indices = split_batch_by_proposal_len( + seq_group_metadata_list, + proposal_lens_list, + select_proposal_len_zero=False) + non_spec_seqs, non_spec_indices = split_batch_by_proposal_len( + seq_group_metadata_list, + proposal_lens_list, + select_proposal_len_zero=True) + + target_seq_group_metadata_list = self._create_scoring_model_input( + spec_seqs, proposal_token_ids_list) + num_scoring_tokens = len(target_seq_group_metadata_list) + target_seq_group_metadata_list.extend(non_spec_seqs) + + return (spec_indices, non_spec_indices, target_seq_group_metadata_list, + num_scoring_tokens) + + def _contract_batch(self, original_bs: int, + target_sampler_output: List[SamplerOutput], + proposals: SpeculativeProposals, + num_scoring_tokens: int, non_spec_indices: List[int], + spec_indices: List[int], + k: int) -> Tuple[torch.Tensor, torch.Tensor]: + """Contract the expanded batch back into its original size. + This maps the scores of speculative tokens back to their original + sequences. + """ + (target_token_ids, target_probs, non_spec_target_token_ids, + non_spec_target_probs) = self._split_scoring_output( + target_sampler_output, num_scoring_tokens) + + # Map distinct sequences used to score each token + # of shape [batch_size * k + 1] back to [batch_size, k + 1]. + batch_size, k = proposals.proposal_token_ids.shape + + target_token_ids = target_token_ids.squeeze().reshape( + batch_size, k + 1) + target_probs = target_probs.squeeze().reshape(batch_size, k + 1, + self._vocab_size) + + all_tokens = torch.full(size=(original_bs, k + 1), + fill_value=-1, + device=self._device, + dtype=torch.long) + all_probs = torch.zeros(original_bs, + k + 1, + self._vocab_size, + device=self._device, + dtype=torch.float32) + + if non_spec_indices: + all_tokens[non_spec_indices, 0] = non_spec_target_token_ids + all_probs[non_spec_indices, :1, :] = non_spec_target_probs + + if spec_indices: + all_tokens[spec_indices] = target_token_ids + all_probs[spec_indices] = target_probs + + return all_tokens, all_probs + + def _create_scoring_model_input( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + proposal_token_ids: List[List[TokenId]], # shape: [batch_size, k] + ) -> List[SequenceGroupMetadata]: + """Given the original input sequences and proposed tokens from the draft + model, create a list of target sequences that can be used for scoring. + """ + + if not seq_group_metadata_list: + return [] + + target_seq_ids_iter = self._create_target_seq_id_iterator( + get_all_seq_ids(seq_group_metadata_list)) + + target_seq_group_metadata = list( + chain.from_iterable( + self._create_target_seq_group_metadata( + seq_group_metadata, + proposal_token_ids, + i, + target_seq_ids_iter, + ) for i, seq_group_metadata in enumerate( + seq_group_metadata_list))) + + return target_seq_group_metadata + + def _create_target_seq_group_metadata( + self, + input_seq_group_metadata: SequenceGroupMetadata, + proposal_token_ids: List[TokenId], # shape: [batch_size, k] + batch_index: int, + target_seq_ids_iter: Iterator[TargetSeqId], + ) -> List[SequenceGroupMetadata]: + """Given an input sequence group metadata and a list of draft tokens, + create a list of target SequenceGroupMetadata, one for each + token id that needs to be scored. + + Naive speculative decoding requires K target model scores, one for each + draft model token. However one can add a bonus token such that if each + token is accepted, then a final token may be sampled from the model. + This function creates K+1 target SequenceGroupMetadata to take + advantage of the bonus token. + """ + assert not input_seq_group_metadata.is_prompt, ( + "Speculating on " + "prompts not yet supported") + assert len(input_seq_group_metadata.seq_data) == 1, ( + "Beam search " + "not supported in speculative decoding") + input_seq_id = next(iter(input_seq_group_metadata.seq_data.keys())) + + token_ids_to_score = self._get_token_ids_to_score( + proposal_token_ids[batch_index]) + + target_seq_group_metadata_list: List[SequenceGroupMetadata] = [] + for token_ids in token_ids_to_score: + target_seq_group_metadata_list.append( + self._create_single_target_seq_group_metadata( + input_seq_group_metadata, + input_seq_id, + next(target_seq_ids_iter), + token_ids, + )) + + return target_seq_group_metadata_list + + def _create_single_target_seq_group_metadata( + self, + seq_group_metadata: SequenceGroupMetadata, + seq_id: SeqId, + target_seq_id: TargetSeqId, + token_ids: List[TokenId], + ) -> SequenceGroupMetadata: + """Create a single target SequenceGroupMetadata. + + Args: + seq_group_metadata: The metadata for the input sequence. + seq_id: The input sequence ID. + target_seq_id: The corresponding target sequence ID. + token_ids: The list of token ids that are to be appended to the + input sequence. + """ + seq_data = seq_group_metadata.seq_data[seq_id] + prompt_token_ids = seq_data.get_prompt_token_ids() + new_output_token_ids = [*seq_data.get_output_token_ids(), *token_ids] + + return SequenceGroupMetadata( + request_id=seq_group_metadata.request_id, + is_prompt=seq_group_metadata.is_prompt, + seq_data={ + target_seq_id: + SequenceData( + prompt_token_ids=prompt_token_ids, + output_token_ids=new_output_token_ids, + ), + }, + sampling_params=seq_group_metadata.sampling_params, + block_tables={ + target_seq_id: seq_group_metadata.block_tables[seq_id], + }, + lora_request=None, + ) + + def _split_scoring_output( + self, sampler_output: SamplerOutput, num_scoring_tokens: int + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + """Split the target model output into speculative and non-speculative + output. + """ + + # vLLM currently only supports proposal lens equal to zero or the batch + # proposal len. This adds some complexity (splitting the batch into spec + # and non spec sequences) and should be removed in the future. It can be + # done by supporting per-sequence proposal lens. + # + # First samples are from speculative scoring, latter samples are non- + # speculative samples. + split_sizes = [ + num_scoring_tokens, + sampler_output.sampled_token_ids.numel() - num_scoring_tokens + ] + (spec_probs, non_spec_probs + ) = sampler_output.sampled_token_probs.split(split_sizes) + (spec_sampled_tokens, non_spec_sampled_tokens + ) = sampler_output.sampled_token_ids.flatten().split(split_sizes) + + # Convert scores to tensors. + sampler_output.sampled_token_probs = spec_probs + sampler_output.sampled_token_ids = spec_sampled_tokens + target_token_ids, target_probs = sampler_output_to_torch( + [sampler_output]) + + # Convert non-speculative output tokens to tensors. + sampler_output.sampled_token_probs = non_spec_probs + sampler_output.sampled_token_ids = non_spec_sampled_tokens + non_spec_target_token_ids, non_spec_target_probs = ( + sampler_output_to_torch([sampler_output])) + + return (target_token_ids, target_probs, non_spec_target_token_ids, + non_spec_target_probs) + + def _create_target_seq_id_iterator( + self, seq_ids: List[SeqId]) -> Iterator[TargetSeqId]: + """Create an iterator for creating target sequence ids. + Target sequence ids are distinct from sequence ids because we create a + distinct target sequence id for each proposal token to be scored. + + This implementation increments a counter starting at 1 + max of all + provided input sequence ids. + """ + return count(start=max(seq_ids) + 1) + + def _get_token_ids_to_score( + self, + full_spec_token_ids: List[TokenId] # shape: [k] + ) -> List[List[TokenId]]: + """Given an int tensor of proposal token ids, return a list of + token ids that should be scored. + + Returns k+1 output lists. The additional one is used for generating the + bonus token. + + Example: + Input: [0, 1, 2, 3] (k=4) + Output: (k+1 lists) + [] + [0] + [0, 1] + [0, 1, 2] + [0, 1, 2, 3] + """ + empty_token_ids = [] + + token_ids_to_score = [empty_token_ids] + token_ids_to_score.extend([ + full_spec_token_ids[:i + 1] + for i in range(len(full_spec_token_ids)) + ]) + return token_ids_to_score diff --git a/vllm/spec_decode/interfaces.py b/vllm/spec_decode/interfaces.py new file mode 100644 index 0000000000000..9e53ffb60ac32 --- /dev/null +++ b/vllm/spec_decode/interfaces.py @@ -0,0 +1,77 @@ +from typing import List, Tuple, Optional, Dict +from dataclasses import dataclass +from abc import ABC, abstractmethod + +import torch + +from vllm.sequence import SequenceGroupMetadata + + +@dataclass +class SpeculativeProposals: + """Datastructure used to represent proposal tokens from some proposer. It + also tracks how many speculative tokens each sequence has. + """ + + # Speculative proposal tokens. + proposal_token_ids: torch.Tensor + + # Probabilities of the proposal tokens according to the proposer. + proposal_probs: torch.Tensor + + # The valid length of each proposal; can be zero. + proposal_lens: torch.Tensor + + def __repr__(self): + return (f"SpeculativeProposals(" + f"proposal_token_ids={self.proposal_token_ids.shape}, " + f"proposal_probs={self.proposal_probs.shape}, " + f"proposal_lens={self.proposal_lens.shape})") + + +@dataclass +class SpeculativeScores: + """Datastructure used to represent the scores of speculative tokens + according to the scoring model. + """ + + # Probabilities of the speculative tokens according to the scoring model. + probs: torch.Tensor + + # Token ids sampled from the scoring model. Used for speculative bonus + # tokens and also non-speculative normal decoding. + token_ids: torch.Tensor + + def __repr__(self): + return (f"SpeculativeScores(" + f"probs={self.probs.shape}, " + f"token_ids={self.token_ids.shape})") + + +class SpeculativeProposer(ABC): + + @abstractmethod + def get_proposals( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: Dict[int, int], + blocks_to_swap_out: Dict[int, int], + blocks_to_copy: Dict[int, List[int]], + max_proposal_len: int, + ) -> SpeculativeProposals: + raise NotImplementedError + + +class SpeculativeScorer(ABC): + + @abstractmethod + def score_proposals( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: Optional[Dict[int, int]], + blocks_to_swap_out: Optional[Dict[int, int]], + blocks_to_copy: Optional[Dict[int, List[int]]], + k: int, + proposals: SpeculativeProposals, + ) -> Tuple[torch.Tensor, torch.Tensor]: + raise NotImplementedError diff --git a/vllm/spec_decode/metrics.py b/vllm/spec_decode/metrics.py new file mode 100644 index 0000000000000..65a2a4a63a98f --- /dev/null +++ b/vllm/spec_decode/metrics.py @@ -0,0 +1,174 @@ +import torch +from dataclasses import dataclass +from vllm.model_executor.layers.rejection_sampler import RejectionSampler +from typing import Optional +from vllm.utils import in_wsl +import time +from typing import Callable + + +@dataclass +class SpecDecodeWorkerMetrics: + """Dataclass holding metrics emitted from the spec decode worker. + """ + + # The empirical acceptance rate of the proposal method on a per-token basis. + # This is useful for evaluating how well the proposal method aligns with the + # scoring method. + draft_acceptance_rate: float + + # The empirical efficiency, measured as the number of tokens emitted by the + # system divided by the number of tokens that could be emitted by the system + # if the proposal method were perfect. + system_efficiency: float + + # The number of speculative tokens produced by the proposal method. + draft_tokens: int + + # The number of tokens emitted by the entire system. + emitted_tokens: int + + # The number of tokens accepted by the scoring model and verification + # routine, e.g. Llama2-70B and lossless rejection sampling. + # + # NOTE: Any token accepted by the verification routine is considered + # accepted (regardless of if the speculative prefix is also accepted). The + # user will usually see less accepted tokens. This metric is helpful when + # evaluating alignment of the proposal method with the scoring model. + accepted_tokens: int + + # The number of speculative tokens per sequence. + num_spec_tokens: int + + +Timer = Callable[[], float] + + +class AsyncMetricsCollector: + """Class which copies rejection sampler metrics from the device to CPU on a + non-default Torch stream. + """ + + def __init__(self, + rejection_sampler: RejectionSampler, + timer: Optional[Timer] = None, + collect_interval_s: float = 5.0): + self._rejection_sampler = rejection_sampler + self._timer = time.time if timer is None else timer + + self._rank: Optional[int] = None + + # We don't have a device set yet. + self._copy_stream: Optional[torch.cuda.Stream] = None + + self._in_flight_copy: Optional[torch.cuda.Event] = None + + pin_memory = not in_wsl() + self._aggregate_num_accepted_tokens = torch.tensor( + 0, dtype=torch.long, device="cpu", pin_memory=pin_memory) + self._aggregate_num_emitted_tokens = torch.tensor( + 0, dtype=torch.long, device="cpu", pin_memory=pin_memory) + self._aggregate_num_draft_tokens = 0 + + self._rejsample_metrics_collect_interval_s = collect_interval_s + self._last_metrics_collect_time = self._timer() + + def init_gpu_tensors(self, rank: int) -> None: + self._rank = rank + self._copy_stream = torch.cuda.Stream() + + def maybe_collect_rejsample_metrics( + self, k: int) -> Optional[SpecDecodeWorkerMetrics]: + + # If a copy was initiated in the previous call, collect and return. + if self._in_flight_copy is not None: + ready_event = self._in_flight_copy + self._in_flight_copy = None + return self._collect_rejsample_metrics(k, ready_event) + + # Otherwise, check if we should start a new copy. + if self._should_collect_rejsample_metrics(self._timer()): + assert self._in_flight_copy is None + self._in_flight_copy = self._copy_rejsample_metrics_async() + + return None + + def _should_collect_rejsample_metrics(self, now: float) -> bool: + """Return whether or not this iteration should print rejection sampling + metrics. + """ + if self._rank != 0: + return False + + if (now - self._last_metrics_collect_time < + self._rejsample_metrics_collect_interval_s): + return False + return True + + def _copy_rejsample_metrics_async(self) -> torch.cuda.Event: + """Copy rejection sampling metrics (number of accepted tokens, etc) to + CPU asynchronously. + + Returns a CUDA event recording when the copy is complete. + """ + self._copy_stream.wait_stream(torch.cuda.current_stream()) + + with torch.cuda.stream(self._copy_stream): + self._aggregate_num_accepted_tokens.copy_( + self._rejection_sampler.num_accepted_tokens, non_blocking=True) + self._aggregate_num_emitted_tokens.copy_( + self._rejection_sampler.num_emitted_tokens, non_blocking=True) + # Number of draft tokens is calculated on CPU, so no copy is + # required. + self._aggregate_num_draft_tokens = ( + self._rejection_sampler.num_draft_tokens) + + aggregate_metrics_ready = torch.cuda.Event() + aggregate_metrics_ready.record(self._copy_stream) + + return aggregate_metrics_ready + + def _collect_rejsample_metrics( + self, k: int, + ready_event: torch.cuda.Event) -> SpecDecodeWorkerMetrics: + """Create metrics object from statistics copied asynchronously. + + Args: + k: int. The number of speculative tokens; used to determine system + efficiency. + ready_event: torch.cuda.Event. The CUDA event recording when the + async GPU->CPU copy is complete. + """ + + ready_event.synchronize() + accepted_tokens = self._aggregate_num_accepted_tokens.item() + emitted_tokens = self._aggregate_num_emitted_tokens.item() + draft_tokens = self._aggregate_num_draft_tokens + + num_possible_tokens = self.get_max_num_accepted_tokens(draft_tokens, k) + + if draft_tokens > 0: + draft_acceptance_rate = accepted_tokens / draft_tokens + else: + draft_acceptance_rate = float("nan") + + if num_possible_tokens > 0: + system_efficiency = emitted_tokens / num_possible_tokens + else: + system_efficiency = float("nan") + + return SpecDecodeWorkerMetrics( + num_spec_tokens=k, + draft_acceptance_rate=draft_acceptance_rate, + system_efficiency=system_efficiency, + accepted_tokens=accepted_tokens, + draft_tokens=draft_tokens, + emitted_tokens=emitted_tokens, + ) + + @staticmethod + def get_max_num_accepted_tokens(draft_tokens: int, k: int) -> int: + # Divide by k since batch size can be variable. + total_num_spec_seqs = draft_tokens / k + num_accepted_per_seq_if_all_accepted = k + 1 + return int(total_num_spec_seqs / num_accepted_per_seq_if_all_accepted) diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py new file mode 100644 index 0000000000000..0915c275b0408 --- /dev/null +++ b/vllm/spec_decode/multi_step_worker.py @@ -0,0 +1,370 @@ +from typing import List, Dict, Optional, Tuple +import copy + +import torch + +from vllm.sequence import SamplerOutput, SequenceGroupMetadata +from vllm.worker.worker import Worker +from vllm.spec_decode.interfaces import (SpeculativeProposals, + SpeculativeProposer) +from vllm.spec_decode.util import sampler_output_to_torch + + +class MultiStepWorker(Worker): + """The MultiStepWorker is equivalent to a Worker except that it allows + multiple forward passes in a single call, assuming the scheduler has + allocated enough space to store the additional KV. This reduces overhead + by invoking the scheduler less. + + The MultiStepWorker does not support cache swap operations, or beam search. + Cache swap operations do not require large modifications. On the other hand, + beam search requires memory allocations during sequence forks and thus + requires more thought for MultiStepWorker support. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + self._proposer: Optional[DraftModelTop1Proposer] = None + + def init_model(self): + super().init_model() + + self._proposer = DraftModelTop1Proposer( + self, + self.device, + self.max_model_len, + self.vocab_size, + ) + + @torch.inference_mode() + def execute_model_multi_step( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: Dict[int, int], + blocks_to_swap_out: Dict[int, int], + blocks_to_copy: Dict[int, List[int]], + num_steps: int, + ) -> List[SamplerOutput]: + """Run the model forward pass num_steps times. Returns the list of + sampler output, one per model forward pass. + """ + self._raise_if_unsupported(seq_group_metadata_list, blocks_to_swap_in, + blocks_to_swap_out, blocks_to_copy) + + # Shallow copy input data so modifications (such as appending tokens) + # do not cause side-effects. + copied_seq_group_metadata_list = self._shallow_copy_inputs( + seq_group_metadata_list) + + # Assert enough KV space for num_steps tokens per sequence. + self._assert_enough_kv_space(seq_group_metadata_list, num_steps) + + # Run model num_steps times. + model_outputs = [] + for _ in range(num_steps): + model_output = super().execute_model( + seq_group_metadata_list=copied_seq_group_metadata_list, + blocks_to_swap_in=blocks_to_swap_in, + blocks_to_swap_out=blocks_to_swap_out, + blocks_to_copy=blocks_to_copy, + ) + + self._append_new_tokens(model_output, + copied_seq_group_metadata_list) + model_outputs.append(model_output) + + return model_outputs + + def get_spec_proposals( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: Dict[int, int], + blocks_to_swap_out: Dict[int, int], + blocks_to_copy: Dict[int, List[int]], + max_proposal_len: int, + ) -> SpeculativeProposals: + """Produce speculations given an input batch of sequences. The number of + speculative tokens per sequence is determined by max_proposal_len. + """ + + return self._proposer.get_proposals( + seq_group_metadata_list, + blocks_to_swap_in, + blocks_to_swap_out, + blocks_to_copy, + max_proposal_len, + ) + + def _append_new_tokens( + self, model_output: SamplerOutput, + seq_group_metadata_list: SequenceGroupMetadata) -> None: + """Given model output from a single run, append the tokens to the + sequences. This is normally done outside of the worker, but it is + required if the worker is to perform multiple forward passes. + """ + for seq_group_metadata, sequence_group_outputs in zip( + seq_group_metadata_list, model_output): + seq_group_metadata.is_prompt = False + + for seq_output in sequence_group_outputs.samples: + # NOTE: Beam search is not supported, so we can assume that + # parent_seq_id == seq_id. + seq = seq_group_metadata.seq_data[seq_output.parent_seq_id] + + token_id = seq_output.output_token + token_logprob = seq_output.logprobs[token_id] + + seq.append_token_id(token_id, token_logprob.logprob) + + def _shallow_copy_inputs( + self, seq_group_metadata_list: List[SequenceGroupMetadata] + ) -> List[SequenceGroupMetadata]: + """Copy input data structures to remove side-effects when input data + structures are shared with other modules. + + Helpful when the vLLM scheduler runs in the same process as the worker. + The alternative is deep-copying (or other form of deep copy); this has + performance downsides. + """ + + # Shallow-copy the list of SequenceGroupMetadata. This allows us to + # append tokens and change is_prompt without external side-effects. + new_seq_group_metadata_list = [] + + for old_seq_group_metadata in seq_group_metadata_list: + # We must shallow-copy seq_group_metadata as is_prompt could change. + seq_group_metadata = copy.copy(old_seq_group_metadata) + new_seq_group_metadata_list.append(seq_group_metadata) + + # We must shallow-copy seq_data as we will append token ids + new_seq_data = {} + for seq_id, old_seq_data in seq_group_metadata.seq_data.items(): + new_seq_data[seq_id] = copy.copy(old_seq_data) + new_seq_data[ + seq_id].output_token_ids = old_seq_data.output_token_ids[:] + + seq_group_metadata.seq_data = new_seq_data + + return new_seq_group_metadata_list + + def _assert_enough_kv_space( + self, seq_group_metadata_list: List[SequenceGroupMetadata], + num_steps: int) -> None: + """Assert there are enough physical blocks per sequence to store the + current KV plus additional KV from num_steps tokens. + """ + assert self.model_runner.block_size is not None + for seq_group_metadata in seq_group_metadata_list: + # Only one seq_id is guaranteed because there is no beam search. + seq_id = list(seq_group_metadata.seq_data.keys())[0] + seq = seq_group_metadata.seq_data[seq_id] + + # After num_steps, the seq len will be the current seq len + # plus one token per step. + final_seq_len = seq.get_len() + num_steps + + # We will have final_seq_len - 1 KV because vLLM saves KV for a + # token in the iteration after the token was generated. + required_num_kv_slots = final_seq_len - 1 + + # The allocated number of kv slots is the number of allocated blocks + # times the number of slots of block. + number_physical_blocks = len( + seq_group_metadata.block_tables[seq_id]) + allocated_kv_slots = (number_physical_blocks * + self.model_runner.block_size) + + if required_num_kv_slots > allocated_kv_slots: + request_id = seq_group_metadata.request_id + raise ValueError( + "The worker attempted to run " + f"{num_steps} times but found insufficient KV space for " + f"{request_id=} {seq_id=}. ({allocated_kv_slots=} " + f"{required_num_kv_slots=}).") + + def _raise_if_unsupported( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: Dict[int, int], + blocks_to_swap_out: Dict[int, int], + blocks_to_copy: Dict[int, List[int]], + ) -> None: + """MultiStepWorker does not yet implement support for cache swap + operations or beam search. + """ + if any([blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy]): + raise NotImplementedError( + "MultiStepWorker does not support cache operations") + + if any( + len(seq_group_metadata.seq_data.keys()) != 1 + for seq_group_metadata in seq_group_metadata_list): + raise NotImplementedError( + "MultiStepWorker does not support beam search.") + + +class DraftModelTop1Proposer(SpeculativeProposer): + """Helper class which separates out sequences which would exceed the max + model length when speculated upon. + + This allows combinations of models such as JackFram/llama-68m draft with + meta-llama/Llama2-13b-chat-hf, as llama-68m has max_position_embeddings of + 2048 while Llama2-13b has max_position_embeddings of 4096. + + We treat the sequences which exceed the proposal draft model length as + "non-spec sequences". Essentially they skip the draft model and go through + normal decoding in the target model. + + Currently, only proposal_lens of 0 and k are supported, where k is a global + batch proposal length. In the future vLLM should support per-sequence + proposal lengths. + """ + + def __init__( + self, + draft_worker: MultiStepWorker, + device: str, + max_model_len: int, + vocab_size: int, + ): + self._draft_worker = draft_worker + self._device = device + self._max_model_len = max_model_len + self._vocab_size = vocab_size + + def get_proposals( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: Dict[int, int], + blocks_to_swap_out: Dict[int, int], + blocks_to_copy: Dict[int, List[int]], + max_proposal_len: int, + ) -> SpeculativeProposals: + """Get speculative proposals given the input batch. + + Sequences which would exceed the max model length are skipped during + speculation. + """ + + # Split speculative- and non-speculative- sequences. + (proposal_lens, nonzero_proposal_len_seqs, + nonzero_proposal_len_indices) = self._split_by_max_model_len( + seq_group_metadata_list, max_proposal_len) + + if nonzero_proposal_len_seqs: + # Speculate tokens using the draft worker for the speculative + # sequences. + maybe_sampler_output = self._draft_worker.execute_model_multi_step( + seq_group_metadata_list=nonzero_proposal_len_seqs, + blocks_to_swap_in=blocks_to_swap_in, + blocks_to_swap_out=blocks_to_swap_out, + blocks_to_copy=blocks_to_copy, + num_steps=max_proposal_len, + ) + else: + # If no sequences can be speculated, set sampler output to None. + maybe_sampler_output = None + + # Combine speculative- and non-speculative sequences into the same + # representation. + proposal_tokens, proposal_probs, proposal_lens = self._merge_outputs( + batch_size=len(seq_group_metadata_list), + max_proposal_len=max_proposal_len, + maybe_sampler_output=maybe_sampler_output, + proposal_lens=proposal_lens, + nonzero_proposal_len_indices=nonzero_proposal_len_indices, + ) + + proposals = SpeculativeProposals( + proposal_token_ids=proposal_tokens, + proposal_probs=proposal_probs, + proposal_lens=proposal_lens, + ) + + return proposals + + def _split_by_max_model_len( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + max_proposal_len: int, + ) -> Tuple[List[int], List[SequenceGroupMetadata], List[int]]: + """Determine which sequences would exceed the max model length. + """ + + proposal_lens: List[int] = [] + nonzero_proposal_len_seqs: List[SequenceGroupMetadata] = [] + nonzero_proposal_len_indices: List[int] = [] + for i, seq_group_metadata in enumerate(seq_group_metadata_list): + seq_data = next(iter(seq_group_metadata.seq_data.values())) + seq_len = seq_data.get_len() + + # Currently only proposal lens of 0 or the global batch proposal len + # are supported. + if seq_len + max_proposal_len < self._max_model_len: + proposal_lens.append(max_proposal_len) + nonzero_proposal_len_seqs.append(seq_group_metadata) + nonzero_proposal_len_indices.append(i) + else: + proposal_lens.append(0) + + return (proposal_lens, nonzero_proposal_len_seqs, + nonzero_proposal_len_indices) + + def _merge_outputs( + self, + batch_size: int, + max_proposal_len: int, + maybe_sampler_output: Optional[SamplerOutput], + proposal_lens: List[int], + nonzero_proposal_len_indices: List[int], + ) -> Tuple[torch.Tensor, torch.tensor, torch.Tensor]: + """After speculations are produced, merge the speculation results with + the skipped sequences. + """ + if maybe_sampler_output is None: + # If no speculative tokens, the sampler output will be None. + # In this case we return empty tensors. + proposal_tokens = torch.zeros(0, + max_proposal_len, + dtype=torch.long, + device=self._device) + proposal_probs = torch.zeros(0, + max_proposal_len, + self._vocab_size, + dtype=torch.float32, + device=self._device) + proposal_lens = torch.zeros(len(proposal_lens), + dtype=torch.long, + device=self._device) + return proposal_tokens, proposal_probs, proposal_lens + + sampler_output = maybe_sampler_output + + proposal_tokens, proposal_probs = sampler_output_to_torch( + sampler_output) + + # Now, reformat the output GPU tensors such that each sequence has + # a proposal. the proposal can be empty, e.g. [-1, -1, -1] + + entire_proposal_tokens = torch.full(size=(batch_size, + *proposal_tokens.shape[1:]), + fill_value=-1, + dtype=torch.long, + device=self._device) + entire_proposal_tokens[nonzero_proposal_len_indices] = proposal_tokens + entire_proposal_probs = torch.zeros(batch_size, + *proposal_probs.shape[1:], + dtype=torch.float32, + device=self._device) + entire_proposal_probs[nonzero_proposal_len_indices] = proposal_probs + + proposal_tokens, proposal_probs = (entire_proposal_tokens, + entire_proposal_probs) + + proposal_lens = torch.zeros(batch_size, + dtype=torch.long, + device=self._device) + proposal_lens[nonzero_proposal_len_indices] = max_proposal_len + + return proposal_tokens, proposal_probs, proposal_lens diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py new file mode 100644 index 0000000000000..1e56741347008 --- /dev/null +++ b/vllm/spec_decode/spec_decode_worker.py @@ -0,0 +1,375 @@ +from typing import List, Tuple, Optional, Dict +from functools import cached_property + +import torch + +from vllm.spec_decode.metrics import AsyncMetricsCollector +from vllm.sequence import (SamplerOutput, SequenceGroupMetadata, + SequenceGroupOutput, SequenceOutput) +from vllm.worker.worker import Worker +from vllm.spec_decode.multi_step_worker import MultiStepWorker +from vllm.model_executor.layers.rejection_sampler import RejectionSampler +from vllm.config import CacheConfig +from vllm.spec_decode.util import (nvtx_range, get_all_seq_ids, + split_batch_by_proposal_len) +from vllm.spec_decode.interfaces import SpeculativeProposals, SpeculativeScores +from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer +from vllm.spec_decode.interfaces import SpeculativeScorer + + +class SpecDecodeWorker: + """Worker which implements speculative decoding. + + Speculative decoding reduces decoding per-token latency by using a proposal + method, such as a small draft model, to speculate ahead of a larger LLM. The + probabilities of the speculative tokens are then determined by the larger + LLM, after which some verification routine determines which (if any) of the + speculative tokens are accepted by the larger LLM. + + See https://github.com/vllm-project/vllm/pull/2188 and + https://github.com/vllm-project/vllm/pull/3103 for more info. + + The current implementation has the following limitations: + * Only draft-model proposal is implemented (contributions for more forms are + welcome!). + * Only top-1 proposal and scoring are implemented. Tree-attention is left as + future work. + * Only lossless rejection sampling is supported. Contributions adding lossy + verification routines are welcome (e.g. Medusa's typical acceptance). + * All sequences in a batch must have the same proposal length, or zero. This + can be improved by having per-sequence speculation in the future. + * The scoring forward pass is done without an MQA kernel, which is + suboptimal especially as the batch size, proposal length, and sequence + lengths grow. Contributions to add a MQA scoring are welcome once + correctness tests pass. + More info here https://docs.google.com/document/d/1T-JaS2T1NRfdP51qzqpyakoCXxSXTtORppiwaj5asxA/edit. + """ + + def __init__( + self, + proposer_worker: MultiStepWorker, + scorer_worker: Worker, + rejection_sampler: RejectionSampler, + metrics_collector: Optional[AsyncMetricsCollector] = None, + ): + """ + Create a SpecDecodeWorker. + + Args: + proposer_worker: A worker that can produce speculative tokens for + sequences. + scorer_worker: A worker that produces probabilities of speculative + tokens according to some base model. Typically a vanilla vLLM + Worker. + rejection_sampler: A Torch module used to perform modified rejection + sampling for speculative decoding. + metrics_collector: Helper class for collecting metrics; can be set + for testing purposes. + """ + self.proposer_worker = proposer_worker + self.scorer_worker = scorer_worker + self.rejection_sampler = rejection_sampler + + self._metrics = AsyncMetricsCollector( + rejection_sampler + ) if metrics_collector is None else metrics_collector + + self.probs_dtype = self.rejection_sampler.probs_dtype + self.token_id_dtype = self.rejection_sampler.token_id_dtype + + self.scorer: SpeculativeScorer = None + + def init_model(self) -> None: + """Initialize both scorer and proposer models. + """ + # The scorer worker model is initialized first in case the proposer + # model has a smaller TP degree than the target worker. + self.scorer_worker.init_model() + self.proposer_worker.init_model() + + self._metrics.init_gpu_tensors(self.rank) + self.rejection_sampler.init_gpu_tensors(self.rank) + self.scorer = BatchExpansionTop1Scorer( + scorer_worker=self.scorer_worker, + device=self.device, + vocab_size=self._vocab_size) + + def profile_num_available_blocks(self, block_size: int, + gpu_memory_utilization: float, + cpu_swap_space: int, + cache_dtype: str) -> Tuple[int, int]: + """Determine the number of cache blocks to use. + + This is done by profiling the scorer model (which is typically the + larger of the two). Then the total memory which would be used by the + scorer cache is divided evenly between the proposer and scorer model KV, + such that the number of blocks is equal in both KV caches. + """ + num_gpu_blocks, num_cpu_blocks = ( + self.scorer_worker.profile_num_available_blocks( + block_size, gpu_memory_utilization, cpu_swap_space, + cache_dtype)) + + scorer_cache_block_size_bytes = ( + self.scorer_worker.get_cache_block_size_bytes( + block_size, cache_dtype)) + proposer_cache_block_size_bytes = ( + self.proposer_worker.get_cache_block_size_bytes( + block_size, cache_dtype)) + + new_num_gpu_blocks = split_num_cache_blocks_evenly( + scorer_cache_block_size_bytes, proposer_cache_block_size_bytes, + num_gpu_blocks) + return new_num_gpu_blocks, num_cpu_blocks + + def init_cache_engine(self, cache_config: CacheConfig): + """Initialize the cache engine of the scorer and proposer workers. + """ + self.scorer_worker.init_cache_engine(cache_config) + self.proposer_worker.init_cache_engine(cache_config) + + @torch.inference_mode() + def execute_model( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: Optional[Dict[int, int]], + blocks_to_swap_out: Optional[Dict[int, int]], + blocks_to_copy: Optional[Dict[int, List[int]]], + num_spec_tokens: int, + ) -> List[SamplerOutput]: + """Perform speculative decoding on the input batch. + """ + + assert seq_group_metadata_list is not None, ( + "speculative decoding " + "requires non-None seq_group_metadata_list") + + # If no spec tokens, call the proposer and scorer workers normally. + # Used for prefill. + if num_spec_tokens == 0 or len(seq_group_metadata_list) == 0: + return self._run_no_spec( + seq_group_metadata_list=seq_group_metadata_list, + blocks_to_swap_in=blocks_to_swap_in, + blocks_to_swap_out=blocks_to_swap_out, + blocks_to_copy=blocks_to_copy, + ) + + return self._run_speculative_decoding_step( + seq_group_metadata_list=seq_group_metadata_list, + blocks_to_swap_in=blocks_to_swap_in, + blocks_to_swap_out=blocks_to_swap_out, + blocks_to_copy=blocks_to_copy, + k=num_spec_tokens, + ) + + @nvtx_range("spec_decode_worker._run_no_spec") + def _run_no_spec( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: Optional[Dict[int, int]], + blocks_to_swap_out: Optional[Dict[int, int]], + blocks_to_copy: Optional[Dict[int, List[int]]], + ) -> List[SamplerOutput]: + """Run a prefill step, without any speculation. The input is sent to the + proposer and scorer model so that the KV cache is consistent between the + two. + """ + + self.proposer_worker.execute_model( + seq_group_metadata_list=seq_group_metadata_list, + blocks_to_swap_in=blocks_to_swap_in, + blocks_to_swap_out=blocks_to_swap_out, + blocks_to_copy=blocks_to_copy, + return_python_output=False) + + sampler_output = self.scorer_worker.execute_model( + seq_group_metadata_list=seq_group_metadata_list, + blocks_to_swap_in=blocks_to_swap_in, + blocks_to_swap_out=blocks_to_swap_out, + blocks_to_copy=blocks_to_copy, + ) + + # Clear device tensors from sampler output. This reduces communication + # overhead when the engine runs in a different process than the workers. + sampler_output.probs = None + sampler_output.sampled_tokens = None + return [sampler_output] + + @nvtx_range("spec_decode_worker._run_speculative_decoding_step") + def _run_speculative_decoding_step( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: Optional[Dict[int, int]], + blocks_to_swap_out: Optional[Dict[int, int]], + blocks_to_copy: Optional[Dict[int, List[int]]], + k: int, + ) -> List[SamplerOutput]: + """Execute a single step of speculative decoding. + + This invokes the proposer worker to get k speculative tokens for each + sequence, then scores each speculative token using the scoring worker. + + Returns a list of SamplerOutput, each containing a single token per + sequence. + """ + + # Generate proposals using draft worker. + proposals = self.proposer_worker.get_spec_proposals( + seq_group_metadata_list, blocks_to_swap_in, blocks_to_swap_out, + blocks_to_copy, k) + + proposal_scores = self.scorer.score_proposals( + seq_group_metadata_list, + blocks_to_swap_in, + blocks_to_swap_out, + blocks_to_copy, + k, + proposals, + ) + + accepted_token_ids = self._verify_tokens(seq_group_metadata_list, + proposal_scores, proposals, k) + + return self._create_output_sampler_list(seq_group_metadata_list, + accepted_token_ids, k) + + @nvtx_range("spec_decode_worker._verify_tokens") + def _verify_tokens( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + proposal_scores: SpeculativeScores, + proposals: SpeculativeProposals, + max_proposal_len: int, + ) -> torch.Tensor: + """Determine which speculative tokens are accepted using the + probabilities of each token according to the proposer and scorer models. + """ + proposal_lens_list = proposals.proposal_lens.tolist() + + # vLLM currently only supports proposal lens equal to zero or the batch + # proposal len. This adds some complexity (splitting the batch into spec + # and non spec sequences) and should be removed in the future. It can be + # done by supporting per-sequence proposal lens. + _, spec_indices = split_batch_by_proposal_len( + seq_group_metadata_list, + proposal_lens_list, + select_proposal_len_zero=False) + _, non_spec_indices = split_batch_by_proposal_len( + seq_group_metadata_list, + proposal_lens_list, + select_proposal_len_zero=True) + original_indices = spec_indices + non_spec_indices + + proposal_probs = proposal_scores.probs[spec_indices, :-1] + bonus_token_ids = proposal_scores.token_ids[spec_indices, -1:] + non_spec_token_ids = proposal_scores.token_ids[non_spec_indices] + + accepted_token_ids = self.rejection_sampler( + proposal_probs, + bonus_token_ids, + proposals.proposal_probs, + proposals.proposal_token_ids, + ) + + # Append output tokens from non-speculative sequences to + # the accepted token ids tensor. + non_spec_token_ids = non_spec_token_ids.expand(-1, max_proposal_len + + 1).clone() + non_spec_token_ids[:, 1:] = -1 + accepted_token_ids = torch.cat( + [accepted_token_ids, non_spec_token_ids]) + + # Rearrange so that results are in the order of the original seq group + # metadata. + accepted_token_ids[original_indices] = accepted_token_ids.clone() + + return accepted_token_ids + + def _create_output_sampler_list( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + accepted_token_ids: torch.Tensor, # shape: [batch_size, k+1] + k: int, + ) -> List[SamplerOutput]: + """Given the accepted token ids, create a list of SamplerOutput. + + The output is padded with -1 tokens such that each sequence has + the same number of outputs. + """ + seq_ids = get_all_seq_ids(seq_group_metadata_list) + + # shape: [k+1, batch_size] + accepted_token_ids_by_step = accepted_token_ids.transpose(0, + 1).tolist() + sampler_output_list = [] + for token_ids_by_step in accepted_token_ids_by_step: + if all(token_id == -1 for token_id in token_ids_by_step): + break + + step_output_token_ids = [] + for token_id, seq_id in zip(token_ids_by_step, seq_ids): + step_output_token_ids.append( + SequenceGroupOutput( + samples=[ + SequenceOutput( + parent_seq_id=seq_id, + output_token=token_id, + # TODO Add verifier logprobs. + logprobs={token_id: 0.0}, + ) + ], + prompt_logprobs=None, + )) + sampler_output_list.append( + SamplerOutput(outputs=step_output_token_ids)) + + maybe_rejsample_metrics = ( + self._metrics.maybe_collect_rejsample_metrics(k)) + if maybe_rejsample_metrics is not None: + sampler_output_list[ + 0].spec_decode_worker_metrics = maybe_rejsample_metrics + + return sampler_output_list + + @cached_property + def _vocab_size(self) -> int: + """Get the vocab size of the model and make sure it's consistent between + draft and target workers. + """ + vocab_sizes = [ + worker.vocab_size + for worker in [self.proposer_worker, self.scorer_worker] + ] + assert all(vocab_sizes[0] == vocab_size for vocab_size in vocab_sizes) + return vocab_sizes[0] + + @property + def rank(self): + return self.scorer_worker.rank + + @property + def device(self): + return self.scorer_worker.device + + +def split_num_cache_blocks_evenly(scorer_cache_block_size_bytes: int, + proposer_cache_block_size_bytes: int, + total_num_gpu_blocks: int) -> int: + """Given total_num_gpu_blocks, the number of GPU blocks that could be + allocate to the target model, this function calculates how many blocks + should be given to the draft and target model. + + Note that usually the block size, in bytes, of each model is different, + as it's a function of number of KV/layer, number of heads, and hidden + dimension size. + + Since the target and draft models allocate the same number of blocks, we + simply calculate the number of blocks where if allocated by both models, + the total memory usage from KV cache is no larger than the number of + blocks allocatable by the target model alone. + """ + new_num_gpu_blocks = int( + total_num_gpu_blocks * scorer_cache_block_size_bytes / + (proposer_cache_block_size_bytes + scorer_cache_block_size_bytes)) + + return new_num_gpu_blocks diff --git a/vllm/spec_decode/util.py b/vllm/spec_decode/util.py new file mode 100644 index 0000000000000..2c5f954551905 --- /dev/null +++ b/vllm/spec_decode/util.py @@ -0,0 +1,99 @@ +import torch +from typing import List, Tuple +from vllm.sequence import SequenceGroupMetadata, SamplerOutput +from contextlib import contextmanager +from itertools import chain + +SeqId = int + + +def get_all_seq_ids( + seq_group_metadata_list: List[SequenceGroupMetadata]) -> List[SeqId]: + """Given a list of SequenceGroupMetadata, create a list of all + sequence ids. + """ + return list( + chain.from_iterable([ + seq_group_metadata.seq_data.keys() + for seq_group_metadata in seq_group_metadata_list + ])) + + +def split_batch_by_proposal_len( + seq_group_metadata_list: List[SequenceGroupMetadata], + proposal_lens: List[int], select_proposal_len_zero: bool +) -> Tuple[List[SequenceGroupMetadata], List[int]]: + """Utility function that splits a batch based on whether the proposal len is + zero or not. We should remove this once vLLM supports per-sequence proposal + lens in a batch. + """ + + if select_proposal_len_zero: + predicate = lambda proposal_len: proposal_len == 0 + else: + predicate = lambda proposal_len: proposal_len != 0 + + indices = [ + i for i, (_, proposal_len + ) in enumerate(zip(seq_group_metadata_list, proposal_lens)) + if predicate(proposal_len) + ] + seq_groups = [ + seq_group for seq_group, proposal_len in zip( + seq_group_metadata_list, proposal_lens) if predicate(proposal_len) + ] + + return seq_groups, indices + + +def sampler_output_to_torch( + sampler_output_list: List[SamplerOutput], +) -> Tuple[torch.Tensor, torch.Tensor]: + """Utility function which converts a list of SamplerOutput to tensors. + + Returns: + sampled_token_ids: torch.Tensor + shape: [batch_size, len(sampler_output_list)] + + sampled_token_probs: torch.Tensor + shape: [batch_size, len(sampler_output_list), vocab_size] + """ + + # shape: [batch_size, num_sampler_output, vocab_size] + sampled_token_probs = torch.stack( + [ + sampler_output.sampled_token_probs + for sampler_output in sampler_output_list + ], + dim=0, + ).transpose(0, 1) + + # shape: [batch_size, num_sampler_output] + sampled_token_ids = torch.stack( + [ + sampler_output.sampled_token_ids.flatten() + for sampler_output in sampler_output_list + ], + dim=0, + ).transpose(0, 1) + + return sampled_token_ids, sampled_token_probs + + +@contextmanager +def nvtx_range(msg, *args, **kwargs): + """ + Context manager / decorator that pushes an NVTX range at the beginning + of its scope, and pops it at the end. If extra arguments are given, + they are passed as arguments to msg.format(). + + If running with cuda graphs, you must enable nsys cuda graph profiling. + + Arguments: + msg (string): message to associate with the range + """ + torch.cuda.nvtx.range_push(msg.format(*args, **kwargs)) + try: + yield + finally: + torch.cuda.nvtx.range_pop() diff --git a/vllm/transformers_utils/configs/mpt.py b/vllm/transformers_utils/configs/mpt.py index 5ea0d9122ef11..2c0e45623aa25 100644 --- a/vllm/transformers_utils/configs/mpt.py +++ b/vllm/transformers_utils/configs/mpt.py @@ -62,62 +62,6 @@ def __init__(self, fc_type: str = 'torch', verbose: Optional[int] = None, **kwargs: Any): - """The MPT configuration class. - Args: - d_model (int): The size of the embedding dimension of the model. - n_heads (int): The number of attention heads. - n_layers (int): The number of layers in the model. - expansion_ratio (int): The ratio of the up/down scale in the ffn. - max_seq_len (int): The maximum sequence length of the model. - vocab_size (int): The size of the vocabulary. - resid_pdrop (float): The dropout probability applied to the attention output before combining with residual. - emb_pdrop (float): The dropout probability for the embedding layer. - learned_pos_emb (bool): Whether to use learned positional embeddings - attn_config (Dict): A dictionary used to configure the model's attention module: - attn_type (str): type of attention to use. Options: multihead_attention, multiquery_attention, grouped_query_attention - attn_pdrop (float): The dropout probability for the attention layers. - attn_impl (str): The attention implementation to use. One of 'torch', 'flash', or 'triton'. - qk_ln (bool): Whether to apply layer normalization to the queries and keys in the attention layer. - clip_qkv (Optional[float]): If not None, clip the queries, keys, and values in the attention layer to - this value. - softmax_scale (Optional[float]): If not None, scale the softmax in the attention layer by this value. If None, - use the default scale of ``1/sqrt(d_keys)``. - prefix_lm (Optional[bool]): Whether the model should operate as a Prefix LM. This requires passing an - extra `prefix_mask` argument which indicates which tokens belong to the prefix. Tokens in the prefix - can attend to one another bi-directionally. Tokens outside the prefix use causal attention. - attn_uses_sequence_id (Optional[bool]): Whether to restrict attention to tokens that have the same sequence_id. - When the model is in `train` mode, this requires passing an extra `sequence_id` argument which indicates - which sub-sequence each token belongs to. - Defaults to ``False`` meaning any provided `sequence_id` will be ignored. - alibi (bool): Whether to use the alibi bias instead of position embeddings. - alibi_bias_max (int): The maximum value of the alibi bias. - kv_n_heads (Optional[int]): For grouped_query_attention only, allow user to specify number of kv heads. - ffn_config (Dict): A dictionary used to configure the model's ffn module: - ffn_type (str): type of ffn to use. Options: mptmlp, te_ln_mlp - init_device (str): The device to use for parameter initialization. - logit_scale (Optional[Union[float, str]]): If not None, scale the logits by this value. - no_bias (bool): Whether to use bias in all layers. - verbose (int): The verbosity level. 0 is silent. - embedding_fraction (float): The fraction to scale the gradients of the embedding layer by. - norm_type (str): choose type of norm to use - use_cache (bool): Whether or not the model should return the last key/values attentions - init_config (Dict): A dictionary used to configure the model initialization: - init_config.name: The parameter initialization scheme to use. Options: 'default_', 'baseline_', - 'kaiming_uniform_', 'kaiming_normal_', 'neox_init_', 'small_init_', 'xavier_uniform_', or - 'xavier_normal_'. These mimic the parameter initialization methods in PyTorch. - init_div_is_residual (Union[int, float, str, bool]): Value to divide initial weights by if ``module._is_residual`` is True. - emb_init_std (Optional[float]): The standard deviation of the normal distribution used to initialize the embedding layer. - emb_init_uniform_lim (Optional[Union[Tuple[float, float], float]]): The lower and upper limits of the uniform distribution - used to initialize the embedding layer. Mutually exclusive with ``emb_init_std``. - init_std (float): The standard deviation of the normal distribution used to initialize the model, - if using the baseline_ parameter initialization scheme. - init_gain (float): The gain to use for parameter initialization with kaiming or xavier initialization schemes. - fan_mode (str): The fan mode to use for parameter initialization with kaiming initialization schemes. - init_nonlinearity (str): The nonlinearity to use for parameter initialization with kaiming initialization schemes. - --- - See llmfoundry.models.utils.param_init_fns.py for info on other param init config options - fc_type (str): choose fc layer implementation. Options: torch and te. te layers support fp8 when using H100 GPUs. - """ self.d_model = d_model self.n_heads = n_heads self.n_layers = n_layers @@ -139,8 +83,8 @@ def __init__(self, self.fc_type = fc_type if verbose is not None: warnings.warn(DeprecationWarning( - 'verbose argument for MPTConfig is now ignored and will be removed. Use python_log_level instead.' - ), + 'verbose argument for MPTConfig is now ignored and ' + 'will be removed. Use python_log_level instead.'), stacklevel=2) if 'name' in kwargs: del kwargs['name'] @@ -149,7 +93,8 @@ def __init__(self, if self.attn_config.get('alibi', False): self.learned_pos_emb = False warnings.warn( - f'alibi is turned on, setting `learned_pos_emb` to {self.learned_pos_emb}`', + f'alibi is turned on, setting `learned_pos_emb` ' + f'to {self.learned_pos_emb}`', stacklevel=2) super().__init__(**kwargs) self._validate_config() @@ -176,8 +121,8 @@ def _validate_config(self) -> None: [self.attn_config['attn_pdrop'], self.resid_pdrop, self.emb_pdrop] )): raise ValueError( - "self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are probabilities and must be between 0 and 1" # pylint: disable=line-too-long - ) + "self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are " + "probabilities and must be between 0 and 1") if self.attn_config['attn_impl'] not in ['torch', 'flash', 'triton']: raise ValueError( f"Unknown attn_impl={self.attn_config['attn_impl']}") @@ -193,17 +138,17 @@ def _validate_config(self) -> None: if self.attn_config['attn_uses_sequence_id'] and self.attn_config[ 'attn_impl'] not in ['torch', 'triton']: raise NotImplementedError( - 'attn_uses_sequence_id only implemented with torch and triton attention.' # pylint: disable=line-too-long - ) + 'attn_uses_sequence_id only implemented with torch ' + 'and triton attention.') if self.embedding_fraction > 1 or self.embedding_fraction <= 0: raise ValueError( - 'model.embedding_fraction must be between 0 (exclusive) and 1 (inclusive)!' # pylint: disable=line-too-long - ) + 'model.embedding_fraction must be between 0 (exclusive) ' + 'and 1 (inclusive)!') if isinstance(self.logit_scale, str) and self.logit_scale != 'inv_sqrt_d_model': raise ValueError( - f"self.logit_scale={self.logit_scale!r} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'." # pylint: disable=line-too-long - ) + f"self.logit_scale={self.logit_scale!r} is not recognized as " + "an option; use numeric value or 'inv_sqrt_d_model'.") if self.init_config.get('name', None) is None: raise ValueError( f"self.init_config={self.init_config!r} 'name' needs to be set." @@ -219,11 +164,11 @@ def _validate_config(self) -> None: del te except Exception as exc: raise ImportError( - # pylint: disable=line-too-long - 'TransformerEngine import fail. `fc_type: te` requires TransformerEngine be installed. ' - + - 'The required version of transformer_engine also requires FlashAttention v1.0.6 is installed:\n' - + 'pip install flash-attn==1.0.6 --no-build-isolation \n' + + 'TransformerEngine import fail. `fc_type: te` requires ' + 'TransformerEngine be installed. ' + 'The required version of transformer_engine also requires ' + 'FlashAttention v1.0.6 is installed:\n' + 'pip install flash-attn==1.0.6 --no-build-isolation \n' 'pip install git+https://github.com/NVIDIA/TransformerEngine.git@144e4888b2cdd60bd52e706d5b7a79cb9c1a7156' ) from exc if self.ffn_config['ffn_type'] == 'mptmlp': diff --git a/vllm/transformers_utils/configs/starcoder2.py b/vllm/transformers_utils/configs/starcoder2.py index 4c3b6b8def074..2879cd0445275 100644 --- a/vllm/transformers_utils/configs/starcoder2.py +++ b/vllm/transformers_utils/configs/starcoder2.py @@ -2,78 +2,6 @@ class Starcoder2Config(PretrainedConfig): - r""" - This is the configuration class to store the configuration of a [`Starcoder2Model`]. It is used to instantiate a - Starcoder2 model according to the specified arguments, defining the model architecture. Instantiating a configuration - with the defaults will yield a similar configuration to that of the [bigcode/starcoder2-7b_16k](https://huggingface.co/bigcode/starcoder2-7b_16k) model. - - - Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the - documentation from [`PretrainedConfig`] for more information. - - - Args: - vocab_size (`int`, *optional*, defaults to 49152): - Vocabulary size of the Starcoder2 model. Defines the number of different tokens that can be represented by the - `inputs_ids` passed when calling [`Starcoder2Model`] - hidden_size (`int`, *optional*, defaults to 3072): - Dimension of the hidden representations. - intermediate_size (`int`, *optional*, defaults to 12288): - Dimension of the MLP representations. - num_hidden_layers (`int`, *optional*, defaults to 30): - Number of hidden layers in the Transformer encoder. - num_attention_heads (`int`, *optional*, defaults to 24): - Number of attention heads for each attention layer in the Transformer encoder. - num_key_value_heads (`int`, *optional*, defaults to 2): - This is the number of key_value heads that should be used to implement Grouped Query Attention. If - `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if - `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When - converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this - paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`. - hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`): - The non-linear activation function (function or string) in the decoder. - max_position_embeddings (`int`, *optional*, defaults to 4096): - The maximum sequence length that this model might ever be used with. Starcoder2's sliding window attention - allows sequence of up to 4096*32 tokens. - initializer_range (`float`, *optional*, defaults to 0.02): - The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - norm_epsilon (`float`, *optional*, defaults to 1e-05): - Epsilon value for the layer norm - use_cache (`bool`, *optional*, defaults to `True`): - Whether or not the model should return the last key/values attentions (not used by all models). Only - relevant if `config.is_decoder=True`. - bos_token_id (`int`, *optional*, defaults to 50256): - The id of the "beginning-of-sequence" token. - eos_token_id (`int`, *optional*, defaults to 50256): - The id of the "end-of-sequence" token. - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - sliding_window (`int`, *optional*): - Sliding window attention window size. If not specified, will default to `None` (no sliding window). - attention_dropout (`float`, *optional*, defaults to 0.0): - The dropout ratio for the attention probabilities. - residual_dropout (`float`, *optional*, defaults to 0.0): - Residual connection dropout value. - embedding_dropout (`float`, *optional*, defaults to 0.0): - Embedding dropout. - use_bias (`bool`, *optional*, defaults to `True`): - Whether to use bias term on linear layers of the model. - - - ```python - >>> from transformers import Starcoder2Model, Starcoder2Config - - >>> # Initializing a Starcoder2 7B style configuration - >>> configuration = Starcoder2Config() - - >>> # Initializing a model from the Starcoder2 7B style configuration - >>> model = Starcoder2Model(configuration) - - >>> # Accessing the model configuration - >>> configuration = model.config - ```""" - model_type = "starcoder2" keys_to_ignore_at_inference = ["past_key_values"] diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py index 6edc225cdfc80..2600ea2642da2 100644 --- a/vllm/transformers_utils/tokenizer.py +++ b/vllm/transformers_utils/tokenizer.py @@ -120,7 +120,8 @@ async def encode_async( def get_lora_tokenizer( self, - lora_request: Optional[LoRARequest]) -> "PreTrainedTokenizer": + lora_request: Optional[LoRARequest] = None + ) -> "PreTrainedTokenizer": if not lora_request or not self.enable_lora: return self.tokenizer if lora_request.lora_int_id not in self.lora_tokenizers: @@ -133,7 +134,8 @@ def get_lora_tokenizer( async def get_lora_tokenizer_async( self, - lora_request: Optional[LoRARequest]) -> "PreTrainedTokenizer": + lora_request: Optional[LoRARequest] = None + ) -> "PreTrainedTokenizer": if not lora_request or not self.enable_lora: return self.tokenizer if lora_request.lora_int_id not in self.lora_tokenizers: diff --git a/vllm/transformers_utils/tokenizers/baichuan.py b/vllm/transformers_utils/tokenizers/baichuan.py index 1dd241e4a5c4b..02045bdcb2ccf 100644 --- a/vllm/transformers_utils/tokenizers/baichuan.py +++ b/vllm/transformers_utils/tokenizers/baichuan.py @@ -1,4 +1,3 @@ -# yapf: disable # Adapted from # https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat/blob/8f6e343d545c503b91429582231d1d354dac2740/tokenization_baichuan.py # This includes a fix suggested in @@ -13,7 +12,6 @@ from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer from transformers.utils import logging - logger = logging.get_logger(__name__) VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"} @@ -52,27 +50,16 @@ def __init__( clean_up_tokenization_spaces=False, **kwargs, ): - self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs - bos_token = ( - AddedToken(bos_token, lstrip=False, rstrip=False) - if isinstance(bos_token, str) - else bos_token - ) - eos_token = ( - AddedToken(eos_token, lstrip=False, rstrip=False) - if isinstance(eos_token, str) - else eos_token - ) - unk_token = ( - AddedToken(unk_token, lstrip=False, rstrip=False) - if isinstance(unk_token, str) - else unk_token - ) - pad_token = ( - AddedToken(pad_token, lstrip=False, rstrip=False) - if isinstance(pad_token, str) - else pad_token - ) + self.sp_model_kwargs = ({} if sp_model_kwargs is None else + sp_model_kwargs) + bos_token = (AddedToken(bos_token, lstrip=False, rstrip=False) + if isinstance(bos_token, str) else bos_token) + eos_token = (AddedToken(eos_token, lstrip=False, rstrip=False) + if isinstance(eos_token, str) else eos_token) + unk_token = (AddedToken(unk_token, lstrip=False, rstrip=False) + if isinstance(unk_token, str) else unk_token) + pad_token = (AddedToken(pad_token, lstrip=False, rstrip=False) + if isinstance(pad_token, str) else pad_token) self.vocab_file = vocab_file self.add_bos_token = add_bos_token self.add_eos_token = add_eos_token @@ -107,7 +94,10 @@ def vocab_size(self): def get_vocab(self): """Returns vocab as a dict""" - vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} + vocab = { + self.convert_ids_to_tokens(i): i + for i in range(self.vocab_size) + } vocab.update(self.added_tokens_encoder) return vocab @@ -130,7 +120,8 @@ def convert_tokens_to_string(self, tokens): out_string = "" prev_is_special = False for i, token in enumerate(tokens): - # make sure that special tokens are not decoded using sentencepiece model + # make sure that special tokens are not decoded using + # sentencepiece model if token in self.all_special_tokens: if not prev_is_special and i != 0: out_string += " " @@ -143,9 +134,9 @@ def convert_tokens_to_string(self, tokens): out_string += self.sp_model.decode(current_sub_tokens) return out_string - def save_vocabulary( - self, save_directory, filename_prefix: Optional[str] = None - ) -> Tuple[str]: + def save_vocabulary(self, + save_directory, + filename_prefix: Optional[str] = None) -> Tuple[str]: """ Save the vocabulary and special tokens file to a directory. @@ -157,24 +148,24 @@ def save_vocabulary( `Tuple(str)`: Paths to the files saved. """ if not os.path.isdir(save_directory): - logger.error(f"Vocabulary path ({save_directory}) should be a directory") + logger.error(f"Vocabulary path ({save_directory}) " + "should be a directory") return out_vocab_file = os.path.join( save_directory, - (filename_prefix + "-" if filename_prefix else "") - + VOCAB_FILES_NAMES["vocab_file"], + (filename_prefix + "-" if filename_prefix else "") + + VOCAB_FILES_NAMES["vocab_file"], ) if os.path.abspath(self.vocab_file) != os.path.abspath( - out_vocab_file - ) and os.path.isfile(self.vocab_file): + out_vocab_file) and os.path.isfile(self.vocab_file): copyfile(self.vocab_file, out_vocab_file) elif not os.path.isfile(self.vocab_file): with open(out_vocab_file, "wb") as fi: content_spiece_model = self.sp_model.serialized_model_proto() fi.write(content_spiece_model) - return (out_vocab_file,) + return (out_vocab_file, ) def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): bos_token_id = [self.bos_token_id] if self.add_bos_token else [] @@ -194,7 +185,8 @@ def get_special_tokens_mask( already_has_special_tokens: bool = False, ) -> List[int]: """ - Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + Retrieve sequence ids from a token list that has no special tokens + added. This method is called when adding special tokens using the tokenizer `prepare_for_model` method. Args: @@ -202,11 +194,14 @@ def get_special_tokens_mask( List of IDs. token_ids_1 (`List[int]`, *optional*): Optional second list of IDs for sequence pairs. - already_has_special_tokens (`bool`, *optional*, defaults to `False`): - Whether or not the token list is already formatted with special tokens for the model. + already_has_special_tokens (`bool`, *optional*, defaults to + `False`): + Whether or not the token list is already formatted with + special tokens for the model. Returns: - `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + `List[int]`: A list of integers in the range [0, 1]: + 1 for a special token, 0 for a sequence token. """ if already_has_special_tokens: return super().get_special_tokens_mask( @@ -220,20 +215,16 @@ def get_special_tokens_mask( if token_ids_1 is None: return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id - return ( - bos_token_id - + ([0] * len(token_ids_0)) - + eos_token_id - + bos_token_id - + ([0] * len(token_ids_1)) - + eos_token_id - ) + return (bos_token_id + ([0] * len(token_ids_0)) + eos_token_id + + bos_token_id + ([0] * len(token_ids_1)) + eos_token_id) def create_token_type_ids_from_sequences( - self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None - ) -> List[int]: + self, + token_ids_0: List[int], + token_ids_1: Optional[List[int]] = None) -> List[int]: """ - Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT + Creates a mask from the two sequences passed to be used in a + sequence-pair classification task. An ALBERT sequence pair mask has the following format: ``` @@ -250,7 +241,8 @@ def create_token_type_ids_from_sequences( Optional second list of IDs for sequence pairs. Returns: - `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s). + `List[int]`: List of [token type IDs](../glossary#token-type-ids) + according to the given sequence(s). """ bos_token_id = [self.bos_token_id] if self.add_bos_token else [] eos_token_id = [self.eos_token_id] if self.add_eos_token else [] diff --git a/vllm/utils.py b/vllm/utils.py index 9cdf623379516..fe6fd27962cd3 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -3,6 +3,7 @@ import socket import subprocess import uuid +import gc from platform import uname from typing import List, Tuple, Union from packaging.version import parse, Version @@ -132,9 +133,10 @@ def get_max_shared_memory_bytes(gpu: int = 0) -> int: # the Neuron-X backend does not have the `cuda_utils` module. from vllm._C import cuda_utils - max_shared_mem = cuda_utils.get_max_shared_memory_per_block_device_attribute( - gpu) - # value 0 will cause MAX_SEQ_LEN become negative and test_attention.py will fail + max_shared_mem = ( + cuda_utils.get_max_shared_memory_per_block_device_attribute(gpu)) + # value 0 will cause MAX_SEQ_LEN become negative and test_attention.py + # will fail assert max_shared_mem > 0, "max_shared_mem can not be zero" return int(max_shared_mem) @@ -208,9 +210,8 @@ def get_nvcc_cuda_version() -> Optional[Version]: if not cuda_home: cuda_home = '/usr/local/cuda' if os.path.isfile(cuda_home + '/bin/nvcc'): - logger.info( - f'CUDA_HOME is not found in the environment. Using {cuda_home} as CUDA_HOME.' - ) + logger.info(f'CUDA_HOME is not found in the environment. ' + f'Using {cuda_home} as CUDA_HOME.') else: logger.warning( f'Not found nvcc in {cuda_home}. Skip cuda version check!') @@ -309,3 +310,27 @@ def create_kv_caches_with_random( f"Does not support value cache of type {cache_dtype}") value_caches.append(value_cache) return key_caches, value_caches + + +class measure_cuda_memory: + + def __init__(self, device=None): + self.device = device + + def current_memory_usage(self) -> float: + # Return the memory usage in bytes. + torch.cuda.reset_peak_memory_stats(self.device) + mem = torch.cuda.max_memory_allocated(self.device) + return mem + + def __enter__(self): + self.initial_memory = self.current_memory_usage() + # This allows us to call methods of the context manager if needed + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.final_memory = self.current_memory_usage() + self.consumed_memory = self.final_memory - self.initial_memory + + # Force garbage collection + gc.collect() diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index aff8ebc903623..7eac576e3f0fe 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -21,7 +21,7 @@ from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager from vllm.lora.layers import LoRAMapping from vllm.lora.request import LoRARequest -from vllm.utils import in_wsl +from vllm.utils import in_wsl, measure_cuda_memory logger = init_logger(__name__) @@ -85,18 +85,21 @@ def __init__( self.model_config.enforce_eager = True def load_model(self) -> None: - self.model = get_model(self.model_config, - self.device_config, - lora_config=self.lora_config, - parallel_config=self.parallel_config, - scheduler_config=self.scheduler_config) + with measure_cuda_memory() as m: + self.model = get_model(self.model_config, + self.device_config, + lora_config=self.lora_config, + parallel_config=self.parallel_config, + scheduler_config=self.scheduler_config) - vocab_size = self.model.config.vocab_size + self.model_memory_usage = m.consumed_memory + logger.info(f"Loading model weights took " + f"{self.model_memory_usage / float(2**30):.4f} GB") if self.lora_config: - assert hasattr( - self.model, "supported_lora_modules" - ) and self.model.supported_lora_modules, "Model does not support LoRA" + assert hasattr(self.model, "supported_lora_modules" + ) and self.model.supported_lora_modules, ( + "Model does not support LoRA") assert hasattr( self.model, "embedding_modules"), "Model does not have embedding_modules" @@ -105,7 +108,7 @@ def load_model(self) -> None: self.lora_manager = LRUCacheWorkerLoRAManager( self.scheduler_config.max_num_seqs, self.scheduler_config.max_num_batched_tokens + - self.scheduler_config.max_paddings, vocab_size, + self.scheduler_config.max_paddings, self.vocab_size, self.lora_config, self.device, self.model.embedding_modules, self.model.embedding_padding_modules) self.model = self.lora_manager.create_lora_manager(self.model) @@ -209,6 +212,7 @@ def _prepare_prompt( slot_mapping[-1].append(slot) max_prompt_len = max(subquery_lens) + assert max_prompt_len > 0 input_tokens = _make_tensor_with_pad(input_tokens, max_prompt_len, pad=0, @@ -600,8 +604,7 @@ def execute_model( @torch.inference_mode() def profile_run(self) -> None: # Enable top-k sampling to reflect the accurate memory usage. - vocab_size = self.model_config.get_vocab_size() - sampling_params = SamplingParams(top_p=0.99, top_k=vocab_size - 1) + sampling_params = SamplingParams(top_p=0.99, top_k=self.vocab_size - 1) max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens max_num_seqs = self.scheduler_config.max_num_seqs @@ -767,6 +770,10 @@ def __del__(self) -> None: self.graph_runners.clear() self.cupy_nccl_backend = None + @property + def vocab_size(self) -> int: + return self.model_config.get_vocab_size() + class CUDAGraphRunner: diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py index 3229a21c11a38..340c079600c78 100644 --- a/vllm/worker/neuron_worker.py +++ b/vllm/worker/neuron_worker.py @@ -79,7 +79,8 @@ def profile_num_available_blocks( cpu_swap_space: int = 0, cache_dtype: str = "float16", ) -> Tuple[int, int]: - """Simply returns max_num_seqs as num_gpu_blocks, 0 as num_cpu_blocks.""" + """Simply returns max_num_seqs as num_gpu_blocks, 0 as + num_cpu_blocks.""" num_gpu_blocks = self.scheduler_config.max_num_seqs num_cpu_blocks = 0 return num_gpu_blocks, num_cpu_blocks @@ -177,7 +178,8 @@ def _init_distributed_environment( "distributed_init_method must be set if torch.distributed " "is not already initialized") else: - distributed_backend = distributed_backend if distributed_backend else "nccl" + distributed_backend = (distributed_backend + if distributed_backend else "nccl") torch.distributed.init_process_group( backend=distributed_backend, world_size=parallel_config.world_size, diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 9df518d155ec2..0dcd4018afa5f 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -19,7 +19,6 @@ from vllm.worker.cache_engine import CacheEngine from vllm.worker.model_runner import ModelRunner from vllm.lora.request import LoRARequest -from vllm.utils import is_hip class Worker: @@ -131,8 +130,8 @@ def profile_num_available_blocks( # GPU did not change their memory usage during the profiling. peak_memory = self.init_gpu_memory - free_gpu_memory - cache_block_size = CacheEngine.get_cache_block_size( - block_size, cache_dtype, self.model_config, self.parallel_config) + cache_block_size = self.get_cache_block_size_bytes( + block_size, cache_dtype) num_gpu_blocks = int( (total_gpu_memory * gpu_memory_utilization - peak_memory) // cache_block_size) @@ -233,6 +232,22 @@ def remove_lora(self, lora_id: int) -> bool: def list_loras(self) -> Set[int]: return self.model_runner.list_loras() + @property + def max_model_len(self) -> int: + return self.model_config.max_model_len + + @property + def vocab_size(self) -> int: + return self.model_runner.vocab_size + + def get_cache_block_size_bytes(self, block_size: int, + cache_dtype: str) -> int: + """Get the size of the KV cache block size in bytes. + """ + return CacheEngine.get_cache_block_size(block_size, cache_dtype, + self.model_config, + self.parallel_config) + def init_distributed_environment( parallel_config: ParallelConfig, @@ -267,8 +282,7 @@ def init_distributed_environment( "cupy.distributed is already initialized but the cupy world " "size does not match parallel_config.world_size " f"({cupy_world_size} vs. {parallel_config.world_size}).") - elif (parallel_config.world_size > 1 and cupy_port is not None - and not is_hip()): + elif (parallel_config.world_size > 1 and cupy_port is not None): # NOTE(woosuk): We don't initialize CuPy process group when world size # is 1. # TODO(woosuk): Support multi-node connection.