diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index c65ab04b8ddda..42a1eacb6de57 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -13,6 +13,9 @@ steps:
 
 - label: Basic Correctness Test
   command: pytest -v -s --forked basic_correctness
+  
+- label: Core Test
+  command: pytest -v -s core
 
 - label: Distributed Comm Ops Test
   command: pytest -v -s --forked test_comm_ops.py
@@ -25,7 +28,7 @@ steps:
   num_gpus: 2 # only support 1 or 2 for now.
 
 - label: Engine Test
-  command: pytest -v -s engine
+  command: pytest -v -s engine test_sequence.py
 
 - label: Entrypoints Test
   command: pytest -v -s entrypoints
@@ -49,6 +52,9 @@ steps:
 - label: Worker Test
   command: pytest -v -s worker
 
+- label: Speculative decoding tests
+  command: pytest -v -s spec_decode
+
 - label: LoRA Test
   command: pytest -v -s lora --forked
 
diff --git a/.github/workflows/remote-push.yml b/.github/workflows/remote-push.yml
index 5bad8eab365a7..03aeabf5278d7 100644
--- a/.github/workflows/remote-push.yml
+++ b/.github/workflows/remote-push.yml
@@ -21,7 +21,7 @@ jobs:
         uses: ./.github/workflows/build-test.yml
         with:
             label: aws-avx2-192G-4-a10g-96G
-            timeout: 180
+            timeout: 240
             gitref: '${{ github.ref }}'
             Gi_per_thread: 4
             python: ${{ matrix.python }}
diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index 54ae06be6e101..a45265d79a6ac 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -23,6 +23,9 @@ RUN echo "FA_BRANCH is $FA_BRANCH"
 # In that case, we need to use the python reference attention implementation in vllm
 ARG BUILD_FA="1"
 
+# whether to build cupy on rocm
+ARG BUILD_CUPY="1"
+
 # Install some basic utilities
 RUN apt-get update && apt-get install python3 python3-pip -y
 
@@ -70,16 +73,33 @@ RUN if [ "$BUILD_FA" = "1" ]; then \
     && cd ..; \
     fi
 
-COPY ./ /app/vllm
-
-RUN python3 -m pip install --upgrade pip
-RUN python3 -m pip install xformers==0.0.23 --no-deps
-
 # Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt.
 # Manually removed it so that later steps of numpy upgrade can continue
 RUN if [ "$BASE_IMAGE" = "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" ]; then \
     rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/; fi
 
+# build cupy
+RUN if [ "$BUILD_CUPY" = "1" ]; then \
+    mkdir -p libs \
+    && cd libs \
+    && git clone -b hipgraph_enablement --recursive https://github.com/ROCm/cupy.git \
+    && cd cupy \
+    && pip install mpi4py-mpich \
+    && pip install scipy==1.9.3 \
+    && pip install cython==0.29.* \
+    && env CC=$MPI_HOME/bin/mpicc python -m pip install mpi4py \
+    && export CUPY_INSTALL_USE_HIP=1 \
+    && export ROCM_HOME=/opt/rocm \
+    && export HCC_AMDGPU_TARGET="gfx90a,gfx942,gfx1100" \
+    && pip install . \
+    && cd ..; \
+    fi
+
+COPY ./ /app/vllm
+
+RUN python3 -m pip install --upgrade pip
+RUN python3 -m pip install xformers==0.0.23 --no-deps
+
 RUN cd /app \
     && cd vllm \
     && pip install -U -r requirements-rocm.txt \
diff --git a/README.md b/README.md
index 22304eb48f4f8..9e4d1e7879216 100644
--- a/README.md
+++ b/README.md
@@ -27,7 +27,7 @@ pip install -e .
 
 ## Quickstart
 
-Neural Magic maintains a variety of sparse models on our Hugging Face organization profiles, [neuralmagic](https://huggingface.co/neuralmagic) and [nm-testing](https://huggingface.co/nm-testing). 
+Neural Magic maintains a variety of sparse models on our Hugging Face organization profiles, [neuralmagic](https://huggingface.co/neuralmagic) and [nm-testing](https://huggingface.co/nm-testing).
 
 A collection of ready-to-use SparseGPT and GPTQ models in inference optimized marlin format are [available on Hugging Face](https://huggingface.co/collections/neuralmagic/compressed-llms-for-nm-vllm-65e73e3d51d3200e34b77431)
 
@@ -63,7 +63,7 @@ For a quick demonstration, here's how to run a small [50% sparse llama2-110M](ht
 from vllm import LLM, SamplingParams
 
 model = LLM(
-    "neuralmagic/llama2.c-stories110M-pruned50", 
+    "neuralmagic/llama2.c-stories110M-pruned50",
     sparsity="sparse_w16a16",   # If left off, model will be loaded as dense
 )
 
diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index 11e1b6a04e093..26d2c24d5655c 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -277,10 +277,80 @@ async def async_request_openai_completions(
     return output
 
 
+async def async_request_openai_chat_completions(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    assert api_url.endswith(
+        "v1/chat/completions"
+    ), "OpenAI Chat API URL must end with 'v1/chat/completions'."
+
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+        assert not request_func_input.use_beam_search
+        payload = {
+            "model": request_func_input.model,
+            "messages": [
+                {
+                    "role": "user",
+                    "content": request_func_input.prompt,
+                },
+            ],
+            "temperature": 0.0,
+            "max_tokens": request_func_input.output_len,
+            "stream": True,
+        }
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
+        }
+
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+
+        generated_text = ""
+        ttft = 0
+        st = time.perf_counter()
+        try:
+            async with session.post(url=api_url, json=payload,
+                                    headers=headers) as response:
+                if response.status == 200:
+                    async for chunk in response.content:
+                        if ttft == 0:
+                            ttft = time.perf_counter() - st
+                            output.ttft = ttft
+
+                        chunk = chunk.strip()
+                        if not chunk:
+                            continue
+
+                        chunk = chunk.decode("utf-8").lstrip("data: ")
+                        if chunk == "[DONE]":
+                            latency = time.perf_counter() - st
+                        else:
+                            body = json.loads(chunk)
+                            if "content" in body["choices"][0]["delta"]:
+                                generated_text += body["choices"][0]["delta"][
+                                    "content"]
+
+                    output.generated_text = generated_text
+                    output.success = True
+                    output.latency = latency
+                else:
+                    output.success = False
+        except (aiohttp.ClientOSError, aiohttp.ServerDisconnectedError):
+            output.success = False
+
+    if pbar:
+        pbar.update(1)
+    return output
+
+
 ASYNC_REQUEST_FUNCS = {
     "tgi": async_request_tgi,
     "vllm": async_request_vllm,
     "deepspeed-mii": async_request_deepspeed_mii,
     "openai": async_request_openai_completions,
+    "openai-chat": async_request_openai_chat_completions,
     "tensorrt-llm": async_request_trt_llm,
 }
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 7d389a9c7d703..3f5e2d9c8f4dc 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -12,7 +12,7 @@
 On the client side, run:
     python benchmarks/benchmark_serving.py \
         --backend <backend> \
-        --tokenizer <your_model> --dataset <target_dataset> \
+        --model <your_model> --dataset <target_dataset> \
         --request-rate <request_rate>
 """
 import argparse
@@ -171,10 +171,10 @@ async def benchmark(
     else:
         raise ValueError(f"Unknown backend: {backend}")
 
-    pbar = None if disable_tqdm else tqdm(total=len(input_requests))
-
     print(f"Traffic request rate: {request_rate}")
 
+    pbar = None if disable_tqdm else tqdm(total=len(input_requests))
+
     benchmark_start_time = time.perf_counter()
     tasks = []
     async for request in get_request(input_requests, request_rate):
diff --git a/csrc/attention/attention_kernels.cu b/csrc/attention/attention_kernels.cu
index b5be3befa07e2..5e61668d5cc1a 100644
--- a/csrc/attention/attention_kernels.cu
+++ b/csrc/attention/attention_kernels.cu
@@ -15,9 +15,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifdef USE_ROCM
-#include <hip/hip_runtime.h>
-#endif
 
 #include <torch/extension.h>
 #include <ATen/cuda/CUDAContext.h>
@@ -31,11 +28,6 @@
 
 #include <algorithm>
 
-#ifndef USE_ROCM
-#define WARP_SIZE 32
-#else
-#define WARP_SIZE warpSize
-#endif
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 #define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b))
diff --git a/csrc/cuda_compat.h b/csrc/cuda_compat.h
index aa58dd73c148a..c711d8d1b24b9 100644
--- a/csrc/cuda_compat.h
+++ b/csrc/cuda_compat.h
@@ -1,5 +1,15 @@
 #pragma once
 
+#ifdef USE_ROCM
+#include <hip/hip_runtime.h>
+#endif
+
+#ifndef USE_ROCM
+  #define WARP_SIZE 32
+#else
+  #define WARP_SIZE warpSize
+#endif
+
 #ifndef USE_ROCM
   #define VLLM_LDG(arg) __ldg(arg)
 #else
diff --git a/csrc/punica/bgmv/bgmv_config.h b/csrc/punica/bgmv/bgmv_config.h
index d5fee9c40d00c..4dc90de1ab42a 100644
--- a/csrc/punica/bgmv/bgmv_config.h
+++ b/csrc/punica/bgmv/bgmv_config.h
@@ -14,6 +14,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
     f(in_T, out_T, W_T, narrow, 128) \
     f(in_T, out_T, W_T, narrow, 256) \
     f(in_T, out_T, W_T, narrow, 512) \
+    f(in_T, out_T, W_T, narrow, 768) \
     f(in_T, out_T, W_T, narrow, 1024) \
     f(in_T, out_T, W_T, narrow, 1280) \
     f(in_T, out_T, W_T, narrow, 1728) \
@@ -21,6 +22,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
     f(in_T, out_T, W_T, narrow, 2048) \
     f(in_T, out_T, W_T, narrow, 2560) \
     f(in_T, out_T, W_T, narrow, 2752) \
+    f(in_T, out_T, W_T, narrow, 2816) \
     f(in_T, out_T, W_T, narrow, 3072) \
     f(in_T, out_T, W_T, narrow, 3456) \
     f(in_T, out_T, W_T, narrow, 3584) \
@@ -36,6 +38,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
     f(in_T, out_T, W_T, narrow, 10240) \
     f(in_T, out_T, W_T, narrow, 11008) \
     f(in_T, out_T, W_T, narrow, 12288) \
+    f(in_T, out_T, W_T, narrow, 13696) \
     f(in_T, out_T, W_T, narrow, 13824) \
     f(in_T, out_T, W_T, narrow, 14336) \
     f(in_T, out_T, W_T, narrow, 16384) \
diff --git a/csrc/reduction_utils.cuh b/csrc/reduction_utils.cuh
index b95ccef162072..210bf0b023ab2 100644
--- a/csrc/reduction_utils.cuh
+++ b/csrc/reduction_utils.cuh
@@ -24,7 +24,7 @@ namespace vllm {
 template<typename T>
 __inline__ __device__ T warpReduceSum(T val) {
 #pragma unroll
-  for (int mask = 16; mask > 0; mask >>= 1)
+  for (int mask = WARP_SIZE/2; mask > 0; mask >>= 1)
     val += VLLM_SHFL_XOR_SYNC(val, mask);
   return val;
 }
@@ -32,7 +32,7 @@ __inline__ __device__ T warpReduceSum(T val) {
 /* Calculate the sum of all elements in a block */
 template<typename T>
 __inline__ __device__ T blockReduceSum(T val) {
-  static __shared__ T shared[32];
+  static __shared__ T shared[WARP_SIZE];
   int lane = threadIdx.x & 0x1f;
   int wid = threadIdx.x >> 5;
 
@@ -45,7 +45,7 @@ __inline__ __device__ T blockReduceSum(T val) {
 
   // Modify from blockDim.x << 5 to blockDim.x / 32. to prevent
   // blockDim.x is not divided by 32
-  val = (threadIdx.x < (blockDim.x / 32.f)) ? shared[lane] : (T)(0.0f);
+  val = (threadIdx.x < (blockDim.x / (WARP_SIZE * 1.0f))) ? shared[lane] : (T)(0.0f);
   val = warpReduceSum<T>(val);
   return val;
 }
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 5a45c6f9d1e0a..61d24e1612128 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -72,8 +72,15 @@
 
 # Mock out external dependencies here.
 autodoc_mock_imports = [
-    "torch", "transformers", "psutil", "prometheus_client", "sentencepiece",
-    "vllm.cuda_utils", "vllm._C"
+    "torch",
+    "transformers",
+    "psutil",
+    "prometheus_client",
+    "sentencepiece",
+    "vllm.cuda_utils",
+    "vllm._C",
+    "numpy",
+    "tqdm",
 ]
 
 for mock_target in autodoc_mock_imports:
diff --git a/docs/source/models/lora.rst b/docs/source/models/lora.rst
index 21b18c75fc552..71211189d184f 100644
--- a/docs/source/models/lora.rst
+++ b/docs/source/models/lora.rst
@@ -90,9 +90,9 @@ Requests can specify the LoRA adapter as if it were any other model via the ``mo
 processed according to the server-wide LoRA configuration (i.e. in parallel with base model requests, and potentially other
 LoRA adapter requests if they were provided and ``max_loras`` is set high enough).
 
-The following is an example request 
+The following is an example request
 
-.. code-block::bash 
+.. code-block::bash
     curl http://localhost:8000/v1/completions \
         -H "Content-Type: application/json" \
         -d '{
diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index 9d4ec663a16e5..3b30dfaf3b47b 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -8,84 +8,109 @@ The following is the list of model architectures that are currently supported by
 Alongside each architecture, we include some popular models that use it.
 
 .. list-table::
-  :widths: 25 25 50
+  :widths: 25 25 50 5
   :header-rows: 1
 
   * - Architecture
     - Models
     - Example HuggingFace Models
+    - :ref:`LoRA <lora>`
   * - :code:`AquilaForCausalLM`
     - Aquila
     - :code:`BAAI/Aquila-7B`, :code:`BAAI/AquilaChat-7B`, etc.
+    - ✅︎
   * - :code:`BaiChuanForCausalLM`
     - Baichuan
     - :code:`baichuan-inc/Baichuan2-13B-Chat`, :code:`baichuan-inc/Baichuan-7B`, etc.
+    -
   * - :code:`ChatGLMModel`
     - ChatGLM
     - :code:`THUDM/chatglm2-6b`, :code:`THUDM/chatglm3-6b`, etc.
+    -
   * - :code:`DeciLMForCausalLM`
     - DeciLM
     - :code:`Deci/DeciLM-7B`, :code:`Deci/DeciLM-7B-instruct`, etc.
+    -
   * - :code:`BloomForCausalLM`
     - BLOOM, BLOOMZ, BLOOMChat
     - :code:`bigscience/bloom`, :code:`bigscience/bloomz`, etc.
+    -
   * - :code:`FalconForCausalLM`
     - Falcon
     - :code:`tiiuae/falcon-7b`, :code:`tiiuae/falcon-40b`, :code:`tiiuae/falcon-rw-7b`, etc.
+    -
   * - :code:`GemmaForCausalLM`
     - Gemma
     - :code:`google/gemma-2b`, :code:`google/gemma-7b`, etc.
+    - ✅︎
   * - :code:`GPT2LMHeadModel`
     - GPT-2
     - :code:`gpt2`, :code:`gpt2-xl`, etc.
+    -
   * - :code:`GPTBigCodeForCausalLM`
     - StarCoder, SantaCoder, WizardCoder
     - :code:`bigcode/starcoder`, :code:`bigcode/gpt_bigcode-santacoder`, :code:`WizardLM/WizardCoder-15B-V1.0`, etc.
+    -
   * - :code:`GPTJForCausalLM`
     - GPT-J
     - :code:`EleutherAI/gpt-j-6b`, :code:`nomic-ai/gpt4all-j`, etc.
+    -
   * - :code:`GPTNeoXForCausalLM`
     - GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM
     - :code:`EleutherAI/gpt-neox-20b`, :code:`EleutherAI/pythia-12b`, :code:`OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, :code:`databricks/dolly-v2-12b`, :code:`stabilityai/stablelm-tuned-alpha-7b`, etc.
+    -
   * - :code:`InternLMForCausalLM`
     - InternLM
     - :code:`internlm/internlm-7b`, :code:`internlm/internlm-chat-7b`, etc.
+    - ✅︎
   * - :code:`InternLM2ForCausalLM`
     - InternLM2
     - :code:`internlm/internlm2-7b`, :code:`internlm/internlm2-chat-7b`, etc.
+    -
   * - :code:`LlamaForCausalLM`
     - LLaMA, LLaMA-2, Vicuna, Alpaca, Yi
     - :code:`meta-llama/Llama-2-13b-hf`, :code:`meta-llama/Llama-2-70b-hf`, :code:`openlm-research/open_llama_13b`, :code:`lmsys/vicuna-13b-v1.3`, :code:`01-ai/Yi-6B`, :code:`01-ai/Yi-34B`, etc.
+    - ✅︎
   * - :code:`MistralForCausalLM`
     - Mistral, Mistral-Instruct
     - :code:`mistralai/Mistral-7B-v0.1`, :code:`mistralai/Mistral-7B-Instruct-v0.1`, etc.
+    - ✅︎
   * - :code:`MixtralForCausalLM`
     - Mixtral-8x7B, Mixtral-8x7B-Instruct
     - :code:`mistralai/Mixtral-8x7B-v0.1`, :code:`mistralai/Mixtral-8x7B-Instruct-v0.1`, etc.
+    - ✅︎
   * - :code:`MPTForCausalLM`
     - MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter
     - :code:`mosaicml/mpt-7b`, :code:`mosaicml/mpt-7b-storywriter`, :code:`mosaicml/mpt-30b`, etc.
+    -
   * - :code:`OLMoForCausalLM`
     - OLMo
     - :code:`allenai/OLMo-1B`, :code:`allenai/OLMo-7B`, etc.
+    -
   * - :code:`OPTForCausalLM`
     - OPT, OPT-IML
     - :code:`facebook/opt-66b`, :code:`facebook/opt-iml-max-30b`, etc.
+    -
   * - :code:`OrionForCausalLM`
     - Orion
     - :code:`OrionStarAI/Orion-14B-Base`, :code:`OrionStarAI/Orion-14B-Chat`, etc.
+    -
   * - :code:`PhiForCausalLM`
     - Phi
     - :code:`microsoft/phi-1_5`, :code:`microsoft/phi-2`, etc.
+    -
   * - :code:`QWenLMHeadModel`
     - Qwen
     - :code:`Qwen/Qwen-7B`, :code:`Qwen/Qwen-7B-Chat`, etc.
+    -
   * - :code:`Qwen2ForCausalLM`
     - Qwen2
     - :code:`Qwen/Qwen2-beta-7B`, :code:`Qwen/Qwen2-beta-7B-Chat`, etc.
+    - ✅︎
   * - :code:`StableLmForCausalLM`
     - StableLM
     - :code:`stabilityai/stablelm-3b-4e1t/` , :code:`stabilityai/stablelm-base-alpha-7b-v2`, etc.
+    -
 
 If your model uses one of the above model architectures, you can seamlessly run your model with vLLM.
 Otherwise, please refer to :ref:`Adding a New Model <adding_a_new_model>` for instructions on how to implement support for your model.
diff --git a/pyproject.toml b/pyproject.toml
index c5db016cebdb7..f74e50265be24 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -9,6 +9,10 @@ requires = [
 ]
 build-backend = "setuptools.build_meta"
 
+[tool.ruff]
+# Allow lines to be as long as 80.
+line-length = 80
+
 [tool.ruff.lint]
 select = [
     # pycodestyle
diff --git a/requirements-dev.txt b/requirements-dev.txt
index c5fc6f7f07993..18de4b7420de2 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -23,3 +23,7 @@ einops # required for MPT
 openai
 requests
 ray
+peft
+
+# Benchmarking
+aiohttp
diff --git a/setup.py b/setup.py
index 84c452225a0dc..af78769695811 100644
--- a/setup.py
+++ b/setup.py
@@ -144,8 +144,8 @@ def get_pytorch_rocm_arch() -> Set[str]:
     # If we don't have PYTORCH_ROCM_ARCH specified pull the list from rocm_agent_enumerator
     if env_arch_list is None:
         command = "rocm_agent_enumerator"
-        env_arch_list = subprocess.check_output([command]).decode('utf-8')\
-                        .strip().replace("\n", ";")
+        env_arch_list = (subprocess.check_output(
+            [command]).decode('utf-8').strip().replace("\n", ";"))
         arch_source_str = "rocm_agent_enumerator"
     else:
         arch_source_str = "PYTORCH_ROCM_ARCH env variable"
diff --git a/tests/async_engine/test_async_llm_engine.py b/tests/async_engine/test_async_llm_engine.py
index 1edb19c550010..cb125a7bfec30 100644
--- a/tests/async_engine/test_async_llm_engine.py
+++ b/tests/async_engine/test_async_llm_engine.py
@@ -25,12 +25,8 @@ async def step_async(self):
         return [RequestOutput(
             request_id=self.request_id)] if self.request_id else []
 
-    async def encode_request_async(
-        self,
-        *args,
-        **kwargs,
-    ):
-        return [1]
+    async def encode_request_async(self, *args, **kwargs):
+        pass
 
     def generate(self, request_id):
         self.request_id = request_id
@@ -43,13 +39,16 @@ def add_request(self, **kwargs):
         self.add_request_calls += 1
 
     async def add_request_async(self, **kwargs):
-        del kwargs  # Unused
         self.add_request_calls += 1
+        return
 
     def abort_request(self, request_id):
         del request_id  # Unused
         self.abort_request_calls += 1
 
+    def has_unfinished_requests(self):
+        return self.request_id is not None
+
 
 class MockAsyncLLMEngine(AsyncLLMEngine):
 
@@ -72,20 +71,24 @@ async def test_new_requests_event():
     await engine.add_request("2", "", None)
     engine.engine.generate("2")
     await asyncio.sleep(0)
-    assert engine.engine.add_request_calls == 2
-    assert engine.engine.step_calls == 2
     await asyncio.sleep(0)
-    assert engine.engine.step_calls == 3
+    assert engine.engine.add_request_calls == 2
+    assert engine.engine.step_calls >= 2
+    await asyncio.sleep(0.001)
+    assert engine.engine.step_calls >= 3
     engine.engine.stop_generating()
-    await asyncio.sleep(0)
-    assert engine.engine.step_calls == 4
-    await asyncio.sleep(0)
-    assert engine.engine.step_calls == 4
+    await asyncio.sleep(0.001)
+    old_step_calls = engine.engine.step_calls
+    await asyncio.sleep(0.001)
+    assert engine.engine.step_calls == old_step_calls
 
     await engine.add_request("3", "", None)
     await asyncio.sleep(0.01)
     assert engine.engine.add_request_calls == 3
-    assert engine.engine.step_calls == 5
+    assert engine.engine.step_calls == old_step_calls + 1
     await asyncio.sleep(0.01)
     assert engine.engine.add_request_calls == 3
-    assert engine.engine.step_calls == 5
+    assert engine.engine.step_calls == old_step_calls + 1
+
+    engine = MockAsyncLLMEngine(worker_use_ray=True, engine_use_ray=True)
+    assert engine.get_tokenizer() is not None
diff --git a/tests/async_engine/test_chat_template.py b/tests/async_engine/test_chat_template.py
index 32d110e0f0b47..e98bba8d43b49 100644
--- a/tests/async_engine/test_chat_template.py
+++ b/tests/async_engine/test_chat_template.py
@@ -73,7 +73,7 @@ def test_load_chat_template():
     assert template_content is not None
     # Hard coded value for template_chatml.jinja
     assert template_content == """{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\\n'}}{% endif %}{% endfor %}
-{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\\n' }}{% endif %}"""
+{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\\n' }}{% endif %}"""  # noqa: E501
 
 
 def test_no_load_chat_template():
@@ -117,4 +117,6 @@ async def test_get_gen_prompt(model, template, add_generation_prompt,
         add_generation_prompt=mock_request.add_generation_prompt)
 
     # Test assertion
-    assert result == expected_output, f"The generated prompt does not match the expected output for model {model} and template {template}"
+    assert result == expected_output, (
+        f"The generated prompt does not match the expected output for "
+        f"model {model} and template {template}")
diff --git a/tests/async_engine/test_request_tracker.py b/tests/async_engine/test_request_tracker.py
index 4043558bae919..7b1f4a9e1eb2f 100644
--- a/tests/async_engine/test_request_tracker.py
+++ b/tests/async_engine/test_request_tracker.py
@@ -4,25 +4,14 @@
 from vllm.outputs import RequestOutput
 
 
-class DummyEvent:
-
-    def __init__(self):
-        self.flag = False
-
-    def set(self):
-        self.flag = True
-
-    def clear(self):
-        self.flag = False
-
-
-def test_request_tracker():
+@pytest.mark.asyncio
+async def test_request_tracker():
     tracker = RequestTracker()
-    tracker.new_requests_event = DummyEvent()
     stream_1 = tracker.add_request("1")
-    assert tracker.new_requests_event.flag
+    assert tracker.new_requests_event.is_set()
+    await tracker.wait_for_new_requests()
     new, finished = tracker.get_new_and_finished_requests()
-    assert not tracker.new_requests_event.flag
+    assert not tracker.new_requests_event.is_set()
     assert len(new) == 1
     assert new[0]["request_id"] == "1"
     assert not finished
@@ -30,9 +19,10 @@ def test_request_tracker():
 
     stream_2 = tracker.add_request("2")
     stream_3 = tracker.add_request("3")
-    assert tracker.new_requests_event.flag
+    assert tracker.new_requests_event.is_set()
+    await tracker.wait_for_new_requests()
     new, finished = tracker.get_new_and_finished_requests()
-    assert not tracker.new_requests_event.flag
+    assert not tracker.new_requests_event.is_set()
     assert len(new) == 2
     assert new[0]["request_id"] == "2"
     assert new[1]["request_id"] == "3"
@@ -43,7 +33,7 @@ def test_request_tracker():
     # request_ids must be unique
     with pytest.raises(KeyError):
         tracker.add_request("1")
-    assert not tracker.new_requests_event.flag
+    assert not tracker.new_requests_event.is_set()
 
     tracker.abort_request("1")
     new, finished = tracker.get_new_and_finished_requests()
@@ -54,7 +44,8 @@ def test_request_tracker():
 
     stream_4 = tracker.add_request("4")
     tracker.abort_request("4")
-    assert tracker.new_requests_event.flag
+    assert tracker.new_requests_event.is_set()
+    await tracker.wait_for_new_requests()
     new, finished = tracker.get_new_and_finished_requests()
     assert len(finished) == 1
     assert "4" in finished
@@ -62,11 +53,12 @@ def test_request_tracker():
     assert stream_4.finished
 
     stream_5 = tracker.add_request("5")
-    assert tracker.new_requests_event.flag
+    assert tracker.new_requests_event.is_set()
     tracker.process_request_output(
-        RequestOutput("2", "output", [], [], [], bool(finished)))
+        RequestOutput("2", "output", [], [], [], finished=True))
+    await tracker.wait_for_new_requests()
     new, finished = tracker.get_new_and_finished_requests()
-    assert not tracker.new_requests_event.flag
+    assert not tracker.new_requests_event.is_set()
     assert len(finished) == 1
     assert "2" in finished
     assert len(new) == 1
diff --git a/tests/worker/spec_decode/__init__.py b/tests/core/__init__.py
similarity index 100%
rename from tests/worker/spec_decode/__init__.py
rename to tests/core/__init__.py
diff --git a/tests/core/test_block_manager.py b/tests/core/test_block_manager.py
new file mode 100644
index 0000000000000..b280fd1d73c2f
--- /dev/null
+++ b/tests/core/test_block_manager.py
@@ -0,0 +1,276 @@
+import pytest
+import time
+from typing import List
+
+from vllm import SamplingParams
+from vllm.block import PhysicalTokenBlock
+from vllm.core.block_manager import (BlockAllocator, BlockSpaceManager,
+                                     AllocStatus)
+from vllm.utils import Device
+from vllm.sequence import Sequence, SequenceGroup, SequenceStatus, Logprob
+
+from .utils import create_dummy_prompt
+
+
+def test_block_allocator_allocate():
+    block_size = 4
+    num_cpu_blocks = 4
+    cpu_allocator = BlockAllocator(Device.CPU, block_size, num_cpu_blocks)
+
+    # Allocate all available cpu blocks.
+    num_free = num_cpu_blocks
+    assert cpu_allocator.get_num_free_blocks() == num_free
+    for _ in range(num_cpu_blocks):
+        block = cpu_allocator.allocate()
+        num_free -= 1
+
+        assert block.block_hash not in cpu_allocator.evictor
+        assert cpu_allocator.get_num_free_blocks() == num_free
+
+    with pytest.raises(ValueError):
+        cpu_allocator.allocate()
+
+
+def test_block_allocator_free():
+    block_size = 4
+    num_cpu_blocks = 4
+    cpu_allocator = BlockAllocator(Device.CPU, block_size, num_cpu_blocks)
+
+    # Allocate all available cpu blocks.
+    blocks: List[PhysicalTokenBlock] = []
+    for _ in range(num_cpu_blocks):
+        block = cpu_allocator.allocate()
+        blocks.append(block)
+        assert block.block_hash not in cpu_allocator.evictor
+
+    # Free all allocated cpu blocks.
+    num_free = 0
+    assert cpu_allocator.get_num_free_blocks() == num_free
+    for block in blocks:
+        cpu_allocator.free(block)
+        num_free += 1
+        assert block.block_hash in cpu_allocator.evictor
+        assert cpu_allocator.get_num_free_blocks() == num_free
+
+        with pytest.raises(ValueError):
+            cpu_allocator.free(block)
+
+
+def test_allocate():
+    block_size = 4
+    num_cpu_blocks = 4
+    num_gpu_blocks = 4
+    block_manager = BlockSpaceManager(block_size,
+                                      num_cpu_blocks,
+                                      num_gpu_blocks,
+                                      watermark=0)
+
+    # Allocate same sequence group to all available gpu blocks.
+    for i in range(num_gpu_blocks):
+        _, seq_group = create_dummy_prompt(str(i), block_size)
+        assert block_manager.can_allocate(seq_group)
+        block_manager.allocate(seq_group)
+    assert block_manager.can_allocate(seq_group) != AllocStatus.OK
+
+    # Allocate same sequence group to all available gpu blocks.
+    # Use watermark to reserve one gpu block.
+    block_manager = BlockSpaceManager(block_size,
+                                      num_cpu_blocks,
+                                      num_gpu_blocks,
+                                      watermark=1 / num_gpu_blocks)
+    for i in range(num_gpu_blocks - 1):
+        _, seq_group = create_dummy_prompt(str(i), block_size)
+        assert block_manager.can_allocate(seq_group)
+        block_manager.allocate(seq_group)
+    assert block_manager.can_allocate(seq_group) != AllocStatus.OK
+
+
+def test_append_slot_single_seq():
+    block_size = 4
+    num_cpu_blocks = 4
+    num_gpu_blocks = 4
+    block_manager = BlockSpaceManager(block_size,
+                                      num_cpu_blocks,
+                                      num_gpu_blocks,
+                                      watermark=0)
+
+    # Allocate single seq to gpu block.
+    prompt, seq_group = create_dummy_prompt("1", block_size)
+    block_manager.allocate(seq_group)
+
+    # Nothing to append. Sequence has no new logical blocks.
+    assert block_manager.can_append_slot(seq_group)
+    before_blocks = block_manager.get_num_free_gpu_blocks()
+    assert not block_manager.append_slot(prompt)
+    after_blocks = block_manager.get_num_free_gpu_blocks()
+    assert before_blocks == after_blocks
+
+    # Add block_size number of new tokens and append slot.
+    for i in range(block_size):
+        token_id = i + 5
+        prompt.append_token_id(token_id, {token_id: Logprob(0.0)})
+
+    assert block_manager.can_append_slot(seq_group)
+    before_blocks = block_manager.get_num_free_gpu_blocks()
+    assert not block_manager.append_slot(prompt)
+    after_blocks = block_manager.get_num_free_gpu_blocks()
+    assert before_blocks - after_blocks == 1
+
+
+def test_append_slot_cow():
+    block_size = 4
+    num_cpu_blocks = 4
+    num_gpu_blocks = 4
+    block_manager = BlockSpaceManager(block_size=block_size,
+                                      num_cpu_blocks=num_cpu_blocks,
+                                      num_gpu_blocks=num_gpu_blocks,
+                                      watermark=0)
+
+    # Allocate prompt to gpu block. There is one slot left in the block.
+    prompt = Sequence(seq_id=1,
+                      prompt="one two three",
+                      prompt_token_ids=[1, 2, 3],
+                      block_size=block_size)
+
+    # Fork the sequence, such that a COW will be required when we append a new
+    # token id.
+    child = prompt.fork(new_seq_id=2)
+
+    # Allocate space for the sequence group.
+    seq_group = SequenceGroup("1", [prompt, child], SamplingParams(),
+                              time.time(), time.perf_counter)
+    block_manager.allocate(seq_group)
+
+    # Fork and append a new token id. We expect a COW to be scheduled.
+    token_id = 4
+    child.append_token_id(token_id, {token_id: Logprob(0.0)})
+    block_manager.fork(prompt, child)
+
+    assert block_manager.can_append_slot(seq_group)
+    before_blocks = block_manager.get_num_free_gpu_blocks()
+
+    maybe_src_dst_block = block_manager.append_slot(child)
+    assert maybe_src_dst_block is not None
+    src_block, dst_block = maybe_src_dst_block
+    assert src_block != dst_block
+
+    after_blocks = block_manager.get_num_free_gpu_blocks()
+    assert before_blocks - after_blocks == 1
+
+
+def test_fork():
+    block_size = 4
+    num_cpu_blocks = 4
+    num_gpu_blocks = 4
+    block_manager = BlockSpaceManager(block_size,
+                                      num_cpu_blocks,
+                                      num_gpu_blocks,
+                                      watermark=0)
+
+    prompt, seq_group = create_dummy_prompt("1",
+                                            block_size - 1,
+                                            block_size=block_size)
+    block_manager.allocate(seq_group)
+
+    # Fork prompt and copy block tables.
+    child = prompt.fork(2)
+    block_manager.fork(prompt, child)
+    assert block_manager.get_block_table(
+        prompt) == block_manager.get_block_table(child)
+    token_id = 4
+    # Append token to child. Block is shared so copy on write occurs.
+    child.append_token_id(token_id, {token_id: Logprob(0.0)})
+    block_manager.append_slot(child)
+    assert block_manager.get_block_table(
+        prompt) != block_manager.get_block_table(child)
+
+
+def test_swap():
+    block_size = 4
+    num_cpu_blocks = 4
+    num_gpu_blocks = 4
+    block_manager = BlockSpaceManager(block_size,
+                                      num_cpu_blocks,
+                                      num_gpu_blocks,
+                                      watermark=0)
+
+    prompt, seq_group = create_dummy_prompt("1", prompt_length=block_size - 1)
+    prompt.status = SequenceStatus.WAITING
+    block_manager.allocate(seq_group)
+
+    # Emulate a forward pass by appending a single token.
+    # The block manager then knows how many unprocessed
+    # tokens will be written in the next forward pass.
+    token_id = 0
+    prompt.status = SequenceStatus.RUNNING
+    prompt.append_token_id(token_id, {token_id: Logprob(0.0)})
+
+    # Swap seq group from GPU -> CPU.
+    gpu_blocks = block_manager.get_block_table(prompt)
+    assert block_manager.can_swap_out(seq_group)
+    before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
+    before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
+    mapping = block_manager.swap_out(seq_group)
+    assert list(mapping.keys()) == gpu_blocks
+    after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
+    after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
+    assert before_cpu_blocks == after_cpu_blocks + len(gpu_blocks)
+    assert before_gpu_blocks + len(gpu_blocks) == after_gpu_blocks
+    prompt.status = SequenceStatus.SWAPPED
+
+    # Swap seq group from CPU -> GPU.
+    cpu_blocks = block_manager.get_block_table(prompt)
+    assert block_manager.can_swap_in(seq_group)
+    before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
+    before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
+    mapping = block_manager.swap_in(seq_group)
+    assert list(mapping.keys()) == cpu_blocks
+    after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
+    after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
+    assert before_cpu_blocks + len(cpu_blocks) == after_cpu_blocks
+    assert before_gpu_blocks == after_gpu_blocks + len(cpu_blocks)
+
+
+def test_free():
+    block_size = 4
+    num_cpu_blocks = 4
+    num_gpu_blocks = 4
+    block_manager = BlockSpaceManager(block_size,
+                                      num_cpu_blocks,
+                                      num_gpu_blocks,
+                                      watermark=0)
+
+    prompt, seq_group = create_dummy_prompt("1", block_size)
+    block_manager.allocate(seq_group)
+
+    # Free allocated seq.
+    prompt_blocks = len(block_manager.get_block_table(prompt))
+    before_blocks = block_manager.get_num_free_gpu_blocks()
+    block_manager.free(prompt)
+    after_blocks = block_manager.get_num_free_gpu_blocks()
+    assert after_blocks == before_blocks + prompt_blocks
+
+    # Block table for freed seq is deleted.
+    with pytest.raises(KeyError):
+        block_manager.get_block_table(prompt)
+
+
+def test_reset():
+    block_size = 4
+    num_cpu_blocks = 4
+    num_gpu_blocks = 4
+    block_manager = BlockSpaceManager(block_size,
+                                      num_cpu_blocks,
+                                      num_gpu_blocks,
+                                      watermark=0)
+
+    # Allocate same seq group on all available gpu blocks.
+    original_blocks = block_manager.get_num_free_gpu_blocks()
+    for i in range(num_gpu_blocks):
+        _, seq_group = create_dummy_prompt(str(i), block_size)
+        block_manager.allocate(seq_group)
+    assert block_manager.get_num_free_gpu_blocks() == 0
+
+    # Resetting block manager frees all allocated blocks.
+    block_manager.reset()
+    assert block_manager.get_num_free_gpu_blocks() == original_blocks
diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py
new file mode 100644
index 0000000000000..ebfeb8ba04812
--- /dev/null
+++ b/tests/core/test_scheduler.py
@@ -0,0 +1,170 @@
+from typing import List
+import pytest  # noqa
+
+from vllm.config import CacheConfig, SchedulerConfig
+from vllm.core.scheduler import Scheduler
+from vllm.sequence import SequenceGroup, Logprob
+
+from .utils import create_dummy_prompt
+
+
+def test_scheduler_add_seq_group():
+    block_size = 4
+    scheduler_config = SchedulerConfig(100, 64, 1, 256)
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 4
+    cache_config.num_gpu_blocks = 4
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+
+    # Add seq group to scheduler.
+    num_seq_group = 4
+    for i in range(num_seq_group):
+        _, seq_group = create_dummy_prompt(str(i), block_size)
+        scheduler.add_seq_group(seq_group)
+        assert scheduler.get_num_unfinished_seq_groups() == i + 1
+
+
+def test_scheduler_abort_seq_group():
+    block_size = 4
+    scheduler_config = SchedulerConfig(100, 64, 1, 256)
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 4
+    cache_config.num_gpu_blocks = 4
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+
+    # Add multiple seq groups to scheduler.
+    num_seq_group = 4
+    request_ids = set()
+    for i in range(num_seq_group):
+        _, seq_group = create_dummy_prompt(str(i), block_size)
+        scheduler.add_seq_group(seq_group)
+        request_ids.add(str(i))
+
+    # Abort all added seq groups.
+    assert scheduler.get_num_unfinished_seq_groups() == num_seq_group
+    scheduler.abort_seq_group(request_ids)
+    assert scheduler.get_num_unfinished_seq_groups() == 0
+
+
+def test_scheduler_schedule_simple():
+    block_size = 4
+    num_seq_group = 4
+    max_model_len = 16
+    scheduler_config = SchedulerConfig(64, num_seq_group, max_model_len, 256)
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 8
+    cache_config.num_gpu_blocks = 8
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+
+    # Add seq groups to scheduler.
+    running: List[SequenceGroup] = []
+    for i in range(num_seq_group):
+        _, seq_group = create_dummy_prompt(str(i), prompt_length=block_size)
+        scheduler.add_seq_group(seq_group)
+        running.append(seq_group)
+
+    # Schedule seq groups prompts.
+    seq_group_meta, out = scheduler.schedule()
+    assert set(out.scheduled_seq_groups) == set(running)
+    assert out.num_batched_tokens == num_seq_group * seq_group.get_seqs(
+    )[0].get_len()
+    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
+            and not out.blocks_to_swap_out)
+    assert len(seq_group_meta) == num_seq_group
+
+    # Schedule seq groups generation.
+    seq_group_meta, out = scheduler.schedule()
+    assert set(out.scheduled_seq_groups) == set(running)
+    assert out.num_batched_tokens == num_seq_group
+    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
+            and not out.blocks_to_swap_out)
+    assert len(seq_group_meta) == num_seq_group
+
+
+def test_scheduler_schedule_preempt_abort():
+    block_size = 4
+    max_model_len = 16
+    scheduler_config = SchedulerConfig(64, 2, max_model_len, 256)
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 2
+    cache_config.num_gpu_blocks = 2
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+
+    # Add seq groups to scheduler.
+    seq_a, seq_group_a = create_dummy_prompt("1", block_size)
+    seq_b, seq_group_b = create_dummy_prompt("2", block_size)
+    scheduler.add_seq_group(seq_group_a)
+    scheduler.add_seq_group(seq_group_b)
+
+    # Schedule seq groups prompts.
+    seq_group_meta, out = scheduler.schedule()
+    assert out.scheduled_seq_groups == [seq_group_a, seq_group_b]
+    assert out.num_batched_tokens == seq_group_a.get_seqs()[0].get_len() * 2
+    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
+            and not out.blocks_to_swap_out)
+    assert len(seq_group_meta) == 2
+    assert scheduler.get_num_unfinished_seq_groups() == 2
+
+    # Append "generated" tokens, allowing the sequence to mark prompt tokens as
+    # processed.
+    token_id = 0
+    seq_a.append_token_id(token_id, {token_id: Logprob(0.0)})
+    seq_b.append_token_id(token_id, {token_id: Logprob(0.0)})
+
+    # Schedule seq groups generation and preempt seq group b.
+    seq_group_meta, out = scheduler.schedule()
+    assert out.scheduled_seq_groups == [seq_group_a]
+    assert out.num_batched_tokens == 1
+    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
+            and not out.blocks_to_swap_out)
+    assert len(seq_group_meta) == 1
+    assert scheduler.get_num_unfinished_seq_groups() == 2
+
+    # Abort seq group a. Re-schedule seq group b prompt with recomputation.
+    scheduler.abort_seq_group("1")
+    seq_group_meta, out = scheduler.schedule()
+    assert out.scheduled_seq_groups == [seq_group_b]
+    assert out.num_batched_tokens == seq_group_b.get_seqs()[0].get_len()
+    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
+            and not out.blocks_to_swap_out)
+    assert len(seq_group_meta) == 1
+    assert scheduler.get_num_unfinished_seq_groups() == 1
+
+
+def test_scheduler_max_seqs():
+    block_size = 4
+    num_seq_group = 4
+    max_seq_group = 2
+    max_model_len = 16
+    scheduler_config = SchedulerConfig(64, max_seq_group, max_model_len, 256)
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 8
+    cache_config.num_gpu_blocks = 8
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+
+    all_seq_groups: List[SequenceGroup] = []
+    # Add seq groups to scheduler.
+    for i in range(num_seq_group):
+        _, seq_group = create_dummy_prompt(str(i), prompt_length=block_size)
+        all_seq_groups.append(seq_group)
+
+    # Append 1 seq group
+    scheduler.add_seq_group(all_seq_groups[0])
+
+    # Schedule seq groups prompts.
+    _, out = scheduler.schedule()
+    assert set(out.scheduled_seq_groups) == set([all_seq_groups[0]])
+
+    # Schedule seq groups generation.
+    _, out = scheduler.schedule()
+    assert set(out.scheduled_seq_groups) == set([all_seq_groups[0]])
+
+    # Append 2 more seq group
+    scheduler.add_seq_group(all_seq_groups[1])
+    scheduler.add_seq_group(all_seq_groups[2])
+
+    # Schedule seq groups prompts.
+    # Only 1 seq group should be scheduled since max_seq_group is 2
+    # and one is prompting.
+    _, out = scheduler.schedule()
+    assert set(out.scheduled_seq_groups) == set([all_seq_groups[1]])
diff --git a/tests/core/utils.py b/tests/core/utils.py
new file mode 100644
index 0000000000000..6469789e89386
--- /dev/null
+++ b/tests/core/utils.py
@@ -0,0 +1,27 @@
+import time
+from typing import Tuple
+
+from vllm import SamplingParams
+from vllm.sequence import Sequence, SequenceGroup
+
+
+def create_dummy_prompt(
+        request_id: str,
+        prompt_length: int,
+        block_size: int = None) -> Tuple[Sequence, SequenceGroup]:
+    if not block_size:
+        block_size = prompt_length
+
+    # Create dummy prompt sequence with tokens 0...block_size-1
+    # and prompt "0 ... block_size".
+    prompt_tokens = list(range(prompt_length))
+    prompt_str = " ".join([str(t) for t in prompt_tokens])
+    prompt = Sequence(int(request_id), prompt_str, prompt_tokens, block_size)
+    seq_group = SequenceGroup(request_id, [prompt], SamplingParams(),
+                              time.time(), None)
+
+    return prompt, seq_group
+
+
+def round_up_to_next_block(seq_len: int, block_size: int) -> int:
+    return (seq_len + block_size - 1) // block_size
diff --git a/tests/engine/test_computed_prefix_blocks.py b/tests/engine/test_computed_prefix_blocks.py
new file mode 100644
index 0000000000000..ed35212cc3f11
--- /dev/null
+++ b/tests/engine/test_computed_prefix_blocks.py
@@ -0,0 +1,34 @@
+import pytest
+
+from vllm.engine.arg_utils import EngineArgs
+from vllm.engine.llm_engine import LLMEngine
+from vllm.sampling_params import SamplingParams
+
+
+@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+@pytest.mark.parametrize("block_size", [16])
+def test_computed_prefix_blocks(model: str, block_size: int):
+    # This test checks if we are able to run the engine to completion
+    # without triggering asserts.
+    # We are in a scenario where all blocks from the second request's prompt
+    # are full and already computed when the second request arrives.
+    prompt = (
+        "You are a helpful assistant. How do I build a car from cardboard and "
+        "paper clips? Is there an easy to follow video tutorial available "
+        "online for free?")
+    prompt2 = (
+        " Please recommend to me some resources where I can learn not only to "
+        "handle technical difficulties of building a car, but also "
+        "decoration.")
+
+    engine_args = EngineArgs(model=model,
+                             block_size=block_size,
+                             enable_prefix_caching=True)
+
+    engine = LLMEngine.from_engine_args(engine_args)
+    sampling_params = SamplingParams()
+
+    engine.add_request("0", prompt + prompt2, sampling_params)
+    engine.step()
+    engine.add_request("1", prompt, sampling_params)
+    engine.step()
diff --git a/tests/entrypoints/test_guided_processors.py b/tests/entrypoints/test_guided_processors.py
index 5b39269916f8b..4a0e3e759e25a 100644
--- a/tests/entrypoints/test_guided_processors.py
+++ b/tests/entrypoints/test_guided_processors.py
@@ -46,8 +46,8 @@
     "required": ["name", "age", "skills", "work history"]
 }
 
-TEST_REGEX = r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}" + \
-             r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)"
+TEST_REGEX = (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
+              r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)")
 
 
 def test_guided_logits_processors():
diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py
index 35e83e1c70cc3..4a2b89befd93f 100644
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/test_openai_server.py
@@ -7,9 +7,12 @@
 import sys
 import pytest
 import requests
-import ray  # using Ray for overall ease of process management, parallel requests, and debugging.
+# using Ray for overall ease of process management, parallel requests,
+# and debugging.
+import ray
 import openai  # use the official client for correctness check
-from huggingface_hub import snapshot_download  # downloading lora to test lora requests
+# downloading lora to test lora requests
+from huggingface_hub import snapshot_download
 
 # imports for guided decoding tests
 import json
@@ -19,8 +22,11 @@
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
 MAX_SERVER_START_WAIT_S = 600  # wait for server to start for 60 seconds
-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"  # any model with a chat template should work here
-LORA_NAME = "typeof/zephyr-7b-beta-lora"  # technically this needs Mistral-7B-v0.1 as base, but we're not testing generation quality here
+# any model with a chat template should work here
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+# technically this needs Mistral-7B-v0.1 as base, but we're not testing
+# generation quality here
+LORA_NAME = "typeof/zephyr-7b-beta-lora"
 
 TEST_SCHEMA = {
     "type": "object",
@@ -61,8 +67,8 @@
     "required": ["name", "age", "skills", "work history"]
 }
 
-TEST_REGEX = r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}" + \
-             r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)"
+TEST_REGEX = (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
+              r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)")
 
 TEST_CHOICE = [
     "Python", "Java", "JavaScript", "C++", "C#", "PHP", "TypeScript", "Ruby",
@@ -122,8 +128,9 @@ def server(zephyr_lora_files):
     server_runner = ServerRunner.remote([
         "--model",
         MODEL_NAME,
+        # use half precision for speed and memory savings in CI environment
         "--dtype",
-        "bfloat16",  # use half precision for speed and memory savings in CI environment
+        "bfloat16",
         "--max-model-len",
         "8192",
         "--enforce-eager",
@@ -394,7 +401,8 @@ async def test_batch_completions(server, client: openai.AsyncOpenAI,
         max_tokens=5,
         temperature=0.0,
         extra_body=dict(
-            # NOTE: this has to be true for n > 1 in vLLM, but not necessary for official client.
+            # NOTE: this has to be true for n > 1 in vLLM, but not necessary
+            # for official client.
             use_beam_search=True),
     )
     assert len(batch.choices) == 4
@@ -471,8 +479,8 @@ async def test_logits_bias(server, client: openai.AsyncOpenAI):
 async def test_guided_json_completion(server, client: openai.AsyncOpenAI):
     completion = await client.completions.create(
         model=MODEL_NAME,
-        prompt=
-        f"Give an example JSON for an employee profile that fits this schema: {TEST_SCHEMA}",
+        prompt=f"Give an example JSON for an employee profile "
+        f"that fits this schema: {TEST_SCHEMA}",
         n=3,
         temperature=1.0,
         max_tokens=500,
@@ -491,9 +499,11 @@ async def test_guided_json_chat(server, client: openai.AsyncOpenAI):
         "role": "system",
         "content": "you are a helpful assistant"
     }, {
-        "role": "user",
-        "content": "Give an example JSON for an employee profile that " + \
-                    f"fits this schema: {TEST_SCHEMA}"
+        "role":
+        "user",
+        "content":
+        f"Give an example JSON for an employee profile that "
+        f"fits this schema: {TEST_SCHEMA}"
     }]
     chat_completion = await client.chat.completions.create(
         model=MODEL_NAME,
diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py
index c402fe3e98c7f..6165225d2d819 100644
--- a/tests/kernels/test_moe.py
+++ b/tests/kernels/test_moe.py
@@ -57,7 +57,8 @@ def test_fused_moe(
                          [torch.float32, torch.float16, torch.bfloat16])
 @torch.inference_mode()
 def test_mixtral_moe(dtype: torch.dtype):
-    "Make sure our Mixtral MoE implementation agrees with the one from huggingface."
+    """Make sure our Mixtral MoE implementation agrees with the one from
+    huggingface."""
 
     # Instantiate our and huggingface's MoE blocks
     config = MixtralConfig()
diff --git a/tests/kernels/test_prefix_prefill.py b/tests/kernels/test_prefix_prefill.py
index 97753d1b659c5..204cc325f7da8 100644
--- a/tests/kernels/test_prefix_prefill.py
+++ b/tests/kernels/test_prefix_prefill.py
@@ -3,7 +3,7 @@
 import time
 
 import torch
-from vllm.model_executor.layers.triton_kernel.prefix_prefill import (
+from vllm.model_executor.layers.attention.ops.prefix_prefill import (
     context_attention_fwd)
 from xformers import ops as xops
 from xformers.ops.fmha.attn_bias import BlockDiagonalCausalFromBottomRightMask
@@ -117,7 +117,8 @@ def test_contexted_kv_attention(
     v_cache = v_cache.view(-1, block_size, num_kv_heads,
                            head_size).permute(0, 2, 3, 1).contiguous()
 
-    # Warm up the Triton kernel by calling it once before actually measuring generation time
+    # Warm up the Triton kernel by calling it once before actually measuring
+    # generation time
     context_attention_fwd(query, k, v, output, k_cache, v_cache, block_table,
                           b_start_loc, b_seq_len, b_ctx_len, max_input_len)
     torch.cuda.synchronize()
diff --git a/tests/lora/test_gemma.py b/tests/lora/test_gemma.py
index 4e7cba1e68b08..838f56f7fd05c 100644
--- a/tests/lora/test_gemma.py
+++ b/tests/lora/test_gemma.py
@@ -28,7 +28,6 @@ def do_sample(llm, lora_path: str, lora_id: int) -> str:
 
 
 @pytest.mark.skip(reason="high likelihood sproadic failure in GHA")
-@pytest.mark.flaky(reruns=2)
 def test_gemma_lora(gemma_lora_files):
     llm = vllm.LLM(MODEL_PATH,
                    max_model_len=1024,
diff --git a/tests/lora/test_layer_variation.py b/tests/lora/test_layer_variation.py
new file mode 100644
index 0000000000000..95cf0cede8729
--- /dev/null
+++ b/tests/lora/test_layer_variation.py
@@ -0,0 +1,104 @@
+from typing import List, Optional
+import peft
+import pytest
+from random import sample
+import tempfile
+from transformers import AutoModelForCausalLM
+
+import vllm
+from vllm.lora.request import LoRARequest
+from .conftest import cleanup
+
+MODEL_PATH = "Felladrin/Llama-68M-Chat-v1"
+PROMPTS = [
+    "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nSpellForce 3 is a pretty bad game. The developer Grimlore Games is clearly a bunch of no-talent hacks, and 2017 was a terrible year for games anyway. [/user] [assistant]",  # noqa: E501
+    "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nI wanted to like Grimlore Games' 2017 entry, but in SpellForce 3 they just didn't get anything right. [/user] [assistant]",  # noqa: E501
+    "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nBioShock is a good role-playing, action-adventure, shooter that released for PlayStation, Xbox, and PC in 2007. It is available on Steam, and it has a Mac release but not a Linux release. [/user] [assistant]",  # noqa: E501
+]
+
+
+def get_lora_model(model_id: str, target_modules: List[str], rank: int):
+    model = AutoModelForCausalLM.from_pretrained(model_id)
+    lora_config = peft.tuners.lora.LoraConfig(target_modules, rank)
+    lora_model = peft.PeftModel(model, lora_config)
+    return lora_model
+
+
+def do_sample(llm,
+              lora_path: Optional[str] = None,
+              lora_id: Optional[int] = None,
+              logprobs: int = 0,
+              n_tokens: int = 256):
+    prompts = PROMPTS
+    sampling_params = vllm.SamplingParams(temperature=0,
+                                          max_tokens=n_tokens,
+                                          logprobs=logprobs,
+                                          stop=["[/assistant]"])
+    outputs = llm.generate(
+        prompts,
+        sampling_params,
+        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
+        if lora_id else None)
+    # Print the outputs.
+    generated_texts = []
+    generated_logprobs = []
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        generated_texts.append(generated_text)
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        generated_logprobs.append([
+            list(logprob.keys()) for out in output.outputs
+            for logprob in out.logprobs
+        ])
+    return generated_logprobs if logprobs else generated_texts
+
+
+SUPPORTED_MODULES = [
+    "qkv_proj", "o_proj", "gate_up_proj", "down_proj", "embed_tokens",
+    "lm_head"
+]
+TARGET_MODULES_LIST = []
+for length in range(2, 6):
+    TARGET_MODULES_LIST.extend(
+        [sample(SUPPORTED_MODULES, length) for _ in range(3)])
+
+
+# Test the correctness when layer and rank are varied
+# step 1: init a base model and serve with LoRA to get the reference results
+# step 2: merge the same LoRA to the base model, serve the merged model
+# step 3: compare the results from step 1 and step 2
+@pytest.mark.parametrize("tp_size", [1])
+@pytest.mark.parametrize("target_modules", TARGET_MODULES_LIST)
+@pytest.mark.parametrize("rank", [8, 16, 32, 64])
+def test_layer_variation_correctness(tp_size, target_modules, rank):
+    llm = vllm.LLM(MODEL_PATH,
+                   enable_lora=True,
+                   max_num_seqs=16,
+                   max_loras=4,
+                   tensor_parallel_size=tp_size,
+                   worker_use_ray=True)
+    model = get_lora_model(MODEL_PATH, target_modules, rank)
+    with tempfile.TemporaryDirectory() as tmpdir:
+        model.save_pretrained(tmpdir)
+        merged_probs = do_sample(llm, tmpdir, 1, logprobs=5, n_tokens=32)
+    del llm
+    cleanup()
+    reference_id_sets = [set(prob[0]) for prob in merged_probs]
+
+    model = get_lora_model(MODEL_PATH, target_modules, rank)
+    with tempfile.TemporaryDirectory() as tmpdir:
+        merged_model = model.merge_and_unload()
+        merged_model.save_pretrained(tmpdir)
+        llm = vllm.LLM(tmpdir,
+                       tokenizer=MODEL_PATH,
+                       enable_lora=False,
+                       max_num_seqs=16,
+                       tensor_parallel_size=tp_size,
+                       worker_use_ray=True)
+    probs = do_sample(llm, logprobs=5, n_tokens=32)
+    del llm
+    cleanup()
+    # verify the top-5 tokens are identical for each token
+    id_sets = [set(prob[0]) for prob in probs]
+    assert id_sets == reference_id_sets
diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index 1e06274133dd6..675e9136ea823 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -17,14 +17,16 @@
     LoRAMapping,
     BaseLayerWithLoRA,
 )
-from vllm.lora.models import LoRALayerWeights, convert_mapping, PackedLoRALayerWeights
+from vllm.lora.models import (LoRALayerWeights, convert_mapping,
+                              PackedLoRALayerWeights)
 from vllm.config import LoRAConfig
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                MergedColumnParallelLinear,
                                                RowParallelLinear,
                                                QKVParallelLinear)
-from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding, ParallelLMHead
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding, ParallelLMHead)
 from vllm.model_executor.utils import set_random_seed
 
 from .utils import DummyLoRAManager
@@ -260,7 +262,8 @@ def create_random_embedding_layer():
 
 
 @torch.inference_mode()
-# @pytest.mark.skip(reason="Fails when loras are in any slot other than the first.")
+# @pytest.mark.skip(
+#     reason="Fails when loras are in any slot other than the first.")
 @pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 def test_embeddings_with_new_embeddings(dist_init, num_loras, device) -> None:
@@ -684,9 +687,9 @@ class FakeConfig:
             result = linear(input_)[0]
             subloras = sublora_dict[lora_id]
             for i, sublora in enumerate(subloras):
-                result[:, sublora.lora_b.shape[1] * i:sublora.lora_b.shape[1] * (
-                    i + 1
-                )] += input_ @ sublora.lora_a @ sublora.lora_b * sublora.scaling
+                result[:, sublora.lora_b.shape[1] * i:sublora.lora_b.shape[1] *
+                       (i + 1)] += (input_ @ sublora.lora_a @ sublora.lora_b *
+                                    sublora.scaling)
             expected_results.append(result)
         expected_result = torch.cat(expected_results)
 
diff --git a/tests/lora/test_llama.py b/tests/lora/test_llama.py
index dfaf8c700695a..130906c3d584d 100644
--- a/tests/lora/test_llama.py
+++ b/tests/lora/test_llama.py
@@ -10,12 +10,12 @@
 
 def do_sample(llm, lora_path: str, lora_id: int):
     prompts = [
-        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",
-        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",
-        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_95 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a low tone mora with a gloss of /˩okiru/ [òkìɽɯ́]? [/user] [assistant]",
-        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]",
-        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]",
-        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]"
+        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
+        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",  # noqa: E501
+        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_95 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a low tone mora with a gloss of /˩okiru/ [òkìɽɯ́]? [/user] [assistant]",  # noqa: E501
+        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]",  # noqa: E501
+        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]",  # noqa: E501
+        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]"  # noqa: E501
     ]
     sampling_params = vllm.SamplingParams(temperature=0,
                                           max_tokens=256,
@@ -48,20 +48,20 @@ def test_llama_lora(sql_lora_files, tp_size):
                    tensor_parallel_size=tp_size)
 
     expected_no_lora_output = [
-        "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_75 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_76 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_77 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_78 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user]",
-        " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? ",
-        "\n\n answer: 1\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_96 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_97 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_98 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one m",
-        " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. ",
-        " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? ",
-        "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE",
+        "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_75 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_76 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_77 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_78 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user]",  # noqa: E501
+        " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? ",  # noqa: E501
+        "\n\n answer: 1\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_96 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_97 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_98 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one m",  # noqa: E501
+        " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. ",  # noqa: E501
+        " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? ",  # noqa: E501
+        "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE",  # noqa: E501
     ]
     expected_lora_output = [
-        "  SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ",
-        "  SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ",
-        "  SELECT one_mora FROM table_name_95 WHERE gloss = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] AND accented_mora = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] ",
-        "  SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(people_id) DESC LIMIT 1) ",
-        "  SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ",
-        "  SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' "
+        "  SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ",  # noqa: E501
+        "  SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ",  # noqa: E501
+        "  SELECT one_mora FROM table_name_95 WHERE gloss = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] AND accented_mora = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] ",  # noqa: E501
+        "  SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(people_id) DESC LIMIT 1) ",  # noqa: E501
+        "  SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ",  # noqa: E501
+        "  SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' "  # noqa: E501
     ]
 
     print("lora adapter created")
@@ -121,7 +121,8 @@ def test_llama_tensor_parallel_equality(sql_lora_files):
 
 
 def test_llama_lora_warmup(sql_lora_files):
-    """Test that the LLM initialization works with a warmup LORA path and is more conservative"""
+    """Test that the LLM initialization works with a warmup LORA path and
+    is more conservative"""
 
     @ray.remote(num_gpus=1)
     def get_num_gpu_blocks_lora():
@@ -132,13 +133,15 @@ def get_num_gpu_blocks_lora():
     @ray.remote(num_gpus=1)
     def get_num_gpu_blocks_no_lora():
         llm = vllm.LLM(MODEL_PATH, max_num_seqs=16)
-        num_gpu_blocks_no_lora_warmup = llm.llm_engine.cache_config.num_gpu_blocks
+        num_gpu_blocks_no_lora_warmup = (
+            llm.llm_engine.cache_config.num_gpu_blocks)
         return num_gpu_blocks_no_lora_warmup
 
     num_gpu_blocks_lora_warmup = ray.get(get_num_gpu_blocks_lora.remote())
     num_gpu_blocks_no_lora_warmup = ray.get(
         get_num_gpu_blocks_no_lora.remote())
     assert num_gpu_blocks_lora_warmup < num_gpu_blocks_no_lora_warmup, (
-        "The warmup with lora should be more"
-        " conservative than without lora, therefore the number of memory blocks for the KV cache should be "
+        "The warmup with lora should be more "
+        "conservative than without lora, therefore the number of "
+        "memory blocks for the KV cache should be "
         "less when using lora than when not using lora")
diff --git a/tests/lora/test_mixtral.py b/tests/lora/test_mixtral.py
index 07779e1f5b889..47172853a1c66 100644
--- a/tests/lora/test_mixtral.py
+++ b/tests/lora/test_mixtral.py
@@ -9,9 +9,9 @@
 
 def do_sample(llm, lora_path: str, lora_id: int):
     prompts = [
-        "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nSpellForce 3 is a pretty bad game. The developer Grimlore Games is clearly a bunch of no-talent hacks, and 2017 was a terrible year for games anyway. [/user] [assistant]",
-        "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nI wanted to like Grimlore Games' 2017 entry, but in SpellForce 3 they just didn't get anything right. [/user] [assistant]",
-        "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nBioShock is a good role-playing, action-adventure, shooter that released for PlayStation, Xbox, and PC in 2007. It is available on Steam, and it has a Mac release but not a Linux release. [/user] [assistant]",
+        "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nSpellForce 3 is a pretty bad game. The developer Grimlore Games is clearly a bunch of no-talent hacks, and 2017 was a terrible year for games anyway. [/user] [assistant]",  # noqa: E501
+        "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nI wanted to like Grimlore Games' 2017 entry, but in SpellForce 3 they just didn't get anything right. [/user] [assistant]",  # noqa: E501
+        "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nBioShock is a good role-playing, action-adventure, shooter that released for PlayStation, Xbox, and PC in 2007. It is available on Steam, and it has a Mac release but not a Linux release. [/user] [assistant]",  # noqa: E501
     ]
     sampling_params = vllm.SamplingParams(temperature=0, max_tokens=256)
     outputs = llm.generate(
@@ -43,9 +43,9 @@ def test_mixtral_lora(mixtral_lora_files, tp_size):
                    worker_use_ray=True)
 
     expected_lora_output = [
-        "give_opinion(name[SpellForce 3], release_year[2017], developer[Grimlore Games], rating[poor])",
-        "give_opinion(name[SpellForce 3], release_year[2017], developer[Grimlore Games], rating[poor])",
-        "inform(name[BioShock], release_year[2007], rating[good], genres[action-adventure, role-playing, shooter], platforms[PlayStation, Xbox, PC], available_on_steam[yes], has_linux_release[no], has_mac_release[yes])",
+        "give_opinion(name[SpellForce 3], release_year[2017], developer[Grimlore Games], rating[poor])",  # noqa: E501
+        "give_opinion(name[SpellForce 3], release_year[2017], developer[Grimlore Games], rating[poor])",  # noqa: E501
+        "inform(name[BioShock], release_year[2007], rating[good], genres[action-adventure, role-playing, shooter], platforms[PlayStation, Xbox, PC], available_on_steam[yes], has_linux_release[no], has_mac_release[yes])",  # noqa: E501
     ]
 
     assert do_sample(llm, mixtral_lora_files,
diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py
index 410bdfa5c69e2..0ab9c63ce4377 100644
--- a/tests/metrics/test_metrics.py
+++ b/tests/metrics/test_metrics.py
@@ -21,7 +21,8 @@ def test_metric_counter_prompt_tokens(
                              gpu_memory_utilization=0.4)
     tokenizer = vllm_model.model.get_tokenizer()
     prompt_token_counts = [len(tokenizer.encode(p)) for p in example_prompts]
-    # This test needs at least 2 prompts in a batch of different lengths to verify their token count is correct despite padding.
+    # This test needs at least 2 prompts in a batch of different lengths to
+    # verify their token count is correct despite padding.
     assert len(example_prompts) > 1, "at least 2 prompts are required"
     assert prompt_token_counts[0] != prompt_token_counts[1], (
         "prompts of different lengths are required")
@@ -33,8 +34,8 @@ def test_metric_counter_prompt_tokens(
         **stat_logger.labels)._value.get()
 
     assert vllm_prompt_token_count == metric_count, (
-        f"prompt token count: {vllm_prompt_token_count!r}\nmetric: {metric_count!r}"
-    )
+        f"prompt token count: {vllm_prompt_token_count!r}\n"
+        f"metric: {metric_count!r}")
 
 
 @pytest.mark.parametrize("model", MODELS)
@@ -60,9 +61,10 @@ def test_metric_counter_generation_tokens(
     for i in range(len(example_prompts)):
         vllm_output_ids, vllm_output_str = vllm_outputs[i]
         prompt_ids = tokenizer.encode(example_prompts[i])
-        # vllm_output_ids contains both prompt tokens and generation tokens. We're interested only in the count of the generation tokens.
+        # vllm_output_ids contains both prompt tokens and generation tokens.
+        # We're interested only in the count of the generation tokens.
         vllm_generation_count += len(vllm_output_ids) - len(prompt_ids)
 
     assert vllm_generation_count == metric_count, (
-        f"generation token count: {vllm_generation_count!r}\nmetric: {metric_count!r}"
-    )
+        f"generation token count: {vllm_generation_count!r}\n"
+        f"metric: {metric_count!r}")
diff --git a/tests/models/test_marlin.py b/tests/models/test_marlin.py
index 7accb425cd666..bd9b75b1c1540 100644
--- a/tests/models/test_marlin.py
+++ b/tests/models/test_marlin.py
@@ -45,6 +45,7 @@ class ModelPair:
 ]
 
 
+@pytest.mark.skip(reason="out of memory")
 @pytest.mark.flaky(reruns=2)
 @pytest.mark.skipif(marlin_not_supported,
                     reason="Marlin is not supported on this GPU type.")
@@ -66,7 +67,9 @@ def test_models(
     marlin_outputs = marlin_model.generate_greedy_logprobs(
         example_prompts, max_tokens, num_logprobs)
 
-    # Note: deleting just the model does not always free the GPU memory, not sure why.
+    # Note: not sure why, but deleting just the model on Ada Lovelace
+    #   does not free the GPU memory. On Ampere, deleting the just model
+    #   frees the memory.
     del marlin_model.model.llm_engine.driver_worker
     del marlin_model
 
@@ -77,11 +80,14 @@ def test_models(
                                                        max_tokens,
                                                        num_logprobs)
 
-    # Note: deleting just the model does not always free the GPU memory, not sure why.
+    # Note: not sure why, but deleting just the model on Ada Lovelace
+    #   does not free the GPU memory. On Ampere, deleting the just model
+    #   frees the memory.
     del gptq_model.model.llm_engine.driver_worker
     del gptq_model
 
     # loop through the prompts
+    # use logprobs or else this will consistently run out of memory
     check_logprobs_close(
         outputs_0_lst=gptq_outputs,
         outputs_1_lst=marlin_outputs,
diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py
index 7ef8dde7bb8f6..c83551c36ef10 100644
--- a/tests/prefix_caching/test_prefix_caching.py
+++ b/tests/prefix_caching/test_prefix_caching.py
@@ -20,20 +20,23 @@ def test_block_allocator(
                                      num_blocks,
                                      enable_caching=True)
 
-    # Allocate two PysicalTokenBlocks with the same hash and check that they are the same PhysicalTokenBlock
+    # Allocate two PysicalTokenBlocks with the same hash and check
+    # that they are the same PhysicalTokenBlock
     first_block = block_allocator.allocate(block_hash, 0)
     second_block = block_allocator.allocate(block_hash, 0)
     assert (first_block == second_block)
     assert (second_block.ref_count == 2)
 
-    # Free the first_block and confirm that the ref_count is correctly decremented on the second block
+    # Free the first_block and confirm that the ref_count is correctly
+    # decremented on the second block
     block_allocator.free(first_block)
     assert (second_block.ref_count == 1)
 
     # Free the second block
     block_allocator.free(second_block)
 
-    # Reallocate the first block and confirm that, even after the block had its ref_count go to 0, we still get the same block back
+    # Reallocate the first block and confirm that, even after the block
+    # had its ref_count go to 0, we still get the same block back
     first_block = block_allocator.allocate(block_hash, 0)
     assert (first_block == second_block)
     assert (first_block.block_hash == block_hash)
@@ -56,7 +59,8 @@ def test_eviction(num_blocks: int, ):
     for block in blocks:
         block_allocator.free(block)
 
-    # Allocate a new block and confirm that it's the first block freed. I.E The Least Recently Used block
+    # Allocate a new block and confirm that it's the first block freed.
+    # I.E The Least Recently Used block
     new_block_hash = block_size
     new_block = block_allocator.allocate(new_block_hash, 0)
     assert (new_block == blocks[0])
@@ -68,7 +72,8 @@ def test_eviction(num_blocks: int, ):
     assert (realloc_block == blocks[realloc_block_hash])
     assert (realloc_block.block_hash == realloc_block_hash)
 
-    # Allocate a new block and confirm that it's not the realloc_block, since the realloc_block shouldn't be in the free list
+    # Allocate a new block and confirm that it's not the realloc_block,
+    # since the realloc_block shouldn't be in the free list
     new_block_hash = block_size + 1
     new_block = block_allocator.allocate(new_block_hash, 0)
     assert (realloc_block != new_block)
diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py
index 1abb55f021214..14f1872c45258 100644
--- a/tests/samplers/test_logprobs.py
+++ b/tests/samplers/test_logprobs.py
@@ -70,8 +70,8 @@ def test_get_prompt_logprobs(
                                            hf_logprob[i][-1][token_id].item(),
                                            atol=1e-2,
                                            rtol=1e-2)
-                assert isinstance(sample_logprob.decoded_token, str), \
-                    ("The token should be decoded by the time it is returned "
+                assert isinstance(sample_logprob.decoded_token, str), (
+                    "The token should be decoded by the time it is returned "
                     " to the user.")
 
 
diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py
index eb384dd1a73c5..31c43454d47ac 100644
--- a/tests/samplers/test_sampler.py
+++ b/tests/samplers/test_sampler.py
@@ -257,9 +257,10 @@ def test_sampling(model_runner: ModelRunner):
             if metadata.sampling_params.use_beam_search:
                 continue
 
-            if metadata.sampling_params.seed is not None \
-                    and expected_tokens[i] is None:
-                # Record seeded random result to compare with results of second invocation
+            if (metadata.sampling_params.seed is not None
+                    and expected_tokens[i] is None):
+                # Record seeded random result to compare with results of
+                # second invocation
                 expected_tokens[i] = [
                     nth_output.output_token
                     for nth_output in sequence_output.samples
@@ -267,11 +268,13 @@ def test_sampling(model_runner: ModelRunner):
                 continue
 
             for n, nth_output in enumerate(sequence_output.samples):
-                if metadata.sampling_params.temperature == 0 or metadata.sampling_params.seed is not None:
+                if (metadata.sampling_params.temperature == 0
+                        or metadata.sampling_params.seed is not None):
                     # Ensure exact matches for greedy or random with seed
                     assert nth_output.output_token == expected_tokens[i][n]
                 else:
-                    # For non-seeded random check that one of the high-logit tokens were chosen
+                    # For non-seeded random check that one of the high-logit
+                    # tokens were chosen
                     assert nth_output.output_token in expected_tokens[i]
 
     # Test batch
@@ -286,8 +289,8 @@ def test_sampling(model_runner: ModelRunner):
     input_tensor.data = input_tensor.index_select(0, target_index)
     fake_logits.data = fake_logits.index_select(0, target_index)
 
-    # This time, results of seeded random samples will be compared with the corresponding
-    # sample in the pre-shuffled batch
+    # This time, results of seeded random samples will be compared with
+    # the corresponding sample in the pre-shuffled batch
     test_sampling(model_runner)
 
     del model_runner
diff --git a/vllm/model_executor/layers/triton_kernel/__init__.py b/tests/spec_decode/__init__.py
similarity index 100%
rename from vllm/model_executor/layers/triton_kernel/__init__.py
rename to tests/spec_decode/__init__.py
diff --git a/tests/spec_decode/test_batch_expansion.py b/tests/spec_decode/test_batch_expansion.py
new file mode 100644
index 0000000000000..fddc3995452cc
--- /dev/null
+++ b/tests/spec_decode/test_batch_expansion.py
@@ -0,0 +1,95 @@
+import torch
+import pytest
+
+from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer
+
+from .utils import mock_worker, create_seq_group_metadata_from_prompts
+
+
+@pytest.mark.parametrize('num_target_seq_ids', [100])
+def test_create_target_seq_id_iterator(num_target_seq_ids: int):
+    """Verify all new sequence ids are greater than all input
+    seq ids.
+    """
+    scorer = BatchExpansionTop1Scorer(mock_worker(), 'cuda:0', 32_000)
+
+    all_seq_ids = [
+        [1, 3, 5, 7],
+        list(range(100)) + [0],
+        [100],
+    ]
+
+    for seq_ids in all_seq_ids:
+        max_seq_id = max(seq_ids)
+        iterator = scorer._create_target_seq_id_iterator(seq_ids)  # pylint: disable=protected-access
+        for _ in range(num_target_seq_ids):
+            assert next(iterator) > max_seq_id
+
+
+@pytest.mark.parametrize('k', [1, 2, 6])
+def test_get_token_ids_to_score(k: int):
+    """Verify correct tokens are selected for scoring.
+    """
+    proposal_token_ids = torch.tensor(
+        list(range(k)),
+        dtype=torch.int64,
+        device='cuda',
+    )
+
+    expected_output = [
+        [],
+    ]
+    for i in range(proposal_token_ids.shape[0]):
+        expected_output.append(proposal_token_ids[:i + 1].tolist())
+
+    scorer = BatchExpansionTop1Scorer(mock_worker(), 'cuda:0', 32_000)
+    actual_output = scorer._get_token_ids_to_score(proposal_token_ids)  # pylint: disable=protected-access
+
+    actual_output = [
+        x.tolist() if isinstance(x, torch.Tensor) else x for x in actual_output
+    ]
+
+    assert actual_output == expected_output
+
+
+@pytest.mark.parametrize('k', [1, 2, 6])
+def test_create_single_target_seq_group_metadata(k: int):
+    """Verify correct creation of a batch-expanded seq group metadata.
+    """
+
+    prompt_tokens = [1, 2, 3]
+    prev_output_tokens = [4, 5, 6]
+
+    token_ids = list(range(k))
+
+    num_tokens_processed = len(prompt_tokens) + len(prev_output_tokens) - 1
+
+    final_seq_len = len(prompt_tokens) + len(prev_output_tokens) + len(
+        token_ids)
+
+    block_size = 32
+    input_seq_group_metadata = create_seq_group_metadata_from_prompts(
+        [prompt_tokens], 2048 // block_size, block_size, [final_seq_len],
+        [prev_output_tokens], [num_tokens_processed])[0]
+
+    input_seq_id = list(input_seq_group_metadata.seq_data.keys())[0]
+    target_seq_id = 100
+
+    scorer = BatchExpansionTop1Scorer(mock_worker(), 'cuda:0', 32_000)
+    output = scorer._create_single_target_seq_group_metadata(  # pylint: disable=protected-access
+        input_seq_group_metadata,
+        input_seq_id,
+        target_seq_id,
+        token_ids,
+    )
+
+    assert output.request_id == input_seq_group_metadata.request_id
+    assert len(output.seq_data) == 1
+    assert output.seq_data[target_seq_id].get_prompt_token_ids(
+    ) == prompt_tokens
+    assert output.seq_data[target_seq_id].get_output_token_ids(
+    ) == prev_output_tokens + token_ids
+
+    assert len(output.block_tables) == 1
+    assert output.block_tables[
+        target_seq_id] == input_seq_group_metadata.block_tables[input_seq_id]
diff --git a/tests/spec_decode/test_metrics.py b/tests/spec_decode/test_metrics.py
new file mode 100644
index 0000000000000..09847136d13e9
--- /dev/null
+++ b/tests/spec_decode/test_metrics.py
@@ -0,0 +1,159 @@
+import torch
+import math
+import pytest
+
+from unittest.mock import MagicMock
+
+from vllm.spec_decode.metrics import AsyncMetricsCollector
+
+
+def test_initial_call_returns_none():
+    """Expect first call to get metrics to return None.
+    """
+    rej_sampler = MagicMock()
+    rej_sampler.num_accepted_tokens = torch.tensor(0,
+                                                   dtype=torch.long,
+                                                   device='cuda')
+    rej_sampler.num_emitted_tokens = torch.tensor(0,
+                                                  dtype=torch.long,
+                                                  device='cuda')
+    rej_sampler.num_draft_tokens = 0
+
+    collector = AsyncMetricsCollector(rej_sampler)
+    collector.init_gpu_tensors(rank=0)
+    maybe_metrics = collector.maybe_collect_rejsample_metrics(k=5)
+    assert maybe_metrics is None
+
+
+def test_second_call_returns_metrics():
+    """Expect second call to not return None.
+    """
+    rej_sampler = MagicMock()
+    rej_sampler.num_accepted_tokens = torch.tensor(0,
+                                                   dtype=torch.long,
+                                                   device='cuda')
+    rej_sampler.num_emitted_tokens = torch.tensor(0,
+                                                  dtype=torch.long,
+                                                  device='cuda')
+    rej_sampler.num_draft_tokens = 0
+
+    collect_interval_s = 5.0
+    timer = MagicMock()
+    timer.side_effect = [
+        0.0, collect_interval_s + 0.1, collect_interval_s + 0.2
+    ]
+
+    collector = AsyncMetricsCollector(rejection_sampler=rej_sampler,
+                                      timer=timer,
+                                      collect_interval_s=collect_interval_s)
+    collector.init_gpu_tensors(rank=0)
+    _ = collector.maybe_collect_rejsample_metrics(k=5)
+    metrics = collector.maybe_collect_rejsample_metrics(k=5)
+    assert metrics is not None
+
+
+@pytest.mark.parametrize("rank", [1, 2, 3, 4])
+def test_nonzero_rank_noop(rank):
+    """Verify nonzero ranks don't collect metrics.
+    """
+    rej_sampler = MagicMock()
+    rej_sampler.num_accepted_tokens = torch.tensor(0,
+                                                   dtype=torch.long,
+                                                   device='cuda')
+    rej_sampler.num_emitted_tokens = torch.tensor(0,
+                                                  dtype=torch.long,
+                                                  device='cuda')
+    rej_sampler.num_draft_tokens = 0
+
+    collector = AsyncMetricsCollector(rej_sampler)
+    collector.init_gpu_tensors(rank=rank)
+    _ = collector.maybe_collect_rejsample_metrics(k=5)
+    metrics = collector.maybe_collect_rejsample_metrics(k=5)
+    assert metrics is None
+
+
+def test_noop_until_time():
+    """Verify metrics aren't collected until enough time passes.
+    """
+    rej_sampler = MagicMock()
+    rej_sampler.num_accepted_tokens = torch.tensor(0,
+                                                   dtype=torch.long,
+                                                   device='cuda')
+    rej_sampler.num_emitted_tokens = torch.tensor(0,
+                                                  dtype=torch.long,
+                                                  device='cuda')
+    rej_sampler.num_draft_tokens = 0
+
+    collect_interval_s = 5.0
+    timer = MagicMock()
+    timer.side_effect = [
+        0.0, collect_interval_s - 0.1, collect_interval_s - 0.1,
+        collect_interval_s + 0.1, collect_interval_s + 0.1
+    ]
+
+    collector = AsyncMetricsCollector(rejection_sampler=rej_sampler,
+                                      timer=timer,
+                                      collect_interval_s=collect_interval_s)
+    collector.init_gpu_tensors(rank=0)
+
+    _ = collector.maybe_collect_rejsample_metrics(k=5)
+    metrics = collector.maybe_collect_rejsample_metrics(k=5)
+    assert metrics is None
+
+    _ = collector.maybe_collect_rejsample_metrics(k=5)
+    metrics = collector.maybe_collect_rejsample_metrics(k=5)
+    assert metrics is not None
+
+
+@pytest.mark.parametrize("has_data", [True, False])
+def test_initial_metrics_has_correct_values(has_data: bool):
+    """Test correctness of metrics data.
+    """
+    if has_data:
+        num_accepted_tokens = 103
+        num_emitted_tokens = 104
+        num_draft_tokens = 105
+    else:
+        num_accepted_tokens = 0
+        num_emitted_tokens = 0
+        num_draft_tokens = 0
+    k = 5
+
+    num_possible_tokens = AsyncMetricsCollector.get_max_num_accepted_tokens(
+        num_draft_tokens, k)
+
+    rej_sampler = MagicMock()
+    rej_sampler.num_accepted_tokens = torch.tensor(num_accepted_tokens,
+                                                   dtype=torch.long,
+                                                   device='cuda')
+    rej_sampler.num_emitted_tokens = torch.tensor(num_emitted_tokens,
+                                                  dtype=torch.long,
+                                                  device='cuda')
+    rej_sampler.num_draft_tokens = num_draft_tokens
+
+    collect_interval_s = 5.0
+    timer = MagicMock()
+    timer.side_effect = [
+        0.0, collect_interval_s + 0.1, collect_interval_s + 0.2
+    ]
+
+    collector = AsyncMetricsCollector(rejection_sampler=rej_sampler,
+                                      timer=timer,
+                                      collect_interval_s=collect_interval_s)
+    collector.init_gpu_tensors(rank=0)
+    _ = collector.maybe_collect_rejsample_metrics(k)
+    metrics = collector.maybe_collect_rejsample_metrics(k)
+
+    assert metrics.num_spec_tokens == k
+    assert metrics.accepted_tokens == num_accepted_tokens
+    assert metrics.draft_tokens == num_draft_tokens
+    assert metrics.emitted_tokens == num_emitted_tokens
+
+    if has_data:
+        assert (metrics.draft_acceptance_rate == num_accepted_tokens /
+                num_draft_tokens)
+        assert (metrics.system_efficiency == num_emitted_tokens /
+                num_possible_tokens)
+    else:
+        assert math.isnan(metrics.draft_acceptance_rate)
+        assert math.isnan(metrics.system_efficiency)
diff --git a/tests/worker/spec_decode/test_multi_step_worker.py b/tests/spec_decode/test_multi_step_worker.py
similarity index 61%
rename from tests/worker/spec_decode/test_multi_step_worker.py
rename to tests/spec_decode/test_multi_step_worker.py
index ea54802903578..45b43ec59ee8f 100644
--- a/tests/worker/spec_decode/test_multi_step_worker.py
+++ b/tests/spec_decode/test_multi_step_worker.py
@@ -3,14 +3,16 @@
 import pytest
 from unittest.mock import MagicMock
 
-from vllm.worker.spec_decode.multi_step_worker import MultiStepWorker
+from vllm.spec_decode.multi_step_worker import (MultiStepWorker,
+                                                DraftModelTop1Proposer)
 from vllm.worker.worker import Worker
 from vllm.model_executor.utils import set_random_seed
+from vllm.sequence import SamplerOutput
 
 from .utils import (create_execute_model_data, create_worker,
                     create_seq_group_metadata_from_prompts, zero_kv_cache,
                     patch_execute_model_with_seeds,
-                    assert_logprobs_dict_allclose)
+                    assert_logprobs_dict_allclose, create_batch)
 
 
 @pytest.mark.parametrize('num_steps', list(range(1, 17)))
@@ -259,3 +261,160 @@ def test_same_output_for_multi_step():
             multi_step_output_logprobs, single_step_output_logprobs):
         assert_logprobs_dict_allclose(multi_step_logprobs,
                                       single_step_logprobs)
+
+
+@torch.inference_mode()
+def test_draft_proposals_full_speculation_len():
+    """Verify DraftModelTop1Proposer correctly handles case where all sequences
+    can speculate.
+    """
+    k = 10
+    batch_size = 32
+    vocab_size = 32_000
+    device = 'cuda:0'
+
+    draft_worker = MagicMock()
+    proposer = DraftModelTop1Proposer(
+        draft_worker=draft_worker,
+        device=device,
+        max_model_len=2048,
+        vocab_size=vocab_size,
+    )
+    draft_worker.execute_model_multi_step.return_value = [
+        SamplerOutput(
+            outputs=[],
+            sampled_token_probs=torch.rand(batch_size,
+                                           vocab_size,
+                                           device=device,
+                                           dtype=torch.float32),
+            sampled_token_ids=torch.randint(low=0,
+                                            high=vocab_size,
+                                            size=(batch_size, ),
+                                            device=device,
+                                            dtype=torch.long),
+        ) for _ in range(k)
+    ]
+
+    execute_model_data, _, _ = create_batch(batch_size, k)
+
+    proposals = proposer.get_proposals(
+        **execute_model_data.to_dict(),
+        max_proposal_len=k,
+    )
+
+    assert torch.is_tensor(proposals.proposal_token_ids)
+    assert torch.is_tensor(proposals.proposal_probs)
+
+    assert proposals.proposal_token_ids.shape == torch.Size([batch_size, k])
+    assert proposals.proposal_probs.shape[:-1] == torch.Size([batch_size, k])
+
+    assert proposals.proposal_lens.shape == torch.Size([batch_size])
+    assert proposals.proposal_lens.tolist() == [k for _ in range(batch_size)]
+
+
+@torch.inference_mode()
+def test_draft_proposals_no_speculations():
+    """Verify DraftModelTop1Proposer correctly handles case where no sequences
+    can speculate.
+    """
+    k = 10
+    batch_size = 32
+    vocab_size = 32_000
+    device = 'cuda:0'
+    prompt_len = 10
+
+    draft_worker = MagicMock()
+    proposer = DraftModelTop1Proposer(
+        draft_worker=draft_worker,
+        device=device,
+        max_model_len=prompt_len + k - 1,
+        vocab_size=vocab_size,
+    )
+
+    execute_model_data, _, _ = create_batch(batch_size,
+                                            k,
+                                            prompt_len=prompt_len)
+
+    proposals = proposer.get_proposals(
+        **execute_model_data.to_dict(),
+        max_proposal_len=k,
+    )
+
+    assert torch.is_tensor(proposals.proposal_token_ids)
+    assert torch.is_tensor(proposals.proposal_probs)
+
+    assert proposals.proposal_token_ids.shape == torch.Size([0, k])
+    assert proposals.proposal_probs.shape[:-1] == torch.Size([0, k])
+
+    assert proposals.proposal_lens.shape == torch.Size([batch_size])
+    assert proposals.proposal_lens.tolist() == [0 for _ in range(batch_size)]
+
+
+@torch.inference_mode()
+def test_draft_proposals_mixed_k():
+    """Verify DraftModelTop1Proposer correctly handles case some sequences can
+    speculate and some can't.
+    """
+    k = 10
+    batch_size = 32
+    vocab_size = 32_000
+    device = 'cuda:0'
+
+    small_prompt_len = 5
+    long_prompt_len = 10
+    prev_output_token_len = 20
+
+    expected_num_proposal_seqs = 6
+    expected_num_no_proposal_seqs = batch_size - expected_num_proposal_seqs
+
+    prompt_len = [
+        small_prompt_len for _ in range(expected_num_proposal_seqs - 1)
+    ] + [long_prompt_len
+         for _ in range(expected_num_no_proposal_seqs)] + [small_prompt_len]
+
+    draft_worker = MagicMock()
+    proposer = DraftModelTop1Proposer(
+        draft_worker=draft_worker,
+        device=device,
+        max_model_len=long_prompt_len + prev_output_token_len + k - 1,
+        vocab_size=vocab_size,
+    )
+
+    draft_worker.execute_model_multi_step.return_value = [
+        SamplerOutput(
+            outputs=[],
+            sampled_token_probs=torch.rand(expected_num_proposal_seqs,
+                                           vocab_size,
+                                           device=device,
+                                           dtype=torch.float32),
+            sampled_token_ids=torch.randint(
+                low=0,
+                high=vocab_size,
+                size=(expected_num_proposal_seqs, ),
+                device=device,
+                dtype=torch.long),
+        ) for _ in range(k)
+    ]
+
+    execute_model_data, _, _ = create_batch(
+        batch_size,
+        k,
+        prompt_len=prompt_len,
+        prev_output_token_len=prev_output_token_len,
+    )
+
+    proposals = proposer.get_proposals(
+        **execute_model_data.to_dict(),
+        max_proposal_len=k,
+    )
+
+    assert torch.is_tensor(proposals.proposal_token_ids)
+    assert torch.is_tensor(proposals.proposal_probs)
+
+    assert proposals.proposal_token_ids.shape == torch.Size([batch_size, k])
+    assert proposals.proposal_probs.shape[:-1] == torch.Size([batch_size, k])
+
+    assert proposals.proposal_lens.shape == torch.Size([batch_size])
+    assert proposals.proposal_lens.tolist() == [
+        k for _ in range(expected_num_proposal_seqs - 1)
+    ] + [0 for _ in range(expected_num_no_proposal_seqs)] + [k]
diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py
new file mode 100644
index 0000000000000..bfc69e01e3eb9
--- /dev/null
+++ b/tests/spec_decode/test_spec_decode_worker.py
@@ -0,0 +1,597 @@
+import torch
+import random
+import pytest
+from unittest.mock import MagicMock
+
+from vllm.spec_decode.multi_step_worker import MultiStepWorker
+from vllm.spec_decode.spec_decode_worker import (SpecDecodeWorker,
+                                                 split_num_cache_blocks_evenly)
+from vllm.spec_decode.interfaces import SpeculativeProposals
+from vllm.model_executor.utils import set_random_seed
+from vllm.model_executor.layers.rejection_sampler import RejectionSampler
+from .utils import (mock_worker, create_batch, ExecuteModelData,
+                    create_sampler_output_list)
+from vllm.spec_decode.metrics import (SpecDecodeWorkerMetrics,
+                                      AsyncMetricsCollector)
+
+
+@pytest.mark.parametrize('k', [1, 2, 6])
+@pytest.mark.parametrize('batch_size', [1, 2, 32])
+@torch.inference_mode()
+def test_correctly_calls_draft_model(k: int, batch_size: int):
+    """Verify SpecDecodeWorker calls the draft worker with correct
+    inputs. Everything else is mocked out.
+    """
+    draft_worker = mock_worker(cls=MultiStepWorker)
+    target_worker = mock_worker()
+    rejection_sampler = MagicMock(spec=RejectionSampler)
+    metrics_collector = MagicMock(spec=AsyncMetricsCollector)
+    worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
+                              metrics_collector)
+
+    exception_secret = 'artifical stop'
+    draft_worker.get_spec_proposals.side_effect = ValueError(exception_secret)
+
+    execute_model_data, _, _ = create_batch(batch_size, k)
+
+    with pytest.raises(ValueError, match=exception_secret):
+        worker.execute_model(**execute_model_data.to_dict(), num_spec_tokens=k)
+
+    call_args_list = draft_worker.get_spec_proposals.call_args_list
+    assert len(call_args_list) == 1
+
+    for args, _ in call_args_list:
+        (seq_group_metadata_list, blocks_to_swap_in, blocks_to_swap_out,
+         blocks_to_copy, actual_k) = args
+        actual_execute_model_data = ExecuteModelData(seq_group_metadata_list,
+                                                     blocks_to_swap_in,
+                                                     blocks_to_swap_out,
+                                                     blocks_to_copy)
+        assert actual_execute_model_data == execute_model_data
+        assert actual_k == k
+
+
+@pytest.mark.parametrize('k', [1, 2, 6])
+@pytest.mark.parametrize('batch_size', [1, 2, 32])
+@torch.inference_mode()
+def test_correctly_calls_target_model(k: int, batch_size: int):
+    """Verify SpecDecodeWorker calls the target model with correct
+    inputs. Everything else is mocked out.
+    """
+    draft_worker = mock_worker(cls=MultiStepWorker)
+    target_worker = mock_worker()
+    rejection_sampler = MagicMock(spec=RejectionSampler)
+    rejection_sampler.token_id_dtype = torch.int64
+    metrics_collector = MagicMock(spec=AsyncMetricsCollector)
+
+    draft_worker.device = 'cuda'
+    target_worker.device = 'cuda'
+
+    set_random_seed(1)
+
+    worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
+                              metrics_collector)
+    worker.init_model()
+
+    vocab_size = 32_000
+
+    proposal_token_ids = torch.randint(low=0,
+                                       high=vocab_size,
+                                       size=(batch_size, k),
+                                       dtype=torch.int64,
+                                       device='cuda')
+    proposal_probs = torch.rand(batch_size,
+                                k,
+                                vocab_size,
+                                dtype=torch.float32,
+                                device='cuda')
+    proposal_lens = torch.ones(batch_size, dtype=torch.int64,
+                               device='cuda') * k
+
+    execute_model_data, prompts, prev_output_tokens = create_batch(
+        batch_size, k)
+
+    draft_worker.get_spec_proposals.return_value = SpeculativeProposals(
+        proposal_token_ids=proposal_token_ids,
+        proposal_probs=proposal_probs,
+        proposal_lens=proposal_lens)
+
+    exception_secret = 'artifical stop'
+    target_worker.execute_model.side_effect = ValueError(exception_secret)
+
+    with pytest.raises(ValueError, match=exception_secret):
+        worker.execute_model(**execute_model_data.to_dict(), num_spec_tokens=k)
+
+    seen_contexts = []
+
+    call_args_list = target_worker.execute_model.call_args_list
+    assert len(call_args_list) == 1
+    for args, kwargs in call_args_list:
+        target_execute_model_data = ExecuteModelData.from_dict(kwargs)
+
+        assert len(target_execute_model_data.seq_group_metadata_list) == (
+            k + 1) * batch_size
+        for seq_group_metadata in (
+                target_execute_model_data.seq_group_metadata_list):
+            for seq_data in seq_group_metadata.seq_data.values():
+                seen_contexts.append(seq_data.get_token_ids())
+
+    expected_seen_contexts = []
+
+    for prompt, prev_generated, draft_tokens in zip(
+            prompts, prev_output_tokens, proposal_token_ids.tolist()):
+
+        for i in range(len(draft_tokens) + 1):
+            expected_seen_contexts.append(prompt + prev_generated +
+                                          draft_tokens[:i])
+
+    seen_contexts.sort()
+    expected_seen_contexts.sort()
+    assert expected_seen_contexts == seen_contexts
+
+
+@pytest.mark.parametrize('k', [1, 2, 6])
+@pytest.mark.parametrize('batch_size', [1, 2, 32])
+@torch.inference_mode()
+def test_correctly_calls_rejection_sampler(k: int, batch_size: int):
+    """Verify SpecDecodeWorker calls the rejection sampler with
+    correct inputs. Everything else is mocked out.
+    """
+    vocab_size = 32_000
+
+    draft_worker = mock_worker(cls=MultiStepWorker, vocab_size=vocab_size)
+    target_worker = mock_worker(vocab_size=vocab_size)
+    rejection_sampler = MagicMock(spec=RejectionSampler)
+    rejection_sampler.token_id_dtype = torch.int64
+    metrics_collector = MagicMock(spec=AsyncMetricsCollector)
+    draft_worker.device = 'cuda'
+    target_worker.device = 'cuda'
+
+    set_random_seed(1)
+
+    worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
+                              metrics_collector)
+    worker.init_model()
+
+    proposal_token_ids = torch.randint(low=0,
+                                       high=vocab_size,
+                                       size=(batch_size, k),
+                                       dtype=torch.int64,
+                                       device='cuda')
+    proposal_probs = torch.rand(batch_size,
+                                k,
+                                vocab_size,
+                                dtype=torch.float32,
+                                device='cuda')
+
+    proposal_lens = torch.ones(batch_size, dtype=torch.int64,
+                               device='cuda') * k
+
+    execute_model_data, _, _ = create_batch(batch_size, k)
+
+    draft_worker.get_spec_proposals.return_value = SpeculativeProposals(
+        proposal_token_ids=proposal_token_ids,
+        proposal_probs=proposal_probs,
+        proposal_lens=proposal_lens)
+
+    target_token_ids = torch.randint(low=0,
+                                     high=vocab_size,
+                                     size=(1, batch_size * (k + 1)),
+                                     dtype=torch.int64,
+                                     device='cuda')
+    target_token_probs = torch.rand(1,
+                                    batch_size * (k + 1),
+                                    vocab_size,
+                                    dtype=torch.float32,
+                                    device='cuda')
+    target_output = create_sampler_output_list(target_token_ids,
+                                               target_token_probs)
+
+    target_worker.execute_model.return_value = target_output[0]
+
+    exception_secret = 'artifical stop'
+    rejection_sampler.side_effect = ValueError(exception_secret)
+
+    with pytest.raises(ValueError, match=exception_secret):
+        worker.execute_model(**execute_model_data.to_dict(), num_spec_tokens=k)
+
+    assert len(rejection_sampler.call_args_list) == 1
+    args, _ = rejection_sampler.call_args_list[0]
+    (actual_proposal_scores, actual_bonus_token_ids, actual_proposal_probs,
+     actual_proposal_token_ids) = args
+
+    assert torch.equal(actual_bonus_token_ids,
+                       target_token_ids.reshape(batch_size, k + 1)[:, -1:])
+    assert torch.equal(
+        actual_proposal_scores,
+        target_token_probs.reshape(batch_size, k + 1, -1)[:, :-1])
+    assert torch.equal(actual_proposal_token_ids, proposal_token_ids)
+    assert torch.equal(actual_proposal_probs, proposal_probs)
+
+
+@pytest.mark.parametrize('k', [1, 2, 6])
+@pytest.mark.parametrize('batch_size', [1, 2, 32])
+@torch.inference_mode()
+def test_correctly_formats_output(k: int, batch_size: int):
+    """Verify SpecDecodeWorker formats sampler output correctly.
+    Everything else is mocked out.
+    """
+    vocab_size = 32_000
+
+    draft_worker = mock_worker(cls=MultiStepWorker, vocab_size=vocab_size)
+    target_worker = mock_worker(vocab_size=vocab_size)
+    rejection_sampler = MagicMock(spec=RejectionSampler)
+    rejection_sampler.token_id_dtype = torch.int64
+    metrics_collector = MagicMock(spec=AsyncMetricsCollector)
+    draft_worker.device = 'cuda'
+    target_worker.device = 'cuda'
+
+    set_random_seed(1)
+
+    worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
+                              metrics_collector)
+    worker.init_model()
+
+    proposal_token_ids = torch.randint(low=0,
+                                       high=vocab_size,
+                                       size=(batch_size, k),
+                                       dtype=torch.int64,
+                                       device='cuda')
+    proposal_probs = torch.rand(batch_size,
+                                k,
+                                vocab_size,
+                                dtype=torch.float32,
+                                device='cuda')
+
+    proposal_lens = torch.ones(batch_size, dtype=torch.int64,
+                               device='cuda') * k
+
+    execute_model_data, _, _ = create_batch(batch_size, k)
+
+    draft_worker.get_spec_proposals.return_value = SpeculativeProposals(
+        proposal_token_ids=proposal_token_ids,
+        proposal_probs=proposal_probs,
+        proposal_lens=proposal_lens)
+
+    target_token_ids = torch.randint(low=0,
+                                     high=vocab_size,
+                                     size=(1, batch_size * (k + 1)),
+                                     dtype=torch.int64,
+                                     device='cuda')
+    target_token_probs = torch.rand(1,
+                                    batch_size * (k + 1),
+                                    vocab_size,
+                                    dtype=torch.float32,
+                                    device='cuda')
+    target_output = create_sampler_output_list(target_token_ids,
+                                               target_token_probs)
+
+    target_worker.execute_model.return_value = target_output[0]
+
+    rejection_sampler_output = torch.randint(low=0,
+                                             high=vocab_size,
+                                             size=(batch_size, k + 1),
+                                             dtype=torch.int64,
+                                             device='cuda')
+    for i in range(batch_size):
+        minimum_accepted_tokens = 1
+        rejection_sampler_output[i][
+            -random.randint(minimum_accepted_tokens, k + 1):] = -1
+
+    rejection_sampler.return_value = rejection_sampler_output
+
+    output = worker.execute_model(**execute_model_data.to_dict(),
+                                  num_spec_tokens=k)
+
+    expected_output = create_sampler_output_list(
+        rejection_sampler_output.transpose(0, 1), [None for _ in range(k + 1)])
+
+    seq_ids = [
+        next(iter(seq_group_metadata.seq_data.keys()))
+        for seq_group_metadata in execute_model_data.seq_group_metadata_list
+    ]
+    actual_output_by_seq = {seq_id: [] for seq_id in seq_ids}
+    expected_output_by_seq = {seq_id: [] for seq_id in seq_ids}
+
+    for step in output:
+        for seq_group in step:
+            for sample in seq_group.samples:
+                seq_id = sample.parent_seq_id
+                actual_output_by_seq[seq_id].append(sample)
+
+    for step in expected_output:
+        for seq_group in step:
+            for sample in seq_group.samples:
+                seq_id = sample.parent_seq_id
+                expected_output_by_seq[seq_id].append(sample)
+
+    all_seen_seq_ids = set(
+        list(actual_output_by_seq.keys()) +
+        list(expected_output_by_seq.keys()))
+    for seq_id in all_seen_seq_ids:
+        actual_by_step = actual_output_by_seq[seq_id]
+        expected_by_step = expected_output_by_seq[seq_id]
+
+        for i in range(k + 1):
+            if i >= len(actual_by_step):
+                assert expected_by_step[i].output_token == -1
+                continue
+            assert actual_by_step[i].output_token == expected_by_step[
+                i].output_token
+            assert actual_by_step[i].logprobs == expected_by_step[i].logprobs
+
+
+@pytest.mark.parametrize('k', [1, 2])
+@pytest.mark.parametrize('batch_size', [1])
+@pytest.mark.parametrize('returns_metrics', [True, False])
+@torch.inference_mode()
+def test_collects_metrics(k: int, batch_size: int, returns_metrics: bool):
+    """Verify SpecDecodeWorker collects metrics.
+    """
+    vocab_size = 32_000
+
+    draft_worker = mock_worker(cls=MultiStepWorker, vocab_size=vocab_size)
+    target_worker = mock_worker(vocab_size=vocab_size)
+    rejection_sampler = MagicMock(spec=RejectionSampler)
+    rejection_sampler.token_id_dtype = torch.int64
+    metrics_collector = MagicMock(spec=AsyncMetricsCollector)
+    draft_worker.device = 'cuda'
+    target_worker.device = 'cuda'
+
+    set_random_seed(1)
+
+    worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
+                              metrics_collector)
+    worker.init_model()
+
+    proposal_token_ids = torch.randint(low=0,
+                                       high=vocab_size,
+                                       size=(batch_size, k),
+                                       dtype=torch.int64,
+                                       device='cuda')
+    proposal_probs = torch.rand(batch_size,
+                                k,
+                                vocab_size,
+                                dtype=torch.float32,
+                                device='cuda')
+
+    proposal_lens = torch.ones(batch_size, dtype=torch.int64,
+                               device='cuda') * k
+
+    execute_model_data, _, _ = create_batch(batch_size, k)
+
+    draft_worker.get_spec_proposals.return_value = SpeculativeProposals(
+        proposal_token_ids=proposal_token_ids,
+        proposal_probs=proposal_probs,
+        proposal_lens=proposal_lens)
+
+    target_token_ids = torch.randint(low=0,
+                                     high=vocab_size,
+                                     size=(1, batch_size * (k + 1)),
+                                     dtype=torch.int64,
+                                     device='cuda')
+    target_token_probs = torch.rand(1,
+                                    batch_size * (k + 1),
+                                    vocab_size,
+                                    dtype=torch.float32,
+                                    device='cuda')
+    target_output = create_sampler_output_list(target_token_ids,
+                                               target_token_probs)
+
+    target_worker.execute_model.return_value = target_output[0]
+
+    rejection_sampler_output = torch.randint(low=0,
+                                             high=vocab_size,
+                                             size=(batch_size, k + 1),
+                                             dtype=torch.int64,
+                                             device='cuda')
+    for i in range(batch_size):
+        minimum_accepted_tokens = 1
+        rejection_sampler_output[i][
+            -random.randint(minimum_accepted_tokens, k + 1):] = -1
+
+    rejection_sampler.return_value = rejection_sampler_output
+
+    mock_rejsample_metrics = MagicMock(
+        spec=SpecDecodeWorkerMetrics) if returns_metrics else None
+    metrics_collector.maybe_collect_rejsample_metrics.return_value = (
+        mock_rejsample_metrics)
+
+    output = worker.execute_model(**execute_model_data.to_dict(),
+                                  num_spec_tokens=k)
+    assert output[0].spec_decode_worker_metrics == mock_rejsample_metrics
+
+    call_args_list = (
+        metrics_collector.maybe_collect_rejsample_metrics.call_args_list)
+    assert len(call_args_list) == 1
+    args, kwargs = call_args_list[0]
+    assert args[0] == k or kwargs.get('k', -1) == k
+
+
+@pytest.mark.parametrize('k', [0])
+@pytest.mark.parametrize('batch_size', [1, 2, 32])
+@torch.inference_mode()
+def test_k_equals_zero(k: int, batch_size: int):
+    """Verify that the SpecDecodeWorker calls the draft and target workers
+    when k is zero. This happens during prefill.
+    """
+    draft_worker = mock_worker(cls=MultiStepWorker)
+    target_worker = mock_worker()
+    rejection_sampler = MagicMock(spec=RejectionSampler)
+    rejection_sampler.token_id_dtype = torch.int64
+    metrics_collector = MagicMock(spec=AsyncMetricsCollector)
+
+    draft_worker.device = 'cuda'
+    target_worker.device = 'cuda'
+
+    set_random_seed(1)
+
+    worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
+                              metrics_collector)
+
+    execute_model_data, prompts, prev_output_tokens = create_batch(
+        batch_size, k, prev_output_token_len=0)
+
+    out = worker.execute_model(**execute_model_data.to_dict(),
+                               num_spec_tokens=k)
+
+    assert len(out) == 1, f"expected only one token output when {k=}"
+    assert out[0].probs is None, "expect gpu tensor references to be None"
+    assert out[
+        0].sampled_tokens is None, "expect gpu tensor references to be None"
+
+    draft_worker.execute_model.assert_called_once_with(
+        **execute_model_data.to_dict(), return_python_output=False)
+    target_worker.execute_model.assert_called_once_with(
+        **execute_model_data.to_dict())
+
+
+@pytest.mark.parametrize('k', [0, 5])
+@pytest.mark.parametrize('batch_size', [0])
+@torch.inference_mode()
+def test_empty_input_batch(k: int, batch_size: int):
+    """Verify that the SpecDecodeWorker calls the draft and target workers
+    when the input batch is empty. This can happen if the engine communicates
+    to the workers information without scheduling a batch.
+    """
+    draft_worker = mock_worker(cls=MultiStepWorker)
+    target_worker = mock_worker()
+    rejection_sampler = MagicMock(spec=RejectionSampler)
+    rejection_sampler.token_id_dtype = torch.int64
+    metrics_collector = MagicMock(spec=AsyncMetricsCollector)
+
+    draft_worker.device = 'cuda'
+    target_worker.device = 'cuda'
+
+    set_random_seed(1)
+
+    worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
+                              metrics_collector)
+
+    execute_model_data, prompts, prev_output_tokens = create_batch(
+        batch_size, k, prev_output_token_len=0)
+
+    out = worker.execute_model(**execute_model_data.to_dict(),
+                               num_spec_tokens=k)
+
+    assert len(out) == 1, f"expected only one token output when {k=}"
+    assert out[0].probs is None, "expect gpu tensor references to be None"
+    assert out[
+        0].sampled_tokens is None, "expect gpu tensor references to be None"
+
+    draft_worker.execute_model.assert_called_once_with(
+        **execute_model_data.to_dict(), return_python_output=False)
+    target_worker.execute_model.assert_called_once_with(
+        **execute_model_data.to_dict())
+
+
+@torch.inference_mode()
+def test_init_model():
+    """Verify SpecDecodeWorker invokes proposer/scorer worker init_model, as
+    well as other GPU initialization.
+    """
+    draft_worker = mock_worker(cls=MultiStepWorker)
+    target_worker = mock_worker()
+    rejection_sampler = MagicMock(spec=RejectionSampler)
+    rejection_sampler.token_id_dtype = torch.int64
+    metrics_collector = MagicMock(spec=AsyncMetricsCollector)
+
+    worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
+                              metrics_collector)
+
+    worker.init_model()
+
+    draft_worker.init_model.assert_called_once()
+
+    target_worker.init_model.assert_called_once()
+
+    metrics_collector.init_gpu_tensors.assert_called_once()
+    rejection_sampler.init_gpu_tensors.assert_called_once()
+
+
+@torch.inference_mode()
+def test_init_cache_engine():
+    """Verify SpecDecodeWorker invokes init_cache_engine on proposer/scorer
+    workers.
+    """
+    draft_worker = mock_worker(cls=MultiStepWorker)
+    target_worker = mock_worker()
+    rejection_sampler = MagicMock(spec=RejectionSampler)
+    rejection_sampler.token_id_dtype = torch.int64
+    metrics_collector = MagicMock(spec=AsyncMetricsCollector)
+
+    worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
+                              metrics_collector)
+
+    cache_config = MagicMock()
+
+    worker.init_cache_engine(cache_config)
+
+    draft_worker.init_cache_engine.assert_called_once_with(cache_config)
+    target_worker.init_cache_engine.assert_called_once_with(cache_config)
+
+
+@pytest.mark.parametrize('available_gpu_blocks', [1, 1024])
+@pytest.mark.parametrize('available_cpu_blocks', [500])
+@pytest.mark.parametrize('target_cache_block_size_bytes', [2 * 2 * 4096])
+@pytest.mark.parametrize('draft_kv_size_bytes', [0, 2 * 2 * 768, 2 * 2 * 4096])
+@torch.inference_mode()
+def test_profile_num_available_blocks(available_gpu_blocks: int,
+                                      available_cpu_blocks: int,
+                                      target_cache_block_size_bytes: int,
+                                      draft_kv_size_bytes: int):
+    """Verify SpecDecodeWorker correctly profiles num available GPU blocks.
+    Specifically, it should run profiling in the scorer worker, and then evenly
+    split the blocks between proposer and scorer worker.
+    """
+    draft_worker = mock_worker(cls=MultiStepWorker)
+    target_worker = mock_worker()
+    rejection_sampler = MagicMock(spec=RejectionSampler)
+    rejection_sampler.token_id_dtype = torch.int64
+    metrics_collector = MagicMock(spec=AsyncMetricsCollector)
+
+    target_worker.profile_num_available_blocks.return_value = (
+        available_gpu_blocks, available_cpu_blocks)
+    target_worker.get_cache_block_size_bytes.return_value = (
+        target_cache_block_size_bytes)
+    draft_worker.get_cache_block_size_bytes.return_value = draft_kv_size_bytes
+
+    worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
+                              metrics_collector)
+
+    # These values do not directly impact the adjusted block size calculation,
+    # so they can be fixed.
+    gpu_memory_utilization = 0.9
+    cpu_swap_space = 100
+    block_size = 16
+
+    num_gpu_blocks, num_cpu_blocks = worker.profile_num_available_blocks(
+        block_size, gpu_memory_utilization, cpu_swap_space, cache_dtype="auto")
+
+    target_worker.profile_num_available_blocks.assert_called_once_with(
+        block_size, gpu_memory_utilization, cpu_swap_space, "auto")
+    assert num_cpu_blocks == available_cpu_blocks
+
+    assert num_gpu_blocks == split_num_cache_blocks_evenly(
+        target_cache_block_size_bytes, draft_kv_size_bytes,
+        available_gpu_blocks)
+
+
+@pytest.mark.parametrize('available_gpu_blocks',
+                         list(range(20)) + [1024, 1024**2])
+@pytest.mark.parametrize('target_cache_block_size_bytes',
+                         [2 * 2 * 4096, 2 * 2 * 8192])
+@pytest.mark.parametrize('draft_kv_size_bytes', [0, 2 * 2 * 768, 2 * 2 * 4096])
+@torch.inference_mode()
+def test_split_num_cache_blocks_evenly(available_gpu_blocks: int,
+                                       target_cache_block_size_bytes: int,
+                                       draft_kv_size_bytes: int):
+    """Verify split_num_cache_blocks_evenly does not exceed original memory
+    allocation in bytes.
+    """
+    num_blocks = split_num_cache_blocks_evenly(target_cache_block_size_bytes,
+                                               draft_kv_size_bytes,
+                                               available_gpu_blocks)
+    assert (num_blocks * target_cache_block_size_bytes) + (
+        num_blocks * draft_kv_size_bytes) <= (available_gpu_blocks *
+                                              target_cache_block_size_bytes)
diff --git a/tests/spec_decode/test_utils.py b/tests/spec_decode/test_utils.py
new file mode 100644
index 0000000000000..19833ddb06154
--- /dev/null
+++ b/tests/spec_decode/test_utils.py
@@ -0,0 +1,111 @@
+from vllm.spec_decode.util import get_all_seq_ids
+from vllm.sequence import SequenceGroupMetadata
+from vllm.spec_decode.util import split_batch_by_proposal_len
+
+import pytest
+from unittest.mock import MagicMock
+
+
+def test_get_all_seq_ids():
+    """Verify get_all_seq_ids extracts all seq ids.
+    """
+    expected_seq_ids = list(range(10)) + list(range(100, 110))
+
+    seq_group_metadata_list = [
+        SequenceGroupMetadata(
+            request_id=str(seq_id),
+            is_prompt=True,
+            seq_data={
+                seq_id: MagicMock(),
+            },
+            sampling_params=MagicMock(),
+            block_tables={
+                seq_id: MagicMock(),
+            },
+            lora_request=None,
+        ) for seq_id in expected_seq_ids
+    ]
+
+    actual_seq_ids = get_all_seq_ids(seq_group_metadata_list)
+    assert actual_seq_ids == expected_seq_ids
+
+
+@pytest.fixture
+def fake_sequence_group_metadata():
+    seq_ids = list(range(3))
+    return [
+        SequenceGroupMetadata(
+            request_id=str(i),
+            is_prompt=True,
+            seq_data={
+                i: MagicMock(),
+            },
+            sampling_params=MagicMock(),
+            block_tables={
+                i: MagicMock(),
+            },
+            lora_request=None,
+        ) for i in seq_ids
+    ]
+
+
+def test_filter_zero_length_proposals(fake_sequence_group_metadata):
+    proposal_lens = [0, 1, 0]
+    filtered_groups, indices = split_batch_by_proposal_len(
+        fake_sequence_group_metadata,
+        proposal_lens,
+        select_proposal_len_zero=True)
+
+    expected_groups = [
+        fake_sequence_group_metadata[0], fake_sequence_group_metadata[2]
+    ]
+    expected_indices = [0, 2]
+
+    assert filtered_groups == expected_groups
+    assert indices == expected_indices
+
+
+def test_filter_non_zero_length_proposals(fake_sequence_group_metadata):
+    proposal_lens = [0, 1, 2]
+    filtered_groups, indices = split_batch_by_proposal_len(
+        fake_sequence_group_metadata,
+        proposal_lens,
+        select_proposal_len_zero=False)
+
+    expected_groups = [
+        fake_sequence_group_metadata[1], fake_sequence_group_metadata[2]
+    ]
+    expected_indices = [1, 2]
+
+    assert filtered_groups == expected_groups
+    assert indices == expected_indices
+
+
+def test_empty_inputs():
+    filtered_groups, indices = split_batch_by_proposal_len(
+        [], [], select_proposal_len_zero=True)
+
+    assert filtered_groups == []
+    assert indices == []
+
+
+def test_all_zero_with_non_zero_filter(fake_sequence_group_metadata):
+    proposal_lens = [0, 0, 0]
+    filtered_groups, indices = split_batch_by_proposal_len(
+        fake_sequence_group_metadata,
+        proposal_lens,
+        select_proposal_len_zero=False)
+
+    assert filtered_groups == []
+    assert indices == []
+
+
+def test_all_non_zero_with_zero_filter(fake_sequence_group_metadata):
+    proposal_lens = [1, 1, 1]
+    filtered_groups, indices = split_batch_by_proposal_len(
+        fake_sequence_group_metadata,
+        proposal_lens,
+        select_proposal_len_zero=True)
+
+    assert filtered_groups == []
+    assert indices == []
diff --git a/tests/worker/spec_decode/utils.py b/tests/spec_decode/utils.py
similarity index 60%
rename from tests/worker/spec_decode/utils.py
rename to tests/spec_decode/utils.py
index fa8767cf898aa..997093988c0eb 100644
--- a/tests/worker/spec_decode/utils.py
+++ b/tests/spec_decode/utils.py
@@ -1,13 +1,16 @@
 import torch
-from typing import List, Optional, Dict
+from typing import List, Optional, Dict, Iterable, Union
+from unittest.mock import MagicMock
 
 from vllm.worker.worker import Worker
 from vllm.utils import get_distributed_init_method, get_ip, get_open_port
 from vllm.engine.arg_utils import EngineArgs
-from vllm.sequence import Logprob, SequenceGroupMetadata, SequenceData
+from vllm.sequence import (Logprob, SequenceGroupMetadata, SequenceData,
+                           SamplerOutput, SequenceGroupOutput, SequenceOutput)
 from vllm.sampling_params import SamplingParams
 from vllm.worker.cache_engine import CacheEngine
 from vllm.model_executor.utils import set_random_seed
+from itertools import count
 from dataclasses import dataclass, fields
 
 
@@ -24,6 +27,11 @@ def to_dict(self):
         return dict(
             (field.name, getattr(self, field.name)) for field in fields(self))
 
+    @classmethod
+    def from_dict(cls, d):
+        cleaned = dict((field.name, d[field.name]) for field in fields(cls))
+        return cls(**cleaned)
+
 
 def round_up_to_next_block(seq_len: int, block_size: int) -> int:
     return (seq_len + block_size - 1) // block_size
@@ -50,6 +58,21 @@ def create_execute_model_data(
     )
 
 
+def mock_worker(cls=None,
+                vocab_size: int = 30_000,
+                max_model_len: int = 2048,
+                rank: int = 0) -> MagicMock:
+    if cls is None:
+        cls = Worker
+
+    worker = MagicMock(spec=cls)
+    worker.vocab_size = vocab_size
+    worker.max_model_len = max_model_len
+    worker.rank = rank
+    worker.device = 'cuda:0'
+    return worker
+
+
 def patch_execute_model_with_seeds(worker: Worker, rand_seeds: List[int]):
     seed_iter = iter(rand_seeds)
     original_execute_model = worker.execute_model
@@ -117,25 +140,12 @@ def create_seq_group_metadata_from_prompts(
     block_size: int,
     final_seq_lens: List[int],
     continuations: Optional[List[List[int]]] = None,
-    num_tokens_processed: Optional[List[int]] = None,
     seq_ids: Optional[List[int]] = None,
 ) -> List[SequenceGroupMetadata]:
 
     if continuations is None:
         continuations = [[] for _ in prompts]
 
-    if num_tokens_processed is None:
-        # Default to 1 token missing from kv cache for generation sequences.
-        num_tokens_processed = []
-        for continuation, prompt in zip(continuations, prompts):
-            # If prefill, then default to zero tokens processed.
-            if not continuation:
-                num_tokens_processed.append(0)
-            else:
-                # If generation, then default to all but one tokens processed.
-                num_tokens_processed.append(
-                    len(continuation) + len(prompt) - 1)
-
     if seq_ids is None:
         seq_ids = list(i for i, _ in enumerate(prompts))
 
@@ -155,13 +165,15 @@ def create_seq_group_metadata_from_prompts(
             is_prompt=len(cont_token_ids) == 0,
             seq_data={
                 i:
-                SequenceData(prompt_token_ids=prompt_token_ids[:] +
-                             cont_token_ids[:])
+                SequenceData(
+                    prompt_token_ids=prompt_token_ids[:],
+                    output_token_ids=cont_token_ids[:],
+                ),
             },
             sampling_params=SamplingParams(temperature=0.0, ),
             block_tables={i: block_allocations[i][:]},
-        ) for i, (prompt_token_ids, cont_token_ids, num_tokens_saved) in
-        enumerate(zip(prompts, continuations, num_tokens_processed))
+        ) for i, (prompt_token_ids,
+                  cont_token_ids) in enumerate(zip(prompts, continuations))
     ]
 
 
@@ -178,3 +190,68 @@ def assert_logprobs_dict_allclose(
             expected = torch.tensor(
                 single_step_expected_logprobs[token_id].logprob)
             assert torch.allclose(actual, expected)
+
+
+def create_sampler_output_list(
+        token_ids: torch.Tensor,
+        probs: Iterable[Optional[torch.Tensor]],
+        seq_ids: Optional[List[int]] = None) -> List[SamplerOutput]:
+    num_steps, batch_size = token_ids.shape
+    token_ids_by_step = token_ids.tolist()
+
+    if seq_ids is None:
+        seq_ids = list(range(batch_size))
+
+    return [
+        SamplerOutput(outputs=[
+            SequenceGroupOutput(
+                samples=[
+                    SequenceOutput(
+                        output_token=token_id,
+                        parent_seq_id=seq_ids[seq_index],
+                        logprobs={token_id: 0},
+                    )
+                ],
+                prompt_logprobs=None,
+            ) for seq_index, token_id in enumerate(token_ids_by_step[step])
+        ],
+                      sampled_token_probs=probs[step],
+                      sampled_token_ids=token_ids[step])
+        for step in range(num_steps)
+    ]
+
+
+def create_batch(batch_size,
+                 k,
+                 prompt_len: Union[int, List[int]] = 10,
+                 prev_output_token_len: int = 10,
+                 seq_ids: Optional[List[int]] = None,
+                 num_gpu_blocks: Optional[int] = None,
+                 block_size: Optional[int] = None):
+    if block_size is None:
+        block_size = 8
+
+    if num_gpu_blocks is None:
+        num_gpu_blocks = 2048 // block_size
+
+    iterator = count()
+
+    if isinstance(prompt_len, int):
+        prompt_lens = [prompt_len for _ in range(batch_size)]
+    else:
+        prompt_lens = prompt_len
+
+    prompts = [[next(iterator) for _ in range(p_len)] for p_len in prompt_lens]
+    prev_output_tokens = [[
+        next(iterator) for _ in range(prev_output_token_len)
+    ] for _ in range(batch_size)]
+    final_seq_lens = [
+        len(prompt) + len(prev_output_token) + k + 1
+        for prompt, prev_output_token in zip(prompts, prev_output_tokens)
+    ]
+
+    execute_model_data = create_execute_model_data(
+        create_seq_group_metadata_from_prompts(prompts, num_gpu_blocks,
+                                               block_size, final_seq_lens,
+                                               prev_output_tokens, seq_ids), )
+    return execute_model_data, prompts, prev_output_tokens
diff --git a/tests/test_cache_block_hashing.py b/tests/test_cache_block_hashing.py
index 7c4ade7f8c8ed..fb541f38f3489 100644
--- a/tests/test_cache_block_hashing.py
+++ b/tests/test_cache_block_hashing.py
@@ -2,8 +2,11 @@
 
 Run `pytest tests/test_cache_block_hashing.py`.
 """
+from typing import List, Optional
+
 import pytest
 
+from vllm.lora.request import LoRARequest
 from vllm.transformers_utils.tokenizer import TokenizerGroup
 from vllm.sequence import Sequence
 
@@ -36,7 +39,10 @@ def flatten_2d(li):
 @pytest.mark.parametrize("model", ["facebook/opt-125m"])
 @pytest.mark.parametrize("block_size", [16])
 @pytest.mark.parametrize("max_num_seqs", [256])
-def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int):
+@pytest.mark.parametrize("concurrent_lora_int_ids",
+                         [[None], [1], [None, 1], [None, 1, 2], [1, 2]])
+def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int,
+                             concurrent_lora_int_ids: List[Optional[int]]):
 
     tokenizer = TokenizerGroup(
         tokenizer_id="facebook/opt-125m",
@@ -48,19 +54,30 @@ def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int):
     hashes = []
 
     for prefix in prefixes:
-        hashes.append([])
-        prompts = [prefix + prompt for prompt in sample_prompts]
-        seq_id = 0
-        for prompt in prompts:
-            hashes[-1].append([])
-            prompt_token_ids = tokenizer.encode(prompt)
-            seq = Sequence(seq_id, prompt, prompt_token_ids, block_size)
-
-            num_blocks = len(prompt_token_ids) // block_size
-            for idx in range(num_blocks):
-                hashes[-1][-1].append(seq.hash_of_block(idx))
-
-            seq_id += 1
+        for lora_int_id in concurrent_lora_int_ids:
+            lora_request = None
+
+            if lora_int_id is not None:
+                lora_request = LoRARequest(
+                    f"example_lora_{lora_int_id}",
+                    lora_int_id,
+                    f"example/path/to/lora_{lora_int_id}",
+                )
+
+            hashes.append([])
+            prompts = [prefix + prompt for prompt in sample_prompts]
+            seq_id = 0
+            for prompt in prompts:
+                hashes[-1].append([])
+                prompt_token_ids = tokenizer.encode(prompt)
+                seq = Sequence(seq_id, prompt, prompt_token_ids, block_size,
+                               tokenizer.tokenizer.eos_token_id, lora_request)
+
+                num_blocks = len(prompt_token_ids) // block_size
+                for idx in range(num_blocks):
+                    hashes[-1][-1].append(seq.hash_of_block(idx))
+
+                seq_id += 1
 
     # Check that hashes made with two prefixes with different first blocks are
     # different everywhere.
diff --git a/tests/test_sequence.py b/tests/test_sequence.py
new file mode 100644
index 0000000000000..e18df059d770f
--- /dev/null
+++ b/tests/test_sequence.py
@@ -0,0 +1,50 @@
+import pytest
+
+from vllm.sequence import SequenceGroupOutput, SamplerOutput, SequenceOutput
+
+
+@pytest.fixture
+def sample_outputs():
+    return [
+        SequenceGroupOutput(samples=[
+            SequenceOutput(parent_seq_id=0, output_token=i, logprobs={})
+        ],
+                            prompt_logprobs=None) for i in range(5)
+    ]
+
+
+@pytest.fixture
+def sampler_output(sample_outputs):
+    return SamplerOutput(outputs=sample_outputs)
+
+
+def test_sampler_output_initialization(sampler_output, sample_outputs):
+    assert len(sampler_output) == len(sample_outputs)
+    assert sampler_output.sampled_token_probs is None
+    assert sampler_output.sampled_token_ids is None
+    assert sampler_output.spec_decode_worker_metrics is None
+
+
+def test_sampler_output_getitem(sampler_output, sample_outputs):
+    assert sampler_output[2] == sample_outputs[2]
+
+
+def test_sampler_output_setitem(sampler_output):
+    new_output = SequenceGroupOutput(samples=[
+        SequenceOutput(parent_seq_id=0, output_token=99, logprobs={})
+    ],
+                                     prompt_logprobs=None)
+    sampler_output[2] = new_output
+    assert sampler_output[2] == new_output
+
+
+def test_sampler_output_len(sampler_output, sample_outputs):
+    assert len(sampler_output) == len(sample_outputs)
+
+
+def test_sampler_output_eq(sample_outputs):
+    sampler_output1 = SamplerOutput(outputs=sample_outputs)
+    sampler_output2 = SamplerOutput(outputs=sample_outputs.copy())
+    sampler_output3 = SamplerOutput(outputs=sample_outputs[:-1])
+    assert sampler_output1 == sampler_output2
+    assert sampler_output1 != sampler_output3
diff --git a/vllm/config.py b/vllm/config.py
index 0f3c064a90386..c8e261afc8377 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -219,8 +219,8 @@ def _verify_quantization(self) -> None:
             if is_hip(
             ) and self.quantization in rocm_not_supported_quantization:
                 raise ValueError(
-                    f"{self.quantization} quantization is currently not supported "
-                    f"in ROCm.")
+                    f"{self.quantization} quantization is currently not "
+                    f"supported in ROCm.")
             if self.quantization != "marlin":
                 logger.warning(
                     f"{self.quantization} quantization is not fully "
@@ -351,7 +351,8 @@ def __init__(
         self.num_cpu_blocks = None
 
     def metrics_info(self):
-        # convert cache_config to dict(key: str, value: str) for prometheus metrics info
+        # convert cache_config to dict(key: str, value: str) for prometheus
+        # metrics info
         return {key: str(value) for key, value in self.__dict__.items()}
 
     def _verify_args(self) -> None:
@@ -429,8 +430,9 @@ def __init__(
     ) -> None:
         self.pipeline_parallel_size = pipeline_parallel_size
         if is_neuron():
-            # For Neuron device support, here we assign TP=1 to avoid sharding within vLLM directly.
-            # Transformer-neuronx would take neuron_tp_degree attribute, and distribute the workload
+            # For Neuron device support, here we assign TP=1 to avoid sharding
+            # within vLLM directly. Transformer-neuronx would take
+            # neuron_tp_degree attribute, and distribute the workload
             # to multiple NeuronCores.
             self.tensor_parallel_size = 1
             self.neuron_tp_degree = tensor_parallel_size
diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index daf83827a7e52..8bfc14999f0a7 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -1,6 +1,6 @@
 """A block manager that manages token blocks."""
 import enum
-from itertools import count
+from itertools import count, takewhile
 from os.path import commonprefix
 from typing import Dict, List, Optional, Set, Tuple
 
@@ -95,13 +95,15 @@ def free(self, block: PhysicalTokenBlock) -> None:
                 del self.cached_blocks[block.block_hash]
 
     def get_num_free_blocks(self) -> int:
-        return self.num_blocks - self.current_num_blocks + self.evictor.num_blocks
+        return (self.num_blocks - self.current_num_blocks +
+                self.evictor.num_blocks)
 
     def contains_block(self, block_hash: int) -> bool:
         return block_hash in self.cached_blocks or block_hash in self.evictor
 
     def update_hash(self, block_hash: int, block: PhysicalTokenBlock):
-        # If caching is enabled, update the hash of block and the cached_blocks dictionary.
+        # If caching is enabled, update the hash of block and the
+        # cached_blocks dictionary.
         if self.enable_caching:
             assert not self.contains_block(block_hash)
             old_hash = block.block_hash
@@ -218,10 +220,12 @@ def _promote_last_block(
         seq: Sequence,
         last_block: PhysicalTokenBlock,
     ) -> PhysicalTokenBlock:
-        # Compute a new hash for the block so that it can be shared by other Sequences
+        # Compute a new hash for the block so that it can be shared by
+        # other Sequences
         new_hash = seq.hash_of_block(len(seq.logical_token_blocks) - 1)
 
-        # if new_hash is already in the cached table, then free last_block and return the cached version
+        # if new_hash is already in the cached table, then free last_block
+        # and return the cached version
         if self.gpu_allocator.contains_block(new_hash):
             self.gpu_allocator.free(last_block)
             return self.gpu_allocator.allocate(new_hash)
@@ -289,7 +293,8 @@ def append_slot(
         assert last_block.device == Device.GPU
         if last_block.ref_count == 1:
             # Not shared with other sequences. Appendable.
-            # If the last block is now complete, promote it to a full block so that it can be shared
+            # If the last block is now complete, promote it to a full block so
+            # that it can be shared
             new_block = self._maybe_promote_last_block(seq, last_block)
             block_table[-1] = new_block
             return None
@@ -426,23 +431,29 @@ def access_all_blocks_in_seq(
         for block in block_table:
             block.last_accessed = access_time
 
-    def compute_last_full_block_in_seq(self, seq: Sequence):
+    def compute_full_blocks_in_seq(self, seq: Sequence):
         if seq.seq_id not in self.block_tables:
             return
         max_full_block = seq.get_len() // self.block_size - 1
         block_table = self.block_tables[seq.seq_id]
         if max_full_block == -1:
             return
-        block_table[max_full_block].computed = True
+        for i in reversed(range(max_full_block)):
+            if block_table[i].computed:
+                break
+            block_table[i].computed = True
 
-    def get_all_block_ids_till_computed(self, seq: Sequence) -> List[int]:
+    def get_all_computed_blocks(self, seq: Sequence) -> List[int]:
         if seq.seq_id not in self.block_tables:
             return []
         block_table = self.block_tables[seq.seq_id]
-        for block_idx in reversed(range(len(block_table))):
-            if block_table[block_idx].computed:
-                return [b.block_number for b in block_table[:block_idx + 1]]
-        return []
+        # NOTE We exclude the last block to avoid the case where the entire
+        # prompt is cached. This would cause erroneous behavior in model
+        # runner.
+        return [
+            b.block_number
+            for b in takewhile(lambda b: b.computed, block_table[:-1])
+        ]
 
     def get_common_computed_block_ids(self,
                                       seq_group: SequenceGroup) -> List[int]:
@@ -451,14 +462,12 @@ def get_common_computed_block_ids(self,
             return []
 
         ids_list = [
-            self.get_all_block_ids_till_computed(seq)
+            self.get_all_computed_blocks(seq)
             for seq in iter(seq_group.seqs_dict.values())
         ]
         return commonprefix([ids for ids in ids_list if ids != []])
 
     def mark_blocks_as_computed(self, seq_group: SequenceGroup):
-        # NOTE: We only mark the last full block because with prefix caching,
-        # all blocks until the marked one are guaranteed to be computed.
         if self.enable_caching:
             for seq in seq_group.seqs_dict.values():
-                self.compute_last_full_block_in_seq(seq)
+                self.compute_full_blocks_in_seq(seq)
diff --git a/vllm/core/evictor.py b/vllm/core/evictor.py
index b538ea574b604..1d81f5a97d71c 100644
--- a/vllm/core/evictor.py
+++ b/vllm/core/evictor.py
@@ -39,9 +39,9 @@ def add(self, block: PhysicalTokenBlock):
     @abstractmethod
     def remove(self, block_hash: int) -> PhysicalTokenBlock:
         """Simply removes the block with the hash value block_hash from the
-        evictor. Caller is responsible for making sure that block_hash is contained
-        in the evictor before calling remove. Should be used to "bring back" blocks
-        that have been freed but not evicted yet.
+        evictor. Caller is responsible for making sure that block_hash is
+        contained in the evictor before calling remove. Should be used to
+        "bring back" blocks that have been freed but not evicted yet.
         """
         pass
 
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 1ae58f525b0fb..9255f91be55cb 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -59,10 +59,9 @@ def is_empty(self) -> bool:
                 and not self.blocks_to_swap_out and not self.blocks_to_copy)
 
     def _sort_by_lora_ids(self) -> bool:
-        self.scheduled_seq_groups = sorted(
-            self.scheduled_seq_groups,
-            key=lambda g: (g.lora_request.lora_int_id
-                           if g.lora_request else 0, g.request_id))
+        self.scheduled_seq_groups = sorted(self.scheduled_seq_groups,
+                                           key=lambda g:
+                                           (g.lora_int_id, g.request_id))
 
     @property
     def lora_requests(self) -> Set[LoRARequest]:
@@ -215,8 +214,8 @@ def _schedule(self) -> SchedulerOutputs:
                 lora_int_id = 0
                 if self.lora_enabled:
                     lora_int_id = seq_group.lora_int_id
-                    if lora_int_id > 0 and lora_int_id not in curr_loras and len(
-                            curr_loras) >= self.lora_config.max_loras:
+                    if (lora_int_id > 0 and lora_int_id not in curr_loras
+                            and len(curr_loras) >= self.lora_config.max_loras):
                         # We don't have a space for another LoRA, so
                         # we ignore this request for now.
                         leftover_waiting_sequences.appendleft(seq_group)
@@ -310,8 +309,8 @@ def _schedule(self) -> SchedulerOutputs:
                 lora_int_id = 0
                 if self.lora_enabled:
                     lora_int_id = seq_group.lora_int_id
-                    if lora_int_id > 0 and lora_int_id not in curr_loras and len(
-                            curr_loras) >= self.lora_config.max_loras:
+                    if (lora_int_id > 0 and lora_int_id not in curr_loras
+                            and len(curr_loras) >= self.lora_config.max_loras):
                         # We don't have a space for another LoRA, so
                         # we ignore this request for now.
                         leftover_swapped.appendleft(seq_group)
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index df66139fddcd1..5629d1a863d04 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -1,8 +1,11 @@
 import asyncio
+import os
 import time
 from functools import partial
 from typing import (Any, Dict, Iterable, List, Optional, Set, Tuple, Type,
-                    Union, AsyncIterator)
+                    Union, AsyncIterator, Callable)
+
+from transformers import PreTrainedTokenizer
 
 from vllm.lora.request import LoRARequest
 from vllm.config import ModelConfig
@@ -14,28 +17,31 @@
 from vllm.sampling_params import SamplingParams
 
 logger = init_logger(__name__)
+ENGINE_ITERATION_TIMEOUT_S = int(
+    os.environ.get("VLLM_ENGINE_ITERATION_TIMEOUT_S", "60"))
 
 
 class AsyncEngineDeadError(RuntimeError):
     pass
 
 
-def _raise_exception_on_finish(task: asyncio.Task,
-                               request_tracker: "RequestTracker") -> None:
+def _raise_exception_on_finish(
+        task: asyncio.Task, error_callback: Callable[[Exception],
+                                                     None]) -> None:
     msg = ("Task finished unexpectedly. This should never happen! "
            "Please open an issue on Github.")
+
+    exception = None
     try:
-        try:
-            task.result()
-        except asyncio.CancelledError:
-            return
-        except Exception as exc:
-            raise AsyncEngineDeadError(
-                msg + " See stack trace above for the actual cause.") from exc
+        task.result()
+        # NOTE: This will be thrown if task exits normally (which it should not)
         raise AsyncEngineDeadError(msg)
-    except Exception as exc:
-        request_tracker.propagate_exception(exc)
-        raise exc
+    except Exception as e:
+        exception = e
+        logger.error("Engine background task failed", exc_info=e)
+        error_callback(exception)
+        raise AsyncEngineDeadError(
+            msg + " See stack trace above for the actual cause.") from e
 
 
 class AsyncStream:
@@ -78,13 +84,13 @@ def __init__(self) -> None:
         self._finished_requests: asyncio.Queue[str] = asyncio.Queue()
         self._new_requests: asyncio.Queue[Tuple[AsyncStream,
                                                 dict]] = asyncio.Queue()
-        self.new_requests_event = None
+        self.new_requests_event = asyncio.Event()
 
     def __contains__(self, item):
         return item in self._request_streams
 
-    def init_event(self):
-        self.new_requests_event = asyncio.Event()
+    def __len__(self) -> int:
+        return len(self._request_streams)
 
     def propagate_exception(self,
                             exc: Exception,
@@ -93,9 +99,11 @@ def propagate_exception(self,
         (all if request_id is None)."""
         if request_id is not None:
             self._request_streams[request_id].put(exc)
+            self.abort_request(request_id)
         else:
-            for stream in self._request_streams.values():
+            for rid, stream in self._request_streams.items():
                 stream.put(exc)
+                self.abort_request(rid)
 
     def process_request_output(self,
                                request_output: RequestOutput,
@@ -172,12 +180,15 @@ def get_new_and_finished_requests(self) -> Tuple[List[Dict], Set[str]]:
             self._request_streams[stream.request_id] = stream
             new_requests.append(new_request)
 
-        self.new_requests_event.clear()
-
         return new_requests, finished_requests
 
     async def wait_for_new_requests(self):
-        await self.new_requests_event.wait()
+        if not self.has_new_requests():
+            await self.new_requests_event.wait()
+        self.new_requests_event.clear()
+
+    def has_new_requests(self):
+        return not self._new_requests.empty()
 
 
 class _AsyncLLMEngine(LLMEngine):
@@ -285,6 +296,10 @@ async def _run_workers_async(
         all_outputs = await asyncio.gather(*coros)
         return all_outputs
 
+    async def check_health_async(self):
+        """Raises an error if engine is unhealthy."""
+        self._check_if_any_actor_is_dead()
+
 
 class AsyncLLMEngine:
     """An asynchronous wrapper for LLMEngine.
@@ -335,27 +350,51 @@ def __init__(self,
         # collected
         self._background_loop_unshielded = None
         self.start_engine_loop = start_engine_loop
-        self._request_tracker = RequestTracker()
+        self._request_tracker: Optional[RequestTracker] = None
+        self._errored_with: Optional[BaseException] = None
 
     @property
     def is_running(self) -> bool:
         return (self.background_loop is not None
-                and not self.background_loop.done())
+                and not self._background_loop_unshielded.done())
+
+    @property
+    def is_stopped(self) -> bool:
+        return self.errored or (self.background_loop is not None
+                                and self._background_loop_unshielded.done())
 
-    def get_tokenizer(self):
-        return self.engine.tokenizer.tokenizer
+    @property
+    def errored(self) -> bool:
+        return self._errored_with is not None
+
+    def set_errored(self, exc: Exception) -> None:
+        self._errored_with = exc
+
+    def _error_callback(self, exc: Exception) -> None:
+        self.set_errored(exc)
+        self._request_tracker.propagate_exception(exc)
+
+    async def get_tokenizer(self) -> "PreTrainedTokenizer":
+        if self.engine_use_ray:
+            return await self.engine.get_tokenizer.remote()
+        else:
+            return self.engine.get_tokenizer()
 
     def start_background_loop(self) -> None:
         """Start the background loop."""
+        if self.errored:
+            raise AsyncEngineDeadError(
+                "Background loop has errored already.") from self._errored_with
         if self.is_running:
             raise RuntimeError("Background loop is already running.")
-        self._request_tracker.init_event()
+        # Initialize the RequestTracker here so it uses the right event loop.
+        self._request_tracker = RequestTracker()
 
         self._background_loop_unshielded = asyncio.get_event_loop(
         ).create_task(self.run_engine_loop())
         self._background_loop_unshielded.add_done_callback(
             partial(_raise_exception_on_finish,
-                    request_tracker=self._request_tracker))
+                    error_callback=self._error_callback))
         self.background_loop = asyncio.shield(self._background_loop_unshielded)
 
     def _init_engine(self, *args,
@@ -423,12 +462,23 @@ async def _engine_abort(self, request_ids: Iterable[str]):
             self.engine.abort_request(request_ids)
 
     async def run_engine_loop(self):
-        # Initialize the RequestTracker here so it uses the right event loop.
         has_requests_in_progress = False
         while True:
             if not has_requests_in_progress:
+                logger.debug("Waiting for new requests...")
                 await self._request_tracker.wait_for_new_requests()
-            has_requests_in_progress = await self.engine_step()
+                logger.debug("Got new requests!")
+
+            # Abort if iteration takes too long due to unrecoverable errors
+            # (eg. NCCL timeouts).
+            try:
+                has_requests_in_progress = await asyncio.wait_for(
+                    self.engine_step(), ENGINE_ITERATION_TIMEOUT_S)
+            except asyncio.TimeoutError as exc:
+                logger.error(
+                    "Engine iteration timed out. This should never happen!")
+                self.set_errored(exc)
+                raise
             await asyncio.sleep(0)
 
     async def add_request(
@@ -647,3 +697,19 @@ async def do_log_stats(self) -> None:
             await self.engine.do_log_stats.remote()
         else:
             self.engine.do_log_stats()
+
+    async def check_health(self):
+        """Raises an error if engine is unhealthy."""
+        t = time.perf_counter()
+        logger.debug("Starting health check...")
+        if self.is_stopped:
+            raise AsyncEngineDeadError("Background loop is stopped.")
+
+        if self.engine_use_ray:
+            try:
+                await self.engine.check_health.remote()
+            except ray.exceptions.RayActorError as e:
+                raise RuntimeError("Engine is dead.") from e
+        else:
+            await self.engine.check_health_async()
+        logger.debug(f"Health check took {time.perf_counter()-t}s")
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 936bff652ee2b..3e35e04107fa2 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -9,6 +9,8 @@
 from typing import (TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple,
                     Union)
 
+from transformers import PreTrainedTokenizer
+
 import vllm
 from vllm.lora.request import LoRARequest
 from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
@@ -100,7 +102,8 @@ def __init__(
             f"download_dir={model_config.download_dir!r}, "
             f"load_format={model_config.load_format}, "
             f"tensor_parallel_size={parallel_config.tensor_parallel_size}, "
-            f"disable_custom_all_reduce={parallel_config.disable_custom_all_reduce}, "
+            f"disable_custom_all_reduce="
+            f"{parallel_config.disable_custom_all_reduce}, "
             f"quantization={model_config.quantization}, "
             f"sparsity={model_config.sparsity}, "
             f"enforce_eager={model_config.enforce_eager}, "
@@ -161,7 +164,16 @@ def __init__(
         if USE_RAY_COMPILED_DAG:
             self.forward_dag = self._compiled_ray_dag()
 
-    def get_tokenizer_for_seq(self, sequence: Sequence):
+    def __reduce__(self):
+        # This is to ensure that the LLMEngine is not referenced in
+        # the closure used to initialize Ray worker actors
+        raise RuntimeError("LLMEngine should not be pickled!")
+
+    def get_tokenizer(self) -> "PreTrainedTokenizer":
+        return self.tokenizer.get_lora_tokenizer()
+
+    def get_tokenizer_for_seq(self,
+                              sequence: Sequence) -> "PreTrainedTokenizer":
         return self.tokenizer.get_lora_tokenizer(sequence.lora_request)
 
     def _dispatch_worker(self):
@@ -283,6 +295,8 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
         parallel_config = copy.deepcopy(self.parallel_config)
         scheduler_config = copy.deepcopy(self.scheduler_config)
         device_config = copy.deepcopy(self.device_config)
+        lora_config = copy.deepcopy(self.lora_config)
+        kv_cache_dtype = self.cache_config.cache_dtype
 
         for rank, (worker, (node_id,
                             _)) in enumerate(zip(self.workers,
@@ -298,22 +312,22 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
                     local_rank,
                     rank,
                     distributed_init_method,
-                    lora_config=self.lora_config,
-                    kv_cache_dtype=self.cache_config.cache_dtype,
+                    lora_config=lora_config,
+                    kv_cache_dtype=kv_cache_dtype,
                 ))
 
         driver_rank = 0
         driver_local_rank = node_workers[driver_node_id].index(driver_rank)
         self.driver_worker = Worker(
-            model_config,
-            parallel_config,
-            scheduler_config,
-            device_config,
+            self.model_config,
+            self.parallel_config,
+            self.scheduler_config,
+            self.device_config,
             driver_local_rank,
             driver_rank,
             distributed_init_method,
             lora_config=self.lora_config,
-            kv_cache_dtype=self.cache_config.cache_dtype,
+            kv_cache_dtype=kv_cache_dtype,
             is_driver_worker=True,
         )
 
@@ -494,8 +508,10 @@ def add_request(
         # Create the sequences.
         block_size = self.cache_config.block_size
         seq_id = next(self.seq_counter)
+        eos_token_id = self.tokenizer.get_lora_tokenizer(
+            lora_request).eos_token_id
         seq = Sequence(seq_id, prompt, prompt_token_ids, block_size,
-                       lora_request)
+                       eos_token_id, lora_request)
 
         # Defensive copy of SamplingParams, which are used by the sampler,
         # this doesn't deep-copy LogitsProcessor objects
@@ -551,15 +567,13 @@ def _check_beam_search_early_stopping(
         if early_stopping is True:
             return True
 
-        current_worst_score = (current_worst_seq.get_beam_search_score(
+        current_worst_score = current_worst_seq.get_beam_search_score(
             length_penalty=length_penalty,
-            eos_token_id=self.get_tokenizer_for_seq(
-                current_worst_seq).eos_token_id))
+            eos_token_id=current_worst_seq.eos_token_id)
         if early_stopping is False:
-            highest_attainable_score = (best_running_seq.get_beam_search_score(
+            highest_attainable_score = best_running_seq.get_beam_search_score(
                 length_penalty=length_penalty,
-                eos_token_id=self.get_tokenizer_for_seq(
-                    best_running_seq).eos_token_id))
+                eos_token_id=best_running_seq.eos_token_id)
         else:
             assert early_stopping == "never"
             if length_penalty > 0.0:
@@ -573,8 +587,7 @@ def _check_beam_search_early_stopping(
                 highest_attainable_score = (
                     best_running_seq.get_beam_search_score(
                         length_penalty=length_penalty,
-                        eos_token_id=self.get_tokenizer_for_seq(
-                            best_running_seq).eos_token_id,
+                        eos_token_id=best_running_seq.eos_token_id,
                         seq_len=max_possible_length))
             else:
                 # Otherwise, beam search will prefer shorter sequences. The
@@ -583,8 +596,7 @@ def _check_beam_search_early_stopping(
                 highest_attainable_score = (
                     best_running_seq.get_beam_search_score(
                         length_penalty=length_penalty,
-                        eos_token_id=self.get_tokenizer_for_seq(
-                            best_running_seq).eos_token_id))
+                        eos_token_id=best_running_seq.eos_token_id))
         return current_worst_score >= highest_attainable_score
 
     def _process_sequence_group_outputs(self, seq_group: SequenceGroup,
@@ -682,8 +694,7 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup,
         all_finished_seqs = existing_finished_seqs + new_finished_seqs
         # Sort the finished sequences by their scores.
         all_finished_seqs.sort(key=lambda x: x[0].get_beam_search_score(
-            length_penalty=length_penalty,
-            eos_token_id=self.get_tokenizer_for_seq(x[0]).eos_token_id),
+            length_penalty=length_penalty, eos_token_id=x[0].eos_token_id),
                                reverse=True)
         for seq, parent, is_new in all_finished_seqs[:beam_width]:
             if is_new:
@@ -710,8 +721,7 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup,
                               if not seq.is_finished()]
         # Sort the running sequences by their scores.
         running_child_seqs.sort(key=lambda x: x[0].get_beam_search_score(
-            length_penalty=length_penalty,
-            eos_token_id=self.get_tokenizer_for_seq(x[0]).eos_token_id),
+            length_penalty=length_penalty, eos_token_id=x[0].eos_token_id),
                                 reverse=True)
 
         # Check if we can stop the beam search.
@@ -923,7 +933,8 @@ def _get_stats(self,
             # Latency Timings.
             time_last_iters = []
             for seq_group in scheduler_outputs.scheduled_seq_groups:
-                # Time since last token. (n.b. updates seq_group.metrics.last_token_time)
+                # Time since last token.
+                # (n.b. updates seq_group.metrics.last_token_time)
                 time_last_iters.append(seq_group.get_last_latency(now))
                 # Time since arrival for all finished requests.
                 if seq_group.is_finished():
@@ -955,16 +966,17 @@ def _decode_logprobs(self, seq: Sequence, prms: SamplingParams,
         for token_id, sample_logprob in logprobs.items():
             if (sample_logprob.decoded_token is None and token_id != -1):
                 all_input_ids_with_logprob = all_input_ids[:-1] + [token_id]
-                _, new_text, prefix_offset, read_offset = detokenize_incrementally(
-                    self.get_tokenizer_for_seq(seq),
-                    all_input_ids=all_input_ids_with_logprob,
-                    prev_tokens=seq.tokens,
-                    prefix_offset=seq.prefix_offset,
-                    read_offset=seq.read_offset,
-                    skip_special_tokens=prms.skip_special_tokens,
-                    spaces_between_special_tokens=prms.
-                    spaces_between_special_tokens,
-                )
+                (_, new_text, prefix_offset,
+                 read_offset) = detokenize_incrementally(
+                     self.get_tokenizer_for_seq(seq),
+                     all_input_ids=all_input_ids_with_logprob,
+                     prev_tokens=seq.tokens,
+                     prefix_offset=seq.prefix_offset,
+                     read_offset=seq.read_offset,
+                     skip_special_tokens=prms.skip_special_tokens,
+                     spaces_between_special_tokens=prms.
+                     spaces_between_special_tokens,
+                 )
                 sample_logprob.decoded_token = new_text
 
     def _decode_sequence(self, seq: Sequence, prms: SamplingParams) -> None:
@@ -1017,8 +1029,8 @@ def _check_stop(self, seq: Sequence,
             return
 
         # Check if the sequence has generated the EOS token.
-        if ((not sampling_params.ignore_eos) and seq.get_last_token_id()
-                == self.get_tokenizer_for_seq(seq).eos_token_id):
+        if ((not sampling_params.ignore_eos)
+                and seq.get_last_token_id() == seq.eos_token_id):
             seq.status = SequenceStatus.FINISHED_STOPPED
             return
 
@@ -1122,3 +1134,23 @@ def _compiled_ray_dag(self):
                 for worker in self.workers
             ])
         return forward_dag.experimental_compile()
+
+    def check_health(self) -> None:
+        """Raises an error if engine is unhealthy."""
+        self._check_if_any_actor_is_dead()
+
+    def _check_if_any_actor_is_dead(self):
+        if not self.parallel_config.worker_use_ray:
+            return
+
+        if not self.workers:
+            return
+
+        dead_actors = []
+        for actor in self.workers:
+            actor_state = ray.state.actors(actor._ray_actor_id.hex())  # pylint: disable=protected-access
+            if actor_state["State"] == "DEAD":
+                dead_actors.append(actor)
+        if dead_actors:
+            raise RuntimeError("At least one Worker is dead. "
+                               f"Dead Workers: {dead_actors}. ")
diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
index d31542159e4a4..17b1852f5b0a3 100644
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -1,5 +1,6 @@
 from vllm.logger import init_logger
-from prometheus_client import Counter, Gauge, Histogram, Info, REGISTRY, disable_created_metrics
+from prometheus_client import (Counter, Gauge, Histogram, Info, REGISTRY,
+                               disable_created_metrics)
 
 import time
 import numpy as np
@@ -177,10 +178,12 @@ def _log_prometheus(self, stats: Stats) -> None:
     def _log_prometheus_interval(self, prompt_throughput: float,
                                  generation_throughput: float) -> None:
         # Logs metrics to prometheus that are computed every logging_interval.
-        # Support legacy gauge metrics that make throughput calculations on the vLLM side.
-        # Moving forward, we should use counters like counter_prompt_tokens, counter_generation_tokens
-        # Which log raw data and calculate summaries using rate() on the grafana/prometheus side.
-        # See https://github.com/vllm-project/vllm/pull/2316#discussion_r1464204666
+        # Support legacy gauge metrics that make throughput calculations on
+        # the vLLM side. Moving forward, we should use counters like
+        # counter_prompt_tokens, counter_generation_tokens
+        # Which log raw data and calculate summaries using rate() on the
+        # grafana/prometheus side. See
+        # https://github.com/vllm-project/vllm/pull/2316#discussion_r1464204666
         self.metrics.gauge_avg_prompt_throughput.labels(
             **self.labels).set(prompt_throughput)
         self.metrics.gauge_avg_generation_throughput.labels(
@@ -188,7 +191,7 @@ def _log_prometheus_interval(self, prompt_throughput: float,
 
     def log(self, stats: Stats) -> None:
         """Called by LLMEngine.
-           Logs to prometheus and tracked stats every iteration. 
+           Logs to prometheus and tracked stats every iteration.
            Logs to Stdout every self.local_interval seconds."""
 
         # Log to prometheus.
@@ -200,8 +203,8 @@ def log(self, stats: Stats) -> None:
 
         # Log locally every local_interval seconds.
         if self._local_interval_elapsed(stats.now):
-
-            # Compute summary metrics for tracked stats (and log them to promethus if applicable).
+            # Compute summary metrics for tracked stats (and log them
+            # to promethus if applicable).
             prompt_throughput = self._get_throughput(self.num_prompt_tokens,
                                                      now=stats.now)
             generation_throughput = self._get_throughput(
@@ -213,7 +216,8 @@ def log(self, stats: Stats) -> None:
             # Log to stdout.
             logger.info(
                 f"Avg prompt throughput: {prompt_throughput:.1f} tokens/s, "
-                f"Avg generation throughput: {generation_throughput:.1f} tokens/s, "
+                f"Avg generation throughput: "
+                f"{generation_throughput:.1f} tokens/s, "
                 f"Running: {stats.num_running} reqs, "
                 f"Swapped: {stats.num_swapped} reqs, "
                 f"Pending: {stats.num_waiting} reqs, "
diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py
index 1eb4ab8b06b64..86b6c4c67cfa4 100644
--- a/vllm/entrypoints/api_server.py
+++ b/vllm/entrypoints/api_server.py
@@ -1,7 +1,9 @@
 """
-NOTE: This API server is used only for demonstrating usage of AsyncEngine and simple performance benchmarks.
-It is not intended for production use. For production use, we recommend using our OpenAI compatible server.
-We are also not going to accept PRs modifying this file, please change `vllm/entrypoints/openai/api_server.py` instead.
+NOTE: This API server is used only for demonstrating usage of AsyncEngine
+and simple performance benchmarks. It is not intended for production use.
+For production use, we recommend using our OpenAI compatible server.
+We are also not going to accept PRs modifying this file, please
+change `vllm/entrypoints/openai/api_server.py` instead.
 """
 
 import argparse
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 2d46e918ddc23..8938f655d75da 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -200,7 +200,9 @@ def _run_engine(self, use_tqdm: bool) -> List[RequestOutput]:
         # Initialize tqdm.
         if use_tqdm:
             num_requests = self.llm_engine.get_num_unfinished_requests()
-            pbar = tqdm(total=num_requests, desc="Processed prompts")
+            pbar = tqdm(total=num_requests,
+                        desc="Processed prompts",
+                        dynamic_ncols=True)
         # Run the engine.
         outputs: List[RequestOutput] = []
         while self.llm_engine.has_unfinished_requests():
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 993a834e5a720..00407bc0e809c 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -18,7 +18,9 @@
 import vllm
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
-from vllm.entrypoints.openai.protocol import CompletionRequest, ChatCompletionRequest, ErrorResponse
+from vllm.entrypoints.openai.protocol import (CompletionRequest,
+                                              ChatCompletionRequest,
+                                              ErrorResponse)
 from vllm.logger import init_logger
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
@@ -84,13 +86,11 @@ def parse_args():
                         type=json.loads,
                         default=["*"],
                         help="allowed headers")
-    parser.add_argument(
-        "--api-key",
-        type=str,
-        default=None,
-        help=
-        "If provided, the server will require this key to be presented in the header."
-    )
+    parser.add_argument("--api-key",
+                        type=str,
+                        default=None,
+                        help="If provided, the server will require this key "
+                        "to be presented in the header.")
     parser.add_argument("--served-model-name",
                         type=str,
                         default=None,
@@ -103,9 +103,8 @@ def parse_args():
         default=None,
         nargs='+',
         action=LoRAParserAction,
-        help=
-        "LoRA module configurations in the format name=path. Multiple modules can be specified."
-    )
+        help="LoRA module configurations in the format name=path. "
+        "Multiple modules can be specified.")
     parser.add_argument("--chat-template",
                         type=str,
                         default=None,
@@ -138,9 +137,10 @@ def parse_args():
         help="Additional ASGI middleware to apply to the app. "
         "We accept multiple --middleware arguments. "
         "The value should be an import path. "
-        "If a function is provided, vLLM will add it to the server using @app.middleware('http'). "
-        "If a class is provided, vLLM will add it to the server using app.add_middleware(). "
-    )
+        "If a function is provided, vLLM will add it to the server "
+        "using @app.middleware('http'). "
+        "If a class is provided, vLLM will add it to the server "
+        "using app.add_middleware(). ")
 
     parser = AsyncEngineArgs.add_cli_args(parser)
     return parser.parse_args()
@@ -160,6 +160,7 @@ async def validation_exception_handler(_, exc):
 @app.get("/health")
 async def health() -> Response:
     """Health check."""
+    await openai_serving_chat.engine.check_health()
     return Response(status_code=200)
 
 
@@ -234,9 +235,8 @@ async def authentication(request: Request, call_next):
         elif inspect.iscoroutinefunction(imported):
             app.middleware("http")(imported)
         else:
-            raise ValueError(
-                f"Invalid middleware {middleware}. Must be a function or a class."
-            )
+            raise ValueError(f"Invalid middleware {middleware}. "
+                             f"Must be a function or a class.")
 
     logger.info(f"vLLM API server version {vllm.__version__}")
     logger.info(f"args: {args}")
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index ba352f18f6454..d2fb9ca001b15 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -12,7 +12,8 @@
     UsageInfo)
 from vllm.outputs import RequestOutput
 from vllm.entrypoints.openai.serving_engine import OpenAIServing, LoRA
-from vllm.model_executor.guided_decoding import get_guided_decoding_logits_processor
+from vllm.model_executor.guided_decoding import (
+    get_guided_decoding_logits_processor)
 
 logger = init_logger(__name__)
 
@@ -37,8 +38,9 @@ async def create_chat_completion(
                ChatCompletionResponse]:
         """Completion API similar to OpenAI's API.
 
-        See  https://platform.openai.com/docs/api-reference/chat/create
-        for the API specification. This API mimics the OpenAI ChatCompletion API.
+        See https://platform.openai.com/docs/api-reference/chat/create
+        for the API specification. This API mimics the OpenAI
+        ChatCompletion API.
 
         NOTE: Currently we do not support the following feature:
             - function_call (Users should implement this by themselves)
@@ -65,7 +67,7 @@ async def create_chat_completion(
             lora_request = self._maybe_get_lora(request)
             guided_decode_logits_processor = (
                 await get_guided_decoding_logits_processor(
-                    request, self.engine.get_tokenizer()))
+                    request, await self.engine.get_tokenizer()))
             if guided_decode_logits_processor:
                 if sampling_params.logits_processors is None:
                     sampling_params.logits_processors = []
@@ -116,7 +118,8 @@ async def chat_completion_stream_generator(
                 # the result_generator, it needs to be sent as the FIRST
                 # response (by the try...catch).
                 if first_iteration:
-                    # Send first response for each request.n (index) with the role
+                    # Send first response for each request.n (index) with
+                    # the role
                     role = self.get_chat_request_role(request)
                     for i in range(request.n):
                         choice_data = ChatCompletionResponseStreamChoice(
@@ -133,7 +136,8 @@ async def chat_completion_stream_generator(
                         data = chunk.model_dump_json(exclude_unset=True)
                         yield f"data: {data}\n\n"
 
-                    # Send response to echo the input portion of the last message
+                    # Send response to echo the input portion of the
+                    # last message
                     if request.echo:
                         last_msg_content = ""
                         if request.messages and isinstance(
@@ -145,11 +149,12 @@ async def chat_completion_stream_generator(
 
                         if last_msg_content:
                             for i in range(request.n):
-                                choice_data = ChatCompletionResponseStreamChoice(
-                                    index=i,
-                                    delta=DeltaMessage(
-                                        content=last_msg_content),
-                                    finish_reason=None)
+                                choice_data = (
+                                    ChatCompletionResponseStreamChoice(
+                                        index=i,
+                                        delta=DeltaMessage(
+                                            content=last_msg_content),
+                                        finish_reason=None))
                                 chunk = ChatCompletionStreamResponse(
                                     id=request_id,
                                     object=chunk_object_type,
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index a8244fd150753..b78f053800f3c 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -1,7 +1,8 @@
 import asyncio
 import time
 from fastapi import Request
-from typing import AsyncGenerator, AsyncIterator, Callable, List, Optional, Dict, Tuple
+from typing import (AsyncGenerator, AsyncIterator, Callable, List, Optional,
+                    Dict, Tuple)
 from vllm.logger import init_logger
 from vllm.utils import random_uuid
 from vllm.engine.async_llm_engine import AsyncLLMEngine
@@ -16,7 +17,8 @@
 )
 from vllm.outputs import RequestOutput
 from vllm.entrypoints.openai.serving_engine import OpenAIServing, LoRA
-from vllm.model_executor.guided_decoding import get_guided_decoding_logits_processor
+from vllm.model_executor.guided_decoding import (
+    get_guided_decoding_logits_processor)
 
 logger = init_logger(__name__)
 
@@ -44,9 +46,8 @@ def parse_prompt_format(prompt) -> Tuple[bool, list]:
             prompt_is_tokens = True
             prompts = prompt  # case 4: array of token arrays
         else:
-            raise ValueError(
-                "prompt must be a string, array of strings, array of tokens, or array of token arrays"
-            )
+            raise ValueError("prompt must be a string, array of strings, "
+                             "array of tokens, or array of token arrays")
     return prompt_is_tokens, prompts
 
 
@@ -126,7 +127,7 @@ async def create_completion(self, request: CompletionRequest,
             lora_request = self._maybe_get_lora(request)
             guided_decode_logit_processor = (
                 await get_guided_decoding_logits_processor(
-                    request, self.engine.get_tokenizer()))
+                    request, await self.engine.get_tokenizer()))
             if guided_decode_logit_processor is not None:
                 if sampling_params.logits_processors is None:
                     sampling_params.logits_processors = []
@@ -156,7 +157,8 @@ async def create_completion(self, request: CompletionRequest,
             int, RequestOutput]] = merge_async_iterators(*generators)
 
         # Similar to the OpenAI API, when n != best_of, we do not stream the
-        # results. In addition, we do not stream the results when use beam search.
+        # results. In addition, we do not stream the results when use
+        # beam search.
         stream = (request.stream
                   and (request.best_of is None or request.n == request.best_of)
                   and not request.use_beam_search)
@@ -223,7 +225,8 @@ async def completion_stream_generator(
 
                 for output in res.outputs:
                     i = output.index + prompt_idx * request.n
-                    # TODO(simon): optimize the performance by avoiding full text O(n^2) sending.
+                    # TODO(simon): optimize the performance by avoiding full
+                    # text O(n^2) sending.
 
                     if request.echo and request.max_tokens == 0:
                         # only return the prompt
@@ -231,11 +234,12 @@ async def completion_stream_generator(
                         delta_token_ids = res.prompt_token_ids
                         top_logprobs = res.prompt_logprobs
                         has_echoed[i] = True
-                    elif request.echo and request.max_tokens > 0 and not has_echoed[
-                            i]:
+                    elif (request.echo and request.max_tokens > 0
+                          and not has_echoed[i]):
                         # echo the prompt and first token
                         delta_text = res.prompt + output.text
-                        delta_token_ids = res.prompt_token_ids + output.token_ids
+                        delta_token_ids = (res.prompt_token_ids +
+                                           output.token_ids)
                         top_logprobs = res.prompt_logprobs + (output.logprobs
                                                               or [])
                         has_echoed[i] = True
@@ -248,7 +252,9 @@ async def completion_stream_generator(
                             i]:] if output.logprobs else None
 
                     if request.logprobs is not None:
-                        assert top_logprobs is not None, "top_logprobs must be provided when logprobs is requested"
+                        assert top_logprobs is not None, (
+                            "top_logprobs must be provided when logprobs "
+                            "is requested")
                         logprobs = self._create_logprobs(
                             token_ids=delta_token_ids,
                             top_logprobs=top_logprobs,
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 230d13d97dbba..2db884945c491 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -50,10 +50,12 @@ def __init__(self,
         except RuntimeError:
             event_loop = None
 
-        if event_loop is not None and event_loop.is_running(
-        ):  # If the current is instanced by Ray Serve, there is already a running event loop
+        if event_loop is not None and event_loop.is_running():
+            # If the current is instanced by Ray Serve,
+            # there is already a running event loop
             event_loop.create_task(self._post_init())
-        else:  # When using single vLLM without engine_use_ray
+        else:
+            # When using single vLLM without engine_use_ray
             asyncio.run(self._post_init())
 
     async def _post_init(self):
@@ -178,8 +180,9 @@ def _validate_prompt_and_tokenize(
 
         if token_num + request.max_tokens > self.max_model_len:
             raise ValueError(
-                f"This model's maximum context length is {self.max_model_len} tokens. "
-                f"However, you requested {request.max_tokens + token_num} tokens "
+                f"This model's maximum context length is "
+                f"{self.max_model_len} tokens. However, you requested "
+                f"{request.max_tokens + token_num} tokens "
                 f"({token_num} in the messages, "
                 f"{request.max_tokens} in the completion). "
                 f"Please reduce the length of the messages or completion.", )
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index e667d70f71e39..99e6cdeee6364 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -20,10 +20,12 @@
                                                RowParallelLinear,
                                                QKVParallelLinear,
                                                MergedColumnParallelLinear)
-from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding, ParallelLMHead
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding, ParallelLMHead)
 from vllm.model_executor.parallel_utils.parallel_state import (
     get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
-from vllm.model_executor.parallel_utils.utils import split_tensor_along_last_dim
+from vllm.model_executor.parallel_utils.utils import (
+    split_tensor_along_last_dim)
 
 if TYPE_CHECKING:
     pass
@@ -84,7 +86,8 @@ def _apply_lora_packed_nslice(
         lora_b_stacked:    3 element tuple of (num_loras, output_dim, lora_rank)
         indices:           (batch_size)
         output:            (batch_size, q_slice_size + 2*kv_slice_size)
-        output_slices:     n-1 element tuple of (slice_size...), where n is number of slices
+        output_slices:     n-1 element tuple of (slice_size...),
+                           where n is number of slices
     """
     org_output = output
     x = x.view(-1, x.shape[-1])
@@ -819,9 +822,8 @@ def create_lora_weights(
     ) -> None:
         # Keep this in sync with csrc/punica/bgmv/bgmv_config.h
         if 32000 < self.base_layer.vocab_size > 33024:
-            raise ValueError(
-                "When using LoRA, vocab size must be 32000 >= vocab_size <= 33024"
-            )
+            raise ValueError("When using LoRA, vocab size must be "
+                             "32000 >= vocab_size <= 33024")
         self.lora_a_stacked = torch.zeros(
             (
                 max_loras,
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 7386d21c58e4e..238da256b7cdc 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -13,7 +13,8 @@
 from vllm.config import LoRAConfig
 from vllm.utils import LRUCache, in_wsl
 
-from vllm.lora.layers import BaseLayerWithLoRA, LoRAMapping, from_layer, from_layer_sampler
+from vllm.lora.layers import (BaseLayerWithLoRA, LoRAMapping, from_layer,
+                              from_layer_sampler)
 from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights
 from vllm.lora.utils import parse_fine_tuned_lora_name, replace_submodule
 
diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py
index 7e92bc93ab472..911115d63a639 100644
--- a/vllm/lora/worker_manager.py
+++ b/vllm/lora/worker_manager.py
@@ -154,10 +154,9 @@ def _load_lora(self, lora_request: LoRARequest) -> LoRAModel:
                 f"LoRA rank {lora.rank} is greater than max_lora_rank "
                 f"{self.lora_config.max_lora_rank}.")
         if lora.extra_vocab_size > self.lora_config.lora_extra_vocab_size:
-            raise ValueError(
-                f"LoRA added vocab size {lora.extra_vocab_size} is greater than "
-                f"lora_extra_vocab_size {self.lora_config.lora_extra_vocab_size}."
-            )
+            raise ValueError(f"LoRA added vocab size {lora.extra_vocab_size} "
+                             f"is greater than lora_extra_vocab_size "
+                             f"{self.lora_config.lora_extra_vocab_size}.")
         return lora
 
     def add_dummy_lora(self, lora_request: LoRARequest, rank: int) -> bool:
diff --git a/vllm/model_executor/guided_decoding.py b/vllm/model_executor/guided_decoding.py
index a8573f8bdc6c8..00984460d79a6 100644
--- a/vllm/model_executor/guided_decoding.py
+++ b/vllm/model_executor/guided_decoding.py
@@ -8,8 +8,10 @@
 from typing import Union, Tuple
 from pydantic import BaseModel
 
-from vllm.entrypoints.openai.protocol import CompletionRequest, ChatCompletionRequest
-from vllm.model_executor.guided_logits_processors import JSONLogitsProcessor, RegexLogitsProcessor
+from vllm.entrypoints.openai.protocol import (CompletionRequest,
+                                              ChatCompletionRequest)
+from vllm.model_executor.guided_logits_processors import (JSONLogitsProcessor,
+                                                          RegexLogitsProcessor)
 
 
 class GuidedDecodingMode(Enum):
diff --git a/vllm/model_executor/guided_logits_processors.py b/vllm/model_executor/guided_logits_processors.py
index 1b3e5e71a5911..76d41aa37dd7b 100644
--- a/vllm/model_executor/guided_logits_processors.py
+++ b/vllm/model_executor/guided_logits_processors.py
@@ -107,12 +107,15 @@ def __init__(self,
         Parameters
         ----------
         schema
-            A JSON schema that encodes the structure we want the model to generate
+            A JSON schema that encodes the structure we want the model to
+            generate
         tokenizer
             The model's tokenizer
         whitespace_pattern
-            Pattern to use for JSON syntactic whitespace (doesn't impact string literals)
-            Example: allow only a single space or newline with `whitespace_pattern=r"[\n ]?"`
+            Pattern to use for JSON syntactic whitespace (doesn't impact
+            string literals)
+            Example: allow only a single space or newline with
+            `whitespace_pattern=r"[\n ]?"`
         """
         if isinstance(schema, type(BaseModel)):
             schema_str = json.dumps(schema.model_json_schema())
@@ -122,8 +125,8 @@ def __init__(self,
             schema_str = schema
         else:
             raise ValueError(
-                f"Cannot parse schema {schema}. The schema must be either " +
-                "a Pydantic object, a dictionary or a string that contains the JSON "
-                + "Schema specification")
+                f"Cannot parse schema {schema}. The schema must be either "
+                f"a Pydantic object, a dictionary or a string that contains "
+                f"the JSON Schema specification")
         regex_string = build_regex_from_schema(schema_str, whitespace_pattern)
         super().__init__(regex_string, tokenizer)
diff --git a/vllm/model_executor/layers/attention/__init__.py b/vllm/model_executor/layers/attention/__init__.py
new file mode 100644
index 0000000000000..1c42a3d28f976
--- /dev/null
+++ b/vllm/model_executor/layers/attention/__init__.py
@@ -0,0 +1,5 @@
+from vllm.model_executor.layers.attention.attention import Attention
+
+__all__ = [
+    "Attention",
+]
diff --git a/vllm/model_executor/layers/attention/attention.py b/vllm/model_executor/layers/attention/attention.py
new file mode 100644
index 0000000000000..4b63b9eaf59a7
--- /dev/null
+++ b/vllm/model_executor/layers/attention/attention.py
@@ -0,0 +1,84 @@
+"""Attention layer."""
+from functools import lru_cache
+from typing import List, Optional
+
+import torch
+import torch.nn as nn
+
+from vllm.logger import init_logger
+from vllm.model_executor.input_metadata import InputMetadata
+from vllm.utils import is_hip
+
+logger = init_logger(__name__)
+
+
+class Attention(nn.Module):
+    """Attention layer.
+
+    This class takes query, key, and value tensors as input. The input tensors
+    can either contain prompt tokens or generation tokens.
+    The class does the following:
+
+    1. Store the input key and value tensors in the KV cache.
+    2. Perform (multi-head/multi-query/grouped-query) attention.
+    3. Return the output tensor.
+    """
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: Optional[int] = None,
+        alibi_slopes: Optional[List[float]] = None,
+        sliding_window: Optional[int] = None,
+    ) -> None:
+        super().__init__()
+        if _use_flash_attn():
+            from vllm.model_executor.layers.attention.backends.flash_attn import FlashAttentionBackend  # noqa: E501
+            self.backend = FlashAttentionBackend(num_heads, head_size, scale,
+                                                 num_kv_heads, alibi_slopes,
+                                                 sliding_window)
+        else:
+            from vllm.model_executor.layers.attention.backends.xformers import XFormersBackend  # noqa: E501
+            self.backend = XFormersBackend(num_heads, head_size, scale,
+                                           num_kv_heads, alibi_slopes,
+                                           sliding_window)
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        key_cache: Optional[torch.Tensor],
+        value_cache: Optional[torch.Tensor],
+        input_metadata: InputMetadata,
+    ) -> torch.Tensor:
+        return self.backend.forward(query, key, value, key_cache, value_cache,
+                                    input_metadata)
+
+
+@lru_cache(maxsize=1)
+def _use_flash_attn() -> bool:
+    try:
+        import flash_attn  # noqa: F401
+    except ImportError:
+        logger.info("flash_attn is not found. Using xformers backend.")
+        return False
+
+    if is_hip():
+        # AMD GPUs.
+        return False
+    if torch.cuda.get_device_capability()[0] < 8:
+        # Volta and Turing NVIDIA GPUs.
+        logger.info("flash_attn is not supported on Turing or older GPUs. "
+                    "Using xformers backend.")
+        return False
+    if torch.get_default_dtype() not in (torch.float16, torch.bfloat16):
+        logger.info(
+            "flash_attn only supports torch.float16 or torch.bfloat16. "
+            "Using xformers backend.")
+        return False
+
+    logger.info("Using flash_attn backend.")
+    return True
diff --git a/vllm/model_executor/layers/attention/backends/__init__.py b/vllm/model_executor/layers/attention/backends/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/vllm/model_executor/layers/attention/backends/flash_attn.py b/vllm/model_executor/layers/attention/backends/flash_attn.py
new file mode 100644
index 0000000000000..58ccd461b993e
--- /dev/null
+++ b/vllm/model_executor/layers/attention/backends/flash_attn.py
@@ -0,0 +1,121 @@
+"""Attention layer with Flash and PagedAttention."""
+from typing import List, Optional
+
+from flash_attn import flash_attn_func
+import torch
+
+from vllm.model_executor.input_metadata import InputMetadata
+from vllm.model_executor.layers.attention.ops.paged_attn import (
+    PagedAttentionImpl)
+
+
+class FlashAttentionBackend:
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: Optional[int] = None,
+        alibi_slopes: Optional[List[float]] = None,
+        sliding_window: Optional[int] = None,
+    ) -> None:
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.scale = float(scale)
+        self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
+        self.sliding_window = sliding_window
+        if alibi_slopes is not None:
+            alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
+        self.alibi_slopes = alibi_slopes
+
+        assert self.num_heads % self.num_kv_heads == 0
+        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+        suppored_head_sizes = PagedAttentionImpl.get_supported_head_sizes()
+        if head_size not in suppored_head_sizes:
+            raise ValueError(
+                f"Head size {head_size} is not supported by PagedAttention. "
+                f"Supported head sizes are: {suppored_head_sizes}.")
+
+        self.sliding_window = ((self.sliding_window, self.sliding_window) if
+                               self.sliding_window is not None else (-1, -1))
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        key_cache: Optional[torch.Tensor],
+        value_cache: Optional[torch.Tensor],
+        input_metadata: InputMetadata,
+    ) -> torch.Tensor:
+        """Forward pass with FlashAttention and PagedAttention.
+
+        Args:
+            query: shape = [batch_size, seq_len, num_heads * head_size]
+            key: shape = [batch_size, seq_len, num_kv_heads * head_size]
+            value: shape = [batch_size, seq_len, num_kv_heads * head_size]
+            key_cache: shape = [num_blocks, num_kv_heads, head_size/x,
+                block_size, x]
+            value_cache: shape = [num_blocks, num_kv_heads, head_size,
+                block_size]
+            input_metadata: metadata for the inputs.
+        Returns:
+            shape = [batch_size, seq_len, num_heads * head_size]
+        """
+        batch_size, seq_len, hidden_size = query.shape
+        # Reshape the query, key, and value tensors.
+        query = query.view(-1, self.num_heads, self.head_size)
+        key = key.view(-1, self.num_kv_heads, self.head_size)
+        value = value.view(-1, self.num_kv_heads, self.head_size)
+
+        # Reshape the keys and values and store them in the cache.
+        # If key_cache and value_cache are not provided, the new key and value
+        # vectors will not be cached. This happens during the initial memory
+        # profiling run.
+        if key_cache is not None and value_cache is not None:
+            PagedAttentionImpl.reshape_and_cache(key, value, key_cache,
+                                                 value_cache, input_metadata)
+
+        if input_metadata.is_prompt:
+            # Prompt run.
+            if (key_cache is None or value_cache is None
+                    or input_metadata.block_tables.numel() == 0):
+                # normal attention
+                query = query.unflatten(0, (batch_size, seq_len))
+                key = key.unflatten(0, (batch_size, seq_len))
+                value = value.unflatten(0, (batch_size, seq_len))
+                output = flash_attn_func(
+                    query,
+                    key,
+                    value,
+                    softmax_scale=self.scale,
+                    causal=True,
+                    window_size=self.sliding_window,
+                    alibi_slopes=self.alibi_slopes,
+                )
+            else:
+                # prefix-enabled attention
+                output = PagedAttentionImpl.forward_prefix(
+                    query,
+                    key,
+                    value,
+                    key_cache,
+                    value_cache,
+                    input_metadata,
+                    self.alibi_slopes,
+                )
+        else:
+            # Decoding run.
+            output = PagedAttentionImpl.forward_decode(
+                query,
+                key_cache,
+                value_cache,
+                input_metadata,
+                self.num_kv_heads,
+                self.scale,
+                self.alibi_slopes,
+            )
+
+        # Reshape the output tensor.
+        return output.view(batch_size, seq_len, hidden_size)
diff --git a/vllm/model_executor/layers/attention/backends/xformers.py b/vllm/model_executor/layers/attention/backends/xformers.py
new file mode 100644
index 0000000000000..bad2a648b6703
--- /dev/null
+++ b/vllm/model_executor/layers/attention/backends/xformers.py
@@ -0,0 +1,255 @@
+"""Attention layer with xFormers and PagedAttention."""
+import importlib
+from typing import List, Optional
+
+import torch
+from xformers import ops as xops
+from xformers.ops.fmha.attn_bias import (BlockDiagonalCausalMask,
+                                         LowerTriangularMaskWithTensorBias)
+
+from vllm.model_executor.input_metadata import InputMetadata
+from vllm.model_executor.layers.attention.ops.paged_attn import (
+    PagedAttentionImpl)
+from vllm.utils import is_hip
+
+
+class XFormersBackend:
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: Optional[int] = None,
+        alibi_slopes: Optional[List[float]] = None,
+        sliding_window: Optional[int] = None,
+    ) -> None:
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.scale = float(scale)
+        self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
+        self.sliding_window = sliding_window
+        if alibi_slopes is not None:
+            alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
+        self.alibi_slopes = alibi_slopes
+
+        assert self.num_heads % self.num_kv_heads == 0
+        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+        suppored_head_sizes = PagedAttentionImpl.get_supported_head_sizes()
+        if head_size not in suppored_head_sizes:
+            raise ValueError(
+                f"Head size {head_size} is not supported by PagedAttention. "
+                f"Supported head sizes are: {suppored_head_sizes}.")
+
+        self.use_ref_attention = _check_use_ref_attention()
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        key_cache: Optional[torch.Tensor],
+        value_cache: Optional[torch.Tensor],
+        input_metadata: InputMetadata,
+    ) -> torch.Tensor:
+        """Forward pass with xFormers and PagedAttention.
+
+        Args:
+            query: shape = [batch_size, seq_len, num_heads * head_size]
+            key: shape = [batch_size, seq_len, num_kv_heads * head_size]
+            value: shape = [batch_size, seq_len, num_kv_heads * head_size]
+            key_cache: shape = [num_blocks, num_kv_heads, head_size/x,
+                block_size, x]
+            value_cache: shape = [num_blocks, num_kv_heads, head_size,
+                block_size]
+            input_metadata: metadata for the inputs.
+        Returns:
+            shape = [batch_size, seq_len, num_heads * head_size]
+        """
+        batch_size, seq_len, hidden_size = query.shape
+        # Reshape the query, key, and value tensors.
+        query = query.view(-1, self.num_heads, self.head_size)
+        key = key.view(-1, self.num_kv_heads, self.head_size)
+        value = value.view(-1, self.num_kv_heads, self.head_size)
+
+        # Reshape the keys and values and store them in the cache.
+        # If key_cache and value_cache are not provided, the new key and value
+        # vectors will not be cached. This happens during the initial memory
+        # profiling run.
+        if key_cache is not None and value_cache is not None:
+            PagedAttentionImpl.reshape_and_cache(key, value, key_cache,
+                                                 value_cache, input_metadata)
+
+        if input_metadata.is_prompt:
+            # Prompt run.
+            if (key_cache is None or value_cache is None
+                    or input_metadata.block_tables.numel() == 0):
+                # normal attention
+                if self.num_kv_heads != self.num_heads:
+                    # As of Nov 2023, xformers only supports MHA. For MQA/GQA,
+                    # project the key and value tensors to the desired number of
+                    # heads.
+                    # TODO(woosuk): Use MQA/GQA kernels for higher performance.
+                    query = query.view(query.shape[0], self.num_kv_heads,
+                                       self.num_queries_per_kv,
+                                       query.shape[-1])
+                    key = key[:, :,
+                              None, :].expand(key.shape[0], self.num_kv_heads,
+                                              self.num_queries_per_kv,
+                                              key.shape[-1])
+                    value = value[:, :,
+                                  None, :].expand(value.shape[0],
+                                                  self.num_kv_heads,
+                                                  self.num_queries_per_kv,
+                                                  value.shape[-1])
+
+                # Set attention bias if not provided. This typically happens at
+                # the very attention layer of every iteration.
+                # FIXME(woosuk): This is a hack.
+                if input_metadata.attn_bias is None:
+                    if self.alibi_slopes is None:
+                        attn_bias = BlockDiagonalCausalMask.from_seqlens(
+                            [seq_len] * batch_size)
+                        if self.sliding_window is not None:
+                            attn_bias = attn_bias.make_local_attention(
+                                self.sliding_window)
+                        input_metadata.attn_bias = attn_bias
+                    else:
+                        input_metadata.attn_bias = _make_alibi_bias(
+                            self.alibi_slopes, self.num_kv_heads, batch_size,
+                            seq_len, query.dtype)
+
+                if self.use_ref_attention:
+                    output = _ref_masked_attention(
+                        query,
+                        key,
+                        value,
+                        self.num_heads,
+                        self.num_kv_heads,
+                        self.head_size,
+                        self.scale,
+                    )
+                    # Using view got RuntimeError: view size is not compatible
+                    # with input tensor's size and stride (at least one
+                    # dimension spans across two contiguous subspaces).
+                    # Use reshape instead.
+                    return output.reshape(batch_size, seq_len, hidden_size)
+
+                # TODO(woosuk): Too many view operations. Let's try to reduce
+                # them in the future for code readability.
+                if self.alibi_slopes is None:
+                    query = query.unsqueeze(0)
+                    key = key.unsqueeze(0)
+                    value = value.unsqueeze(0)
+                else:
+                    query = query.unflatten(0, (batch_size, seq_len))
+                    key = key.unflatten(0, (batch_size, seq_len))
+                    value = value.unflatten(0, (batch_size, seq_len))
+
+                out = xops.memory_efficient_attention_forward(
+                    query,
+                    key,
+                    value,
+                    attn_bias=input_metadata.attn_bias,
+                    p=0.0,
+                    scale=self.scale,
+                    op=xops.fmha.MemoryEfficientAttentionFlashAttentionOp[0] if
+                    (is_hip()) else None,
+                )
+                output = out.view_as(query)
+
+            else:
+                # prefix-enabled attention
+                output = PagedAttentionImpl.forward_prefix(
+                    query,
+                    key,
+                    value,
+                    key_cache,
+                    value_cache,
+                    input_metadata,
+                    self.alibi_slopes,
+                )
+        else:
+            # Decoding run.
+            output = PagedAttentionImpl.forward_decode(
+                query,
+                key_cache,
+                value_cache,
+                input_metadata,
+                self.num_kv_heads,
+                self.scale,
+                self.alibi_slopes,
+            )
+
+        # Reshape the output tensor.
+        return output.view(batch_size, seq_len, hidden_size)
+
+
+def _make_alibi_bias(
+    alibi_slopes: torch.Tensor,
+    num_kv_heads: int,
+    batch_size: int,
+    seq_len: int,
+    dtype: torch.dtype,
+) -> LowerTriangularMaskWithTensorBias:
+    bias = torch.arange(seq_len, dtype=dtype)
+    # NOTE(zhuohan): HF uses
+    #     `bias = bias[None, :].repeat(prompt_len, 1)`
+    # here. We find that both biases give the same results, but
+    # the bias below more accurately follows the original ALiBi
+    # paper.
+    bias = bias[None, :] - bias[:, None]
+
+    # When using custom attention bias, xformers requires the bias to
+    # be sliced from a tensor whose length is a multiple of 8.
+    padded_len = (seq_len + 7) // 8 * 8
+    num_heads = alibi_slopes.shape[0]
+    bias = torch.empty(
+        batch_size,
+        num_heads,
+        seq_len,
+        padded_len,
+        device=alibi_slopes.device,
+        dtype=dtype,
+    )[:, :, :, :seq_len].copy_(bias)
+    bias.mul_(alibi_slopes[:, None, None])
+    if num_heads != num_kv_heads:
+        bias = bias.unflatten(1, (num_kv_heads, num_heads // num_kv_heads))
+    attn_bias = LowerTriangularMaskWithTensorBias(bias)
+    return attn_bias
+
+
+def _check_use_ref_attention() -> bool:
+    if not is_hip():
+        return False
+    # For ROCm, check whether flash attention is installed or not.
+    # if not, use_ref_attention needs to be True
+    return importlib.util.find_spec("flash_attn") is None
+
+
+def _ref_masked_attention(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    num_heads: int,
+    num_kv_heads: int,
+    head_size: int,
+    scale: float,
+) -> torch.Tensor:
+    query = query.view(-1, num_heads, head_size)
+    key = key.view(-1, num_kv_heads, head_size)
+    value = value.view(-1, num_kv_heads, head_size)
+
+    seq_len, _, _ = query.shape
+    attn_mask = torch.triu(torch.ones(seq_len,
+                                      seq_len,
+                                      dtype=query.dtype,
+                                      device=query.device),
+                           diagonal=1)
+    attn_mask = attn_mask * torch.finfo(query.dtype).min
+
+    attn_weights = scale * torch.einsum("qhd,khd->hqk", query, key).float()
+    attn_weights = attn_weights + attn_mask.float()
+    attn_weights = torch.softmax(attn_weights, dim=-1).to(value.dtype)
+    out = torch.einsum("hqk,khd->qhd", attn_weights, value)
+    return out
diff --git a/vllm/model_executor/layers/attention/ops/__init__.py b/vllm/model_executor/layers/attention/ops/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/vllm/model_executor/layers/attention/ops/paged_attn.py b/vllm/model_executor/layers/attention/ops/paged_attn.py
new file mode 100644
index 0000000000000..c5a9618c2395b
--- /dev/null
+++ b/vllm/model_executor/layers/attention/ops/paged_attn.py
@@ -0,0 +1,138 @@
+from typing import List, Optional
+
+import torch
+
+from vllm._C import cache_ops
+from vllm._C import ops
+from vllm.model_executor.input_metadata import InputMetadata
+from vllm.model_executor.layers.attention.ops.prefix_prefill import (
+    context_attention_fwd)
+
+# Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`.
+_PARTITION_SIZE = 512
+
+
+class PagedAttentionImpl:
+
+    @staticmethod
+    def get_supported_head_sizes() -> List[int]:
+        return [64, 80, 96, 112, 128, 256]
+
+    @staticmethod
+    def reshape_and_cache(
+        key: torch.Tensor,
+        value: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        input_metadata: InputMetadata,
+    ) -> None:
+        cache_ops.reshape_and_cache(
+            key,
+            value,
+            key_cache,
+            value_cache,
+            input_metadata.slot_mapping.flatten(),
+            input_metadata.kv_cache_dtype,
+        )
+
+    @staticmethod
+    def forward_decode(
+        query: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        input_metadata: InputMetadata,
+        num_kv_heads: int,
+        scale: float,
+        alibi_slopes: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        output = torch.empty_like(query)
+
+        block_size = value_cache.shape[3]
+        num_seqs, num_heads, head_size = query.shape
+        max_num_partitions = (
+            (input_metadata.max_context_len + _PARTITION_SIZE - 1) //
+            _PARTITION_SIZE)
+        # NOTE(woosuk): We use a simple heuristic to decide whether to use
+        # PagedAttention V1 or V2. If the number of partitions is 1, we use
+        # V1 to avoid the overhead of reduction. Also, if the number of
+        # sequences or heads is large, we use V1 since there is enough work
+        # to parallelize.
+        # TODO(woosuk): Tune this heuristic.
+        # For context len > 8192, use V2 kernel to avoid shared memory shortage.
+        use_v1 = input_metadata.max_context_len <= 8192 and (
+            max_num_partitions == 1 or num_seqs * num_heads > 512)
+        if use_v1:
+            # Run PagedAttention V1.
+            ops.paged_attention_v1(
+                output,
+                query,
+                key_cache,
+                value_cache,
+                num_kv_heads,
+                scale,
+                input_metadata.block_tables,
+                input_metadata.context_lens,
+                block_size,
+                input_metadata.max_context_len,
+                alibi_slopes,
+                input_metadata.kv_cache_dtype,
+            )
+        else:
+            # Run PagedAttention V2.
+            assert _PARTITION_SIZE % block_size == 0
+            tmp_output = torch.empty(
+                size=(num_seqs, num_heads, max_num_partitions, head_size),
+                dtype=output.dtype,
+                device=output.device,
+            )
+            exp_sums = torch.empty(
+                size=(num_seqs, num_heads, max_num_partitions),
+                dtype=torch.float32,
+                device=output.device,
+            )
+            max_logits = torch.empty_like(exp_sums)
+            ops.paged_attention_v2(
+                output,
+                exp_sums,
+                max_logits,
+                tmp_output,
+                query,
+                key_cache,
+                value_cache,
+                num_kv_heads,
+                scale,
+                input_metadata.block_tables,
+                input_metadata.context_lens,
+                block_size,
+                input_metadata.max_context_len,
+                alibi_slopes,
+                input_metadata.kv_cache_dtype,
+            )
+        return output
+
+    @staticmethod
+    def forward_prefix(
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        input_metadata: InputMetadata,
+        alibi_slopes: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        output = torch.empty_like(query)
+        context_attention_fwd(
+            query,
+            key,
+            value,
+            output,
+            key_cache,
+            value_cache,
+            input_metadata.block_tables,  # [BS, max_block_per_request]
+            input_metadata.start_loc,
+            input_metadata.prompt_lens,
+            input_metadata.context_lens,
+            input_metadata.max_seq_len,
+            alibi_slopes,
+        )
+        return output
diff --git a/vllm/model_executor/layers/triton_kernel/prefix_prefill.py b/vllm/model_executor/layers/attention/ops/prefix_prefill.py
similarity index 100%
rename from vllm/model_executor/layers/triton_kernel/prefix_prefill.py
rename to vllm/model_executor/layers/attention/ops/prefix_prefill.py
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 08e3c2d5b706e..3e6dd0dfe2eb3 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -30,9 +30,10 @@ def fused_moe_kernel(
     K,
     EM,
     num_valid_tokens,
-    # The stride variables represent how much to increase the ptr by when moving by 1
-    # element in a particular dimension. E.g. `stride_am` is how much to increase `a_ptr`
-    # by to get the element one row down (A has M rows).
+    # The stride variables represent how much to increase the ptr by when
+    # moving by 1 element in a particular dimension. E.g. `stride_am` is
+    # how much to increase `a_ptr` by to get the element one row down
+    # (A has M rows).
     stride_am,
     stride_ak,
     stride_be,
@@ -50,17 +51,30 @@ def fused_moe_kernel(
     compute_type: tl.constexpr,
 ):
     """
-    Implements the fused computation for a Mixture of Experts (MOE) using token and expert matrices.
+    Implements the fused computation for a Mixture of Experts (MOE) using
+    token and expert matrices.
 
     Key Parameters:
-    - A: The input tensor representing tokens with shape (*, K), where '*' can be any shape representing batches and K is the feature dimension of each token.
-    - B: The stacked MOE weight tensor with shape (E, N, K), where E is the number of experts, K is the input feature dimension, and N is the output feature dimension.
-    - C: The output cache tensor with shape (M, topk, N), where M is the total number of tokens post padding, topk is the number of times each token is repeated,
-        and N is the output feature dimension.
-    - sorted_token_ids: A tensor containing the sorted indices of tokens, repeated topk times and arranged by the expert index they are assigned to.
-    - expert_ids: A tensor containing the indices of the expert for each block. It determines which expert matrix from B should be used for each block in A.
-    This kernel performs the multiplication of a token by its corresponding expert matrix as determined by `expert_ids`. The sorting of `sorted_token_ids`
-    by expert index and padding ensures divisibility by BLOCK_SIZE_M, which is necessary to maintain consistency in block matrix multiplication across different blocks processed by the same expert.
+    - A: The input tensor representing tokens with shape (*, K), where '*' can
+        be any shape representing batches and K is the feature dimension of
+        each token.
+    - B: The stacked MOE weight tensor with shape (E, N, K), where E is
+        the number of experts, K is the input feature dimension, and N is
+        the output feature dimension.
+    - C: The output cache tensor with shape (M, topk, N), where M is the
+        total number of tokens post padding, topk is the number of times
+        each token is repeated, and N is the output feature dimension.
+    - sorted_token_ids: A tensor containing the sorted indices of tokens,
+        repeated topk times and arranged by the expert index they are
+        assigned to.
+    - expert_ids: A tensor containing the indices of the expert for each
+        block. It determines which expert matrix from B should be used for
+        each block in A.
+    This kernel performs the multiplication of a token by its corresponding
+    expert matrix as determined by `expert_ids`. The sorting of
+    `sorted_token_ids` by expert index and padding ensures divisibility by
+    BLOCK_SIZE_M, which is necessary to maintain consistency in block matrix
+    multiplication across different blocks processed by the same expert.
     """
     # -----------------------------------------------------------
     # Map program ids `pid` to the block of C it should compute.
@@ -105,7 +119,8 @@ def fused_moe_kernel(
     accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
 
     for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
-        # Load the next block of A and B, generate a mask by checking the K dimension.
+        # Load the next block of A and B, generate a mask by checking the
+        # K dimension.
         a = tl.load(a_ptrs,
                     mask=token_mask[:, None] &
                     (offs_k[None, :] < K - k * BLOCK_SIZE_K),
@@ -139,30 +154,41 @@ def moe_align_block_size(
         topk_ids: torch.Tensor, block_size: int,
         num_experts: int) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     """
-    Aligns the token distribution across experts to be compatible with block size for matrix multiplication.
+    Aligns the token distribution across experts to be compatible with block
+    size for matrix multiplication.
 
     Parameters:
-    - topk_ids: A tensor of shape [total_tokens, top_k] representing the top-k expert indices for each token.
+    - topk_ids: A tensor of shape [total_tokens, top_k] representing the
+        top-k expert indices for each token.
     - block_size: The block size used in block matrix multiplication.
     - num_experts: The total number of experts.
 
     Returns:
-    - sorted_token_ids: A tensor containing the sorted token indices according to their allocated expert.
+    - sorted_token_ids: A tensor containing the sorted token indices according
+        to their allocated expert.
     - expert_ids: A tensor indicating the assigned expert index for each block.
-    - num_tokens_post_padded: The total number of tokens after padding, ensuring divisibility by block_size.
+    - num_tokens_post_padded: The total number of tokens after padding,
+        ensuring divisibility by block_size.
 
-    This function pads the number of tokens that each expert needs to process so that it is divisible by block_size. 
-    Padding ensures that during block matrix multiplication, the dimensions align correctly.
+    This function pads the number of tokens that each expert needs to process
+    so that it is divisible by block_size.
+    Padding ensures that during block matrix multiplication, the dimensions
+    align correctly.
 
     Example:
-    Given topk_ids = [[2, 3, 4], [1, 2, 4], [1, 3, 4], [1, 2, 3]], block_size = 4, and num_experts = 4:
-    - We initially have 12 tokens (after repeating 'top_k' times) and 4 experts, with each expert needing to process 3 tokens.
+    Given topk_ids = [[2, 3, 4], [1, 2, 4], [1, 3, 4], [1, 2, 3]],
+    block_size = 4, and num_experts = 4:
+    - We initially have 12 tokens (after repeating 'top_k' times) and 4 experts,
+        with each expert needing to process 3 tokens.
     - As block_size is 4, we pad 1 token for each expert.
     - First, flatten topk_ids to [2, 3, 4, 1, 2, 4, 1, 3, 4, 1, 2, 3].
     - Then append padding tokens [12, 12, 12, 12] for each block.
-    - After sorting by expert index, we obtain token_ids [3, 6, 9, 12, 0, 4, 10, 12, 1, 7, 11, 12, 2, 5, 8, 12]. 
-        Tokens 12 are non-existent (padding) and are ignored in the subsequent matrix multiplication.
-    - The padding ensures that the total number of tokens is now divisible by block_size for proper block matrix operations.
+    - After sorting by expert index, we obtain token_ids
+        [3, 6, 9, 12, 0, 4, 10, 12, 1, 7, 11, 12, 2, 5, 8, 12].
+        Tokens 12 are non-existent (padding) and are ignored in
+        the subsequent matrix multiplication.
+    - The padding ensures that the total number of tokens is now divisible
+        by block_size for proper block matrix operations.
     """
     sorted_ids = torch.empty(
         (topk_ids.numel() + num_experts * (block_size - 1), ),
@@ -224,13 +250,14 @@ def get_moe_configs(E: int, N: int) -> Optional[Dict[int, Any]]:
     """
     Return optimized configurations for the fused MoE kernel.
 
-    The return value will be a dictionary that maps an irregular grid of batch sizes
-    to configurations of the fused_moe kernel. To evaluate the kernel on a given batch
-    size bs, the closest batch size in the grid should be picked and the associated
-    configuration chosen to invoke the kernel.
+    The return value will be a dictionary that maps an irregular grid of
+    batch sizes to configurations of the fused_moe kernel. To evaluate the
+    kernel on a given batch size bs, the closest batch size in the grid should
+    be picked and the associated configuration chosen to invoke the kernel.
     """
 
-    # First look up if an optimized configuration is available in the configs directory
+    # First look up if an optimized configuration is available in the configs
+    # directory
     device_name = torch.cuda.get_device_name().replace(" ", "_")
 
     config_file_path = os.path.join(
@@ -243,7 +270,8 @@ def get_moe_configs(E: int, N: int) -> Optional[Dict[int, Any]]:
             # If a configuration has been found, return it
             return {int(key): val for key, val in json.load(f).items()}
 
-    # If no optimized configuration is available, we will use the default configuration
+    # If no optimized configuration is available, we will use the default
+    # configuration
     return None
 
 
@@ -258,18 +286,22 @@ def fused_moe(
     override_config: Optional[Dict[str, Any]] = None,
 ) -> torch.Tensor:
     """
-    This function computes a Mixture of Experts (MoE) layer using two sets of weights, w1 and w2, and top-k gating mechanism.
-    
+    This function computes a Mixture of Experts (MoE) layer using two sets of
+    weights, w1 and w2, and top-k gating mechanism.
+
     Parameters:
     - hidden_states (torch.Tensor): The input tensor to the MoE layer.
     - w1 (torch.Tensor): The first set of expert weights.
     - w2 (torch.Tensor): The second set of expert weights.
-    - gating_output (torch.Tensor): The output of the gating operation (before softmax).
+    - gating_output (torch.Tensor): The output of the gating operation
+        (before softmax).
     - topk (int): The number of top-k experts to select.
     - renormalize (bool): If True, renormalize the top-k weights to sum to 1.
-    - inplace (bool): If True, perform the operation in-place. Defaults to False.
-    - override_config (Optional[Dict[str, Any]]): Optional override for the kernel configuration.
-    
+    - inplace (bool): If True, perform the operation in-place.
+        Defaults to False.
+    - override_config (Optional[Dict[str, Any]]): Optional override
+        for the kernel configuration.
+
     Returns:
     - torch.Tensor: The output tensor after applying the MoE layer.
     """
@@ -325,7 +357,8 @@ def fused_moe(
         configs = get_moe_configs(E, w2.shape[2])
 
         if configs:
-            # If an optimal configuration map has been found, look up the optimal config
+            # If an optimal configuration map has been found, look up the
+            # optimal config
             config = configs[min(configs.keys(), key=lambda x: abs(x - M))]
         else:
             # Else use the default config
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 45f6ea8438043..8d26271bd60a5 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -293,7 +293,8 @@ def weight_loader(self,
                     shard_size = shard_size // param.pack_factor
                     shard_offset = shard_offset // param.pack_factor
 
-                    # If marlin, we need to adjust the offset and size to account for the tiling.
+                    # If marlin, we need to adjust the offset and size to
+                    # account for the tiling.
                     shard_size, shard_offset = adjust_marlin_shard(
                         param, shard_size, shard_offset)
 
@@ -315,7 +316,8 @@ def weight_loader(self,
                 shard_size = shard_size // param.pack_factor
                 shard_offset = shard_offset // param.pack_factor
 
-                # If marlin, we need to adjust the offset and size to account for the tiling.
+                # If marlin, we need to adjust the offset and size to
+                # account for the tiling.
                 shard_size, shard_offset = adjust_marlin_shard(
                     param, shard_size, shard_offset)
 
@@ -431,7 +433,8 @@ def weight_loader(self,
                     shard_size = shard_size // param.pack_factor
                     shard_offset = shard_offset // param.pack_factor
 
-                    # If marlin, we need to adjust the offset and size to account for the tiling.
+                    # If marlin, we need to adjust the offset and size to
+                    # account for the tiling.
                     shard_size, shard_offset = adjust_marlin_shard(
                         param, shard_size, shard_offset)
 
@@ -460,7 +463,8 @@ def weight_loader(self,
                 shard_size = shard_size // param.pack_factor
                 shard_offset = shard_offset // param.pack_factor
 
-                # If marlin, we need to adjust the offset and size to account for the tiling.
+                # If marlin, we need to adjust the offset and size to
+                # account for the tiling.
                 shard_size, shard_offset = adjust_marlin_shard(
                     param, shard_size, shard_offset)
 
diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
index b05790b339243..1891ed4c2b8ee 100644
--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -2,7 +2,8 @@
 
 from typing import Type
 
-from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
 from vllm.model_executor.layers.quantization.awq import AWQConfig
 from vllm.model_executor.layers.quantization.gptq import GPTQConfig
 from vllm.model_executor.layers.quantization.squeezellm import SqueezeLLMConfig
diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py
index 3e1c814dd233c..2caef5f1ebf50 100644
--- a/vllm/model_executor/layers/quantization/awq.py
+++ b/vllm/model_executor/layers/quantization/awq.py
@@ -6,7 +6,8 @@
 from vllm._C import ops
 from vllm.model_executor.layers.linear import (LinearMethodBase,
                                                set_weight_attrs)
-from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
 
 
 class AWQConfig(QuantizationConfig):
@@ -50,7 +51,8 @@ def get_min_capability(self) -> int:
     def get_config_filenames() -> List[str]:
         return [
             "quant_config.json",  # E.g., casperhansen/vicuna-7b-v1.5-awq
-            "quantize_config.json",  # E.g., abhinavkulkarni/mosaicml-mpt-7b-instruct-w4-g128-awq
+            # E.g., abhinavkulkarni/mosaicml-mpt-7b-instruct-w4-g128-awq
+            "quantize_config.json",
         ]
 
     @classmethod
diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py
index 2e6aabb232673..bb69c7235a133 100644
--- a/vllm/model_executor/layers/quantization/gptq.py
+++ b/vllm/model_executor/layers/quantization/gptq.py
@@ -31,8 +31,8 @@ def __init__(
         self.pack_factor = Fraction(32, self.weight_bits)
         if self.weight_bits not in [2, 3, 4, 8]:
             raise ValueError(
-                "Currently, only 2/3/4/8-bit weight quantization is supported for "
-                f"GPTQ, but got {self.weight_bits} bits.")
+                "Currently, only 2/3/4/8-bit weight quantization is "
+                f"supported for GPTQ, but got {self.weight_bits} bits.")
 
     def __repr__(self) -> str:
         return (f"GPTQConfig(weight_bits={self.weight_bits}, "
@@ -101,7 +101,8 @@ def create_weights(
                 "The input size is not aligned with the quantized "
                 "weight shape. This can be caused by too large "
                 "tensor parallel size.")
-        if output_size_per_partition % self.quant_config.pack_factor.numerator != 0:
+        if (output_size_per_partition % self.quant_config.pack_factor.numerator
+                != 0):
             raise ValueError(
                 "The output size is not aligned with the quantized "
                 "weight shape. This can be caused by too large "
@@ -114,7 +115,8 @@ def create_weights(
         exllama_state = ExllamaState.UNINITIALIZED
         scale_and_zero_size = input_size // group_size
         scale_and_zero_input_dim = None
-        if input_size != input_size_per_partition and self.quant_config.group_size != -1:
+        if (input_size != input_size_per_partition
+                and self.quant_config.group_size != -1):
             # For act-order models, we cannot use Exllama for row parallel layer
             if self.quant_config.desc_act:
                 exllama_state = ExllamaState.UNUSED
diff --git a/vllm/model_executor/layers/quantization/marlin.py b/vllm/model_executor/layers/quantization/marlin.py
index 96800759b4a25..0c4f20d9e3a58 100644
--- a/vllm/model_executor/layers/quantization/marlin.py
+++ b/vllm/model_executor/layers/quantization/marlin.py
@@ -5,7 +5,8 @@
 
 from vllm._C import ops
 from vllm.model_executor.layers.linear import LinearMethodBase, set_weight_attrs
-from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
 
 
 class MarlinConfig(QuantizationConfig):
@@ -22,8 +23,9 @@ def __init__(
         self.group_size = group_size
         if self.group_size != 128 and self.group_size != -1:
             raise ValueError(
-                "Currently, only group size 128 and -1 (channelwise) is supported for "
-                f"Marlin, but got group_size of {self.group_size}")
+                "Currently, only group size 128 and -1 (channelwise) "
+                "is supported for Marlin, but got group_size of "
+                f"{self.group_size}")
 
         # 4 Bits packed into 32 bit datatype.
         self.pack_factor = 32 // 4
@@ -37,7 +39,8 @@ def __init__(
         # Min in_features dim
         self.min_k_threads = 128
 
-        # Max parallel problems to solve at once (improves large batch performance)
+        # Max parallel problems to solve at once (improves large
+        # batch performance)
         self.max_parallel = 16
 
         # Permutation length used by the marlin kernels.
@@ -52,9 +55,10 @@ def get_name(cls) -> str:
 
     @classmethod
     def get_supported_act_dtypes(cls) -> List[torch.dtype]:
-        return [torch.float16]
+        return [torch.half]
 
     @classmethod
+    # Need to figure it out
     def get_min_capability(cls) -> int:
         return 80
 
@@ -92,6 +96,8 @@ def create_weights(
         output_size: int,
         params_dtype: torch.dtype,
     ) -> Dict[str, Any]:
+        del output_size  # Unused.
+
         if params_dtype != torch.float16:
             raise ValueError(
                 f"The params dtype must be float16, but got {params_dtype}")
@@ -99,22 +105,26 @@ def create_weights(
         # Validate output_size_per_partition
         if output_size_per_partition % self.quant_config.min_n_threads != 0:
             raise ValueError(
-                f"Weight output_size_per_partition = {output_size_per_partition} is not divisible by min_n_threads = {self.quant_config.min_n_threads}."
-            )
+                f"Weight output_size_per_partition = "
+                f"{output_size_per_partition} is not divisible by "
+                f"min_n_threads = {self.quant_config.min_n_threads}.")
         if output_size_per_partition % self.quant_config.pack_factor != 0:
             raise ValueError(
-                f"Weight output_size_per_partition = {output_size_per_partition} is not divisible by pack_factor = {self.quant_config.pack_factor}."
-            )
+                f"Weight output_size_per_partition = "
+                f"{output_size_per_partition} is not divisible by "
+                f"pack_factor = {self.quant_config.pack_factor}.")
 
         # Validate input_size_per_partition
         if input_size_per_partition % self.quant_config.min_k_threads != 0:
             raise ValueError(
-                f"Weight input_size_per_partition = {input_size_per_partition} is not divisible by min_k_threads = {self.quant_config.min_k_threads}."
-            )
-        if self.quant_config.group_size != -1 and input_size_per_partition % self.quant_config.group_size != 0:
-            raise ValueError(
-                f"Weight input_size_per_partition = f{input_size_per_partition} is not divisible by group_size = {self.quant_config.group_size}."
-            )
+                f"Weight input_size_per_partition = "
+                f"{input_size_per_partition} is not divisible by "
+                f"min_k_threads = {self.quant_config.min_k_threads}.")
+        if (self.quant_config.group_size != -1 and
+                input_size_per_partition % self.quant_config.group_size != 0):
+            raise ValueError(f"Weight input_size_per_partition = "
+                             f"{input_size_per_partition} is not divisible by "
+                             f"group_size = {self.quant_config.group_size}.")
 
         # Check that we have at least 4 tiles horizontally in the shard
         num_tiles_per_perm = self.quant_config.perm_len // (
@@ -146,7 +156,9 @@ def create_weights(
         )
 
         # Determine if channelwise or not
-        input_groups = 1 if self.quant_config.group_size == -1 else input_size_per_partition // self.quant_config.group_size
+        input_groups = (1 if self.quant_config.group_size == -1 else
+                        input_size_per_partition //
+                        self.quant_config.group_size)
 
         scales = Parameter(
             torch.empty(
diff --git a/vllm/model_executor/layers/quantization/squeezellm.py b/vllm/model_executor/layers/quantization/squeezellm.py
index 9244e88552756..ed25455e6ec1f 100644
--- a/vllm/model_executor/layers/quantization/squeezellm.py
+++ b/vllm/model_executor/layers/quantization/squeezellm.py
@@ -6,7 +6,8 @@
 from vllm._C import ops
 from vllm.model_executor.layers.linear import (LinearMethodBase,
                                                set_weight_attrs)
-from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
 from vllm.utils import is_hip
 
 
diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py
index 3e1cfc783b8ef..5643454060251 100644
--- a/vllm/model_executor/layers/rejection_sampler.py
+++ b/vllm/model_executor/layers/rejection_sampler.py
@@ -21,8 +21,6 @@ def __init__(self, strict_mode: bool = False):
                 nontrivial latency.
         """
         super().__init__()
-        self.probs_dtype = torch.float32
-        self.token_id_dtype = torch.int64
         self._strict_mode = strict_mode
 
         # NOTE: A "bonus token" is accepted iff all proposal tokens are
@@ -44,6 +42,14 @@ def init_gpu_tensors(self, rank: int) -> None:
                                                dtype=torch.long,
                                                device=device)
 
+    @property
+    def probs_dtype(self):
+        return torch.float32
+
+    @property
+    def token_id_dtype(self):
+        return torch.int64
+
     def forward(
         self,
         target_probs: torch.Tensor,
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index b48dde0318d09..4377b845df628 100644
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -6,7 +6,8 @@
 
 from vllm.model_executor.parallel_utils.communication_op import (
     tensor_model_parallel_gather)
-from vllm.model_executor.sampling_metadata import SamplingMetadata, SamplingTensors
+from vllm.model_executor.sampling_metadata import (SamplingMetadata,
+                                                   SamplingTensors)
 from vllm.sampling_params import SamplingParams, SamplingType
 from vllm.sequence import (Logprob, PromptLogprobs, SampleLogprobs,
                            SamplerOutput, SequenceData, SequenceGroupOutput,
@@ -516,7 +517,6 @@ def _get_logprobs(
         if (i < sampling_metadata.num_prompts
                 and sampling_params.prompt_logprobs is not None):
             num_logprobs = sampling_params.prompt_logprobs
-            prompt_len = sampling_metadata.prompt_lens[i]
             prompt_tokens = sampling_metadata.seq_data[
                 seq_ids[0]].prompt_token_ids
             group_prompt_logprobs: PromptLogprobs = [None]
@@ -588,4 +588,4 @@ def _build_sampler_output(
                 SequenceOutput(seq_ids[parent_id], next_token_id, logprobs))
         sampler_output.append(
             SequenceGroupOutput(seq_outputs, group_prompt_logprobs))
-    return sampler_output
+    return SamplerOutput(outputs=sampler_output)
diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py
index 550dec6487f9e..cbf472750e294 100644
--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@@ -27,7 +27,7 @@
 
 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.attention import PagedAttention
+from vllm.model_executor.layers.attention import Attention
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (LinearMethodBase,
                                                MergedColumnParallelLinear,
@@ -151,10 +151,10 @@ def __init__(
             alibi_slopes = alibi_slopes[head_start:head_end].tolist()
 
             scaling = self.head_dim**-0.5
-            self.attn = PagedAttention(self.num_heads,
-                                       self.head_dim,
-                                       scaling,
-                                       alibi_slopes=alibi_slopes)
+            self.attn = Attention(self.num_heads,
+                                  self.head_dim,
+                                  scaling,
+                                  alibi_slopes=alibi_slopes)
         else:
             self.rotary_emb = get_rope(
                 self.head_dim,
@@ -163,8 +163,7 @@ def __init__(
                 base=self.rope_theta,
             )
             self.scaling = self.head_dim**-0.5
-            self.attn = PagedAttention(self.num_heads, self.head_dim,
-                                       self.scaling)
+            self.attn = Attention(self.num_heads, self.head_dim, self.scaling)
 
     def forward(
         self,
@@ -334,7 +333,8 @@ def load_weights(self,
             if "rotary_emb.inv_freq" in name:
                 continue
             if name == "lm_head.weight":
-                # Unlike Baichuan, Baichuan2 normalizes the head weights. Refer to:
+                # Unlike Baichuan, Baichuan2 normalizes the head weights.
+                # Refer to:
                 # https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat/blob/84603cde5ebffb6084e476cfaeceaf0b8b91fe54/modeling_baichuan.py#L508
                 # Distinguish between Baichuan and Baichuan2 by checking the
                 # vocab size. This is suggested by
diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py
index 4adfb6b78102f..0548b2b140b1b 100644
--- a/vllm/model_executor/models/bloom.py
+++ b/vllm/model_executor/models/bloom.py
@@ -25,7 +25,7 @@
 
 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.attention import PagedAttention
+from vllm.model_executor.layers.attention import Attention
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                LinearMethodBase,
                                                QKVParallelLinear,
@@ -107,10 +107,10 @@ def __init__(
         alibi_slopes = alibi_slopes[head_start:head_end].tolist()
 
         scaling = self.head_dim**-0.5
-        self.attn = PagedAttention(self.num_heads,
-                                   self.head_dim,
-                                   scaling,
-                                   alibi_slopes=alibi_slopes)
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              scaling,
+                              alibi_slopes=alibi_slopes)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index dca8d724f976b..1c5dcfacaff2b 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -10,7 +10,7 @@
 
 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.attention import PagedAttention
+from vllm.model_executor.layers.attention import Attention
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (LinearMethodBase,
                                                MergedColumnParallelLinear,
@@ -87,7 +87,7 @@ def __init__(
             base=10000 * rope_ratio,
             is_neox_style=False,
         )
-        self.attn = PagedAttention(
+        self.attn = Attention(
             self.num_heads,
             self.head_dim,
             self.scaling,
diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py
index 6dba952736921..13c080cb02774 100644
--- a/vllm/model_executor/models/deepseek.py
+++ b/vllm/model_executor/models/deepseek.py
@@ -29,7 +29,7 @@
 
 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.attention import PagedAttention
+from vllm.model_executor.layers.attention import Attention
 from vllm.model_executor.layers.fused_moe import fused_moe
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (LinearMethodBase,
@@ -119,7 +119,8 @@ def __init__(
                                      linear_method=None)
 
         if config.n_shared_experts is not None:
-            intermediate_size = config.moe_intermediate_size * config.n_shared_experts
+            intermediate_size = (config.moe_intermediate_size *
+                                 config.n_shared_experts)
             self.shared_experts = DeepseekMLP(
                 hidden_size=config.hidden_size,
                 intermediate_size=intermediate_size,
@@ -229,10 +230,10 @@ def __init__(
             base=rope_theta,
             rope_scaling=rope_scaling,
         )
-        self.attn = PagedAttention(self.num_heads,
-                                   self.head_dim,
-                                   self.scaling,
-                                   num_kv_heads=self.num_kv_heads)
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_kv_heads)
 
     def forward(
         self,
@@ -273,8 +274,9 @@ def __init__(
             max_position_embeddings=max_position_embeddings,
             linear_method=linear_method,
         )
-        if (config.n_routed_experts is not None and  \
-            layer_idx >= config.first_k_dense_replace and layer_idx % config.moe_layer_freq == 0):
+        if (config.n_routed_experts is not None
+                and layer_idx >= config.first_k_dense_replace
+                and layer_idx % config.moe_layer_freq == 0):
             self.mlp = DeepseekMoE(config=config, linear_method=linear_method)
         else:
             self.mlp = DeepseekMLP(
diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py
index 2b5e022312e3b..3c148be5b10f4 100644
--- a/vllm/model_executor/models/falcon.py
+++ b/vllm/model_executor/models/falcon.py
@@ -28,7 +28,7 @@
 
 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.attention import PagedAttention
+from vllm.model_executor.layers.attention import Attention
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                LinearMethodBase,
                                                QKVParallelLinear,
@@ -150,10 +150,10 @@ def __init__(
                 max_position=max_position_embeddings,
                 base=rope_theta,
             )
-            self.attn = PagedAttention(self.num_heads,
-                                       self.head_dim,
-                                       self.inv_norm_factor,
-                                       num_kv_heads=self.num_kv_heads)
+            self.attn = Attention(self.num_heads,
+                                  self.head_dim,
+                                  self.inv_norm_factor,
+                                  num_kv_heads=self.num_kv_heads)
         elif self.use_alibi:
             tp_rank = get_tensor_model_parallel_rank()
             head_start = tp_rank * self.num_heads
@@ -161,16 +161,16 @@ def __init__(
             alibi_slopes = (_get_alibi_slopes(self.total_num_heads) *
                             self.inv_norm_factor)
             alibi_slopes = alibi_slopes[head_start:head_end].tolist()
-            self.attn = PagedAttention(self.num_heads,
-                                       self.head_dim,
-                                       self.inv_norm_factor,
-                                       num_kv_heads=self.num_kv_heads,
-                                       alibi_slopes=alibi_slopes)
+            self.attn = Attention(self.num_heads,
+                                  self.head_dim,
+                                  self.inv_norm_factor,
+                                  num_kv_heads=self.num_kv_heads,
+                                  alibi_slopes=alibi_slopes)
         else:
-            self.attn = PagedAttention(self.num_heads,
-                                       self.head_dim,
-                                       scale=self.inv_norm_factor,
-                                       num_kv_heads=self.num_kv_heads)
+            self.attn = Attention(self.num_heads,
+                                  self.head_dim,
+                                  scale=self.inv_norm_factor,
+                                  num_kv_heads=self.num_kv_heads)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index 03948132d32c3..386a36cf492d6 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -23,7 +23,7 @@
 from vllm.config import LoRAConfig
 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.activation import GeluAndMul
-from vllm.model_executor.layers.attention import PagedAttention
+from vllm.model_executor.layers.attention import Attention
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (LinearMethodBase,
                                                MergedColumnParallelLinear,
@@ -123,10 +123,10 @@ def __init__(self,
             base=self.rope_theta,
             is_neox_style=True,
         )
-        self.attn = PagedAttention(self.num_heads,
-                                   self.head_dim,
-                                   self.scaling,
-                                   num_kv_heads=self.num_kv_heads)
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_kv_heads)
 
     def forward(
         self,
@@ -325,11 +325,17 @@ def load_weights(self,
                 if shard_name not in name:
                     continue
                 name = name.replace(shard_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
                 param = params_dict[name]
                 weight_loader = param.weight_loader
                 weight_loader(param, loaded_weight, shard_id)
                 break
             else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
                 # GemmaRMSNorm is different from Llama's in that it multiplies
                 # (1 + weight) to the output, instead of just weight.
                 if "norm.weight" in name:
diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py
index 661da0fe0434e..3f7b21e5a4133 100644
--- a/vllm/model_executor/models/gpt2.py
+++ b/vllm/model_executor/models/gpt2.py
@@ -25,7 +25,7 @@
 
 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.attention import PagedAttention
+from vllm.model_executor.layers.attention import Attention
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                LinearMethodBase,
                                                QKVParallelLinear,
@@ -73,9 +73,7 @@ def __init__(
             bias=True,
             linear_method=linear_method,
         )
-        self.attn = PagedAttention(self.num_heads,
-                                   self.head_dim,
-                                   scale=self.scale)
+        self.attn = Attention(self.num_heads, self.head_dim, scale=self.scale)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py
index ef4c1d4143c88..5c30d47d93e36 100644
--- a/vllm/model_executor/models/gpt_bigcode.py
+++ b/vllm/model_executor/models/gpt_bigcode.py
@@ -26,7 +26,7 @@
 
 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.attention import PagedAttention
+from vllm.model_executor.layers.attention import Attention
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                LinearMethodBase,
                                                QKVParallelLinear,
@@ -85,10 +85,10 @@ def __init__(
             bias=True,
             linear_method=linear_method,
         )
-        self.attn = PagedAttention(self.num_heads,
-                                   self.head_dim,
-                                   scale=self.scale,
-                                   num_kv_heads=self.num_kv_heads)
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              scale=self.scale,
+                              num_kv_heads=self.num_kv_heads)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py
index 5bab30d9d442e..93dce7b67a7a5 100644
--- a/vllm/model_executor/models/gpt_j.py
+++ b/vllm/model_executor/models/gpt_j.py
@@ -24,7 +24,7 @@
 
 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.attention import PagedAttention
+from vllm.model_executor.layers.attention import Attention
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                LinearMethodBase,
                                                QKVParallelLinear,
@@ -86,7 +86,7 @@ def __init__(
             base=rope_theta,
             is_neox_style=False,
         )
-        self.attn = PagedAttention(self.num_heads, self.head_size, scaling)
+        self.attn = Attention(self.num_heads, self.head_size, scaling)
 
     def forward(
         self,
@@ -143,7 +143,8 @@ def __init__(
         linear_method: Optional[LinearMethodBase] = None,
     ):
         super().__init__()
-        inner_dim = 4 * config.n_embd if config.n_inner is None else config.n_inner
+        inner_dim = (4 * config.n_embd
+                     if config.n_inner is None else config.n_inner)
         self.ln_1 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
         self.attn = GPTJAttention(config, linear_method)
         self.mlp = GPTJMLP(inner_dim, config, linear_method)
diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py
index 8f7e1063e0c1d..98107350e60b9 100644
--- a/vllm/model_executor/models/gpt_neox.py
+++ b/vllm/model_executor/models/gpt_neox.py
@@ -24,7 +24,7 @@
 
 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.attention import PagedAttention
+from vllm.model_executor.layers.attention import Attention
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                LinearMethodBase,
                                                QKVParallelLinear,
@@ -87,7 +87,7 @@ def __init__(
             max_position=max_position_embeddings,
             base=rope_theta,
         )
-        self.attn = PagedAttention(self.num_heads, self.head_size, scaling)
+        self.attn = Attention(self.num_heads, self.head_size, scaling)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py
index ebf1d8a89a022..7b2215ef4bda5 100644
--- a/vllm/model_executor/models/internlm2.py
+++ b/vllm/model_executor/models/internlm2.py
@@ -7,7 +7,7 @@
 
 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.attention import PagedAttention
+from vllm.model_executor.layers.attention import Attention
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (LinearMethodBase,
                                                MergedColumnParallelLinear,
@@ -114,10 +114,10 @@ def __init__(
             base=rope_theta,
             rope_scaling=rope_scaling,
         )
-        self.attn = PagedAttention(self.num_heads,
-                                   self.head_dim,
-                                   self.scaling,
-                                   num_kv_heads=self.num_kv_heads)
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_kv_heads)
 
     def forward(
         self,
@@ -305,7 +305,8 @@ def load_weights(self,
                 param = params_dict[name]
                 if "wqkv" in name:
                     config = self.config
-                    kv_groups = config.num_attention_heads // config.num_key_value_heads
+                    kv_groups = (config.num_attention_heads //
+                                 config.num_key_value_heads)
                     head_dim = config.hidden_size // config.num_attention_heads
                     loaded_weight = loaded_weight.view(-1, 2 + kv_groups,
                                                        head_dim,
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index d35887cc0f6a3..4c163dfdab537 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -30,7 +30,7 @@
 from vllm.config import LoRAConfig
 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.attention import PagedAttention
+from vllm.model_executor.layers.attention import Attention
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (LinearMethodBase,
                                                MergedColumnParallelLinear,
@@ -139,11 +139,11 @@ def __init__(
             base=rope_theta,
             rope_scaling=rope_scaling,
         )
-        self.attn = PagedAttention(self.num_heads,
-                                   self.head_dim,
-                                   self.scaling,
-                                   num_kv_heads=self.num_kv_heads,
-                                   sliding_window=sliding_window)
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_kv_heads,
+                              sliding_window=sliding_window)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 0100624a44d78..d47834e519697 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -29,7 +29,7 @@
 
 from vllm.config import LoRAConfig
 from vllm.model_executor.input_metadata import InputMetadata
-from vllm.model_executor.layers.attention import PagedAttention
+from vllm.model_executor.layers.attention import Attention
 from vllm.model_executor.layers.fused_moe import fused_moe
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (LinearMethodBase,
@@ -197,7 +197,7 @@ def __init__(self,
             base=int(self.rope_theta),
             is_neox_style=True,
         )
-        self.attn = PagedAttention(
+        self.attn = Attention(
             self.num_heads,
             self.head_dim,
             self.scaling,
diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py
index a8dadce24aa1d..25c7f1978c0dc 100644
--- a/vllm/model_executor/models/mixtral_quant.py
+++ b/vllm/model_executor/models/mixtral_quant.py
@@ -32,7 +32,7 @@
 from transformers import MixtralConfig
 
 from vllm.model_executor.input_metadata import InputMetadata
-from vllm.model_executor.layers.attention import PagedAttention
+from vllm.model_executor.layers.attention import Attention
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (LinearMethodBase,
                                                ReplicatedLinear,
@@ -214,7 +214,7 @@ def __init__(self,
             base=int(self.rope_theta),
             is_neox_style=True,
         )
-        self.attn = PagedAttention(
+        self.attn = Attention(
             self.num_heads,
             self.head_dim,
             self.scaling,
diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py
index 22a876e2ef691..16ecac3d0529a 100644
--- a/vllm/model_executor/models/mpt.py
+++ b/vllm/model_executor/models/mpt.py
@@ -8,7 +8,7 @@
 
 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.attention import PagedAttention
+from vllm.model_executor.layers.attention import Attention
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                LinearMethodBase,
                                                QKVParallelLinear,
@@ -105,11 +105,11 @@ def __init__(
 
         self.head_dim = self.d_model // self.total_num_heads
         scaling = self.head_dim**-0.5
-        self.attn = PagedAttention(self.num_heads,
-                                   self.head_dim,
-                                   scaling,
-                                   alibi_slopes=alibi_slopes,
-                                   num_kv_heads=self.num_kv_heads)
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              scaling,
+                              alibi_slopes=alibi_slopes,
+                              num_kv_heads=self.num_kv_heads)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py
index 9d563039208c8..2b0a420e82faf 100644
--- a/vllm/model_executor/models/olmo.py
+++ b/vllm/model_executor/models/olmo.py
@@ -43,7 +43,7 @@
 from torch import nn
 
 from vllm.model_executor.input_metadata import InputMetadata
-from vllm.model_executor.layers.attention import PagedAttention
+from vllm.model_executor.layers.attention import Attention
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
     LinearMethodBase,
@@ -52,7 +52,8 @@
 )
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.sampler import Sampler
-from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding)
 from vllm.model_executor.parallel_utils.parallel_state import (
     get_tensor_model_parallel_world_size, )
 from vllm.model_executor.sampling_metadata import SamplingMetadata
@@ -81,7 +82,8 @@ def output_multiplier(self) -> float:
 
 class OlmoAttention(nn.Module):
     """
-    This is the attention block where the output is computed as ``Attention(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))``
+    This is the attention block where the output is computed as
+    ``Attention(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))``
     (plus another skip connection).
     """
 
@@ -94,11 +96,12 @@ def __init__(
         self.config = config
         self.hidden_size = config.d_model
         assert config.d_model % config.n_heads == 0
-        tensor_model_parallel_world_size = get_tensor_model_parallel_world_size(
-        )
+        tensor_model_parallel_world_size = (
+            get_tensor_model_parallel_world_size())
         self.total_num_heads = self.config.n_heads
         assert self.total_num_heads % tensor_model_parallel_world_size == 0
-        self.num_heads = self.total_num_heads // tensor_model_parallel_world_size
+        self.num_heads = (self.total_num_heads //
+                          tensor_model_parallel_world_size)
         self.head_dim = self.hidden_size // self.total_num_heads
 
         # Layer norms.
@@ -126,9 +129,9 @@ def __init__(
                 base=rope_theta,
             )
         self.scaling = self.head_dim**-0.5
-        self.attn = PagedAttention(self.num_heads,
-                                   self.head_dim,
-                                   scale=self.scaling)
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              scale=self.scaling)
 
         # Attention output projection.
         self.attn_out = RowParallelLinear(
@@ -158,7 +161,8 @@ def forward(
 
 class OlmoMLP(nn.Module):
     """
-    This is the MLP block where the output is computed as ``MLP(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))``
+    This is the MLP block where the output is computed as
+    ``MLP(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))``
     (plus another skip connection).
     """
 
@@ -217,7 +221,8 @@ def forward(
 
 class OlmoBlock(nn.Module):
     """
-    This is a typical transformer block where the output is computed as ``MLP(LN(x + Attention(LN(x))))``
+    This is a typical transformer block where the output is
+    computed as ``MLP(LN(x + Attention(LN(x))))``
     (plus another skip connection).
     """
 
diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py
index 393b2dcabcd5a..782f43ce265bd 100644
--- a/vllm/model_executor/models/opt.py
+++ b/vllm/model_executor/models/opt.py
@@ -25,7 +25,7 @@
 
 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.attention import PagedAttention
+from vllm.model_executor.layers.attention import Attention
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                LinearMethodBase,
                                                QKVParallelLinear,
@@ -89,9 +89,9 @@ def __init__(
             bias=bias,
             linear_method=linear_method,
         )
-        self.attn = PagedAttention(self.num_heads,
-                                   self.head_dim,
-                                   scale=self.scaling)
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              scale=self.scaling)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py
index 0b067d4fc8802..6039b1cdc3534 100644
--- a/vllm/model_executor/models/orion.py
+++ b/vllm/model_executor/models/orion.py
@@ -12,7 +12,7 @@
 
 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.attention import PagedAttention
+from vllm.model_executor.layers.attention import Attention
 from vllm.model_executor.layers.linear import (LinearMethodBase,
                                                MergedColumnParallelLinear,
                                                QKVParallelLinear,
@@ -118,10 +118,10 @@ def __init__(
             base=rope_theta,
             rope_scaling=rope_scaling,
         )
-        self.attn = PagedAttention(self.num_heads,
-                                   self.head_dim,
-                                   self.scaling,
-                                   num_kv_heads=self.num_kv_heads)
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_kv_heads)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py
index d143261968288..039dc7a9b7675 100644
--- a/vllm/model_executor/models/phi.py
+++ b/vllm/model_executor/models/phi.py
@@ -43,7 +43,7 @@
 
 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.attention import PagedAttention
+from vllm.model_executor.layers.attention import Attention
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                LinearMethodBase,
                                                QKVParallelLinear,
@@ -108,7 +108,7 @@ def __init__(self,
             max_position=max_position_embeddings,
             base=rope_theta,
         )
-        self.attn = PagedAttention(self.num_heads, self.head_size, scaling)
+        self.attn = Attention(self.num_heads, self.head_size, scaling)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index 37af84c7cd53f..d4d5a4e8bb9a5 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -12,7 +12,7 @@
 
 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.attention import PagedAttention
+from vllm.model_executor.layers.attention import Attention
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (LinearMethodBase,
                                                MergedColumnParallelLinear,
@@ -104,7 +104,7 @@ def __init__(
             base=rope_theta,
             rope_scaling=rope_scaling,
         )
-        self.attn = PagedAttention(self.num_heads, self.head_dim, self.scaling)
+        self.attn = Attention(self.num_heads, self.head_dim, self.scaling)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index e823e6f8c3dbe..3e4f843e649b4 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -30,7 +30,7 @@
 
 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.attention import PagedAttention
+from vllm.model_executor.layers.attention import Attention
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (LinearMethodBase,
                                                MergedColumnParallelLinear,
@@ -46,6 +46,7 @@
 from vllm.model_executor.weight_utils import (default_weight_loader,
                                               hf_model_weights_iterator)
 from vllm.sequence import SamplerOutput
+from vllm.config import LoRAConfig
 
 KVCache = Tuple[torch.Tensor, torch.Tensor]
 
@@ -135,11 +136,11 @@ def __init__(self,
             max_position=max_position,
             base=self.rope_theta,
         )
-        self.attn = PagedAttention(self.num_heads,
-                                   self.head_dim,
-                                   self.scaling,
-                                   num_kv_heads=self.num_kv_heads,
-                                   sliding_window=self.sliding_window)
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_kv_heads,
+                              sliding_window=self.sliding_window)
 
     def forward(
         self,
@@ -169,7 +170,8 @@ def __init__(
         self.hidden_size = config.hidden_size
         # Requires transformers > 4.32.0
         rope_theta = getattr(config, "rope_theta", 1000000)
-        use_sliding_window = config.use_sliding_window and layer_idx < config.max_window_layers
+        use_sliding_window = (config.use_sliding_window
+                              and layer_idx < config.max_window_layers)
         self.self_attn = Qwen2Attention(
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
@@ -264,12 +266,35 @@ def forward(
 
 
 class Qwen2ForCausalLM(nn.Module):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    # LoRA specific attributes
+    supported_lora_modules = [
+        "qkv_proj",
+        "o_proj",
+        "gate_up_proj",
+        "down_proj",
+    ]
+    embedding_modules = {}
+    embedding_padding_modules = []
 
     def __init__(
         self,
         config: Qwen2Config,
         linear_method: Optional[LinearMethodBase] = None,
+        lora_config: Optional[LoRAConfig] = None,
     ) -> None:
+        del lora_config
         super().__init__()
         self.config = config
         self.linear_method = linear_method
diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py
index 44c57e5a6d4f9..c66f327beee7a 100644
--- a/vllm/model_executor/models/stablelm.py
+++ b/vllm/model_executor/models/stablelm.py
@@ -1,5 +1,6 @@
 # coding=utf-8
-# Copyright 2023 Stability AI, EleutherAI, and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2023 Stability AI, EleutherAI, and The HuggingFace Inc. team.
+# All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,7 +17,8 @@
 # This code is based off the following work:
 # https://huggingface.co/stabilityai/stablelm-3b-4e1t/blob/main/modeling_stablelm_epoch.py
 # https://huggingface.co/stabilityai/stablelm-3b-4e1t/blob/main/config.json
-"""Inference-only StabeLM (https://github.com/Stability-AI/StableLM) model compatible with HuggingFace weights."""
+"""Inference-only StabeLM (https://github.com/Stability-AI/StableLM)
+model compatible with HuggingFace weights."""
 from typing import List, Optional, Tuple
 
 import torch
@@ -25,7 +27,7 @@
 
 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.attention import PagedAttention
+from vllm.model_executor.layers.attention import Attention
 from vllm.model_executor.layers.linear import (LinearMethodBase,
                                                MergedColumnParallelLinear,
                                                QKVParallelLinear,
@@ -102,9 +104,9 @@ def __init__(self,
         self.kv_size = self.num_key_value_heads * self.head_dim
         self.qkv_bias = getattr(config, "use_qkv_bias", False)
         if (self.head_dim * self.num_heads * tp_size) != self.hidden_size:
-            raise ValueError(
-                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
-                f" and `num_heads`: {self.num_heads}).")
+            raise ValueError(f"hidden_size must be divisible by num_heads "
+                             f"(got `hidden_size`: {self.hidden_size}"
+                             f" and `num_heads`: {self.num_heads}).")
 
         self.qkv_proj = QKVParallelLinear(self.hidden_size,
                                           self.head_dim,
@@ -122,10 +124,10 @@ def __init__(self,
             max_position=self.config.max_position_embeddings,
             base=self.config.rope_theta,
         )
-        self.attn = PagedAttention(self.num_heads,
-                                   self.head_dim,
-                                   self.scaling,
-                                   num_kv_heads=self.num_key_value_heads)
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_key_value_heads)
 
     def forward(
         self,
@@ -192,7 +194,6 @@ def __init__(self,
                  config: PretrainedConfig,
                  linear_method: Optional[LinearMethodBase] = None) -> None:
         super().__init__()
-        # self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
         self.embed_tokens = VocabParallelEmbedding(
             config.vocab_size,
             config.hidden_size,
diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py
index 1eda07b724cae..cfbb1bdb7909e 100644
--- a/vllm/model_executor/models/starcoder2.py
+++ b/vllm/model_executor/models/starcoder2.py
@@ -25,7 +25,7 @@
 
 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.layers.attention import PagedAttention
+from vllm.model_executor.layers.attention import Attention
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
@@ -35,7 +35,8 @@
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding, ParallelLMHead, DEFAULT_VOCAB_PADDING_SIZE)
-from vllm.model_executor.parallel_utils.parallel_state import get_tensor_model_parallel_world_size
+from vllm.model_executor.parallel_utils.parallel_state import (
+    get_tensor_model_parallel_world_size)
 from vllm.model_executor.weight_utils import (default_weight_loader,
                                               hf_model_weights_iterator)
 from vllm.sequence import SamplerOutput
@@ -103,7 +104,7 @@ def __init__(self,
             base=int(self.rope_theta),
             is_neox_style=True,
         )
-        self.attn = PagedAttention(
+        self.attn = Attention(
             self.num_heads,
             self.head_dim,
             self.scaling,
diff --git a/vllm/model_executor/neuron_model_loader.py b/vllm/model_executor/neuron_model_loader.py
index b8d63d4ff12fc..c434b270a5562 100644
--- a/vllm/model_executor/neuron_model_loader.py
+++ b/vllm/model_executor/neuron_model_loader.py
@@ -34,7 +34,8 @@ def _get_model_architecture(config: PretrainedConfig) -> Type[nn.Module]:
 
 def get_model(model_config: ModelConfig, device_config: DeviceConfig,
               **kwargs) -> nn.Module:
-    from transformers_neuronx.config import NeuronConfig, ContinuousBatchingConfig
+    from transformers_neuronx.config import (NeuronConfig,
+                                             ContinuousBatchingConfig)
 
     parallel_config = kwargs.get("parallel_config")
     scheduler_config = kwargs.get("scheduler_config")
diff --git a/vllm/model_executor/parallel_utils/communication_op.py b/vllm/model_executor/parallel_utils/communication_op.py
index cf805df892fdc..521b6b8a383b0 100644
--- a/vllm/model_executor/parallel_utils/communication_op.py
+++ b/vllm/model_executor/parallel_utils/communication_op.py
@@ -11,7 +11,8 @@
     get_tensor_model_parallel_group,
     is_cupy_nccl_enabled_for_all_reduce,
 )
-from vllm.model_executor.parallel_utils.custom_all_reduce import custom_all_reduce
+from vllm.model_executor.parallel_utils.custom_all_reduce import (
+    custom_all_reduce)
 
 
 def tensor_model_parallel_all_reduce(input_: torch.Tensor) -> torch.Tensor:
@@ -24,7 +25,7 @@ def tensor_model_parallel_all_reduce(input_: torch.Tensor) -> torch.Tensor:
     and GPU topology.
 
     TLDR: always assume this function modifies its input, but use the return
-    value as the output. 
+    value as the output.
     """
     # Bypass the function if we are using only 1 GPU.
     if get_tensor_model_parallel_world_size() == 1:
diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
index 7deb80801856e..b23f0170a6ca5 100644
--- a/vllm/model_executor/sampling_metadata.py
+++ b/vllm/model_executor/sampling_metadata.py
@@ -114,7 +114,8 @@ def from_sampling_metadata(
                 do_penalties = True
             if (i < sampling_metadata.num_prompts
                     and sampling_params.prompt_logprobs is not None):
-                # For tokens in the prompt that we only need to get their logprobs
+                # For tokens in the prompt that we only need to get
+                # their logprobs
                 prompt_len = sampling_metadata.prompt_lens[i]
                 temperatures += [temperature] * (prompt_len - 1)
                 top_ps += [top_p] * (prompt_len - 1)
diff --git a/vllm/model_executor/weight_utils.py b/vllm/model_executor/weight_utils.py
index bbb69786020f0..ea63baa5420c1 100644
--- a/vllm/model_executor/weight_utils.py
+++ b/vllm/model_executor/weight_utils.py
@@ -22,6 +22,9 @@
 
 logger = init_logger(__name__)
 
+_xdg_cache_home = os.getenv('XDG_CACHE_HOME', os.path.expanduser('~/.cache'))
+_vllm_filelocks_path = os.path.join(_xdg_cache_home, 'vllm/locks/')
+
 
 class Disabledtqdm(tqdm):
 
@@ -30,7 +33,8 @@ def __init__(self, *args, **kwargs):
 
 
 def get_lock(model_name_or_path: str, cache_dir: Optional[str] = None):
-    lock_dir = cache_dir if cache_dir is not None else "/tmp"
+    lock_dir = cache_dir if cache_dir is not None else _vllm_filelocks_path
+    os.makedirs(os.path.dirname(lock_dir), exist_ok=True)
     lock_file_name = model_name_or_path.replace("/", "-") + ".lock"
     lock = filelock.FileLock(os.path.join(lock_dir, lock_file_name))
     return lock
diff --git a/vllm/outputs.py b/vllm/outputs.py
index a6de2a5a2257b..b8173fd7a0638 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -87,32 +87,33 @@ def __init__(
 
     @classmethod
     def from_seq_group(cls, seq_group: SequenceGroup) -> "RequestOutput":
-        # Get the top-n sequences.
-        n = seq_group.sampling_params.n
         seqs = seq_group.get_seqs()
-        if seq_group.sampling_params.use_beam_search:
-            sorting_key = lambda seq: seq.get_beam_search_score(
-                seq_group.sampling_params.length_penalty)
+        if len(seqs) == 1:
+            top_n_seqs = seqs
         else:
-            sorting_key = lambda seq: seq.get_cumulative_logprob()
-        sorted_seqs = sorted(seqs, key=sorting_key, reverse=True)
-        top_n_seqs = sorted_seqs[:n]
+            # Get the top-n sequences.
+            n = seq_group.sampling_params.n
+            if seq_group.sampling_params.use_beam_search:
+                sorting_key = lambda seq: seq.get_beam_search_score(
+                    seq_group.sampling_params.length_penalty)
+            else:
+                sorting_key = lambda seq: seq.get_cumulative_logprob()
+            sorted_seqs = sorted(seqs, key=sorting_key, reverse=True)
+            top_n_seqs = sorted_seqs[:n]
 
         # Create the outputs.
-        outputs: List[CompletionOutput] = []
-        for seq in top_n_seqs:
-            logprobs = seq.output_logprobs
-            if seq_group.sampling_params.logprobs is None:
-                # NOTE: We need to take care of this case because the sequence
-                # always has the logprobs of the sampled tokens even if the
-                # logprobs are not requested.
-                logprobs = None
-            finshed_reason = SequenceStatus.get_finished_reason(seq.status)
-            output = CompletionOutput(seqs.index(seq), seq.output_text,
-                                      seq.get_output_token_ids(),
-                                      seq.get_cumulative_logprob(), logprobs,
-                                      finshed_reason)
-            outputs.append(output)
+        # NOTE: We need omit logprobs here explicitly because the sequence
+        # always has the logprobs of the sampled tokens even if the
+        # logprobs are not requested.
+        include_logprobs = seq_group.sampling_params.logprobs
+        outputs = [
+            CompletionOutput(seqs.index(seq), seq.output_text,
+                             seq.get_output_token_ids(),
+                             seq.get_cumulative_logprob(),
+                             seq.output_logprobs if include_logprobs else None,
+                             SequenceStatus.get_finished_reason(seq.status))
+            for seq in top_n_seqs
+        ]
 
         # Every sequence in the sequence group should have the same prompt.
         prompt = seq_group.prompt
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 8103f3c2b24bf..4aa158878fb96 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -74,8 +74,8 @@ class SamplingParams:
         stop_token_ids: List of tokens that stop the generation when they are
             generated. The returned output will contain the stop tokens unless
             the stop tokens are special tokens.
-        include_stop_str_in_output: Whether to include the stop strings in output
-            text. Defaults to False.
+        include_stop_str_in_output: Whether to include the stop strings in
+            output text. Defaults to False.
         ignore_eos: Whether to ignore the EOS token and continue generating
             tokens after the EOS token is generated.
         max_tokens: Maximum number of tokens to generate per output sequence.
diff --git a/vllm/sequence.py b/vllm/sequence.py
index a110ab6b748f8..4a002edaf580f 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -2,12 +2,16 @@
 import copy
 import enum
 from dataclasses import dataclass
-from typing import Dict, List, Optional, Union
+from typing import Dict, List, Optional, Union, TYPE_CHECKING
 
 from vllm.block import LogicalTokenBlock
 from vllm.sampling_params import SamplingParams
 from vllm.lora.request import LoRARequest
 
+if TYPE_CHECKING:
+    import torch
+    from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics
+
 
 @dataclass
 class Logprob:
@@ -81,6 +85,8 @@ class SequenceData:
 
     Args:
         prompt_token_ids: The token IDs of the prompt.
+        output_token_ids: The token IDs of the output. Set to an empty list if
+            None.
 
     Attributes:
         prompt_token_ids: The token IDs of the prompt.
@@ -91,9 +97,13 @@ class SequenceData:
     def __init__(
         self,
         prompt_token_ids: List[int],
+        output_token_ids: Optional[List[int]] = None,
     ) -> None:
+        if output_token_ids is None:
+            output_token_ids = []
+
         self.prompt_token_ids = prompt_token_ids
-        self.output_token_ids: List[int] = []
+        self.output_token_ids = output_token_ids
         self.cumulative_logprob = 0.0
 
     def append_token_id(self, token_id: int, logprob: float) -> None:
@@ -117,6 +127,12 @@ def get_last_token_id(self) -> int:
             return self.prompt_token_ids[-1]
         return self.output_token_ids[-1]
 
+    def get_prompt_token_ids(self) -> int:
+        return self.prompt_token_ids
+
+    def get_output_token_ids(self) -> int:
+        return self.output_token_ids
+
     def __repr__(self) -> str:
         return (f"SequenceData("
                 f"prompt_token_ids={self.prompt_token_ids}, "
@@ -142,11 +158,13 @@ def __init__(
         prompt: str,
         prompt_token_ids: List[int],
         block_size: int,
+        eos_token_id: Optional[int] = None,
         lora_request: Optional[LoRARequest] = None,
     ) -> None:
         self.seq_id = seq_id
         self.prompt = prompt
         self.block_size = block_size
+        self.eos_token_id = eos_token_id
         self.lora_request = lora_request
 
         self.data = SequenceData(prompt_token_ids)
@@ -173,7 +191,8 @@ def hash_of_block(self, logical_idx: int) -> int:
         # TODO: The current hashing function is O(L^2). We should optimize
         # this in the future.
         num_tokens = self.num_hashed_tokens_of_block(logical_idx)
-        return hash(tuple(self.data.get_token_ids()[0:num_tokens]))
+        return hash(
+            (tuple(self.data.get_token_ids()[0:num_tokens]), self.lora_int_id))
 
     def num_hashed_tokens_of_block(self, logical_idx: int):
         return logical_idx * self.block_size + self.block_size
@@ -332,7 +351,8 @@ def maybe_set_first_token_time(self, time: float) -> None:
             self.metrics.first_token_time = time
 
     def maybe_set_first_scheduled_time(self, time: float) -> None:
-        """Sets the first scheduled time and time in queue for Request level timings."""
+        """Sets the first scheduled time and time in queue for Request
+        level timings."""
         if self.metrics.first_scheduled_time is None:
             self.metrics.first_scheduled_time = time
             self.metrics.time_in_queue = time - self.metrics.arrival_time
@@ -362,12 +382,9 @@ def get_seqs(
         self,
         status: Optional[SequenceStatus] = None,
     ) -> List[Sequence]:
-        if status is None:
-            return list(self.seqs_dict.values())
-        else:
-            return [
-                seq for seq in self.seqs_dict.values() if seq.status == status
-            ]
+        return list(self.seqs_dict.values()) if status is None else [
+            seq for seq in self.seqs_dict.values() if seq.status == status
+        ]
 
     def get_unfinished_seqs(self) -> List[Sequence]:
         return [
@@ -506,6 +523,35 @@ def __eq__(self, other: object) -> bool:
                 and self.prompt_logprobs == other.prompt_logprobs)
 
 
-# For each sequence group, we generate a list of SequenceOutput object,
-# each of which contains one possible candidate for the next token.
-SamplerOutput = List[SequenceGroupOutput]
+@dataclass
+class SamplerOutput:
+    """For each sequence group, we generate a list of SequenceOutput object,
+    each of which contains one possible candidate for the next token.
+
+    This datastructure implements methods so it can be used like a list, but
+    also has optional fields for device tensors.
+    """
+
+    outputs: List[SequenceGroupOutput]
+
+    # On-device tensor containing probabilities of each token.
+    sampled_token_probs: Optional["torch.Tensor"] = None
+
+    # On-device tensor containing the sampled token ids.
+    sampled_token_ids: Optional["torch.Tensor"] = None
+
+    # Spec decode metrics populated by workers.
+    spec_decode_worker_metrics: Optional["SpecDecodeWorkerMetrics"] = None
+
+    def __getitem__(self, idx: int):
+        return self.outputs[idx]
+
+    def __setitem__(self, idx: int, value):
+        self.outputs[idx] = value
+
+    def __len__(self):
+        return len(self.outputs)
+
+    def __eq__(self, other: object):
+        return isinstance(other,
+                          self.__class__) and self.outputs == other.outputs
diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py
new file mode 100644
index 0000000000000..0f698fa346010
--- /dev/null
+++ b/vllm/spec_decode/batch_expansion.py
@@ -0,0 +1,358 @@
+from typing import Iterator, List, Tuple, Optional, Dict
+from itertools import chain, count
+
+import torch
+
+from vllm.sequence import (SamplerOutput, SequenceGroupMetadata, SequenceData)
+from vllm.worker.worker import Worker
+from vllm.spec_decode.util import (nvtx_range, sampler_output_to_torch,
+                                   get_all_seq_ids,
+                                   split_batch_by_proposal_len)
+from vllm.spec_decode.interfaces import (SpeculativeScorer,
+                                         SpeculativeProposals,
+                                         SpeculativeScores)
+
+SeqId = int
+TargetSeqId = int
+TokenId = int
+
+
+class BatchExpansionTop1Scorer(SpeculativeScorer):
+    """Implements a speculative scorer that uses batch expansion to get
+    probabilities of speculative tokens according to the scoring model.
+
+    Batch expansion converts a list of sequences and multiple query positions
+    to a new batch of sequences, each with a single query position. This allows
+    for MQA-like scoring in speculative decoding without requiring an MQA
+    kernel.
+
+    It is strictly less efficient than MQA scoring.
+
+    It only supports scoring the top1 proposal tokens of the proposer, instead
+    of topk/tree.
+    """
+
+    def __init__(self, scorer_worker: Worker, device: str, vocab_size: int):
+        self._scorer_worker = scorer_worker
+        self._device = device
+        self._vocab_size = vocab_size
+
+    @nvtx_range("BatchExpansionTop1Scorer.score_proposals")
+    def score_proposals(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        blocks_to_swap_in: Optional[Dict[int, int]],
+        blocks_to_swap_out: Optional[Dict[int, int]],
+        blocks_to_copy: Optional[Dict[int, List[int]]],
+        k: int,
+        proposals: SpeculativeProposals,
+    ) -> SpeculativeScores:
+        """Score the proposed tokens via the scorer model.
+
+        This converts each input sequence to a set of k+1 target sequences. The
+        target sequences have the unique continuations to be scored and a
+        unique sequence ID that is different from all input sequence ids.
+
+        If a speculative sequence length would exceed the max model length, then
+        no speculation is produced for that sequence.
+
+        Args:
+            seq_group_metadata_list: The input sequence group metadata.
+            blocks_to_swap_in: This is passed to the worker during scoring.
+            blocks_to_swap_out: This is passed to the worker during scoring.
+            blocks_to_copy: This is passed to the worker during scoring.
+            k: The fixed proposal length.
+            proposals: The speculative proposals to score.
+        Returns:
+            SpeculativeScores: The scores of each speculative token, along with
+                which sequences were ignored during scoring.
+        """
+
+        # TODO(cade) perform this on GPU to remove blocking call.
+        proposal_lens_list = proposals.proposal_lens.tolist()
+        proposal_token_ids_list = proposals.proposal_token_ids.tolist()
+
+        (spec_indices, non_spec_indices, target_seq_group_metadata_list,
+         num_scoring_tokens) = self._expand_batch(
+             seq_group_metadata_list=seq_group_metadata_list,
+             proposal_token_ids_list=proposal_token_ids_list,
+             proposal_lens_list=proposal_lens_list,
+         )
+
+        target_sampler_output = self._scorer_worker.execute_model(
+            seq_group_metadata_list=target_seq_group_metadata_list,
+            blocks_to_swap_in=blocks_to_swap_in,
+            blocks_to_swap_out=blocks_to_swap_out,
+            blocks_to_copy=blocks_to_copy,
+            return_python_output=False)
+
+        all_tokens, all_probs = self._contract_batch(
+            original_bs=len(seq_group_metadata_list),
+            target_sampler_output=target_sampler_output,
+            proposals=proposals,
+            num_scoring_tokens=num_scoring_tokens,
+            non_spec_indices=non_spec_indices,
+            spec_indices=spec_indices,
+            k=k,
+        )
+
+        return SpeculativeScores(
+            probs=all_probs,
+            token_ids=all_tokens,
+        )
+
+    def _expand_batch(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        proposal_token_ids_list: List[TokenId],
+        proposal_lens_list: List[int],
+    ) -> Tuple[List[int], List[int], List[SequenceGroupMetadata], int]:
+        """Given the input sequences and potentially multiple corresponding
+        proposal tokens, create a new batch where each sequence has a single
+        query token.
+        """
+
+        # vLLM currently only supports proposal lens equal to zero or the batch
+        # proposal len. This adds some complexity (splitting the batch into spec
+        # and non spec sequences) and should be removed in the future. It can be
+        # done by supporting per-sequence proposal lens.
+        spec_seqs, spec_indices = split_batch_by_proposal_len(
+            seq_group_metadata_list,
+            proposal_lens_list,
+            select_proposal_len_zero=False)
+        non_spec_seqs, non_spec_indices = split_batch_by_proposal_len(
+            seq_group_metadata_list,
+            proposal_lens_list,
+            select_proposal_len_zero=True)
+
+        target_seq_group_metadata_list = self._create_scoring_model_input(
+            spec_seqs, proposal_token_ids_list)
+        num_scoring_tokens = len(target_seq_group_metadata_list)
+        target_seq_group_metadata_list.extend(non_spec_seqs)
+
+        return (spec_indices, non_spec_indices, target_seq_group_metadata_list,
+                num_scoring_tokens)
+
+    def _contract_batch(self, original_bs: int,
+                        target_sampler_output: List[SamplerOutput],
+                        proposals: SpeculativeProposals,
+                        num_scoring_tokens: int, non_spec_indices: List[int],
+                        spec_indices: List[int],
+                        k: int) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Contract the expanded batch back into its original size.
+        This maps the scores of speculative tokens back to their original
+        sequences.
+        """
+        (target_token_ids, target_probs, non_spec_target_token_ids,
+         non_spec_target_probs) = self._split_scoring_output(
+             target_sampler_output, num_scoring_tokens)
+
+        # Map distinct sequences used to score each token
+        # of shape [batch_size * k + 1] back to [batch_size, k + 1].
+        batch_size, k = proposals.proposal_token_ids.shape
+
+        target_token_ids = target_token_ids.squeeze().reshape(
+            batch_size, k + 1)
+        target_probs = target_probs.squeeze().reshape(batch_size, k + 1,
+                                                      self._vocab_size)
+
+        all_tokens = torch.full(size=(original_bs, k + 1),
+                                fill_value=-1,
+                                device=self._device,
+                                dtype=torch.long)
+        all_probs = torch.zeros(original_bs,
+                                k + 1,
+                                self._vocab_size,
+                                device=self._device,
+                                dtype=torch.float32)
+
+        if non_spec_indices:
+            all_tokens[non_spec_indices, 0] = non_spec_target_token_ids
+            all_probs[non_spec_indices, :1, :] = non_spec_target_probs
+
+        if spec_indices:
+            all_tokens[spec_indices] = target_token_ids
+            all_probs[spec_indices] = target_probs
+
+        return all_tokens, all_probs
+
+    def _create_scoring_model_input(
+            self,
+            seq_group_metadata_list: List[SequenceGroupMetadata],
+            proposal_token_ids: List[List[TokenId]],  # shape: [batch_size, k]
+    ) -> List[SequenceGroupMetadata]:
+        """Given the original input sequences and proposed tokens from the draft
+        model, create a list of target sequences that can be used for scoring.
+        """
+
+        if not seq_group_metadata_list:
+            return []
+
+        target_seq_ids_iter = self._create_target_seq_id_iterator(
+            get_all_seq_ids(seq_group_metadata_list))
+
+        target_seq_group_metadata = list(
+            chain.from_iterable(
+                self._create_target_seq_group_metadata(
+                    seq_group_metadata,
+                    proposal_token_ids,
+                    i,
+                    target_seq_ids_iter,
+                ) for i, seq_group_metadata in enumerate(
+                    seq_group_metadata_list)))
+
+        return target_seq_group_metadata
+
+    def _create_target_seq_group_metadata(
+        self,
+        input_seq_group_metadata: SequenceGroupMetadata,
+        proposal_token_ids: List[TokenId],  # shape: [batch_size, k]
+        batch_index: int,
+        target_seq_ids_iter: Iterator[TargetSeqId],
+    ) -> List[SequenceGroupMetadata]:
+        """Given an input sequence group metadata and a list of draft tokens,
+        create a list of target SequenceGroupMetadata, one for each
+        token id that needs to be scored.
+
+        Naive speculative decoding requires K target model scores, one for each
+        draft model token. However one can add a bonus token such that if each
+        token is accepted, then a final token may be sampled from the model.
+        This function creates K+1 target SequenceGroupMetadata to take
+        advantage of the bonus token.
+        """
+        assert not input_seq_group_metadata.is_prompt, (
+            "Speculating on "
+            "prompts not yet supported")
+        assert len(input_seq_group_metadata.seq_data) == 1, (
+            "Beam search "
+            "not supported in speculative decoding")
+        input_seq_id = next(iter(input_seq_group_metadata.seq_data.keys()))
+
+        token_ids_to_score = self._get_token_ids_to_score(
+            proposal_token_ids[batch_index])
+
+        target_seq_group_metadata_list: List[SequenceGroupMetadata] = []
+        for token_ids in token_ids_to_score:
+            target_seq_group_metadata_list.append(
+                self._create_single_target_seq_group_metadata(
+                    input_seq_group_metadata,
+                    input_seq_id,
+                    next(target_seq_ids_iter),
+                    token_ids,
+                ))
+
+        return target_seq_group_metadata_list
+
+    def _create_single_target_seq_group_metadata(
+        self,
+        seq_group_metadata: SequenceGroupMetadata,
+        seq_id: SeqId,
+        target_seq_id: TargetSeqId,
+        token_ids: List[TokenId],
+    ) -> SequenceGroupMetadata:
+        """Create a single target SequenceGroupMetadata.
+
+        Args:
+            seq_group_metadata: The metadata for the input sequence.
+            seq_id: The input sequence ID.
+            target_seq_id: The corresponding target sequence ID.
+            token_ids: The list of token ids that are to be appended to the
+                input sequence.
+        """
+        seq_data = seq_group_metadata.seq_data[seq_id]
+        prompt_token_ids = seq_data.get_prompt_token_ids()
+        new_output_token_ids = [*seq_data.get_output_token_ids(), *token_ids]
+
+        return SequenceGroupMetadata(
+            request_id=seq_group_metadata.request_id,
+            is_prompt=seq_group_metadata.is_prompt,
+            seq_data={
+                target_seq_id:
+                SequenceData(
+                    prompt_token_ids=prompt_token_ids,
+                    output_token_ids=new_output_token_ids,
+                ),
+            },
+            sampling_params=seq_group_metadata.sampling_params,
+            block_tables={
+                target_seq_id: seq_group_metadata.block_tables[seq_id],
+            },
+            lora_request=None,
+        )
+
+    def _split_scoring_output(
+        self, sampler_output: SamplerOutput, num_scoring_tokens: int
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Split the target model output into speculative and non-speculative
+        output.
+        """
+
+        # vLLM currently only supports proposal lens equal to zero or the batch
+        # proposal len. This adds some complexity (splitting the batch into spec
+        # and non spec sequences) and should be removed in the future. It can be
+        # done by supporting per-sequence proposal lens.
+        #
+        # First samples are from speculative scoring, latter samples are non-
+        # speculative samples.
+        split_sizes = [
+            num_scoring_tokens,
+            sampler_output.sampled_token_ids.numel() - num_scoring_tokens
+        ]
+        (spec_probs, non_spec_probs
+         ) = sampler_output.sampled_token_probs.split(split_sizes)
+        (spec_sampled_tokens, non_spec_sampled_tokens
+         ) = sampler_output.sampled_token_ids.flatten().split(split_sizes)
+
+        # Convert scores to tensors.
+        sampler_output.sampled_token_probs = spec_probs
+        sampler_output.sampled_token_ids = spec_sampled_tokens
+        target_token_ids, target_probs = sampler_output_to_torch(
+            [sampler_output])
+
+        # Convert non-speculative output tokens to tensors.
+        sampler_output.sampled_token_probs = non_spec_probs
+        sampler_output.sampled_token_ids = non_spec_sampled_tokens
+        non_spec_target_token_ids, non_spec_target_probs = (
+            sampler_output_to_torch([sampler_output]))
+
+        return (target_token_ids, target_probs, non_spec_target_token_ids,
+                non_spec_target_probs)
+
+    def _create_target_seq_id_iterator(
+            self, seq_ids: List[SeqId]) -> Iterator[TargetSeqId]:
+        """Create an iterator for creating target sequence ids.
+        Target sequence ids are distinct from sequence ids because we create a
+        distinct target sequence id for each proposal token to be scored.
+
+        This implementation increments a counter starting at 1 + max of all
+        provided input sequence ids.
+        """
+        return count(start=max(seq_ids) + 1)
+
+    def _get_token_ids_to_score(
+        self,
+        full_spec_token_ids: List[TokenId]  # shape: [k]
+    ) -> List[List[TokenId]]:
+        """Given an int tensor of proposal token ids, return a list of
+        token ids that should be scored.
+
+        Returns k+1 output lists. The additional one is used for generating the
+        bonus token.
+
+        Example:
+            Input: [0, 1, 2, 3] (k=4)
+            Output: (k+1 lists)
+                []
+                [0]
+                [0, 1]
+                [0, 1, 2]
+                [0, 1, 2, 3]
+        """
+        empty_token_ids = []
+
+        token_ids_to_score = [empty_token_ids]
+        token_ids_to_score.extend([
+            full_spec_token_ids[:i + 1]
+            for i in range(len(full_spec_token_ids))
+        ])
+        return token_ids_to_score
diff --git a/vllm/spec_decode/interfaces.py b/vllm/spec_decode/interfaces.py
new file mode 100644
index 0000000000000..9e53ffb60ac32
--- /dev/null
+++ b/vllm/spec_decode/interfaces.py
@@ -0,0 +1,77 @@
+from typing import List, Tuple, Optional, Dict
+from dataclasses import dataclass
+from abc import ABC, abstractmethod
+
+import torch
+
+from vllm.sequence import SequenceGroupMetadata
+
+
+@dataclass
+class SpeculativeProposals:
+    """Datastructure used to represent proposal tokens from some proposer. It
+    also tracks how many speculative tokens each sequence has.
+    """
+
+    # Speculative proposal tokens.
+    proposal_token_ids: torch.Tensor
+
+    # Probabilities of the proposal tokens according to the proposer.
+    proposal_probs: torch.Tensor
+
+    # The valid length of each proposal; can be zero.
+    proposal_lens: torch.Tensor
+
+    def __repr__(self):
+        return (f"SpeculativeProposals("
+                f"proposal_token_ids={self.proposal_token_ids.shape}, "
+                f"proposal_probs={self.proposal_probs.shape}, "
+                f"proposal_lens={self.proposal_lens.shape})")
+
+
+@dataclass
+class SpeculativeScores:
+    """Datastructure used to represent the scores of speculative tokens
+    according to the scoring model.
+    """
+
+    # Probabilities of the speculative tokens according to the scoring model.
+    probs: torch.Tensor
+
+    # Token ids sampled from the scoring model. Used for speculative bonus
+    # tokens and also non-speculative normal decoding.
+    token_ids: torch.Tensor
+
+    def __repr__(self):
+        return (f"SpeculativeScores("
+                f"probs={self.probs.shape}, "
+                f"token_ids={self.token_ids.shape})")
+
+
+class SpeculativeProposer(ABC):
+
+    @abstractmethod
+    def get_proposals(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        blocks_to_swap_in: Dict[int, int],
+        blocks_to_swap_out: Dict[int, int],
+        blocks_to_copy: Dict[int, List[int]],
+        max_proposal_len: int,
+    ) -> SpeculativeProposals:
+        raise NotImplementedError
+
+
+class SpeculativeScorer(ABC):
+
+    @abstractmethod
+    def score_proposals(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        blocks_to_swap_in: Optional[Dict[int, int]],
+        blocks_to_swap_out: Optional[Dict[int, int]],
+        blocks_to_copy: Optional[Dict[int, List[int]]],
+        k: int,
+        proposals: SpeculativeProposals,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        raise NotImplementedError
diff --git a/vllm/spec_decode/metrics.py b/vllm/spec_decode/metrics.py
new file mode 100644
index 0000000000000..65a2a4a63a98f
--- /dev/null
+++ b/vllm/spec_decode/metrics.py
@@ -0,0 +1,174 @@
+import torch
+from dataclasses import dataclass
+from vllm.model_executor.layers.rejection_sampler import RejectionSampler
+from typing import Optional
+from vllm.utils import in_wsl
+import time
+from typing import Callable
+
+
+@dataclass
+class SpecDecodeWorkerMetrics:
+    """Dataclass holding metrics emitted from the spec decode worker.
+    """
+
+    # The empirical acceptance rate of the proposal method on a per-token basis.
+    # This is useful for evaluating how well the proposal method aligns with the
+    # scoring method.
+    draft_acceptance_rate: float
+
+    # The empirical efficiency, measured as the number of tokens emitted by the
+    # system divided by the number of tokens that could be emitted by the system
+    # if the proposal method were perfect.
+    system_efficiency: float
+
+    # The number of speculative tokens produced by the proposal method.
+    draft_tokens: int
+
+    # The number of tokens emitted by the entire system.
+    emitted_tokens: int
+
+    # The number of tokens accepted by the scoring model and verification
+    # routine, e.g. Llama2-70B and lossless rejection sampling.
+    #
+    # NOTE: Any token accepted by the verification routine is considered
+    # accepted (regardless of if the speculative prefix is also accepted). The
+    # user will usually see less accepted tokens. This metric is helpful when
+    # evaluating alignment of the proposal method with the scoring model.
+    accepted_tokens: int
+
+    # The number of speculative tokens per sequence.
+    num_spec_tokens: int
+
+
+Timer = Callable[[], float]
+
+
+class AsyncMetricsCollector:
+    """Class which copies rejection sampler metrics from the device to CPU on a
+    non-default Torch stream.
+    """
+
+    def __init__(self,
+                 rejection_sampler: RejectionSampler,
+                 timer: Optional[Timer] = None,
+                 collect_interval_s: float = 5.0):
+        self._rejection_sampler = rejection_sampler
+        self._timer = time.time if timer is None else timer
+
+        self._rank: Optional[int] = None
+
+        # We don't have a device set yet.
+        self._copy_stream: Optional[torch.cuda.Stream] = None
+
+        self._in_flight_copy: Optional[torch.cuda.Event] = None
+
+        pin_memory = not in_wsl()
+        self._aggregate_num_accepted_tokens = torch.tensor(
+            0, dtype=torch.long, device="cpu", pin_memory=pin_memory)
+        self._aggregate_num_emitted_tokens = torch.tensor(
+            0, dtype=torch.long, device="cpu", pin_memory=pin_memory)
+        self._aggregate_num_draft_tokens = 0
+
+        self._rejsample_metrics_collect_interval_s = collect_interval_s
+        self._last_metrics_collect_time = self._timer()
+
+    def init_gpu_tensors(self, rank: int) -> None:
+        self._rank = rank
+        self._copy_stream = torch.cuda.Stream()
+
+    def maybe_collect_rejsample_metrics(
+            self, k: int) -> Optional[SpecDecodeWorkerMetrics]:
+
+        # If a copy was initiated in the previous call, collect and return.
+        if self._in_flight_copy is not None:
+            ready_event = self._in_flight_copy
+            self._in_flight_copy = None
+            return self._collect_rejsample_metrics(k, ready_event)
+
+        # Otherwise, check if we should start a new copy.
+        if self._should_collect_rejsample_metrics(self._timer()):
+            assert self._in_flight_copy is None
+            self._in_flight_copy = self._copy_rejsample_metrics_async()
+
+        return None
+
+    def _should_collect_rejsample_metrics(self, now: float) -> bool:
+        """Return whether or not this iteration should print rejection sampling
+        metrics.
+        """
+        if self._rank != 0:
+            return False
+
+        if (now - self._last_metrics_collect_time <
+                self._rejsample_metrics_collect_interval_s):
+            return False
+        return True
+
+    def _copy_rejsample_metrics_async(self) -> torch.cuda.Event:
+        """Copy rejection sampling metrics (number of accepted tokens, etc) to
+        CPU asynchronously.
+
+        Returns a CUDA event recording when the copy is complete.
+        """
+        self._copy_stream.wait_stream(torch.cuda.current_stream())
+
+        with torch.cuda.stream(self._copy_stream):
+            self._aggregate_num_accepted_tokens.copy_(
+                self._rejection_sampler.num_accepted_tokens, non_blocking=True)
+            self._aggregate_num_emitted_tokens.copy_(
+                self._rejection_sampler.num_emitted_tokens, non_blocking=True)
+            # Number of draft tokens is calculated on CPU, so no copy is
+            # required.
+            self._aggregate_num_draft_tokens = (
+                self._rejection_sampler.num_draft_tokens)
+
+        aggregate_metrics_ready = torch.cuda.Event()
+        aggregate_metrics_ready.record(self._copy_stream)
+
+        return aggregate_metrics_ready
+
+    def _collect_rejsample_metrics(
+            self, k: int,
+            ready_event: torch.cuda.Event) -> SpecDecodeWorkerMetrics:
+        """Create metrics object from statistics copied asynchronously.
+
+        Args:
+            k: int. The number of speculative tokens; used to determine system
+                efficiency.
+            ready_event: torch.cuda.Event. The CUDA event recording when the
+                async GPU->CPU copy is complete.
+        """
+
+        ready_event.synchronize()
+        accepted_tokens = self._aggregate_num_accepted_tokens.item()
+        emitted_tokens = self._aggregate_num_emitted_tokens.item()
+        draft_tokens = self._aggregate_num_draft_tokens
+
+        num_possible_tokens = self.get_max_num_accepted_tokens(draft_tokens, k)
+
+        if draft_tokens > 0:
+            draft_acceptance_rate = accepted_tokens / draft_tokens
+        else:
+            draft_acceptance_rate = float("nan")
+
+        if num_possible_tokens > 0:
+            system_efficiency = emitted_tokens / num_possible_tokens
+        else:
+            system_efficiency = float("nan")
+
+        return SpecDecodeWorkerMetrics(
+            num_spec_tokens=k,
+            draft_acceptance_rate=draft_acceptance_rate,
+            system_efficiency=system_efficiency,
+            accepted_tokens=accepted_tokens,
+            draft_tokens=draft_tokens,
+            emitted_tokens=emitted_tokens,
+        )
+
+    @staticmethod
+    def get_max_num_accepted_tokens(draft_tokens: int, k: int) -> int:
+        # Divide by k since batch size can be variable.
+        total_num_spec_seqs = draft_tokens / k
+        num_accepted_per_seq_if_all_accepted = k + 1
+        return int(total_num_spec_seqs / num_accepted_per_seq_if_all_accepted)
diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py
new file mode 100644
index 0000000000000..0915c275b0408
--- /dev/null
+++ b/vllm/spec_decode/multi_step_worker.py
@@ -0,0 +1,370 @@
+from typing import List, Dict, Optional, Tuple
+import copy
+
+import torch
+
+from vllm.sequence import SamplerOutput, SequenceGroupMetadata
+from vllm.worker.worker import Worker
+from vllm.spec_decode.interfaces import (SpeculativeProposals,
+                                         SpeculativeProposer)
+from vllm.spec_decode.util import sampler_output_to_torch
+
+
+class MultiStepWorker(Worker):
+    """The MultiStepWorker is equivalent to a Worker except that it allows
+    multiple forward passes in a single call, assuming the scheduler has
+    allocated enough space to store the additional KV. This reduces overhead
+    by invoking the scheduler less.
+
+    The MultiStepWorker does not support cache swap operations, or beam search.
+    Cache swap operations do not require large modifications. On the other hand,
+    beam search requires memory allocations during sequence forks and thus
+    requires more thought for MultiStepWorker support.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        self._proposer: Optional[DraftModelTop1Proposer] = None
+
+    def init_model(self):
+        super().init_model()
+
+        self._proposer = DraftModelTop1Proposer(
+            self,
+            self.device,
+            self.max_model_len,
+            self.vocab_size,
+        )
+
+    @torch.inference_mode()
+    def execute_model_multi_step(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        blocks_to_swap_in: Dict[int, int],
+        blocks_to_swap_out: Dict[int, int],
+        blocks_to_copy: Dict[int, List[int]],
+        num_steps: int,
+    ) -> List[SamplerOutput]:
+        """Run the model forward pass num_steps times. Returns the list of
+        sampler output, one per model forward pass.
+        """
+        self._raise_if_unsupported(seq_group_metadata_list, blocks_to_swap_in,
+                                   blocks_to_swap_out, blocks_to_copy)
+
+        # Shallow copy input data so modifications (such as appending tokens)
+        # do not cause side-effects.
+        copied_seq_group_metadata_list = self._shallow_copy_inputs(
+            seq_group_metadata_list)
+
+        # Assert enough KV space for num_steps tokens per sequence.
+        self._assert_enough_kv_space(seq_group_metadata_list, num_steps)
+
+        # Run model num_steps times.
+        model_outputs = []
+        for _ in range(num_steps):
+            model_output = super().execute_model(
+                seq_group_metadata_list=copied_seq_group_metadata_list,
+                blocks_to_swap_in=blocks_to_swap_in,
+                blocks_to_swap_out=blocks_to_swap_out,
+                blocks_to_copy=blocks_to_copy,
+            )
+
+            self._append_new_tokens(model_output,
+                                    copied_seq_group_metadata_list)
+            model_outputs.append(model_output)
+
+        return model_outputs
+
+    def get_spec_proposals(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        blocks_to_swap_in: Dict[int, int],
+        blocks_to_swap_out: Dict[int, int],
+        blocks_to_copy: Dict[int, List[int]],
+        max_proposal_len: int,
+    ) -> SpeculativeProposals:
+        """Produce speculations given an input batch of sequences. The number of
+        speculative tokens per sequence is determined by max_proposal_len.
+        """
+
+        return self._proposer.get_proposals(
+            seq_group_metadata_list,
+            blocks_to_swap_in,
+            blocks_to_swap_out,
+            blocks_to_copy,
+            max_proposal_len,
+        )
+
+    def _append_new_tokens(
+            self, model_output: SamplerOutput,
+            seq_group_metadata_list: SequenceGroupMetadata) -> None:
+        """Given model output from a single run, append the tokens to the
+        sequences. This is normally done outside of the worker, but it is
+        required if the worker is to perform multiple forward passes.
+        """
+        for seq_group_metadata, sequence_group_outputs in zip(
+                seq_group_metadata_list, model_output):
+            seq_group_metadata.is_prompt = False
+
+            for seq_output in sequence_group_outputs.samples:
+                # NOTE: Beam search is not supported, so we can assume that
+                # parent_seq_id == seq_id.
+                seq = seq_group_metadata.seq_data[seq_output.parent_seq_id]
+
+                token_id = seq_output.output_token
+                token_logprob = seq_output.logprobs[token_id]
+
+                seq.append_token_id(token_id, token_logprob.logprob)
+
+    def _shallow_copy_inputs(
+        self, seq_group_metadata_list: List[SequenceGroupMetadata]
+    ) -> List[SequenceGroupMetadata]:
+        """Copy input data structures to remove side-effects when input data
+        structures are shared with other modules.
+
+        Helpful when the vLLM scheduler runs in the same process as the worker.
+        The alternative is deep-copying (or other form of deep copy); this has
+        performance downsides.
+        """
+
+        # Shallow-copy the list of SequenceGroupMetadata. This allows us to
+        # append tokens and change is_prompt without external side-effects.
+        new_seq_group_metadata_list = []
+
+        for old_seq_group_metadata in seq_group_metadata_list:
+            # We must shallow-copy seq_group_metadata as is_prompt could change.
+            seq_group_metadata = copy.copy(old_seq_group_metadata)
+            new_seq_group_metadata_list.append(seq_group_metadata)
+
+            # We must shallow-copy seq_data as we will append token ids
+            new_seq_data = {}
+            for seq_id, old_seq_data in seq_group_metadata.seq_data.items():
+                new_seq_data[seq_id] = copy.copy(old_seq_data)
+                new_seq_data[
+                    seq_id].output_token_ids = old_seq_data.output_token_ids[:]
+
+            seq_group_metadata.seq_data = new_seq_data
+
+        return new_seq_group_metadata_list
+
+    def _assert_enough_kv_space(
+            self, seq_group_metadata_list: List[SequenceGroupMetadata],
+            num_steps: int) -> None:
+        """Assert there are enough physical blocks per sequence to store the
+        current KV plus additional KV from num_steps tokens.
+        """
+        assert self.model_runner.block_size is not None
+        for seq_group_metadata in seq_group_metadata_list:
+            # Only one seq_id is guaranteed because there is no beam search.
+            seq_id = list(seq_group_metadata.seq_data.keys())[0]
+            seq = seq_group_metadata.seq_data[seq_id]
+
+            # After num_steps, the seq len will be the current seq len
+            # plus one token per step.
+            final_seq_len = seq.get_len() + num_steps
+
+            # We will have final_seq_len - 1 KV because vLLM saves KV for a
+            # token in the iteration after the token was generated.
+            required_num_kv_slots = final_seq_len - 1
+
+            # The allocated number of kv slots is the number of allocated blocks
+            # times the number of slots of block.
+            number_physical_blocks = len(
+                seq_group_metadata.block_tables[seq_id])
+            allocated_kv_slots = (number_physical_blocks *
+                                  self.model_runner.block_size)
+
+            if required_num_kv_slots > allocated_kv_slots:
+                request_id = seq_group_metadata.request_id
+                raise ValueError(
+                    "The worker attempted to run "
+                    f"{num_steps} times but found insufficient KV space for "
+                    f"{request_id=} {seq_id=}. ({allocated_kv_slots=} "
+                    f"{required_num_kv_slots=}).")
+
+    def _raise_if_unsupported(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        blocks_to_swap_in: Dict[int, int],
+        blocks_to_swap_out: Dict[int, int],
+        blocks_to_copy: Dict[int, List[int]],
+    ) -> None:
+        """MultiStepWorker does not yet implement support for cache swap
+        operations or beam search.
+        """
+        if any([blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy]):
+            raise NotImplementedError(
+                "MultiStepWorker does not support cache operations")
+
+        if any(
+                len(seq_group_metadata.seq_data.keys()) != 1
+                for seq_group_metadata in seq_group_metadata_list):
+            raise NotImplementedError(
+                "MultiStepWorker does not support beam search.")
+
+
+class DraftModelTop1Proposer(SpeculativeProposer):
+    """Helper class which separates out sequences which would exceed the max
+    model length when speculated upon.
+
+    This allows combinations of models such as JackFram/llama-68m draft with
+    meta-llama/Llama2-13b-chat-hf, as llama-68m has max_position_embeddings of
+    2048 while Llama2-13b has max_position_embeddings of 4096.
+
+    We treat the sequences which exceed the proposal draft model length as
+    "non-spec sequences". Essentially they skip the draft model and go through
+    normal decoding in the target model.
+
+    Currently, only proposal_lens of 0 and k are supported, where k is a global
+    batch proposal length. In the future vLLM should support per-sequence
+    proposal lengths.
+    """
+
+    def __init__(
+        self,
+        draft_worker: MultiStepWorker,
+        device: str,
+        max_model_len: int,
+        vocab_size: int,
+    ):
+        self._draft_worker = draft_worker
+        self._device = device
+        self._max_model_len = max_model_len
+        self._vocab_size = vocab_size
+
+    def get_proposals(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        blocks_to_swap_in: Dict[int, int],
+        blocks_to_swap_out: Dict[int, int],
+        blocks_to_copy: Dict[int, List[int]],
+        max_proposal_len: int,
+    ) -> SpeculativeProposals:
+        """Get speculative proposals given the input batch.
+
+        Sequences which would exceed the max model length are skipped during
+        speculation.
+        """
+
+        # Split speculative- and non-speculative- sequences.
+        (proposal_lens, nonzero_proposal_len_seqs,
+         nonzero_proposal_len_indices) = self._split_by_max_model_len(
+             seq_group_metadata_list, max_proposal_len)
+
+        if nonzero_proposal_len_seqs:
+            # Speculate tokens using the draft worker for the speculative
+            # sequences.
+            maybe_sampler_output = self._draft_worker.execute_model_multi_step(
+                seq_group_metadata_list=nonzero_proposal_len_seqs,
+                blocks_to_swap_in=blocks_to_swap_in,
+                blocks_to_swap_out=blocks_to_swap_out,
+                blocks_to_copy=blocks_to_copy,
+                num_steps=max_proposal_len,
+            )
+        else:
+            # If no sequences can be speculated, set sampler output to None.
+            maybe_sampler_output = None
+
+        # Combine speculative- and non-speculative sequences into the same
+        # representation.
+        proposal_tokens, proposal_probs, proposal_lens = self._merge_outputs(
+            batch_size=len(seq_group_metadata_list),
+            max_proposal_len=max_proposal_len,
+            maybe_sampler_output=maybe_sampler_output,
+            proposal_lens=proposal_lens,
+            nonzero_proposal_len_indices=nonzero_proposal_len_indices,
+        )
+
+        proposals = SpeculativeProposals(
+            proposal_token_ids=proposal_tokens,
+            proposal_probs=proposal_probs,
+            proposal_lens=proposal_lens,
+        )
+
+        return proposals
+
+    def _split_by_max_model_len(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        max_proposal_len: int,
+    ) -> Tuple[List[int], List[SequenceGroupMetadata], List[int]]:
+        """Determine which sequences would exceed the max model length.
+        """
+
+        proposal_lens: List[int] = []
+        nonzero_proposal_len_seqs: List[SequenceGroupMetadata] = []
+        nonzero_proposal_len_indices: List[int] = []
+        for i, seq_group_metadata in enumerate(seq_group_metadata_list):
+            seq_data = next(iter(seq_group_metadata.seq_data.values()))
+            seq_len = seq_data.get_len()
+
+            # Currently only proposal lens of 0 or the global batch proposal len
+            # are supported.
+            if seq_len + max_proposal_len < self._max_model_len:
+                proposal_lens.append(max_proposal_len)
+                nonzero_proposal_len_seqs.append(seq_group_metadata)
+                nonzero_proposal_len_indices.append(i)
+            else:
+                proposal_lens.append(0)
+
+        return (proposal_lens, nonzero_proposal_len_seqs,
+                nonzero_proposal_len_indices)
+
+    def _merge_outputs(
+        self,
+        batch_size: int,
+        max_proposal_len: int,
+        maybe_sampler_output: Optional[SamplerOutput],
+        proposal_lens: List[int],
+        nonzero_proposal_len_indices: List[int],
+    ) -> Tuple[torch.Tensor, torch.tensor, torch.Tensor]:
+        """After speculations are produced, merge the speculation results with
+        the skipped sequences.
+        """
+        if maybe_sampler_output is None:
+            # If no speculative tokens, the sampler output will be None.
+            # In this case we return empty tensors.
+            proposal_tokens = torch.zeros(0,
+                                          max_proposal_len,
+                                          dtype=torch.long,
+                                          device=self._device)
+            proposal_probs = torch.zeros(0,
+                                         max_proposal_len,
+                                         self._vocab_size,
+                                         dtype=torch.float32,
+                                         device=self._device)
+            proposal_lens = torch.zeros(len(proposal_lens),
+                                        dtype=torch.long,
+                                        device=self._device)
+            return proposal_tokens, proposal_probs, proposal_lens
+
+        sampler_output = maybe_sampler_output
+
+        proposal_tokens, proposal_probs = sampler_output_to_torch(
+            sampler_output)
+
+        # Now, reformat the output GPU tensors such that each sequence has
+        # a proposal. the proposal can be empty, e.g. [-1, -1, -1]
+
+        entire_proposal_tokens = torch.full(size=(batch_size,
+                                                  *proposal_tokens.shape[1:]),
+                                            fill_value=-1,
+                                            dtype=torch.long,
+                                            device=self._device)
+        entire_proposal_tokens[nonzero_proposal_len_indices] = proposal_tokens
+        entire_proposal_probs = torch.zeros(batch_size,
+                                            *proposal_probs.shape[1:],
+                                            dtype=torch.float32,
+                                            device=self._device)
+        entire_proposal_probs[nonzero_proposal_len_indices] = proposal_probs
+
+        proposal_tokens, proposal_probs = (entire_proposal_tokens,
+                                           entire_proposal_probs)
+
+        proposal_lens = torch.zeros(batch_size,
+                                    dtype=torch.long,
+                                    device=self._device)
+        proposal_lens[nonzero_proposal_len_indices] = max_proposal_len
+
+        return proposal_tokens, proposal_probs, proposal_lens
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
new file mode 100644
index 0000000000000..1e56741347008
--- /dev/null
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -0,0 +1,375 @@
+from typing import List, Tuple, Optional, Dict
+from functools import cached_property
+
+import torch
+
+from vllm.spec_decode.metrics import AsyncMetricsCollector
+from vllm.sequence import (SamplerOutput, SequenceGroupMetadata,
+                           SequenceGroupOutput, SequenceOutput)
+from vllm.worker.worker import Worker
+from vllm.spec_decode.multi_step_worker import MultiStepWorker
+from vllm.model_executor.layers.rejection_sampler import RejectionSampler
+from vllm.config import CacheConfig
+from vllm.spec_decode.util import (nvtx_range, get_all_seq_ids,
+                                   split_batch_by_proposal_len)
+from vllm.spec_decode.interfaces import SpeculativeProposals, SpeculativeScores
+from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer
+from vllm.spec_decode.interfaces import SpeculativeScorer
+
+
+class SpecDecodeWorker:
+    """Worker which implements speculative decoding.
+
+    Speculative decoding reduces decoding per-token latency by using a proposal
+    method, such as a small draft model, to speculate ahead of a larger LLM. The
+    probabilities of the speculative tokens are then determined by the larger
+    LLM, after which some verification routine determines which (if any) of the
+    speculative tokens are accepted by the larger LLM.
+
+    See https://github.com/vllm-project/vllm/pull/2188 and
+    https://github.com/vllm-project/vllm/pull/3103 for more info.
+
+    The current implementation has the following limitations:
+    * Only draft-model proposal is implemented (contributions for more forms are
+        welcome!).
+    * Only top-1 proposal and scoring are implemented. Tree-attention is left as
+        future work.
+    * Only lossless rejection sampling is supported. Contributions adding lossy
+        verification routines are welcome (e.g. Medusa's typical acceptance).
+    * All sequences in a batch must have the same proposal length, or zero. This
+        can be improved by having per-sequence speculation in the future.
+    * The scoring forward pass is done without an MQA kernel, which is
+        suboptimal especially as the batch size, proposal length, and sequence
+        lengths grow. Contributions to add a MQA scoring are welcome once
+        correctness tests pass.
+        More info here https://docs.google.com/document/d/1T-JaS2T1NRfdP51qzqpyakoCXxSXTtORppiwaj5asxA/edit.
+    """
+
+    def __init__(
+        self,
+        proposer_worker: MultiStepWorker,
+        scorer_worker: Worker,
+        rejection_sampler: RejectionSampler,
+        metrics_collector: Optional[AsyncMetricsCollector] = None,
+    ):
+        """
+        Create a SpecDecodeWorker.
+
+        Args:
+            proposer_worker: A worker that can produce speculative tokens for
+                sequences.
+            scorer_worker: A worker that produces probabilities of speculative
+                tokens according to some base model. Typically a vanilla vLLM
+                Worker.
+            rejection_sampler: A Torch module used to perform modified rejection
+                sampling for speculative decoding.
+            metrics_collector: Helper class for collecting metrics; can be set
+                for testing purposes.
+        """
+        self.proposer_worker = proposer_worker
+        self.scorer_worker = scorer_worker
+        self.rejection_sampler = rejection_sampler
+
+        self._metrics = AsyncMetricsCollector(
+            rejection_sampler
+        ) if metrics_collector is None else metrics_collector
+
+        self.probs_dtype = self.rejection_sampler.probs_dtype
+        self.token_id_dtype = self.rejection_sampler.token_id_dtype
+
+        self.scorer: SpeculativeScorer = None
+
+    def init_model(self) -> None:
+        """Initialize both scorer and proposer models.
+        """
+        # The scorer worker model is initialized first in case the proposer
+        # model has a smaller TP degree than the target worker.
+        self.scorer_worker.init_model()
+        self.proposer_worker.init_model()
+
+        self._metrics.init_gpu_tensors(self.rank)
+        self.rejection_sampler.init_gpu_tensors(self.rank)
+        self.scorer = BatchExpansionTop1Scorer(
+            scorer_worker=self.scorer_worker,
+            device=self.device,
+            vocab_size=self._vocab_size)
+
+    def profile_num_available_blocks(self, block_size: int,
+                                     gpu_memory_utilization: float,
+                                     cpu_swap_space: int,
+                                     cache_dtype: str) -> Tuple[int, int]:
+        """Determine the number of cache blocks to use.
+
+        This is done by profiling the scorer model (which is typically the
+        larger of the two). Then the total memory which would be used by the
+        scorer cache is divided evenly between the proposer and scorer model KV,
+        such that the number of blocks is equal in both KV caches.
+        """
+        num_gpu_blocks, num_cpu_blocks = (
+            self.scorer_worker.profile_num_available_blocks(
+                block_size, gpu_memory_utilization, cpu_swap_space,
+                cache_dtype))
+
+        scorer_cache_block_size_bytes = (
+            self.scorer_worker.get_cache_block_size_bytes(
+                block_size, cache_dtype))
+        proposer_cache_block_size_bytes = (
+            self.proposer_worker.get_cache_block_size_bytes(
+                block_size, cache_dtype))
+
+        new_num_gpu_blocks = split_num_cache_blocks_evenly(
+            scorer_cache_block_size_bytes, proposer_cache_block_size_bytes,
+            num_gpu_blocks)
+        return new_num_gpu_blocks, num_cpu_blocks
+
+    def init_cache_engine(self, cache_config: CacheConfig):
+        """Initialize the cache engine of the scorer and proposer workers.
+        """
+        self.scorer_worker.init_cache_engine(cache_config)
+        self.proposer_worker.init_cache_engine(cache_config)
+
+    @torch.inference_mode()
+    def execute_model(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        blocks_to_swap_in: Optional[Dict[int, int]],
+        blocks_to_swap_out: Optional[Dict[int, int]],
+        blocks_to_copy: Optional[Dict[int, List[int]]],
+        num_spec_tokens: int,
+    ) -> List[SamplerOutput]:
+        """Perform speculative decoding on the input batch.
+        """
+
+        assert seq_group_metadata_list is not None, (
+            "speculative decoding "
+            "requires non-None seq_group_metadata_list")
+
+        # If no spec tokens, call the proposer and scorer workers normally.
+        # Used for prefill.
+        if num_spec_tokens == 0 or len(seq_group_metadata_list) == 0:
+            return self._run_no_spec(
+                seq_group_metadata_list=seq_group_metadata_list,
+                blocks_to_swap_in=blocks_to_swap_in,
+                blocks_to_swap_out=blocks_to_swap_out,
+                blocks_to_copy=blocks_to_copy,
+            )
+
+        return self._run_speculative_decoding_step(
+            seq_group_metadata_list=seq_group_metadata_list,
+            blocks_to_swap_in=blocks_to_swap_in,
+            blocks_to_swap_out=blocks_to_swap_out,
+            blocks_to_copy=blocks_to_copy,
+            k=num_spec_tokens,
+        )
+
+    @nvtx_range("spec_decode_worker._run_no_spec")
+    def _run_no_spec(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        blocks_to_swap_in: Optional[Dict[int, int]],
+        blocks_to_swap_out: Optional[Dict[int, int]],
+        blocks_to_copy: Optional[Dict[int, List[int]]],
+    ) -> List[SamplerOutput]:
+        """Run a prefill step, without any speculation. The input is sent to the
+        proposer and scorer model so that the KV cache is consistent between the
+        two.
+        """
+
+        self.proposer_worker.execute_model(
+            seq_group_metadata_list=seq_group_metadata_list,
+            blocks_to_swap_in=blocks_to_swap_in,
+            blocks_to_swap_out=blocks_to_swap_out,
+            blocks_to_copy=blocks_to_copy,
+            return_python_output=False)
+
+        sampler_output = self.scorer_worker.execute_model(
+            seq_group_metadata_list=seq_group_metadata_list,
+            blocks_to_swap_in=blocks_to_swap_in,
+            blocks_to_swap_out=blocks_to_swap_out,
+            blocks_to_copy=blocks_to_copy,
+        )
+
+        # Clear device tensors from sampler output. This reduces communication
+        # overhead when the engine runs in a different process than the workers.
+        sampler_output.probs = None
+        sampler_output.sampled_tokens = None
+        return [sampler_output]
+
+    @nvtx_range("spec_decode_worker._run_speculative_decoding_step")
+    def _run_speculative_decoding_step(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        blocks_to_swap_in: Optional[Dict[int, int]],
+        blocks_to_swap_out: Optional[Dict[int, int]],
+        blocks_to_copy: Optional[Dict[int, List[int]]],
+        k: int,
+    ) -> List[SamplerOutput]:
+        """Execute a single step of speculative decoding.
+
+        This invokes the proposer worker to get k speculative tokens for each
+        sequence, then scores each speculative token using the scoring worker.
+
+        Returns a list of SamplerOutput, each containing a single token per
+        sequence.
+        """
+
+        # Generate proposals using draft worker.
+        proposals = self.proposer_worker.get_spec_proposals(
+            seq_group_metadata_list, blocks_to_swap_in, blocks_to_swap_out,
+            blocks_to_copy, k)
+
+        proposal_scores = self.scorer.score_proposals(
+            seq_group_metadata_list,
+            blocks_to_swap_in,
+            blocks_to_swap_out,
+            blocks_to_copy,
+            k,
+            proposals,
+        )
+
+        accepted_token_ids = self._verify_tokens(seq_group_metadata_list,
+                                                 proposal_scores, proposals, k)
+
+        return self._create_output_sampler_list(seq_group_metadata_list,
+                                                accepted_token_ids, k)
+
+    @nvtx_range("spec_decode_worker._verify_tokens")
+    def _verify_tokens(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        proposal_scores: SpeculativeScores,
+        proposals: SpeculativeProposals,
+        max_proposal_len: int,
+    ) -> torch.Tensor:
+        """Determine which speculative tokens are accepted using the
+        probabilities of each token according to the proposer and scorer models.
+        """
+        proposal_lens_list = proposals.proposal_lens.tolist()
+
+        # vLLM currently only supports proposal lens equal to zero or the batch
+        # proposal len. This adds some complexity (splitting the batch into spec
+        # and non spec sequences) and should be removed in the future. It can be
+        # done by supporting per-sequence proposal lens.
+        _, spec_indices = split_batch_by_proposal_len(
+            seq_group_metadata_list,
+            proposal_lens_list,
+            select_proposal_len_zero=False)
+        _, non_spec_indices = split_batch_by_proposal_len(
+            seq_group_metadata_list,
+            proposal_lens_list,
+            select_proposal_len_zero=True)
+        original_indices = spec_indices + non_spec_indices
+
+        proposal_probs = proposal_scores.probs[spec_indices, :-1]
+        bonus_token_ids = proposal_scores.token_ids[spec_indices, -1:]
+        non_spec_token_ids = proposal_scores.token_ids[non_spec_indices]
+
+        accepted_token_ids = self.rejection_sampler(
+            proposal_probs,
+            bonus_token_ids,
+            proposals.proposal_probs,
+            proposals.proposal_token_ids,
+        )
+
+        # Append output tokens from non-speculative sequences to
+        # the accepted token ids tensor.
+        non_spec_token_ids = non_spec_token_ids.expand(-1, max_proposal_len +
+                                                       1).clone()
+        non_spec_token_ids[:, 1:] = -1
+        accepted_token_ids = torch.cat(
+            [accepted_token_ids, non_spec_token_ids])
+
+        # Rearrange so that results are in the order of the original seq group
+        # metadata.
+        accepted_token_ids[original_indices] = accepted_token_ids.clone()
+
+        return accepted_token_ids
+
+    def _create_output_sampler_list(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        accepted_token_ids: torch.Tensor,  # shape: [batch_size, k+1]
+        k: int,
+    ) -> List[SamplerOutput]:
+        """Given the accepted token ids, create a list of SamplerOutput.
+
+        The output is padded with -1 tokens such that each sequence has
+        the same number of outputs.
+        """
+        seq_ids = get_all_seq_ids(seq_group_metadata_list)
+
+        # shape: [k+1, batch_size]
+        accepted_token_ids_by_step = accepted_token_ids.transpose(0,
+                                                                  1).tolist()
+        sampler_output_list = []
+        for token_ids_by_step in accepted_token_ids_by_step:
+            if all(token_id == -1 for token_id in token_ids_by_step):
+                break
+
+            step_output_token_ids = []
+            for token_id, seq_id in zip(token_ids_by_step, seq_ids):
+                step_output_token_ids.append(
+                    SequenceGroupOutput(
+                        samples=[
+                            SequenceOutput(
+                                parent_seq_id=seq_id,
+                                output_token=token_id,
+                                # TODO Add verifier logprobs.
+                                logprobs={token_id: 0.0},
+                            )
+                        ],
+                        prompt_logprobs=None,
+                    ))
+            sampler_output_list.append(
+                SamplerOutput(outputs=step_output_token_ids))
+
+        maybe_rejsample_metrics = (
+            self._metrics.maybe_collect_rejsample_metrics(k))
+        if maybe_rejsample_metrics is not None:
+            sampler_output_list[
+                0].spec_decode_worker_metrics = maybe_rejsample_metrics
+
+        return sampler_output_list
+
+    @cached_property
+    def _vocab_size(self) -> int:
+        """Get the vocab size of the model and make sure it's consistent between
+        draft and target workers.
+        """
+        vocab_sizes = [
+            worker.vocab_size
+            for worker in [self.proposer_worker, self.scorer_worker]
+        ]
+        assert all(vocab_sizes[0] == vocab_size for vocab_size in vocab_sizes)
+        return vocab_sizes[0]
+
+    @property
+    def rank(self):
+        return self.scorer_worker.rank
+
+    @property
+    def device(self):
+        return self.scorer_worker.device
+
+
+def split_num_cache_blocks_evenly(scorer_cache_block_size_bytes: int,
+                                  proposer_cache_block_size_bytes: int,
+                                  total_num_gpu_blocks: int) -> int:
+    """Given total_num_gpu_blocks, the number of GPU blocks that could be
+    allocate to the target model, this function calculates how many blocks
+    should be given to the draft and target model.
+
+    Note that usually the block size, in bytes, of each model is different,
+    as it's a function of number of KV/layer, number of heads, and hidden
+    dimension size.
+
+    Since the target and draft models allocate the same number of blocks, we
+    simply calculate the number of blocks where if allocated by both models,
+    the total memory usage from KV cache is no larger than the number of
+    blocks allocatable by the target model alone.
+    """
+    new_num_gpu_blocks = int(
+        total_num_gpu_blocks * scorer_cache_block_size_bytes /
+        (proposer_cache_block_size_bytes + scorer_cache_block_size_bytes))
+
+    return new_num_gpu_blocks
diff --git a/vllm/spec_decode/util.py b/vllm/spec_decode/util.py
new file mode 100644
index 0000000000000..2c5f954551905
--- /dev/null
+++ b/vllm/spec_decode/util.py
@@ -0,0 +1,99 @@
+import torch
+from typing import List, Tuple
+from vllm.sequence import SequenceGroupMetadata, SamplerOutput
+from contextlib import contextmanager
+from itertools import chain
+
+SeqId = int
+
+
+def get_all_seq_ids(
+        seq_group_metadata_list: List[SequenceGroupMetadata]) -> List[SeqId]:
+    """Given a list of SequenceGroupMetadata, create a list of all
+    sequence ids.
+    """
+    return list(
+        chain.from_iterable([
+            seq_group_metadata.seq_data.keys()
+            for seq_group_metadata in seq_group_metadata_list
+        ]))
+
+
+def split_batch_by_proposal_len(
+    seq_group_metadata_list: List[SequenceGroupMetadata],
+    proposal_lens: List[int], select_proposal_len_zero: bool
+) -> Tuple[List[SequenceGroupMetadata], List[int]]:
+    """Utility function that splits a batch based on whether the proposal len is
+    zero or not. We should remove this once vLLM supports per-sequence proposal
+    lens in a batch.
+    """
+
+    if select_proposal_len_zero:
+        predicate = lambda proposal_len: proposal_len == 0
+    else:
+        predicate = lambda proposal_len: proposal_len != 0
+
+    indices = [
+        i for i, (_, proposal_len
+                  ) in enumerate(zip(seq_group_metadata_list, proposal_lens))
+        if predicate(proposal_len)
+    ]
+    seq_groups = [
+        seq_group for seq_group, proposal_len in zip(
+            seq_group_metadata_list, proposal_lens) if predicate(proposal_len)
+    ]
+
+    return seq_groups, indices
+
+
+def sampler_output_to_torch(
+    sampler_output_list: List[SamplerOutput],
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Utility function which converts a list of SamplerOutput to tensors.
+
+        Returns:
+            sampled_token_ids: torch.Tensor
+                shape: [batch_size, len(sampler_output_list)]
+
+            sampled_token_probs: torch.Tensor
+                shape: [batch_size, len(sampler_output_list), vocab_size]
+        """
+
+    # shape: [batch_size, num_sampler_output, vocab_size]
+    sampled_token_probs = torch.stack(
+        [
+            sampler_output.sampled_token_probs
+            for sampler_output in sampler_output_list
+        ],
+        dim=0,
+    ).transpose(0, 1)
+
+    # shape: [batch_size, num_sampler_output]
+    sampled_token_ids = torch.stack(
+        [
+            sampler_output.sampled_token_ids.flatten()
+            for sampler_output in sampler_output_list
+        ],
+        dim=0,
+    ).transpose(0, 1)
+
+    return sampled_token_ids, sampled_token_probs
+
+
+@contextmanager
+def nvtx_range(msg, *args, **kwargs):
+    """ 
+    Context manager / decorator that pushes an NVTX range at the beginning
+    of its scope, and pops it at the end. If extra arguments are given,
+    they are passed as arguments to msg.format().
+
+    If running with cuda graphs, you must enable nsys cuda graph profiling.
+
+    Arguments:
+        msg (string): message to associate with the range
+    """
+    torch.cuda.nvtx.range_push(msg.format(*args, **kwargs))
+    try:
+        yield
+    finally:
+        torch.cuda.nvtx.range_pop()
diff --git a/vllm/transformers_utils/configs/mpt.py b/vllm/transformers_utils/configs/mpt.py
index 5ea0d9122ef11..2c0e45623aa25 100644
--- a/vllm/transformers_utils/configs/mpt.py
+++ b/vllm/transformers_utils/configs/mpt.py
@@ -62,62 +62,6 @@ def __init__(self,
                  fc_type: str = 'torch',
                  verbose: Optional[int] = None,
                  **kwargs: Any):
-        """The MPT configuration class.
-        Args:
-            d_model (int): The size of the embedding dimension of the model.
-            n_heads (int): The number of attention heads.
-            n_layers (int): The number of layers in the model.
-            expansion_ratio (int): The ratio of the up/down scale in the ffn.
-            max_seq_len (int): The maximum sequence length of the model.
-            vocab_size (int): The size of the vocabulary.
-            resid_pdrop (float): The dropout probability applied to the attention output before combining with residual.
-            emb_pdrop (float): The dropout probability for the embedding layer.
-            learned_pos_emb (bool): Whether to use learned positional embeddings
-            attn_config (Dict): A dictionary used to configure the model's attention module:
-                attn_type (str): type of attention to use. Options: multihead_attention, multiquery_attention, grouped_query_attention
-                attn_pdrop (float): The dropout probability for the attention layers.
-                attn_impl (str): The attention implementation to use. One of 'torch', 'flash', or 'triton'.
-                qk_ln (bool): Whether to apply layer normalization to the queries and keys in the attention layer.
-                clip_qkv (Optional[float]): If not None, clip the queries, keys, and values in the attention layer to
-                    this value.
-                softmax_scale (Optional[float]): If not None, scale the softmax in the attention layer by this value. If None,
-                    use the default scale of ``1/sqrt(d_keys)``.
-                prefix_lm (Optional[bool]): Whether the model should operate as a Prefix LM. This requires passing an
-                    extra `prefix_mask` argument which indicates which tokens belong to the prefix. Tokens in the prefix
-                    can attend to one another bi-directionally. Tokens outside the prefix use causal attention.
-                attn_uses_sequence_id (Optional[bool]): Whether to restrict attention to tokens that have the same sequence_id.
-                    When the model is in `train` mode, this requires passing an extra `sequence_id` argument which indicates
-                    which sub-sequence each token belongs to.
-                    Defaults to ``False`` meaning any provided `sequence_id` will be ignored.
-                alibi (bool): Whether to use the alibi bias instead of position embeddings.
-                alibi_bias_max (int): The maximum value of the alibi bias.
-                kv_n_heads (Optional[int]): For grouped_query_attention only, allow user to specify number of kv heads.
-            ffn_config (Dict): A dictionary used to configure the model's ffn module:
-                ffn_type (str): type of ffn to use. Options: mptmlp, te_ln_mlp
-            init_device (str): The device to use for parameter initialization.
-            logit_scale (Optional[Union[float, str]]): If not None, scale the logits by this value.
-            no_bias (bool): Whether to use bias in all layers.
-            verbose (int): The verbosity level. 0 is silent.
-            embedding_fraction (float): The fraction to scale the gradients of the embedding layer by.
-            norm_type (str): choose type of norm to use
-            use_cache (bool): Whether or not the model should return the last key/values attentions
-            init_config (Dict): A dictionary used to configure the model initialization:
-                init_config.name: The parameter initialization scheme to use. Options: 'default_', 'baseline_',
-                    'kaiming_uniform_', 'kaiming_normal_', 'neox_init_', 'small_init_', 'xavier_uniform_', or
-                    'xavier_normal_'. These mimic the parameter initialization methods in PyTorch.
-                init_div_is_residual (Union[int, float, str, bool]): Value to divide initial weights by if ``module._is_residual`` is True.
-                emb_init_std (Optional[float]): The standard deviation of the normal distribution used to initialize the embedding layer.
-                emb_init_uniform_lim (Optional[Union[Tuple[float, float], float]]): The lower and upper limits of the uniform distribution
-                    used to initialize the embedding layer. Mutually exclusive with ``emb_init_std``.
-                init_std (float): The standard deviation of the normal distribution used to initialize the model,
-                    if using the baseline_ parameter initialization scheme.
-                init_gain (float): The gain to use for parameter initialization with kaiming or xavier initialization schemes.
-                fan_mode (str): The fan mode to use for parameter initialization with kaiming initialization schemes.
-                init_nonlinearity (str): The nonlinearity to use for parameter initialization with kaiming initialization schemes.
-                ---
-                See llmfoundry.models.utils.param_init_fns.py for info on other param init config options
-            fc_type (str): choose fc layer implementation. Options: torch and te. te layers support fp8 when using H100 GPUs.
-        """
         self.d_model = d_model
         self.n_heads = n_heads
         self.n_layers = n_layers
@@ -139,8 +83,8 @@ def __init__(self,
         self.fc_type = fc_type
         if verbose is not None:
             warnings.warn(DeprecationWarning(
-                'verbose argument for MPTConfig is now ignored and will be removed. Use python_log_level instead.'
-            ),
+                'verbose argument for MPTConfig is now ignored and '
+                'will be removed. Use python_log_level instead.'),
                           stacklevel=2)
         if 'name' in kwargs:
             del kwargs['name']
@@ -149,7 +93,8 @@ def __init__(self,
         if self.attn_config.get('alibi', False):
             self.learned_pos_emb = False
             warnings.warn(
-                f'alibi is turned on, setting `learned_pos_emb` to {self.learned_pos_emb}`',
+                f'alibi is turned on, setting `learned_pos_emb` '
+                f'to {self.learned_pos_emb}`',
                 stacklevel=2)
         super().__init__(**kwargs)
         self._validate_config()
@@ -176,8 +121,8 @@ def _validate_config(self) -> None:
             [self.attn_config['attn_pdrop'], self.resid_pdrop, self.emb_pdrop]
         )):
             raise ValueError(
-                "self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are probabilities and must be between 0 and 1"  # pylint: disable=line-too-long
-            )
+                "self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are "
+                "probabilities and must be between 0 and 1")
         if self.attn_config['attn_impl'] not in ['torch', 'flash', 'triton']:
             raise ValueError(
                 f"Unknown attn_impl={self.attn_config['attn_impl']}")
@@ -193,17 +138,17 @@ def _validate_config(self) -> None:
         if self.attn_config['attn_uses_sequence_id'] and self.attn_config[
                 'attn_impl'] not in ['torch', 'triton']:
             raise NotImplementedError(
-                'attn_uses_sequence_id only implemented with torch and triton attention.'  # pylint: disable=line-too-long
-            )
+                'attn_uses_sequence_id only implemented with torch '
+                'and triton attention.')
         if self.embedding_fraction > 1 or self.embedding_fraction <= 0:
             raise ValueError(
-                'model.embedding_fraction must be between 0 (exclusive) and 1 (inclusive)!'  # pylint: disable=line-too-long
-            )
+                'model.embedding_fraction must be between 0 (exclusive) '
+                'and 1 (inclusive)!')
         if isinstance(self.logit_scale,
                       str) and self.logit_scale != 'inv_sqrt_d_model':
             raise ValueError(
-                f"self.logit_scale={self.logit_scale!r} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'."  # pylint: disable=line-too-long
-            )
+                f"self.logit_scale={self.logit_scale!r} is not recognized as "
+                "an option; use numeric value or 'inv_sqrt_d_model'.")
         if self.init_config.get('name', None) is None:
             raise ValueError(
                 f"self.init_config={self.init_config!r} 'name' needs to be set."
@@ -219,11 +164,11 @@ def _validate_config(self) -> None:
                 del te
             except Exception as exc:
                 raise ImportError(
-                    # pylint: disable=line-too-long
-                    'TransformerEngine import fail. `fc_type: te` requires TransformerEngine be installed. '
-                    +
-                    'The required version of transformer_engine also requires FlashAttention v1.0.6 is installed:\n'
-                    + 'pip install flash-attn==1.0.6 --no-build-isolation \n' +
+                    'TransformerEngine import fail. `fc_type: te` requires '
+                    'TransformerEngine be installed. '
+                    'The required version of transformer_engine also requires '
+                    'FlashAttention v1.0.6 is installed:\n'
+                    'pip install flash-attn==1.0.6 --no-build-isolation \n'
                     'pip install git+https://github.com/NVIDIA/TransformerEngine.git@144e4888b2cdd60bd52e706d5b7a79cb9c1a7156'
                 ) from exc
         if self.ffn_config['ffn_type'] == 'mptmlp':
diff --git a/vllm/transformers_utils/configs/starcoder2.py b/vllm/transformers_utils/configs/starcoder2.py
index 4c3b6b8def074..2879cd0445275 100644
--- a/vllm/transformers_utils/configs/starcoder2.py
+++ b/vllm/transformers_utils/configs/starcoder2.py
@@ -2,78 +2,6 @@
 
 
 class Starcoder2Config(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`Starcoder2Model`]. It is used to instantiate a
-    Starcoder2 model according to the specified arguments, defining the model architecture. Instantiating a configuration
-    with the defaults will yield a similar configuration to that of the [bigcode/starcoder2-7b_16k](https://huggingface.co/bigcode/starcoder2-7b_16k) model.
-
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-
-    Args:
-        vocab_size (`int`, *optional*, defaults to 49152):
-            Vocabulary size of the Starcoder2 model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`Starcoder2Model`]
-        hidden_size (`int`, *optional*, defaults to 3072):
-            Dimension of the hidden representations.
-        intermediate_size (`int`, *optional*, defaults to 12288):
-            Dimension of the MLP representations.
-        num_hidden_layers (`int`, *optional*, defaults to 30):
-            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 24):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        num_key_value_heads (`int`, *optional*, defaults to 2):
-            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
-            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
-            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
-            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
-            by meanpooling all the original heads within that group. For more details checkout [this
-            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
-        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
-            The non-linear activation function (function or string) in the decoder.
-        max_position_embeddings (`int`, *optional*, defaults to 4096):
-            The maximum sequence length that this model might ever be used with. Starcoder2's sliding window attention
-            allows sequence of up to 4096*32 tokens.
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        norm_epsilon (`float`, *optional*, defaults to 1e-05):
-            Epsilon value for the layer norm
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if `config.is_decoder=True`.
-        bos_token_id (`int`, *optional*, defaults to 50256):
-            The id of the "beginning-of-sequence" token.
-        eos_token_id (`int`, *optional*, defaults to 50256):
-            The id of the "end-of-sequence" token.
-        rope_theta (`float`, *optional*, defaults to 10000.0):
-            The base period of the RoPE embeddings.
-        sliding_window (`int`, *optional*):
-            Sliding window attention window size. If not specified, will default to `None` (no sliding window).
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-        residual_dropout (`float`, *optional*, defaults to 0.0):
-            Residual connection dropout value.
-        embedding_dropout (`float`, *optional*, defaults to 0.0):
-            Embedding dropout.
-        use_bias (`bool`, *optional*, defaults to `True`):
-            Whether to use bias term on linear layers of the model.
-
-
-    ```python
-    >>> from transformers import Starcoder2Model, Starcoder2Config
-
-    >>> # Initializing a Starcoder2 7B style configuration
-    >>> configuration = Starcoder2Config()
-
-    >>> # Initializing a model from the Starcoder2 7B style configuration
-    >>> model = Starcoder2Model(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-
     model_type = "starcoder2"
     keys_to_ignore_at_inference = ["past_key_values"]
 
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index 6edc225cdfc80..2600ea2642da2 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -120,7 +120,8 @@ async def encode_async(
 
     def get_lora_tokenizer(
             self,
-            lora_request: Optional[LoRARequest]) -> "PreTrainedTokenizer":
+            lora_request: Optional[LoRARequest] = None
+    ) -> "PreTrainedTokenizer":
         if not lora_request or not self.enable_lora:
             return self.tokenizer
         if lora_request.lora_int_id not in self.lora_tokenizers:
@@ -133,7 +134,8 @@ def get_lora_tokenizer(
 
     async def get_lora_tokenizer_async(
             self,
-            lora_request: Optional[LoRARequest]) -> "PreTrainedTokenizer":
+            lora_request: Optional[LoRARequest] = None
+    ) -> "PreTrainedTokenizer":
         if not lora_request or not self.enable_lora:
             return self.tokenizer
         if lora_request.lora_int_id not in self.lora_tokenizers:
diff --git a/vllm/transformers_utils/tokenizers/baichuan.py b/vllm/transformers_utils/tokenizers/baichuan.py
index 1dd241e4a5c4b..02045bdcb2ccf 100644
--- a/vllm/transformers_utils/tokenizers/baichuan.py
+++ b/vllm/transformers_utils/tokenizers/baichuan.py
@@ -1,4 +1,3 @@
-# yapf: disable
 # Adapted from
 # https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat/blob/8f6e343d545c503b91429582231d1d354dac2740/tokenization_baichuan.py
 # This includes a fix suggested in
@@ -13,7 +12,6 @@
 from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
 from transformers.utils import logging
 
-
 logger = logging.get_logger(__name__)
 
 VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
@@ -52,27 +50,16 @@ def __init__(
         clean_up_tokenization_spaces=False,
         **kwargs,
     ):
-        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
-        bos_token = (
-            AddedToken(bos_token, lstrip=False, rstrip=False)
-            if isinstance(bos_token, str)
-            else bos_token
-        )
-        eos_token = (
-            AddedToken(eos_token, lstrip=False, rstrip=False)
-            if isinstance(eos_token, str)
-            else eos_token
-        )
-        unk_token = (
-            AddedToken(unk_token, lstrip=False, rstrip=False)
-            if isinstance(unk_token, str)
-            else unk_token
-        )
-        pad_token = (
-            AddedToken(pad_token, lstrip=False, rstrip=False)
-            if isinstance(pad_token, str)
-            else pad_token
-        )
+        self.sp_model_kwargs = ({} if sp_model_kwargs is None else
+                                sp_model_kwargs)
+        bos_token = (AddedToken(bos_token, lstrip=False, rstrip=False)
+                     if isinstance(bos_token, str) else bos_token)
+        eos_token = (AddedToken(eos_token, lstrip=False, rstrip=False)
+                     if isinstance(eos_token, str) else eos_token)
+        unk_token = (AddedToken(unk_token, lstrip=False, rstrip=False)
+                     if isinstance(unk_token, str) else unk_token)
+        pad_token = (AddedToken(pad_token, lstrip=False, rstrip=False)
+                     if isinstance(pad_token, str) else pad_token)
         self.vocab_file = vocab_file
         self.add_bos_token = add_bos_token
         self.add_eos_token = add_eos_token
@@ -107,7 +94,10 @@ def vocab_size(self):
 
     def get_vocab(self):
         """Returns vocab as a dict"""
-        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab = {
+            self.convert_ids_to_tokens(i): i
+            for i in range(self.vocab_size)
+        }
         vocab.update(self.added_tokens_encoder)
         return vocab
 
@@ -130,7 +120,8 @@ def convert_tokens_to_string(self, tokens):
         out_string = ""
         prev_is_special = False
         for i, token in enumerate(tokens):
-            # make sure that special tokens are not decoded using sentencepiece model
+            # make sure that special tokens are not decoded using
+            # sentencepiece model
             if token in self.all_special_tokens:
                 if not prev_is_special and i != 0:
                     out_string += " "
@@ -143,9 +134,9 @@ def convert_tokens_to_string(self, tokens):
         out_string += self.sp_model.decode(current_sub_tokens)
         return out_string
 
-    def save_vocabulary(
-        self, save_directory, filename_prefix: Optional[str] = None
-    ) -> Tuple[str]:
+    def save_vocabulary(self,
+                        save_directory,
+                        filename_prefix: Optional[str] = None) -> Tuple[str]:
         """
         Save the vocabulary and special tokens file to a directory.
 
@@ -157,24 +148,24 @@ def save_vocabulary(
             `Tuple(str)`: Paths to the files saved.
         """
         if not os.path.isdir(save_directory):
-            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            logger.error(f"Vocabulary path ({save_directory}) "
+                         "should be a directory")
             return
         out_vocab_file = os.path.join(
             save_directory,
-            (filename_prefix + "-" if filename_prefix else "")
-            + VOCAB_FILES_NAMES["vocab_file"],
+            (filename_prefix + "-" if filename_prefix else "") +
+            VOCAB_FILES_NAMES["vocab_file"],
         )
 
         if os.path.abspath(self.vocab_file) != os.path.abspath(
-            out_vocab_file
-        ) and os.path.isfile(self.vocab_file):
+                out_vocab_file) and os.path.isfile(self.vocab_file):
             copyfile(self.vocab_file, out_vocab_file)
         elif not os.path.isfile(self.vocab_file):
             with open(out_vocab_file, "wb") as fi:
                 content_spiece_model = self.sp_model.serialized_model_proto()
                 fi.write(content_spiece_model)
 
-        return (out_vocab_file,)
+        return (out_vocab_file, )
 
     def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
         bos_token_id = [self.bos_token_id] if self.add_bos_token else []
@@ -194,7 +185,8 @@ def get_special_tokens_mask(
         already_has_special_tokens: bool = False,
     ) -> List[int]:
         """
-        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        Retrieve sequence ids from a token list that has no special tokens
+        added. This method is called when adding
         special tokens using the tokenizer `prepare_for_model` method.
 
         Args:
@@ -202,11 +194,14 @@ def get_special_tokens_mask(
                 List of IDs.
             token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
-                Whether or not the token list is already formatted with special tokens for the model.
+            already_has_special_tokens (`bool`, *optional*, defaults to
+            `False`):
+                Whether or not the token list is already formatted with
+                special tokens for the model.
 
         Returns:
-            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            `List[int]`: A list of integers in the range [0, 1]:
+            1 for a special token, 0 for a sequence token.
         """
         if already_has_special_tokens:
             return super().get_special_tokens_mask(
@@ -220,20 +215,16 @@ def get_special_tokens_mask(
 
         if token_ids_1 is None:
             return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
-        return (
-            bos_token_id
-            + ([0] * len(token_ids_0))
-            + eos_token_id
-            + bos_token_id
-            + ([0] * len(token_ids_1))
-            + eos_token_id
-        )
+        return (bos_token_id + ([0] * len(token_ids_0)) + eos_token_id +
+                bos_token_id + ([0] * len(token_ids_1)) + eos_token_id)
 
     def create_token_type_ids_from_sequences(
-        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
-    ) -> List[int]:
+            self,
+            token_ids_0: List[int],
+            token_ids_1: Optional[List[int]] = None) -> List[int]:
         """
-        Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
+        Creates a mask from the two sequences passed to be used in a
+        sequence-pair classification task. An ALBERT
         sequence pair mask has the following format:
 
         ```
@@ -250,7 +241,8 @@ def create_token_type_ids_from_sequences(
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids)
+            according to the given sequence(s).
         """
         bos_token_id = [self.bos_token_id] if self.add_bos_token else []
         eos_token_id = [self.eos_token_id] if self.add_eos_token else []
diff --git a/vllm/utils.py b/vllm/utils.py
index 9cdf623379516..fe6fd27962cd3 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -3,6 +3,7 @@
 import socket
 import subprocess
 import uuid
+import gc
 from platform import uname
 from typing import List, Tuple, Union
 from packaging.version import parse, Version
@@ -132,9 +133,10 @@ def get_max_shared_memory_bytes(gpu: int = 0) -> int:
     # the Neuron-X backend does not have the `cuda_utils` module.
     from vllm._C import cuda_utils
 
-    max_shared_mem = cuda_utils.get_max_shared_memory_per_block_device_attribute(
-        gpu)
-    # value 0 will cause MAX_SEQ_LEN become negative and test_attention.py will fail
+    max_shared_mem = (
+        cuda_utils.get_max_shared_memory_per_block_device_attribute(gpu))
+    # value 0 will cause MAX_SEQ_LEN become negative and test_attention.py
+    # will fail
     assert max_shared_mem > 0, "max_shared_mem can not be zero"
     return int(max_shared_mem)
 
@@ -208,9 +210,8 @@ def get_nvcc_cuda_version() -> Optional[Version]:
     if not cuda_home:
         cuda_home = '/usr/local/cuda'
         if os.path.isfile(cuda_home + '/bin/nvcc'):
-            logger.info(
-                f'CUDA_HOME is not found in the environment. Using {cuda_home} as CUDA_HOME.'
-            )
+            logger.info(f'CUDA_HOME is not found in the environment. '
+                        f'Using {cuda_home} as CUDA_HOME.')
         else:
             logger.warning(
                 f'Not found nvcc in {cuda_home}. Skip cuda version check!')
@@ -309,3 +310,27 @@ def create_kv_caches_with_random(
                 f"Does not support value cache of type {cache_dtype}")
         value_caches.append(value_cache)
     return key_caches, value_caches
+
+
+class measure_cuda_memory:
+
+    def __init__(self, device=None):
+        self.device = device
+
+    def current_memory_usage(self) -> float:
+        # Return the memory usage in bytes.
+        torch.cuda.reset_peak_memory_stats(self.device)
+        mem = torch.cuda.max_memory_allocated(self.device)
+        return mem
+
+    def __enter__(self):
+        self.initial_memory = self.current_memory_usage()
+        # This allows us to call methods of the context manager if needed
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.final_memory = self.current_memory_usage()
+        self.consumed_memory = self.final_memory - self.initial_memory
+
+        # Force garbage collection
+        gc.collect()
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index aff8ebc903623..7eac576e3f0fe 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -21,7 +21,7 @@
 from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager
 from vllm.lora.layers import LoRAMapping
 from vllm.lora.request import LoRARequest
-from vllm.utils import in_wsl
+from vllm.utils import in_wsl, measure_cuda_memory
 
 logger = init_logger(__name__)
 
@@ -85,18 +85,21 @@ def __init__(
             self.model_config.enforce_eager = True
 
     def load_model(self) -> None:
-        self.model = get_model(self.model_config,
-                               self.device_config,
-                               lora_config=self.lora_config,
-                               parallel_config=self.parallel_config,
-                               scheduler_config=self.scheduler_config)
+        with measure_cuda_memory() as m:
+            self.model = get_model(self.model_config,
+                                   self.device_config,
+                                   lora_config=self.lora_config,
+                                   parallel_config=self.parallel_config,
+                                   scheduler_config=self.scheduler_config)
 
-        vocab_size = self.model.config.vocab_size
+        self.model_memory_usage = m.consumed_memory
+        logger.info(f"Loading model weights took "
+                    f"{self.model_memory_usage / float(2**30):.4f} GB")
 
         if self.lora_config:
-            assert hasattr(
-                self.model, "supported_lora_modules"
-            ) and self.model.supported_lora_modules, "Model does not support LoRA"
+            assert hasattr(self.model, "supported_lora_modules"
+                           ) and self.model.supported_lora_modules, (
+                               "Model does not support LoRA")
             assert hasattr(
                 self.model,
                 "embedding_modules"), "Model does not have embedding_modules"
@@ -105,7 +108,7 @@ def load_model(self) -> None:
             self.lora_manager = LRUCacheWorkerLoRAManager(
                 self.scheduler_config.max_num_seqs,
                 self.scheduler_config.max_num_batched_tokens +
-                self.scheduler_config.max_paddings, vocab_size,
+                self.scheduler_config.max_paddings, self.vocab_size,
                 self.lora_config, self.device, self.model.embedding_modules,
                 self.model.embedding_padding_modules)
             self.model = self.lora_manager.create_lora_manager(self.model)
@@ -209,6 +212,7 @@ def _prepare_prompt(
                 slot_mapping[-1].append(slot)
 
         max_prompt_len = max(subquery_lens)
+        assert max_prompt_len > 0
         input_tokens = _make_tensor_with_pad(input_tokens,
                                              max_prompt_len,
                                              pad=0,
@@ -600,8 +604,7 @@ def execute_model(
     @torch.inference_mode()
     def profile_run(self) -> None:
         # Enable top-k sampling to reflect the accurate memory usage.
-        vocab_size = self.model_config.get_vocab_size()
-        sampling_params = SamplingParams(top_p=0.99, top_k=vocab_size - 1)
+        sampling_params = SamplingParams(top_p=0.99, top_k=self.vocab_size - 1)
         max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens
         max_num_seqs = self.scheduler_config.max_num_seqs
 
@@ -767,6 +770,10 @@ def __del__(self) -> None:
         self.graph_runners.clear()
         self.cupy_nccl_backend = None
 
+    @property
+    def vocab_size(self) -> int:
+        return self.model_config.get_vocab_size()
+
 
 class CUDAGraphRunner:
 
diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py
index 3229a21c11a38..340c079600c78 100644
--- a/vllm/worker/neuron_worker.py
+++ b/vllm/worker/neuron_worker.py
@@ -79,7 +79,8 @@ def profile_num_available_blocks(
         cpu_swap_space: int = 0,
         cache_dtype: str = "float16",
     ) -> Tuple[int, int]:
-        """Simply returns max_num_seqs as num_gpu_blocks, 0 as num_cpu_blocks."""
+        """Simply returns max_num_seqs as num_gpu_blocks, 0 as
+        num_cpu_blocks."""
         num_gpu_blocks = self.scheduler_config.max_num_seqs
         num_cpu_blocks = 0
         return num_gpu_blocks, num_cpu_blocks
@@ -177,7 +178,8 @@ def _init_distributed_environment(
             "distributed_init_method must be set if torch.distributed "
             "is not already initialized")
     else:
-        distributed_backend = distributed_backend if distributed_backend else "nccl"
+        distributed_backend = (distributed_backend
+                               if distributed_backend else "nccl")
         torch.distributed.init_process_group(
             backend=distributed_backend,
             world_size=parallel_config.world_size,
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 9df518d155ec2..0dcd4018afa5f 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -19,7 +19,6 @@
 from vllm.worker.cache_engine import CacheEngine
 from vllm.worker.model_runner import ModelRunner
 from vllm.lora.request import LoRARequest
-from vllm.utils import is_hip
 
 
 class Worker:
@@ -131,8 +130,8 @@ def profile_num_available_blocks(
         # GPU did not change their memory usage during the profiling.
         peak_memory = self.init_gpu_memory - free_gpu_memory
 
-        cache_block_size = CacheEngine.get_cache_block_size(
-            block_size, cache_dtype, self.model_config, self.parallel_config)
+        cache_block_size = self.get_cache_block_size_bytes(
+            block_size, cache_dtype)
         num_gpu_blocks = int(
             (total_gpu_memory * gpu_memory_utilization - peak_memory) //
             cache_block_size)
@@ -233,6 +232,22 @@ def remove_lora(self, lora_id: int) -> bool:
     def list_loras(self) -> Set[int]:
         return self.model_runner.list_loras()
 
+    @property
+    def max_model_len(self) -> int:
+        return self.model_config.max_model_len
+
+    @property
+    def vocab_size(self) -> int:
+        return self.model_runner.vocab_size
+
+    def get_cache_block_size_bytes(self, block_size: int,
+                                   cache_dtype: str) -> int:
+        """Get the size of the KV cache block size in bytes.
+        """
+        return CacheEngine.get_cache_block_size(block_size, cache_dtype,
+                                                self.model_config,
+                                                self.parallel_config)
+
 
 def init_distributed_environment(
     parallel_config: ParallelConfig,
@@ -267,8 +282,7 @@ def init_distributed_environment(
                 "cupy.distributed is already initialized but the cupy world "
                 "size does not match parallel_config.world_size "
                 f"({cupy_world_size} vs. {parallel_config.world_size}).")
-    elif (parallel_config.world_size > 1 and cupy_port is not None
-          and not is_hip()):
+    elif (parallel_config.world_size > 1 and cupy_port is not None):
         # NOTE(woosuk): We don't initialize CuPy process group when world size
         # is 1.
         # TODO(woosuk): Support multi-node connection.