in progress

vllm-project · rkooo567 · Feb 28, 2024 · Feb 28, 2024 · Feb 28, 2024 · Feb 28, 2024
commit 4769a2636392d4ac1f25b2af758d008de4533f88
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -43,6 +43,10 @@ steps:
   commands:
     - pytest -v -s prefix_caching
 
+- label: Chunked Prefill Test
+  commands:
+    - pytest -v -s chunked_prefill
+
 - label: Samplers Test
   command: pytest -v -s samplers --forked
 

diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
@@ -10,6 +10,13 @@
 
 from vllm import LLM, SamplingParams
 
+SAMPLE_PROMPTS = [
+    "The president of the United States is",
+    "Hello, my name is",
+    "The capital of France is",
+    "The future of AI is",
+]
+
 
 def main(args: argparse.Namespace):
     print(args)
@@ -57,10 +64,24 @@ def run_to_completion(profile_dir: Optional[str] = None):
             print(p.key_averages())
         else:
             start_time = time.perf_counter()
-            llm.generate(prompt_token_ids=dummy_prompt_token_ids,
-                         sampling_params=sampling_params,
-                         use_tqdm=False)
+            if args.use_sample:
+                batch = (
+                    SAMPLE_PROMPTS *
+                    (args.batch_size // len(SAMPLE_PROMPTS) + 1))[:args.batch_size]
+                outputs = llm.generate(prompts=batch,
+                                    sampling_params=sampling_params,
+                                    use_tqdm=False)
+            else:
+                outputs = llm.generate(prompt_token_ids=dummy_prompt_token_ids,
+                                    sampling_params=sampling_params,
+                                    use_tqdm=False)
             end_time = time.perf_counter()
+            if args.verbose:
+                for output in outputs:
+                    prompt = output.prompt
+                    generated_text = output.outputs[0].text
+                    print(
+                        f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
             latency = end_time - start_time
             return latency
 
@@ -145,5 +166,18 @@ def run_to_completion(profile_dir: Optional[str] = None):
         default="cuda",
         choices=["cuda"],
         help='device type for vLLM execution, supporting CUDA only currently.')
+    parser.add_argument('--flash-style',
+                        action='store_true',
+                        help='enable flash attention')
+    parser.add_argument('--block-size',
+                        type=int,
+                        default=16,
+                        help='block size of key/value cache')
+    parser.add_argument('--use-sample',
+                        action='store_true',
+                        help='use sample input instead of dummy input')
+    parser.add_argument('--verbose',
+                        action='store_true',
+                        help='print generated text')
     args = parser.parse_args()
     main(args)
diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
@@ -278,17 +278,12 @@ __global__ void reshape_and_cache_flash_kernel(
   scalar_t* __restrict__ key_cache,     // [num_blocks, block_size, num_heads, head_size]
   scalar_t* __restrict__ value_cache,   // [num_blocks, block_size, num_heads, head_size]
   const int64_t* __restrict__ slot_mapping, // [num_tokens]
-  const int64_t* __restrict__ num_tokens,   // [1]
   const int key_stride,
   const int value_stride,
   const int num_heads,
   const int head_size,
   const int block_size) {
-  const int64_t num_tokens_ = num_tokens[0];
   const int64_t token_idx = blockIdx.x;
-  if (token_idx >= num_tokens_) {
-    return;
-  }
   const int64_t slot_idx = slot_mapping[token_idx];
   const int64_t block_idx = slot_idx / block_size;
   const int64_t block_offset = slot_idx % block_size;
@@ -323,8 +318,7 @@ void reshape_and_cache_flash(
   torch::Tensor& value,         // [num_tokens, num_heads, head_size]
   torch::Tensor& key_cache,     // [num_blocks, block_size, num_heads, head_size]
   torch::Tensor& value_cache,   // [num_blocks, block_size, num_heads, head_size]
-  torch::Tensor& slot_mapping,  // [num_tokens]
-  torch::Tensor& num_tokens)    // [1]
+  torch::Tensor& slot_mapping)  // [num_tokens]
 {
   int num_tokens_padded = key.size(0);
   int num_heads = key.size(1);
@@ -347,7 +341,6 @@ void reshape_and_cache_flash(
         key_cache.data_ptr<scalar_t>(),
         value_cache.data_ptr<scalar_t>(),
         slot_mapping.data_ptr<int64_t>(),
-        num_tokens.data_ptr<int64_t>(),
         key_stride,
         value_stride,
         num_heads,

diff --git a/requirements.txt b/requirements.txt
@@ -13,3 +13,4 @@ prometheus_client >= 0.18.0
 pynvml == 11.5.0
 triton >= 2.1.0
 cupy-cuda12x == 12.1.0  # Required for CUDA graphs. CUDA 11.8 users should install cupy-cuda11x instead.
+flash-attn >= 2.5.0  # Required for chunked prefill.
diff --git a/tests/chunked_prefill/test_correctness.py b/tests/chunked_prefill/test_correctness.py
@@ -0,0 +1,82 @@
+import gc
+
+from typing import List
+
+import pytest
+import torch
+
+from vllm.model_executor.parallel_utils.parallel_state import destroy_model_parallel
+
+MODELS = [
+    "JackFram/llama-68m",
+]
+
+# SANG-TODO Read it from example.txt
+TEST_PROMPTS = [
+    # pylint: disable=line-too-long
+    "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.",
+    "Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.",
+    "Compare and contrast artificial intelligence with human intelligence in terms of processing information.",
+    # Different between page attention and flash attention.
+    # "Describe the basic components of a neural network and how it can be trained.",
+    "Write a short story about a robot that dreams for the first time.",
+    "Analyze the impact of the COVID-19 pandemic on global economic structures and future business models.",
+    "Explain the cultural significance of the Mona Lisa painting, and how its perception might vary in Western versus Eastern societies.",
+    "Translate the following English sentence into Japanese, French, and Swahili: 'The early bird catches the worm.'",
+]
+
+
+# TODO(sang): Add chunked prefill parameters.
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [128])
+def test_models(
+    vllm_runner,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+) -> None:
+    """ verify the flash attention has the same output
+    as page attention """
+    print("loading page attention models..")
+    pg_model = vllm_runner(model, dtype=dtype)
+    expected_outputs = []
+
+    print("generating tokens...")
+    expected_outputs.extend(pg_model.generate_greedy(TEST_PROMPTS, max_tokens))
+    print("generating tokens finished")
+
+    del pg_model
+
+    destroy_model_parallel()
+    gc.collect()
+    torch.cuda.empty_cache()
+
+    flash_attn_model = vllm_runner(
+        model,
+        dtype=dtype,
+        enable_cuda_graph=False,
+        flash_style=True,
+    )
+    flash_attn_output_by_batchs = []
+    for i in range(10):
+        prompts = [TEST_PROMPTS[j % len(TEST_PROMPTS)] for j in range(i)]
+        flash_attn_output_by_batchs.append(
+            flash_attn_model.generate_greedy(prompts, max_tokens))
+
+    del flash_attn_model
+
+    destroy_model_parallel()
+    gc.collect()
+    torch.cuda.empty_cache()
+
+    for flash_attn_outputs in flash_attn_output_by_batchs:
+        for i in range(len(flash_attn_outputs)):
+            fa_output_ids, fa_output_str = flash_attn_outputs[i]
+            vllm_output_ids, vllm_output_str = expected_outputs[
+                i % len(expected_outputs)]
+            print()
+            assert fa_output_ids == vllm_output_ids, (
+                f"Test{i}:\flash ids: {fa_output_ids}\nvLLM ids: {vllm_output_ids}"
+                f"Test{i}:\nflash ouput: {fa_output_str!r}\nvLLM output: {vllm_output_str!r}"
+            )
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -165,6 +165,7 @@ def __init__(
         dtype: str = "half",
         disable_log_stats: bool = True,
         tensor_parallel_size: int = 1,
+        flash_style: bool = False,
         **kwargs,
     ) -> None:
         self.model = LLM(
@@ -175,6 +176,8 @@ def __init__(
             swap_space=0,
             disable_log_stats=disable_log_stats,
             tensor_parallel_size=tensor_parallel_size,
+            flash_style=flash_style,
+            block_size=32,
             **kwargs,
         )
 

@@ -294,7 +294,7 @@ def pad_key_value(key: torch.Tensor, value: torch.Tensor,
     padded_key, padded_value = pad_key_value(key, value, padding)
     # Call the reshape_and_cache kernel.
     cache_ops.reshape_and_cache_flash(padded_key, padded_value, key_cache,
-                                      value_cache, slot_mapping, num_tokens)
+                                      value_cache, slot_mapping)
 
     # Run the reference implementation.
     block_indicies = torch.div(slot_mapping, block_size, rounding_mode='floor')

@@ -221,7 +221,6 @@ def test_flash_paged_attention(
         scale,
         padded_block_table,
         padded_context_lens,
-        block_size,
         alibi_slopes,
     )
 

diff --git a/vllm/config.py b/vllm/config.py
@@ -60,6 +60,7 @@ class ModelConfig:
         max_context_len_to_capture: Maximum context len covered by CUDA graphs.
             When a sequence has context length larger than this, we fall back
             to eager mode.
+        flash_style: Enable flash style page attention.
     """
 
     def __init__(
@@ -79,6 +80,7 @@ def __init__(
         quantization: Optional[str] = None,
         enforce_eager: bool = False,
         max_context_len_to_capture: Optional[int] = None,
+        flash_style: bool = False,
     ) -> None:
         self.model = model
         self.tokenizer = tokenizer
@@ -93,6 +95,7 @@ def __init__(
         self.quantization = quantization
         self.enforce_eager = enforce_eager
         self.max_context_len_to_capture = max_context_len_to_capture
+        self.flash_style = flash_style
 
         if os.environ.get("VLLM_USE_MODELSCOPE", "False").lower() == "true":
             # download model from ModelScope hub,
@@ -295,12 +298,14 @@ def __init__(
         swap_space: int,
         cache_dtype: str,
         sliding_window: Optional[int] = None,
+        flash_style: bool = False,
     ) -> None:
         self.block_size = block_size
         self.gpu_memory_utilization = gpu_memory_utilization
         self.swap_space_bytes = swap_space * _GB
         self.cache_dtype = cache_dtype
         self.sliding_window = sliding_window
+        self.flash_style = flash_style
         self._verify_args()
         self._verify_cache_dtype()
 
@@ -314,6 +319,15 @@ def _verify_args(self) -> None:
                 "GPU memory utilization must be less than 1.0. Got "
                 f"{self.gpu_memory_utilization}.")
 
+        if self.flash_style:
+            logger.info("Flash attention enabled.")
+            if self.block_size < 256:
+                # Flash style attention only supports block size >=256 for now.
+                # https://github.com/Dao-AILab/flash-attention/pull/824 will fix it.
+                raise ValueError(
+                    "Flash style attention only supports block size >= 256. Got"
+                    f"{self.block_size }")
+
     def _verify_cache_dtype(self) -> None:
         if self.cache_dtype == "auto":
             pass

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -45,6 +45,7 @@ class EngineArgs:
     lora_dtype = 'auto'
     max_cpu_loras: Optional[int] = None
     device: str = 'cuda'
+    flash_style: bool = False
 
     def __post_init__(self):
         if self.tokenizer is None:
@@ -271,6 +272,9 @@ def add_cli_args(
             choices=["cuda"],
             help=('Device type for vLLM execution. '
                   'Currently, only CUDA-compatible devices are supported.'))
+        parser.add_argument('--flash-style',
+                            action='store_true',
+                            help='use flash attention.')
         return parser
 
     @classmethod
@@ -291,11 +295,12 @@ def create_engine_configs(
             self.trust_remote_code, self.download_dir, self.load_format,
             self.dtype, self.seed, self.revision, self.code_revision,
             self.tokenizer_revision, self.max_model_len, self.quantization,
-            self.enforce_eager, self.max_context_len_to_capture)
+            self.enforce_eager, self.max_context_len_to_capture, self.flash_style)
         cache_config = CacheConfig(self.block_size,
                                    self.gpu_memory_utilization,
                                    self.swap_space, self.kv_cache_dtype,
-                                   model_config.get_sliding_window())
+                                   model_config.get_sliding_window(),
+                                   self.flash_style)
         parallel_config = ParallelConfig(self.pipeline_parallel_size,
                                          self.tensor_parallel_size,
                                          self.worker_use_ray,

diff --git a/vllm/model_executor/input_metadata.py b/vllm/model_executor/input_metadata.py
@@ -13,6 +13,10 @@ class InputMetadata:
         context_lens: the length of attention context for each sequence.
         block_tables: The block tables. (Seq id -> list of physical block)
         kv_cache_dtype: Data type to store kv cache.
+        num_prompt_tokens: The number of tokens in the prompts. This might
+            include padding.
+        num_generation_tokens: The number of tokens in the generation sequences.
+            This might include padding.
     """
 
     def __init__(
@@ -27,6 +31,9 @@ def __init__(
         block_tables: Optional[torch.Tensor],
         use_cuda_graph: bool,
         kv_cache_dtype: str,
+        # SANG-TODO
+        # num_prompt_tokens: int,
+        # num_generation_tokens: int,
     ) -> None:
         self.is_prompt = is_prompt
         self.prompt_lens = prompt_lens
@@ -43,6 +50,45 @@ def __init__(
         # FIXME(woosuk): This is a hack.
         self.attn_bias = None
 
+        # SANG-TODO
+        # # Prompt related metadata
+        # # This value might include padding if CudaGraph is enabled.
+        # self.num_prompts = len(prompt_lens)
+        # # This value is the source of truth.
+        # self.num_prompts_tensor = torch.cuda.IntTensor([self.num_prompts])
+        # # This value might include padding if CudaGraph is enabled.
+        # self.num_prompt_tokens = num_prompt_tokens
+        # self.prompt_lens_tensor = torch.cuda.IntTensor(self.prompt_lens)
+        # self.max_prompt_len = max(prompt_lens) if prompt_lens else 0
+
+        # # Cumulative prompt lengths for each prompt in the input
+        # # tensor.
+        # self.cum_prompt_query_lens = torch.zeros(
+        #     self.num_prompts + 1,
+        #     device=self.prompt_lens_tensor.device,
+        #     dtype=torch.int32)
+        # # Cumulative context lengths.
+        # self.cum_prompt_context_lens = torch.zeros(
+        #     self.num_prompts + 1,
+        #     device=self.prompt_lens_tensor.device,
+        #     dtype=torch.int32)
+
+        # torch.cumsum(self.prompt_lens_tensor,
+        #              dim=0,
+        #              dtype=self.cum_prompt_query_lens.dtype,
+        #              out=self.cum_prompt_query_lens[1:])
+
+        # # TODO: this will be different once we support chunked prefills.
+        # self.cum_prompt_context_lens = self.cum_prompt_query_lens
+        # self.max_context_len = max(self.max_context_len, self.max_prompt_len)
+
+        # # Generation related metadata
+        # # This value might include padding if CudaGraph is enabled.
+        # self.num_generation_tokens = num_generation_tokens
+        # # This is the source of truth for the number of generation tokens.
+        # self.num_generation_tokens_tensor = torch.cuda.IntTensor(
+        #     [num_generation_tokens])
+
     def __repr__(self) -> str:
         return ("InputMetadata("
                 f"is_prompt={self.is_prompt}, "