Change default values for decode bucket flags (vllm-project#316)

Change default values for decode bucket flags
jikunshang · Sep 25, 2024 · 45ee586 · 45ee586
1 parent cef2f54
commit 45ee586
Show file tree

Hide file tree

Showing 3 changed files with 17 additions and 18 deletions.
diff --git a/README_GAUDI.md b/README_GAUDI.md
@@ -321,7 +321,7 @@ for graph capture (later referred to as \"usable graph memory\"), and
 the remaining 90% will be utilized for KV cache. Environment variable
 `VLLM_GRAPH_PROMPT_RATIO` determines the ratio of usable graph memory
 reserved for prefill and decode graphs. By default
-(`VLLM_GRAPH_PROMPT_RATIO=0.5`), both stages have equal memory
+(`VLLM_GRAPH_PROMPT_RATIO=0.3`), both stages have equal memory
 constraints. Lower value corresponds to less usable graph memory
 reserved for prefill stage, e.g. `VLLM_GRAPH_PROMPT_RATIO=0.2` will
 reserve 20% of usable graph memory for prefill graphs, and 80% of usable
@@ -388,7 +388,7 @@ INFO 08-02 17:37:54 habana_worker.py:190] Initializing cache engine took 23.73 G
 INFO 08-02 17:37:54 habana_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:55.43 GiB
 ...
 INFO 08-02 17:38:22 habana_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
-INFO 08-02 17:38:22 habana_model_runner.py:1159] Using 15.85 GiB/55.43 GiB of free device memory for HPUGraphs, 7.923 GiB for prompt and 7.923 GiB for decode (VLLM_GRAPH_PROMPT_RATIO=0.5)
+INFO 08-02 17:38:22 habana_model_runner.py:1159] Using 15.85 GiB/55.43 GiB of free device memory for HPUGraphs, 4.755 GiB for prompt and 11.095 GiB for decode (VLLM_GRAPH_PROMPT_RATIO=0.3)
 INFO 08-02 17:38:22 habana_model_runner.py:1066] [Warmup][Graph/Prompt][1/24] batch_size:1 seq_len:128 free_mem:55.43 GiB
 ...
 INFO 08-02 17:38:26 habana_model_runner.py:1066] [Warmup][Graph/Prompt][11/24] batch_size:1 seq_len:896 free_mem:48.77 GiB
@@ -448,7 +448,7 @@ Environment variables
 -   `VLLM_GRAPH_RESERVED_MEM`: percentage of memory dedicated for
     HPUGraph capture, `0.1` by default
 -   `VLLM_GRAPH_PROMPT_RATIO`: percentage of reserved graph memory
-    dedicated for prompt graphs, `0.5` by default
+    dedicated for prompt graphs, `0.3` by default
 -   `VLLM_GRAPH_PROMPT_STRATEGY`: strategy determining order of prompt
     graph capture, `min_tokens` or `max_bs`, `min_tokens` by default
 -   `VLLM_GRAPH_DECODE_STRATEGY`: strategy determining order of decode
@@ -472,15 +472,15 @@ Environment variables
                     `max_model_len`
 
         - Decode:
-            - batch size min (`VLLM_DECODE_BS_BUCKET_MIN`): `min(max_num_seqs, 32)`
+            - batch size min (`VLLM_DECODE_BS_BUCKET_MIN`): `1`
             -   batch size step (`VLLM_DECODE_BS_BUCKET_STEP`):
                     `min(max_num_seqs, 32)`
             -   batch size max (`VLLM_DECODE_BS_BUCKET_MAX`):
                     `max_num_seqs`
             -   block size min (`VLLM_DECODE_BLOCK_BUCKET_MIN`):
-                    `128`
+                    `block_size`
             -   block size step
-                    (`VLLM_DECODE_BLOCK_BUCKET_STEP`): `128`
+                    (`VLLM_DECODE_BLOCK_BUCKET_STEP`): `block_size`
             -   block size max (`VLLM_DECODE_BLOCK_BUCKET_MAX`):
                     `max(128, (max_num_seqs*max_model_len)/block_size)`
 

diff --git a/docs/source/getting_started/gaudi-installation.rst b/docs/source/getting_started/gaudi-installation.rst
@@ -245,7 +245,7 @@ Only after that, ``gpu_memory_utilization`` flag is utilized - at its default va
 Next, KV cache gets allocated, model is warmed up, and HPU Graphs are captured. 
 Environment variable ``VLLM_GRAPH_RESERVED_MEM`` defines the ratio of memory reserved for HPU Graphs capture. 
 With its default value (``VLLM_GRAPH_RESERVED_MEM=0.1``), 10% of usable memory will be reserved for graph capture (later referred to as "usable graph memory"), and the remaining 90% will be utilized for KV cache. 
-Environment variable ``VLLM_GRAPH_PROMPT_RATIO`` determines the ratio of usable graph memory reserved for prefill and decode graphs. By default (``VLLM_GRAPH_PROMPT_RATIO=0.5``), both stages have equal memory constraints. 
+Environment variable ``VLLM_GRAPH_PROMPT_RATIO`` determines the ratio of usable graph memory reserved for prefill and decode graphs. By default (``VLLM_GRAPH_PROMPT_RATIO=0.3``), both stages have equal memory constraints.
 Lower value corresponds to less usable graph memory reserved for prefill stage, e.g. ``VLLM_GRAPH_PROMPT_RATIO=0.2`` will reserve 20% of usable graph memory for prefill graphs, and 80% of usable graph memory for decode graphs. 
 
 .. note:: 
@@ -280,7 +280,7 @@ Each described step is logged by vLLM server, as follows (negative values corres
    INFO 08-02 17:37:54 habana_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:55.43 GiB
    ...
    INFO 08-02 17:38:22 habana_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
-   INFO 08-02 17:38:22 habana_model_runner.py:1159] Using 15.85 GiB/55.43 GiB of free device memory for HPUGraphs, 7.923 GiB for prompt and 7.923 GiB for decode (VLLM_GRAPH_PROMPT_RATIO=0.5)
+   INFO 08-02 17:38:22 habana_model_runner.py:1159] Using 15.85 GiB/55.43 GiB of free device memory for HPUGraphs, 7.923 GiB for prompt and 7.923 GiB for decode (VLLM_GRAPH_PROMPT_RATIO=0.3)
    INFO 08-02 17:38:22 habana_model_runner.py:1066] [Warmup][Graph/Prompt][1/24] batch_size:1 seq_len:128 free_mem:55.43 GiB
    ...
    INFO 08-02 17:38:26 habana_model_runner.py:1066] [Warmup][Graph/Prompt][11/24] batch_size:1 seq_len:896 free_mem:48.77 GiB
@@ -324,7 +324,7 @@ Environment variables
 
 -   ``VLLM_SKIP_WARMUP``: if ``true``, warmup will be skipped, ``false`` by default
 -   ``VLLM_GRAPH_RESERVED_MEM``: percentage of memory dedicated for HPUGraph capture, ``0.1`` by default
--   ``VLLM_GRAPH_PROMPT_RATIO``: percentage of reserved graph memory dedicated for prompt graphs, ``0.5`` by default
+-   ``VLLM_GRAPH_PROMPT_RATIO``: percentage of reserved graph memory dedicated for prompt graphs, ``0.3`` by default
 -   ``VLLM_GRAPH_PROMPT_STRATEGY``: strategy determining order of prompt graph capture, ``min_tokens`` or ``max_bs``, ``min_tokens`` by default
 -   ``VLLM_GRAPH_DECODE_STRATEGY``: strategy determining order of decode graph capture, ``min_tokens`` or ``max_bs``, ``max_bs`` by default
 -   ``VLLM_{phase}_{dim}_BUCKET_{param}`` - collection of 12 environment variables configuring ranges of bucketing mechanism
@@ -343,11 +343,11 @@ Environment variables
          - sequence length max (``VLLM_PROMPT_SEQ_BUCKET_MAX``): ``max_model_len``
 
       - Decode:
-         - batch size min (``VLLM_DECODE_BS_BUCKET_MIN``): ``min(max_num_seqs, 32)``
+         - batch size min (``VLLM_DECODE_BS_BUCKET_MIN``): ``1``
          - batch size step (``VLLM_DECODE_BS_BUCKET_STEP``): ``min(max_num_seqs, 32)``
          - batch size max (``VLLM_DECODE_BS_BUCKET_MAX``): ``max_num_seqs``
-         - sequence length min (``VLLM_DECODE_BLOCK_BUCKET_MIN``): ``128``
-         - sequence length step (``VLLM_DECODE_BLOCK_BUCKET_STEP``): ``128``
+         - sequence length min (``VLLM_DECODE_BLOCK_BUCKET_MIN``): ``block_size``
+         - sequence length step (``VLLM_DECODE_BLOCK_BUCKET_STEP``): ``block_size``
          - sequence length max (``VLLM_DECODE_BLOCK_BUCKET_MAX``): ``max(128, (max_num_seqs*max_model_len)/block_size)``
 
 

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
@@ -670,7 +670,6 @@ def _setup_buckets(self) -> None:
         if self.lora_config and \
             max_bucket_cfg > self.max_num_batched_tokens // self.block_size:
             max_bucket_cfg = self.max_num_batched_tokens // self.block_size
-        blocks_step = 128
         #FIXME: The default values should be max_model_len
         max_prompt_seq = 1024
         max_decode_seq = 2048
@@ -682,7 +681,7 @@ def _setup_buckets(self) -> None:
             max=align_bs(max_bucket_cfg))
         self.decode_bs_bucket_cfg = read_bucket_settings('decode',
                                                          'bs',
-                                                         min=align_bs(32),
+                                                         min=1,
                                                          step=align_bs(32),
                                                          max=self.max_num_seqs)
         self.prompt_seq_bucket_cfg = read_bucket_settings('prompt',
@@ -693,9 +692,9 @@ def _setup_buckets(self) -> None:
         self.decode_block_bucket_cfg = read_bucket_settings(
             'decode',
             'block',
-            min=blocks_step,
-            step=blocks_step,
-            max=max(blocks_step,
+            min=self.block_size,
+            step=self.block_size,
+            max=max(self.block_size,
                     self.max_num_seqs * max_decode_seq // self.block_size))
         self.graphed_buckets: Set[Any] = set()
 
@@ -1594,7 +1593,7 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None:
                 graph_free_mem = align_workers(graph_free_mem,
                                                torch.distributed.ReduceOp.MIN)
                 prompt_graph_mem_ratio = float(
-                    os.environ.get('VLLM_GRAPH_PROMPT_RATIO', '0.5'))
+                    os.environ.get('VLLM_GRAPH_PROMPT_RATIO', '0.3'))
                 prompt_available_memory = (prompt_graph_mem_ratio *
                                            graph_free_mem)
                 decode_available_memory = (graph_free_mem -