Merge branch 'main' into default-chunked-prefill-2048

vllm-project · youkaichao · Nov 23, 2024 · Nov 21, 2024 · Nov 22, 2024 · Nov 22, 2024
commit e7483c5ce562ae8effe9350b1a3185668f294804
diff --git a/vllm/config.py b/vllm/config.py
@@ -1124,7 +1124,7 @@ def __post_init__(self) -> None:
                 else:
                     # This value is chosen to have a balance between ITL
                     # and TTFT. Note it is not optimized for throughput.
-                    max_num_batched_tokens = 2048
+                    self.max_num_batched_tokens = 2048
             else:
                 # If max_model_len is too short, use 2048 as the default value
                 # for higher throughput.