vllm-project · houseroad · Jun 11, 2025 · Jun 10, 2025
@@ -125,4 +125,7 @@ def test_max_model_len():
     for output in outputs:
         num_total_tokens = len(output.prompt_token_ids) + len(
             output.outputs[0].token_ids)
-        assert num_total_tokens == max_model_len
+        # Total tokens must not exceed max_model_len.
+        # It can be less if generation finishes due to other reasons (e.g., EOS)
+        # before reaching the absolute model length limit.
+        assert num_total_tokens <= max_model_len