huggingface · eldarkurtic · May 22, 2025 · May 22, 2025
diff --git a/src/lighteval/models/vllm/vllm_model.py b/src/lighteval/models/vllm/vllm_model.py
@@ -104,9 +104,10 @@ class VLLMModelConfig(ModelConfig):
     max_num_seqs: PositiveInt = 128  # maximum number of sequences per iteration; This variable and `max_num_batched_tokens` effectively control the batch size at prefill stage. See https://github.com/vllm-project/vllm/issues/2492 for detailed explaination.
     max_num_batched_tokens: PositiveInt = 2048  # maximum number of tokens per batch
     subfolder: str | None = None
+    kv_cache_dtype: str = "auto"
+    calculate_kv_scales: bool = False
     is_async: bool = False  # Whether to use the async version or sync version of the model
 
-
 class VLLMModel(LightevalModel):
     def __init__(
         self,
@@ -187,6 +188,8 @@ def _create_auto_model(self, config: VLLMModelConfig) -> Optional[LLM]:
             "seed": int(config.seed),
             "max_num_seqs": int(config.max_num_seqs),
             "max_num_batched_tokens": int(config.max_num_batched_tokens),
+            "kv_cache_dtype": config.kv_cache_dtype,
+            "calculate_kv_scales": config.calculate_kv_scales,
         }
 
         if config.quantization is not None: