support vllm==0.4.2

FreeRotate · FreeRotate · commit 1650d36c3f77 · 2024-05-31T15:47:42.000+08:00
diff --git a/api/config.py b/api/config.py
@@ -203,8 +203,8 @@ class VLLMSetting(BaseModel):
         default=get_bool_env("ENFORCE_EAGER"),
         description="Always use eager-mode PyTorch. If False, will use eager mode and CUDA graph in hybrid for maximal performance and flexibility."
     )
-    max_context_len_to_capture: Optional[int] = Field(
-        default=int(get_env("MAX_CONTEXT_LEN_TO_CAPTURE", 8192)),
+    max_seq_len_to_capture: Optional[int] = Field(
+        default=int(get_env("MAX_SEQ_LEN_TO_CAPTURE", 8192)),
         description="aximum context length covered by CUDA graphs. When a sequence has context length larger than this, we fall back to eager mode."
     )
     max_loras: Optional[int] = Field(
diff --git a/api/models.py b/api/models.py
@@ -107,7 +107,7 @@ def create_vllm_engine():
         "gpu_memory_utilization",
         "max_num_seqs",
         "enforce_eager",
-        "max_context_len_to_capture",
+        "max_seq_len_to_capture",
         "max_loras",
         "max_lora_rank",
         "lora_extra_vocab_size",
diff --git a/docker/Dockerfile.vllm b/docker/Dockerfile.vllm
@@ -6,6 +6,6 @@ COPY requirements.txt /workspace/
 
 RUN pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple && \
     pip install bitsandbytes --upgrade && \
-    pip install vllm==0.4.0 && \
+    pip install vllm==0.4.2 && \
     pip install --no-cache-dir -r /workspace/requirements.txt && \
     pip uninstall transformer-engine -y
diff --git a/docs/VLLM_SCRIPT.md b/docs/VLLM_SCRIPT.md
@@ -16,7 +16,7 @@ docker build -f docker/Dockerfile.vllm -t llm-api:vllm .
 
 ```shell
 pip install torch==2.1.0
-pip install vllm==0.4.0
+pip install vllm==0.4.2
 pip install -r requirements.txt 
 pip uninstall transformer-engine -y
 ```

Original file line number	Diff line number	Diff line change
`@@ -203,8 +203,8 @@ class VLLMSetting(BaseModel):`
`203`	`203`	`default=get_bool_env("ENFORCE_EAGER"),`
`204`	`204`	`description="Always use eager-mode PyTorch. If False, will use eager mode and CUDA graph in hybrid for maximal performance and flexibility."`
`205`	`205`	`)`
`206`		`- max_context_len_to_capture: Optional[int] = Field(`
`207`		`- default=int(get_env("MAX_CONTEXT_LEN_TO_CAPTURE", 8192)),`
	`206`	`+ max_seq_len_to_capture: Optional[int] = Field(`
	`207`	`+ default=int(get_env("MAX_SEQ_LEN_TO_CAPTURE", 8192)),`
`208`	`208`	`description="aximum context length covered by CUDA graphs. When a sequence has context length larger than this, we fall back to eager mode."`
`209`	`209`	`)`
`210`	`210`	`max_loras: Optional[int] = Field(`