Merge remote-tracking branch 'upstream/main' into sharded-tensorizer

* upstream/main: (126 commits) [Bugfix][Frontend] Cleanup "fix chat logprobs" (#5026) [Bugfix] OpenAI entrypoint limits logprobs while ignoring server defined --max-logprobs (#5312) [Misc] Various simplifications and typing fixes (#5368) [ci] Fix Buildkite agent path (#5392) [Doc] Add documentation for FP8 W8A8 (#5388) Bump version to v0.5.0 (#5384) [Docs] Alphabetically sort sponsors (#5386) [Docs] Add Docs on Limitations of VLM Support (#5383) [ci] Mount buildkite agent on Docker container to upload benchmark results (#5330) [ci] Use small_cpu_queue for doc build (#5331) [Bugfix] Fix LLaVA-NeXT (#5380) [Feature][Frontend]: Continued `stream_options` implementation also in CompletionRequest (#5319) [Model] Initial support for LLaVA-NeXT (#4199) [Misc] Improve error message when LoRA parsing fails (#5194) [misc][typo] fix typo (#5372) [Frontend][Misc] Enforce Pixel Values as Input Type for VLMs in API Server (#5374) [Misc] Update to comply with the new `compressed-tensors` config (#5350) [Bugfix] Fix KeyError: 1 When Using LoRA adapters (#5164) [Kernel][Misc] Use TORCH_LIBRARY instead of PYBIND11_MODULE for custom ops (#5047) [mis][ci/test] fix flaky test in test_sharded_state_loader.py (#5361) ...
vllm-project · ywang96 · Jun 12, 2024 · May 22, 2024 · May 22, 2024 · May 23, 2024
commit 12adc56eb8a1d427881fa21ee924ff5bea9e8152
diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py
@@ -105,23 +105,20 @@ def test_can_deserialize_s3(vllm_runner):
 @pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
 def test_deserialized_encrypted_vllm_model_has_same_outputs(
         vllm_runner, tmp_path):
-    vllm_model = vllm_runner(model_ref)
-    model_path = tmp_path / (model_ref + ".tensors")
-    key_path = tmp_path / (model_ref + ".key")
-    write_keyfile(key_path)
+    with vllm_runner(model_ref) as vllm_model:
+        model_path = tmp_path / (model_ref + ".tensors")
+        key_path = tmp_path / (model_ref + ".key")
+        write_keyfile(key_path)
 
-    outputs = vllm_model.generate(prompts, sampling_params)
+        outputs = vllm_model.generate(prompts, sampling_params)
 
-    config_for_serializing = TensorizerConfig(
-        tensorizer_uri=model_path,
-        encryption_keyfile=key_path
-    )
-    serialize_vllm_model(get_torch_model(vllm_model),
-                         config_for_serializing)
+        config_for_serializing = TensorizerConfig(
+            tensorizer_uri=model_path,
+            encryption_keyfile=key_path
+        )
+        serialize_vllm_model(get_torch_model(vllm_model),
+                            config_for_serializing)
 
-    del vllm_model
-    gc.collect()
-    torch.cuda.empty_cache()
 
     config_for_deserializing = TensorizerConfig(tensorizer_uri=model_path,
                                                 encryption_keyfile=key_path)
@@ -173,8 +170,8 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
     with vllm_runner(model_ref, ) as vllm_model:
         model_path = tmp_path / (model_ref + ".tensors")
 
-    serialize_vllm_model(get_torch_model(vllm_model),
-                         TensorizerConfig(tensorizer_uri=model_path))
+        serialize_vllm_model(get_torch_model(vllm_model),
+                            TensorizerConfig(tensorizer_uri=model_path))
 
     with vllm_runner(
         model_ref,
@@ -208,8 +205,8 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path):
     with vllm_runner(model_ref, ) as vllm_model:
         model_path = tmp_path / (model_ref + ".tensors")
 
-    serialize_vllm_model(get_torch_model(vllm_model),
-                         TensorizerConfig(tensorizer_uri=model_path))
+        serialize_vllm_model(get_torch_model(vllm_model),
+                            TensorizerConfig(tensorizer_uri=model_path))
 
         model_loader_extra_config = {
             "tensorizer_uri": str(model_path),
@@ -330,9 +327,9 @@ def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path):
     model_path = tmp_path / (model_ref + ".tensors")
     config = TensorizerConfig(tensorizer_uri=str(model_path))
 
-    vllm_model = vllm_runner(model_ref)
-    outputs = vllm_model.generate(prompts, sampling_params)
-    serialize_vllm_model(get_torch_model(vllm_model), config)
+    with vllm_runner(model_ref) as vllm_model:
+        outputs = vllm_model.generate(prompts, sampling_params)
+        serialize_vllm_model(get_torch_model(vllm_model), config)
 
         assert is_vllm_tensorized(config)
 

diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
@@ -21,6 +21,7 @@
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.model_loader import get_model
 from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
+from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata
 from vllm.utils import (CudaMemoryProfiler, get_kv_cache_torch_dtype, is_hip,