Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Frontend] [Core] Support for sharded tensorized models #4990

Merged
merged 15 commits into from
Jun 12, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Merge remote-tracking branch 'upstream/main' into sharded-tensorizer
* upstream/main: (126 commits)
  [Bugfix][Frontend] Cleanup "fix chat logprobs" (#5026)
  [Bugfix] OpenAI entrypoint limits logprobs while ignoring server defined --max-logprobs (#5312)
  [Misc] Various simplifications and typing fixes (#5368)
  [ci] Fix Buildkite agent path (#5392)
  [Doc] Add documentation for FP8 W8A8 (#5388)
  Bump version to v0.5.0 (#5384)
  [Docs] Alphabetically sort sponsors (#5386)
  [Docs] Add Docs on Limitations of VLM Support (#5383)
  [ci] Mount buildkite agent on Docker container to upload benchmark results (#5330)
  [ci] Use small_cpu_queue for doc build (#5331)
  [Bugfix] Fix LLaVA-NeXT (#5380)
  [Feature][Frontend]:  Continued `stream_options` implementation also in CompletionRequest (#5319)
  [Model] Initial support for LLaVA-NeXT (#4199)
  [Misc] Improve error message when LoRA parsing fails (#5194)
  [misc][typo] fix typo (#5372)
  [Frontend][Misc] Enforce Pixel Values as Input Type for VLMs in API Server (#5374)
  [Misc] Update to comply with the new `compressed-tensors` config (#5350)
  [Bugfix] Fix KeyError: 1 When Using LoRA adapters (#5164)
  [Kernel][Misc] Use TORCH_LIBRARY instead of PYBIND11_MODULE for custom ops (#5047)
  [mis][ci/test] fix flaky test in test_sharded_state_loader.py (#5361)
  ...
  • Loading branch information
tjohnson31415 committed Jun 11, 2024
commit 12adc56eb8a1d427881fa21ee924ff5bea9e8152
39 changes: 18 additions & 21 deletions tests/tensorizer_loader/test_tensorizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,23 +105,20 @@ def test_can_deserialize_s3(vllm_runner):
@pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
def test_deserialized_encrypted_vllm_model_has_same_outputs(
vllm_runner, tmp_path):
vllm_model = vllm_runner(model_ref)
model_path = tmp_path / (model_ref + ".tensors")
key_path = tmp_path / (model_ref + ".key")
write_keyfile(key_path)
with vllm_runner(model_ref) as vllm_model:
model_path = tmp_path / (model_ref + ".tensors")
key_path = tmp_path / (model_ref + ".key")
write_keyfile(key_path)

outputs = vllm_model.generate(prompts, sampling_params)
outputs = vllm_model.generate(prompts, sampling_params)

config_for_serializing = TensorizerConfig(
tensorizer_uri=model_path,
encryption_keyfile=key_path
)
serialize_vllm_model(get_torch_model(vllm_model),
config_for_serializing)
config_for_serializing = TensorizerConfig(
tensorizer_uri=model_path,
encryption_keyfile=key_path
)
serialize_vllm_model(get_torch_model(vllm_model),
config_for_serializing)

del vllm_model
gc.collect()
torch.cuda.empty_cache()

config_for_deserializing = TensorizerConfig(tensorizer_uri=model_path,
encryption_keyfile=key_path)
Expand Down Expand Up @@ -173,8 +170,8 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
with vllm_runner(model_ref, ) as vllm_model:
model_path = tmp_path / (model_ref + ".tensors")

serialize_vllm_model(get_torch_model(vllm_model),
TensorizerConfig(tensorizer_uri=model_path))
serialize_vllm_model(get_torch_model(vllm_model),
TensorizerConfig(tensorizer_uri=model_path))

with vllm_runner(
model_ref,
Expand Down Expand Up @@ -208,8 +205,8 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path):
with vllm_runner(model_ref, ) as vllm_model:
model_path = tmp_path / (model_ref + ".tensors")

serialize_vllm_model(get_torch_model(vllm_model),
TensorizerConfig(tensorizer_uri=model_path))
serialize_vllm_model(get_torch_model(vllm_model),
TensorizerConfig(tensorizer_uri=model_path))

model_loader_extra_config = {
"tensorizer_uri": str(model_path),
Expand Down Expand Up @@ -330,9 +327,9 @@ def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path):
model_path = tmp_path / (model_ref + ".tensors")
config = TensorizerConfig(tensorizer_uri=str(model_path))

vllm_model = vllm_runner(model_ref)
outputs = vllm_model.generate(prompts, sampling_params)
serialize_vllm_model(get_torch_model(vllm_model), config)
with vllm_runner(model_ref) as vllm_model:
outputs = vllm_model.generate(prompts, sampling_params)
serialize_vllm_model(get_torch_model(vllm_model), config)

assert is_vllm_tensorized(config)

Expand Down
1 change: 1 addition & 0 deletions vllm/worker/model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from vllm.model_executor import SamplingMetadata
from vllm.model_executor.model_loader import get_model
from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.sampling_params import SamplingParams
from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata
from vllm.utils import (CudaMemoryProfiler, get_kv_cache_torch_dtype, is_hip,
Expand Down
Loading
You are viewing a condensed version of this merge commit. You can view the full changes here.