forked from vllm-project/vllm
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[Model] Add UltravoxModel and UltravoxConfig (vllm-project#7615)
Signed-off-by: Alvant <alvasian@yandex.ru>
- Loading branch information
1 parent
a3d7058
commit 1af67c6
Showing
33 changed files
with
1,090 additions
and
264 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
""" | ||
This example shows how to use vLLM for running offline inference | ||
with the correct prompt format on vision language models. | ||
For most models, the prompt format should follow corresponding examples | ||
on HuggingFace model repository. | ||
""" | ||
from transformers import AutoTokenizer | ||
|
||
from vllm import LLM, SamplingParams | ||
from vllm.assets.audio import AudioAsset | ||
from vllm.utils import FlexibleArgumentParser | ||
|
||
# Input audio and question | ||
audio_and_sample_rate = AudioAsset("mary_had_lamb").audio_and_sample_rate | ||
question = "What is recited in the audio?" | ||
|
||
|
||
# Ultravox 0.3 | ||
def run_ultravox(question): | ||
model_name = "fixie-ai/ultravox-v0_3" | ||
|
||
tokenizer = AutoTokenizer.from_pretrained(model_name) | ||
messages = [{ | ||
'role': 'user', | ||
'content': f"<|reserved_special_token_0|>\n{question}" | ||
}] | ||
prompt = tokenizer.apply_chat_template(messages, | ||
tokenize=False, | ||
add_generation_prompt=True) | ||
|
||
llm = LLM(model=model_name) | ||
stop_token_ids = None | ||
return llm, prompt, stop_token_ids | ||
|
||
|
||
model_example_map = { | ||
"ultravox": run_ultravox, | ||
} | ||
|
||
|
||
def main(args): | ||
model = args.model_type | ||
if model not in model_example_map: | ||
raise ValueError(f"Model type {model} is not supported.") | ||
|
||
llm, prompt, stop_token_ids = model_example_map[model](question) | ||
|
||
# We set temperature to 0.2 so that outputs can be different | ||
# even when all prompts are identical when running batch inference. | ||
sampling_params = SamplingParams(temperature=0.2, | ||
max_tokens=64, | ||
stop_token_ids=stop_token_ids) | ||
|
||
assert args.num_prompts > 0 | ||
if args.num_prompts == 1: | ||
# Single inference | ||
inputs = { | ||
"prompt": prompt, | ||
"multi_modal_data": { | ||
"audio": audio_and_sample_rate | ||
}, | ||
} | ||
|
||
else: | ||
# Batch inference | ||
inputs = [{ | ||
"prompt": prompt, | ||
"multi_modal_data": { | ||
"audio": audio_and_sample_rate | ||
}, | ||
} for _ in range(args.num_prompts)] | ||
|
||
outputs = llm.generate(inputs, sampling_params=sampling_params) | ||
|
||
for o in outputs: | ||
generated_text = o.outputs[0].text | ||
print(generated_text) | ||
|
||
|
||
if __name__ == "__main__": | ||
parser = FlexibleArgumentParser( | ||
description='Demo on using vLLM for offline inference with ' | ||
'audio language models') | ||
parser.add_argument('--model-type', | ||
'-m', | ||
type=str, | ||
default="ultravox", | ||
choices=model_example_map.keys(), | ||
help='Huggingface "model_type".') | ||
parser.add_argument('--num-prompts', | ||
type=int, | ||
default=1, | ||
help='Number of prompts to run.') | ||
|
||
args = parser.parse_args() | ||
main(args) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
"""An example showing how to use vLLM to serve VLMs. | ||
Launch the vLLM server with the following command: | ||
vllm serve fixie-ai/ultravox-v0_3 | ||
""" | ||
import base64 | ||
|
||
import requests | ||
from openai import OpenAI | ||
|
||
from vllm.assets.audio import AudioAsset | ||
|
||
# Modify OpenAI's API key and API base to use vLLM's API server. | ||
openai_api_key = "EMPTY" | ||
openai_api_base = "http://localhost:8000/v1" | ||
|
||
client = OpenAI( | ||
# defaults to os.environ.get("OPENAI_API_KEY") | ||
api_key=openai_api_key, | ||
base_url=openai_api_base, | ||
) | ||
|
||
models = client.models.list() | ||
model = models.data[0].id | ||
|
||
# Any format supported by librosa is supported | ||
audio_url = AudioAsset("winning_call").url | ||
|
||
# Use audio url in the payload | ||
chat_completion_from_url = client.chat.completions.create( | ||
messages=[{ | ||
"role": | ||
"user", | ||
"content": [ | ||
{ | ||
"type": "text", | ||
"text": "What's in this audio?" | ||
}, | ||
{ | ||
"type": "audio_url", | ||
"audio_url": { | ||
"url": audio_url | ||
}, | ||
}, | ||
], | ||
}], | ||
model=model, | ||
max_tokens=64, | ||
) | ||
|
||
result = chat_completion_from_url.choices[0].message.content | ||
print(f"Chat completion output:{result}") | ||
|
||
|
||
# Use base64 encoded audio in the payload | ||
def encode_audio_base64_from_url(audio_url: str) -> str: | ||
"""Encode an audio retrieved from a remote url to base64 format.""" | ||
|
||
with requests.get(audio_url) as response: | ||
response.raise_for_status() | ||
result = base64.b64encode(response.content).decode('utf-8') | ||
|
||
return result | ||
|
||
|
||
audio_base64 = encode_audio_base64_from_url(audio_url=audio_url) | ||
chat_completion_from_base64 = client.chat.completions.create( | ||
messages=[{ | ||
"role": | ||
"user", | ||
"content": [ | ||
{ | ||
"type": "text", | ||
"text": "What's in this audio?" | ||
}, | ||
{ | ||
"type": "audio_url", | ||
"audio_url": { | ||
# Any format supported by librosa is supported | ||
"url": f"data:audio/ogg;base64,{audio_base64}" | ||
}, | ||
}, | ||
], | ||
}], | ||
model=model, | ||
max_tokens=64, | ||
) | ||
|
||
result = chat_completion_from_base64.choices[0].message.content | ||
print(f"Chat completion output:{result}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.