Skip to content

[Model] Aya Vision #15441

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 51 commits into from
Apr 1, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
51 commits
Select commit Hold shift + click to select a range
aca681b
update
JenZhao Mar 24, 2025
a2a58cf
update
JenZhao Mar 24, 2025
5b137d8
hf processor
JenZhao Mar 25, 2025
3e915d3
debug
JenZhao Mar 25, 2025
17a6d45
Merge branch 'vllm-project:main' into aya
JenZhao Mar 25, 2025
3a9f3ec
Merge branch 'vllm-project:main' into aya
JenZhao Mar 26, 2025
2fa39c8
update
JenZhao Mar 27, 2025
2f6c79d
debug
JenZhao Mar 27, 2025
6e97107
debug works now
JenZhao Mar 27, 2025
cf2ac47
Merge branch 'vllm-project:main' into aya
JenZhao Mar 27, 2025
2088eea
remove flatten_2d_lists
JenZhao Mar 27, 2025
62a9bb2
matched get num patches with huggingface transformer
JenZhao Mar 27, 2025
534bbc3
clean image processor
JenZhao Mar 27, 2025
0d42ae0
revert benchmark change
JenZhao Mar 27, 2025
4295136
update
JenZhao Mar 27, 2025
1f6dae5
add tests
JenZhao Mar 27, 2025
bdd8bd4
update test
JenZhao Mar 27, 2025
8d4680d
fix
JenZhao Mar 28, 2025
105af39
Merge branch 'vllm-project:main' into aya
JenZhao Mar 28, 2025
fd488fa
address comments
JenZhao Mar 28, 2025
86a7977
address comment update registry
JenZhao Mar 28, 2025
674c71c
fix
JenZhao Mar 28, 2025
2ff5c96
fix
JenZhao Mar 28, 2025
8cf6bfe
add aya to offline example
JenZhao Mar 28, 2025
d1efab8
Update supported_models.md
JenZhao Mar 28, 2025
7e166a6
avoid fork sync conflict
JenZhao Mar 29, 2025
015d74b
address comments
JenZhao Mar 29, 2025
88d6700
Merge branch 'main' into aya
JenZhao Mar 29, 2025
9742a5b
update
JenZhao Mar 29, 2025
bcd7473
switch back to ayavision from aya_vision for now
JenZhao Mar 29, 2025
b0d328d
Merge branch 'vllm-project:main' into aya
JenZhao Mar 29, 2025
f1bcd6f
update offline example
JenZhao Mar 30, 2025
d28ab22
ayavision ->aya_vision
JenZhao Mar 30, 2025
e54a9ad
rename to aya_vision
JenZhao Mar 30, 2025
c2696b0
type hint
JenZhao Mar 30, 2025
8e4ac30
comment in example
ywang96 Mar 30, 2025
2afc5bf
add PP + v0 compatibility
ywang96 Mar 30, 2025
c10b995
add multiimage example
JenZhao Mar 30, 2025
a9e28e1
update test
JenZhao Mar 30, 2025
41426f4
modify test to add assistant token
ywang96 Mar 30, 2025
60a1431
Merge remote-tracking branch 'upstream/main' into aya
ywang96 Mar 30, 2025
59219eb
Merge branch 'vllm-project:main' into aya
JenZhao Mar 30, 2025
f0c2b0e
update
JenZhao Mar 31, 2025
9d9428d
Merge branch 'main' into aya
JenZhao Mar 31, 2025
c709d3d
integrate latest main change on embed_is_patch
JenZhao Mar 31, 2025
0282b07
update
JenZhao Mar 31, 2025
a624ddc
update
JenZhao Mar 31, 2025
fe24d94
update
JenZhao Mar 31, 2025
ae95b36
update cohere model max model length setting
JenZhao Apr 1, 2025
4285830
Merge branch 'vllm-project:main' into aya
JenZhao Apr 1, 2025
ab5a09a
address comments
JenZhao Apr 1, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions docs/source/models/supported_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -732,6 +732,13 @@ See [this page](#generative-models) for more information on how to use generativ
*
* ✅︎
* ✅︎
- * `AyaVisionForConditionalGeneration`
* Aya Vision
* T + I<sup>+</sup>
* `CohereForAI/aya-vision-8b`, `CohereForAI/aya-vision-32b`, etc.
*
* ✅︎
* ✅︎
- * `Blip2ForConditionalGeneration`
* BLIP-2
* T + I<sup>E</sup>
Expand Down
23 changes: 23 additions & 0 deletions examples/offline_inference/vision_language.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,28 @@ def run_aria(questions: list[str], modality: str) -> ModelRequestData:
)


# Aya Vision
def run_aya_vision(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "CohereForAI/aya-vision-8b"

engine_args = EngineArgs(
model=model_name,
max_model_len=2048,
max_num_seqs=2,
mm_processor_kwargs={"crop_to_patches": True},
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
)
prompts = [
f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|><image>{question}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
for question in questions
]
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)


# BLIP-2
def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
Expand Down Expand Up @@ -842,6 +864,7 @@ def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:

model_example_map = {
"aria": run_aria,
"aya_vision": run_aya_vision,
"blip-2": run_blip2,
"chameleon": run_chameleon,
"deepseek_vl_v2": run_deepseek_vl2,
Expand Down
36 changes: 36 additions & 0 deletions examples/offline_inference/vision_language_multi_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,41 @@ def load_aria(question: str, image_urls: list[str]) -> ModelRequestData:
)


def load_aya_vision(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "CohereForAI/aya-vision-8b"

engine_args = EngineArgs(
model=model_name,
max_num_seqs=2,
limit_mm_per_prompt={"image": len(image_urls)},
)

placeholders = [{"type": "image", "image": url} for url in image_urls]
messages = [{
"role":
"user",
"content": [
*placeholders,
{
"type": "text",
"text": question
},
],
}]

processor = AutoProcessor.from_pretrained(model_name)

prompt = processor.apply_chat_template(messages,
tokenize=False,
add_generation_prompt=True)

return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
image_data=[fetch_image(url) for url in image_urls],
)


def load_deepseek_vl2(question: str,
image_urls: list[str]) -> ModelRequestData:
model_name = "deepseek-ai/deepseek-vl2-tiny"
Expand Down Expand Up @@ -504,6 +539,7 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:

model_example_map = {
"aria": load_aria,
"aya_vision": load_aya_vision,
"deepseek_vl_v2": load_deepseek_vl2,
"gemma3": load_gemma3,
"h2ovl_chat": load_h2ovl,
Expand Down
14 changes: 14 additions & 0 deletions tests/models/decoder_only/vision_language/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,20 @@
max_tokens=64,
marks=[large_gpu_mark(min_gb=64)],
),
"aya_vision": VLMTestInfo(
models=["CohereForAI/aya-vision-8b"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501
single_image_prompts=IMAGE_ASSETS.prompts({
"stop_sign": "<image>What's the content in the center of the image?", # noqa: E501
"cherry_blossom": "<image>What is the season?", # noqa: E501
}),
multi_image_prompt="<image><image>Describe the two images in detail.", # noqa: E501
max_model_len=8192,
max_num_seqs=2,
auto_cls=AutoModelForImageTextToText,
vllm_runner_kwargs={"mm_processor_kwargs": {"crop_to_patches": True}}
),
"blip2": VLMTestInfo(
# TODO: Change back to 2.7b once head_dim = 80 is supported
models=["Salesforce/blip2-opt-6.7b"],
Expand Down
1 change: 1 addition & 0 deletions tests/models/multimodal/processing/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,7 @@ def _test_processing_correctness_mistral(
# yapf: disable
@pytest.mark.parametrize("model_id", [
"rhymes-ai/Aria",
"CohereForAI/aya-vision-8b",
"Salesforce/blip2-opt-2.7b",
"facebook/chameleon-7b",
"deepseek-ai/deepseek-vl2-tiny",
Expand Down
1 change: 1 addition & 0 deletions tests/models/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,7 @@ def check_available_online(
_MULTIMODAL_EXAMPLE_MODELS = {
# [Decoder-only]
"AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria"),
"AyaVisionForConditionalGeneration": _HfExamplesInfo("CohereForAI/aya-vision-8b"), # noqa: E501
"Blip2ForConditionalGeneration": _HfExamplesInfo("Salesforce/blip2-opt-2.7b", # noqa: E501
extras={"6b": "Salesforce/blip2-opt-6.7b"}), # noqa: E501
"ChameleonForConditionalGeneration": _HfExamplesInfo("facebook/chameleon-7b"), # noqa: E501
Expand Down
4 changes: 4 additions & 0 deletions vllm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -2717,6 +2717,10 @@ def _get_and_verify_max_len(
max_len_key = key if max_len < derived_max_model_len \
else max_len_key
derived_max_model_len = min(derived_max_model_len, max_len)
# For Command-R / Cohere, Cohere2 / Aya Vision models
if tmp_max_len := getattr(hf_config, "model_max_length", None):
max_len_key = "model_max_length"
derived_max_model_len = tmp_max_len

# If sliding window is manually disabled, max_length should be less
# than the sliding window length in the model config.
Expand Down
5 changes: 3 additions & 2 deletions vllm/entrypoints/chat_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -495,8 +495,9 @@ def _placeholder_str(self, modality: ModalityStr,
if model_type.startswith("llava"):
return self._cached_token_str(self._tokenizer,
hf_config.image_token_index)
if model_type in ("chameleon", "deepseek_vl_v2", "internvl_chat",
"skywork_chat", "NVLM_D", "h2ovl_chat"):
if model_type in ("aya_vision", "chameleon", "deepseek_vl_v2",
"internvl_chat", "skywork_chat", "NVLM_D",
"h2ovl_chat"):
return "<image>"
if model_type == "mllama":
return "<|image|>"
Expand Down
Loading