Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support different image sizes in prefill in VLMs #2065

Merged
merged 1 commit into from
Jun 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
{
"details": {
"best_of_sequences": null,
"finish_reason": "eos_token",
"generated_tokens": 8,
"prefill": [],
"seed": null,
"tokens": [
{
"id": 2502,
"logprob": -1.734375,
"special": false,
"text": "image"
},
{
"id": 2196,
"logprob": -0.5756836,
"special": false,
"text": " result"
},
{
"id": 604,
"logprob": -0.007843018,
"special": false,
"text": " for"
},
{
"id": 12254,
"logprob": -1.7167969,
"special": false,
"text": " chicken"
},
{
"id": 611,
"logprob": -0.17053223,
"special": false,
"text": " on"
},
{
"id": 573,
"logprob": -0.7626953,
"special": false,
"text": " the"
},
{
"id": 8318,
"logprob": -0.02709961,
"special": false,
"text": " beach"
},
{
"id": 1,
"logprob": -0.20739746,
"special": true,
"text": "<eos>"
}
],
"top_tokens": null
},
"generated_text": "image result for chicken on the beach"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
{
"details": {
"best_of_sequences": null,
"finish_reason": "eos_token",
"generated_tokens": 12,
"prefill": [],
"seed": null,
"tokens": [
{
"id": 450,
"logprob": -0.26342773,
"special": false,
"text": " The"
},
{
"id": 21282,
"logprob": -0.01838684,
"special": false,
"text": " cow"
},
{
"id": 322,
"logprob": -0.18041992,
"special": false,
"text": " and"
},
{
"id": 521,
"logprob": -0.62841797,
"special": false,
"text": " ch"
},
{
"id": 21475,
"logprob": -0.0037956238,
"special": false,
"text": "icken"
},
{
"id": 526,
"logprob": -0.018737793,
"special": false,
"text": " are"
},
{
"id": 373,
"logprob": -1.0820312,
"special": false,
"text": " on"
},
{
"id": 263,
"logprob": -0.5083008,
"special": false,
"text": " a"
},
{
"id": 25695,
"logprob": -0.07128906,
"special": false,
"text": " beach"
},
{
"id": 29889,
"logprob": -0.12573242,
"special": false,
"text": "."
},
{
"id": 32002,
"logprob": -0.0029792786,
"special": true,
"text": "<end_of_utterance>"
},
{
"id": 2,
"logprob": -0.00024962425,
"special": true,
"text": "</s>"
}
],
"top_tokens": null
},
"generated_text": " The cow and chicken are on a beach."
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
{
"details": {
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 20,
"prefill": [],
"seed": null,
"tokens": [
{
"id": 415,
"logprob": -0.04421997,
"special": false,
"text": " The"
},
{
"id": 12072,
"logprob": -0.13500977,
"special": false,
"text": " cow"
},
{
"id": 349,
"logprob": -0.06750488,
"special": false,
"text": " is"
},
{
"id": 6328,
"logprob": -0.6352539,
"special": false,
"text": " standing"
},
{
"id": 356,
"logprob": -0.16186523,
"special": false,
"text": " on"
},
{
"id": 272,
"logprob": -0.5078125,
"special": false,
"text": " the"
},
{
"id": 10305,
"logprob": -0.017913818,
"special": false,
"text": " beach"
},
{
"id": 304,
"logprob": -1.5205078,
"special": false,
"text": " and"
},
{
"id": 272,
"logprob": -0.029174805,
"special": false,
"text": " the"
},
{
"id": 13088,
"logprob": -0.003479004,
"special": false,
"text": " chicken"
},
{
"id": 349,
"logprob": -0.0035095215,
"special": false,
"text": " is"
},
{
"id": 6398,
"logprob": -0.3088379,
"special": false,
"text": " sitting"
},
{
"id": 356,
"logprob": -0.027755737,
"special": false,
"text": " on"
},
{
"id": 264,
"logprob": -0.31884766,
"special": false,
"text": " a"
},
{
"id": 17972,
"logprob": -0.047943115,
"special": false,
"text": " pile"
},
{
"id": 302,
"logprob": -0.0002925396,
"special": false,
"text": " of"
},
{
"id": 2445,
"logprob": -0.02935791,
"special": false,
"text": " money"
},
{
"id": 28723,
"logprob": -0.031219482,
"special": false,
"text": "."
},
{
"id": 32002,
"logprob": -0.00034475327,
"special": true,
"text": "<end_of_utterance>"
},
{
"id": 2,
"logprob": -1.1920929e-07,
"special": true,
"text": "</s>"
}
],
"top_tokens": null
},
"generated_text": " The cow is standing on the beach and the chicken is sitting on a pile of money."
}
23 changes: 23 additions & 0 deletions integration-tests/models/test_flash_pali_gemma.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,12 @@ async def flash_pali_gemma(flash_pali_gemma_handle):
return flash_pali_gemma_handle.client


def get_chicken():
with open("integration-tests/images/chicken_on_money.png", "rb") as image_file:
encoded_string = base64.b64encode(image_file.read())
return f"data:image/png;base64,{encoded_string.decode('utf-8')}"


def get_cow_beach():
with open("integration-tests/images/cow_beach.png", "rb") as image_file:
encoded_string = base64.b64encode(image_file.read())
Expand All @@ -37,3 +43,20 @@ async def test_flash_pali_gemma(flash_pali_gemma, response_snapshot):

assert response.generated_text == "beach"
assert response == response_snapshot


@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_pali_gemma_two_images(flash_pali_gemma, response_snapshot):
chicken = get_chicken()
cow_beach = get_cow_beach()
response = await flash_pali_gemma.generate(
f"caption![]({chicken})![]({cow_beach})\n",
max_new_tokens=20,
)
# Is PaliGemma not able to handle two separate images? At least we
# get output showing that both images are used.
assert (
response.generated_text == "image result for chicken on the beach"
), f"{repr(response.generated_text)}"
Comment on lines +53 to +61
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ha this is interesting!

I wonder if we should identify and throw an error for models that limit the number of images in the future?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That sounds like a useful validation. Not sure which models are supposed to handle multiple images in one prompt.

assert response == response_snapshot
21 changes: 21 additions & 0 deletions integration-tests/models/test_idefics.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,12 @@ def get_chicken():
return f"data:image/png;base64,{encoded_string.decode('utf-8')}"


def get_cow_beach():
with open("integration-tests/images/cow_beach.png", "rb") as image_file:
encoded_string = base64.b64encode(image_file.read())
return f"data:image/png;base64,{encoded_string.decode('utf-8')}"


@pytest.mark.asyncio
async def test_idefics(idefics, response_snapshot):
chicken = get_chicken()
Expand All @@ -39,6 +45,21 @@ async def test_idefics(idefics, response_snapshot):
assert response == response_snapshot


@pytest.mark.asyncio
@pytest.mark.private
async def test_idefics_two_images(idefics, response_snapshot):
chicken = get_chicken()
cow_beach = get_cow_beach()
response = await idefics.generate(
f"User:![]({chicken})![]({cow_beach})Where are the cow and chicken?<end_of_utterance> \nAssistant:",
max_new_tokens=20,
)
assert (
response.generated_text == " The cow and chicken are on a beach."
), f"{repr(response.generated_text)}"
assert response == response_snapshot


@pytest.mark.asyncio
async def test_idefics_load(idefics, generate_load, response_snapshot):
chicken = get_chicken()
Expand Down
23 changes: 23 additions & 0 deletions integration-tests/models/test_idefics2.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,12 @@ def get_chicken():
return f"data:image/png;base64,{encoded_string.decode('utf-8')}"


def get_cow_beach():
with open("integration-tests/images/cow_beach.png", "rb") as image_file:
encoded_string = base64.b64encode(image_file.read())
return f"data:image/png;base64,{encoded_string.decode('utf-8')}"


@pytest.fixture(scope="module")
def flash_idefics2_next_handle(launcher):
with launcher(
Expand Down Expand Up @@ -38,6 +44,23 @@ async def test_flash_idefics2_next_simple(flash_idefics2_next, response_snapshot
assert response == response_snapshot


@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_idefics2_two_images(flash_idefics2_next, response_snapshot):
chicken = get_chicken()
cow_beach = get_cow_beach()
response = await flash_idefics2_next.generate(
f"User:![]({chicken})![]({cow_beach})Where are the cow and chicken?<end_of_utterance> \nAssistant:",
max_new_tokens=20,
)
assert (
response.generated_text
== " The cow is standing on the beach and the chicken is sitting on a pile of money."
), f"{repr(response.generated_text)}"
assert response.details.generated_tokens == 20
assert response == response_snapshot


@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_idefics2_next_all_params(flash_idefics2_next, response_snapshot):
Expand Down
Loading
Loading