Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -559,6 +559,7 @@ def _make_metadata(
state_columns: list[str] = ["foo"],
path_to_save: Path = Path("test.jsonl"),
tools: list[Tool] | None = None,
save_image_mode: str = "base64",
) -> GenerateMetadata:
if version_info is None:
version_info = {
Expand All @@ -584,6 +585,7 @@ def _make_metadata(
state_columns=state_columns,
path_to_save=path_to_save,
tools=tools,
save_image_mode=save_image_mode,
)

return _make_metadata
90 changes: 90 additions & 0 deletions tests/test_environment_extra.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,70 @@ async def test_generate_inside_running_loop(mock_client, make_dummy_env, make_in
assert states[0].get("completion") is not None


@pytest.mark.asyncio
async def test_generate_uses_env_image_mode_setting_for_https_image_urls(
mock_openai_client, make_dummy_env, make_input
):
env = make_dummy_env(mock_openai_client)
env.set_kwargs(image_mode="placeholder")
image_prompt: vf.Messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "describe this image"},
{
"type": "image_url",
"image_url": {"url": "https://example.com/sample.png"},
},
],
}
]

outputs = await env.generate(
[make_input(example_id=0, prompt=image_prompt)],
client=mock_openai_client,
model="test-model",
)

prompt = outputs["outputs"][0]["prompt"]
assert isinstance(prompt, list)
assert prompt[0]["content"] == "describe this image\n\n[image]"
assert "images" not in prompt[0]


@pytest.mark.asyncio
async def test_generate_explicit_image_mode_overrides_env_setting(
mock_openai_client, make_dummy_env, make_input
):
env = make_dummy_env(mock_openai_client)
env.set_kwargs(image_mode="placeholder")
image_prompt: vf.Messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "describe this image"},
{
"type": "image_url",
"image_url": {"url": "data:image/png;base64,QUJDRA=="},
},
],
}
]

outputs = await env.generate(
[make_input(example_id=0, prompt=image_prompt)],
client=mock_openai_client,
model="test-model",
image_mode="base64",
)

prompt = outputs["outputs"][0]["prompt"]
assert isinstance(prompt, list)
assert prompt[0]["content"] == "describe this image\n\n[image]"
assert prompt[0]["images"][0]["media_type"] == "image/png"
assert prompt[0]["images"][0]["base64"] == "QUJDRA=="


@pytest.mark.asyncio
async def test_generate_grouped_scoring_distributes_per_group(
mock_client, make_dummy_env, make_input
Expand All @@ -329,6 +393,8 @@ async def run_group(
sampling_args,
max_retries,
state_columns,
image_mode="base64",
max_image_base64_chars=None,
):
assert isinstance(client_config, ClientConfig)
self.client_urls_per_group.append(str(client_config.api_base_url))
Expand Down Expand Up @@ -424,6 +490,8 @@ async def run_group(
sampling_args,
max_retries,
state_columns,
image_mode="base64",
max_image_base64_chars=None,
):
assert isinstance(client_config, ClientConfig)
self.client_url = str(client_config.api_base_url)
Expand Down Expand Up @@ -483,6 +551,8 @@ async def run_rollout(
sampling_args,
max_retries,
state_columns,
image_mode="base64",
max_image_base64_chars=None,
):
assert isinstance(client_config, ClientConfig)
self.client_url = str(client_config.api_base_url)
Expand Down Expand Up @@ -635,6 +705,26 @@ def model_dump(self, **kwargs):
assert isinstance(sanitized[0]["tool_calls"][0], str)


def test_sanitize_tool_calls_preserves_serialized_strings_and_extra_fields():
serialized_tool_call = (
'{"id":"x","type":"function","function":{"name":"echo","arguments":"{}"}}'
)
msgs = [
{
"role": "assistant",
"content": "",
"tool_calls": [serialized_tool_call],
"images": [{"media_type": "image/png", "base64": "QUJDRA=="}],
"custom_field": "kept",
}
]

sanitized = sanitize_tool_calls(msgs)
assert sanitized[0]["tool_calls"][0] == serialized_tool_call
assert sanitized[0]["images"] == [{"media_type": "image/png", "base64": "QUJDRA=="}]
assert sanitized[0]["custom_field"] == "kept"


def test_make_dataset_basic_without_tools(make_metadata, make_output):
results = GenerateOutputs(outputs=[make_output()], metadata=make_metadata())
ds = build_dataset(results)
Expand Down
116 changes: 115 additions & 1 deletion tests/test_eval_cli.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import argparse
import json
import os
import tempfile
import time
Expand All @@ -11,7 +12,7 @@
import verifiers.utils.eval_utils
from verifiers.types import GenerateOutputs
from verifiers.utils.eval_utils import load_toml_config
from verifiers.utils.save_utils import states_to_outputs
from verifiers.utils.save_utils import save_metadata, save_outputs, states_to_outputs


@pytest.fixture
Expand All @@ -21,6 +22,7 @@ def _run_cli(
overrides,
capture_all_configs: bool = False,
endpoints: dict | None = None,
run_evaluation_impl=None,
):
"""Run CLI with mocked arguments and capture config(s).

Expand Down Expand Up @@ -49,6 +51,7 @@ def _run_cli(
"no_interleave_scoring": False,
"state_columns": [],
"save_results": False,
"save_image_mode": "base64",
"resume": None,
"save_every": -1,
"save_to_hf_hub": False,
Expand All @@ -73,6 +76,12 @@ def _run_cli(
monkeypatch.setattr(vf_eval, "load_endpoints", lambda *_: endpoints or {})

async def fake_run_evaluation(config, **kwargs):
if run_evaluation_impl is not None:
result = await run_evaluation_impl(config, **kwargs)
captured["sampling_args"] = dict(config.sampling_args)
captured["configs"].append(config)
return result

captured["sampling_args"] = dict(config.sampling_args)
captured["configs"].append(config)
_make_metadata = make_metadata
Expand Down Expand Up @@ -858,3 +867,108 @@ def test_cli_toml_resume_false_disables_global_resume(monkeypatch, run_cli):
assert configs[0].resume_path is None
assert configs[1].env_id == "env-b"
assert configs[1].resume_path is None


def test_cli_save_dataset_with_base64_images(
monkeypatch, run_cli, make_metadata, make_state, tmp_path: Path
):
saved_results_path: Path | None = None

async def fake_run_evaluation(config, **kwargs):
nonlocal saved_results_path
state = make_state(
prompt=[
{
"role": "user",
"content": [
{"type": "text", "text": "question"},
{
"type": "image_url",
"image_url": {"url": "data:image/png;base64,QUJDRA=="},
},
],
}
],
completion=[{"role": "assistant", "content": "ok"}],
reward=1.0,
)

outputs = states_to_outputs(
[state],
image_mode=config.save_image_mode,
max_image_base64_chars=config.max_image_base64_chars,
)
saved_results_path = tmp_path / "results"
metadata = make_metadata(
env_id=config.env_id,
model=config.model,
sampling_args=config.sampling_args,
num_examples=config.num_examples,
rollouts_per_example=config.rollouts_per_example,
path_to_save=saved_results_path,
save_image_mode=config.save_image_mode,
)
if config.save_results:
save_outputs(outputs, saved_results_path)
save_metadata(metadata, saved_results_path)
return GenerateOutputs(outputs=outputs, metadata=metadata)

run_cli(
monkeypatch,
{
"save_results": True,
"save_image_mode": "base64",
"debug": True,
},
run_evaluation_impl=fake_run_evaluation,
)

assert saved_results_path is not None
results_file = saved_results_path / "results.jsonl"
assert results_file.exists()
row = json.loads(results_file.read_text(encoding="utf-8").splitlines()[0])
assert row["prompt"][0]["content"] == "question\n\n[image]"
assert row["prompt"][0]["images"][0]["media_type"] == "image/png"
assert row["prompt"][0]["images"][0]["base64"] == "QUJDRA=="


def test_cli_save_dataset_base64_limit_enforced(
monkeypatch, run_cli, make_metadata, make_state
):
monkeypatch.setattr(vf_eval, "MAX_IMAGE_BASE64_CHARS", 4)

async def fake_run_evaluation(config, **kwargs):
state = make_state(
prompt=[
{
"role": "user",
"content": [
{"type": "text", "text": "question"},
{
"type": "image_url",
"image_url": {"url": "data:image/png;base64,QUJDRA=="},
},
],
}
],
completion=[{"role": "assistant", "content": "ok"}],
reward=1.0,
)
outputs = states_to_outputs(
[state],
image_mode=config.save_image_mode,
max_image_base64_chars=config.max_image_base64_chars,
)
metadata = make_metadata(save_image_mode=config.save_image_mode)
return GenerateOutputs(outputs=outputs, metadata=metadata)

with pytest.raises(ValueError, match="exceeds max_image_base64_chars"):
run_cli(
monkeypatch,
{
"save_results": True,
"save_image_mode": "base64",
"debug": True,
},
run_evaluation_impl=fake_run_evaluation,
)
37 changes: 37 additions & 0 deletions tests/test_message_utils_audio.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# tests/test_message_utils_audio.py
from verifiers.utils.message_utils import (
ImageMode,
message_to_printable,
messages_to_printable,
)
Expand Down Expand Up @@ -108,3 +109,39 @@ def format_prompt(example):
"type": "image_url",
"image_url": {"url": "data:image/png;base64,abc123"},
}


def test_message_to_printable_base64_mode_extracts_images():
msg = {
"role": "user",
"content": [
{"type": "text", "text": "question"},
{
"type": "image_url",
"image_url": {"url": "data:image/png;base64,QUJDRA=="},
},
],
}

out = message_to_printable(msg, image_mode=ImageMode.BASE64)
assert out["content"] == "question\n\n[image]"
assert out["images"][0]["media_type"] == "image/png"
assert out["images"][0]["base64"] == "QUJDRA=="
assert out["images"][0]["base64_chars"] == 8


def test_message_to_printable_base64_mode_enforces_limit():
msg = {
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {"url": "data:image/png;base64,QUJDRA=="},
}
],
}

import pytest

with pytest.raises(ValueError, match="exceeds max_image_base64_chars"):
message_to_printable(msg, image_mode="base64", max_image_base64_chars=4)
Loading
Loading