Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions src/inference_endpoint/metrics/reporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -1075,7 +1075,12 @@ def derive_TPOT(
output_sequence, reasoning_sequence = output_sequence_from_data(
data_bytes, join_chunks=False
)
if isinstance(output_sequence, str):
output_sequence = [output_sequence]
Comment on lines +1078 to +1079
Copy link

Copilot AI Mar 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The same string-wrapping logic should be applied to reasoning_sequence. When output_sequence_from_data is called with join_chunks=False and the data dict contains a string "reasoning" value, reasoning_sequence will also be a string. The isinstance(reasoning_sequence, list) check on line 1085 will silently drop it, causing reasoning tokens to be excluded from the TPOT calculation. Add a similar isinstance(reasoning_sequence, str) check to wrap it in a list, consistent with the fix for output_sequence.

Copilot uses AI. Check for mistakes.
Comment on lines +1078 to +1079
Copy link

Copilot AI Mar 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This bug fix lacks a unit test covering the string response case. The existing test_derive_tpot (line 57 in tests/unit/metrics/test_reporter.py) only uses list-based outputs from fake_outputs. A test should be added that stores a plain string (e.g., orjson.dumps("Hello, world")) as the COMPLETE event data to verify that derive_TPOT handles it gracefully without crashing. Since the string case has only one chunk and will be skipped at the len(all_chunks) < 2 check, the test should verify that derive_TPOT returns None (or skips that sample) rather than raising an error.

Copilot uses AI. Check for mistakes.
if not isinstance(output_sequence, list):
logging.warning(
f"Output sequence for sample {sample_uuid} is not a list but {type(output_sequence)}: {output_sequence}"
)
continue

all_chunks = output_sequence
Expand Down
82 changes: 82 additions & 0 deletions tests/unit/metrics/test_reporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,88 @@ def test_derive_tpot(events_db, sample_uuids, fake_outputs, tokenizer):
assert all(tpot == expected_tpot2 for tpot in tpot2)


def test_derive_tpot_with_string_output(tmp_path, sample_uuids, tokenizer):
"""Test that derive_TPOT handles a plain string output gracefully.

A single-string output has only one chunk, so TPOT cannot be computed.
The reporter should not raise an exception and should return None.
"""
test_db = str(tmp_path / "test_string_output.db")
uuid1 = sample_uuids(1)

with sqlite3_cursor(test_db) as (cursor, conn):
cursor.execute(
"CREATE TABLE IF NOT EXISTS events (sample_uuid VARCHAR(32), event_type VARCHAR(32), timestamp_ns INTEGER, data BLOB)"
)
cursor.executemany(
"INSERT INTO events (sample_uuid, event_type, timestamp_ns, data) VALUES (?, ?, ?, ?)",
[
("", SessionEvent.TEST_STARTED.value, 5000, b""),
(uuid1, SessionEvent.LOADGEN_ISSUE_CALLED.value, 10000, b""),
(uuid1, SampleEvent.FIRST_CHUNK.value, 10010, b""),
(
uuid1,
SampleEvent.COMPLETE.value,
10211,
orjson.dumps({"output": "the final answer"}),
),
("", SessionEvent.TEST_ENDED.value, 10300, b""),
],
)
conn.commit()

with MetricsReporter(test_db) as reporter:
tpot_rows = reporter.derive_TPOT(tokenizer)

# A single-string output produces only 1 chunk — TPOT requires at least 2
assert tpot_rows is None


def test_derive_tpot_string_output_with_list_reasoning(
tmp_path, sample_uuids, tokenizer
):
"""Test that derive_TPOT computes TPOT when string output is paired with a list reasoning sequence.

The fix wraps string outputs into a single-element list so they can be combined with
reasoning chunks. Without the fix, the string output causes the sample to be silently
skipped before reasoning is considered, so TPOT returns None even though there are
enough chunks (output + reasoning) to compute it.
"""
test_db = str(tmp_path / "test_string_output_with_reasoning.db")
uuid1 = sample_uuids(1)

with sqlite3_cursor(test_db) as (cursor, conn):
cursor.execute(
"CREATE TABLE IF NOT EXISTS events (sample_uuid VARCHAR(32), event_type VARCHAR(32), timestamp_ns INTEGER, data BLOB)"
)
cursor.executemany(
"INSERT INTO events (sample_uuid, event_type, timestamp_ns, data) VALUES (?, ?, ?, ?)",
[
("", SessionEvent.TEST_STARTED.value, 5000, b""),
(uuid1, SessionEvent.LOADGEN_ISSUE_CALLED.value, 10000, b""),
(uuid1, SampleEvent.FIRST_CHUNK.value, 10010, b""),
(
uuid1,
SampleEvent.COMPLETE.value,
10211,
orjson.dumps(
{"output": "the answer", "reasoning": ["thought step"]}
),
),
("", SessionEvent.TEST_ENDED.value, 10300, b""),
],
)
conn.commit()

with MetricsReporter(test_db) as reporter:
tpot_rows = reporter.derive_TPOT(tokenizer)

# String output ("the answer") + list reasoning (["thought step"]) = 2 chunks total,
# which is enough for TPOT computation.
assert tpot_rows is not None
assert len(tpot_rows) == 1


def test_derive_sample_latency(events_db, sample_uuids):
uuid1 = sample_uuids(1)
uuid2 = sample_uuids(2)
Expand Down