Skip to content

Commit 7ae013e

Browse files
vertex-sdk-botcopybara-github
authored andcommitted
feat: GenAI Client(evals) - Register hallucination and tool_usage_quality metrics for agent eval
PiperOrigin-RevId: 821797229
1 parent e2aa3eb commit 7ae013e

File tree

4 files changed

+89
-5
lines changed

4 files changed

+89
-5
lines changed

tests/unit/vertexai/genai/test_evals.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4266,6 +4266,28 @@ def test_execute_evaluation_llm_metric(
42664266
call_args = mock_eval_dependencies["mock_evaluate_instances"].call_args
42674267
assert "pointwise_metric_input" in call_args[1]["metric_config"]
42684268

4269+
def test_execute_evaluation_hallucination_metric(self, mock_api_client_fixture):
4270+
dataset_df = pd.DataFrame(
4271+
[{"prompt": "Test prompt", "response": "Test response"}]
4272+
)
4273+
input_dataset = vertexai_genai_types.EvaluationDataset(
4274+
eval_dataset_df=dataset_df
4275+
)
4276+
4277+
result = _evals_common._execute_evaluation(
4278+
api_client=mock_api_client_fixture,
4279+
dataset=input_dataset,
4280+
metrics=[
4281+
vertexai_genai_types.RubricMetric.HALLUCINATION,
4282+
vertexai_genai_types.RubricMetric.TOOL_USE_QUALITY,
4283+
],
4284+
)
4285+
assert isinstance(result, vertexai_genai_types.EvaluationResult)
4286+
assert result.evaluation_dataset == [input_dataset]
4287+
assert len(result.summary_metrics) == 2
4288+
assert result.summary_metrics[0].metric_name == "hallucination_v1"
4289+
assert result.summary_metrics[1].metric_name == "tool_use_quality_v1"
4290+
42694291
@mock.patch.object(_evals_data_converters, "get_dataset_converter")
42704292
def test_execute_evaluation_with_openai_schema(
42714293
self,

vertexai/_genai/_evals_constant.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@
2626
"final_response_match_v2",
2727
"final_response_reference_free_v1",
2828
"final_response_quality_v1",
29+
"hallucination_v1",
30+
"tool_use_quality_v1",
2931
}
3032
)
3133

vertexai/_genai/_evals_utils.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -595,6 +595,14 @@ def MULTI_TURN_SAFETY(self) -> LazyLoadedPrebuiltMetric:
595595
def FINAL_RESPONSE_QUALITY(self) -> LazyLoadedPrebuiltMetric:
596596
return self.__getattr__("FINAL_RESPONSE_QUALITY")
597597

598+
@property
599+
def HALLUCINATION(self) -> LazyLoadedPrebuiltMetric:
600+
return self.__getattr__("HALLUCINATION")
601+
602+
@property
603+
def TOOL_USE_QUALITY(self) -> LazyLoadedPrebuiltMetric:
604+
return self.__getattr__("TOOL_USE_QUALITY")
605+
598606

599607
PrebuiltMetric = PrebuiltMetricLoader()
600608
RubricMetric = PrebuiltMetric

vertexai/_genai/_evals_visualization.py

Lines changed: 57 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -450,22 +450,74 @@ def _get_evaluation_html(eval_result_json: str) -> str:
450450
const candidateMetrics = (caseResult.response_candidate_results && caseResult.response_candidate_results[0] && caseResult.response_candidate_results[0].metric_results) || {{}};
451451
Object.entries(candidateMetrics).forEach(([name, val]) => {{
452452
let metricNameCell = name;
453-
if (val.rubric_verdicts && val.rubric_verdicts.length > 0) {{
454-
metricNameCell += '<div class="rubric-bubble-container" style="margin-top: 8px;">';
453+
let explanationHandled = false;
454+
let bubbles = '';
455+
456+
if (name.startsWith('hallucination') && val.explanation) {{
457+
try {{
458+
const explanationData = JSON.parse(val.explanation);
459+
if (Array.isArray(explanationData) && explanationData.length > 0 && explanationData[0].sentence) {{
460+
bubbles += '<div class="rubric-bubble-container" style="margin-top: 8px;">';
461+
explanationData.forEach(item => {{
462+
const sentence = item.sentence || 'N/A';
463+
const label = item.label ? item.label.toLowerCase() : '';
464+
const verdictText = label === 'no_rad' ? '<span class="pass">Pass</span>' : '<span class="fail">Fail</span>';
465+
const rationale = item.rationale || 'N/A';
466+
const itemJson = JSON.stringify(item, null, 2);
467+
bubbles += `
468+
<details class="rubric-details">
469+
<summary class="rubric-bubble">${{verdictText}}: ${{DOMPurify.sanitize(sentence)}}</summary>
470+
<div class="explanation" style="padding: 10px 0 0 20px;">${{DOMPurify.sanitize(rationale)}}</div>
471+
<pre class="raw-json-container">${{DOMPurify.sanitize(itemJson)}}</pre>
472+
</details>`;
473+
}});
474+
bubbles += '</div>';
475+
explanationHandled = true;
476+
}}
477+
}} catch (e) {{
478+
console.error("Failed to parse hallucination explanation:", e);
479+
}}
480+
}} else if (name.startsWith('safety') && val.score != null) {{
481+
try {{
482+
bubbles += '<div class="rubric-bubble-container" style="margin-top: 8px;">';
483+
const verdictText = val.score >= 1.0 ? '<span class="pass">Pass</span>' : '<span class="fail">Fail</span>';
484+
const explanation = val.explanation || (val.score >= 1.0 ? 'Safety check passed' : 'Safety check failed');
485+
const itemJson = JSON.stringify(val, null, 2);
486+
bubbles += `
487+
<details class="rubric-details">
488+
<summary class="rubric-bubble">${{verdictText}}: ${{DOMPurify.sanitize(explanation)}}</summary>
489+
<pre class="raw-json-container">${{DOMPurify.sanitize(itemJson)}}</pre>
490+
</details>`;
491+
bubbles += '</div>';
492+
explanationHandled = true;
493+
}} catch (e) {{
494+
console.error("Failed to process safety metric:", e);
495+
}}
496+
}}
497+
498+
if (!bubbles && val.rubric_verdicts && val.rubric_verdicts.length > 0) {{
499+
bubbles += '<div class="rubric-bubble-container" style="margin-top: 8px;">';
455500
val.rubric_verdicts.forEach(verdict => {{
456501
const rubricDescription = verdict.evaluated_rubric && verdict.evaluated_rubric.content && verdict.evaluated_rubric.content.property ? verdict.evaluated_rubric.content.property.description : 'N/A';
457502
const verdictText = verdict.verdict ? '<span class="pass">Pass</span>' : '<span class="fail">Fail</span>';
458503
const verdictJson = JSON.stringify(verdict, null, 2);
459-
metricNameCell += `
504+
bubbles += `
460505
<details class="rubric-details">
461506
<summary class="rubric-bubble">${{verdictText}}: ${{DOMPurify.sanitize(rubricDescription)}}</summary>
462507
<pre class="raw-json-container">${{DOMPurify.sanitize(verdictJson)}}</pre>
463508
</details>`;
464509
}});
465-
metricNameCell += '</div>';
510+
bubbles += '</div>';
511+
}}
512+
513+
if(bubbles) {{
514+
metricNameCell += bubbles;
466515
}}
516+
467517
metricTable += `<tr><td>${{metricNameCell}}</td><td><b>${{val.score != null ? val.score.toFixed(2) : 'N/A'}}</b></td></tr>`;
468-
if (val.explanation) {{ metricTable += `<tr><td colspan="2"><div class="explanation">${{DOMPurify.sanitize(marked.parse(String(val.explanation)))}}</div></td></tr>`; }}
518+
if (val.explanation && !explanationHandled) {{
519+
metricTable += `<tr><td colspan="2"><div class="explanation">${{DOMPurify.sanitize(marked.parse(String(val.explanation)))}}</div></td></tr>`;
520+
}}
469521
}});
470522
card += metricTable + '</tbody></table>';
471523
container.innerHTML += card + '</details>';

0 commit comments

Comments
 (0)