Skip to content

Commit fa606a6

Browse files
authored
fix(evaluate): evaluate function add evalset (#204)
1 parent 1a03340 commit fa606a6

File tree

3 files changed

+46
-30
lines changed

3 files changed

+46
-30
lines changed

veadk/evaluation/adk_evaluator/adk_evaluator.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@
2424
)
2525
from google.adk.evaluation.eval_case import IntermediateData, Invocation
2626
from google.adk.evaluation.evaluator import EvalStatus
27+
from google.adk.evaluation.eval_set import EvalSet
28+
from typing import Optional
2729
from typing_extensions import override
2830
from veadk.evaluation.base_evaluator import BaseEvaluator
2931
from types import SimpleNamespace
@@ -52,7 +54,8 @@ def __init__(
5254
@override
5355
async def evaluate(
5456
self,
55-
eval_set_file_path: str,
57+
eval_set: Optional[EvalSet] = None,
58+
eval_set_file_path: Optional[str] = None,
5659
eval_id: str = f"test_{formatted_timestamp()}",
5760
tool_score_threshold: float = 1.0,
5861
response_match_score_threshold: float = 0.8,
@@ -104,7 +107,7 @@ async def evaluate(
104107
# Iterate each test file and evaluate per-case, per-metric
105108
for test_file in test_files:
106109
# Build in-memory evaluation cases via BaseEvaluator from the provided file
107-
self.build_eval_set(test_file)
110+
self.build_eval_set(eval_set, test_file)
108111

109112
evaluation_result_list = []
110113

veadk/evaluation/base_evaluator.py

Lines changed: 36 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
import time
1818
import uuid
1919
from abc import abstractmethod
20-
from typing import Any
20+
from typing import Any, Optional
2121

2222
from google.adk import Runner
2323
from google.adk.evaluation.eval_set import EvalSet
@@ -210,33 +210,43 @@ def _build_eval_set_from_tracing_json(self, tracing_json_path: str) -> EvalSet:
210210

211211
return evalset
212212

213-
def build_eval_set(self, file_path: str):
213+
def build_eval_set(
214+
self, eval_set: Optional[EvalSet] = None, file_path: Optional[str] = None
215+
):
214216
"""Generate evaluation data from a given file and assign it to the class attribute `invocation_list`."""
215-
eval_case_data_list: list[EvalTestCase] = []
216217

217-
try:
218-
with open(file_path, "r", encoding="utf-8") as f:
219-
file_content = json.load(f)
220-
except json.JSONDecodeError as e:
221-
raise ValueError(f"Invalid JSON format in file {file_path}: {e}")
222-
except Exception as e:
223-
raise ValueError(f"Error reading file {file_path}: {e}")
224-
225-
if isinstance(file_content, dict) and "eval_cases" in file_content:
226-
eval_cases = self._build_eval_set_from_eval_json(file_path).eval_cases
227-
elif (
228-
isinstance(file_content, list)
229-
and len(file_content) > 0
230-
and all(
231-
isinstance(span, dict) and "trace_id" in span for span in file_content
232-
)
233-
):
234-
eval_cases = self._build_eval_set_from_tracing_json(file_path).eval_cases
218+
if eval_set is None and file_path is None:
219+
raise ValueError("eval_set or file_path is required")
220+
if eval_set:
221+
eval_cases = eval_set.eval_cases
235222
else:
236-
raise ValueError(
237-
f"Unsupported file format in {file_path}. Please provide a valid file."
238-
)
223+
try:
224+
with open(file_path, "r", encoding="utf-8") as f:
225+
file_content = json.load(f)
226+
except json.JSONDecodeError as e:
227+
raise ValueError(f"Invalid JSON format in file {file_path}: {e}")
228+
except Exception as e:
229+
raise ValueError(f"Error reading file {file_path}: {e}")
230+
231+
if isinstance(file_content, dict) and "eval_cases" in file_content:
232+
eval_cases = self._build_eval_set_from_eval_json(file_path).eval_cases
233+
elif (
234+
isinstance(file_content, list)
235+
and len(file_content) > 0
236+
and all(
237+
isinstance(span, dict) and "trace_id" in span
238+
for span in file_content
239+
)
240+
):
241+
eval_cases = self._build_eval_set_from_tracing_json(
242+
file_path
243+
).eval_cases
244+
else:
245+
raise ValueError(
246+
f"Unsupported file format in {file_path}. Please provide a valid file."
247+
)
239248

249+
eval_case_data_list: list[EvalTestCase] = []
240250
for eval_case in eval_cases:
241251
eval_case_data = EvalTestCase(invocations=[])
242252
if eval_case.session_input:
@@ -384,8 +394,9 @@ def get_eval_set_information(self) -> list[list[dict[str, Any]]]:
384394
@abstractmethod
385395
async def evaluate(
386396
self,
387-
eval_set_file_path: str,
388397
metrics: list[Any],
398+
eval_set: Optional[EvalSet],
399+
eval_set_file_path: Optional[str],
389400
eval_id: str,
390401
):
391402
"""An abstract method for evaluation based on metrics。"""

veadk/evaluation/deepeval_evaluator/deepeval_evaluator.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,8 @@
2222
from deepeval.test_case import LLMTestCase
2323
from deepeval.test_case.llm_test_case import ToolCall
2424
from typing_extensions import override
25-
25+
from typing import Optional
26+
from google.adk.evaluation.eval_set import EvalSet
2627
from veadk.config import getenv
2728
from veadk.evaluation.base_evaluator import BaseEvaluator, EvalResultData, MetricResult
2829
from veadk.evaluation.types import EvalResultCaseData, EvalResultMetadata
@@ -77,13 +78,14 @@ def __init__(
7778
@override
7879
async def evaluate(
7980
self,
80-
eval_set_file_path: str,
8181
metrics: list[BaseMetric],
82+
eval_set: Optional[EvalSet] = None,
83+
eval_set_file_path: Optional[str] = None,
8284
eval_id: str = f"test_{formatted_timestamp()}",
8385
):
8486
"""Target to Google ADK, we will use the same evaluation case format as Google ADK."""
8587
# Get evaluation data by parsing eval set file
86-
self.build_eval_set(eval_set_file_path)
88+
self.build_eval_set(eval_set, eval_set_file_path)
8789

8890
# Get actual data by running agent
8991
logger.info("Start to run agent for actual data.")

0 commit comments

Comments
 (0)