Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,9 @@ core:
# 50 is OK on a typical laptop. Check your Judge-LLM service for max requests per minute
max_threads: 50

# If false don't fail on invalid conversations (like missing context for some metrics)
fail_on_invalid_data: true

# Judge-LLM Configuration
llm:
provider: openai # openai, watsonx, azure, gemini etc.
Expand Down
3 changes: 2 additions & 1 deletion config/system.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
# Core evaluation parameters
core:
max_threads: 50 # Maximum number of threads, set to null for Python default. 50 is OK for bigger datasets
fail_on_invalid_data: true # If False don't fail on invalid conversations (like missing context for some metrics)

# LLM as a judge configuration
llm:
Expand All @@ -28,7 +29,7 @@ embedding:
# To get real time data. Currently it supports lightspeed-stack API.
# But can be easily integrated with other APIs with minimal change.
api:
enabled: true # Enable API calls instead of using pre-filled data
enabled: true # Enable API calls instead of using pre-filled data
api_base: http://localhost:8080 # Base API URL
endpoint_type: streaming # Use "streaming" or "query" endpoint
timeout: 300 # API request timeout in seconds
Expand Down
22 changes: 22 additions & 0 deletions src/lightspeed_evaluation/core/models/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,17 @@ class TurnData(BaseModel):
default=None, description="Path to verify script for script-based evaluation"
)

# Set of turn metrics that don't pass the validation to ignore them later
_invalid_metrics: set[str] = set()

def add_invalid_metric(self, metric: str) -> None:
"""Add metric to the invalid turn metrics."""
self._invalid_metrics.add(metric)

def is_metric_invalid(self, metric: str) -> bool:
"""Returns True if the metric didn't pass the validation."""
return metric in self._invalid_metrics

@field_validator("turn_metrics")
@classmethod
def validate_turn_metrics(cls, v: Optional[list[str]]) -> Optional[list[str]]:
Expand Down Expand Up @@ -334,6 +345,17 @@ class EvaluationData(BaseModel):
description="Path to cleanup script to run after conversation ends",
)

# Set of conversation metrics that don't pass the validation to ignore them later
_invalid_metrics: set[str] = set()

def add_invalid_metric(self, metric: str) -> None:
"""Add metric to the invalid turn metrics."""
self._invalid_metrics.add(metric)

def is_metric_invalid(self, metric: str) -> bool:
"""Returns True if the metric didn't pass the validation."""
return metric in self._invalid_metrics

@field_validator("conversation_metrics")
@classmethod
def validate_conversation_metrics(
Expand Down
4 changes: 4 additions & 0 deletions src/lightspeed_evaluation/core/models/system.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,10 @@ class CoreConfig(BaseModel):
description="Maximum threads for multithreading eval",
gt=0,
)
fail_on_invalid_data: bool = Field(
default=True,
description="If False don't fail on invalid conversations",
)


class SystemConfig(BaseModel):
Expand Down
20 changes: 18 additions & 2 deletions src/lightspeed_evaluation/core/system/validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,12 +79,15 @@ def format_pydantic_error(error: ValidationError) -> str:
class DataValidator:
"""Data validator for evaluation data."""

def __init__(self, api_enabled: bool = False) -> None:
def __init__(
self, api_enabled: bool = False, fail_on_invalid_data: bool = True
) -> None:
"""Initialize validator."""
self.validation_errors: list[str] = []
self.evaluation_data: Optional[list[EvaluationData]] = None
self.api_enabled = api_enabled
self.original_data_path: Optional[str] = None
self.fail_on_invalid_data = fail_on_invalid_data

def load_evaluation_data(self, data_path: str) -> list[EvaluationData]:
"""Load and validate evaluation data from YAML file."""
Expand Down Expand Up @@ -152,7 +155,12 @@ def validate_evaluation_data(self, evaluation_data: list[EvaluationData]) -> boo
print("❌ Validation Errors:")
for error in self.validation_errors:
print(f" • {error}")
return False

if self.fail_on_invalid_data:
return False

print("❌ Validation Errors!, ignoring as instructed")
return True

validation_msg = "✅ All data validation passed"
if self.api_enabled:
Expand All @@ -169,6 +177,7 @@ def _validate_metrics_availability(self, data: EvaluationData) -> None:
if turn_data.turn_metrics:
for metric in turn_data.turn_metrics:
if metric not in TURN_LEVEL_METRICS:
turn_data.add_invalid_metric(metric)
self.validation_errors.append(
f"Conversation {conversation_id}, Turn {turn_data.turn_id}: "
f"Unknown turn metric '{metric}'"
Expand All @@ -178,6 +187,7 @@ def _validate_metrics_availability(self, data: EvaluationData) -> None:
if data.conversation_metrics:
for metric in data.conversation_metrics:
if metric not in CONVERSATION_LEVEL_METRICS:
data.add_invalid_metric(metric)
self.validation_errors.append(
f"Conversation {conversation_id}: Unknown conversation metric '{metric}'"
)
Expand All @@ -188,6 +198,10 @@ def _validate_metric_requirements(self, data: EvaluationData) -> None:

field_errors = self._check_metric_requirements(data, self.api_enabled)

# No errors
if not field_errors:
return

# Add conversation group ID prefix to errors
for error in field_errors:
self.validation_errors.append(
Expand Down Expand Up @@ -237,6 +251,8 @@ def _check_metric_requirements(
or (isinstance(field_value, str) and not field_value.strip())
or (isinstance(field_value, list) and not field_value)
):
turn_data.add_invalid_metric(metric)

api_context = (
" when API is disabled"
if field_name in api_populated_fields and not api_enabled
Expand Down
5 changes: 4 additions & 1 deletion src/lightspeed_evaluation/pipeline/evaluation/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,10 @@ def _initialize_components(self) -> None:
raise ValueError(
"SystemConfig must be loaded before initializing components"
)
self.data_validator = DataValidator(api_enabled=config.api.enabled)
self.data_validator = DataValidator(
api_enabled=config.api.enabled,
fail_on_invalid_data=config.core.fail_on_invalid_data,
)

# Metric manager
metric_manager = MetricManager(config)
Expand Down
32 changes: 32 additions & 0 deletions src/lightspeed_evaluation/pipeline/evaluation/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,22 @@ def _evaluate_turn(
results = []

for metric_identifier in turn_metrics:
if turn_data.is_metric_invalid(metric_identifier):
# The metric didn't pass the validation
error_reason = f"Invalid turn metric '{metric_identifier}', check Validation Errors"
logger.error(error_reason)

error_result = EvaluationResult( # pylint: disable=duplicate-code
conversation_group_id=conv_data.conversation_group_id,
turn_id=turn_data.turn_id,
metric_identifier=metric_identifier,
result="ERROR",
reason=error_reason,
query=turn_data.query,
)
results.append(error_result)
continue

request = EvaluationRequest.for_turn(
conv_data, metric_identifier, turn_idx, turn_data
)
Expand All @@ -204,6 +220,22 @@ def _evaluate_conversation(
results = []

for metric_identifier in conversation_metrics:
if conv_data.is_metric_invalid(metric_identifier):
# The metric didn't pass the validation
error_reason = (
f"Invalid metric '{metric_identifier}', check Validation Errors"
)
logger.error(error_reason)

error_result = EvaluationResult(
conversation_group_id=conv_data.conversation_group_id,
metric_identifier=metric_identifier,
result="ERROR",
reason=error_reason,
)
results.append(error_result)
continue

request = EvaluationRequest.for_conversation(conv_data, metric_identifier)
result = self.components.metrics_evaluator.evaluate_metric(request)
if result:
Expand Down
5 changes: 4 additions & 1 deletion src/lightspeed_evaluation/runner/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,10 @@ def run_evaluation( # pylint: disable=too-many-locals
output_config = system_config.output

# Step 2: Load and validate evaluation data
data_validator = DataValidator(api_enabled=system_config.api.enabled)
data_validator = DataValidator(
api_enabled=system_config.api.enabled,
fail_on_invalid_data=system_config.core.fail_on_invalid_data,
)
evaluation_data = data_validator.load_evaluation_data(evaluation_data_path)

print(f"✅ System config: {llm_config.provider}/{llm_config.model}")
Expand Down
Loading
Loading