Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ lightspeed-eval --system-config config/system_api_disabled.yaml --eval-data conf
- **Custom**
- Response Evaluation
- [`answer_correctness`](src/lightspeed_evaluation/core/metrics/custom.py)
- [`intent_eval`](src/lightspeed_evaluation/core/metrics/custom.py) - Evaluates whether the response demonstrates the expected intent or purpose
- Tool Evaluation
- [`tool_eval`](src/lightspeed_evaluation/core/metrics/custom.py) - Validates tool calls and arguments with regex pattern matching
- **Script-based**
Expand Down Expand Up @@ -136,6 +137,10 @@ metrics_metadata:
description: "How faithful the response is to the provided context"
default: false # Only used when explicitly specified

"custom:intent_eval":
threshold: 1 # Binary evaluation (0 or 1)
description: "Intent alignment evaluation using custom LLM evaluation"

"custom:tool_eval":
description: "Tool call evaluation comparing expected vs actual tool calls (regex for arguments)"

Expand Down Expand Up @@ -216,11 +221,13 @@ embedding:
- OpenShift Virtualization is an extension of the OpenShift ...
attachments: [] # Attachments (Optional)
expected_response: OpenShift Virtualization is an extension of the OpenShift Container Platform that allows running virtual machines alongside containers
expected_intent: "explain a concept" # Expected intent for intent evaluation

# Per-turn metrics (overrides system defaults)
turn_metrics:
- "ragas:faithfulness"
- "custom:answer_correctness"
- "custom:intent_eval"

# Per-turn metric configuration
turn_metrics_metadata:
Expand Down Expand Up @@ -277,6 +284,7 @@ embedding:
| `contexts` | list[string] | 📋 | Context information for evaluation | ✅ (if API enabled) |
| `attachments` | list[string] | ❌ | Attachments | ❌ |
| `expected_response` | string | 📋 | Expected response for comparison | ❌ |
| `expected_intent` | string | 📋 | Expected intent for intent evaluation| ❌ |
| `expected_tool_calls` | list[list[dict]] | 📋 | Expected tool call sequences | ❌ |
| `tool_calls` | list[list[dict]] | ❌ | Actual tool calls from API | ✅ (if API enabled) |
| `verify_script` | string | 📋 | Path to verification script | ❌ |
Expand All @@ -287,6 +295,7 @@ embedding:

Examples
> - `expected_response`: Required for `custom:answer_correctness`
> - `expected_intent`: Required for `custom:intent_eval`
> - `expected_tool_calls`: Required for `custom:tool_eval`
> - `verify_script`: Required for `script:action_eval` (used when API is enabled)
> - `response`: Required for most metrics (auto-populated if API enabled)
Expand Down
4 changes: 4 additions & 0 deletions config/system.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,10 @@ metrics_metadata:
threshold: 0.75
description: "Correctness vs expected answer using custom LLM evaluation"

"custom:intent_eval":
threshold: 1 # boolean eval (either 0 or 1)
description: "Intent alignment evaluation using custom LLM evaluation"

"custom:tool_eval":
description: "Tool call evaluation comparing expected vs actual tool calls"

Expand Down
9 changes: 7 additions & 2 deletions src/lightspeed_evaluation/core/metrics/custom/__init__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
"""Custom metrics components package."""

from lightspeed_evaluation.core.metrics.custom.custom import CustomMetrics
from lightspeed_evaluation.core.metrics.custom.prompts import ANSWER_CORRECTNESS_PROMPT
from lightspeed_evaluation.core.metrics.custom.prompts import (
ANSWER_CORRECTNESS_PROMPT,
INTENT_EVALUATION_PROMPT,
)
from lightspeed_evaluation.core.metrics.custom.tool_eval import evaluate_tool_calls

__all__ = [
"CustomMetrics",
"ANSWER_CORRECTNESS_PROMPT",
"evaluate_tool_calls",
# Prompts
"ANSWER_CORRECTNESS_PROMPT",
"INTENT_EVALUATION_PROMPT",
]
48 changes: 47 additions & 1 deletion src/lightspeed_evaluation/core/metrics/custom/custom.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,10 @@

from lightspeed_evaluation.core.llm.custom import BaseCustomLLM
from lightspeed_evaluation.core.llm.manager import LLMManager
from lightspeed_evaluation.core.metrics.custom.prompts import ANSWER_CORRECTNESS_PROMPT
from lightspeed_evaluation.core.metrics.custom.prompts import (
ANSWER_CORRECTNESS_PROMPT,
INTENT_EVALUATION_PROMPT,
)
from lightspeed_evaluation.core.metrics.custom.tool_eval import evaluate_tool_calls
from lightspeed_evaluation.core.models import EvaluationScope, TurnData
from lightspeed_evaluation.core.system.exceptions import LLMError
Expand All @@ -26,6 +29,7 @@ def __init__(self, llm_manager: LLMManager):

self.supported_metrics = {
"answer_correctness": self._evaluate_answer_correctness,
"intent_eval": self._evaluate_intent,
"tool_eval": self._evaluate_tool_calls,
}

Expand Down Expand Up @@ -195,3 +199,45 @@ def _evaluate_tool_calls(
score = 1.0 if success else 0.0

return score, details

def _evaluate_intent(
self,
_conv_data: Any,
_turn_idx: Optional[int],
turn_data: Optional[TurnData],
is_conversation: bool,
) -> tuple[Optional[float], str]:
"""Evaluate intent alignment using custom prompt."""
if is_conversation:
return None, "Intent evaluation is a turn-level metric"

if turn_data is None:
return None, "TurnData is required for intent evaluation"

if not turn_data.expected_intent:
return None, "No expected intent provided for intent evaluation"

query = turn_data.query
response = turn_data.response
expected_intent = turn_data.expected_intent

prompt = INTENT_EVALUATION_PROMPT.format(
query=query,
response=response,
expected_intent=expected_intent,
)

# Make LLM call and parse response
try:
llm_response = self._call_llm(prompt)
score, reason = self._parse_score_response(llm_response)

if score is None:
return (
None,
f"Could not parse score from LLM response: {llm_response[:100]}...",
)

return score, reason
except LLMError as e:
return None, f"Intent evaluation failed: {str(e)}"
26 changes: 26 additions & 0 deletions src/lightspeed_evaluation/core/metrics/custom/prompts.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
"""Prompts for custom metrics evaluation."""

# pylint: disable=line-too-long

# Answer Correctness Evaluation Prompt
ANSWER_CORRECTNESS_PROMPT = """Evaluate the answer correctness of the given response.

Expand All @@ -18,3 +20,27 @@
Format your response as:
Score: [your score on a scale of 0.0 to 1.0]
Reason: [your detailed explanation]"""

# Intent Evaluation Prompt
INTENT_EVALUATION_PROMPT = """Evaluate whether the response demonstrates the expected intent or purpose.

Question: {query}
Response: {response}
Expected Intent: {expected_intent}

Consider:
- What is the intent/purpose of the actual response?
- Does the response's intent match the expected intent?
- Is the response trying to achieve what is described in the expected intent?

Examples of intent evaluation:
- If expected intent is "provide instructions", check if the response is instructional
- If expected intent is "explain a concept", check if the response is explanatory
- If expected intent is "refuse or decline", check if the response is declining to help
- If expected intent is "ask for clarification", check if the response is asking questions

Rate the intent alignment and provide your reasoning. Use binary scoring: 1 for match, 0 for no match.

Format your response as:
Score: [0 or 1]
Reason: [your detailed explanation]"""
3 changes: 3 additions & 0 deletions src/lightspeed_evaluation/core/models/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,9 @@ class TurnData(BaseModel):
expected_tool_calls: Optional[list[list[dict[str, Any]]]] = Field(
default=None, description="Expected tool call sequences"
)
expected_intent: Optional[str] = Field(
default=None, min_length=1, description="Expected intent for intent evaluation"
)
conversation_id: Optional[str] = Field(
default=None, description="Conversation ID - populated by API if enabled"
)
Expand Down
4 changes: 4 additions & 0 deletions src/lightspeed_evaluation/core/system/validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,10 @@
"required_fields": ["response", "expected_response"],
"description": "requires 'response' and 'expected_response' fields",
},
"custom:intent_eval": {
"required_fields": ["response", "expected_intent"],
"description": "requires 'response' and 'expected_intent' fields",
},
"custom:tool_eval": {
"required_fields": ["tool_calls", "expected_tool_calls"],
"description": (
Expand Down