lightspeed-core · tisnik · Oct 10, 2025 · Oct 10, 2025
diff --git a/README.md b/README.md
@@ -85,6 +85,7 @@ lightspeed-eval --system-config config/system_api_disabled.yaml --eval-data conf
 - **Custom**
   - Response Evaluation
     - [`answer_correctness`](src/lightspeed_evaluation/core/metrics/custom.py)
+    - [`intent_eval`](src/lightspeed_evaluation/core/metrics/custom.py) - Evaluates whether the response demonstrates the expected intent or purpose
   - Tool Evaluation
     - [`tool_eval`](src/lightspeed_evaluation/core/metrics/custom.py) - Validates tool calls and arguments with regex pattern matching
 - **Script-based**
@@ -136,6 +137,10 @@ metrics_metadata:
       description: "How faithful the response is to the provided context"
       default: false  # Only used when explicitly specified
 
+    "custom:intent_eval":
+      threshold: 1  # Binary evaluation (0 or 1)
+      description: "Intent alignment evaluation using custom LLM evaluation"
+
     "custom:tool_eval":
       description: "Tool call evaluation comparing expected vs actual tool calls (regex for arguments)"
 
@@ -216,11 +221,13 @@ embedding:
         - OpenShift Virtualization is an extension of the OpenShift ...
       attachments: []                   # Attachments (Optional)
       expected_response: OpenShift Virtualization is an extension of the OpenShift Container Platform that allows running virtual machines alongside containers
+      expected_intent: "explain a concept"  # Expected intent for intent evaluation
 
       # Per-turn metrics (overrides system defaults)
       turn_metrics:
         - "ragas:faithfulness"
         - "custom:answer_correctness"
+        - "custom:intent_eval"
 
       # Per-turn metric configuration
       turn_metrics_metadata:
@@ -277,6 +284,7 @@ embedding:
 | `contexts`            | list[string]     | 📋       | Context information for evaluation   | ✅ (if API enabled)   |
 | `attachments`         | list[string]     | ❌       | Attachments                          | ❌                    |
 | `expected_response`   | string           | 📋       | Expected response for comparison     | ❌                    |
+| `expected_intent`     | string           | 📋       | Expected intent for intent evaluation| ❌                    |
 | `expected_tool_calls` | list[list[dict]] | 📋       | Expected tool call sequences         | ❌                    |
 | `tool_calls`          | list[list[dict]] | ❌       | Actual tool calls from API           | ✅ (if API enabled)   |
 | `verify_script`       | string           | 📋       | Path to verification script          | ❌                    |
@@ -287,6 +295,7 @@ embedding:
 
 Examples
 > - `expected_response`: Required for `custom:answer_correctness`
+> - `expected_intent`: Required for `custom:intent_eval`
 > - `expected_tool_calls`: Required for `custom:tool_eval`
 > - `verify_script`: Required for `script:action_eval` (used when API is enabled)
 > - `response`: Required for most metrics (auto-populated if API enabled)

diff --git a/config/system.yaml b/config/system.yaml
@@ -70,6 +70,10 @@ metrics_metadata:
       threshold: 0.75
       description: "Correctness vs expected answer using custom LLM evaluation"
 
+    "custom:intent_eval":
+      threshold: 1  # boolean eval (either 0 or 1)
+      description: "Intent alignment evaluation using custom LLM evaluation"
+
     "custom:tool_eval":
       description: "Tool call evaluation comparing expected vs actual tool calls"
 

diff --git a/src/lightspeed_evaluation/core/metrics/custom/__init__.py b/src/lightspeed_evaluation/core/metrics/custom/__init__.py
@@ -1,11 +1,16 @@
 """Custom metrics components package."""
 
 from lightspeed_evaluation.core.metrics.custom.custom import CustomMetrics
-from lightspeed_evaluation.core.metrics.custom.prompts import ANSWER_CORRECTNESS_PROMPT
+from lightspeed_evaluation.core.metrics.custom.prompts import (
+    ANSWER_CORRECTNESS_PROMPT,
+    INTENT_EVALUATION_PROMPT,
+)
 from lightspeed_evaluation.core.metrics.custom.tool_eval import evaluate_tool_calls
 
 __all__ = [
     "CustomMetrics",
-    "ANSWER_CORRECTNESS_PROMPT",
     "evaluate_tool_calls",
+    # Prompts
+    "ANSWER_CORRECTNESS_PROMPT",
+    "INTENT_EVALUATION_PROMPT",
 ]
diff --git a/src/lightspeed_evaluation/core/metrics/custom/custom.py b/src/lightspeed_evaluation/core/metrics/custom/custom.py
@@ -5,7 +5,10 @@
 
 from lightspeed_evaluation.core.llm.custom import BaseCustomLLM
 from lightspeed_evaluation.core.llm.manager import LLMManager
-from lightspeed_evaluation.core.metrics.custom.prompts import ANSWER_CORRECTNESS_PROMPT
+from lightspeed_evaluation.core.metrics.custom.prompts import (
+    ANSWER_CORRECTNESS_PROMPT,
+    INTENT_EVALUATION_PROMPT,
+)
 from lightspeed_evaluation.core.metrics.custom.tool_eval import evaluate_tool_calls
 from lightspeed_evaluation.core.models import EvaluationScope, TurnData
 from lightspeed_evaluation.core.system.exceptions import LLMError
@@ -26,6 +29,7 @@ def __init__(self, llm_manager: LLMManager):
 
         self.supported_metrics = {
             "answer_correctness": self._evaluate_answer_correctness,
+            "intent_eval": self._evaluate_intent,
             "tool_eval": self._evaluate_tool_calls,
         }
 
@@ -195,3 +199,45 @@ def _evaluate_tool_calls(
         score = 1.0 if success else 0.0
 
         return score, details
+
+    def _evaluate_intent(
+        self,
+        _conv_data: Any,
+        _turn_idx: Optional[int],
+        turn_data: Optional[TurnData],
+        is_conversation: bool,
+    ) -> tuple[Optional[float], str]:
+        """Evaluate intent alignment using custom prompt."""
+        if is_conversation:
+            return None, "Intent evaluation is a turn-level metric"
+
+        if turn_data is None:
+            return None, "TurnData is required for intent evaluation"
+
+        if not turn_data.expected_intent:
+            return None, "No expected intent provided for intent evaluation"
+
+        query = turn_data.query
+        response = turn_data.response
+        expected_intent = turn_data.expected_intent
+
+        prompt = INTENT_EVALUATION_PROMPT.format(
+            query=query,
+            response=response,
+            expected_intent=expected_intent,
+        )
+
+        # Make LLM call and parse response
+        try:
+            llm_response = self._call_llm(prompt)
+            score, reason = self._parse_score_response(llm_response)
+
+            if score is None:
+                return (
+                    None,
+                    f"Could not parse score from LLM response: {llm_response[:100]}...",
+                )
+
+            return score, reason
+        except LLMError as e:
+            return None, f"Intent evaluation failed: {str(e)}"
diff --git a/src/lightspeed_evaluation/core/metrics/custom/prompts.py b/src/lightspeed_evaluation/core/metrics/custom/prompts.py
@@ -1,5 +1,7 @@
 """Prompts for custom metrics evaluation."""
 
+# pylint: disable=line-too-long
+
 # Answer Correctness Evaluation Prompt
 ANSWER_CORRECTNESS_PROMPT = """Evaluate the answer correctness of the given response.
 
@@ -18,3 +20,27 @@
 Format your response as:
 Score: [your score on a scale of 0.0 to 1.0]
 Reason: [your detailed explanation]"""
+
+# Intent Evaluation Prompt
+INTENT_EVALUATION_PROMPT = """Evaluate whether the response demonstrates the expected intent or purpose.
+
+Question: {query}
+Response: {response}
+Expected Intent: {expected_intent}
+
+Consider:
+- What is the intent/purpose of the actual response?
+- Does the response's intent match the expected intent?
+- Is the response trying to achieve what is described in the expected intent?
+
+Examples of intent evaluation:
+- If expected intent is "provide instructions", check if the response is instructional
+- If expected intent is "explain a concept", check if the response is explanatory  
+- If expected intent is "refuse or decline", check if the response is declining to help
+- If expected intent is "ask for clarification", check if the response is asking questions
+
+Rate the intent alignment and provide your reasoning. Use binary scoring: 1 for match, 0 for no match.
+
+Format your response as:
+Score: [0 or 1]
+Reason: [your detailed explanation]"""
diff --git a/src/lightspeed_evaluation/core/models/data.py b/src/lightspeed_evaluation/core/models/data.py
@@ -59,6 +59,9 @@ class TurnData(BaseModel):
     expected_tool_calls: Optional[list[list[dict[str, Any]]]] = Field(
         default=None, description="Expected tool call sequences"
     )
+    expected_intent: Optional[str] = Field(
+        default=None, min_length=1, description="Expected intent for intent evaluation"
+    )
     conversation_id: Optional[str] = Field(
         default=None, description="Conversation ID - populated by API if enabled"
     )

diff --git a/src/lightspeed_evaluation/core/system/validator.py b/src/lightspeed_evaluation/core/system/validator.py
@@ -44,6 +44,10 @@
         "required_fields": ["response", "expected_response"],
         "description": "requires 'response' and 'expected_response' fields",
     },
+    "custom:intent_eval": {
+        "required_fields": ["response", "expected_intent"],
+        "description": "requires 'response' and 'expected_intent' fields",
+    },
     "custom:tool_eval": {
         "required_fields": ["tool_calls", "expected_tool_calls"],
         "description": (