lightspeed-core · tisnik · Aug 13, 2025 · Aug 5, 2025 · Aug 7, 2025 · Aug 8, 2025
diff --git a/lsc_agent_eval/README.md b/lsc_agent_eval/README.md
@@ -6,10 +6,11 @@ A framework for evaluating AI agent performance.
 
 - **Agent Goal Evaluation**: Evaluate whether an agent successfully achieves specified goals
 - **Multi-turn Evaluation**: Organize evaluations into conversation groups for multi-turn testing
-- **Multi-type Evaluation**: Support for different evaluation types:
-  - `judge-llm`: LLM-based evaluation using a judge model
-  - `script`: Script-based evaluation using verification scripts (similar to [k8s-bench](https://github.com/GoogleCloudPlatform/kubectl-ai/tree/main/k8s-bench))
-  - `sub-string`: Simple substring matching evaluation (ALL keywords must be present in response)
+- **Multi-type Evaluation**: Support for multiple evaluation types per query:
+  - `action_eval`: Script-based evaluation using verification script (similar to [k8s-bench](https://github.com/GoogleCloudPlatform/kubectl-ai/tree/main/k8s-bench))
+  - `response_eval:sub-string`: Simple substring matching evaluation (ALL keywords must be present in response; case-insensitive)
+  - `response_eval:accuracy`: LLM-based evaluation using a judge model. Result is either accurate or not in comparison to expected response
+  - `tool_eval`: Tool call evaluation comparing expected vs actual tool calls with arguments
 - **Setup/Cleanup Scripts**: Support for running setup and cleanup scripts before/after evaluation
 - **Result Tracking**: Result tracking with CSV output and JSON statistics
 - **Standalone Package**: Can be installed and used independently of the main lightspeed-core-evaluation package
@@ -22,6 +23,9 @@ A framework for evaluating AI agent performance.
 - Python 3.11 or 3.12
 - Package manager: `pdm` or `pip`
 
+- Agent API is running. Any change to the API response may impact evaluation processing logic.
+- For Judge model, model inference server is up
+
 ### Install from Git
 
 ```bash
@@ -59,52 +63,76 @@ The evaluation is configured using a YAML file that defines conversations. Each
 Each evaluation within a conversation can include:
 - `eval_id`: Unique identifier for the evaluation
 - `eval_query`: The query/task to send to the agent
-- `eval_type`: Type of evaluation (judge-llm, script, sub-string)
-- `expected_response`: Expected response (for judge-llm evaluation)
-- `expected_keywords`: Keywords to look for (for sub-string evaluation)
-- `eval_verify_script`: Verification script (for script evaluation)
+- `eval_types`: List of evaluation types to run (action_eval, tool_eval, response_eval:sub-string, response_eval:accuracy)
+- `expected_response`: Expected response (for response_eval:accuracy evaluation)
+- `expected_keywords`: Keywords to look for (for response_eval:sub-string evaluation)
+- `expected_tool_calls`: Expected tool call sequences (list of lists) with tool_name and arguments (for tool_eval)
+- `eval_verify_script`: Verification script (for action_eval evaluation)
 - `description`: Description of the evaluation (Optional)
 
 Note: `eval_id` can't contain duplicate values within a conversation group. But it is okay for cross conversation group (A warning is logged anyway for awareness)
 
 ### Example Data Configuration
 
 ```yaml
-# Multi-turn Conversations
+# Multi-turn Conversations with Multiple Evaluation Types
 - conversation_group: conv1
   description: Basic conversation flow testing cluster operations
   conversation:
     - eval_id: eval1
       eval_query: Hi!
-      eval_type: judge-llm
+      eval_types: [response_eval:accuracy]
       expected_response: Hello! I'm an AI assistant for the Installer.
       description: Initial greeting to start conversation
     - eval_id: eval2
       eval_query: Get me active clusters
-      eval_type: judge-llm
+      eval_types: [response_eval:accuracy, response_eval:sub-string]
       expected_response: Active clusters are x1, x2.
-      description: Request for cluster information
+      expected_keywords: [clusters, active]
+      description: Request for cluster information with multiple validations
 
 - conversation_group: conv2
-  description: Multi-turn conversation with setup/cleanup
+  description: Multi-turn conversation with setup/cleanup and action evaluation
   setup_script: sample_data/script/setup_environment.sh
   cleanup_script: sample_data/script/cleanup_environment.sh
   conversation:
     - eval_id: eval1
       eval_query: Hi! Can you help me manage pods?
-      eval_type: judge-llm
+      eval_types: [response_eval:accuracy]
       expected_response: Hello! I can help you manage pods.
       description: Initial greeting
     - eval_id: eval2
       eval_query: Create a pod named test-pod
-      eval_type: script
-      eval_verify_script: sample_data/script/verify_pod.sh
-      description: Create pod and verify
-    - eval_id: eval3
-      eval_query: List all pods
-      eval_type: sub-string
-      expected_keywords: ['test-pod']
-      description: Verify pod is listed
+      eval_types: 
+        - action_eval
+        - response_eval:sub-string
+      eval_verify_script: sample_data/script/verify_pod_creation.sh
+      expected_keywords: 
+        - pod
+        - created
+        - test-pod
+      description: Pod creation with script verification and keyword matching
+
+# Tool Call Evaluation
+- conversation_group: conv_tools  
+  description: Tool call validation
+  conversation:
+    - eval_id: eval1
+      eval_query: List available OpenShift versions
+      eval_types: [tool_eval]
+      expected_tool_calls: 
+        - - tool_name: list_versions
+            arguments: {}
+      description: Verify correct tool call for listing versions
+    - eval_id: eval2
+      eval_query: is there an openshift-lightspeed namespace
+      eval_types: [tool_eval, response_eval:sub-string]
+      expected_tool_calls:
+        - - tool_name: oc_get
+            arguments:
+              oc_get_args: [namespaces, openshift-lightspeed]
+      expected_keywords: ["yes", "openshift-lightspeed"]
+      description: Tool call with argument validation and response verification
 
 # Single-turn Conversations
 - conversation_group: conv3
@@ -113,11 +141,9 @@ Note: `eval_id` can't contain duplicate values within a conversation group. But
   cleanup_script: sample_data/script/conv3/cleanup.sh
   conversation:
     - eval_id: eval1
-      eval_query: is there a openshift-lightspeed namespace ?
-      eval_type: sub-string
-      expected_keywords:
-        - 'yes'
-        - 'lightspeed'
+      eval_query: is there an openshift-lightspeed namespace?
+      eval_types: [response_eval:sub-string]
+      expected_keywords: ["yes", lightspeed]
       description: Check for openshift-lightspeed namespace after setup
 ```
 
@@ -144,12 +170,15 @@ Expectation is that, either a third-party inference provider access is there or
 lsc_agent_eval \
     --eval_data_yaml agent_goal_eval.yaml \
     --agent_endpoint http://localhost:8080 \
+    --endpoint_type streaming \
     --agent_provider watsonx \
     --agent_model ibm/granite-3-2-8b-instruct \
+    --agent_auth_token_file agent_api_token.txt \
     --judge_provider openai \
     --judge_model gpt-4o-mini \
     --result_dir ./eval_output
 ```
+Pass token text file or set `AGENT_API_TOKEN` env var.
 
 ```python
 from lsc_agent_eval import AgentGoalEval
@@ -159,11 +188,12 @@ class EvalArgs:
     def __init__(self):
         self.eval_data_yaml = 'data/example_eval.yaml'
         self.agent_endpoint = 'http://localhost:8080'
+        self.endpoint_type = 'query'  # Non-streaming
         self.agent_provider = 'watsonx'
         self.agent_model = 'ibm/granite-3-2-8b-instruct'
         self.judge_provider = 'openai'
         self.judge_model = 'gpt-4o-mini'
-        self.agent_auth_token_file = None  # Or set `AGENT_API_TOKEN` env var
+        self.agent_auth_token_file = None  # set `AGENT_API_TOKEN` env var
         self.result_dir = None
 
 # Run evaluation
@@ -176,6 +206,7 @@ evaluator.run_evaluation()
 
 - `--eval_data_yaml`: Path to the YAML file containing evaluation data
 - `--agent_endpoint`: Endpoint URL for the agent API (default: <http://localhost:8080>)
+- `--endpoint_type`: Endpoint type to use for agent queries (default: streaming). Options: 'streaming' or 'query'
 - `--agent_auth_token_file`: Path to .txt file containing API token (if required). Or set `AGENT_API_TOKEN` env var without using a .txt file
 - `--agent_provider`: Provider for the agent API
 - `--agent_model`: Model for the agent API
@@ -194,7 +225,7 @@ evaluator.run_evaluation()
    - Run all evaluations sequentially:
      - For the first evaluation: Send query without conversation ID, receive new conversation ID from API
      - For subsequent evaluations: Use the conversation ID from the first evaluation to maintain context
-     - Execute evaluation based on eval_type (either sub-string, judge-llm or script)
+     - Execute evaluation based on eval_type (any combination of valid eval_types)
    - Run cleanup script (if provided)
 3. **Save Results**: Export to CSV and JSON with statistics
 
@@ -237,7 +268,7 @@ Contains detailed results with columns:
 Result statistics:
 - **Overall Summary**: Total evaluations, pass/fail/error counts, success rate
 - **By Conversation**: Breakdown of results for each conversation group
-- **By Evaluation Type**: Performance metrics for each evaluation type (judge-llm, script, sub-string)
+- **By Evaluation Type**: Performance metrics for each evaluation type (action_eval, response_eval:sub-string, response_eval:accuracy, tool_eval)
 
 ## Development
 

diff --git a/lsc_agent_eval/sample_data/agent_goal_eval_example.yaml b/lsc_agent_eval/sample_data/agent_goal_eval_example.yaml
@@ -1,11 +1,13 @@
+# Agent evaluation sample data (Only for reference)
+
 - conversation_group: conv1
   description: Test namespace detection using substring matching
   conversation:
     - eval_id: eval1
       eval_query: is there a openshift-monitoring namespace ?
-      eval_type: sub-string
+      eval_types: [response_eval:sub-string]
       expected_keywords:
-        - 'yes'
+        - "yes"
         - openshift-monitoring
       description: Check for openshift-monitoring namespace existence
 
@@ -14,7 +16,7 @@
   conversation:
     - eval_id: eval1
       eval_query: is there a openshift-monitoring namespace ?
-      eval_type: judge-llm
+      eval_types: [response_eval:accuracy]
       expected_response: there is a openshift-monitoring namespace.
       description: Verify openshift-monitoring namespace with LLM evaluation
 
@@ -24,10 +26,9 @@
   cleanup_script: sample_data/script/conv3/cleanup.sh
   conversation:
     - eval_id: eval1
-      eval_query: is there a openshift-lightspeed namespace ?
-      eval_type: sub-string
-      expected_keywords:
-        - 'yes'
+      eval_query: is there an openshift-lightspeed namespace?
+      eval_types: [response_eval:sub-string]
+      expected_keywords: ["yes"]
       description: Check for openshift-lightspeed namespace after setup
 
 - conversation_group: conv4
@@ -37,32 +38,110 @@
   conversation:
     - eval_id: eval1
       eval_query: create a namespace called openshift-lightspeed
-      eval_type: script
+      eval_types: [action_eval]
       eval_verify_script: sample_data/script/conv4/eval1/verify.sh
       description: Create namespace and verify with script
 
 - conversation_group: conv5
+  description: Test tool calls with simple structure
+  conversation:
+    - eval_id: eval1
+      eval_query: List available openshift versions
+      eval_types: [tool_eval]
+      expected_tool_calls:
+        - - tool_name: list_versions
+            arguments: {}
+      description: Verify correct tool call for listing versions
+
+- conversation_group: conv6
+  description: Test tool calls with argument
+  conversation:
+    - eval_id: eval1
+      eval_query: does openshift-lightspeed name exist ?
+      eval_types: [tool_eval, response_eval:sub-string]
+      expected_tool_calls:
+        - - tool_name: oc_get
+            arguments:
+              oc_get_args: [namespaces, openshift-lightspeed]
+      expected_keywords: [openshift-lightspeed]
+      description: Tool call with argument
+
+- conversation_group: conv7
+  description: Test multiple tool call sequences
+  conversation:
+    - eval_id: eval1
+      eval_query: List versions and then create a pod
+      eval_types: [tool_eval]
+      expected_tool_calls:
+        - - tool_name: list_versions
+            arguments: {}
+        - - tool_name: create_pod
+            arguments:
+              name: my-pod
+              image: httpd
+      description: Multiple sequential tool calls
+
+- conversation_group: conv8
+  description: Multiple evaluations per query - response_eval only
+  conversation:
+    - eval_id: multi_response_eval
+      eval_query: What is Openshift Virtualization?
+      eval_types:
+        - response_eval:sub-string
+        - response_eval:accuracy
+      expected_keywords:
+        - deploy
+        - application
+        - OpenShift
+      expected_response: OpenShift Virtualization is an extension of the OpenShift Container Platform
+      description: Test multiple response evaluations
+
+- conversation_group: conv9
+  description: Comprehensive evaluation with all types
+  setup_script: sample_data/script/conv4/setup.sh
+  cleanup_script: sample_data/script/conv4/cleanup.sh
+  conversation:
+    - eval_id: comprehensive_eval
+      eval_query: create openshift-lightspeed namespace
+      eval_types:
+        - tool_eval
+        - action_eval
+        - response_eval:sub-string
+        - response_eval:accuracy
+      expected_tool_calls:
+        - - tool_name: oc_create
+            arguments:
+              kind: namespace
+              name: openshift-lightspeed
+      eval_verify_script: sample_data/script/conv4/eval1/verify.sh
+      expected_keywords:
+        - created
+        - openshift-lightspeed
+      expected_response: openshift-lightspeed namespace is created successfully
+      description: Comprehensive evaluation using all evaluation types
+
+- conversation_group: conv10
   description: Test conversation retention - multi turn success
   conversation:
     - eval_id: eval1
       eval_query: what is openshift virtualization ?
-      eval_type: sub-string
+      eval_types: [response_eval:sub-string]
       expected_keywords:
         - virtualization
       description: Test first conversation
     - eval_id: eval2
       eval_query: what was my previous query ?
-      eval_type: sub-string
+      eval_types: [response_eval:sub-string]
       expected_keywords:
         - virtualization
       description: Test second conversation
 
-- conversation_group: conv6
+- conversation_group: conv11
   description: Test conversation retention - new conversation
   conversation:
     - eval_id: eval1
       eval_query: what was my previous query ?
-      eval_type: sub-string
+      eval_types: [response_eval:sub-string]
       expected_keywords:
         - virtualization
       description: new conversation (failure)
diff --git a/lsc_agent_eval/src/lsc_agent_eval/agent_eval.py b/lsc_agent_eval/src/lsc_agent_eval/agent_eval.py
@@ -35,6 +35,13 @@ def _args_parser(args: list[str]) -> argparse.Namespace:
         help="Agent API endpoint URL",
     )
 
+    parser.add_argument(
+        "--endpoint_type",
+        choices=["streaming", "query"],
+        default="streaming",
+        help="Endpoint type to use for agent queries (default: streaming)",
+    )
+
     parser.add_argument(
         "--agent_provider", type=str, required=True, help="Agent provider name"
     )