Query Embedding for Tool Rag

EoghanOConnor · EoghanOConnor · commit cbaae917cadc · 2025-10-30T10:07:39.000Z
diff --git a/evaluator/algorithms/tool_rag_algorithm.py b/evaluator/algorithms/tool_rag_algorithm.py
@@ -84,6 +84,8 @@ class ToolRagAlgorithm(Algorithm):
     - max_document_size: the maximal size, in characters, of a single indexed document, or None to disable the size limit.
     - indexed_tool_def_parts: the parts of the MCP tool definition to be used for index construction, such as 'name',
       'description', 'args', etc.
+      You can also include 'additional_queries' (or 'examples') to append example queries for each tool if provided
+      via the 'additional_queries' setting (see defaults below).
     - hybrid_mode: True to enable hybrid (sparse + dense) search and False to only enable dense search.
     - analyzer_params: parameters for the Milvus BM25 analyzer.
     - fusion_type: the algorithm for combining the dense and the sparse scores if hybrid mode is activated. Milvus only
@@ -128,7 +130,8 @@ def get_default_settings(self) -> Dict[str, Any]:
             "embedding_model_id": "all-MiniLM-L6-v2",
             "similarity_metric": "COSINE",
             "index_type": "FLAT",
-            "indexed_tool_def_parts": ["name", "description"],
+            "indexed_tool_def_parts": ["name", "description", "additional_queries"],
+
 
             # preprocessing
             "text_preprocessing_operations": None,
@@ -232,6 +235,14 @@ def _compose_tool_text(self, tool: BaseTool) -> str:
                 tags = tool.tags or []
                 if tags:
                     segments.append(f"tags: {' '.join(tags)}")
+            elif p.lower() == "additional_queries":
+                # Append example queries supplied via settings["additional_queries"][tool.name]
+                examples_map = self._settings.get("additional_queries") or {}
+                examples_list = examples_map.get(tool.name) or []
+                if examples_list:
+                    rendered = self._render_examples(examples_list)
+                    if rendered:
+                        segments.append(f"ex: {rendered}")
 
         if not segments:
             raise ValueError(f"The following tool contains none of the fields listed in indexed_tool_def_parts:\n{tool}")
@@ -249,7 +260,7 @@ def _create_docs_from_tools(self, tools: List[BaseTool]) -> List[Document]:
             documents.append(Document(page_content=page_content, metadata={"name": tool.name}))
         return documents
 
-    def _index_tools(self, tools: List[BaseTool]) -> None:
+    def _index_tools(self, tools: List[BaseTool], queries: List[QuerySpecification]) -> None:
         self.tool_name_to_base_tool = {tool.name: tool for tool in tools}
 
         self.embeddings = HuggingFaceEmbeddings(model_name=self._settings["embedding_model_id"])
@@ -308,7 +319,7 @@ def _index_tools(self, tools: List[BaseTool]) -> None:
                 search_params=search_params,
             )
 
-    def set_up(self, model: BaseChatModel, tools: List[BaseTool]) -> None:
+    def set_up(self, model: BaseChatModel, tools: List[BaseTool], queries: List[QuerySpecification]) -> None:
         super().set_up(model, tools)
 
         if self._settings["cross_encoder_model_name"]:
@@ -320,7 +331,34 @@ def set_up(self, model: BaseChatModel, tools: List[BaseTool]) -> None:
         if self._settings["enable_query_decomposition"] or self._settings["enable_query_rewriting"]:
             self.query_rewriting_model = self._get_llm(self._settings["query_rewriting_model_id"])
 
-        self._index_tools(tools)
+        # Build additional_queries mapping from provided QuerySpecifications so YAML is not required.
+        try:
+            tool_examples: Dict[str, List[str]] = {}
+            for spec in (queries or []):
+                add_q = getattr(spec, "additional_queries", None) or {}
+                # Flatten wrapper {"additional_queries": {...}} if present
+                if isinstance(add_q, dict) and "additional_queries" in add_q and len(add_q) == 1:
+                    add_q = add_q["additional_queries"]
+                for tool_name, qmap in add_q.items():
+                    if isinstance(qmap, dict):
+                        for _, qtext in qmap.items():
+                            if isinstance(qtext, str) and qtext.strip():
+                                tool_examples.setdefault(tool_name, []).append(qtext.strip())
+            # Dedupe while preserving order
+            for k, v in list(tool_examples.items()):
+                seen = set()
+                deduped = []
+                for s in v:
+                    if s not in seen:
+                        seen.add(s)
+                        deduped.append(s)
+                tool_examples[k] = deduped
+            if tool_examples:
+                self._settings["additional_queries"] = tool_examples
+        except Exception:
+            pass
+
+        self._index_tools(tools, queries)
 
     def _threshold_results(self, docs_and_scores: List[Tuple[Document, float]]) -> List[Document]:
         """
diff --git a/evaluator/components/data_provider.py b/evaluator/components/data_provider.py
@@ -27,6 +27,8 @@ class QuerySpecification(BaseModel):
     """
     id: int
     query: str
+    additional_queries: Optional[Dict[str, Any]] = None
+    path: Optional[str] = None
     reference_answer: Optional[str] = None
     golden_tools: ToolSet = Field(default_factory=dict)
     additional_tools: Optional[ToolSet] = None
@@ -313,7 +315,7 @@ def _load_queries_from_single_file(
         root_dataset_path: str or Path,
         experiment_environment: EnvironmentConfig,
         dataset_config: DatasetConfig,
-) -> List[QuerySpecification]:
+) -> Tuple[List[QuerySpecification], List[Dict[str, Any]]]:
     with open(query_file_path, 'r') as f:
         data = json.load(f)
 
@@ -332,6 +334,13 @@ def _load_queries_from_single_file(
             log(f"Invalid query spec, skipping this query.")
         else:
             query = raw_query_spec.get("query")
+            if raw_query_spec.get("additional_queries"):
+                additional_queries = raw_query_spec.get("additional_queries")
+                print(f"Additional queries provided: {additional_queries}")
+
+            else:
+                print(f"No additional queries provided")
+                additional_queries = None
             query_id = int(raw_query_spec.get("query_id"))
             golden_tools, additional_tools = (
                 _parse_raw_query_tool_definitions(raw_query_spec, experiment_environment, dataset_config))
@@ -345,6 +354,8 @@ def _load_queries_from_single_file(
                     QuerySpecification(
                         id=query_id,
                         query=query,
+                        path=str(query_file_path),
+                        additional_queries=additional_queries,
                         reference_answer=reference_answer,
                         golden_tools=golden_tools,
                         additional_tools=additional_tools or None
@@ -362,7 +373,7 @@ def get_queries(
         experiment_environment: EnvironmentConfig,
         dataset_config: DatasetConfig,
         fine_tuning_mode=False
-) -> List[QuerySpecification]:
+) -> Tuple[List[QuerySpecification], List[Dict[str, Any]]]:
     """Load queries from the dataset."""
     root_dataset_path = Path(os.getenv("ROOT_DATASET_PATH"))
     if not root_dataset_path:
@@ -379,14 +390,14 @@ def get_queries(
     queries_num = None if fine_tuning_mode else dataset_config.queries_num
     queries = []
     for path in local_paths:
+        print(f"\n\n")
+        print(f"--------------------------------")
+        print(f"Loading queries from file: {path}")
+        print(f"\n\n")
         remaining_queries_num = None if queries_num is None else queries_num - len(queries)
         if remaining_queries_num == 0:
             break
-        new_queries = _load_queries_from_single_file(path,
-                                                     remaining_queries_num,
-                                                     root_dataset_path,
-                                                     experiment_environment,
-                                                     dataset_config)
+        new_queries= _load_queries_from_single_file(path, remaining_queries_num, root_dataset_path, experiment_environment, dataset_config)
         queries.extend(new_queries)
 
     return queries
diff --git a/evaluator/components/llm_provider.py b/evaluator/components/llm_provider.py
@@ -21,6 +21,7 @@ def get_llm(model_id: str, model_config: List[ModelConfig], **kwargs) -> BaseCha
 
     log_verbose(f"Connecting to {config.provider_id} server on {config.url} serving {model_id}...")
     stripped_url = str(config.url).strip('/')
+    print(f"\n \n stripped_url: {stripped_url} \n \n")
     if config.provider_id == ProviderId.OLLAMA:
         from langchain_ollama import ChatOllama
         return ChatOllama(model=model_id, base_url=stripped_url, **kwargs)
diff --git a/evaluator/components/mcp_proxy.py b/evaluator/components/mcp_proxy.py
@@ -110,6 +110,7 @@ def _make_param(entry: dict, required_flag: bool) -> Parameter:
             parameters.append(_make_param(e, required_flag=False))
 
         signature = Signature(parameters)
+        print(f"\n \n doc_lines: {doc_lines} \n \n")
         docstring = "\n".join(doc_lines)
 
         def tool_func(*args, **kwargs):
diff --git a/evaluator/config/yaml/tool_rag_experiments.yaml b/evaluator/config/yaml/tool_rag_experiments.yaml
@@ -67,6 +67,10 @@ algorithms:
     module_name: "tool_rag"
     settings:
       indexed_tool_def_parts: ["description"]
+  - label: "Index With Additional Queries"
+    module_name: "tool_rag"
+    settings:
+      indexed_tool_def_parts: ["additional_queries"]
   - label: "Index Tools By Name and Args"
     module_name: "tool_rag"
     settings:
diff --git a/evaluator/evaluator.py b/evaluator/evaluator.py
@@ -3,7 +3,7 @@
 import time
 import traceback
 from typing import List, Tuple
-
+from pathlib import Path
 import openai
 from langgraph.errors import GraphRecursionError
 from pydantic import ValidationError
@@ -17,6 +17,8 @@
 from evaluator.interfaces.algorithm import Algorithm
 from evaluator.utils.csv_logger import CSVLogger
 from evaluator.components.llm_provider import get_llm
+from evaluator.utils.parsing_tools import generate_and_save_additional_queries
+import json as _json
 from dotenv import load_dotenv
 
 from evaluator.utils.tool_logger import ToolLogger
@@ -35,13 +37,13 @@ class Evaluator(object):
 
     config: EvaluationConfig
 
-    def __init__(self, config_path: str | None, use_defaults: bool):
+    def __init__(self, config_path: str | None, use_defaults: bool, test_with_additional_queries: bool = False):
         try:
             self.config = load_config(config_path, use_defaults=use_defaults)
         except ConfigError as ce:
             log(f"Configuration error: {ce}")
             raise SystemExit(2)
-
+        self.test_with_additional_queries = test_with_additional_queries
     async def run(self) -> None:
 
         # Set up the necessary components for the experiments:
@@ -112,15 +114,13 @@ async def _run_experiment(self,
         Runs the specified experiment and returns the number of evaluated queries.
         """
         processed_queries_num = 0
-
         try:
             queries = await self._set_up_experiment(spec, metric_collectors, mcp_proxy_manager)
             algorithm, environment = spec
 
             try:
                 for i, query_spec in enumerate(queries):
                     log(f"Processing query #{query_spec.id} (Experiment {exp_index} of {total_exp_num}, query {i+1} of {len(queries)})...")
-
                     for mc in metric_collectors:
                         mc.prepare_for_measurement(query_spec)
 
@@ -199,22 +199,48 @@ async def _set_up_experiment(self,
         log(f"Initializing LLM connection: {environment.model_id}")
         llm = get_llm(model_id=environment.model_id, model_config=self.config.models)
         log("Connection established successfully.\n")
-
         log("Fetching queries for the current experiment...")
         queries = get_queries(environment, self.config.data)
         log(f"Successfully loaded {len(queries)} queries.\n")
         print_iterable_verbose("The following queries will be executed:\n", queries)
-
+        log(f"Generating additional queries.\n")
+        generate_and_save_additional_queries(llm, queries)
+        queries = get_queries(environment, self.config.data)
         log("Retrieving tool definitions for the current experiment...")
         tool_specs = get_tools_from_queries(queries)
         tools = await mcp_proxy_manager.run_mcp_proxy(tool_specs, init_client=True).get_tools()
         print_iterable_verbose("The following tools will be available during evaluation:\n", tools)
         log(f"The experiment will proceed with {len(tools)} tool(s).\n")
 
         log("Setting up the algorithm and the metric collectors...")
-        algorithm.set_up(llm, tools)
+        
+        algorithm.set_up(llm, tools, queries)
         for mc in metric_collectors:
             mc.set_up()
         log("All set!\n")
 
         return queries
+
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="Run the Evaluator experiments.")
+    parser.add_argument("--config", type=str, default=None, help="Path to evaluation config YAML file")
+    parser.add_argument("--defaults", action="store_true", help="Use default config options if set")
+    parser.add_argument("--test-with-additional-queries", action="store_true", help="Test with additional queries")
+    args = parser.parse_args()
+
+    from evaluator.utils.utils import log
+
+    log("Starting Evaluator main...")
+    evaluator = Evaluator(
+        config_path=args.config,
+        use_defaults=args.defaults,
+        test_with_additional_queries=args.test_with_additional_queries
+    )
+    try:
+        import asyncio
+        asyncio.run(evaluator.run())
+        log("Evaluator finished successfully!")
+    except Exception as e:
+        log(f"Evaluator failed: {e}")
+        raise
diff --git a/evaluator/metric_collectors/fac_metric_collector.py b/evaluator/metric_collectors/fac_metric_collector.py
@@ -82,7 +82,7 @@ def __init__(self, settings: Dict, model_config: List[ModelConfig]):
         super().__init__(settings, model_config)
 
         # Metrics storage
-        self.query_results = None
+        self.query_results = []
 
         # judge model configuration
         self.judge_model_url = os.getenv('FAC_JUDGE_MODEL_URL')
diff --git a/evaluator/metric_collectors/tool_selection_metric_collector.py b/evaluator/metric_collectors/tool_selection_metric_collector.py
@@ -16,10 +16,10 @@ class ToolSelectionMetricCollector(MetricCollector):
     def __init__(self, settings: Dict, model_config: List[ModelConfig]):
         super().__init__(settings, model_config)
 
-        self.total_queries = None
-        self.exact_matches = None
-        self.precision_sum = None
-        self.recall_sum = None
+        self.total_queries = 0
+        self.exact_matches = 0
+        self.precision_sum = 0.0
+        self.recall_sum = 0.0
 
     def get_collected_metrics_names(self) -> List[str]:
         return ["Exact Tool Selection Match Rate",
@@ -96,7 +96,10 @@ def report_results(self) -> Dict[str, Any] or None:
             raise RuntimeError("No measurements registered, cannot produce results.")
 
         results = {
-            "Exact Tool Selection Match Rate": self.exact_matches / self.total_queries,
+            "Exact Tool Selection Match Rate": (
+                (self.exact_matches or 0) / (self.total_queries or 1)
+                if self.total_queries else 0.0
+            ),
             "Tool Selection Precision": self.precision_sum / self.total_queries,
             "Tool Selection Recall": self.recall_sum / self.total_queries,
             "Spurious Tool Calling Rate": 1.0 - (self.precision_sum / self.total_queries),
diff --git a/evaluator/utils/parsing_tools.py b/evaluator/utils/parsing_tools.py
@@ -0,0 +1,60 @@
+from evaluator.components.llm_provider import query_llm
+from pathlib import Path
+import re
+import json
+from evaluator.utils.utils import print_iterable_verbose, log
+
+def generate_and_save_additional_queries(llm, queries):
+    """
+    For each query in queries, use the provided LLM to generate additional_queries if not present,
+    and save to the appropriate JSON file for that query (matching by query_id).
+    """
+
+    system_prompt = '''You create 5 additional queries for each tool and only return the additional queries information, given the query implemented, return in the following format as a JSON string:
+                {tool_name: {"query1": "", "query2": "", "query3": "", "query4": "", "query5": ""}}  '''
+    curr_file = None
+    for i, query_spec in enumerate(queries):
+        # If additional_queries already present, skip generating and saving
+        path = Path(query_spec.path)
+        if getattr(query_spec, 'additional_queries', None) or curr_file == path:
+            log(f"Skipping query_id {getattr(query_spec, 'id', '<N/A>')} because additional_queries is present.")
+            continue
+        user_prompt = f"tool_name = {getattr(query_spec, 'golden_tools', {}).keys()}, Query= {getattr(query_spec, 'query', None)}"
+        result = query_llm(llm, system_prompt, user_prompt)
+        # Remove markdown/code block wrappers if present
+        additional = qwen_model_parsing(result)
+        query_spec.additional_queries = additional
+        # Saving additional queries to the original query JSON file
+        if path and additional is not None:
+            if path.exists():
+                import json as _json
+                with path.open('r', encoding='utf-8') as f:
+                    orig_queries = _json.load(f)
+                for item in orig_queries:
+                    if (
+                        (item.get("query_id") == query_spec.id)
+                        or (str(item.get("query_id")) == str(query_spec.id))
+                    ):
+                        item["additional_queries"] = additional
+                with path.open('w', encoding='utf-8') as f:
+                    _json.dump(orig_queries, f, indent=2, ensure_ascii=False)
+                log(f"Successfully added additional queries to original file {path}")
+                curr_file = path
+
+def qwen_model_parsing(response: str):
+    """
+    Parse the response from the Qwen model and return the additional queries.
+    """
+    # Remove markdown/code block wrappers if present
+    match = re.search(r"</think>\s*(.*)", response, re.DOTALL)
+    response_text = match.group(1).strip() if match else response
+    # Try to extract the 'additional_queries' dict block
+    additional = None
+    response_text = response_text.strip()
+    try:
+        additional = json.loads(response_text)
+    except Exception as e:
+        additional = None
+    return additional
+
+