add async score function to trustworthyrag module

axl1313 · axl1313 · commit 40918fc8e82f · 2025-04-11T19:35:20.000-07:00
diff --git a/src/cleanlab_tlm/utils/rag.py b/src/cleanlab_tlm/utils/rag.py
@@ -40,7 +40,10 @@
     _VALID_TLM_QUALITY_PRESETS_RAG,
 )
 from cleanlab_tlm.internal.exception_handling import handle_tlm_exceptions
-from cleanlab_tlm.internal.validation import tlm_score_process_response_and_kwargs, validate_rag_inputs
+from cleanlab_tlm.internal.validation import (
+    tlm_score_process_response_and_kwargs,
+    validate_rag_inputs,
+)
 
 if TYPE_CHECKING:
     from collections.abc import Coroutine
@@ -115,8 +118,12 @@ def __init__(
                     name=cast(str, eval_config[_TLM_EVAL_NAME_KEY]),
                     criteria=cast(str, eval_config[_TLM_EVAL_CRITERIA_KEY]),
                     query_identifier=eval_config.get(_TLM_EVAL_QUERY_IDENTIFIER_KEY),
-                    context_identifier=eval_config.get(_TLM_EVAL_CONTEXT_IDENTIFIER_KEY),
-                    response_identifier=eval_config.get(_TLM_EVAL_RESPONSE_IDENTIFIER_KEY),
+                    context_identifier=eval_config.get(
+                        _TLM_EVAL_CONTEXT_IDENTIFIER_KEY
+                    ),
+                    response_identifier=eval_config.get(
+                        _TLM_EVAL_RESPONSE_IDENTIFIER_KEY
+                    ),
                 )
                 for eval_config in _DEFAULT_EVALS
             ]
@@ -164,10 +171,16 @@ def score(
         )
 
         # Support constrain_outputs later
-        processed_responses = tlm_score_process_response_and_kwargs(formatted_prompts, response, None, {})
+        processed_responses = tlm_score_process_response_and_kwargs(
+            formatted_prompts, response, None, {}
+        )
 
         # Check if we're handling a batch or a single item
-        if isinstance(query, str) and isinstance(context, str) and isinstance(processed_responses, dict):
+        if (
+            isinstance(query, str)
+            and isinstance(context, str)
+            and isinstance(processed_responses, dict)
+        ):
             return self._event_loop.run_until_complete(
                 self._score_async(
                     response=processed_responses,
@@ -189,6 +202,74 @@ def score(
             )
         )
 
+    async def score_async(
+        self,
+        *,
+        response: Union[str, Sequence[str]],
+        query: Union[str, Sequence[str]],
+        context: Union[str, Sequence[str]],
+        prompt: Optional[Union[str, Sequence[str]]] = None,
+        form_prompt: Optional[Callable[[str, str], str]] = None,
+    ) -> Union[TrustworthyRAGScore, list[TrustworthyRAGScore]]:
+        """
+        Evaluate an existing RAG system's response to a given user query and retrieved context.
+
+        Args:
+             response (str | Sequence[str]): A response (or list of multiple responses) from your LLM/RAG system.
+             query (str | Sequence[str]): The user query (or list of multiple queries) that was used to generate the response.
+             context (str | Sequence[str]): The context (or list of multiple contexts) that was retrieved from the RAG Knowledge Base and used to generate the response.
+             prompt (str | Sequence[str], optional): Optional prompt (or list of multiple prompts) representing the actual inputs (combining query, context, and system instructions into one string) to the LLM that generated the response.
+             form_prompt (Callable[[str, str], str], optional): Optional function to format the prompt based on query and context. Cannot be provided together with prompt, provide one or the other.
+                    This function should take query and context as parameters and return a formatted prompt string.
+                    If not provided, a default prompt formatter will be used.
+                    To include a system prompt or any other special instructions for your LLM,
+                    incorporate them directly in your custom `form_prompt()` function definition.
+
+        Returns:
+             TrustworthyRAGScore | list[TrustworthyRAGScore]: [TrustworthyRAGScore](#class-trustworthyragscore) object containing evaluation metrics.
+                 If multiple inputs were provided in lists, a list of TrustworthyRAGScore objects is returned, one for each set of inputs.
+        """
+        if prompt is None and form_prompt is None:
+            form_prompt = TrustworthyRAG._default_prompt_formatter
+
+        formatted_prompts = validate_rag_inputs(
+            query=query,
+            context=context,
+            response=response,
+            prompt=prompt,
+            form_prompt=form_prompt,
+            evals=self._evals,
+            is_generate=False,
+        )
+
+        # Support constrain_outputs later
+        processed_responses = tlm_score_process_response_and_kwargs(
+            formatted_prompts, response, None, {}
+        )
+
+        # Check if we're handling a batch or a single item
+        if (
+            isinstance(query, str)
+            and isinstance(context, str)
+            and isinstance(processed_responses, dict)
+        ):
+            return await self._score_async(
+                response=processed_responses,
+                prompt=formatted_prompts,
+                query=query,
+                context=context,
+                timeout=self._timeout,
+            )
+
+        # Batch processing
+        return await self._batch_score(
+            responses=cast(Sequence[dict[str, Any]], processed_responses),
+            prompts=formatted_prompts,
+            queries=query,
+            contexts=context,
+            capture_exceptions=False,
+        )
+
     def generate(
         self,
         *,
@@ -212,11 +293,20 @@ def generate(
             form_prompt = TrustworthyRAG._default_prompt_formatter
 
         formatted_prompts = validate_rag_inputs(
-            query=query, context=context, prompt=prompt, form_prompt=form_prompt, evals=self._evals, is_generate=True
+            query=query,
+            context=context,
+            prompt=prompt,
+            form_prompt=form_prompt,
+            evals=self._evals,
+            is_generate=True,
         )
 
         # Check if we're handling a batch or a single item
-        if isinstance(query, str) and isinstance(context, str) and isinstance(formatted_prompts, str):
+        if (
+            isinstance(query, str)
+            and isinstance(context, str)
+            and isinstance(formatted_prompts, str)
+        ):
             return self._event_loop.run_until_complete(
                 self._generate_async(
                     prompt=formatted_prompts,
@@ -287,7 +377,9 @@ async def _batch_generate(
                     capture_exceptions=capture_exceptions,
                     batch_index=batch_index,
                 )
-                for batch_index, (prompt, query, context) in enumerate(zip(prompts, queries, contexts))
+                for batch_index, (prompt, query, context) in enumerate(
+                    zip(prompts, queries, contexts)
+                )
             ],
             per_batch_timeout,
         )
@@ -344,7 +436,9 @@ async def _batch_score(
 
     async def _batch_async(
         self,
-        rag_coroutines: Sequence[Coroutine[None, None, Union[TrustworthyRAGResponse, TrustworthyRAGScore]]],
+        rag_coroutines: Sequence[
+            Coroutine[None, None, Union[TrustworthyRAGResponse, TrustworthyRAGScore]]
+        ],
         batch_timeout: Optional[float] = None,
     ) -> Sequence[Union[TrustworthyRAGResponse, TrustworthyRAGScore]]:
         """Runs batch of TrustworthyRAG operations.
@@ -516,7 +610,9 @@ def _default_prompt_formatter(query: str, context: str) -> str:
         prompt_parts.append("---------------------\n")
 
         # Add instruction to use context
-        prompt_parts.append("Using the context information provided above, please answer the following question:\n")
+        prompt_parts.append(
+            "Using the context information provided above, please answer the following question:\n"
+        )
 
         # Add user query
         prompt_parts.append(f"User: {query.strip()}\n")
@@ -557,7 +653,11 @@ def __init__(
         lazydocs: ignore
         """
         # Validate that at least one identifier is specified
-        if query_identifier is None and context_identifier is None and response_identifier is None:
+        if (
+            query_identifier is None
+            and context_identifier is None
+            and response_identifier is None
+        ):
             raise ValueError(
                 "At least one of query_identifier, context_identifier, or response_identifier must be specified."
             )