miquido · KaQuMiQ · Oct 22, 2025 · Oct 22, 2025 · coderabbitai · Oct 22, 2025
diff --git a/src/draive/evaluation/evaluator.py b/src/draive/evaluation/evaluator.py
@@ -1,6 +1,5 @@
 import re
-from asyncio import gather
-from collections.abc import Callable, Collection
+from collections.abc import Callable, Collection, Sequence
 from typing import Annotated, Protocol, Self, cast, overload, runtime_checkable
 
 from haiway import (
@@ -10,8 +9,10 @@
     Immutable,
     Meta,
     MetaValues,
+    ObservabilityLevel,
     State,
     Validator,
+    concurrently,
     ctx,
 )
 
@@ -288,16 +289,19 @@ def lowest(
         evaluator: PreparedEvaluator[Value],
         /,
         *evaluators: PreparedEvaluator[Value],
+        concurrent_tasks: int = 2,
     ) -> PreparedEvaluator[Value]:
         """
-        Create an evaluator that returns the lowest scoring result.
+        Create an evaluator that returns the lowest performing result.
 
         Parameters
         ----------
         evaluator : PreparedEvaluator[Value]
             First evaluator to run
         *evaluators : PreparedEvaluator[Value]
             Additional evaluators to run
+        concurrent_tasks: int
+            Number of concurrently executed evaluators
 
         Returns
         -------
@@ -316,9 +320,10 @@ async def evaluate(
                 meta=META_EMPTY,
             )
 
-            for result in await gather(
-                evaluator(value),
-                *(evaluator(value) for evaluator in evaluators),
+            for result in await concurrently(
+                (evaluator(value), *(evaluator(value) for evaluator in evaluators)),
+                concurrent_tasks=concurrent_tasks,
+                return_exceptions=False,
             ):
                 if result.performance <= lowest.performance:
                     lowest = result
@@ -332,16 +337,19 @@ def highest(
         evaluator: PreparedEvaluator[Value],
         /,
         *evaluators: PreparedEvaluator[Value],
+        concurrent_tasks: int = 2,
     ) -> PreparedEvaluator[Value]:
         """
-        Create an evaluator that returns the highest scoring result.
+        Create an evaluator that returns the highest performing result.
 
         Parameters
         ----------
         evaluator : PreparedEvaluator[Value]
             First evaluator to run
         *evaluators : PreparedEvaluator[Value]
             Additional evaluators to run
+        concurrent_tasks: int
+            Number of concurrently executed evaluators
 
         Returns
         -------
@@ -360,9 +368,10 @@ async def evaluate(
                 meta=META_EMPTY,
             )
 
-            for result in await gather(
-                evaluator(value),
-                *(evaluator(value) for evaluator in evaluators),
+            for result in await concurrently(
+                (evaluator(value), *(evaluator(value) for evaluator in evaluators)),
+                concurrent_tasks=concurrent_tasks,
+                return_exceptions=False,
             ):
                 if result.performance >= highest.performance:
                     highest = result
@@ -371,6 +380,55 @@ async def evaluate(
 
         return evaluate
 
+    @staticmethod
+    def average(
+        evaluator: PreparedEvaluator[Value],
+        /,
+        *evaluators: PreparedEvaluator[Value],
+        threshold: EvaluationScoreValue,
+        concurrent_tasks: int = 2,
+    ) -> PreparedEvaluator[Value]:
+        """
+        Create an evaluator that returns the average scoring result.
+
+        Parameters
+        ----------
+        evaluator : PreparedEvaluator[Value]
+            First evaluator to run
+        *evaluators : PreparedEvaluator[Value]
+            Additional evaluators to run
+        threshold: EvaluationScoreValue
+            Combined result threshold replacing combined thresholds
+        concurrent_tasks: int
+            Number of concurrently executed evaluators
+
+        Returns
+        -------
+        PreparedEvaluator[Value]
+            Evaluator that returns the result with average score value
+        """
+
+        async def evaluate(
+            value: Value,
+        ) -> EvaluatorResult:
+            scores: Sequence[float] = [
+                result.score
+                for result in await concurrently(
+                    (evaluator(value), *(evaluator(value) for evaluator in evaluators)),
+                    concurrent_tasks=concurrent_tasks,
+                    return_exceptions=False,
+                )
+            ]
+
+            return EvaluatorResult(
+                evaluator="average",
+                score=sum(scores) / len(scores),
+                threshold=evaluation_score_value(threshold),
+                meta=META_EMPTY,
+            )
+
+        return evaluate
+
     name: str
     threshold: float
     meta: Meta
@@ -618,6 +676,7 @@ async def __call__(
             )
 
             ctx.record(
+                ObservabilityLevel.INFO,
                 metric=f"evaluator.{result.evaluator}.performance",
                 value=result.performance,
                 unit="%",

diff --git a/src/draive/evaluation/generator.py b/src/draive/evaluation/generator.py
@@ -45,7 +45,7 @@ async def generate_case_parameters[Parameters: DataModel](
                 parameters,
                 instructions=INSTRUCTION.format(
                     guidelines=f"\n<GUIDELINES>\n{guidelines}\n</GUIDELINES>\n"
-                    if guidelines is not None
+                    if guidelines
                     else ""
                 ),
                 input=INPUT,

diff --git a/src/draive/evaluators/coherence.py b/src/draive/evaluators/coherence.py
@@ -1,46 +1,38 @@
-from typing import cast
-
-from draive.evaluation import EvaluationScore, EvaluationScoreValue, evaluator
+from draive.evaluation import EvaluationScore, evaluator
+from draive.evaluators.utils import FORMAT_INSTRUCTION, extract_evaluation_result
 from draive.multimodal import Multimodal, MultimodalContent
 from draive.stages import Stage
 
 __all__ = ("coherence_evaluator",)
 
 
-INSTRUCTION: str = """\
+INSTRUCTION: str = f"""\
-INSTRUCTION: str = f"""\
+from typing import Final
+
+from draive.evaluation import EvaluationScore, evaluator
+
+INSTRUCTION: Final[str] = f"""\
-INSTRUCTION: str = f"""\
+from typing import Final
+
+from draive.evaluation import EvaluationScore, evaluator
+
+INSTRUCTION: Final[str] = f"""\
 You are evaluating the provided content according to the defined criteria.
 
 <INSTRUCTION>
-Compare the REFERENCE and the EVALUATED content by carefully examining them, then rate\
- the EVALUATED content using solely a coherence metric according to the EVALUATION_CRITERIA.
+Compare the REFERENCE and the EVALUATED content by carefully examining them, then rate the EVALUATED content using solely a coherence metric according to the EVALUATION_CRITERIA.
 Think step by step and provide explanation of the score before the final score.
 Use the explained RATING scale and the requested FORMAT to provide the result.
 </INSTRUCTION>
 
 <EVALUATION_CRITERIA>
 Evaluated metric is coherence - a collective quality of the content.
-We align this dimension with the DUC (Document Understanding Conference) quality question of\
- structure and coherence, whereby the content should be well-structured and well-organized.
-EVALUATED content should not just be a heap of related information, but should build from part
-to part into a coherent body of information about the topic.
+We align this dimension with the DUC (Document Understanding Conference) quality question of structure and coherence, whereby the content should be well-structured and well-organized.
+EVALUATED content should not just be a heap of related information, but should build from part to part into a coherent body of information about the topic.
 </EVALUATION_CRITERIA>
-{guidelines}
+{{guidelines}}
 <RATING>
 Assign a coherence score using exact name of one of the following values:
 - "poor" is very low coherence, the content is chaotic, lacking logical connections between parts.
 - "fair" is low coherence, some connections are visible, but the overall structure is weak.
 - "good" is moderate coherence, the content has a noticeable structure, but with some shortcomings.
 - "excellent" is high coherence, the content is well-organized with minor imperfections.
-- "perfect" is very high coherence, the content is exemplarily structured, with smooth transitions\
- between ideas.
+- "perfect" is very high coherence, the content is exemplarily structured, with smooth transitions between ideas.
 Use the "none" value for content that cannot be rated at all.
 </RATING>
 
-<FORMAT>
-The final result containing only the rating value, HAVE to be put inside a `RESULT`\
- xml tag within the result i.e. `<RESULT>good</RESULT>`.
-</FORMAT>
-"""
+{FORMAT_INSTRUCTION}
+"""  # noqa: E501
 
 
 @evaluator(name="coherence")
@@ -63,26 +55,17 @@ async def coherence_evaluator(
             meta={"comment": "Reference was empty!"},
         )
 
-    completion: MultimodalContent = await Stage.completion(
-        MultimodalContent.of(
-            "<REFERENCE>",
-            reference,
-            "</REFERENCE>\n<EVALUATED>",
-            evaluated,
-            "</EVALUATED>",
-        ),
-        instructions=INSTRUCTION.format(
-            guidelines=f"\n<GUIDELINES>\n{guidelines}\n</GUIDELINES>\n"
-            if guidelines is not None
-            else ""
-        ),
-    ).execute()
-
-    if result := completion.tag("RESULT"):
-        return EvaluationScore.of(
-            cast(EvaluationScoreValue, result.content.to_str().strip().lower()),
-            meta={"comment": completion.to_str()},
-        )
-
-    else:
-        raise ValueError(f"Invalid evaluator result:\n{completion}")
+    return extract_evaluation_result(
+        await Stage.completion(
+            MultimodalContent.of(
+                "<REFERENCE>",
+                reference,
+                "</REFERENCE>\n<EVALUATED>",
+                evaluated,
+                "</EVALUATED>",
+            ),
+            instructions=INSTRUCTION.format(
+                guidelines=f"\n<GUIDELINES>\n{guidelines}\n</GUIDELINES>\n" if guidelines else "",
+            ),
+        ).execute()
+    )
diff --git a/src/draive/evaluators/completeness.py b/src/draive/evaluators/completeness.py
@@ -1,49 +1,36 @@
-from typing import cast
-
-from draive.evaluation import EvaluationScore, EvaluationScoreValue, evaluator
+from draive.evaluation import EvaluationScore, evaluator
+from draive.evaluators.utils import FORMAT_INSTRUCTION, extract_evaluation_result
 from draive.multimodal import Multimodal, MultimodalContent
 from draive.stages import Stage
 
 __all__ = ("completeness_evaluator",)
 
 
-INSTRUCTION: str = """\
+INSTRUCTION: str = f"""\
 You are evaluating the provided content according to the defined criteria.
 
 <INSTRUCTION>
-Compare the USER_QUERY and the EVALUATED content by carefully examining them, then rate\
- the EVALUATED content using solely a completeness metric according to the EVALUATION_CRITERIA.
+Compare the USER_QUERY and the EVALUATED content by carefully examining them, then rate the EVALUATED content using solely a completeness metric according to the EVALUATION_CRITERIA.
 Think step by step and provide explanation of the score before the final score.
 Use the explained RATING scale and the requested FORMAT to provide the result.
 </INSTRUCTION>
 
 <EVALUATION_CRITERIA>
-Evaluated metric is completeness - the extent to which the EVALUATED content fully\
- addresses and answers all aspects of the USER_QUERY. Complete content should address\
- all parts of multi-part questions, provide comprehensive responses to complex queries,\
- and not leave important aspects of the user's request unanswered.
+Evaluated metric is completeness - the extent to which the EVALUATED content fully addresses and answers all aspects of the USER_QUERY. Complete content should address all parts of multi-part questions, provide comprehensive responses to complex queries, and not leave important aspects of the user's request unanswered.
 </EVALUATION_CRITERIA>
-{guidelines}
+{{guidelines}}
 <RATING>
 Assign a completeness score using exact name of one of the following values:
-- "poor" is very low completeness, the content addresses very few aspects of the\
- user's query, leaving most questions unanswered.
-- "fair" is low completeness, the content addresses some aspects of the user's query\
- but leaves several important parts unanswered or incomplete.
-- "good" is moderate completeness, the content addresses most aspects of the user's\
- query but may miss some details or minor components.
-- "excellent" is high completeness, the content addresses nearly all aspects of the\
- user's query with only minor gaps or omissions.
-- "perfect" is very high completeness, the content fully and comprehensively addresses\
- all aspects of the user's query without any significant omissions.
+- "poor" is very low completeness, the content addresses very few aspects of the user's query, leaving most questions unanswered.
+- "fair" is low completeness, the content addresses some aspects of the user's query but leaves several important parts unanswered or incomplete.
+- "good" is moderate completeness, the content addresses most aspects of the user's query but may miss some details or minor components.
+- "excellent" is high completeness, the content addresses nearly all aspects of the user's query with only minor gaps or omissions.
+- "perfect" is very high completeness, the content fully and comprehensively addresses all aspects of the user's query without any significant omissions.
 Use the "none" value for content that cannot be rated at all.
 </RATING>
 
-<FORMAT>
-The final result containing only the rating value, HAVE to be put inside a `RESULT`\
- xml tag within the result i.e. `<RESULT>good</RESULT>`.
-</FORMAT>
-"""
+{FORMAT_INSTRUCTION}
+"""  # noqa: E501
 
 
 @evaluator(name="completeness")
@@ -92,26 +79,17 @@ async def completeness_evaluator(
             meta={"comment": "User query was empty!"},
         )
 
-    completion: MultimodalContent = await Stage.completion(
-        MultimodalContent.of(
-            "<USER_QUERY>",
-            user_query,
-            "</USER_QUERY>\n<EVALUATED>",
-            evaluated,
-            "</EVALUATED>",
-        ),
-        instructions=INSTRUCTION.format(
-            guidelines=f"\n<GUIDELINES>\n{guidelines}\n</GUIDELINES>\n"
-            if guidelines is not None
-            else ""
-        ),
-    ).execute()
-
-    if result := completion.tag("RESULT"):
-        return EvaluationScore.of(
-            cast(EvaluationScoreValue, result.content.to_str().strip().lower()),
-            meta={"comment": completion.to_str()},
-        )
-
-    else:
-        raise ValueError(f"Invalid evaluator result:\n{completion}")
+    return extract_evaluation_result(
+        await Stage.completion(
+            MultimodalContent.of(
+                "<USER_QUERY>",
+                user_query,
+                "</USER_QUERY>\n<EVALUATED>",
+                evaluated,
+                "</EVALUATED>",
+            ),
+            instructions=INSTRUCTION.format(
+                guidelines=f"\n<GUIDELINES>\n{guidelines}\n</GUIDELINES>\n" if guidelines else "",
+            ),
+        ).execute()
+    )