Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 69 additions & 10 deletions src/draive/evaluation/evaluator.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import re
from asyncio import gather
from collections.abc import Callable, Collection
from collections.abc import Callable, Collection, Sequence
from typing import Annotated, Protocol, Self, cast, overload, runtime_checkable

from haiway import (
Expand All @@ -10,8 +9,10 @@
Immutable,
Meta,
MetaValues,
ObservabilityLevel,
State,
Validator,
concurrently,
ctx,
)

Expand Down Expand Up @@ -288,16 +289,19 @@ def lowest(
evaluator: PreparedEvaluator[Value],
/,
*evaluators: PreparedEvaluator[Value],
concurrent_tasks: int = 2,
) -> PreparedEvaluator[Value]:
"""
Create an evaluator that returns the lowest scoring result.
Create an evaluator that returns the lowest performing result.

Parameters
----------
evaluator : PreparedEvaluator[Value]
First evaluator to run
*evaluators : PreparedEvaluator[Value]
Additional evaluators to run
concurrent_tasks: int
Number of concurrently executed evaluators

Returns
-------
Expand All @@ -316,9 +320,10 @@ async def evaluate(
meta=META_EMPTY,
)

for result in await gather(
evaluator(value),
*(evaluator(value) for evaluator in evaluators),
for result in await concurrently(
(evaluator(value), *(evaluator(value) for evaluator in evaluators)),
concurrent_tasks=concurrent_tasks,
return_exceptions=False,
):
if result.performance <= lowest.performance:
lowest = result
Expand All @@ -332,16 +337,19 @@ def highest(
evaluator: PreparedEvaluator[Value],
/,
*evaluators: PreparedEvaluator[Value],
concurrent_tasks: int = 2,
) -> PreparedEvaluator[Value]:
"""
Create an evaluator that returns the highest scoring result.
Create an evaluator that returns the highest performing result.

Parameters
----------
evaluator : PreparedEvaluator[Value]
First evaluator to run
*evaluators : PreparedEvaluator[Value]
Additional evaluators to run
concurrent_tasks: int
Number of concurrently executed evaluators

Returns
-------
Expand All @@ -360,9 +368,10 @@ async def evaluate(
meta=META_EMPTY,
)

for result in await gather(
evaluator(value),
*(evaluator(value) for evaluator in evaluators),
for result in await concurrently(
(evaluator(value), *(evaluator(value) for evaluator in evaluators)),
concurrent_tasks=concurrent_tasks,
return_exceptions=False,
):
if result.performance >= highest.performance:
highest = result
Expand All @@ -371,6 +380,55 @@ async def evaluate(

return evaluate

@staticmethod
def average(
evaluator: PreparedEvaluator[Value],
/,
*evaluators: PreparedEvaluator[Value],
threshold: EvaluationScoreValue,
concurrent_tasks: int = 2,
) -> PreparedEvaluator[Value]:
"""
Create an evaluator that returns the average scoring result.

Parameters
----------
evaluator : PreparedEvaluator[Value]
First evaluator to run
*evaluators : PreparedEvaluator[Value]
Additional evaluators to run
threshold: EvaluationScoreValue
Combined result threshold replacing combined thresholds
concurrent_tasks: int
Number of concurrently executed evaluators

Returns
-------
PreparedEvaluator[Value]
Evaluator that returns the result with average score value
"""

async def evaluate(
value: Value,
) -> EvaluatorResult:
scores: Sequence[float] = [
result.score
for result in await concurrently(
(evaluator(value), *(evaluator(value) for evaluator in evaluators)),
concurrent_tasks=concurrent_tasks,
return_exceptions=False,
)
]

return EvaluatorResult(
evaluator="average",
score=sum(scores) / len(scores),
threshold=evaluation_score_value(threshold),
meta=META_EMPTY,
)

return evaluate

name: str
threshold: float
meta: Meta
Expand Down Expand Up @@ -618,6 +676,7 @@ async def __call__(
)

ctx.record(
ObservabilityLevel.INFO,
metric=f"evaluator.{result.evaluator}.performance",
value=result.performance,
unit="%",
Expand Down
2 changes: 1 addition & 1 deletion src/draive/evaluation/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ async def generate_case_parameters[Parameters: DataModel](
parameters,
instructions=INSTRUCTION.format(
guidelines=f"\n<GUIDELINES>\n{guidelines}\n</GUIDELINES>\n"
if guidelines is not None
if guidelines
else ""
),
input=INPUT,
Expand Down
65 changes: 24 additions & 41 deletions src/draive/evaluators/coherence.py
Original file line number Diff line number Diff line change
@@ -1,46 +1,38 @@
from typing import cast

from draive.evaluation import EvaluationScore, EvaluationScoreValue, evaluator
from draive.evaluation import EvaluationScore, evaluator
from draive.evaluators.utils import FORMAT_INSTRUCTION, extract_evaluation_result
from draive.multimodal import Multimodal, MultimodalContent
from draive.stages import Stage

__all__ = ("coherence_evaluator",)


INSTRUCTION: str = """\
INSTRUCTION: str = f"""\
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🧹 Nitpick | 🔵 Trivial

Mark INSTRUCTION as Final

The INSTRUCTION constant should be annotated with Final to prevent accidental reassignment and align with strict typing guidelines.

Apply this diff:

+from typing import Final
+
 from draive.evaluation import EvaluationScore, evaluator
-INSTRUCTION: str = f"""\
+INSTRUCTION: Final[str] = f"""\
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
INSTRUCTION: str = f"""\
from typing import Final
from draive.evaluation import EvaluationScore, evaluator
INSTRUCTION: Final[str] = f"""\
🤖 Prompt for AI Agents
In src/draive/evaluators/coherence.py around line 9, the INSTRUCTION constant is
currently declared without a Final type annotation; update its declaration to
use typing.Final (e.g., INSTRUCTION: Final[str] = ...) and add an import for
Final from typing at the top of the file if not already present so the constant
is protected from reassignment and the type checker recognizes it as final.

You are evaluating the provided content according to the defined criteria.

<INSTRUCTION>
Compare the REFERENCE and the EVALUATED content by carefully examining them, then rate\
the EVALUATED content using solely a coherence metric according to the EVALUATION_CRITERIA.
Compare the REFERENCE and the EVALUATED content by carefully examining them, then rate the EVALUATED content using solely a coherence metric according to the EVALUATION_CRITERIA.
Think step by step and provide explanation of the score before the final score.
Use the explained RATING scale and the requested FORMAT to provide the result.
</INSTRUCTION>

<EVALUATION_CRITERIA>
Evaluated metric is coherence - a collective quality of the content.
We align this dimension with the DUC (Document Understanding Conference) quality question of\
structure and coherence, whereby the content should be well-structured and well-organized.
EVALUATED content should not just be a heap of related information, but should build from part
to part into a coherent body of information about the topic.
We align this dimension with the DUC (Document Understanding Conference) quality question of structure and coherence, whereby the content should be well-structured and well-organized.
EVALUATED content should not just be a heap of related information, but should build from part to part into a coherent body of information about the topic.
</EVALUATION_CRITERIA>
{guidelines}
{{guidelines}}
<RATING>
Assign a coherence score using exact name of one of the following values:
- "poor" is very low coherence, the content is chaotic, lacking logical connections between parts.
- "fair" is low coherence, some connections are visible, but the overall structure is weak.
- "good" is moderate coherence, the content has a noticeable structure, but with some shortcomings.
- "excellent" is high coherence, the content is well-organized with minor imperfections.
- "perfect" is very high coherence, the content is exemplarily structured, with smooth transitions\
between ideas.
- "perfect" is very high coherence, the content is exemplarily structured, with smooth transitions between ideas.
Use the "none" value for content that cannot be rated at all.
</RATING>

<FORMAT>
The final result containing only the rating value, HAVE to be put inside a `RESULT`\
xml tag within the result i.e. `<RESULT>good</RESULT>`.
</FORMAT>
"""
{FORMAT_INSTRUCTION}
""" # noqa: E501


@evaluator(name="coherence")
Expand All @@ -63,26 +55,17 @@ async def coherence_evaluator(
meta={"comment": "Reference was empty!"},
)

completion: MultimodalContent = await Stage.completion(
MultimodalContent.of(
"<REFERENCE>",
reference,
"</REFERENCE>\n<EVALUATED>",
evaluated,
"</EVALUATED>",
),
instructions=INSTRUCTION.format(
guidelines=f"\n<GUIDELINES>\n{guidelines}\n</GUIDELINES>\n"
if guidelines is not None
else ""
),
).execute()

if result := completion.tag("RESULT"):
return EvaluationScore.of(
cast(EvaluationScoreValue, result.content.to_str().strip().lower()),
meta={"comment": completion.to_str()},
)

else:
raise ValueError(f"Invalid evaluator result:\n{completion}")
return extract_evaluation_result(
await Stage.completion(
MultimodalContent.of(
"<REFERENCE>",
reference,
"</REFERENCE>\n<EVALUATED>",
evaluated,
"</EVALUATED>",
),
instructions=INSTRUCTION.format(
guidelines=f"\n<GUIDELINES>\n{guidelines}\n</GUIDELINES>\n" if guidelines else "",
),
).execute()
)
76 changes: 27 additions & 49 deletions src/draive/evaluators/completeness.py
Original file line number Diff line number Diff line change
@@ -1,49 +1,36 @@
from typing import cast

from draive.evaluation import EvaluationScore, EvaluationScoreValue, evaluator
from draive.evaluation import EvaluationScore, evaluator
from draive.evaluators.utils import FORMAT_INSTRUCTION, extract_evaluation_result
from draive.multimodal import Multimodal, MultimodalContent
from draive.stages import Stage

__all__ = ("completeness_evaluator",)


INSTRUCTION: str = """\
INSTRUCTION: str = f"""\
You are evaluating the provided content according to the defined criteria.

<INSTRUCTION>
Compare the USER_QUERY and the EVALUATED content by carefully examining them, then rate\
the EVALUATED content using solely a completeness metric according to the EVALUATION_CRITERIA.
Compare the USER_QUERY and the EVALUATED content by carefully examining them, then rate the EVALUATED content using solely a completeness metric according to the EVALUATION_CRITERIA.
Think step by step and provide explanation of the score before the final score.
Use the explained RATING scale and the requested FORMAT to provide the result.
</INSTRUCTION>

<EVALUATION_CRITERIA>
Evaluated metric is completeness - the extent to which the EVALUATED content fully\
addresses and answers all aspects of the USER_QUERY. Complete content should address\
all parts of multi-part questions, provide comprehensive responses to complex queries,\
and not leave important aspects of the user's request unanswered.
Evaluated metric is completeness - the extent to which the EVALUATED content fully addresses and answers all aspects of the USER_QUERY. Complete content should address all parts of multi-part questions, provide comprehensive responses to complex queries, and not leave important aspects of the user's request unanswered.
</EVALUATION_CRITERIA>
{guidelines}
{{guidelines}}
<RATING>
Assign a completeness score using exact name of one of the following values:
- "poor" is very low completeness, the content addresses very few aspects of the\
user's query, leaving most questions unanswered.
- "fair" is low completeness, the content addresses some aspects of the user's query\
but leaves several important parts unanswered or incomplete.
- "good" is moderate completeness, the content addresses most aspects of the user's\
query but may miss some details or minor components.
- "excellent" is high completeness, the content addresses nearly all aspects of the\
user's query with only minor gaps or omissions.
- "perfect" is very high completeness, the content fully and comprehensively addresses\
all aspects of the user's query without any significant omissions.
- "poor" is very low completeness, the content addresses very few aspects of the user's query, leaving most questions unanswered.
- "fair" is low completeness, the content addresses some aspects of the user's query but leaves several important parts unanswered or incomplete.
- "good" is moderate completeness, the content addresses most aspects of the user's query but may miss some details or minor components.
- "excellent" is high completeness, the content addresses nearly all aspects of the user's query with only minor gaps or omissions.
- "perfect" is very high completeness, the content fully and comprehensively addresses all aspects of the user's query without any significant omissions.
Use the "none" value for content that cannot be rated at all.
</RATING>

<FORMAT>
The final result containing only the rating value, HAVE to be put inside a `RESULT`\
xml tag within the result i.e. `<RESULT>good</RESULT>`.
</FORMAT>
"""
{FORMAT_INSTRUCTION}
""" # noqa: E501


@evaluator(name="completeness")
Expand Down Expand Up @@ -92,26 +79,17 @@ async def completeness_evaluator(
meta={"comment": "User query was empty!"},
)

completion: MultimodalContent = await Stage.completion(
MultimodalContent.of(
"<USER_QUERY>",
user_query,
"</USER_QUERY>\n<EVALUATED>",
evaluated,
"</EVALUATED>",
),
instructions=INSTRUCTION.format(
guidelines=f"\n<GUIDELINES>\n{guidelines}\n</GUIDELINES>\n"
if guidelines is not None
else ""
),
).execute()

if result := completion.tag("RESULT"):
return EvaluationScore.of(
cast(EvaluationScoreValue, result.content.to_str().strip().lower()),
meta={"comment": completion.to_str()},
)

else:
raise ValueError(f"Invalid evaluator result:\n{completion}")
return extract_evaluation_result(
await Stage.completion(
MultimodalContent.of(
"<USER_QUERY>",
user_query,
"</USER_QUERY>\n<EVALUATED>",
evaluated,
"</EVALUATED>",
),
instructions=INSTRUCTION.format(
guidelines=f"\n<GUIDELINES>\n{guidelines}\n</GUIDELINES>\n" if guidelines else "",
),
).execute()
)
Loading