-
Notifications
You must be signed in to change notification settings - Fork 856
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'main' into feature/xpia-sim-and-eval-fixes
- Loading branch information
Showing
17 changed files
with
529 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
9 changes: 9 additions & 0 deletions
9
src/promptflow-evals/promptflow/evals/evaluators/_bleu/__init__.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
# --------------------------------------------------------- | ||
# Copyright (c) Microsoft Corporation. All rights reserved. | ||
# --------------------------------------------------------- | ||
|
||
from ._bleu import BleuScoreEvaluator | ||
|
||
__all__ = [ | ||
"BleuScoreEvaluator", | ||
] |
72 changes: 72 additions & 0 deletions
72
src/promptflow-evals/promptflow/evals/evaluators/_bleu/_bleu.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
# --------------------------------------------------------- | ||
# Copyright (c) Microsoft Corporation. All rights reserved. | ||
# --------------------------------------------------------- | ||
from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu | ||
|
||
from promptflow._utils.async_utils import async_run_allowing_running_loop | ||
from promptflow.evals._common.utils import nltk_tokenize | ||
|
||
|
||
class _AsyncBleuScoreEvaluator: | ||
def __init__(self): | ||
pass | ||
|
||
async def __call__(self, *, answer: str, ground_truth: str, **kwargs): | ||
reference_tokens = nltk_tokenize(ground_truth) | ||
hypothesis_tokens = nltk_tokenize(answer) | ||
|
||
smoothing_function = SmoothingFunction().method4 | ||
score = sentence_bleu([reference_tokens], hypothesis_tokens, smoothing_function=smoothing_function) | ||
|
||
return { | ||
"bleu_score": score, | ||
} | ||
|
||
|
||
class BleuScoreEvaluator: | ||
""" | ||
Evaluator that computes the BLEU Score between two strings. | ||
BLEU (Bilingual Evaluation Understudy) score is commonly used in natural language processing (NLP) and machine | ||
translation. It is widely used in text summarization and text generation use cases. It evaluates how closely the | ||
generated text matches the reference text. The BLEU score ranges from 0 to 1, with higher scores indicating | ||
better quality. | ||
**Usage** | ||
.. code-block:: python | ||
eval_fn = BleuScoreEvaluator() | ||
result = eval_fn( | ||
answer="Tokyo is the capital of Japan.", | ||
ground_truth="The capital of Japan is Tokyo.") | ||
**Output format** | ||
.. code-block:: python | ||
{ | ||
"bleu_score": 0.22 | ||
} | ||
""" | ||
|
||
def __init__(self): | ||
self._async_evaluator = _AsyncBleuScoreEvaluator() | ||
|
||
def __call__(self, *, answer: str, ground_truth: str, **kwargs): | ||
""" | ||
Evaluate the BLEU score between the answer and the ground truth. | ||
:keyword answer: The answer to be evaluated. | ||
:paramtype answer: str | ||
:keyword ground_truth: The ground truth to be compared against. | ||
:paramtype ground_truth: str | ||
:return: The BLEU score. | ||
:rtype: dict | ||
""" | ||
return async_run_allowing_running_loop( | ||
self._async_evaluator, answer=answer, ground_truth=ground_truth, **kwargs | ||
) | ||
|
||
def _to_async(self): | ||
return self._async_evaluator |
9 changes: 9 additions & 0 deletions
9
src/promptflow-evals/promptflow/evals/evaluators/_gleu/__init__.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
# --------------------------------------------------------- | ||
# Copyright (c) Microsoft Corporation. All rights reserved. | ||
# --------------------------------------------------------- | ||
|
||
from ._gleu import GleuScoreEvaluator | ||
|
||
__all__ = [ | ||
"GleuScoreEvaluator", | ||
] |
71 changes: 71 additions & 0 deletions
71
src/promptflow-evals/promptflow/evals/evaluators/_gleu/_gleu.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
# --------------------------------------------------------- | ||
# Copyright (c) Microsoft Corporation. All rights reserved. | ||
# --------------------------------------------------------- | ||
from nltk.translate.gleu_score import sentence_gleu | ||
|
||
from promptflow._utils.async_utils import async_run_allowing_running_loop | ||
from promptflow.evals._common.utils import nltk_tokenize | ||
|
||
|
||
class _AsyncGleuScoreEvaluator: | ||
def __init__(self): | ||
pass | ||
|
||
async def __call__(self, *, ground_truth: str, answer: str, **kwargs): | ||
reference_tokens = nltk_tokenize(ground_truth) | ||
hypothesis_tokens = nltk_tokenize(answer) | ||
|
||
score = sentence_gleu([reference_tokens], hypothesis_tokens) | ||
|
||
return { | ||
"gleu_score": score, | ||
} | ||
|
||
|
||
class GleuScoreEvaluator: | ||
""" | ||
Evaluator that computes the BLEU Score between two strings. | ||
The GLEU (Google-BLEU) score evaluator measures the similarity between generated and reference texts by | ||
evaluating n-gram overlap, considering both precision and recall. This balanced evaluation, designed for | ||
sentence-level assessment, makes it ideal for detailed analysis of translation quality. GLEU is well-suited for | ||
use cases such as machine translation, text summarization, and text generation. | ||
**Usage** | ||
.. code-block:: python | ||
eval_fn = GleuScoreEvaluator() | ||
result = eval_fn( | ||
answer="Tokyo is the capital of Japan.", | ||
ground_truth="The capital of Japan is Tokyo.") | ||
**Output format** | ||
.. code-block:: python | ||
{ | ||
"gleu_score": 0.41 | ||
} | ||
""" | ||
|
||
def __init__(self): | ||
self._async_evaluator = _AsyncGleuScoreEvaluator() | ||
|
||
def __call__(self, *, ground_truth: str, answer: str, **kwargs): | ||
""" | ||
Evaluate the GLEU score between the answer and the ground truth. | ||
:keyword answer: The answer to be evaluated. | ||
:paramtype answer: str | ||
:keyword ground_truth: The ground truth to be compared against. | ||
:paramtype ground_truth: str | ||
:return: The GLEU score. | ||
:rtype: dict | ||
""" | ||
return async_run_allowing_running_loop( | ||
self._async_evaluator, ground_truth=ground_truth, answer=answer, **kwargs | ||
) | ||
|
||
def _to_async(self): | ||
return self._async_evaluator |
9 changes: 9 additions & 0 deletions
9
src/promptflow-evals/promptflow/evals/evaluators/_meteor/__init__.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
# --------------------------------------------------------- | ||
# Copyright (c) Microsoft Corporation. All rights reserved. | ||
# --------------------------------------------------------- | ||
|
||
from ._meteor import MeteorScoreEvaluator | ||
|
||
__all__ = [ | ||
"MeteorScoreEvaluator", | ||
] |
91 changes: 91 additions & 0 deletions
91
src/promptflow-evals/promptflow/evals/evaluators/_meteor/_meteor.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
# --------------------------------------------------------- | ||
# Copyright (c) Microsoft Corporation. All rights reserved. | ||
# --------------------------------------------------------- | ||
from nltk.translate.meteor_score import single_meteor_score | ||
|
||
from promptflow._utils.async_utils import async_run_allowing_running_loop | ||
from promptflow.evals._common.utils import nltk_tokenize | ||
|
||
|
||
class _AsyncMeteorScoreEvaluator: | ||
def __init__(self, alpha: float = 0.9, beta: float = 3.0, gamma: float = 0.5): | ||
self._alpha = alpha | ||
self._beta = beta | ||
self._gamma = gamma | ||
|
||
async def __call__(self, *, ground_truth: str, answer: str, **kwargs): | ||
reference_tokens = nltk_tokenize(ground_truth) | ||
hypothesis_tokens = nltk_tokenize(answer) | ||
|
||
score = single_meteor_score( | ||
reference_tokens, | ||
hypothesis_tokens, | ||
alpha=self._alpha, | ||
beta=self._beta, | ||
gamma=self._gamma, | ||
) | ||
|
||
return { | ||
"meteor_score": score, | ||
} | ||
|
||
|
||
class MeteorScoreEvaluator: | ||
""" | ||
Evaluator that computes the METEOR Score between two strings. | ||
The METEOR (Metric for Evaluation of Translation with Explicit Ordering) score grader evaluates generated text by | ||
comparing it to reference texts, focusing on precision, recall, and content alignment. It addresses limitations of | ||
other metrics like BLEU by considering synonyms, stemming, and paraphrasing. METEOR score considers synonyms and | ||
word stems to more accurately capture meaning and language variations. In addition to machine translation and | ||
text summarization, paraphrase detection is an optimal use case for the METEOR score. | ||
:param alpha: The METEOR score alpha parameter. Default is 0.9. | ||
:type alpha: float | ||
:param beta: The METEOR score beta parameter. Default is 3.0. | ||
:type beta: float | ||
:param gamma: The METEOR score gamma parameter. Default is 0.5. | ||
:type gamma: float | ||
**Usage** | ||
.. code-block:: python | ||
eval_fn = MeteorScoreEvaluator( | ||
alpha=0.9, | ||
beta=3.0, | ||
gamma=0.5 | ||
) | ||
result = eval_fn( | ||
answer="Tokyo is the capital of Japan.", | ||
ground_truth="The capital of Japan is Tokyo.") | ||
**Output format** | ||
.. code-block:: python | ||
{ | ||
"meteor_score": 0.62 | ||
} | ||
""" | ||
|
||
def __init__(self, alpha: float = 0.9, beta: float = 3.0, gamma: float = 0.5): | ||
self._async_evaluator = _AsyncMeteorScoreEvaluator(alpha=alpha, beta=beta, gamma=gamma) | ||
|
||
def __call__(self, *, ground_truth: str, answer: str, **kwargs): | ||
""" | ||
Evaluate the METEOR score between the answer and the ground truth. | ||
:keyword answer: The answer to be evaluated. | ||
:paramtype answer: str | ||
:keyword ground_truth: The ground truth to be compared against. | ||
:paramtype ground_truth: str | ||
:return: The METEOR score. | ||
:rtype: dict | ||
""" | ||
return async_run_allowing_running_loop( | ||
self._async_evaluator, ground_truth=ground_truth, answer=answer, **kwargs | ||
) | ||
|
||
def _to_async(self): | ||
return self._async_evaluator |
10 changes: 10 additions & 0 deletions
10
src/promptflow-evals/promptflow/evals/evaluators/_rouge/__init__.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
# --------------------------------------------------------- | ||
# Copyright (c) Microsoft Corporation. All rights reserved. | ||
# --------------------------------------------------------- | ||
|
||
from ._rouge import RougeScoreEvaluator, RougeType | ||
|
||
__all__ = [ | ||
"RougeScoreEvaluator", | ||
"RougeType", | ||
] |
Oops, something went wrong.