Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 2 additions & 31 deletions src/autora/doc/pipelines/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,11 @@
from timeit import default_timer as timer
from typing import Dict, List, Tuple

import nltk
import torch
import typer
from nltk.translate.bleu_score import SmoothingFunction, corpus_bleu
from nltk.translate.meteor_score import single_meteor_score

from autora.doc.classes.EvalResult import EvalResult
from autora.doc.pipelines.metrics import eval_bleu_meteor
from autora.doc.runtime.predict_hf import Predictor
from autora.doc.runtime.prompts import PROMPTS, PromptIds
from autora.doc.util import get_prompts_from_file
Expand All @@ -22,33 +20,6 @@
logger = logging.getLogger(__name__)


def evaluate_documentation(predictions: List[str], references: List[str]) -> Tuple[float, float]:
nltk.download("wordnet")

# Tokenize references
tokenized_references = [ref.split() for ref in references]
# Currently there is only 1 prediction for 1 reference, need to avg in future
tokenized_predictions = [pred.split() if pred else [] for pred in predictions]

# Calculate BLEU score with smoothing function
# SmoothingFunction().method1 is used to avoid zero scores for n-grams not found in the reference.
bleu = corpus_bleu(
# Wrap each reference list in another list
[[tokenized_ref] for tokenized_ref in tokenized_references],
tokenized_predictions,
smoothing_function=SmoothingFunction().method1,
)

# Calculate METEOR scores
meteor_scores = [
single_meteor_score(tokenized_ref, tokenized_pred)
for tokenized_ref, tokenized_pred in zip(tokenized_references, tokenized_predictions)
]
meteor = sum(meteor_scores) / len(predictions) if predictions else 0

return (bleu, meteor)


@app.command(help="Evaluate a model for code-to-documentation generation for all prompts in the prompts_file")
def eval_prompts(
data_file: str = typer.Argument(..., help="JSONL Data file to evaluate on"),
Expand Down Expand Up @@ -143,7 +114,7 @@ def eval_prompt(
timer_start = timer()
predictions = pred.predict(prompt, inputs, **param_dict)
timer_end = timer()
bleu, meteor = evaluate_documentation(predictions, labels)
bleu, meteor = eval_bleu_meteor(predictions, labels)
pred_time = timer_end - timer_start
mlflow.log_metric("prediction_time/doc", pred_time / (len(inputs)))
for i in range(len(inputs)):
Expand Down
32 changes: 32 additions & 0 deletions src/autora/doc/pipelines/metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from typing import List, Tuple

import nltk
from nltk.translate.bleu_score import SmoothingFunction, corpus_bleu
from nltk.translate.meteor_score import single_meteor_score


def eval_bleu_meteor(predictions: List[str], references: List[str]) -> Tuple[float, float]:
nltk.download("wordnet")

# Tokenize references
tokenized_references = [ref.split() for ref in references]
# Currently there is only 1 prediction for 1 reference, need to avg in future
tokenized_predictions = [pred.split() if pred else [] for pred in predictions]

# Calculate BLEU score with smoothing function
# SmoothingFunction().method1 is used to avoid zero scores for n-grams not found in the reference.
bleu = corpus_bleu(
# Wrap each reference list in another list
[[tokenized_ref] for tokenized_ref in tokenized_references],
tokenized_predictions,
smoothing_function=SmoothingFunction().method1,
)

# Calculate METEOR scores
meteor_scores = [
single_meteor_score(tokenized_ref, tokenized_pred)
for tokenized_ref, tokenized_pred in zip(tokenized_references, tokenized_predictions)
]
meteor = sum(meteor_scores) / len(predictions) if predictions else 0

return (bleu, meteor)
56 changes: 2 additions & 54 deletions tests/test_main.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,8 @@
from pathlib import Path
from typing import Dict, List

import jsonlines
import pytest
from typing import List

from autora.doc.classes.EvalResult import EvalResult
from autora.doc.pipelines.main import eval, eval_prompts, evaluate_documentation, generate, import_data
from autora.doc.pipelines.main import eval, eval_prompts, generate, import_data
from autora.doc.runtime.prompts import PromptIds

# dummy HF model for testing
Expand All @@ -20,55 +17,6 @@ def test_predict() -> None:
assert len(output) > 0, "Expected non-empty output"


def test_evaluation() -> None:
# Test Case: Meteor and Bleu scores are close to 1
data = Path(__file__).parent.joinpath("../data/sweetpea/data.jsonl").resolve()
with jsonlines.open(data) as reader:
items = [item for item in reader]
labels = [item["output"] for item in items]
predictions = [item["output"] for item in items]

bleu, meteor = evaluate_documentation(predictions, labels)
assert bleu == pytest.approx(1, 0.01), f"BLEU Score is {bleu}"
assert meteor == pytest.approx(1, 0.01), f"METEOR Score is {meteor}"


def test_extra_token_in_prediction() -> None:
# Test Case bleu score should be less due to brevity penalty and meteor is robust to small mistakes
labels = ["this is a test"]
predictions = ["this is a test extra"]
bleu, meteor = evaluate_documentation(predictions, labels)
assert 0.6 <= bleu <= 0.8, f"BLEU Score is {bleu}"
assert 0.8 <= meteor <= 1, f"METEOR Score is {meteor}"


def test_missing_token_in_prediction() -> None:
# bleu score is less, meteor is higher
labels = ["this is a test"]
predictions = ["this is a"]
bleu, meteor = evaluate_documentation(predictions, labels)
assert 0.4 <= bleu <= 0.6, f"BLEU Score is {bleu}"
assert 0.6 <= meteor <= 0.8, f"METEOR Score is {meteor}"


def test_completely_different_tokens() -> None:
# both scores are less, as no common tokens
labels = ["this is a test"]
predictions = ["completely different sentence"]
bleu, meteor = evaluate_documentation(predictions, labels)
assert bleu <= 0.1, f"BLEU Score is {bleu}"
assert meteor <= 0.1, f"METEOR Score is {meteor}"


def test_partially_matching_tokens() -> None:
# As ngrams arent matching because of extra token within, BLEU score is very less. Meteor gives a good score only.
labels = ["this is a test"]
predictions = ["this is a different test"]
bleu, meteor = evaluate_documentation(predictions, labels)
assert 0.25 <= bleu <= 0.4, f"BLEU Score is {bleu}"
assert 0.8 <= meteor <= 0.95, f"METEOR Score is {meteor}"


def test_generate() -> None:
python_file = __file__
output = Path("output.txt")
Expand Down
55 changes: 55 additions & 0 deletions tests/test_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
from pathlib import Path

import jsonlines
import pytest

from autora.doc.pipelines.metrics import eval_bleu_meteor


def test_evaluation() -> None:
# Test Case: Meteor and Bleu scores are close to 1
data = Path(__file__).parent.joinpath("../data/sweetpea/data.jsonl").resolve()
with jsonlines.open(data) as reader:
items = [item for item in reader]
labels = [item["output"] for item in items]
predictions = [item["output"] for item in items]

bleu, meteor = eval_bleu_meteor(predictions, labels)
assert bleu == pytest.approx(1, 0.01), f"BLEU Score is {bleu}"
assert meteor == pytest.approx(1, 0.01), f"METEOR Score is {meteor}"


def test_extra_token_in_prediction() -> None:
# Test Case bleu score should be less due to brevity penalty and meteor is robust to small mistakes
labels = ["this is a test"]
predictions = ["this is a test extra"]
bleu, meteor = eval_bleu_meteor(predictions, labels)
assert 0.6 <= bleu <= 0.8, f"BLEU Score is {bleu}"
assert 0.8 <= meteor <= 1, f"METEOR Score is {meteor}"


def test_missing_token_in_prediction() -> None:
# bleu score is less, meteor is higher
labels = ["this is a test"]
predictions = ["this is a"]
bleu, meteor = eval_bleu_meteor(predictions, labels)
assert 0.4 <= bleu <= 0.6, f"BLEU Score is {bleu}"
assert 0.6 <= meteor <= 0.8, f"METEOR Score is {meteor}"


def test_completely_different_tokens() -> None:
# both scores are less, as no common tokens
labels = ["this is a test"]
predictions = ["completely different sentence"]
bleu, meteor = eval_bleu_meteor(predictions, labels)
assert bleu <= 0.1, f"BLEU Score is {bleu}"
assert meteor <= 0.1, f"METEOR Score is {meteor}"


def test_partially_matching_tokens() -> None:
# As ngrams arent matching because of extra token within, BLEU score is very less. Meteor gives a good score only.
labels = ["this is a test"]
predictions = ["this is a different test"]
bleu, meteor = eval_bleu_meteor(predictions, labels)
assert 0.25 <= bleu <= 0.4, f"BLEU Score is {bleu}"
assert 0.8 <= meteor <= 0.95, f"METEOR Score is {meteor}"