11from pathlib import Path
22
3- from autora .doc .pipelines .main import eval , generate , import_data
3+ import jsonlines
4+ import pytest
5+
6+ from autora .doc .pipelines .main import eval , evaluate_documentation , generate , import_data
47from autora .doc .runtime .prompts import InstructionPrompts , SystemPrompts
58
69# dummy HF model for testing
@@ -15,6 +18,55 @@ def test_predict() -> None:
1518 assert len (output [0 ]) > 0 , "Expected non-empty output"
1619
1720
21+ def test_evaluation () -> None :
22+ # Test Case: Meteor and Bleu scores are close to 1
23+ data = Path (__file__ ).parent .joinpath ("../data/sweetpea/data.jsonl" ).resolve ()
24+ with jsonlines .open (data ) as reader :
25+ items = [item for item in reader ]
26+ labels = [item ["output" ] for item in items ]
27+ predictions = [[item ["output" ]] for item in items ]
28+
29+ bleu , meteor = evaluate_documentation (predictions , labels )
30+ assert bleu == pytest .approx (1 , 0.01 ), f"BLEU Score is { bleu } "
31+ assert meteor == pytest .approx (1 , 0.01 ), f"METEOR Score is { meteor } "
32+
33+
34+ def test_extra_token_in_prediction () -> None :
35+ # Test Case bleu score should be less due to brevity penalty and meteor is robust to small mistakes
36+ labels = ["this is a test" ]
37+ predictions = [["this is a test extra" ]]
38+ bleu , meteor = evaluate_documentation (predictions , labels )
39+ assert 0.6 <= bleu <= 0.8 , f"BLEU Score is { bleu } "
40+ assert 0.8 <= meteor <= 1 , f"METEOR Score is { meteor } "
41+
42+
43+ def test_missing_token_in_prediction () -> None :
44+ # bleu score is less, meteor is higher
45+ labels = ["this is a test" ]
46+ predictions = [["this is a" ]]
47+ bleu , meteor = evaluate_documentation (predictions , labels )
48+ assert 0.4 <= bleu <= 0.6 , f"BLEU Score is { bleu } "
49+ assert 0.6 <= meteor <= 0.8 , f"METEOR Score is { meteor } "
50+
51+
52+ def test_completely_different_tokens () -> None :
53+ # both scores are less, as no common tokens
54+ labels = ["this is a test" ]
55+ predictions = [["completely different sentence" ]]
56+ bleu , meteor = evaluate_documentation (predictions , labels )
57+ assert bleu <= 0.1 , f"BLEU Score is { bleu } "
58+ assert meteor <= 0.1 , f"METEOR Score is { meteor } "
59+
60+
61+ def test_partially_matching_tokens () -> None :
62+ # As ngrams arent matching because of extra token within, BLEU score is very less. Meteor gives a good score only.
63+ labels = ["this is a test" ]
64+ predictions = [["this is a different test" ]]
65+ bleu , meteor = evaluate_documentation (predictions , labels )
66+ assert 0.25 <= bleu <= 0.4 , f"BLEU Score is { bleu } "
67+ assert 0.8 <= meteor <= 0.95 , f"METEOR Score is { meteor } "
68+
69+
1870def test_generate () -> None :
1971 python_file = __file__
2072 output = Path ("output.txt" )
0 commit comments