Skip to content

Commit fb36435

Browse files
authored
feat: Implement SemScore metric (#38)
1 parent 2a26151 commit fb36435

File tree

11 files changed

+70
-31
lines changed

11 files changed

+70
-31
lines changed

.github/actions/deps/action.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,4 +18,4 @@ runs:
1818
python3 -m pip install --upgrade pip
1919
- name: Install project
2020
shell: sh
21-
run: pip install ".[dev,train,cuda]"
21+
run: pip install ".[dev,pipelines,cuda]"

.mypy.ini

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,6 @@ ignore_missing_imports = True
1111

1212
[mypy-nltk.*]
1313
ignore_missing_imports = True
14+
15+
[mypy-sentence_transformers.*]
16+
ignore_missing_imports = True

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,14 +31,14 @@ Once you have created a new environment, you can install this project for local
3131
development using the following commands:
3232

3333
```
34-
>> pip install -e .'[dev,train]'
34+
>> pip install -e .'[dev,pipelines]'
3535
>> pre-commit install
3636
>> conda install pandoc
3737
```
3838

3939
Notes:
4040
1) The single quotes around `'[dev]'` may not be required for your operating system.
41-
3) Look at `pyproject.toml` for other optional dependencies, e.g. you can do `pip install -e ."[dev,train,cuda]"` if you want to use CUDA.
41+
3) Look at `pyproject.toml` for other optional dependencies, e.g. you can do `pip install -e ."[dev,pipelines,cuda]"` if you want to use CUDA.
4242
2) `pre-commit install` will initialize pre-commit for this local repository, so
4343
that a set of tests will be run prior to completing a local commit. For more
4444
information, see the Python Project Template documentation on

azureml/conda.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,3 +17,4 @@ dependencies:
1717
- nltk
1818
# This works, while installing from pytorch and cuda from conda does not
1919
- torch==2.0.1
20+
- sentence_transformers>=2.3.1

notebooks/generate.ipynb

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
"# Uncomment to clone and install autodoc from GitHub\n",
2626
"# !pip uninstall -y autora-doc\n",
2727
"# !git clone https://github.com/AutoResearch/autodoc.git\n",
28-
"# !pip install \"./autodoc[cuda,train]\"\n",
28+
"# !pip install \"./autodoc[cuda,pipelines]\"\n",
2929
"\n",
3030
"# IMPORTANT: Please restart the runtime after running the above commands"
3131
]
@@ -42,7 +42,7 @@
4242
"%autoreload 2\n",
4343
"from autora.doc.runtime.predict_hf import Predictor, preprocess_code\n",
4444
"from autora.doc.runtime.prompts import PROMPTS, PromptIds, PromptBuilder, SYS_GUIDES\n",
45-
"from autora.doc.pipelines.main import evaluate_documentation\n",
45+
"from autora.doc.pipelines.metrics import eval_bleu_meteor, eval_semscore\n",
4646
"from autora.doc.pipelines.main import eval_prompt, load_data"
4747
]
4848
},
@@ -111,9 +111,13 @@
111111
" top_k=10,\n",
112112
" num_ret_seq=1,\n",
113113
" )\n",
114-
" bleu, meteor = evaluate_documentation(output, [label])\n",
114+
" bleu, meteor = eval_bleu_meteor(output, [label])\n",
115+
" sem_score = eval_semscore(output, [label])\n",
116+
"\n",
115117
" for i, o in enumerate(output):\n",
116-
" print(f\"{promptid}\\n******* Output {i} ********. bleu={bleu}, meteor={meteor}\\n{o}\\n*************\\n\")"
118+
" print(\n",
119+
" f\"{promptid}\\n******* Output {i} ********. bleu={bleu}, meteor={meteor}, sem_score={sem_score}\\n{o}\\n*************\\n\"\n",
120+
" )"
117121
]
118122
},
119123
{
@@ -176,8 +180,10 @@
176180
},
177181
"outputs": [],
178182
"source": [
179-
"out, bleu, meteor = eval_prompt(data_file, pred, prompt, {\"max_new_tokens\": 800.0})\n",
180-
"print(f\"bleu={bleu}, meteor={meteor}\\n{out[0]}\\n*************\\n\")"
183+
"eval_result = eval_prompt(data_file, pred, prompt, {\"max_new_tokens\": 800.0})\n",
184+
"print(\n",
185+
" f\"bleu={eval_result.bleu_score}, meteor={eval_result.meteor_score}, sem_score={eval_result.sem_score}\\n{eval_result.predictions[0]}\\n*************\\n\"\n",
186+
")"
181187
]
182188
},
183189
{

pyproject.toml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ dependencies = [
1919
# This works, while installing from pytorch and cuda from conda does not",
2020
"torch==2.0.1",
2121
"transformers>=4.37.2",
22-
"nltk",
2322
]
2423

2524
# On a mac, install optional dependencies with `pip install '.[dev]'` (include the single quotes)
@@ -44,7 +43,7 @@ dev = [
4443
"ipykernel",
4544
"hf_transfer",
4645
]
47-
train = ["jsonlines", "mlflow"]
46+
pipelines = ["jsonlines", "mlflow", "nltk", "sentence-transformers>=2.3.1"]
4847
azure = ["azureml-core", "azureml-mlflow"]
4948
cuda = ["bitsandbytes>=0.42.0", "accelerate>=0.24.1", "xformers"]
5049

src/autora/doc/classes/EvalResult.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@
66
class EvalResult:
77
"""Class for storing LLM evaluation results"""
88

9-
prediction: List[str]
9+
predictions: List[str]
1010
prompt: str
1111
bleu_score: Optional[float] = None
1212
meteor_score: Optional[float] = None
13+
sem_score: Optional[float] = None

src/autora/doc/pipelines/main.py

Lines changed: 7 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import typer
88

99
from autora.doc.classes.EvalResult import EvalResult
10-
from autora.doc.pipelines.metrics import eval_bleu_meteor
10+
from autora.doc.pipelines.metrics import eval_bleu_meteor, eval_semscore
1111
from autora.doc.runtime.predict_hf import Predictor
1212
from autora.doc.runtime.prompts import PROMPTS, PromptIds
1313
from autora.doc.util import get_prompts_from_file
@@ -52,14 +52,8 @@ def eval_prompts(
5252
predictor = Predictor(model_path)
5353
for i in range(len(prompts_list)):
5454
logger.info(f"Starting to run model on prompt {i}")
55-
prediction_with_scores = eval_prompt(data_file, predictor, prompts_list[i], param_dict)
55+
eval_result = eval_prompt(data_file, predictor, prompts_list[i], param_dict)
5656
logger.info(f"Model run completed on prompt {i}: {prompts_list[i]}")
57-
eval_result = EvalResult(
58-
prediction_with_scores[0],
59-
prompts_list[i],
60-
prediction_with_scores[1],
61-
prediction_with_scores[2],
62-
)
6357
results_list.append(eval_result)
6458
return results_list
6559

@@ -72,7 +66,7 @@ def eval(
7266
param: List[str] = typer.Option(
7367
[], help="Additional float parameters to pass to the model as name=float pairs"
7468
),
75-
) -> Tuple[List[str], float, float]:
69+
) -> EvalResult:
7670
import mlflow
7771

7872
mlflow.autolog()
@@ -104,9 +98,7 @@ def load_data(data_file: str) -> Tuple[List[str], List[str]]:
10498
return inputs, labels
10599

106100

107-
def eval_prompt(
108-
data_file: str, pred: Predictor, prompt: str, param_dict: Dict[str, float]
109-
) -> Tuple[List[str], float, float]:
101+
def eval_prompt(data_file: str, pred: Predictor, prompt: str, param_dict: Dict[str, float]) -> EvalResult:
110102
import mlflow
111103

112104
inputs, labels = load_data(data_file)
@@ -115,6 +107,7 @@ def eval_prompt(
115107
predictions = pred.predict(prompt, inputs, **param_dict)
116108
timer_end = timer()
117109
bleu, meteor = eval_bleu_meteor(predictions, labels)
110+
semscore = eval_semscore(predictions, labels)
118111
pred_time = timer_end - timer_start
119112
mlflow.log_metric("prediction_time/doc", pred_time / (len(inputs)))
120113
for i in range(len(inputs)):
@@ -133,7 +126,8 @@ def eval_prompt(
133126
mlflow.log_metric("tokens/sec", total_tokens / pred_time)
134127
mlflow.log_metric("bleu_score", round(bleu, 5))
135128
mlflow.log_metric("meteor_score", round(meteor, 5))
136-
return predictions, bleu, meteor
129+
mlflow.log_metric("semscore", round(semscore, 5))
130+
return EvalResult(predictions, prompt, bleu, meteor, semscore)
137131

138132

139133
@app.command()

src/autora/doc/pipelines/metrics.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@
33
import nltk
44
from nltk.translate.bleu_score import SmoothingFunction, corpus_bleu
55
from nltk.translate.meteor_score import single_meteor_score
6+
from numpy import dot, mean, nan_to_num
7+
from numpy.linalg import norm
8+
from sentence_transformers import SentenceTransformer
69

710

811
def eval_bleu_meteor(predictions: List[str], references: List[str]) -> Tuple[float, float]:
@@ -27,6 +30,24 @@ def eval_bleu_meteor(predictions: List[str], references: List[str]) -> Tuple[flo
2730
single_meteor_score(tokenized_ref, tokenized_pred)
2831
for tokenized_ref, tokenized_pred in zip(tokenized_references, tokenized_predictions)
2932
]
30-
meteor = sum(meteor_scores) / len(predictions) if predictions else 0
33+
meteor: float = nan_to_num(mean(meteor_scores), nan=0)
3134

3235
return (bleu, meteor)
36+
37+
38+
def eval_semscore(predictions: List[str], references: List[str]) -> float:
39+
"""
40+
Calculate sentence embedding similarity score.
41+
https://arxiv.org/pdf/2401.17072.pdf
42+
"""
43+
model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
44+
45+
def score(pred: str, ref: str) -> float:
46+
encodings = model.encode([pred, ref])
47+
assert len(encodings) == 2
48+
cos_dist: float = dot(encodings[0], encodings[1]) / norm(encodings[0]) * norm(encodings[1])
49+
return cos_dist
50+
51+
scores = [score(pred, ref) for pred, ref in zip(predictions, references)]
52+
semscore: float = nan_to_num(mean(scores), nan=0)
53+
return semscore

tests/test_main.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,9 @@
1111

1212
def test_predict() -> None:
1313
data = Path(__file__).parent.joinpath("../data/sweetpea/data.jsonl").resolve()
14-
outputs, _, _ = eval(str(data), TEST_HF_MODEL, PromptIds.SWEETP_1, [])
15-
assert len(outputs) == 3, "Expected 3 outputs"
16-
for output in outputs:
14+
eval_result = eval(str(data), TEST_HF_MODEL, PromptIds.SWEETP_1, [])
15+
assert len(eval_result.predictions) == 3, "Expected 3 outputs"
16+
for output in eval_result.predictions:
1717
assert len(output) > 0, "Expected non-empty output"
1818

1919

@@ -42,5 +42,5 @@ def test_eval_prompts() -> None:
4242
results: List[EvalResult] = eval_prompts(str(data_file), TEST_HF_MODEL, str(prompts_file), [])
4343
assert len(results) == 3, "Expected 3 outputs"
4444
for result in results:
45-
assert result.prediction is not None, "The prediction should not be None"
45+
assert result.predictions is not None, "The prediction should not be None"
4646
assert result.prompt is not None, "The prompt should not be None"

0 commit comments

Comments
 (0)