Skip to content

Commit fa3ef62

Browse files
committed
debug
1 parent 213eda8 commit fa3ef62

File tree

2 files changed

+7
-7
lines changed

2 files changed

+7
-7
lines changed

docs/source/adding-a-new-metric.mdx

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ boolean.
5858

5959
```python
6060
def custom_metric(doc: Doc, model_response: ModelResponse) -> bool:
61-
response = model_response.text[0]
61+
response = model_response.final_text[0]
6262
return response == doc.choices[doc.gold_index]
6363
```
6464

@@ -68,7 +68,7 @@ If you want to return multiple metrics per sample, you need to return a dictiona
6868

6969
```python
7070
def custom_metric(doc: Doc, model_response: ModelResponse) -> dict:
71-
response = model_response.text[0]
71+
response = model_response.final_text[0]
7272
return {"accuracy": response == doc.choices[doc.gold_index], "other_metric": 0.5}
7373
```
7474

src/lighteval/metrics/metrics_sample.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1027,7 +1027,7 @@ def compute(self, responses: list[ModelResponse], docs: list[Doc], **kwargs) ->
10271027
questions = [formatted_doc.query for formatted_doc in docs]
10281028
options = [formatted_doc.choices for formatted_doc in docs]
10291029
golds = [formatted_doc.get_golds()[0] for formatted_doc in docs]
1030-
predictions = [response.text[0] for response in responses]
1030+
predictions = [response.final_text[0] for response in responses]
10311031

10321032
scores, messages, judgements = self.judge.evaluate_answer_batch(questions, predictions, options, golds)
10331033

@@ -1059,7 +1059,7 @@ def compute(self, model_response: list[ModelResponse], doc: list[Doc], **kwargs)
10591059
# If we are evaluating a multiturn task, we need to have specific field in the formatted doc
10601060
questions = [doc.specific["multi_turn_queries"] for doc in docs]
10611061
golds = [doc.specific.get("reference", None) for doc in docs]
1062-
predictions = [response.text[0] for response in model_responses]
1062+
predictions = [response.final_text[0] for response in model_responses]
10631063

10641064
query_context_1 = {"query": questions[0], "context": ""}
10651065
query_context_2 = {"query": questions[1], "context": predictions[0]}
@@ -1089,7 +1089,7 @@ def compute(self, responses: list[ModelResponse], docs: list[Doc], **kwargs):
10891089
questions = [doc.specific["question"] for doc in docs]
10901090
options = [doc.choices for doc in docs]
10911091
golds = [doc.get_golds()[0] for doc in docs]
1092-
predictions = [response.text[0] for response in responses]
1092+
predictions = [response.final_text[0] for response in responses]
10931093

10941094
scores, messages, judgements = self.judge.evaluate_answer_batch(questions, predictions, options, golds)
10951095

@@ -1098,8 +1098,8 @@ def compute(self, responses: list[ModelResponse], docs: list[Doc], **kwargs):
10981098
metrics.append(
10991099
{
11001100
f"judge_score_{self.short_judge_name}": scores[i],
1101-
f"user_prompt_{self.short_judge_name}": messages[i],
1102-
f"judgement_{self.short_judge_name}": judgements[i],
1101+
# f"user_prompt_{self.short_judge_name}": messages[i],
1102+
# f"judgement_{self.short_judge_name}": judgements[i],
11031103
}
11041104
)
11051105

0 commit comments

Comments
 (0)