Skip to content

Commit 092a0c8

Browse files
authored
feat: aspect critic with reference (#1638)
1 parent 1e7121b commit 092a0c8

File tree

2 files changed

+176
-7
lines changed

2 files changed

+176
-7
lines changed

docs/concepts/metrics/available_metrics/general_purpose.md

+29-1
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ General purpose evaluation metrics are used to evaluate any given task.
66

77
`AspectCritic` is an evaluation metric that can be used to evaluate responses based on predefined aspects in free form natural language. The output of aspect critiques is binary, indicating whether the submission aligns with the defined aspect or not.
88

9+
**Without reference**
10+
911
### Example
1012

1113
```python
@@ -15,7 +17,6 @@ from ragas.metrics import AspectCritic
1517
sample = SingleTurnSample(
1618
user_input="Where is the Eiffel Tower located?",
1719
response="The Eiffel Tower is located in Paris.",
18-
reference="The Eiffel Tower is located in Paris.",
1920
)
2021

2122
scorer = AspectCritic(
@@ -25,6 +26,31 @@ scorer = AspectCritic(
2526
scorer.llm = openai_model
2627
await scorer.single_turn_ascore(sample)
2728
```
29+
30+
**With reference**
31+
32+
### Example
33+
34+
```python
35+
from ragas.dataset_schema import SingleTurnSample
36+
from ragas.metrics import AspectCriticWithReference
37+
38+
39+
sample = SingleTurnSample(
40+
user_input="Where is the Eiffel Tower located?",
41+
response="The Eiffel Tower is located in Paris.",
42+
reference="The Eiffel Tower is located in Paris.",
43+
)
44+
45+
scorer = AspectCritic(
46+
name="correctness",
47+
definition="Is the response factually similar to the reference?",
48+
)
49+
scorer.llm = openai_model
50+
await scorer.single_turn_ascore(sample)
51+
52+
```
53+
2854
### How it works
2955

3056
Critics are essentially basic LLM calls using the defined criteria. For example, let's see how the harmfulness critic works:
@@ -39,6 +65,8 @@ Critics are essentially basic LLM calls using the defined criteria. For example,
3965
- Step 2: The majority vote from the returned verdicts determines the binary output.
4066
- Output: Yes
4167

68+
69+
4270
## Simple Criteria Scoring
4371

4472
Course graned evaluation method is an evaluation metric that can be used to score (integer) responses based on predefined single free form scoring criteria. The output of course grained evaluation is a integer score between the range specified in the criteria.

src/ragas/metrics/_aspect_critic.py

+147-6
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,10 @@ class AspectCritic(MetricWithLLM, SingleTurnMetric, MultiTurnMetric):
105105
"user_input",
106106
"response",
107107
"retrieved_contexts:optional",
108-
}
108+
},
109+
MetricType.MULTI_TURN: {
110+
"user_input",
111+
},
109112
}
110113
)
111114
single_turn_prompt: PydanticPrompt = field(
@@ -114,7 +117,9 @@ class AspectCritic(MetricWithLLM, SingleTurnMetric, MultiTurnMetric):
114117
multi_turn_prompt: PydanticPrompt = field(
115118
default_factory=lambda: MultiTurnAspectCriticPrompt()
116119
)
117-
definition: str = field(default="", repr=True)
120+
definition: str = field(
121+
default="check if the response to the user input is correct", repr=True
122+
)
118123
strictness: int = field(default=1, repr=False)
119124
max_retries: int = 1
120125

@@ -183,13 +188,149 @@ async def _multi_turn_ascore(
183188
self, sample: MultiTurnSample, callbacks: Callbacks
184189
) -> float:
185190
assert self.llm is not None, "LLM is not set"
186-
assert sample.reference is not None, "Reference is not set"
187191

188192
interaction = sample.pretty_repr()
189-
reference = sample.reference
190-
prompt_input = AspectCriticInput(
193+
prompt_input = MultiTurnAspectCriticInput(
194+
user_input=interaction,
195+
criteria=self.definition,
196+
)
197+
response = await self.multi_turn_prompt.generate(
198+
data=prompt_input,
199+
llm=self.llm,
200+
callbacks=callbacks,
201+
)
202+
return self._compute_score([response])
203+
204+
205+
class AspectCriticInputWithReference(BaseModel):
206+
user_input: str = Field(description="The input to the model")
207+
response: str = Field(description="The response from the model")
208+
reference: str = Field(description="The reference answer for comparison")
209+
criteria: str = Field(description="The criteria to evaluate the response")
210+
211+
212+
class MultiTurnAspectCriticInputWithReference(BaseModel):
213+
user_input: str = Field(description="The input to the model")
214+
reference: str = Field(description="The reference answer for comparison")
215+
criteria: str = Field(description="The criteria to evaluate the response")
216+
217+
218+
class AspectCriticOutputWithReference(BaseModel):
219+
reason: str
220+
verdict: int
221+
222+
223+
class SingleTurnAspectCriticPromptWithReference(
224+
PydanticPrompt[AspectCriticInputWithReference, AspectCriticOutputWithReference]
225+
):
226+
instruction = "Given an input, response, and reference. Evaluate the submission only using the given criteria. Use only 'Yes' (1) and 'No' (0) as verdict."
227+
input_model = AspectCriticInputWithReference
228+
output_model = AspectCriticOutputWithReference
229+
examples = [
230+
(
231+
AspectCriticInputWithReference(
232+
user_input="Who was the director of Los Alamos Laboratory?",
233+
response="Einstein was the director of Los Alamos Laboratory.",
234+
reference="J. Robert Oppenheimer was the director of Los Alamos Laboratory.",
235+
criteria="Is the output written in perfect grammar",
236+
),
237+
AspectCriticOutputWithReference(
238+
reason="The criteria for evaluation is whether the output is written in perfect grammar. In this case, the output is grammatically correct.",
239+
verdict=1,
240+
),
241+
)
242+
]
243+
244+
245+
@dataclass
246+
class AspectCriticWithReference(AspectCritic):
247+
"""
248+
AspectCriticWithReference judges the submission to give binary results using the criteria specified
249+
It uses user_input, response and reference to evaluate the submission.
250+
251+
Attributes
252+
----------
253+
name: str
254+
name of the metrics
255+
definition: str
256+
criteria to judge the submission, example "Is the submission spreading
257+
fake information?"
258+
strictness: int
259+
The number of times self consistency checks is made. Final judgement is
260+
made using majority vote.
261+
"""
262+
263+
_required_columns: t.Dict[MetricType, t.Set[str]] = field(
264+
default_factory=lambda: {
265+
MetricType.SINGLE_TURN: {
266+
"user_input",
267+
"response",
268+
"reference",
269+
"retrieved_contexts:optional",
270+
},
271+
MetricType.MULTI_TURN: {
272+
"user_input",
273+
"reference",
274+
},
275+
}
276+
)
277+
definition: str = field(
278+
default="check if response is similar to reference", repr=True
279+
)
280+
single_turn_prompt: PydanticPrompt = field(
281+
default_factory=lambda: SingleTurnAspectCriticPromptWithReference()
282+
)
283+
284+
multi_turn_prompt: PydanticPrompt = field(
285+
default_factory=lambda: MultiTurnAspectCriticPrompt()
286+
)
287+
288+
async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float:
289+
290+
if self.llm is None:
291+
raise ValueError("LLM is not set")
292+
293+
user_input, context, response, reference = (
294+
row["user_input"],
295+
row.get("retrieved_contexts"),
296+
row["response"],
297+
row["reference"],
298+
)
299+
300+
if context is not None:
301+
if isinstance(context, list):
302+
context = "\n".join(context)
303+
user_input = f"`user_input`: {user_input} Answer using `retrieved context`: {context}"
304+
305+
prompt_input = AspectCriticInputWithReference(
306+
user_input=user_input,
307+
response=response,
308+
reference=reference,
309+
criteria=self.definition,
310+
)
311+
312+
response = await self.single_turn_prompt.generate(
313+
data=prompt_input,
314+
llm=self.llm,
315+
callbacks=callbacks,
316+
)
317+
318+
return self._compute_score([response])
319+
320+
async def _multi_turn_ascore(
321+
self, sample: MultiTurnSample, callbacks: Callbacks
322+
) -> float:
323+
324+
if self.llm is None:
325+
raise ValueError("LLM is not set")
326+
327+
if sample.reference is None:
328+
raise ValueError("Reference is not set")
329+
330+
interaction = sample.pretty_repr()
331+
prompt_input = MultiTurnAspectCriticInputWithReference(
191332
user_input=interaction,
192-
response=reference,
333+
reference=sample.reference,
193334
criteria=self.definition,
194335
)
195336
response = await self.multi_turn_prompt.generate(

0 commit comments

Comments
 (0)