|
| 1 | +from .evaluator import Evaluator |
| 2 | +from typing import List, TypedDict |
| 3 | +from ragas.metrics import RubricsScore |
| 4 | +from ragas.evaluation import EvaluationResult, EvaluationDataset, RunConfig, evaluate |
| 5 | +from langchain_community.chat_models import ChatOpenAI |
| 6 | + |
| 7 | + |
| 8 | +class Sample(TypedDict): |
| 9 | + # question |
| 10 | + user_input: str |
| 11 | + |
| 12 | + # model answer |
| 13 | + response: str |
| 14 | + |
| 15 | + # golden answer |
| 16 | + reference: str |
| 17 | + |
| 18 | + |
| 19 | +DEFAULT_RUBRICS = { |
| 20 | + "score1_description": "The response is completely incorrect, inaccurate, and/or not factual.", |
| 21 | + "score2_description": "The response is mostly incorrect, inaccurate, and/or not factual.", |
| 22 | + "score3_description": "The response is somewhat correct, accurate, and/or factual.", |
| 23 | + "score4_description": "The response is mostly correct, accurate, and factual.", |
| 24 | + "score5_description": "The response is completely correct, accurate, and factual.", |
| 25 | +} |
| 26 | + |
| 27 | + |
| 28 | +class RagasEvaluator(Evaluator): |
| 29 | + # most basic implementation, we just assume that the user will bring the existing model responses |
| 30 | + name = "ragas" |
| 31 | + |
| 32 | + def __init__(self): |
| 33 | + pass |
| 34 | + |
| 35 | + def run( |
| 36 | + self, dataset: List[Sample], run_config: RunConfig | None = None |
| 37 | + ) -> EvaluationResult: |
| 38 | + """ |
| 39 | + Evaluates the quality of model responses against a graded rubric. |
| 40 | +
|
| 41 | + Args: |
| 42 | + dataset (List[Sample]): List of model questions and ans |
| 43 | + run_config (RunConfig | None, optional): _description_. Defaults to None. |
| 44 | +
|
| 45 | + Returns: |
| 46 | + EvaluationResult: _description_ |
| 47 | + """ |
| 48 | + if not run_config: |
| 49 | + # we set extreme timeout/retry values by default since OpenAI tier-1 rate limits |
| 50 | + # are horrible and will result in half of our evaluation results being NaN or 0 |
| 51 | + run_config = RunConfig( |
| 52 | + max_retries=120, |
| 53 | + max_wait=7200, |
| 54 | + seed=42, |
| 55 | + timeout=3600, |
| 56 | + ) |
| 57 | + |
| 58 | + # we will be using gpt-4o for the foreseeable future, we hardcode this |
| 59 | + # for consistency of answers |
| 60 | + input_ds = EvaluationDataset.from_list(dataset) |
| 61 | + |
| 62 | + # default set of metrics |
| 63 | + metrics = [ |
| 64 | + RubricsScore( |
| 65 | + rubrics=DEFAULT_RUBRICS, |
| 66 | + ) |
| 67 | + ] |
| 68 | + |
| 69 | + critic_lm = ChatOpenAI(model="gpt-4o") |
| 70 | + results = evaluate( |
| 71 | + dataset=input_ds, |
| 72 | + batch_size=4, |
| 73 | + run_config=run_config, |
| 74 | + llm=critic_lm, |
| 75 | + metrics=metrics, |
| 76 | + show_progress=True, |
| 77 | + ) |
| 78 | + return results |
0 commit comments