1
1
# Standard
2
+ from pathlib import Path
2
3
from typing import List , TypedDict
3
4
4
5
# Third Party
5
6
from langchain_community .chat_models import ChatOpenAI
6
7
from ragas .evaluation import EvaluationDataset , EvaluationResult , RunConfig , evaluate
7
- from ragas .metrics import RubricsScore
8
- from ragas .metrics ._domain_specific_rubrics import DEFAULT_WITH_REFERENCE_RUBRICS
8
+ from ragas .metrics ._domain_specific_rubrics import ( # the rubrics we must instantiate are located inside of a file marked as private
9
+ DEFAULT_WITH_REFERENCE_RUBRICS ,
10
+ RubricsScore ,
11
+ )
12
+ import pandas as pd
9
13
10
14
# Local
11
15
from .evaluator import Evaluator
@@ -30,13 +34,13 @@ def __init__(self):
30
34
pass
31
35
32
36
def run (
33
- self , dataset : List [Sample ], run_config : RunConfig | None = None
37
+ self , dataset : List [Sample ] | Path = None , run_config : RunConfig | None = None
34
38
) -> EvaluationResult :
35
39
"""
36
40
Evaluates the quality of model responses against a graded rubric.
37
41
38
42
Args:
39
- dataset (List[Sample]):
43
+ dataset (List[Sample] | Path ):
40
44
List of model questions and answers
41
45
run_config (RunConfig | None, optional):
42
46
Configuration to use when running evaluations. If none is provided, then
@@ -47,6 +51,19 @@ def run(
47
51
Returns:
48
52
EvaluationResult: The results of all evaluations performed by Ragas
49
53
"""
54
+ if not dataset :
55
+ raise ValueError (
56
+ "no dataset was provided, please specify the `dataset` argument"
57
+ )
58
+ if isinstance (dataset , Path ):
59
+ input_ds = EvaluationDataset .from_pandas (
60
+ pd .read_json (dataset , lines = True , orient = "records" )
61
+ )
62
+ elif isinstance (dataset , list ):
63
+ input_ds = EvaluationDataset .from_list (dataset )
64
+ else :
65
+ raise TypeError (f"invalid type passed for dataset: { type (dataset )} " )
66
+
50
67
if not run_config :
51
68
# we set extreme timeout/retry values by default since OpenAI tier-1 rate limits
52
69
# are horrible and will result in half of our evaluation results being NaN or 0
@@ -57,17 +74,15 @@ def run(
57
74
timeout = 3600 ,
58
75
)
59
76
60
- # we will be using gpt-4o for the foreseeable future, we hardcode this
61
- # for consistency of answers
62
- input_ds = EvaluationDataset .from_list (dataset )
63
-
64
77
# default set of metrics
65
78
metrics = [
66
79
RubricsScore (
67
80
rubrics = DEFAULT_WITH_REFERENCE_RUBRICS ,
68
81
)
69
82
]
70
83
84
+ # we will be using gpt-4o for the foreseeable future, we hardcode this
85
+ # for consistency of answers
71
86
critic_lm = ChatOpenAI (model = "gpt-4o" )
72
87
results = evaluate (
73
88
dataset = input_ds ,
0 commit comments