1
1
# Standard
2
2
from pathlib import Path
3
- from typing import List , TypedDict
3
+ from typing import List , Optional , TypedDict
4
4
5
5
# Third Party
6
6
from langchain_community .chat_models import ChatOpenAI
7
+ from pandas import DataFrame , read_json
8
+ from pydantic import BaseModel , ConfigDict , field_validator
7
9
from ragas .evaluation import EvaluationDataset , EvaluationResult , RunConfig , evaluate
8
10
from ragas .metrics ._domain_specific_rubrics import ( # the rubrics we must instantiate are located inside of a file marked as private
9
11
DEFAULT_WITH_REFERENCE_RUBRICS ,
10
12
RubricsScore ,
11
13
)
12
- import pandas as pd
13
14
14
15
# Local
15
16
from .evaluator import Evaluator
17
+ from .mt_bench_common import get_openai_client
16
18
17
19
18
20
class Sample (TypedDict ):
21
+ """
22
+ TypedDict of a sample that we accept when doing eval with Ragas.
23
+ We specifically use TypedDict here to be flexible with the input data we accept.
24
+ """
25
+
19
26
# question
20
27
user_input : str
21
28
22
29
# model answer
23
- response : str
30
+ response : Optional [ str ]
24
31
25
32
# golden answer
26
33
reference : str
27
34
28
35
36
+ # default system prompt we'll use when none is provided. Make it private as we don't intend this to be a public object
37
+ _DEFAULT_SYSTEM_PROMPT = """You are an advanced AI assistant designed to provide precise and accurate information.
38
+ Your primary goal is to answer queries with the most up-to-date and factual information available.
39
+ Focus on delivering clear, concise, and correct responses.
40
+ If you're uncertain about any aspect of the query, state your level of confidence and provide the most accurate information you can.
41
+ Your responses should prioritize accuracy over all other considerations."""
42
+
43
+ DEFAULT_SEED = 1337
44
+ DEFAULT_JUDGE_MODEL = "gpt-4o"
45
+
46
+
47
+ class ModelConfig (BaseModel ):
48
+ model_config = ConfigDict (protected_namespaces = ())
49
+
50
+ # URL of the OpenAI server where the model shall be hosted
51
+ base_url : str
52
+
53
+ # name of the model to use
54
+ model_name : str
55
+ system_prompt : str = _DEFAULT_SYSTEM_PROMPT
56
+
57
+ # We do NOT read from OPENAI_API_KEY for the student model for security reasons (e.g. sending the API key to another client)
58
+ # To provide an OpenAI key, you must set it here; else the default is used.
59
+ api_key : str = "no-api-key"
60
+
61
+ # "model randomness" aka likelihood of sampling something other than the likeliest token
62
+ temperature : float = 0.0
63
+
64
+ max_tokens : int = 768
65
+
66
+ # Random seed for reproducibility. This is not supported everywhere and therefore is unreliable.
67
+ seed : int = DEFAULT_SEED
68
+
69
+ @field_validator ("temperature" )
70
+ @classmethod
71
+ def check_temperature (cls , v : float ) -> float :
72
+ if not 0.0 <= v <= 1.0 :
73
+ raise ValueError ("temperature must be between 0.0 and 1.0" )
74
+ return v
75
+
76
+
29
77
class RagasEvaluator (Evaluator ):
30
78
# most basic implementation, we just assume that the user will bring the existing model responses
31
79
name = "ragas"
32
80
33
- def __init__ (self ):
34
- pass
81
+ def __init__ (
82
+ self ,
83
+ student_model : ModelConfig | None = None ,
84
+ run_config : RunConfig | None = None ,
85
+ ):
86
+ self .student_model = student_model
87
+ self .run_config = run_config
35
88
36
89
def run (
37
- self , dataset : List [Sample ] | Path = None , run_config : RunConfig | None = None
90
+ self ,
91
+ dataset : List [Sample ] | Path ,
92
+ student_model : ModelConfig | None = None ,
93
+ run_config : RunConfig | None = None ,
38
94
) -> EvaluationResult :
39
95
"""
40
96
Evaluates the quality of model responses against a graded rubric.
41
97
98
+ When the `dataset` lacks the `response` field, then `student_model` must be provided
99
+ in order to generate the answers.
100
+
42
101
Args:
43
102
dataset (List[Sample] | Path):
44
- List of model questions and answers
103
+ Can be either a list of `Sample` objects or a path to a jsonl file containing
104
+ records matching `Sample`.
105
+ student_model: (StudentModelConfig):
106
+ When this parameter is provided, we'll attempt to use the described model in order to
107
+ generate the responses from the given list of questions.
45
108
run_config (RunConfig | None, optional):
46
109
Configuration to use when running evaluations. If none is provided, then
47
110
a default one is created containing extremely permissive settings when handling
@@ -51,26 +114,44 @@ def run(
51
114
Returns:
52
115
EvaluationResult: The results of all evaluations performed by Ragas
53
116
"""
117
+ student_model = student_model if student_model else self .student_model
118
+ run_config = run_config if run_config else self .run_config
119
+
54
120
if not dataset :
55
121
raise ValueError (
56
122
"no dataset was provided, please specify the `dataset` argument"
57
123
)
58
- if isinstance (dataset , Path ):
59
- input_ds = EvaluationDataset .from_pandas (
60
- pd .read_json (dataset , lines = True , orient = "records" )
124
+
125
+ if type (dataset ) not in (list , Path ):
126
+ raise TypeError (f"invalid type of dataset: { type (dataset )} " )
127
+
128
+ # ensure we are in the dataframe format
129
+ input_df = None
130
+ if isinstance (dataset , list ):
131
+ input_df = DataFrame (dataset )
132
+ elif isinstance (dataset , Path ):
133
+ input_df = read_json (dataset , orient = "records" , lines = True )
134
+
135
+ # this should never happen, but pylint is not smart enough to detect it
136
+ assert input_df is not None
137
+
138
+ need_to_generate_questions = "response" not in input_df .columns
139
+ if need_to_generate_questions and not student_model :
140
+ raise ValueError (
141
+ "provided dataset doesn't contain the model `response`, but no `student_model` was provided for inference"
61
142
)
62
- elif isinstance ( dataset , list ):
63
- input_ds = EvaluationDataset . from_list ( dataset )
64
- else :
65
- raise TypeError ( f"invalid type passed for dataset: { type ( dataset ) } " )
143
+
144
+ # if the student model was provided then we always generate regardless
145
+ if student_model :
146
+ input_df = self . _generate_answers_from_model ( input_df , student_model )
66
147
67
148
if not run_config :
68
149
# we set extreme timeout/retry values by default since OpenAI tier-1 rate limits
69
150
# are horrible and will result in half of our evaluation results being NaN or 0
70
151
run_config = RunConfig (
71
152
max_retries = 120 ,
72
153
max_wait = 7200 ,
73
- seed = 42 ,
154
+ seed = DEFAULT_SEED ,
74
155
timeout = 3600 ,
75
156
)
76
157
@@ -81,15 +162,48 @@ def run(
81
162
)
82
163
]
83
164
165
+ evaluation_ds = EvaluationDataset .from_pandas (input_df )
166
+
84
167
# we will be using gpt-4o for the foreseeable future, we hardcode this
85
168
# for consistency of answers
86
- critic_lm = ChatOpenAI (model = "gpt-4o" )
169
+ critic_lm = ChatOpenAI (model = DEFAULT_JUDGE_MODEL )
87
170
results = evaluate (
88
- dataset = input_ds ,
171
+ dataset = evaluation_ds ,
89
172
batch_size = 4 ,
90
173
run_config = run_config ,
91
174
llm = critic_lm ,
92
175
metrics = metrics ,
93
176
show_progress = True ,
94
177
)
95
178
return results
179
+
180
+ def _generate_answers_from_model (
181
+ self , questions : DataFrame , student_model : ModelConfig
182
+ ) -> DataFrame :
183
+ """
184
+ Given a DataFrame containing `user_input` columns, generates responses from the given model
185
+ and returns a new DataFrame containing its answers in the `response` column.
186
+ """
187
+ client = get_openai_client (
188
+ model_api_base = student_model .base_url , api_key = student_model .api_key
189
+ )
190
+
191
+ # initialize response to write into
192
+ updated_df = questions .copy ()
193
+ updated_df ["response" ] = ""
194
+
195
+ for i , qna in updated_df .iterrows ():
196
+ messages = [
197
+ student_model .system_prompt ,
198
+ qna ["user_input" ],
199
+ ]
200
+ response = client .chat .completions .create (
201
+ messages = messages ,
202
+ model = student_model .model_name ,
203
+ # specify the seed so we can at least try to have some reproducibility when the clients support it
204
+ seed = 42 ,
205
+ max_tokens = student_model .max_tokens ,
206
+ temperature = student_model .temperature ,
207
+ )
208
+ updated_df .at [i , "response" ] = response .choices [0 ].message .content
209
+ return updated_df
0 commit comments