Skip to content

AspectCritic not working with openai o3 #2067

Open
@taborzbislaw

Description

@taborzbislaw

[ ] I have checked the documentation and related resources and couldn't resolve my bug.

Describe the bug
I am using AspectCritic metric with custom settings of the evaluator model. When changing from gpt-4o to o3, I got runtime errors

Ragas version: 0.2.13
Python version: 3.11.2

Code to Reproduce
from ragas import SingleTurnSample
from ragas.metrics import AspectCritic
from ragas.llms import LangchainLLMWrapper
from langchain_openai import ChatOpenAI

prompt = 'How much is 2+2?'

##########################################################

##################################

this openai_llm works correctly as evaluator_llm

openai_llm = ChatOpenAI(model="gpt-4o", temperature = 1, max_tokens = 10000)

##################################

this openai_llm does not works as evaluator_llm

openai_llm = ChatOpenAI(model="o3", temperature = 1, max_tokens = 10000)

########################

response = openai_llm.invoke(prompt)
print(response.content)

##########################################################

evaluator_llm = LangchainLLMWrapper(openai_llm)

critic = AspectCritic(
name="math_accuracy",
definition="Is the mathematical operation done correctly?",
llm=evaluator_llm,
strictness = 3
)

sample = SingleTurnSample(
user_input = prompt,
response = response.content,
retrieved_contexts=[]
)

aspect_critic_score = critic.single_turn_score(sample)
print(aspect_critic_score)

Error trace

BadRequestError Traceback (most recent call last)
Cell In[7], line 28
15 critic = AspectCritic(
16 name="math_accuracy",
17 definition="Is the mathematical operation done correctly?",
18 llm=evaluator_llm,
19 strictness = 3
20 )
22 sample = SingleTurnSample(
23 user_input = prompt,
24 response = response.content,
25 retrieved_contexts=[]
26 )
---> 28 aspect_critic_score = critic.single_turn_score(sample)
29 print(aspect_critic_score)

File ~/env-nlp/lib/python3.11/site-packages/ragas/metrics/base.py:497, in SingleTurnMetric.single_turn_score(self, sample, callbacks)
495 if not group_cm.ended:
496 rm.on_chain_error(e)
--> 497 raise e
498 else:
499 if not group_cm.ended:

File ~/env-nlp/lib/python3.11/site-packages/ragas/metrics/base.py:491, in SingleTurnMetric.single_turn_score(self, sample, callbacks)
487 raise ImportError(
488 "It seems like your running this in a jupyter-like environment. Please install nest_asyncio with pip install nest_asyncio to make it work."
489 )
490 loop = asyncio.get_event_loop()
--> 491 score = loop.run_until_complete(
492 self._single_turn_ascore(sample=sample, callbacks=group_cm)
493 )
494 except Exception as e:
495 if not group_cm.ended:

File ~/env-nlp/lib/python3.11/site-packages/nest_asyncio.py:98, in _patch_loop..run_until_complete(self, future)
95 if not f.done():
96 raise RuntimeError(
97 'Event loop stopped before Future completed.')
---> 98 return f.result()

File /usr/lib/python3.11/asyncio/futures.py:203, in Future.result(self)
201 self.__log_traceback = False
202 if self._exception is not None:
--> 203 raise self._exception.with_traceback(self._exception_tb)
204 return self._result

File /usr/lib/python3.11/asyncio/tasks.py:267, in Task.__step(failed resolving arguments)
263 try:
264 if exc is None:
265 # We use the send method directly, because coroutines
266 # don't have __iter__ and __next__ methods.
--> 267 result = coro.send(None)
268 else:
269 result = coro.throw(exc)

File ~/env-nlp/lib/python3.11/site-packages/ragas/metrics/_aspect_critic.py:171, in AspectCritic._single_turn_ascore(self, sample, callbacks)
167 async def _single_turn_ascore(
168 self, sample: SingleTurnSample, callbacks: Callbacks
169 ) -> float:
170 row = sample.to_dict()
--> 171 return await self._ascore(row, callbacks)

File ~/env-nlp/lib/python3.11/site-packages/ragas/metrics/_aspect_critic.py:190, in AspectCritic._ascore(self, row, callbacks)
180 reference_contexts = row.get("reference_contexts")
182 prompt_input = AspectCriticInput(
183 user_input=user_input,
184 response=response,
(...)
187 reference_contexts=reference_contexts,
188 )
--> 190 response = await self.single_turn_prompt.generate(
191 data=prompt_input,
192 llm=self.llm,
193 callbacks=callbacks,
194 )
196 return self._compute_score([response])

File ~/env-nlp/lib/python3.11/site-packages/ragas/prompt/pydantic_prompt.py:129, in PydanticPrompt.generate(self, llm, data, temperature, stop, callbacks, retries_left)
126 callbacks = callbacks or []
128 # this is just a special case of generate_multiple
--> 129 output_single = await self.generate_multiple(
130 llm=llm,
131 data=data,
132 n=1,
133 temperature=temperature,
134 stop=stop,
135 callbacks=callbacks,
136 retries_left=retries_left,
137 )
138 return output_single[0]

File ~/env-nlp/lib/python3.11/site-packages/ragas/prompt/pydantic_prompt.py:190, in PydanticPrompt.generate_multiple(self, llm, data, n, temperature, stop, callbacks, retries_left)
183 prompt_rm, prompt_cb = new_group(
184 name=self.name,
185 inputs={"data": processed_data},
186 callbacks=callbacks,
187 metadata={"type": ChainType.RAGAS_PROMPT},
188 )
189 prompt_value = PromptValue(text=self.to_string(processed_data))
--> 190 resp = await llm.generate(
191 prompt_value,
192 n=n,
193 temperature=temperature,
194 stop=stop,
195 callbacks=prompt_cb,
196 )
198 output_models = []
199 parser = RagasOutputParser(pydantic_object=self.output_model)

File ~/env-nlp/lib/python3.11/site-packages/ragas/llms/base.py:108, in BaseRagasLLM.generate(self, prompt, n, temperature, stop, callbacks)
103 temperature = self.get_temperature(n)
105 agenerate_text_with_retry = add_async_retry(
106 self.agenerate_text, self.run_config
107 )
--> 108 result = await agenerate_text_with_retry(
109 prompt=prompt,
110 n=n,
111 temperature=temperature,
112 stop=stop,
113 callbacks=callbacks,
114 )
116 # check there are no max_token issues
117 if not self.is_finished(result):

File ~/env-nlp/lib/python3.11/site-packages/tenacity/asyncio/init.py:189, in AsyncRetrying.wraps..async_wrapped(*args, **kwargs)
187 copy = self.copy()
188 async_wrapped.statistics = copy.statistics # type: ignore[attr-defined]
--> 189 return await copy(fn, *args, **kwargs)

File ~/env-nlp/lib/python3.11/site-packages/tenacity/asyncio/init.py:111, in AsyncRetrying.call(self, fn, *args, **kwargs)
109 retry_state = RetryCallState(retry_object=self, fn=fn, args=args, kwargs=kwargs)
110 while True:
--> 111 do = await self.iter(retry_state=retry_state)
112 if isinstance(do, DoAttempt):
113 try:

File ~/env-nlp/lib/python3.11/site-packages/tenacity/asyncio/init.py:153, in AsyncRetrying.iter(self, retry_state)
151 result = None
152 for action in self.iter_state.actions:
--> 153 result = await action(retry_state)
154 return result

File ~/env-nlp/lib/python3.11/site-packages/tenacity/_utils.py:99, in wrap_to_async_func..inner(*args, **kwargs)
98 async def inner(*args: typing.Any, **kwargs: typing.Any) -> typing.Any:
---> 99 return call(*args, **kwargs)

File ~/env-nlp/lib/python3.11/site-packages/tenacity/init.py:398, in BaseRetrying._post_retry_check_actions..(rs)
396 def _post_retry_check_actions(self, retry_state: "RetryCallState") -> None:
397 if not (self.iter_state.is_explicit_retry or self.iter_state.retry_run_result):
--> 398 self._add_action_func(lambda rs: rs.outcome.result())
399 return
401 if self.after is not None:

File /usr/lib/python3.11/concurrent/futures/_base.py:449, in Future.result(self, timeout)
447 raise CancelledError()
448 elif self._state == FINISHED:
--> 449 return self.__get_result()
451 self._condition.wait(timeout)
453 if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]:

File /usr/lib/python3.11/concurrent/futures/_base.py:401, in Future.__get_result(self)
399 if self._exception:
400 try:
--> 401 raise self._exception
402 finally:
403 # Break a reference cycle with the exception in self._exception
404 self = None

File ~/env-nlp/lib/python3.11/site-packages/tenacity/asyncio/init.py:114, in AsyncRetrying.call(self, fn, *args, **kwargs)
112 if isinstance(do, DoAttempt):
113 try:
--> 114 result = await fn(*args, **kwargs)
115 except BaseException: # noqa: B902
116 retry_state.set_exception(sys.exc_info()) # type: ignore[arg-type]

File ~/env-nlp/lib/python3.11/site-packages/ragas/llms/base.py:253, in LangchainLLMWrapper.agenerate_text(self, prompt, n, temperature, stop, callbacks)
251 if hasattr(self.langchain_llm, "n"):
252 self.langchain_llm.n = n # type: ignore
--> 253 result = await self.langchain_llm.agenerate_prompt(
254 prompts=[prompt],
255 stop=stop,
256 callbacks=callbacks,
257 )
258 else:
259 result = await self.langchain_llm.agenerate_prompt(
260 prompts=[prompt] * n,
261 stop=stop,
262 callbacks=callbacks,
263 )

File ~/env-nlp/lib/python3.11/site-packages/langchain_core/language_models/chat_models.py:905, in BaseChatModel.agenerate_prompt(self, prompts, stop, callbacks, **kwargs)
896 @OverRide
897 async def agenerate_prompt(
898 self,
(...)
902 **kwargs: Any,
903 ) -> LLMResult:
904 prompt_messages = [p.to_messages() for p in prompts]
--> 905 return await self.agenerate(
906 prompt_messages, stop=stop, callbacks=callbacks, **kwargs
907 )

File ~/env-nlp/lib/python3.11/site-packages/langchain_core/language_models/chat_models.py:863, in BaseChatModel.agenerate(self, messages, stop, callbacks, tags, metadata, run_name, run_id, **kwargs)
850 if run_managers:
851 await asyncio.gather(
852 *[
853 run_manager.on_llm_end(
(...)
861 ]
862 )
--> 863 raise exceptions[0]
864 flattened_outputs = [
865 LLMResult(generations=[res.generations], llm_output=res.llm_output) # type: ignore[list-item, union-attr]
866 for res in results
867 ]
868 llm_output = self._combine_llm_outputs([res.llm_output for res in results]) # type: ignore[union-attr]

File /usr/lib/python3.11/asyncio/tasks.py:267, in Task.__step(failed resolving arguments)
263 try:
264 if exc is None:
265 # We use the send method directly, because coroutines
266 # don't have __iter__ and __next__ methods.
--> 267 result = coro.send(None)
268 else:
269 result = coro.throw(exc)

File ~/env-nlp/lib/python3.11/site-packages/langchain_core/language_models/chat_models.py:1033, in BaseChatModel._agenerate_with_cache(self, messages, stop, run_manager, **kwargs)
1031 else:
1032 if inspect.signature(self._agenerate).parameters.get("run_manager"):
-> 1033 result = await self._agenerate(
1034 messages, stop=stop, run_manager=run_manager, **kwargs
1035 )
1036 else:
1037 result = await self._agenerate(messages, stop=stop, **kwargs)

File ~/env-nlp/lib/python3.11/site-packages/langchain_openai/chat_models/base.py:960, in BaseChatOpenAI._agenerate(self, messages, stop, run_manager, **kwargs)
958 generation_info = {"headers": dict(raw_response.headers)}
959 else:
--> 960 response = await self.async_client.create(**payload)
961 return await run_in_executor(
962 None, self._create_chat_result, response, generation_info
963 )

File ~/env-nlp/lib/python3.11/site-packages/openai/resources/chat/completions/completions.py:2028, in AsyncCompletions.create(self, messages, model, audio, frequency_penalty, function_call, functions, logit_bias, logprobs, max_completion_tokens, max_tokens, metadata, modalities, n, parallel_tool_calls, prediction, presence_penalty, reasoning_effort, response_format, seed, service_tier, stop, store, stream, stream_options, temperature, tool_choice, tools, top_logprobs, top_p, user, web_search_options, extra_headers, extra_query, extra_body, timeout)
1985 @required_args(["messages", "model"], ["messages", "model", "stream"])
1986 async def create(
1987 self,
(...)
2025 timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
2026 ) -> ChatCompletion | AsyncStream[ChatCompletionChunk]:
2027 validate_response_format(response_format)
-> 2028 return await self._post(
2029 "/chat/completions",
2030 body=await async_maybe_transform(
2031 {
2032 "messages": messages,
2033 "model": model,
2034 "audio": audio,
2035 "frequency_penalty": frequency_penalty,
2036 "function_call": function_call,
2037 "functions": functions,
2038 "logit_bias": logit_bias,
2039 "logprobs": logprobs,
2040 "max_completion_tokens": max_completion_tokens,
2041 "max_tokens": max_tokens,
2042 "metadata": metadata,
2043 "modalities": modalities,
2044 "n": n,
2045 "parallel_tool_calls": parallel_tool_calls,
2046 "prediction": prediction,
2047 "presence_penalty": presence_penalty,
2048 "reasoning_effort": reasoning_effort,
2049 "response_format": response_format,
2050 "seed": seed,
2051 "service_tier": service_tier,
2052 "stop": stop,
2053 "store": store,
2054 "stream": stream,
2055 "stream_options": stream_options,
2056 "temperature": temperature,
2057 "tool_choice": tool_choice,
2058 "tools": tools,
2059 "top_logprobs": top_logprobs,
2060 "top_p": top_p,
2061 "user": user,
2062 "web_search_options": web_search_options,
2063 },
2064 completion_create_params.CompletionCreateParamsStreaming
2065 if stream
2066 else completion_create_params.CompletionCreateParamsNonStreaming,
2067 ),
2068 options=make_request_options(
2069 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
2070 ),
2071 cast_to=ChatCompletion,
2072 stream=stream or False,
2073 stream_cls=AsyncStream[ChatCompletionChunk],
2074 )

File ~/env-nlp/lib/python3.11/site-packages/openai/_base_client.py:1742, in AsyncAPIClient.post(self, path, cast_to, body, files, options, stream, stream_cls)
1728 async def post(
1729 self,
1730 path: str,
(...)
1737 stream_cls: type[_AsyncStreamT] | None = None,
1738 ) -> ResponseT | _AsyncStreamT:
1739 opts = FinalRequestOptions.construct(
1740 method="post", url=path, json_data=body, files=await async_to_httpx_files(files), **options
1741 )
-> 1742 return await self.request(cast_to, opts, stream=stream, stream_cls=stream_cls)

File ~/env-nlp/lib/python3.11/site-packages/openai/_base_client.py:1549, in AsyncAPIClient.request(self, cast_to, options, stream, stream_cls)
1546 await err.response.aread()
1548 log.debug("Re-raising status error")
-> 1549 raise self._make_status_error_from_response(err.response) from None
1551 break
1553 assert response is not None, "could not resolve response (should never happen)"

BadRequestError: Error code: 400 - {'error': {'message': "Unsupported value: 'temperature' does not support 1E-8 with this model. Only the default (1) value is supported.", 'type': 'invalid_request_error', 'param': 'temperature', 'code': 'unsupported_value'}}

Expected behavior
I have explicitly set the temperature to 1, which is the only value accepted by o3:

openai_llm = ChatOpenAI(model="o3", temperature = 1, max_tokens = 10000)

However, Ragas sets internally temperature of the model to 1e-8 ignoring my setting, which causes runtime errors.

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't workingmodule-metricsthis is part of metrics module

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions