AspectCritic not working with openai o3

[ ] I have checked the [documentation](https://docs.ragas.io/) and related resources and couldn't resolve my bug.

**Describe the bug**
I am using AspectCritic metric with custom settings of the evaluator model. When changing from gpt-4o to o3, I got runtime errors 

Ragas version: 0.2.13
Python version: 3.11.2

**Code to Reproduce**
from ragas import SingleTurnSample
from ragas.metrics import AspectCritic
from ragas.llms import LangchainLLMWrapper
from langchain_openai import ChatOpenAI

prompt = 'How much is 2+2?'

##########################################################

##################################
#  this openai_llm works correctly as evaluator_llm
# openai_llm = ChatOpenAI(model="gpt-4o", temperature = 1, max_tokens = 10000)
##################################

#  this openai_llm does not works as evaluator_llm
openai_llm = ChatOpenAI(model="o3", temperature = 1, max_tokens = 10000)

########################

response = openai_llm.invoke(prompt)
print(response.content)

##########################################################

evaluator_llm = LangchainLLMWrapper(openai_llm)

critic =  AspectCritic(
        name="math_accuracy",
        definition="Is the mathematical operation done correctly?",
        llm=evaluator_llm,
        strictness = 3
    )

sample = SingleTurnSample(
    user_input = prompt,
    response = response.content,
    retrieved_contexts=[] 
)

aspect_critic_score = critic.single_turn_score(sample)
print(aspect_critic_score)

**Error trace**
---------------------------------------------------------------------------
BadRequestError                           Traceback (most recent call last)
Cell In[7], line 28
     15 critic =  AspectCritic(
     16         name="math_accuracy",
     17         definition="Is the mathematical operation done correctly?",
     18         llm=evaluator_llm,
     19         strictness = 3
     20     )
     22 sample = SingleTurnSample(
     23     user_input = prompt,
     24     response = response.content,
     25     retrieved_contexts=[] 
     26 )
---> 28 aspect_critic_score = critic.single_turn_score(sample)
     29 print(aspect_critic_score)

File ~/env-nlp/lib/python3.11/site-packages/ragas/metrics/base.py:497, in SingleTurnMetric.single_turn_score(self, sample, callbacks)
    495     if not group_cm.ended:
    496         rm.on_chain_error(e)
--> 497     raise e
    498 else:
    499     if not group_cm.ended:

File ~/env-nlp/lib/python3.11/site-packages/ragas/metrics/base.py:491, in SingleTurnMetric.single_turn_score(self, sample, callbacks)
    487             raise ImportError(
    488                 "It seems like your running this in a jupyter-like environment. Please install nest_asyncio with `pip install nest_asyncio` to make it work."
    489             )
    490     loop = asyncio.get_event_loop()
--> 491     score = loop.run_until_complete(
    492         self._single_turn_ascore(sample=sample, callbacks=group_cm)
    493     )
    494 except Exception as e:
    495     if not group_cm.ended:

File ~/env-nlp/lib/python3.11/site-packages/nest_asyncio.py:98, in _patch_loop.<locals>.run_until_complete(self, future)
     95 if not f.done():
     96     raise RuntimeError(
     97         'Event loop stopped before Future completed.')
---> 98 return f.result()

File /usr/lib/python3.11/asyncio/futures.py:203, in Future.result(self)
    201 self.__log_traceback = False
    202 if self._exception is not None:
--> 203     raise self._exception.with_traceback(self._exception_tb)
    204 return self._result

File /usr/lib/python3.11/asyncio/tasks.py:267, in Task.__step(***failed resolving arguments***)
    263 try:
    264     if exc is None:
    265         # We use the `send` method directly, because coroutines
    266         # don't have `__iter__` and `__next__` methods.
--> 267         result = coro.send(None)
    268     else:
    269         result = coro.throw(exc)

File ~/env-nlp/lib/python3.11/site-packages/ragas/metrics/_aspect_critic.py:171, in AspectCritic._single_turn_ascore(self, sample, callbacks)
    167 async def _single_turn_ascore(
    168     self, sample: SingleTurnSample, callbacks: Callbacks
    169 ) -> float:
    170     row = sample.to_dict()
--> 171     return await self._ascore(row, callbacks)

File ~/env-nlp/lib/python3.11/site-packages/ragas/metrics/_aspect_critic.py:190, in AspectCritic._ascore(self, row, callbacks)
    180 reference_contexts = row.get("reference_contexts")
    182 prompt_input = AspectCriticInput(
    183     user_input=user_input,
    184     response=response,
   (...)
    187     reference_contexts=reference_contexts,
    188 )
--> 190 response = await self.single_turn_prompt.generate(
    191     data=prompt_input,
    192     llm=self.llm,
    193     callbacks=callbacks,
    194 )
    196 return self._compute_score([response])

File ~/env-nlp/lib/python3.11/site-packages/ragas/prompt/pydantic_prompt.py:129, in PydanticPrompt.generate(self, llm, data, temperature, stop, callbacks, retries_left)
    126 callbacks = callbacks or []
    128 # this is just a special case of generate_multiple
--> 129 output_single = await self.generate_multiple(
    130     llm=llm,
    131     data=data,
    132     n=1,
    133     temperature=temperature,
    134     stop=stop,
    135     callbacks=callbacks,
    136     retries_left=retries_left,
    137 )
    138 return output_single[0]

File ~/env-nlp/lib/python3.11/site-packages/ragas/prompt/pydantic_prompt.py:190, in PydanticPrompt.generate_multiple(self, llm, data, n, temperature, stop, callbacks, retries_left)
    183 prompt_rm, prompt_cb = new_group(
    184     name=self.name,
    185     inputs={"data": processed_data},
    186     callbacks=callbacks,
    187     metadata={"type": ChainType.RAGAS_PROMPT},
    188 )
    189 prompt_value = PromptValue(text=self.to_string(processed_data))
--> 190 resp = await llm.generate(
    191     prompt_value,
    192     n=n,
    193     temperature=temperature,
    194     stop=stop,
    195     callbacks=prompt_cb,
    196 )
    198 output_models = []
    199 parser = RagasOutputParser(pydantic_object=self.output_model)

File ~/env-nlp/lib/python3.11/site-packages/ragas/llms/base.py:108, in BaseRagasLLM.generate(self, prompt, n, temperature, stop, callbacks)
    103     temperature = self.get_temperature(n)
    105 agenerate_text_with_retry = add_async_retry(
    106     self.agenerate_text, self.run_config
    107 )
--> 108 result = await agenerate_text_with_retry(
    109     prompt=prompt,
    110     n=n,
    111     temperature=temperature,
    112     stop=stop,
    113     callbacks=callbacks,
    114 )
    116 # check there are no max_token issues
    117 if not self.is_finished(result):

File ~/env-nlp/lib/python3.11/site-packages/tenacity/asyncio/__init__.py:189, in AsyncRetrying.wraps.<locals>.async_wrapped(*args, **kwargs)
    187 copy = self.copy()
    188 async_wrapped.statistics = copy.statistics  # type: ignore[attr-defined]
--> 189 return await copy(fn, *args, **kwargs)

File ~/env-nlp/lib/python3.11/site-packages/tenacity/asyncio/__init__.py:111, in AsyncRetrying.__call__(self, fn, *args, **kwargs)
    109 retry_state = RetryCallState(retry_object=self, fn=fn, args=args, kwargs=kwargs)
    110 while True:
--> 111     do = await self.iter(retry_state=retry_state)
    112     if isinstance(do, DoAttempt):
    113         try:

File ~/env-nlp/lib/python3.11/site-packages/tenacity/asyncio/__init__.py:153, in AsyncRetrying.iter(self, retry_state)
    151 result = None
    152 for action in self.iter_state.actions:
--> 153     result = await action(retry_state)
    154 return result

File ~/env-nlp/lib/python3.11/site-packages/tenacity/_utils.py:99, in wrap_to_async_func.<locals>.inner(*args, **kwargs)
     98 async def inner(*args: typing.Any, **kwargs: typing.Any) -> typing.Any:
---> 99     return call(*args, **kwargs)

File ~/env-nlp/lib/python3.11/site-packages/tenacity/__init__.py:398, in BaseRetrying._post_retry_check_actions.<locals>.<lambda>(rs)
    396 def _post_retry_check_actions(self, retry_state: "RetryCallState") -> None:
    397     if not (self.iter_state.is_explicit_retry or self.iter_state.retry_run_result):
--> 398         self._add_action_func(lambda rs: rs.outcome.result())
    399         return
    401     if self.after is not None:

File /usr/lib/python3.11/concurrent/futures/_base.py:449, in Future.result(self, timeout)
    447     raise CancelledError()
    448 elif self._state == FINISHED:
--> 449     return self.__get_result()
    451 self._condition.wait(timeout)
    453 if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]:

File /usr/lib/python3.11/concurrent/futures/_base.py:401, in Future.__get_result(self)
    399 if self._exception:
    400     try:
--> 401         raise self._exception
    402     finally:
    403         # Break a reference cycle with the exception in self._exception
    404         self = None

File ~/env-nlp/lib/python3.11/site-packages/tenacity/asyncio/__init__.py:114, in AsyncRetrying.__call__(self, fn, *args, **kwargs)
    112 if isinstance(do, DoAttempt):
    113     try:
--> 114         result = await fn(*args, **kwargs)
    115     except BaseException:  # noqa: B902
    116         retry_state.set_exception(sys.exc_info())  # type: ignore[arg-type]

File ~/env-nlp/lib/python3.11/site-packages/ragas/llms/base.py:253, in LangchainLLMWrapper.agenerate_text(self, prompt, n, temperature, stop, callbacks)
    251 if hasattr(self.langchain_llm, "n"):
    252     self.langchain_llm.n = n  # type: ignore
--> 253     result = await self.langchain_llm.agenerate_prompt(
    254         prompts=[prompt],
    255         stop=stop,
    256         callbacks=callbacks,
    257     )
    258 else:
    259     result = await self.langchain_llm.agenerate_prompt(
    260         prompts=[prompt] * n,
    261         stop=stop,
    262         callbacks=callbacks,
    263     )

File ~/env-nlp/lib/python3.11/site-packages/langchain_core/language_models/chat_models.py:905, in BaseChatModel.agenerate_prompt(self, prompts, stop, callbacks, **kwargs)
    896 @override
    897 async def agenerate_prompt(
    898     self,
   (...)
    902     **kwargs: Any,
    903 ) -> LLMResult:
    904     prompt_messages = [p.to_messages() for p in prompts]
--> 905     return await self.agenerate(
    906         prompt_messages, stop=stop, callbacks=callbacks, **kwargs
    907     )

File ~/env-nlp/lib/python3.11/site-packages/langchain_core/language_models/chat_models.py:863, in BaseChatModel.agenerate(self, messages, stop, callbacks, tags, metadata, run_name, run_id, **kwargs)
    850     if run_managers:
    851         await asyncio.gather(
    852             *[
    853                 run_manager.on_llm_end(
   (...)
    861             ]
    862         )
--> 863     raise exceptions[0]
    864 flattened_outputs = [
    865     LLMResult(generations=[res.generations], llm_output=res.llm_output)  # type: ignore[list-item, union-attr]
    866     for res in results
    867 ]
    868 llm_output = self._combine_llm_outputs([res.llm_output for res in results])  # type: ignore[union-attr]

File /usr/lib/python3.11/asyncio/tasks.py:267, in Task.__step(***failed resolving arguments***)
    263 try:
    264     if exc is None:
    265         # We use the `send` method directly, because coroutines
    266         # don't have `__iter__` and `__next__` methods.
--> 267         result = coro.send(None)
    268     else:
    269         result = coro.throw(exc)

File ~/env-nlp/lib/python3.11/site-packages/langchain_core/language_models/chat_models.py:1033, in BaseChatModel._agenerate_with_cache(self, messages, stop, run_manager, **kwargs)
   1031 else:
   1032     if inspect.signature(self._agenerate).parameters.get("run_manager"):
-> 1033         result = await self._agenerate(
   1034             messages, stop=stop, run_manager=run_manager, **kwargs
   1035         )
   1036     else:
   1037         result = await self._agenerate(messages, stop=stop, **kwargs)

File ~/env-nlp/lib/python3.11/site-packages/langchain_openai/chat_models/base.py:960, in BaseChatOpenAI._agenerate(self, messages, stop, run_manager, **kwargs)
    958     generation_info = {"headers": dict(raw_response.headers)}
    959 else:
--> 960     response = await self.async_client.create(**payload)
    961 return await run_in_executor(
    962     None, self._create_chat_result, response, generation_info
    963 )

File ~/env-nlp/lib/python3.11/site-packages/openai/resources/chat/completions/completions.py:2028, in AsyncCompletions.create(self, messages, model, audio, frequency_penalty, function_call, functions, logit_bias, logprobs, max_completion_tokens, max_tokens, metadata, modalities, n, parallel_tool_calls, prediction, presence_penalty, reasoning_effort, response_format, seed, service_tier, stop, store, stream, stream_options, temperature, tool_choice, tools, top_logprobs, top_p, user, web_search_options, extra_headers, extra_query, extra_body, timeout)
   1985 @required_args(["messages", "model"], ["messages", "model", "stream"])
   1986 async def create(
   1987     self,
   (...)
   2025     timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
   2026 ) -> ChatCompletion | AsyncStream[ChatCompletionChunk]:
   2027     validate_response_format(response_format)
-> 2028     return await self._post(
   2029         "/chat/completions",
   2030         body=await async_maybe_transform(
   2031             {
   2032                 "messages": messages,
   2033                 "model": model,
   2034                 "audio": audio,
   2035                 "frequency_penalty": frequency_penalty,
   2036                 "function_call": function_call,
   2037                 "functions": functions,
   2038                 "logit_bias": logit_bias,
   2039                 "logprobs": logprobs,
   2040                 "max_completion_tokens": max_completion_tokens,
   2041                 "max_tokens": max_tokens,
   2042                 "metadata": metadata,
   2043                 "modalities": modalities,
   2044                 "n": n,
   2045                 "parallel_tool_calls": parallel_tool_calls,
   2046                 "prediction": prediction,
   2047                 "presence_penalty": presence_penalty,
   2048                 "reasoning_effort": reasoning_effort,
   2049                 "response_format": response_format,
   2050                 "seed": seed,
   2051                 "service_tier": service_tier,
   2052                 "stop": stop,
   2053                 "store": store,
   2054                 "stream": stream,
   2055                 "stream_options": stream_options,
   2056                 "temperature": temperature,
   2057                 "tool_choice": tool_choice,
   2058                 "tools": tools,
   2059                 "top_logprobs": top_logprobs,
   2060                 "top_p": top_p,
   2061                 "user": user,
   2062                 "web_search_options": web_search_options,
   2063             },
   2064             completion_create_params.CompletionCreateParamsStreaming
   2065             if stream
   2066             else completion_create_params.CompletionCreateParamsNonStreaming,
   2067         ),
   2068         options=make_request_options(
   2069             extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
   2070         ),
   2071         cast_to=ChatCompletion,
   2072         stream=stream or False,
   2073         stream_cls=AsyncStream[ChatCompletionChunk],
   2074     )

File ~/env-nlp/lib/python3.11/site-packages/openai/_base_client.py:1742, in AsyncAPIClient.post(self, path, cast_to, body, files, options, stream, stream_cls)
   1728 async def post(
   1729     self,
   1730     path: str,
   (...)
   1737     stream_cls: type[_AsyncStreamT] | None = None,
   1738 ) -> ResponseT | _AsyncStreamT:
   1739     opts = FinalRequestOptions.construct(
   1740         method="post", url=path, json_data=body, files=await async_to_httpx_files(files), **options
   1741     )
-> 1742     return await self.request(cast_to, opts, stream=stream, stream_cls=stream_cls)

File ~/env-nlp/lib/python3.11/site-packages/openai/_base_client.py:1549, in AsyncAPIClient.request(self, cast_to, options, stream, stream_cls)
   1546             await err.response.aread()
   1548         log.debug("Re-raising status error")
-> 1549         raise self._make_status_error_from_response(err.response) from None
   1551     break
   1553 assert response is not None, "could not resolve response (should never happen)"

BadRequestError: Error code: 400 - {'error': {'message': "Unsupported value: 'temperature' does not support 1E-8 with this model. Only the default (1) value is supported.", 'type': 'invalid_request_error', 'param': 'temperature', 'code': 'unsupported_value'}}

**Expected behavior**
I have explicitly set the temperature to 1, which is the only value accepted by o3:

openai_llm = ChatOpenAI(model="o3", temperature = 1, max_tokens = 10000)

However, Ragas sets internally temperature of the model to 1e-8 ignoring my setting, which causes runtime errors.




Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

AspectCritic not working with openai o3 #2067

this openai_llm works correctly as evaluator_llm

openai_llm = ChatOpenAI(model="gpt-4o", temperature = 1, max_tokens = 10000)

this openai_llm does not works as evaluator_llm

Error trace

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

AspectCritic not working with openai o3 #2067

Description

this openai_llm works correctly as evaluator_llm

openai_llm = ChatOpenAI(model="gpt-4o", temperature = 1, max_tokens = 10000)

this openai_llm does not works as evaluator_llm

Error trace

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions