Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Moved to extract_answer from #148 and back to gpt-4o-mini #161

Merged
merged 3 commits into from
Dec 18, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Exported #148 contents
  • Loading branch information
jamesbraza committed Dec 18, 2024
commit b594d83d581b6d66314cb841a07dea060a02122b
2 changes: 2 additions & 0 deletions src/aviary/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
EvalAnswerMode,
encode_image_to_base64,
eval_answer,
extract_answer,
is_coroutine_callable,
partial_format,
)
Expand Down Expand Up @@ -82,6 +83,7 @@
"encode_image_to_base64",
"eval_answer",
"eval_answer",
"extract_answer",
"fenv",
"is_coroutine_callable",
"join",
Expand Down
39 changes: 38 additions & 1 deletion src/aviary/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@


DEFAULT_EVAL_MODEL_NAME = "gpt-4o"
LLM_BOOL_EVAL_CONFIG = {
LLM_BOOL_EVAL_CONFIG: dict[str, Any] = {
"prompt": (
"Here is a question, the correct answer to the question, and a proposed answer"
" to the question. Please tell me if the proposed answer is correct, given the"
Expand All @@ -35,6 +35,18 @@
"temperature": 0,
}

LLM_EXTRACT_CONFIG = LLM_BOOL_EVAL_CONFIG | {
"prompt": (
"You are evaluating answers for a test which has fixed options. "
"Repeat back which option the proposed answer matches. "
"GIVE ONLY THE VERBATIM TEXT OF A FIXED OPTION. "
"If the proposed answer is empty, invalid, or ambiguous, "
"return an empty string."
"\n\nOptions:\n{options}"
"\n\nProposed answer: {proposed_answer}"
)
}

LLM_SCORE_EVAL_CONFIG = LLM_BOOL_EVAL_CONFIG | {
"prompt": (
"Here is a question, the correct answer to the question, and a rubric for"
Expand Down Expand Up @@ -175,6 +187,31 @@ async def eval_answer(
raise RuntimeError(f"Invalid evaluation mode: {eval_mode}")


async def extract_answer(
proposed_answer: str, options: Sequence[str], llm_eval_config: dict | None = None
) -> str | None:
"""Extract the answer matching a proposal from a list of options using an LLM."""
for option in options:
if proposed_answer.strip().casefold() == option.strip().casefold():
return option

default_config = LLM_EXTRACT_CONFIG
config = llm_eval_config or default_config
response_msg = await run_prompt(
prompt=config.get("prompt", default_config["prompt"]).format(
options="\n".join(options),
proposed_answer=proposed_answer,
),
model=config.get("model", default_config["model"]),
temperature=config.get("temperature", default_config["temperature"]),
)
answer = response_msg.strip().casefold() # noqa: FURB184
for option in options:
if answer == option.strip().casefold():
return option
return None


_CAPITAL_A_INDEX = ord("A")


Expand Down
109 changes: 109 additions & 0 deletions tests/cassettes/test_extract_answer[complex].yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
interactions:
- request:
body:
'{"messages": [{"content": "You are evaluating answers for a test which
has fixed options. Repeat back which option the proposed answer matches. GIVE
ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid,
or ambiguous, return an empty string.\n\nOptions:\nEconomic factors\nSocial
unrest\nPolitical corruption\n\nProposed answer: Based on the context given,
Serif et al. (2026) claim that the overwhelming cause of regime collapse arises
from economic factors. Yet, most other scholars (Gerald and Robinson for example)
believe the collapse was due to social unrest because of the prolonged epidemic
of 2025. I tend to agree with the majority - although I can see both sides.
Thus my response is that the social unrest was the significant factor in the
collapse of the regime.", "role": "user"}], "model": "gpt-4o", "temperature":
0}'
headers:
accept:
- application/json
accept-encoding:
- gzip, deflate
connection:
- keep-alive
content-length:
- "861"
content-type:
- application/json
host:
- api.openai.com
user-agent:
- AsyncOpenAI/Python 1.57.4
x-stainless-arch:
- arm64
x-stainless-async:
- async:asyncio
x-stainless-lang:
- python
x-stainless-os:
- MacOS
x-stainless-package-version:
- 1.57.4
x-stainless-raw-response:
- "true"
x-stainless-retry-count:
- "1"
x-stainless-runtime:
- CPython
x-stainless-runtime-version:
- 3.12.7
method: POST
uri: https://api.openai.com/v1/chat/completions
response:
body:
string: !!binary |
H4sIAAAAAAAAAwAAAP//jJI/T8MwEMX3fArLc4Oa9E9KtrJ1REyAUOTal8TF8RnbgaKq3x05TZNW
FInFw/3uPb87+xARQqWgOaG8Zp43RsXr8nP/9VCtNKzbGuYvJSab3cdj+rySiw2dBAVud8D9WXXH
sTEKvER9wtwC8xBck2w2Xyyy5SztQIMCVJBVxsdzjNNpOo+nq3i67IU1Sg6O5uQ1IoSQQ3eGiFrA
nuZkOjlXGnCOVUDzoYkQalGFCmXOSeeZ9nQyQo7ag+5SPyGXTJFWW3BXPRbK1rEQUbdK9fXjcKnC
yljcup4P9VJq6erCAnOowwXOo6EdPUaEvHXDtVd5qbHYGF94fAcdDJPl4uRHx3WONO2ZR8/UpSib
3LArBHgmlbvYDuWM1yBG6bhK1gqJFyC6GPp3mFvep8Glrv5jPwLOwXgQhbEgJL8eeGyzED7bX23D
krvA1H07D01RSl2BNVae3rs0BcvuxUownpQ0OkY/AAAA//8DAOEzla34AgAA
headers:
CF-Cache-Status:
- DYNAMIC
CF-RAY:
- 8f42461018c5eb29-SJC
Connection:
- keep-alive
Content-Encoding:
- gzip
Content-Type:
- application/json
Date:
- Wed, 18 Dec 2024 21:33:52 GMT
Server:
- cloudflare
Transfer-Encoding:
- chunked
X-Content-Type-Options:
- nosniff
access-control-expose-headers:
- X-Request-ID
alt-svc:
- h3=":443"; ma=86400
openai-organization:
- future-house-xr4tdh
openai-processing-ms:
- "235"
openai-version:
- "2020-10-01"
strict-transport-security:
- max-age=31536000; includeSubDomains; preload
x-ratelimit-limit-requests:
- "10000"
x-ratelimit-limit-tokens:
- "30000000"
x-ratelimit-remaining-requests:
- "9999"
x-ratelimit-remaining-tokens:
- "29999790"
x-ratelimit-reset-requests:
- 6ms
x-ratelimit-reset-tokens:
- 0s
x-request-id:
- req_366dfd5f505d08facd0f7d10e64a9f5e
status:
code: 200
message: OK
version: 1
102 changes: 102 additions & 0 deletions tests/cassettes/test_extract_answer[empty-proposal].yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
interactions:
- request:
body:
'{"messages": [{"content": "You are evaluating answers for a test which
has fixed options. Repeat back which option the proposed answer matches. GIVE
ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid,
or ambiguous, return an empty string.\n\nOptions:\nA\nB\nC\n\nProposed answer:
", "role": "user"}], "model": "gpt-4o", "temperature": 0}'
headers:
accept:
- application/json
accept-encoding:
- gzip, deflate
connection:
- keep-alive
content-length:
- "369"
content-type:
- application/json
host:
- api.openai.com
user-agent:
- AsyncOpenAI/Python 1.57.4
x-stainless-arch:
- arm64
x-stainless-async:
- async:asyncio
x-stainless-lang:
- python
x-stainless-os:
- MacOS
x-stainless-package-version:
- 1.57.4
x-stainless-raw-response:
- "true"
x-stainless-retry-count:
- "1"
x-stainless-runtime:
- CPython
x-stainless-runtime-version:
- 3.12.7
method: POST
uri: https://api.openai.com/v1/chat/completions
response:
body:
string: !!binary |
H4sIAAAAAAAAAwAAAP//jFLLTsMwELznK6w9Nyht0pb2xokLIEDiAkKRa29Sg+O1bBcFqv47cpq+
1CJx8WFmZzyz9jphDJSEOQOx5EE0Vqc31Vfb3mXF/fOoeDG3Ol+0D4WZPD79uOoVBlFBiw8UYae6
EtRYjUGR2dLCIQ8YXYfTvBiPp5M874iGJOooq21IC0pH2ahIs+s0m/TCJSmBHubsLWGMsXV3xohG
Ygtzlg12SIPe8xphvh9iDBzpiAD3XvnATYDBgRRkApou9THssFp5HlOZldY9vtnfo6m2jha+5/d4
pYzyy9Ih92Sipw9koWM3CWPvXZ/VSUSwjhobykCfaKLhdLi1g8MCD2RfFQIFri9oTsxKiYEr7Y/W
AYKLJcozQ8aAr6SiIyI5qnye5ZL3trYy9X/sD4QQaAPK0jqUSlzs25nH3/XX2H7FXWDw3z5gU1bK
1OisU9sHrmw5q/iCz6osv4Zkk/wCAAD//wMA7iORIukCAAA=
headers:
CF-Cache-Status:
- DYNAMIC
CF-RAY:
- 8f424615ca81eb32-SJC
Connection:
- keep-alive
Content-Encoding:
- gzip
Content-Type:
- application/json
Date:
- Wed, 18 Dec 2024 21:33:53 GMT
Server:
- cloudflare
Transfer-Encoding:
- chunked
X-Content-Type-Options:
- nosniff
access-control-expose-headers:
- X-Request-ID
alt-svc:
- h3=":443"; ma=86400
openai-organization:
- future-house-xr4tdh
openai-processing-ms:
- "171"
openai-version:
- "2020-10-01"
strict-transport-security:
- max-age=31536000; includeSubDomains; preload
x-ratelimit-limit-requests:
- "10000"
x-ratelimit-limit-tokens:
- "30000000"
x-ratelimit-remaining-requests:
- "9999"
x-ratelimit-remaining-tokens:
- "29999912"
x-ratelimit-reset-requests:
- 6ms
x-ratelimit-reset-tokens:
- 0s
x-request-id:
- req_de2070d3e02afd584ac618042c22382d
status:
code: 200
message: OK
version: 1
102 changes: 102 additions & 0 deletions tests/cassettes/test_extract_answer[gave-two].yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
interactions:
- request:
body:
'{"messages": [{"content": "You are evaluating answers for a test which
has fixed options. Repeat back which option the proposed answer matches. GIVE
ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid,
or ambiguous, return an empty string.\n\nOptions:\nA\nB\nC\n\nProposed answer:
A or B", "role": "user"}], "model": "gpt-4o", "temperature": 0}'
headers:
accept:
- application/json
accept-encoding:
- gzip, deflate
connection:
- keep-alive
content-length:
- "375"
content-type:
- application/json
host:
- api.openai.com
user-agent:
- AsyncOpenAI/Python 1.57.4
x-stainless-arch:
- arm64
x-stainless-async:
- async:asyncio
x-stainless-lang:
- python
x-stainless-os:
- MacOS
x-stainless-package-version:
- 1.57.4
x-stainless-raw-response:
- "true"
x-stainless-retry-count:
- "1"
x-stainless-runtime:
- CPython
x-stainless-runtime-version:
- 3.12.7
method: POST
uri: https://api.openai.com/v1/chat/completions
response:
body:
string: !!binary |
H4sIAAAAAAAAA4xSTWsCMRS8768I7+wWv7XeChVKoR4KPbSlLDF5u5s2m5cmWVHE/16yWl3RQi85
zLyZzLxkmzAGSsKMgSh5EJXV6V2+Wq82z+Ut0n3xstBv8+H8cWFK9/r08A2dqKDlJ4rwq7oRVFmN
QZHZ08IhDxhde5PBcDSajAe9hqhIoo6ywoZ0SGm/2x+m3WnaHR+EJSmBHmbsPWGMsW1zxohG4hpm
rNv5RSr0nhcIs+MQY+BIRwS498oHbgJ0TqQgE9A0qduww7z2PKYytdYHfHe8R1NhHS39gT/iuTLK
l5lD7slETx/IQsPuEsY+mj71WUSwjiobskBfaKLhZLC3g9MCT+ShKgQKXF/RnJllEgNX2rfWAYKL
EuWFIWPAa6moRSStypdZrnnvaytT/Mf+RAiBNqDMrEOpxNW+jXn8XX+NHVfcBAa/8QGrLFemQGed
2j9wbjPZny4Fx8m0D8ku+QEAAP//AwDYqi3B6QIAAA==
headers:
CF-Cache-Status:
- DYNAMIC
CF-RAY:
- 8f42460a68a467f1-SJC
Connection:
- keep-alive
Content-Encoding:
- gzip
Content-Type:
- application/json
Date:
- Wed, 18 Dec 2024 21:33:51 GMT
Server:
- cloudflare
Transfer-Encoding:
- chunked
X-Content-Type-Options:
- nosniff
access-control-expose-headers:
- X-Request-ID
alt-svc:
- h3=":443"; ma=86400
openai-organization:
- future-house-xr4tdh
openai-processing-ms:
- "241"
openai-version:
- "2020-10-01"
strict-transport-security:
- max-age=31536000; includeSubDomains; preload
x-ratelimit-limit-requests:
- "10000"
x-ratelimit-limit-tokens:
- "30000000"
x-ratelimit-remaining-requests:
- "9999"
x-ratelimit-remaining-tokens:
- "29999911"
x-ratelimit-reset-requests:
- 6ms
x-ratelimit-reset-tokens:
- 0s
x-request-id:
- req_83d07d0983e1d4d1995bfa068db503dd
status:
code: 200
message: OK
version: 1
Loading