Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

# 🦜💪 LangChain Benchmarks

[📖 Documentation](https://langchain-ai.github.io/langchain-benchmarks/index.html)

A package to help benchmark various LLM related tasks.

The benchmarks are organized by end-to-end use cases, and
Expand Down
114 changes: 74 additions & 40 deletions docs/source/notebooks/extraction/email.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,25 @@
"source": [
"# Email Extraction\n",
"\n",
"Let's examine how to evaluate an email extraction task"
"Let's evaluate an LLM on its ability to extract structured information from email texts."
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "c401de19-814e-4bd7-bb9c-7ea6e217985c",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"\n",
"# Get your API key from https://smith.langchain.com/settings\n",
"# os.environ[\"LANGCHAIN_API_KEY\"] = \"sk-...\""
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "b39159d0-9ea1-414f-a9d8-4a7b22b3d2cc",
"metadata": {
"tags": []
Expand All @@ -32,7 +45,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 3,
"id": "60f22779-a948-4833-8e8c-ace9ef17f56f",
"metadata": {
"tags": []
Expand All @@ -45,7 +58,7 @@
"<tbody>\n",
"<tr><td>Name </td><td>Email Extraction </td></tr>\n",
"<tr><td>Type </td><td>ExtractionTask </td></tr>\n",
"<tr><td>Dataset ID </td><td><a href=\"https://smith.langchain.com/public/36bdfe7d-3cd1-4b36-b957-d12d95810a2b/d\" target=\"_blank\" rel=\"noopener\">36bdfe7d-3cd1-4b36-b957-d12d95810a2b</a></td></tr>\n",
"<tr><td>Dataset ID </td><td><a href=\"https://smith.langchain.com/public/a1742786-bde5-4f51-a1d8-e148e5251ddb/d\" target=\"_blank\" rel=\"noopener\">a1742786-bde5-4f51-a1d8-e148e5251ddb</a></td></tr>\n",
"<tr><td>Description</td><td>A dataset of 42 real emails deduped from a spam folder, with semantic HTML tags removed, as well as a script for initial extraction and formatting of other emails from an arbitrary .mbox file like the one exported by Gmail.\n",
"\n",
"Some additional cleanup of the data was done by hand after the initial pass.\n",
Expand All @@ -55,10 +68,10 @@
"</table>"
],
"text/plain": [
"ExtractionTask(name='Email Extraction', dataset_id='https://smith.langchain.com/public/36bdfe7d-3cd1-4b36-b957-d12d95810a2b/d', description='A dataset of 42 real emails deduped from a spam folder, with semantic HTML tags removed, as well as a script for initial extraction and formatting of other emails from an arbitrary .mbox file like the one exported by Gmail.\\n\\nSome additional cleanup of the data was done by hand after the initial pass.\\n\\nSee https://github.com/jacoblee93/oss-model-extraction-evals.\\n ', schema=<class 'langchain_benchmarks.extraction.tasks.email_task.Email'>, instructions=ChatPromptTemplate(input_variables=['email'], messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template='You are an expert researcher.')), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['email'], template='What can you tell me about the following email? Make sure to extract the question in the correct format. Here is the email:\\n ```\\n{email}\\n```'))]))"
"ExtractionTask(name='Email Extraction', dataset_id='https://smith.langchain.com/public/a1742786-bde5-4f51-a1d8-e148e5251ddb/d', description='A dataset of 42 real emails deduped from a spam folder, with semantic HTML tags removed, as well as a script for initial extraction and formatting of other emails from an arbitrary .mbox file like the one exported by Gmail.\\n\\nSome additional cleanup of the data was done by hand after the initial pass.\\n\\nSee https://github.com/jacoblee93/oss-model-extraction-evals.\\n ', schema=<class 'langchain_benchmarks.extraction.tasks.email_task.Email'>, instructions=ChatPromptTemplate(input_variables=['input'], messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template='You are an expert researcher.')), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['input'], template='What can you tell me about the following email? Make sure to extract the question in the correct format. Here is the email:\\n ```\\n{input}\\n```'))]))"
]
},
"execution_count": 2,
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -70,7 +83,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 4,
"id": "49be36d2-343e-49df-8369-dd5bac405d5e",
"metadata": {
"tags": []
Expand Down Expand Up @@ -103,33 +116,18 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 5,
"id": "70369f67-deb4-467a-801a-6d38c3d0460d",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "8c3c62904d1841b391f3ad33161cb294",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/42 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Finished fetching examples. Creating dataset...\n",
"New dataset created you can access it at https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/309a2fce-ce68-43aa-befb-67f94d0c3570.\n",
"Done creating dataset.\n"
"Dataset Email Extraction already exists. Skipping.\n",
"You can access the dataset at https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/309a2fce-ce68-43aa-befb-67f94d0c3570.\n"
]
}
],
Expand All @@ -139,7 +137,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 6,
"id": "12e302e6-9b3d-42a4-b612-d672c591e8f0",
"metadata": {
"tags": []
Expand Down Expand Up @@ -195,12 +193,12 @@
"source": [
"## Define an extraction chain\n",
"\n",
"Let's build an agent that we can use for evaluation."
"Let's build the extraction chain that we can use to get structured information from the emails."
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 10,
"id": "b7536a5b-0140-4c38-88c6-50921307677d",
"metadata": {
"tags": []
Expand All @@ -216,7 +214,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 11,
"id": "ade7077c-4602-4e5b-ad6d-3eb43cbd0247",
"metadata": {
"tags": []
Expand All @@ -230,16 +228,38 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 12,
"id": "f66ed218-e1db-49b5-bde3-40ebec961723",
"metadata": {
"tags": []
},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"{'output': {'sender': 'Unknown',\n",
" 'sender_phone_number': '000-1212-1111',\n",
" 'sender_address': '12345 My Gold Way',\n",
" 'action_items': ['Buy an envelope',\n",
" 'Put gold inside',\n",
" 'Close the envelope',\n",
" \"Mail it to sender's address\"],\n",
" 'topic': 'Request to send gold',\n",
" 'tone': 'positive'}}"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"extraction_chain.invoke(\n",
" {\n",
" \"email\": \"Hello Dear MR. I want you to send me gold to get rich. First buy an envelope. Then open it and put some gold inside. Then close it and finally mail it to my address at 12345 My Gold Way. You can call me any time at 000-1212-1111.\"\n",
" \"input\": \"Hello Dear MR. I want you to send me gold to get rich.\"\n",
" \" First buy an envelope. Then open it and put some gold inside. \"\n",
" \"Then close it and finally mail it to my address at 12345 My Gold Way.\"\n",
" \" You can call me any time at 000-1212-1111.\"\n",
" }\n",
")"
]
Expand All @@ -249,22 +269,22 @@
"id": "87a64f76-65ae-4367-b43f-f2be3431e7af",
"metadata": {},
"source": [
"Let's test that our agent works"
"Now it's time to measure our chain's effectiveness!"
]
},
{
"cell_type": "markdown",
"id": "3821e4b0-8e67-418a-840c-470fcde42df0",
"metadata": {},
"source": [
"## Eval\n",
"## Evaluate\n",
"\n",
"Let's evaluate an agent now"
"Let's evaluate the chain now."
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 13,
"id": "513042fe-2878-44f8-ae84-05b9d521c1de",
"metadata": {
"tags": []
Expand All @@ -278,7 +298,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 14,
"id": "2bedd9d1-fc06-4066-9f89-b874ae818d82",
"metadata": {
"tags": []
Expand All @@ -290,14 +310,15 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 17,
"id": "6826a2c6-8443-4215-9e15-b6f4bb570405",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"eval_config = get_eval_config(ChatOpenAI(model=\"gpt-4\"))"
"eval_llm = ChatOpenAI(model=\"gpt-4\", model_kwargs={\"seed\": 42})\n",
"eval_config = get_eval_config(eval_llm)"
]
},
{
Expand All @@ -307,7 +328,20 @@
"metadata": {
"tags": []
},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"View the evaluation results for project 'test-notable-cake-39' at:\n",
"https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/projects/p/9950f779-8f98-4ca0-90ab-30e4f9f7af6c?eval=true\n",
"\n",
"View all tests for Dataset Email Extraction at:\n",
"https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/309a2fce-ce68-43aa-befb-67f94d0c3570\n",
"[------------------------------------------------->] 42/42"
]
}
],
"source": [
"test_run = client.run_on_dataset(\n",
" dataset_name=task.name,\n",
Expand All @@ -323,7 +357,7 @@
"id": "1b039225-01cf-481a-87a6-4e880e9b1dcd",
"metadata": {},
"source": [
"# Inspect\n",
"## Inspect\n",
"\n",
"Here, we'll take a look at the underlying results a little bit.\n",
"\n",
Expand Down
5 changes: 2 additions & 3 deletions docs/source/notebooks/getting_started.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,8 @@
"source": [
"import os\n",
"\n",
"os.environ[\n",
" \"LANGCHAIN_API_KEY\"\n",
"] = \"sk-...\" # Get from https://smith.langchain.com/settings"
"# Get from https://smith.langchain.com/settings\n",
"os.environ[\"LANGCHAIN_API_KEY\"] = \"sk-...\""
]
},
{
Expand Down
4 changes: 2 additions & 2 deletions langchain_benchmarks/extraction/tasks/email_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,14 +41,14 @@ class Email(BaseModel):
"human",
"What can you tell me about the following email? Make sure to "
"extract the question in the correct format. "
"Here is the email:\n ```\n{email}\n```",
"Here is the email:\n ```\n{input}\n```",
),
]
)

EMAIL_EXTRACTION_TASK = ExtractionTask(
name="Email Extraction",
dataset_id="https://smith.langchain.com/public/36bdfe7d-3cd1-4b36-b957-d12d95810a2b/d",
dataset_id="https://smith.langchain.com/public/a1742786-bde5-4f51-a1d8-e148e5251ddb/d",
schema=Email,
description="""\
A dataset of 42 real emails deduped from a spam folder, with semantic HTML tags removed, \
Expand Down