langchain-ai · hinthornw · Nov 22, 2023 · Nov 22, 2023 · Nov 22, 2023 · Nov 22, 2023
diff --git a/README.md b/README.md
@@ -2,6 +2,8 @@
 
 # 🦜💪 LangChain Benchmarks
 
+[📖 Documentation](https://langchain-ai.github.io/langchain-benchmarks/index.html)
+
 A package to help benchmark various LLM related tasks.
 
 The benchmarks are organized by end-to-end use cases, and

diff --git a/docs/source/notebooks/extraction/email.ipynb b/docs/source/notebooks/extraction/email.ipynb
@@ -7,12 +7,25 @@
    "source": [
     "# Email Extraction\n",
     "\n",
-    "Let's examine how to evaluate an email extraction task"
+    "Let's evaluate an LLM on its ability to extract structured information from email texts."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 1,
+   "id": "c401de19-814e-4bd7-bb9c-7ea6e217985c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "# Get your API key from https://smith.langchain.com/settings\n",
+    "# os.environ[\"LANGCHAIN_API_KEY\"] = \"sk-...\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
    "id": "b39159d0-9ea1-414f-a9d8-4a7b22b3d2cc",
    "metadata": {
     "tags": []
@@ -32,7 +45,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 3,
    "id": "60f22779-a948-4833-8e8c-ace9ef17f56f",
    "metadata": {
     "tags": []
@@ -45,7 +58,7 @@
        "<tbody>\n",
        "<tr><td>Name       </td><td>Email Extraction                                                                                                                                           </td></tr>\n",
        "<tr><td>Type       </td><td>ExtractionTask                                                                                                                                             </td></tr>\n",
-       "<tr><td>Dataset ID </td><td><a href=\"https://smith.langchain.com/public/36bdfe7d-3cd1-4b36-b957-d12d95810a2b/d\" target=\"_blank\" rel=\"noopener\">36bdfe7d-3cd1-4b36-b957-d12d95810a2b</a></td></tr>\n",
+       "<tr><td>Dataset ID </td><td><a href=\"https://smith.langchain.com/public/a1742786-bde5-4f51-a1d8-e148e5251ddb/d\" target=\"_blank\" rel=\"noopener\">a1742786-bde5-4f51-a1d8-e148e5251ddb</a></td></tr>\n",
        "<tr><td>Description</td><td>A dataset of 42 real emails deduped from a spam folder, with semantic HTML tags removed, as well as a script for initial extraction and formatting of other emails from an arbitrary .mbox file like the one exported by Gmail.\n",
        "\n",
        "Some additional cleanup of the data was done by hand after the initial pass.\n",
@@ -55,10 +68,10 @@
        "</table>"
       ],
       "text/plain": [
-       "ExtractionTask(name='Email Extraction', dataset_id='https://smith.langchain.com/public/36bdfe7d-3cd1-4b36-b957-d12d95810a2b/d', description='A dataset of 42 real emails deduped from a spam folder, with semantic HTML tags removed, as well as a script for initial extraction and formatting of other emails from an arbitrary .mbox file like the one exported by Gmail.\\n\\nSome additional cleanup of the data was done by hand after the initial pass.\\n\\nSee https://github.com/jacoblee93/oss-model-extraction-evals.\\n    ', schema=<class 'langchain_benchmarks.extraction.tasks.email_task.Email'>, instructions=ChatPromptTemplate(input_variables=['email'], messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template='You are an expert researcher.')), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['email'], template='What can you tell me about the following email? Make sure to extract the question in the correct format. Here is the email:\\n ```\\n{email}\\n```'))]))"
+       "ExtractionTask(name='Email Extraction', dataset_id='https://smith.langchain.com/public/a1742786-bde5-4f51-a1d8-e148e5251ddb/d', description='A dataset of 42 real emails deduped from a spam folder, with semantic HTML tags removed, as well as a script for initial extraction and formatting of other emails from an arbitrary .mbox file like the one exported by Gmail.\\n\\nSome additional cleanup of the data was done by hand after the initial pass.\\n\\nSee https://github.com/jacoblee93/oss-model-extraction-evals.\\n    ', schema=<class 'langchain_benchmarks.extraction.tasks.email_task.Email'>, instructions=ChatPromptTemplate(input_variables=['input'], messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template='You are an expert researcher.')), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['input'], template='What can you tell me about the following email? Make sure to extract the question in the correct format. Here is the email:\\n ```\\n{input}\\n```'))]))"
       ]
      },
-     "execution_count": 2,
+     "execution_count": 3,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -70,7 +83,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
    "id": "49be36d2-343e-49df-8369-dd5bac405d5e",
    "metadata": {
     "tags": []
@@ -103,33 +116,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 5,
    "id": "70369f67-deb4-467a-801a-6d38c3d0460d",
    "metadata": {
     "tags": []
    },
    "outputs": [
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "8c3c62904d1841b391f3ad33161cb294",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "  0%|          | 0/42 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Finished fetching examples. Creating dataset...\n",
-      "New dataset created you can access it at https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/309a2fce-ce68-43aa-befb-67f94d0c3570.\n",
-      "Done creating dataset.\n"
+      "Dataset Email Extraction already exists. Skipping.\n",
+      "You can access the dataset at https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/309a2fce-ce68-43aa-befb-67f94d0c3570.\n"
      ]
     }
    ],
@@ -139,7 +137,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 6,
    "id": "12e302e6-9b3d-42a4-b612-d672c591e8f0",
    "metadata": {
     "tags": []
@@ -195,12 +193,12 @@
    "source": [
     "## Define an extraction chain\n",
     "\n",
-    "Let's build an agent that we can use for evaluation."
+    "Let's build the extraction chain that we can use to get structured information from the emails."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 10,
    "id": "b7536a5b-0140-4c38-88c6-50921307677d",
    "metadata": {
     "tags": []
@@ -216,7 +214,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 11,
    "id": "ade7077c-4602-4e5b-ad6d-3eb43cbd0247",
    "metadata": {
     "tags": []
@@ -230,16 +228,38 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 12,
    "id": "f66ed218-e1db-49b5-bde3-40ebec961723",
    "metadata": {
     "tags": []
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'output': {'sender': 'Unknown',\n",
+       "  'sender_phone_number': '000-1212-1111',\n",
+       "  'sender_address': '12345 My Gold Way',\n",
+       "  'action_items': ['Buy an envelope',\n",
+       "   'Put gold inside',\n",
+       "   'Close the envelope',\n",
+       "   \"Mail it to sender's address\"],\n",
+       "  'topic': 'Request to send gold',\n",
+       "  'tone': 'positive'}}"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "extraction_chain.invoke(\n",
     "    {\n",
-    "        \"email\": \"Hello Dear MR. I want you to send me gold to get rich. First buy an envelope. Then open it and put some gold inside. Then close it and finally mail it to my address at 12345 My Gold Way. You can call me any time at 000-1212-1111.\"\n",
+    "        \"input\": \"Hello Dear MR. I want you to send me gold to get rich.\"\n",
+    "        \" First buy an envelope. Then open it and put some gold inside. \"\n",
+    "        \"Then close it and finally mail it to my address at 12345 My Gold Way.\"\n",
+    "        \" You can call me any time at 000-1212-1111.\"\n",
     "    }\n",
     ")"
    ]
@@ -249,22 +269,22 @@
    "id": "87a64f76-65ae-4367-b43f-f2be3431e7af",
    "metadata": {},
    "source": [
-    "Let's test that our agent works"
+    "Now it's time to measure our chain's effectiveness!"
    ]
   },
   {
    "cell_type": "markdown",
    "id": "3821e4b0-8e67-418a-840c-470fcde42df0",
    "metadata": {},
    "source": [
-    "## Eval\n",
+    "## Evaluate\n",
     "\n",
-    "Let's evaluate an agent now"
+    "Let's evaluate the chain now."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 13,
    "id": "513042fe-2878-44f8-ae84-05b9d521c1de",
    "metadata": {
     "tags": []
@@ -278,7 +298,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 14,
    "id": "2bedd9d1-fc06-4066-9f89-b874ae818d82",
    "metadata": {
     "tags": []
@@ -290,14 +310,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 17,
    "id": "6826a2c6-8443-4215-9e15-b6f4bb570405",
    "metadata": {
     "tags": []
    },
    "outputs": [],
    "source": [
-    "eval_config = get_eval_config(ChatOpenAI(model=\"gpt-4\"))"
+    "eval_llm = ChatOpenAI(model=\"gpt-4\", model_kwargs={\"seed\": 42})\n",
+    "eval_config = get_eval_config(eval_llm)"
    ]
   },
   {
@@ -307,7 +328,20 @@
    "metadata": {
     "tags": []
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "View the evaluation results for project 'test-notable-cake-39' at:\n",
+      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/projects/p/9950f779-8f98-4ca0-90ab-30e4f9f7af6c?eval=true\n",
+      "\n",
+      "View all tests for Dataset Email Extraction at:\n",
+      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/309a2fce-ce68-43aa-befb-67f94d0c3570\n",
+      "[------------------------------------------------->] 42/42"
+     ]
+    }
+   ],
    "source": [
     "test_run = client.run_on_dataset(\n",
     "    dataset_name=task.name,\n",
@@ -323,7 +357,7 @@
    "id": "1b039225-01cf-481a-87a6-4e880e9b1dcd",
    "metadata": {},
    "source": [
-    "# Inspect\n",
+    "## Inspect\n",
     "\n",
     "Here, we'll take a look at the underlying results a little bit.\n",
     "\n",

diff --git a/docs/source/notebooks/getting_started.ipynb b/docs/source/notebooks/getting_started.ipynb
@@ -43,9 +43,8 @@
    "source": [
     "import os\n",
     "\n",
-    "os.environ[\n",
-    "    \"LANGCHAIN_API_KEY\"\n",
-    "] = \"sk-...\"  # Get from https://smith.langchain.com/settings"
+    "# Get from https://smith.langchain.com/settings\n",
+    "os.environ[\"LANGCHAIN_API_KEY\"] = \"sk-...\""
    ]
   },
   {

diff --git a/langchain_benchmarks/extraction/tasks/email_task.py b/langchain_benchmarks/extraction/tasks/email_task.py
@@ -41,14 +41,14 @@ class Email(BaseModel):
             "human",
             "What can you tell me about the following email? Make sure to "
             "extract the question in the correct format. "
-            "Here is the email:\n ```\n{email}\n```",
+            "Here is the email:\n ```\n{input}\n```",
         ),
     ]
 )
 
 EMAIL_EXTRACTION_TASK = ExtractionTask(
     name="Email Extraction",
-    dataset_id="https://smith.langchain.com/public/36bdfe7d-3cd1-4b36-b957-d12d95810a2b/d",
+    dataset_id="https://smith.langchain.com/public/a1742786-bde5-4f51-a1d8-e148e5251ddb/d",
     schema=Email,
     description="""\
 A dataset of 42 real emails deduped from a spam folder, with semantic HTML tags removed, \