langchain-ai · eyurtsev · Nov 29, 2023 · Nov 29, 2023
diff --git a/docs/source/notebooks/tool_usage/multiverse_math.ipynb b/docs/source/notebooks/tool_usage/multiverse_math.ipynb
@@ -526,11 +526,192 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 14,
    "id": "1798f587-38a1-439e-8c1e-f9eeb3a23c8d",
    "metadata": {
     "tags": []
    },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>input.question</th>\n",
+       "      <th>model</th>\n",
+       "      <th>actual_steps</th>\n",
+       "      <th>reference.expected_steps</th>\n",
+       "      <th>outputs.output</th>\n",
+       "      <th>reference.reference</th>\n",
+       "      <th>feedback.correctness</th>\n",
+       "      <th>num_expected_steps</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>example_id</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>20ea2f0e-b306-474a-8daa-f4386cc16599</th>\n",
+       "      <td>Add 2 and 3</td>\n",
+       "      <td>gpt-3.5-turbo-0613</td>\n",
+       "      <td>[add]</td>\n",
+       "      <td>[add]</td>\n",
+       "      <td>The sum of 2 and 3 in this alternate mathemati...</td>\n",
+       "      <td>6.20</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>20ea2f0e-b306-474a-8daa-f4386cc16599</th>\n",
+       "      <td>Add 2 and 3</td>\n",
+       "      <td>gpt-3.5-turbo-1106</td>\n",
+       "      <td>[add]</td>\n",
+       "      <td>[add]</td>\n",
+       "      <td>The result of adding 2 and 3 is 6.2.</td>\n",
+       "      <td>6.20</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>20ea2f0e-b306-474a-8daa-f4386cc16599</th>\n",
+       "      <td>Add 2 and 3</td>\n",
+       "      <td>gpt-4-0613</td>\n",
+       "      <td>[add]</td>\n",
+       "      <td>[add]</td>\n",
+       "      <td>6.2</td>\n",
+       "      <td>6.20</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2d3e1665-7b3f-4013-b010-6af30ed62ab2</th>\n",
+       "      <td>I ate 1 apple and 2 oranges every day for 7 da...</td>\n",
+       "      <td>gpt-3.5-turbo-0613</td>\n",
+       "      <td>[add, multiply]</td>\n",
+       "      <td>[add, multiply]</td>\n",
+       "      <td>You ate a total of 32.34 fruits.</td>\n",
+       "      <td>32.34</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2d3e1665-7b3f-4013-b010-6af30ed62ab2</th>\n",
+       "      <td>I ate 1 apple and 2 oranges every day for 7 da...</td>\n",
+       "      <td>gpt-3.5-turbo-1106</td>\n",
+       "      <td>[add]</td>\n",
+       "      <td>[add, multiply]</td>\n",
+       "      <td>You ate 16.2 fruits.</td>\n",
+       "      <td>32.34</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                                         input.question  \\\n",
+       "example_id                                                                                \n",
+       "20ea2f0e-b306-474a-8daa-f4386cc16599                                        Add 2 and 3   \n",
+       "20ea2f0e-b306-474a-8daa-f4386cc16599                                        Add 2 and 3   \n",
+       "20ea2f0e-b306-474a-8daa-f4386cc16599                                        Add 2 and 3   \n",
+       "2d3e1665-7b3f-4013-b010-6af30ed62ab2  I ate 1 apple and 2 oranges every day for 7 da...   \n",
+       "2d3e1665-7b3f-4013-b010-6af30ed62ab2  I ate 1 apple and 2 oranges every day for 7 da...   \n",
+       "\n",
+       "                                                   model     actual_steps  \\\n",
+       "example_id                                                                  \n",
+       "20ea2f0e-b306-474a-8daa-f4386cc16599  gpt-3.5-turbo-0613            [add]   \n",
+       "20ea2f0e-b306-474a-8daa-f4386cc16599  gpt-3.5-turbo-1106            [add]   \n",
+       "20ea2f0e-b306-474a-8daa-f4386cc16599          gpt-4-0613            [add]   \n",
+       "2d3e1665-7b3f-4013-b010-6af30ed62ab2  gpt-3.5-turbo-0613  [add, multiply]   \n",
+       "2d3e1665-7b3f-4013-b010-6af30ed62ab2  gpt-3.5-turbo-1106            [add]   \n",
+       "\n",
+       "                                     reference.expected_steps  \\\n",
+       "example_id                                                      \n",
+       "20ea2f0e-b306-474a-8daa-f4386cc16599                    [add]   \n",
+       "20ea2f0e-b306-474a-8daa-f4386cc16599                    [add]   \n",
+       "20ea2f0e-b306-474a-8daa-f4386cc16599                    [add]   \n",
+       "2d3e1665-7b3f-4013-b010-6af30ed62ab2          [add, multiply]   \n",
+       "2d3e1665-7b3f-4013-b010-6af30ed62ab2          [add, multiply]   \n",
+       "\n",
+       "                                                                         outputs.output  \\\n",
+       "example_id                                                                                \n",
+       "20ea2f0e-b306-474a-8daa-f4386cc16599  The sum of 2 and 3 in this alternate mathemati...   \n",
+       "20ea2f0e-b306-474a-8daa-f4386cc16599               The result of adding 2 and 3 is 6.2.   \n",
+       "20ea2f0e-b306-474a-8daa-f4386cc16599                                                6.2   \n",
+       "2d3e1665-7b3f-4013-b010-6af30ed62ab2                   You ate a total of 32.34 fruits.   \n",
+       "2d3e1665-7b3f-4013-b010-6af30ed62ab2                               You ate 16.2 fruits.   \n",
+       "\n",
+       "                                      reference.reference  \\\n",
+       "example_id                                                  \n",
+       "20ea2f0e-b306-474a-8daa-f4386cc16599                 6.20   \n",
+       "20ea2f0e-b306-474a-8daa-f4386cc16599                 6.20   \n",
+       "20ea2f0e-b306-474a-8daa-f4386cc16599                 6.20   \n",
+       "2d3e1665-7b3f-4013-b010-6af30ed62ab2                32.34   \n",
+       "2d3e1665-7b3f-4013-b010-6af30ed62ab2                32.34   \n",
+       "\n",
+       "                                      feedback.correctness  num_expected_steps  \n",
+       "example_id                                                                      \n",
+       "20ea2f0e-b306-474a-8daa-f4386cc16599                   1.0                   1  \n",
+       "20ea2f0e-b306-474a-8daa-f4386cc16599                   1.0                   1  \n",
+       "20ea2f0e-b306-474a-8daa-f4386cc16599                   1.0                   1  \n",
+       "2d3e1665-7b3f-4013-b010-6af30ed62ab2                   1.0                   2  \n",
+       "2d3e1665-7b3f-4013-b010-6af30ed62ab2                   0.0                   2  "
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "columns = [\n",
+    "    \"input.question\",\n",
+    "    \"model\",\n",
+    "    \"actual_steps\",\n",
+    "    \"reference.expected_steps\",\n",
+    "    \"outputs.output\",\n",
+    "    \"reference.reference\",\n",
+    "    \"feedback.correctness\",\n",
+    "    \"num_expected_steps\",\n",
+    "]\n",
+    "df[columns].sort_values(by=[\"input.question\", \"model\"]).head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "016fbe05-a993-492c-95db-69d3ba756495",
+   "metadata": {
+    "tags": [
+     "remove-cell"
+    ]
+   },
    "outputs": [
     {
      "data": {
@@ -1142,22 +1323,12 @@
        "dd079541-c0da-4d94-85b7-50f0516a9ca1                   1.0                   1  "
       ]
      },
-     "execution_count": 12,
+     "execution_count": 15,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "columns = [\n",
-    "    \"input.question\",\n",
-    "    \"model\",\n",
-    "    \"actual_steps\",\n",
-    "    \"reference.expected_steps\",\n",
-    "    \"outputs.output\",\n",
-    "    \"reference.reference\",\n",
-    "    \"feedback.correctness\",\n",
-    "    \"num_expected_steps\",\n",
-    "]\n",
     "df[columns].sort_values(by=[\"input.question\", \"model\"])"
    ]
   }

diff --git a/docs/source/notebooks/tool_usage/relational_data.ipynb b/docs/source/notebooks/tool_usage/relational_data.ipynb
@@ -444,146 +444,6 @@
     "Here, we'll take a look at the underlying results a little bit."
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 29,
-   "id": "3cd3bb99-9078-43d0-8b22-6e4d6917929c",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "import pandas as pd\n",
-    "from langsmith.client import Client\n",
-    "\n",
-    "client = Client()\n",
-    "projects = list(client.list_projects(reference_dataset_name=\"Multiverse Math\"))\n",
-    "\n",
-    "dfs = []\n",
-    "for project in projects:\n",
-    "    first_root_run = next(\n",
-    "        client.list_runs(project_name=project.name, execution_order=1)\n",
-    "    )\n",
-    "    # Temporary way to get tag information\n",
-    "    tags = first_root_run.tags\n",
-    "    test_results = client.get_test_results(project_name=project.name)\n",
-    "    test_results[\"model\"] = tags[0]\n",
-    "    dfs.append(test_results)\n",
-    "\n",
-    "\n",
-    "df = pd.concat(dfs)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 30,
-   "id": "ca74c84c-8d48-4cc1-bb37-65ba95e082b2",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>feedback.correctness</th>\n",
-       "      <th>feedback.Intermediate steps correctness</th>\n",
-       "      <th>execution_time</th>\n",
-       "      <th>feedback.# steps / # expected steps</th>\n",
-       "      <th>n</th>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>model</th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>gpt-3.5-turbo-0613</th>\n",
-       "      <td>0.3</td>\n",
-       "      <td>0.8</td>\n",
-       "      <td>8.308014</td>\n",
-       "      <td>1.03333</td>\n",
-       "      <td>10</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>gpt-3.5-turbo-1106</th>\n",
-       "      <td>0.1</td>\n",
-       "      <td>0.7</td>\n",
-       "      <td>54.992810</td>\n",
-       "      <td>0.93332</td>\n",
-       "      <td>10</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>gpt-4-0613</th>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.6</td>\n",
-       "      <td>8.554704</td>\n",
-       "      <td>0.79999</td>\n",
-       "      <td>10</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                    feedback.correctness  \\\n",
-       "model                                      \n",
-       "gpt-3.5-turbo-0613                   0.3   \n",
-       "gpt-3.5-turbo-1106                   0.1   \n",
-       "gpt-4-0613                           0.0   \n",
-       "\n",
-       "                    feedback.Intermediate steps correctness  execution_time  \\\n",
-       "model                                                                         \n",
-       "gpt-3.5-turbo-0613                                      0.8        8.308014   \n",
-       "gpt-3.5-turbo-1106                                      0.7       54.992810   \n",
-       "gpt-4-0613                                              0.6        8.554704   \n",
-       "\n",
-       "                    feedback.# steps / # expected steps   n  \n",
-       "model                                                        \n",
-       "gpt-3.5-turbo-0613                              1.03333  10  \n",
-       "gpt-3.5-turbo-1106                              0.93332  10  \n",
-       "gpt-4-0613                                      0.79999  10  "
-      ]
-     },
-     "execution_count": 30,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "count_df = df.groupby(\"model\").size().to_frame(\"n\")\n",
-    "df.groupby(\"model\")[\n",
-    "    [\n",
-    "        \"feedback.correctness\",\n",
-    "        \"feedback.Intermediate steps correctness\",\n",
-    "        \"execution_time\",\n",
-    "        \"feedback.# steps / # expected steps\",\n",
-    "    ]\n",
-    "].mean().join(count_df)"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": 24,
@@ -616,6 +476,14 @@
     "df = pd.concat(dfs)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "da6962a1-81f2-445f-8547-513a105a3847",
+   "metadata": {},
+   "source": [
+    "### Stats"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "4b7d366a-8754-417a-a654-956528f134e2",