Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
195 changes: 183 additions & 12 deletions docs/source/notebooks/tool_usage/multiverse_math.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -526,11 +526,192 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 14,
"id": "1798f587-38a1-439e-8c1e-f9eeb3a23c8d",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>input.question</th>\n",
" <th>model</th>\n",
" <th>actual_steps</th>\n",
" <th>reference.expected_steps</th>\n",
" <th>outputs.output</th>\n",
" <th>reference.reference</th>\n",
" <th>feedback.correctness</th>\n",
" <th>num_expected_steps</th>\n",
" </tr>\n",
" <tr>\n",
" <th>example_id</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>20ea2f0e-b306-474a-8daa-f4386cc16599</th>\n",
" <td>Add 2 and 3</td>\n",
" <td>gpt-3.5-turbo-0613</td>\n",
" <td>[add]</td>\n",
" <td>[add]</td>\n",
" <td>The sum of 2 and 3 in this alternate mathemati...</td>\n",
" <td>6.20</td>\n",
" <td>1.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20ea2f0e-b306-474a-8daa-f4386cc16599</th>\n",
" <td>Add 2 and 3</td>\n",
" <td>gpt-3.5-turbo-1106</td>\n",
" <td>[add]</td>\n",
" <td>[add]</td>\n",
" <td>The result of adding 2 and 3 is 6.2.</td>\n",
" <td>6.20</td>\n",
" <td>1.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20ea2f0e-b306-474a-8daa-f4386cc16599</th>\n",
" <td>Add 2 and 3</td>\n",
" <td>gpt-4-0613</td>\n",
" <td>[add]</td>\n",
" <td>[add]</td>\n",
" <td>6.2</td>\n",
" <td>6.20</td>\n",
" <td>1.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2d3e1665-7b3f-4013-b010-6af30ed62ab2</th>\n",
" <td>I ate 1 apple and 2 oranges every day for 7 da...</td>\n",
" <td>gpt-3.5-turbo-0613</td>\n",
" <td>[add, multiply]</td>\n",
" <td>[add, multiply]</td>\n",
" <td>You ate a total of 32.34 fruits.</td>\n",
" <td>32.34</td>\n",
" <td>1.0</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2d3e1665-7b3f-4013-b010-6af30ed62ab2</th>\n",
" <td>I ate 1 apple and 2 oranges every day for 7 da...</td>\n",
" <td>gpt-3.5-turbo-1106</td>\n",
" <td>[add]</td>\n",
" <td>[add, multiply]</td>\n",
" <td>You ate 16.2 fruits.</td>\n",
" <td>32.34</td>\n",
" <td>0.0</td>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" input.question \\\n",
"example_id \n",
"20ea2f0e-b306-474a-8daa-f4386cc16599 Add 2 and 3 \n",
"20ea2f0e-b306-474a-8daa-f4386cc16599 Add 2 and 3 \n",
"20ea2f0e-b306-474a-8daa-f4386cc16599 Add 2 and 3 \n",
"2d3e1665-7b3f-4013-b010-6af30ed62ab2 I ate 1 apple and 2 oranges every day for 7 da... \n",
"2d3e1665-7b3f-4013-b010-6af30ed62ab2 I ate 1 apple and 2 oranges every day for 7 da... \n",
"\n",
" model actual_steps \\\n",
"example_id \n",
"20ea2f0e-b306-474a-8daa-f4386cc16599 gpt-3.5-turbo-0613 [add] \n",
"20ea2f0e-b306-474a-8daa-f4386cc16599 gpt-3.5-turbo-1106 [add] \n",
"20ea2f0e-b306-474a-8daa-f4386cc16599 gpt-4-0613 [add] \n",
"2d3e1665-7b3f-4013-b010-6af30ed62ab2 gpt-3.5-turbo-0613 [add, multiply] \n",
"2d3e1665-7b3f-4013-b010-6af30ed62ab2 gpt-3.5-turbo-1106 [add] \n",
"\n",
" reference.expected_steps \\\n",
"example_id \n",
"20ea2f0e-b306-474a-8daa-f4386cc16599 [add] \n",
"20ea2f0e-b306-474a-8daa-f4386cc16599 [add] \n",
"20ea2f0e-b306-474a-8daa-f4386cc16599 [add] \n",
"2d3e1665-7b3f-4013-b010-6af30ed62ab2 [add, multiply] \n",
"2d3e1665-7b3f-4013-b010-6af30ed62ab2 [add, multiply] \n",
"\n",
" outputs.output \\\n",
"example_id \n",
"20ea2f0e-b306-474a-8daa-f4386cc16599 The sum of 2 and 3 in this alternate mathemati... \n",
"20ea2f0e-b306-474a-8daa-f4386cc16599 The result of adding 2 and 3 is 6.2. \n",
"20ea2f0e-b306-474a-8daa-f4386cc16599 6.2 \n",
"2d3e1665-7b3f-4013-b010-6af30ed62ab2 You ate a total of 32.34 fruits. \n",
"2d3e1665-7b3f-4013-b010-6af30ed62ab2 You ate 16.2 fruits. \n",
"\n",
" reference.reference \\\n",
"example_id \n",
"20ea2f0e-b306-474a-8daa-f4386cc16599 6.20 \n",
"20ea2f0e-b306-474a-8daa-f4386cc16599 6.20 \n",
"20ea2f0e-b306-474a-8daa-f4386cc16599 6.20 \n",
"2d3e1665-7b3f-4013-b010-6af30ed62ab2 32.34 \n",
"2d3e1665-7b3f-4013-b010-6af30ed62ab2 32.34 \n",
"\n",
" feedback.correctness num_expected_steps \n",
"example_id \n",
"20ea2f0e-b306-474a-8daa-f4386cc16599 1.0 1 \n",
"20ea2f0e-b306-474a-8daa-f4386cc16599 1.0 1 \n",
"20ea2f0e-b306-474a-8daa-f4386cc16599 1.0 1 \n",
"2d3e1665-7b3f-4013-b010-6af30ed62ab2 1.0 2 \n",
"2d3e1665-7b3f-4013-b010-6af30ed62ab2 0.0 2 "
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"columns = [\n",
" \"input.question\",\n",
" \"model\",\n",
" \"actual_steps\",\n",
" \"reference.expected_steps\",\n",
" \"outputs.output\",\n",
" \"reference.reference\",\n",
" \"feedback.correctness\",\n",
" \"num_expected_steps\",\n",
"]\n",
"df[columns].sort_values(by=[\"input.question\", \"model\"]).head()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "016fbe05-a993-492c-95db-69d3ba756495",
"metadata": {
"tags": [
"remove-cell"
]
},
"outputs": [
{
"data": {
Expand Down Expand Up @@ -1142,22 +1323,12 @@
"dd079541-c0da-4d94-85b7-50f0516a9ca1 1.0 1 "
]
},
"execution_count": 12,
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"columns = [\n",
" \"input.question\",\n",
" \"model\",\n",
" \"actual_steps\",\n",
" \"reference.expected_steps\",\n",
" \"outputs.output\",\n",
" \"reference.reference\",\n",
" \"feedback.correctness\",\n",
" \"num_expected_steps\",\n",
"]\n",
"df[columns].sort_values(by=[\"input.question\", \"model\"])"
]
}
Expand Down
148 changes: 8 additions & 140 deletions docs/source/notebooks/tool_usage/relational_data.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -444,146 +444,6 @@
"Here, we'll take a look at the underlying results a little bit."
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "3cd3bb99-9078-43d0-8b22-6e4d6917929c",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"import pandas as pd\n",
"from langsmith.client import Client\n",
"\n",
"client = Client()\n",
"projects = list(client.list_projects(reference_dataset_name=\"Multiverse Math\"))\n",
"\n",
"dfs = []\n",
"for project in projects:\n",
" first_root_run = next(\n",
" client.list_runs(project_name=project.name, execution_order=1)\n",
" )\n",
" # Temporary way to get tag information\n",
" tags = first_root_run.tags\n",
" test_results = client.get_test_results(project_name=project.name)\n",
" test_results[\"model\"] = tags[0]\n",
" dfs.append(test_results)\n",
"\n",
"\n",
"df = pd.concat(dfs)"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "ca74c84c-8d48-4cc1-bb37-65ba95e082b2",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>feedback.correctness</th>\n",
" <th>feedback.Intermediate steps correctness</th>\n",
" <th>execution_time</th>\n",
" <th>feedback.# steps / # expected steps</th>\n",
" <th>n</th>\n",
" </tr>\n",
" <tr>\n",
" <th>model</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>gpt-3.5-turbo-0613</th>\n",
" <td>0.3</td>\n",
" <td>0.8</td>\n",
" <td>8.308014</td>\n",
" <td>1.03333</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>gpt-3.5-turbo-1106</th>\n",
" <td>0.1</td>\n",
" <td>0.7</td>\n",
" <td>54.992810</td>\n",
" <td>0.93332</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>gpt-4-0613</th>\n",
" <td>0.0</td>\n",
" <td>0.6</td>\n",
" <td>8.554704</td>\n",
" <td>0.79999</td>\n",
" <td>10</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" feedback.correctness \\\n",
"model \n",
"gpt-3.5-turbo-0613 0.3 \n",
"gpt-3.5-turbo-1106 0.1 \n",
"gpt-4-0613 0.0 \n",
"\n",
" feedback.Intermediate steps correctness execution_time \\\n",
"model \n",
"gpt-3.5-turbo-0613 0.8 8.308014 \n",
"gpt-3.5-turbo-1106 0.7 54.992810 \n",
"gpt-4-0613 0.6 8.554704 \n",
"\n",
" feedback.# steps / # expected steps n \n",
"model \n",
"gpt-3.5-turbo-0613 1.03333 10 \n",
"gpt-3.5-turbo-1106 0.93332 10 \n",
"gpt-4-0613 0.79999 10 "
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"count_df = df.groupby(\"model\").size().to_frame(\"n\")\n",
"df.groupby(\"model\")[\n",
" [\n",
" \"feedback.correctness\",\n",
" \"feedback.Intermediate steps correctness\",\n",
" \"execution_time\",\n",
" \"feedback.# steps / # expected steps\",\n",
" ]\n",
"].mean().join(count_df)"
]
},
{
"cell_type": "code",
"execution_count": 24,
Expand Down Expand Up @@ -616,6 +476,14 @@
"df = pd.concat(dfs)"
]
},
{
"cell_type": "markdown",
"id": "da6962a1-81f2-445f-8547-513a105a3847",
"metadata": {},
"source": [
"### Stats"
]
},
{
"cell_type": "markdown",
"id": "4b7d366a-8754-417a-a654-956528f134e2",
Expand Down
Loading