Skip to content

Commit c0e87a5

Browse files
committed
Update Unstructured, multi-modal
1 parent 99a7d76 commit c0e87a5

File tree

4 files changed

+1599
-263
lines changed

4 files changed

+1599
-263
lines changed

docs/source/notebooks/retrieval/semi_structured_benchmarking/ss_eval_baseline.ipynb

Lines changed: 222 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -65,20 +65,20 @@
6565
},
6666
{
6767
"cell_type": "code",
68-
"execution_count": 11,
68+
"execution_count": 2,
6969
"id": "7eb9e333-77e6-48f9-b221-9bded023b978",
7070
"metadata": {},
7171
"outputs": [
7272
{
7373
"name": "stdout",
7474
"output_type": "stream",
7575
"text": [
76-
"There are 3 text elements\n",
76+
"There are 5 text elements\n",
7777
"There are 14 text elements\n",
78-
"There are 4 text elements\n",
79-
"There are 18 text elements\n",
80-
"There are 11 text elements\n",
81-
"There are 11 text elements\n"
78+
"There are 5 text elements\n",
79+
"There are 23 text elements\n",
80+
"There are 13 text elements\n",
81+
"There are 13 text elements\n"
8282
]
8383
}
8484
],
@@ -107,7 +107,7 @@
107107
"\n",
108108
"\n",
109109
"texts = []\n",
110-
"token_count = 1500\n",
110+
"token_count = 1000\n",
111111
"for fi in files:\n",
112112
" texts.extend(load_and_split(dir + fi,token_count))"
113113
]
@@ -122,7 +122,7 @@
122122
},
123123
{
124124
"cell_type": "code",
125-
"execution_count": 12,
125+
"execution_count": 3,
126126
"id": "50d01cd3-3c14-4777-90fa-166681d32331",
127127
"metadata": {},
128128
"outputs": [],
@@ -131,7 +131,7 @@
131131
"from langchain.vectorstores import Chroma\n",
132132
"\n",
133133
"vectorstore_baseline = Chroma.from_texts(\n",
134-
" texts=texts, collection_name=\"baseline-1500-token\", embedding=OpenAIEmbeddings()\n",
134+
" texts=texts, collection_name=\"baseline-1000-token\", embedding=OpenAIEmbeddings()\n",
135135
")\n",
136136
"\n",
137137
"retriever_baseline = vectorstore_baseline.as_retriever()"
@@ -147,7 +147,7 @@
147147
},
148148
{
149149
"cell_type": "code",
150-
"execution_count": 13,
150+
"execution_count": 4,
151151
"id": "665986d2-8e7a-4b68-8bc0-7a65a3ed0c14",
152152
"metadata": {},
153153
"outputs": [],
@@ -200,124 +200,249 @@
200200
},
201201
{
202202
"cell_type": "code",
203-
"execution_count": 15,
204-
"id": "eccf4b1d-2551-4938-9c40-6b6b7c7eb722",
205-
"metadata": {},
206-
"outputs": [],
207-
"source": [
208-
"### TODO: Replace with public dataset\n",
209-
"\n",
210-
"import uuid\n",
211-
"\n",
212-
"import pandas as pd\n",
213-
"from langsmith import Client\n",
214-
"\n",
215-
"# Read\n",
216-
"df = pd.read_csv(dir + \"semi_structured_reports.csv\")\n",
217-
"\n",
218-
"# Dataset\n",
219-
"client = Client()\n",
220-
"dataset_name = \"Semi-Structured-Eval-v6\"\n",
221-
"dataset = client.create_dataset(dataset_name=dataset_name)\n",
222-
"\n",
223-
"# Populate dataset\n",
224-
"for _, row in df.iterrows():\n",
225-
" # Get Q, A\n",
226-
" q = row[\"Question\"]\n",
227-
" a = row[\"Answer\"]\n",
228-
"\n",
229-
" # Use the values in your function\n",
230-
" client.create_example(\n",
231-
" inputs={\"question\": q}, outputs={\"answer\": a}, dataset_id=dataset.id\n",
232-
" )"
233-
]
234-
},
235-
{
236-
"cell_type": "code",
237-
"execution_count": 14,
203+
"execution_count": 5,
238204
"id": "edd2e7f9-b3f6-4885-bf05-96f1c1758b20",
239205
"metadata": {},
240206
"outputs": [
241207
{
242208
"name": "stdout",
243209
"output_type": "stream",
244210
"text": [
245-
"View the evaluation results for project 'baseline-1500-tok_438c7954-9395-47e2-8563-19ffc76c8df7' at:\n",
246-
"https://smith.langchain.com/o/1fa8b1f4-fcb9-4072-9aa9-983e35ad61b8/projects/p/b46c6385-45df-4c7d-bfae-51220c85ce60?eval=true\n",
211+
"View the evaluation results for project '6d3c-baseline-1000-tok' at:\n",
212+
"https://smith.langchain.com/o/1fa8b1f4-fcb9-4072-9aa9-983e35ad61b8/datasets/8cff6883-909b-4014-86e2-7a5445ebdff5/compare?selectedSessions=6a5183de-6ae9-4cca-b2ee-8c9520416820\n",
247213
"\n",
248-
"View all tests for Dataset Semi-Structured-Eval-v5 at:\n",
249-
"https://smith.langchain.com/o/1fa8b1f4-fcb9-4072-9aa9-983e35ad61b8/datasets/2759f13d-c0c0-4d60-a8cf-0ce204750642\n",
250-
"[------------------------------------------------->] 25/25\n",
251-
" Eval quantiles:\n",
252-
" inputs.question \\\n",
253-
"count 25 \n",
254-
"unique 25 \n",
255-
"top What is Datadog's Non-GAAP gross margin for th... \n",
256-
"freq 1 \n",
257-
"mean NaN \n",
258-
"std NaN \n",
259-
"min NaN \n",
260-
"25% NaN \n",
261-
"50% NaN \n",
262-
"75% NaN \n",
263-
"max NaN \n",
264-
"\n",
265-
" feedback.COT Contextual Accuracy error execution_time \n",
266-
"count 25.00000 0 25.000000 \n",
267-
"unique NaN 0 NaN \n",
268-
"top NaN NaN NaN \n",
269-
"freq NaN NaN NaN \n",
270-
"mean 0.76000 NaN 5.927943 \n",
271-
"std 0.43589 NaN 1.992108 \n",
272-
"min 0.00000 NaN 3.842005 \n",
273-
"25% 1.00000 NaN 4.801416 \n",
274-
"50% 1.00000 NaN 5.026568 \n",
275-
"75% 1.00000 NaN 6.440968 \n",
276-
"max 1.00000 NaN 11.176281 \n"
214+
"View all tests for Dataset Semi-Structured-Eval-v8 at:\n",
215+
"https://smith.langchain.com/o/1fa8b1f4-fcb9-4072-9aa9-983e35ad61b8/datasets/8cff6883-909b-4014-86e2-7a5445ebdff5\n",
216+
"[------------------------------------------------->] 25/25"
277217
]
218+
},
219+
{
220+
"data": {
221+
"text/html": [
222+
"<h3>Experiment Results:</h3>"
223+
],
224+
"text/plain": [
225+
"<IPython.core.display.HTML object>"
226+
]
227+
},
228+
"metadata": {},
229+
"output_type": "display_data"
230+
},
231+
{
232+
"data": {
233+
"text/html": [
234+
"<div>\n",
235+
"<style scoped>\n",
236+
" .dataframe tbody tr th:only-of-type {\n",
237+
" vertical-align: middle;\n",
238+
" }\n",
239+
"\n",
240+
" .dataframe tbody tr th {\n",
241+
" vertical-align: top;\n",
242+
" }\n",
243+
"\n",
244+
" .dataframe thead th {\n",
245+
" text-align: right;\n",
246+
" }\n",
247+
"</style>\n",
248+
"<table border=\"1\" class=\"dataframe\">\n",
249+
" <thead>\n",
250+
" <tr style=\"text-align: right;\">\n",
251+
" <th></th>\n",
252+
" <th>output</th>\n",
253+
" <th>feedback.COT Contextual Accuracy</th>\n",
254+
" <th>error</th>\n",
255+
" <th>execution_time</th>\n",
256+
" </tr>\n",
257+
" </thead>\n",
258+
" <tbody>\n",
259+
" <tr>\n",
260+
" <th>count</th>\n",
261+
" <td>25</td>\n",
262+
" <td>25.00000</td>\n",
263+
" <td>0</td>\n",
264+
" <td>25.000000</td>\n",
265+
" </tr>\n",
266+
" <tr>\n",
267+
" <th>unique</th>\n",
268+
" <td>25</td>\n",
269+
" <td>NaN</td>\n",
270+
" <td>0</td>\n",
271+
" <td>NaN</td>\n",
272+
" </tr>\n",
273+
" <tr>\n",
274+
" <th>top</th>\n",
275+
" <td>Datadog's Non-GAAP gross margin for the Nine M...</td>\n",
276+
" <td>NaN</td>\n",
277+
" <td>NaN</td>\n",
278+
" <td>NaN</td>\n",
279+
" </tr>\n",
280+
" <tr>\n",
281+
" <th>freq</th>\n",
282+
" <td>1</td>\n",
283+
" <td>NaN</td>\n",
284+
" <td>NaN</td>\n",
285+
" <td>NaN</td>\n",
286+
" </tr>\n",
287+
" <tr>\n",
288+
" <th>mean</th>\n",
289+
" <td>NaN</td>\n",
290+
" <td>0.76000</td>\n",
291+
" <td>NaN</td>\n",
292+
" <td>9.010377</td>\n",
293+
" </tr>\n",
294+
" <tr>\n",
295+
" <th>std</th>\n",
296+
" <td>NaN</td>\n",
297+
" <td>0.43589</td>\n",
298+
" <td>NaN</td>\n",
299+
" <td>3.432551</td>\n",
300+
" </tr>\n",
301+
" <tr>\n",
302+
" <th>min</th>\n",
303+
" <td>NaN</td>\n",
304+
" <td>0.00000</td>\n",
305+
" <td>NaN</td>\n",
306+
" <td>5.954703</td>\n",
307+
" </tr>\n",
308+
" <tr>\n",
309+
" <th>25%</th>\n",
310+
" <td>NaN</td>\n",
311+
" <td>1.00000</td>\n",
312+
" <td>NaN</td>\n",
313+
" <td>6.806990</td>\n",
314+
" </tr>\n",
315+
" <tr>\n",
316+
" <th>50%</th>\n",
317+
" <td>NaN</td>\n",
318+
" <td>1.00000</td>\n",
319+
" <td>NaN</td>\n",
320+
" <td>8.011278</td>\n",
321+
" </tr>\n",
322+
" <tr>\n",
323+
" <th>75%</th>\n",
324+
" <td>NaN</td>\n",
325+
" <td>1.00000</td>\n",
326+
" <td>NaN</td>\n",
327+
" <td>9.132252</td>\n",
328+
" </tr>\n",
329+
" <tr>\n",
330+
" <th>max</th>\n",
331+
" <td>NaN</td>\n",
332+
" <td>1.00000</td>\n",
333+
" <td>NaN</td>\n",
334+
" <td>18.915070</td>\n",
335+
" </tr>\n",
336+
" </tbody>\n",
337+
"</table>\n",
338+
"</div>"
339+
],
340+
"text/plain": [
341+
" output \\\n",
342+
"count 25 \n",
343+
"unique 25 \n",
344+
"top Datadog's Non-GAAP gross margin for the Nine M... \n",
345+
"freq 1 \n",
346+
"mean NaN \n",
347+
"std NaN \n",
348+
"min NaN \n",
349+
"25% NaN \n",
350+
"50% NaN \n",
351+
"75% NaN \n",
352+
"max NaN \n",
353+
"\n",
354+
" feedback.COT Contextual Accuracy error execution_time \n",
355+
"count 25.00000 0 25.000000 \n",
356+
"unique NaN 0 NaN \n",
357+
"top NaN NaN NaN \n",
358+
"freq NaN NaN NaN \n",
359+
"mean 0.76000 NaN 9.010377 \n",
360+
"std 0.43589 NaN 3.432551 \n",
361+
"min 0.00000 NaN 5.954703 \n",
362+
"25% 1.00000 NaN 6.806990 \n",
363+
"50% 1.00000 NaN 8.011278 \n",
364+
"75% 1.00000 NaN 9.132252 \n",
365+
"max 1.00000 NaN 18.915070 "
366+
]
367+
},
368+
"metadata": {},
369+
"output_type": "display_data"
278370
}
279371
],
280372
"source": [
281373
"import uuid\n",
282374
"from langsmith.client import Client\n",
283375
"from langchain.smith import RunEvalConfig\n",
284376
"\n",
377+
"# Config\n",
378+
"client = Client()\n",
285379
"eval_config = RunEvalConfig(\n",
286380
" evaluators=[\"cot_qa\"],\n",
287381
")\n",
288382
"\n",
289-
"def run_eval(chain, eval_run_name):\n",
290-
" \"\"\"\n",
291-
" Run eval\n",
292-
" \"\"\"\n",
293-
" client = Client()\n",
294-
" test_run = client.run_on_dataset(\n",
295-
" ### TODO: Replace with public dataset\n",
296-
" dataset_name=\"Semi-Structured-Eval-v5\",\n",
297-
" llm_or_chain_factory=lambda: (lambda x: x[\"question\"]) | chain,\n",
298-
" evaluation=eval_config,\n",
299-
" verbose=True,\n",
300-
" project_name=eval_run_name,\n",
301-
" )\n",
302-
"\n",
303-
"\n",
304383
"# Experiments\n",
305384
"chain_map = {\n",
306-
" \"baseline-1500-tok\": chain,\n",
385+
" \"baseline-1000-tok\": chain,\n",
386+
" # \"baseline-2000-tok\": chain,\n",
307387
"}\n",
308388
"\n",
309-
"run_id = str(uuid.uuid4())\n",
389+
"\n",
390+
"# Run evaluation\n",
391+
"run_id = uuid.uuid4().hex[:4]\n",
392+
"test_runs = {}\n",
310393
"for project_name, chain in chain_map.items():\n",
311-
" run_eval(chain, project_name + \"_\" + run_id)"
394+
" test_runs[project_name] = client.run_on_dataset(\n",
395+
" # dataset_name=task.name,\n",
396+
" dataset_name=\"Semi-Structured-Eval-v8\",\n",
397+
" llm_or_chain_factory=lambda: (lambda x: x[\"question\"]) | chain,\n",
398+
" evaluation=eval_config,\n",
399+
" verbose=True,\n",
400+
" project_name=f\"{run_id}-{project_name}\",\n",
401+
" project_metadata={\"chain\": project_name},\n",
402+
" )"
403+
]
404+
},
405+
{
406+
"cell_type": "markdown",
407+
"id": "0025211a-f7a1-40e4-9ec5-1b710be3a96a",
408+
"metadata": {},
409+
"source": [
410+
"## TEMP: ITERATURE ON THE DATASET"
312411
]
313412
},
314413
{
315414
"cell_type": "code",
316-
"execution_count": null,
317-
"id": "5a49ffc5-e51b-4b87-adfd-1798aa91e1f7",
415+
"execution_count": 6,
416+
"id": "b54acf22-0cdc-4a58-9d9e-23e63865c32a",
318417
"metadata": {},
319418
"outputs": [],
320-
"source": []
419+
"source": [
420+
"### TODO: Replace with public dataset\n",
421+
"\n",
422+
"import uuid\n",
423+
"\n",
424+
"import pandas as pd\n",
425+
"from langsmith import Client\n",
426+
"\n",
427+
"# Read\n",
428+
"df = pd.read_csv(dir + \"semi_structured_reports.csv\")\n",
429+
"\n",
430+
"# Dataset\n",
431+
"client = Client()\n",
432+
"dataset_name = \"Semi-Structured-Eval-v9\"\n",
433+
"dataset = client.create_dataset(dataset_name=dataset_name)\n",
434+
"\n",
435+
"# Populate dataset\n",
436+
"for _, row in df.iterrows():\n",
437+
" # Get Q, A\n",
438+
" q = row[\"Question\"]\n",
439+
" a = row[\"Answer\"]\n",
440+
"\n",
441+
" # Use the values in your function\n",
442+
" client.create_example(\n",
443+
" inputs={\"question\": q}, outputs={\"answer\": a}, dataset_id=dataset.id\n",
444+
" )"
445+
]
321446
}
322447
],
323448
"metadata": {

0 commit comments

Comments
 (0)