|  | 
| 65 | 65 |   }, | 
| 66 | 66 |   { | 
| 67 | 67 |    "cell_type": "code", | 
| 68 |  | -   "execution_count": 11, | 
|  | 68 | +   "execution_count": 2, | 
| 69 | 69 |    "id": "7eb9e333-77e6-48f9-b221-9bded023b978", | 
| 70 | 70 |    "metadata": {}, | 
| 71 | 71 |    "outputs": [ | 
| 72 | 72 |     { | 
| 73 | 73 |      "name": "stdout", | 
| 74 | 74 |      "output_type": "stream", | 
| 75 | 75 |      "text": [ | 
| 76 |  | -      "There are 3 text elements\n", | 
|  | 76 | +      "There are 5 text elements\n", | 
| 77 | 77 |       "There are 14 text elements\n", | 
| 78 |  | -      "There are 4 text elements\n", | 
| 79 |  | -      "There are 18 text elements\n", | 
| 80 |  | -      "There are 11 text elements\n", | 
| 81 |  | -      "There are 11 text elements\n" | 
|  | 78 | +      "There are 5 text elements\n", | 
|  | 79 | +      "There are 23 text elements\n", | 
|  | 80 | +      "There are 13 text elements\n", | 
|  | 81 | +      "There are 13 text elements\n" | 
| 82 | 82 |      ] | 
| 83 | 83 |     } | 
| 84 | 84 |    ], | 
|  | 
| 107 | 107 |     "\n", | 
| 108 | 108 |     "\n", | 
| 109 | 109 |     "texts = []\n", | 
| 110 |  | -    "token_count = 1500\n", | 
|  | 110 | +    "token_count = 1000\n", | 
| 111 | 111 |     "for fi in files:\n", | 
| 112 | 112 |     "    texts.extend(load_and_split(dir + fi,token_count))" | 
| 113 | 113 |    ] | 
|  | 
| 122 | 122 |   }, | 
| 123 | 123 |   { | 
| 124 | 124 |    "cell_type": "code", | 
| 125 |  | -   "execution_count": 12, | 
|  | 125 | +   "execution_count": 3, | 
| 126 | 126 |    "id": "50d01cd3-3c14-4777-90fa-166681d32331", | 
| 127 | 127 |    "metadata": {}, | 
| 128 | 128 |    "outputs": [], | 
|  | 
| 131 | 131 |     "from langchain.vectorstores import Chroma\n", | 
| 132 | 132 |     "\n", | 
| 133 | 133 |     "vectorstore_baseline = Chroma.from_texts(\n", | 
| 134 |  | -    "    texts=texts, collection_name=\"baseline-1500-token\", embedding=OpenAIEmbeddings()\n", | 
|  | 134 | +    "    texts=texts, collection_name=\"baseline-1000-token\", embedding=OpenAIEmbeddings()\n", | 
| 135 | 135 |     ")\n", | 
| 136 | 136 |     "\n", | 
| 137 | 137 |     "retriever_baseline = vectorstore_baseline.as_retriever()" | 
|  | 
| 147 | 147 |   }, | 
| 148 | 148 |   { | 
| 149 | 149 |    "cell_type": "code", | 
| 150 |  | -   "execution_count": 13, | 
|  | 150 | +   "execution_count": 4, | 
| 151 | 151 |    "id": "665986d2-8e7a-4b68-8bc0-7a65a3ed0c14", | 
| 152 | 152 |    "metadata": {}, | 
| 153 | 153 |    "outputs": [], | 
|  | 
| 200 | 200 |   }, | 
| 201 | 201 |   { | 
| 202 | 202 |    "cell_type": "code", | 
| 203 |  | -   "execution_count": 15, | 
| 204 |  | -   "id": "eccf4b1d-2551-4938-9c40-6b6b7c7eb722", | 
| 205 |  | -   "metadata": {}, | 
| 206 |  | -   "outputs": [], | 
| 207 |  | -   "source": [ | 
| 208 |  | -    "### TODO: Replace with public dataset\n", | 
| 209 |  | -    "\n", | 
| 210 |  | -    "import uuid\n", | 
| 211 |  | -    "\n", | 
| 212 |  | -    "import pandas as pd\n", | 
| 213 |  | -    "from langsmith import Client\n", | 
| 214 |  | -    "\n", | 
| 215 |  | -    "# Read\n", | 
| 216 |  | -    "df = pd.read_csv(dir + \"semi_structured_reports.csv\")\n", | 
| 217 |  | -    "\n", | 
| 218 |  | -    "# Dataset\n", | 
| 219 |  | -    "client = Client()\n", | 
| 220 |  | -    "dataset_name = \"Semi-Structured-Eval-v6\"\n", | 
| 221 |  | -    "dataset = client.create_dataset(dataset_name=dataset_name)\n", | 
| 222 |  | -    "\n", | 
| 223 |  | -    "# Populate dataset\n", | 
| 224 |  | -    "for _, row in df.iterrows():\n", | 
| 225 |  | -    "    # Get Q, A\n", | 
| 226 |  | -    "    q = row[\"Question\"]\n", | 
| 227 |  | -    "    a = row[\"Answer\"]\n", | 
| 228 |  | -    "\n", | 
| 229 |  | -    "    # Use the values in your function\n", | 
| 230 |  | -    "    client.create_example(\n", | 
| 231 |  | -    "        inputs={\"question\": q}, outputs={\"answer\": a}, dataset_id=dataset.id\n", | 
| 232 |  | -    "    )" | 
| 233 |  | -   ] | 
| 234 |  | -  }, | 
| 235 |  | -  { | 
| 236 |  | -   "cell_type": "code", | 
| 237 |  | -   "execution_count": 14, | 
|  | 203 | +   "execution_count": 5, | 
| 238 | 204 |    "id": "edd2e7f9-b3f6-4885-bf05-96f1c1758b20", | 
| 239 | 205 |    "metadata": {}, | 
| 240 | 206 |    "outputs": [ | 
| 241 | 207 |     { | 
| 242 | 208 |      "name": "stdout", | 
| 243 | 209 |      "output_type": "stream", | 
| 244 | 210 |      "text": [ | 
| 245 |  | -      "View the evaluation results for project 'baseline-1500-tok_438c7954-9395-47e2-8563-19ffc76c8df7' at:\n", | 
| 246 |  | -      "https://smith.langchain.com/o/1fa8b1f4-fcb9-4072-9aa9-983e35ad61b8/projects/p/b46c6385-45df-4c7d-bfae-51220c85ce60?eval=true\n", | 
|  | 211 | +      "View the evaluation results for project '6d3c-baseline-1000-tok' at:\n", | 
|  | 212 | +      "https://smith.langchain.com/o/1fa8b1f4-fcb9-4072-9aa9-983e35ad61b8/datasets/8cff6883-909b-4014-86e2-7a5445ebdff5/compare?selectedSessions=6a5183de-6ae9-4cca-b2ee-8c9520416820\n", | 
| 247 | 213 |       "\n", | 
| 248 |  | -      "View all tests for Dataset Semi-Structured-Eval-v5 at:\n", | 
| 249 |  | -      "https://smith.langchain.com/o/1fa8b1f4-fcb9-4072-9aa9-983e35ad61b8/datasets/2759f13d-c0c0-4d60-a8cf-0ce204750642\n", | 
| 250 |  | -      "[------------------------------------------------->] 25/25\n", | 
| 251 |  | -      " Eval quantiles:\n", | 
| 252 |  | -      "                                          inputs.question  \\\n", | 
| 253 |  | -      "count                                                  25   \n", | 
| 254 |  | -      "unique                                                 25   \n", | 
| 255 |  | -      "top     What is Datadog's Non-GAAP gross margin for th...   \n", | 
| 256 |  | -      "freq                                                    1   \n", | 
| 257 |  | -      "mean                                                  NaN   \n", | 
| 258 |  | -      "std                                                   NaN   \n", | 
| 259 |  | -      "min                                                   NaN   \n", | 
| 260 |  | -      "25%                                                   NaN   \n", | 
| 261 |  | -      "50%                                                   NaN   \n", | 
| 262 |  | -      "75%                                                   NaN   \n", | 
| 263 |  | -      "max                                                   NaN   \n", | 
| 264 |  | -      "\n", | 
| 265 |  | -      "        feedback.COT Contextual Accuracy error  execution_time  \n", | 
| 266 |  | -      "count                           25.00000     0       25.000000  \n", | 
| 267 |  | -      "unique                               NaN     0             NaN  \n", | 
| 268 |  | -      "top                                  NaN   NaN             NaN  \n", | 
| 269 |  | -      "freq                                 NaN   NaN             NaN  \n", | 
| 270 |  | -      "mean                             0.76000   NaN        5.927943  \n", | 
| 271 |  | -      "std                              0.43589   NaN        1.992108  \n", | 
| 272 |  | -      "min                              0.00000   NaN        3.842005  \n", | 
| 273 |  | -      "25%                              1.00000   NaN        4.801416  \n", | 
| 274 |  | -      "50%                              1.00000   NaN        5.026568  \n", | 
| 275 |  | -      "75%                              1.00000   NaN        6.440968  \n", | 
| 276 |  | -      "max                              1.00000   NaN       11.176281  \n" | 
|  | 214 | +      "View all tests for Dataset Semi-Structured-Eval-v8 at:\n", | 
|  | 215 | +      "https://smith.langchain.com/o/1fa8b1f4-fcb9-4072-9aa9-983e35ad61b8/datasets/8cff6883-909b-4014-86e2-7a5445ebdff5\n", | 
|  | 216 | +      "[------------------------------------------------->] 25/25" | 
| 277 | 217 |      ] | 
|  | 218 | +    }, | 
|  | 219 | +    { | 
|  | 220 | +     "data": { | 
|  | 221 | +      "text/html": [ | 
|  | 222 | +       "<h3>Experiment Results:</h3>" | 
|  | 223 | +      ], | 
|  | 224 | +      "text/plain": [ | 
|  | 225 | +       "<IPython.core.display.HTML object>" | 
|  | 226 | +      ] | 
|  | 227 | +     }, | 
|  | 228 | +     "metadata": {}, | 
|  | 229 | +     "output_type": "display_data" | 
|  | 230 | +    }, | 
|  | 231 | +    { | 
|  | 232 | +     "data": { | 
|  | 233 | +      "text/html": [ | 
|  | 234 | +       "<div>\n", | 
|  | 235 | +       "<style scoped>\n", | 
|  | 236 | +       "    .dataframe tbody tr th:only-of-type {\n", | 
|  | 237 | +       "        vertical-align: middle;\n", | 
|  | 238 | +       "    }\n", | 
|  | 239 | +       "\n", | 
|  | 240 | +       "    .dataframe tbody tr th {\n", | 
|  | 241 | +       "        vertical-align: top;\n", | 
|  | 242 | +       "    }\n", | 
|  | 243 | +       "\n", | 
|  | 244 | +       "    .dataframe thead th {\n", | 
|  | 245 | +       "        text-align: right;\n", | 
|  | 246 | +       "    }\n", | 
|  | 247 | +       "</style>\n", | 
|  | 248 | +       "<table border=\"1\" class=\"dataframe\">\n", | 
|  | 249 | +       "  <thead>\n", | 
|  | 250 | +       "    <tr style=\"text-align: right;\">\n", | 
|  | 251 | +       "      <th></th>\n", | 
|  | 252 | +       "      <th>output</th>\n", | 
|  | 253 | +       "      <th>feedback.COT Contextual Accuracy</th>\n", | 
|  | 254 | +       "      <th>error</th>\n", | 
|  | 255 | +       "      <th>execution_time</th>\n", | 
|  | 256 | +       "    </tr>\n", | 
|  | 257 | +       "  </thead>\n", | 
|  | 258 | +       "  <tbody>\n", | 
|  | 259 | +       "    <tr>\n", | 
|  | 260 | +       "      <th>count</th>\n", | 
|  | 261 | +       "      <td>25</td>\n", | 
|  | 262 | +       "      <td>25.00000</td>\n", | 
|  | 263 | +       "      <td>0</td>\n", | 
|  | 264 | +       "      <td>25.000000</td>\n", | 
|  | 265 | +       "    </tr>\n", | 
|  | 266 | +       "    <tr>\n", | 
|  | 267 | +       "      <th>unique</th>\n", | 
|  | 268 | +       "      <td>25</td>\n", | 
|  | 269 | +       "      <td>NaN</td>\n", | 
|  | 270 | +       "      <td>0</td>\n", | 
|  | 271 | +       "      <td>NaN</td>\n", | 
|  | 272 | +       "    </tr>\n", | 
|  | 273 | +       "    <tr>\n", | 
|  | 274 | +       "      <th>top</th>\n", | 
|  | 275 | +       "      <td>Datadog's Non-GAAP gross margin for the Nine M...</td>\n", | 
|  | 276 | +       "      <td>NaN</td>\n", | 
|  | 277 | +       "      <td>NaN</td>\n", | 
|  | 278 | +       "      <td>NaN</td>\n", | 
|  | 279 | +       "    </tr>\n", | 
|  | 280 | +       "    <tr>\n", | 
|  | 281 | +       "      <th>freq</th>\n", | 
|  | 282 | +       "      <td>1</td>\n", | 
|  | 283 | +       "      <td>NaN</td>\n", | 
|  | 284 | +       "      <td>NaN</td>\n", | 
|  | 285 | +       "      <td>NaN</td>\n", | 
|  | 286 | +       "    </tr>\n", | 
|  | 287 | +       "    <tr>\n", | 
|  | 288 | +       "      <th>mean</th>\n", | 
|  | 289 | +       "      <td>NaN</td>\n", | 
|  | 290 | +       "      <td>0.76000</td>\n", | 
|  | 291 | +       "      <td>NaN</td>\n", | 
|  | 292 | +       "      <td>9.010377</td>\n", | 
|  | 293 | +       "    </tr>\n", | 
|  | 294 | +       "    <tr>\n", | 
|  | 295 | +       "      <th>std</th>\n", | 
|  | 296 | +       "      <td>NaN</td>\n", | 
|  | 297 | +       "      <td>0.43589</td>\n", | 
|  | 298 | +       "      <td>NaN</td>\n", | 
|  | 299 | +       "      <td>3.432551</td>\n", | 
|  | 300 | +       "    </tr>\n", | 
|  | 301 | +       "    <tr>\n", | 
|  | 302 | +       "      <th>min</th>\n", | 
|  | 303 | +       "      <td>NaN</td>\n", | 
|  | 304 | +       "      <td>0.00000</td>\n", | 
|  | 305 | +       "      <td>NaN</td>\n", | 
|  | 306 | +       "      <td>5.954703</td>\n", | 
|  | 307 | +       "    </tr>\n", | 
|  | 308 | +       "    <tr>\n", | 
|  | 309 | +       "      <th>25%</th>\n", | 
|  | 310 | +       "      <td>NaN</td>\n", | 
|  | 311 | +       "      <td>1.00000</td>\n", | 
|  | 312 | +       "      <td>NaN</td>\n", | 
|  | 313 | +       "      <td>6.806990</td>\n", | 
|  | 314 | +       "    </tr>\n", | 
|  | 315 | +       "    <tr>\n", | 
|  | 316 | +       "      <th>50%</th>\n", | 
|  | 317 | +       "      <td>NaN</td>\n", | 
|  | 318 | +       "      <td>1.00000</td>\n", | 
|  | 319 | +       "      <td>NaN</td>\n", | 
|  | 320 | +       "      <td>8.011278</td>\n", | 
|  | 321 | +       "    </tr>\n", | 
|  | 322 | +       "    <tr>\n", | 
|  | 323 | +       "      <th>75%</th>\n", | 
|  | 324 | +       "      <td>NaN</td>\n", | 
|  | 325 | +       "      <td>1.00000</td>\n", | 
|  | 326 | +       "      <td>NaN</td>\n", | 
|  | 327 | +       "      <td>9.132252</td>\n", | 
|  | 328 | +       "    </tr>\n", | 
|  | 329 | +       "    <tr>\n", | 
|  | 330 | +       "      <th>max</th>\n", | 
|  | 331 | +       "      <td>NaN</td>\n", | 
|  | 332 | +       "      <td>1.00000</td>\n", | 
|  | 333 | +       "      <td>NaN</td>\n", | 
|  | 334 | +       "      <td>18.915070</td>\n", | 
|  | 335 | +       "    </tr>\n", | 
|  | 336 | +       "  </tbody>\n", | 
|  | 337 | +       "</table>\n", | 
|  | 338 | +       "</div>" | 
|  | 339 | +      ], | 
|  | 340 | +      "text/plain": [ | 
|  | 341 | +       "                                                   output  \\\n", | 
|  | 342 | +       "count                                                  25   \n", | 
|  | 343 | +       "unique                                                 25   \n", | 
|  | 344 | +       "top     Datadog's Non-GAAP gross margin for the Nine M...   \n", | 
|  | 345 | +       "freq                                                    1   \n", | 
|  | 346 | +       "mean                                                  NaN   \n", | 
|  | 347 | +       "std                                                   NaN   \n", | 
|  | 348 | +       "min                                                   NaN   \n", | 
|  | 349 | +       "25%                                                   NaN   \n", | 
|  | 350 | +       "50%                                                   NaN   \n", | 
|  | 351 | +       "75%                                                   NaN   \n", | 
|  | 352 | +       "max                                                   NaN   \n", | 
|  | 353 | +       "\n", | 
|  | 354 | +       "        feedback.COT Contextual Accuracy error  execution_time  \n", | 
|  | 355 | +       "count                           25.00000     0       25.000000  \n", | 
|  | 356 | +       "unique                               NaN     0             NaN  \n", | 
|  | 357 | +       "top                                  NaN   NaN             NaN  \n", | 
|  | 358 | +       "freq                                 NaN   NaN             NaN  \n", | 
|  | 359 | +       "mean                             0.76000   NaN        9.010377  \n", | 
|  | 360 | +       "std                              0.43589   NaN        3.432551  \n", | 
|  | 361 | +       "min                              0.00000   NaN        5.954703  \n", | 
|  | 362 | +       "25%                              1.00000   NaN        6.806990  \n", | 
|  | 363 | +       "50%                              1.00000   NaN        8.011278  \n", | 
|  | 364 | +       "75%                              1.00000   NaN        9.132252  \n", | 
|  | 365 | +       "max                              1.00000   NaN       18.915070  " | 
|  | 366 | +      ] | 
|  | 367 | +     }, | 
|  | 368 | +     "metadata": {}, | 
|  | 369 | +     "output_type": "display_data" | 
| 278 | 370 |     } | 
| 279 | 371 |    ], | 
| 280 | 372 |    "source": [ | 
| 281 | 373 |     "import uuid\n", | 
| 282 | 374 |     "from langsmith.client import Client\n", | 
| 283 | 375 |     "from langchain.smith import RunEvalConfig\n", | 
| 284 | 376 |     "\n", | 
|  | 377 | +    "# Config\n", | 
|  | 378 | +    "client = Client()\n", | 
| 285 | 379 |     "eval_config = RunEvalConfig(\n", | 
| 286 | 380 |     "    evaluators=[\"cot_qa\"],\n", | 
| 287 | 381 |     ")\n", | 
| 288 | 382 |     "\n", | 
| 289 |  | -    "def run_eval(chain, eval_run_name):\n", | 
| 290 |  | -    "    \"\"\"\n", | 
| 291 |  | -    "    Run eval\n", | 
| 292 |  | -    "    \"\"\"\n", | 
| 293 |  | -    "    client = Client()\n", | 
| 294 |  | -    "    test_run = client.run_on_dataset(\n", | 
| 295 |  | -    "        ### TODO: Replace with public dataset\n", | 
| 296 |  | -    "        dataset_name=\"Semi-Structured-Eval-v5\",\n", | 
| 297 |  | -    "        llm_or_chain_factory=lambda: (lambda x: x[\"question\"]) | chain,\n", | 
| 298 |  | -    "        evaluation=eval_config,\n", | 
| 299 |  | -    "        verbose=True,\n", | 
| 300 |  | -    "        project_name=eval_run_name,\n", | 
| 301 |  | -    "    )\n", | 
| 302 |  | -    "\n", | 
| 303 |  | -    "\n", | 
| 304 | 383 |     "# Experiments\n", | 
| 305 | 384 |     "chain_map = {\n", | 
| 306 |  | -    "    \"baseline-1500-tok\": chain,\n", | 
|  | 385 | +    "    \"baseline-1000-tok\": chain,\n", | 
|  | 386 | +    "    # \"baseline-2000-tok\": chain,\n", | 
| 307 | 387 |     "}\n", | 
| 308 | 388 |     "\n", | 
| 309 |  | -    "run_id = str(uuid.uuid4())\n", | 
|  | 389 | +    "\n", | 
|  | 390 | +    "# Run evaluation\n", | 
|  | 391 | +    "run_id = uuid.uuid4().hex[:4]\n", | 
|  | 392 | +    "test_runs = {}\n", | 
| 310 | 393 |     "for project_name, chain in chain_map.items():\n", | 
| 311 |  | -    "    run_eval(chain, project_name + \"_\" + run_id)" | 
|  | 394 | +    "    test_runs[project_name] = client.run_on_dataset(\n", | 
|  | 395 | +    "        # dataset_name=task.name,\n", | 
|  | 396 | +    "        dataset_name=\"Semi-Structured-Eval-v8\",\n", | 
|  | 397 | +    "        llm_or_chain_factory=lambda: (lambda x: x[\"question\"]) | chain,\n", | 
|  | 398 | +    "        evaluation=eval_config,\n", | 
|  | 399 | +    "        verbose=True,\n", | 
|  | 400 | +    "        project_name=f\"{run_id}-{project_name}\",\n", | 
|  | 401 | +    "        project_metadata={\"chain\": project_name},\n", | 
|  | 402 | +    "    )" | 
|  | 403 | +   ] | 
|  | 404 | +  }, | 
|  | 405 | +  { | 
|  | 406 | +   "cell_type": "markdown", | 
|  | 407 | +   "id": "0025211a-f7a1-40e4-9ec5-1b710be3a96a", | 
|  | 408 | +   "metadata": {}, | 
|  | 409 | +   "source": [ | 
|  | 410 | +    "## TEMP: ITERATURE ON THE DATASET" | 
| 312 | 411 |    ] | 
| 313 | 412 |   }, | 
| 314 | 413 |   { | 
| 315 | 414 |    "cell_type": "code", | 
| 316 |  | -   "execution_count": null, | 
| 317 |  | -   "id": "5a49ffc5-e51b-4b87-adfd-1798aa91e1f7", | 
|  | 415 | +   "execution_count": 6, | 
|  | 416 | +   "id": "b54acf22-0cdc-4a58-9d9e-23e63865c32a", | 
| 318 | 417 |    "metadata": {}, | 
| 319 | 418 |    "outputs": [], | 
| 320 |  | -   "source": [] | 
|  | 419 | +   "source": [ | 
|  | 420 | +    "### TODO: Replace with public dataset\n", | 
|  | 421 | +    "\n", | 
|  | 422 | +    "import uuid\n", | 
|  | 423 | +    "\n", | 
|  | 424 | +    "import pandas as pd\n", | 
|  | 425 | +    "from langsmith import Client\n", | 
|  | 426 | +    "\n", | 
|  | 427 | +    "# Read\n", | 
|  | 428 | +    "df = pd.read_csv(dir + \"semi_structured_reports.csv\")\n", | 
|  | 429 | +    "\n", | 
|  | 430 | +    "# Dataset\n", | 
|  | 431 | +    "client = Client()\n", | 
|  | 432 | +    "dataset_name = \"Semi-Structured-Eval-v9\"\n", | 
|  | 433 | +    "dataset = client.create_dataset(dataset_name=dataset_name)\n", | 
|  | 434 | +    "\n", | 
|  | 435 | +    "# Populate dataset\n", | 
|  | 436 | +    "for _, row in df.iterrows():\n", | 
|  | 437 | +    "    # Get Q, A\n", | 
|  | 438 | +    "    q = row[\"Question\"]\n", | 
|  | 439 | +    "    a = row[\"Answer\"]\n", | 
|  | 440 | +    "\n", | 
|  | 441 | +    "    # Use the values in your function\n", | 
|  | 442 | +    "    client.create_example(\n", | 
|  | 443 | +    "        inputs={\"question\": q}, outputs={\"answer\": a}, dataset_id=dataset.id\n", | 
|  | 444 | +    "    )" | 
|  | 445 | +   ] | 
| 321 | 446 |   } | 
| 322 | 447 |  ], | 
| 323 | 448 |  "metadata": { | 
|  | 
0 commit comments