langchain-ai · rlancemartin · Dec 12, 2023 · Nov 23, 2023 · Nov 23, 2023 · Nov 24, 2023
diff --git a/...notebooks/retrieval/semi_structured.ipynb → ...ctured_benchmarking/semi_structured.ipynb b/...notebooks/retrieval/semi_structured.ipynb → ...ctured_benchmarking/semi_structured.ipynb
diff --git a/docs/source/notebooks/retrieval/semi_structured_benchmarking/ss_eval_chunk_sizes.ipynb b/docs/source/notebooks/retrieval/semi_structured_benchmarking/ss_eval_chunk_sizes.ipynb
@@ -0,0 +1,301 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "b6856d11-40d5-48e5-9eb3-423f479933a1",
+   "metadata": {},
+   "source": [
+    "# Semi-structured eval: Chunk size tuning\n",
+    "\n",
+    "`Semi-structured Reports` is a public dataset that contains question-answer pairs from documents with text and tables.\n",
+    "\n",
+    "The question-answer pairs are derived from the tables as well as some of the paragraphs in the docs.\n",
+    "\n",
+    "We evaluation performance of various chunk sizes with RAG. \n",
+    "\n",
+    "## Pre-requisites"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c387b660-967d-4d2f-8c38-af125f7b7a8b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %pip install -U langchain langsmith langchain_benchmarks\n",
+    "# %pip install --quiet chromadb openai"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e9e332b1-7da4-47fc-8d9a-4d65fbfc6953",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import getpass\n",
+    "import os\n",
+    "\n",
+    "os.environ[\"LANGCHAIN_ENDPOINT\"] = \"https://api.smith.langchain.com\"\n",
+    "env_vars = [\"LANGCHAIN_API_KEY\", \"OPENAI_API_KEY\"]\n",
+    "for var in env_vars:\n",
+    "    if var not in os.environ:\n",
+    "        os.environ[var] = getpass.getpass(prompt=f\"Enter your {var}: \")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b1a19f23-468c-4aeb-a0e9-0765a85f3f0b",
+   "metadata": {},
+   "source": [
+    "## Dataset\n",
+    "\n",
+    "Fetch the associated PDFs from remote cache for the dataset so that we can perform ingestion."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "a94d9aa5-acd8-4032-ad8f-f995dec4d13c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "from langchain_benchmarks import clone_public_dataset, registry\n",
+    "from langchain_benchmarks.rag.tasks.semi_structured_reports import get_file_names\n",
+    "\n",
+    "# Task\n",
+    "task = registry[\"Semi-structured Reports\"]\n",
+    "\n",
+    "# Files used\n",
+    "paths = list(get_file_names())\n",
+    "files = [str(p) for p in paths]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "12b52285-358c-4752-ad6b-25ffb629e309",
+   "metadata": {},
+   "source": [
+    "Clone the dataset so that it's available in our LangSmith datasets."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1ecca7af-c3e7-42d1-97dd-c7d9777207cb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "clone_public_dataset(task.dataset_id, dataset_name=task.name)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "64f37705-0190-4b7a-9d88-63bfd904fbd9",
+   "metadata": {},
+   "source": [
+    "## Load and index\n",
+    "\n",
+    "We load each file, split it, embed with `OpenAIEmbeddings`, and create an index with `Chroma` vectorstore."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7eb9e333-77e6-48f9-b221-9bded023b978",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "from langchain.document_loaders import PyPDFLoader\n",
+    "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
+    "from langchain.vectorstores import Chroma\n",
+    "from langchain.embeddings import OpenAIEmbeddings\n",
+    "from langchain.chat_models import ChatOpenAI\n",
+    "from langchain.prompts import ChatPromptTemplate\n",
+    "from langchain.schema.output_parser import StrOutputParser\n",
+    "from langchain.schema.runnable import RunnablePassthrough\n",
+    "\n",
+    "\n",
+    "def load_and_split(file, token_count, split_document=True):\n",
+    "    \"\"\"\n",
+    "    Load and optionally split PDF files.\n",
+    "\n",
+    "    Args:\n",
+    "        file (str): File path.\n",
+    "        token_count (int): Token count for splitting.\n",
+    "        split_document (bool): Flag for splitting or returning pages.\n",
+    "    \"\"\"\n",
+    "\n",
+    "    loader = PyPDFLoader(file)\n",
+    "    pdf_pages = loader.load()\n",
+    "\n",
+    "    if split_document:\n",
+    "        text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(\n",
+    "            chunk_size=token_count, chunk_overlap=50\n",
+    "        )\n",
+    "\n",
+    "        docs = text_splitter.split_documents(pdf_pages)\n",
+    "        texts = [d.page_content for d in docs]\n",
+    "    else:\n",
+    "        texts = [d.page_content for d in pdf_pages]\n",
+    "\n",
+    "    print(f\"There are {len(texts)} text elements\")\n",
+    "    return texts\n",
+    "\n",
+    "\n",
+    "def load_files(files, directory, token_count, split_document):\n",
+    "    \"\"\"\n",
+    "    Load files.\n",
+    "\n",
+    "    Args:\n",
+    "        files (list): List of file names.\n",
+    "        dir (str): Directory path.\n",
+    "        token_count (int): Token count for splitting.\n",
+    "        split_document (bool): Flag for splitting documents.\n",
+    "    \"\"\"\n",
+    "\n",
+    "    texts = []\n",
+    "    for fi in files:\n",
+    "        texts.extend(load_and_split(directory + fi, token_count, split_document))\n",
+    "    return texts\n",
+    "\n",
+    "\n",
+    "def make_retriever(texts, expt):\n",
+    "    \"\"\"\n",
+    "    Make vector store.\n",
+    "\n",
+    "    Args:\n",
+    "        texts (list): List of texts.\n",
+    "        expt (str): Experiment name.\n",
+    "    \"\"\"\n",
+    "    vectorstore = Chroma.from_texts(\n",
+    "        texts=texts, collection_name=expt, embedding=OpenAIEmbeddings()\n",
+    "    )\n",
+    "    retriever = vectorstore.as_retriever()\n",
+    "    return retriever\n",
+    "\n",
+    "\n",
+    "def rag_chain(retriever):\n",
+    "    \"\"\"\n",
+    "    RAG chain.\n",
+    "\n",
+    "    Args:\n",
+    "        retriever: The retriever to use.\n",
+    "    \"\"\"\n",
+    "\n",
+    "    # Prompt template\n",
+    "    template = \"\"\"Answer the question based only on the following context, which can include text and tables:\n",
+    "    {context}\n",
+    "    Question: {question}\n",
+    "    \"\"\"\n",
+    "    prompt = ChatPromptTemplate.from_template(template)\n",
+    "\n",
+    "    # LLM\n",
+    "    model = ChatOpenAI(temperature=0, model=\"gpt-4\")\n",
+    "\n",
+    "    # RAG pipeline\n",
+    "    chain = (\n",
+    "        {\n",
+    "            \"context\": retriever | (lambda x: \"\\n\\n\".join([i.page_content for i in x])),\n",
+    "            \"question\": RunnablePassthrough(),\n",
+    "        }\n",
+    "        | prompt\n",
+    "        | model\n",
+    "        | StrOutputParser()\n",
+    "    )\n",
+    "    return chain\n",
+    "\n",
+    "\n",
+    "# Experiment configurations\n",
+    "experiments = [\n",
+    "    (None, False, \"page_split\"),\n",
+    "    (50, True, \"50_tok_split\"),\n",
+    "    (100, True, \"100_tok_split\"),\n",
+    "    (250, True, \"250_tok_split\"),\n",
+    "]\n",
+    "\n",
+    "# Run\n",
+    "stor_chain = {}\n",
+    "for token_count, split_document, expt in experiments:\n",
+    "    texts = load_files(files, directory, token_count, split_document)\n",
+    "    retriever = make_retriever(texts, expt)\n",
+    "    stor_chain[expt] = rag_chain(retriever)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "29515a91-3cb1-41bd-a2d4-6cf6ce7806c2",
+   "metadata": {},
+   "source": [
+    "## Eval\n",
+    "\n",
+    "Run eval onm our dataset, `Semi-structured Reports`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "edd2e7f9-b3f6-4885-bf05-96f1c1758b20",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import uuid\n",
+    "from langsmith.client import Client\n",
+    "from langchain.smith import RunEvalConfig\n",
+    "\n",
+    "# Config\n",
+    "client = Client()\n",
+    "eval_config = RunEvalConfig(\n",
+    "    evaluators=[\"cot_qa\"],\n",
+    ")\n",
+    "\n",
+    "# Experiments\n",
+    "chain_map = {\n",
+    "    \"page_split\": stor_chain[\"page_split\"],\n",
+    "    \"baseline-50-tok\": stor_chain[\"50_tok_split\"],\n",
+    "    \"baseline-100-tok\": stor_chain[\"100_tok_split\"],\n",
+    "    \"baseline-250-tok\": stor_chain[\"250_tok_split\"],\n",
+    "}\n",
+    "\n",
+    "# Run evaluation\n",
+    "run_id = uuid.uuid4().hex[:4]\n",
+    "test_runs = {}\n",
+    "for project_name, chain in chain_map.items():\n",
+    "    test_runs[project_name] = client.run_on_dataset(\n",
+    "        dataset_name=task.name,\n",
+    "        llm_or_chain_factory=lambda: (lambda x: x[\"question\"]) | chain,\n",
+    "        evaluation=eval_config,\n",
+    "        verbose=True,\n",
+    "        project_name=f\"{run_id}-{project_name}\",\n",
+    "        project_metadata={\"chain\": project_name},\n",
+    "    )"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.16"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}