Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,301 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "b6856d11-40d5-48e5-9eb3-423f479933a1",
"metadata": {},
"source": [
"# Semi-structured eval: Chunk size tuning\n",
"\n",
"`Semi-structured Reports` is a public dataset that contains question-answer pairs from documents with text and tables.\n",
"\n",
"The question-answer pairs are derived from the tables as well as some of the paragraphs in the docs.\n",
"\n",
"We evaluation performance of various chunk sizes with RAG. \n",
"\n",
"## Pre-requisites"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c387b660-967d-4d2f-8c38-af125f7b7a8b",
"metadata": {},
"outputs": [],
"source": [
"# %pip install -U langchain langsmith langchain_benchmarks\n",
"# %pip install --quiet chromadb openai"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e9e332b1-7da4-47fc-8d9a-4d65fbfc6953",
"metadata": {},
"outputs": [],
"source": [
"import getpass\n",
"import os\n",
"\n",
"os.environ[\"LANGCHAIN_ENDPOINT\"] = \"https://api.smith.langchain.com\"\n",
"env_vars = [\"LANGCHAIN_API_KEY\", \"OPENAI_API_KEY\"]\n",
"for var in env_vars:\n",
" if var not in os.environ:\n",
" os.environ[var] = getpass.getpass(prompt=f\"Enter your {var}: \")"
]
},
{
"cell_type": "markdown",
"id": "b1a19f23-468c-4aeb-a0e9-0765a85f3f0b",
"metadata": {},
"source": [
"## Dataset\n",
"\n",
"Fetch the associated PDFs from remote cache for the dataset so that we can perform ingestion."
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "a94d9aa5-acd8-4032-ad8f-f995dec4d13c",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"\n",
"from langchain_benchmarks import clone_public_dataset, registry\n",
"from langchain_benchmarks.rag.tasks.semi_structured_reports import get_file_names\n",
"\n",
"# Task\n",
"task = registry[\"Semi-structured Reports\"]\n",
"\n",
"# Files used\n",
"paths = list(get_file_names())\n",
"files = [str(p) for p in paths]"
]
},
{
"cell_type": "markdown",
"id": "12b52285-358c-4752-ad6b-25ffb629e309",
"metadata": {},
"source": [
"Clone the dataset so that it's available in our LangSmith datasets."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1ecca7af-c3e7-42d1-97dd-c7d9777207cb",
"metadata": {},
"outputs": [],
"source": [
"clone_public_dataset(task.dataset_id, dataset_name=task.name)"
]
},
{
"cell_type": "markdown",
"id": "64f37705-0190-4b7a-9d88-63bfd904fbd9",
"metadata": {},
"source": [
"## Load and index\n",
"\n",
"We load each file, split it, embed with `OpenAIEmbeddings`, and create an index with `Chroma` vectorstore."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7eb9e333-77e6-48f9-b221-9bded023b978",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"from langchain.document_loaders import PyPDFLoader\n",
"from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
"from langchain.vectorstores import Chroma\n",
"from langchain.embeddings import OpenAIEmbeddings\n",
"from langchain.chat_models import ChatOpenAI\n",
"from langchain.prompts import ChatPromptTemplate\n",
"from langchain.schema.output_parser import StrOutputParser\n",
"from langchain.schema.runnable import RunnablePassthrough\n",
"\n",
"\n",
"def load_and_split(file, token_count, split_document=True):\n",
" \"\"\"\n",
" Load and optionally split PDF files.\n",
"\n",
" Args:\n",
" file (str): File path.\n",
" token_count (int): Token count for splitting.\n",
" split_document (bool): Flag for splitting or returning pages.\n",
" \"\"\"\n",
"\n",
" loader = PyPDFLoader(file)\n",
" pdf_pages = loader.load()\n",
"\n",
" if split_document:\n",
" text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(\n",
" chunk_size=token_count, chunk_overlap=50\n",
" )\n",
"\n",
" docs = text_splitter.split_documents(pdf_pages)\n",
" texts = [d.page_content for d in docs]\n",
" else:\n",
" texts = [d.page_content for d in pdf_pages]\n",
"\n",
" print(f\"There are {len(texts)} text elements\")\n",
" return texts\n",
"\n",
"\n",
"def load_files(files, directory, token_count, split_document):\n",
" \"\"\"\n",
" Load files.\n",
"\n",
" Args:\n",
" files (list): List of file names.\n",
" dir (str): Directory path.\n",
" token_count (int): Token count for splitting.\n",
" split_document (bool): Flag for splitting documents.\n",
" \"\"\"\n",
"\n",
" texts = []\n",
" for fi in files:\n",
" texts.extend(load_and_split(directory + fi, token_count, split_document))\n",
" return texts\n",
"\n",
"\n",
"def make_retriever(texts, expt):\n",
" \"\"\"\n",
" Make vector store.\n",
"\n",
" Args:\n",
" texts (list): List of texts.\n",
" expt (str): Experiment name.\n",
" \"\"\"\n",
" vectorstore = Chroma.from_texts(\n",
" texts=texts, collection_name=expt, embedding=OpenAIEmbeddings()\n",
" )\n",
" retriever = vectorstore.as_retriever()\n",
" return retriever\n",
"\n",
"\n",
"def rag_chain(retriever):\n",
" \"\"\"\n",
" RAG chain.\n",
"\n",
" Args:\n",
" retriever: The retriever to use.\n",
" \"\"\"\n",
"\n",
" # Prompt template\n",
" template = \"\"\"Answer the question based only on the following context, which can include text and tables:\n",
" {context}\n",
" Question: {question}\n",
" \"\"\"\n",
" prompt = ChatPromptTemplate.from_template(template)\n",
"\n",
" # LLM\n",
" model = ChatOpenAI(temperature=0, model=\"gpt-4\")\n",
"\n",
" # RAG pipeline\n",
" chain = (\n",
" {\n",
" \"context\": retriever | (lambda x: \"\\n\\n\".join([i.page_content for i in x])),\n",
" \"question\": RunnablePassthrough(),\n",
" }\n",
" | prompt\n",
" | model\n",
" | StrOutputParser()\n",
" )\n",
" return chain\n",
"\n",
"\n",
"# Experiment configurations\n",
"experiments = [\n",
" (None, False, \"page_split\"),\n",
" (50, True, \"50_tok_split\"),\n",
" (100, True, \"100_tok_split\"),\n",
" (250, True, \"250_tok_split\"),\n",
"]\n",
"\n",
"# Run\n",
"stor_chain = {}\n",
"for token_count, split_document, expt in experiments:\n",
" texts = load_files(files, directory, token_count, split_document)\n",
" retriever = make_retriever(texts, expt)\n",
" stor_chain[expt] = rag_chain(retriever)"
]
},
{
"cell_type": "markdown",
"id": "29515a91-3cb1-41bd-a2d4-6cf6ce7806c2",
"metadata": {},
"source": [
"## Eval\n",
"\n",
"Run eval onm our dataset, `Semi-structured Reports`."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "edd2e7f9-b3f6-4885-bf05-96f1c1758b20",
"metadata": {},
"outputs": [],
"source": [
"import uuid\n",
"from langsmith.client import Client\n",
"from langchain.smith import RunEvalConfig\n",
"\n",
"# Config\n",
"client = Client()\n",
"eval_config = RunEvalConfig(\n",
" evaluators=[\"cot_qa\"],\n",
")\n",
"\n",
"# Experiments\n",
"chain_map = {\n",
" \"page_split\": stor_chain[\"page_split\"],\n",
" \"baseline-50-tok\": stor_chain[\"50_tok_split\"],\n",
" \"baseline-100-tok\": stor_chain[\"100_tok_split\"],\n",
" \"baseline-250-tok\": stor_chain[\"250_tok_split\"],\n",
"}\n",
"\n",
"# Run evaluation\n",
"run_id = uuid.uuid4().hex[:4]\n",
"test_runs = {}\n",
"for project_name, chain in chain_map.items():\n",
" test_runs[project_name] = client.run_on_dataset(\n",
" dataset_name=task.name,\n",
" llm_or_chain_factory=lambda: (lambda x: x[\"question\"]) | chain,\n",
" evaluation=eval_config,\n",
" verbose=True,\n",
" project_name=f\"{run_id}-{project_name}\",\n",
" project_metadata={\"chain\": project_name},\n",
" )"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.16"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading