Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion docs/source/notebooks/extraction/email.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,16 @@
"Let's evaluate an LLM on its ability to extract structured information from email texts."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "47de0d20-d20b-44be-9e41-d2275f0866e8",
"metadata": {},
"outputs": [],
"source": [
"# %pip install -U langchain langchain_benchmarks openai"
]
},
{
"cell_type": "code",
"execution_count": 1,
Expand All @@ -20,7 +30,8 @@
"import os\n",
"\n",
"# Get your API key from https://smith.langchain.com/settings\n",
"# os.environ[\"LANGCHAIN_API_KEY\"] = \"sk-...\""
"# os.environ[\"LANGCHAIN_API_KEY\"] = \"sk-...\"\n",
"# os.environ[\"OPENAI_API_KEY\"] = \"sk-...\""
]
},
{
Expand Down
223 changes: 166 additions & 57 deletions docs/source/notebooks/retrieval/semi_structured.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion langchain_benchmarks/rag/evaluators.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

from langchain.chat_models import ChatOpenAI
from langchain.evaluation import load_evaluator
from langchain.llms.base import BaseLanguageModel
from langchain.smith import RunEvalConfig
from langchain_core.language_models import BaseLanguageModel
from langsmith.evaluation.evaluator import EvaluationResult, RunEvaluator
from langsmith.schemas import Example, Run

Expand Down
6 changes: 3 additions & 3 deletions langchain_benchmarks/rag/tasks/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from langchain_benchmarks.rag.tasks.langchain_docs.task import LANGCHAIN_DOCS_TASK
from langchain_benchmarks.rag.tasks.semi_structured_earnings.task import (
SEMI_STRUCTURED_EARNINGS_TASK,
from langchain_benchmarks.rag.tasks.semi_structured_reports.task import (
SEMI_STRUCTURED_REPORTS_TASK,
)

# Please keep this sorted
__all__ = ["LANGCHAIN_DOCS_TASK", "SEMI_STRUCTURED_EARNINGS_TASK"]
__all__ = ["LANGCHAIN_DOCS_TASK", "SEMI_STRUCTURED_REPORTS_TASK"]
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def _chroma_retriever_factory(
docs = docs or load_docs_from_parquet()
embedding_name = embedding.__class__.__name__
vectorstore = Chroma(
collection_name=f"langchain-benchmarks-classic-{embedding_name}",
collection_name=f"lcbm-b-{embedding_name}-{transformation_name}",
embedding_function=embedding,
persist_directory="./chromadb",
)
Expand All @@ -79,11 +79,12 @@ def _chroma_parent_document_retriever_factory(
*,
docs: Optional[Iterable[Document]] = None,
search_kwargs: Optional[dict] = None,
transformation_name: Optional[str] = None,
) -> BaseRetriever:
docs = docs or load_docs_from_parquet()
embedding_name = embedding.__class__.__name__
vectorstore = Chroma(
collection_name=f"langchain-benchmarks-parent-doc-{embedding_name}",
collection_name=f"lcbm-b-{embedding_name}-{transformation_name}",
embedding_function=embedding,
persist_directory="./chromadb",
)
Expand All @@ -93,6 +94,7 @@ def _chroma_parent_document_retriever_factory(
vectorstore,
collection_name="langchain-docs",
search_kwargs=search_kwargs or _DEFAULT_SEARCH_KWARGS,
transformation_name=transformation_name,
)


Expand All @@ -101,11 +103,12 @@ def _chroma_hyde_retriever_factory(
*,
docs: Optional[Iterable[Document]] = None,
search_kwargs: Optional[dict] = None,
transformation_name: Optional[str] = None,
) -> BaseRetriever:
docs = docs or load_docs_from_parquet()
embedding_name = embedding.__class__.__name__
vectorstore = Chroma(
collection_name=f"langchain-benchmarks-hyde-{embedding_name}",
collection_name=f"lcbm-hd-{embedding_name}-{transformation_name}",
embedding_function=embedding,
persist_directory="./chromadb",
)
Expand All @@ -115,6 +118,7 @@ def _chroma_hyde_retriever_factory(
vectorstore,
collection_name="langchain-docs",
search_kwargs=search_kwargs or _DEFAULT_SEARCH_KWARGS,
transformation_name=transformation_name,
)


Expand Down
Empty file.

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from langchain_benchmarks.rag.tasks.semi_structured_reports.indexing.retriever_registry import (
get_file_names,
)
from langchain_benchmarks.rag.tasks.semi_structured_reports.task import (
SEMI_STRUCTURED_REPORTS_TASK,
)

# Please keep this sorted
__all__ = ["get_file_names", "SEMI_STRUCTURED_REPORTS_TASK"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from langchain_benchmarks.rag.tasks.semi_structured_reports.indexing.retriever_registry import (
RETRIEVER_FACTORIES,
load_docs,
)

__all__ = ["RETRIEVER_FACTORIES", "load_docs"]
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,13 @@ def fetch_raw_docs(
os.remove(LOCAL_FILE)


def get_file_names():
fetch_raw_docs()
# Traverse the directory and partition the pdfs
for path in DOCS_DIR.glob("*.pdf"):
yield path


def partition_pdfs(path: Path, *, config: Optional[dict] = None):
try:
from unstructured.partition.pdf import partition_pdf
Expand Down Expand Up @@ -90,9 +97,7 @@ def partition_pdfs(path: Path, *, config: Optional[dict] = None):


def load_docs(*, unstructured_config: Optional[dict] = None) -> Iterable[Document]:
fetch_raw_docs()
# Traverse the directory and partition the pdfs
for path in DOCS_DIR.glob("*.pdf"):
for path in get_file_names():
yield from partition_pdfs(path, config=unstructured_config)


Expand All @@ -107,15 +112,15 @@ def _chroma_retriever_factory(
docs = docs or load_docs()
embedding_name = embedding.__class__.__name__
vectorstore = Chroma(
collection_name=f"lcbm-semistruct-basic-{embedding_name}",
collection_name=f"lcbm-ss-b-{embedding_name}-{transformation_name}",
embedding_function=embedding,
persist_directory="./chromadb",
)
return get_vectorstore_retriever(
docs,
embedding,
vectorstore,
collection_name="semi-structured-earnings",
collection_name="semi-structured-earnings-b",
transform_docs=transform_docs,
transformation_name=transformation_name,
search_kwargs=search_kwargs or _DEFAULT_SEARCH_KWARGS,
Expand All @@ -132,15 +137,15 @@ def _chroma_parent_document_retriever_factory(
docs = docs or load_docs()
embedding_name = embedding.__class__.__name__
vectorstore = Chroma(
collection_name=f"lcbm-semistruct-parent-doc-{embedding_name}",
collection_name=f"lcbm-ss-pd-{embedding_name}-{transformation_name}",
embedding_function=embedding,
persist_directory="./chromadb",
)
return get_parent_document_retriever(
docs,
embedding,
vectorstore,
collection_name="semi-structured-earnings",
collection_name="semi-structured-earnings-pd",
search_kwargs=search_kwargs or _DEFAULT_SEARCH_KWARGS,
transformation_name=transformation_name,
)
Expand All @@ -156,15 +161,15 @@ def _chroma_hyde_retriever_factory(
docs = docs or load_docs()
embedding_name = embedding.__class__.__name__
vectorstore = Chroma(
collection_name=f"lcbm-semistruct-hyde-{embedding_name}",
collection_name=f"lcbm-ss-hd-{embedding_name}-{transformation_name}",
embedding_function=embedding,
persist_directory="./chromadb",
)
return get_hyde_retriever(
docs,
embedding,
vectorstore,
collection_name="semi-structured-earnings",
collection_name="semi-structured-earnings-hd",
search_kwargs=search_kwargs or _DEFAULT_SEARCH_KWARGS,
transformation_name=transformation_name,
)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
from langchain_benchmarks.rag.tasks.semi_structured_earnings import (
from langchain_benchmarks.rag.tasks.semi_structured_reports import (
indexing,
)
from langchain_benchmarks.rag.tasks.semi_structured_earnings.indexing.retriever_registry import (
from langchain_benchmarks.rag.tasks.semi_structured_reports.indexing.retriever_registry import (
load_docs,
)
from langchain_benchmarks.schema import RetrievalTask

# ID of public Semi-structured Earnings dataset
DATASET_ID = "https://smith.langchain.com/public/c47d9617-ab99-4d6e-a6e6-92b8daf85a7d/d"

SEMI_STRUCTURED_EARNINGS_TASK = RetrievalTask(
name="Semi-structured Earnings",
SEMI_STRUCTURED_REPORTS_TASK = RetrievalTask(
name="Semi-structured Reports",
dataset_id=DATASET_ID,
retriever_factories=indexing.RETRIEVER_FACTORIES,
architecture_factories={},
Expand Down
1 change: 0 additions & 1 deletion langchain_benchmarks/rag/utils/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,6 @@ def create_index(
db_url=RECORD_MANAGER_DB_URL,
)
record_manager.create_schema()

return index(
tqdm(transformed_docs),
record_manager,
Expand Down
4 changes: 2 additions & 2 deletions langchain_benchmarks/registration.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from langchain_benchmarks.extraction.tasks import email_task
from langchain_benchmarks.rag.tasks import (
LANGCHAIN_DOCS_TASK,
SEMI_STRUCTURED_EARNINGS_TASK,
SEMI_STRUCTURED_REPORTS_TASK,
)
from langchain_benchmarks.schema import Registry
from langchain_benchmarks.tool_usage.tasks import (
Expand All @@ -22,6 +22,6 @@
multiverse_math.MULTIVERSE_MATH,
email_task.EMAIL_EXTRACTION_TASK,
LANGCHAIN_DOCS_TASK,
SEMI_STRUCTURED_EARNINGS_TASK,
SEMI_STRUCTURED_REPORTS_TASK,
]
)
44 changes: 20 additions & 24 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ readme = "README.md"
[tool.poetry.dependencies]
python = "^3.8.1"
langchain = ">=0.0.333"
langchain_core = ">=0.0.3"
langsmith = ">=0.0.66"
tqdm = "^4"
ipywidgets = "^8"
Expand Down