Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/source/.gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
chromadb/
index.md
Untitled.ipynb
1,162 changes: 1,162 additions & 0 deletions docs/source/notebooks/retrieval/multi_modal_benchmarking/multi_modal_eval.ipynb

Large diffs are not rendered by default.

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions docs/source/toc.segment
Original file line number Diff line number Diff line change
Expand Up @@ -33,5 +33,7 @@
./notebooks/retrieval/intro
./notebooks/retrieval/langchain_docs_qa
./notebooks/retrieval/semi_structured
./notebooks/retrieval/multi_modal_benchmarking/multi_modal_eval_baseline
./notebooks/retrieval/multi_modal_benchmarking/multi_modal_eval
./notebooks/retrieval/comparing_techniques
```
1 change: 1 addition & 0 deletions langchain_benchmarks/rag/tasks/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
pdfs/
9 changes: 8 additions & 1 deletion langchain_benchmarks/rag/tasks/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,14 @@
from langchain_benchmarks.rag.tasks.langchain_docs.task import LANGCHAIN_DOCS_TASK
from langchain_benchmarks.rag.tasks.multi_modal_slide_decks.task import (
MULTI_MODAL_SLIDE_DECKS_TASK,
)
from langchain_benchmarks.rag.tasks.semi_structured_reports.task import (
SEMI_STRUCTURED_REPORTS_TASK,
)

# Please keep this sorted
__all__ = ["LANGCHAIN_DOCS_TASK", "SEMI_STRUCTURED_REPORTS_TASK"]
__all__ = [
"LANGCHAIN_DOCS_TASK",
"SEMI_STRUCTURED_REPORTS_TASK",
"MULTI_MODAL_SLIDE_DECKS_TASK",
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from langchain_benchmarks.rag.tasks.multi_modal_slide_decks.indexing.retriever_registry import (
get_file_names,
)

__all__ = ["get_file_names"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from langchain_benchmarks.rag.tasks.multi_modal_slide_decks.indexing.retriever_registry import (
get_file_names,
)

__all__ = ["get_file_names"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import logging
import os
import zipfile
from pathlib import Path
from typing import Iterable, Optional

from langchain_benchmarks.rag.utils._downloading import (
fetch_remote_file,
is_folder_populated,
)

logger = logging.getLogger(__name__)
_DIRECTORY = Path(os.path.abspath(__file__)).parent
# Stores the zipped pdfs for this dataset
REMOTE_DOCS_FILE = "https://storage.googleapis.com/benchmarks-artifacts/langchain-docs-benchmarking/multi_modal_slide_decks.zip"
DOCS_DIR = _DIRECTORY / "pdfs"


def fetch_raw_docs(
filename: Optional[str] = None, docs_dir: Optional[str] = None
) -> None:
filename = filename or _DIRECTORY / Path(REMOTE_DOCS_FILE).name
docs_dir = docs_dir or DOCS_DIR
if not is_folder_populated(docs_dir):
fetch_remote_file(REMOTE_DOCS_FILE, filename)
with zipfile.ZipFile(filename, "r") as zip_ref:
zip_ref.extractall(docs_dir)

os.remove(filename)


def get_file_names() -> Iterable[Path]:
fetch_raw_docs()
# Traverse the directory and partition the pdfs
for path in DOCS_DIR.rglob("*.pdf"):
# Ignore __MACOSX
if "__MACOSX" in str(path):
continue
yield path
23 changes: 23 additions & 0 deletions langchain_benchmarks/rag/tasks/multi_modal_slide_decks/task.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from langchain_benchmarks.schema import RetrievalTask

# ID of public Multi Modal Slide Decks dataset
DATASET_ID = "https://smith.langchain.com/public/40afc8e7-9d7e-44ed-8971-2cae1eb59731/d"

MULTI_MODAL_SLIDE_DECKS_TASK = RetrievalTask(
name="Multi-modal slide decks",
dataset_id=DATASET_ID,
retriever_factories={},
architecture_factories={},
get_docs={},
description=(
"""\
This public dataset is a work-in-progress and will be extended over time.

Questions and answers based on slide decks containing visual tables and charts.

Each example is composed of a question and reference answer.

Success is measured based on the accuracy of the answer relative to the reference answer.
""" # noqa: E501
),
)
Original file line number Diff line number Diff line change
Expand Up @@ -24,25 +24,24 @@
# Stores the zipped pdfs for this dataset
REMOTE_DOCS_FILE = "https://storage.googleapis.com/benchmarks-artifacts/langchain-docs-benchmarking/semi_structured_earnings.zip"
DOCS_DIR = _DIRECTORY / "pdfs"
LOCAL_FILE = _DIRECTORY / "chroma_db.zip"

_DEFAULT_SEARCH_KWARGS = {"k": 6}


def fetch_raw_docs(
filename: Optional[str] = None, docs_dir: Optional[str] = None
) -> None:
filename = filename or LOCAL_FILE
filename = filename or _DIRECTORY / Path(REMOTE_DOCS_FILE).name
docs_dir = docs_dir or DOCS_DIR
if not is_folder_populated(docs_dir):
fetch_remote_file(REMOTE_DOCS_FILE, filename)
with zipfile.ZipFile(filename, "r") as zip_ref:
zip_ref.extractall(docs_dir)

os.remove(LOCAL_FILE)
os.remove(filename)


def get_file_names():
def get_file_names() -> Iterable[Path]:
fetch_raw_docs()
# Traverse the directory and partition the pdfs
for path in DOCS_DIR.glob("*.pdf"):
Expand Down
2 changes: 2 additions & 0 deletions langchain_benchmarks/registration.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from langchain_benchmarks.extraction.tasks import chat_extraction, email_task
from langchain_benchmarks.rag.tasks import (
LANGCHAIN_DOCS_TASK,
MULTI_MODAL_SLIDE_DECKS_TASK,
SEMI_STRUCTURED_REPORTS_TASK,
)
from langchain_benchmarks.schema import Registry
Expand All @@ -24,5 +25,6 @@
chat_extraction.CHAT_EXTRACTION_TASK,
LANGCHAIN_DOCS_TASK,
SEMI_STRUCTURED_REPORTS_TASK,
MULTI_MODAL_SLIDE_DECKS_TASK,
]
)
12 changes: 8 additions & 4 deletions langchain_benchmarks/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,12 +109,16 @@ class ExtractionTask(BaseTask):

@dataclasses.dataclass(frozen=True)
class RetrievalTask(BaseTask):
retriever_factories: Dict[str, Callable[[Embeddings], BaseRetriever]] # noqa: F821
get_docs: Optional[Callable[..., Iterable[Document]]] = None
"""A function that returns the documents to be indexed."""
retriever_factories: Dict[
str, Callable[[Embeddings], BaseRetriever]
] = dataclasses.field(default_factory=dict) # noqa: F821
"""Factories that index the docs using the specified strategy."""
architecture_factories: Dict[str, Callable[[Embeddings], BaseRetriever]] # noqa: F821
architecture_factories: Dict[
str, Callable[[Embeddings], BaseRetriever]
] = dataclasses.field(default_factory=dict) # noqa: F821
"""Factories methods that help build some off-the-shelf architectures。"""
get_docs: Callable[..., Iterable[Document]]
"""A function that returns the documents to be indexed."""

@property
def _table(self) -> List[List[str]]:
Expand Down