Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -159,4 +159,4 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/

.DS_Store
1 change: 0 additions & 1 deletion csv-qa/pandas_agent_instruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
)
from langchain.embeddings import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
from langchain.smith import RunEvalConfig, run_on_dataset
from langchain.tools import PythonAstREPLTool
from langchain.vectorstores import FAISS
Expand Down
2 changes: 0 additions & 2 deletions csv-qa/pandas_ai.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
import pandas as pd
from langchain.agents.agent_toolkits import create_pandas_dataframe_agent
from langchain.agents.agent_types import AgentType
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser
Expand Down
1 change: 1 addition & 0 deletions docs/source/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
chromadb/
10 changes: 6 additions & 4 deletions docs/source/notebooks/extraction.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -272,10 +272,11 @@
},
"outputs": [],
"source": [
"from langchain.chat_models import ChatOpenAI\n",
"\n",
"from langchain_benchmarks.extraction.implementations import (\n",
" create_openai_function_based_extractor,\n",
")\n",
"from langchain.chat_models import ChatOpenAI"
")"
]
},
{
Expand Down Expand Up @@ -354,8 +355,9 @@
},
"outputs": [],
"source": [
"from langchain_benchmarks.extraction import get_eval_config\n",
"from langsmith.client import Client"
"from langsmith.client import Client\n",
"\n",
"from langchain_benchmarks.extraction import get_eval_config"
]
},
{
Expand Down
958 changes: 958 additions & 0 deletions docs/source/notebooks/rag_evaluations.ipynb

Large diffs are not rendered by default.

409 changes: 409 additions & 0 deletions docs/source/notebooks/rag_langchain_docs.ipynb

Large diffs are not rendered by default.

440 changes: 440 additions & 0 deletions docs/source/notebooks/rag_semi_structured.ipynb

Large diffs are not rendered by default.

5 changes: 3 additions & 2 deletions docs/source/notebooks/tool_usage.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -301,8 +301,9 @@
},
"outputs": [],
"source": [
"from langchain_benchmarks.tool_usage import STANDARD_AGENT_EVALUATOR\n",
"from langsmith.client import Client"
"from langsmith.client import Client\n",
"\n",
"from langchain_benchmarks.tool_usage import STANDARD_AGENT_EVALUATOR"
]
},
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ def create_response_chain(
]
)

response_synthesizer = (prompt | llm | StrOutputParser()).with_config(
response_generator = (prompt | llm | StrOutputParser()).with_config(
run_name="GenerateResponse",
)
return (
Expand All @@ -147,7 +147,7 @@ def create_response_chain(
),
}
| _context
| response_synthesizer
| response_generator
)


Expand Down
45 changes: 38 additions & 7 deletions langchain-docs-benchmarking/prepare_dataset.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,30 @@
"""Copy the public dataset to your own langsmith tenant."""
from typing import Optional

from langsmith import Client
from tqdm import tqdm

DATASET_NAME = "LangChain Docs Q&A"
PUBLIC_DATASET_TOKEN = "452ccafc-18e1-4314-885b-edd735f17b9d"
client = Client()


def create_langchain_docs_dataset(
dataset_name: str = DATASET_NAME, public_dataset_token: str = PUBLIC_DATASET_TOKEN
dataset_name: str = DATASET_NAME,
public_dataset_token: str = PUBLIC_DATASET_TOKEN,
client: Optional[Client] = None,
):
shared_client = Client(
api_url="https://api.smith.langchain.com", api_key="placeholder"
)
examples = list(shared_client.list_shared_examples(public_dataset_token))
client = client or Client()
if client.has_dataset(dataset_name=dataset_name):
return
ds = client.create_dataset(dataset_name=dataset_name)
examples = tqdm(list(client.list_shared_examples(public_dataset_token)))
loaded_examples = list(client.list_examples(dataset_name=dataset_name))
if len(loaded_examples) == len(examples):
return
else:
ds = client.read_dataset(dataset_name=dataset_name)
else:
ds = client.create_dataset(dataset_name=dataset_name)
client.create_examples(
inputs=[e.inputs for e in examples],
outputs=[e.outputs for e in examples],
Expand All @@ -23,4 +34,24 @@ def create_langchain_docs_dataset(


if __name__ == "__main__":
create_langchain_docs_dataset()
import argparse

parser = argparse.ArgumentParser()
parser.add_argument("--target-api-key", type=str, required=False)
parser.add_argument("--target-endpoint", type=str, required=False)
parser.add_argument("--dataset-name", type=str, default=DATASET_NAME)
parser.add_argument(
"--public-dataset-token", type=str, default=PUBLIC_DATASET_TOKEN
)
args = parser.parse_args()
client = None
if args.target_api_key or args.target_endpoint:
client = Client(
api_key=args.target_api_key,
api_url=args.target_endpoint,
)
create_langchain_docs_dataset(
dataset_name=args.dataset_name,
public_dataset_token=args.public_dataset_token,
client=client,
)
1 change: 0 additions & 1 deletion langchain-docs-benchmarking/run_evals.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import argparse
import importlib.util
import os
import sys
import uuid
from functools import partial
Expand Down
1 change: 1 addition & 0 deletions langchain-docs-benchmarking/run_experiments.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@
]

for experiment in selected_experiments:
print("Running experiment:", experiment)
main(
**experiment,
dataset_name=args.dataset_name,
Expand Down
1 change: 1 addition & 0 deletions langchain_benchmarks/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
.sql
3 changes: 1 addition & 2 deletions langchain_benchmarks/extraction/implementations.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""Default implementations of LLMs that can be used for extraction."""
from typing import Type, Optional, List, Any, Dict
from typing import Any, Dict, List, Optional, Type

from langchain.chains.openai_functions import convert_to_openai_function
from langchain.chat_models import ChatOpenAI
Expand All @@ -12,7 +12,6 @@
from langchain_benchmarks.extraction.evaluators import get_eval_config
from langchain_benchmarks.schema import ExtractionTask


# PUBLIC API


Expand Down
2 changes: 1 addition & 1 deletion langchain_benchmarks/extraction/tasks/email_task.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from enum import Enum
from typing import Optional, List
from typing import List, Optional

from langchain.prompts import ChatPromptTemplate
from langchain.pydantic_v1 import BaseModel, Field
Expand Down
1 change: 1 addition & 0 deletions langchain_benchmarks/rag/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
*.sql
5 changes: 5 additions & 0 deletions langchain_benchmarks/rag/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from langchain_benchmarks.rag.evaluators import get_eval_config
from langchain_benchmarks.rag.tasks import LANGCHAIN_DOCS_TASK

# Please keep this sorted
__all__ = ["get_eval_config", "LANGCHAIN_DOCS_TASK"]
97 changes: 97 additions & 0 deletions langchain_benchmarks/rag/evaluators.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
from typing import Optional

from langchain.chat_models import ChatOpenAI
from langchain.evaluation import load_evaluator
from langchain.llms.base import BaseLanguageModel
from langchain.smith import RunEvalConfig
from langsmith.evaluation.evaluator import EvaluationResult, RunEvaluator
from langsmith.schemas import Example, Run


# TODO: Split this into an assertion-by-assertion evaluator
# TODO: Combine with a document relevance evaluator (to report retriever performance)
class FaithfulnessEvaluator(RunEvaluator):
def __init__(self, llm: Optional[BaseLanguageModel] = None):
self.evaluator = load_evaluator(
"labeled_score_string",
criteria={
"faithfulness": """
Score 1: The answer directly contradicts the information provided in the reference docs.
Score 3: The answer contains a mix of correct information from the reference docs and incorrect or unverifiable information not found in the docs.
Score 5: The answer is mostly aligned with the reference docs but includes extra information that, while not contradictory, is not verified by the docs.
Score 7: The answer aligns well with the reference docs but includes minor, commonly accepted facts not found in the docs.
Score 10: The answer perfectly aligns with and is fully entailed by the reference docs, with no extra information."""
},
llm=llm,
normalize_by=10,
)

@staticmethod
def _get_retrieved_docs(run: Run) -> str:
# This assumes there is only one retriever in your chain.
# To select more precisely, name your retrieval chain
# using with_config(name="my_unique_name") and look up
# by run.name
runs = [run]
while runs:
run = runs.pop()
if run.run_type == "retriever":
return str(run.outputs["documents"])
if run.child_runs:
runs.extend(run.child_runs[::-1])
return ""

def evaluate_run(
self, run: Run, example: Optional[Example] = None
) -> EvaluationResult:
try:
docs_string = self._get_retrieved_docs(run)
docs_string = f"Reference docs:\n<DOCS>\n{docs_string}\n</DOCS>\n\n"
input_query = run.inputs["question"]
if run.outputs is not None and len(run.outputs) == 1:
prediction = next(iter(run.outputs.values()))
else:
prediction = run.outputs["output"]
result = self.evaluator.evaluate_strings(
input=input_query,
prediction=prediction,
reference=docs_string,
)
return EvaluationResult(
**{"key": "faithfulness", "comment": result.get("reasoning"), **result}
)
except Exception as e:
return EvaluationResult(key="faithfulness", score=None, comment=repr(e))


_ACCURACY_CRITERION = {
"accuracy": """
Score 1: The answer is incorrect and unrelated to the question or reference document.
Score 3: The answer shows slight relevance to the question or reference document but is largely incorrect.
Score 5: The answer is partially correct but has significant errors or omissions.
Score 7: The answer is mostly correct with minor errors or omissions, and aligns with the reference document.
Score 10: The answer is correct, complete, and perfectly aligns with the reference document.

If the reference answer contains multiple alternatives, the predicted answer must only match one of the alternatives to be considered correct.
If the predicted answer contains additional helpful and accurate information that is not present in the reference answer, it should still be considered correct.
""" # noqa
}


def get_eval_config() -> RunEvalConfig:
"""Returns the evaluator for the environment."""
eval_llm = ChatOpenAI(model="gpt-4", temperature=0.0, model_kwargs={"seed": 42})
# Use a longer-context LLM to check documents
faithfulness_eval_llm = ChatOpenAI(
model="gpt-4-1106-preview", temperature=0.0, model_kwargs={"seed": 42}
)

return RunEvalConfig(
evaluators=[
RunEvalConfig.LabeledScoreString(
criteria=_ACCURACY_CRITERION, llm=eval_llm, normalize_by=10.0
),
RunEvalConfig.EmbeddingDistance(),
],
custom_evaluators=[FaithfulnessEvaluator(llm=faithfulness_eval_llm)],
)
Binary file not shown.
7 changes: 7 additions & 0 deletions langchain_benchmarks/rag/tasks/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from langchain_benchmarks.rag.tasks.langchain_docs.task import LANGCHAIN_DOCS_TASK
from langchain_benchmarks.rag.tasks.semi_structured_earnings.task import (
SEMI_STRUCTURED_EARNINGS_TASK,
)

# Please keep this sorted
__all__ = ["LANGCHAIN_DOCS_TASK", "SEMI_STRUCTURED_EARNINGS_TASK"]
10 changes: 10 additions & 0 deletions langchain_benchmarks/rag/tasks/langchain_docs/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# LangChain Docs Task

This code contains utilities to scrape the LangChain docs (already run) and index them
using common techniques. The docs were scraped using the code in `_ingest_docs.py` and
uploaded to gcs. To better compare retrieval techniques, we hold these constant and pull
from that cache whenever generating different indices.


The content in `indexing` composes some common indexing strategies with default paramaters for
benchmarking on the langchain docs.
8 changes: 8 additions & 0 deletions langchain_benchmarks/rag/tasks/langchain_docs/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from langchain_benchmarks.rag.tasks.langchain_docs import architectures, indexing
from langchain_benchmarks.rag.tasks.langchain_docs.task import LANGCHAIN_DOCS_TASK

DATASET_ID = (
"452ccafc-18e1-4314-885b-edd735f17b9d" # ID of public LangChain Docs dataset
)

__all__ = ["architectures", "indexing", "DATASET_ID", "LANGCHAIN_DOCS_TASK"]
Loading