langchain-ai · hinthornw · Nov 21, 2023 · Nov 17, 2023 · Nov 18, 2023 · Nov 20, 2023
diff --git a/.gitignore b/.gitignore
@@ -159,4 +159,4 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
-
+.DS_Store
diff --git a/csv-qa/pandas_agent_instruct.py b/csv-qa/pandas_agent_instruct.py
@@ -5,7 +5,6 @@
 )
 from langchain.embeddings import OpenAIEmbeddings
 from langchain.llms import OpenAI
-from langchain.prompts import PromptTemplate
 from langchain.smith import RunEvalConfig, run_on_dataset
 from langchain.tools import PythonAstREPLTool
 from langchain.vectorstores import FAISS

diff --git a/csv-qa/pandas_ai.py b/csv-qa/pandas_ai.py
@@ -1,6 +1,4 @@
 import pandas as pd
-from langchain.agents.agent_toolkits import create_pandas_dataframe_agent
-from langchain.agents.agent_types import AgentType
 from langchain.chat_models import ChatOpenAI
 from langchain.prompts import ChatPromptTemplate
 from langchain.schema.output_parser import StrOutputParser

diff --git a/docs/source/.gitignore b/docs/source/.gitignore
@@ -0,0 +1 @@
+chromadb/
diff --git a/docs/source/notebooks/extraction.ipynb b/docs/source/notebooks/extraction.ipynb
@@ -272,10 +272,11 @@
    },
    "outputs": [],
    "source": [
+    "from langchain.chat_models import ChatOpenAI\n",
+    "\n",
     "from langchain_benchmarks.extraction.implementations import (\n",
     "    create_openai_function_based_extractor,\n",
-    ")\n",
-    "from langchain.chat_models import ChatOpenAI"
+    ")"
    ]
   },
   {
@@ -354,8 +355,9 @@
    },
    "outputs": [],
    "source": [
-    "from langchain_benchmarks.extraction import get_eval_config\n",
-    "from langsmith.client import Client"
+    "from langsmith.client import Client\n",
+    "\n",
+    "from langchain_benchmarks.extraction import get_eval_config"
    ]
   },
   {

diff --git a/docs/source/notebooks/rag_evaluations.ipynb b/docs/source/notebooks/rag_evaluations.ipynb
diff --git a/docs/source/notebooks/rag_langchain_docs.ipynb b/docs/source/notebooks/rag_langchain_docs.ipynb
diff --git a/docs/source/notebooks/rag_semi_structured.ipynb b/docs/source/notebooks/rag_semi_structured.ipynb
diff --git a/docs/source/notebooks/tool_usage.ipynb b/docs/source/notebooks/tool_usage.ipynb
@@ -301,8 +301,9 @@
    },
    "outputs": [],
    "source": [
-    "from langchain_benchmarks.tool_usage import STANDARD_AGENT_EVALUATOR\n",
-    "from langsmith.client import Client"
+    "from langsmith.client import Client\n",
+    "\n",
+    "from langchain_benchmarks.tool_usage import STANDARD_AGENT_EVALUATOR"
    ]
   },
   {

diff --git a/langchain-docs-benchmarking/packages/chat-langchain/chat_langchain/chain.py b/langchain-docs-benchmarking/packages/chat-langchain/chat_langchain/chain.py
@@ -134,7 +134,7 @@ def create_response_chain(
         ]
     )
 
-    response_synthesizer = (prompt | llm | StrOutputParser()).with_config(
+    response_generator = (prompt | llm | StrOutputParser()).with_config(
         run_name="GenerateResponse",
     )
     return (
@@ -147,7 +147,7 @@ def create_response_chain(
             ),
         }
         | _context
-        | response_synthesizer
+        | response_generator
     )
 
 

diff --git a/langchain-docs-benchmarking/prepare_dataset.py b/langchain-docs-benchmarking/prepare_dataset.py
@@ -1,19 +1,30 @@
 """Copy the public dataset to your own langsmith tenant."""
+from typing import Optional
+
 from langsmith import Client
-from tqdm import tqdm
 
 DATASET_NAME = "LangChain Docs Q&A"
 PUBLIC_DATASET_TOKEN = "452ccafc-18e1-4314-885b-edd735f17b9d"
-client = Client()
 
 
 def create_langchain_docs_dataset(
-    dataset_name: str = DATASET_NAME, public_dataset_token: str = PUBLIC_DATASET_TOKEN
+    dataset_name: str = DATASET_NAME,
+    public_dataset_token: str = PUBLIC_DATASET_TOKEN,
+    client: Optional[Client] = None,
 ):
+    shared_client = Client(
+        api_url="https://api.smith.langchain.com", api_key="placeholder"
+    )
+    examples = list(shared_client.list_shared_examples(public_dataset_token))
+    client = client or Client()
     if client.has_dataset(dataset_name=dataset_name):
-        return
-    ds = client.create_dataset(dataset_name=dataset_name)
-    examples = tqdm(list(client.list_shared_examples(public_dataset_token)))
+        loaded_examples = list(client.list_examples(dataset_name=dataset_name))
+        if len(loaded_examples) == len(examples):
+            return
+        else:
+            ds = client.read_dataset(dataset_name=dataset_name)
+    else:
+        ds = client.create_dataset(dataset_name=dataset_name)
     client.create_examples(
         inputs=[e.inputs for e in examples],
         outputs=[e.outputs for e in examples],
@@ -23,4 +34,24 @@ def create_langchain_docs_dataset(
 
 
 if __name__ == "__main__":
-    create_langchain_docs_dataset()
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--target-api-key", type=str, required=False)
+    parser.add_argument("--target-endpoint", type=str, required=False)
+    parser.add_argument("--dataset-name", type=str, default=DATASET_NAME)
+    parser.add_argument(
+        "--public-dataset-token", type=str, default=PUBLIC_DATASET_TOKEN
+    )
+    args = parser.parse_args()
+    client = None
+    if args.target_api_key or args.target_endpoint:
+        client = Client(
+            api_key=args.target_api_key,
+            api_url=args.target_endpoint,
+        )
+    create_langchain_docs_dataset(
+        dataset_name=args.dataset_name,
+        public_dataset_token=args.public_dataset_token,
+        client=client,
+    )
diff --git a/langchain-docs-benchmarking/run_evals.py b/langchain-docs-benchmarking/run_evals.py
@@ -1,6 +1,5 @@
 import argparse
 import importlib.util
-import os
 import sys
 import uuid
 from functools import partial

diff --git a/langchain-docs-benchmarking/run_experiments.py b/langchain-docs-benchmarking/run_experiments.py
@@ -120,6 +120,7 @@
         ]
 
     for experiment in selected_experiments:
+        print("Running experiment:", experiment)
         main(
             **experiment,
             dataset_name=args.dataset_name,

diff --git a/langchain_benchmarks/.gitignore b/langchain_benchmarks/.gitignore
@@ -0,0 +1 @@
+.sql
diff --git a/langchain_benchmarks/extraction/implementations.py b/langchain_benchmarks/extraction/implementations.py
@@ -1,5 +1,5 @@
 """Default implementations of LLMs that can be used for extraction."""
-from typing import Type, Optional, List, Any, Dict
+from typing import Any, Dict, List, Optional, Type
 
 from langchain.chains.openai_functions import convert_to_openai_function
 from langchain.chat_models import ChatOpenAI
@@ -12,7 +12,6 @@
 from langchain_benchmarks.extraction.evaluators import get_eval_config
 from langchain_benchmarks.schema import ExtractionTask
 
-
 # PUBLIC API
 
 

diff --git a/langchain_benchmarks/extraction/tasks/email_task.py b/langchain_benchmarks/extraction/tasks/email_task.py
@@ -1,5 +1,5 @@
 from enum import Enum
-from typing import Optional, List
+from typing import List, Optional
 
 from langchain.prompts import ChatPromptTemplate
 from langchain.pydantic_v1 import BaseModel, Field

diff --git a/langchain_benchmarks/rag/.gitignore b/langchain_benchmarks/rag/.gitignore
@@ -0,0 +1 @@
+*.sql
diff --git a/langchain_benchmarks/rag/__init__.py b/langchain_benchmarks/rag/__init__.py
@@ -0,0 +1,5 @@
+from langchain_benchmarks.rag.evaluators import get_eval_config
+from langchain_benchmarks.rag.tasks import LANGCHAIN_DOCS_TASK
+
+# Please keep this sorted
+__all__ = ["get_eval_config", "LANGCHAIN_DOCS_TASK"]
diff --git a/langchain_benchmarks/rag/evaluators.py b/langchain_benchmarks/rag/evaluators.py
@@ -0,0 +1,97 @@
+from typing import Optional
+
+from langchain.chat_models import ChatOpenAI
+from langchain.evaluation import load_evaluator
+from langchain.llms.base import BaseLanguageModel
+from langchain.smith import RunEvalConfig
+from langsmith.evaluation.evaluator import EvaluationResult, RunEvaluator
+from langsmith.schemas import Example, Run
+
+
+# TODO: Split this into an assertion-by-assertion evaluator
+# TODO: Combine with a document relevance evaluator (to report retriever performance)
+class FaithfulnessEvaluator(RunEvaluator):
+    def __init__(self, llm: Optional[BaseLanguageModel] = None):
+        self.evaluator = load_evaluator(
+            "labeled_score_string",
+            criteria={
+                "faithfulness": """
+Score 1: The answer directly contradicts the information provided in the reference docs.
+Score 3: The answer contains a mix of correct information from the reference docs and incorrect or unverifiable information not found in the docs.
+Score 5: The answer is mostly aligned with the reference docs but includes extra information that, while not contradictory, is not verified by the docs.
+Score 7: The answer aligns well with the reference docs but includes minor, commonly accepted facts not found in the docs.
+Score 10: The answer perfectly aligns with and is fully entailed by the reference docs, with no extra information."""
+            },
+            llm=llm,
+            normalize_by=10,
+        )
+
+    @staticmethod
+    def _get_retrieved_docs(run: Run) -> str:
+        # This assumes there is only one retriever in your chain.
+        # To select more precisely, name your retrieval chain
+        # using with_config(name="my_unique_name") and look up
+        # by run.name
+        runs = [run]
+        while runs:
+            run = runs.pop()
+            if run.run_type == "retriever":
+                return str(run.outputs["documents"])
+            if run.child_runs:
+                runs.extend(run.child_runs[::-1])
+        return ""
+
+    def evaluate_run(
+        self, run: Run, example: Optional[Example] = None
+    ) -> EvaluationResult:
+        try:
+            docs_string = self._get_retrieved_docs(run)
+            docs_string = f"Reference docs:\n<DOCS>\n{docs_string}\n</DOCS>\n\n"
+            input_query = run.inputs["question"]
+            if run.outputs is not None and len(run.outputs) == 1:
+                prediction = next(iter(run.outputs.values()))
+            else:
+                prediction = run.outputs["output"]
+            result = self.evaluator.evaluate_strings(
+                input=input_query,
+                prediction=prediction,
+                reference=docs_string,
+            )
+            return EvaluationResult(
+                **{"key": "faithfulness", "comment": result.get("reasoning"), **result}
+            )
+        except Exception as e:
+            return EvaluationResult(key="faithfulness", score=None, comment=repr(e))
+
+
+_ACCURACY_CRITERION = {
+    "accuracy": """
+Score 1: The answer is incorrect and unrelated to the question or reference document.
+Score 3: The answer shows slight relevance to the question or reference document but is largely incorrect.
+Score 5: The answer is partially correct but has significant errors or omissions.
+Score 7: The answer is mostly correct with minor errors or omissions, and aligns with the reference document.
+Score 10: The answer is correct, complete, and perfectly aligns with the reference document.
+
+If the reference answer contains multiple alternatives, the predicted answer must only match one of the alternatives to be considered correct.
+If the predicted answer contains additional helpful and accurate information that is not present in the reference answer, it should still be considered correct.
+"""  # noqa
+}
+
+
+def get_eval_config() -> RunEvalConfig:
+    """Returns the evaluator for the environment."""
+    eval_llm = ChatOpenAI(model="gpt-4", temperature=0.0, model_kwargs={"seed": 42})
+    # Use a longer-context LLM to check documents
+    faithfulness_eval_llm = ChatOpenAI(
+        model="gpt-4-1106-preview", temperature=0.0, model_kwargs={"seed": 42}
+    )
+
+    return RunEvalConfig(
+        evaluators=[
+            RunEvalConfig.LabeledScoreString(
+                criteria=_ACCURACY_CRITERION, llm=eval_llm, normalize_by=10.0
+            ),
+            RunEvalConfig.EmbeddingDistance(),
+        ],
+        custom_evaluators=[FaithfulnessEvaluator(llm=faithfulness_eval_llm)],
+    )
diff --git a/langchain_benchmarks/rag/lcdocs_oai_record_manager.sql b/langchain_benchmarks/rag/lcdocs_oai_record_manager.sql
diff --git a/langchain_benchmarks/rag/tasks/__init__.py b/langchain_benchmarks/rag/tasks/__init__.py
@@ -0,0 +1,7 @@
+from langchain_benchmarks.rag.tasks.langchain_docs.task import LANGCHAIN_DOCS_TASK
+from langchain_benchmarks.rag.tasks.semi_structured_earnings.task import (
+    SEMI_STRUCTURED_EARNINGS_TASK,
+)
+
+# Please keep this sorted
+__all__ = ["LANGCHAIN_DOCS_TASK", "SEMI_STRUCTURED_EARNINGS_TASK"]
diff --git a/langchain_benchmarks/rag/tasks/langchain_docs/README.md b/langchain_benchmarks/rag/tasks/langchain_docs/README.md
@@ -0,0 +1,10 @@
+# LangChain Docs Task 
+
+This code contains utilities to scrape the LangChain docs (already run) and index them
+using common techniques. The docs were scraped using the code in `_ingest_docs.py` and
+uploaded to gcs. To better compare retrieval techniques, we hold these constant and pull
+from that cache whenever generating different indices.
+
+
+The content in `indexing` composes some common indexing strategies with default paramaters for
+benchmarking on the langchain docs.
diff --git a/langchain_benchmarks/rag/tasks/langchain_docs/__init__.py b/langchain_benchmarks/rag/tasks/langchain_docs/__init__.py
@@ -0,0 +1,8 @@
+from langchain_benchmarks.rag.tasks.langchain_docs import architectures, indexing
+from langchain_benchmarks.rag.tasks.langchain_docs.task import LANGCHAIN_DOCS_TASK
+
+DATASET_ID = (
+    "452ccafc-18e1-4314-885b-edd735f17b9d"  # ID of public LangChain Docs dataset
+)
+
+__all__ = ["architectures", "indexing", "DATASET_ID", "LANGCHAIN_DOCS_TASK"]