langchain-ai · hinthornw · Nov 17, 2023 · Nov 18, 2023 · Nov 20, 2023 · Nov 20, 2023
diff --git a/.gitignore b/.gitignore
@@ -159,4 +159,4 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
-
+.DS_Store
diff --git a/docs/source/.gitignore b/docs/source/.gitignore
@@ -0,0 +1 @@
+chromadb/
diff --git a/docs/source/notebooks/rag.ipynb b/docs/source/notebooks/rag.ipynb
diff --git a/...hain-docs-benchmarking/packages/openai-functions-agent/openai_functions_agent/__init__.py b/...hain-docs-benchmarking/packages/openai-functions-agent/openai_functions_agent/__init__.py
@@ -1,3 +1,3 @@
-from openai_functions_agent.agent import agent_executor
+from openai_functions_agent.agent import agent_executor, create_executor
 
-__all__ = ["agent_executor"]
+__all__ = ["agent_executor", "create_executor"]
diff --git a/langchain-docs-benchmarking/packages/openai-functions-agent/openai_functions_agent/agent.py b/langchain-docs-benchmarking/packages/openai-functions-agent/openai_functions_agent/agent.py
@@ -1,4 +1,4 @@
-from typing import List, Tuple
+from typing import List, Tuple, Optional
 
 from langchain.agents import AgentExecutor
 from langchain.agents.format_scratchpad import format_to_openai_functions
@@ -25,7 +25,6 @@ def search(query, callbacks=None):
 
 tools = [search]
 
-llm = ChatOpenAI(model="gpt-3.5-turbo-16k", temperature=0)
 assistant_system_message = """You are a helpful assistant tasked with answering technical questions about LangChain. \
 Use tools (only if necessary) to best answer the users questions. Do not make up information if you cannot find the answer using your tools."""
 prompt = ChatPromptTemplate.from_messages(
@@ -37,8 +36,6 @@ def search(query, callbacks=None):
     ]
 )
 
-llm_with_tools = llm.bind(functions=[format_tool_to_openai_function(t) for t in tools])
-
 
 def _format_chat_history(chat_history: List[Tuple[str, str]]):
     buffer = []
@@ -48,30 +45,11 @@ def _format_chat_history(chat_history: List[Tuple[str, str]]):
     return buffer
 
 
-agent = (
-    {
-        "input": lambda x: x["input"],
-        "chat_history": lambda x: _format_chat_history(x["chat_history"]),
-        "agent_scratchpad": lambda x: format_to_openai_functions(
-            x["intermediate_steps"]
-        ),
-    }
-    | prompt
-    | llm_with_tools
-    | OpenAIFunctionsAgentOutputParser()
-)
-
-
 class AgentInput(BaseModel):
     input: str
     chat_history: List[Tuple[str, str]] = Field(..., extra={"widget": {"type": "chat"}})
 
 
-agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=False).with_types(
-    input_type=AgentInput
-)
-
-
 class ChainInput(BaseModel):
     question: str
 
@@ -80,6 +58,31 @@ def mapper(input: dict):
     return {"input": input["question"], "chat_history": []}
 
 
-agent_executor = (mapper | agent_executor | (lambda x: x["output"])).with_types(
-    input_type=ChainInput
-)
+def create_executor(model_config: Optional[dict] = None):
+    model = (model_config or {}).get("model", "gpt-3.5-turbo-16k")
+    llm = ChatOpenAI(model=model, temperature=0)
+    llm_with_tools = llm.bind(
+        functions=[format_tool_to_openai_function(t) for t in tools]
+    )
+
+    agent = (
+        {
+            "input": lambda x: x["input"],
+            "chat_history": lambda x: _format_chat_history(x["chat_history"]),
+            "agent_scratchpad": lambda x: format_to_openai_functions(
+                x["intermediate_steps"]
+            ),
+        }
+        | prompt
+        | llm_with_tools
+        | OpenAIFunctionsAgentOutputParser()
+    )
+    agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=False).with_types(
+        input_type=AgentInput
+    )
+    return (mapper | agent_executor | (lambda x: x["output"])).with_types(
+        input_type=ChainInput
+    )
+
+
+agent_executor = create_executor()
diff --git a/langchain-docs-benchmarking/prepare_dataset.py b/langchain-docs-benchmarking/prepare_dataset.py
@@ -1,19 +1,30 @@
 """Copy the public dataset to your own langsmith tenant."""
+from typing import Optional
 from langsmith import Client
 from tqdm import tqdm
 
 DATASET_NAME = "LangChain Docs Q&A"
 PUBLIC_DATASET_TOKEN = "452ccafc-18e1-4314-885b-edd735f17b9d"
-client = Client()
 
 
 def create_langchain_docs_dataset(
-    dataset_name: str = DATASET_NAME, public_dataset_token: str = PUBLIC_DATASET_TOKEN
+    dataset_name: str = DATASET_NAME,
+    public_dataset_token: str = PUBLIC_DATASET_TOKEN,
+    client: Optional[Client] = None,
 ):
+    shared_client = Client(
+        api_url="https://api.smith.langchain.com", api_key="placeholder"
+    )
+    examples = list(shared_client.list_shared_examples(public_dataset_token))
+    client = client or Client()
     if client.has_dataset(dataset_name=dataset_name):
-        return
-    ds = client.create_dataset(dataset_name=dataset_name)
-    examples = tqdm(list(client.list_shared_examples(public_dataset_token)))
+        loaded_examples = list(client.list_examples(dataset_name=dataset_name))
+        if len(loaded_examples) == len(examples):
+            return
+        else:
+            ds = client.read_dataset(dataset_name=dataset_name)
+    else:
+        ds = client.create_dataset(dataset_name=dataset_name)
     client.create_examples(
         inputs=[e.inputs for e in examples],
         outputs=[e.outputs for e in examples],
@@ -23,4 +34,24 @@ def create_langchain_docs_dataset(
 
 
 if __name__ == "__main__":
-    create_langchain_docs_dataset()
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--target-api-key", type=str, required=False)
+    parser.add_argument("--target-endpoint", type=str, required=False)
+    parser.add_argument("--dataset-name", type=str, default=DATASET_NAME)
+    parser.add_argument(
+        "--public-dataset-token", type=str, default=PUBLIC_DATASET_TOKEN
+    )
+    args = parser.parse_args()
+    client = None
+    if args.target_api_key or args.target_endpoint:
+        client = Client(
+            api_key=args.target_api_key,
+            api_url=args.target_endpoint,
+        )
+    create_langchain_docs_dataset(
+        dataset_name=args.dataset_name,
+        public_dataset_token=args.public_dataset_token,
+        client=client,
+    )
diff --git a/langchain-docs-benchmarking/run_evals.py b/langchain-docs-benchmarking/run_evals.py
@@ -1,6 +1,5 @@
 import argparse
 import importlib.util
-import os
 import sys
 import uuid
 from functools import partial
@@ -13,7 +12,7 @@
 from langchain.smith import RunEvalConfig, run_on_dataset
 from langsmith import Client
 from oai_assistant.chain import agent_executor as openai_assistant_chain
-from openai_functions_agent import agent_executor as openai_functions_agent_chain
+from openai_functions_agent import create_executor
 
 ls_client = Client()
 
@@ -33,7 +32,7 @@ def _get_chain_factory(arch: str) -> Callable:
     _map = {
         "chat": create_chain,
         "anthropic-iterative-search": lambda _: anthropic_agent_chain,
-        "openai-functions-agent": lambda _: openai_functions_agent_chain,
+        "openai-functions-agent": create_executor,
         "openai-assistant": lambda _: openai_assistant_chain,
     }
     if arch in _map:
@@ -93,8 +92,7 @@ def main(
     run_on_dataset(
         client=ls_client,
         dataset_name=dataset_name,
-        llm_or_chain_factory=partial(
-            create_runnable,
+        llm_or_chain_factory=lambda: create_runnable(
             arch=arch,
             model_config=model_config,
             retry_config=retry_config,

diff --git a/langchain-docs-benchmarking/run_experiments.py b/langchain-docs-benchmarking/run_experiments.py
@@ -6,9 +6,14 @@
 
 experiments = [
     {
-        # "server_url": "http://localhost:1983/openai-functions-agent",
         "arch": "openai-functions-agent",
         "project_name": "openai-functions-agent",
+        "model_config": {"model": "gpt-3.5-turbo-16k"},
+    },
+    {
+        "arch": "openai-functions-agent",
+        "project_name": "oaifunc-agent-gpt-4-1106",
+        "model_config": {"model": "gpt-4-1106-preview"},
     },
     {
         # "server_url": "http://localhost:1983/anthropic_chat",
@@ -41,9 +46,17 @@
         "arch": "chat",
         "model_config": {
             "chat_cls": "ChatFireworks",
-            "model": "accounts/fireworks/models/llama-v2-34b-code-instruct-w8a16",
+            "model": "accounts/fireworks/models/llama-v2-34b-code-instruct",
+        },
+        "project_name": "llama-v2-34b-code-instruct",
+    },
+    {
+        "arch": "chat",
+        "model_config": {
+            "chat_cls": "ChatFireworks",
+            "model": "accounts/fireworks/models/llama-v2-70b-chat",
         },
-        "project_name": "llama-v2-34b-code-instruct-w8a16",
+        "project_name": "llama-70b-chat",
     },
     {
         "arch": "chat",
@@ -120,6 +133,7 @@
         ]
 
     for experiment in selected_experiments:
+        print("Running experiment:", experiment)
         main(
             **experiment,
             dataset_name=args.dataset_name,

diff --git a/langchain_benchmarks/.gitignore b/langchain_benchmarks/.gitignore
@@ -0,0 +1 @@
+.sql
diff --git a/langchain_benchmarks/rag/__init__.py b/langchain_benchmarks/rag/__init__.py
@@ -0,0 +1,6 @@
+"""RAG environments."""
+from langchain_benchmarks.rag.evaluators import RAG_EVALUATION
+from langchain_benchmarks.rag.registration import registry
+
+# Please keep this list sorted!
+__all__ = ["registry", "RAG_EVALUATION"]
diff --git a/langchain_benchmarks/rag/environments/__init__.py b/langchain_benchmarks/rag/environments/__init__.py
diff --git a/langchain_benchmarks/rag/environments/langchain_docs/README.md b/langchain_benchmarks/rag/environments/langchain_docs/README.md
@@ -0,0 +1,6 @@
+# LangChain Docs Environment
+
+This code contains utilities to scrape the LangChain docs (already run) and index them
+using common techniques. The docs were scraped using the code in `_ingest_docs.py` and
+uploaded to gcs. To better compare retrieval techniques, we hold these constant and pull
+from that cache whenever generating different indices.
diff --git a/langchain_benchmarks/rag/environments/langchain_docs/__init__.py b/langchain_benchmarks/rag/environments/langchain_docs/__init__.py
@@ -0,0 +1 @@
+DATASET_ID = "452ccafc-18e1-4314-885b-edd735f17b9d"  # ID of public LangChain Docs dataset
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		DATASET_ID = "452ccafc-18e1-4314-885b-edd735f17b9d" # ID of public LangChain Docs dataset