Skip to content

Commit eb2d9e2

Browse files
authored
Update notebooks, model registry and make release (#131)
see release notes
1 parent 09d2145 commit eb2d9e2

File tree

6 files changed

+152
-25
lines changed

6 files changed

+152
-25
lines changed

docs/source/notebooks/retrieval/semi_structured_benchmarking/ss_eval_chunk_sizes.ipynb

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -118,17 +118,16 @@
118118
"metadata": {},
119119
"outputs": [],
120120
"source": [
121+
"from langchain.callbacks.manager import CallbackManager\n",
122+
"from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler\n",
123+
"from langchain.chat_models import ChatFireworks, ChatOpenAI\n",
121124
"from langchain.document_loaders import PyPDFLoader\n",
122-
"from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
123-
"from langchain.vectorstores import Chroma\n",
124125
"from langchain.embeddings import OpenAIEmbeddings\n",
125-
"from langchain.chat_models import ChatOpenAI\n",
126126
"from langchain.prompts import ChatPromptTemplate\n",
127127
"from langchain.schema.output_parser import StrOutputParser\n",
128128
"from langchain.schema.runnable import RunnablePassthrough\n",
129-
"from langchain.chat_models import ChatFireworks\n",
130-
"from langchain.callbacks.manager import CallbackManager\n",
131-
"from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler\n",
129+
"from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
130+
"from langchain.vectorstores import Chroma\n",
132131
"\n",
133132
"\n",
134133
"def load_and_split(file, token_count, split_document=True):\n",
@@ -262,8 +261,9 @@
262261
"outputs": [],
263262
"source": [
264263
"import uuid\n",
265-
"from langsmith.client import Client\n",
264+
"\n",
266265
"from langchain.smith import RunEvalConfig\n",
266+
"from langsmith.client import Client\n",
267267
"\n",
268268
"# Config\n",
269269
"client = Client()\n",

docs/source/notebooks/retrieval/semi_structured_benchmarking/ss_eval_long_context.ipynb

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -160,8 +160,7 @@
160160
"metadata": {},
161161
"outputs": [],
162162
"source": [
163-
"from langchain.chat_models import ChatOpenAI\n",
164-
"from langchain.chat_models import ChatAnthropic\n",
163+
"from langchain.chat_models import ChatAnthropic, ChatOpenAI\n",
165164
"from langchain.prompts import ChatPromptTemplate\n",
166165
"from langchain.schema.output_parser import StrOutputParser\n",
167166
"from langchain.schema.runnable import RunnablePassthrough\n",
@@ -221,8 +220,9 @@
221220
"outputs": [],
222221
"source": [
223222
"import uuid\n",
224-
"from langsmith.client import Client\n",
223+
"\n",
225224
"from langchain.smith import RunEvalConfig\n",
225+
"from langsmith.client import Client\n",
226226
"\n",
227227
"# Config\n",
228228
"client = Client()\n",
@@ -281,8 +281,8 @@
281281
}
282282
],
283283
"source": [
284-
"import numpy as np\n",
285284
"import matplotlib.pyplot as plt\n",
285+
"import numpy as np\n",
286286
"\n",
287287
"\n",
288288
"def find_all_phrase_locations(phrases, text):\n",

docs/source/notebooks/retrieval/semi_structured_benchmarking/ss_eval_multi_vector.ipynb

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -380,8 +380,9 @@
380380
"outputs": [],
381381
"source": [
382382
"import uuid\n",
383-
"from langsmith.client import Client\n",
383+
"\n",
384384
"from langchain.smith import RunEvalConfig\n",
385+
"from langsmith.client import Client\n",
385386
"\n",
386387
"# Config\n",
387388
"client = Client()\n",

docs/source/notebooks/tool_usage/benchmark_all_tasks.ipynb

Lines changed: 129 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,8 @@
1414
},
1515
{
1616
"cell_type": "code",
17-
"execution_count": null,
18-
"id": "ddadb9ef-e76a-4b48-85e4-f62c3957f502",
17+
"execution_count": 1,
18+
"id": "13a7483b-d08f-49fa-83da-619863171e5b",
1919
"metadata": {
2020
"tags": []
2121
},
@@ -24,10 +24,114 @@
2424
"import datetime\n",
2525
"import uuid\n",
2626
"\n",
27+
"from langchain.globals import set_verbose\n",
2728
"from langsmith.client import Client\n",
2829
"\n",
29-
"from langchain_benchmarks import clone_public_dataset, registry\n",
30-
"from langchain_benchmarks.tool_usage import agents"
30+
"from langchain_benchmarks import (\n",
31+
" __version__,\n",
32+
" clone_public_dataset,\n",
33+
" model_registry,\n",
34+
" registry,\n",
35+
")\n",
36+
"from langchain_benchmarks.rate_limiting import RateLimiter\n",
37+
"from langchain_benchmarks.tool_usage.agents import (\n",
38+
" CustomAgentFactory,\n",
39+
" OpenAIAgentFactory,\n",
40+
")"
41+
]
42+
},
43+
{
44+
"cell_type": "markdown",
45+
"id": "50bbe23b-a3b1-4607-929d-ea6e88b7085e",
46+
"metadata": {},
47+
"source": [
48+
"Prior to starting the tests, you may want to verify\n",
49+
"that the task that you're working with and the models are propelry defined."
50+
]
51+
},
52+
{
53+
"cell_type": "code",
54+
"execution_count": 2,
55+
"id": "adfbcaa9-349c-4223-89be-4abff9cf76ff",
56+
"metadata": {
57+
"tags": []
58+
},
59+
"outputs": [
60+
{
61+
"data": {
62+
"text/plain": [
63+
"{'input': \"Repeat the given string using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must print the letters 'a', 'b', and 'c' one at a time and in that order. \\nWrite down your answer, but do not explain it. Input: `abc`\",\n",
64+
" 'output': ' Thank you for the input and for confirming the output of each letter I printed. I simply followed the instructions to repeat the given string \"abc\" by printing one letter at a time using the provided \"type_letter\" tool without any additional explanations. Please let me know if you need me to repeat this process with a different input string.',\n",
65+
" 'intermediate_steps': [(AgentActionMessageLog(tool='type_letter', tool_input={'letter': 'a'}, log=\"\\nInvoking type_letter: {'letter': 'a'}\\n\\t\", message_log=[AIMessage(content='<tool>{\\n \"tool_name\": \"type_letter\",\\n \"arguments\": {\\n \"letter\": \"a\"\\n }\\n}</tool>\\n')]),\n",
66+
" 'OK'),\n",
67+
" (AgentActionMessageLog(tool='type_letter', tool_input={'letter': 'b'}, log=\"\\nInvoking type_letter: {'letter': 'b'}\\n\\t\", message_log=[AIMessage(content='<tool>{\\n \"tool_name\": \"type_letter\",\\n \"arguments\": {\\n \"letter\": \"b\"\\n }\\n}</tool>\\n')]),\n",
68+
" 'OK'),\n",
69+
" (AgentActionMessageLog(tool='type_letter', tool_input={'letter': 'c'}, log=\"\\nInvoking type_letter: {'letter': 'c'}\\n\\t\", message_log=[AIMessage(content='<tool>{\\n \"tool_name\": \"type_letter\",\\n \"arguments\": {\\n \"letter\": \"c\"\\n }\\n}</tool>\\n')]),\n",
70+
" 'OK')],\n",
71+
" 'state': 'abc'}"
72+
]
73+
},
74+
"execution_count": 2,
75+
"metadata": {},
76+
"output_type": "execute_result"
77+
}
78+
],
79+
"source": [
80+
"task = registry[\"Tool Usage - Typewriter (1 tool)\"]\n",
81+
"agent_factory = CustomAgentFactory(task, \"claude-2.1\")\n",
82+
"\n",
83+
"agent_factory().invoke({\"question\": \"abc\"})"
84+
]
85+
},
86+
{
87+
"cell_type": "markdown",
88+
"id": "65b32e7d-3986-4461-8a3b-8e9b6d4008cb",
89+
"metadata": {},
90+
"source": [
91+
"Define the test cases"
92+
]
93+
},
94+
{
95+
"cell_type": "code",
96+
"execution_count": 9,
97+
"id": "26d390b6-9ade-424c-aabb-d450f52ed121",
98+
"metadata": {
99+
"tags": []
100+
},
101+
"outputs": [],
102+
"source": [
103+
"tests = [\n",
104+
" # 2-tuple of (architecture, model name)\n",
105+
" (\"xml\", \"mixtral-8x7b-instruct-fw\"),\n",
106+
" (\"xml\", \"claude-2.1\"),\n",
107+
" (\"xml\", \"claude-2\"),\n",
108+
" (\"xml\", \"yi-34b-200k-fw\"),\n",
109+
" (\"xml\", \"llama-v2-70b-chat-fw\"),\n",
110+
" (\"xml\", \"llama-v2-13b-chat-fw\"),\n",
111+
" (\"openai_functions\", \"gpt-3.5-turbo-1106\"),\n",
112+
" (\"openai_functions\", \"gpt-3.5-turbo-0613\"),\n",
113+
" (\"openai_functions\", \"gpt-4-1106-preview\")(\"openai_functions\", \"gpt-4-0613\"),\n",
114+
"]"
115+
]
116+
},
117+
{
118+
"cell_type": "markdown",
119+
"id": "b55b7c24-8b4d-4bd7-8b00-365fbe61897f",
120+
"metadata": {},
121+
"source": [
122+
"## Run"
123+
]
124+
},
125+
{
126+
"cell_type": "code",
127+
"execution_count": 10,
128+
"id": "a415dd82-2e70-4173-a3f3-8e1aac60db9e",
129+
"metadata": {
130+
"tags": []
131+
},
132+
"outputs": [],
133+
"source": [
134+
"experiment_uuid = uuid.uuid4().hex[:4]"
31135
]
32136
},
33137
{
@@ -39,38 +143,51 @@
39143
},
40144
"outputs": [],
41145
"source": [
42-
"experiment_uuid = uuid.uuid4().hex[:4]\n",
43-
"models = [\"gpt-3.5-turbo-16k\"]\n",
44146
"client = Client() # Launch langsmith client for cloning datasets\n",
45147
"today = datetime.date.today().isoformat()\n",
148+
"rate_limiter = RateLimiter(requests_per_second=1)\n",
46149
"\n",
47150
"for task in registry:\n",
48151
" dataset_name = task.name + f\"_benchmarking_{today}\"\n",
49152
" clone_public_dataset(task.dataset_id, dataset_name=dataset_name)\n",
50153
"\n",
51154
" if task.type != \"ToolUsageTask\":\n",
52155
" continue\n",
53-
" for model in models:\n",
156+
"\n",
157+
" for arch, model in tests:\n",
54158
" print()\n",
55-
" print(f\"Benchmarking {task.name} with model: {model}\")\n",
159+
" print(f\"Benchmarking {task.name} with model: {model} and arch: {arch}\")\n",
56160
" eval_config = task.get_eval_config()\n",
57-
" agent_factory = agents.OpenAIAgentFactory(task, model=model)\n",
161+
"\n",
162+
" if arch == \"openai_functions\":\n",
163+
" agent_factory = OpenAIAgentFactory(\n",
164+
" task, model=model, rate_limiter=rate_limiter\n",
165+
" )\n",
166+
" elif arch == \"xml\":\n",
167+
" agent_factory = CustomAgentFactory(\n",
168+
" task, model=model, rate_limiter=rate_limiter\n",
169+
" )\n",
170+
" else:\n",
171+
" raise ValueError()\n",
58172
"\n",
59173
" client.run_on_dataset(\n",
60174
" dataset_name=dataset_name,\n",
61175
" llm_or_chain_factory=agent_factory,\n",
62176
" evaluation=eval_config,\n",
63177
" verbose=False,\n",
64-
" project_name=f\"{dataset_name}-{model}-{experiment_uuid}\",\n",
178+
" project_name=f\"{model}{experiment_uuid}\",\n",
65179
" tags=[model],\n",
66-
" concurrency_level=1,\n",
180+
" concurrency_level=5,\n",
67181
" project_metadata={\n",
68182
" \"model\": model,\n",
69183
" \"id\": experiment_uuid,\n",
70184
" \"task\": task.name,\n",
71185
" \"date\": today,\n",
186+
" \"langchain_benchmarks_version\": __version__,\n",
187+
" \"arch\": arch,\n",
72188
" },\n",
73-
" )"
189+
" )\n",
190+
" break"
74191
]
75192
}
76193
],

langchain_benchmarks/model_registration.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,15 @@
192192
"model": "accounts/fireworks/models/llama-v2-70b-chat",
193193
},
194194
),
195+
RegisteredModel(
196+
provider="fireworks",
197+
name="yi-34b-200k-fw",
198+
type="llm",
199+
description=" 4B LLM model from 01.ai, with context window 200k.",
200+
params={
201+
"model": "accounts/fireworks/models/yi-34b-200k",
202+
},
203+
),
195204
RegisteredModel(
196205
provider="fireworks",
197206
name="mixtral-8x7b-instruct-fw",

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "langchain-benchmarks"
3-
version = "0.0.8"
3+
version = "0.0.9"
44
description = "🦜💪 Flex those feathers!"
55
authors = ["LangChain AI"]
66
license = "MIT"

0 commit comments

Comments
 (0)