Skip to content

Commit

Permalink
Remove pythonrepl from LLM-MathChain (#2943)
Browse files Browse the repository at this point in the history
Use numexpr evaluate instead of the python REPL to avoid malicious code
injection.

Tested against the (limited) math dataset and got the same score as
before.

For more permissive tools (like the REPL tool itself), other approaches
ought to be provided (some combination of Sanitizer + Restricted python
+ unprivileged-docker + ...), but for a calculator tool, only
mathematical expressions should be permitted.

See #814
  • Loading branch information
vowelparrot authored Apr 16, 2023
1 parent 2a0f65f commit 5ca7ce7
Show file tree
Hide file tree
Showing 6 changed files with 251 additions and 230 deletions.
187 changes: 67 additions & 120 deletions docs/use_cases/evaluation/agent_benchmarking.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,11 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"id": "46bf9205",
"metadata": {},
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# Comment this out if you are NOT using tracing\n",
Expand All @@ -35,32 +37,12 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"id": "5b2d5e98",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Found cached dataset json (/Users/harrisonchase/.cache/huggingface/datasets/LangChainDatasets___json/LangChainDatasets--agent-search-calculator-8a025c0ce5fb99d2/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "3a275586643f4ccfba1a8d54be28c351",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/1 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from langchain.evaluation.loading import load_dataset\n",
"dataset = load_dataset(\"agent-search-calculator\")"
Expand All @@ -77,9 +59,11 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": null,
"id": "c18680b5",
"metadata": {},
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from langchain.llms import OpenAI\n",
Expand All @@ -88,7 +72,7 @@
"from langchain.agents import AgentType\n",
"\n",
"tools = load_tools(['serpapi', 'llm-math'], llm=OpenAI(temperature=0))\n",
"agent = initialize_agent(tools, OpenAI(temperature=0), agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION)\n"
"agent = initialize_agent(tools, OpenAI(temperature=0), agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=True)"
]
},
{
Expand All @@ -103,22 +87,14 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": null,
"id": "cbcafc92",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'38,630,316 people live in Canada as of 2023.'"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"print(dataset[0]['question'])\n",
"agent.run(dataset[0]['question'])"
]
},
Expand All @@ -133,18 +109,24 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": null,
"id": "bbbbb20e",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"agent.run(dataset[4]['question'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "24b4c66e",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Retrying langchain.llms.openai.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised APIConnectionError: Error communicating with OpenAI: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer')).\n"
]
}
],
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"predictions = []\n",
"predicted_dataset = []\n",
Expand All @@ -154,7 +136,8 @@
" try:\n",
" predictions.append(agent(new_data))\n",
" predicted_dataset.append(new_data)\n",
" except Exception:\n",
" except Exception as e:\n",
" predictions.append({\"output\": str(e), **new_data})\n",
" error_dataset.append(new_data)"
]
},
Expand All @@ -169,25 +152,12 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": null,
"id": "1d583f03",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'input': 'How many people live in canada as of 2023?',\n",
" 'answer': 'approximately 38,625,801',\n",
" 'output': '38,630,316 people live in Canada as of 2023.',\n",
" 'intermediate_steps': [(AgentAction(tool='Search', tool_input='Population of Canada 2023', log=' I need to find population data\\nAction: Search\\nAction Input: Population of Canada 2023'),\n",
" '38,630,316')]}"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"predictions[0]"
]
Expand All @@ -202,19 +172,23 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": null,
"id": "d0a9341d",
"metadata": {},
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from langchain.evaluation.qa import QAEvalChain"
]
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": null,
"id": "1612dec1",
"metadata": {},
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"llm = OpenAI(temperature=0)\n",
Expand All @@ -232,9 +206,11 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": null,
"id": "2a689df5",
"metadata": {},
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"for i, prediction in enumerate(predictions):\n",
Expand All @@ -243,21 +219,12 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": null,
"id": "27b61215",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Counter({' CORRECT': 4, ' INCORRECT': 6})"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from collections import Counter\n",
"Counter([pred['grade'] for pred in predictions])"
Expand All @@ -273,7 +240,7 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": null,
"id": "47c692a1",
"metadata": {},
"outputs": [],
Expand All @@ -283,38 +250,18 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": null,
"id": "0ef976c1",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'input': \"who is dua lipa's boyfriend? what is his age raised to the .43 power?\",\n",
" 'answer': 'her boyfriend is Romain Gravas. his age raised to the .43 power is approximately 4.9373857399466665',\n",
" 'output': \"Isaac Carew, Dua Lipa's boyfriend, is 36 years old and his age raised to the .43 power is 4.6688516567750975.\",\n",
" 'intermediate_steps': [(AgentAction(tool='Search', tool_input=\"Dua Lipa's boyfriend\", log=' I need to find out who Dua Lipa\\'s boyfriend is and then calculate his age raised to the .43 power\\nAction: Search\\nAction Input: \"Dua Lipa\\'s boyfriend\"'),\n",
" 'Dua and Isaac, a model and a chef, dated on and off from 2013 to 2019. The two first split in early 2017, which is when Dua went on to date LANY ...'),\n",
" (AgentAction(tool='Search', tool_input='Isaac Carew age', log=' I need to find out Isaac\\'s age\\nAction: Search\\nAction Input: \"Isaac Carew age\"'),\n",
" '36 years'),\n",
" (AgentAction(tool='Calculator', tool_input='36^.43', log=' I need to calculate 36 raised to the .43 power\\nAction: Calculator\\nAction Input: 36^.43'),\n",
" 'Answer: 4.6688516567750975\\n')],\n",
" 'grade': ' INCORRECT'}"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"incorrect[0]"
"incorrect"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7710401a",
"id": "3eb948cf-f767-4c87-a12d-275b66eef407",
"metadata": {},
"outputs": [],
"source": []
Expand All @@ -336,7 +283,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.1"
"version": "3.11.2"
}
},
"nbformat": 4,
Expand Down
Loading

0 comments on commit 5ca7ce7

Please sign in to comment.