Remove pythonrepl from LLM-MathChain (#2943)

Use numexpr evaluate instead of the python REPL to avoid malicious code injection. Tested against the (limited) math dataset and got the same score as before. For more permissive tools (like the REPL tool itself), other approaches ought to be provided (some combination of Sanitizer + Restricted python + unprivileged-docker + ...), but for a calculator tool, only mathematical expressions should be permitted. See langchain-ai/langchain#814
wertycn · Apr 16, 2023 · 22af90a · 22af90a
1 parent 4be363e
commit 22af90a
Showing 1 changed file with 67 additions and 120 deletions.
diff --git a/use_cases/evaluation/agent_benchmarking.ipynb b/use_cases/evaluation/agent_benchmarking.ipynb
@@ -14,9 +14,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "id": "46bf9205",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "# Comment this out if you are NOT using tracing\n",
@@ -35,32 +37,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "id": "5b2d5e98",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Found cached dataset json (/Users/harrisonchase/.cache/huggingface/datasets/LangChainDatasets___json/LangChainDatasets--agent-search-calculator-8a025c0ce5fb99d2/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "3a275586643f4ccfba1a8d54be28c351",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "  0%|          | 0/1 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
    "source": [
     "from langchain.evaluation.loading import load_dataset\n",
     "dataset = load_dataset(\"agent-search-calculator\")"
@@ -77,9 +59,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "id": "c18680b5",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "from langchain.llms import OpenAI\n",
@@ -88,7 +72,7 @@
     "from langchain.agents import AgentType\n",
     "\n",
     "tools = load_tools(['serpapi', 'llm-math'], llm=OpenAI(temperature=0))\n",
-    "agent = initialize_agent(tools, OpenAI(temperature=0), agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION)\n"
+    "agent = initialize_agent(tools, OpenAI(temperature=0), agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=True)"
    ]
   },
   {
@@ -103,22 +87,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "id": "cbcafc92",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'38,630,316 people live in Canada as of 2023.'"
-      ]
-     },
-     "execution_count": 7,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
    "source": [
+    "print(dataset[0]['question'])\n",
     "agent.run(dataset[0]['question'])"
    ]
   },
@@ -133,18 +109,24 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
+   "id": "bbbbb20e",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "agent.run(dataset[4]['question'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
    "id": "24b4c66e",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Retrying langchain.llms.openai.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised APIConnectionError: Error communicating with OpenAI: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer')).\n"
-     ]
-    }
-   ],
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
    "source": [
     "predictions = []\n",
     "predicted_dataset = []\n",
@@ -154,7 +136,8 @@
     "    try:\n",
     "        predictions.append(agent(new_data))\n",
     "        predicted_dataset.append(new_data)\n",
-    "    except Exception:\n",
+    "    except Exception as e:\n",
+    "        predictions.append({\"output\": str(e), **new_data})\n",
     "        error_dataset.append(new_data)"
    ]
   },
@@ -169,25 +152,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
    "id": "1d583f03",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'input': 'How many people live in canada as of 2023?',\n",
-       " 'answer': 'approximately 38,625,801',\n",
-       " 'output': '38,630,316 people live in Canada as of 2023.',\n",
-       " 'intermediate_steps': [(AgentAction(tool='Search', tool_input='Population of Canada 2023', log=' I need to find population data\\nAction: Search\\nAction Input: Population of Canada 2023'),\n",
-       "   '38,630,316')]}"
-      ]
-     },
-     "execution_count": 9,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
    "source": [
     "predictions[0]"
    ]
@@ -202,19 +172,23 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
    "id": "d0a9341d",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "from langchain.evaluation.qa import QAEvalChain"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": null,
    "id": "1612dec1",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "llm = OpenAI(temperature=0)\n",
@@ -232,9 +206,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": null,
    "id": "2a689df5",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "for i, prediction in enumerate(predictions):\n",
@@ -243,21 +219,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": null,
    "id": "27b61215",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "Counter({' CORRECT': 4, ' INCORRECT': 6})"
-      ]
-     },
-     "execution_count": 16,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
    "source": [
     "from collections import Counter\n",
     "Counter([pred['grade'] for pred in predictions])"
@@ -273,7 +240,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": null,
    "id": "47c692a1",
    "metadata": {},
    "outputs": [],
@@ -283,38 +250,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": null,
    "id": "0ef976c1",
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'input': \"who is dua lipa's boyfriend? what is his age raised to the .43 power?\",\n",
-       " 'answer': 'her boyfriend is Romain Gravas. his age raised to the .43 power is approximately 4.9373857399466665',\n",
-       " 'output': \"Isaac Carew, Dua Lipa's boyfriend, is 36 years old and his age raised to the .43 power is 4.6688516567750975.\",\n",
-       " 'intermediate_steps': [(AgentAction(tool='Search', tool_input=\"Dua Lipa's boyfriend\", log=' I need to find out who Dua Lipa\\'s boyfriend is and then calculate his age raised to the .43 power\\nAction: Search\\nAction Input: \"Dua Lipa\\'s boyfriend\"'),\n",
-       "   'Dua and Isaac, a model and a chef, dated on and off from 2013 to 2019. The two first split in early 2017, which is when Dua went on to date LANY ...'),\n",
-       "  (AgentAction(tool='Search', tool_input='Isaac Carew age', log=' I need to find out Isaac\\'s age\\nAction: Search\\nAction Input: \"Isaac Carew age\"'),\n",
-       "   '36 years'),\n",
-       "  (AgentAction(tool='Calculator', tool_input='36^.43', log=' I need to calculate 36 raised to the .43 power\\nAction: Calculator\\nAction Input: 36^.43'),\n",
-       "   'Answer: 4.6688516567750975\\n')],\n",
-       " 'grade': ' INCORRECT'}"
-      ]
-     },
-     "execution_count": 18,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
-    "incorrect[0]"
+    "incorrect"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "7710401a",
+   "id": "3eb948cf-f767-4c87-a12d-275b66eef407",
    "metadata": {},
    "outputs": [],
    "source": []
@@ -336,7 +283,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.1"
+   "version": "3.11.2"
   }
  },
  "nbformat": 4,