diff --git a/use_cases/evaluation/agent_benchmarking.ipynb b/use_cases/evaluation/agent_benchmarking.ipynb index 342cfa6..4c68b9c 100644 --- a/use_cases/evaluation/agent_benchmarking.ipynb +++ b/use_cases/evaluation/agent_benchmarking.ipynb @@ -14,9 +14,11 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "46bf9205", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "# Comment this out if you are NOT using tracing\n", @@ -35,32 +37,12 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "5b2d5e98", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Found cached dataset json (/Users/harrisonchase/.cache/huggingface/datasets/LangChainDatasets___json/LangChainDatasets--agent-search-calculator-8a025c0ce5fb99d2/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "3a275586643f4ccfba1a8d54be28c351", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/1 [00:00._completion_with_retry in 4.0 seconds as it raised APIConnectionError: Error communicating with OpenAI: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer')).\n" - ] - } - ], + "metadata": { + "tags": [] + }, + "outputs": [], "source": [ "predictions = []\n", "predicted_dataset = []\n", @@ -154,7 +136,8 @@ " try:\n", " predictions.append(agent(new_data))\n", " predicted_dataset.append(new_data)\n", - " except Exception:\n", + " except Exception as e:\n", + " predictions.append({\"output\": str(e), **new_data})\n", " error_dataset.append(new_data)" ] }, @@ -169,25 +152,12 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "id": "1d583f03", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'input': 'How many people live in canada as of 2023?',\n", - " 'answer': 'approximately 38,625,801',\n", - " 'output': '38,630,316 people live in Canada as of 2023.',\n", - " 'intermediate_steps': [(AgentAction(tool='Search', tool_input='Population of Canada 2023', log=' I need to find population data\\nAction: Search\\nAction Input: Population of Canada 2023'),\n", - " '38,630,316')]}" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], + "metadata": { + "tags": [] + }, + "outputs": [], "source": [ "predictions[0]" ] @@ -202,9 +172,11 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "id": "d0a9341d", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "from langchain.evaluation.qa import QAEvalChain" @@ -212,9 +184,11 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "id": "1612dec1", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "llm = OpenAI(temperature=0)\n", @@ -232,9 +206,11 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "id": "2a689df5", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "for i, prediction in enumerate(predictions):\n", @@ -243,21 +219,12 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "id": "27b61215", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Counter({' CORRECT': 4, ' INCORRECT': 6})" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], + "metadata": { + "tags": [] + }, + "outputs": [], "source": [ "from collections import Counter\n", "Counter([pred['grade'] for pred in predictions])" @@ -273,7 +240,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "id": "47c692a1", "metadata": {}, "outputs": [], @@ -283,38 +250,18 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "id": "0ef976c1", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'input': \"who is dua lipa's boyfriend? what is his age raised to the .43 power?\",\n", - " 'answer': 'her boyfriend is Romain Gravas. his age raised to the .43 power is approximately 4.9373857399466665',\n", - " 'output': \"Isaac Carew, Dua Lipa's boyfriend, is 36 years old and his age raised to the .43 power is 4.6688516567750975.\",\n", - " 'intermediate_steps': [(AgentAction(tool='Search', tool_input=\"Dua Lipa's boyfriend\", log=' I need to find out who Dua Lipa\\'s boyfriend is and then calculate his age raised to the .43 power\\nAction: Search\\nAction Input: \"Dua Lipa\\'s boyfriend\"'),\n", - " 'Dua and Isaac, a model and a chef, dated on and off from 2013 to 2019. The two first split in early 2017, which is when Dua went on to date LANY ...'),\n", - " (AgentAction(tool='Search', tool_input='Isaac Carew age', log=' I need to find out Isaac\\'s age\\nAction: Search\\nAction Input: \"Isaac Carew age\"'),\n", - " '36 years'),\n", - " (AgentAction(tool='Calculator', tool_input='36^.43', log=' I need to calculate 36 raised to the .43 power\\nAction: Calculator\\nAction Input: 36^.43'),\n", - " 'Answer: 4.6688516567750975\\n')],\n", - " 'grade': ' INCORRECT'}" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "incorrect[0]" + "incorrect" ] }, { "cell_type": "code", "execution_count": null, - "id": "7710401a", + "id": "3eb948cf-f767-4c87-a12d-275b66eef407", "metadata": {}, "outputs": [], "source": [] @@ -336,7 +283,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.1" + "version": "3.11.2" } }, "nbformat": 4,