Merge remote-tracking branch 'origin/dev' into pr/63

EmergenceAI · Jul 4, 2024 · df8da54 · df8da54
2 parents e8f0855 + 896d23c
commit df8da54
Show file tree

Hide file tree

Showing 46 changed files with 4,325 additions and 718 deletions.
diff --git a/.gitignore b/.gitignore
@@ -155,4 +155,7 @@ cython_debug/
 ae/log_files/*
 ae/temp/*
 test/logs/*
-test/results/*
+test/results/*
+Pipfile.lock
+requirements.txt
+Pipfile
diff --git a/README.md b/README.md
@@ -11,7 +11,7 @@ This provides a natural language way to interacting with a web browser:
 - Manage and automate tasks on project management platforms (like JIRA) by filtering issues, easing the workflow for users.
 - Provide personal shopping assistance, suggesting products based on the user's needs, such as storage options for game cards.
 
-While Agent-E is growing, it is already equipped to handle a versatile range of tasks, but the best task is the one that you come up with. So, take it for a spin and tell us what you were able to do with it. For more information see our [blog article](https://blog.emergence.ai/2024/03/28/distilling-the-web-agent.html).
+While Agent-E is growing, it is already equipped to handle a versatile range of tasks, but the best task is the one that you come up with. So, take it for a spin and tell us what you were able to do with it. For more information see our [blog article](https://www.emergence.ai/blog/distilling-the-web-for-multi-agent-automation).
 
 
 ## Quick Start
@@ -156,6 +156,39 @@ html_theme = 'sphinx_rtd_theme'
 7. Build the documentation, from `docs` directory, run: `sphinx-build -b html . _build`
 
 
+## Open-source models
+
+Using open-source models is possible through LiteLLM with Ollama. Ollama allows users to run language models locally on their machines, and LiteLLM translates OpenAI-format inputs to local models' endpoints. To use open-source models as Agent-E backbone, follow the steps below:
+
+1. Install LiteLLM
+    ```bash
+    pip install 'litellm[proxy]'
+    ```
+2. Install Ollama
+    * For Mac and Windows, download [Ollama](https://ollama.com/download).
+    * For Linux:
+        ```bash
+        curl -fsSL https://ollama.com/install.sh | sh
+        ```
+3. Pull Ollama models
+    Before you can use a model, you need to download it from the library. The list of available models is [here](https://ollama.com/library). Here, we use Mistral v0.3:
+    ```bash
+    ollama pull mistral:v0.3
+    ```
+4. Run LiteLLM
+    To run the downloaded model with LiteLLM as a proxy, run:
+    ```bash
+    litellm --model ollama_chat/mistral:v0.3
+    ```
+5. Configure model in Autogen
+    Configure the `.env` file as follows. Note that the model name and API keys are not needed since the local model is already running.
+    ```bash
+    AUTOGEN_MODEL_NAME=NotRequired
+    AUTOGEN_MODEL_API_KEY=NotRequired
+    AUTOGEN_MODEL_BASE_URL=http://0.0.0.0:400
+    ```
+
+
 ## TODO
 
 - Action verification - Responding from every skill with changes that took place in the DOM (Mutation Observers) so that the LLM can judge whether the skill did execute properly or not

diff --git a/ae/__init__.py b/ae/__init__.py
@@ -1 +1 @@
-from ae import core
+from ae import core  # type: ignore # noqa: F401
diff --git a/ae/config.py b/ae/config.py
@@ -23,4 +23,4 @@
 
 if not os.path.exists(PROJECT_TEMP_PATH):
     os.makedirs(PROJECT_TEMP_PATH)
-    print(f"Created temp folder at: {PROJECT_TEMP_PATH}")
+    print(f"Created temp folder at: {PROJECT_TEMP_PATH}")
diff --git a/ae/core/__init__.py b/ae/core/__init__.py
@@ -1,10 +1,8 @@
 from ae.core import agents
 from ae.core import memory
 from ae.core import skills
-
 from ae.core.autogen_wrapper import AutogenWrapper
 from ae.core.playwright_manager import PlaywrightManager
-from ae.core.post_process_responses import final_reply_callback_browser_agent
 from ae.core.post_process_responses import final_reply_callback_user_proxy
 from ae.core.prompts import LLM_PROMPTS
 from ae.core.system_orchestrator import SystemOrchestrator

diff --git a/ae/core/agents/__init__.py b/ae/core/agents/__init__.py
@@ -1,2 +1 @@
-from ae.core.agents.browser_nav_agent import BrowserNavAgent
-from ae.core.agents.browser_nav_agent_no_skills import BrowserNavAgentNoSkills
+from ae.core.agents.browser_nav_agent import BrowserNavAgent
diff --git a/ae/core/agents/browser_nav_agent.py b/ae/core/agents/browser_nav_agent.py
@@ -1,24 +1,26 @@
+from datetime import datetime
 from string import Template
 
 import autogen  # type: ignore
 
 from ae.core.memory.static_ltm import get_user_ltm
-from ae.core.post_process_responses import final_reply_callback_browser_agent as print_message_from_user_proxy  # type: ignore
-from ae.core.post_process_responses import final_reply_callback_user_proxy as print_message_from_browser_agent  # type: ignore
 from ae.core.prompts import LLM_PROMPTS
 from ae.core.skills.click_using_selector import click as click_element
-from ae.core.skills.enter_text_and_click import enter_text_and_click
+
+# from ae.core.skills.enter_text_and_click import enter_text_and_click
 from ae.core.skills.enter_text_using_selector import bulk_enter_text
 from ae.core.skills.enter_text_using_selector import entertext
 from ae.core.skills.get_dom_with_content_type import get_dom_with_content_type
 from ae.core.skills.get_url import geturl
-from ae.core.skills.get_user_input import get_user_input
 from ae.core.skills.open_url import openurl
 from ae.core.skills.pdf_text_extractor import extract_text_from_pdf
 
+#from ae.core.skills.pdf_text_extractor import extract_text_from_pdf
+from ae.core.skills.press_key_combination import press_key_combination
+
 
 class BrowserNavAgent:
-    def __init__(self, config_list, user_proxy_agent: autogen.UserProxyAgent): # type: ignore
+    def __init__(self, config_list, browser_nav_executor: autogen.UserProxyAgent): # type: ignore
         """
         Initialize the BrowserNavAgent and store the AssistantAgent instance
         as an instance attribute for external access.
@@ -27,21 +29,23 @@ def __init__(self, config_list, user_proxy_agent: autogen.UserProxyAgent): # typ
         - config_list: A list of configuration parameters required for AssistantAgent.
         - user_proxy_agent: An instance of the UserProxyAgent class.
         """
-        self.user_proxy_agent = user_proxy_agent
+        self.browser_nav_executor = browser_nav_executor
         user_ltm = self.__get_ltm()
         system_message = LLM_PROMPTS["BROWSER_AGENT_PROMPT"]
-
+        system_message = system_message + "\n" + f"Today's date is {datetime.now().strftime('%d %B %Y')}"
         if user_ltm: #add the user LTM to the system prompt if it exists
             user_ltm = "\n" + user_ltm
             system_message = Template(system_message).substitute(basic_user_information=user_ltm)
 
-        self.agent = autogen.AssistantAgent(
+        self.agent = autogen.ConversableAgent(
             name="browser_navigation_agent",
             system_message=system_message,
             llm_config={
                 "config_list": config_list,
-                "cache_seed": 2,
-                "temperature": 0.0
+                "cache_seed": None,
+                "temperature": 0.0,
+                "top_p": 0.001,
+                "seed":12345
             },
         )
         self.__register_skills()
@@ -59,54 +63,53 @@ def __register_skills(self):
         """
         Register all the skills that the agent can perform.
         """
-        # Register get_user_input skill for execution by user_proxy_agent
-        self.user_proxy_agent.register_for_execution()(get_user_input) # type: ignore
-        # Register get_user_input skill for LLM by assistant agent
-        self.agent.register_for_llm(description=LLM_PROMPTS["GET_USER_INPUT_PROMPT"])(get_user_input) # type: ignore
 
-        # Register openurl skill for execution by user_proxy_agent
-        self.user_proxy_agent.register_for_execution()(openurl) # type: ignore
         # Register openurl skill for LLM by assistant agent
-        self.agent.register_for_llm(description=LLM_PROMPTS["OPEN_URL_PROMPT"])(openurl) # type: ignore
+        self.agent.register_for_llm(description=LLM_PROMPTS["OPEN_URL_PROMPT"])(openurl)
+        # Register openurl skill for execution by user_proxy_agent
+        self.browser_nav_executor.register_for_execution()(openurl)
 
-        # Register enter_text_and_click skill for execution by user_proxy_agent
-        self.user_proxy_agent.register_for_execution()(enter_text_and_click)
         # Register enter_text_and_click skill for LLM by assistant agent
-        self.agent.register_for_llm(description=LLM_PROMPTS["ENTER_TEXT_AND_CLICK_PROMPT"])(enter_text_and_click)
+        # self.agent.register_for_llm(description=LLM_PROMPTS["ENTER_TEXT_AND_CLICK_PROMPT"])(enter_text_and_click)
+        # Register enter_text_and_click skill for execution by user_proxy_agent
+        # self.browser_nav_executor.register_for_execution()(enter_text_and_click)
 
-        # Register get_dom_with_content_type skill for execution by user_proxy_agent
-        self.user_proxy_agent.register_for_execution()(get_dom_with_content_type)
         # Register get_dom_with_content_type skill for LLM by assistant agent
         self.agent.register_for_llm(description=LLM_PROMPTS["GET_DOM_WITH_CONTENT_TYPE_PROMPT"])(get_dom_with_content_type)
+        # Register get_dom_with_content_type skill for execution by user_proxy_agent
+        self.browser_nav_executor.register_for_execution()(get_dom_with_content_type)
 
-        # Register click_element skill for execution by user_proxy_agent
-        self.user_proxy_agent.register_for_execution()(click_element)
         # Register click_element skill for LLM by assistant agent
-        #self.agent.register_for_llm(description=LLM_PROMPTS["CLICK_PROMPT_ACCESSIBILITY"])(click_element)
         self.agent.register_for_llm(description=LLM_PROMPTS["CLICK_PROMPT"])(click_element)
+        # Register click_element skill for execution by user_proxy_agent
+        self.browser_nav_executor.register_for_execution()(click_element)
 
-        # Register geturl skill for execution by user_proxy_agent
-        self.user_proxy_agent.register_for_execution()(geturl)
         # Register geturl skill for LLM by assistant agent
         self.agent.register_for_llm(description=LLM_PROMPTS["GET_URL_PROMPT"])(geturl)
+        # Register geturl skill for execution by user_proxy_agent
+        self.browser_nav_executor.register_for_execution()(geturl)
 
-        # Register bulk_enter_text skill for execution by user_proxy_agent
-        self.user_proxy_agent.register_for_execution()(bulk_enter_text)
         # Register bulk_enter_text skill for LLM by assistant agent
         self.agent.register_for_llm(description=LLM_PROMPTS["BULK_ENTER_TEXT_PROMPT"])(bulk_enter_text)
+        # Register bulk_enter_text skill for execution by user_proxy_agent
+        self.browser_nav_executor.register_for_execution()(bulk_enter_text)
 
-        # Register entertext skill for execution by user_proxy_agent
-        self.user_proxy_agent.register_for_execution()(entertext)
         # Register entertext skill for LLM by assistant agent
         self.agent.register_for_llm(description=LLM_PROMPTS["ENTER_TEXT_PROMPT"])(entertext)
-
         # Register entertext skill for execution by user_proxy_agent
-        self.user_proxy_agent.register_for_execution()(extract_text_from_pdf)
+        self.browser_nav_executor.register_for_execution()(entertext)
+
         # Register entertext skill for LLM by assistant agent
+        self.agent.register_for_llm(description=LLM_PROMPTS["PRESS_KEY_COMBINATION_PROMPT"])(press_key_combination)
+        # Register entertext skill for execution by user_proxy_agent
+        self.browser_nav_executor.register_for_execution()(press_key_combination)
+
         self.agent.register_for_llm(description=LLM_PROMPTS["EXTRACT_TEXT_FROM_PDF_PROMPT"])(extract_text_from_pdf)
+        self.browser_nav_executor.register_for_execution()(extract_text_from_pdf)
 
+        '''
         # Register reply function for printing messages
-        self.user_proxy_agent.register_reply( # type: ignore
+        self.browser_nav_executor.register_reply( # type: ignore
             [autogen.Agent, None],
             reply_func=print_message_from_user_proxy,
             config={"callback": None},
@@ -116,3 +119,6 @@ def __register_skills(self):
             reply_func=print_message_from_browser_agent,
             config={"callback": None},
         )
+        '''
+        # print(f">>> Function map: {self.browser_nav_executor.function_map}") # type: ignore
+        # print(">>> Registered skills for BrowserNavAgent and BrowserNavExecutorAgent")
diff --git a/ae/core/agents/browser_nav_agent_no_skills.py b/ae/core/agents/browser_nav_agent_no_skills.py
diff --git a/ae/core/agents/high_level_planner_agent.py b/ae/core/agents/high_level_planner_agent.py
@@ -0,0 +1,61 @@
+from datetime import datetime
+from string import Template
+
+import autogen  # type: ignore
+from autogen import ConversableAgent  # type: ignore
+
+from ae.core.memory.static_ltm import get_user_ltm
+from ae.core.post_process_responses import final_reply_callback_planner_agent as print_message_as_planner  # type: ignore
+from ae.core.prompts import LLM_PROMPTS
+from ae.core.skills.get_user_input import get_user_input
+
+
+class PlannerAgent:
+    def __init__(self, config_list, user_proxy_agent:ConversableAgent): # type: ignore
+        """
+        Initialize the PlannerAgent and store the AssistantAgent instance
+        as an instance attribute for external access.
+
+        Parameters:
+        - config_list: A list of configuration parameters required for AssistantAgent.
+        - user_proxy_agent: An instance of the UserProxyAgent class.
+        """
+
+        user_ltm = self.__get_ltm()
+        system_message = LLM_PROMPTS["PLANNER_AGENT_PROMPT"]
+
+        if user_ltm: #add the user LTM to the system prompt if it exists
+            user_ltm = "\n" + user_ltm
+            system_message = Template(system_message).substitute(basic_user_information=user_ltm)
+        system_message = system_message + "\n" + f"Today's date is {datetime.now().strftime('%d %B %Y')}"
+        self.agent = autogen.AssistantAgent(
+            name="planner_agent",
+            system_message=system_message,
+            llm_config={
+                "config_list": config_list,
+                "cache_seed": None,
+                "temperature": 0.0,
+                "top_p": 0.001,
+                "seed":12345
+            },
+        )
+
+        # Register get_user_input skill for LLM by assistant agent
+        self.agent.register_for_llm(description=LLM_PROMPTS["GET_USER_INPUT_PROMPT"])(get_user_input)
+        # Register get_user_input skill for execution by user_proxy_agent
+        user_proxy_agent.register_for_execution()(get_user_input)
+
+        self.agent.register_reply( # type: ignore
+            [autogen.AssistantAgent, None],
+            reply_func=print_message_as_planner,
+            config={"callback": None},
+            ignore_async_in_sync_chat=True
+        )
+
+    def __get_ltm(self):
+        """
+        Get the the long term memory of the user.
+        returns: str | None - The user LTM or None if not found.
+        """
+        return get_user_ltm()
+
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		from ae import core
		from ae import core # type: ignore # noqa: F401