EmergenceAI · teaxio · Sep 19, 2024 · Aug 26, 2024 · Aug 28, 2024 · Sep 6, 2024
diff --git a/.gitignore b/.gitignore
@@ -161,4 +161,5 @@ requirements.txt
 Pipfile
 
 # file containing LLM config for the agents
-agents_llm_config.json
+agents_llm_config.json
+ae/testing.py
diff --git a/README.md b/README.md
@@ -110,6 +110,13 @@ Agent-E relies on several environment variables for its configuration. You need
 
 - **`LOG_MESSAGES_FORMAT`**  
   Set to `json` or `text` (Default: `text`). Specifies the format for logging messages.
+
+- **`ADDITIONAL_SKILL_DIRS`** *(optional)*
+  A comma-separated list of directories or `.py` files where additional skills can be loaded from. This is used to dynamically load skills from specified directories or files.
+  Example: `ADDITIONAL_SKILL_DIRS="./private_skills,./extra_skills/my_custom_skill.py"` would be added to the `.env` file (or equivalent)
+
+- **`PLANNER_USER_INPUT_SKILL_ENABLED`** *(optional)*
+  Set to `true` or `false` (Default: `false`). Specifies whether to allow the planner agent to get user input or not.
 
 ## Running the Code
 
@@ -137,10 +144,14 @@ Agent-E provides a FastAPI wrapper, allowing you to send commands via HTTP and r
 
 #### To launch the FastAPI server:
 
-1. Run the following command:
+1. On Linux/macOS, run the following command:
    ```bash
    uvicorn ae.server.api_routes:app --reload --loop asyncio
    ```
+2. On Windows, run the same command but without ```--reload``` (Python still has different async implementations across OSes, removing --reload helping finding a workaround, see this [answer on  StackOverflow](https://stackoverflow.com/a/78795990)):
+   ```cmd
+   uvicorn ae.server.api_routes:app --loop asyncio
+   ```
 
 2. Send POST requests to execute tasks. For example, to execute a task using cURL:
 ```bash
@@ -150,7 +161,16 @@ curl --location 'http://127.0.0.1:8000/execute_task' \
     "command": "go to espn, look for soccer news, report the names of the most recent soccer champs"
 }'
 ```
+Optionally, the API request can include an llm_config object if you want to apply a different configuration during API request execution. The llm_config object should have configuration seperately for planner_agent and browser_nav_agent. See  `agents_llm_config-example.json` for an exmaple.
 
+```bash
+curl --location 'http://127.0.0.1:8000/execute_task' \
+--header 'Content-Type: application/json' \
+--data '{
+    "command": "go to espn, look for soccer news, report the names of the most recent soccer champs",
+    "llm_config":{"planner_agent":{...}, "browser_nav_agent":{...}}
+}'
+```
 ### Customizing LLM Parameters
 Agent-E supports advanced LLM configurations using environment variables or JSON-based configuration files. This allows users to customize how the underlying model behaves, such as setting temperature, top-p, and model API base URLs.
 
@@ -352,8 +372,7 @@ python -m test.run_tests
 ### macOS Users
 If you're running the tests on macOS and encounter `BlockingIOError`, run the tests with unbuffered output:
 ```bash
-macOS Users
-If you're running the tests on macOS and encounter BlockingIOError, run the tests with unbuffered output:
+python -u -m test.run_tests
 ```
 
 ### Running Specific Tests
@@ -375,7 +394,7 @@ Here are additional parameters that you can pass to customize the test execution
 - `--take_screenshots`: Takes screenshots after every operation performed. Example: `--take_screenshots` `true`. Default is `false`
 
 ### Example Command
-Here’s an example of how to use the parameters (macUsers add `-u` parameter to the command below):
+Here’s an example of how to use the parameters (macOS Users add `-u` parameter to the command below):
 ```bash
 python -m test.run_tests --min_task_index 0 --max_task_index 28 --test_results_id first_28_tests
 ```

diff --git a/ae/core/agents/browser_nav_agent.py b/ae/core/agents/browser_nav_agent.py
@@ -1,3 +1,5 @@
+import importlib
+import os
 from datetime import datetime
 from string import Template
 from typing import Any
@@ -18,6 +20,7 @@
 
 #from ae.core.skills.pdf_text_extractor import extract_text_from_pdf
 from ae.core.skills.press_key_combination import press_key_combination
+from ae.core.skills.skill_registry import skill_registry
 from ae.utils.logger import logger
 
 
@@ -73,44 +76,30 @@ def __register_skills(self):
         Register all the skills that the agent can perform.
         """
 
-        # Register openurl skill for LLM by assistant agent
+        # Register each skill for LLM by assistant agent and for execution by user_proxy_agen
+
         self.agent.register_for_llm(description=LLM_PROMPTS["OPEN_URL_PROMPT"])(openurl)
-        # Register openurl skill for execution by user_proxy_agent
         self.browser_nav_executor.register_for_execution()(openurl)
 
-        # Register enter_text_and_click skill for LLM by assistant agent
         # self.agent.register_for_llm(description=LLM_PROMPTS["ENTER_TEXT_AND_CLICK_PROMPT"])(enter_text_and_click)
-        # Register enter_text_and_click skill for execution by user_proxy_agent
         # self.browser_nav_executor.register_for_execution()(enter_text_and_click)
 
-        # Register get_dom_with_content_type skill for LLM by assistant agent
         self.agent.register_for_llm(description=LLM_PROMPTS["GET_DOM_WITH_CONTENT_TYPE_PROMPT"])(get_dom_with_content_type)
-        # Register get_dom_with_content_type skill for execution by user_proxy_agent
         self.browser_nav_executor.register_for_execution()(get_dom_with_content_type)
 
-        # Register click_element skill for LLM by assistant agent
         self.agent.register_for_llm(description=LLM_PROMPTS["CLICK_PROMPT"])(click_element)
-        # Register click_element skill for execution by user_proxy_agent
         self.browser_nav_executor.register_for_execution()(click_element)
 
-        # Register geturl skill for LLM by assistant agent
         self.agent.register_for_llm(description=LLM_PROMPTS["GET_URL_PROMPT"])(geturl)
-        # Register geturl skill for execution by user_proxy_agent
         self.browser_nav_executor.register_for_execution()(geturl)
 
-        # Register bulk_enter_text skill for LLM by assistant agent
         self.agent.register_for_llm(description=LLM_PROMPTS["BULK_ENTER_TEXT_PROMPT"])(bulk_enter_text)
-        # Register bulk_enter_text skill for execution by user_proxy_agent
         self.browser_nav_executor.register_for_execution()(bulk_enter_text)
 
-        # Register entertext skill for LLM by assistant agent
         self.agent.register_for_llm(description=LLM_PROMPTS["ENTER_TEXT_PROMPT"])(entertext)
-        # Register entertext skill for execution by user_proxy_agent
         self.browser_nav_executor.register_for_execution()(entertext)
 
-        # Register entertext skill for LLM by assistant agent
         self.agent.register_for_llm(description=LLM_PROMPTS["PRESS_KEY_COMBINATION_PROMPT"])(press_key_combination)
-        # Register entertext skill for execution by user_proxy_agent
         self.browser_nav_executor.register_for_execution()(press_key_combination)
 
         self.agent.register_for_llm(description=LLM_PROMPTS["EXTRACT_TEXT_FROM_PDF_PROMPT"])(extract_text_from_pdf)
@@ -129,5 +118,47 @@ def __register_skills(self):
             config={"callback": None},
         )
         '''
-        # print(f">>> Function map: {self.browser_nav_executor.function_map}") # type: ignore
-        # print(">>> Registered skills for BrowserNavAgent and BrowserNavExecutorAgent")
+        self.__load_additional_skills()
+
+        #print(f">>> Function map: {self.browser_nav_executor.function_map}") # type: ignore
+
+
+    def __load_additional_skills(self):
+        """
+        Dynamically load additional skills from directories or specific Python files
+        specified by an environment variable.
+        """
+        # Get additional skill directories or files from environment variable
+        additional_skill_dirs: str = os.getenv('ADDITIONAL_SKILL_DIRS', "")
+        if len(additional_skill_dirs) == 0:
+            logger.debug("No additional skill directories or files specified.")
+            return
+
+        additional_skill_paths: list[str] = additional_skill_dirs.split(',')
+
+        for skill_path in additional_skill_paths:
+            skill_path = skill_path.strip()  # Strip whitespace
+
+            if os.path.isdir(skill_path):
+                # If the path is a directory, process all .py files in it
+                for filename in os.listdir(skill_path):
+                    if filename.endswith(".py"):
+                        module_name = filename[:-3]  # Remove .py extension
+                        module_path = f"{skill_path.replace('/', '.')}.{module_name}"
+                        importlib.import_module(module_path)
+
+            elif skill_path.endswith(".py") and os.path.isfile(skill_path):
+                # If the path is a specific .py file, load it directly
+                module_name = os.path.basename(skill_path)[:-3]  # Strip .py extension
+                directory_path = os.path.dirname(skill_path).replace('/', '.')
+                module_path = f"{directory_path}.{module_name}"
+                importlib.import_module(module_path)
+            else:
+                logger.warning(f"Invalid skill path specified: {skill_path}")
+
+        # Register the skills that were dynamically discovered
+        for skill in skill_registry:
+            self.agent.register_for_llm(description=skill['description'])(skill['func'])
+            self.browser_nav_executor.register_for_execution()(skill['func'])
+            logger.debug(f"Registered additional skill: {skill['name']}")
+
diff --git a/ae/core/agents/high_level_planner_agent.py b/ae/core/agents/high_level_planner_agent.py
@@ -1,3 +1,4 @@
+import os
 from datetime import datetime
 from string import Template
 from typing import Any
@@ -24,6 +25,8 @@ def __init__(self, model_config_list, llm_config_params: dict[str, Any], system_
         - system_prompt: The system prompt to be used for this agent or the default will be used if not provided.
         - user_proxy_agent: An instance of the UserProxyAgent class.
         """
+        enable_user_input = os.getenv("PLANNER_USER_INPUT_SKILL_ENABLED", "false").lower() == "true"
+
         user_ltm = self.__get_ltm()
         system_message = LLM_PROMPTS["PLANNER_AGENT_PROMPT"]
 
@@ -50,10 +53,13 @@ def __init__(self, model_config_list, llm_config_params: dict[str, Any], system_
             },
         )
 
-        # Register get_user_input skill for LLM by assistant agent
-        self.agent.register_for_llm(description=LLM_PROMPTS["GET_USER_INPUT_PROMPT"])(get_user_input)
-        # Register get_user_input skill for execution by user_proxy_agent
-        user_proxy_agent.register_for_execution()(get_user_input)
+        if enable_user_input:
+            # Register get_user_input skill for LLM by assistant agent
+            self.agent.register_for_llm(description=LLM_PROMPTS["GET_USER_INPUT_PROMPT"])(get_user_input)
+            # Register get_user_input skill for execution by user_proxy_agent
+            user_proxy_agent.register_for_execution()(get_user_input)
+        else:
+            logger.debug("User input skill is disabled for PlannerAgent")
 
         self.agent.register_reply( # type: ignore
             [autogen.AssistantAgent, None],

diff --git a/ae/core/agents_llm_config.py b/ae/core/agents_llm_config.py
@@ -31,9 +31,13 @@ class AgentsLLMConfig:
         "model_base_url": "base_url",
     }
 
-    def __init__(self, env_file_path: str = ".env") -> None:
+    def __init__(self, env_file_path: str = ".env", llm_config: dict[str,Any] | None = None) -> None:
         load_dotenv(env_file_path, verbose=True, override=True)
-        self.config: dict[str, Any] = self._load_config()
+        if llm_config:
+            self.config: dict[str, Any] = self.load_config_from_api(llm_config)
+        else:
+            self.config: dict[str, Any] = self._load_config()
+
 
     def _load_config(self) -> dict[str, Any]:
         config_file = os.getenv("AGENTS_LLM_CONFIG_FILE")
@@ -50,8 +54,8 @@ def _load_config(self) -> dict[str, Any]:
                         raw_config = file_config[config_file_ref_key]
 
                         # Process configurations for both planner_agent and browser_nav_agent
-                        planner_config = self._normalize_config_from_file(raw_config.get("planner_agent", {}))
-                        browser_nav_config = self._normalize_config_from_file(raw_config.get("browser_nav_agent", {}))
+                        planner_config = self._normalize_config(raw_config.get("planner_agent", {}))
+                        browser_nav_config = self._normalize_config(raw_config.get("browser_nav_agent", {}))
 
                         config = {
                             "planner_agent": planner_config,
@@ -81,7 +85,41 @@ def _load_config(self) -> dict[str, Any]:
 
         return config
 
-    def _normalize_config_from_file(self, agent_config: dict[str, Any]) -> dict[str, Any]:
+    def load_config_from_api(self, llm_config: dict[str, Any]) -> dict[str, Any]:
+            """
+            Load configuration from a JSON provided during execution.
+
+            Parameters
+            ----------
+            config_string : dict[str,Any]
+                A JSON representing the configuration.
+
+            Returns
+            -------
+            dict[str, Any]
+                The loaded and normalized configuration.
+            """
+            try:
+
+                logger.info("Loading LLM configuration provided via API.")
+
+                # Process configurations for both planner_agent and browser_nav_agent
+                planner_config = self._normalize_config(llm_config.get("planner_agent", {}))
+                browser_nav_config = self._normalize_config(llm_config.get("browser_nav_agent", {}))
+
+                config = {
+                    "planner_agent": planner_config,
+                    "browser_nav_agent": browser_nav_config,
+                    "other_settings": {k: v for k, v in llm_config.items() if k not in ["planner_agent", "browser_nav_agent"]},
+                }
+
+                return config
+
+            except json.JSONDecodeError as e:
+                logger.error(f"Error decoding JSON string: {e}")
+                raise e
+
+    def _normalize_config(self, agent_config: dict[str, Any]) -> dict[str, Any]:
         """Normalize agent-specific config from a file, grouping keys into model_config_params, llm_config_params, and other_settings."""
         model_config = {}
         llm_config_params = {}
@@ -156,6 +194,3 @@ def get_full_config(self) -> dict[str, Any]:
 
     planner_config = config.get_planner_agent_config()
     browser_nav_config = config.get_browser_nav_agent_config()
-
-    print("Planner Agent Config:", planner_config)
-    print("Browser Nav Agent Config:", browser_nav_config)