Fix file name extraction after matplotlib block added

h2oai · Sep 13, 2024 · 6de12f2 · 6de12f2
1 parent c792746
commit 6de12f2
Show file tree

Hide file tree

Showing 4 changed files with 112 additions and 2 deletions.
diff --git a/openai_server/agent_prompting.py b/openai_server/agent_prompting.py
@@ -104,6 +104,7 @@ def agent_system_prompt(agent_code_writer_system_message, agent_system_site_pack
   * Check the execution result returned by the user.
   * Ensure python code blocks contain valid python code, and shell code blocks contain valid shell code.
 * Every python or shell code block MUST be marked whether it is for execution with a comment that shows if execution is true or false, e.g. # execution: true
+* If a python code is marked for execution, do not generate a shell script to execute that python code file, because that would execute the python code twice.
 * You can assume that any files (python scripts, shell scripts, images, csv files, etc.) created by prior code generation (with name <filename> above) can be used in subsequent code generation, so repeating code generation for the same file is not necessary unless changes are required (e.g. a python code of some name can be run with a short sh code).
 * When you need to collect info, generate code to output the info you need.
 * Ensure you provide well-commented code, so the user can understand what the code does.

diff --git a/openai_server/autogen_utils.py b/openai_server/autogen_utils.py
@@ -59,6 +59,13 @@ def __init__(
         self.autogen_code_restrictions_level = autogen_code_restrictions_level
         self.stream_output = stream_output
 
+        self.filename_patterns: List[re.Pattern] = [
+            re.compile(r"^<!--\s*filename:\s*([\w.-/]+)\s*-->$"),
+            re.compile(r"^/\*\s*filename:\s*([\w.-/]+)\s*\*/$"),
+            re.compile(r"^//\s*filename:\s*([\w.-/]+)\s*$"),
+            re.compile(r"^#\s*filename:\s*([\w.-/]+)\s*$"),
+        ]
+
     @staticmethod
     def remove_comments_strings(code: str, lang: str) -> str:
         if verbose:
@@ -181,6 +188,32 @@ def sanitize_command(lang: str, code: str) -> None:
                 if match.group(f"pat{i}"):
                     raise ValueError(f"{danger_mark}: {patterns[pattern]}\n\n{cleaned_code}")
 
+    def _get_file_name_from_content(self, code: str, workspace_path: Path) -> Optional[str]:
+        lines = code.split("\n")
+        for line in lines:
+            line = line.strip()
+            for pattern in self.filename_patterns:
+                matches = pattern.match(line)
+                if matches is not None:
+                    filename = matches.group(1).strip()
+
+                    # Validate filename
+                    if not re.match(r'^[\w.-/]+$', filename):
+                        continue  # Invalid filename, try next match
+
+                    # Construct the path
+                    path = Path(filename)
+
+                    # Ensure the path doesn't try to go outside the workspace
+                    try:
+                        resolved_path = (workspace_path / path).resolve().relative_to(workspace_path)
+                        return str(resolved_path)
+                    except ValueError:
+                        # Path would be outside the workspace, skip it
+                        continue
+
+        return None
+
     def __execute_code_dont_check_setup(self, code_blocks: List[CodeBlock]) -> CommandLineCodeResult:
         # nearly identical to parent, but with control over guardrails via self.sanitize_command
         logs_all = ""
@@ -211,7 +244,7 @@ def __execute_code_dont_check_setup(self, code_blocks: List[CodeBlock]) -> Comma
             execute_code = self.execution_policies.get(lang, False)
             try:
                 # Check if there is a filename comment
-                filename = _get_file_name_from_content(code, self._work_dir)
+                filename = self._get_file_name_from_content(code, self._work_dir)
             except ValueError:
                 return CommandLineCodeResult(exit_code=1, output="Filename is not in the workspace")
 

diff --git a/openai_server/test_autogen_utils.py b/openai_server/test_autogen_utils.py
@@ -1,4 +1,6 @@
 import re
+from pathlib import Path
+
 import pytest
 
 from openai_server.autogen_utils import H2OLocalCommandLineCodeExecutor, bad_output_mark, danger_mark
@@ -382,3 +384,77 @@ def fetch_content(url):
         pass
     else:
         raise ValueError("Should not reach here")
+
+
+@pytest.fixture
+def workspace_path():
+    return Path("/tmp/workspace"), H2OLocalCommandLineCodeExecutor()
+
+
+def test_basic_filename_extraction(workspace_path):
+    code = "# filename: test.py\nprint('Hello, World!')"
+    assert workspace_path[1]._get_file_name_from_content(code, workspace_path[0]) == "test.py"
+
+
+def test_filename_with_path(workspace_path):
+    code = "# filename: subfolder/test.py\nprint('Hello, World!')"
+    assert workspace_path[1]._get_file_name_from_content(code, workspace_path[0]) == "subfolder/test.py"
+
+
+def test_filename_with_different_comment_styles(workspace_path):
+    code1 = "<!-- filename: test.html -->\n<html></html>"
+    code2 = "/* filename: test.css */\nbody {}"
+    code3 = "// filename: test.js\nconsole.log('Hello');"
+    assert workspace_path[1]._get_file_name_from_content(code1, workspace_path[0]) == "test.html"
+    assert workspace_path[1]._get_file_name_from_content(code2, workspace_path[0]) == "test.css"
+    assert workspace_path[1]._get_file_name_from_content(code3, workspace_path[0]) == "test.js"
+
+
+def test_filename_not_on_first_line(workspace_path):
+    code = "import os\n# filename: test.py\nprint('Hello, World!')"
+    assert workspace_path[1]._get_file_name_from_content(code, workspace_path[0]) == "test.py"
+
+
+def test_no_filename_specified(workspace_path):
+    code = "print('Hello, World!')"
+    assert workspace_path[1]._get_file_name_from_content(code, workspace_path[0]) is None
+
+
+def test_invalid_filename(workspace_path):
+    code = "# filename: invalid file name.py\nprint('Hello, World!')"
+    assert workspace_path[1]._get_file_name_from_content(code, workspace_path[0]) is None
+
+
+def test_filename_outside_workspace(workspace_path):
+    code = "# filename: /etc/passwd\nprint('Hello, World!')"
+    assert workspace_path[1]._get_file_name_from_content(code, workspace_path[0]) is None
+
+
+def test_filename_with_colon(workspace_path):
+    code = "# filename: test.py\nprint('Hello, World!')"
+    assert workspace_path[1]._get_file_name_from_content(code, workspace_path[0]) == "test.py"
+
+
+def test_filename_without_colon(workspace_path):
+    code = "# filename test.py\nprint('Hello, World!')"
+    assert workspace_path[1]._get_file_name_from_content(code, workspace_path[0]) is None
+
+
+def test_multiple_filenames(workspace_path):
+    code = "# filename: first.py\n# filename: second.py\nprint('Hello, World!')"
+    assert workspace_path[1]._get_file_name_from_content(code, workspace_path[0]) == "first.py"
+
+
+def test_commented_out_filename(workspace_path):
+    code = "# # filename: test.py\nprint('Hello, World!')"
+    assert workspace_path[1]._get_file_name_from_content(code, workspace_path[0]) is None
+
+
+def test_filename_with_spaces_around(workspace_path):
+    code = "#    filename:    test.py    \nprint('Hello, World!')"
+    assert workspace_path[1]._get_file_name_from_content(code, workspace_path[0]) == "test.py"
+
+
+def test_filename_with_extension_containing_dot(workspace_path):
+    code = "# filename: test.tar.gz\nprint('Hello, World!')"
+    assert workspace_path[1]._get_file_name_from_content(code, workspace_path[0]) == "test.tar.gz"
diff --git a/src/version.py b/src/version.py
@@ -1 +1 @@
-__version__ = "b8d4aeb0bee08bb9f1893642c2dd692fcc1cd3d4"
+__version__ = "c792746151d399c265614dcc4360506312ebf432"