From 60bf1910a45e04af0fe921e08d04e4400d175592 Mon Sep 17 00:00:00 2001
From: Raphael MANSUY <raphael.mansuy.partner@decathlon.com>
Date: Fri, 28 Jun 2024 09:46:52 +0800
Subject: [PATCH] restructure the code

---
 code2prompt/comment_stripper/__init__.py      |   8 ++
 code2prompt/comment_stripper/c_style.py       |  12 ++
 code2prompt/comment_stripper/html_style.py    |   5 +
 code2prompt/comment_stripper/matlab_style.py  |  12 ++
 code2prompt/comment_stripper/python_style.py  |  12 ++
 code2prompt/comment_stripper/r_style.py       |  12 ++
 code2prompt/comment_stripper/shell_style.py   |  19 +++
 code2prompt/comment_stripper/sql_style.py     |  12 ++
 .../comment_stripper/strip_comments.py        |  27 ++++
 code2prompt/file_handling.py                  |  54 +++++++
 code2prompt/language_inference.py             |   5 -
 code2prompt/main.py                           | 136 +++++++-----------
 tests/test_code2prompt.py                     |   3 +-
 13 files changed, 226 insertions(+), 91 deletions(-)
 create mode 100644 code2prompt/comment_stripper/__init__.py
 create mode 100644 code2prompt/comment_stripper/c_style.py
 create mode 100644 code2prompt/comment_stripper/html_style.py
 create mode 100644 code2prompt/comment_stripper/matlab_style.py
 create mode 100644 code2prompt/comment_stripper/python_style.py
 create mode 100644 code2prompt/comment_stripper/r_style.py
 create mode 100644 code2prompt/comment_stripper/shell_style.py
 create mode 100644 code2prompt/comment_stripper/sql_style.py
 create mode 100644 code2prompt/comment_stripper/strip_comments.py
 create mode 100644 code2prompt/file_handling.py

diff --git a/code2prompt/comment_stripper/__init__.py b/code2prompt/comment_stripper/__init__.py
new file mode 100644
index 0000000..39204f9
--- /dev/null
+++ b/code2prompt/comment_stripper/__init__.py
@@ -0,0 +1,8 @@
+from .c_style import strip_c_style_comments
+from .html_style import strip_html_style_comments
+from .python_style import strip_python_style_comments
+from .shell_style import strip_shell_style_comments
+from .sql_style import strip_sql_style_comments
+from .matlab_style import strip_matlab_style_comments
+from .r_style import strip_r_style_comments
+from .strip_comments import strip_comments
diff --git a/code2prompt/comment_stripper/c_style.py b/code2prompt/comment_stripper/c_style.py
new file mode 100644
index 0000000..da064ae
--- /dev/null
+++ b/code2prompt/comment_stripper/c_style.py
@@ -0,0 +1,12 @@
+import re
+
+def strip_c_style_comments(code: str) -> str:
+    pattern = re.compile(
+        r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
+        re.DOTALL | re.MULTILINE,
+    )
+    return re.sub(
+        pattern,
+        lambda match: match.group(0) if match.group(0).startswith(("'", '"')) else "",
+        code,
+    )
diff --git a/code2prompt/comment_stripper/html_style.py b/code2prompt/comment_stripper/html_style.py
new file mode 100644
index 0000000..e6e938d
--- /dev/null
+++ b/code2prompt/comment_stripper/html_style.py
@@ -0,0 +1,5 @@
+import re
+
+def strip_html_style_comments(code: str) -> str:
+    pattern = re.compile(r"<!--.*?-->", re.DOTALL)
+    return re.sub(pattern, "", code)
diff --git a/code2prompt/comment_stripper/matlab_style.py b/code2prompt/comment_stripper/matlab_style.py
new file mode 100644
index 0000000..6026f89
--- /dev/null
+++ b/code2prompt/comment_stripper/matlab_style.py
@@ -0,0 +1,12 @@
+import re
+
+def strip_matlab_style_comments(code: str) -> str:
+    pattern = re.compile(
+        r'%.*?$|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
+        re.DOTALL | re.MULTILINE,
+    )
+    return re.sub(
+        pattern,
+        lambda match: match.group(0) if match.group(0).startswith(("'", '"')) else "",
+        code,
+    )
diff --git a/code2prompt/comment_stripper/python_style.py b/code2prompt/comment_stripper/python_style.py
new file mode 100644
index 0000000..091778c
--- /dev/null
+++ b/code2prompt/comment_stripper/python_style.py
@@ -0,0 +1,12 @@
+import re
+
+def strip_python_style_comments(code: str) -> str:
+    pattern = re.compile(
+        r'(?s)#.*?$|\'\'\'.*?\'\'\'|""".*?"""|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
+        re.MULTILINE,
+    )
+    return re.sub(
+        pattern,
+        lambda match: ("" if match.group(0).startswith(("#", "'''", '"""')) else match.group(0)),
+        code,
+    )
diff --git a/code2prompt/comment_stripper/r_style.py b/code2prompt/comment_stripper/r_style.py
new file mode 100644
index 0000000..db2c7a8
--- /dev/null
+++ b/code2prompt/comment_stripper/r_style.py
@@ -0,0 +1,12 @@
+import re
+
+def strip_r_style_comments(code: str) -> str:
+    pattern = re.compile(
+        r'#.*?$|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
+        re.DOTALL | re.MULTILINE,
+    )
+    return re.sub(
+        pattern,
+        lambda match: match.group(0) if match.group(0).startswith(("'", '"')) else "",
+        code,
+    )
diff --git a/code2prompt/comment_stripper/shell_style.py b/code2prompt/comment_stripper/shell_style.py
new file mode 100644
index 0000000..7395823
--- /dev/null
+++ b/code2prompt/comment_stripper/shell_style.py
@@ -0,0 +1,19 @@
+def strip_shell_style_comments(code: str) -> str:
+    lines = code.split("\n")
+    new_lines = []
+    in_multiline_comment = False
+    for line in lines:
+        if line.strip().startswith("#!"):  # Preserve shebang lines
+            new_lines.append(line)
+        elif in_multiline_comment:
+            if line.strip().endswith("'"):
+                in_multiline_comment = False
+        elif line.strip().startswith(": '"):
+            in_multiline_comment = True
+        elif "#" in line:  # Remove single-line comments
+            line = line.split("#", 1)[0]
+            if line.strip():
+                new_lines.append(line)
+        else:
+            new_lines.append(line)
+    return "\n".join(new_lines).strip()
diff --git a/code2prompt/comment_stripper/sql_style.py b/code2prompt/comment_stripper/sql_style.py
new file mode 100644
index 0000000..10e7604
--- /dev/null
+++ b/code2prompt/comment_stripper/sql_style.py
@@ -0,0 +1,12 @@
+import re
+
+def strip_sql_style_comments(code: str) -> str:
+    pattern = re.compile(
+        r'--.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
+        re.DOTALL | re.MULTILINE,
+    )
+    return re.sub(
+        pattern,
+        lambda match: match.group(0) if match.group(0).startswith(("'", '"')) else "",
+        code,
+    )
diff --git a/code2prompt/comment_stripper/strip_comments.py b/code2prompt/comment_stripper/strip_comments.py
new file mode 100644
index 0000000..5291503
--- /dev/null
+++ b/code2prompt/comment_stripper/strip_comments.py
@@ -0,0 +1,27 @@
+from .c_style import strip_c_style_comments
+from .html_style import strip_html_style_comments
+from .python_style import strip_python_style_comments
+from .shell_style import strip_shell_style_comments
+from .sql_style import strip_sql_style_comments
+from .matlab_style import strip_matlab_style_comments
+from .r_style import strip_r_style_comments
+
+def strip_comments(code: str, language: str) -> str:
+    if language in [
+        "c", "cpp", "java", "javascript", "csharp", "php", "go", "rust", "kotlin", "swift", "scala", "dart",
+    ]:
+        return strip_c_style_comments(code)
+    elif language in ["python", "ruby", "perl"]:
+        return strip_python_style_comments(code)
+    elif language in ["bash", "powershell", "shell"]:
+        return strip_shell_style_comments(code)
+    elif language in ["html", "xml"]:
+        return strip_html_style_comments(code)
+    elif language in ["sql", "plsql", "tsql"]:
+        return strip_sql_style_comments(code)
+    elif language in ["matlab", "octave"]:
+        return strip_matlab_style_comments(code)
+    elif language in ["r"]:
+        return strip_r_style_comments(code)
+    else:
+        return code
diff --git a/code2prompt/file_handling.py b/code2prompt/file_handling.py
new file mode 100644
index 0000000..0acf1ae
--- /dev/null
+++ b/code2prompt/file_handling.py
@@ -0,0 +1,54 @@
+from pathlib import Path
+from fnmatch import fnmatch
+
+def parse_gitignore(gitignore_path):
+    if not gitignore_path.exists():
+        return set()
+    with gitignore_path.open("r", encoding="utf-8") as file:
+        patterns = set(line.strip() for line in file if line.strip() and not line.startswith("#"))
+    return patterns
+
+def is_ignored(file_path: Path, gitignore_patterns: list, base_path: Path) -> bool:
+    relative_path = file_path.relative_to(base_path)
+    for pattern in gitignore_patterns:
+        pattern = pattern.rstrip("/")
+        if pattern.startswith("/"):
+            if fnmatch(str(relative_path), pattern[1:]):
+                return True
+            if fnmatch(str(relative_path.parent), pattern[1:]):
+                return True
+        else:
+            for path in relative_path.parents:
+                if fnmatch(str(path / relative_path.name), pattern):
+                    return True
+                if fnmatch(str(path), pattern):
+                    return True
+            if fnmatch(str(relative_path), pattern):
+                return True
+    return False
+
+def is_filtered(file_path, include_pattern="", exclude_pattern="", case_sensitive=False):
+    def match_patterns(file_name, patterns):
+        return any(fnmatch(file_name, pattern) for pattern in patterns)
+
+    file_name = file_path.name
+    if not case_sensitive:
+        file_name = file_name.lower()
+    include_patterns = [p.strip().lower() for p in (include_pattern or "").split(',') if p.strip()]
+    exclude_patterns = [p.strip().lower() for p in (exclude_pattern or "").split(',') if p.strip()]
+
+    if not include_patterns:
+        include_match = True
+    else:
+        include_match = match_patterns(file_name, include_patterns)
+    exclude_match = match_patterns(file_name, exclude_patterns)
+    return include_match and not exclude_match
+
+def is_binary(file_path):
+    try:
+        with open(file_path, "rb") as file:
+            chunk = file.read(1024)
+            return b"\x00" in chunk
+    except IOError:
+        print(f"Error: The file at {file_path} could not be opened.")
+        return False
diff --git a/code2prompt/language_inference.py b/code2prompt/language_inference.py
index 92691dc..ba67be5 100644
--- a/code2prompt/language_inference.py
+++ b/code2prompt/language_inference.py
@@ -1,11 +1,6 @@
 import os
 
 def infer_language(filename: str) -> str:
-    """Infers the programming language based on the file extension.
-    
-    :param filename: The name of the file.
-    :return: The inferred programming language.
-    """
     _, extension = os.path.splitext(filename)
     extension = extension.lower()
     if extension in [".c", ".h"]:
diff --git a/code2prompt/main.py b/code2prompt/main.py
index b60344d..87f964e 100644
--- a/code2prompt/main.py
+++ b/code2prompt/main.py
@@ -1,112 +1,65 @@
 from datetime import datetime
 from pathlib import Path
-from fnmatch import fnmatch
 import click
 from code2prompt.language_inference import infer_language
 from code2prompt.comment_stripper import strip_comments
+from code2prompt.file_handling import (
+    parse_gitignore,
+    is_ignored,
+    is_filtered,
+    is_binary,
+)
 
-def parse_gitignore(gitignore_path):
-    """Parse the .gitignore file and return a set of patterns."""
-    if not gitignore_path.exists():
-        return set()
-    with gitignore_path.open("r", encoding="utf-8") as file:
-        patterns = set(line.strip() for line in file if line.strip() and not line.startswith("#"))
-    return patterns
-
-def is_ignored(file_path: Path, gitignore_patterns: list, base_path: Path) -> bool:
-    """Check if a file path matches any pattern in the .gitignore file."""
-    relative_path = file_path.relative_to(base_path)
-    for pattern in gitignore_patterns:
-        pattern = pattern.rstrip("/")  # Remove trailing slash from the pattern
-        if pattern.startswith("/"):
-            if fnmatch(str(relative_path), pattern[1:]):
-                return True
-            if fnmatch(str(relative_path.parent), pattern[1:]):
-                return True
-        else:
-            for path in relative_path.parents:
-                if fnmatch(str(path / relative_path.name), pattern):
-                    return True
-                if fnmatch(str(path), pattern):
-                    return True
-            if fnmatch(str(relative_path), pattern):
-                return True
-    return False
-
-def is_filtered(file_path, include_pattern="", exclude_pattern="", case_sensitive=False):
-    """
-    Check if a file path matches the include patterns and doesn't match the exclude patterns.
-    
-    Args:
-        file_path (Path): The path of the file to check.
-        include_pattern (str): Comma-separated list of inclusion patterns.
-        exclude_pattern (str): Comma-separated list of exclusion patterns.
-        case_sensitive (bool): Whether to perform case-sensitive matching.
-    
-    Returns:
-        bool: True if the file should be included, False otherwise.
-    """
-    def match_patterns(file_name, patterns):
-        return any(fnmatch(file_name, pattern) for pattern in patterns)
-
-    file_name = file_path.name
-    if not case_sensitive:
-        file_name = file_name.lower()
-    
-    include_patterns = [p.strip().lower() for p in (include_pattern or "").split(',') if p.strip()]
-    exclude_patterns = [p.strip().lower() for p in (exclude_pattern or "").split(',') if p.strip()]
-    
-    if not include_patterns:
-        include_match = True
-    else:
-        include_match = match_patterns(file_name, include_patterns)
-    
-    exclude_match = match_patterns(file_name, exclude_patterns)
-    
-    return include_match and not exclude_match
-
-def is_binary(file_path):
-    """Determines if the specified file is a binary file."""
-    try:
-        with open(file_path, "rb") as file:
-            chunk = file.read(1024)
-            return b"\x00" in chunk
-    except IOError:
-        print(f"Error: The file at {file_path} could not be opened.")
-        return False
 
 @click.command()
 @click.option(
-    "--path", "-p", type=click.Path(exists=True), required=True, help="Path to the directory to navigate."
+    "--path",
+    "-p",
+    type=click.Path(exists=True),
+    required=True,
+    help="Path to the directory to navigate.",
 )
 @click.option(
     "--output", "-o", type=click.Path(), help="Name of the output Markdown file."
 )
 @click.option(
-    "--gitignore", "-g", type=click.Path(exists=True), help="Path to the .gitignore file."
+    "--gitignore",
+    "-g",
+    type=click.Path(exists=True),
+    help="Path to the .gitignore file.",
 )
 @click.option(
-    "--filter", "-f", type=str, help='Comma-separated filter patterns to include files (e.g., "*.py,*.js").'
+    "--filter",
+    "-f",
+    type=str,
+    help='Comma-separated filter patterns to include files (e.g., "*.py,*.js").',
 )
 @click.option(
-    "--exclude", "-e", type=str, help='Comma-separated patterns to exclude files (e.g., "*.txt,*.md").'
+    "--exclude",
+    "-e",
+    type=str,
+    help='Comma-separated patterns to exclude files (e.g., "*.txt,*.md").',
 )
 @click.option(
     "--case-sensitive", is_flag=True, help="Perform case-sensitive pattern matching."
 )
 @click.option(
-    "--suppress-comments", "-s", is_flag=True, help="Strip comments from the code files.", default=False
+    "--suppress-comments",
+    "-s",
+    is_flag=True,
+    help="Strip comments from the code files.",
+    default=False,
 )
-def create_markdown_file(path, output, gitignore, filter, exclude, suppress_comments, case_sensitive):
-    """Create a Markdown file with the content of files in a directory."""
+def create_markdown_file(
+    path, output, gitignore, filter, exclude, suppress_comments, case_sensitive
+):
     content = []
     table_of_contents = []
     path = Path(path)
-    
     gitignore_path = Path(gitignore) if gitignore else path / ".gitignore"
     gitignore_patterns = parse_gitignore(gitignore_path)
     gitignore_patterns.add(".git")
-    
+
     for file_path in path.rglob("*"):
         if (
             file_path.is_file()
@@ -116,8 +69,13 @@ def create_markdown_file(path, output, gitignore, filter, exclude, suppress_comm
         ):
             file_extension = file_path.suffix
             file_size = file_path.stat().st_size
-            file_creation_time = datetime.fromtimestamp(file_path.stat().st_ctime).strftime("%Y-%m-%d %H:%M:%S")
-            file_modification_time = datetime.fromtimestamp(file_path.stat().st_mtime).strftime("%Y-%m-%d %H:%M:%S")
+            file_creation_time = datetime.fromtimestamp(
+                file_path.stat().st_ctime
+            ).strftime("%Y-%m-%d %H:%M:%S")
+            file_modification_time = datetime.fromtimestamp(
+                file_path.stat().st_mtime
+            ).strftime("%Y-%m-%d %H:%M:%S")
+
             try:
                 with file_path.open("r", encoding="utf-8") as f:
                     file_content = f.read()
@@ -127,16 +85,21 @@ def create_markdown_file(path, output, gitignore, filter, exclude, suppress_comm
                             file_content = strip_comments(file_content, language)
             except UnicodeDecodeError:
                 continue
+
             file_info = f"## File: {file_path}\n\n"
             file_info += f"- Extension: {file_extension}\n"
             file_info += f"- Size: {file_size} bytes\n"
             file_info += f"- Created: {file_creation_time}\n"
             file_info += f"- Modified: {file_modification_time}\n\n"
-            file_code = f"### Code\n```{file_extension}\n{file_content}\n```\n\n"
+            file_code = f"### Code\n\n\n"
             content.append(file_info + file_code)
-            table_of_contents.append(f"- [{file_path}](#{file_path.as_posix().replace('/', '')})\n")
-    
-    markdown_content = "# Table of Contents\n" + "".join(table_of_contents) + "\n" + "".join(content)
+            table_of_contents.append(
+                f"- [{file_path}](#{file_path.as_posix().replace('/', '')})\n"
+            )
+
+    markdown_content = (
+        "# Table of Contents\n" + "".join(table_of_contents) + "\n" + "".join(content)
+    )
     if output:
         output_path = Path(output)
         with output_path.open("w", encoding="utf-8") as md_file:
@@ -145,5 +108,8 @@ def create_markdown_file(path, output, gitignore, filter, exclude, suppress_comm
     else:
         click.echo(markdown_content)
 
+
 if __name__ == "__main__":
+    # pylint: disable=no-value-for-parameter
+    # pylint: disable=E1120
     create_markdown_file()
diff --git a/tests/test_code2prompt.py b/tests/test_code2prompt.py
index 9a4d7ad..a1e6cfc 100644
--- a/tests/test_code2prompt.py
+++ b/tests/test_code2prompt.py
@@ -2,7 +2,8 @@
 import tempfile
 from pathlib import Path
 from click.testing import CliRunner
-from code2prompt.main import create_markdown_file, parse_gitignore, is_ignored, is_filtered, is_binary
+from code2prompt.file_handling import parse_gitignore, is_ignored, is_filtered, is_binary
+from code2prompt.main import create_markdown_file
 
 def test_parse_gitignore():
     with tempfile.NamedTemporaryFile(mode='w', delete=False) as temp_file: