From 60bf1910a45e04af0fe921e08d04e4400d175592 Mon Sep 17 00:00:00 2001 From: Raphael MANSUY Date: Fri, 28 Jun 2024 09:46:52 +0800 Subject: [PATCH] restructure the code --- code2prompt/comment_stripper/__init__.py | 8 ++ code2prompt/comment_stripper/c_style.py | 12 ++ code2prompt/comment_stripper/html_style.py | 5 + code2prompt/comment_stripper/matlab_style.py | 12 ++ code2prompt/comment_stripper/python_style.py | 12 ++ code2prompt/comment_stripper/r_style.py | 12 ++ code2prompt/comment_stripper/shell_style.py | 19 +++ code2prompt/comment_stripper/sql_style.py | 12 ++ .../comment_stripper/strip_comments.py | 27 ++++ code2prompt/file_handling.py | 54 +++++++ code2prompt/language_inference.py | 5 - code2prompt/main.py | 136 +++++++----------- tests/test_code2prompt.py | 3 +- 13 files changed, 226 insertions(+), 91 deletions(-) create mode 100644 code2prompt/comment_stripper/__init__.py create mode 100644 code2prompt/comment_stripper/c_style.py create mode 100644 code2prompt/comment_stripper/html_style.py create mode 100644 code2prompt/comment_stripper/matlab_style.py create mode 100644 code2prompt/comment_stripper/python_style.py create mode 100644 code2prompt/comment_stripper/r_style.py create mode 100644 code2prompt/comment_stripper/shell_style.py create mode 100644 code2prompt/comment_stripper/sql_style.py create mode 100644 code2prompt/comment_stripper/strip_comments.py create mode 100644 code2prompt/file_handling.py diff --git a/code2prompt/comment_stripper/__init__.py b/code2prompt/comment_stripper/__init__.py new file mode 100644 index 0000000..39204f9 --- /dev/null +++ b/code2prompt/comment_stripper/__init__.py @@ -0,0 +1,8 @@ +from .c_style import strip_c_style_comments +from .html_style import strip_html_style_comments +from .python_style import strip_python_style_comments +from .shell_style import strip_shell_style_comments +from .sql_style import strip_sql_style_comments +from .matlab_style import strip_matlab_style_comments +from .r_style import strip_r_style_comments +from .strip_comments import strip_comments diff --git a/code2prompt/comment_stripper/c_style.py b/code2prompt/comment_stripper/c_style.py new file mode 100644 index 0000000..da064ae --- /dev/null +++ b/code2prompt/comment_stripper/c_style.py @@ -0,0 +1,12 @@ +import re + +def strip_c_style_comments(code: str) -> str: + pattern = re.compile( + r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"', + re.DOTALL | re.MULTILINE, + ) + return re.sub( + pattern, + lambda match: match.group(0) if match.group(0).startswith(("'", '"')) else "", + code, + ) diff --git a/code2prompt/comment_stripper/html_style.py b/code2prompt/comment_stripper/html_style.py new file mode 100644 index 0000000..e6e938d --- /dev/null +++ b/code2prompt/comment_stripper/html_style.py @@ -0,0 +1,5 @@ +import re + +def strip_html_style_comments(code: str) -> str: + pattern = re.compile(r"", re.DOTALL) + return re.sub(pattern, "", code) diff --git a/code2prompt/comment_stripper/matlab_style.py b/code2prompt/comment_stripper/matlab_style.py new file mode 100644 index 0000000..6026f89 --- /dev/null +++ b/code2prompt/comment_stripper/matlab_style.py @@ -0,0 +1,12 @@ +import re + +def strip_matlab_style_comments(code: str) -> str: + pattern = re.compile( + r'%.*?$|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"', + re.DOTALL | re.MULTILINE, + ) + return re.sub( + pattern, + lambda match: match.group(0) if match.group(0).startswith(("'", '"')) else "", + code, + ) diff --git a/code2prompt/comment_stripper/python_style.py b/code2prompt/comment_stripper/python_style.py new file mode 100644 index 0000000..091778c --- /dev/null +++ b/code2prompt/comment_stripper/python_style.py @@ -0,0 +1,12 @@ +import re + +def strip_python_style_comments(code: str) -> str: + pattern = re.compile( + r'(?s)#.*?$|\'\'\'.*?\'\'\'|""".*?"""|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"', + re.MULTILINE, + ) + return re.sub( + pattern, + lambda match: ("" if match.group(0).startswith(("#", "'''", '"""')) else match.group(0)), + code, + ) diff --git a/code2prompt/comment_stripper/r_style.py b/code2prompt/comment_stripper/r_style.py new file mode 100644 index 0000000..db2c7a8 --- /dev/null +++ b/code2prompt/comment_stripper/r_style.py @@ -0,0 +1,12 @@ +import re + +def strip_r_style_comments(code: str) -> str: + pattern = re.compile( + r'#.*?$|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"', + re.DOTALL | re.MULTILINE, + ) + return re.sub( + pattern, + lambda match: match.group(0) if match.group(0).startswith(("'", '"')) else "", + code, + ) diff --git a/code2prompt/comment_stripper/shell_style.py b/code2prompt/comment_stripper/shell_style.py new file mode 100644 index 0000000..7395823 --- /dev/null +++ b/code2prompt/comment_stripper/shell_style.py @@ -0,0 +1,19 @@ +def strip_shell_style_comments(code: str) -> str: + lines = code.split("\n") + new_lines = [] + in_multiline_comment = False + for line in lines: + if line.strip().startswith("#!"): # Preserve shebang lines + new_lines.append(line) + elif in_multiline_comment: + if line.strip().endswith("'"): + in_multiline_comment = False + elif line.strip().startswith(": '"): + in_multiline_comment = True + elif "#" in line: # Remove single-line comments + line = line.split("#", 1)[0] + if line.strip(): + new_lines.append(line) + else: + new_lines.append(line) + return "\n".join(new_lines).strip() diff --git a/code2prompt/comment_stripper/sql_style.py b/code2prompt/comment_stripper/sql_style.py new file mode 100644 index 0000000..10e7604 --- /dev/null +++ b/code2prompt/comment_stripper/sql_style.py @@ -0,0 +1,12 @@ +import re + +def strip_sql_style_comments(code: str) -> str: + pattern = re.compile( + r'--.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"', + re.DOTALL | re.MULTILINE, + ) + return re.sub( + pattern, + lambda match: match.group(0) if match.group(0).startswith(("'", '"')) else "", + code, + ) diff --git a/code2prompt/comment_stripper/strip_comments.py b/code2prompt/comment_stripper/strip_comments.py new file mode 100644 index 0000000..5291503 --- /dev/null +++ b/code2prompt/comment_stripper/strip_comments.py @@ -0,0 +1,27 @@ +from .c_style import strip_c_style_comments +from .html_style import strip_html_style_comments +from .python_style import strip_python_style_comments +from .shell_style import strip_shell_style_comments +from .sql_style import strip_sql_style_comments +from .matlab_style import strip_matlab_style_comments +from .r_style import strip_r_style_comments + +def strip_comments(code: str, language: str) -> str: + if language in [ + "c", "cpp", "java", "javascript", "csharp", "php", "go", "rust", "kotlin", "swift", "scala", "dart", + ]: + return strip_c_style_comments(code) + elif language in ["python", "ruby", "perl"]: + return strip_python_style_comments(code) + elif language in ["bash", "powershell", "shell"]: + return strip_shell_style_comments(code) + elif language in ["html", "xml"]: + return strip_html_style_comments(code) + elif language in ["sql", "plsql", "tsql"]: + return strip_sql_style_comments(code) + elif language in ["matlab", "octave"]: + return strip_matlab_style_comments(code) + elif language in ["r"]: + return strip_r_style_comments(code) + else: + return code diff --git a/code2prompt/file_handling.py b/code2prompt/file_handling.py new file mode 100644 index 0000000..0acf1ae --- /dev/null +++ b/code2prompt/file_handling.py @@ -0,0 +1,54 @@ +from pathlib import Path +from fnmatch import fnmatch + +def parse_gitignore(gitignore_path): + if not gitignore_path.exists(): + return set() + with gitignore_path.open("r", encoding="utf-8") as file: + patterns = set(line.strip() for line in file if line.strip() and not line.startswith("#")) + return patterns + +def is_ignored(file_path: Path, gitignore_patterns: list, base_path: Path) -> bool: + relative_path = file_path.relative_to(base_path) + for pattern in gitignore_patterns: + pattern = pattern.rstrip("/") + if pattern.startswith("/"): + if fnmatch(str(relative_path), pattern[1:]): + return True + if fnmatch(str(relative_path.parent), pattern[1:]): + return True + else: + for path in relative_path.parents: + if fnmatch(str(path / relative_path.name), pattern): + return True + if fnmatch(str(path), pattern): + return True + if fnmatch(str(relative_path), pattern): + return True + return False + +def is_filtered(file_path, include_pattern="", exclude_pattern="", case_sensitive=False): + def match_patterns(file_name, patterns): + return any(fnmatch(file_name, pattern) for pattern in patterns) + + file_name = file_path.name + if not case_sensitive: + file_name = file_name.lower() + include_patterns = [p.strip().lower() for p in (include_pattern or "").split(',') if p.strip()] + exclude_patterns = [p.strip().lower() for p in (exclude_pattern or "").split(',') if p.strip()] + + if not include_patterns: + include_match = True + else: + include_match = match_patterns(file_name, include_patterns) + exclude_match = match_patterns(file_name, exclude_patterns) + return include_match and not exclude_match + +def is_binary(file_path): + try: + with open(file_path, "rb") as file: + chunk = file.read(1024) + return b"\x00" in chunk + except IOError: + print(f"Error: The file at {file_path} could not be opened.") + return False diff --git a/code2prompt/language_inference.py b/code2prompt/language_inference.py index 92691dc..ba67be5 100644 --- a/code2prompt/language_inference.py +++ b/code2prompt/language_inference.py @@ -1,11 +1,6 @@ import os def infer_language(filename: str) -> str: - """Infers the programming language based on the file extension. - - :param filename: The name of the file. - :return: The inferred programming language. - """ _, extension = os.path.splitext(filename) extension = extension.lower() if extension in [".c", ".h"]: diff --git a/code2prompt/main.py b/code2prompt/main.py index b60344d..87f964e 100644 --- a/code2prompt/main.py +++ b/code2prompt/main.py @@ -1,112 +1,65 @@ from datetime import datetime from pathlib import Path -from fnmatch import fnmatch import click from code2prompt.language_inference import infer_language from code2prompt.comment_stripper import strip_comments +from code2prompt.file_handling import ( + parse_gitignore, + is_ignored, + is_filtered, + is_binary, +) -def parse_gitignore(gitignore_path): - """Parse the .gitignore file and return a set of patterns.""" - if not gitignore_path.exists(): - return set() - with gitignore_path.open("r", encoding="utf-8") as file: - patterns = set(line.strip() for line in file if line.strip() and not line.startswith("#")) - return patterns - -def is_ignored(file_path: Path, gitignore_patterns: list, base_path: Path) -> bool: - """Check if a file path matches any pattern in the .gitignore file.""" - relative_path = file_path.relative_to(base_path) - for pattern in gitignore_patterns: - pattern = pattern.rstrip("/") # Remove trailing slash from the pattern - if pattern.startswith("/"): - if fnmatch(str(relative_path), pattern[1:]): - return True - if fnmatch(str(relative_path.parent), pattern[1:]): - return True - else: - for path in relative_path.parents: - if fnmatch(str(path / relative_path.name), pattern): - return True - if fnmatch(str(path), pattern): - return True - if fnmatch(str(relative_path), pattern): - return True - return False - -def is_filtered(file_path, include_pattern="", exclude_pattern="", case_sensitive=False): - """ - Check if a file path matches the include patterns and doesn't match the exclude patterns. - - Args: - file_path (Path): The path of the file to check. - include_pattern (str): Comma-separated list of inclusion patterns. - exclude_pattern (str): Comma-separated list of exclusion patterns. - case_sensitive (bool): Whether to perform case-sensitive matching. - - Returns: - bool: True if the file should be included, False otherwise. - """ - def match_patterns(file_name, patterns): - return any(fnmatch(file_name, pattern) for pattern in patterns) - - file_name = file_path.name - if not case_sensitive: - file_name = file_name.lower() - - include_patterns = [p.strip().lower() for p in (include_pattern or "").split(',') if p.strip()] - exclude_patterns = [p.strip().lower() for p in (exclude_pattern or "").split(',') if p.strip()] - - if not include_patterns: - include_match = True - else: - include_match = match_patterns(file_name, include_patterns) - - exclude_match = match_patterns(file_name, exclude_patterns) - - return include_match and not exclude_match - -def is_binary(file_path): - """Determines if the specified file is a binary file.""" - try: - with open(file_path, "rb") as file: - chunk = file.read(1024) - return b"\x00" in chunk - except IOError: - print(f"Error: The file at {file_path} could not be opened.") - return False @click.command() @click.option( - "--path", "-p", type=click.Path(exists=True), required=True, help="Path to the directory to navigate." + "--path", + "-p", + type=click.Path(exists=True), + required=True, + help="Path to the directory to navigate.", ) @click.option( "--output", "-o", type=click.Path(), help="Name of the output Markdown file." ) @click.option( - "--gitignore", "-g", type=click.Path(exists=True), help="Path to the .gitignore file." + "--gitignore", + "-g", + type=click.Path(exists=True), + help="Path to the .gitignore file.", ) @click.option( - "--filter", "-f", type=str, help='Comma-separated filter patterns to include files (e.g., "*.py,*.js").' + "--filter", + "-f", + type=str, + help='Comma-separated filter patterns to include files (e.g., "*.py,*.js").', ) @click.option( - "--exclude", "-e", type=str, help='Comma-separated patterns to exclude files (e.g., "*.txt,*.md").' + "--exclude", + "-e", + type=str, + help='Comma-separated patterns to exclude files (e.g., "*.txt,*.md").', ) @click.option( "--case-sensitive", is_flag=True, help="Perform case-sensitive pattern matching." ) @click.option( - "--suppress-comments", "-s", is_flag=True, help="Strip comments from the code files.", default=False + "--suppress-comments", + "-s", + is_flag=True, + help="Strip comments from the code files.", + default=False, ) -def create_markdown_file(path, output, gitignore, filter, exclude, suppress_comments, case_sensitive): - """Create a Markdown file with the content of files in a directory.""" +def create_markdown_file( + path, output, gitignore, filter, exclude, suppress_comments, case_sensitive +): content = [] table_of_contents = [] path = Path(path) - gitignore_path = Path(gitignore) if gitignore else path / ".gitignore" gitignore_patterns = parse_gitignore(gitignore_path) gitignore_patterns.add(".git") - + for file_path in path.rglob("*"): if ( file_path.is_file() @@ -116,8 +69,13 @@ def create_markdown_file(path, output, gitignore, filter, exclude, suppress_comm ): file_extension = file_path.suffix file_size = file_path.stat().st_size - file_creation_time = datetime.fromtimestamp(file_path.stat().st_ctime).strftime("%Y-%m-%d %H:%M:%S") - file_modification_time = datetime.fromtimestamp(file_path.stat().st_mtime).strftime("%Y-%m-%d %H:%M:%S") + file_creation_time = datetime.fromtimestamp( + file_path.stat().st_ctime + ).strftime("%Y-%m-%d %H:%M:%S") + file_modification_time = datetime.fromtimestamp( + file_path.stat().st_mtime + ).strftime("%Y-%m-%d %H:%M:%S") + try: with file_path.open("r", encoding="utf-8") as f: file_content = f.read() @@ -127,16 +85,21 @@ def create_markdown_file(path, output, gitignore, filter, exclude, suppress_comm file_content = strip_comments(file_content, language) except UnicodeDecodeError: continue + file_info = f"## File: {file_path}\n\n" file_info += f"- Extension: {file_extension}\n" file_info += f"- Size: {file_size} bytes\n" file_info += f"- Created: {file_creation_time}\n" file_info += f"- Modified: {file_modification_time}\n\n" - file_code = f"### Code\n```{file_extension}\n{file_content}\n```\n\n" + file_code = f"### Code\n\n\n" content.append(file_info + file_code) - table_of_contents.append(f"- [{file_path}](#{file_path.as_posix().replace('/', '')})\n") - - markdown_content = "# Table of Contents\n" + "".join(table_of_contents) + "\n" + "".join(content) + table_of_contents.append( + f"- [{file_path}](#{file_path.as_posix().replace('/', '')})\n" + ) + + markdown_content = ( + "# Table of Contents\n" + "".join(table_of_contents) + "\n" + "".join(content) + ) if output: output_path = Path(output) with output_path.open("w", encoding="utf-8") as md_file: @@ -145,5 +108,8 @@ def create_markdown_file(path, output, gitignore, filter, exclude, suppress_comm else: click.echo(markdown_content) + if __name__ == "__main__": + # pylint: disable=no-value-for-parameter + # pylint: disable=E1120 create_markdown_file() diff --git a/tests/test_code2prompt.py b/tests/test_code2prompt.py index 9a4d7ad..a1e6cfc 100644 --- a/tests/test_code2prompt.py +++ b/tests/test_code2prompt.py @@ -2,7 +2,8 @@ import tempfile from pathlib import Path from click.testing import CliRunner -from code2prompt.main import create_markdown_file, parse_gitignore, is_ignored, is_filtered, is_binary +from code2prompt.file_handling import parse_gitignore, is_ignored, is_filtered, is_binary +from code2prompt.main import create_markdown_file def test_parse_gitignore(): with tempfile.NamedTemporaryFile(mode='w', delete=False) as temp_file: