Skip to content

Commit

Permalink
restructure the code
Browse files Browse the repository at this point in the history
  • Loading branch information
raphaelmansuy committed Jun 28, 2024
1 parent 79370e5 commit 60bf191
Show file tree
Hide file tree
Showing 13 changed files with 226 additions and 91 deletions.
8 changes: 8 additions & 0 deletions code2prompt/comment_stripper/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from .c_style import strip_c_style_comments
from .html_style import strip_html_style_comments
from .python_style import strip_python_style_comments
from .shell_style import strip_shell_style_comments
from .sql_style import strip_sql_style_comments
from .matlab_style import strip_matlab_style_comments
from .r_style import strip_r_style_comments
from .strip_comments import strip_comments
12 changes: 12 additions & 0 deletions code2prompt/comment_stripper/c_style.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import re

def strip_c_style_comments(code: str) -> str:
pattern = re.compile(
r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
re.DOTALL | re.MULTILINE,
)
return re.sub(
pattern,
lambda match: match.group(0) if match.group(0).startswith(("'", '"')) else "",
code,
)
5 changes: 5 additions & 0 deletions code2prompt/comment_stripper/html_style.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
import re

def strip_html_style_comments(code: str) -> str:
pattern = re.compile(r"<!--.*?-->", re.DOTALL)
return re.sub(pattern, "", code)
12 changes: 12 additions & 0 deletions code2prompt/comment_stripper/matlab_style.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import re

def strip_matlab_style_comments(code: str) -> str:
pattern = re.compile(
r'%.*?$|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
re.DOTALL | re.MULTILINE,
)
return re.sub(
pattern,
lambda match: match.group(0) if match.group(0).startswith(("'", '"')) else "",
code,
)
12 changes: 12 additions & 0 deletions code2prompt/comment_stripper/python_style.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import re

def strip_python_style_comments(code: str) -> str:
pattern = re.compile(
r'(?s)#.*?$|\'\'\'.*?\'\'\'|""".*?"""|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
re.MULTILINE,
)
return re.sub(
pattern,
lambda match: ("" if match.group(0).startswith(("#", "'''", '"""')) else match.group(0)),
code,
)
12 changes: 12 additions & 0 deletions code2prompt/comment_stripper/r_style.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import re

def strip_r_style_comments(code: str) -> str:
pattern = re.compile(
r'#.*?$|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
re.DOTALL | re.MULTILINE,
)
return re.sub(
pattern,
lambda match: match.group(0) if match.group(0).startswith(("'", '"')) else "",
code,
)
19 changes: 19 additions & 0 deletions code2prompt/comment_stripper/shell_style.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
def strip_shell_style_comments(code: str) -> str:
lines = code.split("\n")
new_lines = []
in_multiline_comment = False
for line in lines:
if line.strip().startswith("#!"): # Preserve shebang lines
new_lines.append(line)
elif in_multiline_comment:
if line.strip().endswith("'"):
in_multiline_comment = False
elif line.strip().startswith(": '"):
in_multiline_comment = True
elif "#" in line: # Remove single-line comments
line = line.split("#", 1)[0]
if line.strip():
new_lines.append(line)
else:
new_lines.append(line)
return "\n".join(new_lines).strip()
12 changes: 12 additions & 0 deletions code2prompt/comment_stripper/sql_style.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import re

def strip_sql_style_comments(code: str) -> str:
pattern = re.compile(
r'--.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
re.DOTALL | re.MULTILINE,
)
return re.sub(
pattern,
lambda match: match.group(0) if match.group(0).startswith(("'", '"')) else "",
code,
)
27 changes: 27 additions & 0 deletions code2prompt/comment_stripper/strip_comments.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from .c_style import strip_c_style_comments
from .html_style import strip_html_style_comments
from .python_style import strip_python_style_comments
from .shell_style import strip_shell_style_comments
from .sql_style import strip_sql_style_comments
from .matlab_style import strip_matlab_style_comments
from .r_style import strip_r_style_comments

def strip_comments(code: str, language: str) -> str:
if language in [
"c", "cpp", "java", "javascript", "csharp", "php", "go", "rust", "kotlin", "swift", "scala", "dart",
]:
return strip_c_style_comments(code)
elif language in ["python", "ruby", "perl"]:
return strip_python_style_comments(code)
elif language in ["bash", "powershell", "shell"]:
return strip_shell_style_comments(code)
elif language in ["html", "xml"]:
return strip_html_style_comments(code)
elif language in ["sql", "plsql", "tsql"]:
return strip_sql_style_comments(code)
elif language in ["matlab", "octave"]:
return strip_matlab_style_comments(code)
elif language in ["r"]:
return strip_r_style_comments(code)
else:
return code
54 changes: 54 additions & 0 deletions code2prompt/file_handling.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
from pathlib import Path
from fnmatch import fnmatch

def parse_gitignore(gitignore_path):
if not gitignore_path.exists():
return set()
with gitignore_path.open("r", encoding="utf-8") as file:
patterns = set(line.strip() for line in file if line.strip() and not line.startswith("#"))
return patterns

def is_ignored(file_path: Path, gitignore_patterns: list, base_path: Path) -> bool:
relative_path = file_path.relative_to(base_path)
for pattern in gitignore_patterns:
pattern = pattern.rstrip("/")
if pattern.startswith("/"):
if fnmatch(str(relative_path), pattern[1:]):
return True
if fnmatch(str(relative_path.parent), pattern[1:]):
return True
else:
for path in relative_path.parents:
if fnmatch(str(path / relative_path.name), pattern):
return True
if fnmatch(str(path), pattern):
return True
if fnmatch(str(relative_path), pattern):
return True
return False

def is_filtered(file_path, include_pattern="", exclude_pattern="", case_sensitive=False):
def match_patterns(file_name, patterns):
return any(fnmatch(file_name, pattern) for pattern in patterns)

file_name = file_path.name
if not case_sensitive:
file_name = file_name.lower()
include_patterns = [p.strip().lower() for p in (include_pattern or "").split(',') if p.strip()]
exclude_patterns = [p.strip().lower() for p in (exclude_pattern or "").split(',') if p.strip()]

if not include_patterns:
include_match = True
else:
include_match = match_patterns(file_name, include_patterns)
exclude_match = match_patterns(file_name, exclude_patterns)
return include_match and not exclude_match

def is_binary(file_path):
try:
with open(file_path, "rb") as file:
chunk = file.read(1024)
return b"\x00" in chunk
except IOError:
print(f"Error: The file at {file_path} could not be opened.")
return False
5 changes: 0 additions & 5 deletions code2prompt/language_inference.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,6 @@
import os

def infer_language(filename: str) -> str:
"""Infers the programming language based on the file extension.
:param filename: The name of the file.
:return: The inferred programming language.
"""
_, extension = os.path.splitext(filename)
extension = extension.lower()
if extension in [".c", ".h"]:
Expand Down
136 changes: 51 additions & 85 deletions code2prompt/main.py
Original file line number Diff line number Diff line change
@@ -1,112 +1,65 @@
from datetime import datetime
from pathlib import Path
from fnmatch import fnmatch
import click
from code2prompt.language_inference import infer_language
from code2prompt.comment_stripper import strip_comments
from code2prompt.file_handling import (
parse_gitignore,
is_ignored,
is_filtered,
is_binary,
)

def parse_gitignore(gitignore_path):
"""Parse the .gitignore file and return a set of patterns."""
if not gitignore_path.exists():
return set()
with gitignore_path.open("r", encoding="utf-8") as file:
patterns = set(line.strip() for line in file if line.strip() and not line.startswith("#"))
return patterns

def is_ignored(file_path: Path, gitignore_patterns: list, base_path: Path) -> bool:
"""Check if a file path matches any pattern in the .gitignore file."""
relative_path = file_path.relative_to(base_path)
for pattern in gitignore_patterns:
pattern = pattern.rstrip("/") # Remove trailing slash from the pattern
if pattern.startswith("/"):
if fnmatch(str(relative_path), pattern[1:]):
return True
if fnmatch(str(relative_path.parent), pattern[1:]):
return True
else:
for path in relative_path.parents:
if fnmatch(str(path / relative_path.name), pattern):
return True
if fnmatch(str(path), pattern):
return True
if fnmatch(str(relative_path), pattern):
return True
return False

def is_filtered(file_path, include_pattern="", exclude_pattern="", case_sensitive=False):
"""
Check if a file path matches the include patterns and doesn't match the exclude patterns.
Args:
file_path (Path): The path of the file to check.
include_pattern (str): Comma-separated list of inclusion patterns.
exclude_pattern (str): Comma-separated list of exclusion patterns.
case_sensitive (bool): Whether to perform case-sensitive matching.
Returns:
bool: True if the file should be included, False otherwise.
"""
def match_patterns(file_name, patterns):
return any(fnmatch(file_name, pattern) for pattern in patterns)

file_name = file_path.name
if not case_sensitive:
file_name = file_name.lower()

include_patterns = [p.strip().lower() for p in (include_pattern or "").split(',') if p.strip()]
exclude_patterns = [p.strip().lower() for p in (exclude_pattern or "").split(',') if p.strip()]

if not include_patterns:
include_match = True
else:
include_match = match_patterns(file_name, include_patterns)

exclude_match = match_patterns(file_name, exclude_patterns)

return include_match and not exclude_match

def is_binary(file_path):
"""Determines if the specified file is a binary file."""
try:
with open(file_path, "rb") as file:
chunk = file.read(1024)
return b"\x00" in chunk
except IOError:
print(f"Error: The file at {file_path} could not be opened.")
return False

@click.command()
@click.option(
"--path", "-p", type=click.Path(exists=True), required=True, help="Path to the directory to navigate."
"--path",
"-p",
type=click.Path(exists=True),
required=True,
help="Path to the directory to navigate.",
)
@click.option(
"--output", "-o", type=click.Path(), help="Name of the output Markdown file."
)
@click.option(
"--gitignore", "-g", type=click.Path(exists=True), help="Path to the .gitignore file."
"--gitignore",
"-g",
type=click.Path(exists=True),
help="Path to the .gitignore file.",
)
@click.option(
"--filter", "-f", type=str, help='Comma-separated filter patterns to include files (e.g., "*.py,*.js").'
"--filter",
"-f",
type=str,
help='Comma-separated filter patterns to include files (e.g., "*.py,*.js").',
)
@click.option(
"--exclude", "-e", type=str, help='Comma-separated patterns to exclude files (e.g., "*.txt,*.md").'
"--exclude",
"-e",
type=str,
help='Comma-separated patterns to exclude files (e.g., "*.txt,*.md").',
)
@click.option(
"--case-sensitive", is_flag=True, help="Perform case-sensitive pattern matching."
)
@click.option(
"--suppress-comments", "-s", is_flag=True, help="Strip comments from the code files.", default=False
"--suppress-comments",
"-s",
is_flag=True,
help="Strip comments from the code files.",
default=False,
)
def create_markdown_file(path, output, gitignore, filter, exclude, suppress_comments, case_sensitive):
"""Create a Markdown file with the content of files in a directory."""
def create_markdown_file(
path, output, gitignore, filter, exclude, suppress_comments, case_sensitive
):
content = []
table_of_contents = []
path = Path(path)

gitignore_path = Path(gitignore) if gitignore else path / ".gitignore"
gitignore_patterns = parse_gitignore(gitignore_path)
gitignore_patterns.add(".git")

for file_path in path.rglob("*"):
if (
file_path.is_file()
Expand All @@ -116,8 +69,13 @@ def create_markdown_file(path, output, gitignore, filter, exclude, suppress_comm
):
file_extension = file_path.suffix
file_size = file_path.stat().st_size
file_creation_time = datetime.fromtimestamp(file_path.stat().st_ctime).strftime("%Y-%m-%d %H:%M:%S")
file_modification_time = datetime.fromtimestamp(file_path.stat().st_mtime).strftime("%Y-%m-%d %H:%M:%S")
file_creation_time = datetime.fromtimestamp(
file_path.stat().st_ctime
).strftime("%Y-%m-%d %H:%M:%S")
file_modification_time = datetime.fromtimestamp(
file_path.stat().st_mtime
).strftime("%Y-%m-%d %H:%M:%S")

try:
with file_path.open("r", encoding="utf-8") as f:
file_content = f.read()
Expand All @@ -127,16 +85,21 @@ def create_markdown_file(path, output, gitignore, filter, exclude, suppress_comm
file_content = strip_comments(file_content, language)
except UnicodeDecodeError:
continue

file_info = f"## File: {file_path}\n\n"
file_info += f"- Extension: {file_extension}\n"
file_info += f"- Size: {file_size} bytes\n"
file_info += f"- Created: {file_creation_time}\n"
file_info += f"- Modified: {file_modification_time}\n\n"
file_code = f"### Code\n```{file_extension}\n{file_content}\n```\n\n"
file_code = f"### Code\n\n\n"
content.append(file_info + file_code)
table_of_contents.append(f"- [{file_path}](#{file_path.as_posix().replace('/', '')})\n")

markdown_content = "# Table of Contents\n" + "".join(table_of_contents) + "\n" + "".join(content)
table_of_contents.append(
f"- [{file_path}](#{file_path.as_posix().replace('/', '')})\n"
)

markdown_content = (
"# Table of Contents\n" + "".join(table_of_contents) + "\n" + "".join(content)
)
if output:
output_path = Path(output)
with output_path.open("w", encoding="utf-8") as md_file:
Expand All @@ -145,5 +108,8 @@ def create_markdown_file(path, output, gitignore, filter, exclude, suppress_comm
else:
click.echo(markdown_content)


if __name__ == "__main__":
# pylint: disable=no-value-for-parameter
# pylint: disable=E1120
create_markdown_file()
Loading

0 comments on commit 60bf191

Please sign in to comment.