forked from cyclotruc/gitingest
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathingest.py
183 lines (149 loc) · 6.65 KB
/
ingest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
import os
import fnmatch
from typing import Dict, List, Union
DEFAULT_IGNORE_PATTERNS = [
'*.pyc', '*.pyo', '*.pyd', '__pycache__', # Python
'node_modules', 'bower_components', # JavaScript
'.git', '.svn', '.hg', '.gitignore', # Version control
'venv', '.venv', 'env', # Virtual environments
'.idea', '.vscode', # IDEs
'*.log', '*.bak', '*.swp', '*.tmp', # Temporary files
'.DS_Store', # macOS
'Thumbs.db', # Windows
'build', 'dist', # Build directories
'*.egg-info', # Python egg info
'*.so', '*.dylib', '*.dll' # Compiled libraries
]
def should_ignore(path: str, base_path: str, ignore_patterns: List[str]) -> bool:
"""Checks if a file or directory should be ignored based on patterns."""
name = os.path.basename(path)
rel_path = os.path.relpath(path, base_path)
for pattern in ignore_patterns:
if fnmatch.fnmatch(name, pattern) or \
fnmatch.fnmatch(rel_path, pattern):
return True
return False
def is_text_file(file_path: str) -> bool:
"""Determines if a file is likely a text file based on its content."""
try:
with open(file_path, 'rb') as file:
chunk = file.read(1024)
return not bool(chunk.translate(None, bytes([7, 8, 9, 10, 12, 13, 27] + list(range(0x20, 0x100)))))
except IOError:
return False
def read_file_content(file_path: str) -> str:
"""Reads the content of a file, handling potential encoding errors."""
try:
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
return f.read()
except Exception as e:
return f"Error reading file: {str(e)}"
def analyze_directory(path: str, ignore_patterns: List[str], base_path: str) -> Dict:
"""Recursively analyzes a directory and its contents."""
result = {
"name": os.path.basename(path),
"type": "directory",
"size": 0,
"children": [],
"file_count": 0,
"dir_count": 0,
"path": path
}
try:
for item in os.listdir(path):
item_path = os.path.join(path, item)
if should_ignore(item_path, base_path, ignore_patterns):
continue
if os.path.isfile(item_path):
file_size = os.path.getsize(item_path)
is_text = is_text_file(item_path)
content = read_file_content(item_path) if is_text else "[Non-text file]"
child = {
"name": item,
"type": "file",
"size": file_size,
"content": content,
"path": item_path
}
result["children"].append(child)
result["size"] += file_size
result["file_count"] += 1
elif os.path.isdir(item_path):
subdir = analyze_directory(item_path, ignore_patterns, base_path)
if subdir:
result["children"].append(subdir)
result["size"] += subdir["size"]
result["file_count"] += subdir["file_count"]
result["dir_count"] += 1 + subdir["dir_count"]
except PermissionError:
print(f"Permission denied: {path}")
return result
def get_all_files(node: Dict, max_file_size: int, files: List = None) -> List[Dict]:
"""Recursively collects all text files with their contents."""
if files is None:
files = []
if node["type"] == "file" and node["content"] != "[Non-text file]":
content = node["content"]
if node["size"] > max_file_size:
content = "[Content ignored: file too large]"
files.append({
"path": node["path"],
"content": content,
"size": node["size"]
})
elif node["type"] == "directory":
for child in node["children"]:
get_all_files(child, max_file_size, files)
return files
def create_file_content_string(files: List[Dict]) -> str:
"""Creates a formatted string of file contents with separators."""
output = ""
separator = "=" * 50 + "\n"
for file in files:
output += separator
output += f"File: {file['path']}\n"
output += separator
output += f"{file['content']}\n\n"
return output
def create_summary_string(result: Dict, files: List[Dict]) -> str:
"""Creates a summary string with file counts and content size."""
total_size = sum(len(file["content"].encode('utf-8')) for file in files)
size_kb = total_size / 1024
return (
f"Files analyzed: {result['file_count']}\n"
f"Directories analyzed: {result['dir_count']}\n"
f"Actual text content size: {size_kb:.2f} KB\n"
)
def create_tree_structure(node: Dict, prefix: str = "", is_last: bool = True) -> str:
"""Creates a tree-like string representation of the file structure.
Args:
node: Dictionary containing file/directory information
prefix: Current line prefix for formatting
is_last: Boolean indicating if this is the last item in current level
Returns:
String representation of the tree structure
"""
tree = ""
current_prefix = "└── " if is_last else "├── "
tree += prefix + current_prefix + node["name"] + "\n"
if node["type"] == "directory":
new_prefix = prefix + (" " if is_last else "│ ")
children = node["children"]
for i, child in enumerate(children):
tree += create_tree_structure(child, new_prefix, i == len(children) - 1)
return tree
def analyze_codebase(path: str, ignore_patterns: List[str] = None, max_file_size: int = 10000000) -> Dict:
"""Main entry point for analyzing a codebase directory."""
if not os.path.exists(path):
raise ValueError(f"Path {path} does not exist")
patterns = ignore_patterns if ignore_patterns is not None else DEFAULT_IGNORE_PATTERNS
result = analyze_directory(path, patterns, path)
repo_name = os.path.basename(os.path.abspath(path))
files = get_all_files(result, max_file_size)
summary = create_summary_string(result, files)
tree = "Directory Structure:\n" + create_tree_structure(result)
formatted_content = create_file_content_string(files)
txt_dump = tree + "\n" + formatted_content
with open(f"../tmp/{repo_name}.txt", "w") as f:
f.write(txt_dump)
return (summary, tree, formatted_content)