Skip to content

Commit 75ee8f7

Browse files
Refactor/gitingest structure (cyclotruc#66)
Refactor and enhance gitingest module for improved clarity, maintainability, and functionality.
1 parent 16def8a commit 75ee8f7

9 files changed

+295
-207
lines changed

src/gitingest/cli.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ def main(
3737

3838
if not output:
3939
output = "digest.txt"
40-
summary, tree, content = ingest(source, max_size, include_patterns, exclude_patterns, output=output)
40+
summary, _, _ = ingest(source, max_size, include_patterns, exclude_patterns, output=output)
4141

4242
click.echo(f"Analysis complete! Output written to: {output}")
4343
click.echo("\nSummary:")

src/gitingest/clone.py

+118-53
Original file line numberDiff line numberDiff line change
@@ -1,79 +1,144 @@
11
import asyncio
2-
from typing import Any, Dict, Tuple
2+
from dataclasses import dataclass
3+
from typing import Optional, Tuple
34

4-
from gitingest.utils import async_timeout
5+
from gitingest.utils import AsyncTimeoutError, async_timeout
56

67
CLONE_TIMEOUT = 20
78

89

10+
@dataclass
11+
class CloneConfig:
12+
url: str
13+
local_path: str
14+
commit: Optional[str] = None
15+
branch: Optional[str] = None
16+
17+
918
async def check_repo_exists(url: str) -> bool:
19+
"""
20+
Check if a repository exists at the given URL using an HTTP HEAD request.
21+
22+
Parameters
23+
----------
24+
url : str
25+
The URL of the repository.
26+
27+
Returns
28+
-------
29+
bool
30+
True if the repository exists, False otherwise.
31+
"""
1032
proc = await asyncio.create_subprocess_exec(
1133
"curl",
1234
"-I",
1335
url,
1436
stdout=asyncio.subprocess.PIPE,
1537
stderr=asyncio.subprocess.PIPE,
1638
)
17-
stdout, stderr = await proc.communicate()
39+
stdout, _ = await proc.communicate()
1840
if proc.returncode != 0:
1941
return False
2042
# Check if stdout contains "404" status code
2143
stdout_str = stdout.decode()
2244
return "HTTP/1.1 404" not in stdout_str and "HTTP/2 404" not in stdout_str
2345

2446

25-
@async_timeout(CLONE_TIMEOUT)
26-
async def clone_repo(query: Dict[str, Any]) -> Tuple[bytes, bytes]:
27-
if not await check_repo_exists(query['url']):
28-
raise ValueError("Repository not found, make sure it is public")
47+
async def run_git_command(*args: str) -> Tuple[bytes, bytes]:
48+
"""
49+
Executes a git command asynchronously and captures its output.
2950
30-
if query['commit']:
31-
proc = await asyncio.create_subprocess_exec(
32-
"git",
33-
"clone",
34-
"--single-branch",
35-
query['url'],
36-
query['local_path'],
37-
stdout=asyncio.subprocess.PIPE,
38-
stderr=asyncio.subprocess.PIPE,
39-
)
40-
stdout, stderr = await proc.communicate()
41-
42-
proc = await asyncio.create_subprocess_exec(
43-
"git",
44-
"-C",
45-
query['local_path'],
46-
"checkout",
47-
query['branch'],
48-
stdout=asyncio.subprocess.PIPE,
49-
stderr=asyncio.subprocess.PIPE,
50-
)
51-
stdout, stderr = await proc.communicate()
52-
elif query['branch'] != 'main' and query['branch'] != 'master' and query['branch']:
53-
proc = await asyncio.create_subprocess_exec(
54-
"git",
55-
"clone",
56-
"--depth=1",
57-
"--single-branch",
58-
"--branch",
59-
query['branch'],
60-
query['url'],
61-
query['local_path'],
62-
stdout=asyncio.subprocess.PIPE,
63-
stderr=asyncio.subprocess.PIPE,
64-
)
65-
else:
66-
proc = await asyncio.create_subprocess_exec(
67-
"git",
68-
"clone",
69-
"--depth=1",
70-
"--single-branch",
71-
query['url'],
72-
query['local_path'],
73-
stdout=asyncio.subprocess.PIPE,
74-
stderr=asyncio.subprocess.PIPE,
75-
)
51+
Parameters
52+
----------
53+
*args : str
54+
The git command and its arguments to execute.
7655
56+
Returns
57+
-------
58+
Tuple[bytes, bytes]
59+
A tuple containing the stdout and stderr of the git command.
60+
61+
Raises
62+
------
63+
RuntimeError
64+
If the git command exits with a non-zero status.
65+
"""
66+
proc = await asyncio.create_subprocess_exec(
67+
*args,
68+
stdout=asyncio.subprocess.PIPE,
69+
stderr=asyncio.subprocess.PIPE,
70+
)
7771
stdout, stderr = await proc.communicate()
72+
if proc.returncode != 0:
73+
error_message = stderr.decode().strip()
74+
raise RuntimeError(f"Git command failed: {' '.join(args)}\nError: {error_message}")
7875

7976
return stdout, stderr
77+
78+
79+
@async_timeout(CLONE_TIMEOUT)
80+
async def clone_repo(config: CloneConfig) -> Tuple[bytes, bytes]:
81+
"""
82+
Clones a repository to a local path based on the provided query parameters.
83+
84+
Parameters
85+
----------
86+
config : CloneConfig
87+
A dictionary containing the following keys:
88+
- url (str): The URL of the repository.
89+
- local_path (str): The local path to clone the repository to.
90+
- commit (Optional[str]): The specific commit hash to checkout.
91+
- branch (Optional[str]): The branch to clone. Defaults to 'main' or 'master' if not provided.
92+
93+
Returns
94+
-------
95+
Tuple[bytes, bytes]
96+
A tuple containing the stdout and stderr of the git commands executed.
97+
98+
Raises
99+
------
100+
ValueError
101+
If the repository does not exist or if required query parameters are missing.
102+
RuntimeError
103+
If any git command fails during execution.
104+
AsyncTimeoutError
105+
If the cloning process exceeds the specified timeout.
106+
"""
107+
# Extract and validate query parameters
108+
url: str = config.url
109+
local_path: str = config.local_path
110+
commit: Optional[str] = config.commit
111+
branch: Optional[str] = config.branch
112+
113+
if not url:
114+
raise ValueError("The 'url' parameter is required.")
115+
116+
if not local_path:
117+
raise ValueError("The 'local_path' parameter is required.")
118+
119+
# Check if the repository exists
120+
if not await check_repo_exists(url):
121+
raise ValueError("Repository not found, make sure it is public")
122+
123+
try:
124+
if commit:
125+
# Scenario 1: Clone and checkout a specific commit
126+
# Clone the repository without depth to ensure full history for checkout
127+
clone_cmd = ["git", "clone", "--single-branch", url, local_path]
128+
await run_git_command(*clone_cmd)
129+
130+
# Checkout the specific commit
131+
checkout_cmd = ["git", "-C", local_path, "checkout", commit]
132+
return await run_git_command(*checkout_cmd)
133+
134+
if branch and branch.lower() not in ('main', 'master'):
135+
# Scenario 2: Clone a specific branch with shallow depth
136+
clone_cmd = ["git", "clone", "--depth=1", "--single-branch", "--branch", branch, url, local_path]
137+
return await run_git_command(*clone_cmd)
138+
139+
# Scenario 3: Clone the default branch with shallow depth
140+
clone_cmd = ["git", "clone", "--depth=1", "--single-branch", url, local_path]
141+
return await run_git_command(*clone_cmd)
142+
143+
except (RuntimeError, asyncio.TimeoutError, AsyncTimeoutError):
144+
raise # Re-raise the exception

src/gitingest/ingest.py

+12-3
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,14 @@
44
from pathlib import Path
55
from typing import List, Optional, Tuple, Union
66

7-
from gitingest.clone import clone_repo
7+
from gitingest.clone import CloneConfig, clone_repo
88
from gitingest.ingest_from_query import ingest_from_query
99
from gitingest.parse_query import parse_query
1010

1111

1212
def ingest(
1313
source: str,
14-
max_file_size: int = 10 * 1024 * 1024,
14+
max_file_size: int = 10 * 1024 * 1024, # 10 MB
1515
include_patterns: Union[List[str], str, None] = None,
1616
exclude_patterns: Union[List[str], str, None] = None,
1717
output: Optional[str] = None,
@@ -25,7 +25,16 @@ def ingest(
2525
ignore_patterns=exclude_patterns,
2626
)
2727
if query['url']:
28-
clone_result = clone_repo(query)
28+
29+
# Extract relevant fields for CloneConfig
30+
clone_config = CloneConfig(
31+
url=query["url"],
32+
local_path=query['local_path'],
33+
commit=query.get('commit'),
34+
branch=query.get('branch'),
35+
)
36+
clone_result = clone_repo(clone_config)
37+
2938
if inspect.iscoroutine(clone_result):
3039
asyncio.run(clone_result)
3140
else:

src/gitingest/ingest_from_query.py

+3-8
Original file line numberDiff line numberDiff line change
@@ -278,7 +278,7 @@ def create_file_content_string(files: List[Dict[str, Any]]) -> str:
278278
return output
279279

280280

281-
def create_summary_string(query: Dict[str, Any], nodes: Dict[str, Any], files: List[Dict[str, Any]]) -> str:
281+
def create_summary_string(query: Dict[str, Any], nodes: Dict[str, Any]) -> str:
282282
"""Creates a summary string with file counts and content size."""
283283
if "user_name" in query:
284284
summary = f"Repository: {query['user_name']}/{query['repo_name']}\n"
@@ -297,12 +297,7 @@ def create_summary_string(query: Dict[str, Any], nodes: Dict[str, Any], files: L
297297
return summary
298298

299299

300-
def create_tree_structure(
301-
query: Dict[str, Any],
302-
node: Dict[str, Any],
303-
prefix: str = "",
304-
is_last: bool = True,
305-
) -> str:
300+
def create_tree_structure(query: Dict[str, Any], node: Dict[str, Any], prefix: str = "", is_last: bool = True) -> str:
306301
"""Creates a tree-like string representation of the file structure."""
307302
tree = ""
308303

@@ -386,7 +381,7 @@ def ingest_directory(path: str, query: Dict[str, Any]) -> Tuple[str, str, str]:
386381
if not nodes:
387382
raise ValueError(f"No files found in {path}")
388383
files = extract_files_content(query=query, node=nodes, max_file_size=query['max_file_size'])
389-
summary = create_summary_string(query, nodes, files)
384+
summary = create_summary_string(query, nodes)
390385
tree = "Directory structure:\n" + create_tree_structure(query, nodes)
391386
files_content = create_file_content_string(files)
392387

0 commit comments

Comments
 (0)