Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Modernize Codebase and Enhance CI Workflow #67

Merged
merged 15 commits into from
Dec 29, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
15 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Refactor and enhance gitingest module for improved clarity, maintaina…
…bility, and functionality.

- **Introduced the `CloneConfig` dataclass** to encapsulate cloning parameters, including `url`, `local_path`, `commit`, and `branch`.
- **Enhanced documentation** by adding detailed docstrings to the functions `check_repo_exists`, `run_git_command`, and `clone_repo`.
- **Improved error handling** by refining exception management processes.
- **Streamlined repository existence checks** for increased reliability.
- **Added the `run_git_command` function** to centralize and simplify the execution of Git commands.
- **Refactored code structure** to enhance readability and maintainability.

---

- **Replaced manual hexadecimal comparison (`"0123456789abcdefABCDEF"`)** with the `string` module by defining `HEX_DIGITS = set(string.hexdigits)`.
- **Revised the construction of the `parsed` dictionary** in the `parse_url` function for clarity.
- **Refactored the `parse_patterns` function** to store patterns in a list (`patterns`) instead of repeatedly joining and splitting them.
- **Enhanced documentation** by adding docstrings to the `override_ignore_patterns` and `parse_query` functions.
- **Removed redundant `pattern.strip()` call** in `normalize_pattern`, as this is now handled within `parse_patterns`.
- **Optimized the `override_ignore_patterns` function** by implementing set difference for unordered comparisons.
- **Improved the `parse_query` function's structure** for better readability and maintainability.

---

- **Refined `print_query`, `print_error`, and `print_success` functions** to accept only the `url` parameter, removing the dependency on the entire `query` object.
- **Eliminated the unused `request` argument** from the above functions.
- **Integrated the `CloneConfig` dataclass** for improved parameter handling.

---

- **Adopted the `CloneConfig` dataclass** for consistent parameter management.

---

- **Removed the unused `files` argument** from the `create_summary_string` function to reduce unnecessary complexity.

---

- **Simplified the `AsyncTimeoutError` class** by removing a redundant `pass` statement.

---

- **Updated tests** to utilize the `CloneConfig` dataclass and align with the newly introduced `run_git_command` function for encapsulated Git command execution.

---

- **Aligned comparison with `DEFAULT_IGNORE_PATTERNS`** to use a set difference, ensuring unordered existence comparison.
  • Loading branch information
filipchristiansen committed Dec 28, 2024
commit 086aba050fce2e04ed20583864d13850600ec562
3 changes: 1 addition & 2 deletions src/gitingest/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

import click

from gitingest.ignore_patterns import DEFAULT_IGNORE_PATTERNS
from gitingest.ingest import ingest
from gitingest.ingest_from_query import MAX_FILE_SIZE

Expand Down Expand Up @@ -37,7 +36,7 @@ def main(

if not output:
output = "digest.txt"
summary, tree, content = ingest(source, max_size, include_patterns, exclude_patterns, output=output)
summary, _, _ = ingest(source, max_size, include_patterns, exclude_patterns, output=output)

click.echo(f"Analysis complete! Output written to: {output}")
click.echo("\nSummary:")
Expand Down
174 changes: 121 additions & 53 deletions src/gitingest/clone.py
Original file line number Diff line number Diff line change
@@ -1,79 +1,147 @@
import asyncio
from typing import Any, Dict, Tuple
from dataclasses import dataclass
from typing import Optional, Tuple

from gitingest.utils import async_timeout
from gitingest.utils import AsyncTimeoutError, async_timeout

CLONE_TIMEOUT = 20


@dataclass
class CloneConfig:
url: str
local_path: str
commit: Optional[str] = None
branch: Optional[str] = None


async def check_repo_exists(url: str) -> bool:
"""
Check if a repository exists at the given URL using an HTTP HEAD request.

Parameters
----------
url : str
The URL of the repository.

Returns
-------
bool
True if the repository exists, False otherwise.
"""
proc = await asyncio.create_subprocess_exec(
"curl",
"-I",
url,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
stdout, stderr = await proc.communicate()
stdout, _ = await proc.communicate()
if proc.returncode != 0:
return False
# Check if stdout contains "404" status code
stdout_str = stdout.decode()
return "HTTP/1.1 404" not in stdout_str and "HTTP/2 404" not in stdout_str


@async_timeout(CLONE_TIMEOUT)
async def clone_repo(query: Dict[str, Any]) -> Tuple[bytes, bytes]:
if not await check_repo_exists(query['url']):
raise ValueError("Repository not found, make sure it is public")
async def run_git_command(*args: str) -> Tuple[bytes, bytes]:
"""
Executes a git command asynchronously and captures its output.

Parameters
----------
*args : str
The git command and its arguments to execute.

if query['commit']:
proc = await asyncio.create_subprocess_exec(
"git",
"clone",
"--single-branch",
query['url'],
query['local_path'],
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
stdout, stderr = await proc.communicate()

proc = await asyncio.create_subprocess_exec(
"git",
"-C",
query['local_path'],
"checkout",
query['branch'],
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
stdout, stderr = await proc.communicate()
elif query['branch'] != 'main' and query['branch'] != 'master' and query['branch']:
proc = await asyncio.create_subprocess_exec(
"git",
"clone",
"--depth=1",
"--single-branch",
"--branch",
query['branch'],
query['url'],
query['local_path'],
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
else:
proc = await asyncio.create_subprocess_exec(
"git",
"clone",
"--depth=1",
"--single-branch",
query['url'],
query['local_path'],
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
Returns
-------
Tuple[bytes, bytes]
A tuple containing the stdout and stderr of the git command.

Raises
------
RuntimeError
If the git command exits with a non-zero status.
"""
proc = await asyncio.create_subprocess_exec(
*args,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
stdout, stderr = await proc.communicate()
if proc.returncode != 0:
error_message = stderr.decode().strip()
raise RuntimeError(f"Git command failed: {' '.join(args)}\nError: {error_message}")

return stdout, stderr


@async_timeout(CLONE_TIMEOUT)
async def clone_repo(config: CloneConfig) -> Tuple[bytes, bytes]:
"""
Clones a repository to a local path based on the provided query parameters.

Parameters
----------
config : CloneConfig
A dictionary containing the following keys:
- url (str): The URL of the repository.
- local_path (str): The local path to clone the repository to.
- commit (Optional[str]): The specific commit hash to checkout.
- branch (Optional[str]): The branch to clone. Defaults to 'main' or 'master' if not provided.

Returns
-------
Tuple[bytes, bytes]
A tuple containing the stdout and stderr of the git commands executed.

Raises
------
ValueError
If the repository does not exist or if required query parameters are missing.
RuntimeError
If any git command fails during execution.
AsyncTimeoutError
If the cloning process exceeds the specified timeout.
"""
# Extract and validate query parameters
url: str = config.url
local_path: str = config.local_path
commit: Optional[str] = config.commit
branch: Optional[str] = config.branch

if not url:
raise ValueError("The 'url' parameter is required.")

if not local_path:
raise ValueError("The 'local_path' parameter is required.")

# if commit and branch:
# raise ValueError("Provide either 'commit' or 'branch', not both.")

# Check if the repository exists
if not await check_repo_exists(url):
raise ValueError("Repository not found, make sure it is public")

try:
if commit:
# Scenario 1: Clone and checkout a specific commit
# Clone the repository without depth to ensure full history for checkout
clone_cmd = ["git", "clone", "--single-branch", url, local_path]
await run_git_command(*clone_cmd)

# Checkout the specific commit
checkout_cmd = ["git", "-C", local_path, "checkout", commit]
return await run_git_command(*checkout_cmd)

if branch and branch.lower() not in ('main', 'master'):
# Scenario 2: Clone a specific branch with shallow depth
clone_cmd = ["git", "clone", "--depth=1", "--single-branch", "--branch", branch, url, local_path]
return await run_git_command(*clone_cmd)

# Scenario 3: Clone the default branch with shallow depth
clone_cmd = ["git", "clone", "--depth=1", "--single-branch", url, local_path]
return await run_git_command(*clone_cmd)

except (RuntimeError, asyncio.TimeoutError, AsyncTimeoutError):
raise # Re-raise the exception
13 changes: 10 additions & 3 deletions src/gitingest/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@
from pathlib import Path
from typing import List, Optional, Tuple, Union

from gitingest.clone import clone_repo
from gitingest.clone import CloneConfig, clone_repo
from gitingest.ingest_from_query import ingest_from_query
from gitingest.parse_query import parse_query


def ingest(
source: str,
max_file_size: int = 10 * 1024 * 1024,
max_file_size: int = 10 * 1024 * 1024, # 10 MB
include_patterns: Union[List[str], str, None] = None,
exclude_patterns: Union[List[str], str, None] = None,
output: Optional[str] = None,
Expand All @@ -25,7 +25,14 @@ def ingest(
ignore_patterns=exclude_patterns,
)
if query['url']:
clone_result = clone_repo(query)
# Extract relevant fields for CloneConfig
clone_config = CloneConfig(
url=f"https://github.com/{query['slug']}.git",
local_path=query['local_path'],
commit=query.get('commit'),
branch=query.get('branch'),
)
clone_result = clone_repo(clone_config)
if inspect.iscoroutine(clone_result):
asyncio.run(clone_result)
else:
Expand Down
11 changes: 3 additions & 8 deletions src/gitingest/ingest_from_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,7 @@ def create_file_content_string(files: List[Dict[str, Any]]) -> str:
return output


def create_summary_string(query: Dict[str, Any], nodes: Dict[str, Any], files: List[Dict[str, Any]]) -> str:
def create_summary_string(query: Dict[str, Any], nodes: Dict[str, Any]) -> str:
"""Creates a summary string with file counts and content size."""
if "user_name" in query:
summary = f"Repository: {query['user_name']}/{query['repo_name']}\n"
Expand All @@ -297,12 +297,7 @@ def create_summary_string(query: Dict[str, Any], nodes: Dict[str, Any], files: L
return summary


def create_tree_structure(
query: Dict[str, Any],
node: Dict[str, Any],
prefix: str = "",
is_last: bool = True,
) -> str:
def create_tree_structure(query: Dict[str, Any], node: Dict[str, Any], prefix: str = "", is_last: bool = True) -> str:
"""Creates a tree-like string representation of the file structure."""
tree = ""

Expand Down Expand Up @@ -386,7 +381,7 @@ def ingest_directory(path: str, query: Dict[str, Any]) -> Tuple[str, str, str]:
if not nodes:
raise ValueError(f"No files found in {path}")
files = extract_files_content(query=query, node=nodes, max_file_size=query['max_file_size'])
summary = create_summary_string(query, nodes, files)
summary = create_summary_string(query, nodes)
tree = "Directory structure:\n" + create_tree_structure(query, nodes)
files_content = create_file_content_string(files)

Expand Down
Loading