Skip to content

refactor: consistent cloning & pattern-handling #388

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/publish_to_pypi.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ name: Publish to PyPI

on:
release:
types: [created] # Run when you click Publish release
types: [created] # Run when you click "Publish release"
workflow_dispatch: # ... or run it manually from the Actions tab

permissions:
Expand Down Expand Up @@ -33,7 +33,7 @@ jobs:
name: dist
path: dist/

# Publish to PyPI (only if dist/ succeeded)
# Publish to PyPI (only if "dist/" succeeded)
pypi-publish:
needs: release-build
runs-on: ubuntu-latest
Expand Down
5 changes: 1 addition & 4 deletions src/gitingest/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
"""Gitingest: A package for ingesting data from Git repositories."""

from gitingest.clone import clone_repo
from gitingest.entrypoint import ingest, ingest_async
from gitingest.ingestion import ingest_query
from gitingest.query_parser import parse_query

__all__ = ["clone_repo", "ingest", "ingest_async", "ingest_query", "parse_query"]
__all__ = ["ingest", "ingest_async"]
61 changes: 17 additions & 44 deletions src/gitingest/clone.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,15 @@
from gitingest.config import DEFAULT_TIMEOUT
from gitingest.utils.git_utils import (
check_repo_exists,
checkout_partial_clone,
create_git_auth_header,
create_git_command,
ensure_git_installed,
is_github_host,
resolve_commit,
run_command,
)
from gitingest.utils.os_utils import ensure_directory
from gitingest.utils.os_utils import ensure_directory_exists_or_create
from gitingest.utils.timeout_wrapper import async_timeout

if TYPE_CHECKING:
Expand Down Expand Up @@ -45,71 +47,42 @@ async def clone_repo(config: CloneConfig, *, token: str | None = None) -> None:
# Extract and validate query parameters
url: str = config.url
local_path: str = config.local_path
commit: str | None = config.commit
branch: str | None = config.branch
tag: str | None = config.tag
partial_clone: bool = config.subpath != "/"

# Create parent directory if it doesn't exist
await ensure_directory(Path(local_path).parent)
await ensure_git_installed()
await ensure_directory_exists_or_create(Path(local_path).parent)

# Check if the repository exists
if not await check_repo_exists(url, token=token):
msg = "Repository not found. Make sure it is public or that you have provided a valid token."
raise ValueError(msg)

commit = await resolve_commit(config, token=token)

clone_cmd = ["git"]
if token and is_github_host(url):
clone_cmd += ["-c", create_git_auth_header(token, url=url)]

clone_cmd += ["clone", "--single-branch"]

if config.include_submodules:
clone_cmd += ["--recurse-submodules"]

clone_cmd += ["clone", "--single-branch", "--no-checkout", "--depth=1"]
if partial_clone:
clone_cmd += ["--filter=blob:none", "--sparse"]

# Shallow clone unless a specific commit is requested
if not commit:
clone_cmd += ["--depth=1"]

# Prefer tag over branch when both are provided
if tag:
clone_cmd += ["--branch", tag]
elif branch and branch.lower() not in ("main", "master"):
clone_cmd += ["--branch", branch]

clone_cmd += [url, local_path]

# Clone the repository
await ensure_git_installed()
await run_command(*clone_cmd)

# Checkout the subpath if it is a partial clone
if partial_clone:
await _checkout_partial_clone(config, token)
await checkout_partial_clone(config, token=token)

# Checkout the commit if it is provided
if commit:
checkout_cmd = create_git_command(["git"], local_path, url, token)
await run_command(*checkout_cmd, "checkout", commit)
git = create_git_command(["git"], local_path, url, token)

# Ensure the commit is locally available
await run_command(*git, "fetch", "--depth=1", "origin", commit)

async def _checkout_partial_clone(config: CloneConfig, token: str | None) -> None:
"""Configure sparse-checkout for a partially cloned repository.
# Write the work-tree at that commit
await run_command(*git, "checkout", commit)

Parameters
----------
config : CloneConfig
The configuration for cloning the repository, including subpath and blob flag.
token : str | None
GitHub personal access token (PAT) for accessing private repositories.
"""
subpath = config.subpath.lstrip("/")
if config.blob:
# Remove the file name from the subpath when ingesting from a file url (e.g. blob/branch/path/file.txt)
subpath = str(Path(subpath).parent.as_posix())
checkout_cmd = create_git_command(["git"], config.local_path, config.url, token)
await run_command(*checkout_cmd, "sparse-checkout", "set", subpath)
# Update submodules
if config.include_submodules:
await run_command(*git, "submodule", "update", "--init", "--recursive", "--depth=1")
76 changes: 62 additions & 14 deletions src/gitingest/entrypoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,30 @@
from __future__ import annotations

import asyncio
import errno
import shutil
import stat
import sys
import warnings
from contextlib import asynccontextmanager
from pathlib import Path
from typing import AsyncGenerator
from typing import TYPE_CHECKING, AsyncGenerator, Callable
from urllib.parse import urlparse

from gitingest.clone import clone_repo
from gitingest.config import MAX_FILE_SIZE
from gitingest.ingestion import ingest_query
from gitingest.query_parser import IngestionQuery, parse_query
from gitingest.query_parser import parse_local_dir_path, parse_remote_repo
from gitingest.utils.auth import resolve_token
from gitingest.utils.compat_func import removesuffix
from gitingest.utils.ignore_patterns import load_ignore_patterns
from gitingest.utils.pattern_utils import process_patterns
from gitingest.utils.query_parser_utils import KNOWN_GIT_HOSTS

if TYPE_CHECKING:
from types import TracebackType

from gitingest.schemas import IngestionQuery


async def ingest_async(
Expand Down Expand Up @@ -74,23 +85,28 @@ async def ingest_async(
"""
token = resolve_token(token)

query: IngestionQuery = await parse_query(
source=source,
max_file_size=max_file_size,
from_web=False,
source = removesuffix(source.strip(), ".git")

# Determine the parsing method based on the source type
if urlparse(source).scheme in ("https", "http") or any(h in source for h in KNOWN_GIT_HOSTS):
# We either have a full URL or a domain-less slug
query = await parse_remote_repo(source, token=token)
query.include_submodules = include_submodules
_override_branch_and_tag(query, branch=branch, tag=tag)

else:
# Local path scenario
query = parse_local_dir_path(source)

query.max_file_size = max_file_size
query.ignore_patterns, query.include_patterns = process_patterns(
exclude_patterns=exclude_patterns,
include_patterns=include_patterns,
ignore_patterns=exclude_patterns,
token=token,
)

if not include_gitignored:
_apply_gitignores(query)

if query.url:
_override_branch_and_tag(query, branch=branch, tag=tag)

query.include_submodules = include_submodules

async with _clone_repo_if_remote(query, token=token):
summary, tree, content = ingest_query(query)
await _write_output(tree, content=content, target=output)
Expand Down Expand Up @@ -236,17 +252,49 @@ async def _clone_repo_if_remote(query: IngestionQuery, *, token: str | None) ->
GitHub personal access token (PAT) for accessing private repositories.

"""
kwargs = {}
if sys.version_info >= (3, 12):
kwargs["onexc"] = _handle_remove_readonly
else:
kwargs["onerror"] = _handle_remove_readonly

if query.url:
clone_config = query.extract_clone_config()
await clone_repo(clone_config, token=token)
try:
yield
finally:
shutil.rmtree(query.local_path.parent)
shutil.rmtree(query.local_path.parent, **kwargs)
else:
yield


def _handle_remove_readonly(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When is the cloned repo read-only? Does this only happen on Windows, due to certain operating system special cases?

Copy link
Contributor Author

@filipchristiansen filipchristiansen Jul 12, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It happens on Windows. Discovered it when added the tests/test_summary.py tests cases.

func: Callable,
path: str,
exc_info: BaseException | tuple[type[BaseException], BaseException, TracebackType],
) -> None:
"""Handle permission errors raised by ``shutil.rmtree()``.

* Makes the target writable (removes the read-only attribute).
* Retries the original operation (``func``) once.

"""
# 'onerror' passes a (type, value, tb) tuple; 'onexc' passes the exception
if isinstance(exc_info, tuple): # 'onerror' (Python <3.12)
exc: BaseException = exc_info[1]
else: # 'onexc' (Python 3.12+)
exc = exc_info

# Handle only'Permission denied' and 'Operation not permitted'
if not isinstance(exc, OSError) or exc.errno not in {errno.EACCES, errno.EPERM}:
raise exc

# Make the target writable
Path(path).chmod(stat.S_IWRITE)
func(path)


async def _write_output(tree: str, content: str, target: str | None) -> None:
"""Write combined output to ``target`` (``"-"`` ⇒ stdout).

Expand Down
2 changes: 1 addition & 1 deletion src/gitingest/ingestion.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from gitingest.utils.ingestion_utils import _should_exclude, _should_include

if TYPE_CHECKING:
from gitingest.query_parser import IngestionQuery
from gitingest.schemas import IngestionQuery


def ingest_query(query: IngestionQuery) -> tuple[str, str, str]:
Expand Down
4 changes: 3 additions & 1 deletion src/gitingest/output_formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from gitingest.utils.compat_func import readlink

if TYPE_CHECKING:
from gitingest.query_parser import IngestionQuery
from gitingest.schemas import IngestionQuery

_TOKEN_THRESHOLDS: list[tuple[int, str]] = [
(1_000_000, "M"),
Expand Down Expand Up @@ -84,6 +84,8 @@ def _create_summary_prefix(query: IngestionQuery, *, single_file: bool = False)

if query.commit:
parts.append(f"Commit: {query.commit}")
elif query.tag:
parts.append(f"Tag: {query.tag}")
elif query.branch and query.branch not in ("main", "master"):
parts.append(f"Branch: {query.branch}")

Expand Down
Loading
Loading