Skip to content

Commit cf1aa6f

Browse files
resolve commit
1 parent 340edb6 commit cf1aa6f

19 files changed

+619
-537
lines changed

src/gitingest/__init__.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,5 @@
11
"""Gitingest: A package for ingesting data from Git repositories."""
22

3-
from gitingest.clone import clone_repo
43
from gitingest.entrypoint import ingest, ingest_async
5-
from gitingest.ingestion import ingest_query
6-
from gitingest.query_parser import parse_query
74

8-
__all__ = ["clone_repo", "ingest", "ingest_async", "ingest_query", "parse_query"]
5+
__all__ = ["ingest", "ingest_async"]

src/gitingest/clone.py

Lines changed: 17 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,15 @@
88
from gitingest.config import DEFAULT_TIMEOUT
99
from gitingest.utils.git_utils import (
1010
check_repo_exists,
11+
checkout_partial_clone,
1112
create_git_auth_header,
1213
create_git_command,
1314
ensure_git_installed,
1415
is_github_host,
16+
resolve_commit,
1517
run_command,
1618
)
17-
from gitingest.utils.os_utils import ensure_directory
19+
from gitingest.utils.os_utils import ensure_directory_exists_or_create
1820
from gitingest.utils.timeout_wrapper import async_timeout
1921

2022
if TYPE_CHECKING:
@@ -45,71 +47,42 @@ async def clone_repo(config: CloneConfig, *, token: str | None = None) -> None:
4547
# Extract and validate query parameters
4648
url: str = config.url
4749
local_path: str = config.local_path
48-
commit: str | None = config.commit
49-
branch: str | None = config.branch
50-
tag: str | None = config.tag
5150
partial_clone: bool = config.subpath != "/"
5251

53-
# Create parent directory if it doesn't exist
54-
await ensure_directory(Path(local_path).parent)
52+
await ensure_git_installed()
53+
await ensure_directory_exists_or_create(Path(local_path).parent)
5554

56-
# Check if the repository exists
5755
if not await check_repo_exists(url, token=token):
5856
msg = "Repository not found. Make sure it is public or that you have provided a valid token."
5957
raise ValueError(msg)
6058

59+
commit = await resolve_commit(config, token=token)
60+
6161
clone_cmd = ["git"]
6262
if token and is_github_host(url):
6363
clone_cmd += ["-c", create_git_auth_header(token, url=url)]
6464

65-
clone_cmd += ["clone", "--single-branch"]
66-
67-
if config.include_submodules:
68-
clone_cmd += ["--recurse-submodules"]
69-
65+
clone_cmd += ["clone", "--single-branch", "--no-checkout", "--depth=1"]
7066
if partial_clone:
7167
clone_cmd += ["--filter=blob:none", "--sparse"]
7268

73-
# Shallow clone unless a specific commit is requested
74-
if not commit:
75-
clone_cmd += ["--depth=1"]
76-
77-
# Prefer tag over branch when both are provided
78-
if tag:
79-
clone_cmd += ["--branch", tag]
80-
elif branch and branch.lower() not in ("main", "master"):
81-
clone_cmd += ["--branch", branch]
82-
8369
clone_cmd += [url, local_path]
8470

8571
# Clone the repository
86-
await ensure_git_installed()
8772
await run_command(*clone_cmd)
8873

8974
# Checkout the subpath if it is a partial clone
9075
if partial_clone:
91-
await _checkout_partial_clone(config, token)
76+
await checkout_partial_clone(config, token=token)
9277

93-
# Checkout the commit if it is provided
94-
if commit:
95-
checkout_cmd = create_git_command(["git"], local_path, url, token)
96-
await run_command(*checkout_cmd, "checkout", commit)
78+
git = create_git_command(["git"], local_path, url, token)
9779

80+
# Ensure the commit is locally available
81+
await run_command(*git, "fetch", "--depth=1", "origin", commit)
9882

99-
async def _checkout_partial_clone(config: CloneConfig, token: str | None) -> None:
100-
"""Configure sparse-checkout for a partially cloned repository.
83+
# Write the work-tree at that commit
84+
await run_command(*git, "checkout", commit)
10185

102-
Parameters
103-
----------
104-
config : CloneConfig
105-
The configuration for cloning the repository, including subpath and blob flag.
106-
token : str | None
107-
GitHub personal access token (PAT) for accessing private repositories.
108-
109-
"""
110-
subpath = config.subpath.lstrip("/")
111-
if config.blob:
112-
# Remove the file name from the subpath when ingesting from a file url (e.g. blob/branch/path/file.txt)
113-
subpath = str(Path(subpath).parent.as_posix())
114-
checkout_cmd = create_git_command(["git"], config.local_path, config.url, token)
115-
await run_command(*checkout_cmd, "sparse-checkout", "set", subpath)
86+
# Update submodules
87+
if config.include_submodules:
88+
await run_command(*git, "submodule", "update", "--init", "--recursive", "--depth=1")

src/gitingest/entrypoint.py

Lines changed: 25 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,21 @@
88
import warnings
99
from contextlib import asynccontextmanager
1010
from pathlib import Path
11-
from typing import AsyncGenerator
11+
from typing import TYPE_CHECKING, AsyncGenerator
12+
from urllib.parse import urlparse
1213

1314
from gitingest.clone import clone_repo
1415
from gitingest.config import MAX_FILE_SIZE
1516
from gitingest.ingestion import ingest_query
16-
from gitingest.query_parser import IngestionQuery, parse_query
17+
from gitingest.query_parser import parse_local_dir_path, parse_remote_repo
1718
from gitingest.utils.auth import resolve_token
19+
from gitingest.utils.compat_func import removesuffix
1820
from gitingest.utils.ignore_patterns import load_ignore_patterns
21+
from gitingest.utils.pattern_utils import process_patterns
22+
from gitingest.utils.query_parser_utils import KNOWN_GIT_HOSTS
23+
24+
if TYPE_CHECKING:
25+
from gitingest.schemas import IngestionQuery
1926

2027

2128
async def ingest_async(
@@ -74,23 +81,28 @@ async def ingest_async(
7481
"""
7582
token = resolve_token(token)
7683

77-
query: IngestionQuery = await parse_query(
78-
source=source,
79-
max_file_size=max_file_size,
80-
from_web=False,
84+
source = removesuffix(source.strip(), ".git")
85+
86+
# Determine the parsing method based on the source type
87+
if urlparse(source).scheme in ("https", "http") or any(h in source for h in KNOWN_GIT_HOSTS):
88+
# We either have a full URL or a domain-less slug
89+
query = await parse_remote_repo(source, token=token)
90+
query.include_submodules = include_submodules
91+
_override_branch_and_tag(query, branch=branch, tag=tag)
92+
93+
else:
94+
# Local path scenario
95+
query = parse_local_dir_path(source)
96+
97+
query.max_file_size = max_file_size
98+
query.ignore_patterns, query.include_patterns = process_patterns(
99+
exclude_patterns=exclude_patterns,
81100
include_patterns=include_patterns,
82-
ignore_patterns=exclude_patterns,
83-
token=token,
84101
)
85102

86103
if not include_gitignored:
87104
_apply_gitignores(query)
88105

89-
if query.url:
90-
_override_branch_and_tag(query, branch=branch, tag=tag)
91-
92-
query.include_submodules = include_submodules
93-
94106
async with _clone_repo_if_remote(query, token=token):
95107
summary, tree, content = ingest_query(query)
96108
await _write_output(tree, content=content, target=output)

src/gitingest/ingestion.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from gitingest.utils.ingestion_utils import _should_exclude, _should_include
1212

1313
if TYPE_CHECKING:
14-
from gitingest.query_parser import IngestionQuery
14+
from gitingest.schemas import IngestionQuery
1515

1616

1717
def ingest_query(query: IngestionQuery) -> tuple[str, str, str]:

src/gitingest/output_formatter.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from gitingest.utils.compat_func import readlink
1111

1212
if TYPE_CHECKING:
13-
from gitingest.query_parser import IngestionQuery
13+
from gitingest.schemas import IngestionQuery
1414

1515
_TOKEN_THRESHOLDS: list[tuple[int, str]] = [
1616
(1_000_000, "M"),
@@ -84,6 +84,8 @@ def _create_summary_prefix(query: IngestionQuery, *, single_file: bool = False)
8484

8585
if query.commit:
8686
parts.append(f"Commit: {query.commit}")
87+
elif query.tag:
88+
parts.append(f"Tag: {query.tag}")
8789
elif query.branch and query.branch not in ("main", "master"):
8890
parts.append(f"Branch: {query.branch}")
8991

0 commit comments

Comments
 (0)