Skip to content

Commit 77d51c7

Browse files
committed
feat(git): Implement ChangeSet class for representing arbitrary file changes
* Implement `ChangeSet` class * Implement parsing function for building from `git diff` output * Include `ChangeSet` in commit details * Add `get_commit_changes` and `get_working_tree_changes` to git tool class
1 parent b8f9bc0 commit 77d51c7

File tree

3 files changed

+269
-1
lines changed

3 files changed

+269
-1
lines changed

src/dda/tools/git.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from typing import Any
1313

1414
from dda.utils.fs import Path
15+
from dda.utils.git.changeset import ChangeSet
1516
from dda.utils.git.commit import Commit, CommitDetails
1617
from dda.utils.git.sha1hash import SHA1Hash
1718

@@ -197,4 +198,43 @@ def get_commit_details(self, sha1: SHA1Hash, repo_path: Path | None = None) -> C
197198
datetime=datetime.fromisoformat(date_str),
198199
message="\n".join(message_lines).strip().strip('"'),
199200
parent_shas=[SHA1Hash(parent_sha) for parent_sha in parents_str.split()],
201+
changes=self.get_commit_changes(sha1, repo_path),
200202
)
203+
204+
def get_commit_changes(self, sha1: SHA1Hash, repo_path: Path | None = None) -> ChangeSet:
205+
"""
206+
Get the changes of the given commit in the Git repository at the given path.
207+
If no path is given, use the current working directory.
208+
"""
209+
from dda.utils.fs import Path
210+
from dda.utils.git.changeset import FILECHANGES_GIT_DIFF_ARGS, ChangeSet
211+
212+
repo_path = Path(repo_path or ".").resolve()
213+
return ChangeSet.generate_from_diff_output(
214+
self.capture([*FILECHANGES_GIT_DIFF_ARGS, f"{sha1}^", str(sha1)], cwd=str(repo_path)),
215+
)
216+
217+
def get_working_tree_changes(self, repo_path: Path | None = None) -> ChangeSet:
218+
"""
219+
Get the changes in the working tree of the Git repository at the given path.
220+
If no path is given, use the current working directory.
221+
"""
222+
from itertools import chain
223+
224+
from dda.utils.fs import Path
225+
from dda.utils.git.changeset import FILECHANGES_GIT_DIFF_ARGS, ChangeSet
226+
227+
repo_path = Path(repo_path or ".").resolve()
228+
with repo_path.as_cwd():
229+
# Capture changes to already-tracked files - `diff HEAD` does not include any untracked files !
230+
tracked_changes = ChangeSet.generate_from_diff_output(self.capture([*FILECHANGES_GIT_DIFF_ARGS, "HEAD"]))
231+
232+
# Capture changes to untracked files
233+
untracked_files = self.capture(["git", "ls-files", "--others", "--exclude-standard"]).strip().splitlines()
234+
diffs = chain.from_iterable(
235+
self.capture([*FILECHANGES_GIT_DIFF_ARGS, file]).strip().splitlines() for file in untracked_files
236+
)
237+
untracked_changes = ChangeSet.generate_from_diff_output(diffs)
238+
239+
# Combine the changes
240+
return ChangeSet(tracked_changes | untracked_changes)

src/dda/utils/git/changeset.py

Lines changed: 212 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,212 @@
1+
# SPDX-FileCopyrightText: 2025-present Datadog, Inc. <dev@datadoghq.com>
2+
#
3+
# SPDX-License-Identifier: MIT
4+
from collections.abc import Generator, Iterable
5+
from dataclasses import dataclass
6+
from enum import StrEnum
7+
from functools import cached_property
8+
from typing import Self
9+
10+
from dda.utils.fs import Path
11+
from dda.utils.git.sha1hash import SHA1Hash
12+
13+
FILECHANGES_GIT_DIFF_ARGS = ["diff", "-U0", "--no-color", "--no-prefix", "--no-renames"]
14+
15+
16+
class ChangeType(StrEnum):
17+
ADDED = "A"
18+
MODIFIED = "M"
19+
DELETED = "D"
20+
21+
@classmethod
22+
def from_github_status(cls, status: str) -> "ChangeType":
23+
mapping = {
24+
"added": cls.ADDED,
25+
"modified": cls.MODIFIED,
26+
"deleted": cls.DELETED,
27+
}
28+
try:
29+
return mapping[status]
30+
except KeyError as e:
31+
msg = f"Invalid GitHub change type message: {status}"
32+
raise ValueError(msg) from e
33+
34+
35+
@dataclass(frozen=True, order=True)
36+
class FileChanges:
37+
"""Represents changes to a single file in a git repository."""
38+
39+
file: Path
40+
type: ChangeType
41+
42+
patch: str
43+
"""
44+
The patch representing the changes to the file, in unified diff format.
45+
We only keep the hunk lines (starting with @@) and the lines starting with + or - (no extra context lines).
46+
This is similar to the format used by the `patch` fields in GitHub's API.
47+
48+
Example:
49+
```diff
50+
@@ -15,2 +15 @@ if TYPE_CHECKING:
51+
- from dda.utils.git.commit import Commit, CommitDetails
52+
- from dda.utils.git.sha1hash import SHA1Hash
53+
+ from dda.utils.git.commit import Commit
54+
```
55+
"""
56+
57+
# TODO: This might be a bit brittle - or we might want to move this to a separate file ?
58+
@classmethod
59+
def generate_from_diff_output(cls, diff_output: Iterable[str] | str) -> Generator[Self, None, None]:
60+
"""
61+
Generate a list of FileChanges from the output of _some_ git diff commands.
62+
Not all outputs from `git diff` are supported (ex: renames), you should run:
63+
```bash
64+
git diff -U0 --no-color --no-prefix --no-renames <oldrev> <newrev>
65+
```
66+
Accepts an Iterable of lines or a single string as argument.
67+
"""
68+
if isinstance(diff_output, str):
69+
diff_output = diff_output.strip().splitlines()
70+
71+
line_iterator = iter(diff_output)
72+
73+
current_file: Path | None = None
74+
current_type: ChangeType | None = None
75+
current_patch_lines: list[str] = []
76+
iterator_exhausted = False
77+
78+
try:
79+
line = next(line_iterator)
80+
while True:
81+
# Start processing a new file - the line looks like `diff --git a/<path> b/<path>`
82+
if not line.startswith("diff --git "):
83+
msg = f"Unexpected line in git diff output: {line}"
84+
raise ValueError(msg)
85+
86+
# Go forward until we find the 'old file' line (---)
87+
while not line.startswith("--- "):
88+
try:
89+
line = next(line_iterator)
90+
except StopIteration:
91+
msg = "Unexpected end of git diff output while looking for --- line"
92+
raise ValueError(msg) # noqa: B904
93+
94+
# When we get here, we are on the --- line
95+
# It should always be followed by a +++ line
96+
old_file_line = line
97+
98+
try:
99+
new_file_line = next(line_iterator)
100+
except StopIteration:
101+
msg = "Unexpected end of git diff output while looking for +++ line"
102+
raise ValueError(msg) # noqa: B904
103+
if not new_file_line.startswith("+++ "):
104+
msg = f"Unexpected line in git diff output, expected +++ line: {new_file_line}"
105+
raise ValueError(msg)
106+
107+
old_file_path = old_file_line[4:].strip()
108+
new_file_path = new_file_line[4:].strip()
109+
110+
if old_file_path == "/dev/null":
111+
current_type = ChangeType.ADDED
112+
current_file = Path(new_file_path)
113+
elif new_file_path == "/dev/null":
114+
current_type = ChangeType.DELETED
115+
current_file = Path(old_file_path)
116+
elif old_file_path == new_file_path:
117+
current_type = ChangeType.MODIFIED
118+
current_file = Path(new_file_path)
119+
else:
120+
msg = f"Unexpected file paths in git diff output: {old_file_path} -> {new_file_path} - this indicates a rename which we do not support"
121+
raise ValueError(
122+
msg,
123+
)
124+
125+
# Now, we should be at the start of the patch hunks (lines starting with @@)
126+
line = next(line_iterator)
127+
if not line.startswith("@@ "):
128+
msg = f"Unexpected line in git diff output, expected hunk start: {line}"
129+
raise ValueError(msg)
130+
current_patch_lines.append(line)
131+
# Collect all hunk lines and lines starting with + or -
132+
line = next(line_iterator)
133+
while line.startswith(("+", "-")):
134+
current_patch_lines.append(line)
135+
try:
136+
line = next(line_iterator)
137+
except StopIteration:
138+
# Just break out of the loop, we will handle yielding below
139+
# Set a flag to indicate we reached the end of the iterator
140+
iterator_exhausted = True
141+
break
142+
143+
# Yield the file we were building now that we have reached the end of its patch
144+
yield cls(
145+
file=current_file,
146+
type=current_type,
147+
patch="\n".join(current_patch_lines),
148+
)
149+
current_file = None
150+
current_type = None
151+
current_patch_lines = []
152+
153+
if iterator_exhausted:
154+
return
155+
156+
except StopIteration:
157+
msg = "Unexpected end of git diff output while parsing"
158+
raise ValueError(msg) # noqa: B904
159+
160+
161+
# Easier and safer to subclass UserDict than Dict directly
162+
class ChangeSet(dict[Path, FileChanges]):
163+
"""
164+
Represents a set of changes to files in a git repository.
165+
This can both be a change between two commits, or the changes in the working directory.
166+
167+
When considering the changes to the working directory, the untracked files are considered as added files.
168+
"""
169+
170+
@cached_property
171+
def added(self) -> set[Path]:
172+
"""List of files that were added."""
173+
return {change.file for change in self.values() if change.type == ChangeType.ADDED}
174+
175+
@cached_property
176+
def modified(self) -> set[Path]:
177+
"""List of files that were modified."""
178+
return {change.file for change in self.values() if change.type == ChangeType.MODIFIED}
179+
180+
@cached_property
181+
def deleted(self) -> set[Path]:
182+
"""List of files that were deleted."""
183+
return {change.file for change in self.values() if change.type == ChangeType.DELETED}
184+
185+
@cached_property
186+
def changed(self) -> set[Path]:
187+
"""List of files that were changed (added, modified, or deleted)."""
188+
return set(self.keys())
189+
190+
def add(self, change: FileChanges) -> None:
191+
"""Add a file change to the changeset."""
192+
self[change.file] = change
193+
194+
def digest(self) -> SHA1Hash:
195+
"""Compute a hash of the changeset."""
196+
from hashlib import sha1
197+
198+
digester = sha1() # noqa: S324
199+
for change in sorted(self.values()):
200+
digester.update(change.file.as_posix().encode())
201+
digester.update(change.type.value.encode())
202+
digester.update(change.patch.encode())
203+
204+
return SHA1Hash(digester.hexdigest())
205+
206+
@classmethod
207+
def generate_from_diff_output(cls, diff_output: Iterable[str] | str) -> Self:
208+
"""Generate a changeset from the output of a git diff command."""
209+
changeset = cls()
210+
for change in FileChanges.generate_from_diff_output(diff_output):
211+
changeset.add(change)
212+
return changeset

src/dda/utils/git/commit.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212

1313
from dda.cli.application import Application
1414
from dda.utils.fs import Path
15+
from dda.utils.git.changeset import ChangeSet
1516
from dda.utils.git.sha1hash import SHA1Hash
1617

1718

@@ -60,17 +61,32 @@ def get_details_from_github(self) -> CommitDetails:
6061
"""
6162
from datetime import datetime
6263

64+
from dda.utils.fs import Path
65+
from dda.utils.git.changeset import ChangeSet, ChangeType, FileChanges
6366
from dda.utils.git.sha1hash import SHA1Hash
6467
from dda.utils.network.http.client import get_http_client
6568

6669
client = get_http_client()
6770
data = client.get(self.github_api_url).json()
71+
72+
# Compute ChangeSet
73+
changes = ChangeSet()
74+
for file_obj in data["files"]:
75+
changes.add(
76+
FileChanges(
77+
file=Path(file_obj["filename"]),
78+
type=ChangeType.from_github_status(file_obj["status"]),
79+
patch=file_obj["patch"],
80+
)
81+
)
82+
6883
self._details = CommitDetails(
6984
author_name=data["commit"]["author"]["name"],
7085
author_email=data["commit"]["author"]["email"],
7186
datetime=datetime.fromisoformat(data["commit"]["author"]["date"]),
7287
message=data["commit"]["message"],
7388
parent_shas=[SHA1Hash(parent["sha"]) for parent in data.get("parents", [])],
89+
changes=changes,
7490
)
7591
return self.details
7692

@@ -122,4 +138,4 @@ class CommitDetails:
122138
message: str
123139
parent_shas: list[SHA1Hash]
124140

125-
# TODO: Add some way to represent the diff
141+
changes: ChangeSet

0 commit comments

Comments
 (0)