|  | 
|  | 1 | +# SPDX-FileCopyrightText: 2025-present Datadog, Inc. <dev@datadoghq.com> | 
|  | 2 | +# | 
|  | 3 | +# SPDX-License-Identifier: MIT | 
|  | 4 | +from collections.abc import Generator, Iterable | 
|  | 5 | +from dataclasses import dataclass | 
|  | 6 | +from enum import StrEnum | 
|  | 7 | +from functools import cached_property | 
|  | 8 | +from typing import Self | 
|  | 9 | + | 
|  | 10 | +from dda.utils.fs import Path | 
|  | 11 | +from dda.utils.git.sha1hash import SHA1Hash | 
|  | 12 | + | 
|  | 13 | +FILECHANGES_GIT_DIFF_ARGS = ["diff", "-U0", "--no-color", "--no-prefix", "--no-renames"] | 
|  | 14 | + | 
|  | 15 | + | 
|  | 16 | +class ChangeType(StrEnum): | 
|  | 17 | +    ADDED = "A" | 
|  | 18 | +    MODIFIED = "M" | 
|  | 19 | +    DELETED = "D" | 
|  | 20 | + | 
|  | 21 | +    @classmethod | 
|  | 22 | +    def from_github_status(cls, status: str) -> "ChangeType": | 
|  | 23 | +        mapping = { | 
|  | 24 | +            "added": cls.ADDED, | 
|  | 25 | +            "modified": cls.MODIFIED, | 
|  | 26 | +            "deleted": cls.DELETED, | 
|  | 27 | +        } | 
|  | 28 | +        try: | 
|  | 29 | +            return mapping[status] | 
|  | 30 | +        except KeyError as e: | 
|  | 31 | +            msg = f"Invalid GitHub change type message: {status}" | 
|  | 32 | +            raise ValueError(msg) from e | 
|  | 33 | + | 
|  | 34 | + | 
|  | 35 | +@dataclass(frozen=True, order=True) | 
|  | 36 | +class FileChanges: | 
|  | 37 | +    """Represents changes to a single file in a git repository.""" | 
|  | 38 | + | 
|  | 39 | +    file: Path | 
|  | 40 | +    type: ChangeType | 
|  | 41 | + | 
|  | 42 | +    patch: str | 
|  | 43 | +    """ | 
|  | 44 | +    The patch representing the changes to the file, in unified diff format. | 
|  | 45 | +    We only keep the hunk lines (starting with @@) and the lines starting with + or - (no extra context lines). | 
|  | 46 | +    This is similar to the format used by the `patch` fields in GitHub's API. | 
|  | 47 | +
 | 
|  | 48 | +    Example: | 
|  | 49 | +    ```diff | 
|  | 50 | +    @@ -15,2 +15 @@ if TYPE_CHECKING: | 
|  | 51 | +    -    from dda.utils.git.commit import Commit, CommitDetails | 
|  | 52 | +    -    from dda.utils.git.sha1hash import SHA1Hash | 
|  | 53 | +    +    from dda.utils.git.commit import Commit | 
|  | 54 | +    ``` | 
|  | 55 | +    """ | 
|  | 56 | + | 
|  | 57 | +    # TODO: This might be a bit brittle - or we might want to move this to a separate file ? | 
|  | 58 | +    @classmethod | 
|  | 59 | +    def generate_from_diff_output(cls, diff_output: Iterable[str] | str) -> Generator[Self, None, None]: | 
|  | 60 | +        """ | 
|  | 61 | +        Generate a list of FileChanges from the output of _some_ git diff commands. | 
|  | 62 | +        Not all outputs from `git diff` are supported (ex: renames), you should run: | 
|  | 63 | +        ```bash | 
|  | 64 | +        git diff -U0 --no-color --no-prefix --no-renames <oldrev> <newrev> | 
|  | 65 | +        ``` | 
|  | 66 | +        Accepts an Iterable of lines or a single string as argument. | 
|  | 67 | +        """ | 
|  | 68 | +        if isinstance(diff_output, str): | 
|  | 69 | +            diff_output = diff_output.strip().splitlines() | 
|  | 70 | + | 
|  | 71 | +        line_iterator = iter(diff_output) | 
|  | 72 | + | 
|  | 73 | +        current_file: Path | None = None | 
|  | 74 | +        current_type: ChangeType | None = None | 
|  | 75 | +        current_patch_lines: list[str] = [] | 
|  | 76 | +        iterator_exhausted = False | 
|  | 77 | + | 
|  | 78 | +        try: | 
|  | 79 | +            line = next(line_iterator) | 
|  | 80 | +            while True: | 
|  | 81 | +                # Start processing a new file - the line looks like `diff --git a/<path> b/<path>` | 
|  | 82 | +                if not line.startswith("diff --git "): | 
|  | 83 | +                    msg = f"Unexpected line in git diff output: {line}" | 
|  | 84 | +                    raise ValueError(msg) | 
|  | 85 | + | 
|  | 86 | +                # Go forward until we find the 'old file' line (---) | 
|  | 87 | +                while not line.startswith("--- "): | 
|  | 88 | +                    try: | 
|  | 89 | +                        line = next(line_iterator) | 
|  | 90 | +                    except StopIteration: | 
|  | 91 | +                        msg = "Unexpected end of git diff output while looking for --- line" | 
|  | 92 | +                        raise ValueError(msg)  # noqa: B904 | 
|  | 93 | + | 
|  | 94 | +                # When we get here, we are on the --- line | 
|  | 95 | +                # It should always be followed by a +++ line | 
|  | 96 | +                old_file_line = line | 
|  | 97 | + | 
|  | 98 | +                try: | 
|  | 99 | +                    new_file_line = next(line_iterator) | 
|  | 100 | +                except StopIteration: | 
|  | 101 | +                    msg = "Unexpected end of git diff output while looking for +++ line" | 
|  | 102 | +                    raise ValueError(msg)  # noqa: B904 | 
|  | 103 | +                if not new_file_line.startswith("+++ "): | 
|  | 104 | +                    msg = f"Unexpected line in git diff output, expected +++ line: {new_file_line}" | 
|  | 105 | +                    raise ValueError(msg) | 
|  | 106 | + | 
|  | 107 | +                old_file_path = old_file_line[4:].strip() | 
|  | 108 | +                new_file_path = new_file_line[4:].strip() | 
|  | 109 | + | 
|  | 110 | +                if old_file_path == "/dev/null": | 
|  | 111 | +                    current_type = ChangeType.ADDED | 
|  | 112 | +                    current_file = Path(new_file_path) | 
|  | 113 | +                elif new_file_path == "/dev/null": | 
|  | 114 | +                    current_type = ChangeType.DELETED | 
|  | 115 | +                    current_file = Path(old_file_path) | 
|  | 116 | +                elif old_file_path == new_file_path: | 
|  | 117 | +                    current_type = ChangeType.MODIFIED | 
|  | 118 | +                    current_file = Path(new_file_path) | 
|  | 119 | +                else: | 
|  | 120 | +                    msg = f"Unexpected file paths in git diff output: {old_file_path} -> {new_file_path} - this indicates a rename which we do not support" | 
|  | 121 | +                    raise ValueError( | 
|  | 122 | +                        msg, | 
|  | 123 | +                    ) | 
|  | 124 | + | 
|  | 125 | +                # Now, we should be at the start of the patch hunks (lines starting with @@) | 
|  | 126 | +                line = next(line_iterator) | 
|  | 127 | +                if not line.startswith("@@ "): | 
|  | 128 | +                    msg = f"Unexpected line in git diff output, expected hunk start: {line}" | 
|  | 129 | +                    raise ValueError(msg) | 
|  | 130 | +                current_patch_lines.append(line) | 
|  | 131 | +                # Collect all hunk lines and lines starting with + or - | 
|  | 132 | +                line = next(line_iterator) | 
|  | 133 | +                while line.startswith(("+", "-")): | 
|  | 134 | +                    current_patch_lines.append(line) | 
|  | 135 | +                    try: | 
|  | 136 | +                        line = next(line_iterator) | 
|  | 137 | +                    except StopIteration: | 
|  | 138 | +                        # Just break out of the loop, we will handle yielding below | 
|  | 139 | +                        # Set a flag to indicate we reached the end of the iterator | 
|  | 140 | +                        iterator_exhausted = True | 
|  | 141 | +                        break | 
|  | 142 | + | 
|  | 143 | +                # Yield the file we were building now that we have reached the end of its patch | 
|  | 144 | +                yield cls( | 
|  | 145 | +                    file=current_file, | 
|  | 146 | +                    type=current_type, | 
|  | 147 | +                    patch="\n".join(current_patch_lines), | 
|  | 148 | +                ) | 
|  | 149 | +                current_file = None | 
|  | 150 | +                current_type = None | 
|  | 151 | +                current_patch_lines = [] | 
|  | 152 | + | 
|  | 153 | +                if iterator_exhausted: | 
|  | 154 | +                    return | 
|  | 155 | + | 
|  | 156 | +        except StopIteration: | 
|  | 157 | +            msg = "Unexpected end of git diff output while parsing" | 
|  | 158 | +            raise ValueError(msg)  # noqa: B904 | 
|  | 159 | + | 
|  | 160 | + | 
|  | 161 | +# Easier and safer to subclass UserDict than Dict directly | 
|  | 162 | +class ChangeSet(dict[Path, FileChanges]): | 
|  | 163 | +    """ | 
|  | 164 | +    Represents a set of changes to files in a git repository. | 
|  | 165 | +    This can both be a change between two commits, or the changes in the working directory. | 
|  | 166 | +
 | 
|  | 167 | +    When considering the changes to the working directory, the untracked files are considered as added files. | 
|  | 168 | +    """ | 
|  | 169 | + | 
|  | 170 | +    @cached_property | 
|  | 171 | +    def added(self) -> set[Path]: | 
|  | 172 | +        """List of files that were added.""" | 
|  | 173 | +        return {change.file for change in self.values() if change.type == ChangeType.ADDED} | 
|  | 174 | + | 
|  | 175 | +    @cached_property | 
|  | 176 | +    def modified(self) -> set[Path]: | 
|  | 177 | +        """List of files that were modified.""" | 
|  | 178 | +        return {change.file for change in self.values() if change.type == ChangeType.MODIFIED} | 
|  | 179 | + | 
|  | 180 | +    @cached_property | 
|  | 181 | +    def deleted(self) -> set[Path]: | 
|  | 182 | +        """List of files that were deleted.""" | 
|  | 183 | +        return {change.file for change in self.values() if change.type == ChangeType.DELETED} | 
|  | 184 | + | 
|  | 185 | +    @cached_property | 
|  | 186 | +    def changed(self) -> set[Path]: | 
|  | 187 | +        """List of files that were changed (added, modified, or deleted).""" | 
|  | 188 | +        return set(self.keys()) | 
|  | 189 | + | 
|  | 190 | +    def add(self, change: FileChanges) -> None: | 
|  | 191 | +        """Add a file change to the changeset.""" | 
|  | 192 | +        self[change.file] = change | 
|  | 193 | + | 
|  | 194 | +    def digest(self) -> SHA1Hash: | 
|  | 195 | +        """Compute a hash of the changeset.""" | 
|  | 196 | +        from hashlib import sha1 | 
|  | 197 | + | 
|  | 198 | +        digester = sha1()  # noqa: S324 | 
|  | 199 | +        for change in sorted(self.values()): | 
|  | 200 | +            digester.update(change.file.as_posix().encode()) | 
|  | 201 | +            digester.update(change.type.value.encode()) | 
|  | 202 | +            digester.update(change.patch.encode()) | 
|  | 203 | + | 
|  | 204 | +        return SHA1Hash(digester.hexdigest()) | 
|  | 205 | + | 
|  | 206 | +    @classmethod | 
|  | 207 | +    def generate_from_diff_output(cls, diff_output: Iterable[str] | str) -> Self: | 
|  | 208 | +        """Generate a changeset from the output of a git diff command.""" | 
|  | 209 | +        changeset = cls() | 
|  | 210 | +        for change in FileChanges.generate_from_diff_output(diff_output): | 
|  | 211 | +            changeset.add(change) | 
|  | 212 | +        return changeset | 
0 commit comments