Skip to content
This repository was archived by the owner on May 17, 2024. It is now read-only.

Group rows by all columns of composite PKs #838

Merged
merged 1 commit into from
Jan 4, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 10 additions & 5 deletions data_diff/hashdiff_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@

# Just for local readability: TODO: later switch to real type declarations of these.
_Op = Literal["+", "-"]
_PK = Any
_PK = Sequence[Any]
_Row = Tuple[Any]


Expand All @@ -34,24 +34,27 @@ def diff_sets(
json_cols: dict = None,
columns1: Sequence[str],
columns2: Sequence[str],
key_columns1: Sequence[str],
key_columns2: Sequence[str],
ignored_columns1: Collection[str],
ignored_columns2: Collection[str],
) -> Iterator:
# Differ only by columns of interest (PKs+relevant-ignored). But yield with ignored ones!
sa: Set[_Row] = {tuple(val for col, val in safezip(columns1, row) if col not in ignored_columns1) for row in a}
sb: Set[_Row] = {tuple(val for col, val in safezip(columns2, row) if col not in ignored_columns2) for row in b}

# The first item is always the key (see TableDiffer.relevant_columns)
# TODO update when we add compound keys to hashdiff
# The first items are always the PK (see TableSegment.relevant_columns)
diffs_by_pks: Dict[_PK, List[Tuple[_Op, _Row]]] = defaultdict(list)
for row in a:
pk: _PK = tuple(val for col, val in zip(key_columns1, row))
cutrow: _Row = tuple(val for col, val in zip(columns1, row) if col not in ignored_columns1)
if cutrow not in sb:
diffs_by_pks[row[0]].append(("-", row))
diffs_by_pks[pk].append(("-", row))
for row in b:
pk: _PK = tuple(val for col, val in zip(key_columns2, row))
cutrow: _Row = tuple(val for col, val in zip(columns2, row) if col not in ignored_columns2)
if cutrow not in sa:
diffs_by_pks[row[0]].append(("+", row))
diffs_by_pks[pk].append(("+", row))

warned_diff_cols = set()
for diffs in (diffs_by_pks[pk] for pk in sorted(diffs_by_pks)):
Expand Down Expand Up @@ -232,6 +235,8 @@ def _bisect_and_diff_segments(
json_cols=json_cols,
columns1=table1.relevant_columns,
columns2=table2.relevant_columns,
key_columns1=table1.key_columns,
key_columns2=table2.key_columns,
ignored_columns1=self.ignored_columns1,
ignored_columns2=self.ignored_columns2,
)
Expand Down