Skip to content
This repository was archived by the owner on May 17, 2024. It is now read-only.

Materialize: rename and reorder columns #287

Merged
merged 2 commits into from
Nov 14, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 11 additions & 4 deletions data_diff/joindiff_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from functools import partial
import logging
from typing import List
from itertools import chain

from runtype import dataclass

Expand Down Expand Up @@ -183,13 +184,17 @@ def _diff_segments(
else None,
):

assert len(a_cols) == len(b_cols)
logger.debug("Querying for different rows")
for is_xa, is_xb, *x in db.query(diff_rows, list):
if is_xa and is_xb:
# Can't both be exclusive, meaning a pk is NULL
# This can happen if the explicit null test didn't finish running yet
raise ValueError("NULL values in one or more primary keys")
_is_diff, a_row, b_row = _slice_tuple(x, len(is_diff_cols), len(a_cols), len(b_cols))
# _is_diff, a_row, b_row = _slice_tuple(x, len(is_diff_cols), len(a_cols), len(b_cols))
_is_diff, ab_row = _slice_tuple(x, len(is_diff_cols), len(a_cols) + len(b_cols))
a_row, b_row = ab_row[::2], ab_row[1::2]
assert len(a_row) == len(b_row)
if not is_xb:
yield "-", tuple(a_row)
if not is_xa:
Expand Down Expand Up @@ -273,10 +278,12 @@ def _create_outer_join(self, table1, table2):

is_diff_cols = {f"is_diff_{c1}": bool_to_int(a[c1].is_distinct_from(b[c2])) for c1, c2 in safezip(cols1, cols2)}

a_cols = {f"table1_{c}": NormalizeAsString(a[c]) for c in cols1}
b_cols = {f"table2_{c}": NormalizeAsString(b[c]) for c in cols2}
a_cols = {f"{c}_a": NormalizeAsString(a[c]) for c in cols1}
b_cols = {f"{c}_b": NormalizeAsString(b[c]) for c in cols2}
# Order columns as col1_a, col1_b, col2_a, col2_b, etc.
cols = {k: v for k, v in chain(*zip(a_cols.items(), b_cols.items()))}

all_rows = _outerjoin(db, a, b, keys1, keys2, {**is_diff_cols, **a_cols, **b_cols})
all_rows = _outerjoin(db, a, b, keys1, keys2, {**is_diff_cols, **cols})
diff_rows = all_rows.where(or_(this[c] == 1 for c in is_diff_cols))
return diff_rows, a_cols, b_cols, is_diff_cols, all_rows

Expand Down
3 changes: 2 additions & 1 deletion tests/test_joindiff.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,8 @@ def test_diff_small_tables(self):
t = TablePath(materialize_path)
rows = self.connection.query(t.select(), List[tuple])
# is_xa, is_xb, is_diff1, is_diff2, row1, row2
assert rows == [(1, 0, 1, 1) + expected_row + (None, None)], rows
# assert rows == [(1, 0, 1, 1) + expected_row + (None, None)], rows
assert rows == [(1, 0, 1, 1) + (expected_row[0], None, expected_row[1], None)], rows
self.connection.query(t.drop())

# Test materialize all rows
Expand Down