Skip to content
This repository was archived by the owner on May 17, 2024. It is now read-only.

Commit 6308ad9

Browse files
authored
Merge pull request #292 from datafold/nov14_new_output
Revised CLI output to be more understandable and detailed
2 parents 69e1154 + 3918343 commit 6308ad9

File tree

3 files changed

+56
-23
lines changed

3 files changed

+56
-23
lines changed

data_diff/__main__.py

Lines changed: 36 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -380,27 +380,48 @@ def _main(
380380

381381
if stats:
382382
diff = list(diff_iter)
383-
unique_diff_count = len({i[0] for _, i in diff})
384-
max_table_count = max(differ.stats["table1_count"], differ.stats["table2_count"])
385-
percent = 100 * unique_diff_count / (max_table_count or 1)
386-
plus = len([1 for op, _ in diff if op == "+"])
387-
minus = len([1 for op, _ in diff if op == "-"])
383+
key_columns_len = len(key_columns)
384+
385+
diff_by_key = {}
386+
for sign, values in diff:
387+
k = values[:key_columns_len]
388+
if k in diff_by_key:
389+
assert sign != diff_by_key[k]
390+
diff_by_key[k] = "!"
391+
else:
392+
diff_by_key[k] = sign
393+
394+
diff_by_sign = {k: 0 for k in "+-!"}
395+
for sign in diff_by_key.values():
396+
diff_by_sign[sign] += 1
397+
398+
table1_count = differ.stats.pop("table1_count")
399+
table2_count = differ.stats.pop("table2_count")
400+
del differ.stats['diff_count']
401+
unchanged = table1_count - diff_by_sign["-"] - diff_by_sign["!"]
402+
diff_percent = 1 - unchanged / max(table1_count, table2_count)
388403

389404
if json_output:
390405
json_output = {
391-
"different_rows": len(diff),
392-
"different_percent": percent,
393-
"different_+": plus,
394-
"different_-": minus,
395-
"different_unique": unique_diff_count,
396-
"total": max_table_count,
406+
"rows_A": table1_count,
407+
"rows_B": table2_count,
408+
"exclusive_A": diff_by_sign["-"],
409+
"exclusive_B": diff_by_sign["+"],
410+
"updated": diff_by_sign["!"],
411+
"unchanged": unchanged,
412+
"total": sum(diff_by_sign.values()),
397413
"stats": differ.stats,
398414
}
399-
rich.print(json.dumps(json_output))
415+
rich.print_json(json.dumps(json_output))
400416
else:
401-
print(f"Diff-Total: {unique_diff_count} changed rows out of {max_table_count}")
402-
print(f"Diff-Percent: {percent:.14f}%")
403-
print(f"Diff-Split: +{plus} -{minus}")
417+
rich.print(f"{table1_count} rows in table A")
418+
rich.print(f"{table2_count} rows in table B")
419+
rich.print(f"{diff_by_sign['-']} rows exclusive to table A (not present in B)")
420+
rich.print(f"{diff_by_sign['+']} rows exclusive to table B (not present in A)")
421+
rich.print(f"{diff_by_sign['!']} rows updated")
422+
rich.print(f"{unchanged} rows unchanged")
423+
rich.print(f"{100*diff_percent:.2f}% difference score")
424+
404425
if differ.stats:
405426
print("Extra-Info:")
406427
for k, v in differ.stats.items():

data_diff/joindiff_tables.py

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -240,21 +240,33 @@ def _collect_stats(self, i, table_seg: TableSegment):
240240

241241
# Metrics
242242
col_exprs = merge_dicts(
243+
{
244+
# f"min_{c}": min_(this[c]),
245+
# f"max_{c}": max_(this[c]),
246+
}
247+
if c in table_seg.key_columns else
243248
{
244249
f"sum_{c}": sum_(this[c]),
245-
f"avg_{c}": avg(this[c]),
246-
f"min_{c}": min_(this[c]),
247-
f"max_{c}": max_(this[c]),
250+
# f"avg_{c}": avg(this[c]),
251+
# f"min_{c}": min_(this[c]),
252+
# f"max_{c}": max_(this[c]),
248253
}
249254
for c in table_seg.relevant_columns
250255
if isinstance(table_seg._schema[c], NumericType)
251256
)
252257
col_exprs["count"] = Count()
253258

254259
res = db.query(table_seg.make_select().select(**col_exprs), tuple)
255-
res = dict(zip([f"table{i}_{n}" for n in col_exprs], map(json_friendly_value, res)))
256-
for k, v in res.items():
257-
self.stats[k] = self.stats.get(k, 0) + (v or 0)
260+
261+
for col_name, value in safezip(col_exprs, res):
262+
if value is not None:
263+
value = json_friendly_value(value)
264+
stat_name = f"table{i}_{col_name}"
265+
266+
if stat_name in self.stats:
267+
self.stats[stat_name] += value
268+
else:
269+
self.stats[stat_name] = value
258270

259271
logger.debug("Done collecting stats for table #%s", i)
260272

tests/test_joindiff.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -122,8 +122,8 @@ def test_diff_small_tables(self):
122122
self.assertEqual(expected, diff)
123123
self.assertEqual(2, self.differ.stats["table1_count"])
124124
self.assertEqual(1, self.differ.stats["table2_count"])
125-
self.assertEqual(3, self.differ.stats["table1_sum_id"])
126-
self.assertEqual(1, self.differ.stats["table2_sum_id"])
125+
# self.assertEqual(2, self.differ.stats["table1_max_id"])
126+
# self.assertEqual(1, self.differ.stats["table2_min_id"])
127127

128128
# Test materialize
129129
materialize_path = self.connection.parse_table_name(f"test_mat_{random_table_suffix()}")

0 commit comments

Comments
 (0)