Skip to content

Commit

Permalink
[SPARK-44453][PYTHON] Use difflib to display errors in assertDataFram…
Browse files Browse the repository at this point in the history
…eEqual

### What changes were proposed in this pull request?
This PR uses the built-in Python library, difflib, to display errors in the testing util `assertDataFrameEqual`

### Why are the changes needed?
The change makes the error message output more user-friendly, as well as consistent with `assertSchemaEqual`

### Does this PR introduce _any_ user-facing change?
Yes, the PR changes the test util output for the user-facing util function `assertDataFrameEqual`.

### How was this patch tested?
Existing tests in `runtime/python/pyspark/sql/tests/test_utils.py` and `runtime/python/pyspark/sql/tests/connect/test_utils.py`

Example output:
<img width="891" alt="Screenshot 2023-07-16 at 8 20 31 PM" src="https://github.com/apache/spark/assets/68875504/2d7a9d02-bb9e-4c21-b330-5ec01b2e9ec8">

<img width="868" alt="Screenshot 2023-07-16 at 8 20 41 PM" src="https://github.com/apache/spark/assets/68875504/eba9f3e8-e147-491c-934b-34e8351df012">

Closes apache#42031 from asl3/difflib-assertdfequal.

Authored-by: Amanda Liu <amanda.liu@databricks.com>
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
  • Loading branch information
asl3 authored and HyukjinKwon committed Jul 17, 2023
1 parent e578d46 commit aa68810
Show file tree
Hide file tree
Showing 2 changed files with 82 additions and 128 deletions.
177 changes: 63 additions & 114 deletions python/pyspark/sql/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,19 +151,14 @@ def test_assert_approx_equal_arraytype_float_default_rtol_fail(self):
expected_error_message = "Results do not match: "
percent_diff = (1 / 2) * 100
expected_error_message += "( %.5f %% )" % percent_diff
diff_msg = (
"[actual]"
+ "\n"
+ str(df1.collect()[1])
+ "\n\n"
+ "[expected]"
+ "\n"
+ str(df2.collect()[1])
+ "\n\n"
+ "********************"
+ "\n\n"
)
expected_error_message += "\n" + diff_msg

generated_diff = difflib.ndiff(
str(df1.collect()[1]).splitlines(), str(df2.collect()[1]).splitlines()
)
diff_msg = "\n" + "\n".join(generated_diff) + "\n"
diff_msg += "********************" + "\n"

expected_error_message += "\n" + "--- actual\n+++ expected\n" + diff_msg

with self.assertRaises(PySparkAssertionError) as pe:
assertDataFrameEqual(df1, df2)
Expand Down Expand Up @@ -294,19 +289,14 @@ def test_assert_notequal_arraytype(self):
expected_error_message = "Results do not match: "
percent_diff = (1 / 2) * 100
expected_error_message += "( %.5f %% )" % percent_diff
diff_msg = (
"[actual]"
+ "\n"
+ str(df1.collect()[1])
+ "\n\n"
+ "[expected]"
+ "\n"
+ str(df2.collect()[1])
+ "\n\n"
+ "********************"
+ "\n\n"
)
expected_error_message += "\n" + diff_msg

generated_diff = difflib.ndiff(
str(df1.collect()[1]).splitlines(), str(df2.collect()[1]).splitlines()
)
diff_msg = "\n" + "\n".join(generated_diff) + "\n"
diff_msg += "********************" + "\n"

expected_error_message += "\n" + "--- actual\n+++ expected\n" + diff_msg

with self.assertRaises(PySparkAssertionError) as pe:
assertDataFrameEqual(df1, df2)
Expand Down Expand Up @@ -598,19 +588,14 @@ def test_assert_notequal_nullval(self):
expected_error_message = "Results do not match: "
percent_diff = (1 / 2) * 100
expected_error_message += "( %.5f %% )" % percent_diff
diff_msg = (
"[actual]"
+ "\n"
+ str(df1.collect()[1])
+ "\n\n"
+ "[expected]"
+ "\n"
+ str(df2.collect()[1])
+ "\n\n"
+ "********************"
+ "\n\n"
)
expected_error_message += "\n" + diff_msg

generated_diff = difflib.ndiff(
str(df1.collect()[1]).splitlines(), str(df2.collect()[1]).splitlines()
)
diff_msg = "\n" + "\n".join(generated_diff) + "\n"
diff_msg += "********************" + "\n"

expected_error_message += "\n" + "--- actual\n+++ expected\n" + diff_msg

with self.assertRaises(PySparkAssertionError) as pe:
assertDataFrameEqual(df1, df2)
Expand Down Expand Up @@ -722,31 +707,19 @@ def test_check_row_order_error(self):
expected_error_message = "Results do not match: "
percent_diff = (2 / 2) * 100
expected_error_message += "( %.5f %% )" % percent_diff
diff_msg = (
"[actual]"
+ "\n"
+ str(df1.collect()[0])
+ "\n\n"
+ "[expected]"
+ "\n"
+ str(df2.collect()[0])
+ "\n\n"
+ "********************"
+ "\n\n"
)
diff_msg += (
"[actual]"
+ "\n"
+ str(df1.collect()[1])
+ "\n\n"
+ "[expected]"
+ "\n"
+ str(df2.collect()[1])
+ "\n\n"
+ "********************"
+ "\n\n"
)
expected_error_message += "\n" + diff_msg

generated_diff = difflib.ndiff(
str(df1.collect()[0]).splitlines(), str(df2.collect()[0]).splitlines()
)
diff_msg = "\n" + "\n".join(generated_diff) + "\n"
diff_msg += "********************" + "\n"
generated_diff = difflib.ndiff(
str(df1.collect()[1]).splitlines(), str(df2.collect()[1]).splitlines()
)
diff_msg += "\n" + "\n".join(generated_diff) + "\n"
diff_msg += "********************" + "\n"

expected_error_message += "\n" + "--- actual\n+++ expected\n" + diff_msg

with self.assertRaises(PySparkAssertionError) as pe:
assertDataFrameEqual(df1, df2, checkRowOrder=True)
Expand Down Expand Up @@ -829,31 +802,19 @@ def test_assert_pyspark_df_not_equal(self):
expected_error_message = "Results do not match: "
percent_diff = (2 / 3) * 100
expected_error_message += "( %.5f %% )" % percent_diff
diff_msg = (
"[actual]"
+ "\n"
+ str(df1.collect()[0])
+ "\n\n"
+ "[expected]"
+ "\n"
+ str(df2.collect()[0])
+ "\n\n"
+ "********************"
+ "\n\n"
)
diff_msg += (
"[actual]"
+ "\n"
+ str(df1.collect()[2])
+ "\n\n"
+ "[expected]"
+ "\n"
+ str(df2.collect()[2])
+ "\n\n"
+ "********************"
+ "\n\n"
)
expected_error_message += "\n" + diff_msg

generated_diff = difflib.ndiff(
str(df1.collect()[0]).splitlines(), str(df2.collect()[0]).splitlines()
)
diff_msg = "\n" + "\n".join(generated_diff) + "\n"
diff_msg += "********************" + "\n"
generated_diff = difflib.ndiff(
str(df1.collect()[2]).splitlines(), str(df2.collect()[2]).splitlines()
)
diff_msg += "\n" + "\n".join(generated_diff) + "\n"
diff_msg += "********************" + "\n"

expected_error_message += "\n" + "--- actual\n+++ expected\n" + diff_msg

with self.assertRaises(PySparkAssertionError) as pe:
assertDataFrameEqual(df1, df2)
Expand Down Expand Up @@ -1197,31 +1158,19 @@ def test_list_row_unequal_schema(self):
expected_error_message = "Results do not match: "
percent_diff = (2 / 2) * 100
expected_error_message += "( %.5f %% )" % percent_diff
diff_msg = (
"[actual]"
+ "\n"
+ str(df1.collect()[0])
+ "\n\n"
+ "[expected]"
+ "\n"
+ str(list_of_rows[0])
+ "\n\n"
+ "********************"
+ "\n\n"
)
diff_msg += (
"[actual]"
+ "\n"
+ str(df1.collect()[1])
+ "\n\n"
+ "[expected]"
+ "\n"
+ str(list_of_rows[1])
+ "\n\n"
+ "********************"
+ "\n\n"
)
expected_error_message += "\n" + diff_msg

generated_diff = difflib.ndiff(
str(df1.collect()[0]).splitlines(), str(list_of_rows[0]).splitlines()
)
diff_msg = "\n" + "\n".join(generated_diff) + "\n"
diff_msg += "********************" + "\n"
generated_diff = difflib.ndiff(
str(df1.collect()[1]).splitlines(), str(list_of_rows[1]).splitlines()
)
diff_msg += "\n" + "\n".join(generated_diff) + "\n"
diff_msg += "********************" + "\n"

expected_error_message += "\n" + "--- actual\n+++ expected\n" + diff_msg

with self.assertRaises(PySparkAssertionError) as pe:
assertDataFrameEqual(df1, list_of_rows)
Expand Down
33 changes: 19 additions & 14 deletions python/pyspark/testing/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -346,6 +346,9 @@ def assertDataFrameEqual(
Notes
-----
When assertDataFrameEqual fails, the error message uses the Python `difflib` library to display
a diff log of each row that differs in `actual` and `expected`.
For checkRowOrder, note that PySpark DataFrame ordering is non-deterministic, unless
explicitly sorted.
Expand Down Expand Up @@ -374,15 +377,18 @@ def assertDataFrameEqual(
>>> assertDataFrameEqual(df1, df2) # doctest: +IGNORE_EXCEPTION_DETAIL
Traceback (most recent call last):
...
PySparkAssertionError: [DIFFERENT_ROWS] Results do not match: ( 66.667 % )
[actual]
Row(id='1', amount=1000.0)
[expected]
Row(id='1', amount=1001.0)
[actual]
Row(id='3', amount=2000.0)
[expected]
Row(id='3', amount=2003.0)
PySparkAssertionError: [DIFFERENT_ROWS] Results do not match: ( 66.66667 % )
--- actual
+++ expected
- Row(id='1', amount=1000.0)
? ^
+ Row(id='1', amount=1001.0)
? ^
- Row(id='3', amount=2000.0)
? ^
+ Row(id='3', amount=2003.0)
? ^
"""
if actual is None and expected is None:
return True
Expand Down Expand Up @@ -471,15 +477,14 @@ def assert_rows_equal(rows1: List[Row], rows2: List[Row]):
if not compare_rows(r1, r2):
rows_equal = False
diff_rows_cnt += 1
diff_msg += (
"[actual]" + "\n" + str(r1) + "\n\n" + "[expected]" + "\n" + str(r2) + "\n\n"
)
diff_msg += "********************" + "\n\n"
generated_diff = difflib.ndiff(str(r1).splitlines(), str(r2).splitlines())
diff_msg += "\n" + "\n".join(generated_diff) + "\n"
diff_msg += "********************" + "\n"

if not rows_equal:
percent_diff = (diff_rows_cnt / len(zipped)) * 100
error_msg += "( %.5f %% )" % percent_diff
error_msg += "\n" + diff_msg
error_msg += "\n" + "--- actual\n+++ expected\n" + diff_msg
raise PySparkAssertionError(
error_class="DIFFERENT_ROWS",
message_parameters={"error_msg": error_msg},
Expand Down

0 comments on commit aa68810

Please sign in to comment.