FIX-#1959 #1987: Fix incorrect work of duplicated and

prutskov · prutskov · commit f52578c9ee37 · 2020-09-07T16:40:25.000+03:00
`drop_duplicates` functions

Signed-off-by: Alexey Prutskov &lt;alexey.prutskov@intel.com&gt;
diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py
@@ -282,11 +282,15 @@ def duplicated(self, subset=None, keep="first"):
         Returns:
             Series
         """
+        import hashlib
+
         df = self[subset] if subset is not None else self
         # if the number of columns we are checking for duplicates is larger than 1, we must
         # hash them to generate a single value that can be compared across rows.
         if len(df.columns) > 1:
-            hashed = df.apply(lambda s: hash(tuple(s)), axis=1).to_frame()
+            hashed = df.apply(
+                lambda s: hashlib.new("md5", str(tuple(s)).encode()).hexdigest(), axis=1
+            ).to_frame()
         else:
             hashed = df
         duplicates = hashed.apply(lambda s: s.duplicated(keep=keep)).squeeze(axis=1)
diff --git a/modin/pandas/test/dataframe/test_window.py b/modin/pandas/test/dataframe/test_window.py
@@ -24,6 +24,8 @@
     name_contains,
     test_data_values,
     test_data_keys,
+    test_data_with_duplicates_values,
+    test_data_with_duplicates_keys,
     no_numeric_dfs,
     quantiles_keys,
     quantiles_values,
@@ -211,7 +213,9 @@ def test_diff(request, data, axis, periods):
         df_equals(modin_result, pandas_result)
 
 
-@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
+@pytest.mark.parametrize(
+    "data", test_data_with_duplicates_values, ids=test_data_with_duplicates_keys
+)
 @pytest.mark.parametrize(
     "keep", ["last", "first", False], ids=["last", "first", "False"]
 )
diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py
@@ -31,6 +31,8 @@
     test_data,
     test_data_values,
     test_data_keys,
+    test_data_with_duplicates_values,
+    test_data_with_duplicates_keys,
     test_string_data_values,
     test_string_data_keys,
     test_string_list_data_values,
@@ -1391,7 +1393,9 @@ def test_drop():
         modin_series.drop(None, None, None, None)
 
 
-@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
+@pytest.mark.parametrize(
+    "data", test_data_with_duplicates_values, ids=test_data_with_duplicates_keys
+)
 @pytest.mark.parametrize(
     "keep", ["last", "first", False], ids=["last", "first", "False"]
 )
@@ -1527,7 +1531,9 @@ def test_dt():
     df_equals(modin_series.dt.to_timestamp(), pandas_series.dt.to_timestamp())
 
 
-@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
+@pytest.mark.parametrize(
+    "data", test_data_with_duplicates_values, ids=test_data_with_duplicates_keys
+)
 @pytest.mark.parametrize(
     "keep", ["last", "first", False], ids=["last", "first", "False"]
 )
diff --git a/modin/pandas/test/utils.py b/modin/pandas/test/utils.py
@@ -101,6 +101,11 @@
     "col{}".format(int(NCOLS / 2))
 )
 
+for col in test_data["float_nan_data"]:
+    for row in range(NROWS // 2):
+        if row % 16 == 0:
+            test_data["float_nan_data"][col][row] = np.NaN
+
 test_data_values = list(test_data.values())
 test_data_keys = list(test_data.keys())
 
@@ -128,21 +133,22 @@
         ]
         for i in range(NCOLS)
     },
-    "subset_duplicates": {
-        "col{}".format(i): [
-            i if j % 7 == 0 and i in [1, 3, 7] else x
-            for j, x in enumerate(range(NROWS))
-        ]
-        for i in range(NCOLS)
-    },
     "has_name_column": {
         "name": ["one", "two", "two", "three"],
         "col1": [1, 2, 2, 3],
         "col3": [10, 20, 20, 3],
         "col7": [100, 201, 200, 300],
     },
+    "str_columns": {
+        "col_str{}".format(int((i - NCOLS / 2) % NCOLS + 1)): [
+            "s" + str(x % 5) for x in range(NROWS)
+        ]
+        for i in range(NCOLS)
+    },
 }
 
+test_data_with_duplicates["float_nan"] = test_data["float_nan_data"]
+
 test_data_small = {
     "small": {
         "col0": [1, 2, 3, 4],