Skip to content

Commit f52578c

Browse files
committed
FIX-#1959 #1987: Fix incorrect work of duplicated and
`drop_duplicates` functions Signed-off-by: Alexey Prutskov <alexey.prutskov@intel.com>
1 parent 03fe0d6 commit f52578c

File tree

4 files changed

+31
-11
lines changed

4 files changed

+31
-11
lines changed

modin/pandas/dataframe.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -282,11 +282,15 @@ def duplicated(self, subset=None, keep="first"):
282282
Returns:
283283
Series
284284
"""
285+
import hashlib
286+
285287
df = self[subset] if subset is not None else self
286288
# if the number of columns we are checking for duplicates is larger than 1, we must
287289
# hash them to generate a single value that can be compared across rows.
288290
if len(df.columns) > 1:
289-
hashed = df.apply(lambda s: hash(tuple(s)), axis=1).to_frame()
291+
hashed = df.apply(
292+
lambda s: hashlib.new("md5", str(tuple(s)).encode()).hexdigest(), axis=1
293+
).to_frame()
290294
else:
291295
hashed = df
292296
duplicates = hashed.apply(lambda s: s.duplicated(keep=keep)).squeeze(axis=1)

modin/pandas/test/dataframe/test_window.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@
2424
name_contains,
2525
test_data_values,
2626
test_data_keys,
27+
test_data_with_duplicates_values,
28+
test_data_with_duplicates_keys,
2729
no_numeric_dfs,
2830
quantiles_keys,
2931
quantiles_values,
@@ -211,7 +213,9 @@ def test_diff(request, data, axis, periods):
211213
df_equals(modin_result, pandas_result)
212214

213215

214-
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
216+
@pytest.mark.parametrize(
217+
"data", test_data_with_duplicates_values, ids=test_data_with_duplicates_keys
218+
)
215219
@pytest.mark.parametrize(
216220
"keep", ["last", "first", False], ids=["last", "first", "False"]
217221
)

modin/pandas/test/test_series.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@
3131
test_data,
3232
test_data_values,
3333
test_data_keys,
34+
test_data_with_duplicates_values,
35+
test_data_with_duplicates_keys,
3436
test_string_data_values,
3537
test_string_data_keys,
3638
test_string_list_data_values,
@@ -1391,7 +1393,9 @@ def test_drop():
13911393
modin_series.drop(None, None, None, None)
13921394

13931395

1394-
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
1396+
@pytest.mark.parametrize(
1397+
"data", test_data_with_duplicates_values, ids=test_data_with_duplicates_keys
1398+
)
13951399
@pytest.mark.parametrize(
13961400
"keep", ["last", "first", False], ids=["last", "first", "False"]
13971401
)
@@ -1527,7 +1531,9 @@ def test_dt():
15271531
df_equals(modin_series.dt.to_timestamp(), pandas_series.dt.to_timestamp())
15281532

15291533

1530-
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
1534+
@pytest.mark.parametrize(
1535+
"data", test_data_with_duplicates_values, ids=test_data_with_duplicates_keys
1536+
)
15311537
@pytest.mark.parametrize(
15321538
"keep", ["last", "first", False], ids=["last", "first", "False"]
15331539
)

modin/pandas/test/utils.py

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,11 @@
101101
"col{}".format(int(NCOLS / 2))
102102
)
103103

104+
for col in test_data["float_nan_data"]:
105+
for row in range(NROWS // 2):
106+
if row % 16 == 0:
107+
test_data["float_nan_data"][col][row] = np.NaN
108+
104109
test_data_values = list(test_data.values())
105110
test_data_keys = list(test_data.keys())
106111

@@ -128,21 +133,22 @@
128133
]
129134
for i in range(NCOLS)
130135
},
131-
"subset_duplicates": {
132-
"col{}".format(i): [
133-
i if j % 7 == 0 and i in [1, 3, 7] else x
134-
for j, x in enumerate(range(NROWS))
135-
]
136-
for i in range(NCOLS)
137-
},
138136
"has_name_column": {
139137
"name": ["one", "two", "two", "three"],
140138
"col1": [1, 2, 2, 3],
141139
"col3": [10, 20, 20, 3],
142140
"col7": [100, 201, 200, 300],
143141
},
142+
"str_columns": {
143+
"col_str{}".format(int((i - NCOLS / 2) % NCOLS + 1)): [
144+
"s" + str(x % 5) for x in range(NROWS)
145+
]
146+
for i in range(NCOLS)
147+
},
144148
}
145149

150+
test_data_with_duplicates["float_nan"] = test_data["float_nan_data"]
151+
146152
test_data_small = {
147153
"small": {
148154
"col0": [1, 2, 3, 4],

0 commit comments

Comments
 (0)