Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Add support for array-like inputs in cudf.get_dummies #7181

Merged
merged 4 commits into from
Jan 21, 2021
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
148 changes: 94 additions & 54 deletions python/cudf/cudf/core/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -575,8 +575,8 @@ def get_dummies(

Parameters
----------
df : cudf.DataFrame
dataframe to encode
df : array-like, Series, or DataFrame
Data of which to get dummy indicators.
prefix : str, dict, or sequence, optional
prefix to append. Either a str (to apply a constant prefix), dict
mapping column names to prefixes, or sequence of prefixes to apply with
Expand Down Expand Up @@ -633,6 +633,22 @@ def get_dummies(
1 0 1 0 0
2 0 0 1 0
3 0 0 0 1

>>> series = cudf.Series([1, 2, None, 2, 4])
>>> series
0 1
1 2
2 <NA>
3 2
4 4
dtype: int64
>>> cudf.get_dummies(series, dummy_na=True)
null 1 2 4
0 0 1 0 0
1 0 0 1 0
2 1 0 0 0
3 0 0 1 0
4 0 0 0 1
"""
if cats is None:
cats = {}
Expand All @@ -642,67 +658,79 @@ def get_dummies(
if drop_first:
raise NotImplementedError("drop_first is not supported yet")

encode_fallback_dtypes = ["object", "category"]
if isinstance(df, cudf.DataFrame):
encode_fallback_dtypes = ["object", "category"]

if columns is None or len(columns) == 0:
columns = df.select_dtypes(include=encode_fallback_dtypes).columns
if columns is None or len(columns) == 0:
columns = df.select_dtypes(include=encode_fallback_dtypes).columns

def length_check(obj, name):
if cudf.utils.dtypes.is_list_like(obj):
if len(obj) != len(columns):
raise ValueError(
f"Length of '{name}' ({len(obj)}) did not match the "
f"length of the columns being encoded ({len(columns)})."
)
def length_check(obj, name):
if cudf.utils.dtypes.is_list_like(obj):
if len(obj) != len(columns):
raise ValueError(
f"Length of '{name}' ({len(obj)}) did not match the "
f"length of the columns being "
f"encoded ({len(columns)})."
)
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved

length_check(prefix, "prefix")
length_check(prefix_sep, "prefix_sep")
length_check(prefix, "prefix")
length_check(prefix_sep, "prefix_sep")

if prefix is None:
prefix = columns
if prefix is None:
prefix = columns

if isinstance(prefix, str):
prefix_map = {}
elif isinstance(prefix, dict):
prefix_map = prefix
else:
prefix_map = dict(zip(columns, prefix))
if isinstance(prefix, str):
prefix_map = {}
elif isinstance(prefix, dict):
prefix_map = prefix
else:
prefix_map = dict(zip(columns, prefix))

if isinstance(prefix_sep, str):
prefix_sep_map = {}
elif isinstance(prefix_sep, dict):
prefix_sep_map = prefix_sep
else:
prefix_sep_map = dict(zip(columns, prefix_sep))
if isinstance(prefix_sep, str):
prefix_sep_map = {}
elif isinstance(prefix_sep, dict):
prefix_sep_map = prefix_sep
else:
prefix_sep_map = dict(zip(columns, prefix_sep))

# If we have no columns to encode, we need to drop
# fallback columns(if any)
if len(columns) == 0:
return df.select_dtypes(exclude=encode_fallback_dtypes)
else:
result_df = df.drop(columns=columns)
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved
for name in columns:
unique = _get_unique(column=df._data[name], dummy_na=dummy_na)

col_enc_df = df.one_hot_encoding(
name,
prefix=prefix_map.get(name, prefix),
cats=cats.get(name, unique),
prefix_sep=prefix_sep_map.get(name, prefix_sep),
dtype=dtype,
)
for col in col_enc_df.columns.difference(df._data.names):
result_df[col] = col_enc_df._data[col]

# If we have no columns to encode, we need to drop fallback columns(if any)
if len(columns) == 0:
return df.select_dtypes(exclude=encode_fallback_dtypes)
return result_df
else:
result_df = df.drop(columns=columns)
for name in columns:
if isinstance(
df[name]._column, cudf.core.column.CategoricalColumn
):
unique = df[name]._column.categories
else:
unique = df[name].unique()

if not dummy_na:
if np.issubdtype(unique.dtype, np.floating):
unique = unique.nans_to_nulls()
unique = unique.dropna()

col_enc_df = df.one_hot_encoding(
name,
prefix=prefix_map.get(name, prefix),
cats=cats.get(name, unique),
prefix_sep=prefix_sep_map.get(name, prefix_sep),
dtype=dtype,
)
for col in col_enc_df.columns.difference(df._data.names):
result_df[col] = col_enc_df._data[col]
ser = cudf.Series(df)
unique = _get_unique(column=ser._column, dummy_na=dummy_na)

if hasattr(unique, "to_arrow"):
cats = unique.to_arrow().to_pylist()
else:
cats = pd.Series(unique, dtype="object")
kkraus14 marked this conversation as resolved.
Show resolved Hide resolved

col_names = ["null" if cat is None else cat for cat in cats]

if prefix is not None:
col_names = [f"{prefix}{prefix_sep}{cat}" for cat in col_names]

newcols = ser.one_hot_encoding(cats=cats, dtype=dtype)
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved
result_df = cudf.DataFrame(
dict(zip(col_names, newcols)), index=ser.index
)
return result_df


Expand Down Expand Up @@ -1013,3 +1041,15 @@ def unstack(df, level, fill_value=None):
if result.index.nlevels == 1:
result.index = result.index.get_level_values(result.index.names[0])
return result


def _get_unique(column, dummy_na):
if isinstance(column, cudf.core.column.CategoricalColumn):
unique = column.categories
else:
unique = column.unique()
if not dummy_na:
if np.issubdtype(unique.dtype, np.floating):
unique = unique.nans_to_nulls()
unique = unique.dropna()
return unique
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved
12 changes: 12 additions & 0 deletions python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -5266,3 +5266,15 @@ def isclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False):
result_col[equal_nulls] = True

return Series(result_col, index=index)


def _get_unique(df, dummy_na, name):
if isinstance(df[name]._column, cudf.core.column.CategoricalColumn):
unique = df[name]._column.categories
kkraus14 marked this conversation as resolved.
Show resolved Hide resolved
else:
unique = df[name].unique()
if not dummy_na:
if np.issubdtype(unique.dtype, np.floating):
unique = unique.nans_to_nulls()
unique = unique.dropna()
return unique
44 changes: 44 additions & 0 deletions python/cudf/cudf/tests/test_onehot.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,3 +205,47 @@ def test_get_dummies_with_nan():
actual = cudf.get_dummies(df, dummy_na=True, columns=["a"])

utils.assert_eq(expected, actual)


@pytest.mark.parametrize(
"data",
[
cudf.Series(["abc", "l", "a", "abc", "z", "xyz"]),
cudf.Index([None, 1, 2, 3.3, None, 0.2]),
cudf.Series([0.1, 2, 3, None, np.nan]),
cudf.Series([23678, 324, 1, 324], name="abc"),
],
)
@pytest.mark.parametrize("prefix_sep", ["-", "#"])
@pytest.mark.parametrize("prefix", [None, "hi"])
@pytest.mark.parametrize("dtype", ["uint8", "int16"])
def test_get_dummies_array_like(data, prefix_sep, prefix, dtype):
expected = cudf.get_dummies(
data, prefix=prefix, prefix_sep=prefix_sep, dtype=dtype
)
if isinstance(data, (cudf.Series, cudf.Index)):
pd_data = data.to_pandas()
else:
pd_data = data

actual = pd.get_dummies(
pd_data, prefix=prefix, prefix_sep=prefix_sep, dtype=dtype
)
utils.assert_eq(expected, actual)


def test_get_dummies_array_like_with_nan():
ser = cudf.Series([0.1, 2, 3, None, np.nan], nan_as_null=False)
expected = cudf.DataFrame(
{
"a_null": [0, 0, 0, 1, 0],
"a_0.1": [1, 0, 0, 0, 0],
"a_2.0": [0, 1, 0, 0, 0],
"a_3.0": [0, 0, 1, 0, 0],
"a_nan": [0, 0, 0, 0, 1],
},
dtype="uint8",
)
actual = cudf.get_dummies(ser, dummy_na=True, prefix="a", prefix_sep="_")

utils.assert_eq(expected, actual)