Skip to content

Commit

Permalink
[FIX] Fix #1683 - losing index names in pd.concat (#1684)
Browse files Browse the repository at this point in the history
Signed-off-by: Dmitry Chigarev <dmitry.chigarev@intel.com>
  • Loading branch information
dchigarev authored Jul 22, 2020
1 parent c1aba38 commit 51c3803
Show file tree
Hide file tree
Showing 3 changed files with 124 additions and 48 deletions.
47 changes: 46 additions & 1 deletion modin/pandas/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,13 @@
# governing permissions and limitations under the License.

import pandas
import numpy as np

from typing import Hashable, Iterable, Mapping, Optional, Union
from pandas._typing import FrameOrSeriesUnion
from pandas.core.dtypes.common import is_list_like

from modin.backends.base.query_compiler import BaseQueryCompiler
from .dataframe import DataFrame
from .series import Series

Expand Down Expand Up @@ -108,8 +111,18 @@ def concat(
new_idx_labels = {
k: v.index if axis == 0 else v.columns for k, v in zip(keys, objs)
}
tuples = [(k, o) for k, obj in new_idx_labels.items() for o in obj]
tuples = [
(k, *o) if isinstance(o, tuple) else (k, o)
for k, obj in new_idx_labels.items()
for o in obj
]
new_idx = pandas.MultiIndex.from_tuples(tuples)
if names is not None:
new_idx.names = names
else:
old_name = _determine_name(objs, axis)
if old_name is not None:
new_idx.names = [None] + old_name
else:
new_idx = None
new_query_compiler = objs[0].concat(
Expand All @@ -132,3 +145,35 @@ def concat(
else:
result_df.columns = new_idx
return result_df


def _determine_name(objs: Iterable[BaseQueryCompiler], axis: Union[int, str]):
"""
Determine names of index after concatenation along passed axis
Parameters
----------
objs : iterable of QueryCompilers
objects to concatenate
axis : int or str
the axis to concatenate along
Returns
-------
`list` with single element - computed index name, `None` if it could not
be determined
"""
axis = pandas.DataFrame()._get_axis_number(axis)

def get_names(obj):
return obj.columns.names if axis else obj.index.names

names = np.array([get_names(obj) for obj in objs])

# saving old name, only if index names of all objs are the same
if np.all(names == names[0]):
# we must do this check to avoid this calls `list(str_like_name)`
return list(names[0]) if is_list_like(names[0]) else [names[0]]
else:
return None
66 changes: 19 additions & 47 deletions modin/pandas/test/test_concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,57 +17,11 @@

import modin.pandas as pd
from modin.pandas.utils import from_pandas
from .utils import df_equals
from .utils import df_equals, generate_dfs, generate_multiindex_dfs, generate_none_dfs

pd.DEFAULT_NPARTITIONS = 4


def generate_dfs():
df = pandas.DataFrame(
{
"col1": [0, 1, 2, 3],
"col2": [4, 5, 6, 7],
"col3": [8, 9, 10, 11],
"col4": [12, 13, 14, 15],
"col5": [0, 0, 0, 0],
}
)

df2 = pandas.DataFrame(
{
"col1": [0, 1, 2, 3],
"col2": [4, 5, 6, 7],
"col3": [8, 9, 10, 11],
"col6": [12, 13, 14, 15],
"col7": [0, 0, 0, 0],
}
)
return df, df2


def generate_none_dfs():
df = pandas.DataFrame(
{
"col1": [0, 1, 2, 3],
"col2": [4, 5, None, 7],
"col3": [8, 9, 10, 11],
"col4": [12, 13, 14, 15],
"col5": [None, None, None, None],
}
)

df2 = pandas.DataFrame(
{
"col1": [0, 1, 2, 3],
"col2": [4, 5, 6, 7],
"col3": [8, 9, 10, 11],
"col6": [12, 13, 14, 15],
"col7": [0, 0, 0, 0],
}
)
return df, df2


def test_df_concat():
df, df2 = generate_dfs()

Expand Down Expand Up @@ -207,3 +161,21 @@ def test_concat_with_empty_frame():
pd.concat([modin_empty_df, modin_row]),
pandas.concat([pandas_empty_df, pandas_row]),
)


@pytest.mark.parametrize("axis", [0, 1])
@pytest.mark.parametrize("names", [False, True])
def test_concat_multiindex(axis, names):
pd_df1, pd_df2 = generate_multiindex_dfs(axis=axis)
md_df1, md_df2 = map(from_pandas, [pd_df1, pd_df2])

keys = ["first", "second"]
if names:
names = [str(i) for i in np.arange(pd_df1.axes[axis].nlevels + 1)]
else:
names = None

df_equals(
pd.concat([md_df1, md_df2], keys=keys, axis=axis, names=names),
pandas.concat([pd_df1, pd_df2], keys=keys, axis=axis, names=names),
)
59 changes: 59 additions & 0 deletions modin/pandas/test/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -592,3 +592,62 @@ def execute_callable(fn, md_kwargs={}, pd_kwargs={}):

def create_test_dfs(*args, **kwargs):
return pd.DataFrame(*args, **kwargs), pandas.DataFrame(*args, **kwargs)


def generate_dfs():
df = pandas.DataFrame(
{
"col1": [0, 1, 2, 3],
"col2": [4, 5, 6, 7],
"col3": [8, 9, 10, 11],
"col4": [12, 13, 14, 15],
"col5": [0, 0, 0, 0],
}
)

df2 = pandas.DataFrame(
{
"col1": [0, 1, 2, 3],
"col2": [4, 5, 6, 7],
"col3": [8, 9, 10, 11],
"col6": [12, 13, 14, 15],
"col7": [0, 0, 0, 0],
}
)
return df, df2


def generate_multiindex_dfs(axis=1):
def generate_multiindex(index):
return pandas.MultiIndex.from_tuples(
[("a", x) for x in index.values], names=["name1", "name2"]
)

df1, df2 = generate_dfs()
df1.axes[axis], df2.axes[axis] = map(
generate_multiindex, [df1.axes[axis], df2.axes[axis]]
)
return df1, df2


def generate_none_dfs():
df = pandas.DataFrame(
{
"col1": [0, 1, 2, 3],
"col2": [4, 5, None, 7],
"col3": [8, 9, 10, 11],
"col4": [12, 13, 14, 15],
"col5": [None, None, None, None],
}
)

df2 = pandas.DataFrame(
{
"col1": [0, 1, 2, 3],
"col2": [4, 5, 6, 7],
"col3": [8, 9, 10, 11],
"col6": [12, 13, 14, 15],
"col7": [0, 0, 0, 0],
}
)
return df, df2

0 comments on commit 51c3803

Please sign in to comment.