[FIX] Fix modin-project#1683 - losing index names in pd.concat (modin-project#1684)

dchigarev · aregm · commit 55d05c82a721 · 2020-09-15T23:58:09.000-05:00
Signed-off-by: Dmitry Chigarev &lt;dmitry.chigarev@intel.com&gt;
diff --git a/modin/pandas/concat.py b/modin/pandas/concat.py
@@ -12,10 +12,13 @@
 # governing permissions and limitations under the License.
 
 import pandas
+import numpy as np
 
 from typing import Hashable, Iterable, Mapping, Optional, Union
 from pandas._typing import FrameOrSeriesUnion
+from pandas.core.dtypes.common import is_list_like
 
+from modin.backends.base.query_compiler import BaseQueryCompiler
 from .dataframe import DataFrame
 from .series import Series
 
@@ -108,8 +111,18 @@ def concat(
             new_idx_labels = {
                 k: v.index if axis == 0 else v.columns for k, v in zip(keys, objs)
             }
-            tuples = [(k, o) for k, obj in new_idx_labels.items() for o in obj]
+            tuples = [
+                (k, *o) if isinstance(o, tuple) else (k, o)
+                for k, obj in new_idx_labels.items()
+                for o in obj
+            ]
             new_idx = pandas.MultiIndex.from_tuples(tuples)
+            if names is not None:
+                new_idx.names = names
+            else:
+                old_name = _determine_name(objs, axis)
+                if old_name is not None:
+                    new_idx.names = [None] + old_name
     else:
         new_idx = None
     new_query_compiler = objs[0].concat(
@@ -132,3 +145,35 @@ def concat(
         else:
             result_df.columns = new_idx
     return result_df
+
+
+def _determine_name(objs: Iterable[BaseQueryCompiler], axis: Union[int, str]):
+    """
+    Determine names of index after concatenation along passed axis
+
+    Parameters
+    ----------
+    objs : iterable of QueryCompilers
+        objects to concatenate
+
+    axis : int or str
+        the axis to concatenate along
+
+    Returns
+    -------
+        `list` with single element - computed index name, `None` if it could not
+        be determined
+    """
+    axis = pandas.DataFrame()._get_axis_number(axis)
+
+    def get_names(obj):
+        return obj.columns.names if axis else obj.index.names
+
+    names = np.array([get_names(obj) for obj in objs])
+
+    # saving old name, only if index names of all objs are the same
+    if np.all(names == names[0]):
+        # we must do this check to avoid this calls `list(str_like_name)`
+        return list(names[0]) if is_list_like(names[0]) else [names[0]]
+    else:
+        return None
diff --git a/modin/pandas/test/test_concat.py b/modin/pandas/test/test_concat.py
@@ -17,57 +17,11 @@
 
 import modin.pandas as pd
 from modin.pandas.utils import from_pandas
-from .utils import df_equals
+from .utils import df_equals, generate_dfs, generate_multiindex_dfs, generate_none_dfs
 
 pd.DEFAULT_NPARTITIONS = 4
 
 
-def generate_dfs():
-    df = pandas.DataFrame(
-        {
-            "col1": [0, 1, 2, 3],
-            "col2": [4, 5, 6, 7],
-            "col3": [8, 9, 10, 11],
-            "col4": [12, 13, 14, 15],
-            "col5": [0, 0, 0, 0],
-        }
-    )
-
-    df2 = pandas.DataFrame(
-        {
-            "col1": [0, 1, 2, 3],
-            "col2": [4, 5, 6, 7],
-            "col3": [8, 9, 10, 11],
-            "col6": [12, 13, 14, 15],
-            "col7": [0, 0, 0, 0],
-        }
-    )
-    return df, df2
-
-
-def generate_none_dfs():
-    df = pandas.DataFrame(
-        {
-            "col1": [0, 1, 2, 3],
-            "col2": [4, 5, None, 7],
-            "col3": [8, 9, 10, 11],
-            "col4": [12, 13, 14, 15],
-            "col5": [None, None, None, None],
-        }
-    )
-
-    df2 = pandas.DataFrame(
-        {
-            "col1": [0, 1, 2, 3],
-            "col2": [4, 5, 6, 7],
-            "col3": [8, 9, 10, 11],
-            "col6": [12, 13, 14, 15],
-            "col7": [0, 0, 0, 0],
-        }
-    )
-    return df, df2
-
-
 def test_df_concat():
     df, df2 = generate_dfs()
 
@@ -207,3 +161,21 @@ def test_concat_with_empty_frame():
         pd.concat([modin_empty_df, modin_row]),
         pandas.concat([pandas_empty_df, pandas_row]),
     )
+
+
+@pytest.mark.parametrize("axis", [0, 1])
+@pytest.mark.parametrize("names", [False, True])
+def test_concat_multiindex(axis, names):
+    pd_df1, pd_df2 = generate_multiindex_dfs(axis=axis)
+    md_df1, md_df2 = map(from_pandas, [pd_df1, pd_df2])
+
+    keys = ["first", "second"]
+    if names:
+        names = [str(i) for i in np.arange(pd_df1.axes[axis].nlevels + 1)]
+    else:
+        names = None
+
+    df_equals(
+        pd.concat([md_df1, md_df2], keys=keys, axis=axis, names=names),
+        pandas.concat([pd_df1, pd_df2], keys=keys, axis=axis, names=names),
+    )
diff --git a/modin/pandas/test/utils.py b/modin/pandas/test/utils.py
@@ -592,3 +592,62 @@ def execute_callable(fn, md_kwargs={}, pd_kwargs={}):
 
 def create_test_dfs(*args, **kwargs):
     return pd.DataFrame(*args, **kwargs), pandas.DataFrame(*args, **kwargs)
+
+
+def generate_dfs():
+    df = pandas.DataFrame(
+        {
+            "col1": [0, 1, 2, 3],
+            "col2": [4, 5, 6, 7],
+            "col3": [8, 9, 10, 11],
+            "col4": [12, 13, 14, 15],
+            "col5": [0, 0, 0, 0],
+        }
+    )
+
+    df2 = pandas.DataFrame(
+        {
+            "col1": [0, 1, 2, 3],
+            "col2": [4, 5, 6, 7],
+            "col3": [8, 9, 10, 11],
+            "col6": [12, 13, 14, 15],
+            "col7": [0, 0, 0, 0],
+        }
+    )
+    return df, df2
+
+
+def generate_multiindex_dfs(axis=1):
+    def generate_multiindex(index):
+        return pandas.MultiIndex.from_tuples(
+            [("a", x) for x in index.values], names=["name1", "name2"]
+        )
+
+    df1, df2 = generate_dfs()
+    df1.axes[axis], df2.axes[axis] = map(
+        generate_multiindex, [df1.axes[axis], df2.axes[axis]]
+    )
+    return df1, df2
+
+
+def generate_none_dfs():
+    df = pandas.DataFrame(
+        {
+            "col1": [0, 1, 2, 3],
+            "col2": [4, 5, None, 7],
+            "col3": [8, 9, 10, 11],
+            "col4": [12, 13, 14, 15],
+            "col5": [None, None, None, None],
+        }
+    )
+
+    df2 = pandas.DataFrame(
+        {
+            "col1": [0, 1, 2, 3],
+            "col2": [4, 5, 6, 7],
+            "col3": [8, 9, 10, 11],
+            "col6": [12, 13, 14, 15],
+            "col7": [0, 0, 0, 0],
+        }
+    )
+    return df, df2