Skip to content

Commit b15ba30

Browse files
committed
Merge pull request pandas-dev#5417 from TomAugspurger/to-frame-multi
BUG: Panel.to_frame() with MultiIndex major axis
2 parents 5c043b9 + 77a88a8 commit b15ba30

File tree

5 files changed

+190
-11
lines changed

5 files changed

+190
-11
lines changed

doc/source/release.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,8 @@ Bug Fixes
105105
- Fixed string-representation of ``NaT`` to be "NaT" (:issue:`5708`)
106106
- Fixed string-representation for Timestamp to show nanoseconds if present (:issue:`5912`)
107107
- ``pd.match`` not returning passed sentinel
108+
- ``Panel.to_frame()`` no longer fails when ``major_axis`` is a
109+
``MultiIndex`` (:issue:`5402`).
108110

109111
pandas 0.13.0
110112
-------------

pandas/core/index.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2396,6 +2396,44 @@ def format(self, space=2, sparsify=None, adjoin=True, names=False,
23962396
else:
23972397
return result_levels
23982398

2399+
def to_hierarchical(self, n_repeat, n_shuffle=1):
2400+
"""
2401+
Return a MultiIndex reshaped to conform to the
2402+
shapes given by n_repeat and n_shuffle.
2403+
2404+
Useful to replicate and rearrange a MultiIndex for combination
2405+
with another Index with n_repeat items.
2406+
2407+
Parameters
2408+
----------
2409+
n_repeat : int
2410+
Number of times to repeat the labels on self
2411+
n_shuffle : int
2412+
Controls the reordering of the labels. If the result is going
2413+
to be an inner level in a MultiIndex, n_shuffle will need to be
2414+
greater than one. The size of each label must divisible by
2415+
n_shuffle.
2416+
2417+
Returns
2418+
-------
2419+
MultiIndex
2420+
2421+
Examples
2422+
--------
2423+
>>> idx = MultiIndex.from_tuples([(1, u'one'), (1, u'two'),
2424+
(2, u'one'), (2, u'two')])
2425+
>>> idx.to_hierarchical(3)
2426+
MultiIndex(levels=[[1, 2], [u'one', u'two']],
2427+
labels=[[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1],
2428+
[0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1]])
2429+
"""
2430+
levels = self.levels
2431+
labels = [np.repeat(x, n_repeat) for x in self.labels]
2432+
# Assumes that each label is divisible by n_shuffle
2433+
labels = [x.reshape(n_shuffle, -1).ravel(1) for x in labels]
2434+
names = self.names
2435+
return MultiIndex(levels=levels, labels=labels, names=names)
2436+
23992437
@property
24002438
def is_all_dates(self):
24012439
return False

pandas/core/panel.py

Lines changed: 40 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -796,7 +796,9 @@ def groupby(self, function, axis='major'):
796796

797797
def to_frame(self, filter_observations=True):
798798
"""
799-
Transform wide format into long (stacked) format as DataFrame
799+
Transform wide format into long (stacked) format as DataFrame whose
800+
columns are the Panel's items and whose index is a MultiIndex formed
801+
of the Panel's major and minor axes.
800802
801803
Parameters
802804
----------
@@ -811,6 +813,7 @@ def to_frame(self, filter_observations=True):
811813
_, N, K = self.shape
812814

813815
if filter_observations:
816+
# shaped like the return DataFrame
814817
mask = com.notnull(self.values).all(axis=0)
815818
# size = mask.sum()
816819
selector = mask.ravel()
@@ -822,19 +825,45 @@ def to_frame(self, filter_observations=True):
822825
for item in self.items:
823826
data[item] = self[item].values.ravel()[selector]
824827

825-
major_labels = np.arange(N).repeat(K)[selector]
828+
def construct_multi_parts(idx, n_repeat, n_shuffle=1):
829+
axis_idx = idx.to_hierarchical(n_repeat, n_shuffle)
830+
labels = [x[selector] for x in axis_idx.labels]
831+
levels = axis_idx.levels
832+
names = axis_idx.names
833+
return labels, levels, names
834+
835+
def construct_index_parts(idx, major=True):
836+
levels = [idx]
837+
if major:
838+
labels = [np.arange(N).repeat(K)[selector]]
839+
names = idx.name or 'major'
840+
else:
841+
labels = np.arange(K).reshape(1, K)[np.zeros(N, dtype=int)]
842+
labels = [labels.ravel()[selector]]
843+
names = idx.name or 'minor'
844+
names = [names]
845+
return labels, levels, names
846+
847+
if isinstance(self.major_axis, MultiIndex):
848+
major_labels, major_levels, major_names = construct_multi_parts(
849+
self.major_axis, n_repeat=K)
850+
else:
851+
major_labels, major_levels, major_names = construct_index_parts(
852+
self.major_axis)
826853

827-
# Anyone think of a better way to do this? np.repeat does not
828-
# do what I want
829-
minor_labels = np.arange(K).reshape(1, K)[np.zeros(N, dtype=int)]
830-
minor_labels = minor_labels.ravel()[selector]
854+
if isinstance(self.minor_axis, MultiIndex):
855+
minor_labels, minor_levels, minor_names = construct_multi_parts(
856+
self.minor_axis, n_repeat=N, n_shuffle=K)
857+
else:
858+
minor_labels, minor_levels, minor_names = construct_index_parts(
859+
self.minor_axis, major=False)
831860

832-
maj_name = self.major_axis.name or 'major'
833-
min_name = self.minor_axis.name or 'minor'
861+
levels = major_levels + minor_levels
862+
labels = major_labels + minor_labels
863+
names = major_names + minor_names
834864

835-
index = MultiIndex(levels=[self.major_axis, self.minor_axis],
836-
labels=[major_labels, minor_labels],
837-
names=[maj_name, min_name], verify_integrity=False)
865+
index = MultiIndex(levels=levels, labels=labels,
866+
names=names, verify_integrity=False)
838867

839868
return DataFrame(data, index=index, columns=self.items)
840869

pandas/tests/test_index.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1990,6 +1990,36 @@ def test_format_sparse_config(self):
19901990

19911991
warnings.filters = warn_filters
19921992

1993+
def test_to_hierarchical(self):
1994+
index = MultiIndex.from_tuples([(1, 'one'), (1, 'two'),
1995+
(2, 'one'), (2, 'two')])
1996+
result = index.to_hierarchical(3)
1997+
expected = MultiIndex(levels=[[1, 2], ['one', 'two']],
1998+
labels=[[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1],
1999+
[0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1]])
2000+
tm.assert_index_equal(result, expected)
2001+
self.assertEqual(result.names, index.names)
2002+
2003+
# K > 1
2004+
result = index.to_hierarchical(3, 2)
2005+
expected = MultiIndex(levels=[[1, 2], ['one', 'two']],
2006+
labels=[[0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1],
2007+
[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]])
2008+
tm.assert_index_equal(result, expected)
2009+
self.assertEqual(result.names, index.names)
2010+
2011+
# non-sorted
2012+
index = MultiIndex.from_tuples([(2, 'c'), (1, 'b'),
2013+
(2, 'a'), (2, 'b')],
2014+
names=['N1', 'N2'])
2015+
2016+
result = index.to_hierarchical(2)
2017+
expected = MultiIndex.from_tuples([(2, 'c'), (2, 'c'), (1, 'b'), (1, 'b'),
2018+
(2, 'a'), (2, 'a'), (2, 'b'), (2, 'b')],
2019+
names=['N1', 'N2'])
2020+
tm.assert_index_equal(result, expected)
2021+
self.assertEqual(result.names, index.names)
2022+
19932023
def test_bounds(self):
19942024
self.index._bounds
19952025

pandas/tests/test_panel.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1450,6 +1450,86 @@ def test_to_frame_mixed(self):
14501450
# Previously, this was mutating the underlying index and changing its name
14511451
assert_frame_equal(wp['bool'], panel['bool'], check_names=False)
14521452

1453+
def test_to_frame_multi_major(self):
1454+
idx = MultiIndex.from_tuples([(1, 'one'), (1, 'two'), (2, 'one'),
1455+
(2, 'two')])
1456+
df = DataFrame([[1, 'a', 1], [2, 'b', 1], [3, 'c', 1], [4, 'd', 1]],
1457+
columns=['A', 'B', 'C'], index=idx)
1458+
wp = Panel({'i1': df, 'i2': df})
1459+
expected_idx = MultiIndex.from_tuples([(1, 'one', 'A'), (1, 'one', 'B'),
1460+
(1, 'one', 'C'), (1, 'two', 'A'),
1461+
(1, 'two', 'B'), (1, 'two', 'C'),
1462+
(2, 'one', 'A'), (2, 'one', 'B'),
1463+
(2, 'one', 'C'), (2, 'two', 'A'),
1464+
(2, 'two', 'B'), (2, 'two', 'C')],
1465+
names=[None, None, 'minor'])
1466+
expected = DataFrame({'i1': [1, 'a', 1, 2, 'b', 1, 3, 'c', 1, 4, 'd', 1],
1467+
'i2': [1, 'a', 1, 2, 'b', 1, 3, 'c', 1, 4, 'd', 1]},
1468+
index=expected_idx)
1469+
result = wp.to_frame()
1470+
assert_frame_equal(result, expected)
1471+
1472+
wp.iloc[0, 0].iloc[0] = np.nan # BUG on setting. GH #5773
1473+
result = wp.to_frame()
1474+
assert_frame_equal(result, expected[1:])
1475+
1476+
idx = MultiIndex.from_tuples([(1, 'two'), (1, 'one'), (2, 'one'),
1477+
(np.nan, 'two')])
1478+
df = DataFrame([[1, 'a', 1], [2, 'b', 1], [3, 'c', 1], [4, 'd', 1]],
1479+
columns=['A', 'B', 'C'], index=idx)
1480+
wp = Panel({'i1': df, 'i2': df})
1481+
ex_idx = MultiIndex.from_tuples([(1, 'two', 'A'), (1, 'two', 'B'), (1, 'two', 'C'),
1482+
(1, 'one', 'A'), (1, 'one', 'B'), (1, 'one', 'C'),
1483+
(2, 'one', 'A'), (2, 'one', 'B'), (2, 'one', 'C'),
1484+
(np.nan, 'two', 'A'), (np.nan, 'two', 'B'),
1485+
(np.nan, 'two', 'C')],
1486+
names=[None, None, 'minor'])
1487+
expected.index = ex_idx
1488+
result = wp.to_frame()
1489+
assert_frame_equal(result, expected)
1490+
1491+
def test_to_frame_multi_major_minor(self):
1492+
cols = MultiIndex(levels=[['C_A', 'C_B'], ['C_1', 'C_2']],
1493+
labels=[[0, 0, 1, 1], [0, 1, 0, 1]])
1494+
idx = MultiIndex.from_tuples([(1, 'one'), (1, 'two'), (2, 'one'),
1495+
(2, 'two'), (3, 'three'), (4, 'four')])
1496+
df = DataFrame([[1, 2, 11, 12], [3, 4, 13, 14], ['a', 'b', 'w', 'x'],
1497+
['c', 'd', 'y', 'z'], [-1, -2, -3, -4], [-5, -6, -7, -8]
1498+
], columns=cols, index=idx)
1499+
wp = Panel({'i1': df, 'i2': df})
1500+
1501+
exp_idx = MultiIndex.from_tuples([(1, 'one', 'C_A', 'C_1'), (1, 'one', 'C_A', 'C_2'),
1502+
(1, 'one', 'C_B', 'C_1'), (1, 'one', 'C_B', 'C_2'),
1503+
(1, 'two', 'C_A', 'C_1'), (1, 'two', 'C_A', 'C_2'),
1504+
(1, 'two', 'C_B', 'C_1'), (1, 'two', 'C_B', 'C_2'),
1505+
(2, 'one', 'C_A', 'C_1'), (2, 'one', 'C_A', 'C_2'),
1506+
(2, 'one', 'C_B', 'C_1'), (2, 'one', 'C_B', 'C_2'),
1507+
(2, 'two', 'C_A', 'C_1'), (2, 'two', 'C_A', 'C_2'),
1508+
(2, 'two', 'C_B', 'C_1'), (2, 'two', 'C_B', 'C_2'),
1509+
(3, 'three', 'C_A', 'C_1'), (3, 'three', 'C_A', 'C_2'),
1510+
(3, 'three', 'C_B', 'C_1'), (3, 'three', 'C_B', 'C_2'),
1511+
(4, 'four', 'C_A', 'C_1'), (4, 'four', 'C_A', 'C_2'),
1512+
(4, 'four', 'C_B', 'C_1'), (4, 'four', 'C_B', 'C_2')],
1513+
names=[None, None, None, None])
1514+
exp_val = [[1, 1], [2, 2], [11, 11], [12, 12], [3, 3], [4, 4], [13, 13],
1515+
[14, 14], ['a', 'a'], ['b', 'b'], ['w', 'w'], ['x', 'x'],
1516+
['c', 'c'], ['d', 'd'], ['y', 'y'], ['z', 'z'], [-1, -1],
1517+
[-2, -2], [-3, -3], [-4, -4], [-5, -5], [-6, -6], [-7, -7],
1518+
[-8, -8]]
1519+
result = wp.to_frame()
1520+
expected = DataFrame(exp_val, columns=['i1', 'i2'], index=exp_idx)
1521+
assert_frame_equal(result, expected)
1522+
1523+
def test_to_frame_multi_drop_level(self):
1524+
idx = MultiIndex.from_tuples([(1, 'one'), (2, 'one'), (2, 'two')])
1525+
df = DataFrame({'A': [np.nan, 1, 2]}, index=idx)
1526+
wp = Panel({'i1': df, 'i2': df})
1527+
result = wp.to_frame()
1528+
exp_idx = MultiIndex.from_tuples([(2, 'one', 'A'), (2, 'two', 'A')],
1529+
names=[None, None, 'minor'])
1530+
expected = DataFrame({'i1': [1., 2], 'i2': [1., 2]}, index=exp_idx)
1531+
assert_frame_equal(result, expected)
1532+
14531533
def test_to_panel_na_handling(self):
14541534
df = DataFrame(np.random.randint(0, 10, size=20).reshape((10, 2)),
14551535
index=[[0, 0, 0, 0, 0, 0, 1, 1, 1, 1],

0 commit comments

Comments
 (0)