Skip to content

ENH: Support EAs in Series.unstack #23284

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 34 commits into from
Nov 7, 2018
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
ced299f
ENH: Support EAs in Series.unstack
TomAugspurger Oct 12, 2018
3b63fcb
release note
TomAugspurger Oct 22, 2018
756dde9
xfail
TomAugspurger Oct 22, 2018
90f84ef
spelling
TomAugspurger Oct 22, 2018
942db1b
lint
TomAugspurger Oct 22, 2018
36a4450
no copy
TomAugspurger Oct 23, 2018
ee330d6
Fixup decimal tests
TomAugspurger Oct 23, 2018
2fcaf4d
Merge remote-tracking branch 'upstream/master' into ea-unstack
TomAugspurger Oct 23, 2018
4f46364
Merge remote-tracking branch 'upstream/master' into ea-unstack
TomAugspurger Oct 23, 2018
e9498a1
update
TomAugspurger Oct 23, 2018
72b5a0d
handle names
TomAugspurger Oct 24, 2018
f6b2050
Merge remote-tracking branch 'upstream/master' into ea-unstack
TomAugspurger Oct 24, 2018
4d679cb
lint
TomAugspurger Oct 24, 2018
ff7aba7
handle DataFrame.unstack
TomAugspurger Oct 24, 2018
91587cb
Merge remote-tracking branch 'upstream/master' into ea-unstack
TomAugspurger Oct 24, 2018
49bdb50
handle DataFrame.unstack
TomAugspurger Oct 24, 2018
cf8ed73
handle DataFrame.unstack
TomAugspurger Oct 24, 2018
5902b5b
Slightly de-hackify
TomAugspurger Oct 24, 2018
17d3002
Merge remote-tracking branch 'upstream/master' into ea-unstack
TomAugspurger Oct 24, 2018
a75806a
docs, comments
TomAugspurger Oct 26, 2018
2397e89
Merge remote-tracking branch 'upstream/master' into ea-unstack
TomAugspurger Oct 26, 2018
8ed7c73
unxfail test
TomAugspurger Oct 26, 2018
b23234c
added benchmark
TomAugspurger Oct 26, 2018
29a6bb1
Merge remote-tracking branch 'upstream/master' into ea-unstack
TomAugspurger Oct 29, 2018
19b7cfa
fix asv
TomAugspurger Oct 29, 2018
254fe52
Merge remote-tracking branch 'upstream/master' into ea-unstack
TomAugspurger Nov 5, 2018
2d78d42
CLN: remove dead code
TomAugspurger Nov 5, 2018
a9e6263
faster asv
TomAugspurger Nov 5, 2018
ca286f7
Merge remote-tracking branch 'upstream/master' into ea-unstack
TomAugspurger Nov 6, 2018
2f28638
Merge remote-tracking branch 'upstream/master' into ea-unstack
TomAugspurger Nov 6, 2018
967c674
API: decimal nan is na
TomAugspurger Nov 6, 2018
f6aa4b9
Merge remote-tracking branch 'upstream/master' into ea-unstack
TomAugspurger Nov 6, 2018
32bc3de
Revert "API: decimal nan is na"
TomAugspurger Nov 6, 2018
56e5f2f
Fixed sparse test
TomAugspurger Nov 6, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Slightly de-hackify
  • Loading branch information
TomAugspurger committed Oct 24, 2018
commit 5902b5ba1be9e10f482d21b0e2b037b7228264f6
67 changes: 37 additions & 30 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# -*- coding: utf-8 -*-
import copy
import functools
import warnings
import inspect
import re
Expand Down Expand Up @@ -1434,7 +1434,7 @@ def equals(self, other):
return False
return array_equivalent(self.values, other.values)

def _unstack(self, unstacker_func, new_columns):
def _unstack(self, unstacker_func, new_columns, n_rows, fill_value):
"""Return a list of unstacked blocks of self

Parameters
Expand All @@ -1443,6 +1443,10 @@ def _unstack(self, unstacker_func, new_columns):
Partially applied unstacker.
new_columns : Index
All columns of the unstacked BlockManager.
n_rows : int
Only used in ExtensionBlock.unstack
fill_value : int
Only used in ExtensionBlock.unstack

Returns
-------
Expand Down Expand Up @@ -1736,7 +1740,7 @@ def _slice(self, slicer):
def _try_cast_result(self, result, dtype=None):
return result

def _unstack(self, unstacker_func, new_columns):
def _unstack(self, unstacker_func, new_columns, n_rows, fill_value):
"""Return a list of unstacked blocks of self

Parameters
Expand All @@ -1745,6 +1749,10 @@ def _unstack(self, unstacker_func, new_columns):
Partially applied unstacker.
new_columns : Index
All columns of the unstacked BlockManager.
n_rows : int
Only used in ExtensionBlock.unstack
fill_value : int
Only used in ExtensionBlock.unstack

Returns
-------
Expand All @@ -1756,18 +1764,28 @@ def _unstack(self, unstacker_func, new_columns):
# NonConsolidatable blocks can have a single item only, so we return
# one block per item
unstacker = unstacker_func(self.values.T)
new_items = unstacker.get_new_columns()
new_placement = new_columns.get_indexer(new_items)
new_values, mask = unstacker.get_new_values()

mask = mask.any(0)
new_placement, new_values, mask = self._get_unstack_items(
unstacker, new_columns
)

new_values = new_values.T[mask]
new_placement = new_placement[mask]

blocks = [self.make_block_same_class(vals, [place])
for vals, place in zip(new_values, new_placement)]
return blocks, mask

@staticmethod
def _get_unstack_items(unstacker, new_columns):
# shared with ExtensionBlock
new_items = unstacker.get_new_columns()
new_placement = new_columns.get_indexer(new_items)
new_values, mask = unstacker.get_new_values()

mask = mask.any(0)
return new_placement, new_values, mask


class ExtensionBlock(NonConsolidatableMixIn, Block):
"""Block for holding extension types.
Expand Down Expand Up @@ -1955,32 +1973,21 @@ def shift(self, periods, axis=0):
def _ftype(self):
return getattr(self.values, '_pandas_ftype', Block._ftype)

def _unstack(self, unstacker_func, new_columns):
# I wonder if this is supported
fill_value = unstacker_func.keywords['fill_value']
unstacker_func = copy.deepcopy(unstacker_func)
unstacker_func.keywords['fill_value'] = -1

# just get the index. Can maybe avoid this?
dummy_unstacker = unstacker_func(np.empty((0, 0)))

dummy_arr = np.arange(len(dummy_unstacker.index))
def _unstack(self, unstacker_func, new_columns, n_rows, fill_value):
dummy_arr = np.arange(n_rows)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add a doc-string (or does it share)?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The inherited one OK. I'm going to leave a comment explaining why we override.

dummy_unstacker = functools.partial(unstacker_func, fill_value=-1)
unstacker = dummy_unstacker(dummy_arr)

unstacker = unstacker_func(dummy_arr)
new_items = unstacker.get_new_columns()
new_placement = new_columns.get_indexer(new_items)
new_values, mask = unstacker.get_new_values()
mask = mask.any(0)

new_values = [
self.values.take(indices, allow_fill=True,
fill_value=fill_value)
for indices in new_values.T
]
new_placement, new_values, mask = self._get_unstack_items(
unstacker, new_columns
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

would not this generically work for all unstacking/ (e.g. what if you make this the super method)?

Copy link
Contributor Author

@TomAugspurger TomAugspurger Oct 26, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is slower in general. It's necessarily slower for NumPy types, since you have to do the reshaping / unstack on the ndarray of positions anyway.

The hope is that the cost of the additional take per column is offset by not converting to an ndarray[object] in the first place.

Working on benchmarks now.

)

blocks = [
self.make_block_same_class(vals, [place])
for vals, place in zip(new_values, new_placement)
self.make_block_same_class(
self.values.take(indices, allow_fill=True,
fill_value=fill_value),
[place])
for indices, place in zip(new_values.T, new_placement)
]
return blocks, mask

Expand Down
10 changes: 8 additions & 2 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1405,18 +1405,21 @@ def canonicalize(block):
return all(block.equals(oblock)
for block, oblock in zip(self_blocks, other_blocks))

def unstack(self, unstacker_func):
def unstack(self, unstacker_func, fill_value):
"""Return a blockmanager with all blocks unstacked.

Parameters
----------
unstacker_func : callable
A (partially-applied) ``pd.core.reshape._Unstacker`` class.
fill_value : Any
fill_value for newly introduced missing values.

Returns
-------
unstacked : BlockManager
"""
n_rows = self.shape[-1]
dummy = unstacker_func(np.empty((0, 0)), value_columns=self.items)
new_columns = dummy.get_new_columns()
new_index = dummy.get_new_index()
Expand All @@ -1427,7 +1430,10 @@ def unstack(self, unstacker_func):
blocks, mask = blk._unstack(
partial(unstacker_func,
value_columns=self.items[blk.mgr_locs.indexer]),
new_columns)
new_columns,
n_rows,
fill_value
)

new_blocks.extend(blocks)
columns_mask.extend(mask)
Expand Down
10 changes: 2 additions & 8 deletions pandas/core/reshape/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -409,17 +409,11 @@ def unstack(obj, level, fill_value=None):


def _unstack_frame(obj, level, fill_value=None):
from pandas.core.reshape.concat import concat

if (obj._is_homogeneous_type and
is_extension_array_dtype(obj.dtypes.iloc[0])):
frames = [ser.unstack(level=level, fill_value=fill_value)
for name, ser in obj.iteritems()]
return concat(frames, axis=1, keys=obj.columns)
if obj._is_mixed_type:
unstacker = partial(_Unstacker, index=obj.index,
level=level, fill_value=fill_value)
blocks = obj._data.unstack(unstacker)
blocks = obj._data.unstack(unstacker,
fill_value=fill_value)
return obj._constructor(blocks)
else:
unstacker = _Unstacker(obj.values, obj.index, level=level,
Expand Down