Skip to content

Commit

Permalink
REF/INT: concat blocks of same type with preserving block type (panda…
Browse files Browse the repository at this point in the history
  • Loading branch information
jorisvandenbossche authored and jreback committed Oct 12, 2017
1 parent 7e159ae commit eac4d3f
Show file tree
Hide file tree
Showing 4 changed files with 152 additions and 23 deletions.
10 changes: 5 additions & 5 deletions pandas/core/dtypes/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,11 +63,12 @@ def get_dtype_kinds(l):
return typs


def _get_series_result_type(result):
def _get_series_result_type(result, objs=None):
"""
return appropriate class of Series concat
input is either dict or array-like
"""
# concat Series with axis 1
if isinstance(result, dict):
# concat Series with axis 1
if all(is_sparse(c) for c in compat.itervalues(result)):
Expand All @@ -77,13 +78,12 @@ def _get_series_result_type(result):
from pandas.core.frame import DataFrame
return DataFrame

elif is_sparse(result):
# concat Series with axis 1
# otherwise it is a SingleBlockManager (axis = 0)
if result._block.is_sparse:
from pandas.core.sparse.api import SparseSeries
return SparseSeries
else:
from pandas.core.series import Series
return Series
return objs[0]._constructor


def _get_frame_result_type(result, objs):
Expand Down
108 changes: 105 additions & 3 deletions pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ class Block(PandasObject):
_validate_ndim = True
_ftype = 'dense'
_holder = None
_concatenator = staticmethod(np.concatenate)

def __init__(self, values, placement, ndim=None, fastpath=False):
if ndim is None:
Expand Down Expand Up @@ -314,6 +315,15 @@ def ftype(self):
def merge(self, other):
return _merge_blocks([self, other])

def concat_same_type(self, to_concat, placement=None):
"""
Concatenate list of single blocks of the same type.
"""
values = self._concatenator([blk.values for blk in to_concat],
axis=self.ndim - 1)
return self.make_block_same_class(
values, placement=placement or slice(0, len(values), 1))

def reindex_axis(self, indexer, method=None, axis=1, fill_value=None,
limit=None, mask_info=None):
"""
Expand Down Expand Up @@ -2309,6 +2319,7 @@ class CategoricalBlock(NonConsolidatableMixIn, ObjectBlock):
_verify_integrity = True
_can_hold_na = True
_holder = Categorical
_concatenator = staticmethod(_concat._concat_categorical)

def __init__(self, values, placement, fastpath=False, **kwargs):

Expand Down Expand Up @@ -2432,6 +2443,17 @@ def to_native_types(self, slicer=None, na_rep='', quoting=None, **kwargs):
# we are expected to return a 2-d ndarray
return values.reshape(1, len(values))

def concat_same_type(self, to_concat, placement=None):
"""
Concatenate list of single blocks of the same type.
"""
values = self._concatenator([blk.values for blk in to_concat],
axis=self.ndim - 1)
# not using self.make_block_same_class as values can be object dtype
return make_block(
values, placement=placement or slice(0, len(values), 1),
ndim=self.ndim)


class DatetimeBlock(DatetimeLikeBlockMixin, Block):
__slots__ = ()
Expand Down Expand Up @@ -2571,6 +2593,7 @@ class DatetimeTZBlock(NonConsolidatableMixIn, DatetimeBlock):
""" implement a datetime64 block with a tz attribute """
__slots__ = ()
_holder = DatetimeIndex
_concatenator = staticmethod(_concat._concat_datetime)
is_datetimetz = True

def __init__(self, values, placement, ndim=2, **kwargs):
Expand Down Expand Up @@ -2711,6 +2734,16 @@ def shift(self, periods, axis=0, mgr=None):
return [self.make_block_same_class(new_values,
placement=self.mgr_locs)]

def concat_same_type(self, to_concat, placement=None):
"""
Concatenate list of single blocks of the same type.
"""
values = self._concatenator([blk.values for blk in to_concat],
axis=self.ndim - 1)
# not using self.make_block_same_class as values can be non-tz dtype
return make_block(
values, placement=placement or slice(0, len(values), 1))


class SparseBlock(NonConsolidatableMixIn, Block):
""" implement as a list of sparse arrays of the same dtype """
Expand All @@ -2721,6 +2754,7 @@ class SparseBlock(NonConsolidatableMixIn, Block):
_can_hold_na = True
_ftype = 'sparse'
_holder = SparseArray
_concatenator = staticmethod(_concat._concat_sparse)

@property
def shape(self):
Expand Down Expand Up @@ -4517,6 +4551,45 @@ def fast_xs(self, loc):
"""
return self._block.values[loc]

def concat(self, to_concat, new_axis):
"""
Concatenate a list of SingleBlockManagers into a single
SingleBlockManager.
Used for pd.concat of Series objects with axis=0.
Parameters
----------
to_concat : list of SingleBlockManagers
new_axis : Index of the result
Returns
-------
SingleBlockManager
"""
non_empties = [x for x in to_concat if len(x) > 0]

# check if all series are of the same block type:
if len(non_empties) > 0:
blocks = [obj.blocks[0] for obj in non_empties]

if all([type(b) is type(blocks[0]) for b in blocks[1:]]): # noqa
new_block = blocks[0].concat_same_type(blocks)
else:
values = [x.values for x in blocks]
values = _concat._concat_compat(values)
new_block = make_block(
values, placement=slice(0, len(values), 1))
else:
values = [x._block.values for x in to_concat]
values = _concat._concat_compat(values)
new_block = make_block(
values, placement=slice(0, len(values), 1))

mgr = SingleBlockManager(new_block, new_axis)
return mgr


def construction_error(tot_items, block_shape, axes, e=None):
""" raise a helpful message about our construction """
Expand Down Expand Up @@ -5105,13 +5178,42 @@ def concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy):
[get_mgr_concatenation_plan(mgr, indexers)
for mgr, indexers in mgrs_indexers], concat_axis)

blocks = [make_block(
concatenate_join_units(join_units, concat_axis, copy=copy),
placement=placement) for placement, join_units in concat_plan]
blocks = []

for placement, join_units in concat_plan:

if is_uniform_join_units(join_units):
b = join_units[0].block.concat_same_type(
[ju.block for ju in join_units], placement=placement)
else:
b = make_block(
concatenate_join_units(join_units, concat_axis, copy=copy),
placement=placement)
blocks.append(b)

return BlockManager(blocks, axes)


def is_uniform_join_units(join_units):
"""
Check if the join units consist of blocks of uniform type that can
be concatenated using Block.concat_same_type instead of the generic
concatenate_join_units (which uses `_concat._concat_compat`).
"""
return (
# all blocks need to have the same type
all([type(ju.block) is type(join_units[0].block) for ju in join_units]) and # noqa
# no blocks that would get missing values (can lead to type upcasts)
all([not ju.is_na for ju in join_units]) and
# no blocks with indexers (as then the dimensions do not fit)
all([not ju.indexers for ju in join_units]) and
# disregard Panels
all([ju.block.ndim <= 2 for ju in join_units]) and
# only use this path when there is something to concatenate
len(join_units) > 1)


def get_empty_dtype_and_na(join_units):
"""
Return dtype and N/A values to use when concatenating specified units.
Expand Down
16 changes: 4 additions & 12 deletions pandas/core/reshape/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -362,20 +362,12 @@ def get_result(self):

# stack blocks
if self.axis == 0:
# concat Series with length to keep dtype as much
non_empties = [x for x in self.objs if len(x) > 0]
if len(non_empties) > 0:
values = [x._values for x in non_empties]
else:
values = [x._values for x in self.objs]
new_data = _concat._concat_compat(values)

name = com._consensus_name_attr(self.objs)
cons = _concat._get_series_result_type(new_data)

return (cons(new_data, index=self.new_axes[0],
name=name, dtype=new_data.dtype)
.__finalize__(self, method='concat'))
mgr = self.objs[0]._data.concat([x._data for x in self.objs],
self.new_axes)
cons = _concat._get_series_result_type(mgr, self.objs)
return cons(mgr, name=name).__finalize__(self, method='concat')

# combine as columns in a frame
else:
Expand Down
41 changes: 38 additions & 3 deletions pandas/tests/internals/test_external_block.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,26 @@
import numpy as np

import pandas as pd
from pandas.core.internals import Block, BlockManager, SingleBlockManager
from pandas.core.internals import (
Block, BlockManager, SingleBlockManager, NonConsolidatableMixIn)


class CustomBlock(Block):
class CustomBlock(NonConsolidatableMixIn, Block):

_holder = np.ndarray

def formatting_values(self):
return np.array(["Val: {}".format(i) for i in self.values])

def concat_same_type(self, to_concat, placement=None):
"""
Always concatenate disregarding self.ndim as the values are
always 1D in this custom Block
"""
values = np.concatenate([blk.values for blk in to_concat])
return self.make_block_same_class(
values, placement=placement or slice(0, len(values), 1))


def test_custom_repr():
values = np.arange(3, dtype='int64')
Expand All @@ -23,7 +35,30 @@ def test_custom_repr():
assert repr(s) == '0 Val: 0\n1 Val: 1\n2 Val: 2\ndtype: int64'

# dataframe
block = CustomBlock(values.reshape(1, -1), placement=slice(0, 1))
block = CustomBlock(values, placement=slice(0, 1))
blk_mgr = BlockManager([block], [['col'], range(3)])
df = pd.DataFrame(blk_mgr)
assert repr(df) == ' col\n0 Val: 0\n1 Val: 1\n2 Val: 2'


def test_concat_series():
# GH17728
values = np.arange(3, dtype='int64')
block = CustomBlock(values, placement=slice(0, 3))
s = pd.Series(block, pd.RangeIndex(3), fastpath=True)

res = pd.concat([s, s])
assert isinstance(res._data.blocks[0], CustomBlock)


def test_concat_dataframe():
# GH17728
df = pd.DataFrame({'a': [1, 2, 3]})
blocks = df._data.blocks
values = np.arange(3, dtype='int64')
custom_block = CustomBlock(values, placement=slice(1, 2))
blocks = blocks + (custom_block, )
block_manager = BlockManager(blocks, [pd.Index(['a', 'b']), df.index])
df = pd.DataFrame(block_manager)
res = pd.concat([df, df])
assert isinstance(res._data.blocks[1], CustomBlock)

0 comments on commit eac4d3f

Please sign in to comment.