Skip to content

Commit

Permalink
[CLN] Dispatch (some) Frame ops to Series, avoiding _data.eval (#22019)
Browse files Browse the repository at this point in the history
* avoid casting to object dtype in mixed-type frames

* Dispatch to Series ops in _combine_match_columns

* comment

* docstring

* flake8 fixup

* dont bother with try_cast_result

* revert non-central change

* simplify

* revert try_cast_results

* revert non-central changes

* Fixup typo syntaxerror

* simplify assertion

* use dispatch_to_series in combine_match_columns

* Pass unwrapped op where appropriate

* catch correct error

* whatsnew note

* comment

* whatsnew section

* remove unnecessary tester

* doc fixup
  • Loading branch information
jbrockmendel authored Oct 3, 2018
1 parent 3e3256b commit 15d32bb
Show file tree
Hide file tree
Showing 7 changed files with 70 additions and 37 deletions.
29 changes: 29 additions & 0 deletions doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -532,6 +532,35 @@ Current Behavior:
...
OverflowError: Trying to coerce negative values to unsigned integers

.. _whatsnew_0240.api.crosstab_dtypes

Crosstab Preserves Dtypes
^^^^^^^^^^^^^^^^^^^^^^^^^

:func:`crosstab` will preserve now dtypes in some cases that previously would
cast from integer dtype to floating dtype (:issue:`22019`)

Previous Behavior:

.. code-block:: ipython

In [3]: df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4],
...: 'c': [1, 1, np.nan, 1, 1]})
In [4]: pd.crosstab(df.a, df.b, normalize='columns')
Out[4]:
b 3 4
a
1 0.5 0.0
2 0.5 1.0

Current Behavior:

.. code-block:: ipython

In [3]: df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4],
...: 'c': [1, 1, np.nan, 1, 1]})
In [4]: pd.crosstab(df.a, df.b, normalize='columns')

Datetimelike API Changes
^^^^^^^^^^^^^^^^^^^^^^^^

Expand Down
7 changes: 1 addition & 6 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4899,7 +4899,6 @@ def _arith_op(left, right):
copy=False)

def _combine_match_index(self, other, func, level=None):
assert isinstance(other, Series)
left, right = self.align(other, join='outer', axis=0, level=level,
copy=False)
assert left.index.equals(right.index)
Expand All @@ -4919,11 +4918,7 @@ def _combine_match_columns(self, other, func, level=None, try_cast=True):
left, right = self.align(other, join='outer', axis=1, level=level,
copy=False)
assert left.columns.equals(right.index)

new_data = left._data.eval(func=func, other=right,
axes=[left.columns, self.index],
try_cast=try_cast)
return self._constructor(new_data)
return ops.dispatch_to_series(left, right, func, axis="columns")

def _combine_const(self, other, func, errors='raise', try_cast=True):
if lib.is_scalar(other) or np.ndim(other) == 0:
Expand Down
17 changes: 15 additions & 2 deletions pandas/core/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -1666,7 +1666,7 @@ def flex_wrapper(self, other, level=None, fill_value=None, axis=0):
# -----------------------------------------------------------------------------
# DataFrame

def dispatch_to_series(left, right, func, str_rep=None):
def dispatch_to_series(left, right, func, str_rep=None, axis=None):
"""
Evaluate the frame operation func(left, right) by evaluating
column-by-column, dispatching to the Series implementation.
Expand All @@ -1677,6 +1677,7 @@ def dispatch_to_series(left, right, func, str_rep=None):
right : scalar or DataFrame
func : arithmetic or comparison operator
str_rep : str or None, default None
axis : {None, 0, 1, "index", "columns"}
Returns
-------
Expand All @@ -1700,6 +1701,15 @@ def column_op(a, b):
return {i: func(a.iloc[:, i], b.iloc[:, i])
for i in range(len(a.columns))}

elif isinstance(right, ABCSeries) and axis == "columns":
# We only get here if called via left._combine_match_columns,
# in which case we specifically want to operate row-by-row
assert right.index.equals(left.columns)

def column_op(a, b):
return {i: func(a.iloc[:, i], b.iloc[i])
for i in range(len(a.columns))}

elif isinstance(right, ABCSeries):
assert right.index.equals(left.index) # Handle other cases later

Expand Down Expand Up @@ -1844,7 +1854,10 @@ def f(self, other, axis=default_axis, level=None, fill_value=None):
pass_op = op if should_series_dispatch(self, other, op) else na_op
return self._combine_frame(other, pass_op, fill_value, level)
elif isinstance(other, ABCSeries):
return _combine_series_frame(self, other, na_op,
# For these values of `axis`, we end up dispatching to Series op,
# so do not want the masked op.
pass_op = op if axis in [0, "columns", None] else na_op
return _combine_series_frame(self, other, pass_op,
fill_value=fill_value, axis=axis,
level=level, try_cast=True)
else:
Expand Down
34 changes: 14 additions & 20 deletions pandas/tests/arithmetic/test_timedelta64.py
Original file line number Diff line number Diff line change
Expand Up @@ -505,33 +505,25 @@ def test_tdi_add_dt64_array(self, box_df_broadcast_failure):
# ------------------------------------------------------------------
# Operations with int-like others

def test_td64arr_add_int_series_invalid(self, box_df_broadcast_failure,
tdser):
box = box_df_broadcast_failure
def test_td64arr_add_int_series_invalid(self, box, tdser):
tdser = tm.box_expected(tdser, box)
err = TypeError if box is not pd.Index else NullFrequencyError
with pytest.raises(err):
tdser + Series([2, 3, 4])

def test_td64arr_radd_int_series_invalid(self, box_df_broadcast_failure,
tdser):
box = box_df_broadcast_failure
def test_td64arr_radd_int_series_invalid(self, box, tdser):
tdser = tm.box_expected(tdser, box)
err = TypeError if box is not pd.Index else NullFrequencyError
with pytest.raises(err):
Series([2, 3, 4]) + tdser

def test_td64arr_sub_int_series_invalid(self, box_df_broadcast_failure,
tdser):
box = box_df_broadcast_failure
def test_td64arr_sub_int_series_invalid(self, box, tdser):
tdser = tm.box_expected(tdser, box)
err = TypeError if box is not pd.Index else NullFrequencyError
with pytest.raises(err):
tdser - Series([2, 3, 4])

def test_td64arr_rsub_int_series_invalid(self, box_df_broadcast_failure,
tdser):
box = box_df_broadcast_failure
def test_td64arr_rsub_int_series_invalid(self, box, tdser):
tdser = tm.box_expected(tdser, box)
err = TypeError if box is not pd.Index else NullFrequencyError
with pytest.raises(err):
Expand Down Expand Up @@ -605,9 +597,10 @@ def test_td64arr_add_sub_numeric_scalar_invalid(self, box, scalar, tdser):
Series([1, 2, 3])
# TODO: Add DataFrame in here?
], ids=lambda x: type(x).__name__)
def test_td64arr_add_sub_numeric_arr_invalid(
self, box_df_broadcast_failure, vec, dtype, tdser):
box = box_df_broadcast_failure
def test_td64arr_add_sub_numeric_arr_invalid(self, box, vec, dtype, tdser):
if box is pd.DataFrame and not isinstance(vec, Series):
raise pytest.xfail(reason="Tries to broadcast incorrectly")

tdser = tm.box_expected(tdser, box)
err = TypeError
if box is pd.Index and not dtype.startswith('float'):
Expand Down Expand Up @@ -930,9 +923,9 @@ def test_td64arr_sub_offset_array(self, box_df_broadcast_failure):
@pytest.mark.parametrize('names', [(None, None, None),
('foo', 'bar', None),
('foo', 'foo', 'foo')])
def test_td64arr_with_offset_series(self, names, box_df_broadcast_failure):
def test_td64arr_with_offset_series(self, names, box_df_fail):
# GH#18849
box = box_df_broadcast_failure
box = box_df_fail
box2 = Series if box is pd.Index else box

tdi = TimedeltaIndex(['1 days 00:00:00', '3 days 04:00:00'],
Expand Down Expand Up @@ -963,10 +956,11 @@ def test_td64arr_with_offset_series(self, names, box_df_broadcast_failure):
tm.assert_equal(res3, expected_sub)

@pytest.mark.parametrize('obox', [np.array, pd.Index, pd.Series])
def test_td64arr_addsub_anchored_offset_arraylike(
self, obox, box_df_broadcast_failure):
def test_td64arr_addsub_anchored_offset_arraylike(self, obox, box):
# GH#18824
box = box_df_broadcast_failure
if box is pd.DataFrame and obox is not pd.Series:
raise pytest.xfail(reason="Attempts to broadcast incorrectly")

tdi = TimedeltaIndex(['1 days 00:00:00', '3 days 04:00:00'])
tdi = tm.box_expected(tdi, box)

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/frame/test_axis_select_reindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -721,7 +721,7 @@ def test_align_int_fill_bug(self):

result = df1 - df1.mean()
expected = df2 - df2.mean()
assert_frame_equal(result, expected)
assert_frame_equal(result.astype('f8'), expected)

def test_align_multiindex(self):
# GH 10665
Expand Down
8 changes: 5 additions & 3 deletions pandas/tests/reshape/test_pivot.py
Original file line number Diff line number Diff line change
Expand Up @@ -1566,8 +1566,9 @@ def test_crosstab_normalize(self):
full_normal)
tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='index'),
row_normal)
tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='columns'),
col_normal)
tm.assert_frame_equal(
pd.crosstab(df.a, df.b, normalize='columns').astype('f8'),
col_normal)
tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize=1),
pd.crosstab(df.a, df.b, normalize='columns'))
tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize=0),
Expand Down Expand Up @@ -1600,7 +1601,8 @@ def test_crosstab_normalize(self):
tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='index',
margins=True), row_normal_margins)
tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='columns',
margins=True), col_normal_margins)
margins=True).astype('f8'),
col_normal_margins)
tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize=True,
margins=True), all_normal_margins)

Expand Down
10 changes: 5 additions & 5 deletions pandas/tests/series/test_operators.py
Original file line number Diff line number Diff line change
Expand Up @@ -758,9 +758,6 @@ def test_operators_bitwise(self):
def test_scalar_na_cmp_corners(self):
s = Series([2, 3, 4, 5, 6, 7, 8, 9, 10])

def tester(a, b):
return a & b

with pytest.raises(TypeError):
s & datetime(2005, 1, 1)

Expand All @@ -780,8 +777,11 @@ def tester(a, b):
# this is an alignment issue; these are equivalent
# https://github.com/pandas-dev/pandas/issues/5284

pytest.raises(ValueError, lambda: d.__and__(s, axis='columns'))
pytest.raises(ValueError, tester, s, d)
with pytest.raises(TypeError):
d.__and__(s, axis='columns')

with pytest.raises(TypeError):
s & d

# this is wrong as its not a boolean result
# result = d.__and__(s,axis='index')
Expand Down

0 comments on commit 15d32bb

Please sign in to comment.