Skip to content

PERF: apply perf enhancements #6024

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jan 21, 2014
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
PERF: perf enhancements for DataFrame.apply (GH6013)
  • Loading branch information
jreback committed Jan 21, 2014
commit 9e37a7dfd977a8d54b89f8b3560a75519988a5f3
1 change: 1 addition & 0 deletions doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ Improvements to existing features
- perf improvments in indexing with object dtypes (:issue:`5968`)
- improved dtype inference for ``timedelta`` like passed to constructors (:issue:`5458`,:issue:`5689`)
- escape special characters when writing to latex (:issue: `5374`)
- perf improvements in ``DataFrame.apply`` (:issue:`6013`)

.. _release.bug_fixes-0.13.1:

Expand Down
10 changes: 5 additions & 5 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -3324,16 +3324,16 @@ def _apply_standard(self, func, axis, ignore_failures=False, reduce=True):
if reduce:
try:

if self._is_mixed_type: # maybe a hack for now
raise AssertionError('Must be mixed type DataFrame')
values = self.values
dummy = Series(NA, index=self._get_axis(axis),
# can only work with numeric data in the fast path
numeric = self._get_numeric_data()
values = numeric.values
dummy = Series(NA, index=numeric._get_axis(axis),
dtype=values.dtype)

labels = self._get_agg_axis(axis)
result = lib.reduce(values, func, axis=axis, dummy=dummy,
labels=labels)
return Series(result, index=self._get_agg_axis(axis))
return Series(result, index=labels)
except Exception:
pass

Expand Down
11 changes: 10 additions & 1 deletion pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,8 @@ class NDFrame(PandasObject):
copy : boolean, default False
"""
_internal_names = ['_data', '_cacher', '_item_cache', '_cache',
'is_copy', '_subtyp', '_index', '_default_kind', '_default_fill_value']
'is_copy', '_subtyp', '_index', '_default_kind',
'_default_fill_value','__array_struct__','__array_interface__']
_internal_names_set = set(_internal_names)
_metadata = []
is_copy = None
Expand Down Expand Up @@ -698,6 +699,14 @@ def __array_wrap__(self, result):
d = self._construct_axes_dict(self._AXIS_ORDERS, copy=False)
return self._constructor(result, **d).__finalize__(self)

# ideally we would define this to avoid the getattr checks, but
# is slower
#@property
#def __array_interface__(self):
# """ provide numpy array interface method """
# values = self.values
# return dict(typestr=values.dtype.str,shape=values.shape,data=values)

def to_dense(self):
"Return dense representation of NDFrame (as opposed to sparse)"
# compat
Expand Down
56 changes: 33 additions & 23 deletions pandas/src/reduce.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -35,25 +35,26 @@ cdef class Reducer:
self.chunksize = k
self.increment = k * arr.dtype.itemsize


self.f = f
self.arr = arr
self.typ = None
self.labels = labels
self.dummy, index = self._check_dummy(dummy)
self.dummy, index = self._check_dummy(dummy=dummy)

if axis == 0:
self.labels = index
self.index = labels
else:
self.labels = labels
self.index = index
self.labels = labels
self.index = index

def _check_dummy(self, dummy=None):
cdef object index

if dummy is None:
dummy = np.empty(self.chunksize, dtype=self.arr.dtype)
index = None

# our ref is stolen later since we are creating this array
# in cython, so increment first
Py_INCREF(dummy)
else:
if dummy.dtype != self.arr.dtype:
raise ValueError('Dummy array must be same dtype')
Expand All @@ -76,39 +77,48 @@ cdef class Reducer:
ndarray arr, result, chunk
Py_ssize_t i, incr
flatiter it
object res, tchunk, name, labels, index, typ
object res, name, labels, index
object cached_typ = None

arr = self.arr
chunk = self.dummy
dummy_buf = chunk.data
chunk.data = arr.data
labels = self.labels
index = self.index
typ = self.typ
incr = self.increment

try:
for i in range(self.nresults):
# need to make sure that we pass an actual object to the function
# and not just an ndarray
if typ is not None:
try:
if labels is not None:
name = labels[i]

if labels is not None:
name = util.get_value_at(labels, i)
else:
name = None

# create the cached type
# each time just reassign the data
if i == 0:

if self.typ is not None:

# recreate with the index if supplied
if index is not None:
tchunk = typ(chunk, index=index, name=name, fastpath=True)

cached_typ = self.typ(chunk, index=index, name=name)

else:
tchunk = typ(chunk, name=name)

except:
tchunk = chunk
typ = None
else:
tchunk = chunk
# use the passsed typ, sans index
cached_typ = self.typ(chunk, name=name)

res = self.f(tchunk)
# use the cached_typ if possible
if cached_typ is not None:
cached_typ._data._block.values = chunk
cached_typ.name = name
res = self.f(cached_typ)
else:
res = self.f(chunk)

if hasattr(res,'values'):
res = res.values
Expand Down
1 change: 0 additions & 1 deletion pandas/tests/test_tseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -661,7 +661,6 @@ def test_int_index(self):
from pandas.core.series import Series

arr = np.random.randn(100, 4)

result = lib.reduce(arr, np.sum, labels=Index(np.arange(4)))
expected = arr.sum(0)
assert_almost_equal(result, expected)
Expand Down