PERF: perf enhancements for DataFrame.apply (GH6013)

pandas-dev · jreback · Jan 21, 2014 · Jan 21, 2014 · Jan 21, 2014 · Jan 21, 2014
commit 9e37a7dfd977a8d54b89f8b3560a75519988a5f3
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -88,6 +88,7 @@ Improvements to existing features
   - perf improvments in indexing with object dtypes (:issue:`5968`)
   - improved dtype inference for ``timedelta`` like passed to constructors (:issue:`5458`,:issue:`5689`)
   - escape special characters when writing to latex (:issue: `5374`)
+  - perf improvements in ``DataFrame.apply`` (:issue:`6013`)
 
 .. _release.bug_fixes-0.13.1:
 

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -3324,16 +3324,16 @@ def _apply_standard(self, func, axis, ignore_failures=False, reduce=True):
         if reduce:
             try:
 
-                if self._is_mixed_type:  # maybe a hack for now
-                    raise AssertionError('Must be mixed type DataFrame')
-                values = self.values
-                dummy = Series(NA, index=self._get_axis(axis),
+                # can only work with numeric data in the fast path
+                numeric = self._get_numeric_data()
+                values = numeric.values
+                dummy = Series(NA, index=numeric._get_axis(axis),
                                dtype=values.dtype)
 
                 labels = self._get_agg_axis(axis)
                 result = lib.reduce(values, func, axis=axis, dummy=dummy,
                                     labels=labels)
-                return Series(result, index=self._get_agg_axis(axis))
+                return Series(result, index=labels)
             except Exception:
                 pass
 

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -78,7 +78,8 @@ class NDFrame(PandasObject):
     copy : boolean, default False
     """
     _internal_names = ['_data', '_cacher', '_item_cache', '_cache',
-                       'is_copy', '_subtyp', '_index', '_default_kind', '_default_fill_value']
+                       'is_copy', '_subtyp', '_index', '_default_kind',
+                       '_default_fill_value','__array_struct__','__array_interface__']
     _internal_names_set = set(_internal_names)
     _metadata = []
     is_copy = None
@@ -698,6 +699,14 @@ def __array_wrap__(self, result):
         d = self._construct_axes_dict(self._AXIS_ORDERS, copy=False)
         return self._constructor(result, **d).__finalize__(self)
 
+    # ideally we would define this to avoid the getattr checks, but
+    # is slower
+    #@property
+    #def __array_interface__(self):
+    #    """ provide numpy array interface method """
+    #    values = self.values
+    #    return dict(typestr=values.dtype.str,shape=values.shape,data=values)
+
     def to_dense(self):
         "Return dense representation of NDFrame (as opposed to sparse)"
         # compat

diff --git a/pandas/src/reduce.pyx b/pandas/src/reduce.pyx
@@ -35,25 +35,26 @@ cdef class Reducer:
             self.chunksize = k
             self.increment = k * arr.dtype.itemsize
 
+
         self.f = f
         self.arr = arr
         self.typ = None
         self.labels = labels
-        self.dummy, index = self._check_dummy(dummy)
+        self.dummy, index = self._check_dummy(dummy=dummy)
 
-        if axis == 0:
-             self.labels = index
-             self.index  = labels
-        else:
-             self.labels = labels
-             self.index  = index
+        self.labels = labels
+        self.index  = index
 
     def _check_dummy(self, dummy=None):
         cdef object index
 
         if dummy is None:
             dummy = np.empty(self.chunksize, dtype=self.arr.dtype)
             index = None
+
+            # our ref is stolen later since we are creating this array
+            # in cython, so increment first
+            Py_INCREF(dummy)
         else:
             if dummy.dtype != self.arr.dtype:
                 raise ValueError('Dummy array must be same dtype')
@@ -76,39 +77,48 @@ cdef class Reducer:
             ndarray arr, result, chunk
             Py_ssize_t i, incr
             flatiter it
-            object res, tchunk, name, labels, index, typ
+            object res, name, labels, index
+            object cached_typ = None
 
         arr = self.arr
         chunk = self.dummy
         dummy_buf = chunk.data
         chunk.data = arr.data
         labels = self.labels
         index = self.index
-        typ = self.typ
         incr = self.increment
 
         try:
             for i in range(self.nresults):
-                # need to make sure that we pass an actual object to the function
-                # and not just an ndarray
-                if typ is not None:
-                     try:
-                         if labels is not None:
-                            name = labels[i]
+
+                if labels is not None:
+                    name = util.get_value_at(labels, i)
+                else:
+                    name = None
+
+                # create the cached type
+                # each time just reassign the data
+                if i == 0:
+
+                    if self.typ is not None:
 
                          # recreate with the index if supplied
                          if index is not None:
-                              tchunk = typ(chunk, index=index, name=name, fastpath=True)
+
+                             cached_typ = self.typ(chunk, index=index, name=name)
+
                          else:
-                             tchunk = typ(chunk, name=name)
 
-                     except:
-                         tchunk = chunk
-                         typ = None
-                else:
-                     tchunk = chunk
+                             # use the passsed typ, sans index
+                             cached_typ = self.typ(chunk, name=name)
 
-                res = self.f(tchunk)
+                # use the cached_typ if possible
+                if cached_typ is not None:
+                    cached_typ._data._block.values = chunk
+                    cached_typ.name = name
+                    res = self.f(cached_typ)
+                else:
+                    res = self.f(chunk)
 
                 if hasattr(res,'values'):
                     res = res.values

diff --git a/pandas/tests/test_tseries.py b/pandas/tests/test_tseries.py
@@ -661,7 +661,6 @@ def test_int_index(self):
         from pandas.core.series import Series
 
         arr = np.random.randn(100, 4)
-
         result = lib.reduce(arr, np.sum, labels=Index(np.arange(4)))
         expected = arr.sum(0)
         assert_almost_equal(result, expected)