Skip to content

Commit

Permalink
PERF: correctly report memory used by Index's
Browse files Browse the repository at this point in the history
Author: Jeff Reback <jeff@reback.net>

Closes pandas-dev#15237 from jreback/memory and squashes the following commits:

d77c002 [Jeff Reback] PERF: correctly report memory used by Index's
  • Loading branch information
jreback committed Jan 27, 2017
1 parent c67486f commit 3853fe6
Show file tree
Hide file tree
Showing 8 changed files with 106 additions and 4 deletions.
37 changes: 37 additions & 0 deletions doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,43 @@ New Behavior:
In [5]: df['a']['2011-12-31 23:59:59']
Out[5]: 1

.. _whatsnew_0200.api_breaking.memory_usage:

Memory Usage for Index is more Accurate
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

In previous versions, showing ``.memory_usage()`` on a pandas structure that has an index, would only include actual index values and not include structures that facilitated fast indexing. This will generally be different for ``Index`` and ``MultiIndex`` and less-so for other index types. (:issue:`15237`)

Previous Behavior:

.. code-block:: ipython

In [8]: index = Index(['foo', 'bar', 'baz'])

In [9]: index.memory_usage(deep=True)
Out[9]: 180

In [10]: index.get_loc('foo')
Out[10]: 0

In [11]: index.memory_usage(deep=True)
Out[11]: 180

New Behavior:

.. code-block:: ipython

In [8]: index = Index(['foo', 'bar', 'baz'])

In [9]: index.memory_usage(deep=True)
Out[9]: 180

In [10]: index.get_loc('foo')
Out[10]: 0

In [11]: index.memory_usage(deep=True)
Out[11]: 260

.. _whatsnew_0200.api:

Other API Changes
Expand Down
1 change: 1 addition & 0 deletions pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1067,6 +1067,7 @@ def memory_usage(self, deep=False):
v = self.values.nbytes
if deep and is_object_dtype(self):
v += lib.memory_usage_of_objects(self.values)

return v

def factorize(self, sort=False, na_sentinel=-1):
Expand Down
9 changes: 9 additions & 0 deletions pandas/index.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,15 @@ cdef class IndexEngine:

return result

def sizeof(self, deep=False):
""" return the sizeof our mapping """
if not self.is_mapping_populated:
return 0
return self.mapping.sizeof(deep=deep)

def __sizeof__(self):
return self.sizeof()

property is_unique:

def __get__(self):
Expand Down
8 changes: 8 additions & 0 deletions pandas/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -536,6 +536,14 @@ def get_values(self):
""" return the underlying data as an ndarray """
return self.values

@Appender(IndexOpsMixin.memory_usage.__doc__)
def memory_usage(self, deep=False):
result = super(Index, self).memory_usage(deep=deep)

# include our engine hashtable
result += self._engine.sizeof(deep=deep)
return result

# ops compat
def tolist(self):
"""
Expand Down
8 changes: 7 additions & 1 deletion pandas/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -446,13 +446,19 @@ def _nbytes(self, deep=False):
return the number of bytes in the underlying data
deeply introspect the level data if deep=True
include the engine hashtable
*this is in internal routine*
"""
level_nbytes = sum((i.memory_usage(deep=deep) for i in self.levels))
label_nbytes = sum((i.nbytes for i in self.labels))
names_nbytes = sum((getsizeof(i) for i in self.names))
return level_nbytes + label_nbytes + names_nbytes
result = level_nbytes + label_nbytes + names_nbytes

# include our engine hashtable
result += self._engine.sizeof(deep=deep)
return result

def _format_attrs(self):
"""
Expand Down
19 changes: 19 additions & 0 deletions pandas/src/hashtable_class_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,7 @@ cdef class ObjectVector:


cdef class HashTable:

pass

{{py:
Expand Down Expand Up @@ -237,6 +238,12 @@ cdef class {{name}}HashTable(HashTable):
k = kh_get_{{dtype}}(self.table, key)
return k != self.table.n_buckets

def sizeof(self, deep=False):
""" return the size of my table in bytes """
return self.table.n_buckets * (sizeof({{dtype}}_t) + # keys
sizeof(size_t) + # vals
sizeof(uint32_t)) # flags

cpdef get_item(self, {{dtype}}_t val):
cdef khiter_t k
k = kh_get_{{dtype}}(self.table, val)
Expand Down Expand Up @@ -464,6 +471,12 @@ cdef class StringHashTable(HashTable):
kh_destroy_str(self.table)
self.table = NULL

def sizeof(self, deep=False):
""" return the size of my table in bytes """
return self.table.n_buckets * (sizeof(char *) + # keys
sizeof(size_t) + # vals
sizeof(uint32_t)) # flags

cpdef get_item(self, object val):
cdef:
khiter_t k
Expand Down Expand Up @@ -714,6 +727,12 @@ cdef class PyObjectHashTable(HashTable):
k = kh_get_pymap(self.table, <PyObject*>key)
return k != self.table.n_buckets

def sizeof(self, deep=False):
""" return the size of my table in bytes """
return self.table.n_buckets * (sizeof(PyObject *) + # keys
sizeof(size_t) + # vals
sizeof(uint32_t)) # flags

cpdef get_item(self, object val):
cdef khiter_t k
if val != val or val is None:
Expand Down
20 changes: 20 additions & 0 deletions pandas/tests/indexes/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,26 @@ def test_compat(self):
for ind in self.indices.values():
self.assertEqual(ind.tolist(), list(ind))

def test_memory_usage(self):
for name, index in compat.iteritems(self.indices):
result = index.memory_usage()
if len(index):
index.get_loc(index[0])
result2 = index.memory_usage()
result3 = index.memory_usage(deep=True)

# RangeIndex doesn't use a hashtable engine
if not isinstance(index, RangeIndex):
self.assertTrue(result2 > result)

if index.inferred_type == 'object':
self.assertTrue(result3 > result2)

else:

# we report 0 for no-length
self.assertEqual(result, 0)

def test_argsort(self):
for k, ind in self.indices.items():

Expand Down
8 changes: 5 additions & 3 deletions pandas/tests/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -1555,11 +1555,13 @@ def test_nbytes(self):

def test_memory_usage(self):
cat = pd.Categorical([1, 2, 3])
self.assertEqual(cat.nbytes, cat.memory_usage())
self.assertEqual(cat.nbytes, cat.memory_usage(deep=True))

# .categories is an index, so we include the hashtable
self.assertTrue(cat.nbytes > 0 and cat.nbytes <= cat.memory_usage())
self.assertTrue(cat.nbytes > 0 and
cat.nbytes <= cat.memory_usage(deep=True))

cat = pd.Categorical(['foo', 'foo', 'bar'])
self.assertEqual(cat.nbytes, cat.memory_usage())
self.assertTrue(cat.memory_usage(deep=True) > cat.nbytes)

# sys.getsizeof will call the .memory_usage with
Expand Down

0 comments on commit 3853fe6

Please sign in to comment.