Skip to content

Commit cad683b

Browse files
committed
PERF: msgpack encoding changnes to use to/from string for speed boosts
API: disable sparse structure encodings and unicode indexes
1 parent da07446 commit cad683b

File tree

4 files changed

+105
-138
lines changed

4 files changed

+105
-138
lines changed

pandas/core/common.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1871,8 +1871,12 @@ def _asarray_tuplesafe(values, dtype=None):
18711871
else:
18721872
# Making a 1D array that safely contains tuples is a bit tricky
18731873
# in numpy, leading to the following
1874-
result = np.empty(len(values), dtype=object)
1875-
result[:] = values
1874+
try:
1875+
result = np.empty(len(values), dtype=object)
1876+
result[:] = values
1877+
except (ValueError):
1878+
# we have a list-of-list
1879+
result[:] = [ tuple(x) for x in values ]
18761880

18771881
return result
18781882

pandas/io/packers.py

Lines changed: 62 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@
4545

4646
import numpy as np
4747
from pandas import compat
48-
from pandas.compat import u
48+
from pandas.compat import u, PY3
4949
from pandas import (
5050
Timestamp, Period, Series, DataFrame, Panel, Panel4D,
5151
Index, MultiIndex, Int64Index, PeriodIndex, DatetimeIndex, Float64Index, NaT
@@ -168,6 +168,10 @@ def convert(values):
168168
values = values.view('i8')
169169
v = values.ravel()
170170

171+
# convert object
172+
if dtype == np.object_:
173+
return v.tolist()
174+
171175
if compressor == 'zlib':
172176

173177
# return string arrays like they are
@@ -189,12 +193,7 @@ def convert(values):
189193
return blosc.compress(v, typesize=dtype.itemsize)
190194

191195
# ndarray (on original dtype)
192-
if dtype == 'float64' or dtype == 'int64':
193-
return v
194-
195-
# as a list
196-
return v.tolist()
197-
196+
return v.tostring()
198197

199198
def unconvert(values, dtype, compress=None):
200199

@@ -216,9 +215,8 @@ def unconvert(values, dtype, compress=None):
216215

217216
return np.frombuffer(values, dtype=dtype)
218217

219-
# as a list
220-
return np.array(values, dtype=dtype)
221-
218+
# from a string
219+
return np.fromstring(values.encode('latin1'),dtype=dtype)
222220

223221
def encode(obj):
224222
"""
@@ -253,19 +251,20 @@ def encode(obj):
253251
'klass': obj.__class__.__name__,
254252
'name': getattr(obj, 'name', None),
255253
'dtype': obj.dtype.num,
256-
'data': obj.tolist()}
254+
'data': convert(obj.values)}
257255
elif isinstance(obj, Series):
258256
if isinstance(obj, SparseSeries):
259-
d = {'typ': 'sparse_series',
260-
'klass': obj.__class__.__name__,
261-
'dtype': obj.dtype.num,
262-
'index': obj.index,
263-
'sp_index': obj.sp_index,
264-
'sp_values': convert(obj.sp_values),
265-
'compress': compressor}
266-
for f in ['name', 'fill_value', 'kind']:
267-
d[f] = getattr(obj, f, None)
268-
return d
257+
raise NotImplementedError("msgpack sparse series is not implemented")
258+
#d = {'typ': 'sparse_series',
259+
# 'klass': obj.__class__.__name__,
260+
# 'dtype': obj.dtype.num,
261+
# 'index': obj.index,
262+
# 'sp_index': obj.sp_index,
263+
# 'sp_values': convert(obj.sp_values),
264+
# 'compress': compressor}
265+
#for f in ['name', 'fill_value', 'kind']:
266+
# d[f] = getattr(obj, f, None)
267+
#return d
269268
else:
270269
return {'typ': 'series',
271270
'klass': obj.__class__.__name__,
@@ -276,23 +275,25 @@ def encode(obj):
276275
'compress': compressor}
277276
elif issubclass(tobj, NDFrame):
278277
if isinstance(obj, SparseDataFrame):
279-
d = {'typ': 'sparse_dataframe',
280-
'klass': obj.__class__.__name__,
281-
'columns': obj.columns}
282-
for f in ['default_fill_value', 'default_kind']:
283-
d[f] = getattr(obj, f, None)
284-
d['data'] = dict([(name, ss)
285-
for name, ss in compat.iteritems(obj)])
286-
return d
278+
raise NotImplementedError("msgpack sparse frame is not implemented")
279+
#d = {'typ': 'sparse_dataframe',
280+
# 'klass': obj.__class__.__name__,
281+
# 'columns': obj.columns}
282+
#for f in ['default_fill_value', 'default_kind']:
283+
# d[f] = getattr(obj, f, None)
284+
#d['data'] = dict([(name, ss)
285+
# for name, ss in compat.iteritems(obj)])
286+
#return d
287287
elif isinstance(obj, SparsePanel):
288-
d = {'typ': 'sparse_panel',
289-
'klass': obj.__class__.__name__,
290-
'items': obj.items}
291-
for f in ['default_fill_value', 'default_kind']:
292-
d[f] = getattr(obj, f, None)
293-
d['data'] = dict([(name, df)
294-
for name, df in compat.iteritems(obj)])
295-
return d
288+
raise NotImplementedError("msgpack sparse frame is not implemented")
289+
#d = {'typ': 'sparse_panel',
290+
# 'klass': obj.__class__.__name__,
291+
# 'items': obj.items}
292+
#for f in ['default_fill_value', 'default_kind']:
293+
# d[f] = getattr(obj, f, None)
294+
#d['data'] = dict([(name, df)
295+
# for name, df in compat.iteritems(obj)])
296+
#return d
296297
else:
297298

298299
data = obj._data
@@ -354,7 +355,7 @@ def encode(obj):
354355
'klass': obj.__class__.__name__,
355356
'indices': obj.indices,
356357
'length': obj.length}
357-
elif isinstance(obj, np.ndarray) and obj.dtype not in ['float64', 'int64']:
358+
elif isinstance(obj, np.ndarray):
358359
return {'typ': 'ndarray',
359360
'shape': obj.shape,
360361
'ndim': obj.ndim,
@@ -394,14 +395,18 @@ def decode(obj):
394395
return Period(ordinal=obj['ordinal'], freq=obj['freq'])
395396
elif typ == 'index':
396397
dtype = dtype_for(obj['dtype'])
397-
data = obj['data']
398+
data = unconvert(obj['data'], np.typeDict[obj['dtype']], obj.get('compress'))
398399
return globals()[obj['klass']](data, dtype=dtype, name=obj['name'])
399400
elif typ == 'multi_index':
400-
return globals()[obj['klass']].from_tuples(obj['data'], names=obj['names'])
401+
data = unconvert(obj['data'], np.typeDict[obj['dtype']], obj.get('compress'))
402+
data = [ tuple(x) for x in data ]
403+
return globals()[obj['klass']].from_tuples(data, names=obj['names'])
401404
elif typ == 'period_index':
402-
return globals()[obj['klass']](obj['data'], name=obj['name'], freq=obj['freq'])
405+
data = unconvert(obj['data'], np.int64, obj.get('compress'))
406+
return globals()[obj['klass']](data, name=obj['name'], freq=obj['freq'])
403407
elif typ == 'datetime_index':
404-
return globals()[obj['klass']](obj['data'], freq=obj['freq'], tz=obj['tz'], name=obj['name'])
408+
data = unconvert(obj['data'], np.int64, obj.get('compress'))
409+
return globals()[obj['klass']](data, freq=obj['freq'], tz=obj['tz'], name=obj['name'])
405410
elif typ == 'series':
406411
dtype = dtype_for(obj['dtype'])
407412
index = obj['index']
@@ -425,17 +430,17 @@ def create_block(b):
425430
return timedelta(*obj['data'])
426431
elif typ == 'timedelta64':
427432
return np.timedelta64(int(obj['data']))
428-
elif typ == 'sparse_series':
429-
dtype = dtype_for(obj['dtype'])
430-
return globals(
431-
)[obj['klass']](unconvert(obj['sp_values'], dtype, obj['compress']), sparse_index=obj['sp_index'],
432-
index=obj['index'], fill_value=obj['fill_value'], kind=obj['kind'], name=obj['name'])
433-
elif typ == 'sparse_dataframe':
434-
return globals()[obj['klass']](obj['data'],
435-
columns=obj['columns'], default_fill_value=obj['default_fill_value'], default_kind=obj['default_kind'])
436-
elif typ == 'sparse_panel':
437-
return globals()[obj['klass']](obj['data'],
438-
items=obj['items'], default_fill_value=obj['default_fill_value'], default_kind=obj['default_kind'])
433+
#elif typ == 'sparse_series':
434+
# dtype = dtype_for(obj['dtype'])
435+
# return globals(
436+
# )[obj['klass']](unconvert(obj['sp_values'], dtype, obj['compress']), sparse_index=obj['sp_index'],
437+
# index=obj['index'], fill_value=obj['fill_value'], kind=obj['kind'], name=obj['name'])
438+
#elif typ == 'sparse_dataframe':
439+
# return globals()[obj['klass']](obj['data'],
440+
# columns=obj['columns'], default_fill_value=obj['default_fill_value'], default_kind=obj['default_kind'])
441+
#elif typ == 'sparse_panel':
442+
# return globals()[obj['klass']](obj['data'],
443+
# items=obj['items'], default_fill_value=obj['default_fill_value'], default_kind=obj['default_kind'])
439444
elif typ == 'block_index':
440445
return globals()[obj['klass']](obj['length'], obj['blocs'], obj['blengths'])
441446
elif typ == 'int_index':
@@ -460,7 +465,7 @@ def create_block(b):
460465

461466

462467
def pack(o, default=encode,
463-
encoding='utf-8', unicode_errors='strict', use_single_float=False):
468+
encoding='latin1', unicode_errors='strict', use_single_float=False):
464469
"""
465470
Pack an object and return the packed bytes.
466471
"""
@@ -471,7 +476,7 @@ def pack(o, default=encode,
471476

472477

473478
def unpack(packed, object_hook=decode,
474-
list_hook=None, use_list=False, encoding='utf-8',
479+
list_hook=None, use_list=False, encoding='latin1',
475480
unicode_errors='strict', object_pairs_hook=None):
476481
"""
477482
Unpack a packed object, return an iterator
@@ -488,7 +493,7 @@ def unpack(packed, object_hook=decode,
488493
class Packer(_Packer):
489494

490495
def __init__(self, default=encode,
491-
encoding='utf-8',
496+
encoding='latin1',
492497
unicode_errors='strict',
493498
use_single_float=False):
494499
super(Packer, self).__init__(default=default,
@@ -501,7 +506,7 @@ class Unpacker(_Unpacker):
501506

502507
def __init__(self, file_like=None, read_size=0, use_list=False,
503508
object_hook=decode,
504-
object_pairs_hook=None, list_hook=None, encoding='utf-8',
509+
object_pairs_hook=None, list_hook=None, encoding='latin1',
505510
unicode_errors='strict', max_buffer_size=0):
506511
super(Unpacker, self).__init__(file_like=file_like,
507512
read_size=read_size,

pandas/io/tests/test_packers.py

Lines changed: 37 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -61,30 +61,28 @@ class TestNumpy(Test):
6161
def test_numpy_scalar_float(self):
6262
x = np.float32(np.random.rand())
6363
x_rec = self.encode_decode(x)
64-
self.assert_(np.allclose(x, x_rec) and type(x) == type(x_rec))
64+
tm.assert_almost_equal(x,x_rec)
6565

6666
def test_numpy_scalar_complex(self):
6767
x = np.complex64(np.random.rand() + 1j * np.random.rand())
6868
x_rec = self.encode_decode(x)
69-
self.assert_(np.allclose(x, x_rec) and type(x) == type(x_rec))
69+
tm.assert_almost_equal(x,x_rec)
7070

7171
def test_scalar_float(self):
7272
x = np.random.rand()
7373
x_rec = self.encode_decode(x)
74-
self.assert_(np.allclose(x, x_rec) and type(x) == type(x_rec))
74+
tm.assert_almost_equal(x,x_rec)
7575

7676
def test_scalar_complex(self):
7777
x = np.random.rand() + 1j * np.random.rand()
7878
x_rec = self.encode_decode(x)
79-
self.assert_(np.allclose(x, x_rec) and type(x) == type(x_rec))
79+
tm.assert_almost_equal(x,x_rec)
8080

8181
def test_list_numpy_float(self):
8282
raise nose.SkipTest('buggy test')
8383
x = [np.float32(np.random.rand()) for i in range(5)]
8484
x_rec = self.encode_decode(x)
85-
self.assert_(all(map(lambda x, y:
86-
x == y, x, x_rec)) and
87-
all(map(lambda x, y: type(x) == type(y), x, x_rec)))
85+
tm.assert_almost_equal(x,x_rec)
8886

8987
def test_list_numpy_float_complex(self):
9088
if not hasattr(np, 'complex128'):
@@ -96,65 +94,59 @@ def test_list_numpy_float_complex(self):
9694
[np.complex128(np.random.rand() + 1j * np.random.rand())
9795
for i in range(5)]
9896
x_rec = self.encode_decode(x)
99-
self.assert_(all(map(lambda x, y: x == y, x, x_rec)) and
100-
all(map(lambda x, y: type(x) == type(y), x, x_rec)))
97+
tm.assert_almost_equal(x,x_rec)
10198

10299
def test_list_float(self):
103100
x = [np.random.rand() for i in range(5)]
104101
x_rec = self.encode_decode(x)
105-
self.assert_(all(map(lambda x, y: x == y, x, x_rec)) and
106-
all(map(lambda x, y: type(x) == type(y), x, x_rec)))
102+
tm.assert_almost_equal(x,x_rec)
107103

108104
def test_list_float_complex(self):
109105
x = [np.random.rand() for i in range(5)] + \
110106
[(np.random.rand() + 1j * np.random.rand()) for i in range(5)]
111107
x_rec = self.encode_decode(x)
112-
self.assert_(all(map(lambda x, y: x == y, x, x_rec)) and
113-
all(map(lambda x, y: type(x) == type(y), x, x_rec)))
108+
tm.assert_almost_equal(x,x_rec)
114109

115110
def test_dict_float(self):
116111
x = {'foo': 1.0, 'bar': 2.0}
117112
x_rec = self.encode_decode(x)
118-
self.assert_(all(map(lambda x, y: x == y, x.values(), x_rec.values())) and
119-
all(map(lambda x, y: type(x) == type(y), x.values(), x_rec.values())))
113+
tm.assert_almost_equal(x,x_rec)
120114

121115
def test_dict_complex(self):
122116
x = {'foo': 1.0 + 1.0j, 'bar': 2.0 + 2.0j}
123117
x_rec = self.encode_decode(x)
124-
self.assert_(all(map(lambda x, y: x == y, x.values(), x_rec.values())) and
125-
all(map(lambda x, y: type(x) == type(y), x.values(), x_rec.values())))
118+
tm.assert_almost_equal(x,x_rec)
126119

127120
def test_dict_numpy_float(self):
128121
x = {'foo': np.float32(1.0), 'bar': np.float32(2.0)}
129122
x_rec = self.encode_decode(x)
130-
self.assert_(all(map(lambda x, y: x == y, x.values(), x_rec.values())) and
131-
all(map(lambda x, y: type(x) == type(y), x.values(), x_rec.values())))
123+
tm.assert_almost_equal(x,x_rec)
132124

133125
def test_dict_numpy_complex(self):
134126
x = {'foo': np.complex128(
135127
1.0 + 1.0j), 'bar': np.complex128(2.0 + 2.0j)}
136128
x_rec = self.encode_decode(x)
137-
self.assert_(all(map(lambda x, y: x == y, x.values(), x_rec.values())) and
138-
all(map(lambda x, y: type(x) == type(y), x.values(), x_rec.values())))
129+
tm.assert_almost_equal(x,x_rec)
139130

140131
def test_numpy_array_float(self):
141-
x = np.random.rand(5).astype(np.float32)
142-
x_rec = self.encode_decode(x)
143-
self.assert_(all(map(lambda x, y: x == y, x, x_rec)) and
144-
x.dtype == x_rec.dtype)
132+
133+
# run multiple times
134+
for n in range(10):
135+
x = np.random.rand(10)
136+
for dtype in ['float32','float64']:
137+
x = x.astype(dtype)
138+
x_rec = self.encode_decode(x)
139+
tm.assert_almost_equal(x,x_rec)
145140

146141
def test_numpy_array_complex(self):
147142
x = (np.random.rand(5) + 1j * np.random.rand(5)).astype(np.complex128)
148143
x_rec = self.encode_decode(x)
149-
self.assert_(all(map(lambda x, y: x == y, x, x_rec)) and
150-
x.dtype == x_rec.dtype)
144+
tm.assert_almost_equal(x,x_rec)
151145

152146
def test_list_mixed(self):
153147
x = [1.0, np.float32(3.5), np.complex128(4.25), u('foo')]
154148
x_rec = self.encode_decode(x)
155-
self.assert_(all(map(lambda x, y: x == y, x, x_rec)) and
156-
all(map(lambda x, y: type(x) == type(y), x, x_rec)))
157-
149+
tm.assert_almost_equal(x,x_rec)
158150

159151
class TestBasic(Test):
160152

@@ -219,8 +211,12 @@ def test_multi_index(self):
219211

220212
def test_unicode(self):
221213
i = tm.makeUnicodeIndex(100)
222-
i_rec = self.encode_decode(i)
223-
self.assert_(i.equals(i_rec))
214+
215+
# this currently fails
216+
self.assertRaises(UnicodeEncodeError, self.encode_decode, i)
217+
218+
#i_rec = self.encode_decode(i)
219+
#self.assert_(i.equals(i_rec))
224220

225221

226222
class TestSeries(Test):
@@ -255,9 +251,11 @@ def setUp(self):
255251

256252
def test_basic(self):
257253

258-
for s, i in self.d.items():
259-
i_rec = self.encode_decode(i)
260-
assert_series_equal(i, i_rec)
254+
# run multiple times here
255+
for n in range(10):
256+
for s, i in self.d.items():
257+
i_rec = self.encode_decode(i)
258+
assert_series_equal(i, i_rec)
261259

262260

263261
class TestNDFrame(Test):
@@ -326,8 +324,10 @@ class TestSparse(Test):
326324

327325
def _check_roundtrip(self, obj, comparator, **kwargs):
328326

329-
i_rec = self.encode_decode(obj)
330-
comparator(obj, i_rec, **kwargs)
327+
# currently these are not implemetned
328+
#i_rec = self.encode_decode(obj)
329+
#comparator(obj, i_rec, **kwargs)
330+
self.assertRaises(NotImplementedError, self.encode_decode, obj)
331331

332332
def test_sparse_series(self):
333333

0 commit comments

Comments
 (0)