Add support for handling ndarrays with dtype='O' using pickle (as well as cautionary note) (#46)

lebedov · lebedov · commit fd7032a3045f · 2022-04-28T12:48:50.000-04:00
diff --git a/AUTHORS.md b/AUTHORS.md
@@ -11,6 +11,7 @@ Special thanks are due to the following parties for their contributions:
 
 - [Alex Ford](https://github.com/asford) - bug fix.
 - [Colin Jermain](https://github.com/cjermain) - Python 3 support.
+- [Etienne Wodey](https://github.com/airwoodix) - support for ndarrays with dtype='O'
 - [John Tyree](https://github.com/johntyree) - support for numpy scalar booleans.
 - [Mehdi Sadeghi](https://github.com/mehdisadeghi) - bug reports.
 - [Sujoy Roy](https://github.com/tvkpz) - bug reports.
diff --git a/CHANGES.md b/CHANGES.md
@@ -5,6 +5,10 @@ vi:ft=markdown
 Change Log
 ==========
 
+Release 0.4.8 (April 28, 2022)
+------------------------------
+* Add support for ndarrays with dtype=object (#46).
+
 Release 0.4.7.1 (September 30, 2020)
 ------------------------------------
 * Fix Python 2.7 regression (#45).
diff --git a/LICENSE.md b/LICENSE.md
@@ -5,7 +5,7 @@ vi:ft=markdown
 License
 =======
 
-Copyright (c) 2013-2020, Lev E. Givon.
+Copyright (c) 2013-2022, Lev E. Givon.
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/README.md b/README.md
@@ -65,7 +65,14 @@ data types during msgpack serialization and deserialization. Inclusion of type
 information in the serialized data necessarily incurs some storage overhead; if
 preservation of type information is not needed, one may be able to avoid some
 of this overhead by writing a custom encoder/decoder pair that produces more
-efficient serializations for those specific use cases.
+efficient serializations for those specific use cases. 
+
+Numpy arrays with a dtype of 'O' are serialized/deserialized using pickle as 
+a fallback solution to enable msgpack-numpy to handle
+such arrays. As the additional overhead of pickle serialization negates one
+of the reasons to use msgpack, it may be advisable to either write a custom
+encoder/decoder to handle the specific use case efficiently or else not bother
+using msgpack-numpy.
 
 Note that numpy arrays deserialized by msgpack-numpy are read-only and must be copied 
 if they are to be modified.
diff --git a/msgpack_numpy.py b/msgpack_numpy.py
@@ -4,13 +4,14 @@
 Support for serialization of numpy data types with msgpack.
 """
 
-# Copyright (c) 2013-2020, Lev E. Givon
+# Copyright (c) 2013-2022, Lev E. Givon
 # All rights reserved.
 # Distributed under the terms of the BSD license:
 # http://www.opensource.org/licenses/bsd-license
 
 import sys
 import functools
+import pickle
 import warnings
 
 import msgpack
@@ -19,10 +20,14 @@
 import numpy as np
 
 if sys.version_info >= (3, 0):
-    if sys.platform == 'darwin':
-        ndarray_to_bytes = lambda obj: obj.tobytes()
-    else:
-        ndarray_to_bytes = lambda obj: obj.data if obj.flags['C_CONTIGUOUS'] else obj.tobytes()
+    def ndarray_to_bytes(obj):
+        if obj.dtype == 'O':
+            return obj.dumps()
+        else:
+            if sys.platform == 'darwin':
+                return obj.tobytes()
+            else:
+                return obj.data if obj.flags['C_CONTIGUOUS'] else obj.tobytes()
 
     num_to_bytes = lambda obj: obj.data
 
@@ -32,10 +37,14 @@ def tostr(x):
         else:
             return str(x)
 else:
-    if sys.platform == 'darwin':
-        ndarray_to_bytes = lambda obj: obj.tobytes()
-    else:
-        ndarray_to_bytes = lambda obj: memoryview(obj.data) if obj.flags['C_CONTIGUOUS'] else obj.tobytes()
+    def ndarray_to_bytes(obj):
+        if obj.dtype == 'O':
+            return obj.dumps()
+        else:
+            if sys.platform == 'darwin':
+                return obj.tobytes()
+            else:
+                return memoryview(obj.data) if obj.flags['C_CONTIGUOUS'] else obj.tobytes()
 
     num_to_bytes = lambda obj: memoryview(obj.data)
 
@@ -50,12 +59,13 @@ def encode(obj, chain=None):
     if isinstance(obj, np.ndarray):
         # If the dtype is structured, store the interface description;
         # otherwise, store the corresponding array protocol type string:
-        if obj.dtype.kind == 'V':
-            kind = b'V'
+        if obj.dtype.kind in ('V', 'O'):
+            kind = bytes(obj.dtype.kind, 'ascii')
             descr = obj.dtype.descr
         else:
             kind = b''
             descr = obj.dtype.str
+
         return {b'nd': True,
                 b'type': descr,
                 b'kind': kind,
@@ -81,14 +91,18 @@ def decode(obj, chain=None):
             if obj[b'nd'] is True:
 
                 # Check if b'kind' is in obj to enable decoding of data
-                # serialized with older versions (#20):
+                # serialized with older versions (#20) or data
+                # that had dtype == 'O' (#46):
                 if b'kind' in obj and obj[b'kind'] == b'V':
                     descr = [tuple(tostr(t) if type(t) is bytes else t for t in d) \
                              for d in obj[b'type']]
+                elif b'kind' in obj and obj[b'kind'] == b'O':
+                    return pickle.loads(obj[b'data'])
                 else:
                     descr = obj[b'type']
-                return np.frombuffer(obj[b'data'],
-                            dtype=_unpack_dtype(descr)).reshape(obj[b'shape'])
+                return np.ndarray(buffer=obj[b'data'],
+                                  dtype=_unpack_dtype(descr),
+                                  shape=obj[b'shape'])
             else:
                 descr = obj[b'type']
                 return np.frombuffer(obj[b'data'],
diff --git a/setup.py b/setup.py
@@ -6,7 +6,7 @@
 from setuptools import setup
 
 NAME =               'msgpack-numpy'
-VERSION =            '0.4.7.1'
+VERSION =            '0.4.8'
 AUTHOR =             'Lev E. Givon'
 AUTHOR_EMAIL =       'lev@columbia.edu'
 URL =                'https://github.com/lebedov/msgpack-numpy'
diff --git a/tests.py b/tests.py
@@ -188,6 +188,12 @@ def test_numpy_array_float(self):
         assert_array_equal(x, x_rec)
         assert_equal(x.dtype, x_rec.dtype)
 
+    def test_numpy_array_object(self):
+        x = np.random.rand(5).astype(object)
+        x_rec = self.encode_decode(x)
+        assert_array_equal(x, x_rec)
+        assert_equal(x.dtype, x_rec.dtype)
+
     def test_numpy_array_complex(self):
         x = (np.random.rand(5)+1j*np.random.rand(5)).astype(np.complex128)
         x_rec = self.encode_decode(x)