Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions python/pyarrow/serialization.py
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,37 @@ def _deserialize_scipy_sparse(data):
pass


# ----------------------------------------------------------------------
# Set up serialization for pydata/sparse tensors.

def _register_pydata_sparse_handlers(serialization_context):
try:
import sparse

def _serialize_pydata_sparse(obj):
if isinstance(obj, sparse.COO):
return 'coo', pa.SparseCOOTensor.from_pydata_sparse(obj)
else:
raise NotImplementedError(
"Serialization of {} is not supported.".format(sparse.COO))

def _deserialize_pydata_sparse(data):
if data[0] == 'coo':
data_array, coords = data[1].to_numpy()
return sparse.COO(
data=data_array[:, 0],
coords=coords.T, shape=data[1].shape)

serialization_context.register_type(
sparse.COO, 'sparse.COO',
custom_serializer=_serialize_pydata_sparse,
custom_deserializer=_deserialize_pydata_sparse)

except ImportError:
# no pydata/sparse
pass


def register_default_serialization_handlers(serialization_context):

# ----------------------------------------------------------------------
Expand Down Expand Up @@ -403,6 +434,7 @@ def register_default_serialization_handlers(serialization_context):
_register_collections_serialization_handlers(serialization_context)
_register_custom_pandas_handlers(serialization_context)
_register_scipy_handlers(serialization_context)
_register_pydata_sparse_handlers(serialization_context)


def default_serialization_context():
Expand Down
44 changes: 43 additions & 1 deletion python/pyarrow/tensor.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,34 @@ shape: {0.shape}""".format(self)
coords = np.require(coords, dtype='i8', requirements='F')

check_status(NdarraysToSparseCOOTensor(c_default_memory_pool(),
obj.data.view(), coords, c_shape, c_dim_names,
obj.data, coords, c_shape, c_dim_names,
&csparse_tensor))
return pyarrow_wrap_sparse_coo_tensor(csparse_tensor)

@staticmethod
def from_pydata_sparse(obj, dim_names=None):
"""
Convert pydata/sparse.COO to arrow::SparseCOOTensor
"""
import sparse
if not isinstance(obj, sparse.COO):
raise TypeError(
"Expected sparse.COO, got {}".format(type(obj)))

cdef shared_ptr[CSparseCOOTensor] csparse_tensor
cdef vector[int64_t] c_shape
cdef vector[c_string] c_dim_names

for x in obj.shape:
c_shape.push_back(x)
if dim_names is not None:
for x in dim_names:
c_dim_names.push_back(tobytes(x))

coords = np.require(obj.coords.T, dtype='i8', requirements='F')

check_status(NdarraysToSparseCOOTensor(c_default_memory_pool(),
obj.data, coords, c_shape, c_dim_names,
&csparse_tensor))
return pyarrow_wrap_sparse_coo_tensor(csparse_tensor)

Expand Down Expand Up @@ -247,6 +274,21 @@ shape: {0.shape}""".format(self)
shape=self.shape)
return result

def to_pydata_sparse(self):
"""
Convert arrow::SparseCOOTensor to pydata/sparse.COO
"""
from sparse import COO
cdef PyObject* out_data
cdef PyObject* out_coords

check_status(SparseCOOTensorToNdarray(self.sp_sparse_tensor, self,
&out_data, &out_coords))
data = PyObject_to_object(out_data)
coords = PyObject_to_object(out_coords)
result = COO(data=data[:, 0], coords=coords.T, shape=self.shape)
return result

def to_tensor(self):
"""
Convert arrow::SparseCOOTensor to arrow::Tensor
Expand Down
21 changes: 21 additions & 0 deletions python/pyarrow/tests/test_serialization.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,11 @@
coo_matrix = None
csr_matrix = None

try:
import sparse
except ImportError:
sparse = None


def assert_equal(obj1, obj2):
if torch is not None and torch.is_tensor(obj1) and torch.is_tensor(obj2):
Expand Down Expand Up @@ -597,6 +602,22 @@ def test_scipy_sparse_coo_tensor_serialization():
assert np.array_equal(sparse_array.toarray(), result.toarray())


@pytest.mark.skipif(not sparse, reason="requires pydata/sparse")
def test_pydata_sparse_sparse_coo_tensor_serialization():
data = np.array([1, 2, 3, 4, 5, 6])
coords = np.array([
[0, 0, 2, 3, 1, 3],
[0, 2, 0, 4, 5, 5],
])
shape = (4, 6)

sparse_array = sparse.COO(data=data, coords=coords, shape=shape)
serialized = pa.serialize(sparse_array)
result = serialized.deserialize()

assert np.array_equal(sparse_array.todense(), result.todense())


@pytest.mark.parametrize('tensor_type', tensor_types)
@pytest.mark.parametrize('index_type', index_types)
def test_sparse_csr_matrix_serialization(index_type, tensor_type):
Expand Down
31 changes: 31 additions & 0 deletions python/pyarrow/tests/test_sparse_tensor.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,11 @@
coo_matrix = None
csr_matrix = None

try:
import sparse
except ImportError:
sparse = None


tensor_type_pairs = [
('i1', pa.int8()),
Expand Down Expand Up @@ -325,3 +330,29 @@ def test_sparse_csr_matrix_scipy_roundtrip(dtype_str, arrow_type):
else:
dense_array = sparse_array.toarray()
assert np.array_equal(dense_array, sparse_tensor.to_tensor().to_numpy())


@pytest.mark.skipif(not sparse, reason="requires pydata/sparse")
@pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs)
def test_pydata_sparse_sparse_coo_tensor_roundtrip(dtype_str, arrow_type):
dtype = np.dtype(dtype_str)
data = np.array([1, 2, 3, 4, 5, 6]).astype(dtype)
coords = np.array([
[0, 0, 2, 3, 1, 3],
[0, 2, 0, 4, 5, 5],
])
shape = (4, 6)
dim_names = ("x", "y")

sparse_array = sparse.COO(data=data, coords=coords, shape=shape)
sparse_tensor = pa.SparseCOOTensor.from_pydata_sparse(sparse_array,
dim_names=dim_names)
out_sparse_array = sparse_tensor.to_pydata_sparse()

assert sparse_tensor.type == arrow_type
assert sparse_tensor.dim_names == dim_names
assert sparse_array.dtype == out_sparse_array.dtype
assert np.array_equal(sparse_array.data, out_sparse_array.data)
assert np.array_equal(sparse_array.coords, out_sparse_array.coords)
assert np.array_equal(sparse_array.todense(),
sparse_tensor.to_tensor().to_numpy())