Skip to content

Commit 66873e2

Browse files
rokkszucs
authored andcommitted
ARROW-4453: [Python] Cython wrappers for SparseTensor
Creating cython wrappers for SparseTensor. This is to resolve [ARROW-4453](https://issues.apache.org/jira/browse/ARROW-4453). Author: Rok <rok@mihevc.org> Author: Antoine Pitrou <antoine@python.org> Closes #4446 from rok/ARROW-4453 and squashes the following commits: db5d620 <Rok> Typo. 9e0363a <Antoine Pitrou> Polish code c31b8eb <Rok> Enabling SparseTensor.Equals checks. 654002a <Rok> Partial review feedback implementation. e89edc6 <Rok> Refactoring to_numpy methods. 3fcc192 <Rok> Add equality methods. 4a30487 <Rok> Set base object in to_numpy methods. 4eeae02 <Rok> Cython wrapper for SparseTensor.
1 parent a3242fb commit 66873e2

File tree

20 files changed

+1101
-146
lines changed

20 files changed

+1101
-146
lines changed

cpp/src/arrow/compare.cc

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1026,9 +1026,8 @@ struct SparseTensorEqualsImpl<SparseIndexType, SparseIndexType> {
10261026

10271027
const uint8_t* left_data = left.data()->data();
10281028
const uint8_t* right_data = right.data()->data();
1029-
10301029
return memcmp(left_data, right_data,
1031-
static_cast<size_t>(byte_width * left.non_zero_length()));
1030+
static_cast<size_t>(byte_width * left.non_zero_length())) == 0;
10321031
}
10331032
};
10341033

cpp/src/arrow/python/numpy_convert.cc

Lines changed: 153 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,10 @@
2525
#include <vector>
2626

2727
#include "arrow/buffer.h"
28+
#include "arrow/sparse_tensor.h"
2829
#include "arrow/tensor.h"
2930
#include "arrow/type.h"
31+
#include "arrow/util/logging.h"
3032

3133
#include "arrow/python/common.h"
3234
#include "arrow/python/pyarrow.h"
@@ -186,7 +188,9 @@ Status NumPyDtypeToArrow(PyArray_Descr* descr, std::shared_ptr<DataType>* out) {
186188

187189
#undef TO_ARROW_TYPE_CASE
188190

189-
Status NdarrayToTensor(MemoryPool* pool, PyObject* ao, std::shared_ptr<Tensor>* out) {
191+
Status NdarrayToTensor(MemoryPool* pool, PyObject* ao,
192+
const std::vector<std::string>& dim_names,
193+
std::shared_ptr<Tensor>* out) {
190194
if (!PyArray_Check(ao)) {
191195
return Status::TypeError("Did not pass ndarray object");
192196
}
@@ -197,35 +201,29 @@ Status NdarrayToTensor(MemoryPool* pool, PyObject* ao, std::shared_ptr<Tensor>*
197201

198202
int ndim = PyArray_NDIM(ndarray);
199203

200-
// This is also holding the GIL, so don't already draw it.
201204
std::shared_ptr<Buffer> data = std::make_shared<NumPyBuffer>(ao);
202205
std::vector<int64_t> shape(ndim);
203206
std::vector<int64_t> strides(ndim);
204207

205-
{
206-
PyAcquireGIL lock;
207-
npy_intp* array_strides = PyArray_STRIDES(ndarray);
208-
npy_intp* array_shape = PyArray_SHAPE(ndarray);
209-
for (int i = 0; i < ndim; ++i) {
210-
if (array_strides[i] < 0) {
211-
return Status::Invalid("Negative ndarray strides not supported");
212-
}
213-
shape[i] = array_shape[i];
214-
strides[i] = array_strides[i];
208+
npy_intp* array_strides = PyArray_STRIDES(ndarray);
209+
npy_intp* array_shape = PyArray_SHAPE(ndarray);
210+
for (int i = 0; i < ndim; ++i) {
211+
if (array_strides[i] < 0) {
212+
return Status::Invalid("Negative ndarray strides not supported");
215213
}
216-
217-
std::shared_ptr<DataType> type;
218-
RETURN_NOT_OK(
219-
GetTensorType(reinterpret_cast<PyObject*>(PyArray_DESCR(ndarray)), &type));
220-
*out = std::make_shared<Tensor>(type, data, shape, strides);
221-
return Status::OK();
214+
shape[i] = array_shape[i];
215+
strides[i] = array_strides[i];
222216
}
217+
218+
std::shared_ptr<DataType> type;
219+
RETURN_NOT_OK(
220+
GetTensorType(reinterpret_cast<PyObject*>(PyArray_DESCR(ndarray)), &type));
221+
*out = std::make_shared<Tensor>(type, data, shape, strides, dim_names);
222+
return Status::OK();
223223
}
224224

225225
Status TensorToNdarray(const std::shared_ptr<Tensor>& tensor, PyObject* base,
226226
PyObject** out) {
227-
PyAcquireGIL lock;
228-
229227
int type_num;
230228
RETURN_NOT_OK(GetNumPyType(*tensor->type(), &type_num));
231229
PyArray_Descr* dtype = PyArray_DescrNewFromType(type_num);
@@ -274,5 +272,140 @@ Status TensorToNdarray(const std::shared_ptr<Tensor>& tensor, PyObject* base,
274272
return Status::OK();
275273
}
276274

275+
// Wrap the dense data of a sparse tensor in a ndarray
276+
static Status SparseTensorDataToNdarray(const SparseTensor& sparse_tensor,
277+
std::vector<npy_intp> data_shape, PyObject* base,
278+
PyObject** out_data) {
279+
int type_num_data;
280+
RETURN_NOT_OK(GetNumPyType(*sparse_tensor.type(), &type_num_data));
281+
PyArray_Descr* dtype_data = PyArray_DescrNewFromType(type_num_data);
282+
RETURN_IF_PYERROR();
283+
284+
const void* immutable_data = sparse_tensor.data()->data();
285+
// Remove const =(
286+
void* mutable_data = const_cast<void*>(immutable_data);
287+
int array_flags = NPY_ARRAY_C_CONTIGUOUS | NPY_ARRAY_F_CONTIGUOUS;
288+
if (sparse_tensor.is_mutable()) {
289+
array_flags |= NPY_ARRAY_WRITEABLE;
290+
}
291+
292+
*out_data = PyArray_NewFromDescr(&PyArray_Type, dtype_data,
293+
static_cast<int>(data_shape.size()), data_shape.data(),
294+
nullptr, mutable_data, array_flags, nullptr);
295+
RETURN_IF_PYERROR()
296+
Py_XINCREF(base);
297+
PyArray_SetBaseObject(reinterpret_cast<PyArrayObject*>(*out_data), base);
298+
return Status::OK();
299+
}
300+
301+
Status SparseTensorCOOToNdarray(const std::shared_ptr<SparseTensorCOO>& sparse_tensor,
302+
PyObject* base, PyObject** out_data,
303+
PyObject** out_coords) {
304+
const auto& sparse_index = arrow::internal::checked_cast<const SparseCOOIndex&>(
305+
*sparse_tensor->sparse_index());
306+
307+
// Wrap tensor data
308+
OwnedRef result_data;
309+
RETURN_NOT_OK(SparseTensorDataToNdarray(
310+
*sparse_tensor, {sparse_index.non_zero_length(), 1}, base, result_data.ref()));
311+
312+
// Wrap indices
313+
PyObject* result_coords;
314+
RETURN_NOT_OK(TensorToNdarray(sparse_index.indices(), base, &result_coords));
315+
316+
*out_data = result_data.detach();
317+
*out_coords = result_coords;
318+
return Status::OK();
319+
}
320+
321+
Status SparseTensorCSRToNdarray(const std::shared_ptr<SparseTensorCSR>& sparse_tensor,
322+
PyObject* base, PyObject** out_data,
323+
PyObject** out_indptr, PyObject** out_indices) {
324+
const auto& sparse_index = arrow::internal::checked_cast<const SparseCSRIndex&>(
325+
*sparse_tensor->sparse_index());
326+
327+
// Wrap tensor data
328+
OwnedRef result_data;
329+
RETURN_NOT_OK(SparseTensorDataToNdarray(
330+
*sparse_tensor, {sparse_index.non_zero_length(), 1}, base, result_data.ref()));
331+
332+
// Wrap indices
333+
OwnedRef result_indptr;
334+
OwnedRef result_indices;
335+
RETURN_NOT_OK(TensorToNdarray(sparse_index.indptr(), base, result_indptr.ref()));
336+
RETURN_NOT_OK(TensorToNdarray(sparse_index.indices(), base, result_indices.ref()));
337+
338+
*out_data = result_data.detach();
339+
*out_indptr = result_indptr.detach();
340+
*out_indices = result_indices.detach();
341+
return Status::OK();
342+
}
343+
344+
Status NdarraysToSparseTensorCOO(MemoryPool* pool, PyObject* data_ao, PyObject* coords_ao,
345+
const std::vector<int64_t>& shape,
346+
const std::vector<std::string>& dim_names,
347+
std::shared_ptr<SparseTensorCOO>* out) {
348+
if (!PyArray_Check(data_ao) || !PyArray_Check(coords_ao)) {
349+
return Status::TypeError("Did not pass ndarray object");
350+
}
351+
352+
PyArrayObject* ndarray_data = reinterpret_cast<PyArrayObject*>(data_ao);
353+
std::shared_ptr<Buffer> data = std::make_shared<NumPyBuffer>(data_ao);
354+
std::shared_ptr<DataType> type_data;
355+
RETURN_NOT_OK(GetTensorType(reinterpret_cast<PyObject*>(PyArray_DESCR(ndarray_data)),
356+
&type_data));
357+
358+
std::shared_ptr<Tensor> coords;
359+
RETURN_NOT_OK(NdarrayToTensor(pool, coords_ao, {}, &coords));
360+
ARROW_CHECK_EQ(coords->type_id(), Type::INT64); // Should be ensured by caller
361+
362+
std::shared_ptr<SparseCOOIndex> sparse_index = std::make_shared<SparseCOOIndex>(
363+
std::static_pointer_cast<NumericTensor<Int64Type>>(coords));
364+
*out = std::make_shared<SparseTensorImpl<SparseCOOIndex>>(sparse_index, type_data, data,
365+
shape, dim_names);
366+
return Status::OK();
367+
}
368+
369+
Status NdarraysToSparseTensorCSR(MemoryPool* pool, PyObject* data_ao, PyObject* indptr_ao,
370+
PyObject* indices_ao, const std::vector<int64_t>& shape,
371+
const std::vector<std::string>& dim_names,
372+
std::shared_ptr<SparseTensorCSR>* out) {
373+
if (!PyArray_Check(data_ao) || !PyArray_Check(indptr_ao) ||
374+
!PyArray_Check(indices_ao)) {
375+
return Status::TypeError("Did not pass ndarray object");
376+
}
377+
378+
PyArrayObject* ndarray_data = reinterpret_cast<PyArrayObject*>(data_ao);
379+
std::shared_ptr<Buffer> data = std::make_shared<NumPyBuffer>(data_ao);
380+
std::shared_ptr<DataType> type_data;
381+
RETURN_NOT_OK(GetTensorType(reinterpret_cast<PyObject*>(PyArray_DESCR(ndarray_data)),
382+
&type_data));
383+
384+
std::shared_ptr<Tensor> indptr, indices;
385+
RETURN_NOT_OK(NdarrayToTensor(pool, indptr_ao, {}, &indptr));
386+
RETURN_NOT_OK(NdarrayToTensor(pool, indices_ao, {}, &indices));
387+
ARROW_CHECK_EQ(indptr->type_id(), Type::INT64); // Should be ensured by caller
388+
ARROW_CHECK_EQ(indices->type_id(), Type::INT64); // Should be ensured by caller
389+
390+
auto sparse_index = std::make_shared<SparseCSRIndex>(
391+
std::static_pointer_cast<NumericTensor<Int64Type>>(indptr),
392+
std::static_pointer_cast<NumericTensor<Int64Type>>(indices));
393+
*out = std::make_shared<SparseTensorImpl<SparseCSRIndex>>(sparse_index, type_data, data,
394+
shape, dim_names);
395+
return Status::OK();
396+
}
397+
398+
Status TensorToSparseTensorCOO(const std::shared_ptr<Tensor>& tensor,
399+
std::shared_ptr<SparseTensorCOO>* out) {
400+
*out = std::make_shared<SparseTensorCOO>(*tensor);
401+
return Status::OK();
402+
}
403+
404+
Status TensorToSparseTensorCSR(const std::shared_ptr<Tensor>& tensor,
405+
std::shared_ptr<SparseTensorCSR>* out) {
406+
*out = std::make_shared<SparseTensorCSR>(*tensor);
407+
return Status::OK();
408+
}
409+
277410
} // namespace py
278411
} // namespace arrow

cpp/src/arrow/python/numpy_convert.h

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,11 @@
2525

2626
#include <memory>
2727
#include <string>
28+
#include <vector>
2829

2930
#include "arrow/buffer.h"
3031
#include "arrow/python/visibility.h"
32+
#include "arrow/sparse_tensor.h"
3133

3234
namespace arrow {
3335

@@ -63,11 +65,38 @@ Status GetTensorType(PyObject* dtype, std::shared_ptr<DataType>* out);
6365
Status GetNumPyType(const DataType& type, int* type_num);
6466

6567
ARROW_PYTHON_EXPORT Status NdarrayToTensor(MemoryPool* pool, PyObject* ao,
68+
const std::vector<std::string>& dim_names,
6669
std::shared_ptr<Tensor>* out);
6770

6871
ARROW_PYTHON_EXPORT Status TensorToNdarray(const std::shared_ptr<Tensor>& tensor,
6972
PyObject* base, PyObject** out);
7073

74+
ARROW_PYTHON_EXPORT Status
75+
SparseTensorCOOToNdarray(const std::shared_ptr<SparseTensorCOO>& sparse_tensor,
76+
PyObject* base, PyObject** out_data, PyObject** out_coords);
77+
78+
ARROW_PYTHON_EXPORT Status SparseTensorCSRToNdarray(
79+
const std::shared_ptr<SparseTensorCSR>& sparse_tensor, PyObject* base,
80+
PyObject** out_data, PyObject** out_indptr, PyObject** out_indices);
81+
82+
ARROW_PYTHON_EXPORT Status NdarraysToSparseTensorCOO(
83+
MemoryPool* pool, PyObject* data_ao, PyObject* coords_ao,
84+
const std::vector<int64_t>& shape, const std::vector<std::string>& dim_names,
85+
std::shared_ptr<SparseTensorCOO>* out);
86+
87+
ARROW_PYTHON_EXPORT Status NdarraysToSparseTensorCSR(
88+
MemoryPool* pool, PyObject* data_ao, PyObject* indptr_ao, PyObject* indices_ao,
89+
const std::vector<int64_t>& shape, const std::vector<std::string>& dim_names,
90+
std::shared_ptr<SparseTensorCSR>* out);
91+
92+
ARROW_PYTHON_EXPORT Status
93+
TensorToSparseTensorCOO(const std::shared_ptr<Tensor>& tensor,
94+
std::shared_ptr<SparseTensorCOO>* csparse_tensor);
95+
96+
ARROW_PYTHON_EXPORT Status
97+
TensorToSparseTensorCSR(const std::shared_ptr<Tensor>& tensor,
98+
std::shared_ptr<SparseTensorCSR>* csparse_tensor);
99+
71100
} // namespace py
72101
} // namespace arrow
73102

cpp/src/arrow/python/pyarrow.cc

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,44 @@ PyObject* wrap_tensor(const std::shared_ptr<Tensor>& tensor) {
123123
return ::pyarrow_wrap_tensor(tensor);
124124
}
125125

126+
bool is_sparse_tensor_csr(PyObject* sparse_tensor) {
127+
return ::pyarrow_is_sparse_tensor_csr(sparse_tensor) != 0;
128+
}
129+
130+
Status unwrap_sparse_tensor_csr(PyObject* sparse_tensor,
131+
std::shared_ptr<SparseTensorCSR>* out) {
132+
*out = ::pyarrow_unwrap_sparse_tensor_csr(sparse_tensor);
133+
if (*out) {
134+
return Status::OK();
135+
} else {
136+
return Status::Invalid(
137+
"Could not unwrap SparseTensorCSR from the passed Python object.");
138+
}
139+
}
140+
141+
PyObject* wrap_sparse_tensor_csr(const std::shared_ptr<SparseTensorCSR>& sparse_tensor) {
142+
return ::pyarrow_wrap_sparse_tensor_csr(sparse_tensor);
143+
}
144+
145+
bool is_sparse_tensor_coo(PyObject* sparse_tensor) {
146+
return ::pyarrow_is_sparse_tensor_coo(sparse_tensor) != 0;
147+
}
148+
149+
Status unwrap_sparse_tensor_coo(PyObject* sparse_tensor,
150+
std::shared_ptr<SparseTensorCOO>* out) {
151+
*out = ::pyarrow_unwrap_sparse_tensor_coo(sparse_tensor);
152+
if (*out) {
153+
return Status::OK();
154+
} else {
155+
return Status::Invalid(
156+
"Could not unwrap SparseTensorCOO from the passed Python object.");
157+
}
158+
}
159+
160+
PyObject* wrap_sparse_tensor_coo(const std::shared_ptr<SparseTensorCOO>& sparse_tensor) {
161+
return ::pyarrow_wrap_sparse_tensor_coo(sparse_tensor);
162+
}
163+
126164
bool is_column(PyObject* column) { return ::pyarrow_is_column(column) != 0; }
127165

128166
Status unwrap_column(PyObject* column, std::shared_ptr<Column>* out) {

cpp/src/arrow/python/pyarrow.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@
2424

2525
#include "arrow/python/visibility.h"
2626

27+
#include "arrow/sparse_tensor.h"
28+
2729
namespace arrow {
2830

2931
class Array;
@@ -67,6 +69,18 @@ ARROW_PYTHON_EXPORT bool is_tensor(PyObject* tensor);
6769
ARROW_PYTHON_EXPORT Status unwrap_tensor(PyObject* tensor, std::shared_ptr<Tensor>* out);
6870
ARROW_PYTHON_EXPORT PyObject* wrap_tensor(const std::shared_ptr<Tensor>& tensor);
6971

72+
ARROW_PYTHON_EXPORT bool is_sparse_tensor_coo(PyObject* sparse_tensor);
73+
ARROW_PYTHON_EXPORT Status
74+
unwrap_sparse_tensor_coo(PyObject* sparse_tensor, std::shared_ptr<SparseTensorCOO>* out);
75+
ARROW_PYTHON_EXPORT PyObject* wrap_sparse_tensor_coo(
76+
const std::shared_ptr<SparseTensorCOO>& sparse_tensor);
77+
78+
ARROW_PYTHON_EXPORT bool is_sparse_tensor_csr(PyObject* sparse_tensor);
79+
ARROW_PYTHON_EXPORT Status
80+
unwrap_sparse_tensor_csr(PyObject* sparse_tensor, std::shared_ptr<SparseTensorCSR>* out);
81+
ARROW_PYTHON_EXPORT PyObject* wrap_sparse_tensor_csr(
82+
const std::shared_ptr<SparseTensorCSR>& sparse_tensor);
83+
7084
ARROW_PYTHON_EXPORT bool is_column(PyObject* column);
7185
ARROW_PYTHON_EXPORT Status unwrap_column(PyObject* column, std::shared_ptr<Column>* out);
7286
ARROW_PYTHON_EXPORT PyObject* wrap_column(const std::shared_ptr<Column>& column);

0 commit comments

Comments
 (0)