Skip to content

Commit 654002a

Browse files
rokpitrou
authored andcommitted
Partial review feedback implementation.
1 parent e89edc6 commit 654002a

File tree

7 files changed

+220
-174
lines changed

7 files changed

+220
-174
lines changed

cpp/src/arrow/python/numpy_convert.cc

Lines changed: 72 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -281,36 +281,28 @@ Status SparseTensorCOOToNdarray(const std::shared_ptr<SparseTensorCOO>& sparse_t
281281
PyAcquireGIL lock;
282282

283283
int type_num_data;
284-
int type_num_coords;
284+
RETURN_NOT_OK(GetNumPyType(*sparse_tensor->type(), &type_num_data));
285+
PyArray_Descr* dtype_data = PyArray_DescrNewFromType(type_num_data);
286+
RETURN_IF_PYERROR();
287+
285288
const auto& sparse_index = arrow::internal::checked_cast<const SparseCOOIndex&>(
286289
*sparse_tensor->sparse_index());
287290
const std::shared_ptr<arrow::NumericTensor<arrow::Int64Type>> sparse_index_coords =
288291
sparse_index.indices();
289292

290-
RETURN_NOT_OK(GetNumPyType(*sparse_tensor->type(), &type_num_data));
291-
PyArray_Descr* dtype_data = PyArray_DescrNewFromType(type_num_data);
292-
RETURN_NOT_OK(GetNumPyType(*sparse_index_coords->type(), &type_num_coords));
293-
PyArray_Descr* dtype_coords = PyArray_DescrNewFromType(type_num_coords);
294293
RETURN_IF_PYERROR();
295294

295+
std::vector<npy_intp> npy_shape_data({sparse_index.non_zero_length(), 1});
296+
296297
const int ndim_coords = sparse_tensor->ndim();
297298
std::vector<npy_intp> npy_shape_coords(ndim_coords);
298299

299300
for (int i = 0; i < ndim_coords; ++i) {
300301
npy_shape_coords[i] = sparse_index_coords->shape()[i];
301302
}
302303

303-
std::vector<npy_intp> npy_shape_data({sparse_index.non_zero_length(), 1});
304-
305-
const void* immutable_data = nullptr;
306-
if (sparse_tensor->data()) {
307-
immutable_data = sparse_tensor->data()->data();
308-
}
309-
310-
const void* immutable_coords = nullptr;
311-
if (sparse_index_coords->data()) {
312-
immutable_coords = sparse_index_coords->data()->data();
313-
}
304+
const void* immutable_data = sparse_tensor->data()->data();
305+
const void* immutable_coords = sparse_index_coords->data()->data();
314306

315307
// Remove const =(
316308
void* mutable_data = const_cast<void*>(immutable_data);
@@ -321,20 +313,21 @@ Status SparseTensorCOOToNdarray(const std::shared_ptr<SparseTensorCOO>& sparse_t
321313
array_flags |= NPY_ARRAY_WRITEABLE;
322314
}
323315

316+
int type_num_coords;
317+
RETURN_NOT_OK(GetNumPyType(*sparse_index_coords->type(), &type_num_coords));
318+
PyArray_Descr* dtype_coords = PyArray_DescrNewFromType(type_num_coords);
319+
324320
PyObject* result_data =
325-
PyArray_NewFromDescr(&PyArray_Type, dtype_data, 1, npy_shape_data.data(), nullptr,
321+
PyArray_NewFromDescr(&PyArray_Type, dtype_data, 2, npy_shape_data.data(), nullptr,
326322
mutable_data, array_flags, nullptr);
327-
PyObject* result_coords =
328-
PyArray_NewFromDescr(&PyArray_Type, dtype_coords, 2, npy_shape_coords.data(),
329-
nullptr, mutable_coords, array_flags, nullptr);
330-
323+
PyObject* result_coords = PyArray_NewFromDescr(&PyArray_Type, dtype_coords, ndim_coords,
324+
npy_shape_coords.data(), nullptr,
325+
mutable_coords, array_flags, nullptr);
331326
RETURN_IF_PYERROR()
332327

333-
if (base == Py_None || base == nullptr) {
334-
base = py::wrap_sparse_tensor_coo(sparse_tensor);
335-
} else {
336-
Py_XINCREF(base);
337-
}
328+
Py_XINCREF(base);
329+
Py_XINCREF(base);
330+
338331
PyArray_SetBaseObject(reinterpret_cast<PyArrayObject*>(result_data), base);
339332
PyArray_SetBaseObject(reinterpret_cast<PyArrayObject*>(result_coords), base);
340333

@@ -376,20 +369,9 @@ Status SparseTensorCSRToNdarray(const std::shared_ptr<SparseTensorCSR>& sparse_t
376369
npy_shape_indices[i] = sparse_index_indices->shape()[i];
377370
}
378371

379-
const void* immutable_data = nullptr;
380-
if (sparse_tensor->data()) {
381-
immutable_data = sparse_tensor->data()->data();
382-
}
383-
384-
const void* immutable_indptr = nullptr;
385-
if (sparse_index_indptr->data()) {
386-
immutable_indptr = sparse_index_indptr->data()->data();
387-
}
388-
389-
const void* immutable_indices = nullptr;
390-
if (sparse_index_indices->data()) {
391-
immutable_indices = sparse_index_indices->data()->data();
392-
}
372+
const void* immutable_data = sparse_tensor->data()->data();
373+
const void* immutable_indptr = sparse_index_indptr->data()->data();
374+
const void* immutable_indices = sparse_index_indices->data()->data();
393375

394376
// Remove const =(
395377
void* mutable_data = const_cast<void*>(immutable_data);
@@ -410,21 +392,20 @@ Status SparseTensorCSRToNdarray(const std::shared_ptr<SparseTensorCSR>& sparse_t
410392
PyArray_Descr* dtype_indices = PyArray_DescrNewFromType(type_num_indices);
411393

412394
PyObject* result_data =
413-
PyArray_NewFromDescr(&PyArray_Type, dtype_data, 1, npy_shape_data.data(), nullptr,
395+
PyArray_NewFromDescr(&PyArray_Type, dtype_data, 2, npy_shape_data.data(), nullptr,
414396
mutable_data, array_flags, nullptr);
415-
PyObject* result_indptr =
416-
PyArray_NewFromDescr(&PyArray_Type, dtype_indptr, 1, npy_shape_indptr.data(),
417-
nullptr, mutable_indptr, array_flags, nullptr);
418-
PyObject* result_indices =
419-
PyArray_NewFromDescr(&PyArray_Type, dtype_indices, 1, npy_shape_indices.data(),
420-
nullptr, mutable_indices, array_flags, nullptr);
397+
PyObject* result_indptr = PyArray_NewFromDescr(&PyArray_Type, dtype_indptr, ndim_indptr,
398+
npy_shape_indptr.data(), nullptr,
399+
mutable_indptr, array_flags, nullptr);
400+
PyObject* result_indices = PyArray_NewFromDescr(
401+
&PyArray_Type, dtype_indices, ndim_indices, npy_shape_indices.data(), nullptr,
402+
mutable_indices, array_flags, nullptr);
421403
RETURN_IF_PYERROR()
422404

423-
if (base == Py_None || base == nullptr) {
424-
base = py::wrap_sparse_tensor_csr(sparse_tensor);
425-
} else {
426-
Py_XINCREF(base);
427-
}
405+
Py_XINCREF(base);
406+
Py_XINCREF(base);
407+
Py_XINCREF(base);
408+
428409
PyArray_SetBaseObject(reinterpret_cast<PyArrayObject*>(result_data), base);
429410
PyArray_SetBaseObject(reinterpret_cast<PyArrayObject*>(result_indptr), base);
430411
PyArray_SetBaseObject(reinterpret_cast<PyArrayObject*>(result_indices), base);
@@ -435,19 +416,17 @@ Status SparseTensorCSRToNdarray(const std::shared_ptr<SparseTensorCSR>& sparse_t
435416
return Status::OK();
436417
}
437418

438-
Status NdarrayToSparseTensorCOO(MemoryPool* pool, PyObject* ao,
439-
std::shared_ptr<SparseTensorCOO>* out) {
419+
Status DenseNdarrayToSparseTensorCOO(MemoryPool* pool, PyObject* ao,
420+
const std::vector<std::string>& dim_names,
421+
std::shared_ptr<SparseTensorCOO>* out) {
440422
if (!PyArray_Check(ao)) {
441423
return Status::TypeError("Did not pass ndarray object");
442424
}
443425

444426
PyArrayObject* ndarray = reinterpret_cast<PyArrayObject*>(ao);
445427

446-
// TODO(wesm): What do we want to do with non-contiguous memory and negative strides?
447-
448428
int ndim = PyArray_NDIM(ndarray);
449429

450-
// This is also holding the GIL, so don't already draw it.
451430
std::shared_ptr<Buffer> data = std::make_shared<NumPyBuffer>(ao);
452431
std::vector<int64_t> shape(ndim);
453432
std::vector<int64_t> strides(ndim);
@@ -467,25 +446,23 @@ Status NdarrayToSparseTensorCOO(MemoryPool* pool, PyObject* ao,
467446
std::shared_ptr<DataType> type;
468447
RETURN_NOT_OK(
469448
GetTensorType(reinterpret_cast<PyObject*>(PyArray_DESCR(ndarray)), &type));
470-
Tensor tensor(type, data, shape, strides);
449+
Tensor tensor(type, data, shape, strides, dim_names);
471450
*out = std::make_shared<SparseTensorCOO>(tensor);
472451
return Status::OK();
473452
}
474453
}
475454

476-
Status NdarrayToSparseTensorCSR(MemoryPool* pool, PyObject* ao,
477-
std::shared_ptr<SparseTensorCSR>* out) {
455+
Status DenseNdarrayToSparseTensorCSR(MemoryPool* pool, PyObject* ao,
456+
const std::vector<std::string>& dim_names,
457+
std::shared_ptr<SparseTensorCSR>* out) {
478458
if (!PyArray_Check(ao)) {
479459
return Status::TypeError("Did not pass ndarray object");
480460
}
481461

482462
PyArrayObject* ndarray = reinterpret_cast<PyArrayObject*>(ao);
483463

484-
// TODO(wesm): What do we want to do with non-contiguous memory and negative strides?
485-
486464
int ndim = PyArray_NDIM(ndarray);
487465

488-
// This is also holding the GIL, so don't already draw it.
489466
std::shared_ptr<Buffer> data = std::make_shared<NumPyBuffer>(ao);
490467
std::vector<int64_t> shape(ndim);
491468
std::vector<int64_t> strides(ndim);
@@ -505,7 +482,7 @@ Status NdarrayToSparseTensorCSR(MemoryPool* pool, PyObject* ao,
505482
std::shared_ptr<DataType> type;
506483
RETURN_NOT_OK(
507484
GetTensorType(reinterpret_cast<PyObject*>(PyArray_DESCR(ndarray)), &type));
508-
Tensor tensor(type, data, shape, strides);
485+
Tensor tensor(type, data, shape, strides, dim_names);
509486
*out = std::make_shared<SparseTensorCSR>(tensor);
510487
return Status::OK();
511488
}
@@ -529,22 +506,34 @@ Status NdarraysToSparseTensorCOO(MemoryPool* pool, PyObject* data_ao, PyObject*
529506

530507
int coords_ndim = PyArray_NDIM(ndarray_coords);
531508

532-
std::shared_ptr<DataType> type;
509+
std::shared_ptr<DataType> type_data;
510+
std::shared_ptr<DataType> type_coords;
511+
RETURN_NOT_OK(GetTensorType(reinterpret_cast<PyObject*>(PyArray_DESCR(ndarray_data)),
512+
&type_data));
513+
RETURN_NOT_OK(GetTensorType(reinterpret_cast<PyObject*>(PyArray_DESCR(ndarray_coords)),
514+
&type_coords));
515+
ARROW_CHECK_EQ(type_coords->id(), Type::INT64);
516+
517+
const int64_t i64_size = sizeof(int64_t);
533518
npy_intp* coords_array_shape = PyArray_SHAPE(ndarray_coords);
534519
std::vector<int64_t> coords_shape(coords_ndim);
520+
std::vector<int64_t> coords_strides(coords_ndim);
535521

536522
for (int i = 0; i < coords_ndim; ++i) {
537523
coords_shape[i] = coords_array_shape[i];
524+
if (i == 0) {
525+
coords_strides[i] = i64_size;
526+
} else {
527+
coords_strides[i] = coords_strides[i - 1] * coords_array_shape[i - 1];
528+
}
538529
}
539530

540531
std::shared_ptr<NumericTensor<Int64Type>> coords =
541-
std::make_shared<NumericTensor<Int64Type>>(coords_buffer, coords_shape);
542-
543-
RETURN_NOT_OK(
544-
GetTensorType(reinterpret_cast<PyObject*>(PyArray_DESCR(ndarray_data)), &type));
532+
std::make_shared<NumericTensor<Int64Type>>(coords_buffer, coords_shape,
533+
coords_strides);
545534

546535
std::shared_ptr<SparseCOOIndex> sparse_index = std::make_shared<SparseCOOIndex>(coords);
547-
*out = std::make_shared<SparseTensorImpl<SparseCOOIndex>>(sparse_index, type, data,
536+
*out = std::make_shared<SparseTensorImpl<SparseCOOIndex>>(sparse_index, type_data, data,
548537
shape, dim_names);
549538
return Status::OK();
550539
}
@@ -564,15 +553,16 @@ Status NdarraysToSparseTensorCSR(MemoryPool* pool, PyObject* data_ao, PyObject*
564553
PyArrayObject* ndarray_indptr = reinterpret_cast<PyArrayObject*>(indptr_ao);
565554
PyArrayObject* ndarray_indices = reinterpret_cast<PyArrayObject*>(indices_ao);
566555

567-
// This is also holding the GIL, so don't already draw it.
568556
std::shared_ptr<Buffer> data = std::make_shared<NumPyBuffer>(data_ao);
569557
std::shared_ptr<Buffer> indptr_buffer = std::make_shared<NumPyBuffer>(indptr_ao);
570558
std::shared_ptr<Buffer> indices_buffer = std::make_shared<NumPyBuffer>(indices_ao);
571559

572560
int indptr_ndim = PyArray_NDIM(ndarray_indptr);
573561
int indices_ndim = PyArray_NDIM(ndarray_indices);
574562

575-
std::shared_ptr<DataType> type;
563+
std::shared_ptr<DataType> type_data;
564+
std::shared_ptr<DataType> type_indptr;
565+
std::shared_ptr<DataType> type_indices;
576566
npy_intp* indptr_array_shape = PyArray_SHAPE(ndarray_indptr);
577567
npy_intp* indices_array_shape = PyArray_SHAPE(ndarray_indices);
578568
std::vector<int64_t> indptr_shape(indptr_ndim);
@@ -585,17 +575,23 @@ Status NdarraysToSparseTensorCSR(MemoryPool* pool, PyObject* data_ao, PyObject*
585575
indices_shape[i] = indices_array_shape[i];
586576
}
587577

578+
RETURN_NOT_OK(GetTensorType(reinterpret_cast<PyObject*>(PyArray_DESCR(ndarray_data)),
579+
&type_data));
580+
RETURN_NOT_OK(GetTensorType(reinterpret_cast<PyObject*>(PyArray_DESCR(ndarray_indptr)),
581+
&type_indptr));
582+
RETURN_NOT_OK(GetTensorType(reinterpret_cast<PyObject*>(PyArray_DESCR(ndarray_indices)),
583+
&type_indices));
584+
ARROW_CHECK_EQ(type_indptr->id(), Type::INT64);
585+
ARROW_CHECK_EQ(type_indices->id(), Type::INT64);
586+
588587
std::shared_ptr<SparseCSRIndex::IndexTensor> indptr =
589588
std::make_shared<SparseCSRIndex::IndexTensor>(indptr_buffer, indptr_shape);
590589
std::shared_ptr<SparseCSRIndex::IndexTensor> indices =
591590
std::make_shared<SparseCSRIndex::IndexTensor>(indices_buffer, indices_shape);
592591

593-
RETURN_NOT_OK(
594-
GetTensorType(reinterpret_cast<PyObject*>(PyArray_DESCR(ndarray_data)), &type));
595-
596592
std::shared_ptr<SparseCSRIndex> sparse_index =
597593
std::make_shared<SparseCSRIndex>(indptr, indices);
598-
*out = std::make_shared<SparseTensorImpl<SparseCSRIndex>>(sparse_index, type, data,
594+
*out = std::make_shared<SparseTensorImpl<SparseCSRIndex>>(sparse_index, type_data, data,
599595
shape, dim_names);
600596
return Status::OK();
601597
}

cpp/src/arrow/python/numpy_convert.h

Lines changed: 16 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -70,38 +70,40 @@ ARROW_PYTHON_EXPORT Status NdarrayToTensor(MemoryPool* pool, PyObject* ao,
7070
ARROW_PYTHON_EXPORT Status TensorToNdarray(const std::shared_ptr<Tensor>& tensor,
7171
PyObject* base, PyObject** out);
7272

73-
ARROW_PYTHON_EXPORT Status SparseTensorCSRToNdarray(
74-
const std::shared_ptr<SparseTensorCSR>& sparse_tensor, PyObject* base,
75-
PyObject** out_data, PyObject** out_indptr, PyObject** out_indices);
76-
7773
ARROW_PYTHON_EXPORT Status
7874
SparseTensorCOOToNdarray(const std::shared_ptr<SparseTensorCOO>& sparse_tensor,
7975
PyObject* base, PyObject** out_data, PyObject** out_coords);
8076

81-
ARROW_PYTHON_EXPORT Status NdarrayToSparseTensorCSR(
82-
MemoryPool* pool, PyObject* ao, std::shared_ptr<SparseTensorCSR>* out);
77+
ARROW_PYTHON_EXPORT Status SparseTensorCSRToNdarray(
78+
const std::shared_ptr<SparseTensorCSR>& sparse_tensor, PyObject* base,
79+
PyObject** out_data, PyObject** out_indptr, PyObject** out_indices);
8380

84-
ARROW_PYTHON_EXPORT Status NdarrayToSparseTensorCOO(
85-
MemoryPool* pool, PyObject* ao, std::shared_ptr<SparseTensorCOO>* out);
81+
ARROW_PYTHON_EXPORT Status DenseNdarrayToSparseTensorCOO(
82+
MemoryPool* pool, PyObject* ao, const std::vector<std::string>& dim_names,
83+
std::shared_ptr<SparseTensorCOO>* out);
8684

87-
ARROW_PYTHON_EXPORT Status NdarraysToSparseTensorCSR(
88-
MemoryPool* pool, PyObject* data_ao, PyObject* indptr_ao, PyObject* indices_ao,
89-
const std::vector<int64_t>& shape, const std::vector<std::string>& dim_names,
85+
ARROW_PYTHON_EXPORT Status DenseNdarrayToSparseTensorCSR(
86+
MemoryPool* pool, PyObject* ao, const std::vector<std::string>& dim_names,
9087
std::shared_ptr<SparseTensorCSR>* out);
9188

9289
ARROW_PYTHON_EXPORT Status NdarraysToSparseTensorCOO(
9390
MemoryPool* pool, PyObject* data_ao, PyObject* coords_ao,
9491
const std::vector<int64_t>& shape, const std::vector<std::string>& dim_names,
9592
std::shared_ptr<SparseTensorCOO>* out);
9693

97-
ARROW_PYTHON_EXPORT Status
98-
TensorToSparseTensorCSR(const std::shared_ptr<Tensor>& tensor,
99-
std::shared_ptr<SparseTensorCSR>* csparse_tensor);
94+
ARROW_PYTHON_EXPORT Status NdarraysToSparseTensorCSR(
95+
MemoryPool* pool, PyObject* data_ao, PyObject* indptr_ao, PyObject* indices_ao,
96+
const std::vector<int64_t>& shape, const std::vector<std::string>& dim_names,
97+
std::shared_ptr<SparseTensorCSR>* out);
10098

10199
ARROW_PYTHON_EXPORT Status
102100
TensorToSparseTensorCOO(const std::shared_ptr<Tensor>& tensor,
103101
std::shared_ptr<SparseTensorCOO>* csparse_tensor);
104102

103+
ARROW_PYTHON_EXPORT Status
104+
TensorToSparseTensorCSR(const std::shared_ptr<Tensor>& tensor,
105+
std::shared_ptr<SparseTensorCSR>* csparse_tensor);
106+
105107
} // namespace py
106108
} // namespace arrow
107109

cpp/src/arrow/python/pyarrow_lib.h

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -61,18 +61,6 @@ __PYX_EXTERN_C std::shared_ptr< arrow::Table> __pyx_f_7pyarrow_3lib_pyarrow_unw
6161
__PYX_EXTERN_C std::shared_ptr< arrow::Tensor> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_tensor(PyObject *);
6262
__PYX_EXTERN_C std::shared_ptr< arrow::SparseTensorCOO> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_sparse_tensor_coo(PyObject *);
6363
__PYX_EXTERN_C std::shared_ptr< arrow::SparseTensorCSR> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_sparse_tensor_csr(PyObject *);
64-
__PYX_EXTERN_C int pyarrow_is_buffer(PyObject *);
65-
__PYX_EXTERN_C int pyarrow_is_data_type(PyObject *);
66-
__PYX_EXTERN_C int pyarrow_is_field(PyObject *);
67-
__PYX_EXTERN_C int pyarrow_is_schema(PyObject *);
68-
__PYX_EXTERN_C int pyarrow_is_array(PyObject *);
69-
__PYX_EXTERN_C PyObject *pyarrow_wrap_chunked_array(std::shared_ptr< arrow::ChunkedArray> const &);
70-
__PYX_EXTERN_C int pyarrow_is_tensor(PyObject *);
71-
__PYX_EXTERN_C int pyarrow_is_sparse_tensor_coo(PyObject *);
72-
__PYX_EXTERN_C int pyarrow_is_sparse_tensor_csr(PyObject *);
73-
__PYX_EXTERN_C int pyarrow_is_column(PyObject *);
74-
__PYX_EXTERN_C int pyarrow_is_table(PyObject *);
75-
__PYX_EXTERN_C int pyarrow_is_batch(PyObject *);
7664

7765
#endif /* !__PYX_HAVE_API__pyarrow__lib */
7866

python/pyarrow/__init__.pxd

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ from pyarrow.includes.libarrow cimport (CArray, CBuffer, CColumn, CDataType,
2222
CField, CRecordBatch, CSchema,
2323
CTable, CTensor,
2424
CSparseTensorCSR, CSparseTensorCOO)
25-
25+
from pyarrow.compat import frombytes
2626

2727
cdef extern from "arrow/python/pyarrow.h" namespace "arrow::py":
2828
cdef int import_pyarrow() except -1

0 commit comments

Comments
 (0)