Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions cpp/src/arrow/compare.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1026,9 +1026,8 @@ struct SparseTensorEqualsImpl<SparseIndexType, SparseIndexType> {

const uint8_t* left_data = left.data()->data();
const uint8_t* right_data = right.data()->data();

return memcmp(left_data, right_data,
static_cast<size_t>(byte_width * left.non_zero_length()));
static_cast<size_t>(byte_width * left.non_zero_length())) == 0;
}
};

Expand Down
173 changes: 153 additions & 20 deletions cpp/src/arrow/python/numpy_convert.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,10 @@
#include <vector>

#include "arrow/buffer.h"
#include "arrow/sparse_tensor.h"
#include "arrow/tensor.h"
#include "arrow/type.h"
#include "arrow/util/logging.h"

#include "arrow/python/common.h"
#include "arrow/python/pyarrow.h"
Expand Down Expand Up @@ -186,7 +188,9 @@ Status NumPyDtypeToArrow(PyArray_Descr* descr, std::shared_ptr<DataType>* out) {

#undef TO_ARROW_TYPE_CASE

Status NdarrayToTensor(MemoryPool* pool, PyObject* ao, std::shared_ptr<Tensor>* out) {
Status NdarrayToTensor(MemoryPool* pool, PyObject* ao,
const std::vector<std::string>& dim_names,
std::shared_ptr<Tensor>* out) {
if (!PyArray_Check(ao)) {
return Status::TypeError("Did not pass ndarray object");
}
Expand All @@ -197,35 +201,29 @@ Status NdarrayToTensor(MemoryPool* pool, PyObject* ao, std::shared_ptr<Tensor>*

int ndim = PyArray_NDIM(ndarray);

// This is also holding the GIL, so don't already draw it.
std::shared_ptr<Buffer> data = std::make_shared<NumPyBuffer>(ao);
std::vector<int64_t> shape(ndim);
std::vector<int64_t> strides(ndim);

{
PyAcquireGIL lock;
npy_intp* array_strides = PyArray_STRIDES(ndarray);
npy_intp* array_shape = PyArray_SHAPE(ndarray);
for (int i = 0; i < ndim; ++i) {
if (array_strides[i] < 0) {
return Status::Invalid("Negative ndarray strides not supported");
}
shape[i] = array_shape[i];
strides[i] = array_strides[i];
npy_intp* array_strides = PyArray_STRIDES(ndarray);
npy_intp* array_shape = PyArray_SHAPE(ndarray);
for (int i = 0; i < ndim; ++i) {
if (array_strides[i] < 0) {
return Status::Invalid("Negative ndarray strides not supported");
}

std::shared_ptr<DataType> type;
RETURN_NOT_OK(
GetTensorType(reinterpret_cast<PyObject*>(PyArray_DESCR(ndarray)), &type));
*out = std::make_shared<Tensor>(type, data, shape, strides);
return Status::OK();
shape[i] = array_shape[i];
strides[i] = array_strides[i];
}

std::shared_ptr<DataType> type;
RETURN_NOT_OK(
GetTensorType(reinterpret_cast<PyObject*>(PyArray_DESCR(ndarray)), &type));
*out = std::make_shared<Tensor>(type, data, shape, strides, dim_names);
return Status::OK();
}

Status TensorToNdarray(const std::shared_ptr<Tensor>& tensor, PyObject* base,
PyObject** out) {
PyAcquireGIL lock;

int type_num;
RETURN_NOT_OK(GetNumPyType(*tensor->type(), &type_num));
PyArray_Descr* dtype = PyArray_DescrNewFromType(type_num);
Expand Down Expand Up @@ -274,5 +272,140 @@ Status TensorToNdarray(const std::shared_ptr<Tensor>& tensor, PyObject* base,
return Status::OK();
}

// Wrap the dense data of a sparse tensor in a ndarray
static Status SparseTensorDataToNdarray(const SparseTensor& sparse_tensor,
std::vector<npy_intp> data_shape, PyObject* base,
PyObject** out_data) {
int type_num_data;
RETURN_NOT_OK(GetNumPyType(*sparse_tensor.type(), &type_num_data));
PyArray_Descr* dtype_data = PyArray_DescrNewFromType(type_num_data);
RETURN_IF_PYERROR();

const void* immutable_data = sparse_tensor.data()->data();
// Remove const =(
void* mutable_data = const_cast<void*>(immutable_data);
int array_flags = NPY_ARRAY_C_CONTIGUOUS | NPY_ARRAY_F_CONTIGUOUS;
if (sparse_tensor.is_mutable()) {
array_flags |= NPY_ARRAY_WRITEABLE;
}

*out_data = PyArray_NewFromDescr(&PyArray_Type, dtype_data,
static_cast<int>(data_shape.size()), data_shape.data(),
nullptr, mutable_data, array_flags, nullptr);
RETURN_IF_PYERROR()
Py_XINCREF(base);
PyArray_SetBaseObject(reinterpret_cast<PyArrayObject*>(*out_data), base);
return Status::OK();
}

Status SparseTensorCOOToNdarray(const std::shared_ptr<SparseTensorCOO>& sparse_tensor,
PyObject* base, PyObject** out_data,
PyObject** out_coords) {
const auto& sparse_index = arrow::internal::checked_cast<const SparseCOOIndex&>(
*sparse_tensor->sparse_index());

// Wrap tensor data
OwnedRef result_data;
RETURN_NOT_OK(SparseTensorDataToNdarray(
*sparse_tensor, {sparse_index.non_zero_length(), 1}, base, result_data.ref()));

// Wrap indices
PyObject* result_coords;
RETURN_NOT_OK(TensorToNdarray(sparse_index.indices(), base, &result_coords));

*out_data = result_data.detach();
*out_coords = result_coords;
return Status::OK();
}

Status SparseTensorCSRToNdarray(const std::shared_ptr<SparseTensorCSR>& sparse_tensor,
PyObject* base, PyObject** out_data,
PyObject** out_indptr, PyObject** out_indices) {
const auto& sparse_index = arrow::internal::checked_cast<const SparseCSRIndex&>(
*sparse_tensor->sparse_index());

// Wrap tensor data
OwnedRef result_data;
RETURN_NOT_OK(SparseTensorDataToNdarray(
*sparse_tensor, {sparse_index.non_zero_length(), 1}, base, result_data.ref()));

// Wrap indices
OwnedRef result_indptr;
OwnedRef result_indices;
RETURN_NOT_OK(TensorToNdarray(sparse_index.indptr(), base, result_indptr.ref()));
RETURN_NOT_OK(TensorToNdarray(sparse_index.indices(), base, result_indices.ref()));

*out_data = result_data.detach();
*out_indptr = result_indptr.detach();
*out_indices = result_indices.detach();
return Status::OK();
}

Status NdarraysToSparseTensorCOO(MemoryPool* pool, PyObject* data_ao, PyObject* coords_ao,
const std::vector<int64_t>& shape,
const std::vector<std::string>& dim_names,
std::shared_ptr<SparseTensorCOO>* out) {
if (!PyArray_Check(data_ao) || !PyArray_Check(coords_ao)) {
return Status::TypeError("Did not pass ndarray object");
}

PyArrayObject* ndarray_data = reinterpret_cast<PyArrayObject*>(data_ao);
std::shared_ptr<Buffer> data = std::make_shared<NumPyBuffer>(data_ao);
std::shared_ptr<DataType> type_data;
RETURN_NOT_OK(GetTensorType(reinterpret_cast<PyObject*>(PyArray_DESCR(ndarray_data)),
&type_data));

std::shared_ptr<Tensor> coords;
RETURN_NOT_OK(NdarrayToTensor(pool, coords_ao, {}, &coords));
ARROW_CHECK_EQ(coords->type_id(), Type::INT64); // Should be ensured by caller

std::shared_ptr<SparseCOOIndex> sparse_index = std::make_shared<SparseCOOIndex>(
std::static_pointer_cast<NumericTensor<Int64Type>>(coords));
*out = std::make_shared<SparseTensorImpl<SparseCOOIndex>>(sparse_index, type_data, data,
shape, dim_names);
return Status::OK();
}

Status NdarraysToSparseTensorCSR(MemoryPool* pool, PyObject* data_ao, PyObject* indptr_ao,
PyObject* indices_ao, const std::vector<int64_t>& shape,
const std::vector<std::string>& dim_names,
std::shared_ptr<SparseTensorCSR>* out) {
if (!PyArray_Check(data_ao) || !PyArray_Check(indptr_ao) ||
!PyArray_Check(indices_ao)) {
return Status::TypeError("Did not pass ndarray object");
}

PyArrayObject* ndarray_data = reinterpret_cast<PyArrayObject*>(data_ao);
std::shared_ptr<Buffer> data = std::make_shared<NumPyBuffer>(data_ao);
std::shared_ptr<DataType> type_data;
RETURN_NOT_OK(GetTensorType(reinterpret_cast<PyObject*>(PyArray_DESCR(ndarray_data)),
&type_data));

std::shared_ptr<Tensor> indptr, indices;
RETURN_NOT_OK(NdarrayToTensor(pool, indptr_ao, {}, &indptr));
RETURN_NOT_OK(NdarrayToTensor(pool, indices_ao, {}, &indices));
ARROW_CHECK_EQ(indptr->type_id(), Type::INT64); // Should be ensured by caller
ARROW_CHECK_EQ(indices->type_id(), Type::INT64); // Should be ensured by caller

auto sparse_index = std::make_shared<SparseCSRIndex>(
std::static_pointer_cast<NumericTensor<Int64Type>>(indptr),
std::static_pointer_cast<NumericTensor<Int64Type>>(indices));
*out = std::make_shared<SparseTensorImpl<SparseCSRIndex>>(sparse_index, type_data, data,
shape, dim_names);
return Status::OK();
}

Status TensorToSparseTensorCOO(const std::shared_ptr<Tensor>& tensor,
std::shared_ptr<SparseTensorCOO>* out) {
*out = std::make_shared<SparseTensorCOO>(*tensor);
return Status::OK();
}

Status TensorToSparseTensorCSR(const std::shared_ptr<Tensor>& tensor,
std::shared_ptr<SparseTensorCSR>* out) {
*out = std::make_shared<SparseTensorCSR>(*tensor);
return Status::OK();
}

} // namespace py
} // namespace arrow
29 changes: 29 additions & 0 deletions cpp/src/arrow/python/numpy_convert.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,11 @@

#include <memory>
#include <string>
#include <vector>

#include "arrow/buffer.h"
#include "arrow/python/visibility.h"
#include "arrow/sparse_tensor.h"

namespace arrow {

Expand Down Expand Up @@ -63,11 +65,38 @@ Status GetTensorType(PyObject* dtype, std::shared_ptr<DataType>* out);
Status GetNumPyType(const DataType& type, int* type_num);

ARROW_PYTHON_EXPORT Status NdarrayToTensor(MemoryPool* pool, PyObject* ao,
const std::vector<std::string>& dim_names,
std::shared_ptr<Tensor>* out);

ARROW_PYTHON_EXPORT Status TensorToNdarray(const std::shared_ptr<Tensor>& tensor,
PyObject* base, PyObject** out);

ARROW_PYTHON_EXPORT Status
SparseTensorCOOToNdarray(const std::shared_ptr<SparseTensorCOO>& sparse_tensor,
PyObject* base, PyObject** out_data, PyObject** out_coords);

ARROW_PYTHON_EXPORT Status SparseTensorCSRToNdarray(
const std::shared_ptr<SparseTensorCSR>& sparse_tensor, PyObject* base,
PyObject** out_data, PyObject** out_indptr, PyObject** out_indices);

ARROW_PYTHON_EXPORT Status NdarraysToSparseTensorCOO(
MemoryPool* pool, PyObject* data_ao, PyObject* coords_ao,
const std::vector<int64_t>& shape, const std::vector<std::string>& dim_names,
std::shared_ptr<SparseTensorCOO>* out);

ARROW_PYTHON_EXPORT Status NdarraysToSparseTensorCSR(
MemoryPool* pool, PyObject* data_ao, PyObject* indptr_ao, PyObject* indices_ao,
const std::vector<int64_t>& shape, const std::vector<std::string>& dim_names,
std::shared_ptr<SparseTensorCSR>* out);

ARROW_PYTHON_EXPORT Status
TensorToSparseTensorCOO(const std::shared_ptr<Tensor>& tensor,
std::shared_ptr<SparseTensorCOO>* csparse_tensor);

ARROW_PYTHON_EXPORT Status
TensorToSparseTensorCSR(const std::shared_ptr<Tensor>& tensor,
std::shared_ptr<SparseTensorCSR>* csparse_tensor);

} // namespace py
} // namespace arrow

Expand Down
38 changes: 38 additions & 0 deletions cpp/src/arrow/python/pyarrow.cc
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,44 @@ PyObject* wrap_tensor(const std::shared_ptr<Tensor>& tensor) {
return ::pyarrow_wrap_tensor(tensor);
}

bool is_sparse_tensor_csr(PyObject* sparse_tensor) {
return ::pyarrow_is_sparse_tensor_csr(sparse_tensor) != 0;
}

Status unwrap_sparse_tensor_csr(PyObject* sparse_tensor,
std::shared_ptr<SparseTensorCSR>* out) {
*out = ::pyarrow_unwrap_sparse_tensor_csr(sparse_tensor);
if (*out) {
return Status::OK();
} else {
return Status::Invalid(
"Could not unwrap SparseTensorCSR from the passed Python object.");
}
}

PyObject* wrap_sparse_tensor_csr(const std::shared_ptr<SparseTensorCSR>& sparse_tensor) {
return ::pyarrow_wrap_sparse_tensor_csr(sparse_tensor);
}

bool is_sparse_tensor_coo(PyObject* sparse_tensor) {
return ::pyarrow_is_sparse_tensor_coo(sparse_tensor) != 0;
}

Status unwrap_sparse_tensor_coo(PyObject* sparse_tensor,
std::shared_ptr<SparseTensorCOO>* out) {
*out = ::pyarrow_unwrap_sparse_tensor_coo(sparse_tensor);
if (*out) {
return Status::OK();
} else {
return Status::Invalid(
"Could not unwrap SparseTensorCOO from the passed Python object.");
}
}

PyObject* wrap_sparse_tensor_coo(const std::shared_ptr<SparseTensorCOO>& sparse_tensor) {
return ::pyarrow_wrap_sparse_tensor_coo(sparse_tensor);
}

bool is_column(PyObject* column) { return ::pyarrow_is_column(column) != 0; }

Status unwrap_column(PyObject* column, std::shared_ptr<Column>* out) {
Expand Down
14 changes: 14 additions & 0 deletions cpp/src/arrow/python/pyarrow.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@

#include "arrow/python/visibility.h"

#include "arrow/sparse_tensor.h"

namespace arrow {

class Array;
Expand Down Expand Up @@ -67,6 +69,18 @@ ARROW_PYTHON_EXPORT bool is_tensor(PyObject* tensor);
ARROW_PYTHON_EXPORT Status unwrap_tensor(PyObject* tensor, std::shared_ptr<Tensor>* out);
ARROW_PYTHON_EXPORT PyObject* wrap_tensor(const std::shared_ptr<Tensor>& tensor);

ARROW_PYTHON_EXPORT bool is_sparse_tensor_coo(PyObject* sparse_tensor);
ARROW_PYTHON_EXPORT Status
unwrap_sparse_tensor_coo(PyObject* sparse_tensor, std::shared_ptr<SparseTensorCOO>* out);
ARROW_PYTHON_EXPORT PyObject* wrap_sparse_tensor_coo(
const std::shared_ptr<SparseTensorCOO>& sparse_tensor);

ARROW_PYTHON_EXPORT bool is_sparse_tensor_csr(PyObject* sparse_tensor);
ARROW_PYTHON_EXPORT Status
unwrap_sparse_tensor_csr(PyObject* sparse_tensor, std::shared_ptr<SparseTensorCSR>* out);
ARROW_PYTHON_EXPORT PyObject* wrap_sparse_tensor_csr(
const std::shared_ptr<SparseTensorCSR>& sparse_tensor);

ARROW_PYTHON_EXPORT bool is_column(PyObject* column);
ARROW_PYTHON_EXPORT Status unwrap_column(PyObject* column, std::shared_ptr<Column>* out);
ARROW_PYTHON_EXPORT PyObject* wrap_column(const std::shared_ptr<Column>& column);
Expand Down
Loading