Skip to content
This repository was archived by the owner on Oct 21, 2024. It is now read-only.

Commit d9ff47e

Browse files
committed
Further work and implementing review feedback.
1 parent 24a831f commit d9ff47e

File tree

2 files changed

+290
-159
lines changed

2 files changed

+290
-159
lines changed

cpp/src/arrow/sparse_tensor.cc

Lines changed: 69 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -444,56 +444,70 @@ class SparseTensorConverter<TYPE, SparseCSFIndex>
444444

445445
const int64_t ndim = tensor_.ndim();
446446
std::vector<int64_t> axis_order = internal::ArgSort(tensor_.shape());
447+
int64_t nonzero_count = -1;
448+
RETURN_NOT_OK(tensor_.CountNonZero(&nonzero_count));
447449

448-
if (ndim < 2) {
449-
// LCOV_EXCL_START: The following invalid causes program failure.
450-
return Status::Invalid("Invalid tensor dimension");
451-
// LCOV_EXCL_STOP
452-
}
450+
std::shared_ptr<Buffer> values_buffer;
451+
RETURN_NOT_OK(
452+
AllocateBuffer(pool_, sizeof(value_type) * nonzero_count, &values_buffer));
453+
value_type* values = reinterpret_cast<value_type*>(values_buffer->mutable_data());
453454

454-
std::shared_ptr<SparseCOOTensor> sparse_coo_tensor;
455-
ARROW_ASSIGN_OR_RAISE(sparse_coo_tensor, SparseCOOTensor::Make(tensor_));
456-
std::shared_ptr<Tensor> coords =
457-
arrow::internal::checked_pointer_cast<SparseCOOIndex>(
458-
sparse_coo_tensor->sparse_index())
459-
->indices();
455+
std::vector<int64_t> counts(ndim, 0);
456+
std::vector<int64_t> coord(ndim, 0);
457+
std::vector<int64_t> previous_coord(ndim, -1);
458+
std::vector<TypedBufferBuilder<c_index_value_type>> indptr_buffer_builders(ndim - 1);
459+
std::vector<TypedBufferBuilder<c_index_value_type>> indices_buffer_builders(ndim);
460460

461-
// TODO(rok): Coords should be sorted with axis_order priority to improve compression.
462-
// ARROW-4221 would help here as well.
461+
if (ndim <= 1) {
462+
return Status::NotImplemented("TODO for ndim <= 1");
463+
} else {
464+
const std::vector<int64_t>& shape = tensor_.shape();
465+
for (int64_t n = tensor_.size(); n > 0; n--) {
466+
const value_type x = tensor_.Value(coord);
463467

464-
// Convert SparseCOOTensor to long CSF buffers
465-
const int64_t nonzero_count = sparse_coo_tensor->non_zero_length();
468+
if (tensor_.Value(coord) != 0) {
469+
bool tree_split = false;
470+
*values++ = x;
466471

467-
std::vector<int64_t> counts(ndim);
468-
std::fill_n(counts.begin(), ndim, static_cast<int64_t>(0));
469-
std::vector<TypedBufferBuilder<c_index_value_type>> indptr_buffer_builders(ndim - 1);
470-
std::vector<TypedBufferBuilder<c_index_value_type>> indices_buffer_builders(ndim);
472+
for (int64_t i = 0; i < ndim; ++i) {
473+
int64_t dimension = axis_order[i];
474+
bool change = coord[dimension] != previous_coord[dimension];
475+
476+
if (tree_split || change) {
477+
if (change) tree_split = true;
478+
479+
if (i < ndim - 1)
480+
RETURN_NOT_OK(indptr_buffer_builders[i].Append(
481+
static_cast<c_index_value_type>(counts[dimension + 1])));
482+
RETURN_NOT_OK(indices_buffer_builders[i].Append(
483+
static_cast<c_index_value_type>(coord[dimension])));
484+
++counts[dimension];
485+
}
486+
}
487+
previous_coord = coord;
488+
}
471489

472-
for (int64_t row = 0; row < nonzero_count; ++row) {
473-
bool tree_split = false;
474-
for (int64_t column = 0; column < ndim; ++column) {
475-
int64_t dimension = axis_order[column];
476-
bool change = coords->Value<IndexValueType>({row, dimension}) !=
477-
coords->Value<IndexValueType>({row - 1, dimension});
478-
479-
if (tree_split || change || row == 0) {
480-
if (row > 1 || change) tree_split = true;
481-
482-
if (column < ndim - 1)
483-
RETURN_NOT_OK(indptr_buffer_builders[column].Append(
484-
static_cast<c_index_value_type>(counts[column + 1])));
485-
RETURN_NOT_OK(
486-
indices_buffer_builders[column].Append(static_cast<c_index_value_type>(
487-
coords->Value<IndexValueType>({row, dimension}))));
488-
++counts[column];
490+
// increment index
491+
++coord[ndim - 1];
492+
if (n > 1 && coord[ndim - 1] == shape[ndim - 1]) {
493+
int64_t d = ndim - 1;
494+
while (d > 0 && coord[d] == shape[d]) {
495+
coord[d] = 0;
496+
++coord[d - 1];
497+
--d;
498+
}
489499
}
490500
}
491501
}
502+
492503
for (int64_t column = 0; column < ndim - 1; ++column) {
493504
RETURN_NOT_OK(indptr_buffer_builders[column].Append(
494505
static_cast<c_index_value_type>(counts[column + 1])));
495506
}
496507

508+
// make results
509+
data = values_buffer;
510+
497511
std::vector<std::shared_ptr<Buffer>> indptr_buffers(ndim - 1);
498512
std::vector<std::shared_ptr<Buffer>> indices_buffers(ndim);
499513
std::vector<int64_t> indptr_shapes(counts.begin(), counts.end() - 1);
@@ -509,7 +523,6 @@ class SparseTensorConverter<TYPE, SparseCSFIndex>
509523
ARROW_ASSIGN_OR_RAISE(
510524
sparse_index, SparseCSFIndex::Make(index_value_type_, indices_shapes, axis_order,
511525
indptr_buffers, indices_buffers));
512-
data = sparse_coo_tensor->data();
513526
return Status::OK();
514527
}
515528

@@ -647,11 +660,14 @@ Status MakeSparseTensorFromTensor(const Tensor& tensor,
647660
}
648661
}
649662

663+
namespace {
664+
650665
template <typename TYPE, typename IndexValueType>
651-
void assign_values(int64_t dimension, int64_t offset, int64_t first_ptr, int64_t last_ptr,
652-
const SparseCSFIndex* sparse_index, const int64_t* raw_data,
653-
const std::vector<int64_t> strides,
654-
const std::vector<int64_t> axis_order, TYPE* out) {
666+
void ExpandSparseCSFTensorValues(int64_t dimension, int64_t offset, int64_t first_ptr,
667+
int64_t last_ptr, const SparseCSFIndex* sparse_index,
668+
const int64_t* raw_data,
669+
const std::vector<int64_t> strides,
670+
const std::vector<int64_t> axis_order, TYPE* out) {
655671
int64_t ndim = axis_order.size();
656672

657673
for (int64_t i = first_ptr; i < last_ptr; ++i) {
@@ -660,7 +676,7 @@ void assign_values(int64_t dimension, int64_t offset, int64_t first_ptr, int64_t
660676
strides[axis_order[dimension]];
661677

662678
if (dimension < ndim - 1)
663-
assign_values<TYPE, IndexValueType>(
679+
ExpandSparseCSFTensorValues<TYPE, IndexValueType>(
664680
dimension + 1, tmp_offset,
665681
sparse_index->indptr()[dimension]->Value<IndexValueType>({i}),
666682
sparse_index->indptr()[dimension]->Value<IndexValueType>({i + 1}), sparse_index,
@@ -670,6 +686,8 @@ void assign_values(int64_t dimension, int64_t offset, int64_t first_ptr, int64_t
670686
}
671687
}
672688

689+
} // namespace
690+
673691
template <typename TYPE, typename IndexValueType>
674692
Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* sparse_tensor,
675693
std::shared_ptr<Tensor>* out) {
@@ -753,13 +771,9 @@ Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* sparse_t
753771
case SparseTensorFormat::CSF: {
754772
const auto& sparse_index =
755773
internal::checked_cast<const SparseCSFIndex&>(*sparse_tensor->sparse_index());
756-
int64_t last_ptr_index = sparse_index.indptr()[0]->size() - 1;
757-
int64_t first_ptr = sparse_index.indptr()[0]->Value<IndexValueType>({0});
758-
int64_t last_ptr =
759-
sparse_index.indptr()[0]->Value<IndexValueType>({last_ptr_index});
760774

761-
assign_values<value_type, IndexValueType>(
762-
0, 0, first_ptr, last_ptr, &sparse_index,
775+
ExpandSparseCSFTensorValues<value_type, IndexValueType>(
776+
0, 0, 0, sparse_index.indptr()[0]->size() - 1, &sparse_index,
763777
reinterpret_cast<const int64_t*>(sparse_tensor->raw_data()), strides,
764778
sparse_index.axis_order(), values);
765779
*out = std::make_shared<Tensor>(sparse_tensor->type(), values_buffer,
@@ -985,10 +999,9 @@ Result<std::shared_ptr<SparseCSFIndex>> SparseCSFIndex::Make(
985999
indices[i] = std::make_shared<Tensor>(indices_type, indices_data[i],
9861000
std::vector<int64_t>({indices_shapes[i]}));
9871001

988-
ARROW_CHECK(CheckSparseCSFIndexValidity(indptr_type, indices_type, indptr.size(),
989-
indices.size(), indptr.back()->shape(),
990-
indices.back()->shape(), axis_order.size())
991-
.ok());
1002+
RETURN_NOT_OK(CheckSparseCSFIndexValidity(indptr_type, indices_type, indptr.size(),
1003+
indices.size(), indptr.back()->shape(),
1004+
indices.back()->shape(), axis_order.size()));
9921005

9931006
return std::make_shared<SparseCSFIndex>(indptr, indices, axis_order);
9941007
}
@@ -997,15 +1010,13 @@ Result<std::shared_ptr<SparseCSFIndex>> SparseCSFIndex::Make(
9971010
SparseCSFIndex::SparseCSFIndex(std::vector<std::shared_ptr<Tensor>>& indptr,
9981011
std::vector<std::shared_ptr<Tensor>>& indices,
9991012
const std::vector<int64_t>& axis_order)
1000-
: SparseIndexBase(indices.back()->shape()[0]),
1013+
: SparseIndexBase(indices.back()->size()),
10011014
indptr_(indptr),
10021015
indices_(indices),
10031016
axis_order_(axis_order) {
1004-
ARROW_CHECK(CheckSparseCSFIndexValidity(indptr_.front()->type(),
1005-
indices_.front()->type(), indptr_.size(),
1006-
indices_.size(), indptr_.back()->shape(),
1007-
indices_.back()->shape(), axis_order_.size())
1008-
.ok());
1017+
ARROW_CHECK_OK(CheckSparseCSFIndexValidity(
1018+
indptr_.front()->type(), indices_.front()->type(), indptr_.size(), indices_.size(),
1019+
indptr_.back()->shape(), indices_.back()->shape(), axis_order_.size()));
10091020
}
10101021

10111022
std::string SparseCSFIndex::ToString() const { return std::string("SparseCSFIndex"); }

0 commit comments

Comments
 (0)