Skip to content
This repository was archived by the owner on Oct 21, 2024. It is now read-only.

Commit eb51947

Browse files
committed
Switching SparseCSFIndex to '2D' data structure.
1 parent a322ff5 commit eb51947

File tree

5 files changed

+213
-256
lines changed

5 files changed

+213
-256
lines changed

cpp/src/arrow/compare.cc

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1194,6 +1194,12 @@ inline bool SparseTensorEqualsImplDispatch(const SparseTensorImpl<SparseIndexTyp
11941194
checked_cast<const SparseTensorImpl<SparseCSCIndex>&>(right);
11951195
return SparseTensorEqualsImpl<SparseIndexType, SparseCSCIndex>::Compare(left,
11961196
right_csc);
1197+
1198+
case SparseTensorFormat::CSF: {
1199+
const auto& right_csf =
1200+
checked_cast<const SparseTensorImpl<SparseCSFIndex>&>(right);
1201+
return SparseTensorEqualsImpl<SparseIndexType, SparseCSFIndex>::Compare(left,
1202+
right_csf);
11971203
}
11981204

11991205
default:
@@ -1230,6 +1236,10 @@ bool SparseTensorEquals(const SparseTensor& left, const SparseTensor& right) {
12301236
case SparseTensorFormat::CSC: {
12311237
const auto& left_csc = checked_cast<const SparseTensorImpl<SparseCSCIndex>&>(left);
12321238
return SparseTensorEqualsImplDispatch(left_csc, right);
1239+
1240+
case SparseTensorFormat::CSF: {
1241+
const auto& left_csf = checked_cast<const SparseTensorImpl<SparseCSFIndex>&>(left);
1242+
return SparseTensorEqualsImplDispatch(left_csf, right);
12331243
}
12341244

12351245
default:

cpp/src/arrow/sparse_tensor.cc

Lines changed: 89 additions & 107 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
#include <memory>
2424
#include <numeric>
2525

26+
#include "arrow/buffer_builder.h"
2627
#include "arrow/compare.h"
2728
#include "arrow/util/checked_cast.h"
2829
#include "arrow/util/logging.h"
@@ -439,10 +440,9 @@ class SparseTensorConverter<TYPE, SparseCSFIndex>
439440
Status Convert() {
440441
using c_index_value_type = typename IndexValueType::c_type;
441442
RETURN_NOT_OK(CheckMaximumValue(std::numeric_limits<c_index_value_type>::max()));
442-
const int64_t indices_elsize = sizeof(c_index_value_type);
443443

444444
std::shared_ptr<SparseCOOTensor> sparse_coo_tensor;
445-
RETURN_NOT_OK(SparseCOOTensor::Make(tensor_, &sparse_coo_tensor));
445+
ARROW_ASSIGN_OR_RAISE(sparse_coo_tensor, SparseCOOTensor::Make(tensor_));
446446
std::shared_ptr<Tensor> coords =
447447
arrow::internal::checked_pointer_cast<SparseCOOIndex>(
448448
sparse_coo_tensor->sparse_index())
@@ -458,14 +458,8 @@ class SparseTensorConverter<TYPE, SparseCSFIndex>
458458
std::vector<int64_t> axis_order(ndim);
459459
for (int64_t i = 0; i < ndim; ++i) axis_order[i] = i;
460460

461-
std::shared_ptr<Buffer> indices_buffer;
462-
std::shared_ptr<Buffer> indptr_buffer;
463-
RETURN_NOT_OK(
464-
AllocateBuffer(pool_, indices_elsize * ndim * nonzero_count, &indices_buffer));
465-
RETURN_NOT_OK(AllocateBuffer(pool_, indices_elsize * (ndim - 1) * (nonzero_count + 1),
466-
&indptr_buffer));
467-
auto* indices = reinterpret_cast<c_index_value_type*>(indices_buffer->mutable_data());
468-
auto* indptr = reinterpret_cast<c_index_value_type*>(indptr_buffer->mutable_data());
461+
std::vector<TypedBufferBuilder<c_index_value_type>> indptr_buffer_builders(ndim - 1);
462+
std::vector<TypedBufferBuilder<c_index_value_type>> indices_buffer_builders(ndim);
469463

470464
for (int64_t row = 0; row < nonzero_count; ++row) {
471465
bool tree_split = false;
@@ -476,73 +470,37 @@ class SparseTensorConverter<TYPE, SparseCSFIndex>
476470
if (tree_split || change || row == 0) {
477471
if (row > 1) tree_split = true;
478472

479-
indices[column * nonzero_count + counts[column]] =
480-
static_cast<c_index_value_type>(
481-
coords->Value<IndexValueType>({row, column}));
482-
indptr[column * (nonzero_count + 1) + counts[column]] =
483-
static_cast<c_index_value_type>(counts[column + 1]);
473+
if (column < ndim - 1)
474+
RETURN_NOT_OK(indptr_buffer_builders[column].Append(
475+
static_cast<c_index_value_type>(counts[column + 1])));
476+
RETURN_NOT_OK(
477+
indices_buffer_builders[column].Append(static_cast<c_index_value_type>(
478+
coords->Value<IndexValueType>({row, column}))));
484479
++counts[column];
485480
}
486481
}
487482
}
488-
489-
for (int64_t column = 0; column < ndim; ++column) {
490-
indptr[column * (nonzero_count + 1) + counts[column]] =
491-
static_cast<c_index_value_type>(counts[column + 1]);
483+
for (int64_t column = 0; column < ndim - 1; ++column) {
484+
RETURN_NOT_OK(indptr_buffer_builders[column].Append(
485+
static_cast<c_index_value_type>(counts[column + 1])));
492486
}
493487

494-
// Remove gaps from buffers
495-
int64_t total_size = counts[0];
496-
for (int64_t column = 1; column < ndim; ++column) {
497-
for (int64_t i = 0; i < counts[column] + 1; ++i) {
498-
if (column < ndim - 1)
499-
indptr[total_size + column + i] = indptr[column * (nonzero_count + 1) + i];
500-
if (i < counts[column])
501-
indices[total_size + i] = indices[column * nonzero_count + i];
502-
}
503-
total_size += counts[column];
504-
}
488+
std::vector<std::shared_ptr<Buffer>> indptr_buffers(ndim - 1);
489+
std::vector<std::shared_ptr<Buffer>> indices_buffers(ndim);
490+
std::vector<int64_t> indptr_shapes(counts.begin(), counts.end() - 1);
491+
std::vector<int64_t> indices_shapes = counts;
505492

506-
// Copy CSF index data into smaller buffers
507-
std::shared_ptr<Buffer> out_indices_buffer;
508-
std::shared_ptr<Buffer> out_indptr_buffer;
509-
RETURN_NOT_OK(
510-
AllocateBuffer(pool_, indices_elsize * total_size, &out_indices_buffer));
511-
RETURN_NOT_OK(AllocateBuffer(pool_,
512-
indices_elsize * total_size - nonzero_count + ndim - 1,
513-
&out_indptr_buffer));
514-
auto* out_indices =
515-
reinterpret_cast<c_index_value_type*>(out_indices_buffer->mutable_data());
516-
auto* out_indptr =
517-
reinterpret_cast<c_index_value_type*>(out_indptr_buffer->mutable_data());
518-
519-
for (int64_t i = 0; i < total_size; ++i) out_indices[i] = indices[i];
520-
521-
for (int64_t i = 0; i < total_size - nonzero_count + ndim - 1; ++i)
522-
out_indptr[i] = indptr[i];
523-
524-
// Construct SparseCSFTensor
525-
std::vector<int64_t> out_indptr_shape({total_size - nonzero_count + ndim - 1});
526-
std::vector<int64_t> out_indices_shape({total_size});
527-
528-
std::vector<int64_t> indptr_offsets(ndim - 1);
529-
std::vector<int64_t> indices_offsets(ndim);
530-
std::fill_n(indptr_offsets.begin(), ndim - 1, static_cast<int64_t>(0));
531-
std::fill_n(indices_offsets.begin(), ndim, static_cast<int64_t>(0));
532-
533-
for (int64_t i = 0; i < ndim - 2; ++i)
534-
indptr_offsets[i + 1] = indptr_offsets[i] + counts[i] + 1;
535-
536-
for (int64_t i = 0; i < ndim; ++i)
537-
indices_offsets[i + 1] = indices_offsets[i] + counts[i];
538-
539-
sparse_index = std::make_shared<SparseCSFIndex>(
540-
std::make_shared<Tensor>(index_value_type_, out_indptr_buffer, out_indptr_shape),
541-
std::make_shared<Tensor>(index_value_type_, out_indices_buffer,
542-
out_indices_shape),
543-
indptr_offsets, indices_offsets, axis_order);
544-
data = sparse_coo_tensor->data();
493+
for (int64_t column = 0; column < ndim; ++column)
494+
RETURN_NOT_OK(
495+
indices_buffer_builders[column].Finish(&indices_buffers[column], true));
496+
497+
for (int64_t column = 0; column < ndim - 1; ++column)
498+
RETURN_NOT_OK(indptr_buffer_builders[column].Finish(&indptr_buffers[column], true));
545499

500+
ARROW_ASSIGN_OR_RAISE(
501+
sparse_index, SparseCSFIndex::Make(index_value_type_, indices_shapes, axis_order,
502+
indptr_buffers, indices_buffers));
503+
data = sparse_coo_tensor->data();
546504
return Status::OK();
547505
}
548506

@@ -686,23 +644,19 @@ void assign_values(int64_t dimension_index, int64_t offset, int64_t first_ptr,
686644
const int64_t* raw_data, const std::vector<int64_t> strides,
687645
const std::vector<int64_t> axis_order, TYPE* out) {
688646
auto dimension = axis_order[dimension_index];
689-
auto indices_offset = sparse_index->indices_offsets()[dimension];
690-
auto indptr_offset = sparse_index->indptr_offsets()[dimension];
691-
int64_t ndim = sparse_index->indices_offsets().size();
692-
693-
if (dimension == 0 && ndim > 1)
694-
last_ptr = sparse_index->indptr_offsets()[dimension + 1] - 1;
647+
int64_t ndim = axis_order.size();
648+
if (dimension == 0 && ndim > 1) last_ptr = sparse_index->indptr()[0]->size() - 1;
695649

696650
for (int64_t i = first_ptr; i < last_ptr; ++i) {
697651
int64_t tmp_offset =
698-
offset + sparse_index->indices()->Value<IndexValueType>({indices_offset + i}) *
652+
offset + sparse_index->indices()[dimension]->Value<IndexValueType>({i}) *
699653
strides[dimension];
700654
if (dimension_index < ndim - 1)
701655
assign_values<TYPE, IndexValueType>(
702656
dimension + 1, tmp_offset,
703-
sparse_index->indptr()->Value<IndexValueType>({indptr_offset + i}),
704-
sparse_index->indptr()->Value<IndexValueType>({indptr_offset + i + 1}),
705-
sparse_index, raw_data, strides, axis_order, out);
657+
sparse_index->indptr()[dimension]->Value<IndexValueType>({i}),
658+
sparse_index->indptr()[dimension]->Value<IndexValueType>({i + 1}), sparse_index,
659+
raw_data, strides, axis_order, out);
706660
else
707661
out[tmp_offset] = static_cast<TYPE>(raw_data[i]);
708662
}
@@ -840,8 +794,8 @@ Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* sparse_t
840794
case SparseTensorFormat::CSF: {
841795
const auto& sparse_index =
842796
internal::checked_cast<const SparseCSFIndex&>(*sparse_tensor->sparse_index());
843-
const std::shared_ptr<const Tensor> indices = sparse_index.indices();
844-
type = indices->type();
797+
const std::vector<std::shared_ptr<Tensor>> indices = sparse_index.indices();
798+
type = indices[0]->type();
845799
break;
846800
}
847801
// LCOV_EXCL_START: ignore program failure
@@ -975,40 +929,68 @@ void CheckSparseCSXIndexValidity(const std::shared_ptr<DataType>& indptr_type,
975929
// ----------------------------------------------------------------------
976930
// SparseCSFIndex
977931

978-
Status SparseCSFIndex::Make(const std::shared_ptr<DataType> indices_type,
979-
const std::vector<int64_t>& indptr_shape,
980-
const std::vector<int64_t>& indices_shape,
981-
const std::vector<int64_t>& indptr_offsets,
982-
const std::vector<int64_t>& indices_offsets,
983-
const std::vector<int64_t>& axis_order,
984-
std::shared_ptr<Buffer> indptr_data,
985-
std::shared_ptr<Buffer> indices_data,
986-
std::shared_ptr<SparseCSFIndex>* out) {
987-
*out = std::make_shared<SparseCSFIndex>(
988-
std::make_shared<Tensor>(indices_type, indptr_data, indptr_shape),
989-
std::make_shared<Tensor>(indices_type, indices_data, indices_shape), indptr_offsets,
990-
indices_offsets, axis_order);
932+
namespace {
933+
934+
inline Status CheckSparseCSFIndexValidity(const std::shared_ptr<DataType>& indptr_type,
935+
const std::shared_ptr<DataType>& indices_type,
936+
const int64_t num_indptrs,
937+
const int64_t num_indices,
938+
const std::vector<int64_t>& indptr_shape,
939+
const std::vector<int64_t>& indices_shape,
940+
const int64_t axis_order_size) {
941+
if (!is_integer(indptr_type->id())) {
942+
return Status::Invalid("Type of SparseCSFIndex indptr must be integer");
943+
}
944+
if (!is_integer(indices_type->id())) {
945+
return Status::Invalid("Type of SparseCSFIndex indices must be integer");
946+
}
947+
if (num_indptrs + 1 != num_indices) {
948+
return Status::Invalid(
949+
"SparseCSFIndex length indices must be equal to length inptrs plus one.");
950+
}
951+
if (axis_order_size != num_indices) {
952+
return Status::Invalid(
953+
"SparseCSFIndex length of indices must be equal number of dimensions.");
954+
}
991955
return Status::OK();
992956
}
993957

958+
} // namespace
959+
960+
Result<std::shared_ptr<SparseCSFIndex>> SparseCSFIndex::Make(
961+
const std::shared_ptr<DataType>& indptr_type,
962+
const std::shared_ptr<DataType>& indices_type,
963+
const std::vector<int64_t>& indices_shapes, const std::vector<int64_t>& axis_order,
964+
std::vector<std::shared_ptr<Buffer>> indptr_data,
965+
std::vector<std::shared_ptr<Buffer>> indices_data) {
966+
int64_t ndim = axis_order.size();
967+
std::vector<std::shared_ptr<Tensor>> indptr(ndim - 1);
968+
std::vector<std::shared_ptr<Tensor>> indices(ndim);
969+
970+
for (int64_t i = 0; i < ndim - 1; ++i)
971+
indptr[i] = std::make_shared<Tensor>(indptr_type, indptr_data[i],
972+
std::vector<int64_t>({indices_shapes[i] + 1}));
973+
974+
for (int64_t i = 0; i < ndim; ++i)
975+
indices[i] = std::make_shared<Tensor>(indices_type, indices_data[i],
976+
std::vector<int64_t>({indices_shapes[i]}));
977+
978+
return std::make_shared<SparseCSFIndex>(indptr, indices, axis_order);
979+
}
980+
994981
// Constructor with two index vectors
995-
SparseCSFIndex::SparseCSFIndex(const std::shared_ptr<Tensor>& indptr,
996-
const std::shared_ptr<Tensor>& indices,
997-
const std::vector<int64_t>& indptr_offsets,
998-
const std::vector<int64_t>& indices_offsets,
982+
SparseCSFIndex::SparseCSFIndex(std::vector<std::shared_ptr<Tensor>>& indptr,
983+
std::vector<std::shared_ptr<Tensor>>& indices,
999984
const std::vector<int64_t>& axis_order)
1000-
: SparseIndexBase(indices->size() - indices_offsets.back()),
985+
: SparseIndexBase(indices.back()->shape()[0]),
1001986
indptr_(indptr),
1002987
indices_(indices),
1003-
indptr_offsets_(indptr_offsets),
1004-
indices_offsets_(indices_offsets),
1005988
axis_order_(axis_order) {
1006-
ARROW_CHECK(is_integer(indptr_->type_id()));
1007-
ARROW_CHECK_EQ(1, indptr_->ndim());
1008-
ARROW_CHECK(is_integer(indices_->type_id()));
1009-
ARROW_CHECK_EQ(1, indices_->ndim());
1010-
ARROW_CHECK_EQ(indptr_offsets_.size() + 1, indices_offsets_.size());
1011-
ARROW_CHECK_EQ(axis_order_.size(), indices_offsets_.size());
989+
ARROW_CHECK(CheckSparseCSFIndexValidity(indptr_.front()->type(),
990+
indices_.front()->type(), indptr_.size(),
991+
indices_.size(), indptr_.back()->shape(),
992+
indices_.back()->shape(), axis_order_.size())
993+
.ok());
1012994
}
1013995

1014996
std::string SparseCSFIndex::ToString() const { return std::string("SparseCSFIndex"); }

cpp/src/arrow/sparse_tensor.h

Lines changed: 32 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -338,46 +338,44 @@ class ARROW_EXPORT SparseCSCIndex
338338
///
339339
/// A CSF sparse index manages the location of its non-zero values by set of
340340
/// prefix trees. Each path from a root to leaf forms one tensor non-zero index.
341-
/// CSF is implemented with five vectors.
341+
/// CSF is implemented with three vectors.
342342
///
343-
/// Vectors indptr and indices are split into N-1 segments (by indptr_offsets) and
344-
/// N segments (by indices_offsetsy, where N is the number of dimensions.
345-
/// Indptr and indices segments describe the set of prefix trees.
346-
///
347-
/// Trees traverse dimensions in order given by axis_order.
343+
/// Vectors inptr and indices contain N-1 and N buffers respectively, where N is the
344+
/// number of dimensions. Axis_order is a vector of integers of legth N. Indptr and
345+
/// indices describe the set of prefix trees. Trees traverse dimensions in order given by
346+
/// axis_order.
348347
class ARROW_EXPORT SparseCSFIndex : public internal::SparseIndexBase<SparseCSFIndex> {
349348
public:
350349
static constexpr SparseTensorFormat::type format_id = SparseTensorFormat::CSF;
351350

352351
/// \brief Make SparseCSFIndex from raw properties
353-
static Status Make(const std::shared_ptr<DataType> indices_type,
354-
const std::vector<int64_t>& indptr_shape,
355-
const std::vector<int64_t>& indices_shape,
356-
const std::vector<int64_t>& indptr_offsets,
357-
const std::vector<int64_t>& indices_offsets,
358-
const std::vector<int64_t>& axis_order,
359-
std::shared_ptr<Buffer> indptr_data,
360-
std::shared_ptr<Buffer> indices_data,
361-
std::shared_ptr<SparseCSFIndex>* out);
352+
static Result<std::shared_ptr<SparseCSFIndex>> Make(
353+
const std::shared_ptr<DataType>& indptr_type,
354+
const std::shared_ptr<DataType>& indices_type,
355+
const std::vector<int64_t>& indices_shapes, const std::vector<int64_t>& axis_order,
356+
std::vector<std::shared_ptr<Buffer>> indptr_data,
357+
std::vector<std::shared_ptr<Buffer>> indices_data);
358+
359+
/// \brief Make SparseCSFIndex from raw properties
360+
static Result<std::shared_ptr<SparseCSFIndex>> Make(
361+
const std::shared_ptr<DataType>& indices_type,
362+
const std::vector<int64_t>& indices_shapes, const std::vector<int64_t>& axis_order,
363+
std::vector<std::shared_ptr<Buffer>> indptr_data,
364+
std::vector<std::shared_ptr<Buffer>> indices_data) {
365+
return Make(indices_type, indices_type, indices_shapes, axis_order, indptr_data,
366+
indices_data);
367+
}
362368

363369
/// \brief Construct SparseCSFIndex from two index vectors
364-
explicit SparseCSFIndex(const std::shared_ptr<Tensor>& indptr,
365-
const std::shared_ptr<Tensor>& indices,
366-
const std::vector<int64_t>& indptr_offsets,
367-
const std::vector<int64_t>& indices_offsets,
370+
explicit SparseCSFIndex(std::vector<std::shared_ptr<Tensor>>& indptr,
371+
std::vector<std::shared_ptr<Tensor>>& indices,
368372
const std::vector<int64_t>& axis_order);
369373

370374
/// \brief Return a 1D tensor of indptr vector
371-
const std::shared_ptr<Tensor>& indptr() const { return indptr_; }
375+
const std::vector<std::shared_ptr<Tensor>>& indptr() const { return indptr_; }
372376

373377
/// \brief Return a 1D tensor of indices vector
374-
const std::shared_ptr<Tensor>& indices() const { return indices_; }
375-
376-
/// \brief Return a 1D vector of indptr offsets
377-
const std::vector<int64_t>& indptr_offsets() const { return indptr_offsets_; }
378-
379-
/// \brief Return a vector of indices offsets
380-
const std::vector<int64_t>& indices_offsets() const { return indices_offsets_; }
378+
const std::vector<std::shared_ptr<Tensor>>& indices() const { return indices_; }
381379

382380
/// \brief Return a 1D vector specifying the order of axes
383381
const std::vector<int64_t>& axis_order() const { return axis_order_; }
@@ -387,17 +385,16 @@ class ARROW_EXPORT SparseCSFIndex : public internal::SparseIndexBase<SparseCSFIn
387385

388386
/// \brief Return whether the CSF indices are equal
389387
bool Equals(const SparseCSFIndex& other) const {
390-
return indptr()->Equals(*other.indptr()) && indices()->Equals(*other.indices()) &&
391-
indptr_offsets() == other.indptr_offsets() &&
392-
indices_offsets() == other.indices_offsets() &&
393-
axis_order() == other.axis_order();
388+
for (int64_t i = 0; i < static_cast<int64_t>(indices().size()); ++i)
389+
if (!indices()[i]->Equals(*other.indices()[i])) return false;
390+
for (int64_t i = 0; i < static_cast<int64_t>(indptr().size()); ++i)
391+
if (!indptr()[i]->Equals(*other.indptr()[i])) return false;
392+
return axis_order() == other.axis_order();
394393
}
395394

396395
protected:
397-
std::shared_ptr<Tensor> indptr_;
398-
std::shared_ptr<Tensor> indices_;
399-
std::vector<int64_t> indptr_offsets_;
400-
std::vector<int64_t> indices_offsets_;
396+
std::vector<std::shared_ptr<Tensor>> indptr_;
397+
std::vector<std::shared_ptr<Tensor>> indices_;
401398
std::vector<int64_t> axis_order_;
402399
};
403400

0 commit comments

Comments
 (0)