Skip to content
This repository was archived by the owner on Oct 21, 2024. It is now read-only.

Commit 2d10104

Browse files
committed
WIP
1 parent 25fd97b commit 2d10104

File tree

4 files changed

+225
-8
lines changed

4 files changed

+225
-8
lines changed

cpp/src/arrow/sparse_tensor.cc

Lines changed: 75 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -507,6 +507,33 @@ Status MakeSparseTensorFromTensor(const Tensor& tensor,
507507
}
508508
}
509509

510+
template <typename TYPE, typename IndexValueType>
511+
void assign_values(int64_t dimension_index, int64_t offset, int64_t first_ptr,
512+
int64_t last_ptr, const SparseCSFIndex* sparse_index,
513+
const int64_t* raw_data, const std::vector<int64_t> strides,
514+
TYPE* out) {
515+
auto indices_offset = sparse_index->indices_offsets()[dimension_index];
516+
auto indptr_offset = sparse_index->indptr_offsets()[dimension_index];
517+
int64_t ndim = sparse_index->indices_offsets().size();
518+
519+
if (dimension_index == 0 && ndim > 1)
520+
last_ptr = sparse_index->indptr_offsets()[dimension_index + 1] - 1;
521+
522+
for (int64_t i = first_ptr; i < last_ptr; ++i) {
523+
int64_t tmp_offset =
524+
offset + sparse_index->indices()->Value<IndexValueType>({indices_offset + i}) *
525+
strides[dimension_index];
526+
if (dimension_index < ndim - 1)
527+
assign_values<TYPE, IndexValueType>(
528+
dimension_index + 1, tmp_offset,
529+
sparse_index->indptr()->Value<IndexValueType>({indptr_offset + i}),
530+
sparse_index->indptr()->Value<IndexValueType>({indptr_offset + i + 1}),
531+
sparse_index, raw_data, strides, out);
532+
else
533+
out[tmp_offset] = raw_data[i];
534+
}
535+
}
536+
510537
template <typename TYPE, typename IndexValueType>
511538
Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* sparse_tensor,
512539
std::shared_ptr<Tensor>* out) {
@@ -521,18 +548,18 @@ Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* sparse_t
521548

522549
std::fill_n(values, sparse_tensor->size(), static_cast<value_type>(0));
523550

551+
std::vector<int64_t> strides(sparse_tensor->ndim(), 1);
552+
for (int i = sparse_tensor->ndim() - 1; i > 0; --i)
553+
strides[i - 1] *= strides[i] * sparse_tensor->shape()[i];
554+
555+
const auto raw_data = reinterpret_cast<const value_type*>(sparse_tensor->raw_data());
556+
524557
switch (sparse_tensor->format_id()) {
525558
case SparseTensorFormat::COO: {
526559
const auto& sparse_index =
527560
internal::checked_cast<const SparseCOOIndex&>(*sparse_tensor->sparse_index());
528561
const std::shared_ptr<const Tensor> coords = sparse_index.indices();
529-
const auto raw_data =
530-
reinterpret_cast<const value_type*>(sparse_tensor->raw_data());
531-
std::vector<int64_t> strides(sparse_tensor->ndim(), 1);
532562

533-
for (int i = sparse_tensor->ndim() - 1; i > 0; --i) {
534-
strides[i - 1] *= strides[i] * sparse_tensor->shape()[i];
535-
}
536563
for (int64_t i = 0; i < sparse_tensor->non_zero_length(); ++i) {
537564
std::vector<c_index_value_type> coord(sparse_tensor->ndim());
538565
int64_t offset = 0;
@@ -552,8 +579,6 @@ Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* sparse_t
552579
internal::checked_cast<const SparseCSRIndex&>(*sparse_tensor->sparse_index());
553580
const std::shared_ptr<const Tensor> indptr = sparse_index.indptr();
554581
const std::shared_ptr<const Tensor> indices = sparse_index.indices();
555-
const auto raw_data =
556-
reinterpret_cast<const value_type*>(sparse_tensor->raw_data());
557582

558583
int64_t offset;
559584
for (int64_t i = 0; i < indptr->size() - 1; ++i) {
@@ -590,6 +615,17 @@ Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* sparse_t
590615
sparse_tensor->shape());
591616
return Status::OK();
592617
}
618+
619+
case SparseTensorFormat::CSF: {
620+
const auto& sparse_index =
621+
internal::checked_cast<const SparseCSFIndex&>(*sparse_tensor->sparse_index());
622+
assign_values<value_type, IndexValueType>(
623+
0, 0, 0, 0, &sparse_index,
624+
reinterpret_cast<const int64_t*>(sparse_tensor->raw_data()), strides, values);
625+
*out = std::make_shared<Tensor>(sparse_tensor->type(), values_buffer,
626+
sparse_tensor->shape());
627+
return Status::OK();
628+
}
593629
}
594630
return Status::NotImplemented("Unsupported SparseIndex format type");
595631
}
@@ -625,6 +661,13 @@ Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* sparse_t
625661
const std::shared_ptr<const Tensor> indices = sparse_index.indices();
626662
type = indices->type();
627663
break;
664+
}
665+
case SparseTensorFormat::CSF: {
666+
const auto& sparse_index =
667+
internal::checked_cast<const SparseCSFIndex&>(*sparse_tensor->sparse_index());
668+
const std::shared_ptr<const Tensor> indices = sparse_index.indices();
669+
type = indices->type();
670+
break;
628671
}
629672
// LCOV_EXCL_START: ignore program failure
630673
default:
@@ -754,6 +797,30 @@ void CheckSparseCSXIndexValidity(const std::shared_ptr<DataType>& indptr_type,
754797

755798
} // namespace internal
756799

800+
// ----------------------------------------------------------------------
801+
// SparseCSFIndex
802+
803+
// Constructor with two index vectors
804+
SparseCSFIndex::SparseCSFIndex(const std::shared_ptr<Tensor>& indptr,
805+
const std::shared_ptr<Tensor>& indices,
806+
const std::vector<int64_t>& indptr_offsets,
807+
const std::vector<int64_t>& indices_offsets,
808+
const std::vector<int64_t>& axis_order)
809+
: SparseIndexBase(indices->shape()[0] - indices_offsets.back()),
810+
indptr_(indptr),
811+
indices_(indices),
812+
indptr_offsets_(indptr_offsets),
813+
indices_offsets_(indices_offsets),
814+
axis_order_(axis_order) {
815+
ARROW_CHECK(is_integer(indptr_->type_id()));
816+
ARROW_CHECK_EQ(1, indptr_->ndim());
817+
ARROW_CHECK(is_integer(indices_->type_id()));
818+
ARROW_CHECK_EQ(1, indices_->ndim());
819+
ARROW_CHECK_EQ(indptr_offsets_.size() + 1, indices_offsets_.size());
820+
}
821+
822+
std::string SparseCSFIndex::ToString() const { return std::string("SparseCSFIndex"); }
823+
757824
// ----------------------------------------------------------------------
758825
// SparseTensor
759826

cpp/src/arrow/sparse_tensor.h

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@ struct SparseTensorFormat {
4040
CSR,
4141
/// Compressed sparse column (CSC) format.
4242
CSC,
43+
/// Compressed sparse fiber (CSF) format.
44+
CSF
4345
};
4446
};
4547

@@ -329,6 +331,66 @@ class ARROW_EXPORT SparseCSCIndex
329331
using SparseCSXIndex::SparseCSXIndex;
330332
};
331333

334+
// ----------------------------------------------------------------------
335+
// SparseCSFIndex class
336+
337+
/// \brief EXPERIMENTAL: The index data for a CSF sparse tensor
338+
///
339+
/// A CSF sparse index manages the location of its non-zero values by two
340+
/// vectors.
341+
/// TODO:rok, documentation
342+
/// The first vector, called indptr, represents the range of the rows; the i-th
343+
/// row spans from indptr[i] to indptr[i+1] in the corresponding value vector.
344+
/// So the length of an indptr vector is the number of rows + 1.
345+
///
346+
/// The other vector, called indices, represents the column indices of the
347+
/// corresponding non-zero values. So the length of an indices vector is same
348+
/// as the number of non-zero-values.
349+
class ARROW_EXPORT SparseCSFIndex : public internal::SparseIndexBase<SparseCSFIndex> {
350+
public:
351+
static constexpr SparseTensorFormat::type format_id = SparseTensorFormat::CSF;
352+
353+
/// \brief Construct SparseCSFIndex from two index vectors
354+
explicit SparseCSFIndex(const std::shared_ptr<Tensor>& indptr,
355+
const std::shared_ptr<Tensor>& indices,
356+
const std::vector<int64_t>& indptr_offsets,
357+
const std::vector<int64_t>& indices_offsets,
358+
const std::vector<int64_t>& axis_order);
359+
360+
/// \brief Return a 1D tensor of indptr vector
361+
const std::shared_ptr<Tensor>& indptr() const { return indptr_; }
362+
363+
/// \brief Return a 1D tensor of indices vector
364+
const std::shared_ptr<Tensor>& indices() const { return indices_; }
365+
366+
/// \brief Return a 1D vector of indptr offsets
367+
const std::vector<int64_t>& indptr_offsets() const { return indptr_offsets_; }
368+
369+
/// \brief Return a vector of indices offsets
370+
const std::vector<int64_t>& indices_offsets() const { return indices_offsets_; }
371+
372+
/// \brief Return a 1D vector specifying the order of axes
373+
const std::vector<int64_t>& axis_order() const { return axis_order_; }
374+
375+
/// \brief Return a string representation of the sparse index
376+
std::string ToString() const override;
377+
378+
/// \brief Return whether the CSF indices are equal
379+
bool Equals(const SparseCSFIndex& other) const {
380+
return indptr()->Equals(*other.indptr()) && indices()->Equals(*other.indices()) &&
381+
indptr_offsets() == other.indptr_offsets() &&
382+
indices_offsets() == other.indices_offsets() &&
383+
axis_order() == other.axis_order();
384+
}
385+
386+
protected:
387+
std::shared_ptr<Tensor> indptr_;
388+
std::shared_ptr<Tensor> indices_;
389+
std::vector<int64_t> indptr_offsets_;
390+
std::vector<int64_t> indices_offsets_;
391+
std::vector<int64_t> axis_order_;
392+
};
393+
332394
// ----------------------------------------------------------------------
333395
// SparseTensor class
334396

@@ -527,6 +589,9 @@ using SparseCSRMatrix = SparseTensorImpl<SparseCSRIndex>;
527589
/// \brief EXPERIMENTAL: Type alias for CSC sparse matrix
528590
using SparseCSCMatrix = SparseTensorImpl<SparseCSCIndex>;
529591

592+
/// \brief EXPERIMENTAL: Type alias for CSF sparse matrix
593+
using SparseCSFTensor = SparseTensorImpl<SparseCSFIndex>;
594+
530595
} // namespace arrow
531596

532597
#endif // ARROW_SPARSE_TENSOR_H

cpp/src/arrow/sparse_tensor_test.cc

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -910,4 +910,76 @@ TEST_F(TestSparseCSCMatrix, TestToTensor) {
910910
ASSERT_TRUE(tensor.Equals(*dense_tensor));
911911
}
912912

913+
template <typename IndexValueType>
914+
class TestSparseCSFTensorBase : public ::testing::Test {
915+
public:
916+
void SetUp() {
917+
shape_ = {6, 4};
918+
dim_names_ = {"foo", "bar"};
919+
920+
// Dense representation:
921+
// [
922+
// 1 0 2 0
923+
// 0 3 0 4
924+
// 5 0 6 0
925+
// 0 11 0 12
926+
// 13 0 14 0
927+
// 0 15 0 16
928+
// ]
929+
std::vector<int64_t> dense_values = {1, 0, 2, 0, 0, 3, 0, 4, 5, 0, 6, 0,
930+
0, 11, 0, 12, 13, 0, 14, 0, 0, 15, 0, 16};
931+
auto dense_data = Buffer::Wrap(dense_values);
932+
NumericTensor<Int64Type> dense_tensor(dense_data, shape_, {}, dim_names_);
933+
}
934+
935+
protected:
936+
std::vector<int64_t> shape_;
937+
std::vector<std::string> dim_names_;
938+
std::shared_ptr<SparseCSFTensor> sparse_tensor_from_dense_;
939+
};
940+
941+
class TestSparseCSFTensor : public TestSparseCSFTensorBase<Int64Type> {};
942+
943+
TEST_F(TestSparseCSFTensor, TestToTensor) {
944+
std::vector<int64_t> data_values = {1, 2, 3, 4, 5, 6, 7, 8};
945+
std::vector<int64_t> indptr_values = {0, 2, 3, 0, 1, 3, 4, 0, 2, 4, 5, 8};
946+
std::vector<int64_t> indices_values = {1, 2, 1, 2, 2, 1, 1, 2, 2,
947+
2, 3, 1, 3, 1, 1, 2, 3};
948+
std::vector<int64_t> indices_offsets = {0, 2, 5, 9};
949+
std::vector<int64_t> indptr_offsets = {0, 3, 7};
950+
std::vector<int64_t> axis_order = {0, 1, 2, 3};
951+
std::vector<int64_t> sparse_tensor_shape({3, 3, 3, 4});
952+
std::vector<int64_t> indptr_shape({12});
953+
std::vector<int64_t> indices_shape({17});
954+
std::vector<std::string> dim_names({"a", "b", "c", "d"});
955+
956+
std::shared_ptr<Buffer> data_buffer = Buffer::Wrap(data_values);
957+
std::shared_ptr<Buffer> indptr_buffer = Buffer::Wrap(indptr_values);
958+
std::shared_ptr<Buffer> indices_buffer = Buffer::Wrap(indices_values);
959+
960+
std::shared_ptr<Tensor> indptr =
961+
std::make_shared<Tensor>(int64(), indptr_buffer, indptr_shape);
962+
std::shared_ptr<Tensor> indices =
963+
std::make_shared<Tensor>(int64(), indices_buffer, indices_shape);
964+
965+
std::shared_ptr<SparseCSFIndex> sparse_index = std::make_shared<SparseCSFIndex>(
966+
indptr, indices, indptr_offsets, indices_offsets, axis_order);
967+
std::shared_ptr<SparseCSFTensor> sparse_tensor = std::make_shared<SparseCSFTensor>(
968+
sparse_index, int64(), data_buffer, sparse_tensor_shape, dim_names);
969+
970+
ASSERT_EQ(8, sparse_tensor->non_zero_length());
971+
972+
std::shared_ptr<Tensor> dense_tensor;
973+
ASSERT_OK(sparse_tensor->ToTensor(&dense_tensor));
974+
975+
std::vector<int64_t> dense_values = {
976+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
977+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
978+
1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 4, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
979+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 7, 8};
980+
auto dense_data = Buffer::Wrap(dense_values);
981+
Tensor tensor(int64(), dense_data, sparse_tensor_shape, {});
982+
983+
ASSERT_TRUE(tensor.Equals(*dense_tensor));
984+
}
913985
} // namespace arrow

format/SparseTensor.fbs

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,9 +114,22 @@ table SparseMatrixIndexCSX {
114114
indicesBuffer: Buffer (required);
115115
}
116116

117+
/// Compressed Sparse Fiber (CSF) sparse tensor format
118+
///
119+
/// CSF is a generalization of compressed sparse row (CSR) index.
120+
/// CSF compresses a tensor into one three one-dimensional tensors.
121+
table SparseTensorIndexCSF {
122+
indptrType: Int;
123+
indptrBuffer: Buffer;
124+
indicesType: Int;
125+
indicesBuffer: Buffer;
126+
axisOrder: [long];
127+
}
128+
117129
union SparseTensorIndex {
118130
SparseTensorIndexCOO,
119131
SparseMatrixIndexCSX,
132+
SparseTensorIndexCSF
120133
}
121134

122135
table SparseTensor {

0 commit comments

Comments
 (0)