2323#include < memory>
2424#include < numeric>
2525
26+ #include " arrow/buffer_builder.h"
2627#include " arrow/compare.h"
2728#include " arrow/util/checked_cast.h"
2829#include " arrow/util/logging.h"
@@ -439,10 +440,9 @@ class SparseTensorConverter<TYPE, SparseCSFIndex>
439440 Status Convert () {
440441 using c_index_value_type = typename IndexValueType::c_type;
441442 RETURN_NOT_OK (CheckMaximumValue (std::numeric_limits<c_index_value_type>::max ()));
442- const int64_t indices_elsize = sizeof (c_index_value_type);
443443
444444 std::shared_ptr<SparseCOOTensor> sparse_coo_tensor;
445- RETURN_NOT_OK ( SparseCOOTensor::Make (tensor_, &sparse_coo_tensor ));
445+ ARROW_ASSIGN_OR_RAISE (sparse_coo_tensor, SparseCOOTensor::Make (tensor_));
446446 std::shared_ptr<Tensor> coords =
447447 arrow::internal::checked_pointer_cast<SparseCOOIndex>(
448448 sparse_coo_tensor->sparse_index ())
@@ -458,14 +458,8 @@ class SparseTensorConverter<TYPE, SparseCSFIndex>
458458 std::vector<int64_t > axis_order (ndim);
459459 for (int64_t i = 0 ; i < ndim; ++i) axis_order[i] = i;
460460
461- std::shared_ptr<Buffer> indices_buffer;
462- std::shared_ptr<Buffer> indptr_buffer;
463- RETURN_NOT_OK (
464- AllocateBuffer (pool_, indices_elsize * ndim * nonzero_count, &indices_buffer));
465- RETURN_NOT_OK (AllocateBuffer (pool_, indices_elsize * (ndim - 1 ) * (nonzero_count + 1 ),
466- &indptr_buffer));
467- auto * indices = reinterpret_cast <c_index_value_type*>(indices_buffer->mutable_data ());
468- auto * indptr = reinterpret_cast <c_index_value_type*>(indptr_buffer->mutable_data ());
461+ std::vector<TypedBufferBuilder<c_index_value_type>> indptr_buffer_builders (ndim - 1 );
462+ std::vector<TypedBufferBuilder<c_index_value_type>> indices_buffer_builders (ndim);
469463
470464 for (int64_t row = 0 ; row < nonzero_count; ++row) {
471465 bool tree_split = false ;
@@ -476,73 +470,37 @@ class SparseTensorConverter<TYPE, SparseCSFIndex>
476470 if (tree_split || change || row == 0 ) {
477471 if (row > 1 ) tree_split = true ;
478472
479- indices[column * nonzero_count + counts[column]] =
480- static_cast <c_index_value_type>(
481- coords->Value <IndexValueType>({row, column}));
482- indptr[column * (nonzero_count + 1 ) + counts[column]] =
483- static_cast <c_index_value_type>(counts[column + 1 ]);
473+ if (column < ndim - 1 )
474+ RETURN_NOT_OK (indptr_buffer_builders[column].Append (
475+ static_cast <c_index_value_type>(counts[column + 1 ])));
476+ RETURN_NOT_OK (
477+ indices_buffer_builders[column].Append (static_cast <c_index_value_type>(
478+ coords->Value <IndexValueType>({row, column}))));
484479 ++counts[column];
485480 }
486481 }
487482 }
488-
489- for (int64_t column = 0 ; column < ndim; ++column) {
490- indptr[column * (nonzero_count + 1 ) + counts[column]] =
491- static_cast <c_index_value_type>(counts[column + 1 ]);
483+ for (int64_t column = 0 ; column < ndim - 1 ; ++column) {
484+ RETURN_NOT_OK (indptr_buffer_builders[column].Append (
485+ static_cast <c_index_value_type>(counts[column + 1 ])));
492486 }
493487
494- // Remove gaps from buffers
495- int64_t total_size = counts[0 ];
496- for (int64_t column = 1 ; column < ndim; ++column) {
497- for (int64_t i = 0 ; i < counts[column] + 1 ; ++i) {
498- if (column < ndim - 1 )
499- indptr[total_size + column + i] = indptr[column * (nonzero_count + 1 ) + i];
500- if (i < counts[column])
501- indices[total_size + i] = indices[column * nonzero_count + i];
502- }
503- total_size += counts[column];
504- }
488+ std::vector<std::shared_ptr<Buffer>> indptr_buffers (ndim - 1 );
489+ std::vector<std::shared_ptr<Buffer>> indices_buffers (ndim);
490+ std::vector<int64_t > indptr_shapes (counts.begin (), counts.end () - 1 );
491+ std::vector<int64_t > indices_shapes = counts;
505492
506- // Copy CSF index data into smaller buffers
507- std::shared_ptr<Buffer> out_indices_buffer;
508- std::shared_ptr<Buffer> out_indptr_buffer;
509- RETURN_NOT_OK (
510- AllocateBuffer (pool_, indices_elsize * total_size, &out_indices_buffer));
511- RETURN_NOT_OK (AllocateBuffer (pool_,
512- indices_elsize * total_size - nonzero_count + ndim - 1 ,
513- &out_indptr_buffer));
514- auto * out_indices =
515- reinterpret_cast <c_index_value_type*>(out_indices_buffer->mutable_data ());
516- auto * out_indptr =
517- reinterpret_cast <c_index_value_type*>(out_indptr_buffer->mutable_data ());
518-
519- for (int64_t i = 0 ; i < total_size; ++i) out_indices[i] = indices[i];
520-
521- for (int64_t i = 0 ; i < total_size - nonzero_count + ndim - 1 ; ++i)
522- out_indptr[i] = indptr[i];
523-
524- // Construct SparseCSFTensor
525- std::vector<int64_t > out_indptr_shape ({total_size - nonzero_count + ndim - 1 });
526- std::vector<int64_t > out_indices_shape ({total_size});
527-
528- std::vector<int64_t > indptr_offsets (ndim - 1 );
529- std::vector<int64_t > indices_offsets (ndim);
530- std::fill_n (indptr_offsets.begin (), ndim - 1 , static_cast <int64_t >(0 ));
531- std::fill_n (indices_offsets.begin (), ndim, static_cast <int64_t >(0 ));
532-
533- for (int64_t i = 0 ; i < ndim - 2 ; ++i)
534- indptr_offsets[i + 1 ] = indptr_offsets[i] + counts[i] + 1 ;
535-
536- for (int64_t i = 0 ; i < ndim; ++i)
537- indices_offsets[i + 1 ] = indices_offsets[i] + counts[i];
538-
539- sparse_index = std::make_shared<SparseCSFIndex>(
540- std::make_shared<Tensor>(index_value_type_, out_indptr_buffer, out_indptr_shape),
541- std::make_shared<Tensor>(index_value_type_, out_indices_buffer,
542- out_indices_shape),
543- indptr_offsets, indices_offsets, axis_order);
544- data = sparse_coo_tensor->data ();
493+ for (int64_t column = 0 ; column < ndim; ++column)
494+ RETURN_NOT_OK (
495+ indices_buffer_builders[column].Finish (&indices_buffers[column], true ));
496+
497+ for (int64_t column = 0 ; column < ndim - 1 ; ++column)
498+ RETURN_NOT_OK (indptr_buffer_builders[column].Finish (&indptr_buffers[column], true ));
545499
500+ ARROW_ASSIGN_OR_RAISE (
501+ sparse_index, SparseCSFIndex::Make (index_value_type_, indices_shapes, axis_order,
502+ indptr_buffers, indices_buffers));
503+ data = sparse_coo_tensor->data ();
546504 return Status::OK ();
547505 }
548506
@@ -686,23 +644,19 @@ void assign_values(int64_t dimension_index, int64_t offset, int64_t first_ptr,
686644 const int64_t * raw_data, const std::vector<int64_t > strides,
687645 const std::vector<int64_t > axis_order, TYPE* out) {
688646 auto dimension = axis_order[dimension_index];
689- auto indices_offset = sparse_index->indices_offsets ()[dimension];
690- auto indptr_offset = sparse_index->indptr_offsets ()[dimension];
691- int64_t ndim = sparse_index->indices_offsets ().size ();
692-
693- if (dimension == 0 && ndim > 1 )
694- last_ptr = sparse_index->indptr_offsets ()[dimension + 1 ] - 1 ;
647+ int64_t ndim = axis_order.size ();
648+ if (dimension == 0 && ndim > 1 ) last_ptr = sparse_index->indptr ()[0 ]->size () - 1 ;
695649
696650 for (int64_t i = first_ptr; i < last_ptr; ++i) {
697651 int64_t tmp_offset =
698- offset + sparse_index->indices ()->Value <IndexValueType>({indices_offset + i}) *
652+ offset + sparse_index->indices ()[dimension] ->Value <IndexValueType>({i}) *
699653 strides[dimension];
700654 if (dimension_index < ndim - 1 )
701655 assign_values<TYPE, IndexValueType>(
702656 dimension + 1 , tmp_offset,
703- sparse_index->indptr ()->Value <IndexValueType>({indptr_offset + i}),
704- sparse_index->indptr ()->Value <IndexValueType>({indptr_offset + i + 1 }),
705- sparse_index, raw_data, strides, axis_order, out);
657+ sparse_index->indptr ()[dimension] ->Value <IndexValueType>({i}),
658+ sparse_index->indptr ()[dimension] ->Value <IndexValueType>({i + 1 }), sparse_index ,
659+ raw_data, strides, axis_order, out);
706660 else
707661 out[tmp_offset] = static_cast <TYPE>(raw_data[i]);
708662 }
@@ -840,8 +794,8 @@ Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* sparse_t
840794 case SparseTensorFormat::CSF: {
841795 const auto & sparse_index =
842796 internal::checked_cast<const SparseCSFIndex&>(*sparse_tensor->sparse_index ());
843- const std::shared_ptr<const Tensor> indices = sparse_index.indices ();
844- type = indices->type ();
797+ const std::vector<std:: shared_ptr<Tensor> > indices = sparse_index.indices ();
798+ type = indices[ 0 ] ->type ();
845799 break ;
846800 }
847801 // LCOV_EXCL_START: ignore program failure
@@ -975,40 +929,68 @@ void CheckSparseCSXIndexValidity(const std::shared_ptr<DataType>& indptr_type,
975929// ----------------------------------------------------------------------
976930// SparseCSFIndex
977931
978- Status SparseCSFIndex::Make (const std::shared_ptr<DataType> indices_type,
979- const std::vector<int64_t >& indptr_shape,
980- const std::vector<int64_t >& indices_shape,
981- const std::vector<int64_t >& indptr_offsets,
982- const std::vector<int64_t >& indices_offsets,
983- const std::vector<int64_t >& axis_order,
984- std::shared_ptr<Buffer> indptr_data,
985- std::shared_ptr<Buffer> indices_data,
986- std::shared_ptr<SparseCSFIndex>* out) {
987- *out = std::make_shared<SparseCSFIndex>(
988- std::make_shared<Tensor>(indices_type, indptr_data, indptr_shape),
989- std::make_shared<Tensor>(indices_type, indices_data, indices_shape), indptr_offsets,
990- indices_offsets, axis_order);
932+ namespace {
933+
934+ inline Status CheckSparseCSFIndexValidity (const std::shared_ptr<DataType>& indptr_type,
935+ const std::shared_ptr<DataType>& indices_type,
936+ const int64_t num_indptrs,
937+ const int64_t num_indices,
938+ const std::vector<int64_t >& indptr_shape,
939+ const std::vector<int64_t >& indices_shape,
940+ const int64_t axis_order_size) {
941+ if (!is_integer (indptr_type->id ())) {
942+ return Status::Invalid (" Type of SparseCSFIndex indptr must be integer" );
943+ }
944+ if (!is_integer (indices_type->id ())) {
945+ return Status::Invalid (" Type of SparseCSFIndex indices must be integer" );
946+ }
947+ if (num_indptrs + 1 != num_indices) {
948+ return Status::Invalid (
949+ " SparseCSFIndex length indices must be equal to length inptrs plus one." );
950+ }
951+ if (axis_order_size != num_indices) {
952+ return Status::Invalid (
953+ " SparseCSFIndex length of indices must be equal number of dimensions." );
954+ }
991955 return Status::OK ();
992956}
993957
958+ } // namespace
959+
960+ Result<std::shared_ptr<SparseCSFIndex>> SparseCSFIndex::Make (
961+ const std::shared_ptr<DataType>& indptr_type,
962+ const std::shared_ptr<DataType>& indices_type,
963+ const std::vector<int64_t >& indices_shapes, const std::vector<int64_t >& axis_order,
964+ std::vector<std::shared_ptr<Buffer>> indptr_data,
965+ std::vector<std::shared_ptr<Buffer>> indices_data) {
966+ int64_t ndim = axis_order.size ();
967+ std::vector<std::shared_ptr<Tensor>> indptr (ndim - 1 );
968+ std::vector<std::shared_ptr<Tensor>> indices (ndim);
969+
970+ for (int64_t i = 0 ; i < ndim - 1 ; ++i)
971+ indptr[i] = std::make_shared<Tensor>(indptr_type, indptr_data[i],
972+ std::vector<int64_t >({indices_shapes[i] + 1 }));
973+
974+ for (int64_t i = 0 ; i < ndim; ++i)
975+ indices[i] = std::make_shared<Tensor>(indices_type, indices_data[i],
976+ std::vector<int64_t >({indices_shapes[i]}));
977+
978+ return std::make_shared<SparseCSFIndex>(indptr, indices, axis_order);
979+ }
980+
994981// Constructor with two index vectors
995- SparseCSFIndex::SparseCSFIndex (const std::shared_ptr<Tensor>& indptr,
996- const std::shared_ptr<Tensor>& indices,
997- const std::vector<int64_t >& indptr_offsets,
998- const std::vector<int64_t >& indices_offsets,
982+ SparseCSFIndex::SparseCSFIndex (std::vector<std::shared_ptr<Tensor>>& indptr,
983+ std::vector<std::shared_ptr<Tensor>>& indices,
999984 const std::vector<int64_t >& axis_order)
1000- : SparseIndexBase(indices-> size () - indices_offsets.back() ),
985+ : SparseIndexBase(indices.back()->shape()[0] ),
1001986 indptr_(indptr),
1002987 indices_(indices),
1003- indptr_offsets_(indptr_offsets),
1004- indices_offsets_(indices_offsets),
1005988 axis_order_(axis_order) {
1006- ARROW_CHECK (is_integer (indptr_->type_id ()));
1007- ARROW_CHECK_EQ (1 , indptr_->ndim ());
1008- ARROW_CHECK (is_integer (indices_->type_id ()));
1009- ARROW_CHECK_EQ (1 , indices_->ndim ());
1010- ARROW_CHECK_EQ (indptr_offsets_.size () + 1 , indices_offsets_.size ());
1011- ARROW_CHECK_EQ (axis_order_.size (), indices_offsets_.size ());
989+ ARROW_CHECK (CheckSparseCSFIndexValidity (indptr_.front ()->type (),
990+ indices_.front ()->type (), indptr_.size (),
991+ indices_.size (), indptr_.back ()->shape (),
992+ indices_.back ()->shape (), axis_order_.size ())
993+ .ok ());
1012994}
1013995
1014996std::string SparseCSFIndex::ToString () const { return std::string (" SparseCSFIndex" ); }
0 commit comments