@@ -444,56 +444,70 @@ class SparseTensorConverter<TYPE, SparseCSFIndex>
444444
445445 const int64_t ndim = tensor_.ndim ();
446446 std::vector<int64_t > axis_order = internal::ArgSort (tensor_.shape ());
447+ int64_t nonzero_count = -1 ;
448+ RETURN_NOT_OK (tensor_.CountNonZero (&nonzero_count));
447449
448- if (ndim < 2 ) {
449- // LCOV_EXCL_START: The following invalid causes program failure.
450- return Status::Invalid (" Invalid tensor dimension" );
451- // LCOV_EXCL_STOP
452- }
450+ std::shared_ptr<Buffer> values_buffer;
451+ RETURN_NOT_OK (
452+ AllocateBuffer (pool_, sizeof (value_type) * nonzero_count, &values_buffer));
453+ value_type* values = reinterpret_cast <value_type*>(values_buffer->mutable_data ());
453454
454- std::shared_ptr<SparseCOOTensor> sparse_coo_tensor;
455- ARROW_ASSIGN_OR_RAISE (sparse_coo_tensor, SparseCOOTensor::Make (tensor_));
456- std::shared_ptr<Tensor> coords =
457- arrow::internal::checked_pointer_cast<SparseCOOIndex>(
458- sparse_coo_tensor->sparse_index ())
459- ->indices ();
455+ std::vector<int64_t > counts (ndim, 0 );
456+ std::vector<int64_t > coord (ndim, 0 );
457+ std::vector<int64_t > previous_coord (ndim, -1 );
458+ std::vector<TypedBufferBuilder<c_index_value_type>> indptr_buffer_builders (ndim - 1 );
459+ std::vector<TypedBufferBuilder<c_index_value_type>> indices_buffer_builders (ndim);
460460
461- // TODO(rok): Coords should be sorted with axis_order priority to improve compression.
462- // ARROW-4221 would help here as well.
461+ if (ndim <= 1 ) {
462+ return Status::NotImplemented (" TODO for ndim <= 1" );
463+ } else {
464+ const std::vector<int64_t >& shape = tensor_.shape ();
465+ for (int64_t n = tensor_.size (); n > 0 ; n--) {
466+ const value_type x = tensor_.Value (coord);
463467
464- // Convert SparseCOOTensor to long CSF buffers
465- const int64_t nonzero_count = sparse_coo_tensor->non_zero_length ();
468+ if (tensor_.Value (coord) != 0 ) {
469+ bool tree_split = false ;
470+ *values++ = x;
466471
467- std::vector<int64_t > counts (ndim);
468- std::fill_n (counts.begin (), ndim, static_cast <int64_t >(0 ));
469- std::vector<TypedBufferBuilder<c_index_value_type>> indptr_buffer_builders (ndim - 1 );
470- std::vector<TypedBufferBuilder<c_index_value_type>> indices_buffer_builders (ndim);
472+ for (int64_t i = 0 ; i < ndim; ++i) {
473+ int64_t dimension = axis_order[i];
474+ bool change = coord[dimension] != previous_coord[dimension];
475+
476+ if (tree_split || change) {
477+ if (change) tree_split = true ;
478+
479+ if (i < ndim - 1 )
480+ RETURN_NOT_OK (indptr_buffer_builders[i].Append (
481+ static_cast <c_index_value_type>(counts[dimension + 1 ])));
482+ RETURN_NOT_OK (indices_buffer_builders[i].Append (
483+ static_cast <c_index_value_type>(coord[dimension])));
484+ ++counts[dimension];
485+ }
486+ }
487+ previous_coord = coord;
488+ }
471489
472- for (int64_t row = 0 ; row < nonzero_count; ++row) {
473- bool tree_split = false ;
474- for (int64_t column = 0 ; column < ndim; ++column) {
475- int64_t dimension = axis_order[column];
476- bool change = coords->Value <IndexValueType>({row, dimension}) !=
477- coords->Value <IndexValueType>({row - 1 , dimension});
478-
479- if (tree_split || change || row == 0 ) {
480- if (row > 1 || change) tree_split = true ;
481-
482- if (column < ndim - 1 )
483- RETURN_NOT_OK (indptr_buffer_builders[column].Append (
484- static_cast <c_index_value_type>(counts[column + 1 ])));
485- RETURN_NOT_OK (
486- indices_buffer_builders[column].Append (static_cast <c_index_value_type>(
487- coords->Value <IndexValueType>({row, dimension}))));
488- ++counts[column];
490+ // increment index
491+ ++coord[ndim - 1 ];
492+ if (n > 1 && coord[ndim - 1 ] == shape[ndim - 1 ]) {
493+ int64_t d = ndim - 1 ;
494+ while (d > 0 && coord[d] == shape[d]) {
495+ coord[d] = 0 ;
496+ ++coord[d - 1 ];
497+ --d;
498+ }
489499 }
490500 }
491501 }
502+
492503 for (int64_t column = 0 ; column < ndim - 1 ; ++column) {
493504 RETURN_NOT_OK (indptr_buffer_builders[column].Append (
494505 static_cast <c_index_value_type>(counts[column + 1 ])));
495506 }
496507
508+ // make results
509+ data = values_buffer;
510+
497511 std::vector<std::shared_ptr<Buffer>> indptr_buffers (ndim - 1 );
498512 std::vector<std::shared_ptr<Buffer>> indices_buffers (ndim);
499513 std::vector<int64_t > indptr_shapes (counts.begin (), counts.end () - 1 );
@@ -509,7 +523,6 @@ class SparseTensorConverter<TYPE, SparseCSFIndex>
509523 ARROW_ASSIGN_OR_RAISE (
510524 sparse_index, SparseCSFIndex::Make (index_value_type_, indices_shapes, axis_order,
511525 indptr_buffers, indices_buffers));
512- data = sparse_coo_tensor->data ();
513526 return Status::OK ();
514527 }
515528
@@ -647,11 +660,14 @@ Status MakeSparseTensorFromTensor(const Tensor& tensor,
647660 }
648661}
649662
663+ namespace {
664+
650665template <typename TYPE, typename IndexValueType>
651- void assign_values (int64_t dimension, int64_t offset, int64_t first_ptr, int64_t last_ptr,
652- const SparseCSFIndex* sparse_index, const int64_t * raw_data,
653- const std::vector<int64_t > strides,
654- const std::vector<int64_t > axis_order, TYPE* out) {
666+ void ExpandSparseCSFTensorValues (int64_t dimension, int64_t offset, int64_t first_ptr,
667+ int64_t last_ptr, const SparseCSFIndex* sparse_index,
668+ const int64_t * raw_data,
669+ const std::vector<int64_t > strides,
670+ const std::vector<int64_t > axis_order, TYPE* out) {
655671 int64_t ndim = axis_order.size ();
656672
657673 for (int64_t i = first_ptr; i < last_ptr; ++i) {
@@ -660,7 +676,7 @@ void assign_values(int64_t dimension, int64_t offset, int64_t first_ptr, int64_t
660676 strides[axis_order[dimension]];
661677
662678 if (dimension < ndim - 1 )
663- assign_values <TYPE, IndexValueType>(
679+ ExpandSparseCSFTensorValues <TYPE, IndexValueType>(
664680 dimension + 1 , tmp_offset,
665681 sparse_index->indptr ()[dimension]->Value <IndexValueType>({i}),
666682 sparse_index->indptr ()[dimension]->Value <IndexValueType>({i + 1 }), sparse_index,
@@ -670,6 +686,8 @@ void assign_values(int64_t dimension, int64_t offset, int64_t first_ptr, int64_t
670686 }
671687}
672688
689+ } // namespace
690+
673691template <typename TYPE, typename IndexValueType>
674692Status MakeTensorFromSparseTensor (MemoryPool* pool, const SparseTensor* sparse_tensor,
675693 std::shared_ptr<Tensor>* out) {
@@ -753,13 +771,9 @@ Status MakeTensorFromSparseTensor(MemoryPool* pool, const SparseTensor* sparse_t
753771 case SparseTensorFormat::CSF: {
754772 const auto & sparse_index =
755773 internal::checked_cast<const SparseCSFIndex&>(*sparse_tensor->sparse_index ());
756- int64_t last_ptr_index = sparse_index.indptr ()[0 ]->size () - 1 ;
757- int64_t first_ptr = sparse_index.indptr ()[0 ]->Value <IndexValueType>({0 });
758- int64_t last_ptr =
759- sparse_index.indptr ()[0 ]->Value <IndexValueType>({last_ptr_index});
760774
761- assign_values <value_type, IndexValueType>(
762- 0 , 0 , first_ptr, last_ptr , &sparse_index,
775+ ExpandSparseCSFTensorValues <value_type, IndexValueType>(
776+ 0 , 0 , 0 , sparse_index. indptr ()[ 0 ]-> size () - 1 , &sparse_index,
763777 reinterpret_cast <const int64_t *>(sparse_tensor->raw_data ()), strides,
764778 sparse_index.axis_order (), values);
765779 *out = std::make_shared<Tensor>(sparse_tensor->type (), values_buffer,
@@ -985,10 +999,9 @@ Result<std::shared_ptr<SparseCSFIndex>> SparseCSFIndex::Make(
985999 indices[i] = std::make_shared<Tensor>(indices_type, indices_data[i],
9861000 std::vector<int64_t >({indices_shapes[i]}));
9871001
988- ARROW_CHECK (CheckSparseCSFIndexValidity (indptr_type, indices_type, indptr.size (),
989- indices.size (), indptr.back ()->shape (),
990- indices.back ()->shape (), axis_order.size ())
991- .ok ());
1002+ RETURN_NOT_OK (CheckSparseCSFIndexValidity (indptr_type, indices_type, indptr.size (),
1003+ indices.size (), indptr.back ()->shape (),
1004+ indices.back ()->shape (), axis_order.size ()));
9921005
9931006 return std::make_shared<SparseCSFIndex>(indptr, indices, axis_order);
9941007}
@@ -997,15 +1010,13 @@ Result<std::shared_ptr<SparseCSFIndex>> SparseCSFIndex::Make(
9971010SparseCSFIndex::SparseCSFIndex (std::vector<std::shared_ptr<Tensor>>& indptr,
9981011 std::vector<std::shared_ptr<Tensor>>& indices,
9991012 const std::vector<int64_t >& axis_order)
1000- : SparseIndexBase(indices.back()->shape()[0] ),
1013+ : SparseIndexBase(indices.back()->size() ),
10011014 indptr_(indptr),
10021015 indices_(indices),
10031016 axis_order_(axis_order) {
1004- ARROW_CHECK (CheckSparseCSFIndexValidity (indptr_.front ()->type (),
1005- indices_.front ()->type (), indptr_.size (),
1006- indices_.size (), indptr_.back ()->shape (),
1007- indices_.back ()->shape (), axis_order_.size ())
1008- .ok ());
1017+ ARROW_CHECK_OK (CheckSparseCSFIndexValidity (
1018+ indptr_.front ()->type (), indices_.front ()->type (), indptr_.size (), indices_.size (),
1019+ indptr_.back ()->shape (), indices_.back ()->shape (), axis_order_.size ()));
10091020}
10101021
10111022std::string SparseCSFIndex::ToString () const { return std::string (" SparseCSFIndex" ); }
0 commit comments