diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index d682dc76f8ced..6f954830b6334 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -681,7 +681,6 @@ set(ARROW_SRCS src/arrow/types/construct.cc src/arrow/types/decimal.cc - src/arrow/types/json.cc src/arrow/types/list.cc src/arrow/types/primitive.cc src/arrow/types/string.cc diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index 646a6f24e9df8..cef17e5aabab9 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -93,7 +93,7 @@ class ARROW_EXPORT ArrayBuilder { // Creates new array object to hold the contents of the builder and transfers // ownership of the data. This resets all variables on the builder. - virtual std::shared_ptr Finish() = 0; + virtual Status Finish(std::shared_ptr* out) = 0; const std::shared_ptr& type() const { return type_; } diff --git a/cpp/src/arrow/ipc/test-common.h b/cpp/src/arrow/ipc/test-common.h index 7d02bc302f40e..13bbbebde8aa1 100644 --- a/cpp/src/arrow/ipc/test-common.h +++ b/cpp/src/arrow/ipc/test-common.h @@ -42,7 +42,7 @@ const auto kListInt32 = std::make_shared(kInt32); const auto kListListInt32 = std::make_shared(kListInt32); Status MakeRandomInt32Array( - int32_t length, bool include_nulls, MemoryPool* pool, std::shared_ptr* array) { + int32_t length, bool include_nulls, MemoryPool* pool, std::shared_ptr* out) { std::shared_ptr data; test::MakeRandomInt32PoolBuffer(length, pool, &data); const auto kInt32 = std::make_shared(); @@ -52,16 +52,14 @@ Status MakeRandomInt32Array( test::MakeRandomBytePoolBuffer(length, pool, &valid_bytes); RETURN_NOT_OK(builder.Append( reinterpret_cast(data->data()), length, valid_bytes->data())); - *array = builder.Finish(); - return Status::OK(); + return builder.Finish(out); } RETURN_NOT_OK(builder.Append(reinterpret_cast(data->data()), length)); - *array = builder.Finish(); - return Status::OK(); + return builder.Finish(out); } Status MakeRandomListArray(const std::shared_ptr& child_array, int num_lists, - bool include_nulls, MemoryPool* pool, std::shared_ptr* array) { + bool include_nulls, MemoryPool* pool, std::shared_ptr* out) { // Create the null list values std::vector valid_lists(num_lists); const double null_percent = include_nulls ? 0.1 : 0; @@ -90,8 +88,8 @@ Status MakeRandomListArray(const std::shared_ptr& child_array, int num_li } ListBuilder builder(pool, child_array); RETURN_NOT_OK(builder.Append(offsets.data(), num_lists, valid_lists.data())); - *array = builder.Finish(); - return (*array)->Validate(); + RETURN_NOT_OK(builder.Finish(out)); + return (*out)->Validate(); } typedef Status MakeRecordBatch(std::shared_ptr* out); @@ -115,7 +113,7 @@ Status MakeIntRecordBatch(std::shared_ptr* out) { template Status MakeRandomBinaryArray( - const TypePtr& type, int32_t length, MemoryPool* pool, ArrayPtr* array) { + const TypePtr& type, int32_t length, MemoryPool* pool, ArrayPtr* out) { const std::vector values = { "", "", "abc", "123", "efg", "456!@#!@#", "12312"}; Builder builder(pool, type); @@ -130,8 +128,7 @@ Status MakeRandomBinaryArray( builder.Append(reinterpret_cast(value.data()), value.size())); } } - *array = builder.Finish(); - return Status::OK(); + return builder.Finish(out); } Status MakeStringTypesRecordBatch(std::shared_ptr* out) { diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index b4c3721a72895..4b6f7e4ebf9bd 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -242,7 +242,7 @@ struct ARROW_EXPORT DoubleType : public PrimitiveType { struct ARROW_EXPORT ListType : public DataType { // List can contain any other logical value type explicit ListType(const std::shared_ptr& value_type) - : ListType(value_type, Type::LIST) {} + : ListType(std::make_shared("item", value_type)) {} explicit ListType(const std::shared_ptr& value_field) : DataType(Type::LIST) { children_ = {value_field}; @@ -255,26 +255,17 @@ struct ARROW_EXPORT ListType : public DataType { static char const* name() { return "list"; } std::string ToString() const override; - - protected: - // Constructor for classes that are implemented as List Arrays. - ListType(const std::shared_ptr& value_type, Type::type logical_type) - : DataType(logical_type) { - // TODO ARROW-187 this can technically fail, make a constructor method ? - children_ = {std::make_shared("item", value_type)}; - } }; // BinaryType type is reprsents lists of 1-byte values. -struct ARROW_EXPORT BinaryType : public ListType { +struct ARROW_EXPORT BinaryType : public DataType { BinaryType() : BinaryType(Type::BINARY) {} static char const* name() { return "binary"; } std::string ToString() const override; protected: // Allow subclasses to change the logical type. - explicit BinaryType(Type::type logical_type) - : ListType(std::shared_ptr(new UInt8Type()), logical_type) {} + explicit BinaryType(Type::type logical_type) : DataType(logical_type) {} }; // UTF encoded strings @@ -284,9 +275,6 @@ struct ARROW_EXPORT StringType : public BinaryType { static char const* name() { return "string"; } std::string ToString() const override; - - protected: - explicit StringType(Type::type logical_type) : BinaryType(logical_type) {} }; struct ARROW_EXPORT StructType : public DataType { diff --git a/cpp/src/arrow/types/CMakeLists.txt b/cpp/src/arrow/types/CMakeLists.txt index 72a8e77664610..9f7816989827d 100644 --- a/cpp/src/arrow/types/CMakeLists.txt +++ b/cpp/src/arrow/types/CMakeLists.txt @@ -25,7 +25,6 @@ install(FILES construct.h datetime.h decimal.h - json.h list.h primitive.h string.h diff --git a/cpp/src/arrow/types/binary.h b/cpp/src/arrow/types/binary.h deleted file mode 100644 index 201fbb6e79536..0000000000000 --- a/cpp/src/arrow/types/binary.h +++ /dev/null @@ -1,28 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_TYPES_BINARY_H -#define ARROW_TYPES_BINARY_H - -#include -#include - -#include "arrow/type.h" - -namespace arrow {} // namespace arrow - -#endif // ARROW_TYPES_BINARY_H diff --git a/cpp/src/arrow/types/construct.cc b/cpp/src/arrow/types/construct.cc index 0b71ea965516c..eda75b6a0c8d9 100644 --- a/cpp/src/arrow/types/construct.cc +++ b/cpp/src/arrow/types/construct.cc @@ -59,6 +59,7 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, BUILDER_CASE(DOUBLE, DoubleBuilder); BUILDER_CASE(STRING, StringBuilder); + BUILDER_CASE(BINARY, BinaryBuilder); case Type::LIST: { std::shared_ptr value_builder; @@ -105,10 +106,10 @@ Status MakePrimitiveArray(const TypePtr& type, int32_t length, MAKE_PRIMITIVE_ARRAY_CASE(INT32, Int32Array); MAKE_PRIMITIVE_ARRAY_CASE(UINT64, UInt64Array); MAKE_PRIMITIVE_ARRAY_CASE(INT64, Int64Array); - MAKE_PRIMITIVE_ARRAY_CASE(TIME, Int64Array); - MAKE_PRIMITIVE_ARRAY_CASE(TIMESTAMP, TimestampArray); MAKE_PRIMITIVE_ARRAY_CASE(FLOAT, FloatArray); MAKE_PRIMITIVE_ARRAY_CASE(DOUBLE, DoubleArray); + MAKE_PRIMITIVE_ARRAY_CASE(TIME, Int64Array); + MAKE_PRIMITIVE_ARRAY_CASE(TIMESTAMP, TimestampArray); MAKE_PRIMITIVE_ARRAY_CASE(TIMESTAMP_DOUBLE, DoubleArray); default: return Status::NotImplemented(type->ToString()); @@ -123,22 +124,8 @@ Status MakePrimitiveArray(const TypePtr& type, int32_t length, Status MakeListArray(const TypePtr& type, int32_t length, const std::shared_ptr& offsets, const ArrayPtr& values, int32_t null_count, const std::shared_ptr& null_bitmap, ArrayPtr* out) { - switch (type->type) { - case Type::BINARY: - out->reset(new BinaryArray(type, length, offsets, values, null_count, null_bitmap)); - break; - - case Type::LIST: - out->reset(new ListArray(type, length, offsets, values, null_count, null_bitmap)); - break; - - case Type::DECIMAL_TEXT: - case Type::STRING: - out->reset(new StringArray(type, length, offsets, values, null_count, null_bitmap)); - break; - default: - return Status::NotImplemented(type->ToString()); - } + *out = + std::make_shared(type, length, offsets, values, null_count, null_bitmap); #ifdef NDEBUG return Status::OK(); #else diff --git a/cpp/src/arrow/types/json.cc b/cpp/src/arrow/types/json.cc deleted file mode 100644 index 89240fc22bb2c..0000000000000 --- a/cpp/src/arrow/types/json.cc +++ /dev/null @@ -1,37 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "arrow/types/json.h" - -#include - -#include "arrow/type.h" -#include "arrow/types/union.h" - -namespace arrow { - -static const TypePtr Null(new NullType()); -static const TypePtr Int32(new Int32Type()); -static const TypePtr String(new StringType()); -static const TypePtr Double(new DoubleType()); -static const TypePtr Bool(new BooleanType()); - -static const std::vector kJsonTypes = {Null, Int32, String, Double, Bool}; -TypePtr JSONScalar::dense_type = TypePtr(new DenseUnionType(kJsonTypes)); -TypePtr JSONScalar::sparse_type = TypePtr(new SparseUnionType(kJsonTypes)); - -} // namespace arrow diff --git a/cpp/src/arrow/types/json.h b/cpp/src/arrow/types/json.h deleted file mode 100644 index 9de961f79a60a..0000000000000 --- a/cpp/src/arrow/types/json.h +++ /dev/null @@ -1,36 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_TYPES_JSON_H -#define ARROW_TYPES_JSON_H - -#include "arrow/type.h" - -namespace arrow { - -struct JSONScalar : public DataType { - bool dense; - - static TypePtr dense_type; - static TypePtr sparse_type; - - explicit JSONScalar(bool dense = true) : DataType(Type::JSON_SCALAR), dense(dense) {} -}; - -} // namespace arrow - -#endif // ARROW_TYPES_JSON_H diff --git a/cpp/src/arrow/types/list-test.cc b/cpp/src/arrow/types/list-test.cc index 2e41b4a61caf2..12c539495a28b 100644 --- a/cpp/src/arrow/types/list-test.cc +++ b/cpp/src/arrow/types/list-test.cc @@ -76,7 +76,11 @@ class TestListBuilder : public TestBuilder { builder_ = std::dynamic_pointer_cast(tmp); } - void Done() { result_ = std::dynamic_pointer_cast(builder_->Finish()); } + void Done() { + std::shared_ptr out; + EXPECT_OK(builder_->Finish(&out)); + result_ = std::dynamic_pointer_cast(out); + } protected: TypePtr value_type_; @@ -98,14 +102,17 @@ TEST_F(TestListBuilder, Equality) { // setup two equal arrays ASSERT_OK(builder_->Append(equal_offsets.data(), equal_offsets.size())); ASSERT_OK(vb->Append(equal_values.data(), equal_values.size())); - array = builder_->Finish(); + + ASSERT_OK(builder_->Finish(&array)); ASSERT_OK(builder_->Append(equal_offsets.data(), equal_offsets.size())); ASSERT_OK(vb->Append(equal_values.data(), equal_values.size())); - equal_array = builder_->Finish(); + + ASSERT_OK(builder_->Finish(&equal_array)); // now an unequal one ASSERT_OK(builder_->Append(unequal_offsets.data(), unequal_offsets.size())); ASSERT_OK(vb->Append(unequal_values.data(), unequal_values.size())); - unequal_array = builder_->Finish(); + + ASSERT_OK(builder_->Finish(&unequal_array)); // Test array equality EXPECT_TRUE(array->Equals(array)); diff --git a/cpp/src/arrow/types/list.cc b/cpp/src/arrow/types/list.cc index 6334054caf84a..ef2ec22cb5336 100644 --- a/cpp/src/arrow/types/list.cc +++ b/cpp/src/arrow/types/list.cc @@ -25,7 +25,7 @@ bool ListArray::EqualsExact(const ListArray& other) const { if (null_count_ != other.null_count_) { return false; } bool equal_offsets = - offset_buf_->Equals(*other.offset_buf_, (length_ + 1) * sizeof(int32_t)); + offset_buffer_->Equals(*other.offset_buffer_, (length_ + 1) * sizeof(int32_t)); if (!equal_offsets) { return false; } bool equal_null_bitmap = true; if (null_count_ > 0) { @@ -72,10 +72,10 @@ bool ListArray::RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_st Status ListArray::Validate() const { if (length_ < 0) { return Status::Invalid("Length was negative"); } - if (!offset_buf_) { return Status::Invalid("offset_buf_ was null"); } - if (offset_buf_->size() / static_cast(sizeof(int32_t)) < length_) { + if (!offset_buffer_) { return Status::Invalid("offset_buffer_ was null"); } + if (offset_buffer_->size() / static_cast(sizeof(int32_t)) < length_) { std::stringstream ss; - ss << "offset buffer size (bytes): " << offset_buf_->size() + ss << "offset buffer size (bytes): " << offset_buffer_->size() << " isn't large enough for length: " << length_; return Status::Invalid(ss.str()); } @@ -121,4 +121,38 @@ Status ListArray::Validate() const { return Status::OK(); } +Status ListBuilder::Init(int32_t elements) { + DCHECK_LT(elements, std::numeric_limits::max()); + RETURN_NOT_OK(ArrayBuilder::Init(elements)); + // one more then requested for offsets + return offset_builder_.Resize((elements + 1) * sizeof(int32_t)); +} + +Status ListBuilder::Resize(int32_t capacity) { + DCHECK_LT(capacity, std::numeric_limits::max()); + // one more then requested for offsets + RETURN_NOT_OK(offset_builder_.Resize((capacity + 1) * sizeof(int32_t))); + return ArrayBuilder::Resize(capacity); +} + +Status ListBuilder::Finish(std::shared_ptr* out) { + std::shared_ptr items = values_; + if (!items) { RETURN_NOT_OK(value_builder_->Finish(&items)); } + + RETURN_NOT_OK(offset_builder_.Append(items->length())); + std::shared_ptr offsets = offset_builder_.Finish(); + + *out = std::make_shared( + type_, length_, offsets, items, null_count_, null_bitmap_); + + Reset(); + + return Status::OK(); +} + +void ListBuilder::Reset() { + capacity_ = length_ = null_count_ = 0; + null_bitmap_ = nullptr; +} + } // namespace arrow diff --git a/cpp/src/arrow/types/list.h b/cpp/src/arrow/types/list.h index f3894510d091a..9440ffed4bf8a 100644 --- a/cpp/src/arrow/types/list.h +++ b/cpp/src/arrow/types/list.h @@ -43,9 +43,9 @@ class ARROW_EXPORT ListArray : public Array { const ArrayPtr& values, int32_t null_count = 0, std::shared_ptr null_bitmap = nullptr) : Array(type, length, null_count, null_bitmap) { - offset_buf_ = offsets; - offsets_ = offsets == nullptr ? nullptr - : reinterpret_cast(offset_buf_->data()); + offset_buffer_ = offsets; + offsets_ = offsets == nullptr ? nullptr : reinterpret_cast( + offset_buffer_->data()); values_ = values; } @@ -57,7 +57,7 @@ class ARROW_EXPORT ListArray : public Array { // with this array. const std::shared_ptr& values() const { return values_; } const std::shared_ptr offset_buffer() const { - return std::static_pointer_cast(offset_buf_); + return std::static_pointer_cast(offset_buffer_); } const std::shared_ptr& value_type() const { return values_->type(); } @@ -77,7 +77,7 @@ class ARROW_EXPORT ListArray : public Array { const ArrayPtr& arr) const override; protected: - std::shared_ptr offset_buf_; + std::shared_ptr offset_buffer_; const int32_t* offsets_; ArrayPtr values_; }; @@ -119,19 +119,9 @@ class ARROW_EXPORT ListBuilder : public ArrayBuilder { virtual ~ListBuilder() {} - Status Init(int32_t elements) override { - DCHECK_LT(elements, std::numeric_limits::max()); - RETURN_NOT_OK(ArrayBuilder::Init(elements)); - // one more then requested for offsets - return offset_builder_.Resize((elements + 1) * sizeof(int32_t)); - } - - Status Resize(int32_t capacity) override { - DCHECK_LT(capacity, std::numeric_limits::max()); - // one more then requested for offsets - RETURN_NOT_OK(offset_builder_.Resize((capacity + 1) * sizeof(int32_t))); - return ArrayBuilder::Resize(capacity); - } + Status Init(int32_t elements) override; + Status Resize(int32_t capacity) override; + Status Finish(std::shared_ptr* out) override; // Vector append // @@ -145,27 +135,6 @@ class ARROW_EXPORT ListBuilder : public ArrayBuilder { return Status::OK(); } - // The same as Finalize but allows for overridding the c++ type - template - std::shared_ptr Transfer() { - std::shared_ptr items = values_; - if (!items) { items = value_builder_->Finish(); } - - offset_builder_.Append(items->length()); - - const auto offsets_buffer = offset_builder_.Finish(); - auto result = std::make_shared( - type_, length_, offsets_buffer, items, null_count_, null_bitmap_); - - // TODO(emkornfield) make a reset method - capacity_ = length_ = null_count_ = 0; - null_bitmap_ = nullptr; - - return result; - } - - std::shared_ptr Finish() override { return Transfer(); } - // Start a new variable-length list slot // // This function should be called before beginning to append elements to the @@ -188,6 +157,8 @@ class ARROW_EXPORT ListBuilder : public ArrayBuilder { BufferBuilder offset_builder_; std::shared_ptr value_builder_; std::shared_ptr values_; + + void Reset(); }; } // namespace arrow diff --git a/cpp/src/arrow/types/primitive-test.cc b/cpp/src/arrow/types/primitive-test.cc index 5ac2867932df7..74c663cea7443 100644 --- a/cpp/src/arrow/types/primitive-test.cc +++ b/cpp/src/arrow/types/primitive-test.cc @@ -123,8 +123,11 @@ class TestPrimitiveBuilder : public TestBuilder { auto expected = std::make_shared(size, ex_data, ex_null_count, ex_null_bitmap); - std::shared_ptr result = - std::dynamic_pointer_cast(builder->Finish()); + + std::shared_ptr out; + ASSERT_OK(builder->Finish(&out)); + + std::shared_ptr result = std::dynamic_pointer_cast(out); // Builder is now reset ASSERT_EQ(0, builder->length()); @@ -216,8 +219,10 @@ void TestPrimitiveBuilder::Check( auto expected = std::make_shared(size, ex_data, ex_null_count, ex_null_bitmap); - std::shared_ptr result = - std::dynamic_pointer_cast(builder->Finish()); + + std::shared_ptr out; + ASSERT_OK(builder->Finish(&out)); + std::shared_ptr result = std::dynamic_pointer_cast(out); // Builder is now reset ASSERT_EQ(0, builder->length()); @@ -267,7 +272,8 @@ TYPED_TEST(TestPrimitiveBuilder, TestAppendNull) { ASSERT_OK(this->builder_->AppendNull()); } - auto result = this->builder_->Finish(); + std::shared_ptr result; + ASSERT_OK(this->builder_->Finish(&result)); for (int i = 0; i < size; ++i) { ASSERT_TRUE(result->IsNull(i)) << i; @@ -298,7 +304,8 @@ TYPED_TEST(TestPrimitiveBuilder, TestArrayDtorDealloc) { } do { - std::shared_ptr result = this->builder_->Finish(); + std::shared_ptr result; + ASSERT_OK(this->builder_->Finish(&result)); } while (false); ASSERT_EQ(memory_before, this->pool_->bytes_allocated()); @@ -315,8 +322,7 @@ Status MakeArray(const vector& valid_bytes, const vector& draws, int RETURN_NOT_OK(builder->AppendNull()); } } - *out = builder->Finish(); - return Status::OK(); + return builder->Finish(out); } TYPED_TEST(TestPrimitiveBuilder, Equality) { diff --git a/cpp/src/arrow/types/primitive.cc b/cpp/src/arrow/types/primitive.cc index 9ba2ebdcc2d5b..1a230d1f1ac56 100644 --- a/cpp/src/arrow/types/primitive.cc +++ b/cpp/src/arrow/types/primitive.cc @@ -69,6 +69,19 @@ bool PrimitiveArray::Equals(const std::shared_ptr& arr) const { return EqualsExact(*static_cast(arr.get())); } +template class NumericArray; +template class NumericArray; +template class NumericArray; +template class NumericArray; +template class NumericArray; +template class NumericArray; +template class NumericArray; +template class NumericArray; +template class NumericArray; +template class NumericArray; +template class NumericArray; +template class NumericArray; + template Status PrimitiveBuilder::Init(int32_t capacity) { RETURN_NOT_OK(ArrayBuilder::Init(capacity)); @@ -118,13 +131,13 @@ Status PrimitiveBuilder::Append( } template -std::shared_ptr PrimitiveBuilder::Finish() { - std::shared_ptr result = std::make_shared::ArrayType>( +Status PrimitiveBuilder::Finish(std::shared_ptr* out) { + *out = std::make_shared::ArrayType>( type_, length_, data_, null_count_, null_bitmap_); data_ = null_bitmap_ = nullptr; capacity_ = length_ = null_count_ = 0; - return result; + return Status::OK(); } template <> diff --git a/cpp/src/arrow/types/primitive.h b/cpp/src/arrow/types/primitive.h index c643783f681bd..b4758ed11150c 100644 --- a/cpp/src/arrow/types/primitive.h +++ b/cpp/src/arrow/types/primitive.h @@ -91,7 +91,9 @@ class ARROW_EXPORT NumericArray : public PrimitiveArray { value_type Value(int i) const { return raw_data()[i]; } }; -#define NUMERIC_ARRAY_DECL(NAME, TypeClass) using NAME = NumericArray; +#define NUMERIC_ARRAY_DECL(NAME, TypeClass) \ + using NAME = NumericArray; \ + extern template class ARROW_EXPORT NumericArray; NUMERIC_ARRAY_DECL(UInt8Array, UInt8Type); NUMERIC_ARRAY_DECL(Int8Array, Int8Type); @@ -139,8 +141,7 @@ class ARROW_EXPORT PrimitiveBuilder : public ArrayBuilder { Status Append( const value_type* values, int32_t length, const uint8_t* valid_bytes = nullptr); - std::shared_ptr Finish() override; - + Status Finish(std::shared_ptr* out) override; Status Init(int32_t capacity) override; // Increase the capacity of the builder to accommodate at least the indicated diff --git a/cpp/src/arrow/types/string-test.cc b/cpp/src/arrow/types/string-test.cc index 6807b00e8ca99..015bb83a68886 100644 --- a/cpp/src/arrow/types/string-test.cc +++ b/cpp/src/arrow/types/string-test.cc @@ -147,7 +147,10 @@ class TestStringBuilder : public TestBuilder { } void Done() { - result_ = std::dynamic_pointer_cast(builder_->Finish()); + std::shared_ptr out; + EXPECT_OK(builder_->Finish(&out)); + + result_ = std::dynamic_pointer_cast(out); result_->Validate(); } @@ -178,7 +181,7 @@ TEST_F(TestStringBuilder, TestScalarAppend) { ASSERT_EQ(reps * N, result_->length()); ASSERT_EQ(reps, result_->null_count()); - ASSERT_EQ(reps * 6, result_->values()->length()); + ASSERT_EQ(reps * 6, result_->data()->size()); int32_t length; int32_t pos = 0; @@ -298,7 +301,10 @@ class TestBinaryBuilder : public TestBuilder { } void Done() { - result_ = std::dynamic_pointer_cast(builder_->Finish()); + std::shared_ptr out; + EXPECT_OK(builder_->Finish(&out)); + + result_ = std::dynamic_pointer_cast(out); result_->Validate(); } @@ -330,7 +336,7 @@ TEST_F(TestBinaryBuilder, TestScalarAppend) { ASSERT_OK(result_->Validate()); ASSERT_EQ(reps * N, result_->length()); ASSERT_EQ(reps, result_->null_count()); - ASSERT_EQ(reps * 6, result_->values()->length()); + ASSERT_EQ(reps * 6, result_->data()->size()); int32_t length; for (int i = 0; i < N * reps; ++i) { diff --git a/cpp/src/arrow/types/string.cc b/cpp/src/arrow/types/string.cc index 745ed8f7edb99..7c3fc05ebecd8 100644 --- a/cpp/src/arrow/types/string.cc +++ b/cpp/src/arrow/types/string.cc @@ -17,44 +17,82 @@ #include "arrow/types/string.h" +#include #include #include #include "arrow/type.h" namespace arrow { - -const std::shared_ptr BINARY(new BinaryType()); -const std::shared_ptr STRING(new StringType()); +const std::shared_ptr kBinary = std::make_shared(); +const std::shared_ptr kString = std::make_shared(); BinaryArray::BinaryArray(int32_t length, const std::shared_ptr& offsets, - const ArrayPtr& values, int32_t null_count, + const std::shared_ptr& data, int32_t null_count, const std::shared_ptr& null_bitmap) - : BinaryArray(BINARY, length, offsets, values, null_count, null_bitmap) {} + : BinaryArray(kBinary, length, offsets, data, null_count, null_bitmap) {} BinaryArray::BinaryArray(const TypePtr& type, int32_t length, - const std::shared_ptr& offsets, const ArrayPtr& values, int32_t null_count, - const std::shared_ptr& null_bitmap) - : ListArray(type, length, offsets, values, null_count, null_bitmap), - bytes_(std::dynamic_pointer_cast(values).get()), - raw_bytes_(bytes_->raw_data()) { - // Check in case the dynamic cast fails. - DCHECK(bytes_); -} + const std::shared_ptr& offsets, const std::shared_ptr& data, + int32_t null_count, const std::shared_ptr& null_bitmap) + : Array(type, length, null_count, null_bitmap), + offset_buffer_(offsets), + offsets_(reinterpret_cast(offset_buffer_->data())), + data_buffer_(data), + data_(data_buffer_->data()) {} Status BinaryArray::Validate() const { - if (values()->null_count() > 0) { - std::stringstream ss; - ss << type()->ToString() << " can have null values in the value array"; - Status::Invalid(ss.str()); + // TODO(wesm): what to do here? + return Status::OK(); +} + +bool BinaryArray::EqualsExact(const BinaryArray& other) const { + if (!Array::EqualsExact(other)) { return false; } + + bool equal_offsets = + offset_buffer_->Equals(*other.offset_buffer_, (length_ + 1) * sizeof(int32_t)); + if (!equal_offsets) { return false; } + + return data_buffer_->Equals(*other.data_buffer_, data_buffer_->size()); +} + +bool BinaryArray::Equals(const std::shared_ptr& arr) const { + if (this == arr.get()) { return true; } + if (this->type_enum() != arr->type_enum()) { return false; } + return EqualsExact(*static_cast(arr.get())); +} + +bool BinaryArray::RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx, + const std::shared_ptr& arr) const { + if (this == arr.get()) { return true; } + if (!arr) { return false; } + if (this->type_enum() != arr->type_enum()) { return false; } + const auto other = static_cast(arr.get()); + for (int32_t i = start_idx, o_i = other_start_idx; i < end_idx; ++i, ++o_i) { + const bool is_null = IsNull(i); + if (is_null != arr->IsNull(o_i)) { return false; } + if (is_null) continue; + const int32_t begin_offset = offset(i); + const int32_t end_offset = offset(i + 1); + const int32_t other_begin_offset = other->offset(o_i); + const int32_t other_end_offset = other->offset(o_i + 1); + // Underlying can't be equal if the size isn't equal + if (end_offset - begin_offset != other_end_offset - other_begin_offset) { + return false; + } + + if (std::memcmp(data_ + begin_offset, other->data_ + other_begin_offset, + end_offset - begin_offset)) { + return false; + } } - return ListArray::Validate(); + return true; } StringArray::StringArray(int32_t length, const std::shared_ptr& offsets, - const ArrayPtr& values, int32_t null_count, + const std::shared_ptr& data, int32_t null_count, const std::shared_ptr& null_bitmap) - : StringArray(STRING, length, offsets, values, null_count, null_bitmap) {} + : BinaryArray(kString, length, offsets, data, null_count, null_bitmap) {} Status StringArray::Validate() const { // TODO(emkornfield) Validate proper UTF8 code points? @@ -72,4 +110,28 @@ BinaryBuilder::BinaryBuilder(MemoryPool* pool, const TypePtr& type) byte_builder_ = static_cast(value_builder_.get()); } +Status BinaryBuilder::Finish(std::shared_ptr* out) { + std::shared_ptr result; + RETURN_NOT_OK(ListBuilder::Finish(&result)); + + const auto list = std::dynamic_pointer_cast(result); + auto values = std::dynamic_pointer_cast(list->values()); + + *out = std::make_shared(list->length(), list->offset_buffer(), + values->data(), list->null_count(), list->null_bitmap()); + return Status::OK(); +} + +Status StringBuilder::Finish(std::shared_ptr* out) { + std::shared_ptr result; + RETURN_NOT_OK(ListBuilder::Finish(&result)); + + const auto list = std::dynamic_pointer_cast(result); + auto values = std::dynamic_pointer_cast(list->values()); + + *out = std::make_shared(list->length(), list->offset_buffer(), + values->data(), list->null_count(), list->null_bitmap()); + return Status::OK(); +} + } // namespace arrow diff --git a/cpp/src/arrow/types/string.h b/cpp/src/arrow/types/string.h index bab0c58f617b2..dd8967fbacacf 100644 --- a/cpp/src/arrow/types/string.h +++ b/cpp/src/arrow/types/string.h @@ -35,15 +35,16 @@ namespace arrow { class Buffer; class MemoryPool; -class ARROW_EXPORT BinaryArray : public ListArray { +class ARROW_EXPORT BinaryArray : public Array { public: BinaryArray(int32_t length, const std::shared_ptr& offsets, - const ArrayPtr& values, int32_t null_count = 0, + const std::shared_ptr& data, int32_t null_count = 0, const std::shared_ptr& null_bitmap = nullptr); + // Constructor that allows sub-classes/builders to propagate there logical type up the // class hierarchy. BinaryArray(const TypePtr& type, int32_t length, const std::shared_ptr& offsets, - const ArrayPtr& values, int32_t null_count = 0, + const std::shared_ptr& data, int32_t null_count = 0, const std::shared_ptr& null_bitmap = nullptr); // Return the pointer to the given elements bytes @@ -53,28 +54,38 @@ class ARROW_EXPORT BinaryArray : public ListArray { DCHECK(out_length); const int32_t pos = offsets_[i]; *out_length = offsets_[i + 1] - pos; - return raw_bytes_ + pos; + return data_ + pos; } + std::shared_ptr data() const { return data_buffer_; } + + const int32_t* offsets() const { return offsets_; } + int32_t offset(int i) const { return offsets_[i]; } + + // Neither of these functions will perform boundschecking + int32_t value_offset(int i) const { return offsets_[i]; } + int32_t value_length(int i) const { return offsets_[i + 1] - offsets_[i]; } + + bool EqualsExact(const BinaryArray& other) const; + bool Equals(const std::shared_ptr& arr) const override; + bool RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx, + const ArrayPtr& arr) const override; + Status Validate() const override; private: - UInt8Array* bytes_; - const uint8_t* raw_bytes_; + std::shared_ptr offset_buffer_; + const int32_t* offsets_; + + std::shared_ptr data_buffer_; + const uint8_t* data_; }; class ARROW_EXPORT StringArray : public BinaryArray { public: StringArray(int32_t length, const std::shared_ptr& offsets, - const ArrayPtr& values, int32_t null_count = 0, + const std::shared_ptr& data, int32_t null_count = 0, const std::shared_ptr& null_bitmap = nullptr); - // Constructor that allows overriding the logical type, so subclasses can propagate - // there - // up the class hierarchy. - StringArray(const TypePtr& type, int32_t length, const std::shared_ptr& offsets, - const ArrayPtr& values, int32_t null_count = 0, - const std::shared_ptr& null_bitmap = nullptr) - : BinaryArray(type, length, offsets, values, null_count, null_bitmap) {} // Construct a std::string // TODO: std::bad_alloc possibility @@ -98,9 +109,7 @@ class ARROW_EXPORT BinaryBuilder : public ListBuilder { return byte_builder_->Append(value, length); } - std::shared_ptr Finish() override { - return ListBuilder::Transfer(); - } + Status Finish(std::shared_ptr* out) override; protected: UInt8Builder* byte_builder_; @@ -112,6 +121,8 @@ class ARROW_EXPORT StringBuilder : public BinaryBuilder { explicit StringBuilder(MemoryPool* pool, const TypePtr& type) : BinaryBuilder(pool, type) {} + Status Finish(std::shared_ptr* out) override; + Status Append(const std::string& value) { return Append(value.c_str(), value.size()); } Status Append(const char* value, int32_t length) { @@ -119,10 +130,6 @@ class ARROW_EXPORT StringBuilder : public BinaryBuilder { } Status Append(const std::vector& values, uint8_t* null_bytes); - - std::shared_ptr Finish() override { - return ListBuilder::Transfer(); - } }; } // namespace arrow diff --git a/cpp/src/arrow/types/struct.cc b/cpp/src/arrow/types/struct.cc index e8176f08268b4..4dccd81b87593 100644 --- a/cpp/src/arrow/types/struct.cc +++ b/cpp/src/arrow/types/struct.cc @@ -87,4 +87,19 @@ Status StructArray::Validate() const { return Status::OK(); } +Status StructBuilder::Finish(std::shared_ptr* out) { + std::vector> fields(field_builders_.size()); + for (size_t i = 0; i < field_builders_.size(); ++i) { + RETURN_NOT_OK(field_builders_[i]->Finish(&fields[i])); + } + + *out = + std::make_shared(type_, length_, fields, null_count_, null_bitmap_); + + null_bitmap_ = nullptr; + capacity_ = length_ = null_count_ = 0; + + return Status::OK(); +} + } // namespace arrow diff --git a/cpp/src/arrow/types/struct.h b/cpp/src/arrow/types/struct.h index 63955eb31bb7d..65b8daf214a69 100644 --- a/cpp/src/arrow/types/struct.h +++ b/cpp/src/arrow/types/struct.h @@ -73,6 +73,8 @@ class ARROW_EXPORT StructBuilder : public ArrayBuilder { field_builders_ = field_builders; } + Status Finish(std::shared_ptr* out) override; + // Null bitmap is of equal length to every child field, and any zero byte // will be considered as a null for that field, but users must using app- // end methods or advance methods of the child builders' independently to @@ -83,21 +85,6 @@ class ARROW_EXPORT StructBuilder : public ArrayBuilder { return Status::OK(); } - std::shared_ptr Finish() override { - std::vector fields; - for (auto it : field_builders_) { - fields.push_back(it->Finish()); - } - - auto result = - std::make_shared(type_, length_, fields, null_count_, null_bitmap_); - - null_bitmap_ = nullptr; - capacity_ = length_ = null_count_ = 0; - - return result; - } - // Append an element to the Struct. All child-builders' Append method must // be called independently to maintain data-structure consistency. Status Append(bool is_valid = true) {