Skip to content

Commit 0a49022

Browse files
xuepanchenwesm
authored andcommitted
ARROW-1712: [C++] Add method to BinaryBuilder to reserve space for value data
Modified BinaryBuilder::Resize(int64_t) so that when building BinaryArrays with a known size, space is also reserved for value_data_builder_ to prevent internal reallocation. Author: Panchen Xue <pan.panchen.xue@gmail.com> Closes #1481 from xuepanchen/master and squashes the following commits: 707b67b [Panchen Xue] ARROW-1712: [C++] Fix lint errors 360e601 [Panchen Xue] Merge branch 'master' of https://github.com/xuepanchen/arrow d4bbd15 [Panchen Xue] ARROW-1712: [C++] Modify test case for BinaryBuilder::ReserveData() and change arguments for offsets_builder_.Resize() 77f8f3c [Panchen Xue] Merge pull request #5 from apache/master bc5db7d [Panchen Xue] ARROW-1712: [C++] Remove unneeded data member in BinaryBuilder and modify test case 5a5b70e [Panchen Xue] Merge pull request #4 from apache/master 8e4c892 [Panchen Xue] Merge pull request #3 from xuepanchen/xuepanchen-arrow-1712 d3c8202 [Panchen Xue] ARROW-1945: [C++] Fix a small typo 0b07895 [Panchen Xue] ARROW-1945: [C++] Add data_capacity_ to track capacity of value data 18f90fb [Panchen Xue] ARROW-1945: [C++] Add data_capacity_ to track capacity of value data bbc6527 [Panchen Xue] ARROW-1945: [C++] Update test case for BinaryBuild data value space reservation 15e045c [Panchen Xue] Add test case for array-test.cc 5a5593e [Panchen Xue] Update again ReserveData(int64_t) method for BinaryBuilder 9b5e805 [Panchen Xue] Update ReserveData(int64_t) method signature for BinaryBuilder 8dd5eaa [Panchen Xue] Update builder.cc b002e0b [Panchen Xue] Remove override keyword from ReserveData(int64_t) method for BinaryBuilder de318f4 [Panchen Xue] Implement ReserveData(int64_t) method for BinaryBuilder e0434e6 [Panchen Xue] Add ReserveData(int64_t) and value_data_capacity() for methods for BinaryBuilder 5ebfb32 [Panchen Xue] Add capacity() method for TypedBufferBuilder 5b73c1c [Panchen Xue] Update again BinaryBuilder::Resize(int64_t capacity) in builder.cc d021c54 [Panchen Xue] Merge pull request #2 from xuepanchen/xuepanchen-arrow-1712 232024e [Panchen Xue] Update BinaryBuilder::Resize(int64_t capacity) in builder.cc c2f8dc4 [Panchen Xue] Merge pull request #1 from apache/master
1 parent 0930b1d commit 0a49022

File tree

4 files changed

+59
-4
lines changed

4 files changed

+59
-4
lines changed

cpp/src/arrow/array-test.cc

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1155,6 +1155,45 @@ TEST_F(TestBinaryBuilder, TestScalarAppend) {
11551155
}
11561156
}
11571157

1158+
TEST_F(TestBinaryBuilder, TestCapacityReserve) {
1159+
vector<string> strings = {"aaaaa", "bbbbbbbbbb", "ccccccccccccccc", "dddddddddd"};
1160+
int N = static_cast<int>(strings.size());
1161+
int reps = 15;
1162+
int64_t length = 0;
1163+
int64_t capacity = 1000;
1164+
int64_t expected_capacity = BitUtil::RoundUpToMultipleOf64(capacity);
1165+
1166+
ASSERT_OK(builder_->ReserveData(capacity));
1167+
1168+
ASSERT_EQ(length, builder_->value_data_length());
1169+
ASSERT_EQ(expected_capacity, builder_->value_data_capacity());
1170+
1171+
for (int j = 0; j < reps; ++j) {
1172+
for (int i = 0; i < N; ++i) {
1173+
ASSERT_OK(builder_->Append(strings[i]));
1174+
length += static_cast<int>(strings[i].size());
1175+
1176+
ASSERT_EQ(length, builder_->value_data_length());
1177+
ASSERT_EQ(expected_capacity, builder_->value_data_capacity());
1178+
}
1179+
}
1180+
1181+
int extra_capacity = 500;
1182+
expected_capacity = BitUtil::RoundUpToMultipleOf64(length + extra_capacity);
1183+
1184+
ASSERT_OK(builder_->ReserveData(extra_capacity));
1185+
1186+
ASSERT_EQ(length, builder_->value_data_length());
1187+
ASSERT_EQ(expected_capacity, builder_->value_data_capacity());
1188+
1189+
Done();
1190+
1191+
ASSERT_EQ(reps * N, result_->length());
1192+
ASSERT_EQ(0, result_->null_count());
1193+
ASSERT_EQ(reps * 40, result_->value_data()->size());
1194+
ASSERT_EQ(expected_capacity, result_->value_data()->capacity());
1195+
}
1196+
11581197
TEST_F(TestBinaryBuilder, TestZeroLength) {
11591198
// All buffers are null
11601199
Done();

cpp/src/arrow/buffer.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -333,6 +333,7 @@ class ARROW_EXPORT TypedBufferBuilder : public BufferBuilder {
333333

334334
const T* data() const { return reinterpret_cast<const T*>(data_); }
335335
int64_t length() const { return size_ / sizeof(T); }
336+
int64_t capacity() const { return capacity_ / sizeof(T); }
336337
};
337338

338339
/// \brief Allocate a fixed size mutable buffer from a memory pool

cpp/src/arrow/builder.cc

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1165,13 +1165,13 @@ Status ListBuilder::Init(int64_t elements) {
11651165
DCHECK_LT(elements, std::numeric_limits<int32_t>::max());
11661166
RETURN_NOT_OK(ArrayBuilder::Init(elements));
11671167
// one more then requested for offsets
1168-
return offsets_builder_.Resize((elements + 1) * sizeof(int64_t));
1168+
return offsets_builder_.Resize((elements + 1) * sizeof(int32_t));
11691169
}
11701170

11711171
Status ListBuilder::Resize(int64_t capacity) {
11721172
DCHECK_LT(capacity, std::numeric_limits<int32_t>::max());
11731173
// one more then requested for offsets
1174-
RETURN_NOT_OK(offsets_builder_.Resize((capacity + 1) * sizeof(int64_t)));
1174+
RETURN_NOT_OK(offsets_builder_.Resize((capacity + 1) * sizeof(int32_t)));
11751175
return ArrayBuilder::Resize(capacity);
11761176
}
11771177

@@ -1216,16 +1216,26 @@ Status BinaryBuilder::Init(int64_t elements) {
12161216
DCHECK_LT(elements, std::numeric_limits<int32_t>::max());
12171217
RETURN_NOT_OK(ArrayBuilder::Init(elements));
12181218
// one more then requested for offsets
1219-
return offsets_builder_.Resize((elements + 1) * sizeof(int64_t));
1219+
return offsets_builder_.Resize((elements + 1) * sizeof(int32_t));
12201220
}
12211221

12221222
Status BinaryBuilder::Resize(int64_t capacity) {
12231223
DCHECK_LT(capacity, std::numeric_limits<int32_t>::max());
12241224
// one more then requested for offsets
1225-
RETURN_NOT_OK(offsets_builder_.Resize((capacity + 1) * sizeof(int64_t)));
1225+
RETURN_NOT_OK(offsets_builder_.Resize((capacity + 1) * sizeof(int32_t)));
12261226
return ArrayBuilder::Resize(capacity);
12271227
}
12281228

1229+
Status BinaryBuilder::ReserveData(int64_t elements) {
1230+
if (value_data_length() + elements > value_data_capacity()) {
1231+
if (value_data_length() + elements > std::numeric_limits<int32_t>::max()) {
1232+
return Status::Invalid("Cannot reserve capacity larger than 2^31 - 1 for binary");
1233+
}
1234+
RETURN_NOT_OK(value_data_builder_.Reserve(elements));
1235+
}
1236+
return Status::OK();
1237+
}
1238+
12291239
Status BinaryBuilder::AppendNextOffset() {
12301240
const int64_t num_bytes = value_data_builder_.length();
12311241
if (ARROW_PREDICT_FALSE(num_bytes > kMaximumCapacity)) {

cpp/src/arrow/builder.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -682,10 +682,15 @@ class ARROW_EXPORT BinaryBuilder : public ArrayBuilder {
682682

683683
Status Init(int64_t elements) override;
684684
Status Resize(int64_t capacity) override;
685+
/// \brief Ensures there is enough allocated capacity to append the indicated
686+
/// number of bytes to the value data buffer without additional allocations
687+
Status ReserveData(int64_t elements);
685688
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
686689

687690
/// \return size of values buffer so far
688691
int64_t value_data_length() const { return value_data_builder_.length(); }
692+
/// \return capacity of values buffer
693+
int64_t value_data_capacity() const { return value_data_builder_.capacity(); }
689694

690695
/// Temporary access to a value.
691696
///

0 commit comments

Comments
 (0)