Skip to content

Commit 73eb5c2

Browse files
author
fengguangyuan
committed
ARROW-60: Struct type builder API
Implement the basic classes, StructArray and StructBuilder, meanwhile, add the perspective test cases for them. Other necessary methods will be added subsequetly.
1 parent 0b472d8 commit 73eb5c2

File tree

5 files changed

+291
-4
lines changed

5 files changed

+291
-4
lines changed

cpp/src/arrow/type.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,7 @@ struct Field {
161161

162162
std::string ToString() const;
163163
};
164+
typedef std::shared_ptr<Field> FieldPtr;
164165

165166
template <typename Derived>
166167
struct PrimitiveType : public DataType {

cpp/src/arrow/types/construct.cc

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
#include "arrow/types/list.h"
2424
#include "arrow/types/primitive.h"
2525
#include "arrow/types/string.h"
26+
#include "arrow/types/struct.h"
2627
#include "arrow/util/buffer.h"
2728
#include "arrow/util/status.h"
2829

@@ -71,6 +72,20 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type,
7172
}
7273
}
7374

75+
Status MakeStructBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type,
76+
const std::vector<std::shared_ptr<Field>>& fields,
77+
std::shared_ptr<ArrayBuilder>* out) {
78+
std::vector<std::shared_ptr<ArrayBuilder>> values_builder;
79+
80+
for (auto it = fields.cbegin(); it != fields.cend(); it++) {
81+
std::shared_ptr<ArrayBuilder> builder;
82+
RETURN_NOT_OK(MakeBuilder(pool, it->get()->type, &builder));
83+
values_builder.push_back(builder);
84+
}
85+
out->reset(new StructBuilder(pool, type, fields, values_builder));
86+
return Status::OK();
87+
}
88+
7489
#define MAKE_PRIMITIVE_ARRAY_CASE(ENUM, ArrayType) \
7590
case Type::ENUM: \
7691
out->reset(new ArrayType(type, length, data, null_count, null_bitmap)); \

cpp/src/arrow/types/construct.h

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,19 +20,24 @@
2020

2121
#include <cstdint>
2222
#include <memory>
23-
23+
#include <vector>
2424
namespace arrow {
2525

2626
class Array;
2727
class ArrayBuilder;
2828
class Buffer;
2929
struct DataType;
30+
struct Field;
3031
class MemoryPool;
3132
class Status;
3233

3334
Status MakeBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type,
3435
std::shared_ptr<ArrayBuilder>* out);
3536

37+
Status MakeStructBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type,
38+
const std::vector<std::shared_ptr<Field>>& fields,
39+
std::shared_ptr<ArrayBuilder>* out);
40+
3641
// Create new arrays for logical types that are backed by primitive arrays.
3742
Status MakePrimitiveArray(const std::shared_ptr<DataType>& type, int32_t length,
3843
const std::shared_ptr<Buffer>& data, int32_t null_count,

cpp/src/arrow/types/struct-test.cc

Lines changed: 139 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,16 @@
2222
#include "gtest/gtest.h"
2323

2424
#include "arrow/type.h"
25+
#include "arrow/array.h"
26+
#include "arrow/builder.h"
27+
#include "arrow/test-util.h"
28+
#include "arrow/types/struct.h"
29+
#include "arrow/types/construct.h"
30+
#include "arrow/types/list.h"
31+
#include "arrow/types/primitive.h"
32+
#include "arrow/types/test-common.h"
33+
#include "arrow/util/status.h"
34+
2535

2636
using std::shared_ptr;
2737
using std::string;
@@ -52,4 +62,132 @@ TEST(TestStructType, Basics) {
5262
// TODO(wesm): out of bounds for field(...)
5363
}
5464

55-
} // namespace arrow
65+
// .............................................................................
66+
// Struct test
67+
class TestStructBuilder : public TestBuilder {
68+
public:
69+
void SetUp() {
70+
TestBuilder::SetUp();
71+
72+
auto value_type = TypePtr(new Int32Type());
73+
auto char_type = TypePtr(new Int8Type());
74+
auto list_type = TypePtr(new ListType(char_type));
75+
76+
std::vector<TypePtr> types = {list_type, value_type};
77+
std::vector<FieldPtr> fields;
78+
fields.push_back(FieldPtr(new Field("list", list_type)));
79+
fields.push_back(FieldPtr(new Field("int", value_type)));
80+
81+
type_ = TypePtr(new StructType(fields));
82+
value_fields_ = fields;
83+
84+
std::shared_ptr<ArrayBuilder> tmp;
85+
ASSERT_OK(MakeStructBuilder(pool_, type_, fields, &tmp));
86+
87+
builder_ = std::dynamic_pointer_cast<StructBuilder>(tmp);
88+
}
89+
90+
void Done() {
91+
result_ = std::dynamic_pointer_cast<StructArray>(builder_->Finish());
92+
}
93+
94+
protected:
95+
std::vector<FieldPtr> value_fields_;
96+
TypePtr type_;
97+
98+
std::shared_ptr<StructBuilder> builder_;
99+
std::shared_ptr<StructArray> result_;
100+
};
101+
102+
TEST_F(TestStructBuilder, TestAppendNull) {
103+
ASSERT_OK(builder_->AppendNull());
104+
ASSERT_OK(builder_->AppendNull());
105+
ASSERT_EQ(2, builder_->value_builder().size());
106+
107+
Done();
108+
109+
ASSERT_EQ(2, result_->values().size());
110+
ASSERT_TRUE(result_->IsNull(0));
111+
ASSERT_TRUE(result_->IsNull(1));
112+
113+
114+
auto list_char = static_cast<ListArray*>(result_->values(0).get());
115+
auto chars = static_cast<Int8Array*>(list_char->values().get());
116+
auto int32 = static_cast<Int32Array*>(result_->values(1).get());
117+
ASSERT_EQ(0, list_char->length());
118+
ASSERT_EQ(0, chars->length());
119+
ASSERT_EQ(0, int32->length());
120+
121+
ASSERT_EQ(Type::LIST, list_char->type_enum());
122+
ASSERT_EQ(Type::INT8, list_char->values()->type_enum());
123+
ASSERT_EQ(Type::INT32, int32->type_enum());
124+
}
125+
126+
TEST_F(TestStructBuilder, TestBasics) {
127+
vector<int32_t> int_values = {1, 2, 3, 4};
128+
vector<char> list_values = {'j', 'o', 'e', 'b', 'o', 'b', 'm', 'a', 'r', 'k'};
129+
vector<int> list_lengths = {3, 0, 3, 4};
130+
vector<int> list_offsets = {0, 3, 3, 6, 10};
131+
vector<uint8_t> list_is_not_null = {1, 0, 1, 1};
132+
vector<uint8_t> struct_is_not_null = {1, 1, 1, 1};
133+
134+
ListBuilder* list_vb = static_cast<ListBuilder*>(
135+
builder_->value_builder().at(0).get());
136+
Int8Builder* char_vb = static_cast<Int8Builder*>(
137+
list_vb->value_builder().get());
138+
Int32Builder* int_vb = static_cast<Int32Builder*>(
139+
builder_->value_builder().at(1).get());
140+
ASSERT_EQ(2, builder_->value_builder().size());
141+
142+
//EXPECT_OK(builder_->Reserve(list_lengths.size()));
143+
EXPECT_OK(char_vb->Reserve(list_values.size()));
144+
EXPECT_OK(int_vb->Reserve(int_values.size()));
145+
146+
int pos = 0;
147+
for (size_t i = 0; i < list_lengths.size(); ++i) {
148+
ASSERT_OK(list_vb->Append(list_is_not_null[i] > 0));
149+
int_vb->Append(int_values[i]);
150+
for (int j = 0; j < list_lengths[i]; ++j) {
151+
char_vb->Append(list_values[pos++]);
152+
}
153+
}
154+
155+
for (size_t i = 0; i < struct_is_not_null.size(); ++i) {
156+
ASSERT_OK(builder_->Append(struct_is_not_null[i] > 0));
157+
}
158+
159+
Done();
160+
161+
ASSERT_EQ(2, result_->length());
162+
163+
auto list_char = static_cast<ListArray*>(result_->values(0).get());
164+
auto chars = static_cast<Int8Array*>(list_char->values().get());
165+
auto int32 = static_cast<Int32Array*>(result_->values(1).get());
166+
167+
ASSERT_EQ(0, result_->null_count());
168+
ASSERT_EQ(1, list_char->null_count());
169+
ASSERT_EQ(0, int32->null_count());
170+
171+
172+
for (int i = 0; i < result_->length(); ++i) {
173+
ASSERT_EQ(!static_cast<bool>(struct_is_not_null[i]), result_->IsNull(i));
174+
ASSERT_EQ(!static_cast<bool>(list_is_not_null[i]), list_char->IsNull(i));
175+
}
176+
177+
// List<char>
178+
ASSERT_EQ(4, list_char->length());
179+
ASSERT_EQ(10, list_char->values()->length());
180+
for (size_t i = 0; i < list_offsets.size(); ++i) {
181+
ASSERT_EQ(list_offsets[i], list_char->offsets()[i]);
182+
}
183+
for (size_t i = 0; i < list_values.size(); ++i) {
184+
ASSERT_EQ(list_values[i], chars->Value(i));
185+
}
186+
187+
// Int32
188+
ASSERT_EQ(4, int32->length());
189+
for (size_t i = 0; i < int_values.size(); ++i) {
190+
ASSERT_EQ(int_values[i], int32->Value(i));
191+
}
192+
}
193+
} // namespace arrow

cpp/src/arrow/types/struct.h

Lines changed: 130 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,135 @@
2323
#include <vector>
2424

2525
#include "arrow/type.h"
26+
#include "arrow/types/primitive.h"
27+
#include "arrow/types/list.h"
2628

27-
namespace arrow {} // namespace arrow
29+
namespace arrow {
2830

29-
#endif // ARROW_TYPES_STRUCT_H
31+
class StructArray : public Array {
32+
public:
33+
StructArray(const TypePtr& type, int32_t length,
34+
std::vector<ArrayPtr>& values,
35+
int32_t null_count = 0,
36+
std::shared_ptr<Buffer> null_bitmap = nullptr) :
37+
Array(type, length, null_count, null_bitmap) {
38+
type_ = type;
39+
values_ = values;
40+
}
41+
42+
virtual ~StructArray() {}
43+
44+
// Return a shared pointer in case the requestor desires to share ownership
45+
// with this array.
46+
const std::shared_ptr<Array>& values(int32_t pos) const {return values_.at(pos);}
47+
const std::vector<ArrayPtr>& values() const {return values_;}
48+
49+
const std::shared_ptr<DataType>& value_type(int32_t pos) const {
50+
return values_.at(pos)->type();
51+
}
52+
53+
// return the count of Array in values_
54+
int32_t length() const { return values_.size(); }
55+
56+
bool EqualsExact(const ListArray& other) const {
57+
return true;
58+
}
59+
bool Equals(const std::shared_ptr<Array>& arr) const override {
60+
return true;
61+
}
62+
63+
protected:
64+
// Contains kinds of Arrays.
65+
std::vector<ArrayPtr> values_;
66+
};
67+
68+
// ............................................................................
69+
// StrcutArray builder
70+
class StructBuilder : public ArrayBuilder {
71+
public:
72+
StructBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type,
73+
const std::vector<FieldPtr>& fields,
74+
std::vector<std::shared_ptr<ArrayBuilder>>& value_builder)
75+
: ArrayBuilder(pool, type) {
76+
fields_ = fields;
77+
value_builder_ = value_builder;
78+
}
79+
80+
Status Init(int32_t elements) {
81+
return ArrayBuilder::Init(elements);
82+
}
83+
84+
Status Resize(int32_t capacity) {
85+
// Need space for the end offset
86+
if (capacity < MIN_BUILDER_CAPACITY) {
87+
capacity = MIN_BUILDER_CAPACITY;
88+
}
89+
90+
if (capacity_ == 0) {
91+
RETURN_NOT_OK(ArrayBuilder::Init(capacity));
92+
} else {
93+
RETURN_NOT_OK(ArrayBuilder::Resize(capacity));
94+
}
95+
capacity_ = capacity;
96+
97+
return Status::OK();
98+
}
99+
100+
// Vector append
101+
//
102+
// If passed, valid_bytes is of equal length to values, and any zero byte
103+
// will be considered as a null for that slot
104+
Status Append(const uint8_t* null_bitmap, int32_t length) {
105+
RETURN_NOT_OK(Reserve(length));
106+
UnsafeAppendToBitmap(null_bitmap, length);
107+
return Status::OK();
108+
}
109+
110+
template <typename Container>
111+
std::shared_ptr<Array> Transfer() {
112+
DCHECK(value_builder_.size());
113+
114+
std::vector<std::shared_ptr<Array>> items;
115+
for (auto it = value_builder_.cbegin(); it != value_builder_.cend(); it++) {
116+
items.push_back(it->get()->Finish());
117+
}
118+
// Here, for ListArray, offsets_ is needed, but StructArray dont need it.
119+
auto result = std::make_shared<StructArray>(type_, length_, items,
120+
null_count_, null_bitmap_);
121+
122+
null_bitmap_ = nullptr;
123+
capacity_ = length_ = null_count_ = 0;
124+
125+
return result;
126+
}
127+
128+
std::shared_ptr<Array> Finish() override {
129+
return Transfer<StructArray>();
130+
}
131+
132+
// Start a new variable-length list slot
133+
//
134+
// This function should be called before beginning to append elements to the
135+
// value builder
136+
Status Append(bool is_valid = true) {
137+
RETURN_NOT_OK(Reserve(1));
138+
UnsafeAppendToBitmap(is_valid);
139+
return Status::OK();
140+
}
141+
142+
Status AppendNull() {
143+
return Append(false);
144+
}
145+
146+
const std::vector<std::shared_ptr<ArrayBuilder>>& value_builder() const {
147+
return value_builder_;
148+
}
149+
150+
protected:
151+
std::vector<std::shared_ptr<ArrayBuilder>> value_builder_;
152+
std::vector<FieldPtr> fields_;
153+
};
154+
155+
} // namespace arrow
156+
157+
#endif // ARROW_TYPES_STRUCT_H

0 commit comments

Comments
 (0)