Skip to content

Commit 74d6bae

Browse files
committed
Convert BYTE_ARRAY to StringType or List<UInt8> depending on the logical type
1 parent b7b9ca9 commit 74d6bae

File tree

4 files changed

+124
-43
lines changed

4 files changed

+124
-43
lines changed

cpp/src/arrow/parquet/CMakeLists.txt

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,15 +19,15 @@
1919
# arrow_parquet : Arrow <-> Parquet adapter
2020

2121
set(PARQUET_SRCS
22-
schema.cc
22+
schema.cc
2323
)
2424

2525
set(PARQUET_LIBS
26-
arrow
27-
${PARQUET_SHARED_LIB}
26+
arrow
27+
${PARQUET_SHARED_LIB}
2828
)
2929

30-
add_library(arrow_parquet STATIC
30+
add_library(arrow_parquet SHARED
3131
${PARQUET_SRCS}
3232
)
3333
target_link_libraries(arrow_parquet ${PARQUET_LIBS})

cpp/src/arrow/parquet/parquet-schema-test.cc

Lines changed: 44 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,9 @@
1919

2020
#include "arrow/parquet/schema.h"
2121

22+
#include "arrow/test-util.h"
23+
#include "arrow/util/status.h"
24+
2225
namespace arrow {
2326

2427
namespace parquet {
@@ -28,21 +31,24 @@ using parquet_cpp::schema::NodePtr;
2831
using parquet_cpp::schema::PrimitiveNode;
2932

3033
TEST(TestNodeConversion, Primitive) {
34+
std::shared_ptr<Field> field;
35+
3136
NodePtr node = PrimitiveNode::Make("boolean", Repetition::REQUIRED,
3237
parquet_cpp::Type::BOOLEAN);
33-
std::shared_ptr<Field> field = NodeToField(node);
38+
39+
ASSERT_OK(NodeToField(node, &field));
3440
ASSERT_EQ(field->name, "boolean");
3541
ASSERT_TRUE(field->type->Equals(std::make_shared<BooleanType>()));
3642
ASSERT_FALSE(field->nullable);
3743

3844
node = PrimitiveNode::Make("int32", Repetition::REQUIRED, parquet_cpp::Type::INT32);
39-
field = NodeToField(node);
45+
ASSERT_OK(NodeToField(node, &field));
4046
ASSERT_EQ(field->name, "int32");
4147
ASSERT_TRUE(field->type->Equals(std::make_shared<Int32Type>()));
4248
ASSERT_FALSE(field->nullable);
4349

4450
node = PrimitiveNode::Make("int64", Repetition::REQUIRED, parquet_cpp::Type::INT64);
45-
field = NodeToField(node);
51+
ASSERT_OK(NodeToField(node, &field));
4652
ASSERT_EQ(field->name, "int64");
4753
ASSERT_TRUE(field->type->Equals(std::make_shared<Int64Type>()));
4854
ASSERT_FALSE(field->nullable);
@@ -55,14 +61,14 @@ TEST(TestNodeConversion, Primitive) {
5561

5662
// case parquet_cpp::Type::FLOAT:
5763
node = PrimitiveNode::Make("float", Repetition::REQUIRED, parquet_cpp::Type::FLOAT);
58-
field = NodeToField(node);
64+
ASSERT_OK(NodeToField(node, &field));
5965
ASSERT_EQ(field->name, "float");
6066
ASSERT_TRUE(field->type->Equals(std::make_shared<FloatType>()));
6167
ASSERT_FALSE(field->nullable);
6268

6369
// case parquet_cpp::Type::DOUBLE:
6470
node = PrimitiveNode::Make("double", Repetition::REQUIRED, parquet_cpp::Type::DOUBLE);
65-
field = NodeToField(node);
71+
ASSERT_OK(NodeToField(node, &field));
6672
ASSERT_EQ(field->name, "double");
6773
ASSERT_TRUE(field->type->Equals(std::make_shared<DoubleType>()));
6874
ASSERT_FALSE(field->nullable);
@@ -80,6 +86,39 @@ TEST(TestNodeConversion, Primitive) {
8086
// TODO: Assertions
8187
}
8288

89+
const auto UINT8 = std::make_shared<UInt8Type>();
90+
91+
TEST(TestNodeConversion, Int96Timestamp) {
92+
}
93+
94+
TEST(TestNodeConversion, ByteArray) {
95+
std::shared_ptr<Field> field;
96+
97+
NodePtr node = PrimitiveNode::Make("field0", Repetition::OPTIONAL,
98+
parquet_cpp::Type::BYTE_ARRAY);
99+
ASSERT_OK(NodeToField(node, &field));
100+
101+
std::shared_ptr<DataType> ex_type = std::make_shared<ListType>(
102+
std::make_shared<Field>("", UINT8));
103+
104+
ASSERT_EQ(field->name, "field0");
105+
ASSERT_TRUE(field->type->Equals(ex_type));
106+
ASSERT_TRUE(field->nullable);
107+
108+
node = PrimitiveNode::Make("field1", Repetition::OPTIONAL,
109+
parquet_cpp::Type::BYTE_ARRAY,
110+
parquet_cpp::LogicalType::UTF8);
111+
ASSERT_OK(NodeToField(node, &field));
112+
ex_type = std::make_shared<StringType>();
113+
114+
ASSERT_EQ(field->name, "field1");
115+
ASSERT_TRUE(field->type->Equals(ex_type));
116+
ASSERT_TRUE(field->nullable);
117+
}
118+
119+
TEST(TestNodeConversion, FixedLenByteArray) {
120+
}
121+
83122
TEST(TestNodeConversion, Logical) {
84123
}
85124

cpp/src/arrow/parquet/schema.cc

Lines changed: 64 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -15,75 +15,109 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18+
#include "arrow/parquet/schema.h"
19+
1820
#include <vector>
1921

20-
#include "arrow/parquet/schema.h"
22+
#include "parquet/api/schema.h"
23+
24+
#include "arrow/util/status.h"
2125
#include "arrow/types/decimal.h"
2226

2327
using parquet_cpp::schema::Node;
2428
using parquet_cpp::schema::NodePtr;
2529
using parquet_cpp::schema::GroupNode;
2630
using parquet_cpp::schema::PrimitiveNode;
2731

32+
using parquet_cpp::LogicalType;
33+
2834
namespace arrow {
2935

3036
namespace parquet {
3137

38+
const auto BOOL = std::make_shared<BooleanType>();
39+
const auto UINT8 = std::make_shared<UInt8Type>();
40+
const auto INT32 = std::make_shared<Int32Type>();
41+
const auto INT64 = std::make_shared<Int64Type>();
42+
const auto FLOAT = std::make_shared<FloatType>();
43+
const auto DOUBLE = std::make_shared<DoubleType>();
44+
const auto UTF8 = std::make_shared<StringType>();
45+
const auto BINARY = std::make_shared<ListType>(
46+
std::make_shared<Field>("", UINT8));
3247

3348
TypePtr MakeDecimalType(const PrimitiveNode* node) {
3449
int precision = node->decimal_metadata().precision;
3550
int scale = node->decimal_metadata().scale;
36-
return TypePtr(new DecimalType(precision, scale));
51+
return std::make_shared<DecimalType>(precision, scale);
52+
}
53+
54+
static Status FromByteArray(const PrimitiveNode* node, TypePtr* out) {
55+
switch (node->logical_type()) {
56+
case LogicalType::UTF8:
57+
*out = UTF8;
58+
break;
59+
default:
60+
// BINARY
61+
*out = BINARY;
62+
break;
63+
}
64+
return Status::OK();
65+
}
66+
67+
static Status FromFLBA(const PrimitiveNode* node, TypePtr* out) {
68+
switch (node->logical_type()) {
69+
case LogicalType::DECIMAL:
70+
*out = MakeDecimalType(node);
71+
break;
72+
default:
73+
return Status::NotImplemented("unhandled type");
74+
break;
75+
}
76+
77+
return Status::OK();
3778
}
3879

3980
// TODO: Logical Type Handling
40-
std::shared_ptr<Field> NodeToField(const NodePtr& node) {
81+
Status NodeToField(const NodePtr& node, std::shared_ptr<Field>* out) {
4182
TypePtr type;
4283

4384
if (node->is_group()) {
4485
const GroupNode* group = static_cast<const GroupNode*>(node.get());
45-
std::vector<std::shared_ptr<Field>> fields;
86+
std::vector<std::shared_ptr<Field>> fields(group->field_count());
4687
for (int i = 0; i < group->field_count(); i++) {
47-
fields.push_back(NodeToField(group->field(i)));
88+
RETURN_NOT_OK(NodeToField(group->field(i), &fields[i]));
4889
}
49-
type = TypePtr(new StructType(fields));
90+
type = std::make_shared<StructType>(fields);
5091
} else {
5192
// Primitive (leaf) node
5293
const PrimitiveNode* primitive = static_cast<const PrimitiveNode*>(node.get());
5394

5495
switch (primitive->physical_type()) {
5596
case parquet_cpp::Type::BOOLEAN:
56-
type = TypePtr(new BooleanType());
97+
type = BOOL;
5798
break;
5899
case parquet_cpp::Type::INT32:
59-
type = TypePtr(new Int32Type());
100+
type = INT32;
60101
break;
61102
case parquet_cpp::Type::INT64:
62-
type = TypePtr(new Int64Type());
103+
type = INT64;
63104
break;
64105
case parquet_cpp::Type::INT96:
65106
// TODO: Do we have that type in Arrow?
66107
// type = TypePtr(new Int96Type());
67-
break;
108+
return Status::NotImplemented("int96");
68109
case parquet_cpp::Type::FLOAT:
69-
type = TypePtr(new FloatType());
110+
type = FLOAT;
70111
break;
71112
case parquet_cpp::Type::DOUBLE:
72-
type = TypePtr(new DoubleType());
113+
type = DOUBLE;
73114
break;
74115
case parquet_cpp::Type::BYTE_ARRAY:
75116
// TODO: Do we have that type in Arrow?
76-
// type = TypePtr(new Int96Type());
117+
RETURN_NOT_OK(FromByteArray(primitive, &type));
77118
break;
78119
case parquet_cpp::Type::FIXED_LEN_BYTE_ARRAY:
79-
switch (primitive->logical_type()) {
80-
case parquet_cpp::LogicalType::DECIMAL:
81-
type = MakeDecimalType(primitive);
82-
break;
83-
default:
84-
// TODO: Do we have that type in Arrow?
85-
break;
86-
}
120+
RETURN_NOT_OK(FromFLBA(primitive, &type));
87121
break;
88122
}
89123
}
@@ -92,21 +126,25 @@ std::shared_ptr<Field> NodeToField(const NodePtr& node) {
92126
type = TypePtr(new ListType(type));
93127
}
94128

95-
return std::shared_ptr<Field>(new Field(node->name(), type, !node->is_required()));
129+
*out = std::make_shared<Field>(node->name(), type, !node->is_required());
130+
131+
return Status::OK();
96132
}
97133

98-
std::shared_ptr<Schema> FromParquetSchema(
99-
const parquet_cpp::SchemaDescriptor* parquet_schema) {
134+
Status FromParquetSchema(const parquet_cpp::SchemaDescriptor* parquet_schema,
135+
std::shared_ptr<Schema>* out) {
100136
std::vector<std::shared_ptr<Field>> fields;
101137
const GroupNode* schema_node = static_cast<const GroupNode*>(
102138
parquet_schema->schema().get());
103139

104140
// TODO: What to with the head node?
141+
fields.resize(schema_node->field_count());
105142
for (int i = 0; i < schema_node->field_count(); i++) {
106-
fields.push_back(NodeToField(schema_node->field(i)));
143+
RETURN_NOT_OK(NodeToField(schema_node->field(i), &fields[i]));
107144
}
108145

109-
return std::shared_ptr<Schema>(new Schema(fields));
146+
*out = std::make_shared<Schema>(fields);
147+
return Status::OK();
110148
}
111149

112150
} // namespace parquet

cpp/src/arrow/parquet/schema.h

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -18,20 +18,24 @@
1818
#ifndef ARROW_PARQUET_SCHEMA_H
1919
#define ARROW_PARQUET_SCHEMA_H
2020

21-
#include <arrow/schema.h>
22-
#include <arrow/type.h>
23-
#include <parquet/schema/descriptor.h>
24-
#include <parquet/schema/types.h>
25-
2621
#include <memory>
2722

23+
#include "parquet/api/schema.h"
24+
25+
#include "arrow/schema.h"
26+
#include "arrow/type.h"
27+
2828
namespace arrow {
2929

30+
class Status;
31+
3032
namespace parquet {
3133

32-
std::shared_ptr<Field> NodeToField(const parquet_cpp::schema::NodePtr& node);
33-
std::shared_ptr<Schema> FromParquetSchema(
34-
const parquet_cpp::SchemaDescriptor* parquet_schema);
34+
Status NodeToField(const parquet_cpp::schema::NodePtr& node,
35+
std::shared_ptr<Field>* out);
36+
37+
Status FromParquetSchema(const parquet_cpp::SchemaDescriptor* parquet_schema,
38+
std::shared_ptr<Schema>* out);
3539

3640
} // namespace parquet
3741

0 commit comments

Comments
 (0)