Skip to content

Commit

Permalink
PARQUET-603: Implement missing information in schema descriptor
Browse files Browse the repository at this point in the history
Author: Deepak Majeti <deepak.majeti@hpe.com>

Closes apache#97 from majetideepak/LeafToBase and squashes the following commits:

9ded368 [Deepak Majeti] review comments
d80352f [Deepak Majeti] added tests
2a95b67 [Deepak Majeti] Implemented leaf_to_base

Change-Id: Ic1ccc14aaae1dcefd5986d8d5f5fd09c1abd7e02
  • Loading branch information
Deepak Majeti authored and wesm committed May 9, 2016
1 parent e132b9d commit ccf470d
Show file tree
Hide file tree
Showing 5 changed files with 30 additions and 9 deletions.
1 change: 1 addition & 0 deletions cpp/src/parquet/column/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
install(FILES
page.h
levels.h
properties.h
reader.h
scanner.h
writer.h
Expand Down
1 change: 0 additions & 1 deletion cpp/src/parquet/column/properties.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@

#include "parquet/util/input.h"
#include "parquet/util/mem-allocator.h"
#include "parquet/types.h"

namespace parquet {

Expand Down
16 changes: 12 additions & 4 deletions cpp/src/parquet/schema/descriptor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include "parquet/schema/descriptor.h"

#include "parquet/exception.h"
#include "parquet/util/logging.h"

namespace parquet {

Expand All @@ -42,12 +43,12 @@ void SchemaDescriptor::Init(const NodePtr& schema) {
leaves_.clear();

for (int i = 0; i < group_->field_count(); ++i) {
BuildTree(group_->field(i), 0, 0);
BuildTree(group_->field(i), 0, 0, group_->field(i));
}
}

void SchemaDescriptor::BuildTree(
const NodePtr& node, int16_t max_def_level, int16_t max_rep_level) {
void SchemaDescriptor::BuildTree(const NodePtr& node, int16_t max_def_level,
int16_t max_rep_level, const NodePtr& base) {
if (node->is_optional()) {
++max_def_level;
} else if (node->is_repeated()) {
Expand All @@ -61,11 +62,12 @@ void SchemaDescriptor::BuildTree(
if (node->is_group()) {
const GroupNode* group = static_cast<const GroupNode*>(node.get());
for (int i = 0; i < group->field_count(); ++i) {
BuildTree(group->field(i), max_def_level, max_rep_level);
BuildTree(group->field(i), max_def_level, max_rep_level, base);
}
} else {
// Primitive node, append to leaves
leaves_.push_back(ColumnDescriptor(node, max_def_level, max_rep_level, this));
leaf_to_base_.emplace(leaves_.size() - 1, base);
}
}

Expand All @@ -81,9 +83,15 @@ ColumnDescriptor::ColumnDescriptor(const schema::NodePtr& node,
}

const ColumnDescriptor* SchemaDescriptor::Column(int i) const {
DCHECK(i >= 0 && i < static_cast<int>(leaves_.size()));
return &leaves_[i];
}

const schema::NodePtr& SchemaDescriptor::GetColumnRoot(int i) const {
DCHECK(i >= 0 && i < static_cast<int>(leaves_.size()));
return leaf_to_base_.find(i)->second;
}

int ColumnDescriptor::type_scale() const {
return primitive_node_->decimal_metadata().scale;
}
Expand Down
11 changes: 8 additions & 3 deletions cpp/src/parquet/schema/descriptor.h
Original file line number Diff line number Diff line change
Expand Up @@ -101,14 +101,19 @@ class SchemaDescriptor {

const schema::NodePtr& schema() const { return schema_; }

const schema::GroupNode* group() const { return group_; }

// Returns the root (child of the schema root) node of the leaf(column) node
const schema::NodePtr& GetColumnRoot(int i) const;

private:
friend class ColumnDescriptor;

schema::NodePtr schema_;
const schema::GroupNode* group_;

void BuildTree(
const schema::NodePtr& node, int16_t max_def_level, int16_t max_rep_level);
void BuildTree(const schema::NodePtr& node, int16_t max_def_level,
int16_t max_rep_level, const schema::NodePtr& base);

// Result of leaf node / tree analysis
std::vector<ColumnDescriptor> leaves_;
Expand All @@ -122,7 +127,7 @@ class SchemaDescriptor {
// -- -- b |
// -- -- -- c |
// -- -- -- -- d
std::unordered_map<int, schema::NodePtr> leaf_to_base_;
std::unordered_map<int, const schema::NodePtr> leaf_to_base_;
};

} // namespace parquet
Expand Down
10 changes: 9 additions & 1 deletion cpp/src/parquet/schema/schema-descriptor-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,8 @@ TEST_F(TestSchemaDescriptor, BuildTree) {
NodeVector fields;
NodePtr schema;

fields.push_back(Int32("a", Repetition::REQUIRED));
NodePtr inta = Int32("a", Repetition::REQUIRED);
fields.push_back(inta);
fields.push_back(Int64("b", Repetition::OPTIONAL));
fields.push_back(ByteArray("c", Repetition::REPEATED));

Expand Down Expand Up @@ -122,6 +123,13 @@ TEST_F(TestSchemaDescriptor, BuildTree) {
ASSERT_EQ(descr_.Column(4)->path()->ToDotString(), "bag.records.item2");
ASSERT_EQ(descr_.Column(5)->path()->ToDotString(), "bag.records.item3");

ASSERT_EQ(inta.get(), descr_.GetColumnRoot(0).get());
ASSERT_EQ(bag.get(), descr_.GetColumnRoot(3).get());
ASSERT_EQ(bag.get(), descr_.GetColumnRoot(4).get());
ASSERT_EQ(bag.get(), descr_.GetColumnRoot(5).get());

ASSERT_EQ(schema.get(), descr_.group());

// Init clears the leaves
descr_.Init(schema);
ASSERT_EQ(nleaves, descr_.num_columns());
Expand Down

0 comments on commit ccf470d

Please sign in to comment.