From ccf470dde1952278c125a52355ad6f45c9e6c685 Mon Sep 17 00:00:00 2001 From: Deepak Majeti Date: Sun, 8 May 2016 22:25:44 -0700 Subject: [PATCH] PARQUET-603: Implement missing information in schema descriptor Author: Deepak Majeti Closes #97 from majetideepak/LeafToBase and squashes the following commits: 9ded368 [Deepak Majeti] review comments d80352f [Deepak Majeti] added tests 2a95b67 [Deepak Majeti] Implemented leaf_to_base Change-Id: Ic1ccc14aaae1dcefd5986d8d5f5fd09c1abd7e02 --- cpp/src/parquet/column/CMakeLists.txt | 1 + cpp/src/parquet/column/properties.h | 1 - cpp/src/parquet/schema/descriptor.cc | 16 ++++++++++++---- cpp/src/parquet/schema/descriptor.h | 11 ++++++++--- cpp/src/parquet/schema/schema-descriptor-test.cc | 10 +++++++++- 5 files changed, 30 insertions(+), 9 deletions(-) diff --git a/cpp/src/parquet/column/CMakeLists.txt b/cpp/src/parquet/column/CMakeLists.txt index 4c50c0a7b333d..d64be6c9ee2e8 100644 --- a/cpp/src/parquet/column/CMakeLists.txt +++ b/cpp/src/parquet/column/CMakeLists.txt @@ -19,6 +19,7 @@ install(FILES page.h levels.h + properties.h reader.h scanner.h writer.h diff --git a/cpp/src/parquet/column/properties.h b/cpp/src/parquet/column/properties.h index 40d04c3f6ec0a..132b1a6ecc151 100644 --- a/cpp/src/parquet/column/properties.h +++ b/cpp/src/parquet/column/properties.h @@ -23,7 +23,6 @@ #include "parquet/util/input.h" #include "parquet/util/mem-allocator.h" -#include "parquet/types.h" namespace parquet { diff --git a/cpp/src/parquet/schema/descriptor.cc b/cpp/src/parquet/schema/descriptor.cc index 01f0421245364..de63e5e745561 100644 --- a/cpp/src/parquet/schema/descriptor.cc +++ b/cpp/src/parquet/schema/descriptor.cc @@ -18,6 +18,7 @@ #include "parquet/schema/descriptor.h" #include "parquet/exception.h" +#include "parquet/util/logging.h" namespace parquet { @@ -42,12 +43,12 @@ void SchemaDescriptor::Init(const NodePtr& schema) { leaves_.clear(); for (int i = 0; i < group_->field_count(); ++i) { - BuildTree(group_->field(i), 0, 0); + BuildTree(group_->field(i), 0, 0, group_->field(i)); } } -void SchemaDescriptor::BuildTree( - const NodePtr& node, int16_t max_def_level, int16_t max_rep_level) { +void SchemaDescriptor::BuildTree(const NodePtr& node, int16_t max_def_level, + int16_t max_rep_level, const NodePtr& base) { if (node->is_optional()) { ++max_def_level; } else if (node->is_repeated()) { @@ -61,11 +62,12 @@ void SchemaDescriptor::BuildTree( if (node->is_group()) { const GroupNode* group = static_cast(node.get()); for (int i = 0; i < group->field_count(); ++i) { - BuildTree(group->field(i), max_def_level, max_rep_level); + BuildTree(group->field(i), max_def_level, max_rep_level, base); } } else { // Primitive node, append to leaves leaves_.push_back(ColumnDescriptor(node, max_def_level, max_rep_level, this)); + leaf_to_base_.emplace(leaves_.size() - 1, base); } } @@ -81,9 +83,15 @@ ColumnDescriptor::ColumnDescriptor(const schema::NodePtr& node, } const ColumnDescriptor* SchemaDescriptor::Column(int i) const { + DCHECK(i >= 0 && i < static_cast(leaves_.size())); return &leaves_[i]; } +const schema::NodePtr& SchemaDescriptor::GetColumnRoot(int i) const { + DCHECK(i >= 0 && i < static_cast(leaves_.size())); + return leaf_to_base_.find(i)->second; +} + int ColumnDescriptor::type_scale() const { return primitive_node_->decimal_metadata().scale; } diff --git a/cpp/src/parquet/schema/descriptor.h b/cpp/src/parquet/schema/descriptor.h index 7c04e59ae28e4..eb6eac66c23a5 100644 --- a/cpp/src/parquet/schema/descriptor.h +++ b/cpp/src/parquet/schema/descriptor.h @@ -101,14 +101,19 @@ class SchemaDescriptor { const schema::NodePtr& schema() const { return schema_; } + const schema::GroupNode* group() const { return group_; } + + // Returns the root (child of the schema root) node of the leaf(column) node + const schema::NodePtr& GetColumnRoot(int i) const; + private: friend class ColumnDescriptor; schema::NodePtr schema_; const schema::GroupNode* group_; - void BuildTree( - const schema::NodePtr& node, int16_t max_def_level, int16_t max_rep_level); + void BuildTree(const schema::NodePtr& node, int16_t max_def_level, + int16_t max_rep_level, const schema::NodePtr& base); // Result of leaf node / tree analysis std::vector leaves_; @@ -122,7 +127,7 @@ class SchemaDescriptor { // -- -- b | // -- -- -- c | // -- -- -- -- d - std::unordered_map leaf_to_base_; + std::unordered_map leaf_to_base_; }; } // namespace parquet diff --git a/cpp/src/parquet/schema/schema-descriptor-test.cc b/cpp/src/parquet/schema/schema-descriptor-test.cc index dd552be8ba3ab..d88cd0d95f007 100644 --- a/cpp/src/parquet/schema/schema-descriptor-test.cc +++ b/cpp/src/parquet/schema/schema-descriptor-test.cc @@ -75,7 +75,8 @@ TEST_F(TestSchemaDescriptor, BuildTree) { NodeVector fields; NodePtr schema; - fields.push_back(Int32("a", Repetition::REQUIRED)); + NodePtr inta = Int32("a", Repetition::REQUIRED); + fields.push_back(inta); fields.push_back(Int64("b", Repetition::OPTIONAL)); fields.push_back(ByteArray("c", Repetition::REPEATED)); @@ -122,6 +123,13 @@ TEST_F(TestSchemaDescriptor, BuildTree) { ASSERT_EQ(descr_.Column(4)->path()->ToDotString(), "bag.records.item2"); ASSERT_EQ(descr_.Column(5)->path()->ToDotString(), "bag.records.item3"); + ASSERT_EQ(inta.get(), descr_.GetColumnRoot(0).get()); + ASSERT_EQ(bag.get(), descr_.GetColumnRoot(3).get()); + ASSERT_EQ(bag.get(), descr_.GetColumnRoot(4).get()); + ASSERT_EQ(bag.get(), descr_.GetColumnRoot(5).get()); + + ASSERT_EQ(schema.get(), descr_.group()); + // Init clears the leaves descr_.Init(schema); ASSERT_EQ(nleaves, descr_.num_columns());