From 06776690cf57cbdfde920cfda5ce2a1aefaa06a3 Mon Sep 17 00:00:00 2001 From: Jimmy Lu Date: Thu, 26 Sep 2024 11:05:33 -0700 Subject: [PATCH] Handle missing fields when read flatmap as struct Summary: When we read flatmap as struct, the presence of a field is not decided the same way as a normal struct field. We need to handle this separately and set `ScanSpec::subscript` according to the flatmap specific logic, and also leverage this subscript in the subsequent reading to treat it as constant. Differential Revision: D63469632 --- .../dwio/common/SelectiveStructColumnReader.cpp | 8 ++++---- velox/dwio/common/SelectiveStructColumnReader.h | 10 +++++++--- .../reader/SelectiveFlatMapColumnReader.cpp | 3 +++ .../dwrf/reader/SelectiveStructColumnReader.cpp | 2 +- .../dwio/parquet/reader/StructColumnReader.cpp | 3 ++- velox/exec/tests/TableScanTest.cpp | 17 +++++++++++++++-- 6 files changed, 32 insertions(+), 11 deletions(-) diff --git a/velox/dwio/common/SelectiveStructColumnReader.cpp b/velox/dwio/common/SelectiveStructColumnReader.cpp index cbe5388ce27f..22c2393b0e42 100644 --- a/velox/dwio/common/SelectiveStructColumnReader.cpp +++ b/velox/dwio/common/SelectiveStructColumnReader.cpp @@ -271,11 +271,11 @@ void SelectiveStructColumnReaderBase::recordParentNullsInChildren( } } -bool SelectiveStructColumnReaderBase::isChildConstant( +bool SelectiveStructColumnReaderBase::isChildMissing( const velox::common::ScanSpec& childSpec) const { - // Returns true if the child has a constant set in the ScanSpec, or if the - // file doesn't have this child (in which case it will be treated as null). - return childSpec.isConstant() || + // Returns true if the file doesn't have this child (in which case it will be + // treated as null). + return // The below check is trying to determine if this is a missing field in a // struct that should be constant null. (!isRoot_ && // If we're in the root struct channel is meaningless in this diff --git a/velox/dwio/common/SelectiveStructColumnReader.h b/velox/dwio/common/SelectiveStructColumnReader.h index 6f15b321391c..f21e3f5199d1 100644 --- a/velox/dwio/common/SelectiveStructColumnReader.h +++ b/velox/dwio/common/SelectiveStructColumnReader.h @@ -133,9 +133,13 @@ class SelectiveStructColumnReaderBase : public SelectiveColumnReader { return hasDeletion_; } - // Returns true if we'll return a constant for that childSpec (i.e. we don't - // need to read it). - bool isChildConstant(const velox::common::ScanSpec& childSpec) const; + bool isChildMissing(const velox::common::ScanSpec& childSpec) const; + + bool isChildConstant(const velox::common::ScanSpec& childSpec) const { + return childSpec.isConstant() || + childSpec.subscript() == kConstantChildSpecSubscript || + isChildMissing(childSpec); + } void fillOutputRowsFromMutation(vector_size_t size); diff --git a/velox/dwio/dwrf/reader/SelectiveFlatMapColumnReader.cpp b/velox/dwio/dwrf/reader/SelectiveFlatMapColumnReader.cpp index 35fea227cf1b..d1172829792a 100644 --- a/velox/dwio/dwrf/reader/SelectiveFlatMapColumnReader.cpp +++ b/velox/dwio/dwrf/reader/SelectiveFlatMapColumnReader.cpp @@ -183,6 +183,9 @@ class SelectiveFlatMapAsStructReader : public SelectiveStructColumnReaderBase { !keyNodes_.empty(), "For struct encoding, keys to project must be configured"); children_.resize(keyNodes_.size()); + for (auto& childSpec : scanSpec.children()) { + childSpec->setSubscript(kConstantChildSpecSubscript); + } for (int i = 0; i < keyNodes_.size(); ++i) { keyNodes_[i].reader->scanSpec()->setSubscript(i); children_[i] = keyNodes_[i].reader.get(); diff --git a/velox/dwio/dwrf/reader/SelectiveStructColumnReader.cpp b/velox/dwio/dwrf/reader/SelectiveStructColumnReader.cpp index 26f5020dd16a..6baaf94b43cd 100644 --- a/velox/dwio/dwrf/reader/SelectiveStructColumnReader.cpp +++ b/velox/dwio/dwrf/reader/SelectiveStructColumnReader.cpp @@ -53,7 +53,7 @@ SelectiveStructColumnReader::SelectiveStructColumnReader( if (childSpec->isExplicitRowNumber()) { continue; } - if (isChildConstant(*childSpec)) { + if (childSpec->isConstant() || isChildMissing(*childSpec)) { childSpec->setSubscript(kConstantChildSpecSubscript); continue; } diff --git a/velox/dwio/parquet/reader/StructColumnReader.cpp b/velox/dwio/parquet/reader/StructColumnReader.cpp index 90c407c96135..2ab9ae4562e3 100644 --- a/velox/dwio/parquet/reader/StructColumnReader.cpp +++ b/velox/dwio/parquet/reader/StructColumnReader.cpp @@ -35,7 +35,8 @@ StructColumnReader::StructColumnReader( auto& childSpecs = scanSpec_->stableChildren(); for (auto i = 0; i < childSpecs.size(); ++i) { auto childSpec = childSpecs[i]; - if (childSpecs[i]->isConstant()) { + if (childSpec->isConstant() || isChildMissing(*childSpec)) { + childSpec->setSubscript(kConstantChildSpecSubscript); continue; } if (childSpecs[i]->isExplicitRowNumber()) { diff --git a/velox/exec/tests/TableScanTest.cpp b/velox/exec/tests/TableScanTest.cpp index fd982240c2c6..724e2af98302 100644 --- a/velox/exec/tests/TableScanTest.cpp +++ b/velox/exec/tests/TableScanTest.cpp @@ -4733,13 +4733,14 @@ TEST_F(TableScanTest, partitionKeyNotMatchPartitionKeysHandle) { TEST_F(TableScanTest, readFlatMapAsStruct) { constexpr int kSize = 10; std::vector keys = {"1", "2", "3"}; - auto vector = makeRowVector({makeRowVector( + auto c0 = makeRowVector( keys, { makeFlatVector(kSize, folly::identity), makeFlatVector(kSize, folly::identity, nullEvery(5)), makeFlatVector(kSize, folly::identity, nullEvery(7)), - })}); + }); + auto vector = makeRowVector({c0}); auto config = std::make_shared(); config->set(dwrf::Config::FLATTEN_MAP, true); config->set>(dwrf::Config::MAP_FLAT_COLS, {0}); @@ -4753,6 +4754,18 @@ TEST_F(TableScanTest, readFlatMapAsStruct) { PlanBuilder().tableScan(readSchema, {}, "", writeSchema).planNode(); auto split = makeHiveConnectorSplit(file->getPath()); AssertQueryBuilder(plan).split(split).assertResults(vector); + readSchema = + ROW({"c0"}, {ROW({"1", "4", "2"}, {BIGINT(), BIGINT(), BIGINT()})}); + plan = PlanBuilder().tableScan(readSchema, {}, "", writeSchema).planNode(); + split = makeHiveConnectorSplit(file->getPath()); + auto expected = makeRowVector({makeRowVector( + {"1", "4", "2"}, + { + c0->childAt(0), + makeNullConstant(TypeKind::BIGINT, kSize), + c0->childAt(1), + })}); + AssertQueryBuilder(plan).split(split).assertResults(expected); } TEST_F(TableScanTest, dynamicFilters) {