diff --git a/velox/dwio/common/ScanSpec.cpp b/velox/dwio/common/ScanSpec.cpp index 005c6eaa47ae..9a15d48ae239 100644 --- a/velox/dwio/common/ScanSpec.cpp +++ b/velox/dwio/common/ScanSpec.cpp @@ -129,6 +129,18 @@ bool ScanSpec::hasFilter() const { return false; } +bool ScanSpec::testNull() const { + if (filter_ && !filter_->testNull()) { + return false; + } + for (auto& child : children_) { + if (!child->isArrayElementOrMapEntry_ && !child->testNull()) { + return false; + } + } + return true; +} + void ScanSpec::moveAdaptationFrom(ScanSpec& other) { // moves the filters and filter order from 'other'. for (auto& child : children_) { diff --git a/velox/dwio/common/ScanSpec.h b/velox/dwio/common/ScanSpec.h index 0e298c532975..2c79dd78bb84 100644 --- a/velox/dwio/common/ScanSpec.h +++ b/velox/dwio/common/ScanSpec.h @@ -255,6 +255,12 @@ class ScanSpec { // This may change as a result of runtime adaptation. bool hasFilter() const; + /// Assume this field is read as null constant vector (usually due to missing + /// field), check if any filter in the struct subtree would make the whole + /// vector to be filtered out. Return false when the whole vector should be + /// filtered out. + bool testNull() const; + // Resets cached values after this or children were updated, e.g. a new filter // was added or existing filter was modified. void resetCachedValues(bool doReorder) { diff --git a/velox/dwio/common/SelectiveStructColumnReader.cpp b/velox/dwio/common/SelectiveStructColumnReader.cpp index 75a2de344d45..e0509810fc87 100644 --- a/velox/dwio/common/SelectiveStructColumnReader.cpp +++ b/velox/dwio/common/SelectiveStructColumnReader.cpp @@ -78,6 +78,20 @@ void SelectiveStructColumnReaderBase::fillOutputRowsFromMutation( } } +namespace { + +bool testFilterOnConstant(const velox::common::ScanSpec& spec) { + if (spec.isConstant() && !spec.constantValue()->isNullAt(0)) { + // Non-null constant is known value during split scheduling and filters on + // them should not be handled at execution level. + return true; + } + // Check filter on missing field. + return !spec.hasFilter() || spec.testNull(); +} + +} // namespace + void SelectiveStructColumnReaderBase::next( uint64_t numValues, VectorPtr& result, @@ -99,6 +113,12 @@ void SelectiveStructColumnReaderBase::next( } } } + for (auto& childSpec : scanSpec_->children()) { + if (isChildConstant(*childSpec) && !testFilterOnConstant(*childSpec)) { + numValues = 0; + break; + } + } // no readers // This can be either count(*) query or a query that select only @@ -107,8 +127,7 @@ void SelectiveStructColumnReaderBase::next( auto resultRowVector = std::dynamic_pointer_cast(result); resultRowVector->unsafeResize(numValues); - auto& childSpecs = scanSpec_->children(); - for (auto& childSpec : childSpecs) { + for (auto& childSpec : scanSpec_->children()) { VELOX_CHECK(childSpec->isConstant()); if (childSpec->projectOut()) { auto channel = childSpec->channel(); @@ -172,6 +191,10 @@ void SelectiveStructColumnReaderBase::read( auto& childSpec = childSpecs[i]; VELOX_TRACE_HISTORY_PUSH("read %s", childSpec->fieldName().c_str()); if (isChildConstant(*childSpec)) { + if (!testFilterOnConstant(*childSpec)) { + activeRows = {}; + break; + } continue; } auto fieldIndex = childSpec->subscript(); diff --git a/velox/exec/tests/TableScanTest.cpp b/velox/exec/tests/TableScanTest.cpp index b10da3382af0..690aa75df988 100644 --- a/velox/exec/tests/TableScanTest.cpp +++ b/velox/exec/tests/TableScanTest.cpp @@ -4226,6 +4226,41 @@ TEST_F(TableScanTest, readMissingFieldsInMap) { AssertQueryBuilder(op).split(split).copyResults(pool()), VeloxUserError); } +TEST_F(TableScanTest, filterMissingFields) { + constexpr int kSize = 10; + auto iota = makeFlatVector(kSize, folly::identity); + auto data = makeRowVector({makeRowVector({iota})}); + auto file = TempFilePath::create(); + writeToFile(file->getPath(), {data}); + auto schema = makeRowType({ + makeRowType({BIGINT(), BIGINT()}), + makeRowType({BIGINT()}), + BIGINT(), + }); + auto test = [&](const std::vector& subfieldFilters, + int expectedSize) { + SCOPED_TRACE(fmt::format("{}", fmt::join(subfieldFilters, " AND "))); + auto plan = PlanBuilder() + .tableScan(ROW({}, {}), subfieldFilters, "", schema) + .planNode(); + auto split = makeHiveConnectorSplit(file->getPath()); + auto result = AssertQueryBuilder(plan).split(split).copyResults(pool()); + ASSERT_EQ(result->size(), expectedSize); + }; + test({"c0.c1 = 0"}, 0); + test({"c0.c1 IS NULL"}, kSize); + test({"c1 IS NOT NULL"}, 0); + test({"c1 IS NULL"}, kSize); + test({"c1.c0 = 0"}, 0); + test({"c1.c0 IS NULL"}, kSize); + test({"c2 = 0"}, 0); + test({"c2 IS NULL"}, kSize); + test({"c2 = 0", "c0.c1 IS NULL"}, 0); + test({"c2 IS NULL", "c0.c1 = 0"}, 0); + test({"c0.c0 = 0", "c1.c0 = 0"}, 0); + test({"c0.c0 = 0", "c1.c0 IS NULL"}, 1); +} + // Tests various projections of top level columns using the output type passed // into TableScan. TEST_F(TableScanTest, tableScanProjections) {