Skip to content

Commit

Permalink
Fix filter on missing field (facebookincubator#10777)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: facebookincubator#10777

When there is a subfield filter on a missing field, we used to ignore
the filter, because we thought all constant fields are supplied by planner
(via `infoColumns`) and should not be filtered in execution level.  This is not true for
the null constant caused by missing field in file.  So we add the checks for
this case.

Differential Revision: D61473660
  • Loading branch information
Yuhta authored and facebook-github-bot committed Aug 19, 2024
1 parent be52988 commit 96b14f0
Show file tree
Hide file tree
Showing 4 changed files with 78 additions and 2 deletions.
12 changes: 12 additions & 0 deletions velox/dwio/common/ScanSpec.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,18 @@ bool ScanSpec::hasFilter() const {
return false;
}

bool ScanSpec::testNull() const {
if (filter_ && !filter_->testNull()) {
return false;
}
for (auto& child : children_) {
if (!child->isArrayElementOrMapEntry_ && !child->testNull()) {
return false;
}
}
return true;
}

void ScanSpec::moveAdaptationFrom(ScanSpec& other) {
// moves the filters and filter order from 'other'.
for (auto& child : children_) {
Expand Down
6 changes: 6 additions & 0 deletions velox/dwio/common/ScanSpec.h
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,12 @@ class ScanSpec {
// This may change as a result of runtime adaptation.
bool hasFilter() const;

/// Assume this field is read as null constant vector (usually due to missing
/// field), check if any filter in the struct subtree would make the whole
/// vector to be filtered out. Return false when the whole vector should be
/// filtered out.
bool testNull() const;

// Resets cached values after this or children were updated, e.g. a new filter
// was added or existing filter was modified.
void resetCachedValues(bool doReorder) {
Expand Down
27 changes: 25 additions & 2 deletions velox/dwio/common/SelectiveStructColumnReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,20 @@ void SelectiveStructColumnReaderBase::fillOutputRowsFromMutation(
}
}

namespace {

bool testFilterOnConstant(const velox::common::ScanSpec& spec) {
if (spec.isConstant() && !spec.constantValue()->isNullAt(0)) {
// Non-null constant is known value during split scheduling and filters on
// them should not be handled at execution level.
return true;
}
// Check filter on missing field.
return !spec.hasFilter() || spec.testNull();
}

} // namespace

void SelectiveStructColumnReaderBase::next(
uint64_t numValues,
VectorPtr& result,
Expand All @@ -99,6 +113,12 @@ void SelectiveStructColumnReaderBase::next(
}
}
}
for (auto& childSpec : scanSpec_->children()) {
if (isChildConstant(*childSpec) && !testFilterOnConstant(*childSpec)) {
numValues = 0;
break;
}
}

// no readers
// This can be either count(*) query or a query that select only
Expand All @@ -107,8 +127,7 @@ void SelectiveStructColumnReaderBase::next(
auto resultRowVector = std::dynamic_pointer_cast<RowVector>(result);
resultRowVector->unsafeResize(numValues);

auto& childSpecs = scanSpec_->children();
for (auto& childSpec : childSpecs) {
for (auto& childSpec : scanSpec_->children()) {
VELOX_CHECK(childSpec->isConstant());
if (childSpec->projectOut()) {
auto channel = childSpec->channel();
Expand Down Expand Up @@ -172,6 +191,10 @@ void SelectiveStructColumnReaderBase::read(
auto& childSpec = childSpecs[i];
VELOX_TRACE_HISTORY_PUSH("read %s", childSpec->fieldName().c_str());
if (isChildConstant(*childSpec)) {
if (!testFilterOnConstant(*childSpec)) {
activeRows = {};
break;
}
continue;
}
auto fieldIndex = childSpec->subscript();
Expand Down
35 changes: 35 additions & 0 deletions velox/exec/tests/TableScanTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4226,6 +4226,41 @@ TEST_F(TableScanTest, readMissingFieldsInMap) {
AssertQueryBuilder(op).split(split).copyResults(pool()), VeloxUserError);
}

TEST_F(TableScanTest, filterMissingFields) {
constexpr int kSize = 10;
auto iota = makeFlatVector<int64_t>(kSize, folly::identity);
auto data = makeRowVector({makeRowVector({iota})});
auto file = TempFilePath::create();
writeToFile(file->getPath(), {data});
auto schema = makeRowType({
makeRowType({BIGINT(), BIGINT()}),
makeRowType({BIGINT()}),
BIGINT(),
});
auto test = [&](const std::vector<std::string>& subfieldFilters,
int expectedSize) {
SCOPED_TRACE(fmt::format("{}", fmt::join(subfieldFilters, " AND ")));
auto plan = PlanBuilder()
.tableScan(ROW({}, {}), subfieldFilters, "", schema)
.planNode();
auto split = makeHiveConnectorSplit(file->getPath());
auto result = AssertQueryBuilder(plan).split(split).copyResults(pool());
ASSERT_EQ(result->size(), expectedSize);
};
test({"c0.c1 = 0"}, 0);
test({"c0.c1 IS NULL"}, kSize);
test({"c1 IS NOT NULL"}, 0);
test({"c1 IS NULL"}, kSize);
test({"c1.c0 = 0"}, 0);
test({"c1.c0 IS NULL"}, kSize);
test({"c2 = 0"}, 0);
test({"c2 IS NULL"}, kSize);
test({"c2 = 0", "c0.c1 IS NULL"}, 0);
test({"c2 IS NULL", "c0.c1 = 0"}, 0);
test({"c0.c0 = 0", "c1.c0 = 0"}, 0);
test({"c0.c0 = 0", "c1.c0 IS NULL"}, 1);
}

// Tests various projections of top level columns using the output type passed
// into TableScan.
TEST_F(TableScanTest, tableScanProjections) {
Expand Down

0 comments on commit 96b14f0

Please sign in to comment.