Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

column to column comparisons for filtering file scans and row data #11152

Draft
wants to merge 17 commits into
base: main
Choose a base branch
from
Draft
Prev Previous commit
Next Next commit
OTF-1500 not equals and datatype validation (wip)
  • Loading branch information
Jennifer Baldwin committed Aug 16, 2024
commit aaa85d26fca3d4b9ca86b2bdf30651971bb10aa6
21 changes: 21 additions & 0 deletions api/src/main/java/org/apache/iceberg/expressions/Evaluator.java
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ public <T> Boolean lt(Bound<T> valueExpr, Literal<T> lit) {

@Override
public <T> Boolean lt(Bound<T> valueExpr, Bound<T> valueExpr2) {
validateDataTypes(valueExpr, valueExpr2);
Comparator<T> cmp = Comparators.forType(valueExpr2.ref().type().asPrimitiveType());
return cmp.compare(valueExpr.eval(struct), valueExpr2.eval(struct)) < 0;
}
Expand All @@ -122,6 +123,7 @@ public <T> Boolean ltEq(Bound<T> valueExpr, Literal<T> lit) {

@Override
public <T> Boolean ltEq(Bound<T> valueExpr, Bound<T> valueExpr2) {
validateDataTypes(valueExpr, valueExpr2);
Comparator<T> cmp = Comparators.forType(valueExpr2.ref().type().asPrimitiveType());
return cmp.compare(valueExpr.eval(struct), valueExpr2.eval(struct)) <= 0;
}
Expand All @@ -134,6 +136,7 @@ public <T> Boolean gt(Bound<T> valueExpr, Literal<T> lit) {

@Override
public <T> Boolean gt(Bound<T> valueExpr, Bound<T> valueExpr2) {
validateDataTypes(valueExpr, valueExpr2);
Comparator<T> cmp = Comparators.forType(valueExpr2.ref().type().asPrimitiveType());
return cmp.compare(valueExpr.eval(struct), valueExpr2.eval(struct)) > 0;
}
Expand All @@ -146,6 +149,7 @@ public <T> Boolean gtEq(Bound<T> valueExpr, Literal<T> lit) {

@Override
public <T> Boolean gtEq(Bound<T> valueExpr, Bound<T> valueExpr2) {
validateDataTypes(valueExpr, valueExpr2);
Comparator<T> cmp = Comparators.forType(valueExpr2.ref().type().asPrimitiveType());
return cmp.compare(valueExpr.eval(struct), valueExpr2.eval(struct)) >= 0;
}
Expand All @@ -158,15 +162,32 @@ public <T> Boolean eq(Bound<T> valueExpr, Literal<T> lit) {

@Override
public <T> Boolean eq(Bound<T> valueExpr, Bound<T> valueExpr2) {
validateDataTypes(valueExpr, valueExpr2);
Comparator<T> cmp = Comparators.forType(valueExpr2.ref().type().asPrimitiveType());
return cmp.compare(valueExpr.eval(struct), valueExpr2.eval(struct)) == 0;
}

private <T> void validateDataTypes(Bound<T> valueExpr, Bound<T> valueExpr2) {
if (valueExpr.ref().type().typeId() != valueExpr2.ref().type().typeId()) {
throw new IllegalArgumentException(
"Cannot compare different types: "
+ valueExpr.ref().type()
+ " and "
+ valueExpr2.ref().type());
}
}

@Override
public <T> Boolean notEq(Bound<T> valueExpr, Literal<T> lit) {
return !eq(valueExpr, lit);
}

@Override
public <T> Boolean notEq(Bound<T> valueExpr, Bound<T> valueExpr2) {
validateDataTypes(valueExpr, valueExpr2);
return !eq(valueExpr, valueExpr2);
}

@Override
public <T> Boolean in(Bound<T> valueExpr, Set<T> literalSet) {
return literalSet.contains(valueExpr.eval(struct));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -450,8 +450,8 @@ private static <T> String termPredicate(Predicate pred, String term) {
return term + " >= " + rightTerm;
case EQ:
return term + " = " + rightTerm;
// case NOT_EQ:
// return term + " != " + rightTerm;
case NOT_EQ:
return term + " != " + rightTerm;
default:
throw new UnsupportedOperationException(
"Cannot sanitize unsupported predicate type: " + pred.op());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,10 @@ public <T> R notEq(BoundReference<T> ref, Literal<T> lit) {
return null;
}

public <T> R notEq(BoundReference<T> ref, BoundReference<T> ref2) {
return null;
}

public <T> R in(BoundReference<T> ref, Set<T> literalSet) {
throw new UnsupportedOperationException("In expression is not supported by the visitor");
}
Expand Down Expand Up @@ -229,8 +233,8 @@ public <T> R predicate(BoundPredicate<T> pred) {
return gtEq((BoundReference<T>) pred.term(), (BoundReference<T>) termPred.rightTerm());
case EQ:
return eq((BoundReference<T>) pred.term(), (BoundReference<T>) termPred.rightTerm());
// case NOT_EQ:
// return notEq((BoundReference<T>) pred.term(), literalPred.literal());
case NOT_EQ:
return notEq((BoundReference<T>) pred.term(), (BoundReference<T>) termPred.rightTerm());
default:
throw new IllegalStateException(
"Invalid operation for BoundTermPredicate: " + pred.op());
Expand Down Expand Up @@ -393,8 +397,8 @@ public <T> R predicate(BoundPredicate<T> pred) {
return gtEq((BoundReference<T>) pred.term(), (BoundReference<T>) termPred.rightTerm());
case EQ:
return eq((BoundReference<T>) pred.term(), (BoundReference<T>) termPred.rightTerm());
// case NOT_EQ:
// return notEq((BoundReference<T>) pred.term(), literalPred.literal());
case NOT_EQ:
return notEq((BoundReference<T>) pred.term(), (BoundReference<T>) termPred.rightTerm());
default:
throw new IllegalStateException(
"Invalid operation for BoundTermPredicate: " + pred.op());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,10 @@ public <T> Boolean lt(BoundReference<T> ref, BoundReference<T> ref2) {
return ROWS_CANNOT_MATCH;
}

if (ref.type().typeId() != ref2.type().typeId()) {
return ROWS_MIGHT_MATCH;
}

if (checkLowerBounds(ref, ref2, id, id2, cmp -> cmp >= 0)) {
return ROWS_CANNOT_MATCH;
}
Expand Down Expand Up @@ -280,6 +284,10 @@ public <T> Boolean ltEq(BoundReference<T> ref, BoundReference<T> ref2) {
return ROWS_CANNOT_MATCH;
}

if (ref.type().typeId() != ref2.type().typeId()) {
return ROWS_MIGHT_MATCH;
}

if (checkLowerBounds(ref, ref2, id, id2, cmp -> cmp > 0)) {
return ROWS_CANNOT_MATCH;
}
Expand Down Expand Up @@ -319,6 +327,10 @@ public <T> Boolean gt(BoundReference<T> ref, BoundReference<T> ref2) {
return ROWS_CANNOT_MATCH;
}

if (ref.type().typeId() != ref2.type().typeId()) {
return ROWS_MIGHT_MATCH;
}

if (checkUpperBounds(ref, ref2, id, id2, cmp -> cmp <= 0)) {
return ROWS_CANNOT_MATCH;
}
Expand Down Expand Up @@ -358,6 +370,10 @@ public <T> Boolean gtEq(BoundReference<T> ref, BoundReference<T> ref2) {
return ROWS_CANNOT_MATCH;
}

if (ref.type().typeId() != ref2.type().typeId()) {
return ROWS_MIGHT_MATCH;
}

if (checkUpperBounds(ref, ref2, id, id2, cmp -> cmp < 0)) {
return ROWS_CANNOT_MATCH;
}
Expand Down Expand Up @@ -402,14 +418,17 @@ public <T> Boolean eq(BoundReference<T> ref, Literal<T> lit) {
@Override
public <T> Boolean eq(BoundReference<T> ref, BoundReference<T> ref2) {
Integer id = ref.fieldId();
Integer id2 = ref2.fieldId();

if (containsNullsOnly(id) || containsNaNsOnly(id)) {
if (containsNullsOnly(id)
|| containsNaNsOnly(id)
|| containsNullsOnly(id2)
|| containsNaNsOnly(id2)) {
return ROWS_CANNOT_MATCH;
}

Integer id2 = ref2.fieldId();
if (containsNullsOnly(id2) || containsNaNsOnly(id2)) {
return ROWS_CANNOT_MATCH;
if (ref.type().typeId() != ref2.type().typeId()) {
return ROWS_MIGHT_MATCH;
}

if (checkLowerBounds(ref, ref2, id, id2, cmp -> cmp > 0)) {
Expand All @@ -423,6 +442,20 @@ public <T> Boolean eq(BoundReference<T> ref, BoundReference<T> ref2) {
return ROWS_MIGHT_MATCH;
}

@Override
public <T> Boolean notEq(BoundReference<T> ref, Literal<T> lit) {
// because the bounds are not necessarily a min or max value, this cannot be answered using
// them. notEq(col, X) with (X, Y) doesn't guarantee that X is a value in col.
return ROWS_MIGHT_MATCH;
}

@Override
public <T> Boolean notEq(BoundReference<T> ref, BoundReference<T> ref2) {
// because the bounds are not necessarily a min or max value, this cannot be answered using
// them. notEq(col, X) with (X, Y) doesn't guarantee that X is a value in col.
return ROWS_MIGHT_MATCH;
}

private <T> boolean checkUpperBounds(
BoundReference<T> ref,
BoundReference<T> ref2,
Expand Down Expand Up @@ -466,13 +499,6 @@ private <T> boolean checkLowerBounds(
return false;
}

@Override
public <T> Boolean notEq(BoundReference<T> ref, Literal<T> lit) {
// because the bounds are not necessarily a min or max value, this cannot be answered using
// them. notEq(col, X) with (X, Y) doesn't guarantee that X is a value in col.
return ROWS_MIGHT_MATCH;
}

@Override
public <T> Boolean in(BoundReference<T> ref, Set<T> literalSet) {
Integer id = ref.fieldId();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,10 @@ public <T> Boolean lt(BoundReference<T> ref, Literal<T> lit) {

@Override
public <T> Boolean lt(BoundReference<T> ref, BoundReference<T> ref2) {
if (ref.type().typeId() != ref2.type().typeId()) {
return ROWS_MIGHT_MATCH;
}

int pos = Accessors.toPosition(ref.accessor());
int pos2 = Accessors.toPosition(ref2.accessor());
ByteBuffer lowerBound = stats.get(pos).lowerBound();
Expand Down Expand Up @@ -230,6 +234,10 @@ public <T> Boolean ltEq(BoundReference<T> ref, Literal<T> lit) {

@Override
public <T> Boolean ltEq(BoundReference<T> ref, BoundReference<T> ref2) {
if (ref.type().typeId() != ref2.type().typeId()) {
return ROWS_MIGHT_MATCH;
}

int pos = Accessors.toPosition(ref.accessor());
int pos2 = Accessors.toPosition(ref2.accessor());
ByteBuffer lowerBound = stats.get(pos).lowerBound();
Expand Down Expand Up @@ -270,6 +278,10 @@ public <T> Boolean gt(BoundReference<T> ref, Literal<T> lit) {

@Override
public <T> Boolean gt(BoundReference<T> ref, BoundReference<T> ref2) {
if (ref.type().typeId() != ref2.type().typeId()) {
return ROWS_MIGHT_MATCH;
}

int pos = Accessors.toPosition(ref.accessor());
int pos2 = Accessors.toPosition(ref2.accessor());
ByteBuffer upperBound = stats.get(pos).upperBound();
Expand Down Expand Up @@ -310,6 +322,10 @@ public <T> Boolean gtEq(BoundReference<T> ref, Literal<T> lit) {

@Override
public <T> Boolean gtEq(BoundReference<T> ref, BoundReference<T> ref2) {
if (ref.type().typeId() != ref2.type().typeId()) {
return ROWS_MIGHT_MATCH;
}

int pos = Accessors.toPosition(ref.accessor());
int pos2 = Accessors.toPosition(ref2.accessor());
ByteBuffer upperBound = stats.get(pos).upperBound();
Expand Down Expand Up @@ -355,6 +371,10 @@ public <T> Boolean eq(BoundReference<T> ref, Literal<T> lit) {

@Override
public <T> Boolean eq(BoundReference<T> ref, BoundReference<T> ref2) {
if (ref.type().typeId() != ref2.type().typeId()) {
return ROWS_MIGHT_MATCH;
}

int pos = Accessors.toPosition(ref.accessor());
int pos2 = Accessors.toPosition(ref2.accessor());
PartitionFieldSummary fieldStats = stats.get(pos);
Expand Down Expand Up @@ -390,6 +410,13 @@ public <T> Boolean notEq(BoundReference<T> ref, Literal<T> lit) {
return ROWS_MIGHT_MATCH;
}

@Override
public <T> Boolean notEq(BoundReference<T> ref, BoundReference<T> ref2) {
// because the bounds are not necessarily a min or max value, this cannot be answered using
// them. notEq(col, X) with (X, Y) doesn't guarantee that X is a value in col.
return ROWS_MIGHT_MATCH;
}

@Override
public <T> Boolean in(BoundReference<T> ref, Set<T> literalSet) {
int pos = Accessors.toPosition(ref.accessor());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,8 @@ public <T> Expression notNaN(BoundReference<T> ref) {
return NaNUtil.isNaN(ref.eval(struct)) ? alwaysFalse() : alwaysTrue();
}

// TODO implement ref to refs comparison

@Override
public <T> Expression lt(BoundReference<T> ref, Literal<T> lit) {
Comparator<T> cmp = lit.comparator();
Expand Down
28 changes: 27 additions & 1 deletion core/src/main/java/org/apache/iceberg/AllManifestsTable.java
Original file line number Diff line number Diff line change
Expand Up @@ -340,13 +340,18 @@ public <T> Boolean notNaN(BoundReference<T> ref) {
return ROWS_MIGHT_MATCH;
}

// TODO implement ref to refs comparison

@Override
public <T> Boolean lt(BoundReference<T> ref, Literal<T> lit) {
return compareSnapshotRef(ref, lit, compareResult -> compareResult < 0);
}

@Override
public <T> Boolean lt(BoundReference<T> ref, BoundReference<T> ref2) {
if (ref.type().typeId() != ref2.type().typeId()) {
return ROWS_MIGHT_MATCH;
}
return compareSnapshotRef(ref, ref2, compareResult -> compareResult < 0);
}

Expand All @@ -357,6 +362,9 @@ public <T> Boolean ltEq(BoundReference<T> ref, Literal<T> lit) {

@Override
public <T> Boolean ltEq(BoundReference<T> ref, BoundReference<T> ref2) {
if (ref.type().typeId() != ref2.type().typeId()) {
return ROWS_MIGHT_MATCH;
}
return compareSnapshotRef(ref, ref2, compareResult -> compareResult <= 0);
}

Expand All @@ -367,6 +375,9 @@ public <T> Boolean gt(BoundReference<T> ref, Literal<T> lit) {

@Override
public <T> Boolean gt(BoundReference<T> ref, BoundReference<T> ref2) {
if (ref.type().typeId() != ref2.type().typeId()) {
return ROWS_MIGHT_MATCH;
}
return compareSnapshotRef(ref, ref2, compareResult -> compareResult > 0);
}

Expand All @@ -377,6 +388,9 @@ public <T> Boolean gtEq(BoundReference<T> ref, Literal<T> lit) {

@Override
public <T> Boolean gtEq(BoundReference<T> ref, BoundReference<T> ref2) {
if (ref.type().typeId() != ref2.type().typeId()) {
return ROWS_MIGHT_MATCH;
}
return compareSnapshotRef(ref, ref2, compareResult -> compareResult >= 0);
}

Expand All @@ -387,6 +401,9 @@ public <T> Boolean eq(BoundReference<T> ref, Literal<T> lit) {

@Override
public <T> Boolean eq(BoundReference<T> ref, BoundReference<T> ref2) {
if (ref.type().typeId() != ref2.type().typeId()) {
return ROWS_MIGHT_MATCH;
}
return compareSnapshotRef(ref, ref2, compareResult -> compareResult == 0);
}

Expand All @@ -395,6 +412,14 @@ public <T> Boolean notEq(BoundReference<T> ref, Literal<T> lit) {
return compareSnapshotRef(ref, lit, compareResult -> compareResult != 0);
}

@Override
public <T> Boolean notEq(BoundReference<T> ref, BoundReference<T> ref2) {
if (ref.type().typeId() != ref2.type().typeId()) {
return ROWS_MIGHT_MATCH;
}
return compareSnapshotRef(ref, ref2, compareResult -> compareResult != 0);
}

@Override
public <T> Boolean in(BoundReference<T> ref, Set<T> literalSet) {
if (isSnapshotRef(ref)) {
Expand Down Expand Up @@ -452,7 +477,8 @@ private <T> Boolean compareSnapshotRef(
*
* @param ref bound reference, comparison attempted only if reference is for
* reference_snapshot_id
* @param lit literal value to compare with snapshot id.
* @param ref2 l bound reference, comparison attempted only if reference is for
* reference_snapshot_id
* @param desiredResult function to apply to long comparator result, returns true if result is
* as expected.
* @return false if comparator does not achieve desired result, true otherwise
Expand Down
Loading