Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

API: add isNaN and notNaN predicates #1747

Merged
merged 6 commits into from
Dec 6, 2020
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
API: add isNaN and notNaN predicates
  • Loading branch information
yyanyy committed Nov 26, 2020
commit 951699acce5dd52b5dd008a20619534f9362570d
Original file line number Diff line number Diff line change
Expand Up @@ -46,18 +46,35 @@ public boolean test(T value) {
return value == null;
case NOT_NULL:
return value != null;
case IS_NAN:
return isNaN(value);
case NOT_NAN:
return !isNaN(value);
default:
throw new IllegalStateException("Invalid operation for BoundUnaryPredicate: " + op());
}
}

private boolean isNaN(T value) {
if (value instanceof Double) {
return Double.isNaN((Double) value);
} else if (value instanceof Float) {
return Float.isNaN((Float) value);
}
return false;
}

@Override
public String toString() {
switch (op()) {
case IS_NULL:
return "is_null(" + term() + ")";
case NOT_NULL:
return "not_null(" + term() + ")";
case IS_NAN:
return "is_nan(" + term() + ")";
case NOT_NAN:
return "not_nan(" + term() + ")";
default:
return "Invalid unary predicate: operation = " + op();
}
Expand Down
21 changes: 21 additions & 0 deletions api/src/main/java/org/apache/iceberg/expressions/Evaluator.java
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,27 @@ public <T> Boolean notNull(Bound<T> valueExpr) {
return valueExpr.eval(struct) != null;
}

@Override
public <T> Boolean isNaN(Bound<T> valueExpr) {
T value = valueExpr.eval(struct);
if (value == null) {
return false;
}

if (value instanceof Double) {
return Double.isNaN((Double) value);
} else if (value instanceof Float) {
return Float.isNaN((Float) value);
} else {
return false;
}
}

@Override
public <T> Boolean notNaN(Bound<T> ref) {
return !isNaN(ref);
}

@Override
public <T> Boolean lt(Bound<T> valueExpr, Literal<T> lit) {
Comparator<T> cmp = lit.comparator();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ enum Operation {
FALSE,
IS_NULL,
NOT_NULL,
IS_NAN,
NOT_NAN,
LT,
LT_EQ,
GT,
Expand All @@ -52,6 +54,10 @@ public Operation negate() {
return Operation.NOT_NULL;
case NOT_NULL:
return Operation.IS_NULL;
case IS_NAN:
return Operation.NOT_NAN;
case NOT_NAN:
return Operation.IS_NAN;
case LT:
return Operation.GT_EQ;
case LT_EQ:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,14 @@ public <T> R notNull(BoundReference<T> ref) {
return null;
}

public <T> R isNaN(BoundReference<T> ref) {
return null;
}

public <T> R notNaN(BoundReference<T> ref) {
return null;
}

public <T> R lt(BoundReference<T> ref, Literal<T> lit) {
return null;
}
Expand Down Expand Up @@ -143,6 +151,10 @@ public <T> R predicate(BoundPredicate<T> pred) {
return isNull((BoundReference<T>) pred.term());
case NOT_NULL:
return notNull((BoundReference<T>) pred.term());
case IS_NAN:
return isNaN((BoundReference<T>) pred.term());
case NOT_NAN:
return notNaN((BoundReference<T>) pred.term());
default:
throw new IllegalStateException("Invalid operation for BoundUnaryPredicate: " + pred.op());
}
Expand Down Expand Up @@ -176,6 +188,14 @@ public <T> R notNull(Bound<T> expr) {
return null;
}

public <T> R isNaN(Bound<T> expr) {
return null;
}

public <T> R notNaN(Bound<T> expr) {
return null;
}

public <T> R lt(Bound<T> expr, Literal<T> lit) {
return null;
}
Expand Down Expand Up @@ -241,6 +261,10 @@ public <T> R predicate(BoundPredicate<T> pred) {
return isNull(pred.term());
case NOT_NULL:
return notNull(pred.term());
case IS_NAN:
return isNaN(pred.term());
case NOT_NAN:
return notNaN(pred.term());
default:
throw new IllegalStateException("Invalid operation for BoundUnaryPredicate: " + pred.op());
}
Expand Down
22 changes: 20 additions & 2 deletions api/src/main/java/org/apache/iceberg/expressions/Expressions.java
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,22 @@ public static <T> UnboundPredicate<T> notNull(UnboundTerm<T> expr) {
return new UnboundPredicate<>(Expression.Operation.NOT_NULL, expr);
}

public static <T> UnboundPredicate<T> isNaN(String name) {
return new UnboundPredicate<>(Expression.Operation.IS_NAN, ref(name));
}

public static <T> UnboundPredicate<T> isNaN(UnboundTerm<T> expr) {
return new UnboundPredicate<>(Expression.Operation.IS_NAN, expr);
}

public static <T> UnboundPredicate<T> notNaN(String name) {
return new UnboundPredicate<>(Expression.Operation.NOT_NAN, ref(name));
}

public static <T> UnboundPredicate<T> notNaN(UnboundTerm<T> expr) {
return new UnboundPredicate<>(Expression.Operation.NOT_NAN, expr);
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we also need to update the equality predicate to catch NaN and rewrite to isNaN?

Copy link
Contributor Author

@yyanyy yyanyy Nov 12, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I originally thought to update SparkFilters to do the rewrite, but this is a much better place. Thanks for the suggestion!

Edit: what do you think about doing rewriting eq within UnboundPredicate? And for rewriting in, I was thinking to let Expressions.in to do the rewrite logic of or(isNaN, in)/and(notNaN, notIn), but that means it will return Expression instead of Predicate; does that align with your thinking?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I do not fully understand what you mean by "rewrite logic of or(isNaN, in)/and(notNaN, notIn)" when you talk about rewriting in. Can you give some examples of what predicate are you trying to support?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So now since we want to handle NaN in in predicate, for query in(1,2, NaN) to avoid checking for NaN in in evaluation all the time we can transform that to in(1,2) or isNaN, and notIn(1,2,NaN) to notIn(1, 2) and notNaN. The problem is where to do that, since in and notIn are both predicate, and if we are extending them we are transforming a predicate (simpler form) to an expression (complex form), and I think there's no such case in the current code base, and it would touch a lot of existing test cases for this.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay so it's what I thought, just a bit confused by the notation.

So for eq, what is the benefit of doing it in UnboundedPredicate versus just rewriting it in the Expressions?

For in, I think it is a more complex question.We need to figure out:

  1. should syntax like in(1,2,NaN) be supported, given it can be written as is_nan or in(1,2) on client side
  2. if so, Expressions.in should return Expression as you said, which looks fine to me because the only caller SparkFilters.convert also returns an Expression in the end.
  3. maybe we should tackle this in another PR to keep changes concise.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the quick response! Yeah I think the amount of change to method return type/tests is not a concern now. I just wasn't entirely sure if rewriting eq to isNan in Expressions will help with catching problems early (comparing to rewriting in UnboundPredicate), since it seems to me that the related code will not have a chance to throw any exception until bind() is called?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, it isn't much earlier in that case. Maybe that actually exposes a problem with rewriting, too.

Expressions.equal("c", Double.NaN) if c is not a floating point column would result in isNaN, which should be rejected while binding expressions. You could argue that it should rewrite to alwaysFalse instead following the same logic as Expressions.equal("intCol", Long.MAX_VALUE) -- it can't be true.

I think that it would be better to be strict and reject binding in that case because something is clearly wrong. I think a lot of the time, that kind of error would happen when columns are misaligned or predicates are incorrectly converted.

If the result of those errors is just to fail in expression binding, then why rewrite at all? Maybe we should just reject NaN in any predicate and force people to explicitly use isNaN and notNaN. That way we do throw an exception much earlier in all cases. Plus, we wouldn't have to worry about confusion over whether NaN is equal to itself: in Java, a Double that holds NaN is equal to itself, but a primitive is not. 😕

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, those are some good points! To make sure I understand correctly/know how to move forward, I have some questions:

  • If I understand correctly, to reject NaN in any predicate sounds like we might go back to the idea of rewriting equals in SparkFilters (or in general, the integration point with engines during the query-to-expression translation); or maybe even earlier than that, to let engines to support syntax of is NaN?
  • Since to know if a query is eligible to be translated to isNaN there has to be some place that ensures the type has to be either double or float, and in iceberg code base we will only know this during binding; are we able to rely on engine to do this check before translating query to Expression?
  • And seems like this may only impact eq as we decided to do input validation on other lg/lteq/gt/gteq and in anyway?
  • And if we start to throw exceptions when the code passes in NaN to eq, that may sound backward incompatible until the engine starts to rewrite NaN?

I guess the conversation is starting to get too detailed, if you wouldn't mind I'll try to follow up on Slack tomorrow and then post the conclusion here?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If I understand correctly, to reject NaN in any predicate sounds like we might go back to the idea of rewriting equals in SparkFilters

Yes. If the engine generally uses d = NaN then we can convert that to isNaN. But that would be engine-dependent and the Iceberg expression API would not support equals with NaN.

are we able to rely on engine to do this check before translating query to Expression?

I think so. Most engines will optimize the SQL expressions and handle this already. If not, then it would result in an exception from Iceberg to the user. I think that's okay, too, because as I said above, we want to fail if a NaN is used in an expression with a non-floating-point column, not rewrite to false.

And seems like this may only impact eq as we decided to do input validation on other lg/lteq/gt/gteq and in anyway?

Yes. This makes all of the handling in Expressions consistent: always reject NaN values.

that may sound backward incompatible until the engine starts to rewrite NaN?

I'm not convinced either way. You could argue that d = NaN is ambiguous and that rejecting it is now fixing a bug. That's certainly the case with d > NaN, which is not defined. On the other hand, there was some bevhavior before that will now no longer work. So I'd be up for fixing this in Flink and Spark conversions as soon as we can.

Feel free to ping me on Slack!

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you for the explanation! I think now I understand the full picture. I think I've addressed everything except for rewriting in SparkFilters and other engines, which I think this PR is already too big so I'll submit a separate PR for it (likely next week).


public static <T> UnboundPredicate<T> lessThan(String name, T value) {
return new UnboundPredicate<>(Expression.Operation.LT, ref(name), value);
}
Expand Down Expand Up @@ -220,7 +236,8 @@ public static <T> UnboundPredicate<T> predicate(Operation op, String name, T val
}

public static <T> UnboundPredicate<T> predicate(Operation op, String name, Literal<T> lit) {
Preconditions.checkArgument(op != Operation.IS_NULL && op != Operation.NOT_NULL,
Preconditions.checkArgument(
op != Operation.IS_NULL && op != Operation.NOT_NULL && op != Operation.IS_NAN && op != Operation.NOT_NAN,
"Cannot create %s predicate inclusive a value", op);
return new UnboundPredicate<T>(op, ref(name), lit);
}
Expand All @@ -230,7 +247,8 @@ public static <T> UnboundPredicate<T> predicate(Operation op, String name, Itera
}

public static <T> UnboundPredicate<T> predicate(Operation op, String name) {
Preconditions.checkArgument(op == Operation.IS_NULL || op == Operation.NOT_NULL,
Preconditions.checkArgument(
op == Operation.IS_NULL || op == Operation.NOT_NULL || op == Operation.IS_NAN || op == Operation.NOT_NAN,
"Cannot create %s predicate without a value", op);
return new UnboundPredicate<>(op, ref(name));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ public boolean eval(ContentFile<?> file) {
private class MetricsEvalVisitor extends BoundExpressionVisitor<Boolean> {
private Map<Integer, Long> valueCounts = null;
private Map<Integer, Long> nullCounts = null;
private Map<Integer, Long> nanCounts = null;
private Map<Integer, ByteBuffer> lowerBounds = null;
private Map<Integer, ByteBuffer> upperBounds = null;

Expand All @@ -93,6 +94,7 @@ private boolean eval(ContentFile<?> file) {

this.valueCounts = file.valueCounts();
this.nullCounts = file.nullValueCounts();
this.nanCounts = file.nanValueCounts();
this.lowerBounds = file.lowerBounds();
this.upperBounds = file.upperBounds();

Expand Down Expand Up @@ -150,6 +152,53 @@ public <T> Boolean notNull(BoundReference<T> ref) {
return ROWS_MIGHT_MATCH;
}

@Override
public <T> Boolean isNaN(BoundReference<T> ref) {
Integer id = ref.fieldId();

if (nanCounts != null && nanCounts.containsKey(id) && nanCounts.get(id) == 0) {
return ROWS_CANNOT_MATCH;
}

// when there's no nanCounts information, but we already know the column only contains null,
// it's guaranteed that there's no NaN value
if (containsNullsOnly(id)) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we define a similar containsNaNsOnly method to use in notNaN and for a similar use in isNull?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't define notNaN originally as I could directly return ROWS_CANNOT_MATCH when both nanCounts and valueCounts contain this column but numbers don't match, without going into the next block of logic (of checking upper == lower == NaN and null count == 0); but this advantage no longer exists since that block needs to be removed.

But I wasn't sure if we need it for isNull: currently in isNull() we are checking if nullCounts == 0 to return ROWS_CANNOT_MATCH, and I guess the only chance where we rely on containsNaNsOnly to return ROWS_CANNOT_MATCH is nullCounts for this column doesn't exist but nanCounts does. I personally feel the chance of this happening would be small, do you think we will run into this case often?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree that the containsNaNsOnly logic will not be very useful as Yan said, but I think it is also valuable to have that private method just for readability.

Then the question reduces to: do we need to consider the case that null value metrics do not exist but NaN metrics do. For now I think the answer is no, because in all metrics modes NaN and null counters either both exist or both not exist.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good, I'll create a containsNaNsOnly for readability. Ryan, do you have comment on the other point?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree with the reasoning. If we have NaN counts, then we should have null counts. No need to over-complicated the null logic with a check for when we don't have null counts but do have NaN counts. Good catch!

return ROWS_CANNOT_MATCH;
}

return ROWS_MIGHT_MATCH;
}

@Override
@SuppressWarnings("checkstyle:CyclomaticComplexity")
public <T> Boolean notNaN(BoundReference<T> ref) {
Integer id = ref.fieldId();

if (nanCounts != null && nanCounts.containsKey(id) &&
valueCounts != null && valueCounts.containsKey(id)) {
if (nanCounts.get(id).equals(valueCounts.get(id))) {
return ROWS_CANNOT_MATCH;
}

return ROWS_MIGHT_MATCH;
}

// for v1 table, when NaN could still be upper/lower bound,
// if upper == lower == NaN and null count == 0, the column will only contain NaN
if (nullCounts != null && nullCounts.getOrDefault(id, -1L) == 0 &&
upperBounds != null && upperBounds.containsKey(id) &&
lowerBounds != null && upperBounds.get(id).equals(lowerBounds.get(id))) {
T lower = Conversions.fromByteBuffer(ref.type(), lowerBounds.get(id));

if ((lower instanceof Double && Double.isNaN((Double) lower)) ||
(lower instanceof Float && Float.isNaN((Float) lower))) {
return ROWS_CANNOT_MATCH;
}
}

return ROWS_MIGHT_MATCH;
}

@Override
public <T> Boolean lt(BoundReference<T> ref, Literal<T> lit) {
Integer id = ref.fieldId();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,25 @@ public <T> Boolean notNull(BoundReference<T> ref) {
return ROWS_MIGHT_MATCH;
}

@Override
public <T> Boolean isNaN(BoundReference<T> ref) {
int pos = Accessors.toPosition(ref.accessor());
// containsNull encodes whether at least one partition value is null, lowerBound is null if
// all partition values are null.
ByteBuffer lowerBound = stats.get(pos).lowerBound();
if (lowerBound == null) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To be safe, I think this should validate that containsNull is true.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You mean check for both containsNull and stats.get(pos).lowerBound() == null are true? When would lowerBound be null while the column doesn't contain null? I guess I'll also need to update notNull for this too (since I copied the logic from there)?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems like #1803 is missing PartitionFieldSummary.containsNaN(), or is it in some other PR?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That should be doable, although I originally consider the scope of the NaN support to be only on manifest entry level, I wasn't sure if we want to extend it beyond that?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure of a case where it would happen, but containsNull is the source of truth for whether there are null values, not a missing bound value.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good, I'll add containsNull to both here and notNull. And looks like we do want to update PartitionFieldSummary, that I'll do in a separate pr.

return ROWS_CANNOT_MATCH; // all values are null
}

return ROWS_MIGHT_MATCH;
}

@Override
public <T> Boolean notNaN(BoundReference<T> ref) {
// we don't have enough information to tell if there is no NaN value
return ROWS_MIGHT_MATCH;
}

@Override
public <T> Boolean lt(BoundReference<T> ref, Literal<T> lit) {
int pos = Accessors.toPosition(ref.accessor());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,30 @@ public <T> Expression notNull(BoundReference<T> ref) {
return (ref.eval(struct) != null) ? alwaysTrue() : alwaysFalse();
}

@Override
public <T> Expression isNaN(BoundReference<T> ref) {
return isNaN(ref.eval(struct)) ? alwaysTrue() : alwaysFalse();
}

@Override
public <T> Expression notNaN(BoundReference<T> ref) {
return isNaN(ref.eval(struct)) ? alwaysFalse() : alwaysTrue();
}

private <T> boolean isNaN(T value) {
if (value == null) {
return false;
}

if (value instanceof Double) {
return Double.isNaN((Double) value);
} else if (value instanceof Float) {
return Float.isNaN((Float) value);
} else {
return false;
}
}

@Override
public <T> Expression lt(BoundReference<T> ref, Literal<T> lit) {
Comparator<T> cmp = lit.comparator();
Expand Down
Loading