do not accept NaN in Expressions, or mismatch type

apache · rdblue · Dec 6, 2020 · Nov 4, 2020 · Nov 18, 2020 · Nov 23, 2020
commit d5e666399663685d5fc583a692c57c090de74de3
diff --git a/api/src/main/java/org/apache/iceberg/expressions/Expressions.java b/api/src/main/java/org/apache/iceberg/expressions/Expressions.java
@@ -26,6 +26,7 @@
 import org.apache.iceberg.transforms.Transform;
 import org.apache.iceberg.transforms.Transforms;
 import org.apache.iceberg.types.Types;
+import org.apache.iceberg.util.NaNUtil;
 
 /**
  * Factory methods for creating {@link Expression expressions}.
@@ -140,50 +141,62 @@ public static <T> UnboundPredicate<T> notNaN(UnboundTerm<T> expr) {
   }
 
   public static <T> UnboundPredicate<T> lessThan(String name, T value) {
+    validateInput("lessThan", value);
     return new UnboundPredicate<>(Expression.Operation.LT, ref(name), value);
   }
 
   public static <T> UnboundPredicate<T> lessThan(UnboundTerm<T> expr, T value) {
+    validateInput("lessThan", value);
     return new UnboundPredicate<>(Expression.Operation.LT, expr, value);
   }
 
   public static <T> UnboundPredicate<T> lessThanOrEqual(String name, T value) {
+    validateInput("lessThanOrEqual", value);
     return new UnboundPredicate<>(Expression.Operation.LT_EQ, ref(name), value);
   }
 
   public static <T> UnboundPredicate<T> lessThanOrEqual(UnboundTerm<T> expr, T value) {
+    validateInput("lessThanOrEqual", value);
     return new UnboundPredicate<>(Expression.Operation.LT_EQ, expr, value);
   }
 
   public static <T> UnboundPredicate<T> greaterThan(String name, T value) {
+    validateInput("greaterThan", value);
     return new UnboundPredicate<>(Expression.Operation.GT, ref(name), value);
   }
 
   public static <T> UnboundPredicate<T> greaterThan(UnboundTerm<T> expr, T value) {
+    validateInput("greaterThan", value);
     return new UnboundPredicate<>(Expression.Operation.GT, expr, value);
   }
 
   public static <T> UnboundPredicate<T> greaterThanOrEqual(String name, T value) {
+    validateInput("greaterThanOrEqual", value);
     return new UnboundPredicate<>(Expression.Operation.GT_EQ, ref(name), value);
   }
 
   public static <T> UnboundPredicate<T> greaterThanOrEqual(UnboundTerm<T> expr, T value) {
+    validateInput("greaterThanOrEqual", value);
     return new UnboundPredicate<>(Expression.Operation.GT_EQ, expr, value);
   }
 
   public static <T> UnboundPredicate<T> equal(String name, T value) {
+    validateInput("equal", value);
     return new UnboundPredicate<>(Expression.Operation.EQ, ref(name), value);
   }
 
   public static <T> UnboundPredicate<T> equal(UnboundTerm<T> expr, T value) {
+    validateInput("equal", value);
     return new UnboundPredicate<>(Expression.Operation.EQ, expr, value);
   }
 
   public static <T> UnboundPredicate<T> notEqual(String name, T value) {
+    validateInput("notEqual", value);
     return new UnboundPredicate<>(Expression.Operation.NOT_EQ, ref(name), value);
   }
 
   public static <T> UnboundPredicate<T> notEqual(UnboundTerm<T> expr, T value) {
+    validateInput("notEqual", value);
     return new UnboundPredicate<>(Expression.Operation.NOT_EQ, expr, value);
   }
 
@@ -232,6 +245,7 @@ public static <T> UnboundPredicate<T> notIn(UnboundTerm<T> expr, Iterable<T> val
   }
 
   public static <T> UnboundPredicate<T> predicate(Operation op, String name, T value) {
+    validateInput(op.toString(), value);
     return predicate(op, name, Literals.from(value));
   }
 
@@ -243,6 +257,7 @@ public static <T> UnboundPredicate<T> predicate(Operation op, String name, Liter
   }
 
   public static <T> UnboundPredicate<T> predicate(Operation op, String name, Iterable<T> values) {
+    validateInput(op.toString(), values);
     return predicate(op, ref(name), values);
   }
 
@@ -254,9 +269,19 @@ public static <T> UnboundPredicate<T> predicate(Operation op, String name) {
   }
 
   private static <T> UnboundPredicate<T> predicate(Operation op, UnboundTerm<T> expr, Iterable<T> values) {
+    validateInput(op.toString(), values);
     return new UnboundPredicate<>(op, expr, values);
   }
 
+  private static <T> void validateInput(String op, T value) {
+    Preconditions.checkArgument(!NaNUtil.isNaN(value), String.format("Cannot create %s predicate with NaN", op));
+  }
+
+  private static <T> void validateInput(String op, Iterable<T> values) {
+    Preconditions.checkArgument(Lists.newArrayList(values).stream().noneMatch(NaNUtil::isNaN),
+        String.format("Cannot create %s predicate with NaN", op));
+  }
+
   public static True alwaysTrue() {
     return True.INSTANCE;
   }

diff --git a/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java b/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java
@@ -134,8 +134,7 @@ public <T> Boolean notNull(BoundReference<T> ref) {
       int pos = Accessors.toPosition(ref.accessor());
       // containsNull encodes whether at least one partition value is null, lowerBound is null if
       // all partition values are null.
-      ByteBuffer lowerBound = stats.get(pos).lowerBound();
-      if (lowerBound == null) {
+      if (stats.get(pos).containsNull() && stats.get(pos).lowerBound() == null) {
         return ROWS_CANNOT_MATCH; // all values are null
       }
 
@@ -147,8 +146,7 @@ public <T> Boolean isNaN(BoundReference<T> ref) {
       int pos = Accessors.toPosition(ref.accessor());
       // containsNull encodes whether at least one partition value is null, lowerBound is null if
       // all partition values are null.
-      ByteBuffer lowerBound = stats.get(pos).lowerBound();
-      if (lowerBound == null) {
+      if (stats.get(pos).containsNull() && stats.get(pos).lowerBound() == null) {
         return ROWS_CANNOT_MATCH; // all values are null
       }
 

diff --git a/api/src/main/java/org/apache/iceberg/expressions/UnboundPredicate.java b/api/src/main/java/org/apache/iceberg/expressions/UnboundPredicate.java
@@ -30,7 +30,6 @@
 import org.apache.iceberg.types.Type;
 import org.apache.iceberg.types.Types.StructType;
 import org.apache.iceberg.util.CharSequenceSet;
-import org.apache.iceberg.util.NaNUtil;
 
 public class UnboundPredicate<T> extends Predicate<T, UnboundTerm<T>> implements Unbound<T, Expression> {
   private static final Joiner COMMA = Joiner.on(", ");
@@ -130,25 +129,35 @@ private Expression bindUnaryOperation(BoundTerm<T> boundTerm) {
         }
         return new BoundUnaryPredicate<>(Operation.NOT_NULL, boundTerm);
       case IS_NAN:
-        return toIsNaNExpression(boundTerm);
+        if (floatingType(boundTerm.type().typeId())) {
+          return new BoundUnaryPredicate<>(Operation.IS_NAN, boundTerm);
+        } else {
+          throw new ValidationException("IsNaN cannot be used with a non-floating-point column");
+        }
       case NOT_NAN:
-        return toNotNaNExpression(boundTerm);
+        if (floatingType(boundTerm.type().typeId())) {
+          return new BoundUnaryPredicate<>(Operation.NOT_NAN, boundTerm);
+        } else {
+          throw new ValidationException("NotNaN cannot be used with a non-floating-point column");
+        }
       default:
         throw new ValidationException("Operation must be IS_NULL, NOT_NULL, IS_NAN, or NOT_NAN");
     }
   }
 
-  private Expression bindLiteralOperation(BoundTerm<T> boundTerm) {
-    return bindLiteralOperation(boundTerm, op(), literal().to(boundTerm.type()));
+  private boolean floatingType(Type.TypeID typeID) {
+    return Type.TypeID.DOUBLE.equals(typeID) || Type.TypeID.FLOAT.equals(typeID);
   }
 
-  private Expression bindLiteralOperation(BoundTerm<T> boundTerm, Operation op, Literal<T> lit) {
+  private Expression bindLiteralOperation(BoundTerm<T> boundTerm) {
+    Literal<T> lit = literal().to(boundTerm.type());
+
     if (lit == null) {
       throw new ValidationException("Invalid value for conversion to type %s: %s (%s)",
           boundTerm.type(), literal().value(), literal().value().getClass().getName());
 
     } else if (lit == Literals.aboveMax()) {
-      switch (op) {
+      switch (op()) {
         case LT:
         case LT_EQ:
         case NOT_EQ:
@@ -159,7 +168,7 @@ private Expression bindLiteralOperation(BoundTerm<T> boundTerm, Operation op, Li
           return Expressions.alwaysFalse();
       }
     } else if (lit == Literals.belowMin()) {
-      switch (op) {
+      switch (op()) {
         case GT:
         case GT_EQ:
         case NOT_EQ:
@@ -169,42 +178,10 @@ private Expression bindLiteralOperation(BoundTerm<T> boundTerm, Operation op, Li
         case EQ:
           return Expressions.alwaysFalse();
       }
-    } else if (NaNUtil.isNaN(lit.value())) {
-      switch (op) {
-        case GT:
-        case GT_EQ:
-        case LT:
-        case LT_EQ:
-          throw new IllegalArgumentException(String.format("Cannot perform operation %s with value NaN", op));
-        case EQ:
-          return toIsNaNExpression(boundTerm);
-        case NOT_EQ:
-          return toNotNaNExpression(boundTerm);
-      }
     }
 
     // TODO: translate truncate(col) == value to startsWith(value)
-    return new BoundLiteralPredicate<>(op, boundTerm, lit);
-  }
-
-  private Expression toIsNaNExpression(BoundTerm<T> boundTerm) {
-    if (typeIncludesNaN(boundTerm.type().typeId())) {
-      return new BoundUnaryPredicate<>(Operation.IS_NAN, boundTerm);
-    } else {
-      return Expressions.alwaysFalse();
-    }
-  }
-
-  private Expression toNotNaNExpression(BoundTerm<T> boundTerm) {
-    if (typeIncludesNaN(boundTerm.type().typeId())) {
-      return new BoundUnaryPredicate<>(Operation.NOT_NAN, boundTerm);
-    } else {
-      return Expressions.alwaysTrue();
-    }
-  }
-
-  private boolean typeIncludesNaN(Type.TypeID typeID) {
-    return Type.TypeID.DOUBLE.equals(typeID) || Type.TypeID.FLOAT.equals(typeID);
+    return new BoundLiteralPredicate<>(op(), boundTerm, lit);
   }
 
   private Expression bindInOperation(BoundTerm<T> boundTerm) {
@@ -232,9 +209,9 @@ private Expression bindInOperation(BoundTerm<T> boundTerm) {
     if (literalSet.size() == 1) {
       switch (op()) {
         case IN:
-          return bindLiteralOperation(boundTerm, Operation.EQ, Iterables.get(convertedLiterals, 0));
+          return new BoundLiteralPredicate<>(Operation.EQ, boundTerm, Iterables.get(convertedLiterals, 0));
         case NOT_IN:
-          return bindLiteralOperation(boundTerm, Operation.NOT_EQ, Iterables.get(convertedLiterals, 0));
+          return new BoundLiteralPredicate<>(Operation.NOT_EQ, boundTerm, Iterables.get(convertedLiterals, 0));
         default:
           throw new ValidationException("Operation must be IN or NOT_IN");
       }

diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestExpressionHelpers.java b/api/src/test/java/org/apache/iceberg/expressions/TestExpressionHelpers.java
@@ -19,7 +19,9 @@
 
 package org.apache.iceberg.expressions;
 
+import java.util.concurrent.Callable;
 import org.apache.iceberg.AssertHelpers;
+import org.apache.iceberg.transforms.Transforms;
 import org.apache.iceberg.types.Types;
 import org.apache.iceberg.types.Types.NestedField;
 import org.apache.iceberg.types.Types.StructType;
@@ -45,6 +47,8 @@
 import static org.apache.iceberg.expressions.Expressions.notIn;
 import static org.apache.iceberg.expressions.Expressions.notNull;
 import static org.apache.iceberg.expressions.Expressions.or;
+import static org.apache.iceberg.expressions.Expressions.predicate;
+import static org.apache.iceberg.expressions.Expressions.ref;
 import static org.apache.iceberg.expressions.Expressions.rewriteNot;
 import static org.apache.iceberg.expressions.Expressions.truncate;
 import static org.apache.iceberg.expressions.Expressions.year;
@@ -187,4 +191,44 @@ public void testMultiAnd() {
 
     Assert.assertEquals(expected.toString(), actual.toString());
   }
+
+  @Test
+  public void testInvalidateNaNInput() {
+    assertInvalidateNaNThrows("lessThan", () -> lessThan("a", Double.NaN));
+    assertInvalidateNaNThrows("lessThan", () -> lessThan(self("a"), Double.NaN));
+
+    assertInvalidateNaNThrows("lessThanOrEqual", () -> lessThanOrEqual("a", Double.NaN));
+    assertInvalidateNaNThrows("lessThanOrEqual", () -> lessThanOrEqual(self("a"), Double.NaN));
+
+    assertInvalidateNaNThrows("greaterThan", () -> greaterThan("a", Double.NaN));
+    assertInvalidateNaNThrows("greaterThan", () -> greaterThan(self("a"), Double.NaN));
+
+    assertInvalidateNaNThrows("greaterThanOrEqual", () -> greaterThanOrEqual("a", Double.NaN));
+    assertInvalidateNaNThrows("greaterThanOrEqual", () -> greaterThanOrEqual(self("a"), Double.NaN));
+
+    assertInvalidateNaNThrows("equal", () -> equal("a", Double.NaN));
+    assertInvalidateNaNThrows("equal", () -> equal(self("a"), Double.NaN));
+
+    assertInvalidateNaNThrows("notEqual", () -> notEqual("a", Double.NaN));
+    assertInvalidateNaNThrows("notEqual", () -> notEqual(self("a"), Double.NaN));
+
+    assertInvalidateNaNThrows("IN", () -> in("a", 1.0D, 2.0D, Double.NaN));
+    assertInvalidateNaNThrows("IN", () -> in(self("a"), 1.0D, 2.0D, Double.NaN));
+
+    assertInvalidateNaNThrows("NOT_IN", () -> notIn("a", 1.0D, 2.0D, Double.NaN));
+    assertInvalidateNaNThrows("NOT_IN", () -> notIn(self("a"), 1.0D, 2.0D, Double.NaN));
+
+    assertInvalidateNaNThrows("EQ", () -> predicate(Expression.Operation.EQ, "a", Double.NaN));
+  }
+
+  private void assertInvalidateNaNThrows(String operation, Callable<UnboundPredicate<Double>> callable) {
+    AssertHelpers.assertThrows("Should invalidate NaN input",
+        IllegalArgumentException.class, String.format("Cannot create %s predicate with NaN", operation),
+        callable);
+  }
+
+  private <T> UnboundTerm<T> self(String name) {
+    return new UnboundTransform<>(ref(name), Transforms.identity(Types.DoubleType.get()));
+  }
+
 }
diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveManifestEvaluator.java b/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveManifestEvaluator.java
@@ -92,7 +92,7 @@ public class TestInclusiveManifestEvaluator {
           new TestHelpers.TestFieldSummary(false,
               toByteBuffer(Types.FloatType.get(), 0F),
               toByteBuffer(Types.FloatType.get(), 20F)),
-          new TestHelpers.TestFieldSummary(false, null, null)
+          new TestHelpers.TestFieldSummary(true, null, null)
       ));
 
   @Test