Skip to content

Commit fab4a5f

Browse files
authored
API: add isNaN and notNaN predicates (#1747)
1 parent 61702d1 commit fab4a5f

28 files changed

+937
-55
lines changed

api/src/main/java/org/apache/iceberg/expressions/BoundUnaryPredicate.java

+10
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919

2020
package org.apache.iceberg.expressions;
2121

22+
import org.apache.iceberg.util.NaNUtil;
23+
2224
public class BoundUnaryPredicate<T> extends BoundPredicate<T> {
2325
BoundUnaryPredicate(Operation op, BoundTerm<T> term) {
2426
super(op, term);
@@ -46,6 +48,10 @@ public boolean test(T value) {
4648
return value == null;
4749
case NOT_NULL:
4850
return value != null;
51+
case IS_NAN:
52+
return NaNUtil.isNaN(value);
53+
case NOT_NAN:
54+
return !NaNUtil.isNaN(value);
4955
default:
5056
throw new IllegalStateException("Invalid operation for BoundUnaryPredicate: " + op());
5157
}
@@ -58,6 +64,10 @@ public String toString() {
5864
return "is_null(" + term() + ")";
5965
case NOT_NULL:
6066
return "not_null(" + term() + ")";
67+
case IS_NAN:
68+
return "is_nan(" + term() + ")";
69+
case NOT_NAN:
70+
return "not_nan(" + term() + ")";
6171
default:
6272
return "Invalid unary predicate: operation = " + op();
6373
}

api/src/main/java/org/apache/iceberg/expressions/Evaluator.java

+11
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
import org.apache.iceberg.StructLike;
2626
import org.apache.iceberg.expressions.ExpressionVisitors.BoundVisitor;
2727
import org.apache.iceberg.types.Types.StructType;
28+
import org.apache.iceberg.util.NaNUtil;
2829

2930
/**
3031
* Evaluates an {@link Expression} for data described by a {@link StructType}.
@@ -91,6 +92,16 @@ public <T> Boolean notNull(Bound<T> valueExpr) {
9192
return valueExpr.eval(struct) != null;
9293
}
9394

95+
@Override
96+
public <T> Boolean isNaN(Bound<T> valueExpr) {
97+
return NaNUtil.isNaN(valueExpr.eval(struct));
98+
}
99+
100+
@Override
101+
public <T> Boolean notNaN(Bound<T> valueExpr) {
102+
return !NaNUtil.isNaN(valueExpr.eval(struct));
103+
}
104+
94105
@Override
95106
public <T> Boolean lt(Bound<T> valueExpr, Literal<T> lit) {
96107
Comparator<T> cmp = lit.comparator();

api/src/main/java/org/apache/iceberg/expressions/Expression.java

+6
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ enum Operation {
3030
FALSE,
3131
IS_NULL,
3232
NOT_NULL,
33+
IS_NAN,
34+
NOT_NAN,
3335
LT,
3436
LT_EQ,
3537
GT,
@@ -52,6 +54,10 @@ public Operation negate() {
5254
return Operation.NOT_NULL;
5355
case NOT_NULL:
5456
return Operation.IS_NULL;
57+
case IS_NAN:
58+
return Operation.NOT_NAN;
59+
case NOT_NAN:
60+
return Operation.IS_NAN;
5561
case LT:
5662
return Operation.GT_EQ;
5763
case LT_EQ:

api/src/main/java/org/apache/iceberg/expressions/ExpressionVisitors.java

+24
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,14 @@ public <T> R notNull(BoundReference<T> ref) {
7575
return null;
7676
}
7777

78+
public <T> R isNaN(BoundReference<T> ref) {
79+
throw new UnsupportedOperationException(this.getClass().getName() + " does not implement isNaN");
80+
}
81+
82+
public <T> R notNaN(BoundReference<T> ref) {
83+
throw new UnsupportedOperationException(this.getClass().getName() + " does not implement notNaN");
84+
}
85+
7886
public <T> R lt(BoundReference<T> ref, Literal<T> lit) {
7987
return null;
8088
}
@@ -143,6 +151,10 @@ public <T> R predicate(BoundPredicate<T> pred) {
143151
return isNull((BoundReference<T>) pred.term());
144152
case NOT_NULL:
145153
return notNull((BoundReference<T>) pred.term());
154+
case IS_NAN:
155+
return isNaN((BoundReference<T>) pred.term());
156+
case NOT_NAN:
157+
return notNaN((BoundReference<T>) pred.term());
146158
default:
147159
throw new IllegalStateException("Invalid operation for BoundUnaryPredicate: " + pred.op());
148160
}
@@ -176,6 +188,14 @@ public <T> R notNull(Bound<T> expr) {
176188
return null;
177189
}
178190

191+
public <T> R isNaN(Bound<T> expr) {
192+
throw new UnsupportedOperationException(this.getClass().getName() + " does not implement isNaN");
193+
}
194+
195+
public <T> R notNaN(Bound<T> expr) {
196+
throw new UnsupportedOperationException(this.getClass().getName() + " does not implement notNaN");
197+
}
198+
179199
public <T> R lt(Bound<T> expr, Literal<T> lit) {
180200
return null;
181201
}
@@ -241,6 +261,10 @@ public <T> R predicate(BoundPredicate<T> pred) {
241261
return isNull(pred.term());
242262
case NOT_NULL:
243263
return notNull(pred.term());
264+
case IS_NAN:
265+
return isNaN(pred.term());
266+
case NOT_NAN:
267+
return notNaN(pred.term());
244268
default:
245269
throw new IllegalStateException("Invalid operation for BoundUnaryPredicate: " + pred.op());
246270
}

api/src/main/java/org/apache/iceberg/expressions/Expressions.java

+45-2
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
import org.apache.iceberg.transforms.Transform;
2727
import org.apache.iceberg.transforms.Transforms;
2828
import org.apache.iceberg.types.Types;
29+
import org.apache.iceberg.util.NaNUtil;
2930

3031
/**
3132
* Factory methods for creating {@link Expression expressions}.
@@ -123,51 +124,79 @@ public static <T> UnboundPredicate<T> notNull(UnboundTerm<T> expr) {
123124
return new UnboundPredicate<>(Expression.Operation.NOT_NULL, expr);
124125
}
125126

127+
public static <T> UnboundPredicate<T> isNaN(String name) {
128+
return new UnboundPredicate<>(Expression.Operation.IS_NAN, ref(name));
129+
}
130+
131+
public static <T> UnboundPredicate<T> isNaN(UnboundTerm<T> expr) {
132+
return new UnboundPredicate<>(Expression.Operation.IS_NAN, expr);
133+
}
134+
135+
public static <T> UnboundPredicate<T> notNaN(String name) {
136+
return new UnboundPredicate<>(Expression.Operation.NOT_NAN, ref(name));
137+
}
138+
139+
public static <T> UnboundPredicate<T> notNaN(UnboundTerm<T> expr) {
140+
return new UnboundPredicate<>(Expression.Operation.NOT_NAN, expr);
141+
}
142+
126143
public static <T> UnboundPredicate<T> lessThan(String name, T value) {
144+
validateInput("lessThan", value);
127145
return new UnboundPredicate<>(Expression.Operation.LT, ref(name), value);
128146
}
129147

130148
public static <T> UnboundPredicate<T> lessThan(UnboundTerm<T> expr, T value) {
149+
validateInput("lessThan", value);
131150
return new UnboundPredicate<>(Expression.Operation.LT, expr, value);
132151
}
133152

134153
public static <T> UnboundPredicate<T> lessThanOrEqual(String name, T value) {
154+
validateInput("lessThanOrEqual", value);
135155
return new UnboundPredicate<>(Expression.Operation.LT_EQ, ref(name), value);
136156
}
137157

138158
public static <T> UnboundPredicate<T> lessThanOrEqual(UnboundTerm<T> expr, T value) {
159+
validateInput("lessThanOrEqual", value);
139160
return new UnboundPredicate<>(Expression.Operation.LT_EQ, expr, value);
140161
}
141162

142163
public static <T> UnboundPredicate<T> greaterThan(String name, T value) {
164+
validateInput("greaterThan", value);
143165
return new UnboundPredicate<>(Expression.Operation.GT, ref(name), value);
144166
}
145167

146168
public static <T> UnboundPredicate<T> greaterThan(UnboundTerm<T> expr, T value) {
169+
validateInput("greaterThan", value);
147170
return new UnboundPredicate<>(Expression.Operation.GT, expr, value);
148171
}
149172

150173
public static <T> UnboundPredicate<T> greaterThanOrEqual(String name, T value) {
174+
validateInput("greaterThanOrEqual", value);
151175
return new UnboundPredicate<>(Expression.Operation.GT_EQ, ref(name), value);
152176
}
153177

154178
public static <T> UnboundPredicate<T> greaterThanOrEqual(UnboundTerm<T> expr, T value) {
179+
validateInput("greaterThanOrEqual", value);
155180
return new UnboundPredicate<>(Expression.Operation.GT_EQ, expr, value);
156181
}
157182

158183
public static <T> UnboundPredicate<T> equal(String name, T value) {
184+
validateInput("equal", value);
159185
return new UnboundPredicate<>(Expression.Operation.EQ, ref(name), value);
160186
}
161187

162188
public static <T> UnboundPredicate<T> equal(UnboundTerm<T> expr, T value) {
189+
validateInput("equal", value);
163190
return new UnboundPredicate<>(Expression.Operation.EQ, expr, value);
164191
}
165192

166193
public static <T> UnboundPredicate<T> notEqual(String name, T value) {
194+
validateInput("notEqual", value);
167195
return new UnboundPredicate<>(Expression.Operation.NOT_EQ, ref(name), value);
168196
}
169197

170198
public static <T> UnboundPredicate<T> notEqual(UnboundTerm<T> expr, T value) {
199+
validateInput("notEqual", value);
171200
return new UnboundPredicate<>(Expression.Operation.NOT_EQ, expr, value);
172201
}
173202

@@ -216,29 +245,43 @@ public static <T> UnboundPredicate<T> notIn(UnboundTerm<T> expr, Iterable<T> val
216245
}
217246

218247
public static <T> UnboundPredicate<T> predicate(Operation op, String name, T value) {
248+
validateInput(op.toString(), value);
219249
return predicate(op, name, Literals.from(value));
220250
}
221251

222252
public static <T> UnboundPredicate<T> predicate(Operation op, String name, Literal<T> lit) {
223-
Preconditions.checkArgument(op != Operation.IS_NULL && op != Operation.NOT_NULL,
253+
Preconditions.checkArgument(
254+
op != Operation.IS_NULL && op != Operation.NOT_NULL && op != Operation.IS_NAN && op != Operation.NOT_NAN,
224255
"Cannot create %s predicate inclusive a value", op);
225256
return new UnboundPredicate<T>(op, ref(name), lit);
226257
}
227258

228259
public static <T> UnboundPredicate<T> predicate(Operation op, String name, Iterable<T> values) {
260+
validateInput(op.toString(), values);
229261
return predicate(op, ref(name), values);
230262
}
231263

232264
public static <T> UnboundPredicate<T> predicate(Operation op, String name) {
233-
Preconditions.checkArgument(op == Operation.IS_NULL || op == Operation.NOT_NULL,
265+
Preconditions.checkArgument(
266+
op == Operation.IS_NULL || op == Operation.NOT_NULL || op == Operation.IS_NAN || op == Operation.NOT_NAN,
234267
"Cannot create %s predicate without a value", op);
235268
return new UnboundPredicate<>(op, ref(name));
236269
}
237270

238271
private static <T> UnboundPredicate<T> predicate(Operation op, UnboundTerm<T> expr, Iterable<T> values) {
272+
validateInput(op.toString(), values);
239273
return new UnboundPredicate<>(op, expr, values);
240274
}
241275

276+
private static <T> void validateInput(String op, T value) {
277+
Preconditions.checkArgument(!NaNUtil.isNaN(value), String.format("Cannot create %s predicate with NaN", op));
278+
}
279+
280+
private static <T> void validateInput(String op, Iterable<T> values) {
281+
Preconditions.checkArgument(Lists.newArrayList(values).stream().noneMatch(NaNUtil::isNaN),
282+
String.format("Cannot create %s predicate with NaN", op));
283+
}
284+
242285
public static True alwaysTrue() {
243286
return True.INSTANCE;
244287
}

api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java

+35
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ public boolean eval(ContentFile<?> file) {
7676
private class MetricsEvalVisitor extends BoundExpressionVisitor<Boolean> {
7777
private Map<Integer, Long> valueCounts = null;
7878
private Map<Integer, Long> nullCounts = null;
79+
private Map<Integer, Long> nanCounts = null;
7980
private Map<Integer, ByteBuffer> lowerBounds = null;
8081
private Map<Integer, ByteBuffer> upperBounds = null;
8182

@@ -93,6 +94,7 @@ private boolean eval(ContentFile<?> file) {
9394

9495
this.valueCounts = file.valueCounts();
9596
this.nullCounts = file.nullValueCounts();
97+
this.nanCounts = file.nanValueCounts();
9698
this.lowerBounds = file.lowerBounds();
9799
this.upperBounds = file.upperBounds();
98100

@@ -150,6 +152,34 @@ public <T> Boolean notNull(BoundReference<T> ref) {
150152
return ROWS_MIGHT_MATCH;
151153
}
152154

155+
@Override
156+
public <T> Boolean isNaN(BoundReference<T> ref) {
157+
Integer id = ref.fieldId();
158+
159+
if (nanCounts != null && nanCounts.containsKey(id) && nanCounts.get(id) == 0) {
160+
return ROWS_CANNOT_MATCH;
161+
}
162+
163+
// when there's no nanCounts information, but we already know the column only contains null,
164+
// it's guaranteed that there's no NaN value
165+
if (containsNullsOnly(id)) {
166+
return ROWS_CANNOT_MATCH;
167+
}
168+
169+
return ROWS_MIGHT_MATCH;
170+
}
171+
172+
@Override
173+
public <T> Boolean notNaN(BoundReference<T> ref) {
174+
Integer id = ref.fieldId();
175+
176+
if (containsNaNsOnly(id)) {
177+
return ROWS_CANNOT_MATCH;
178+
}
179+
180+
return ROWS_MIGHT_MATCH;
181+
}
182+
153183
@Override
154184
public <T> Boolean lt(BoundReference<T> ref, Literal<T> lit) {
155185
Integer id = ref.fieldId();
@@ -347,5 +377,10 @@ private boolean containsNullsOnly(Integer id) {
347377
nullCounts != null && nullCounts.containsKey(id) &&
348378
valueCounts.get(id) - nullCounts.get(id) == 0;
349379
}
380+
381+
private boolean containsNaNsOnly(Integer id) {
382+
return nanCounts != null && nanCounts.containsKey(id) &&
383+
valueCounts != null && nanCounts.get(id).equals(valueCounts.get(id));
384+
}
350385
}
351386
}

api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java

+19-2
Original file line numberDiff line numberDiff line change
@@ -134,14 +134,31 @@ public <T> Boolean notNull(BoundReference<T> ref) {
134134
int pos = Accessors.toPosition(ref.accessor());
135135
// containsNull encodes whether at least one partition value is null, lowerBound is null if
136136
// all partition values are null.
137-
ByteBuffer lowerBound = stats.get(pos).lowerBound();
138-
if (lowerBound == null) {
137+
if (stats.get(pos).containsNull() && stats.get(pos).lowerBound() == null) {
138+
return ROWS_CANNOT_MATCH; // all values are null
139+
}
140+
141+
return ROWS_MIGHT_MATCH;
142+
}
143+
144+
@Override
145+
public <T> Boolean isNaN(BoundReference<T> ref) {
146+
int pos = Accessors.toPosition(ref.accessor());
147+
// containsNull encodes whether at least one partition value is null, lowerBound is null if
148+
// all partition values are null.
149+
if (stats.get(pos).containsNull() && stats.get(pos).lowerBound() == null) {
139150
return ROWS_CANNOT_MATCH; // all values are null
140151
}
141152

142153
return ROWS_MIGHT_MATCH;
143154
}
144155

156+
@Override
157+
public <T> Boolean notNaN(BoundReference<T> ref) {
158+
// we don't have enough information to tell if there is no NaN value
159+
return ROWS_MIGHT_MATCH;
160+
}
161+
145162
@Override
146163
public <T> Boolean lt(BoundReference<T> ref, Literal<T> lit) {
147164
int pos = Accessors.toPosition(ref.accessor());

api/src/main/java/org/apache/iceberg/expressions/ResidualEvaluator.java

+11
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
import org.apache.iceberg.StructLike;
2929
import org.apache.iceberg.expressions.ExpressionVisitors.BoundExpressionVisitor;
3030
import org.apache.iceberg.transforms.Transform;
31+
import org.apache.iceberg.util.NaNUtil;
3132

3233
/**
3334
* Finds the residuals for an {@link Expression} the partitions in the given {@link PartitionSpec}.
@@ -152,6 +153,16 @@ public <T> Expression notNull(BoundReference<T> ref) {
152153
return (ref.eval(struct) != null) ? alwaysTrue() : alwaysFalse();
153154
}
154155

156+
@Override
157+
public <T> Expression isNaN(BoundReference<T> ref) {
158+
return NaNUtil.isNaN(ref.eval(struct)) ? alwaysTrue() : alwaysFalse();
159+
}
160+
161+
@Override
162+
public <T> Expression notNaN(BoundReference<T> ref) {
163+
return NaNUtil.isNaN(ref.eval(struct)) ? alwaysFalse() : alwaysTrue();
164+
}
165+
155166
@Override
156167
public <T> Expression lt(BoundReference<T> ref, Literal<T> lit) {
157168
Comparator<T> cmp = lit.comparator();

0 commit comments

Comments
 (0)