Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

column to column comparisons for filtering file scans and row data #11152

Draft
wants to merge 17 commits into
base: main
Choose a base branch
from
Draft
Prev Previous commit
Next Next commit
test cases for inclusive metrics evaluator
  • Loading branch information
Jennifer Baldwin committed Sep 2, 2024
commit 00e9411c33b1d4826f690fca9bb14ad15365854e
Original file line number Diff line number Diff line change
Expand Up @@ -439,11 +439,11 @@ public <T> Boolean eq(BoundReference<T> ref, BoundReference<T> ref2) {
return ROWS_MIGHT_MATCH;
}

if (checkLowerBounds(ref, ref2, id, id2, cmp -> cmp > 0)) {
if (checkLowerToUpperBounds(ref, ref2, id, id2, cmp -> cmp > 0)) {
return ROWS_CANNOT_MATCH;
}

if (checkUpperBounds(ref, ref2, id, id2, cmp -> cmp < 0)) {
if (checkUpperToLowerBounds(ref, ref2, id, id2, cmp -> cmp < 0)) {
return ROWS_CANNOT_MATCH;
}

Expand Down Expand Up @@ -537,30 +537,6 @@ private <T> boolean checkUpperToLowerBounds(
return false;
}

private <T> boolean checkLowerBounds(
BoundReference<T> ref,
BoundReference<T> ref2,
Integer id,
Integer id2,
java.util.function.Predicate<Integer> compare) {
if (lowerBounds != null && lowerBounds.containsKey(id) && lowerBounds.containsKey(id2)) {
T lower = Conversions.fromByteBuffer(ref.type(), lowerBounds.get(id));
T lower2 = Conversions.fromByteBuffer(ref2.type(), lowerBounds.get(id2));

if (NaNUtil.isNaN(lower) || NaNUtil.isNaN(lower2)) {
// NaN indicates unreliable bounds. See the InclusiveMetricsEvaluator docs for more.
return false;
}

Comparator<Object> comparator = Comparators.forType(ref.type().asPrimitiveType());
int cmp = comparator.compare(lower, lower2);
if (compare.test(cmp)) {
return true;
}
}
return false;
}

@Override
public <T> Boolean in(BoundReference<T> ref, Set<T> literalSet) {
Integer id = ref.fieldId();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
import static org.apache.iceberg.expressions.Expressions.notNull;
import static org.apache.iceberg.expressions.Expressions.notStartsWith;
import static org.apache.iceberg.expressions.Expressions.or;
import static org.apache.iceberg.expressions.Expressions.predicate;
import static org.apache.iceberg.expressions.Expressions.startsWith;
import static org.apache.iceberg.types.Conversions.toByteBuffer;
import static org.apache.iceberg.types.Types.NestedField.optional;
Expand Down Expand Up @@ -71,7 +72,8 @@ public class TestInclusiveMetricsEvaluator {
optional(11, "all_nans_v1_stats", Types.FloatType.get()),
optional(12, "nan_and_null_only", Types.DoubleType.get()),
optional(13, "no_nan_stats", Types.DoubleType.get()),
optional(14, "some_empty", Types.StringType.get()));
optional(14, "some_empty", Types.StringType.get()),
optional(15, "id2", Types.IntegerType.get()));

private static final int INT_MIN_VALUE = 30;
private static final int INT_MAX_VALUE = 79;
Expand Down Expand Up @@ -115,13 +117,15 @@ public class TestInclusiveMetricsEvaluator {
1, toByteBuffer(IntegerType.get(), INT_MIN_VALUE),
11, toByteBuffer(Types.FloatType.get(), Float.NaN),
12, toByteBuffer(Types.DoubleType.get(), Double.NaN),
14, toByteBuffer(Types.StringType.get(), "")),
14, toByteBuffer(Types.StringType.get(), ""),
15, toByteBuffer(Types.IntegerType.get(), INT_MIN_VALUE - 25)),
// upper bounds
ImmutableMap.of(
1, toByteBuffer(IntegerType.get(), INT_MAX_VALUE),
11, toByteBuffer(Types.FloatType.get(), Float.NaN),
12, toByteBuffer(Types.DoubleType.get(), Double.NaN),
14, toByteBuffer(Types.StringType.get(), "房东整租霍营小区二层两居室")));
14, toByteBuffer(Types.StringType.get(), "房东整租霍营小区二层两居室"),
15, toByteBuffer(Types.IntegerType.get(), INT_MIN_VALUE + 25)));

private static final DataFile FILE_2 =
new TestDataFile(
Expand All @@ -135,9 +139,14 @@ public class TestInclusiveMetricsEvaluator {
// nan value counts
null,
// lower bounds
ImmutableMap.of(3, toByteBuffer(StringType.get(), "aa")),
ImmutableMap.of(1, toByteBuffer(IntegerType.get(), INT_MIN_VALUE),
3, toByteBuffer(StringType.get(), "aa"),
15, toByteBuffer(Types.IntegerType.get(), INT_MIN_VALUE + 25)
),
// upper bounds
ImmutableMap.of(3, toByteBuffer(StringType.get(), "dC")));
ImmutableMap.of(1, toByteBuffer(IntegerType.get(), INT_MIN_VALUE + 10),
3, toByteBuffer(StringType.get(), "dC"),
15, toByteBuffer(Types.IntegerType.get(), INT_MIN_VALUE + 50)));

private static final DataFile FILE_3 =
new TestDataFile(
Expand All @@ -151,9 +160,14 @@ public class TestInclusiveMetricsEvaluator {
// nan value counts
null,
// lower bounds
ImmutableMap.of(3, toByteBuffer(StringType.get(), "1str1")),
ImmutableMap.of(1, toByteBuffer(IntegerType.get(), INT_MIN_VALUE),
3, toByteBuffer(StringType.get(), "1str1"),
15, toByteBuffer(Types.IntegerType.get(), INT_MIN_VALUE - 25)),
// upper bounds
ImmutableMap.of(3, toByteBuffer(StringType.get(), "3str3")));
ImmutableMap.of(1, toByteBuffer(IntegerType.get(), INT_MIN_VALUE + 10),
3, toByteBuffer(StringType.get(), "3str3"),
15, toByteBuffer(Types.IntegerType.get(), INT_MIN_VALUE - 5)
));

private static final DataFile FILE_4 =
new TestDataFile(
Expand All @@ -167,9 +181,13 @@ public class TestInclusiveMetricsEvaluator {
// nan value counts
null,
// lower bounds
ImmutableMap.of(3, toByteBuffer(StringType.get(), "abc")),
ImmutableMap.of(1, toByteBuffer(IntegerType.get(), INT_MIN_VALUE),
3, toByteBuffer(StringType.get(), "abc"),
15, toByteBuffer(Types.IntegerType.get(), INT_MIN_VALUE)),
// upper bounds
ImmutableMap.of(3, toByteBuffer(StringType.get(), "イロハニホヘト")));
ImmutableMap.of(1, toByteBuffer(IntegerType.get(), INT_MIN_VALUE),
3, toByteBuffer(StringType.get(), "イロハニホヘト"),
15, toByteBuffer(Types.IntegerType.get(), INT_MIN_VALUE)));

@Test
public void testAllNulls() {
Expand Down Expand Up @@ -430,6 +448,23 @@ public void testIntegerLt() {
assertThat(shouldRead).as("Should read: may possible ids").isTrue();
}

@Test
public void testRefCompareIntegerLt() {
boolean shouldRead =
new InclusiveMetricsEvaluator(SCHEMA, predicate(Expression.Operation.LT, "id", "id2")).eval(FILE);
assertThat(shouldRead).as("Should read: id range lower bound (30) is below upper bound id2 range (55)").isTrue();

shouldRead =
new InclusiveMetricsEvaluator(SCHEMA, predicate(Expression.Operation.LT, "id", "id2")).eval(FILE_2);
assertThat(shouldRead).as("Should read: id range lower bound (30) is below upper bound id2 range (80)").isTrue();

shouldRead = new InclusiveMetricsEvaluator(SCHEMA, predicate(Expression.Operation.LT, "id", "id2")).eval(FILE_3);
assertThat(shouldRead).as("Should not read: id range lower bound (30) is not below upper bound id range (25)").isFalse();

shouldRead = new InclusiveMetricsEvaluator(SCHEMA, predicate(Expression.Operation.LT, "id", "id2")).eval(FILE_4);
assertThat(shouldRead).as("Should not read: id range lower bound (30) is not below upper bound id range (30)").isFalse();
}

@Test
public void testIntegerLtEq() {
boolean shouldRead =
Expand All @@ -449,6 +484,23 @@ public void testIntegerLtEq() {
assertThat(shouldRead).as("Should read: many possible ids").isTrue();
}

@Test
public void testRefCompareIntegerLtEq() {
boolean shouldRead =
new InclusiveMetricsEvaluator(SCHEMA, predicate(Expression.Operation.LT_EQ, "id", "id2")).eval(FILE);
assertThat(shouldRead).as("Should read: id range lower bound (30) is below upper bound id2 range (55)").isTrue();

shouldRead =
new InclusiveMetricsEvaluator(SCHEMA, predicate(Expression.Operation.LT_EQ, "id", "id2")).eval(FILE_2);
assertThat(shouldRead).as("Should read: id range lower bound (30) is below upper bound id2 range (80)").isTrue();

shouldRead = new InclusiveMetricsEvaluator(SCHEMA, predicate(Expression.Operation.LT_EQ, "id", "id2")).eval(FILE_3);
assertThat(shouldRead).as("Should not read: id range lower bound (30) is not below upper bound id range (25)").isFalse();

shouldRead = new InclusiveMetricsEvaluator(SCHEMA, predicate(Expression.Operation.LT_EQ, "id", "id2")).eval(FILE_4);
assertThat(shouldRead).as("Should read: id range lower bound (30) can be equal to range (30)").isTrue();
}

@Test
public void testIntegerGt() {
boolean shouldRead =
Expand All @@ -469,6 +521,23 @@ public void testIntegerGt() {
assertThat(shouldRead).as("Should read: may possible ids").isTrue();
}

@Test
public void testRefCompareIntegerGt() {
boolean shouldRead =
new InclusiveMetricsEvaluator(SCHEMA, predicate(Expression.Operation.GT, "id", "id2")).eval(FILE);
assertThat(shouldRead).as("Should read: id range upper bound (79) is greater than lower bound id2 range (5)").isTrue();

shouldRead =
new InclusiveMetricsEvaluator(SCHEMA, predicate(Expression.Operation.GT, "id", "id2")).eval(FILE_2);
assertThat(shouldRead).as("Should not read: id range upper bound (40) is not greater than upper bound id2 range (80)").isFalse();

shouldRead = new InclusiveMetricsEvaluator(SCHEMA, predicate(Expression.Operation.GT, "id", "id2")).eval(FILE_3);
assertThat(shouldRead).as("Should read: id range upper bound (40) is greater than lower bound id range (5)").isTrue();

shouldRead = new InclusiveMetricsEvaluator(SCHEMA, predicate(Expression.Operation.GT, "id", "id2")).eval(FILE_4);
assertThat(shouldRead).as("Should not read: id range upper bound (30) is not greater than upper bound id range (30)").isFalse();
}

@Test
public void testIntegerGtEq() {
boolean shouldRead =
Expand All @@ -491,6 +560,23 @@ public void testIntegerGtEq() {
assertThat(shouldRead).as("Should read: may possible ids").isTrue();
}

@Test
public void testRefCompareIntegerGtEq() {
boolean shouldRead =
new InclusiveMetricsEvaluator(SCHEMA, predicate(Expression.Operation.GT_EQ, "id", "id2")).eval(FILE);
assertThat(shouldRead).as("Should read: id range upper bound (79) is greater than lower bound id2 range (5)").isTrue();

shouldRead =
new InclusiveMetricsEvaluator(SCHEMA, predicate(Expression.Operation.GT_EQ, "id", "id2")).eval(FILE_2);
assertThat(shouldRead).as("Should not read: id range upper bound (40) is not greater than upper bound id2 range (80)").isFalse();

shouldRead = new InclusiveMetricsEvaluator(SCHEMA, predicate(Expression.Operation.GT_EQ, "id", "id2")).eval(FILE_3);
assertThat(shouldRead).as("Should read: id range upper bound (40) is greater than lower bound id range (5)").isTrue();

shouldRead = new InclusiveMetricsEvaluator(SCHEMA, predicate(Expression.Operation.GT_EQ, "id", "id2")).eval(FILE_4);
assertThat(shouldRead).as("Should not read: id range upper bound (30) is equal to upper bound id range (30)").isTrue();
}

@Test
public void testIntegerEq() {
boolean shouldRead =
Expand All @@ -516,6 +602,25 @@ public void testIntegerEq() {
assertThat(shouldRead).as("Should not read: id above upper bound").isFalse();
}

@Test
public void testRefCompareIntegerEq() {
boolean shouldRead =
new InclusiveMetricsEvaluator(SCHEMA, predicate(Expression.Operation.EQ, "id", "id2")).eval(FILE);
assertThat(shouldRead).as("Should read: id range (30,79) can be equal to id2 range (5,55)").isTrue();

shouldRead =
new InclusiveMetricsEvaluator(SCHEMA, predicate(Expression.Operation.EQ, "id", "id2")).eval(FILE_2);
assertThat(shouldRead).as("Should not read: id range (30,40) can not be equal to id2 range (50,80)").isFalse();

shouldRead =
new InclusiveMetricsEvaluator(SCHEMA, predicate(Expression.Operation.EQ, "id", "id2")).eval(FILE_3);
assertThat(shouldRead).as("Should not read: id range (5,25) can not be equal to id2 range (30,40)").isFalse();

shouldRead =
new InclusiveMetricsEvaluator(SCHEMA, predicate(Expression.Operation.EQ, "id", "id2")).eval(FILE_4);
assertThat(shouldRead).as("Should read: id range (30,30) can be equal to id2 range (30,30)").isTrue();
}

@Test
public void testIntegerNotEq() {
boolean shouldRead =
Expand Down Expand Up @@ -545,6 +650,25 @@ public void testIntegerNotEq() {
assertThat(shouldRead).as("Should read: id above upper bound").isTrue();
}

@Test
public void testRefCompareIntegerNotEq() {
boolean shouldRead =
new InclusiveMetricsEvaluator(SCHEMA, predicate(Expression.Operation.NOT_EQ, "id", "id2")).eval(FILE);
assertThat(shouldRead).as("Should read: id range (30,79) can be equal to id2 range (5,55)").isTrue();

shouldRead =
new InclusiveMetricsEvaluator(SCHEMA, predicate(Expression.Operation.NOT_EQ, "id", "id2")).eval(FILE_2);
assertThat(shouldRead).as("Should read: id range (30,40) can be equal to id2 range (5,55)").isTrue();

shouldRead =
new InclusiveMetricsEvaluator(SCHEMA, predicate(Expression.Operation.NOT_EQ, "id", "id2")).eval(FILE_3);
assertThat(shouldRead).as("Should read: id range (30,35) can be equal to id2 range (25,30)").isTrue();

shouldRead =
new InclusiveMetricsEvaluator(SCHEMA, predicate(Expression.Operation.NOT_EQ, "id2", "id")).eval(FILE_4);
assertThat(shouldRead).as("Should read: id range (20,25) can not be equal to id2 range (30,30)").isTrue();
}

@Test
public void testIntegerNotEqRewritten() {
boolean shouldRead =
Expand Down