Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -424,10 +424,10 @@ protected long evaluateExpression(Statistics stats, ExprNodeDesc pred,
String colType = encd.getTypeString();
if (colType.equalsIgnoreCase(serdeConstants.BOOLEAN_TYPE_NAME)) {
ColStatistics cs = stats.getColumnStatisticsFromColName(colName);
if (cs != null) {
if (cs != null && cs.getNumTrues() >= 0) {
newNumRows = cs.getNumTrues();
} else {
// default
// default (no stats or numTrues unknown i.e. negative)
newNumRows = stats.getNumRows() / 2;
}
} else {
Expand Down Expand Up @@ -923,11 +923,11 @@ private long evaluateNotExpr(Statistics stats, ExprNodeDesc pred, long currNumRo
String colType = encd.getTypeString();
if (colType.equalsIgnoreCase(serdeConstants.BOOLEAN_TYPE_NAME)) {
ColStatistics cs = stats.getColumnStatisticsFromColName(colName);
if (cs != null) {
if (cs != null && cs.getNumFalses() >= 0) {
return cs.getNumFalses();
}
}
// if not boolean column return half the number of rows
// if not boolean column, or numFalses unknown (negative), return half the number of rows
return numRows / 2;
}
}
Expand All @@ -952,7 +952,7 @@ private long evaluateColEqualsNullExpr(Statistics stats, AnnotateStatsProcCtx as
aspCtx.addAffectedColumn(colDesc);
String colName = colDesc.getColumn();
ColStatistics cs = stats.getColumnStatisticsFromColName(colName);
if (cs != null) {
if (cs != null && cs.getNumNulls() >= 0) {
return cs.getNumNulls();
}
}
Expand Down Expand Up @@ -1783,9 +1783,11 @@ private static void computeAggregateColumnMinMax(ColStatistics cs, HiveConf conf
conf, parentStats, agg.getParameters().get(0));
if (parentCS != null && parentCS.getRange() != null &&
parentCS.getRange().minValue != null && parentCS.getRange().maxValue != null) {
// numNulls < 0 means "unknown" - treat as 0 for conservative COUNT estimate
long numNulls = parentCS.getNumNulls() < 0 ? 0 : parentCS.getNumNulls();
long valuesCount = agg.getDistinct() ?
parentCS.getCountDistint() :
parentStats.getNumRows() - parentCS.getNumNulls();
parentStats.getNumRows() - numNulls;
Range range = parentCS.getRange();
// Get the aggregate function matching the name in the query.
GenericUDAFResolver udaf =
Expand Down Expand Up @@ -2626,6 +2628,10 @@ private void updateNumNulls(ColStatistics colStats, long leftUnmatchedRows, long
}

long oldNumNulls = colStats.getNumNulls();
// numNulls < 0 means "unknown" - preserve the sentinel value
if (oldNumNulls < 0) {
return;
}
long newNumNulls = Math.min(newNumRows, oldNumNulls);

JoinCondDesc joinCond = jop.getConf().getConds()[0];
Expand Down
7 changes: 6 additions & 1 deletion ql/src/java/org/apache/hadoop/hive/ql/plan/Statistics.java
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,12 @@ public void addToColumnStats(List<ColStatistics> colStats) {
if (columnStats.containsKey(key) && columnStats.get(key) != null) {
updatedCS = columnStats.get(key);
updatedCS.setAvgColLen(Math.max(updatedCS.getAvgColLen(), cs.getAvgColLen()));
updatedCS.setNumNulls(StatsUtils.safeAdd(updatedCS.getNumNulls(), cs.getNumNulls()));
// numNulls < 0 means "unknown" - propagate unknown if either is unknown
if (cs.getNumNulls() < 0 || updatedCS.getNumNulls() < 0) {
updatedCS.setNumNulls(-1);
} else {
updatedCS.setNumNulls(StatsUtils.safeAdd(updatedCS.getNumNulls(), cs.getNumNulls()));
}
updatedCS.setCountDistint(Math.max(updatedCS.getCountDistint(), cs.getCountDistint()));
columnStats.put(key, updatedCS);
} else {
Expand Down
25 changes: 20 additions & 5 deletions ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -820,7 +820,11 @@ public static ColStatistics getColStatistics(ColumnStatisticsObj cso, String col
} else if (colTypeLowerCase.equals(serdeConstants.BOOLEAN_TYPE_NAME)) {
if (csd.getBooleanStats().getNumFalses() > 0 && csd.getBooleanStats().getNumTrues() > 0) {
cs.setCountDistint(2);
} else if (csd.getBooleanStats().getNumFalses() < 0 || csd.getBooleanStats().getNumTrues() < 0) {
// At least one is unknown (-1) - assume typical boolean has both values
cs.setCountDistint(2);
} else {
// Both >= 0 but not both > 0 - one value type confirmed absent (or all NULL)
cs.setCountDistint(1);
}
cs.setNumTrues(csd.getBooleanStats().getNumTrues());
Expand Down Expand Up @@ -2038,8 +2042,11 @@ public static void updateStats(Statistics stats, long newNumRows,
if (oldDV > newNumRows) {
cs.setCountDistint(newNumRows);
}
long newNumNulls = Math.round(ratio * cs.getNumNulls());
cs.setNumNulls(Math.min(newNumNulls, newNumRows));
// numNulls < 0 means "unknown" - preserve the sentinel value
if (cs.getNumNulls() >= 0) {
long newNumNulls = Math.round(ratio * cs.getNumNulls());
cs.setNumNulls(Math.min(newNumNulls, newNumRows));
}
}
stats.setColumnStats(colStats);
long newDataSize = StatsUtils.getDataSizeFromColumnStats(newNumRows, colStats);
Expand All @@ -2052,9 +2059,17 @@ public static void updateStats(Statistics stats, long newNumRows,

public static void scaleColStatistics(List<ColStatistics> colStats, double factor) {
for (ColStatistics cs : colStats) {
cs.setNumFalses(StatsUtils.safeMult(cs.getNumFalses(), factor));
cs.setNumTrues(StatsUtils.safeMult(cs.getNumTrues(), factor));
cs.setNumNulls(StatsUtils.safeMult(cs.getNumNulls(), factor));
// numTrues/numFalses < 0 means "unknown" - preserve the sentinel value
if (cs.getNumFalses() >= 0) {
cs.setNumFalses(StatsUtils.safeMult(cs.getNumFalses(), factor));
}
if (cs.getNumTrues() >= 0) {
cs.setNumTrues(StatsUtils.safeMult(cs.getNumTrues(), factor));
}
// numNulls < 0 means "unknown" - preserve the sentinel value
if (cs.getNumNulls() >= 0) {
cs.setNumNulls(StatsUtils.safeMult(cs.getNumNulls(), factor));
}
if (factor < 1.0) {
final double newNDV = Math.ceil(cs.getCountDistint() * factor);
cs.setCountDistint(newNDV > Long.MAX_VALUE ? Long.MAX_VALUE : (long) newNDV);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,13 +44,22 @@ public void add(ColStatistics stat) {
if (stat.getCountDistint() > result.getCountDistint()) {
result.setCountDistint(stat.getCountDistint());
}
if (stat.getNumNulls() > result.getNumNulls()) {
// numNulls < 0 means "unknown" - propagate unknown if either is unknown
if (stat.getNumNulls() < 0 || result.getNumNulls() < 0) {
result.setNumNulls(-1);
} else if (stat.getNumNulls() > result.getNumNulls()) {
result.setNumNulls(stat.getNumNulls());
}
if (stat.getNumTrues() > result.getNumTrues()) {
// numTrues < 0 means "unknown" - propagate unknown if either is unknown
if (stat.getNumTrues() < 0 || result.getNumTrues() < 0) {
result.setNumTrues(-1);
} else if (stat.getNumTrues() > result.getNumTrues()) {
result.setNumTrues(stat.getNumTrues());
}
if (stat.getNumFalses() > result.getNumFalses()) {
// numFalses < 0 means "unknown" - propagate unknown if either is unknown
if (stat.getNumFalses() < 0 || result.getNumFalses() < 0) {
result.setNumFalses(-1);
} else if (stat.getNumFalses() > result.getNumFalses()) {
result.setNumFalses(stat.getNumFalses());
}
if (stat.isFilteredColumn()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,14 +35,26 @@
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualOrLessThan;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPGreaterThan;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPLessThan;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
import org.apache.hadoop.hive.ql.plan.AggregationDesc;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator;
import org.junit.Test;

import java.lang.reflect.Constructor;
import java.lang.reflect.Method;
import java.util.Arrays;
import java.util.Collections;

import org.apache.hadoop.hive.ql.exec.CommonJoinOperator;
import org.apache.hadoop.hive.ql.plan.JoinCondDesc;
import org.apache.hadoop.hive.ql.plan.JoinDesc;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;

import static org.apache.hadoop.hive.ql.optimizer.stats.annotation.StatsRulesProcFactory.FilterStatsRule.extractFloatFromLiteralValue;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertThrows;

public class TestStatsRulesProcFactory {
Expand Down Expand Up @@ -631,4 +643,169 @@ private static ColStatistics createColStatistics(

return colStatistics;
}

/**
* Test that computeAggregateColumnMinMax properly handles numNulls=-1 (unknown).
* With the fix, numNulls=-1 should be treated as 0, giving valuesCount = numRows.
* Without the fix, valuesCount = numRows - (-1) = numRows + 1 (wrong).
*/
@Test
public void testComputeAggregateColumnMinMaxWithUnknownNumNulls() throws Exception {
// Get the private static method via reflection
Class<?> groupByStatsRuleClass = null;
for (Class<?> innerClass : StatsRulesProcFactory.class.getDeclaredClasses()) {
if (innerClass.getSimpleName().equals("GroupByStatsRule")) {
groupByStatsRuleClass = innerClass;
break;
}
}
assertNotNull("GroupByStatsRule class not found", groupByStatsRuleClass);

Method method = groupByStatsRuleClass.getDeclaredMethod(
"computeAggregateColumnMinMax",
ColStatistics.class, HiveConf.class, AggregationDesc.class, String.class, Statistics.class);
method.setAccessible(true);

// Create output ColStatistics for the COUNT result
ColStatistics cs = new ColStatistics("_col0", "bigint");

// Create HiveConf
HiveConf conf = new HiveConf();

// Create parent column stats with numNulls=-1 (unknown) and Range(1, 100)
ColStatistics parentColStats = new ColStatistics("val", "int");
parentColStats.setNumNulls(-1); // unknown numNulls - this is what we're testing
parentColStats.setCountDistint(100);
parentColStats.setRange(1, 100);

// Create parent Statistics with 100 rows
Statistics parentStats = new Statistics(100, 400, 400, 400);
parentStats.addToColumnStats(Collections.singletonList(parentColStats));

// Create ExprNodeColumnDesc for the "val" column
ExprNodeColumnDesc colExpr = new ExprNodeColumnDesc(
TypeInfoFactory.intTypeInfo, "val", "t", false);

// Create AggregationDesc for COUNT(val) using no-arg constructor to avoid NPE
AggregationDesc agg = new AggregationDesc();
agg.setGenericUDAFName("count");
agg.setParameters(Collections.singletonList(colExpr));
agg.setDistinct(false);
agg.setMode(GenericUDAFEvaluator.Mode.COMPLETE);

// Call the method
method.invoke(null, cs, conf, agg, "bigint", parentStats);

// Verify: With the fix, COUNT Range should be (0, 100)
// numNulls=-1 is treated as 0, so valuesCount = 100 - 0 = 100
// Without the fix, valuesCount = 100 - (-1) = 101 (WRONG)
assertNotNull("Range should be set on COUNT column", cs.getRange());
assertEquals("COUNT min should be 0", 0L, ((Number) cs.getRange().minValue).longValue());
assertEquals("COUNT max should be 100 (numRows), not 101",
100L, ((Number) cs.getRange().maxValue).longValue());
}

@Test
public void testComputeAggregateColumnMinMaxWithKnownNumNulls() throws Exception {
// Get the private static method via reflection
Class<?> groupByStatsRuleClass = null;
for (Class<?> innerClass : StatsRulesProcFactory.class.getDeclaredClasses()) {
if (innerClass.getSimpleName().equals("GroupByStatsRule")) {
groupByStatsRuleClass = innerClass;
break;
}
}
Method method = groupByStatsRuleClass.getDeclaredMethod(
"computeAggregateColumnMinMax",
ColStatistics.class, HiveConf.class, AggregationDesc.class, String.class, Statistics.class);
method.setAccessible(true);

ColStatistics cs = new ColStatistics("_col0", "bigint");
HiveConf conf = new HiveConf();

// Create parent column stats with numNulls=20 (known) and Range
ColStatistics parentColStats = new ColStatistics("val", "int");
parentColStats.setNumNulls(20); // known numNulls
parentColStats.setCountDistint(80);
parentColStats.setRange(1, 100);

Statistics parentStats = new Statistics(100, 400, 400, 400);
parentStats.addToColumnStats(Collections.singletonList(parentColStats));

ExprNodeColumnDesc colExpr = new ExprNodeColumnDesc(
TypeInfoFactory.intTypeInfo, "val", "t", false);
AggregationDesc agg = new AggregationDesc();
agg.setGenericUDAFName("count");
agg.setParameters(Collections.singletonList(colExpr));
agg.setDistinct(false);
agg.setMode(GenericUDAFEvaluator.Mode.COMPLETE);

method.invoke(null, cs, conf, agg, "bigint", parentStats);

// With known numNulls=20, valuesCount = 100 - 20 = 80
assertNotNull("Range should be set", cs.getRange());
assertEquals(0L, ((Number) cs.getRange().minValue).longValue());
assertEquals("COUNT max should be 80 (numRows - numNulls)",
80L, ((Number) cs.getRange().maxValue).longValue());
}

/**
* Test that JoinStatsRule.updateNumNulls preserves unknown numNulls (-1).
* With the fix, when numNulls is -1 (unknown), the method returns early without modification.
* Without the fix, LEFT_OUTER_JOIN would calculate: newNumNulls = oldNumNulls + leftUnmatchedRows = -1 + 100 = 99
*/
@Test
public void testUpdateNumNullsPreservesUnknownNumNulls() throws Exception {
// Get the private JoinStatsRule inner class
Class<?> joinStatsRuleClass = null;
for (Class<?> innerClass : StatsRulesProcFactory.class.getDeclaredClasses()) {
if (innerClass.getSimpleName().equals("JoinStatsRule")) {
joinStatsRuleClass = innerClass;
break;
}
}
assertNotNull("JoinStatsRule class not found", joinStatsRuleClass);

// Create an instance of JoinStatsRule
Constructor<?> ctor = joinStatsRuleClass.getDeclaredConstructor();
ctor.setAccessible(true);
Object joinStatsRule = ctor.newInstance();

// Get the updateNumNulls method
Method updateNumNulls = joinStatsRuleClass.getDeclaredMethod("updateNumNulls",
ColStatistics.class, long.class, long.class, long.class, long.class,
CommonJoinOperator.class);
updateNumNulls.setAccessible(true);

// Create ColStatistics with numNulls = -1 (unknown)
// Use a column name that won't be a join key
ColStatistics colStats = new ColStatistics("non_join_col", "int");
colStats.setNumNulls(-1);
colStats.setCountDistint(100);

// Create a mock JoinOperator with LEFT_OUTER_JOIN
JoinCondDesc joinCond = mock(JoinCondDesc.class);
when(joinCond.getType()).thenReturn(JoinDesc.LEFT_OUTER_JOIN);
when(joinCond.getRight()).thenReturn(0); // pos=0 will match getRight()

JoinDesc joinDesc = mock(JoinDesc.class);
when(joinDesc.getConds()).thenReturn(new JoinCondDesc[] { joinCond });
// Return empty join keys so our column won't be a join key
when(joinDesc.getJoinKeys()).thenReturn(new ExprNodeDesc[][] {});

@SuppressWarnings("unchecked")
CommonJoinOperator<JoinDesc> mockJop = mock(CommonJoinOperator.class);
when(mockJop.getConf()).thenReturn(joinDesc);

// Call updateNumNulls with:
// - leftUnmatchedRows=100 (without fix, this would be added to -1, giving 99)
// - pos=0 (matches joinCond.getRight())
// With the fix: should return early because numNulls is -1
// Without fix: numNulls would become Math.min(1000, -1 + 100) = 99
updateNumNulls.invoke(joinStatsRule, colStats, 100L, 100L, 1000L, 0L, mockJop);

// Assert that numNulls is still -1 (unchanged)
assertEquals("Unknown numNulls (-1) should be preserved after updateNumNulls",
-1L, colStats.getNumNulls());
}
}
Loading