Skip to content

Commit

Permalink
Fix analyze empty table NaN bug.
Browse files Browse the repository at this point in the history
  • Loading branch information
Jibing-Li committed Dec 1, 2023
1 parent 776f020 commit 7206383
Show file tree
Hide file tree
Showing 7 changed files with 56 additions and 32 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,8 @@ public abstract class BaseAnalysisTask {
+ "${rowCount} AS `row_count`, "
+ "${ndvFunction} as `ndv`, "
+ "IFNULL(SUM(IF(`t1`.`column_key` IS NULL, `t1`.`count`, 0)), 0) * ${scaleFactor} as `null_count`, "
+ "'${min}' AS `min`, "
+ "'${max}' AS `max`, "
+ "${min} AS `min`, "
+ "${max} AS `max`, "
+ "${dataSizeFunction} * ${scaleFactor} AS `data_size`, "
+ "NOW() "
+ "FROM ( "
Expand All @@ -115,8 +115,8 @@ public abstract class BaseAnalysisTask {
+ "${row_count} AS `row_count`, "
+ "${ndv} AS `ndv`, "
+ "${null_count} AS `null_count`, "
+ "'${min}' AS `min`, "
+ "'${max}' AS `max`, "
+ "${min} AS `min`, "
+ "${max} AS `max`, "
+ "${data_size} AS `data_size`, "
+ "NOW() ";

Expand Down Expand Up @@ -311,7 +311,7 @@ public void setJob(AnalysisJob job) {
this.job = job;
}

protected void runQuery(String sql, boolean needEncode) {
protected void runQuery(String sql) {
long startTime = System.currentTimeMillis();
try (AutoCloseConnectContext a = StatisticsUtil.buildConnectContext()) {
stmtExecutor = new StmtExecutor(a.connectContext, sql);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ private void getOrdinaryColumnStats() throws Exception {
}
stringSubstitutor = new StringSubstitutor(params);
String sql = stringSubstitutor.replace(sb.toString());
runQuery(sql, true);
runQuery(sql);
}

// Collect the partition column stats through HMS metadata.
Expand Down Expand Up @@ -201,12 +201,12 @@ private void getPartitionColumnStats() throws Exception {
params.put("row_count", String.valueOf(count));
params.put("ndv", String.valueOf(ndv));
params.put("null_count", String.valueOf(numNulls));
params.put("min", min);
params.put("max", max);
params.put("min", StatisticsUtil.quote(min));
params.put("max", StatisticsUtil.quote(max));
params.put("data_size", String.valueOf(dataSize));
StringSubstitutor stringSubstitutor = new StringSubstitutor(params);
String sql = stringSubstitutor.replace(ANALYZE_PARTITION_COLUMN_TEMPLATE);
runQuery(sql, true);
runQuery(sql);
}

private String updateMinValue(String currentMin, String value) {
Expand Down Expand Up @@ -313,6 +313,9 @@ protected Pair<Double, Long> getSampleInfo() {
for (long size : chunkSizes) {
total += size;
}
if (total == 0) {
return Pair.of(1.0, 0L);
}
// Calculate the sample target size for percent and rows sample.
if (tableSample.isPercent()) {
target = total * tableSample.getSampleValue() / 100;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ private void getColumnStats() throws Exception {
params.put("dataSizeFunction", getDataSizeFunction(col, false));
StringSubstitutor stringSubstitutor = new StringSubstitutor(params);
String sql = stringSubstitutor.replace(sb.toString());
runQuery(sql, true);
runQuery(sql);
}

private Map<String, String> buildTableStatsParams(String partId) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,13 @@ public OlapAnalysisTask(AnalysisInfo info) {
}

public void doExecute() throws Exception {

Set<String> partitionNames = info.colToPartitions.get(info.colName);
if (partitionNames.isEmpty()) {
LOG.debug("Skip empty empty partition task for column {} in {}.{}.{}",
info.catalogId, info.dbId, info.tblId, info.colName);
job.appendBuf(this, Collections.emptyList());
return;
}
if (tableSample != null) {
doSample();
} else {
Expand Down Expand Up @@ -113,24 +119,25 @@ protected void doSample() throws Exception {
params.put("scaleFactor", String.valueOf(scaleFactor));
params.put("sampleHints", tabletStr.isEmpty() ? "" : String.format("TABLET(%s)", tabletStr));
params.put("ndvFunction", getNdvFunction(String.valueOf(rowCount)));
params.put("min", min);
params.put("max", max);
params.put("min", StatisticsUtil.quote(min));
params.put("max", StatisticsUtil.quote(max));
params.put("rowCount", String.valueOf(rowCount));
params.put("type", col.getType().toString());
params.put("limit", "");
if (needLimit()) {
// If the tablets to be sampled are too large, use limit to control the rows to read, and re-calculate
// the scaleFactor.
limitFlag = true;
rowsToSample = Math.min(getSampleRows(), pair.second);
params.put("limit", "limit " + rowsToSample);
params.put("scaleFactor", String.valueOf(scaleFactor * (double) pair.second / rowsToSample));
// Empty table doesn't need to limit.
if (rowsToSample > 0) {
limitFlag = true;
params.put("limit", "limit " + rowsToSample);
params.put("scaleFactor", String.valueOf(scaleFactor * (double) pair.second / rowsToSample));
}
}
StringSubstitutor stringSubstitutor = new StringSubstitutor(params);
String sql;
if (useLinearAnalyzeTemplate()) {
params.put("min", StatisticsUtil.quote(min));
params.put("max", StatisticsUtil.quote(max));
// For single unique key, use count as ndv.
if (isSingleUniqueKey()) {
params.put("ndvFunction", String.valueOf(rowCount));
Expand All @@ -148,7 +155,7 @@ protected void doSample() throws Exception {
col.getName(), params.get("rowCount"), rowsToSample, params.get("scaleFactor"),
limitFlag, tbl.isDistributionColumn(col.getName()),
tbl.isPartitionColumn(col.getName()), col.isKey(), isSingleUniqueKey());
runQuery(sql, false);
runQuery(sql);
}
}

Expand All @@ -169,11 +176,6 @@ protected ResultRow collectBasicStat(AutoCloseConnectContext context) {
*/
protected void doFull() throws Exception {
LOG.debug("Will do full collection for column {}", col.getName());
Set<String> partitionNames = info.colToPartitions.get(info.colName);
if (partitionNames.isEmpty()) {
job.appendBuf(this, Collections.emptyList());
return;
}
Map<String, String> params = new HashMap<>();
params.put("internalDB", FeConstants.INTERNAL_DB_NAME);
params.put("columnStatTbl", StatisticConstants.STATISTIC_TBL_NAME);
Expand All @@ -189,7 +191,7 @@ protected void doFull() throws Exception {
params.put("tblName", String.valueOf(tbl.getName()));
StringSubstitutor stringSubstitutor = new StringSubstitutor(params);
String collectColStats = stringSubstitutor.replace(COLLECT_COL_STATISTICS);
runQuery(collectColStats, true);
runQuery(collectColStats);
}

// Get sample tablets id and scale up scaleFactor
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ public void execSQLs(List<String> partitionAnalysisSQLs, Map<String, String> par
new MockUp<BaseAnalysisTask>() {

@Mock
protected void runQuery(String sql, boolean needEncode) {}
protected void runQuery(String sql) {}
};
HashMap<String, Set<String>> colToPartitions = Maps.newHashMap();
colToPartitions.put("col1", Collections.singleton("t1"));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import org.apache.doris.catalog.Column;
import org.apache.doris.catalog.PrimitiveType;
import org.apache.doris.catalog.external.HMSExternalTable;
import org.apache.doris.common.Pair;
import org.apache.doris.statistics.util.StatisticsUtil;

import com.google.common.collect.Lists;
Expand Down Expand Up @@ -138,4 +139,25 @@ public long getDataSize(boolean singleReplica) {
Assertions.assertEquals(1000, tableSample.getSampleValue());
}

@Test
public void testGetSampleInfo(@Mocked HMSExternalTable tableIf)
throws Exception {
new MockUp<HMSExternalTable>() {
@Mock
public List<Long> getChunkSizes() {
return Lists.newArrayList();
}
};
HMSAnalysisTask task = new HMSAnalysisTask();
task.setTable(tableIf);
task.tableSample = null;
Pair<Double, Long> info1 = task.getSampleInfo();
Assertions.assertEquals(1.0, info1.first);
Assertions.assertEquals(0, info1.second);
task.tableSample = new TableSample(false, 100L);
Pair<Double, Long> info2 = task.getSampleInfo();
Assertions.assertEquals(1.0, info2.first);
Assertions.assertEquals(0, info2.second);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -149,8 +149,7 @@ public ResultRow collectBasicStat(AutoCloseConnectContext context) {
}

@Mock
public void runQuery(String sql, boolean needEncode) {
Assertions.assertFalse(needEncode);
public void runQuery(String sql) {
Assertions.assertEquals("SELECT CONCAT('30001', '-', '-1', '-', 'null') AS `id`, 10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS `tbl_id`, -1 AS `idx_id`, 'null' AS `col_id`, NULL AS `part_id`, 500 AS `row_count`, SUM(`t1`.`count`) * COUNT(1) / (SUM(`t1`.`count`) - SUM(IF(`t1`.`count` = 1, 1, 0)) + SUM(IF(`t1`.`count` = 1, 1, 0)) * SUM(`t1`.`count`) / 500) as `ndv`, IFNULL(SUM(IF(`t1`.`column_key` IS NULL, `t1`.`count`, 0)), 0) * 5.0 as `null_count`, '1' AS `min`, '2' AS `max`, SUM(LENGTH(`column_key`) * count) * 5.0 AS `data_size`, NOW() FROM ( SELECT t0.`${colName}` as `column_key`, COUNT(1) as `count` FROM (SELECT `${colName}` FROM `catalogName`.`${dbName}`.`${tblName}` limit 100) as `t0` GROUP BY `t0`.`${colName}` ) as `t1` ", sql);
return;
}
Expand Down Expand Up @@ -216,8 +215,7 @@ public ResultRow collectBasicStat(AutoCloseConnectContext context) {
}

@Mock
public void runQuery(String sql, boolean needEncode) {
Assertions.assertFalse(needEncode);
public void runQuery(String sql) {
Assertions.assertEquals(" SELECT CONCAT(30001, '-', -1, '-', 'null') AS `id`, 10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS `tbl_id`, -1 AS `idx_id`, 'null' AS `col_id`, NULL AS `part_id`, 500 AS `row_count`, ROUND(NDV(`${colName}`) * 5.0) as `ndv`, ROUND(SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) * 5.0) AS `null_count`, '1' AS `min`, '2' AS `max`, SUM(LENGTH(`${colName}`)) * 5.0 AS `data_size`, NOW() FROM `catalogName`.`${dbName}`.`${tblName}` limit 100", sql);
return;
}
Expand Down Expand Up @@ -290,8 +288,7 @@ public ResultRow collectBasicStat(AutoCloseConnectContext context) {
}

@Mock
public void runQuery(String sql, boolean needEncode) {
Assertions.assertFalse(needEncode);
public void runQuery(String sql) {
Assertions.assertEquals("SELECT CONCAT('30001', '-', '-1', '-', 'null') AS `id`, 10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS `tbl_id`, -1 AS `idx_id`, 'null' AS `col_id`, NULL AS `part_id`, 500 AS `row_count`, SUM(`t1`.`count`) * COUNT(1) / (SUM(`t1`.`count`) - SUM(IF(`t1`.`count` = 1, 1, 0)) + SUM(IF(`t1`.`count` = 1, 1, 0)) * SUM(`t1`.`count`) / 500) as `ndv`, IFNULL(SUM(IF(`t1`.`column_key` IS NULL, `t1`.`count`, 0)), 0) * 5.0 as `null_count`, '1' AS `min`, '2' AS `max`, SUM(LENGTH(`column_key`) * count) * 5.0 AS `data_size`, NOW() FROM ( SELECT t0.`${colName}` as `column_key`, COUNT(1) as `count` FROM (SELECT `${colName}` FROM `catalogName`.`${dbName}`.`${tblName}` limit 100) as `t0` GROUP BY `t0`.`${colName}` ) as `t1` ", sql);
return;
}
Expand Down

0 comments on commit 7206383

Please sign in to comment.