Skip to content

Commit

Permalink
[opt](stats) Use escape rather than base64 for min/max value apache#2…
Browse files Browse the repository at this point in the history
  • Loading branch information
Kikyou1997 authored and gnehil committed Dec 4, 2023
1 parent e2a090a commit e1b14e2
Show file tree
Hide file tree
Showing 10 changed files with 27 additions and 31 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,7 @@ protected String getDataSizeFunction(Column column, boolean useDuj1) {

protected String getMinFunction() {
if (tableSample == null) {
return "to_base64(CAST(MIN(`${colName}`) as ${type})) ";
return "CAST(MIN(`${colName}`) as ${type}) ";
} else {
// Min value is not accurate while sample, so set it to NULL to avoid optimizer generate bad plan.
return "NULL";
Expand All @@ -276,7 +276,7 @@ protected String getNdvFunction(String totalRows) {
// Max value is not accurate while sample, so set it to NULL to avoid optimizer generate bad plan.
protected String getMaxFunction() {
if (tableSample == null) {
return "to_base64(CAST(MAX(`${colName}`) as ${type})) ";
return "CAST(MAX(`${colName}`) as ${type}) ";
} else {
return "NULL";
}
Expand Down Expand Up @@ -315,7 +315,7 @@ protected void runQuery(String sql, boolean needEncode) {
long startTime = System.currentTimeMillis();
try (AutoCloseConnectContext a = StatisticsUtil.buildConnectContext()) {
stmtExecutor = new StmtExecutor(a.connectContext, sql);
ColStatsData colStatsData = new ColStatsData(stmtExecutor.executeInternalQuery().get(0), needEncode);
ColStatsData colStatsData = new ColStatsData(stmtExecutor.executeInternalQuery().get(0));
job.appendBuf(this, Collections.singletonList(colStatsData));
} finally {
LOG.debug("End cost time in secs: " + (System.currentTimeMillis() - startTime) / 1000);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,6 @@

import com.google.common.annotations.VisibleForTesting;

import java.nio.charset.StandardCharsets;
import java.util.Base64;
import java.util.StringJoiner;

/**
Expand Down Expand Up @@ -56,8 +54,6 @@ public class ColStatsData {

public final String updateTime;

public final boolean needEncode;

@VisibleForTesting
public ColStatsData() {
statsId = new StatsId();
Expand All @@ -68,10 +64,9 @@ public ColStatsData() {
maxLit = null;
dataSizeInBytes = 0;
updateTime = null;
needEncode = true;
}

public ColStatsData(ResultRow row, boolean needEncode) {
public ColStatsData(ResultRow row) {
this.statsId = new StatsId(row);
this.count = (long) Double.parseDouble(row.get(7));
this.ndv = (long) Double.parseDouble(row.getWithDefault(8, "0"));
Expand All @@ -80,7 +75,6 @@ public ColStatsData(ResultRow row, boolean needEncode) {
this.maxLit = row.get(11);
this.dataSizeInBytes = (long) Double.parseDouble(row.getWithDefault(12, "0"));
this.updateTime = row.get(13);
this.needEncode = needEncode;
}

public String toSQL(boolean roundByParentheses) {
Expand All @@ -93,12 +87,8 @@ public String toSQL(boolean roundByParentheses) {
sj.add(String.valueOf(count));
sj.add(String.valueOf(ndv));
sj.add(String.valueOf(nullCount));
sj.add(minLit == null ? "NULL" : needEncode
? "'" + Base64.getEncoder().encodeToString(minLit.getBytes(StandardCharsets.UTF_8)) + "'"
: "'" + minLit + "'");
sj.add(maxLit == null ? "NULL" : needEncode
? "'" + Base64.getEncoder().encodeToString(maxLit.getBytes(StandardCharsets.UTF_8)) + "'"
: "'" + maxLit + "'");
sj.add(minLit == null ? "NULL" : "'" + StatisticsUtil.escapeSQL(minLit) + "'");
sj.add(maxLit == null ? "NULL" : "'" + StatisticsUtil.escapeSQL(maxLit) + "'");
sj.add(String.valueOf(dataSizeInBytes));
sj.add(StatisticsUtil.quote(updateTime));
return sj.toString();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,6 @@
import org.apache.logging.log4j.Logger;
import org.json.JSONObject;

import java.nio.charset.StandardCharsets;
import java.util.Base64;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
Expand Down Expand Up @@ -175,7 +173,6 @@ public static ColumnStatistic fromResultRow(ResultRow row) {
String min = row.get(10);
String max = row.get(11);
if (min != null && !min.equalsIgnoreCase("NULL")) {
min = new String(Base64.getDecoder().decode(min), StandardCharsets.UTF_8);
// Internal catalog get the min/max value using a separate SQL,
// and the value is already encoded by base64. Need to handle internal and external catalog separately.
if (catalogId != InternalCatalog.INTERNAL_CATALOG_ID && min.equalsIgnoreCase("NULL")) {
Expand All @@ -193,7 +190,6 @@ public static ColumnStatistic fromResultRow(ResultRow row) {
columnStatisticBuilder.setMinValue(Double.NEGATIVE_INFINITY);
}
if (max != null && !max.equalsIgnoreCase("NULL")) {
max = new String(Base64.getDecoder().decode(max), StandardCharsets.UTF_8);
if (catalogId != InternalCatalog.INTERNAL_CATALOG_ID && max.equalsIgnoreCase("NULL")) {
columnStatisticBuilder.setMaxValue(Double.POSITIVE_INFINITY);
} else {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,8 @@ protected void doSample() throws Exception {
// Get basic stats, including min and max.
ResultRow basicStats = collectBasicStat(r);
long rowCount = tbl.getRowCount();
String min = StatisticsUtil.encodeValue(basicStats, 0);
String max = StatisticsUtil.encodeValue(basicStats, 1);
String min = StatisticsUtil.escapeSQL(basicStats.get(0));
String max = StatisticsUtil.escapeSQL(basicStats.get(1));

boolean limitFlag = false;
long rowsToSample = pair.second;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -269,8 +269,8 @@ public static void alterColumnStatistics(AlterColumnStatsStmt alterColumnStatsSt
params.put("count", String.valueOf(columnStatistic.count));
params.put("ndv", String.valueOf(columnStatistic.ndv));
params.put("nullCount", String.valueOf(columnStatistic.numNulls));
params.put("min", StatisticsUtil.encodeString(min));
params.put("max", StatisticsUtil.encodeString(max));
params.put("min", StatisticsUtil.escapeSQL(min));
params.put("max", StatisticsUtil.escapeSQL(max));
params.put("dataSize", String.valueOf(columnStatistic.dataSize));

if (partitionIds.isEmpty()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -786,7 +786,9 @@ public static String escapeSQL(String str) {
if (str == null) {
return null;
}
return org.apache.commons.lang3.StringUtils.replace(str, "'", "''");
return str.replace("'", "''")
.replace("\\", "\\\\")
.replace("\"", "\"\"");
}

public static boolean isExternalTable(String catalogName, String dbName, String tblName) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,14 +42,14 @@ public void testGetFunctions() {
Assertions.assertEquals("SUM(t1.count) * 4", dataSizeFunction);

String minFunction = olapAnalysisTask.getMinFunction();
Assertions.assertEquals("to_base64(CAST(MIN(`${colName}`) as ${type})) ", minFunction);
Assertions.assertEquals("CAST(MIN(`${colName}`) as ${type}) ", minFunction);
olapAnalysisTask.tableSample = new TableSample(true, 20L);
minFunction = olapAnalysisTask.getMinFunction();
Assertions.assertEquals("NULL", minFunction);

olapAnalysisTask.tableSample = null;
String maxFunction = olapAnalysisTask.getMaxFunction();
Assertions.assertEquals("to_base64(CAST(MAX(`${colName}`) as ${type})) ", maxFunction);
Assertions.assertEquals("CAST(MAX(`${colName}`) as ${type}) ", maxFunction);
olapAnalysisTask.tableSample = new TableSample(true, 20L);
maxFunction = olapAnalysisTask.getMaxFunction();
Assertions.assertEquals("NULL", maxFunction);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ public ResultRow collectBasicStat(AutoCloseConnectContext context) {
@Mock
public void runQuery(String sql, boolean needEncode) {
Assertions.assertFalse(needEncode);
Assertions.assertEquals("SELECT CONCAT('30001', '-', '-1', '-', 'null') AS `id`, 10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS `tbl_id`, -1 AS `idx_id`, 'null' AS `col_id`, NULL AS `part_id`, 500 AS `row_count`, SUM(`t1`.`count`) * COUNT(1) / (SUM(`t1`.`count`) - SUM(IF(`t1`.`count` = 1, 1, 0)) + SUM(IF(`t1`.`count` = 1, 1, 0)) * SUM(`t1`.`count`) / 500) as `ndv`, IFNULL(SUM(IF(`t1`.`column_key` IS NULL, `t1`.`count`, 0)), 0) * 5.0 as `null_count`, 'MQ==' AS `min`, 'Mg==' AS `max`, SUM(LENGTH(`column_key`) * count) * 5.0 AS `data_size`, NOW() FROM ( SELECT t0.`${colName}` as `column_key`, COUNT(1) as `count` FROM (SELECT `${colName}` FROM `catalogName`.`${dbName}`.`${tblName}` limit 100) as `t0` GROUP BY `t0`.`${colName}` ) as `t1` ", sql);
Assertions.assertEquals("SELECT CONCAT('30001', '-', '-1', '-', 'null') AS `id`, 10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS `tbl_id`, -1 AS `idx_id`, 'null' AS `col_id`, NULL AS `part_id`, 500 AS `row_count`, SUM(`t1`.`count`) * COUNT(1) / (SUM(`t1`.`count`) - SUM(IF(`t1`.`count` = 1, 1, 0)) + SUM(IF(`t1`.`count` = 1, 1, 0)) * SUM(`t1`.`count`) / 500) as `ndv`, IFNULL(SUM(IF(`t1`.`column_key` IS NULL, `t1`.`count`, 0)), 0) * 5.0 as `null_count`, '1' AS `min`, '2' AS `max`, SUM(LENGTH(`column_key`) * count) * 5.0 AS `data_size`, NOW() FROM ( SELECT t0.`${colName}` as `column_key`, COUNT(1) as `count` FROM (SELECT `${colName}` FROM `catalogName`.`${dbName}`.`${tblName}` limit 100) as `t0` GROUP BY `t0`.`${colName}` ) as `t1` ", sql);
return;
}
};
Expand Down Expand Up @@ -218,7 +218,7 @@ public ResultRow collectBasicStat(AutoCloseConnectContext context) {
@Mock
public void runQuery(String sql, boolean needEncode) {
Assertions.assertFalse(needEncode);
Assertions.assertEquals(" SELECT CONCAT(30001, '-', -1, '-', 'null') AS `id`, 10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS `tbl_id`, -1 AS `idx_id`, 'null' AS `col_id`, NULL AS `part_id`, 500 AS `row_count`, ROUND(NDV(`${colName}`) * 5.0) as `ndv`, ROUND(SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) * 5.0) AS `null_count`, 'MQ==' AS `min`, 'Mg==' AS `max`, SUM(LENGTH(`${colName}`)) * 5.0 AS `data_size`, NOW() FROM `catalogName`.`${dbName}`.`${tblName}` limit 100", sql);
Assertions.assertEquals(" SELECT CONCAT(30001, '-', -1, '-', 'null') AS `id`, 10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS `tbl_id`, -1 AS `idx_id`, 'null' AS `col_id`, NULL AS `part_id`, 500 AS `row_count`, ROUND(NDV(`${colName}`) * 5.0) as `ndv`, ROUND(SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) * 5.0) AS `null_count`, '1' AS `min`, '2' AS `max`, SUM(LENGTH(`${colName}`)) * 5.0 AS `data_size`, NOW() FROM `catalogName`.`${dbName}`.`${tblName}` limit 100", sql);
return;
}
};
Expand Down Expand Up @@ -292,7 +292,7 @@ public ResultRow collectBasicStat(AutoCloseConnectContext context) {
@Mock
public void runQuery(String sql, boolean needEncode) {
Assertions.assertFalse(needEncode);
Assertions.assertEquals("SELECT CONCAT('30001', '-', '-1', '-', 'null') AS `id`, 10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS `tbl_id`, -1 AS `idx_id`, 'null' AS `col_id`, NULL AS `part_id`, 500 AS `row_count`, SUM(`t1`.`count`) * COUNT(1) / (SUM(`t1`.`count`) - SUM(IF(`t1`.`count` = 1, 1, 0)) + SUM(IF(`t1`.`count` = 1, 1, 0)) * SUM(`t1`.`count`) / 500) as `ndv`, IFNULL(SUM(IF(`t1`.`column_key` IS NULL, `t1`.`count`, 0)), 0) * 5.0 as `null_count`, 'MQ==' AS `min`, 'Mg==' AS `max`, SUM(LENGTH(`column_key`) * count) * 5.0 AS `data_size`, NOW() FROM ( SELECT t0.`${colName}` as `column_key`, COUNT(1) as `count` FROM (SELECT `${colName}` FROM `catalogName`.`${dbName}`.`${tblName}` limit 100) as `t0` GROUP BY `t0`.`${colName}` ) as `t1` ", sql);
Assertions.assertEquals("SELECT CONCAT('30001', '-', '-1', '-', 'null') AS `id`, 10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS `tbl_id`, -1 AS `idx_id`, 'null' AS `col_id`, NULL AS `part_id`, 500 AS `row_count`, SUM(`t1`.`count`) * COUNT(1) / (SUM(`t1`.`count`) - SUM(IF(`t1`.`count` = 1, 1, 0)) + SUM(IF(`t1`.`count` = 1, 1, 0)) * SUM(`t1`.`count`) / 500) as `ndv`, IFNULL(SUM(IF(`t1`.`column_key` IS NULL, `t1`.`count`, 0)), 0) * 5.0 as `null_count`, '1' AS `min`, '2' AS `max`, SUM(LENGTH(`column_key`) * count) * 5.0 AS `data_size`, NOW() FROM ( SELECT t0.`${colName}` as `column_key`, COUNT(1) as `count` FROM (SELECT `${colName}` FROM `catalogName`.`${dbName}`.`${tblName}` limit 100) as `t0` GROUP BY `t0`.`${colName}` ) as `t1` ", sql);
return;
}
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ public static ResultRow mockResultRow(boolean col) {
add("0");
add("10");
// 11
add("MTE=");
add("11");
add("12");
add(String.valueOf(System.currentTimeMillis()));
}};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -142,4 +142,12 @@ public void testEncodeValue() throws Exception {
.encodeToString("a".getBytes(StandardCharsets.UTF_8)), StatisticsUtil.encodeValue(row, 1));
Assertions.assertEquals("NULL", StatisticsUtil.encodeValue(row, 2));
}

@Test
public void testEscape() {
// \'"
String origin = "\\'\"";
// \\''""
Assertions.assertEquals("\\\\''\"\"", StatisticsUtil.escapeSQL(origin));
}
}

0 comments on commit e1b14e2

Please sign in to comment.