|
36 | 36 | import org.apache.logging.log4j.LogManager; |
37 | 37 | import org.apache.logging.log4j.Logger; |
38 | 38 |
|
| 39 | +import java.text.MessageFormat; |
39 | 40 | import java.util.Collections; |
40 | 41 | import java.util.concurrent.TimeUnit; |
41 | 42 |
|
42 | 43 | public abstract class BaseAnalysisTask { |
43 | 44 |
|
44 | 45 | public static final Logger LOG = LogManager.getLogger(BaseAnalysisTask.class); |
45 | 46 |
|
46 | | - protected static final String NDV_MULTIPLY_THRESHOLD = "0.3"; |
47 | | - |
48 | | - protected static final String NDV_SAMPLE_TEMPLATE = "case when NDV(`${colName}`)/count('${colName}') < " |
49 | | - + NDV_MULTIPLY_THRESHOLD |
50 | | - + " then NDV(`${colName}`) " |
51 | | - + "else NDV(`${colName}`) * ${scaleFactor} end AS ndv, " |
52 | | - ; |
| 47 | + public static final long LIMIT_SIZE = 1024 * 1024 * 1024; // 1GB |
| 48 | + public static final double LIMIT_FACTOR = 1.2; |
53 | 49 |
|
54 | 50 | protected static final String COLLECT_COL_STATISTICS = |
55 | | - "SELECT CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}') AS id, " |
56 | | - + " ${catalogId} AS catalog_id, " |
57 | | - + " ${dbId} AS db_id, " |
58 | | - + " ${tblId} AS tbl_id, " |
59 | | - + " ${idxId} AS idx_id, " |
60 | | - + " '${colId}' AS col_id, " |
61 | | - + " NULL AS part_id, " |
62 | | - + " COUNT(1) AS row_count, " |
63 | | - + " NDV(`${colName}`) AS ndv, " |
64 | | - + " COUNT(1) - COUNT(${colName}) AS null_count, " |
65 | | - + " CAST(MIN(${colName}) AS STRING) AS min, " |
66 | | - + " CAST(MAX(${colName}) AS STRING) AS max, " |
67 | | - + " ${dataSizeFunction} AS data_size, " |
68 | | - + " NOW() AS update_time " |
69 | | - + " FROM `${dbName}`.`${tblName}`"; |
70 | | - |
71 | | - protected static final String ANALYZE_PARTITION_COLUMN_TEMPLATE = |
72 | | - " SELECT " |
73 | | - + "CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}') AS id, " |
74 | | - + "${catalogId} AS catalog_id, " |
75 | | - + "${dbId} AS db_id, " |
76 | | - + "${tblId} AS tbl_id, " |
77 | | - + "${idxId} AS idx_id, " |
78 | | - + "'${colId}' AS col_id, " |
79 | | - + "NULL AS part_id, " |
80 | | - + "${row_count} AS row_count, " |
81 | | - + "${ndv} AS ndv, " |
82 | | - + "${null_count} AS null_count, " |
83 | | - + "'${min}' AS min, " |
84 | | - + "'${max}' AS max, " |
85 | | - + "${data_size} AS data_size, " |
| 51 | + "SELECT CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}') AS `id`, " |
| 52 | + + " ${catalogId} AS `catalog_id`, " |
| 53 | + + " ${dbId} AS `db_id`, " |
| 54 | + + " ${tblId} AS `tbl_id`, " |
| 55 | + + " ${idxId} AS `idx_id`, " |
| 56 | + + " '${colId}' AS `col_id`, " |
| 57 | + + " NULL AS `part_id`, " |
| 58 | + + " COUNT(1) AS `row_count`, " |
| 59 | + + " NDV(`${colName}`) AS `ndv`, " |
| 60 | + + " COUNT(1) - COUNT(${colName}) AS `null_count`, " |
| 61 | + + " CAST(MIN(${colName}) AS STRING) AS `min`, " |
| 62 | + + " CAST(MAX(${colName}) AS STRING) AS `max`, " |
| 63 | + + " ${dataSizeFunction} AS `data_size`, " |
| 64 | + + " NOW() AS `update_time` " |
| 65 | + + " FROM `${catalogName}`.`${dbName}`.`${tblName}`"; |
| 66 | + |
| 67 | + protected static final String LINEAR_ANALYZE_TEMPLATE = " SELECT " |
| 68 | + + "CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}') AS `id`, " |
| 69 | + + "${catalogId} AS `catalog_id`, " |
| 70 | + + "${dbId} AS `db_id`, " |
| 71 | + + "${tblId} AS `tbl_id`, " |
| 72 | + + "${idxId} AS `idx_id`, " |
| 73 | + + "'${colId}' AS `col_id`, " |
| 74 | + + "NULL AS `part_id`, " |
| 75 | + + "ROUND(COUNT(1) * ${scaleFactor}) AS `row_count`, " |
| 76 | + + "ROUND(NDV(`${colName}`) * ${scaleFactor}) as `ndv`, " |
| 77 | + + "ROUND(SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) * ${scaleFactor}) AS `null_count`, " |
| 78 | + + "${min} AS `min`, " |
| 79 | + + "${max} AS `max`, " |
| 80 | + + "${dataSizeFunction} * ${scaleFactor} AS `data_size`, " |
| 81 | + + "NOW() " |
| 82 | + + "FROM `${catalogName}`.`${dbName}`.`${tblName}` ${sampleHints} ${limit}"; |
| 83 | + |
| 84 | + protected static final String DUJ1_ANALYZE_TEMPLATE = "SELECT " |
| 85 | + + "CONCAT('${tblId}', '-', '${idxId}', '-', '${colId}') AS `id`, " |
| 86 | + + "${catalogId} AS `catalog_id`, " |
| 87 | + + "${dbId} AS `db_id`, " |
| 88 | + + "${tblId} AS `tbl_id`, " |
| 89 | + + "${idxId} AS `idx_id`, " |
| 90 | + + "'${colId}' AS `col_id`, " |
| 91 | + + "NULL AS `part_id`, " |
| 92 | + + "${rowCount} AS `row_count`, " |
| 93 | + + "${ndvFunction} as `ndv`, " |
| 94 | + + "IFNULL(SUM(IF(`t1`.`column_key` IS NULL, `t1`.count, 0)), 0) * ${scaleFactor} as `null_count`, " |
| 95 | + + "'${min}' AS `min`, " |
| 96 | + + "'${max}' AS `max`, " |
| 97 | + + "${dataSizeFunction} * ${scaleFactor} AS `data_size`, " |
| 98 | + + "NOW() " |
| 99 | + + "FROM ( " |
| 100 | + + " SELECT t0.`${colName}` as column_key, COUNT(1) as `count` " |
| 101 | + + " FROM " |
| 102 | + + " (SELECT `${colName}` FROM `${catalogName}`.`${dbName}`.`${tblName}` " |
| 103 | + + " ${sampleHints} ${limit}) as `t0` " |
| 104 | + + " GROUP BY `t0`.`${colName}` " |
| 105 | + + ") as `t1` "; |
| 106 | + |
| 107 | + protected static final String ANALYZE_PARTITION_COLUMN_TEMPLATE = " SELECT " |
| 108 | + + "CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}') AS `id`, " |
| 109 | + + "${catalogId} AS `catalog_id`, " |
| 110 | + + "${dbId} AS `db_id`, " |
| 111 | + + "${tblId} AS `tbl_id`, " |
| 112 | + + "${idxId} AS `idx_id`, " |
| 113 | + + "'${colId}' AS `col_id`, " |
| 114 | + + "NULL AS `part_id`, " |
| 115 | + + "${row_count} AS `row_count`, " |
| 116 | + + "${ndv} AS `ndv`, " |
| 117 | + + "${null_count} AS `null_count`, " |
| 118 | + + "'${min}' AS `min`, " |
| 119 | + + "'${max}' AS `max`, " |
| 120 | + + "${data_size} AS `data_size`, " |
86 | 121 | + "NOW() "; |
87 | 122 |
|
88 | 123 | protected AnalysisInfo info; |
@@ -199,29 +234,51 @@ public long getJobId() { |
199 | 234 | return info.jobId; |
200 | 235 | } |
201 | 236 |
|
202 | | - // TODO : time cost is intolerable when column is string type, return 0 directly for now. |
203 | | - protected String getDataSizeFunction(Column column) { |
204 | | - if (column.getType().isStringType()) { |
205 | | - return "SUM(LENGTH(`${colName}`))"; |
| 237 | + protected String getDataSizeFunction(Column column, boolean useDuj1) { |
| 238 | + if (useDuj1) { |
| 239 | + if (column.getType().isStringType()) { |
| 240 | + return "SUM(LENGTH(`column_key`) * count)"; |
| 241 | + } else { |
| 242 | + return "SUM(t1.count) * " + column.getType().getSlotSize(); |
| 243 | + } |
| 244 | + } else { |
| 245 | + if (column.getType().isStringType()) { |
| 246 | + return "SUM(LENGTH(`${colName}`))"; |
| 247 | + } else { |
| 248 | + return "COUNT(1) * " + column.getType().getSlotSize(); |
| 249 | + } |
206 | 250 | } |
207 | | - return "COUNT(1) * " + column.getType().getSlotSize(); |
208 | 251 | } |
209 | 252 |
|
210 | | - // Min value is not accurate while sample, so set it to NULL to avoid optimizer generate bad plan. |
211 | 253 | protected String getMinFunction() { |
212 | 254 | if (tableSample == null) { |
213 | | - return "CAST(MIN(`${colName}`) as ${type}) "; |
| 255 | + return "to_base64(CAST(MIN(`${colName}`) as ${type})) "; |
214 | 256 | } else { |
215 | | - return "NULL "; |
| 257 | + // Min value is not accurate while sample, so set it to NULL to avoid optimizer generate bad plan. |
| 258 | + return "NULL"; |
216 | 259 | } |
217 | 260 | } |
218 | 261 |
|
| 262 | + protected String getNdvFunction(String totalRows) { |
| 263 | + String sampleRows = "SUM(t1.count)"; |
| 264 | + String onceCount = "SUM(IF(t1.count = 1, 1, 0))"; |
| 265 | + String countDistinct = "COUNT(1)"; |
| 266 | + // DUJ1 estimator: n*d / (n - f1 + f1*n/N) |
| 267 | + // f1 is the count of element that appears only once in the sample. |
| 268 | + // (https://github.com/postgres/postgres/blob/master/src/backend/commands/analyze.c) |
| 269 | + // (http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.93.8637&rep=rep1&type=pdf) |
| 270 | + // sample_row * count_distinct / ( sample_row - once_count + once_count * sample_row / total_row) |
| 271 | + String fn = MessageFormat.format("{0} * {1} / ({0} - {2} + {2} * {0} / {3})", sampleRows, |
| 272 | + countDistinct, onceCount, totalRows); |
| 273 | + return fn; |
| 274 | + } |
| 275 | + |
219 | 276 | // Max value is not accurate while sample, so set it to NULL to avoid optimizer generate bad plan. |
220 | 277 | protected String getMaxFunction() { |
221 | 278 | if (tableSample == null) { |
222 | | - return "CAST(MAX(`${colName}`) as ${type}) "; |
| 279 | + return "to_base64(CAST(MAX(`${colName}`) as ${type})) "; |
223 | 280 | } else { |
224 | | - return "NULL "; |
| 281 | + return "NULL"; |
225 | 282 | } |
226 | 283 | } |
227 | 284 |
|
@@ -254,12 +311,11 @@ public void setJob(AnalysisJob job) { |
254 | 311 | this.job = job; |
255 | 312 | } |
256 | 313 |
|
257 | | - protected void runQuery(String sql) { |
| 314 | + protected void runQuery(String sql, boolean needEncode) { |
258 | 315 | long startTime = System.currentTimeMillis(); |
259 | 316 | try (AutoCloseConnectContext a = StatisticsUtil.buildConnectContext()) { |
260 | 317 | stmtExecutor = new StmtExecutor(a.connectContext, sql); |
261 | | - stmtExecutor.executeInternalQuery(); |
262 | | - ColStatsData colStatsData = new ColStatsData(stmtExecutor.executeInternalQuery().get(0)); |
| 318 | + ColStatsData colStatsData = new ColStatsData(stmtExecutor.executeInternalQuery().get(0), needEncode); |
263 | 319 | job.appendBuf(this, Collections.singletonList(colStatsData)); |
264 | 320 | } finally { |
265 | 321 | LOG.debug("End cost time in secs: " + (System.currentTimeMillis() - startTime) / 1000); |
|
0 commit comments