Skip to content

Commit 526f29d

Browse files
Fixed bugs in CBO statistics calculation (#6537)
1 parent ec5a005 commit 526f29d

File tree

7 files changed

+1396
-69
lines changed

7 files changed

+1396
-69
lines changed

ydb/core/kqp/opt/kqp_statistics_transformer.cpp

Lines changed: 66 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -22,40 +22,53 @@ void InferStatisticsForReadTable(const TExprNode::TPtr& input, TTypeAnnotationCo
2222
const TKqpOptimizeContext& kqpCtx) {
2323

2424
auto inputNode = TExprBase(input);
25-
double nRows = 0;
26-
int nAttrs = 0;
25+
std::shared_ptr<TOptimizerStatistics> inputStats;
2726

28-
const TExprNode* path;
27+
int nAttrs = 0;
28+
bool readRange = false;
2929

3030
if (auto readTable = inputNode.Maybe<TKqlReadTableBase>()) {
31-
path = readTable.Cast().Table().Path().Raw();
31+
inputStats = typeCtx->GetStats(readTable.Cast().Table().Raw());
3232
nAttrs = readTable.Cast().Columns().Size();
33+
34+
auto range = readTable.Cast().Range();
35+
auto rangeFrom = range.From().Maybe<TKqlKeyTuple>();
36+
auto rangeTo = range.To().Maybe<TKqlKeyTuple>();
37+
if (rangeFrom && rangeTo) {
38+
readRange = true;
39+
}
3340
} else if (auto readRanges = inputNode.Maybe<TKqlReadTableRangesBase>()) {
34-
path = readRanges.Cast().Table().Path().Raw();
41+
inputStats = typeCtx->GetStats(readRanges.Cast().Table().Raw());
3542
nAttrs = readRanges.Cast().Columns().Size();
3643
} else {
3744
Y_ENSURE(false, "Invalid node type for InferStatisticsForReadTable");
3845
}
3946

40-
const auto& tableData = kqpCtx.Tables->ExistingTable(kqpCtx.Cluster, path->Content());
41-
int totalAttrs = tableData.Metadata->Columns.size();
42-
nRows = tableData.Metadata->RecordsCount;
43-
44-
double byteSize = tableData.Metadata->DataSize * (nAttrs / (double)totalAttrs);
45-
46-
auto keyColumns = TIntrusivePtr<TOptimizerStatistics::TKeyColumns>(new TOptimizerStatistics::TKeyColumns(tableData.Metadata->KeyColumnNames));
47-
auto stats = std::make_shared<TOptimizerStatistics>(EStatisticsType::BaseTable, nRows, nAttrs, byteSize, 0.0, keyColumns);
48-
if (kqpCtx.Config->OverrideStatistics.Get()) {
49-
stats = OverrideStatistics(*stats, path->Content(), *kqpCtx.Config->OverrideStatistics.Get());
47+
/**
48+
* We need index statistics to calculate this in the future
49+
* Right now we use very small estimates to make sure CBO picks Lookup Joins
50+
* I.e. there can be a chain of lookup joins in OLTP scenario and we want to make
51+
* sure the cardinality doesn't blow up and lookup joins are still being picked
52+
*/
53+
double inputRows = inputStats->Nrows;
54+
double nRows = inputRows;
55+
if (readRange) {
56+
nRows = 1;
5057
}
5158

52-
if (stats->ColumnStatistics) {
53-
for (const auto& [columnName, metaData]: tableData.Metadata->Columns) {
54-
stats->ColumnStatistics->Data[columnName].Type = metaData.Type;
55-
}
56-
}
59+
double sizePerRow = inputStats->ByteSize / (inputRows==0?1:inputRows);
60+
double byteSize = nRows * sizePerRow * (nAttrs / (double)inputStats->Ncols);
5761

58-
YQL_CLOG(TRACE, CoreDq) << "Infer statistics for read table, nrows: " << stats->Nrows << ", nattrs: " << stats->Ncols;
62+
auto stats = std::make_shared<TOptimizerStatistics>(
63+
EStatisticsType::BaseTable,
64+
nRows,
65+
nAttrs,
66+
byteSize,
67+
0.0,
68+
inputStats->KeyColumns,
69+
inputStats->ColumnStatistics);
70+
71+
YQL_CLOG(TRACE, CoreDq) << "Infer statistics for read table, nrows: " << stats->Nrows << ", nattrs: " << stats->Ncols << ", byteSize: " << stats->ByteSize;
5972

6073
typeCtx->SetStats(input.Get(), stats);
6174
}
@@ -81,7 +94,7 @@ void InferStatisticsForKqpTable(const TExprNode::TPtr& input, TTypeAnnotationCon
8194
stats = OverrideStatistics(*stats, path.Value(), *kqpCtx.Config->OverrideStatistics.Get());
8295
}
8396

84-
YQL_CLOG(TRACE, CoreDq) << "Infer statistics for table: " << path.Value() << ", nrows: " << stats->Nrows << ", nattrs: " << stats->Ncols << ", nKeyColumns: " << stats->KeyColumns->Data.size();
97+
YQL_CLOG(TRACE, CoreDq) << "Infer statistics for table: " << path.Value() << ", nrows: " << stats->Nrows << ", nattrs: " << stats->Ncols << ", byteSize: " << stats->ByteSize << ", nKeyColumns: " << stats->KeyColumns->Data.size();
8598

8699
typeCtx->SetStats(input.Get(), stats);
87100
}
@@ -103,7 +116,14 @@ void InferStatisticsForSteamLookup(const TExprNode::TPtr& input, TTypeAnnotation
103116
auto inputStats = typeCtx->GetStats(streamLookup.Table().Raw());
104117
auto byteSize = inputStats->ByteSize * (nAttrs / (double) inputStats->Ncols);
105118

106-
typeCtx->SetStats(input.Get(), std::make_shared<TOptimizerStatistics>(EStatisticsType::BaseTable, inputStats->Nrows, nAttrs, byteSize, 0, inputStats->KeyColumns));
119+
typeCtx->SetStats(input.Get(), std::make_shared<TOptimizerStatistics>(
120+
EStatisticsType::BaseTable,
121+
inputStats->Nrows,
122+
nAttrs,
123+
byteSize,
124+
0,
125+
inputStats->KeyColumns,
126+
inputStats->ColumnStatistics));
107127
}
108128

109129
/**
@@ -134,7 +154,14 @@ void InferStatisticsForLookupTable(const TExprNode::TPtr& input, TTypeAnnotation
134154
byteSize = 10;
135155
}
136156

137-
typeCtx->SetStats(input.Get(), std::make_shared<TOptimizerStatistics>(EStatisticsType::BaseTable, nRows, nAttrs, byteSize, 0, inputStats->KeyColumns));
157+
typeCtx->SetStats(input.Get(), std::make_shared<TOptimizerStatistics>(
158+
EStatisticsType::BaseTable,
159+
nRows,
160+
nAttrs,
161+
byteSize,
162+
0,
163+
inputStats->KeyColumns,
164+
inputStats->ColumnStatistics));
138165
}
139166

140167
/**
@@ -151,7 +178,8 @@ void InferStatisticsForRowsSourceSettings(const TExprNode::TPtr& input, TTypeAnn
151178
return;
152179
}
153180

154-
double nRows = inputStats->Nrows;
181+
double inputRows = inputStats->Nrows;
182+
double nRows = inputRows;
155183

156184
// Check if we have a range expression, in that case just assign a single row to this read
157185
// We don't currently check the size of an index lookup
@@ -165,10 +193,19 @@ void InferStatisticsForRowsSourceSettings(const TExprNode::TPtr& input, TTypeAnn
165193
}
166194

167195
int nAttrs = sourceSettings.Columns().Size();
196+
197+
double sizePerRow = inputStats->ByteSize / (inputRows==0?1:inputRows);
198+
double byteSize = nRows * sizePerRow * (nAttrs / (double)inputStats->Ncols);
168199
double cost = inputStats->Cost;
169-
double byteSize = inputStats->ByteSize * (nAttrs / (double)inputStats->Ncols);
170200

171-
typeCtx->SetStats(input.Get(), std::make_shared<TOptimizerStatistics>(EStatisticsType::BaseTable, nRows, nAttrs, byteSize, cost, inputStats->KeyColumns));
201+
typeCtx->SetStats(input.Get(), std::make_shared<TOptimizerStatistics>(
202+
EStatisticsType::BaseTable,
203+
nRows,
204+
nAttrs,
205+
byteSize,
206+
cost,
207+
inputStats->KeyColumns,
208+
inputStats->ColumnStatistics));
172209
}
173210

174211
/**
@@ -199,7 +236,8 @@ void InferStatisticsForReadTableIndexRanges(const TExprNode::TPtr& input, TTypeA
199236
inputStats->Ncols,
200237
inputStats->ByteSize,
201238
inputStats->Cost,
202-
indexColumnsPtr);
239+
indexColumnsPtr,
240+
inputStats->ColumnStatistics);
203241

204242
typeCtx->SetStats(input.Get(), stats);
205243

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
{
2+
"op_name": "InnerJoin (Map)",
3+
"args": [
4+
{
5+
"op_name": "TableLookup",
6+
"table": "stock"
7+
},
8+
{
9+
"op_name": "TableRangeScan",
10+
"table": "order_line"
11+
}
12+
]
13+
}
Lines changed: 41 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1,49 +1,49 @@
11
{
2-
"op_name": "InnerJoin (Grace)",
2+
"op_name": "InnerJoin (MapJoin)",
33
"args": [
4-
{
5-
"op_name": "InnerJoin (Grace)",
6-
"args": [
7-
{
8-
"op_name": "TableFullScan",
9-
"table": "orders"
10-
},
11-
{
4+
{
125
"op_name": "InnerJoin (Grace)",
136
"args": [
14-
{
15-
"op_name": "TableFullScan",
16-
"table": "lineitem"
17-
},
18-
{
19-
"op_name": "InnerJoin (Grace)",
20-
"args": [
21-
{
7+
{
228
"op_name": "TableFullScan",
23-
"table": "partsupp"
24-
},
25-
{
9+
"table": "orders"
10+
},
11+
{
12+
"op_name": "InnerJoin (Grace)",
13+
"args": [
14+
{
15+
"op_name": "TableFullScan",
16+
"table": "lineitem"
17+
},
18+
{
19+
"op_name": "InnerJoin (MapJoin)",
20+
"args": [
21+
{
22+
"op_name": "TableFullScan",
23+
"table": "partsupp"
24+
},
25+
{
26+
"op_name": "TableFullScan",
27+
"table": "part"
28+
}
29+
]
30+
}
31+
]
32+
}
33+
]
34+
},
35+
{
36+
"op_name": "InnerJoin (MapJoin)",
37+
"args": [
38+
{
39+
"op_name": "TableFullScan",
40+
"table": "supplier"
41+
},
42+
{
2643
"op_name": "TableFullScan",
27-
"table": "part"
28-
}
29-
]
30-
}
44+
"table": "nation"
45+
}
3146
]
32-
}
33-
]
34-
},
35-
{
36-
"op_name": "InnerJoin (MapJoin)",
37-
"args": [
38-
{
39-
"op_name": "TableFullScan",
40-
"table": "supplier"
41-
},
42-
{
43-
"op_name": "TableFullScan",
44-
"table": "nation"
45-
}
46-
]
47-
}
47+
}
4848
]
49-
}
49+
}
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
SELECT COUNT(DISTINCT (s.S_I_ID)) AS STOCK_COUNT
2+
FROM `/Root/test/tpcc/order_line` as ol INNER JOIN `/Root/test/tpcc/stock` as s ON s.S_I_ID = ol.OL_I_ID
3+
WHERE ol.OL_W_ID = 1
4+
AND ol.OL_D_ID = 10
5+
AND ol.OL_O_ID < 3000
6+
AND ol.OL_O_ID >= 2900
7+
AND s.S_W_ID = 1
8+
AND s.S_QUANTITY < 15

0 commit comments

Comments
 (0)