Skip to content

Commit

Permalink
statistics: correctly check if a column histogram is loaded (#32764)
Browse files Browse the repository at this point in the history
close #31925, ref #32758, close #32833
  • Loading branch information
time-and-fate authored Mar 13, 2022
1 parent 51111b8 commit 9fcc6b0
Show file tree
Hide file tree
Showing 9 changed files with 53 additions and 22 deletions.
26 changes: 13 additions & 13 deletions cmd/explaintest/r/imdbload.result
Original file line number Diff line number Diff line change
Expand Up @@ -276,31 +276,31 @@ load stats 's/imdbload_stats/movie_info.json';
load stats 's/imdbload_stats/cast_info.json';
explain select * from char_name where ((imdb_index = 'I') and (surname_pcode < 'E436')) or ((imdb_index = 'L') and (surname_pcode < 'E436'));
id estRows task access object operator info
TableReader_7 804024.75 root data:Selection_6
└─Selection_6 804024.75 cop[tikv] or(and(eq(imdbload.char_name.imdb_index, "I"), lt(imdbload.char_name.surname_pcode, "E436")), and(eq(imdbload.char_name.imdb_index, "L"), lt(imdbload.char_name.surname_pcode, "E436")))
└─TableFullScan_5 4314864.00 cop[tikv] table:char_name keep order:false
IndexLookUp_10 1005030.94 root
├─IndexRangeScan_8(Build) 1005030.94 cop[tikv] table:char_name, index:itest2(imdb_index, surname_pcode, name_pcode_nf) range:["I" -inf,"I" "E436"), ["L" -inf,"L" "E436"), keep order:false
└─TableRowIDScan_9(Probe) 1005030.94 cop[tikv] table:char_name keep order:false
explain select * from char_name use index (itest2) where ((imdb_index = 'I') and (surname_pcode < 'E436')) or ((imdb_index = 'L') and (surname_pcode < 'E436'));
id estRows task access object operator info
IndexLookUp_7 2010061.87 root
├─IndexRangeScan_5(Build) 2010061.87 cop[tikv] table:char_name, index:itest2(imdb_index, surname_pcode, name_pcode_nf) range:["I" -inf,"I" "E436"), ["L" -inf,"L" "E436"), keep order:false
└─TableRowIDScan_6(Probe) 2010061.87 cop[tikv] table:char_name keep order:false
IndexLookUp_7 1005030.94 root
├─IndexRangeScan_5(Build) 1005030.94 cop[tikv] table:char_name, index:itest2(imdb_index, surname_pcode, name_pcode_nf) range:["I" -inf,"I" "E436"), ["L" -inf,"L" "E436"), keep order:false
└─TableRowIDScan_6(Probe) 1005030.94 cop[tikv] table:char_name keep order:false
trace plan target = 'estimation' select * from char_name where ((imdb_index = 'I') and (surname_pcode < 'E436')) or ((imdb_index = 'L') and (surname_pcode < 'E436'));
CE_trace
[{"table_name":"char_name","type":"Column Stats-Range","expr":"((id >= -9223372036854775808 and id <= 9223372036854775807))","row_count":4314864},{"table_name":"char_name","type":"Index Stats-Range","expr":"((imdb_index = 'I') and (surname_pcode < 'E436')) or ((imdb_index = 'L') and (surname_pcode < 'E436'))","row_count":2010061},{"table_name":"char_name","type":"Index Stats-Range","expr":"((surname_pcode < 'E436'))","row_count":1005030},{"table_name":"char_name","type":"Table Stats-Expression-CNF","expr":"`or`(`and`(`eq`(imdbload.char_name.imdb_index, 'I'), `lt`(imdbload.char_name.surname_pcode, 'E436')), `and`(`eq`(imdbload.char_name.imdb_index, 'L'), `lt`(imdbload.char_name.surname_pcode, 'E436')))","row_count":804024}]
[{"table_name":"char_name","type":"Column Stats-Point","expr":"((imdb_index = 'I'))","row_count":0},{"table_name":"char_name","type":"Column Stats-Point","expr":"((imdb_index = 'L'))","row_count":0},{"table_name":"char_name","type":"Column Stats-Range","expr":"((id >= -9223372036854775808 and id <= 9223372036854775807))","row_count":4314864},{"table_name":"char_name","type":"Index Stats-Range","expr":"((imdb_index = 'I') and (surname_pcode < 'E436')) or ((imdb_index = 'L') and (surname_pcode < 'E436'))","row_count":0},{"table_name":"char_name","type":"Index Stats-Range","expr":"((surname_pcode < 'E436'))","row_count":1005030},{"table_name":"char_name","type":"Table Stats-Expression-CNF","expr":"`or`(`and`(`eq`(imdbload.char_name.imdb_index, 'I'), `lt`(imdbload.char_name.surname_pcode, 'E436')), `and`(`eq`(imdbload.char_name.imdb_index, 'L'), `lt`(imdbload.char_name.surname_pcode, 'E436')))","row_count":804024}]

explain select * from char_name where ((imdb_index = 'V') and (surname_pcode < 'L3416'));
id estRows task access object operator info
TableReader_7 1927106.39 root data:Selection_6
└─Selection_6 1927106.39 cop[tikv] eq(imdbload.char_name.imdb_index, "V"), lt(imdbload.char_name.surname_pcode, "L3416")
└─TableFullScan_5 4314864.00 cop[tikv] table:char_name keep order:false
IndexLookUp_10 0.00 root
├─IndexRangeScan_8(Build) 0.00 cop[tikv] table:char_name, index:itest2(imdb_index, surname_pcode, name_pcode_nf) range:["V" -inf,"V" "L3416"), keep order:false
└─TableRowIDScan_9(Probe) 0.00 cop[tikv] table:char_name keep order:false
explain select * from char_name where imdb_index > 'V';
id estRows task access object operator info
IndexLookUp_10 0.00 root
├─IndexRangeScan_8(Build) 0.00 cop[tikv] table:char_name, index:itest2(imdb_index, surname_pcode, name_pcode_nf) range:("V",+inf], keep order:false
└─TableRowIDScan_9(Probe) 0.00 cop[tikv] table:char_name keep order:false
trace plan target = 'estimation' select * from char_name where imdb_index > 'V';
CE_trace
[{"table_name":"char_name","type":"Column Stats-Pseudo-Range","expr":"((imdb_index > 'V' and true))","row_count":1438288},{"table_name":"char_name","type":"Column Stats-Range","expr":"((id >= -9223372036854775808 and id <= 9223372036854775807))","row_count":4314864},{"table_name":"char_name","type":"Index Stats-Range","expr":"((imdb_index > 'V' and true))","row_count":0},{"table_name":"char_name","type":"Table Stats-Expression-CNF","expr":"`gt`(imdbload.char_name.imdb_index, 'V')","row_count":0}]
[{"table_name":"char_name","type":"Column Stats-Range","expr":"((id >= -9223372036854775808 and id <= 9223372036854775807))","row_count":4314864},{"table_name":"char_name","type":"Column Stats-Range","expr":"((imdb_index > 'V' and true))","row_count":0},{"table_name":"char_name","type":"Index Stats-Range","expr":"((imdb_index > 'V' and true))","row_count":0},{"table_name":"char_name","type":"Table Stats-Expression-CNF","expr":"`gt`(imdbload.char_name.imdb_index, 'V')","row_count":0}]

explain select * from movie_companies where company_type_id > 2;
id estRows task access object operator info
Expand All @@ -318,7 +318,7 @@ IndexLookUp_10 0.00 root
└─TableRowIDScan_9(Probe) 0.00 cop[tikv] table:char_name keep order:false
trace plan target = 'estimation' select * from char_name where imdb_index > 'I' and imdb_index < 'II';
CE_trace
[{"table_name":"char_name","type":"Column Stats-Pseudo-Range","expr":"((imdb_index > 'I' and imdb_index < 'II'))","row_count":107871},{"table_name":"char_name","type":"Column Stats-Range","expr":"((id >= -9223372036854775808 and id <= 9223372036854775807))","row_count":4314864},{"table_name":"char_name","type":"Index Stats-Range","expr":"((imdb_index > 'I' and imdb_index < 'II'))","row_count":0},{"table_name":"char_name","type":"Table Stats-Expression-CNF","expr":"`and`(`gt`(imdbload.char_name.imdb_index, 'I'), `lt`(imdbload.char_name.imdb_index, 'II'))","row_count":0}]
[{"table_name":"char_name","type":"Column Stats-Range","expr":"((id >= -9223372036854775808 and id <= 9223372036854775807))","row_count":4314864},{"table_name":"char_name","type":"Column Stats-Range","expr":"((imdb_index > 'I' and imdb_index < 'II'))","row_count":0},{"table_name":"char_name","type":"Index Stats-Range","expr":"((imdb_index > 'I' and imdb_index < 'II'))","row_count":0},{"table_name":"char_name","type":"Table Stats-Expression-CNF","expr":"`and`(`gt`(imdbload.char_name.imdb_index, 'I'), `lt`(imdbload.char_name.imdb_index, 'II'))","row_count":0}]

explain select * from char_name where imdb_index > 'I';
id estRows task access object operator info
Expand All @@ -327,7 +327,7 @@ IndexLookUp_10 0.00 root
└─TableRowIDScan_9(Probe) 0.00 cop[tikv] table:char_name keep order:false
trace plan target = 'estimation' select * from char_name where imdb_index > 'I';
CE_trace
[{"table_name":"char_name","type":"Column Stats-Pseudo-Range","expr":"((imdb_index > 'I' and true))","row_count":1438288},{"table_name":"char_name","type":"Column Stats-Range","expr":"((id >= -9223372036854775808 and id <= 9223372036854775807))","row_count":4314864},{"table_name":"char_name","type":"Index Stats-Range","expr":"((imdb_index > 'I' and true))","row_count":0},{"table_name":"char_name","type":"Table Stats-Expression-CNF","expr":"`gt`(imdbload.char_name.imdb_index, 'I')","row_count":0}]
[{"table_name":"char_name","type":"Column Stats-Range","expr":"((id >= -9223372036854775808 and id <= 9223372036854775807))","row_count":4314864},{"table_name":"char_name","type":"Column Stats-Range","expr":"((imdb_index > 'I' and true))","row_count":0},{"table_name":"char_name","type":"Index Stats-Range","expr":"((imdb_index > 'I' and true))","row_count":0},{"table_name":"char_name","type":"Table Stats-Expression-CNF","expr":"`gt`(imdbload.char_name.imdb_index, 'I')","row_count":0}]

explain select * from cast_info where nr_order < -2068070866;
id estRows task access object operator info
Expand Down
1 change: 1 addition & 0 deletions cmd/explaintest/t/imdbload.test
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,7 @@ load stats 's/imdbload_stats/cast_info.json';
-- The statistics and actual row count are from the latest imdb dataset that is distributed as old text files.

-- Actual row count: 1
-- Index lookup on itest2 index is the best plan, runs <50ms for the first time. Table scan + Selection runs >800ms. (using 8 core tikv * 5, copr cache disabled)
explain select * from char_name where ((imdb_index = 'I') and (surname_pcode < 'E436')) or ((imdb_index = 'L') and (surname_pcode < 'E436'));
explain select * from char_name use index (itest2) where ((imdb_index = 'I') and (surname_pcode < 'E436')) or ((imdb_index = 'L') and (surname_pcode < 'E436'));
trace plan target = 'estimation' select * from char_name where ((imdb_index = 'I') and (surname_pcode < 'E436')) or ((imdb_index = 'L') and (surname_pcode < 'E436'));
Expand Down
1 change: 1 addition & 0 deletions statistics/handle/dump.go
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,7 @@ func TableStatsFromJSON(tableInfo *model.TableInfo, physicalID int64, jsonTbl *J
Info: colInfo,
IsHandle: tableInfo.PKIsHandle && mysql.HasPriKeyFlag(colInfo.Flag),
StatsVer: statsVer,
Loaded: true,
}
col.Count = int64(col.TotalRowCount())
tbl.Columns[col.ID] = col
Expand Down
6 changes: 4 additions & 2 deletions statistics/handle/handle.go
Original file line number Diff line number Diff line change
Expand Up @@ -613,7 +613,7 @@ func (h *Handle) LoadNeededHistograms() (err error) {
continue
}
c, ok := tbl.Columns[col.ColumnID]
if !ok || c.Len() > 0 {
if !ok || c.IsLoaded() {
statistics.HistogramNeededColumns.Delete(col)
continue
}
Expand Down Expand Up @@ -645,6 +645,7 @@ func (h *Handle) LoadNeededHistograms() (err error) {
FMSketch: fms,
IsHandle: c.IsHandle,
StatsVer: rows[0].GetInt64(0),
Loaded: true,
}
// Column.Count is calculated by Column.TotalRowCount(). Hence we don't set Column.Count when initializing colHist.
colHist.Count = int64(colHist.TotalRowCount())
Expand Down Expand Up @@ -791,7 +792,7 @@ func (h *Handle) columnStatsFromStorage(reader *statsReader, row chunk.Row, tabl
// 4. loadAll is false.
notNeedLoad := h.Lease() > 0 &&
!isHandle &&
(col == nil || col.Len() == 0 && col.LastUpdateVersion < histVer) &&
(col == nil || !col.IsLoaded() && col.LastUpdateVersion < histVer) &&
!loadAll
if notNeedLoad {
count, err := h.columnCountFromStorage(reader, table.PhysicalID, histID, statsVer)
Expand Down Expand Up @@ -833,6 +834,7 @@ func (h *Handle) columnStatsFromStorage(reader *statsReader, row chunk.Row, tabl
IsHandle: tableInfo.PKIsHandle && mysql.HasPriKeyFlag(colInfo.Flag),
Flag: flag,
StatsVer: statsVer,
Loaded: true,
}
// Column.Count is calculated by Column.TotalRowCount(). Hence we don't set Column.Count when initializing col.
col.Count = int64(col.TotalRowCount())
Expand Down
1 change: 1 addition & 0 deletions statistics/handle/handle_hist.go
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,7 @@ func (h *Handle) readStatsForOne(col model.TableColumnID, c *statistics.Column,
FMSketch: fms,
IsHandle: c.IsHandle,
StatsVer: rows[0].GetInt64(0),
Loaded: true,
}
// Column.Count is calculated by Column.TotalRowCount(). Hence, we don't set Column.Count when initializing colHist.
colHist.Count = int64(colHist.TotalRowCount())
Expand Down
23 changes: 19 additions & 4 deletions statistics/histogram.go
Original file line number Diff line number Diff line change
Expand Up @@ -1049,6 +1049,17 @@ type Column struct {
Flag int64
LastAnalyzePos types.Datum
StatsVer int64 // StatsVer is the version of the current stats, used to maintain compatibility

// Loaded means if the histogram, the topn and the cm sketch are loaded fully.
// Those three parts of a Column is loaded lazily. It will only be loaded after trying to use them.
// Note: Currently please use Column.IsLoaded() to check if it's loaded.
Loaded bool
}

// IsLoaded is a wrap around c.Loaded.
// It's just for safe when we are switching from `c.notNullCount() > 0)` to `c.Loaded`.
func (c *Column) IsLoaded() bool {
return c.Loaded || c.notNullCount() > 0
}

func (c *Column) String() string {
Expand Down Expand Up @@ -1108,20 +1119,23 @@ func (c *Column) IsInvalid(sctx sessionctx.Context, collPseudo bool) bool {
if stmtctx != nil && stmtctx.StatsLoad.Fallback {
return true
}
if c.Histogram.NDV > 0 && c.notNullCount() == 0 && stmtctx != nil {
if !c.IsLoaded() && stmtctx != nil {
if stmtctx.StatsLoad.Timeout > 0 {
logutil.BgLogger().Warn("Hist for column should already be loaded as sync but not found.",
zap.String(strconv.FormatInt(c.Info.ID, 10), c.Info.Name.O))
}
HistogramNeededColumns.insert(tableColumnID{TableID: c.PhysicalID, ColumnID: c.Info.ID})
// In some tests, the c.Info is not set, so we add this check here.
if c.Info != nil {
HistogramNeededColumns.insert(tableColumnID{TableID: c.PhysicalID, ColumnID: c.Info.ID})
}
}
}
return c.TotalRowCount() == 0 || (c.Histogram.NDV > 0 && c.notNullCount() == 0)
return c.TotalRowCount() == 0 || !c.IsLoaded()
}

// IsHistNeeded checks if this column needs histogram to be loaded
func (c *Column) IsHistNeeded(collPseudo bool) bool {
return (!collPseudo || !c.NotAccurate()) && c.Histogram.NDV > 0 && c.notNullCount() == 0
return (!collPseudo || !c.NotAccurate()) && !c.IsLoaded()
}

func (c *Column) equalRowCount(sctx sessionctx.Context, val types.Datum, encodedVal []byte, realtimeRowCount int64) (float64, error) {
Expand Down Expand Up @@ -1674,6 +1688,7 @@ func (coll *HistColl) NewHistCollBySelectivity(sctx sessionctx.Context, statsNod
zap.Error(err))
continue
}
newCol.Loaded = oldCol.Loaded
newColl.Columns[node.ID] = newCol
}
for id, idx := range coll.Indices {
Expand Down
2 changes: 2 additions & 0 deletions statistics/histogram_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ func TestNewHistogramBySelectivity(t *testing.T) {
intCol := &Column{}
intCol.Histogram = *NewHistogram(1, 30, 30, 0, types.NewFieldType(mysql.TypeLonglong), chunk.InitialCapacity, 0)
intCol.IsHandle = true
intCol.Loaded = true
for i := 0; i < 10; i++ {
intCol.Bounds.AppendInt64(0, int64(i*3))
intCol.Bounds.AppendInt64(0, int64(i*3+2))
Expand All @@ -61,6 +62,7 @@ num: 1 lower_bound: 12 upper_bound: 14 repeats: 0 ndv: 0
num: 30 lower_bound: 27 upper_bound: 29 repeats: 0 ndv: 0`

stringCol := &Column{}
stringCol.Loaded = true
stringCol.Histogram = *NewHistogram(2, 15, 30, 0, types.NewFieldType(mysql.TypeString), chunk.InitialCapacity, 0)
stringCol.Bounds.AppendString(0, "a")
stringCol.Bounds.AppendString(0, "aaaabbbb")
Expand Down
6 changes: 5 additions & 1 deletion statistics/selectivity_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -866,7 +866,11 @@ func prepareSelectivity(testKit *testkit.TestKit, dom *domain.Domain) (*statisti
return nil, err
}
for i := 1; i <= 5; i++ {
statsTbl.Columns[int64(i)] = &statistics.Column{Histogram: *mockStatsHistogram(int64(i), colValues, 10, types.NewFieldType(mysql.TypeLonglong)), Info: tbl.Columns[i-1]}
statsTbl.Columns[int64(i)] = &statistics.Column{
Histogram: *mockStatsHistogram(int64(i), colValues, 10, types.NewFieldType(mysql.TypeLonglong)),
Info: tbl.Columns[i-1],
Loaded: true,
}
}

// Set the value of two indices' histograms.
Expand Down
9 changes: 7 additions & 2 deletions statistics/statistics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,12 @@ func SubTestColumnRange() func(*testing.T) {
hg, err := BuildColumn(ctx, bucketCount, 2, collector, types.NewFieldType(mysql.TypeLonglong))
hg.PreCalculateScalar()
require.NoError(t, err)
col := &Column{Histogram: *hg, CMSketch: buildCMSketch(s.rc.(*recordSet).data), Info: &model.ColumnInfo{}}
col := &Column{
Histogram: *hg,
CMSketch: buildCMSketch(s.rc.(*recordSet).data),
Info: &model.ColumnInfo{},
Loaded: true,
}
tbl := &Table{
HistColl: HistColl{
Count: int64(col.TotalRowCount()),
Expand Down Expand Up @@ -322,7 +327,7 @@ func SubTestIntColumnRanges() func(*testing.T) {
hg.PreCalculateScalar()
require.NoError(t, err)
require.Equal(t, int64(100000), rowCount)
col := &Column{Histogram: *hg, Info: &model.ColumnInfo{}}
col := &Column{Histogram: *hg, Info: &model.ColumnInfo{}, Loaded: true}
tbl := &Table{
HistColl: HistColl{
Count: int64(col.TotalRowCount()),
Expand Down

0 comments on commit 9fcc6b0

Please sign in to comment.