Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

statistics: change the upper bound of the out-of-range estimation to modify count | tidb-test=pr/2012 #39011

Merged
merged 8 commits into from
Jan 10, 2023
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 2 additions & 6 deletions statistics/column.go
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ func (c *Column) equalRowCount(sctx sessionctx.Context, val types.Datum, encoded
}

// GetColumnRowCount estimates the row count by a slice of Range.
func (c *Column) GetColumnRowCount(sctx sessionctx.Context, ranges []*ranger.Range, realtimeRowCount int64, pkIsHandle bool) (float64, error) {
func (c *Column) GetColumnRowCount(sctx sessionctx.Context, ranges []*ranger.Range, realtimeRowCount, modifyCount int64, pkIsHandle bool) (float64, error) {
sc := sctx.GetSessionVars().StmtCtx
var rowCount float64
for _, rg := range ranges {
Expand Down Expand Up @@ -284,11 +284,7 @@ func (c *Column) GetColumnRowCount(sctx sessionctx.Context, ranges []*ranger.Ran

// handling the out-of-range part
if (c.outOfRange(lowVal) && !lowVal.IsNull()) || c.outOfRange(highVal) {
increaseCount := realtimeRowCount - int64(c.TotalRowCount())
if increaseCount < 0 {
increaseCount = 0
}
cnt += c.Histogram.outOfRangeRowCount(&lowVal, &highVal, increaseCount)
cnt += c.Histogram.outOfRangeRowCount(&lowVal, &highVal, modifyCount)
}

rowCount += cnt
Expand Down
4 changes: 2 additions & 2 deletions statistics/handle/update.go
Original file line number Diff line number Diff line change
Expand Up @@ -1449,10 +1449,10 @@ func (h *Handle) RecalculateExpectCount(q *statistics.QueryFeedback, enablePseud
expected := 0.0
if isIndex {
idx := t.Indices[id]
expected, err = idx.GetRowCount(sctx, nil, ranges, t.Count)
expected, err = idx.GetRowCount(sctx, nil, ranges, t.Count, t.ModifyCount)
} else {
c := t.Columns[id]
expected, err = c.GetColumnRowCount(sctx, ranges, t.Count, true)
expected, err = c.GetColumnRowCount(sctx, ranges, t.Count, t.ModifyCount, true)
}
q.Expected = int64(expected)
return err
Expand Down
8 changes: 4 additions & 4 deletions statistics/histogram.go
Original file line number Diff line number Diff line change
Expand Up @@ -778,7 +778,7 @@ func (hg *Histogram) outOfRange(val types.Datum) bool {

// outOfRangeRowCount estimate the row count of part of [lDatum, rDatum] which is out of range of the histogram.
// Here we assume the density of data is decreasing from the lower/upper bound of the histogram toward outside.
// The maximum row count it can get is the increaseCount. It reaches the maximum when out-of-range width reaches histogram range width.
// The maximum row count it can get is the modifyCount. It reaches the maximum when out-of-range width reaches histogram range width.
// As it shows below. To calculate the out-of-range row count, we need to calculate the percentage of the shaded area.
// Note that we assume histL-boundL == histR-histL == boundR-histR here.
/*
Expand All @@ -795,7 +795,7 @@ func (hg *Histogram) outOfRange(val types.Datum) bool {
│ │
lDatum rDatum
*/
func (hg *Histogram) outOfRangeRowCount(lDatum, rDatum *types.Datum, increaseCount int64) float64 {
func (hg *Histogram) outOfRangeRowCount(lDatum, rDatum *types.Datum, modifyCount int64) float64 {
if hg.Len() == 0 {
return 0
}
Expand Down Expand Up @@ -879,8 +879,8 @@ func (hg *Histogram) outOfRangeRowCount(lDatum, rDatum *types.Datum, increaseCou
totalPercent = 1
}
rowCount := totalPercent * hg.notNullCount()
if rowCount > float64(increaseCount) {
return float64(increaseCount)
if rowCount > float64(modifyCount) {
return float64(modifyCount)
}
return rowCount
}
Expand Down
8 changes: 2 additions & 6 deletions statistics/index.go
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ func (idx *Index) QueryBytes(d []byte) uint64 {

// GetRowCount returns the row count of the given ranges.
// It uses the modifyCount to adjust the influence of modifications on the table.
func (idx *Index) GetRowCount(sctx sessionctx.Context, coll *HistColl, indexRanges []*ranger.Range, realtimeRowCount int64) (float64, error) {
func (idx *Index) GetRowCount(sctx sessionctx.Context, coll *HistColl, indexRanges []*ranger.Range, realtimeRowCount, modifyCount int64) (float64, error) {
idx.checkStats()
sc := sctx.GetSessionVars().StmtCtx
totalCount := float64(0)
Expand Down Expand Up @@ -309,11 +309,7 @@ func (idx *Index) GetRowCount(sctx sessionctx.Context, coll *HistColl, indexRang

// handling the out-of-range part
if (idx.outOfRange(l) && !(isSingleCol && lowIsNull)) || idx.outOfRange(r) {
increaseCount := realtimeRowCount - int64(idx.TotalRowCount())
if increaseCount < 0 {
increaseCount = 0
}
totalCount += idx.Histogram.outOfRangeRowCount(&l, &r, increaseCount)
totalCount += idx.Histogram.outOfRangeRowCount(&l, &r, modifyCount)
}
}
totalCount = mathutil.Clamp(totalCount, 0, float64(realtimeRowCount))
Expand Down
8 changes: 5 additions & 3 deletions statistics/selectivity_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ func TestOutOfRangeEstimation(t *testing.T) {
statsTbl := h.GetTableStats(table.Meta())
sctx := mock.NewContext()
col := statsTbl.Columns[table.Meta().Columns[0].ID]
count, err := col.GetColumnRowCount(sctx, getRange(900, 900), statsTbl.Count, false)
count, err := col.GetColumnRowCount(sctx, getRange(900, 900), statsTbl.Count, statsTbl.ModifyCount, false)
require.NoError(t, err)
// Because the ANALYZE collect data by random sampling, so the result is not an accurate value.
// so we use a range here.
Expand All @@ -146,8 +146,9 @@ func TestOutOfRangeEstimation(t *testing.T) {
statsSuiteData := statistics.GetStatsSuiteData()
statsSuiteData.LoadTestCases(t, &input, &output)
increasedTblRowCount := int64(float64(statsTbl.Count) * 1.5)
modifyCount := int64(float64(statsTbl.Count) * 0.5)
for i, ran := range input {
count, err = col.GetColumnRowCount(sctx, getRange(ran.Start, ran.End), increasedTblRowCount, false)
count, err = col.GetColumnRowCount(sctx, getRange(ran.Start, ran.End), increasedTblRowCount, modifyCount, false)
require.NoError(t, err)
testdata.OnRecord(func() {
output[i].Start = ran.Start
Expand Down Expand Up @@ -542,6 +543,7 @@ func TestSelectivity(t *testing.T) {
require.Truef(t, math.Abs(ratio-tt.selectivity) < eps, "for %s, needed: %v, got: %v", tt.exprs, tt.selectivity, ratio)

histColl.Count *= 10
histColl.ModifyCount = histColl.Count * 9
ratio, _, err = histColl.Selectivity(sctx, sel.Conditions, nil)
require.NoErrorf(t, err, "for %s", tt.exprs)
require.Truef(t, math.Abs(ratio-tt.selectivityAfterIncrease) < eps, "for %s, needed: %v, got: %v", tt.exprs, tt.selectivityAfterIncrease, ratio)
Expand Down Expand Up @@ -747,7 +749,7 @@ func TestSmallRangeEstimation(t *testing.T) {
statsSuiteData := statistics.GetStatsSuiteData()
statsSuiteData.LoadTestCases(t, &input, &output)
for i, ran := range input {
count, err := col.GetColumnRowCount(sctx, getRange(ran.Start, ran.End), statsTbl.Count, false)
count, err := col.GetColumnRowCount(sctx, getRange(ran.Start, ran.End), statsTbl.Count, statsTbl.ModifyCount, false)
require.NoError(t, err)
testdata.OnRecord(func() {
output[i].Start = ran.Start
Expand Down
10 changes: 5 additions & 5 deletions statistics/table.go
Original file line number Diff line number Diff line change
Expand Up @@ -566,7 +566,7 @@ func (coll *HistColl) GetRowCountByIntColumnRanges(sctx sessionctx.Context, colI
}
return result, nil
}
result, err = c.GetColumnRowCount(sctx, intRanges, coll.Count, true)
result, err = c.GetColumnRowCount(sctx, intRanges, coll.Count, coll.ModifyCount, true)
if sc.EnableOptimizerCETrace {
CETraceRange(sctx, coll.PhysicalID, []string{c.Info.Name.O}, intRanges, "Column Stats", uint64(result))
}
Expand All @@ -587,7 +587,7 @@ func (coll *HistColl) GetRowCountByColumnRanges(sctx sessionctx.Context, colID i
}
return result, err
}
result, err := c.GetColumnRowCount(sctx, colRanges, coll.Count, false)
result, err := c.GetColumnRowCount(sctx, colRanges, coll.Count, coll.ModifyCount, false)
if sc.EnableOptimizerCETrace {
CETraceRange(sctx, coll.PhysicalID, []string{c.Info.Name.O}, colRanges, "Column Stats", uint64(result))
}
Expand Down Expand Up @@ -623,7 +623,7 @@ func (coll *HistColl) GetRowCountByIndexRanges(sctx sessionctx.Context, idxID in
if idx.CMSketch != nil && idx.StatsVer == Version1 {
result, err = coll.getIndexRowCount(sctx, idxID, indexRanges)
} else {
result, err = idx.GetRowCount(sctx, coll, indexRanges, coll.Count)
result, err = idx.GetRowCount(sctx, coll, indexRanges, coll.Count, coll.ModifyCount)
}
if sc.EnableOptimizerCETrace {
CETraceRange(sctx, coll.PhysicalID, colNames, indexRanges, "Index Stats", uint64(result))
Expand Down Expand Up @@ -959,7 +959,7 @@ func (coll *HistColl) crossValidationSelectivity(sctx sessionctx.Context, idx *I
Collators: []collate.Collator{idxPointRange.Collators[i]},
}

rowCount, err := col.GetColumnRowCount(sctx, []*ranger.Range{&rang}, coll.Count, col.IsHandle)
rowCount, err := col.GetColumnRowCount(sctx, []*ranger.Range{&rang}, coll.Count, coll.ModifyCount, col.IsHandle)
if err != nil {
return 0, 0, err
}
Expand Down Expand Up @@ -1031,7 +1031,7 @@ func (coll *HistColl) getIndexRowCount(sctx sessionctx.Context, idxID int64, ind
// on single-column index, use previous way as well, because CMSketch does not contain null
// values in this case.
if rangePosition == 0 || isSingleColIdxNullRange(idx, ran) {
count, err := idx.GetRowCount(sctx, nil, []*ranger.Range{ran}, coll.Count)
count, err := idx.GetRowCount(sctx, nil, []*ranger.Range{ran}, coll.Count, coll.ModifyCount)
if err != nil {
return 0, errors.Trace(err)
}
Expand Down