Skip to content

Commit

Permalink
statistics: change the upper bound of the out-of-range estimation to …
Browse files Browse the repository at this point in the history
…modify count | tidb-test=pr/2013 (#39011) (#40454)

close #39008
  • Loading branch information
ti-chi-bot authored Jan 10, 2023
1 parent fc46865 commit 99b2d4c
Show file tree
Hide file tree
Showing 6 changed files with 376 additions and 42 deletions.
4 changes: 2 additions & 2 deletions statistics/handle/update.go
Original file line number Diff line number Diff line change
Expand Up @@ -1402,10 +1402,10 @@ func (h *Handle) RecalculateExpectCount(q *statistics.QueryFeedback) error {
expected := 0.0
if isIndex {
idx := t.Indices[id]
expected, err = idx.GetRowCount(sctx, nil, ranges, t.Count)
expected, err = idx.GetRowCount(sctx, nil, ranges, t.Count, t.ModifyCount)
} else {
c := t.Columns[id]
expected, err = c.GetColumnRowCount(sctx, ranges, t.Count, true)
expected, err = c.GetColumnRowCount(sctx, ranges, t.Count, t.ModifyCount, true)
}
q.Expected = int64(expected)
return err
Expand Down
58 changes: 26 additions & 32 deletions statistics/histogram.go
Original file line number Diff line number Diff line change
Expand Up @@ -879,27 +879,23 @@ func (hg *Histogram) outOfRange(val types.Datum) bool {

// outOfRangeRowCount estimate the row count of part of [lDatum, rDatum] which is out of range of the histogram.
// Here we assume the density of data is decreasing from the lower/upper bound of the histogram toward outside.
// The maximum row count it can get is the increaseCount. It reaches the maximum when out-of-range width reaches histogram range width.
// The maximum row count it can get is the modifyCount. It reaches the maximum when out-of-range width reaches histogram range width.
// As it shows below. To calculate the out-of-range row count, we need to calculate the percentage of the shaded area.
// Note that we assume histL-boundL == histR-histL == boundR-histR here.
//
// /│ │\
// / │ │ \
// /x│ │◄─histogram─►│ \
// / xx│ │ range │ \
// / │xxx│ │ │ \
// / │xxx│ │ │ \
//
// ────┴────┴───┴──┴─────────────┴───────────┴─────
//
// ▲ ▲ ▲ ▲ ▲ ▲
// │ │ │ │ │ │
//
// boundL │ │histL histR boundR
//
// │ │
// lDatum rDatum
func (hg *Histogram) outOfRangeRowCount(lDatum, rDatum *types.Datum, increaseCount int64) float64 {
// /│ │\
// / │ │ \
// /x│ │◄─histogram─►│ \
// / xx│ │ range │ \
// / │xxx│ │ │ \
// / │xxx│ │ │ \
// ────┴────┴───┴──┴─────────────┴───────────┴─────
// ▲ ▲ ▲ ▲ ▲ ▲
// │ │ │ │ │ │
// boundL │ │histL histR boundR
// │ │
// lDatum rDatum
func (hg *Histogram) outOfRangeRowCount(lDatum, rDatum *types.Datum, modifyCount int64) float64 {
if hg.Len() == 0 {
return 0
}
Expand Down Expand Up @@ -983,8 +979,14 @@ func (hg *Histogram) outOfRangeRowCount(lDatum, rDatum *types.Datum, increaseCou
totalPercent = 1
}
rowCount := totalPercent * hg.notNullCount()
if rowCount > float64(increaseCount) {
return float64(increaseCount)

// Use the modifyCount as the upper bound. Note that modifyCount contains insert, delete and update. So this is
// a rather loose upper bound.
// There are some scenarios where we need to handle out-of-range estimation after both insert and delete happen.
// But we don't know how many increases are in the modifyCount. So we have to use this loose bound to ensure it
// can produce a reasonable results in this scenario.
if rowCount > float64(modifyCount) {
return float64(modifyCount)
}
return rowCount
}
Expand Down Expand Up @@ -1209,7 +1211,7 @@ func (c *Column) equalRowCount(sctx sessionctx.Context, val types.Datum, encoded
}

// GetColumnRowCount estimates the row count by a slice of Range.
func (c *Column) GetColumnRowCount(sctx sessionctx.Context, ranges []*ranger.Range, realtimeRowCount int64, pkIsHandle bool) (float64, error) {
func (c *Column) GetColumnRowCount(sctx sessionctx.Context, ranges []*ranger.Range, realtimeRowCount, modifyCount int64, pkIsHandle bool) (float64, error) {
sc := sctx.GetSessionVars().StmtCtx
var rowCount float64
for _, rg := range ranges {
Expand Down Expand Up @@ -1306,11 +1308,7 @@ func (c *Column) GetColumnRowCount(sctx sessionctx.Context, ranges []*ranger.Ran

// handling the out-of-range part
if (c.outOfRange(lowVal) && !lowVal.IsNull()) || c.outOfRange(highVal) {
increaseCount := realtimeRowCount - int64(c.TotalRowCount())
if increaseCount < 0 {
increaseCount = 0
}
cnt += c.Histogram.outOfRangeRowCount(&lowVal, &highVal, increaseCount)
cnt += c.Histogram.outOfRangeRowCount(&lowVal, &highVal, modifyCount)
}

rowCount += cnt
Expand Down Expand Up @@ -1433,7 +1431,7 @@ func (idx *Index) QueryBytes(d []byte) uint64 {

// GetRowCount returns the row count of the given ranges.
// It uses the modifyCount to adjust the influence of modifications on the table.
func (idx *Index) GetRowCount(sctx sessionctx.Context, coll *HistColl, indexRanges []*ranger.Range, realtimeRowCount int64) (float64, error) {
func (idx *Index) GetRowCount(sctx sessionctx.Context, coll *HistColl, indexRanges []*ranger.Range, realtimeRowCount, modifyCount int64) (float64, error) {
sc := sctx.GetSessionVars().StmtCtx
totalCount := float64(0)
isSingleCol := len(idx.Info.Columns) == 1
Expand Down Expand Up @@ -1525,11 +1523,7 @@ func (idx *Index) GetRowCount(sctx sessionctx.Context, coll *HistColl, indexRang

// handling the out-of-range part
if (idx.outOfRange(l) && !(isSingleCol && lowIsNull)) || idx.outOfRange(r) {
increaseCount := realtimeRowCount - int64(idx.TotalRowCount())
if increaseCount < 0 {
increaseCount = 0
}
totalCount += idx.Histogram.outOfRangeRowCount(&l, &r, increaseCount)
totalCount += idx.Histogram.outOfRangeRowCount(&l, &r, modifyCount)
}
}
totalCount = mathutil.Clamp(totalCount, 0, float64(realtimeRowCount))
Expand Down
92 changes: 89 additions & 3 deletions statistics/selectivity_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ func TestOutOfRangeEstimation(t *testing.T) {
statsTbl := h.GetTableStats(table.Meta())
sctx := mock.NewContext()
col := statsTbl.Columns[table.Meta().Columns[0].ID]
count, err := col.GetColumnRowCount(sctx, getRange(900, 900), statsTbl.Count, false)
count, err := col.GetColumnRowCount(sctx, getRange(900, 900), statsTbl.Count, statsTbl.ModifyCount, false)
require.NoError(t, err)
// Because the ANALYZE collect data by random sampling, so the result is not an accurate value.
// so we use a range here.
Expand All @@ -147,8 +147,9 @@ func TestOutOfRangeEstimation(t *testing.T) {
statsSuiteData := statistics.GetStatsSuiteData()
statsSuiteData.GetTestCases(t, &input, &output)
increasedTblRowCount := int64(float64(statsTbl.Count) * 1.5)
modifyCount := int64(float64(statsTbl.Count) * 0.5)
for i, ran := range input {
count, err = col.GetColumnRowCount(sctx, getRange(ran.Start, ran.End), increasedTblRowCount, false)
count, err = col.GetColumnRowCount(sctx, getRange(ran.Start, ran.End), increasedTblRowCount, modifyCount, false)
require.NoError(t, err)
testdata.OnRecord(func() {
output[i].Start = ran.Start
Expand All @@ -160,6 +161,43 @@ func TestOutOfRangeEstimation(t *testing.T) {
}
}

// TestOutOfRangeEstimationAfterDelete tests the out-of-range estimation after deletion happen.
// The test result doesn't perfectly reflect the actual data distribution, but this is the expected behavior for now.
func TestOutOfRangeEstimationAfterDelete(t *testing.T) {
store, dom, clean := testkit.CreateMockStoreAndDomain(t)
defer clean()
testKit := testkit.NewTestKit(t, store)
h := dom.StatsHandle()
testKit.MustExec("use test")
testKit.MustExec("drop table if exists t")
testKit.MustExec("create table t(a int unsigned)")
require.NoError(t, h.HandleDDLEvent(<-h.DDLEventCh()))
for i := 0; i < 3000; i++ {
testKit.MustExec(fmt.Sprintf("insert into t values (%v)", i/5+300)) // [300, 900)
}
require.Nil(t, h.DumpStatsDeltaToKV(handle.DumpAll))
testKit.MustExec("analyze table t with 1 samplerate, 0 topn")
testKit.MustExec("delete from t where a < 500")
require.Nil(t, h.DumpStatsDeltaToKV(handle.DumpAll))
require.Nil(t, h.Update(dom.InfoSchema()))
var (
input []string
output []struct {
SQL string
Result []string
}
)
statsSuiteData := statistics.GetStatsSuiteData()
statsSuiteData.GetTestCases(t, &input, &output)
for i := range input {
testdata.OnRecord(func() {
output[i].SQL = input[i]
output[i].Result = testdata.ConvertRowsToStrings(testKit.MustQuery(input[i]).Rows())
})
testKit.MustQuery(input[i]).Check(testkit.Rows(output[i].Result...))
}
}

func TestEstimationForUnknownValues(t *testing.T) {
store, dom, clean := testkit.CreateMockStoreAndDomain(t)
defer clean()
Expand Down Expand Up @@ -551,6 +589,7 @@ func TestSelectivity(t *testing.T) {
require.Truef(t, math.Abs(ratio-tt.selectivity) < eps, "for %s, needed: %v, got: %v", tt.exprs, tt.selectivity, ratio)

histColl.Count *= 10
histColl.ModifyCount = histColl.Count * 9
ratio, _, err = histColl.Selectivity(sctx, sel.Conditions, nil)
require.NoErrorf(t, err, "for %s", tt.exprs)
require.Truef(t, math.Abs(ratio-tt.selectivityAfterIncrease) < eps, "for %s, needed: %v, got: %v", tt.exprs, tt.selectivityAfterIncrease, ratio)
Expand Down Expand Up @@ -762,7 +801,7 @@ func TestSmallRangeEstimation(t *testing.T) {
statsSuiteData := statistics.GetStatsSuiteData()
statsSuiteData.GetTestCases(t, &input, &output)
for i, ran := range input {
count, err := col.GetColumnRowCount(sctx, getRange(ran.Start, ran.End), statsTbl.Count, false)
count, err := col.GetColumnRowCount(sctx, getRange(ran.Start, ran.End), statsTbl.Count, statsTbl.ModifyCount, false)
require.NoError(t, err)
testdata.OnRecord(func() {
output[i].Start = ran.Start
Expand Down Expand Up @@ -893,3 +932,50 @@ func TestSelectivityGreedyAlgo(t *testing.T) {
require.Equal(t, 1, len(usedSets))
require.Equal(t, int64(1), usedSets[0].ID)
}

func TestGlobalStatsOutOfRangeEstimationAfterDelete(t *testing.T) {
store, dom, clean := testkit.CreateMockStoreAndDomain(t)
defer clean()
testKit := testkit.NewTestKit(t, store)
h := dom.StatsHandle()
testKit.MustExec("use test")
testKit.MustExec("set @@tidb_partition_prune_mode='dynamic'")
testKit.MustExec("drop table if exists t")
testKit.MustExec("create table t(a int unsigned) " +
"partition by range (a) " +
"(partition p0 values less than (400), " +
"partition p1 values less than (600), " +
"partition p2 values less than (800)," +
"partition p3 values less than (1000)," +
"partition p4 values less than (1200))")
require.NoError(t, h.HandleDDLEvent(<-h.DDLEventCh()))
for i := 0; i < 3000; i++ {
testKit.MustExec(fmt.Sprintf("insert into t values (%v)", i/5+300)) // [300, 900)
}
require.Nil(t, h.DumpStatsDeltaToKV(handle.DumpAll))
testKit.MustExec("analyze table t with 1 samplerate, 0 topn")
testKit.MustExec("delete from t where a < 500")
require.Nil(t, h.DumpStatsDeltaToKV(handle.DumpAll))
require.Nil(t, h.Update(dom.InfoSchema()))
var (
input []string
output []struct {
SQL string
Result []string
}
)
statsSuiteData := statistics.GetStatsSuiteData()
statsSuiteData.GetTestCases(t, &input, &output)
for i := range input {
testdata.OnRecord(func() {
output[i].SQL = input[i]
output[i].Result = testdata.ConvertRowsToStrings(testKit.MustQuery(input[i]).Rows())
})
testKit.MustQuery(input[i]).Check(testkit.Rows(output[i].Result...))
}
testKit.MustExec("analyze table t partition p4 with 1 samplerate, 0 topn")
require.Nil(t, h.Update(dom.InfoSchema()))
for i := range input {
testKit.MustQuery(input[i]).Check(testkit.Rows(output[i].Result...))
}
}
10 changes: 5 additions & 5 deletions statistics/table.go
Original file line number Diff line number Diff line change
Expand Up @@ -466,7 +466,7 @@ func (coll *HistColl) GetRowCountByIntColumnRanges(sctx sessionctx.Context, colI
}
return result, nil
}
result, err := c.GetColumnRowCount(sctx, intRanges, coll.Count, true)
result, err := c.GetColumnRowCount(sctx, intRanges, coll.Count, coll.ModifyCount, true)
if sc.EnableOptimizerCETrace {
CETraceRange(sctx, coll.PhysicalID, []string{c.Info.Name.O}, intRanges, "Column Stats", uint64(result))
}
Expand All @@ -484,7 +484,7 @@ func (coll *HistColl) GetRowCountByColumnRanges(sctx sessionctx.Context, colID i
}
return result, err
}
result, err := c.GetColumnRowCount(sctx, colRanges, coll.Count, false)
result, err := c.GetColumnRowCount(sctx, colRanges, coll.Count, coll.ModifyCount, false)
if sc.EnableOptimizerCETrace {
CETraceRange(sctx, coll.PhysicalID, []string{c.Info.Name.O}, colRanges, "Column Stats", uint64(result))
}
Expand Down Expand Up @@ -517,7 +517,7 @@ func (coll *HistColl) GetRowCountByIndexRanges(sctx sessionctx.Context, idxID in
if idx.CMSketch != nil && idx.StatsVer == Version1 {
result, err = coll.getIndexRowCount(sctx, idxID, indexRanges)
} else {
result, err = idx.GetRowCount(sctx, coll, indexRanges, coll.Count)
result, err = idx.GetRowCount(sctx, coll, indexRanges, coll.Count, coll.ModifyCount)
}
if sc.EnableOptimizerCETrace {
CETraceRange(sctx, coll.PhysicalID, colNames, indexRanges, "Index Stats", uint64(result))
Expand Down Expand Up @@ -708,7 +708,7 @@ func (coll *HistColl) crossValidationSelectivity(sctx sessionctx.Context, idx *I
Collators: []collate.Collator{idxPointRange.Collators[i]},
}

rowCount, err := col.GetColumnRowCount(sctx, []*ranger.Range{&rang}, coll.Count, col.IsHandle)
rowCount, err := col.GetColumnRowCount(sctx, []*ranger.Range{&rang}, coll.Count, coll.ModifyCount, col.IsHandle)
if err != nil {
return 0, 0, err
}
Expand Down Expand Up @@ -780,7 +780,7 @@ func (coll *HistColl) getIndexRowCount(sctx sessionctx.Context, idxID int64, ind
// on single-column index, use previous way as well, because CMSketch does not contain null
// values in this case.
if rangePosition == 0 || isSingleColIdxNullRange(idx, ran) {
count, err := idx.GetRowCount(sctx, nil, []*ranger.Range{ran}, coll.Count)
count, err := idx.GetRowCount(sctx, nil, []*ranger.Range{ran}, coll.Count, coll.ModifyCount)
if err != nil {
return 0, errors.Trace(err)
}
Expand Down
36 changes: 36 additions & 0 deletions statistics/testdata/stats_suite_in.json
Original file line number Diff line number Diff line change
Expand Up @@ -259,5 +259,41 @@
"End": 0
}
]
},
{
"name": "TestOutOfRangeEstimationAfterDelete",
"cases": [
"explain format = 'brief' select * from t where a <= 300",
"explain format = 'brief' select * from t where a < 300",
"explain format = 'brief' select * from t where a <= 500",
"explain format = 'brief' select * from t where a >= 300 and a <= 900",
"explain format = 'brief' select * from t where a >= 900",
"explain format = 'brief' select * from t where a > 900",
"explain format = 'brief' select * from t where a >= 300",
"explain format = 'brief' select * from t where a <= 900",
"explain format = 'brief' select * from t where a > 800 and a < 1000",
"explain format = 'brief' select * from t where a > 900 and a < 1000",
"explain format = 'brief' select * from t where a > 900 and a < 1100",
"explain format = 'brief' select * from t where a > 200 and a < 300",
"explain format = 'brief' select * from t where a > 100 and a < 300"
]
},
{
"name": "TestGlobalStatsOutOfRangeEstimationAfterDelete",
"cases": [
"explain format = 'brief' select * from t where a <= 300",
"explain format = 'brief' select * from t where a < 300",
"explain format = 'brief' select * from t where a <= 500",
"explain format = 'brief' select * from t where a >= 300 and a <= 900",
"explain format = 'brief' select * from t where a >= 900",
"explain format = 'brief' select * from t where a > 900",
"explain format = 'brief' select * from t where a >= 300",
"explain format = 'brief' select * from t where a <= 900",
"explain format = 'brief' select * from t where a > 800 and a < 1000",
"explain format = 'brief' select * from t where a > 900 and a < 1000",
"explain format = 'brief' select * from t where a > 900 and a < 1100",
"explain format = 'brief' select * from t where a > 200 and a < 300",
"explain format = 'brief' select * from t where a > 100 and a < 300"
]
}
]
Loading

0 comments on commit 99b2d4c

Please sign in to comment.