forked from pingcap/tidb
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
statistics: introduce selectivity calculation (pingcap#3161)
- Loading branch information
1 parent
4023bb8
commit 8072082
Showing
4 changed files
with
369 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,180 @@ | ||
// Copyright 2017 PingCAP, Inc. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
package statistics | ||
|
||
import ( | ||
"math" | ||
|
||
"github.com/juju/errors" | ||
"github.com/pingcap/tidb/context" | ||
"github.com/pingcap/tidb/expression" | ||
"github.com/pingcap/tidb/mysql" | ||
"github.com/pingcap/tidb/sessionctx/variable" | ||
"github.com/pingcap/tidb/util/ranger" | ||
"github.com/pingcap/tidb/util/types" | ||
) | ||
|
||
// If one condition can't be calculated, we will assume that the selectivity of this condition is 0.8. | ||
const selectionFactor = 0.8 | ||
|
||
// exprSet is used for calculating selectivity. | ||
type exprSet struct { | ||
tp int | ||
ID int64 | ||
// The ith bit of `mask` will tell whether the ith expression is covered by this index/column. | ||
mask int64 | ||
// This stores ranges we get. | ||
ranges []types.Range | ||
} | ||
|
||
// The type of the exprSet. | ||
const ( | ||
indexType = iota | ||
pkType | ||
colType | ||
) | ||
|
||
// Selectivity is a function calculate the selectivity of the expressions. | ||
// The definition of selectivity is (row count after filter / row count before filter). | ||
// And exprs must be CNF now, in other words, `exprs[0] and exprs[1] and ... and exprs[len - 1]` should be held when you call this. | ||
// TODO: support expressions that the top layer is a DNF. | ||
// Currently the time complexity is o(n^2). | ||
func (t *Table) Selectivity(ctx context.Context, exprs []expression.Expression) (float64, error) { | ||
if t.Count == 0 { | ||
return 0, nil | ||
} | ||
// TODO: If len(exprs) is bigger than 63, we could use bitset structure to replace the int64. | ||
// This will simplify some code and speed up if we use this rather than a boolean slice. | ||
if t.Pseudo || len(exprs) > 63 { | ||
return selectionFactor, nil | ||
} | ||
var sets []*exprSet | ||
sc := ctx.GetSessionVars().StmtCtx | ||
extractedCols := expression.ExtractColumns(expression.ComposeCNFCondition(ctx, exprs...)) | ||
for _, colInfo := range t.Columns { | ||
col := expression.ColInfo2Col(extractedCols, colInfo.Info) | ||
// This column should have histogram. | ||
if col != nil && len(colInfo.Histogram.Buckets) > 0 { | ||
maskCovered, ranges, err := getMaskAndRanges(sc, exprs, ranger.ColumnRangeType, nil, col) | ||
if err != nil { | ||
return 0, errors.Trace(err) | ||
} | ||
sets = append(sets, &exprSet{tp: colType, ID: col.ID, mask: maskCovered, ranges: ranges}) | ||
if mysql.HasPriKeyFlag(colInfo.Info.Flag) { | ||
sets[len(sets)-1].tp = pkType | ||
} | ||
} | ||
} | ||
for _, idxInfo := range t.Indices { | ||
idxCols, lengths := expression.IndexInfo2Cols(extractedCols, idxInfo.Info) | ||
// This index should have histogram. | ||
if len(idxCols) > 0 && len(idxInfo.Histogram.Buckets) > 0 { | ||
maskCovered, ranges, err := getMaskAndRanges(sc, exprs, ranger.IndexRangeType, lengths, idxCols...) | ||
if err != nil { | ||
return 0, errors.Trace(err) | ||
} | ||
sets = append(sets, &exprSet{tp: indexType, ID: idxInfo.ID, mask: maskCovered, ranges: ranges}) | ||
} | ||
} | ||
sets = getUsableSetsByGreedy(sets) | ||
ret := 1.0 | ||
// Initialize the mask with the full set. | ||
mask := (int64(1) << uint(len(exprs))) - 1 | ||
for _, set := range sets { | ||
mask ^= set.mask | ||
var ( | ||
rowCount float64 | ||
err error | ||
) | ||
switch set.tp { | ||
case pkType, colType: | ||
ranges := ranger.Ranges2ColumnRanges(set.ranges) | ||
rowCount, err = t.GetRowCountByColumnRanges(sc, set.ID, ranges) | ||
case indexType: | ||
ranges := ranger.Ranges2IndexRanges(set.ranges) | ||
rowCount, err = t.GetRowCountByIndexRanges(sc, set.ID, ranges) | ||
} | ||
if err != nil { | ||
return 0, errors.Trace(err) | ||
} | ||
ret *= rowCount / float64(t.Count) | ||
} | ||
// If there's still conditions which cannot be calculated, we will multiply a selectionFactor. | ||
if mask > 0 { | ||
ret *= selectionFactor | ||
} | ||
return ret, nil | ||
} | ||
|
||
func getMaskAndRanges(sc *variable.StatementContext, exprs []expression.Expression, rangeType int, | ||
lengths []int, cols ...*expression.Column) (int64, []types.Range, error) { | ||
exprsClone := make([]expression.Expression, 0, len(exprs)) | ||
for _, expr := range exprs { | ||
exprsClone = append(exprsClone, expr.Clone()) | ||
} | ||
ranges, accessConds, _, err := ranger.BuildRange(sc, exprsClone, rangeType, cols, lengths) | ||
if err != nil { | ||
return 0, nil, errors.Trace(err) | ||
} | ||
mask := int64(0) | ||
for i := range exprs { | ||
for j := range accessConds { | ||
if exprs[i].Equal(accessConds[j], nil) { | ||
mask |= 1 << uint64(i) | ||
break | ||
} | ||
} | ||
} | ||
return mask, ranges, nil | ||
} | ||
|
||
// getUsableSetsByGreedy will select the indices and pk used for calculate selectivity by greedy algorithm. | ||
func getUsableSetsByGreedy(sets []*exprSet) (newBlocks []*exprSet) { | ||
mask := int64(math.MaxInt64) | ||
for { | ||
// Choose the index that covers most. | ||
bestID := -1 | ||
bestCount := 0 | ||
bestID, bestCount, bestTp := -1, 0, colType | ||
for i, set := range sets { | ||
set.mask &= mask | ||
bits := popCount(set.mask) | ||
if (bestTp == colType && set.tp < colType) || bestCount < bits { | ||
bestID, bestCount, bestTp = i, bits, set.tp | ||
} | ||
} | ||
if bestCount == 0 { | ||
break | ||
} else { | ||
// update the mask, remove the bit that sets[bestID].mask has. | ||
mask &^= sets[bestID].mask | ||
|
||
newBlocks = append(newBlocks, sets[bestID]) | ||
// remove the chosen one | ||
sets = append(sets[:bestID], sets[bestID+1:]...) | ||
} | ||
} | ||
return | ||
} | ||
|
||
// popCount is the digit sum of the binary representation of the number x. | ||
func popCount(x int64) int { | ||
ret := 0 | ||
// x -= x & -x, remove the lowest bit of the x. | ||
// e.g. result will be 2 if x is 3. | ||
for ; x > 0; x -= x & -x { | ||
ret++ | ||
} | ||
return ret | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,184 @@ | ||
// Copyright 2017 PingCAP, Inc. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
package statistics_test | ||
|
||
import ( | ||
"math" | ||
|
||
. "github.com/pingcap/check" | ||
"github.com/pingcap/tidb" | ||
"github.com/pingcap/tidb/context" | ||
"github.com/pingcap/tidb/model" | ||
"github.com/pingcap/tidb/plan" | ||
"github.com/pingcap/tidb/statistics" | ||
"github.com/pingcap/tidb/util/codec" | ||
"github.com/pingcap/tidb/util/testkit" | ||
"github.com/pingcap/tidb/util/testleak" | ||
"github.com/pingcap/tidb/util/types" | ||
) | ||
|
||
const eps = 1e-9 | ||
|
||
var _ = Suite(&testSelectivitySuite{}) | ||
|
||
type testSelectivitySuite struct { | ||
} | ||
|
||
// generateIntDatum will generate a datum slice, every dimension is begin from 0, end with num - 1. | ||
// If dimension is x, num is y, the total number of datum is y^x. And This slice is sorted. | ||
func (s *testSelectivitySuite) generateIntDatum(dimension, num int) ([]types.Datum, error) { | ||
len := int(math.Pow(float64(num), float64(dimension))) | ||
ret := make([]types.Datum, len) | ||
if dimension == 1 { | ||
for i := 0; i < num; i++ { | ||
ret[i] = types.NewIntDatum(int64(i)) | ||
} | ||
} else { | ||
// In this way, we can guarantee the datum is in order. | ||
for i := 0; i < len; i++ { | ||
data := make([]types.Datum, dimension) | ||
j := i | ||
for k := 0; k < dimension; k++ { | ||
data[dimension-k-1].SetInt64(int64(j % num)) | ||
j = j / num | ||
} | ||
bytes, err := codec.EncodeKey(nil, data...) | ||
if err != nil { | ||
return nil, err | ||
} | ||
ret[i].SetBytes(bytes) | ||
} | ||
} | ||
return ret, nil | ||
} | ||
|
||
// mockStatsHistogram will create a statistics.Histogram, of which the data is uniform distribution. | ||
func mockStatsHistogram(id int64, values []types.Datum, repeat int64) *statistics.Histogram { | ||
ndv := len(values) | ||
histogram := &statistics.Histogram{ | ||
ID: id, | ||
NDV: int64(ndv), | ||
Buckets: make([]statistics.Bucket, ndv), | ||
} | ||
for i := 0; i < ndv; i++ { | ||
histogram.Buckets[i].Repeats = repeat | ||
histogram.Buckets[i].Count = repeat * int64(i+1) | ||
histogram.Buckets[i].UpperBound = values[i] | ||
} | ||
return histogram | ||
} | ||
|
||
func mockStatsTable(tbl *model.TableInfo, rowCount int64) *statistics.Table { | ||
statsTbl := &statistics.Table{ | ||
TableID: tbl.ID, | ||
Count: rowCount, | ||
Columns: make(map[int64]*statistics.Column, len(tbl.Columns)), | ||
Indices: make(map[int64]*statistics.Index, len(tbl.Indices)), | ||
} | ||
return statsTbl | ||
} | ||
|
||
func (s *testSelectivitySuite) TestSelectivity(c *C) { | ||
defer testleak.AfterTest(c)() | ||
store, do, err := newStoreWithBootstrap() | ||
defer store.Close() | ||
c.Assert(err, IsNil) | ||
|
||
testKit := testkit.NewTestKit(c, store) | ||
testKit.MustExec("use test") | ||
testKit.MustExec("drop table if exists t") | ||
testKit.MustExec("create table t(a int primary key, b int, c int, d int, e int, index idx_cd(c, d), index idx_de(d, e))") | ||
|
||
is := do.InfoSchema() | ||
tb, err := is.TableByName(model.NewCIStr("test"), model.NewCIStr("t")) | ||
c.Assert(err, IsNil) | ||
tbl := tb.Meta() | ||
|
||
// mock the statistic table | ||
statsTbl := mockStatsTable(tbl, 540) | ||
|
||
// Set the value of columns' histogram. | ||
colValues, _ := s.generateIntDatum(1, 54) | ||
for i := 1; i <= 5; i++ { | ||
statsTbl.Columns[int64(i)] = &statistics.Column{Histogram: *mockStatsHistogram(int64(i), colValues, 10), Info: tbl.Columns[i-1]} | ||
} | ||
|
||
// Set the value of two indices' histograms. | ||
idxValues, err := s.generateIntDatum(2, 3) | ||
c.Assert(err, IsNil) | ||
statsTbl.Indices[1] = &statistics.Index{Histogram: *mockStatsHistogram(1, idxValues, 60), Info: tbl.Indices[0]} | ||
statsTbl.Indices[2] = &statistics.Index{Histogram: *mockStatsHistogram(2, idxValues, 60), Info: tbl.Indices[1]} | ||
|
||
tests := []struct { | ||
exprs string | ||
selectivity float64 | ||
}{ | ||
{ | ||
exprs: "a > 0 and a < 2", | ||
selectivity: 0.01851851851, | ||
}, | ||
{ | ||
exprs: "a >= 1 and a < 2", | ||
selectivity: 0.01851851851, | ||
}, | ||
{ | ||
exprs: "a >= 1 and b > 1 and a < 2", | ||
selectivity: 0.01783264746, | ||
}, | ||
{ | ||
exprs: "a >= 1 and c > 1 and a < 2", | ||
selectivity: 0.00617283950, | ||
}, | ||
{ | ||
exprs: "a >= 1 and c >= 1 and a < 2", | ||
selectivity: 0.01234567901, | ||
}, | ||
{ | ||
exprs: "d = 0 and e = 1", | ||
selectivity: 0.11111111111, | ||
}, | ||
{ | ||
exprs: "b > 1", | ||
selectivity: 0.96296296296, | ||
}, | ||
{ | ||
exprs: "a > 1 and b < 2 and c > 3 and d < 4 and e > 5", | ||
selectivity: 0.00123287439, | ||
}, | ||
} | ||
for _, tt := range tests { | ||
sql := "select * from t where " + tt.exprs | ||
comment := Commentf("for %s", tt.exprs) | ||
ctx := testKit.Se.(context.Context) | ||
stmts, err := tidb.Parse(ctx, sql) | ||
c.Assert(err, IsNil, Commentf("error %v, for expr %s", err, tt.exprs)) | ||
c.Assert(stmts, HasLen, 1) | ||
err = plan.ResolveName(stmts[0], is, ctx) | ||
|
||
p, err := plan.BuildLogicalPlan(ctx, stmts[0], is) | ||
c.Assert(err, IsNil, Commentf("error %v, for building plan, expr %s", err, tt.exprs)) | ||
var sel *plan.Selection | ||
for _, child := range p.Children() { | ||
p, ok := child.(*plan.Selection) | ||
if ok { | ||
sel = p | ||
break | ||
} | ||
} | ||
c.Assert(sel, NotNil, comment) | ||
ratio, err := statsTbl.Selectivity(ctx, sel.Conditions) | ||
c.Assert(err, IsNil, comment) | ||
c.Assert(math.Abs(ratio-tt.selectivity) < eps, IsTrue, comment) | ||
} | ||
} |
Oops, something went wrong.