statistics: introduce selectivity calculation (pingcap#3161)

blacktear23 · Jul 4, 2017 · 8072082 · 8072082
1 parent 4023bb8
commit 8072082
Show file tree

Hide file tree

Showing 4 changed files with 369 additions and 5 deletions.
diff --git a/statistics/histogram.go b/statistics/histogram.go
@@ -337,7 +337,7 @@ func (hg *Histogram) getIncreaseFactor(totalCount int64) float64 {
 // Column represents a column histogram.
 type Column struct {
 	Histogram
-	info *model.ColumnInfo
+	Info *model.ColumnInfo
 }
 
 func (c *Column) String() string {
@@ -429,7 +429,7 @@ func (c *Column) getColumnRowCount(sc *variable.StatementContext, ranges []*type
 // Index represents an index histogram.
 type Index struct {
 	Histogram
-	info *model.IndexInfo
+	Info *model.IndexInfo
 }
 
 func (idx *Index) String() string {
@@ -439,7 +439,7 @@ func (idx *Index) String() string {
 func (idx *Index) getRowCount(sc *variable.StatementContext, indexRanges []*types.IndexRange) (float64, error) {
 	totalCount := float64(0)
 	for _, indexRange := range indexRanges {
-		indexRange.Align(len(idx.info.Columns))
+		indexRange.Align(len(idx.Info.Columns))
 		lb, err := codec.EncodeKey(nil, indexRange.LowVal...)
 		if err != nil {
 			return 0, errors.Trace(err)

diff --git a/statistics/selectivity.go b/statistics/selectivity.go
@@ -0,0 +1,180 @@
+// Copyright 2017 PingCAP, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package statistics
+
+import (
+	"math"
+
+	"github.com/juju/errors"
+	"github.com/pingcap/tidb/context"
+	"github.com/pingcap/tidb/expression"
+	"github.com/pingcap/tidb/mysql"
+	"github.com/pingcap/tidb/sessionctx/variable"
+	"github.com/pingcap/tidb/util/ranger"
+	"github.com/pingcap/tidb/util/types"
+)
+
+// If one condition can't be calculated, we will assume that the selectivity of this condition is 0.8.
+const selectionFactor = 0.8
+
+// exprSet is used for calculating selectivity.
+type exprSet struct {
+	tp int
+	ID int64
+	// The ith bit of `mask` will tell whether the ith expression is covered by this index/column.
+	mask int64
+	// This stores ranges we get.
+	ranges []types.Range
+}
+
+// The type of the exprSet.
+const (
+	indexType = iota
+	pkType
+	colType
+)
+
+// Selectivity is a function calculate the selectivity of the expressions.
+// The definition of selectivity is (row count after filter / row count before filter).
+// And exprs must be CNF now, in other words, `exprs[0] and exprs[1] and ... and exprs[len - 1]` should be held when you call this.
+// TODO: support expressions that the top layer is a DNF.
+// Currently the time complexity is o(n^2).
+func (t *Table) Selectivity(ctx context.Context, exprs []expression.Expression) (float64, error) {
+	if t.Count == 0 {
+		return 0, nil
+	}
+	// TODO: If len(exprs) is bigger than 63, we could use bitset structure to replace the int64.
+	// This will simplify some code and speed up if we use this rather than a boolean slice.
+	if t.Pseudo || len(exprs) > 63 {
+		return selectionFactor, nil
+	}
+	var sets []*exprSet
+	sc := ctx.GetSessionVars().StmtCtx
+	extractedCols := expression.ExtractColumns(expression.ComposeCNFCondition(ctx, exprs...))
+	for _, colInfo := range t.Columns {
+		col := expression.ColInfo2Col(extractedCols, colInfo.Info)
+		// This column should have histogram.
+		if col != nil && len(colInfo.Histogram.Buckets) > 0 {
+			maskCovered, ranges, err := getMaskAndRanges(sc, exprs, ranger.ColumnRangeType, nil, col)
+			if err != nil {
+				return 0, errors.Trace(err)
+			}
+			sets = append(sets, &exprSet{tp: colType, ID: col.ID, mask: maskCovered, ranges: ranges})
+			if mysql.HasPriKeyFlag(colInfo.Info.Flag) {
+				sets[len(sets)-1].tp = pkType
+			}
+		}
+	}
+	for _, idxInfo := range t.Indices {
+		idxCols, lengths := expression.IndexInfo2Cols(extractedCols, idxInfo.Info)
+		// This index should have histogram.
+		if len(idxCols) > 0 && len(idxInfo.Histogram.Buckets) > 0 {
+			maskCovered, ranges, err := getMaskAndRanges(sc, exprs, ranger.IndexRangeType, lengths, idxCols...)
+			if err != nil {
+				return 0, errors.Trace(err)
+			}
+			sets = append(sets, &exprSet{tp: indexType, ID: idxInfo.ID, mask: maskCovered, ranges: ranges})
+		}
+	}
+	sets = getUsableSetsByGreedy(sets)
+	ret := 1.0
+	// Initialize the mask with the full set.
+	mask := (int64(1) << uint(len(exprs))) - 1
+	for _, set := range sets {
+		mask ^= set.mask
+		var (
+			rowCount float64
+			err      error
+		)
+		switch set.tp {
+		case pkType, colType:
+			ranges := ranger.Ranges2ColumnRanges(set.ranges)
+			rowCount, err = t.GetRowCountByColumnRanges(sc, set.ID, ranges)
+		case indexType:
+			ranges := ranger.Ranges2IndexRanges(set.ranges)
+			rowCount, err = t.GetRowCountByIndexRanges(sc, set.ID, ranges)
+		}
+		if err != nil {
+			return 0, errors.Trace(err)
+		}
+		ret *= rowCount / float64(t.Count)
+	}
+	// If there's still conditions which cannot be calculated, we will multiply a selectionFactor.
+	if mask > 0 {
+		ret *= selectionFactor
+	}
+	return ret, nil
+}
+
+func getMaskAndRanges(sc *variable.StatementContext, exprs []expression.Expression, rangeType int,
+	lengths []int, cols ...*expression.Column) (int64, []types.Range, error) {
+	exprsClone := make([]expression.Expression, 0, len(exprs))
+	for _, expr := range exprs {
+		exprsClone = append(exprsClone, expr.Clone())
+	}
+	ranges, accessConds, _, err := ranger.BuildRange(sc, exprsClone, rangeType, cols, lengths)
+	if err != nil {
+		return 0, nil, errors.Trace(err)
+	}
+	mask := int64(0)
+	for i := range exprs {
+		for j := range accessConds {
+			if exprs[i].Equal(accessConds[j], nil) {
+				mask |= 1 << uint64(i)
+				break
+			}
+		}
+	}
+	return mask, ranges, nil
+}
+
+// getUsableSetsByGreedy will select the indices and pk used for calculate selectivity by greedy algorithm.
+func getUsableSetsByGreedy(sets []*exprSet) (newBlocks []*exprSet) {
+	mask := int64(math.MaxInt64)
+	for {
+		// Choose the index that covers most.
+		bestID := -1
+		bestCount := 0
+		bestID, bestCount, bestTp := -1, 0, colType
+		for i, set := range sets {
+			set.mask &= mask
+			bits := popCount(set.mask)
+			if (bestTp == colType && set.tp < colType) || bestCount < bits {
+				bestID, bestCount, bestTp = i, bits, set.tp
+			}
+		}
+		if bestCount == 0 {
+			break
+		} else {
+			// update the mask, remove the bit that sets[bestID].mask has.
+			mask &^= sets[bestID].mask
+
+			newBlocks = append(newBlocks, sets[bestID])
+			// remove the chosen one
+			sets = append(sets[:bestID], sets[bestID+1:]...)
+		}
+	}
+	return
+}
+
+// popCount is the digit sum of the binary representation of the number x.
+func popCount(x int64) int {
+	ret := 0
+	// x -= x & -x, remove the lowest bit of the x.
+	// e.g. result will be 2 if x is 3.
+	for ; x > 0; x -= x & -x {
+		ret++
+	}
+	return ret
+}
diff --git a/statistics/selectivity_test.go b/statistics/selectivity_test.go
@@ -0,0 +1,184 @@
+// Copyright 2017 PingCAP, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package statistics_test
+
+import (
+	"math"
+
+	. "github.com/pingcap/check"
+	"github.com/pingcap/tidb"
+	"github.com/pingcap/tidb/context"
+	"github.com/pingcap/tidb/model"
+	"github.com/pingcap/tidb/plan"
+	"github.com/pingcap/tidb/statistics"
+	"github.com/pingcap/tidb/util/codec"
+	"github.com/pingcap/tidb/util/testkit"
+	"github.com/pingcap/tidb/util/testleak"
+	"github.com/pingcap/tidb/util/types"
+)
+
+const eps = 1e-9
+
+var _ = Suite(&testSelectivitySuite{})
+
+type testSelectivitySuite struct {
+}
+
+// generateIntDatum will generate a datum slice, every dimension is begin from 0, end with num - 1.
+// If dimension is x, num is y, the total number of datum is y^x. And This slice is sorted.
+func (s *testSelectivitySuite) generateIntDatum(dimension, num int) ([]types.Datum, error) {
+	len := int(math.Pow(float64(num), float64(dimension)))
+	ret := make([]types.Datum, len)
+	if dimension == 1 {
+		for i := 0; i < num; i++ {
+			ret[i] = types.NewIntDatum(int64(i))
+		}
+	} else {
+		// In this way, we can guarantee the datum is in order.
+		for i := 0; i < len; i++ {
+			data := make([]types.Datum, dimension)
+			j := i
+			for k := 0; k < dimension; k++ {
+				data[dimension-k-1].SetInt64(int64(j % num))
+				j = j / num
+			}
+			bytes, err := codec.EncodeKey(nil, data...)
+			if err != nil {
+				return nil, err
+			}
+			ret[i].SetBytes(bytes)
+		}
+	}
+	return ret, nil
+}
+
+// mockStatsHistogram will create a statistics.Histogram, of which the data is uniform distribution.
+func mockStatsHistogram(id int64, values []types.Datum, repeat int64) *statistics.Histogram {
+	ndv := len(values)
+	histogram := &statistics.Histogram{
+		ID:      id,
+		NDV:     int64(ndv),
+		Buckets: make([]statistics.Bucket, ndv),
+	}
+	for i := 0; i < ndv; i++ {
+		histogram.Buckets[i].Repeats = repeat
+		histogram.Buckets[i].Count = repeat * int64(i+1)
+		histogram.Buckets[i].UpperBound = values[i]
+	}
+	return histogram
+}
+
+func mockStatsTable(tbl *model.TableInfo, rowCount int64) *statistics.Table {
+	statsTbl := &statistics.Table{
+		TableID: tbl.ID,
+		Count:   rowCount,
+		Columns: make(map[int64]*statistics.Column, len(tbl.Columns)),
+		Indices: make(map[int64]*statistics.Index, len(tbl.Indices)),
+	}
+	return statsTbl
+}
+
+func (s *testSelectivitySuite) TestSelectivity(c *C) {
+	defer testleak.AfterTest(c)()
+	store, do, err := newStoreWithBootstrap()
+	defer store.Close()
+	c.Assert(err, IsNil)
+
+	testKit := testkit.NewTestKit(c, store)
+	testKit.MustExec("use test")
+	testKit.MustExec("drop table if exists t")
+	testKit.MustExec("create table t(a int primary key, b int, c int, d int, e int, index idx_cd(c, d), index idx_de(d, e))")
+
+	is := do.InfoSchema()
+	tb, err := is.TableByName(model.NewCIStr("test"), model.NewCIStr("t"))
+	c.Assert(err, IsNil)
+	tbl := tb.Meta()
+
+	// mock the statistic table
+	statsTbl := mockStatsTable(tbl, 540)
+
+	// Set the value of columns' histogram.
+	colValues, _ := s.generateIntDatum(1, 54)
+	for i := 1; i <= 5; i++ {
+		statsTbl.Columns[int64(i)] = &statistics.Column{Histogram: *mockStatsHistogram(int64(i), colValues, 10), Info: tbl.Columns[i-1]}
+	}
+
+	// Set the value of two indices' histograms.
+	idxValues, err := s.generateIntDatum(2, 3)
+	c.Assert(err, IsNil)
+	statsTbl.Indices[1] = &statistics.Index{Histogram: *mockStatsHistogram(1, idxValues, 60), Info: tbl.Indices[0]}
+	statsTbl.Indices[2] = &statistics.Index{Histogram: *mockStatsHistogram(2, idxValues, 60), Info: tbl.Indices[1]}
+
+	tests := []struct {
+		exprs       string
+		selectivity float64
+	}{
+		{
+			exprs:       "a > 0 and a < 2",
+			selectivity: 0.01851851851,
+		},
+		{
+			exprs:       "a >= 1 and a < 2",
+			selectivity: 0.01851851851,
+		},
+		{
+			exprs:       "a >= 1 and b > 1 and a < 2",
+			selectivity: 0.01783264746,
+		},
+		{
+			exprs:       "a >= 1 and c > 1 and a < 2",
+			selectivity: 0.00617283950,
+		},
+		{
+			exprs:       "a >= 1 and c >= 1 and a < 2",
+			selectivity: 0.01234567901,
+		},
+		{
+			exprs:       "d = 0 and e = 1",
+			selectivity: 0.11111111111,
+		},
+		{
+			exprs:       "b > 1",
+			selectivity: 0.96296296296,
+		},
+		{
+			exprs:       "a > 1 and b < 2 and c > 3 and d < 4 and e > 5",
+			selectivity: 0.00123287439,
+		},
+	}
+	for _, tt := range tests {
+		sql := "select * from t where " + tt.exprs
+		comment := Commentf("for %s", tt.exprs)
+		ctx := testKit.Se.(context.Context)
+		stmts, err := tidb.Parse(ctx, sql)
+		c.Assert(err, IsNil, Commentf("error %v, for expr %s", err, tt.exprs))
+		c.Assert(stmts, HasLen, 1)
+		err = plan.ResolveName(stmts[0], is, ctx)
+
+		p, err := plan.BuildLogicalPlan(ctx, stmts[0], is)
+		c.Assert(err, IsNil, Commentf("error %v, for building plan, expr %s", err, tt.exprs))
+		var sel *plan.Selection
+		for _, child := range p.Children() {
+			p, ok := child.(*plan.Selection)
+			if ok {
+				sel = p
+				break
+			}
+		}
+		c.Assert(sel, NotNil, comment)
+		ratio, err := statsTbl.Selectivity(ctx, sel.Conditions)
+		c.Assert(err, IsNil, comment)
+		c.Assert(math.Abs(ratio-tt.selectivity) < eps, IsTrue, comment)
+	}
+}