statistics: fix estimation error when ranges are too many and modify …

…count is large (#40472) * fix * fmt * bazel update * update test result Co-authored-by: Weizhen Wang <wangweizhen@pingcap.com> Co-authored-by: Ti Chi Robot <ti-community-prow-bot@tidb.io>
pingcap · Jan 11, 2023 · 2f13578 · 2f13578
1 parent 2cf328b
commit 2f13578
Show file tree

Hide file tree

Showing 3 changed files with 90 additions and 5 deletions.
diff --git a/statistics/BUILD.bazel b/statistics/BUILD.bazel
@@ -112,6 +112,7 @@ go_test(
         "@com_github_pingcap_failpoint//:failpoint",
         "@com_github_pingcap_log//:log",
         "@com_github_stretchr_testify//require",
+        "@org_golang_x_exp//slices",
         "@org_uber_go_goleak//:goleak",
         "@org_uber_go_zap//:zap",
     ],

diff --git a/statistics/index.go b/statistics/index.go
@@ -222,6 +222,7 @@ func (idx *Index) GetRowCount(sctx sessionctx.Context, coll *HistColl, indexRang
 	totalCount := float64(0)
 	isSingleCol := len(idx.Info.Columns) == 1
 	for _, indexRange := range indexRanges {
+		var count float64
 		lb, err := codec.EncodeKey(sc, nil, indexRange.LowVal...)
 		if err != nil {
 			return 0, err
@@ -242,7 +243,7 @@ func (idx *Index) GetRowCount(sctx sessionctx.Context, coll *HistColl, indexRang
 					totalCount++
 					continue
 				}
-				count := idx.equalRowCount(lb, realtimeRowCount)
+				count = idx.equalRowCount(lb, realtimeRowCount)
 				// If the current table row count has changed, we should scale the row count accordingly.
 				count *= idx.GetIncreaseFactor(realtimeRowCount)
 				totalCount += count
@@ -262,7 +263,7 @@ func (idx *Index) GetRowCount(sctx sessionctx.Context, coll *HistColl, indexRang
 		r := types.NewBytesDatum(rb)
 		lowIsNull := bytes.Equal(lb, nullKeyBytes)
 		if isSingleCol && lowIsNull {
-			totalCount += float64(idx.Histogram.NullCount)
+			count += float64(idx.Histogram.NullCount)
 		}
 		expBackoffSuccess := false
 		// Due to the limitation of calcFraction and convertDatumToScalar, the histogram actually won't estimate anything.
@@ -301,16 +302,17 @@ func (idx *Index) GetRowCount(sctx sessionctx.Context, coll *HistColl, indexRang
 			}
 		}
 		if !expBackoffSuccess {
-			totalCount += idx.BetweenRowCount(l, r)
+			count += idx.BetweenRowCount(l, r)
 		}
 
 		// If the current table row count has changed, we should scale the row count accordingly.
-		totalCount *= idx.GetIncreaseFactor(realtimeRowCount)
+		count *= idx.GetIncreaseFactor(realtimeRowCount)
 
 		// handling the out-of-range part
 		if (idx.outOfRange(l) && !(isSingleCol && lowIsNull)) || idx.outOfRange(r) {
 			totalCount += idx.Histogram.outOfRangeRowCount(&l, &r, modifyCount)
 		}
+		totalCount += count
 	}
 	totalCount = mathutil.Clamp(totalCount, 0, float64(realtimeRowCount))
 	return totalCount, nil

diff --git a/statistics/selectivity_test.go b/statistics/selectivity_test.go
@@ -44,6 +44,7 @@ import (
 	"github.com/pingcap/tidb/util/mock"
 	"github.com/pingcap/tidb/util/ranger"
 	"github.com/stretchr/testify/require"
+	"golang.org/x/exp/slices"
 )
 
 func TestCollationColumnEstimate(t *testing.T) {
@@ -891,7 +892,7 @@ func prepareSelectivity(testKit *testkit.TestKit, dom *domain.Domain) (*statisti
 	return statsTbl, nil
 }
 
-func getRange(start, end int64) []*ranger.Range {
+func getRange(start, end int64) ranger.Ranges {
 	ran := &ranger.Range{
 		LowVal:    []types.Datum{types.NewIntDatum(start)},
 		HighVal:   []types.Datum{types.NewIntDatum(end)},
@@ -900,6 +901,21 @@ func getRange(start, end int64) []*ranger.Range {
 	return []*ranger.Range{ran}
 }
 
+func getRanges(start, end []int64) (res ranger.Ranges) {
+	if len(start) != len(end) {
+		return nil
+	}
+	for i := range start {
+		ran := &ranger.Range{
+			LowVal:    []types.Datum{types.NewIntDatum(start[i])},
+			HighVal:   []types.Datum{types.NewIntDatum(end[i])},
+			Collators: collate.GetBinaryCollatorSlice(1),
+		}
+		res = append(res, ran)
+	}
+	return
+}
+
 func TestSelectivityGreedyAlgo(t *testing.T) {
 	nodes := make([]*statistics.StatsNode, 3)
 	nodes[0] = statistics.MockStatsNode(1, 3, 2)
@@ -1075,3 +1091,69 @@ func TestGlobalStatsOutOfRangeEstimationAfterDelete(t *testing.T) {
 		testKit.MustQuery(input[i]).Check(testkit.Rows(output[i].Result...))
 	}
 }
+
+func generateMapsForMockStatsTbl(statsTbl *statistics.Table) {
+	idx2Columns := make(map[int64][]int64)
+	colID2IdxIDs := make(map[int64][]int64)
+	for _, idxHist := range statsTbl.Indices {
+		ids := make([]int64, 0, len(idxHist.Info.Columns))
+		for _, idxCol := range idxHist.Info.Columns {
+			ids = append(ids, int64(idxCol.Offset))
+		}
+		colID2IdxIDs[ids[0]] = append(colID2IdxIDs[ids[0]], idxHist.ID)
+		idx2Columns[idxHist.ID] = ids
+	}
+	for _, idxIDs := range colID2IdxIDs {
+		slices.Sort(idxIDs)
+	}
+	statsTbl.Idx2ColumnIDs = idx2Columns
+	statsTbl.ColID2IdxIDs = colID2IdxIDs
+}
+
+func TestIssue39593(t *testing.T) {
+	store, dom := testkit.CreateMockStoreAndDomain(t)
+	testKit := testkit.NewTestKit(t, store)
+
+	testKit.MustExec("use test")
+	testKit.MustExec("drop table if exists t")
+	testKit.MustExec("create table t(a int, b int, index idx(a, b))")
+	is := dom.InfoSchema()
+	tb, err := is.TableByName(model.NewCIStr("test"), model.NewCIStr("t"))
+	require.NoError(t, err)
+	tblInfo := tb.Meta()
+
+	// mock the statistics.Table
+	statsTbl := mockStatsTable(tblInfo, 540)
+	colValues, err := generateIntDatum(1, 54)
+	require.NoError(t, err)
+	for i := 1; i <= 2; i++ {
+		statsTbl.Columns[int64(i)] = &statistics.Column{
+			Histogram:         *mockStatsHistogram(int64(i), colValues, 10, types.NewFieldType(mysql.TypeLonglong)),
+			Info:              tblInfo.Columns[i-1],
+			StatsLoadedStatus: statistics.NewStatsFullLoadStatus(),
+			StatsVer:          2,
+		}
+	}
+	idxValues, err := generateIntDatum(2, 3)
+	require.NoError(t, err)
+	tp := types.NewFieldType(mysql.TypeBlob)
+	statsTbl.Indices[1] = &statistics.Index{
+		Histogram: *mockStatsHistogram(1, idxValues, 60, tp),
+		Info:      tblInfo.Indices[0],
+		StatsVer:  2,
+	}
+	generateMapsForMockStatsTbl(statsTbl)
+
+	sctx := testKit.Session()
+	idxID := tblInfo.Indices[0].ID
+	vals := []int64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20}
+	count, err := statsTbl.GetRowCountByIndexRanges(sctx, idxID, getRanges(vals, vals))
+	require.NoError(t, err)
+	// estimated row count without any changes
+	require.Equal(t, float64(360), count)
+	statsTbl.Count *= 10
+	count, err = statsTbl.GetRowCountByIndexRanges(sctx, idxID, getRanges(vals, vals))
+	require.NoError(t, err)
+	// estimated row count after mock modify on the table
+	require.Equal(t, float64(3600), count)
+}