diff --git a/statistics/handle/update.go b/statistics/handle/update.go index ab5fae00c3dd8..d66e5afab79f5 100644 --- a/statistics/handle/update.go +++ b/statistics/handle/update.go @@ -1402,10 +1402,10 @@ func (h *Handle) RecalculateExpectCount(q *statistics.QueryFeedback) error { expected := 0.0 if isIndex { idx := t.Indices[id] - expected, err = idx.GetRowCount(sctx, nil, ranges, t.Count) + expected, err = idx.GetRowCount(sctx, nil, ranges, t.Count, t.ModifyCount) } else { c := t.Columns[id] - expected, err = c.GetColumnRowCount(sctx, ranges, t.Count, true) + expected, err = c.GetColumnRowCount(sctx, ranges, t.Count, t.ModifyCount, true) } q.Expected = int64(expected) return err diff --git a/statistics/histogram.go b/statistics/histogram.go index 59571fbfbcc63..79bd45b512f7c 100644 --- a/statistics/histogram.go +++ b/statistics/histogram.go @@ -876,7 +876,7 @@ func (hg *Histogram) outOfRange(val types.Datum) bool { // outOfRangeRowCount estimate the row count of part of [lDatum, rDatum] which is out of range of the histogram. // Here we assume the density of data is decreasing from the lower/upper bound of the histogram toward outside. -// The maximum row count it can get is the increaseCount. It reaches the maximum when out-of-range width reaches histogram range width. +// The maximum row count it can get is the modifyCount. It reaches the maximum when out-of-range width reaches histogram range width. // As it shows below. To calculate the out-of-range row count, we need to calculate the percentage of the shaded area. // Note that we assume histL-boundL == histR-histL == boundR-histR here. // @@ -892,7 +892,7 @@ func (hg *Histogram) outOfRange(val types.Datum) bool { // boundL │ │histL histR boundR // │ │ // lDatum rDatum -func (hg *Histogram) outOfRangeRowCount(lDatum, rDatum *types.Datum, increaseCount int64) float64 { +func (hg *Histogram) outOfRangeRowCount(lDatum, rDatum *types.Datum, modifyCount int64) float64 { if hg.Len() == 0 { return 0 } @@ -976,8 +976,8 @@ func (hg *Histogram) outOfRangeRowCount(lDatum, rDatum *types.Datum, increaseCou totalPercent = 1 } rowCount := totalPercent * hg.notNullCount() - if rowCount > float64(increaseCount) { - return float64(increaseCount) + if rowCount > float64(modifyCount) { + return float64(modifyCount) } return rowCount } @@ -1202,7 +1202,7 @@ func (c *Column) equalRowCount(sctx sessionctx.Context, val types.Datum, encoded } // GetColumnRowCount estimates the row count by a slice of Range. -func (c *Column) GetColumnRowCount(sctx sessionctx.Context, ranges []*ranger.Range, realtimeRowCount int64, pkIsHandle bool) (float64, error) { +func (c *Column) GetColumnRowCount(sctx sessionctx.Context, ranges []*ranger.Range, realtimeRowCount, modifyCount int64, pkIsHandle bool) (float64, error) { sc := sctx.GetSessionVars().StmtCtx var rowCount float64 for _, rg := range ranges { @@ -1299,11 +1299,7 @@ func (c *Column) GetColumnRowCount(sctx sessionctx.Context, ranges []*ranger.Ran // handling the out-of-range part if (c.outOfRange(lowVal) && !lowVal.IsNull()) || c.outOfRange(highVal) { - increaseCount := realtimeRowCount - int64(c.TotalRowCount()) - if increaseCount < 0 { - increaseCount = 0 - } - cnt += c.Histogram.outOfRangeRowCount(&lowVal, &highVal, increaseCount) + cnt += c.Histogram.outOfRangeRowCount(&lowVal, &highVal, modifyCount) } rowCount += cnt @@ -1426,7 +1422,7 @@ func (idx *Index) QueryBytes(d []byte) uint64 { // GetRowCount returns the row count of the given ranges. // It uses the modifyCount to adjust the influence of modifications on the table. -func (idx *Index) GetRowCount(sctx sessionctx.Context, coll *HistColl, indexRanges []*ranger.Range, realtimeRowCount int64) (float64, error) { +func (idx *Index) GetRowCount(sctx sessionctx.Context, coll *HistColl, indexRanges []*ranger.Range, realtimeRowCount, modifyCount int64) (float64, error) { sc := sctx.GetSessionVars().StmtCtx totalCount := float64(0) isSingleCol := len(idx.Info.Columns) == 1 @@ -1518,11 +1514,7 @@ func (idx *Index) GetRowCount(sctx sessionctx.Context, coll *HistColl, indexRang // handling the out-of-range part if (idx.outOfRange(l) && !(isSingleCol && lowIsNull)) || idx.outOfRange(r) { - increaseCount := realtimeRowCount - int64(idx.TotalRowCount()) - if increaseCount < 0 { - increaseCount = 0 - } - totalCount += idx.Histogram.outOfRangeRowCount(&l, &r, increaseCount) + totalCount += idx.Histogram.outOfRangeRowCount(&l, &r, modifyCount) } } totalCount = mathutil.Clamp(totalCount, 0, float64(realtimeRowCount)) diff --git a/statistics/selectivity_test.go b/statistics/selectivity_test.go index 71f6c5f26808d..d5f1ffe913a64 100644 --- a/statistics/selectivity_test.go +++ b/statistics/selectivity_test.go @@ -128,7 +128,7 @@ func TestOutOfRangeEstimation(t *testing.T) { statsTbl := h.GetTableStats(table.Meta()) sctx := mock.NewContext() col := statsTbl.Columns[table.Meta().Columns[0].ID] - count, err := col.GetColumnRowCount(sctx, getRange(900, 900), statsTbl.Count, false) + count, err := col.GetColumnRowCount(sctx, getRange(900, 900), statsTbl.Count, statsTbl.ModifyCount, false) require.NoError(t, err) // Because the ANALYZE collect data by random sampling, so the result is not an accurate value. // so we use a range here. @@ -147,8 +147,9 @@ func TestOutOfRangeEstimation(t *testing.T) { statsSuiteData := statistics.GetStatsSuiteData() statsSuiteData.GetTestCases(t, &input, &output) increasedTblRowCount := int64(float64(statsTbl.Count) * 1.5) + modifyCount := int64(float64(statsTbl.Count) * 0.5) for i, ran := range input { - count, err = col.GetColumnRowCount(sctx, getRange(ran.Start, ran.End), increasedTblRowCount, false) + count, err = col.GetColumnRowCount(sctx, getRange(ran.Start, ran.End), increasedTblRowCount, modifyCount, false) require.NoError(t, err) testdata.OnRecord(func() { output[i].Start = ran.Start @@ -551,6 +552,7 @@ func TestSelectivity(t *testing.T) { require.Truef(t, math.Abs(ratio-tt.selectivity) < eps, "for %s, needed: %v, got: %v", tt.exprs, tt.selectivity, ratio) histColl.Count *= 10 + histColl.ModifyCount = histColl.Count * 9 ratio, _, err = histColl.Selectivity(sctx, sel.Conditions, nil) require.NoErrorf(t, err, "for %s", tt.exprs) require.Truef(t, math.Abs(ratio-tt.selectivityAfterIncrease) < eps, "for %s, needed: %v, got: %v", tt.exprs, tt.selectivityAfterIncrease, ratio) @@ -762,7 +764,7 @@ func TestSmallRangeEstimation(t *testing.T) { statsSuiteData := statistics.GetStatsSuiteData() statsSuiteData.GetTestCases(t, &input, &output) for i, ran := range input { - count, err := col.GetColumnRowCount(sctx, getRange(ran.Start, ran.End), statsTbl.Count, false) + count, err := col.GetColumnRowCount(sctx, getRange(ran.Start, ran.End), statsTbl.Count, statsTbl.ModifyCount, false) require.NoError(t, err) testdata.OnRecord(func() { output[i].Start = ran.Start diff --git a/statistics/table.go b/statistics/table.go index 134ea0ced5c7f..90351b3d0c78e 100644 --- a/statistics/table.go +++ b/statistics/table.go @@ -466,7 +466,7 @@ func (coll *HistColl) GetRowCountByIntColumnRanges(sctx sessionctx.Context, colI } return result, nil } - result, err := c.GetColumnRowCount(sctx, intRanges, coll.Count, true) + result, err := c.GetColumnRowCount(sctx, intRanges, coll.Count, coll.ModifyCount, true) if sc.EnableOptimizerCETrace { CETraceRange(sctx, coll.PhysicalID, []string{c.Info.Name.O}, intRanges, "Column Stats", uint64(result)) } @@ -484,7 +484,7 @@ func (coll *HistColl) GetRowCountByColumnRanges(sctx sessionctx.Context, colID i } return result, err } - result, err := c.GetColumnRowCount(sctx, colRanges, coll.Count, false) + result, err := c.GetColumnRowCount(sctx, colRanges, coll.Count, coll.ModifyCount, false) if sc.EnableOptimizerCETrace { CETraceRange(sctx, coll.PhysicalID, []string{c.Info.Name.O}, colRanges, "Column Stats", uint64(result)) } @@ -517,7 +517,7 @@ func (coll *HistColl) GetRowCountByIndexRanges(sctx sessionctx.Context, idxID in if idx.CMSketch != nil && idx.StatsVer == Version1 { result, err = coll.getIndexRowCount(sctx, idxID, indexRanges) } else { - result, err = idx.GetRowCount(sctx, coll, indexRanges, coll.Count) + result, err = idx.GetRowCount(sctx, coll, indexRanges, coll.Count, coll.ModifyCount) } if sc.EnableOptimizerCETrace { CETraceRange(sctx, coll.PhysicalID, colNames, indexRanges, "Index Stats", uint64(result)) @@ -708,7 +708,7 @@ func (coll *HistColl) crossValidationSelectivity(sctx sessionctx.Context, idx *I Collators: []collate.Collator{idxPointRange.Collators[i]}, } - rowCount, err := col.GetColumnRowCount(sctx, []*ranger.Range{&rang}, coll.Count, col.IsHandle) + rowCount, err := col.GetColumnRowCount(sctx, []*ranger.Range{&rang}, coll.Count, coll.ModifyCount, col.IsHandle) if err != nil { return 0, 0, err } @@ -780,7 +780,7 @@ func (coll *HistColl) getIndexRowCount(sctx sessionctx.Context, idxID int64, ind // on single-column index, use previous way as well, because CMSketch does not contain null // values in this case. if rangePosition == 0 || isSingleColIdxNullRange(idx, ran) { - count, err := idx.GetRowCount(sctx, nil, []*ranger.Range{ran}, coll.Count) + count, err := idx.GetRowCount(sctx, nil, []*ranger.Range{ran}, coll.Count, coll.ModifyCount) if err != nil { return 0, errors.Trace(err) }