Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

planner: fix the inappropriate heuristic rule to estimate the EQ selectivity when out of range (#18543) #18995

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions cmd/explaintest/r/explain_union_scan.result
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ insert into city values("06766b3ef41d484d8878606393f1ed0b", 88, "chongqing", "ch
begin;
update city set province_id = 77 where id="06766b3ef41d484d8878606393f1ed0b";
explain select t1.*, t2.province_id as provinceID, t2.city_name as cityName, t3.description as description from city t1 inner join city t2 on t1.id = t2.id left join city t3 on t1.province_id = t3.province_id where t1.province_id > 1 and t1.province_id < 100 limit 10;
<<<<<<< HEAD
id count task operator info
Projection_17 10.00 root test.t1.id, test.t1.province_id, test.t1.city_name, test.t1.description, test.t2.province_id, test.t2.city_name, test.t3.description
└─Limit_20 10.00 root offset:0, count:10
Expand All @@ -29,4 +30,23 @@ Projection_17 10.00 root test.t1.id, test.t1.province_id, test.t1.city_name, tes
└─TableReader_55 519304.44 root data:Selection_54
└─Selection_54 519304.44 cop[tikv] gt(test.t3.province_id, 1), lt(test.t3.province_id, 100), not(isnull(test.t3.province_id))
└─TableScan_53 536284.00 cop[tikv] table:t3, range:[-inf,+inf], keep order:false
=======
id estRows task access object operator info
Limit_20 10.00 root offset:0, count:10
└─HashJoin_22 10.00 root left outer join, equal:[eq(test.city.province_id, test.city.province_id)]
├─Limit_25(Build) 10.00 root offset:0, count:10
│ └─IndexJoin_38 10.00 root inner join, inner:UnionScan_37, outer key:test.city.id, inner key:test.city.id
│ ├─UnionScan_47(Build) 10.00 root
│ │ └─TableReader_49 10.00 root data:TableFullScan_48
│ │ └─TableFullScan_48 10.00 cop[tikv] table:t2 keep order:false
│ └─UnionScan_37(Probe) 1.00 root gt(test.city.province_id, 1), lt(test.city.province_id, 100)
│ └─IndexLookUp_36 1.00 root
│ ├─IndexRangeScan_33(Build) 1.00 cop[tikv] table:t1, index:PRIMARY(id) range: decided by [eq(test.city.id, test.city.id)], keep order:false
│ └─Selection_35(Probe) 1.00 cop[tikv] gt(test.city.province_id, 1), lt(test.city.province_id, 100)
│ └─TableRowIDScan_34 1.00 cop[tikv] table:t1 keep order:false
└─UnionScan_57(Probe) 536284.00 root gt(test.city.province_id, 1), lt(test.city.province_id, 100), not(isnull(test.city.province_id))
└─TableReader_60 536284.00 root data:Selection_59
└─Selection_59 536284.00 cop[tikv] gt(test.city.province_id, 1), lt(test.city.province_id, 100), not(isnull(test.city.province_id))
└─TableFullScan_58 536284.00 cop[tikv] table:t3 keep order:false
>>>>>>> aeee152... planner: fix the inappropriate heuristic rule to estimate the EQ selectivity when out of range (#18543)
commit;
6 changes: 6 additions & 0 deletions planner/core/cbo_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -594,9 +594,15 @@ func (s *testAnalyzeSuite) TestOutdatedAnalyze(c *C) {
c.Assert(h.Update(dom.InfoSchema()), IsNil)
statistics.RatioOfPseudoEstimate.Store(10.0)
testKit.MustQuery("explain select * from t where a <= 5 and b <= 5").Check(testkit.Rows(
<<<<<<< HEAD
"TableReader_7 35.91 root data:Selection_6",
"└─Selection_6 35.91 cop[tikv] le(test.t.a, 5), le(test.t.b, 5)",
" └─TableScan_5 80.00 cop[tikv] table:t, range:[-inf,+inf], keep order:false",
=======
"TableReader_7 29.77 root data:Selection_6",
"└─Selection_6 29.77 cop[tikv] le(test.t.a, 5), le(test.t.b, 5)",
" └─TableFullScan_5 80.00 cop[tikv] table:t keep order:false",
>>>>>>> aeee152... planner: fix the inappropriate heuristic rule to estimate the EQ selectivity when out of range (#18543)
))
statistics.RatioOfPseudoEstimate.Store(0.7)
testKit.MustQuery("explain select * from t where a <= 5 and b <= 5").Check(testkit.Rows(
Expand Down
13 changes: 13 additions & 0 deletions planner/core/testdata/analyze_suite_out.json
Original file line number Diff line number Diff line change
Expand Up @@ -138,17 +138,30 @@
{
"SQL": "explain select * from t where a = 7639902",
"Plan": [
<<<<<<< HEAD
"IndexReader_6 2.03 root index:IndexScan_5",
"└─IndexScan_5 2.03 cop[tikv] table:t, index:a, c, b, range:[7639902,7639902], keep order:false"
=======
"IndexReader_6 6.68 root index:IndexRangeScan_5",
"└─IndexRangeScan_5 6.68 cop[tikv] table:t, index:PRIMARY(a, c, b) range:[7639902,7639902], keep order:false"
>>>>>>> aeee152... planner: fix the inappropriate heuristic rule to estimate the EQ selectivity when out of range (#18543)
]
},
{
"SQL": "explain select c, b from t where a = 7639902 order by b asc limit 6",
"Plan": [
<<<<<<< HEAD
"Projection_7 2.03 root test.t.c, test.t.b",
"└─TopN_10 2.03 root test.t.b:asc, offset:0, count:6",
" └─IndexReader_18 2.03 root index:IndexScan_17",
" └─IndexScan_17 2.03 cop[tikv] table:t, index:a, c, b, range:[7639902,7639902], keep order:false"
=======
"Projection_7 6.00 root test.t.c, test.t.b",
"└─TopN_8 6.00 root test.t.b, offset:0, count:6",
" └─IndexReader_16 6.00 root index:TopN_15",
" └─TopN_15 6.00 cop[tikv] test.t.b, offset:0, count:6",
" └─IndexRangeScan_14 6.68 cop[tikv] table:t, index:PRIMARY(a, c, b) range:[7639902,7639902], keep order:false"
>>>>>>> aeee152... planner: fix the inappropriate heuristic rule to estimate the EQ selectivity when out of range (#18543)
]
}
]
Expand Down
4 changes: 2 additions & 2 deletions statistics/handle/update_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1523,8 +1523,8 @@ func (s *testStatsSuite) TestAbnormalIndexFeedback(c *C) {
sql: "select * from t where a = 2 and b > 10",
hist: "column:2 ndv:20 totColSize:20\n" +
"num: 5 lower_bound: -9223372036854775808 upper_bound: 7 repeats: 0\n" +
"num: 6 lower_bound: 7 upper_bound: 14 repeats: 0\n" +
"num: 7 lower_bound: 14 upper_bound: 9223372036854775807 repeats: 0",
"num: 4 lower_bound: 7 upper_bound: 14 repeats: 0\n" +
"num: 5 lower_bound: 14 upper_bound: 9223372036854775807 repeats: 0",
rangeID: tblInfo.Columns[1].ID,
idxID: tblInfo.Indices[0].ID,
eqCount: 3,
Expand Down
12 changes: 9 additions & 3 deletions statistics/histogram.go
Original file line number Diff line number Diff line change
Expand Up @@ -728,7 +728,7 @@ func (c *Column) equalRowCount(sc *stmtctx.StatementContext, val types.Datum, mo
return 0.0, nil
}
if c.NDV > 0 && c.outOfRange(val) {
return float64(modifyCount) / float64(c.NDV), nil
return outOfRangeEQSelectivity(c.NDV, modifyCount, int64(c.TotalRowCount())) * c.TotalRowCount(), nil
}
if c.CMSketch != nil {
count, err := c.CMSketch.queryValue(sc, val)
Expand Down Expand Up @@ -763,9 +763,15 @@ func (c *Column) GetColumnRowCount(sc *stmtctx.StatementContext, ranges []*range
continue
}
// The interval case.
<<<<<<< HEAD
cnt := c.BetweenRowCount(rg.LowVal[0], rg.HighVal[0])
if (c.outOfRange(rg.LowVal[0]) && !rg.LowVal[0].IsNull()) || c.outOfRange(rg.HighVal[0]) {
cnt += float64(modifyCount) / outOfRangeBetweenRate
=======
cnt := c.BetweenRowCount(lowVal, highVal)
if (c.outOfRange(lowVal) && !lowVal.IsNull()) || c.outOfRange(highVal) {
cnt += outOfRangeEQSelectivity(outOfRangeBetweenRate, modifyCount, int64(c.TotalRowCount())) * c.TotalRowCount()
>>>>>>> aeee152... planner: fix the inappropriate heuristic rule to estimate the EQ selectivity when out of range (#18543)
}
// `betweenRowCount` returns count for [l, h) range, we adjust cnt for boudaries here.
// Note that, `cnt` does not include null values, we need specially handle cases
Expand Down Expand Up @@ -827,7 +833,7 @@ func (idx *Index) equalRowCount(sc *stmtctx.StatementContext, b []byte, modifyCo
}
val := types.NewBytesDatum(b)
if idx.NDV > 0 && idx.outOfRange(val) {
return float64(modifyCount) / (float64(idx.NDV)), nil
return outOfRangeEQSelectivity(idx.NDV, modifyCount, int64(idx.TotalRowCount())) * idx.TotalRowCount(), nil
}
if idx.CMSketch != nil {
return float64(idx.CMSketch.QueryBytes(b)), nil
Expand Down Expand Up @@ -879,7 +885,7 @@ func (idx *Index) GetRowCount(sc *stmtctx.StatementContext, indexRanges []*range
totalCount += idx.BetweenRowCount(l, r)
lowIsNull := bytes.Equal(lb, nullKeyBytes)
if (idx.outOfRange(l) && !(isSingleCol && lowIsNull)) || idx.outOfRange(r) {
totalCount += float64(modifyCount) / outOfRangeBetweenRate
totalCount += outOfRangeEQSelectivity(outOfRangeBetweenRate, modifyCount, int64(idx.TotalRowCount())) * idx.TotalRowCount()
}
if isSingleCol && lowIsNull {
totalCount += float64(idx.NullCount)
Expand Down
34 changes: 31 additions & 3 deletions statistics/selectivity_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -340,6 +340,34 @@ func getRange(start, end int64) []*ranger.Range {
return []*ranger.Range{ran}
}

func (s *testStatsSuite) TestOutOfRangeEQEstimation(c *C) {
defer cleanEnv(c, s.store, s.do)
testKit := testkit.NewTestKit(c, s.store)
testKit.MustExec("use test")
testKit.MustExec("drop table if exists t")
testKit.MustExec("create table t(a int)")
for i := 0; i < 1000; i++ {
testKit.MustExec(fmt.Sprintf("insert into t values (%v)", i/4)) // 0 ~ 249
}
testKit.MustExec("analyze table t")

h := s.do.StatsHandle()
table, err := s.do.InfoSchema().TableByName(model.NewCIStr("test"), model.NewCIStr("t"))
c.Assert(err, IsNil)
statsTbl := h.GetTableStats(table.Meta())
sc := &stmtctx.StatementContext{}
col := statsTbl.Columns[table.Meta().Columns[0].ID]
count, err := col.GetColumnRowCount(sc, getRange(250, 250), 0, false)
c.Assert(err, IsNil)
c.Assert(count, Equals, float64(0))

for i := 0; i < 8; i++ {
count, err := col.GetColumnRowCount(sc, getRange(250, 250), int64(i+1), false)
c.Assert(err, IsNil)
c.Assert(count, Equals, math.Min(float64(i+1), 4)) // estRows must be less than modifyCnt
}
}

func (s *testStatsSuite) TestEstimationForUnknownValues(c *C) {
defer cleanEnv(c, s.store, s.do)
testKit := testkit.NewTestKit(c, s.store)
Expand All @@ -366,15 +394,15 @@ func (s *testStatsSuite) TestEstimationForUnknownValues(c *C) {
colID := table.Meta().Columns[0].ID
count, err := statsTbl.GetRowCountByColumnRanges(sc, colID, getRange(30, 30))
c.Assert(err, IsNil)
c.Assert(count, Equals, 2.0)
c.Assert(count, Equals, 0.2)

count, err = statsTbl.GetRowCountByColumnRanges(sc, colID, getRange(9, 30))
c.Assert(err, IsNil)
c.Assert(count, Equals, 4.2)
c.Assert(count, Equals, 2.4000000000000004)

count, err = statsTbl.GetRowCountByColumnRanges(sc, colID, getRange(9, math.MaxInt64))
c.Assert(err, IsNil)
c.Assert(count, Equals, 4.2)
c.Assert(count, Equals, 2.4000000000000004)

idxID := table.Meta().Indices[0].ID
count, err = statsTbl.GetRowCountByIndexRanges(sc, idxID, getRange(30, 30))
Expand Down
26 changes: 20 additions & 6 deletions statistics/table.go
Original file line number Diff line number Diff line change
Expand Up @@ -365,6 +365,24 @@ func isSingleColIdxNullRange(idx *Index, ran *ranger.Range) bool {
return false
}

// outOfRangeEQSelectivity estimates selectivities for out-of-range values.
// It assumes all modifications are insertions and all new-inserted rows are uniformly distributed
// and has the same distribution with analyzed rows, which means each unique value should have the
// same number of rows(Tot/NDV) of it.
func outOfRangeEQSelectivity(ndv, modifyRows, totalRows int64) float64 {
if modifyRows == 0 {
return 0 // it must be 0 since the histogram contains the whole data
}
if ndv < outOfRangeBetweenRate {
ndv = outOfRangeBetweenRate // avoid inaccurate selectivity caused by small NDV
}
selectivity := 1 / float64(ndv) // TODO: After extracting TopN from histograms, we can minus the TopN fraction here.
if selectivity*float64(totalRows) > float64(modifyRows) {
selectivity = float64(modifyRows) / float64(totalRows)
}
return selectivity
}

// getEqualCondSelectivity gets the selectivity of the equal conditions.
func (coll *HistColl) getEqualCondSelectivity(idx *Index, bytes []byte, usedColsLen int) float64 {
coverAll := len(idx.Info.Columns) == usedColsLen
Expand All @@ -377,8 +395,7 @@ func (coll *HistColl) getEqualCondSelectivity(idx *Index, bytes []byte, usedCols
// When the value is out of range, we could not found this value in the CM Sketch,
// so we use heuristic methods to estimate the selectivity.
if idx.NDV > 0 && coverAll {
// for equality queries
return float64(coll.ModifyCount) / float64(idx.NDV) / idx.TotalRowCount()
return outOfRangeEQSelectivity(idx.NDV, coll.ModifyCount, int64(idx.TotalRowCount()))
}
// The equal condition only uses prefix columns of the index.
colIDs := coll.Idx2ColumnIDs[idx.ID]
Expand All @@ -389,10 +406,7 @@ func (coll *HistColl) getEqualCondSelectivity(idx *Index, bytes []byte, usedCols
}
ndv = mathutil.MaxInt64(ndv, coll.Columns[colID].NDV)
}
if ndv > 0 {
return float64(coll.ModifyCount) / float64(ndv) / idx.TotalRowCount()
}
return float64(coll.ModifyCount) / outOfRangeBetweenRate / idx.TotalRowCount()
return outOfRangeEQSelectivity(ndv, coll.ModifyCount, int64(idx.TotalRowCount()))
}
return float64(idx.CMSketch.QueryBytes(bytes)) / float64(idx.TotalRowCount())
}
Expand Down