Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix inappropriate heuristic method of estimating out-of-range values #19200

Merged
merged 3 commits into from
Aug 14, 2020
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions cmd/explaintest/r/explain_union_scan.result
Original file line number Diff line number Diff line change
@@ -20,13 +20,13 @@ Projection_17 10.00 root test.t1.id, test.t1.province_id, test.t1.city_name, tes
│ ├─UnionScan_38 10.00 root gt(test.t1.province_id, 1), lt(test.t1.province_id, 100)
│ │ └─TableReader_41 10.00 root data:Selection_40
│ │ └─Selection_40 10.00 cop gt(test.t1.province_id, 1), lt(test.t1.province_id, 100)
│ │ └─TableScan_39 14.30 cop table:city, range:[-inf,+inf], keep order:false
│ │ └─TableScan_39 13.96 cop table:city, range:[-inf,+inf], keep order:false
│ └─UnionScan_28 1.00 root
│ └─IndexLookUp_27 1.00 root
│ ├─IndexScan_25 1.00 cop table:city, index:id, range: decided by [eq(test.t2.id, test.t1.id)], keep order:false
│ └─TableScan_26 1.00 cop table:city, keep order:false
└─UnionScan_52 519304.44 root gt(test.t3.province_id, 1), lt(test.t3.province_id, 100), not(isnull(test.t3.province_id))
└─TableReader_55 519304.44 root data:Selection_54
└─Selection_54 519304.44 cop gt(test.t3.province_id, 1), lt(test.t3.province_id, 100), not(isnull(test.t3.province_id))
└─UnionScan_52 532199.19 root gt(test.t3.province_id, 1), lt(test.t3.province_id, 100), not(isnull(test.t3.province_id))
└─TableReader_55 532199.19 root data:Selection_54
└─Selection_54 532199.19 cop gt(test.t3.province_id, 1), lt(test.t3.province_id, 100), not(isnull(test.t3.province_id))
└─TableScan_53 536284.00 cop table:city, range:[-inf,+inf], keep order:false
commit;
12 changes: 6 additions & 6 deletions planner/core/testdata/analyze_suite_out.json
Original file line number Diff line number Diff line change
@@ -138,17 +138,17 @@
{
"SQL": "explain select * from t where a = 7639902",
"Plan": [
"IndexReader_6 2.03 root index:IndexScan_5",
"└─IndexScan_5 2.03 cop table:t, index:a, c, b, range:[7639902,7639902], keep order:false"
"IndexReader_6 6.68 root index:IndexScan_5",
"└─IndexScan_5 6.68 cop table:t, index:a, c, b, range:[7639902,7639902], keep order:false"
]
},
{
"SQL": "explain select c, b from t where a = 7639902 order by b asc limit 6",
"Plan": [
"Projection_7 2.03 root test.t.c, test.t.b",
"└─TopN_10 2.03 root test.t.b:asc, offset:0, count:6",
" └─IndexReader_18 2.03 root index:IndexScan_17",
" └─IndexScan_17 2.03 cop table:t, index:a, c, b, range:[7639902,7639902], keep order:false"
"Projection_7 6.00 root test.t.c, test.t.b",
"└─TopN_10 6.00 root test.t.b:asc, offset:0, count:6",
" └─IndexReader_18 6.68 root index:IndexScan_17",
" └─IndexScan_17 6.68 cop table:t, index:a, c, b, range:[7639902,7639902], keep order:false"
]
}
]
4 changes: 2 additions & 2 deletions statistics/handle/update_test.go
Original file line number Diff line number Diff line change
@@ -1523,8 +1523,8 @@ func (s *testStatsSuite) TestAbnormalIndexFeedback(c *C) {
sql: "select * from t where a = 2 and b > 10",
hist: "column:2 ndv:20 totColSize:20\n" +
"num: 5 lower_bound: -9223372036854775808 upper_bound: 7 repeats: 0\n" +
"num: 6 lower_bound: 7 upper_bound: 14 repeats: 0\n" +
"num: 7 lower_bound: 14 upper_bound: 9223372036854775807 repeats: 0",
"num: 4 lower_bound: 7 upper_bound: 14 repeats: 0\n" +
"num: 6 lower_bound: 14 upper_bound: 9223372036854775807 repeats: 0",
rangeID: tblInfo.Columns[1].ID,
idxID: tblInfo.Indices[0].ID,
eqCount: 3,
4 changes: 2 additions & 2 deletions statistics/histogram.go
Original file line number Diff line number Diff line change
@@ -719,7 +719,7 @@ func (c *Column) equalRowCount(sc *stmtctx.StatementContext, val types.Datum, mo
return 0.0, nil
}
if c.NDV > 0 && c.outOfRange(val) {
return float64(modifyCount) / float64(c.NDV), nil
return outOfRangeEQSelectivity(c.NDV, modifyCount, int64(c.TotalRowCount())) * c.TotalRowCount(), nil
}
if c.CMSketch != nil {
count, err := c.CMSketch.queryValue(sc, val)
@@ -818,7 +818,7 @@ func (idx *Index) equalRowCount(sc *stmtctx.StatementContext, b []byte, modifyCo
}
val := types.NewBytesDatum(b)
if idx.NDV > 0 && idx.outOfRange(val) {
return float64(modifyCount) / (float64(idx.NDV)), nil
return outOfRangeEQSelectivity(idx.NDV, modifyCount, int64(idx.TotalRowCount())) * idx.TotalRowCount(), nil
}
if idx.CMSketch != nil {
return float64(idx.CMSketch.QueryBytes(b)), nil
32 changes: 29 additions & 3 deletions statistics/selectivity_test.go
Original file line number Diff line number Diff line change
@@ -366,15 +366,15 @@ func (s *testStatsSuite) TestEstimationForUnknownValues(c *C) {
colID := table.Meta().Columns[0].ID
count, err := statsTbl.GetRowCountByColumnRanges(sc, colID, getRange(30, 30))
c.Assert(err, IsNil)
c.Assert(count, Equals, 2.0)
c.Assert(count, Equals, 0.2)

count, err = statsTbl.GetRowCountByColumnRanges(sc, colID, getRange(9, 30))
c.Assert(err, IsNil)
c.Assert(count, Equals, 4.2)
c.Assert(count, Equals, 2.4000000000000004)

count, err = statsTbl.GetRowCountByColumnRanges(sc, colID, getRange(9, math.MaxInt64))
c.Assert(err, IsNil)
c.Assert(count, Equals, 4.2)
c.Assert(count, Equals, 2.4000000000000004)

idxID := table.Meta().Indices[0].ID
count, err = statsTbl.GetRowCountByIndexRanges(sc, idxID, getRange(30, 30))
@@ -579,3 +579,29 @@ func (s *testStatsSuite) TestSelectivityGreedyAlgo(c *C) {
c.Assert(len(usedSets), Equals, 1)
c.Assert(usedSets[0].ID, Equals, int64(1))
}

func (s *testStatsSuite) TestOutOfRangeEQEstimation(c *C) {
defer cleanEnv(c, s.store, s.do)
testKit := testkit.NewTestKit(c, s.store)
testKit.MustExec("use test")
testKit.MustExec("drop table if exists t")
testKit.MustExec("create table t(a int)")
for i := 0; i < 1000; i++ {
testKit.MustExec(fmt.Sprintf("insert into t values (%v)", i/4)) // 0 ~ 249
}
testKit.MustExec("analyze table t")
h := s.do.StatsHandle()
table, err := s.do.InfoSchema().TableByName(model.NewCIStr("test"), model.NewCIStr("t"))
c.Assert(err, IsNil)
statsTbl := h.GetTableStats(table.Meta())
sc := &stmtctx.StatementContext{}
col := statsTbl.Columns[table.Meta().Columns[0].ID]
count, err := col.GetColumnRowCount(sc, getRange(250, 250), 0, false)
c.Assert(err, IsNil)
c.Assert(count, Equals, float64(0))
for i := 0; i < 8; i++ {
count, err := col.GetColumnRowCount(sc, getRange(250, 250), int64(i+1), false)
c.Assert(err, IsNil)
c.Assert(count, Equals, math.Min(float64(i+1), 4)) // estRows must be less than modifyCnt
}
}
25 changes: 20 additions & 5 deletions statistics/table.go
Original file line number Diff line number Diff line change
@@ -375,7 +375,7 @@ func (coll *HistColl) getEqualCondSelectivity(idx *Index, bytes []byte, usedCols
// so we use heuristic methods to estimate the selectivity.
if idx.NDV > 0 && coverAll {
// for equality queries
return float64(coll.ModifyCount) / float64(idx.NDV) / idx.TotalRowCount()
return outOfRangeEQSelectivity(idx.NDV, coll.ModifyCount, int64(idx.TotalRowCount()))
}
// The equal condition only uses prefix columns of the index.
colIDs := coll.Idx2ColumnIDs[idx.ID]
@@ -386,10 +386,7 @@ func (coll *HistColl) getEqualCondSelectivity(idx *Index, bytes []byte, usedCols
}
ndv = mathutil.MaxInt64(ndv, coll.Columns[colID].NDV)
}
if ndv > 0 {
return float64(coll.ModifyCount) / float64(ndv) / idx.TotalRowCount()
}
return float64(coll.ModifyCount) / outOfRangeBetweenRate / idx.TotalRowCount()
return outOfRangeEQSelectivity(ndv, coll.ModifyCount, int64(idx.TotalRowCount()))
}
return float64(idx.CMSketch.QueryBytes(bytes)) / float64(idx.TotalRowCount())
}
@@ -640,3 +637,21 @@ func getPseudoRowCountByUnsignedIntRanges(intRanges []*ranger.Range, tableRowCou
}
return rowCount
}

// outOfRangeEQSelectivity estimates selectivities for out-of-range values.
// It assumes all modifications are insertions and all new-inserted rows are uniformly distributed
// and has the same distribution with analyzed rows, which means each unique value should have the
// same number of rows(Tot/NDV) of it.
func outOfRangeEQSelectivity(ndv, modifyRows, totalRows int64) float64 {
if modifyRows == 0 {
return 0 // it must be 0 since the histogram contains the whole data
}
if ndv < outOfRangeBetweenRate {
ndv = outOfRangeBetweenRate // avoid inaccurate selectivity caused by small NDV
}
selectivity := 1 / float64(ndv) // TODO: After extracting TopN from histograms, we can minus the TopN fraction here.
if selectivity*float64(totalRows) > float64(modifyRows) {
selectivity = float64(modifyRows) / float64(totalRows)
}
return selectivity
}