diff --git a/cmd/explaintest/r/explain_union_scan.result b/cmd/explaintest/r/explain_union_scan.result index 75b1e85280c52..fde3e85d6469d 100644 --- a/cmd/explaintest/r/explain_union_scan.result +++ b/cmd/explaintest/r/explain_union_scan.result @@ -20,13 +20,13 @@ Projection_17 10.00 root test.t1.id, test.t1.province_id, test.t1.city_name, tes │ ├─UnionScan_38 10.00 root gt(test.t1.province_id, 1), lt(test.t1.province_id, 100) │ │ └─TableReader_41 10.00 root data:Selection_40 │ │ └─Selection_40 10.00 cop gt(test.t1.province_id, 1), lt(test.t1.province_id, 100) - │ │ └─TableScan_39 14.30 cop table:city, range:[-inf,+inf], keep order:false + │ │ └─TableScan_39 13.96 cop table:city, range:[-inf,+inf], keep order:false │ └─UnionScan_28 1.00 root │ └─IndexLookUp_27 1.00 root │ ├─IndexScan_25 1.00 cop table:city, index:id, range: decided by [eq(test.t2.id, test.t1.id)], keep order:false │ └─TableScan_26 1.00 cop table:city, keep order:false - └─UnionScan_52 519304.44 root gt(test.t3.province_id, 1), lt(test.t3.province_id, 100), not(isnull(test.t3.province_id)) - └─TableReader_55 519304.44 root data:Selection_54 - └─Selection_54 519304.44 cop gt(test.t3.province_id, 1), lt(test.t3.province_id, 100), not(isnull(test.t3.province_id)) + └─UnionScan_52 532199.19 root gt(test.t3.province_id, 1), lt(test.t3.province_id, 100), not(isnull(test.t3.province_id)) + └─TableReader_55 532199.19 root data:Selection_54 + └─Selection_54 532199.19 cop gt(test.t3.province_id, 1), lt(test.t3.province_id, 100), not(isnull(test.t3.province_id)) └─TableScan_53 536284.00 cop table:city, range:[-inf,+inf], keep order:false commit; diff --git a/planner/core/testdata/analyze_suite_out.json b/planner/core/testdata/analyze_suite_out.json index d046cb1e868c2..488c28a5d2a8e 100644 --- a/planner/core/testdata/analyze_suite_out.json +++ b/planner/core/testdata/analyze_suite_out.json @@ -138,17 +138,17 @@ { "SQL": "explain select * from t where a = 7639902", "Plan": [ - "IndexReader_6 2.03 root index:IndexScan_5", - "└─IndexScan_5 2.03 cop table:t, index:a, c, b, range:[7639902,7639902], keep order:false" + "IndexReader_6 6.68 root index:IndexScan_5", + "└─IndexScan_5 6.68 cop table:t, index:a, c, b, range:[7639902,7639902], keep order:false" ] }, { "SQL": "explain select c, b from t where a = 7639902 order by b asc limit 6", "Plan": [ - "Projection_7 2.03 root test.t.c, test.t.b", - "└─TopN_10 2.03 root test.t.b:asc, offset:0, count:6", - " └─IndexReader_18 2.03 root index:IndexScan_17", - " └─IndexScan_17 2.03 cop table:t, index:a, c, b, range:[7639902,7639902], keep order:false" + "Projection_7 6.00 root test.t.c, test.t.b", + "└─TopN_10 6.00 root test.t.b:asc, offset:0, count:6", + " └─IndexReader_18 6.68 root index:IndexScan_17", + " └─IndexScan_17 6.68 cop table:t, index:a, c, b, range:[7639902,7639902], keep order:false" ] } ] diff --git a/statistics/handle/update_test.go b/statistics/handle/update_test.go index 1c86e60e5813c..6800406f4cc3a 100644 --- a/statistics/handle/update_test.go +++ b/statistics/handle/update_test.go @@ -1523,8 +1523,8 @@ func (s *testStatsSuite) TestAbnormalIndexFeedback(c *C) { sql: "select * from t where a = 2 and b > 10", hist: "column:2 ndv:20 totColSize:20\n" + "num: 5 lower_bound: -9223372036854775808 upper_bound: 7 repeats: 0\n" + - "num: 6 lower_bound: 7 upper_bound: 14 repeats: 0\n" + - "num: 7 lower_bound: 14 upper_bound: 9223372036854775807 repeats: 0", + "num: 4 lower_bound: 7 upper_bound: 14 repeats: 0\n" + + "num: 6 lower_bound: 14 upper_bound: 9223372036854775807 repeats: 0", rangeID: tblInfo.Columns[1].ID, idxID: tblInfo.Indices[0].ID, eqCount: 3, diff --git a/statistics/histogram.go b/statistics/histogram.go index 3414f80da1020..d38bc73db2fa2 100644 --- a/statistics/histogram.go +++ b/statistics/histogram.go @@ -719,7 +719,7 @@ func (c *Column) equalRowCount(sc *stmtctx.StatementContext, val types.Datum, mo return 0.0, nil } if c.NDV > 0 && c.outOfRange(val) { - return float64(modifyCount) / float64(c.NDV), nil + return outOfRangeEQSelectivity(c.NDV, modifyCount, int64(c.TotalRowCount())) * c.TotalRowCount(), nil } if c.CMSketch != nil { count, err := c.CMSketch.queryValue(sc, val) @@ -818,7 +818,7 @@ func (idx *Index) equalRowCount(sc *stmtctx.StatementContext, b []byte, modifyCo } val := types.NewBytesDatum(b) if idx.NDV > 0 && idx.outOfRange(val) { - return float64(modifyCount) / (float64(idx.NDV)), nil + return outOfRangeEQSelectivity(idx.NDV, modifyCount, int64(idx.TotalRowCount())) * idx.TotalRowCount(), nil } if idx.CMSketch != nil { return float64(idx.CMSketch.QueryBytes(b)), nil diff --git a/statistics/selectivity_test.go b/statistics/selectivity_test.go index f79821b17124a..d258403004c5c 100644 --- a/statistics/selectivity_test.go +++ b/statistics/selectivity_test.go @@ -366,15 +366,15 @@ func (s *testStatsSuite) TestEstimationForUnknownValues(c *C) { colID := table.Meta().Columns[0].ID count, err := statsTbl.GetRowCountByColumnRanges(sc, colID, getRange(30, 30)) c.Assert(err, IsNil) - c.Assert(count, Equals, 2.0) + c.Assert(count, Equals, 0.2) count, err = statsTbl.GetRowCountByColumnRanges(sc, colID, getRange(9, 30)) c.Assert(err, IsNil) - c.Assert(count, Equals, 4.2) + c.Assert(count, Equals, 2.4000000000000004) count, err = statsTbl.GetRowCountByColumnRanges(sc, colID, getRange(9, math.MaxInt64)) c.Assert(err, IsNil) - c.Assert(count, Equals, 4.2) + c.Assert(count, Equals, 2.4000000000000004) idxID := table.Meta().Indices[0].ID count, err = statsTbl.GetRowCountByIndexRanges(sc, idxID, getRange(30, 30)) @@ -579,3 +579,29 @@ func (s *testStatsSuite) TestSelectivityGreedyAlgo(c *C) { c.Assert(len(usedSets), Equals, 1) c.Assert(usedSets[0].ID, Equals, int64(1)) } + +func (s *testStatsSuite) TestOutOfRangeEQEstimation(c *C) { + defer cleanEnv(c, s.store, s.do) + testKit := testkit.NewTestKit(c, s.store) + testKit.MustExec("use test") + testKit.MustExec("drop table if exists t") + testKit.MustExec("create table t(a int)") + for i := 0; i < 1000; i++ { + testKit.MustExec(fmt.Sprintf("insert into t values (%v)", i/4)) // 0 ~ 249 + } + testKit.MustExec("analyze table t") + h := s.do.StatsHandle() + table, err := s.do.InfoSchema().TableByName(model.NewCIStr("test"), model.NewCIStr("t")) + c.Assert(err, IsNil) + statsTbl := h.GetTableStats(table.Meta()) + sc := &stmtctx.StatementContext{} + col := statsTbl.Columns[table.Meta().Columns[0].ID] + count, err := col.GetColumnRowCount(sc, getRange(250, 250), 0, false) + c.Assert(err, IsNil) + c.Assert(count, Equals, float64(0)) + for i := 0; i < 8; i++ { + count, err := col.GetColumnRowCount(sc, getRange(250, 250), int64(i+1), false) + c.Assert(err, IsNil) + c.Assert(count, Equals, math.Min(float64(i+1), 4)) // estRows must be less than modifyCnt + } +} diff --git a/statistics/table.go b/statistics/table.go index 8b464727ed5d7..1f51228d38691 100644 --- a/statistics/table.go +++ b/statistics/table.go @@ -375,7 +375,7 @@ func (coll *HistColl) getEqualCondSelectivity(idx *Index, bytes []byte, usedCols // so we use heuristic methods to estimate the selectivity. if idx.NDV > 0 && coverAll { // for equality queries - return float64(coll.ModifyCount) / float64(idx.NDV) / idx.TotalRowCount() + return outOfRangeEQSelectivity(idx.NDV, coll.ModifyCount, int64(idx.TotalRowCount())) } // The equal condition only uses prefix columns of the index. colIDs := coll.Idx2ColumnIDs[idx.ID] @@ -386,10 +386,7 @@ func (coll *HistColl) getEqualCondSelectivity(idx *Index, bytes []byte, usedCols } ndv = mathutil.MaxInt64(ndv, coll.Columns[colID].NDV) } - if ndv > 0 { - return float64(coll.ModifyCount) / float64(ndv) / idx.TotalRowCount() - } - return float64(coll.ModifyCount) / outOfRangeBetweenRate / idx.TotalRowCount() + return outOfRangeEQSelectivity(ndv, coll.ModifyCount, int64(idx.TotalRowCount())) } return float64(idx.CMSketch.QueryBytes(bytes)) / float64(idx.TotalRowCount()) } @@ -640,3 +637,21 @@ func getPseudoRowCountByUnsignedIntRanges(intRanges []*ranger.Range, tableRowCou } return rowCount } + +// outOfRangeEQSelectivity estimates selectivities for out-of-range values. +// It assumes all modifications are insertions and all new-inserted rows are uniformly distributed +// and has the same distribution with analyzed rows, which means each unique value should have the +// same number of rows(Tot/NDV) of it. +func outOfRangeEQSelectivity(ndv, modifyRows, totalRows int64) float64 { + if modifyRows == 0 { + return 0 // it must be 0 since the histogram contains the whole data + } + if ndv < outOfRangeBetweenRate { + ndv = outOfRangeBetweenRate // avoid inaccurate selectivity caused by small NDV + } + selectivity := 1 / float64(ndv) // TODO: After extracting TopN from histograms, we can minus the TopN fraction here. + if selectivity*float64(totalRows) > float64(modifyRows) { + selectivity = float64(modifyRows) / float64(totalRows) + } + return selectivity +}