fix inappropriate heuristic method of estimating out-of-range values (p…

…ingcap#19200)
ti-srebot · Aug 14, 2020 · 6446961 · 6446961
1 parent dab3d93
commit 6446961
Show file tree

Hide file tree

Showing 6 changed files with 63 additions and 22 deletions.
diff --git a/cmd/explaintest/r/explain_union_scan.result b/cmd/explaintest/r/explain_union_scan.result
@@ -20,13 +20,13 @@ Projection_17	10.00	root	test.t1.id, test.t1.province_id, test.t1.city_name, tes
     │   ├─UnionScan_38	10.00	root	gt(test.t1.province_id, 1), lt(test.t1.province_id, 100)
     │   │ └─TableReader_41	10.00	root	data:Selection_40
     │   │   └─Selection_40	10.00	cop	gt(test.t1.province_id, 1), lt(test.t1.province_id, 100)
-    │   │     └─TableScan_39	14.30	cop	table:city, range:[-inf,+inf], keep order:false
+    │   │     └─TableScan_39	13.96	cop	table:city, range:[-inf,+inf], keep order:false
     │   └─UnionScan_28	1.00	root	
     │     └─IndexLookUp_27	1.00	root	
     │       ├─IndexScan_25	1.00	cop	table:city, index:id, range: decided by [eq(test.t2.id, test.t1.id)], keep order:false
     │       └─TableScan_26	1.00	cop	table:city, keep order:false
-    └─UnionScan_52	519304.44	root	gt(test.t3.province_id, 1), lt(test.t3.province_id, 100), not(isnull(test.t3.province_id))
-      └─TableReader_55	519304.44	root	data:Selection_54
-        └─Selection_54	519304.44	cop	gt(test.t3.province_id, 1), lt(test.t3.province_id, 100), not(isnull(test.t3.province_id))
+    └─UnionScan_52	532199.19	root	gt(test.t3.province_id, 1), lt(test.t3.province_id, 100), not(isnull(test.t3.province_id))
+      └─TableReader_55	532199.19	root	data:Selection_54
+        └─Selection_54	532199.19	cop	gt(test.t3.province_id, 1), lt(test.t3.province_id, 100), not(isnull(test.t3.province_id))
           └─TableScan_53	536284.00	cop	table:city, range:[-inf,+inf], keep order:false
 commit;
diff --git a/planner/core/testdata/analyze_suite_out.json b/planner/core/testdata/analyze_suite_out.json
@@ -138,17 +138,17 @@
       {
         "SQL": "explain select * from t where a = 7639902",
         "Plan": [
-          "IndexReader_6 2.03 root index:IndexScan_5",
-          "└─IndexScan_5 2.03 cop table:t, index:a, c, b, range:[7639902,7639902], keep order:false"
+          "IndexReader_6 6.68 root index:IndexScan_5",
+          "└─IndexScan_5 6.68 cop table:t, index:a, c, b, range:[7639902,7639902], keep order:false"
         ]
       },
       {
         "SQL": "explain select c, b from t where a = 7639902 order by b asc limit 6",
         "Plan": [
-          "Projection_7 2.03 root test.t.c, test.t.b",
-          "└─TopN_10 2.03 root test.t.b:asc, offset:0, count:6",
-          "  └─IndexReader_18 2.03 root index:IndexScan_17",
-          "    └─IndexScan_17 2.03 cop table:t, index:a, c, b, range:[7639902,7639902], keep order:false"
+          "Projection_7 6.00 root test.t.c, test.t.b",
+          "└─TopN_10 6.00 root test.t.b:asc, offset:0, count:6",
+          "  └─IndexReader_18 6.68 root index:IndexScan_17",
+          "    └─IndexScan_17 6.68 cop table:t, index:a, c, b, range:[7639902,7639902], keep order:false"
         ]
       }
     ]

diff --git a/statistics/handle/update_test.go b/statistics/handle/update_test.go
@@ -1523,8 +1523,8 @@ func (s *testStatsSuite) TestAbnormalIndexFeedback(c *C) {
 			sql: "select * from t where a = 2 and b > 10",
 			hist: "column:2 ndv:20 totColSize:20\n" +
 				"num: 5 lower_bound: -9223372036854775808 upper_bound: 7 repeats: 0\n" +
-				"num: 6 lower_bound: 7 upper_bound: 14 repeats: 0\n" +
-				"num: 7 lower_bound: 14 upper_bound: 9223372036854775807 repeats: 0",
+				"num: 4 lower_bound: 7 upper_bound: 14 repeats: 0\n" +
+				"num: 6 lower_bound: 14 upper_bound: 9223372036854775807 repeats: 0",
 			rangeID: tblInfo.Columns[1].ID,
 			idxID:   tblInfo.Indices[0].ID,
 			eqCount: 3,

diff --git a/statistics/histogram.go b/statistics/histogram.go
@@ -719,7 +719,7 @@ func (c *Column) equalRowCount(sc *stmtctx.StatementContext, val types.Datum, mo
 		return 0.0, nil
 	}
 	if c.NDV > 0 && c.outOfRange(val) {
-		return float64(modifyCount) / float64(c.NDV), nil
+		return outOfRangeEQSelectivity(c.NDV, modifyCount, int64(c.TotalRowCount())) * c.TotalRowCount(), nil
 	}
 	if c.CMSketch != nil {
 		count, err := c.CMSketch.queryValue(sc, val)
@@ -818,7 +818,7 @@ func (idx *Index) equalRowCount(sc *stmtctx.StatementContext, b []byte, modifyCo
 	}
 	val := types.NewBytesDatum(b)
 	if idx.NDV > 0 && idx.outOfRange(val) {
-		return float64(modifyCount) / (float64(idx.NDV)), nil
+		return outOfRangeEQSelectivity(idx.NDV, modifyCount, int64(idx.TotalRowCount())) * idx.TotalRowCount(), nil
 	}
 	if idx.CMSketch != nil {
 		return float64(idx.CMSketch.QueryBytes(b)), nil

diff --git a/statistics/selectivity_test.go b/statistics/selectivity_test.go
@@ -366,15 +366,15 @@ func (s *testStatsSuite) TestEstimationForUnknownValues(c *C) {
 	colID := table.Meta().Columns[0].ID
 	count, err := statsTbl.GetRowCountByColumnRanges(sc, colID, getRange(30, 30))
 	c.Assert(err, IsNil)
-	c.Assert(count, Equals, 2.0)
+	c.Assert(count, Equals, 0.2)
 
 	count, err = statsTbl.GetRowCountByColumnRanges(sc, colID, getRange(9, 30))
 	c.Assert(err, IsNil)
-	c.Assert(count, Equals, 4.2)
+	c.Assert(count, Equals, 2.4000000000000004)
 
 	count, err = statsTbl.GetRowCountByColumnRanges(sc, colID, getRange(9, math.MaxInt64))
 	c.Assert(err, IsNil)
-	c.Assert(count, Equals, 4.2)
+	c.Assert(count, Equals, 2.4000000000000004)
 
 	idxID := table.Meta().Indices[0].ID
 	count, err = statsTbl.GetRowCountByIndexRanges(sc, idxID, getRange(30, 30))
@@ -579,3 +579,29 @@ func (s *testStatsSuite) TestSelectivityGreedyAlgo(c *C) {
 	c.Assert(len(usedSets), Equals, 1)
 	c.Assert(usedSets[0].ID, Equals, int64(1))
 }
+
+func (s *testStatsSuite) TestOutOfRangeEQEstimation(c *C) {
+	defer cleanEnv(c, s.store, s.do)
+	testKit := testkit.NewTestKit(c, s.store)
+	testKit.MustExec("use test")
+	testKit.MustExec("drop table if exists t")
+	testKit.MustExec("create table t(a int)")
+	for i := 0; i < 1000; i++ {
+		testKit.MustExec(fmt.Sprintf("insert into t values (%v)", i/4)) // 0 ~ 249
+	}
+	testKit.MustExec("analyze table t")
+	h := s.do.StatsHandle()
+	table, err := s.do.InfoSchema().TableByName(model.NewCIStr("test"), model.NewCIStr("t"))
+	c.Assert(err, IsNil)
+	statsTbl := h.GetTableStats(table.Meta())
+	sc := &stmtctx.StatementContext{}
+	col := statsTbl.Columns[table.Meta().Columns[0].ID]
+	count, err := col.GetColumnRowCount(sc, getRange(250, 250), 0, false)
+	c.Assert(err, IsNil)
+	c.Assert(count, Equals, float64(0))
+	for i := 0; i < 8; i++ {
+		count, err := col.GetColumnRowCount(sc, getRange(250, 250), int64(i+1), false)
+		c.Assert(err, IsNil)
+		c.Assert(count, Equals, math.Min(float64(i+1), 4)) // estRows must be less than modifyCnt
+	}
+}
diff --git a/statistics/table.go b/statistics/table.go
@@ -375,7 +375,7 @@ func (coll *HistColl) getEqualCondSelectivity(idx *Index, bytes []byte, usedCols
 		// so we use heuristic methods to estimate the selectivity.
 		if idx.NDV > 0 && coverAll {
 			// for equality queries
-			return float64(coll.ModifyCount) / float64(idx.NDV) / idx.TotalRowCount()
+			return outOfRangeEQSelectivity(idx.NDV, coll.ModifyCount, int64(idx.TotalRowCount()))
 		}
 		// The equal condition only uses prefix columns of the index.
 		colIDs := coll.Idx2ColumnIDs[idx.ID]
@@ -386,10 +386,7 @@ func (coll *HistColl) getEqualCondSelectivity(idx *Index, bytes []byte, usedCols
 			}
 			ndv = mathutil.MaxInt64(ndv, coll.Columns[colID].NDV)
 		}
-		if ndv > 0 {
-			return float64(coll.ModifyCount) / float64(ndv) / idx.TotalRowCount()
-		}
-		return float64(coll.ModifyCount) / outOfRangeBetweenRate / idx.TotalRowCount()
+		return outOfRangeEQSelectivity(ndv, coll.ModifyCount, int64(idx.TotalRowCount()))
 	}
 	return float64(idx.CMSketch.QueryBytes(bytes)) / float64(idx.TotalRowCount())
 }
@@ -640,3 +637,21 @@ func getPseudoRowCountByUnsignedIntRanges(intRanges []*ranger.Range, tableRowCou
 	}
 	return rowCount
 }
+
+// outOfRangeEQSelectivity estimates selectivities for out-of-range values.
+// It assumes all modifications are insertions and all new-inserted rows are uniformly distributed
+// and has the same distribution with analyzed rows, which means each unique value should have the
+// same number of rows(Tot/NDV) of it.
+func outOfRangeEQSelectivity(ndv, modifyRows, totalRows int64) float64 {
+	if modifyRows == 0 {
+		return 0 // it must be 0 since the histogram contains the whole data
+	}
+	if ndv < outOfRangeBetweenRate {
+		ndv = outOfRangeBetweenRate // avoid inaccurate selectivity caused by small NDV
+	}
+	selectivity := 1 / float64(ndv) // TODO: After extracting TopN from histograms, we can minus the TopN fraction here.
+	if selectivity*float64(totalRows) > float64(modifyRows) {
+		selectivity = float64(modifyRows) / float64(totalRows)
+	}
+	return selectivity
+}