From b14b6f75980e4f5886f0d52a4f3cf411d6389068 Mon Sep 17 00:00:00 2001 From: Yiding Cui Date: Tue, 9 Apr 2024 00:51:37 +0800 Subject: [PATCH 1/5] planner, statistics: fix the inconsistent est when table has no stats --- pkg/planner/cardinality/selectivity_test.go | 28 +++++++++++++++++++ pkg/statistics/handle/storage/read.go | 8 ++++-- .../handle/syncload/stats_syncload.go | 18 ++++++++++-- pkg/statistics/table.go | 19 +++++++++---- 4 files changed, 63 insertions(+), 10 deletions(-) diff --git a/pkg/planner/cardinality/selectivity_test.go b/pkg/planner/cardinality/selectivity_test.go index b9503ef045462..da2eec6a8e46e 100644 --- a/pkg/planner/cardinality/selectivity_test.go +++ b/pkg/planner/cardinality/selectivity_test.go @@ -1310,3 +1310,31 @@ func TestSubsetIdxCardinality(t *testing.T) { testKit.MustQuery(input[i]).Check(testkit.Rows(output[i].Result...)) } } + +func TestBuiltinInEstWithoutStats(t *testing.T) { + store, dom := testkit.CreateMockStoreAndDomain(t) + tk := testkit.NewTestKit(t, store) + h := dom.StatsHandle() + + tk.MustExec("use test") + tk.MustExec("create table t(a int)") + require.NoError(t, h.HandleDDLEvent(<-h.DDLEventCh())) + tk.MustExec("insert into t values(1), (2), (3), (4), (5), (6), (7), (8), (9), (10)") + require.NoError(t, h.DumpStatsDeltaToKV(true)) + is := dom.InfoSchema() + require.NoError(t, h.Update(is)) + + tk.MustQuery("explain format='brief' select * from t where a in (1, 2, 3, 4, 5, 6, 7, 8)").Check(testkit.Rows( + "TableReader 0.08 root data:Selection", + "└─Selection 0.08 cop[tikv] in(test.t.a, 1, 2, 3, 4, 5, 6, 7, 8)", + " └─TableFullScan 10.00 cop[tikv] table:t keep order:false, stats:pseudo", + )) + + h.Clear() + require.NoError(t, h.InitStatsLite(is)) + tk.MustQuery("explain format='brief' select * from t where a in (1, 2, 3, 4, 5, 6, 7, 8)").Check(testkit.Rows( + "TableReader 0.08 root data:Selection", + "└─Selection 0.08 cop[tikv] in(test.t.a, 1, 2, 3, 4, 5, 6, 7, 8)", + " └─TableFullScan 10.00 cop[tikv] table:t keep order:false, stats:pseudo", + )) +} diff --git a/pkg/statistics/handle/storage/read.go b/pkg/statistics/handle/storage/read.go index b49b4e559299d..7586726d3c5f8 100644 --- a/pkg/statistics/handle/storage/read.go +++ b/pkg/statistics/handle/storage/read.go @@ -572,7 +572,9 @@ func CleanFakeItemsForShowHistInFlights(statsCache statstypes.StatsCache) int { if item.IsIndex { _, loadNeeded = tbl.IndexIsLoadNeeded(item.ID) } else { - _, loadNeeded = tbl.ColumnIsLoadNeeded(item.ID, true) + var analyzed bool + _, loadNeeded, analyzed = tbl.ColumnIsLoadNeeded(item.ID, true) + loadNeeded = loadNeeded && analyzed } if !loadNeeded { statistics.HistogramNeededItems.Delete(item) @@ -589,8 +591,8 @@ func loadNeededColumnHistograms(sctx sessionctx.Context, statsCache statstypes.S return nil } var colInfo *model.ColumnInfo - _, loadNeeded := tbl.ColumnIsLoadNeeded(col.ID, true) - if !loadNeeded { + _, loadNeeded, analyzed := tbl.ColumnIsLoadNeeded(col.ID, true) + if !loadNeeded || !analyzed { statistics.HistogramNeededItems.Delete(col) return nil } diff --git a/pkg/statistics/handle/syncload/stats_syncload.go b/pkg/statistics/handle/syncload/stats_syncload.go index ec89ed7b52361..3e9db7716182e 100644 --- a/pkg/statistics/handle/syncload/stats_syncload.go +++ b/pkg/statistics/handle/syncload/stats_syncload.go @@ -155,7 +155,7 @@ func (s *statsSyncLoad) removeHistLoadedColumns(neededItems []model.StatsLoadIte } continue } - _, loadNeeded := tbl.ColumnIsLoadNeeded(item.ID, item.FullLoad) + _, loadNeeded, _ := tbl.ColumnIsLoadNeeded(item.ID, item.FullLoad) if loadNeeded { remainedItems = append(remainedItems, item) } @@ -270,7 +270,7 @@ func (s *statsSyncLoad) handleOneItemTask(sctx sessionctx.Context, task *statsty wrapper.idxInfo = tbl.ColAndIdxExistenceMap.GetIndex(item.ID) } } else { - col, loadNeeded := tbl.ColumnIsLoadNeeded(item.ID, task.Item.FullLoad) + col, loadNeeded, analyzed := tbl.ColumnIsLoadNeeded(item.ID, task.Item.FullLoad) if !loadNeeded { return result, nil } @@ -279,6 +279,20 @@ func (s *statsSyncLoad) handleOneItemTask(sctx sessionctx.Context, task *statsty } else { wrapper.colInfo = tbl.ColAndIdxExistenceMap.GetCol(item.ID) } + // If this column is not analyzed yet and we don't have it in memory. + // We create a fake one for the pseudo estimation. + if loadNeeded && !analyzed { + wrapper.col = &statistics.Column{ + PhysicalID: item.TableID, + Info: wrapper.colInfo, + Histogram: *statistics.NewHistogram(item.ID, 0, 0, 0, &wrapper.colInfo.FieldType, 0, 0), + IsHandle: tbl.IsPkIsHandle && mysql.HasPriKeyFlag(wrapper.colInfo.GetFlag()), + } + if s.updateCachedItem(item, wrapper.col, wrapper.idx, task.Item.FullLoad) { + return result, nil + } + return nil, nil + } } t := time.Now() needUpdate := false diff --git a/pkg/statistics/table.go b/pkg/statistics/table.go index f954dc98c7057..ad1e2c1124d4d 100644 --- a/pkg/statistics/table.go +++ b/pkg/statistics/table.go @@ -626,13 +626,22 @@ func (t *Table) GetStatsHealthy() (int64, bool) { // The Column should be visible in the table and really has analyzed statistics in the stroage. // Also, if the stats has been loaded into the memory, we also don't need to load it. // We return the Column together with the checking result, to avoid accessing the map multiple times. -func (t *Table) ColumnIsLoadNeeded(id int64, fullLoad bool) (*Column, bool) { +// The first bool is whether we have it in memory. The second bool is whether this column has stats or not. +func (t *Table) ColumnIsLoadNeeded(id int64, fullLoad bool) (*Column, bool, bool) { + if t.Pseudo { + return nil, false, false + } col, ok := t.Columns[id] hasAnalyzed := t.ColAndIdxExistenceMap.HasAnalyzed(id, false) - // If it's not analyzed yet. Don't need to load it. + // If it's not analyzed yet. if !hasAnalyzed { - return nil, false + // If we don't have it in memory, we create a fake hist for pseudo estimation. + if !ok { + return nil, true, false + } + // Otherwise we don't need to load it. + return nil, false, false } // Restore the condition from the simplified form: @@ -640,11 +649,11 @@ func (t *Table) ColumnIsLoadNeeded(id int64, fullLoad bool) (*Column, bool) { // 2. ok && hasAnalyzed && fullLoad && !col.IsFullLoad => need load // 3. ok && hasAnalyzed && !fullLoad && !col.statsInitialized => need load if !ok || (fullLoad && !col.IsFullLoad()) || (!fullLoad && !col.statsInitialized) { - return col, true + return col, true, true } // Otherwise don't need load it. - return col, false + return col, false, true } // IndexIsLoadNeeded checks whether the index needs trigger the async/sync load. From 2257f977caa8f3ebe80c54ba19a9166765c1b1d9 Mon Sep 17 00:00:00 2001 From: Yiding Cui Date: Tue, 9 Apr 2024 01:04:01 +0800 Subject: [PATCH 2/5] add non-lite in test --- pkg/planner/cardinality/selectivity_test.go | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pkg/planner/cardinality/selectivity_test.go b/pkg/planner/cardinality/selectivity_test.go index da2eec6a8e46e..2c0ce384f49af 100644 --- a/pkg/planner/cardinality/selectivity_test.go +++ b/pkg/planner/cardinality/selectivity_test.go @@ -1337,4 +1337,12 @@ func TestBuiltinInEstWithoutStats(t *testing.T) { "└─Selection 0.08 cop[tikv] in(test.t.a, 1, 2, 3, 4, 5, 6, 7, 8)", " └─TableFullScan 10.00 cop[tikv] table:t keep order:false, stats:pseudo", )) + + h.Clear() + require.NoError(t, h.InitStats(is)) + tk.MustQuery("explain format='brief' select * from t where a in (1, 2, 3, 4, 5, 6, 7, 8)").Check(testkit.Rows( + "TableReader 0.08 root data:Selection", + "└─Selection 0.08 cop[tikv] in(test.t.a, 1, 2, 3, 4, 5, 6, 7, 8)", + " └─TableFullScan 10.00 cop[tikv] table:t keep order:false, stats:pseudo", + )) } From 9c49cdadb3d7013ece9648e2ca5d02a3edc84e09 Mon Sep 17 00:00:00 2001 From: Yiding Cui Date: Tue, 9 Apr 2024 01:27:56 +0800 Subject: [PATCH 3/5] fix bazel_prepare --- pkg/planner/cardinality/BUILD.bazel | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/planner/cardinality/BUILD.bazel b/pkg/planner/cardinality/BUILD.bazel index af8f1416a89a4..1ce4d3e41b678 100644 --- a/pkg/planner/cardinality/BUILD.bazel +++ b/pkg/planner/cardinality/BUILD.bazel @@ -59,7 +59,7 @@ go_test( data = glob(["testdata/**"]), embed = [":cardinality"], flaky = True, - shard_count = 27, + shard_count = 28, deps = [ "//pkg/config", "//pkg/domain", From 04b75b0efc963cd798501e5f0bbc3dc7b62851f3 Mon Sep 17 00:00:00 2001 From: Yiding Cui Date: Tue, 9 Apr 2024 02:34:44 +0800 Subject: [PATCH 4/5] fix tests --- pkg/planner/core/collect_column_stats_usage.go | 10 ++++++++++ pkg/statistics/table.go | 5 ++++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/pkg/planner/core/collect_column_stats_usage.go b/pkg/planner/core/collect_column_stats_usage.go index 684ccc86deb4a..0d166d380192b 100644 --- a/pkg/planner/core/collect_column_stats_usage.go +++ b/pkg/planner/core/collect_column_stats_usage.go @@ -195,11 +195,21 @@ func (c *columnStatsUsageCollector) addHistNeededColumns(ds *DataSource) { colIDSet := intset.NewFastIntSet() for _, col := range columns { + // If the column is plan-generated one, Skip it. + // TODO: we may need to consider the ExtraHandle. + if col.ID < 0 { + continue + } tblColID := model.TableItemID{TableID: ds.physicalTableID, ID: col.ID, IsIndex: false} colIDSet.Insert(int(col.ID)) c.histNeededCols[tblColID] = true } for _, col := range ds.Columns { + // If the column is plan-generated one, Skip it. + // TODO: we may need to consider the ExtraHandle. + if col.ID < 0 { + continue + } if !colIDSet.Has(int(col.ID)) && !col.Hidden { tblColID := model.TableItemID{TableID: ds.physicalTableID, ID: col.ID, IsIndex: false} if _, ok := c.histNeededCols[tblColID]; ok { diff --git a/pkg/statistics/table.go b/pkg/statistics/table.go index ad1e2c1124d4d..163faa69da769 100644 --- a/pkg/statistics/table.go +++ b/pkg/statistics/table.go @@ -638,7 +638,10 @@ func (t *Table) ColumnIsLoadNeeded(id int64, fullLoad bool) (*Column, bool, bool if !hasAnalyzed { // If we don't have it in memory, we create a fake hist for pseudo estimation. if !ok { - return nil, true, false + // If we don't have this column. We skip it. + // It's something ridiculous. But it's possible that the stats don't have some ColumnInfo. + // We need to find a way to maintain it more correctly. + return nil, t.ColAndIdxExistenceMap.Has(id, false), false } // Otherwise we don't need to load it. return nil, false, false From d33f702b899d9a5ca0d1031e4664e660c1ece967 Mon Sep 17 00:00:00 2001 From: Yiding Cui Date: Mon, 15 Apr 2024 18:54:08 +0800 Subject: [PATCH 5/5] Apply suggestions from code review Co-authored-by: Zhou Kunqin <25057648+time-and-fate@users.noreply.github.com> --- pkg/statistics/table.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/statistics/table.go b/pkg/statistics/table.go index 163faa69da769..83b649631cd54 100644 --- a/pkg/statistics/table.go +++ b/pkg/statistics/table.go @@ -626,7 +626,7 @@ func (t *Table) GetStatsHealthy() (int64, bool) { // The Column should be visible in the table and really has analyzed statistics in the stroage. // Also, if the stats has been loaded into the memory, we also don't need to load it. // We return the Column together with the checking result, to avoid accessing the map multiple times. -// The first bool is whether we have it in memory. The second bool is whether this column has stats or not. +// The first bool is whether we have it in memory. The second bool is whether this column has stats in the system table or not. func (t *Table) ColumnIsLoadNeeded(id int64, fullLoad bool) (*Column, bool, bool) { if t.Pseudo { return nil, false, false @@ -636,7 +636,7 @@ func (t *Table) ColumnIsLoadNeeded(id int64, fullLoad bool) (*Column, bool, bool // If it's not analyzed yet. if !hasAnalyzed { - // If we don't have it in memory, we create a fake hist for pseudo estimation. + // If we don't have it in memory, we create a fake hist for pseudo estimation (see handleOneItemTask()). if !ok { // If we don't have this column. We skip it. // It's something ridiculous. But it's possible that the stats don't have some ColumnInfo.