Merge #44239

44239: sql: optimize NeededColumnFamilyIDs r=solongordon a=solongordon I made the logic for determining which column families need to be scanned more clever in two ways: - Previously we were always including column family 0 as a sentinel, since other column families have no KV entry if all their columns are null. This is not necessary if any of the column families being scanned have a NOT NULL column. - If a needed column is indexed and not composite, it can be decoded from the key, so we don't need to take it into account when determining the needed column families. Release note: None Co-authored-by: Solon Gordon <[email protected]>
cockroachdb · Feb 11, 2020 · 5035d70 · 5035d70
2 parents 58d1a57 + 2ba11ea
commit 5035d70
Show file tree

Hide file tree

Showing 14 changed files with 308 additions and 44 deletions.
diff --git a/pkg/sql/backfill/backfill.go b/pkg/sql/backfill/backfill.go
@@ -338,7 +338,8 @@ func (ib *IndexBackfiller) Init(desc *sqlbase.ImmutableTableDescriptor) error {
 			ib.added = append(ib.added, *idx)
 			for i := range cols {
 				id := cols[i].ID
-				if idx.ContainsColumnID(id) || idx.EncodingType == sqlbase.PrimaryIndexEncoding {
+				if idx.ContainsColumnID(id) ||
+					idx.GetEncodingType(desc.PrimaryIndex.ID) == sqlbase.PrimaryIndexEncoding {
 					valNeededForCol.Add(i)
 				}
 			}

diff --git a/pkg/sql/logictest/testdata/logic_test/exec_merge_join_dist b/pkg/sql/logictest/testdata/logic_test/exec_merge_join_dist
@@ -3,10 +3,10 @@
 # Regression test for #39317.
 
 statement ok
-CREATE TABLE l (a INT PRIMARY KEY, b INT)
+CREATE TABLE l (a INT PRIMARY KEY, b INT, FAMILY (a, b))
 
 statement ok
-CREATE TABLE r (a INT PRIMARY KEY, b INT)
+CREATE TABLE r (a INT PRIMARY KEY, b INT, FAMILY (a, b))
 
 statement ok
 INSERT INTO l VALUES (1, 10), (2, 20), (3, 30)

diff --git a/pkg/sql/logictest/testdata/logic_test/family b/pkg/sql/logictest/testdata/logic_test/family
@@ -275,3 +275,156 @@ query I
 SELECT xyz.z FROM y INNER LOOKUP JOIN xyz ON y.y = xyz.y
 ----
 NULL
+
+# Tests for NeededColumnFamilyIDs logic. This function is used for point lookups
+# to determine the minimal set of column families which need to be scanned.
+subtest needed_column_families
+
+statement ok
+CREATE TABLE t1 (
+  a INT PRIMARY KEY, b INT NOT NULL, c INT, d INT,
+  FAMILY (d), FAMILY (c), FAMILY (b), FAMILY (a)
+);
+INSERT INTO t1 VALUES (10, 20, 30, 40)
+
+# A point lookup on the primary key column should use family 0 (even if the
+# column is not in that family) because the column can be decoded from the key.
+query I
+SELECT a FROM t1 WHERE a = 10
+----
+10
+
+query TT
+SELECT field, description FROM [EXPLAIN SELECT a FROM t1 WHERE a = 10] WHERE field IN ('table', 'spans')
+----
+table  t1@primary
+spans  /10/0-/10/1
+
+# A point lookup on a non-nullable column allows us to scan only that column
+# family.
+query I
+SELECT b FROM t1 WHERE a = 10
+----
+20
+
+query TT
+SELECT field, description FROM [EXPLAIN SELECT b FROM t1 WHERE a = 10] WHERE field IN ('table', 'spans')
+----
+table  t1@primary
+spans  /10/2/1-/10/2/2
+
+# Even if we also select the primary key column, we can still scan the single
+# column family because that column can be decoded from the key.
+query II
+SELECT a, b FROM t1 WHERE a = 10
+----
+10  20
+
+query TT
+SELECT field, description FROM [EXPLAIN SELECT a, b FROM t1 WHERE a = 10] WHERE field IN ('table', 'spans')
+----
+table  t1@primary
+spans  /10/2/1-/10/2/2
+
+# A point lookup on a nullable column requires also scanning column family 0 as
+# a sentinel.
+query I
+SELECT c FROM t1 WHERE a = 10
+----
+30
+
+query TT
+SELECT field, description FROM [EXPLAIN SELECT c FROM t1 WHERE a = 10] WHERE field IN ('table', 'spans')
+----
+table  t1@primary
+spans  /10/0-/10/1/2
+
+# A point lookup on two columns in non-adjacent column families results in two
+# spans.
+query II
+SELECT b, d FROM t1 WHERE a = 10
+----
+20  40
+
+query TT
+SELECT field, description FROM [EXPLAIN SELECT b, d FROM t1 WHERE a = 10] WHERE field IN ('table', 'spans')
+----
+table  t1@primary
+spans  /10/0-/10/1 /10/2/1-/10/2/2
+
+# Unique secondary indexes store non-indexed primary key columns in column
+# family 0.
+statement ok
+CREATE UNIQUE INDEX b_idx ON t1 (b) STORING (c, d)
+
+query I
+SELECT a FROM t1 WHERE b = 20
+----
+10
+
+query TT
+SELECT field, description FROM [EXPLAIN SELECT a FROM t1 WHERE b = 20] WHERE field IN ('table', 'spans')
+----
+table  t1@b_idx
+spans  /20/0-/20/1
+
+# If the primary key column is composite, we do need to scan its column family
+# to retrieve its value.
+statement ok
+CREATE TABLE t2 (
+  a DECIMAL PRIMARY KEY, b INT, c INT NOT NULL, d INT,
+  FAMILY (d), FAMILY (c), FAMILY (b), FAMILY (a)
+);
+INSERT INTO t2 VALUES (10.00, 20, 30, 40)
+
+# A point lookup on the primary key column should use its family.
+query T
+SELECT a FROM t2 WHERE a = 10
+----
+10.00
+
+query TT
+SELECT field, description FROM [EXPLAIN SELECT a FROM t2 WHERE a = 10] WHERE field IN ('table', 'spans')
+----
+table  t2@primary
+spans  /1E+1/3/1-/1E+1/3/2
+
+# A point lookup on `a` and `b` should scan both of their families.
+query TI
+SELECT a, b FROM t2 WHERE a = 10
+----
+10.00  20
+
+query TT
+SELECT field, description FROM [EXPLAIN SELECT a, b FROM t2 WHERE a = 10] WHERE field IN ('table', 'spans')
+----
+table  t2@primary
+spans  /1E+1/2/1-/1E+1/3/2
+
+# Secondary indexes always store their composite values in column family 0.
+statement ok
+CREATE UNIQUE INDEX a_idx ON t2 (a) STORING (b, c, d)
+
+# A point lookup on the composite column should use family 0.
+query TI
+SELECT a, b FROM t2@a_idx WHERE a = 10
+----
+10.00  20
+
+query TT
+SELECT field, description FROM [EXPLAIN SELECT a FROM t2@a_idx WHERE a = 10] WHERE field IN ('table', 'spans')
+----
+table  t2@a_idx
+spans  /1E+1/0-/1E+1/1
+
+# A point lookup on `a` and `b` should use column family 0 and b's family.
+query TI
+SELECT a, b FROM t2@a_idx WHERE a = 10
+----
+10.00  20
+
+query TT
+SELECT field, description FROM [EXPLAIN SELECT a, b FROM t2@a_idx WHERE a = 10] WHERE field IN ('table', 'spans')
+----
+table  t2@a_idx
+spans  /1E+1/0-/1E+1/1 /1E+1/2/1-/1E+1/2/2
diff --git a/pkg/sql/logictest/testdata/logic_test/interleaved_join b/pkg/sql/logictest/testdata/logic_test/interleaved_join
@@ -29,8 +29,9 @@
 # Create tables #
 #################
 
+# TODO(solon): Remove the FAMILY declarations when #44699 is resolved.
 statement ok
-CREATE TABLE parent1 (pid1 INT PRIMARY KEY, pa1 INT)
+CREATE TABLE parent1 (pid1 INT PRIMARY KEY, pa1 INT, FAMILY (pid1), FAMILY (pa1))
 
 statement ok
 CREATE TABLE parent2 (pid2 INT PRIMARY KEY, pa2 INT)

diff --git a/pkg/sql/logictest/testdata/logic_test/vectorize b/pkg/sql/logictest/testdata/logic_test/vectorize
@@ -703,7 +703,10 @@ SET tracing=off
 # Making sure that colBatchScan operator can parallelize scans.
 # This test is similar to that in testplannerlogic/select
 statement ok
-CREATE TABLE tpar (a INT PRIMARY KEY, item STRING, price FLOAT, UNIQUE INDEX item (item), UNIQUE INDEX p (price))
+CREATE TABLE tpar (
+    a INT PRIMARY KEY, item STRING, price FLOAT, FAMILY (a, item, price),
+    UNIQUE INDEX item (item), UNIQUE INDEX p (price)
+)
 
 statement ok
 ALTER TABLE tpar SPLIT AT VALUES(5)

diff --git a/pkg/sql/opt/exec/execbuilder/testdata/fk_opt b/pkg/sql/opt/exec/execbuilder/testdata/fk_opt
@@ -10,10 +10,10 @@ SET enable_insert_fast_path = false
 # -- Tests with INSERT --
 
 statement ok
-CREATE TABLE parent (p INT PRIMARY KEY, other INT UNIQUE)
+CREATE TABLE parent (p INT PRIMARY KEY, other INT UNIQUE, FAMILY (p, other))
 
 statement ok
-CREATE TABLE child (c INT PRIMARY KEY, p INT NOT NULL REFERENCES parent(p))
+CREATE TABLE child (c INT PRIMARY KEY, p INT NOT NULL REFERENCES parent(p), FAMILY (c, p))
 
 query TTT
 EXPLAIN INSERT INTO child VALUES (1,1), (2,2)

diff --git a/pkg/sql/opt/exec/execbuilder/testdata/interleaved b/pkg/sql/opt/exec/execbuilder/testdata/interleaved
@@ -31,6 +31,7 @@ CREATE TABLE level4 (
   k1 INT,
   k2 INT,
   k3 INT,
+  FAMILY (k1, k2, k3),
   PRIMARY KEY (k1, k2, k3),
   CONSTRAINT fk3 FOREIGN KEY (k1, k2, k3) REFERENCES level3
 ) INTERLEAVE IN PARENT level3 (k1, k2, k3)

diff --git a/pkg/sql/opt/exec/execbuilder/testdata/join_order b/pkg/sql/opt/exec/execbuilder/testdata/join_order
@@ -23,7 +23,8 @@ CREATE TABLE abc (
   a INT PRIMARY KEY,
   b INT,
   c INT,
-  d INT
+  d INT,
+  FAMILY (a, b, c, d)
 )
 
 statement ok

diff --git a/pkg/sql/opt/exec/execbuilder/testdata/select b/pkg/sql/opt/exec/execbuilder/testdata/select
@@ -269,7 +269,7 @@ EXPLAIN (VERBOSE) SELECT * FROM [53(1) AS num_ref_alias]
 # Basic filter combinations.
 # ------------------------------------------------------------------------------
 statement ok
-CREATE TABLE a (x INT PRIMARY KEY, y INT);
+CREATE TABLE a (x INT PRIMARY KEY, y INT, FAMILY (x, y));
 
 query TTTTT
 EXPLAIN (VERBOSE) SELECT * FROM a WHERE x > 1
@@ -1237,7 +1237,7 @@ filter     ·            ·                  (x, y, z)  +x,+z
 # Verify that multi-span point lookups are parallelized.
 # ------------------------------------------------------
 statement ok
-CREATE TABLE a (a INT PRIMARY KEY, item STRING, price FLOAT, UNIQUE INDEX item (item), UNIQUE INDEX p (price))
+CREATE TABLE a (a INT PRIMARY KEY, item STRING, price FLOAT, FAMILY (a, item, price), UNIQUE INDEX item (item), UNIQUE INDEX p (price))
 
 statement ok
 CREATE TABLE b (a INT, b INT, c INT NULL, d INT NULL, PRIMARY KEY (a, b), FAMILY (a, b, c, d))

diff --git a/pkg/sql/opt/exec/execbuilder/testdata/upsert b/pkg/sql/opt/exec/execbuilder/testdata/upsert
@@ -123,6 +123,7 @@ CREATE TABLE indexed (
   b INT,
   c INT DEFAULT(10),
   d INT AS (a + c) STORED,
+  FAMILY (a, b, c, d),
   UNIQUE INDEX secondary (d, b),
   CHECK (c > 0)
 )

diff --git a/pkg/sql/row/fetcher.go b/pkg/sql/row/fetcher.go
@@ -893,7 +893,7 @@ func (rf *Fetcher) processKV(
 	}
 
 	// For covering secondary indexes, allow for decoding as a primary key.
-	if (!table.isSecondaryIndex || table.index.EncodingType == sqlbase.PrimaryIndexEncoding) &&
+	if table.index.GetEncodingType(table.desc.PrimaryIndex.ID) == sqlbase.PrimaryIndexEncoding &&
 		len(rf.keyRemainingBytes) > 0 {
 		// If familyID is 0, kv.Value contains values for composite key columns.
 		// These columns already have a table.row value assigned above, but that value

diff --git a/pkg/sql/span/span_builder.go b/pkg/sql/span/span_builder.go
@@ -100,7 +100,7 @@ func MakeBuilder(table *sqlbase.TableDescriptor, index *sqlbase.IndexDescriptor)
 // SetNeededColumns sets the needed columns on the Builder. This information
 // is used by MaybeSplitSpanIntoSeparateFamilies.
 func (s *Builder) SetNeededColumns(neededCols util.FastIntSet) {
-	s.neededFamilies = sqlbase.NeededColumnFamilyIDs(s.table.ColumnIdxMap(), s.table.Families, neededCols)
+	s.neededFamilies = sqlbase.NeededColumnFamilyIDs(neededCols, s.table, s.index)
 }
 
 // UnsetNeededColumns resets the needed columns for column family specific optimizations
@@ -259,7 +259,7 @@ func (s *Builder) appendSpansFromConstraintSpan(
 	// families, only scan the relevant column families. This is disabled for
 	// deletions to ensure that the entire row is deleted.
 	if !forDelete && needed.Len() > 0 && span.Key.Equal(span.EndKey) {
-		neededFamilyIDs := sqlbase.NeededColumnFamilyIDs(s.table.ColumnIdxMap(), s.table.Families, needed)
+		neededFamilyIDs := sqlbase.NeededColumnFamilyIDs(needed, s.table, s.index)
 		if s.CanSplitSpanIntoSeparateFamilies(len(neededFamilyIDs), cs.StartKey().Length(), containsNull) {
 			return sqlbase.SplitSpanIntoSeparateFamilies(appendTo, span, neededFamilyIDs), nil
 		}
-Original file line number
+Diff line change
@@ Expand Up / @@ -23,7 +23,8 @@ CREATE TABLE abc ( @@
       a INT PRIMARY KEY,
       b INT,
       c INT,
-      d INT
+      d INT,
+      FAMILY (a, b, c, d)
     )
     statement ok
@@ Expand Down @@