Merge #128020

128020: tenantcostmodel: update cost model to reflect batching changes r=andy-kimball a=andy-kimball A previous change updated the eCPU model to count all requests in read/write batches, rather than a subset of requests. This PR updates the cost of each request to reflect that change, based on a suite of tests. Changing the cost of requests has a ripple effect across other costs, so they changed as well. The resulting cost model has fewer outliers in testing and should be more resilient to future changes to KV batches/requests (e.g. adding a new request type). Epic: https://cockroachlabs.atlassian.net/browse/CC-28471 Release note: None Co-authored-by: Andrew Kimball <[email protected]>
cockroachdb · Jul 31, 2024 · 7a1bf52 · 7a1bf52
2 parents d4faecd + ffcc2b0
commit 7a1bf52
Show file tree

Hide file tree

Showing 6 changed files with 111 additions and 77 deletions.
diff --git a/pkg/ccl/multitenantccl/tenantcostclient/tenant_side_test.go b/pkg/ccl/multitenantccl/tenantcostclient/tenant_side_test.go
@@ -1822,7 +1822,10 @@ func TestCPUModelSettingsChanged(t *testing.T) {
 	newModel := `
 	{
 	  "ReadBatchCost": 1,
-	  "ReadRequestCost": 2,
+	  "ReadRequestCost": {
+		"BatchSize": [1, 2, 3],
+		"CPUPerRequest": [0.1, 0.2, 0.3]
+	  },
 	  "ReadBytesCost": {
 		"PayloadSize": [1, 2, 3],
 		"CPUPerByte": [0.5, 1, 1.5]

diff --git a/pkg/ccl/multitenantccl/tenantcostclient/testdata/estimated-cpu b/pkg/ccl/multitenantccl/tenantcostclient/testdata/estimated-cpu
@@ -18,10 +18,10 @@ token-bucket
 write repeat=35 count=6 bytes=2048 localities=same-zone
 ----
 
-# Expect ~283 tokens to be consumed.
+# Expect ~235 tokens to be consumed.
 token-bucket
 ----
-4717.33 tokens filling @ 0.00 tokens/s
+4764.99 tokens filling @ 0.00 tokens/s
 
 metrics
 ----
@@ -38,8 +38,8 @@ tenant.sql_usage.pgwire_egress_bytes: 0
 tenant.sql_usage.external_io_ingress_bytes: 0
 tenant.sql_usage.external_io_egress_bytes: 0
 tenant.sql_usage.cross_region_network_ru: 0.00
-tenant.sql_usage.estimated_kv_cpu_seconds: 0.28
-tenant.sql_usage.estimated_cpu_seconds: 0.28
+tenant.sql_usage.estimated_kv_cpu_seconds: 0.24
+tenant.sql_usage.estimated_cpu_seconds: 0.24
 tenant.sql_usage.estimated_replication_bytes: 145460
 tenant.sql_usage.estimated_replication_bytes{from_region="us-central1",from_zone="az1",to_region="us-central1",to_zone="az1"}: 145460
 
@@ -56,10 +56,10 @@ advance wait=true
 ----
 00:00:01.000
 
-# ~31 tokens removed from bucket to account for background CPU.
+# ~25 tokens removed from bucket to account for background CPU.
 token-bucket
 ----
-4686.71 tokens filling @ 0.00 tokens/s
+4739.54 tokens filling @ 0.00 tokens/s
 
 metrics
 ----
@@ -76,8 +76,8 @@ tenant.sql_usage.pgwire_egress_bytes: 0
 tenant.sql_usage.external_io_ingress_bytes: 0
 tenant.sql_usage.external_io_egress_bytes: 0
 tenant.sql_usage.cross_region_network_ru: 0.00
-tenant.sql_usage.estimated_kv_cpu_seconds: 0.28
-tenant.sql_usage.estimated_cpu_seconds: 0.31
+tenant.sql_usage.estimated_kv_cpu_seconds: 0.24
+tenant.sql_usage.estimated_cpu_seconds: 0.26
 tenant.sql_usage.estimated_replication_bytes: 145460
 tenant.sql_usage.estimated_replication_bytes{from_region="us-central1",from_zone="az1",to_region="us-central1",to_zone="az1"}: 145460
 
@@ -109,10 +109,10 @@ advance wait=true
 ----
 00:00:42.000
 
-# Expect ~254 tokens to be removed, as compared to ~314 above (283 + 31).
+# Expect ~208 tokens to be removed, as compared to ~260 above (235 + 25).
 token-bucket
 ----
-4432.93 tokens filling @ 0.00 tokens/s
+4531.69 tokens filling @ 0.00 tokens/s
 
 metrics
 ----
@@ -129,8 +129,8 @@ tenant.sql_usage.pgwire_egress_bytes: 0
 tenant.sql_usage.external_io_ingress_bytes: 0
 tenant.sql_usage.external_io_egress_bytes: 0
 tenant.sql_usage.cross_region_network_ru: 0.00
-tenant.sql_usage.estimated_kv_cpu_seconds: 0.51
-tenant.sql_usage.estimated_cpu_seconds: 0.57
+tenant.sql_usage.estimated_kv_cpu_seconds: 0.42
+tenant.sql_usage.estimated_cpu_seconds: 0.47
 tenant.sql_usage.estimated_replication_bytes: 290920
 tenant.sql_usage.estimated_replication_bytes{from_region="us-central1",from_zone="az1",to_region="us-central1",to_zone="az1"}: 218190
 tenant.sql_usage.estimated_replication_bytes{from_region="us-central1",from_zone="az1",to_region="us-central1",to_zone="az2"}: 72730
@@ -163,10 +163,10 @@ advance wait=true
 ----
 00:00:53.000
 
-# Expect ~254 tokens to be consumed, like above.
+# Expect ~206 tokens to be consumed, like above.
 token-bucket
 ----
-4179.15 tokens filling @ 0.00 tokens/s
+4323.84 tokens filling @ 0.00 tokens/s
 
 metrics
 ----
@@ -183,8 +183,8 @@ tenant.sql_usage.pgwire_egress_bytes: 0
 tenant.sql_usage.external_io_ingress_bytes: 0
 tenant.sql_usage.external_io_egress_bytes: 0
 tenant.sql_usage.cross_region_network_ru: 0.00
-tenant.sql_usage.estimated_kv_cpu_seconds: 0.74
-tenant.sql_usage.estimated_cpu_seconds: 0.82
+tenant.sql_usage.estimated_kv_cpu_seconds: 0.61
+tenant.sql_usage.estimated_cpu_seconds: 0.68
 tenant.sql_usage.estimated_replication_bytes: 436380
 tenant.sql_usage.estimated_replication_bytes{from_region="us-central1",from_zone="az1",to_region="us-central1",to_zone="az1"}: 290920
 tenant.sql_usage.estimated_replication_bytes{from_region="us-central1",from_zone="az1",to_region="us-central1",to_zone="az2"}: 145460
@@ -201,7 +201,7 @@ advance wait=true
 
 token-bucket
 ----
-2787.54 tokens filling @ 0.00 tokens/s
+2466.99 tokens filling @ 0.00 tokens/s
 
 metrics
 ----
@@ -218,8 +218,8 @@ tenant.sql_usage.pgwire_egress_bytes: 0
 tenant.sql_usage.external_io_ingress_bytes: 0
 tenant.sql_usage.external_io_egress_bytes: 0
 tenant.sql_usage.cross_region_network_ru: 0.00
-tenant.sql_usage.estimated_kv_cpu_seconds: 2.00
-tenant.sql_usage.estimated_cpu_seconds: 2.21
+tenant.sql_usage.estimated_kv_cpu_seconds: 2.29
+tenant.sql_usage.estimated_cpu_seconds: 2.53
 tenant.sql_usage.estimated_replication_bytes: 436380
 tenant.sql_usage.estimated_replication_bytes{from_region="us-central1",from_zone="az1",to_region="us-central1",to_zone="az1"}: 290920
 tenant.sql_usage.estimated_replication_bytes{from_region="us-central1",from_zone="az1",to_region="us-central1",to_zone="az2"}: 145460
@@ -237,7 +237,7 @@ advance wait=true
 
 token-bucket
 ----
-1690.29 tokens filling @ 0.00 tokens/s
+1369.74 tokens filling @ 0.00 tokens/s
 
 metrics
 ----
@@ -254,8 +254,8 @@ tenant.sql_usage.pgwire_egress_bytes: 0
 tenant.sql_usage.external_io_ingress_bytes: 0
 tenant.sql_usage.external_io_egress_bytes: 0
 tenant.sql_usage.cross_region_network_ru: 0.00
-tenant.sql_usage.estimated_kv_cpu_seconds: 2.00
-tenant.sql_usage.estimated_cpu_seconds: 3.31
+tenant.sql_usage.estimated_kv_cpu_seconds: 2.29
+tenant.sql_usage.estimated_cpu_seconds: 3.63
 tenant.sql_usage.estimated_replication_bytes: 436380
 tenant.sql_usage.estimated_replication_bytes{from_region="us-central1",from_zone="az1",to_region="us-central1",to_zone="az1"}: 290920
 tenant.sql_usage.estimated_replication_bytes{from_region="us-central1",from_zone="az1",to_region="us-central1",to_zone="az2"}: 145460
@@ -274,7 +274,7 @@ advance wait=true
 
 token-bucket
 ----
-1690.29 tokens filling @ 0.00 tokens/s
+1369.74 tokens filling @ 0.00 tokens/s
 
 metrics
 ----
@@ -291,8 +291,8 @@ tenant.sql_usage.pgwire_egress_bytes: 0
 tenant.sql_usage.external_io_ingress_bytes: 1024000
 tenant.sql_usage.external_io_egress_bytes: 1024000
 tenant.sql_usage.cross_region_network_ru: 0.00
-tenant.sql_usage.estimated_kv_cpu_seconds: 2.00
-tenant.sql_usage.estimated_cpu_seconds: 3.31
+tenant.sql_usage.estimated_kv_cpu_seconds: 2.29
+tenant.sql_usage.estimated_cpu_seconds: 3.63
 tenant.sql_usage.estimated_replication_bytes: 436380
 tenant.sql_usage.estimated_replication_bytes{from_region="us-central1",from_zone="az1",to_region="us-central1",to_zone="az1"}: 290920
 tenant.sql_usage.estimated_replication_bytes{from_region="us-central1",from_zone="az1",to_region="us-central1",to_zone="az2"}: 145460
@@ -309,7 +309,7 @@ advance wait=true
 
 token-bucket
 ----
-1690.29 tokens filling @ 0.00 tokens/s
+1369.74 tokens filling @ 0.00 tokens/s
 
 metrics
 ----
@@ -326,8 +326,8 @@ tenant.sql_usage.pgwire_egress_bytes: 12345
 tenant.sql_usage.external_io_ingress_bytes: 1024000
 tenant.sql_usage.external_io_egress_bytes: 1024000
 tenant.sql_usage.cross_region_network_ru: 0.00
-tenant.sql_usage.estimated_kv_cpu_seconds: 2.00
-tenant.sql_usage.estimated_cpu_seconds: 3.31
+tenant.sql_usage.estimated_kv_cpu_seconds: 2.29
+tenant.sql_usage.estimated_cpu_seconds: 3.63
 tenant.sql_usage.estimated_replication_bytes: 436380
 tenant.sql_usage.estimated_replication_bytes{from_region="us-central1",from_zone="az1",to_region="us-central1",to_zone="az1"}: 290920
 tenant.sql_usage.estimated_replication_bytes{from_region="us-central1",from_zone="az1",to_region="us-central1",to_zone="az2"}: 145460
@@ -345,7 +345,7 @@ token-bucket-response
 
 token-bucket
 ----
-1690.29 tokens filling @ 0.00 tokens/s
+1369.74 tokens filling @ 0.00 tokens/s
 
 # Perform cross-region write request.
 write count=100 bytes=1000 localities=cross-region
@@ -359,7 +359,7 @@ advance wait=true
 
 token-bucket
 ----
-1652.30 tokens filling @ 0.00 tokens/s
+1335.90 tokens filling @ 0.00 tokens/s
 
 metrics
 ----
@@ -376,8 +376,8 @@ tenant.sql_usage.pgwire_egress_bytes: 12345
 tenant.sql_usage.external_io_ingress_bytes: 1024000
 tenant.sql_usage.external_io_egress_bytes: 1024000
 tenant.sql_usage.cross_region_network_ru: 0.00
-tenant.sql_usage.estimated_kv_cpu_seconds: 2.03
-tenant.sql_usage.estimated_cpu_seconds: 3.35
+tenant.sql_usage.estimated_kv_cpu_seconds: 2.32
+tenant.sql_usage.estimated_cpu_seconds: 3.66
 tenant.sql_usage.estimated_replication_bytes: 441980
 tenant.sql_usage.estimated_replication_bytes{from_region="us-central1",from_zone="",to_region="europe-west1",to_zone=""}: 2800
 tenant.sql_usage.estimated_replication_bytes{from_region="us-central1",from_zone="az1",to_region="us-central1",to_zone="az1"}: 292320
@@ -397,4 +397,4 @@ advance wait=true
 
 token-bucket
 ----
-1652.30 tokens filling @ 0.00 tokens/s
+1335.90 tokens filling @ 0.00 tokens/s
diff --git a/pkg/multitenant/tenantcostmodel/ecpu_model.go b/pkg/multitenant/tenantcostmodel/ecpu_model.go
@@ -36,9 +36,14 @@ type EstimatedCPUModel struct {
 	// ReadBatchCost is the amount of KV CPU needed to process 1 read batch
 	// containing 1 read request with a 1-byte payload.
 	ReadBatchCost EstimatedCPU
-	// ReadRequestCost is the amount of KV CPU needed to process each additional
-	// read request in a batch, beyond the first.
-	ReadRequestCost EstimatedCPU
+	// ReadRequestCost is a lookup table that maps from the number of requests
+	// in a read batch, to the amount of KV CPU used to process each additional
+	// read request in a batch, beyond the first. As the batch size increases,
+	// each KV CPU can process more read requests.
+	ReadRequestCost struct {
+		BatchSize     []float64
+		CPUPerRequest []EstimatedCPU
+	}
 	// ReadBytesCost is a lookup table that maps from the total payload size of
 	// a read batch, to the amount of KV CPU needed to process those bytes. As
 	// the payload size increases, each KV CPU can process more bytes.
@@ -56,8 +61,10 @@ type EstimatedCPUModel struct {
 	}
 	// WriteRequestCost is a lookup table that maps from the number of requests
 	// in a write batch, to the amount of KV CPU used to process each additional
-	// write request in a batch, beyond the first. As the batch size increases,
-	// each KV CPU can process more write requests.
+	// write request in a batch, beyond the second. The second request in simple
+	// write batches is often an EndTxn request, which has near-zero cost in the
+	// fast path. As the batch size increases, each KV CPU can process more write
+	// requests.
 	WriteRequestCost struct {
 		BatchSize     []float64
 		CPUPerRequest []EstimatedCPU
@@ -85,20 +92,28 @@ type EstimatedCPUModel struct {
 // DefaultEstimatedCPUModel is the default model that is used if the
 // tenant_cost_model.estimated_cpu cluster setting is not specified.
 var DefaultEstimatedCPUModel = EstimatedCPUModel{
-	ReadBatchCost:   1.0 / 3500,
-	ReadRequestCost: 1.0 / 45000,
+	ReadBatchCost: 1.0 / 3500,
+	ReadRequestCost: struct {
+		BatchSize     []float64
+		CPUPerRequest []EstimatedCPU
+	}{
+		BatchSize: []float64{8, 16, 32, 64},
+		CPUPerRequest: []EstimatedCPU{
+			1.0 / 16500, 1.0 / 26700, 1.0 / 35000, 1.0 / 40400,
+		},
+	},
 	ReadBytesCost: struct {
 		PayloadSize []float64
 		CPUPerByte  []EstimatedCPU
 	}{
 		PayloadSize: []float64{256, 1024, 4 * 1024, 16 * 1024, 64 * 1024, 256 * 1024},
 		CPUPerByte: []EstimatedCPU{
-			1.0 / 1.5 / 1024 / 1024,
-			1.0 / 5.5 / 1024 / 1024,
-			1.0 / 12 / 1024 / 1024,
-			1.0 / 34 / 1024 / 1024,
-			1.0 / 64 / 1024 / 1024,
-			1.0 / 89 / 1024 / 1024,
+			1.0 / 3 / 1024 / 1024,
+			1.0 / 5 / 1024 / 1024,
+			1.0 / 9 / 1024 / 1024,
+			1.0 / 27 / 1024 / 1024,
+			1.0 / 62 / 1024 / 1024,
+			1.0 / 106 / 1024 / 1024,
 		},
 	},
 	WriteBatchCost: struct {
@@ -107,28 +122,29 @@ var DefaultEstimatedCPUModel = EstimatedCPUModel{
 	}{
 		RatePerNode: []float64{100, 200, 400, 800, 1600, 3200, 6400, 12800},
 		CPUPerBatch: []EstimatedCPU{
-			1.0 / 660, 1.0 / 850, 1.0 / 1090, 1.0 / 1400, 1.0 / 1790, 1.0 / 2290, 1.0 / 2930, 1.0 / 3150,
+			1.0 / 700, 1.0 / 900, 1.0 / 1100, 1.0 / 1300, 1.0 / 1700, 1.0 / 2200, 1.0 / 2700, 1.0 / 3150,
 		},
 	},
 	WriteRequestCost: struct {
 		BatchSize     []float64
 		CPUPerRequest []EstimatedCPU
 	}{
-		BatchSize: []float64{3, 6, 12, 25, 50, 100, 200},
+		BatchSize: []float64{2, 3, 6, 11, 22, 43, 84},
 		CPUPerRequest: []EstimatedCPU{
-			1.0 / 2500, 1.0 / 5250, 1.0 / 9050, 1.0 / 11900, 1.0 / 15400, 1.0 / 17400, 1.0 / 19000},
+			1.0 / 1100, 1.0 / 2700, 1.0 / 6400, 1.0 / 10200, 1.0 / 14600, 1.0 / 18500, 1.0 / 19600,
+		},
 	},
 	WriteBytesCost: struct {
 		PayloadSize []float64
 		CPUPerByte  []EstimatedCPU
 	}{
 		PayloadSize: []float64{256, 1024, 4 * 1024, 16 * 1024, 64 * 1024},
 		CPUPerByte: []EstimatedCPU{
-			1.0 / 3.75 / 1024 / 1024,
-			1.0 / 8 / 1024 / 1024,
-			1.0 / 11 / 1024 / 1024,
-			1.0 / 14 / 1024 / 1024,
-			1.0 / 18.5 / 1024 / 1024,
+			1.0 / 9 / 1024 / 1024,
+			1.0 / 10 / 1024 / 1024,
+			1.0 / 12.5 / 1024 / 1024,
+			1.0 / 15 / 1024 / 1024,
+			1.0 / 19 / 1024 / 1024,
 		},
 	},
 	BackgroundCPU: struct {
@@ -183,7 +199,9 @@ func (m *EstimatedCPUModel) BatchCost(
 
 		// Add cost for additional requests in the batch, beyond the first.
 		if bi.ReadCount > 1 {
-			readCPU += m.ReadRequestCost * EstimatedCPU(bi.ReadCount-1)
+			ecpuPerRequest := m.lookupCost(
+				m.ReadRequestCost.BatchSize, m.ReadRequestCost.CPUPerRequest, float64(bi.ReadCount))
+			readCPU += ecpuPerRequest * EstimatedCPU(bi.ReadCount-1)
 		}
 
 		// Add cost for bytes in the requests.
@@ -196,11 +214,12 @@ func (m *EstimatedCPUModel) BatchCost(
 		// Add cost for the batch.
 		writeCPU = m.lookupCost(m.WriteBatchCost.RatePerNode, m.WriteBatchCost.CPUPerBatch, ratePerNode)
 
-		// Add cost for additional requests in the batch, beyond the first.
-		if bi.WriteCount > 1 {
+		// Add cost for additional requests in the batch, beyond the second (see
+		// EstimatedCPUModel.WriteRequestCost comment for furthe).
+		if bi.WriteCount > 2 {
 			ecpuPerRequest := m.lookupCost(
 				m.WriteRequestCost.BatchSize, m.WriteRequestCost.CPUPerRequest, float64(bi.WriteCount))
-			writeCPU += ecpuPerRequest * EstimatedCPU(bi.WriteCount-1)
+			writeCPU += ecpuPerRequest * EstimatedCPU(bi.WriteCount-2)
 		}
 
 		// Add cost for bytes in the requests.