Skip to content

Commit

Permalink
Merge #128020
Browse files Browse the repository at this point in the history
128020: tenantcostmodel: update cost model to reflect batching changes r=andy-kimball a=andy-kimball

A previous change updated the eCPU model to count all requests in read/write batches, rather than a subset of requests. This PR updates the cost of each request to reflect that change, based on a suite of tests. Changing the cost of requests has a ripple effect across other costs, so they changed as well. The resulting cost model has fewer outliers in testing and should be more resilient to future changes to KV batches/requests (e.g. adding a new request type).

Epic: https://cockroachlabs.atlassian.net/browse/CC-28471

Release note: None

Co-authored-by: Andrew Kimball <[email protected]>
  • Loading branch information
craig[bot] and andy-kimball committed Jul 31, 2024
2 parents d4faecd + ffcc2b0 commit 7a1bf52
Show file tree
Hide file tree
Showing 6 changed files with 111 additions and 77 deletions.
5 changes: 4 additions & 1 deletion pkg/ccl/multitenantccl/tenantcostclient/tenant_side_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1822,7 +1822,10 @@ func TestCPUModelSettingsChanged(t *testing.T) {
newModel := `
{
"ReadBatchCost": 1,
"ReadRequestCost": 2,
"ReadRequestCost": {
"BatchSize": [1, 2, 3],
"CPUPerRequest": [0.1, 0.2, 0.3]
},
"ReadBytesCost": {
"PayloadSize": [1, 2, 3],
"CPUPerByte": [0.5, 1, 1.5]
Expand Down
66 changes: 33 additions & 33 deletions pkg/ccl/multitenantccl/tenantcostclient/testdata/estimated-cpu
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,10 @@ token-bucket
write repeat=35 count=6 bytes=2048 localities=same-zone
----

# Expect ~283 tokens to be consumed.
# Expect ~235 tokens to be consumed.
token-bucket
----
4717.33 tokens filling @ 0.00 tokens/s
4764.99 tokens filling @ 0.00 tokens/s

metrics
----
Expand All @@ -38,8 +38,8 @@ tenant.sql_usage.pgwire_egress_bytes: 0
tenant.sql_usage.external_io_ingress_bytes: 0
tenant.sql_usage.external_io_egress_bytes: 0
tenant.sql_usage.cross_region_network_ru: 0.00
tenant.sql_usage.estimated_kv_cpu_seconds: 0.28
tenant.sql_usage.estimated_cpu_seconds: 0.28
tenant.sql_usage.estimated_kv_cpu_seconds: 0.24
tenant.sql_usage.estimated_cpu_seconds: 0.24
tenant.sql_usage.estimated_replication_bytes: 145460
tenant.sql_usage.estimated_replication_bytes{from_region="us-central1",from_zone="az1",to_region="us-central1",to_zone="az1"}: 145460

Expand All @@ -56,10 +56,10 @@ advance wait=true
----
00:00:01.000

# ~31 tokens removed from bucket to account for background CPU.
# ~25 tokens removed from bucket to account for background CPU.
token-bucket
----
4686.71 tokens filling @ 0.00 tokens/s
4739.54 tokens filling @ 0.00 tokens/s

metrics
----
Expand All @@ -76,8 +76,8 @@ tenant.sql_usage.pgwire_egress_bytes: 0
tenant.sql_usage.external_io_ingress_bytes: 0
tenant.sql_usage.external_io_egress_bytes: 0
tenant.sql_usage.cross_region_network_ru: 0.00
tenant.sql_usage.estimated_kv_cpu_seconds: 0.28
tenant.sql_usage.estimated_cpu_seconds: 0.31
tenant.sql_usage.estimated_kv_cpu_seconds: 0.24
tenant.sql_usage.estimated_cpu_seconds: 0.26
tenant.sql_usage.estimated_replication_bytes: 145460
tenant.sql_usage.estimated_replication_bytes{from_region="us-central1",from_zone="az1",to_region="us-central1",to_zone="az1"}: 145460

Expand Down Expand Up @@ -109,10 +109,10 @@ advance wait=true
----
00:00:42.000

# Expect ~254 tokens to be removed, as compared to ~314 above (283 + 31).
# Expect ~208 tokens to be removed, as compared to ~260 above (235 + 25).
token-bucket
----
4432.93 tokens filling @ 0.00 tokens/s
4531.69 tokens filling @ 0.00 tokens/s

metrics
----
Expand All @@ -129,8 +129,8 @@ tenant.sql_usage.pgwire_egress_bytes: 0
tenant.sql_usage.external_io_ingress_bytes: 0
tenant.sql_usage.external_io_egress_bytes: 0
tenant.sql_usage.cross_region_network_ru: 0.00
tenant.sql_usage.estimated_kv_cpu_seconds: 0.51
tenant.sql_usage.estimated_cpu_seconds: 0.57
tenant.sql_usage.estimated_kv_cpu_seconds: 0.42
tenant.sql_usage.estimated_cpu_seconds: 0.47
tenant.sql_usage.estimated_replication_bytes: 290920
tenant.sql_usage.estimated_replication_bytes{from_region="us-central1",from_zone="az1",to_region="us-central1",to_zone="az1"}: 218190
tenant.sql_usage.estimated_replication_bytes{from_region="us-central1",from_zone="az1",to_region="us-central1",to_zone="az2"}: 72730
Expand Down Expand Up @@ -163,10 +163,10 @@ advance wait=true
----
00:00:53.000

# Expect ~254 tokens to be consumed, like above.
# Expect ~206 tokens to be consumed, like above.
token-bucket
----
4179.15 tokens filling @ 0.00 tokens/s
4323.84 tokens filling @ 0.00 tokens/s

metrics
----
Expand All @@ -183,8 +183,8 @@ tenant.sql_usage.pgwire_egress_bytes: 0
tenant.sql_usage.external_io_ingress_bytes: 0
tenant.sql_usage.external_io_egress_bytes: 0
tenant.sql_usage.cross_region_network_ru: 0.00
tenant.sql_usage.estimated_kv_cpu_seconds: 0.74
tenant.sql_usage.estimated_cpu_seconds: 0.82
tenant.sql_usage.estimated_kv_cpu_seconds: 0.61
tenant.sql_usage.estimated_cpu_seconds: 0.68
tenant.sql_usage.estimated_replication_bytes: 436380
tenant.sql_usage.estimated_replication_bytes{from_region="us-central1",from_zone="az1",to_region="us-central1",to_zone="az1"}: 290920
tenant.sql_usage.estimated_replication_bytes{from_region="us-central1",from_zone="az1",to_region="us-central1",to_zone="az2"}: 145460
Expand All @@ -201,7 +201,7 @@ advance wait=true

token-bucket
----
2787.54 tokens filling @ 0.00 tokens/s
2466.99 tokens filling @ 0.00 tokens/s

metrics
----
Expand All @@ -218,8 +218,8 @@ tenant.sql_usage.pgwire_egress_bytes: 0
tenant.sql_usage.external_io_ingress_bytes: 0
tenant.sql_usage.external_io_egress_bytes: 0
tenant.sql_usage.cross_region_network_ru: 0.00
tenant.sql_usage.estimated_kv_cpu_seconds: 2.00
tenant.sql_usage.estimated_cpu_seconds: 2.21
tenant.sql_usage.estimated_kv_cpu_seconds: 2.29
tenant.sql_usage.estimated_cpu_seconds: 2.53
tenant.sql_usage.estimated_replication_bytes: 436380
tenant.sql_usage.estimated_replication_bytes{from_region="us-central1",from_zone="az1",to_region="us-central1",to_zone="az1"}: 290920
tenant.sql_usage.estimated_replication_bytes{from_region="us-central1",from_zone="az1",to_region="us-central1",to_zone="az2"}: 145460
Expand All @@ -237,7 +237,7 @@ advance wait=true

token-bucket
----
1690.29 tokens filling @ 0.00 tokens/s
1369.74 tokens filling @ 0.00 tokens/s

metrics
----
Expand All @@ -254,8 +254,8 @@ tenant.sql_usage.pgwire_egress_bytes: 0
tenant.sql_usage.external_io_ingress_bytes: 0
tenant.sql_usage.external_io_egress_bytes: 0
tenant.sql_usage.cross_region_network_ru: 0.00
tenant.sql_usage.estimated_kv_cpu_seconds: 2.00
tenant.sql_usage.estimated_cpu_seconds: 3.31
tenant.sql_usage.estimated_kv_cpu_seconds: 2.29
tenant.sql_usage.estimated_cpu_seconds: 3.63
tenant.sql_usage.estimated_replication_bytes: 436380
tenant.sql_usage.estimated_replication_bytes{from_region="us-central1",from_zone="az1",to_region="us-central1",to_zone="az1"}: 290920
tenant.sql_usage.estimated_replication_bytes{from_region="us-central1",from_zone="az1",to_region="us-central1",to_zone="az2"}: 145460
Expand All @@ -274,7 +274,7 @@ advance wait=true

token-bucket
----
1690.29 tokens filling @ 0.00 tokens/s
1369.74 tokens filling @ 0.00 tokens/s

metrics
----
Expand All @@ -291,8 +291,8 @@ tenant.sql_usage.pgwire_egress_bytes: 0
tenant.sql_usage.external_io_ingress_bytes: 1024000
tenant.sql_usage.external_io_egress_bytes: 1024000
tenant.sql_usage.cross_region_network_ru: 0.00
tenant.sql_usage.estimated_kv_cpu_seconds: 2.00
tenant.sql_usage.estimated_cpu_seconds: 3.31
tenant.sql_usage.estimated_kv_cpu_seconds: 2.29
tenant.sql_usage.estimated_cpu_seconds: 3.63
tenant.sql_usage.estimated_replication_bytes: 436380
tenant.sql_usage.estimated_replication_bytes{from_region="us-central1",from_zone="az1",to_region="us-central1",to_zone="az1"}: 290920
tenant.sql_usage.estimated_replication_bytes{from_region="us-central1",from_zone="az1",to_region="us-central1",to_zone="az2"}: 145460
Expand All @@ -309,7 +309,7 @@ advance wait=true

token-bucket
----
1690.29 tokens filling @ 0.00 tokens/s
1369.74 tokens filling @ 0.00 tokens/s

metrics
----
Expand All @@ -326,8 +326,8 @@ tenant.sql_usage.pgwire_egress_bytes: 12345
tenant.sql_usage.external_io_ingress_bytes: 1024000
tenant.sql_usage.external_io_egress_bytes: 1024000
tenant.sql_usage.cross_region_network_ru: 0.00
tenant.sql_usage.estimated_kv_cpu_seconds: 2.00
tenant.sql_usage.estimated_cpu_seconds: 3.31
tenant.sql_usage.estimated_kv_cpu_seconds: 2.29
tenant.sql_usage.estimated_cpu_seconds: 3.63
tenant.sql_usage.estimated_replication_bytes: 436380
tenant.sql_usage.estimated_replication_bytes{from_region="us-central1",from_zone="az1",to_region="us-central1",to_zone="az1"}: 290920
tenant.sql_usage.estimated_replication_bytes{from_region="us-central1",from_zone="az1",to_region="us-central1",to_zone="az2"}: 145460
Expand All @@ -345,7 +345,7 @@ token-bucket-response

token-bucket
----
1690.29 tokens filling @ 0.00 tokens/s
1369.74 tokens filling @ 0.00 tokens/s

# Perform cross-region write request.
write count=100 bytes=1000 localities=cross-region
Expand All @@ -359,7 +359,7 @@ advance wait=true

token-bucket
----
1652.30 tokens filling @ 0.00 tokens/s
1335.90 tokens filling @ 0.00 tokens/s

metrics
----
Expand All @@ -376,8 +376,8 @@ tenant.sql_usage.pgwire_egress_bytes: 12345
tenant.sql_usage.external_io_ingress_bytes: 1024000
tenant.sql_usage.external_io_egress_bytes: 1024000
tenant.sql_usage.cross_region_network_ru: 0.00
tenant.sql_usage.estimated_kv_cpu_seconds: 2.03
tenant.sql_usage.estimated_cpu_seconds: 3.35
tenant.sql_usage.estimated_kv_cpu_seconds: 2.32
tenant.sql_usage.estimated_cpu_seconds: 3.66
tenant.sql_usage.estimated_replication_bytes: 441980
tenant.sql_usage.estimated_replication_bytes{from_region="us-central1",from_zone="",to_region="europe-west1",to_zone=""}: 2800
tenant.sql_usage.estimated_replication_bytes{from_region="us-central1",from_zone="az1",to_region="us-central1",to_zone="az1"}: 292320
Expand All @@ -397,4 +397,4 @@ advance wait=true

token-bucket
----
1652.30 tokens filling @ 0.00 tokens/s
1335.90 tokens filling @ 0.00 tokens/s
69 changes: 44 additions & 25 deletions pkg/multitenant/tenantcostmodel/ecpu_model.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,14 @@ type EstimatedCPUModel struct {
// ReadBatchCost is the amount of KV CPU needed to process 1 read batch
// containing 1 read request with a 1-byte payload.
ReadBatchCost EstimatedCPU
// ReadRequestCost is the amount of KV CPU needed to process each additional
// read request in a batch, beyond the first.
ReadRequestCost EstimatedCPU
// ReadRequestCost is a lookup table that maps from the number of requests
// in a read batch, to the amount of KV CPU used to process each additional
// read request in a batch, beyond the first. As the batch size increases,
// each KV CPU can process more read requests.
ReadRequestCost struct {
BatchSize []float64
CPUPerRequest []EstimatedCPU
}
// ReadBytesCost is a lookup table that maps from the total payload size of
// a read batch, to the amount of KV CPU needed to process those bytes. As
// the payload size increases, each KV CPU can process more bytes.
Expand All @@ -56,8 +61,10 @@ type EstimatedCPUModel struct {
}
// WriteRequestCost is a lookup table that maps from the number of requests
// in a write batch, to the amount of KV CPU used to process each additional
// write request in a batch, beyond the first. As the batch size increases,
// each KV CPU can process more write requests.
// write request in a batch, beyond the second. The second request in simple
// write batches is often an EndTxn request, which has near-zero cost in the
// fast path. As the batch size increases, each KV CPU can process more write
// requests.
WriteRequestCost struct {
BatchSize []float64
CPUPerRequest []EstimatedCPU
Expand Down Expand Up @@ -85,20 +92,28 @@ type EstimatedCPUModel struct {
// DefaultEstimatedCPUModel is the default model that is used if the
// tenant_cost_model.estimated_cpu cluster setting is not specified.
var DefaultEstimatedCPUModel = EstimatedCPUModel{
ReadBatchCost: 1.0 / 3500,
ReadRequestCost: 1.0 / 45000,
ReadBatchCost: 1.0 / 3500,
ReadRequestCost: struct {
BatchSize []float64
CPUPerRequest []EstimatedCPU
}{
BatchSize: []float64{8, 16, 32, 64},
CPUPerRequest: []EstimatedCPU{
1.0 / 16500, 1.0 / 26700, 1.0 / 35000, 1.0 / 40400,
},
},
ReadBytesCost: struct {
PayloadSize []float64
CPUPerByte []EstimatedCPU
}{
PayloadSize: []float64{256, 1024, 4 * 1024, 16 * 1024, 64 * 1024, 256 * 1024},
CPUPerByte: []EstimatedCPU{
1.0 / 1.5 / 1024 / 1024,
1.0 / 5.5 / 1024 / 1024,
1.0 / 12 / 1024 / 1024,
1.0 / 34 / 1024 / 1024,
1.0 / 64 / 1024 / 1024,
1.0 / 89 / 1024 / 1024,
1.0 / 3 / 1024 / 1024,
1.0 / 5 / 1024 / 1024,
1.0 / 9 / 1024 / 1024,
1.0 / 27 / 1024 / 1024,
1.0 / 62 / 1024 / 1024,
1.0 / 106 / 1024 / 1024,
},
},
WriteBatchCost: struct {
Expand All @@ -107,28 +122,29 @@ var DefaultEstimatedCPUModel = EstimatedCPUModel{
}{
RatePerNode: []float64{100, 200, 400, 800, 1600, 3200, 6400, 12800},
CPUPerBatch: []EstimatedCPU{
1.0 / 660, 1.0 / 850, 1.0 / 1090, 1.0 / 1400, 1.0 / 1790, 1.0 / 2290, 1.0 / 2930, 1.0 / 3150,
1.0 / 700, 1.0 / 900, 1.0 / 1100, 1.0 / 1300, 1.0 / 1700, 1.0 / 2200, 1.0 / 2700, 1.0 / 3150,
},
},
WriteRequestCost: struct {
BatchSize []float64
CPUPerRequest []EstimatedCPU
}{
BatchSize: []float64{3, 6, 12, 25, 50, 100, 200},
BatchSize: []float64{2, 3, 6, 11, 22, 43, 84},
CPUPerRequest: []EstimatedCPU{
1.0 / 2500, 1.0 / 5250, 1.0 / 9050, 1.0 / 11900, 1.0 / 15400, 1.0 / 17400, 1.0 / 19000},
1.0 / 1100, 1.0 / 2700, 1.0 / 6400, 1.0 / 10200, 1.0 / 14600, 1.0 / 18500, 1.0 / 19600,
},
},
WriteBytesCost: struct {
PayloadSize []float64
CPUPerByte []EstimatedCPU
}{
PayloadSize: []float64{256, 1024, 4 * 1024, 16 * 1024, 64 * 1024},
CPUPerByte: []EstimatedCPU{
1.0 / 3.75 / 1024 / 1024,
1.0 / 8 / 1024 / 1024,
1.0 / 11 / 1024 / 1024,
1.0 / 14 / 1024 / 1024,
1.0 / 18.5 / 1024 / 1024,
1.0 / 9 / 1024 / 1024,
1.0 / 10 / 1024 / 1024,
1.0 / 12.5 / 1024 / 1024,
1.0 / 15 / 1024 / 1024,
1.0 / 19 / 1024 / 1024,
},
},
BackgroundCPU: struct {
Expand Down Expand Up @@ -183,7 +199,9 @@ func (m *EstimatedCPUModel) BatchCost(

// Add cost for additional requests in the batch, beyond the first.
if bi.ReadCount > 1 {
readCPU += m.ReadRequestCost * EstimatedCPU(bi.ReadCount-1)
ecpuPerRequest := m.lookupCost(
m.ReadRequestCost.BatchSize, m.ReadRequestCost.CPUPerRequest, float64(bi.ReadCount))
readCPU += ecpuPerRequest * EstimatedCPU(bi.ReadCount-1)
}

// Add cost for bytes in the requests.
Expand All @@ -196,11 +214,12 @@ func (m *EstimatedCPUModel) BatchCost(
// Add cost for the batch.
writeCPU = m.lookupCost(m.WriteBatchCost.RatePerNode, m.WriteBatchCost.CPUPerBatch, ratePerNode)

// Add cost for additional requests in the batch, beyond the first.
if bi.WriteCount > 1 {
// Add cost for additional requests in the batch, beyond the second (see
// EstimatedCPUModel.WriteRequestCost comment for furthe).
if bi.WriteCount > 2 {
ecpuPerRequest := m.lookupCost(
m.WriteRequestCost.BatchSize, m.WriteRequestCost.CPUPerRequest, float64(bi.WriteCount))
writeCPU += ecpuPerRequest * EstimatedCPU(bi.WriteCount-1)
writeCPU += ecpuPerRequest * EstimatedCPU(bi.WriteCount-2)
}

// Add cost for bytes in the requests.
Expand Down
Loading

0 comments on commit 7a1bf52

Please sign in to comment.