Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

release-22.1: backupccl: add 8TB TPCE restore nightly roachtest #91511

Merged
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
153 changes: 143 additions & 10 deletions pkg/cmd/roachtest/tests/restore.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/util/httputil"
"github.com/cockroachdb/cockroach/pkg/util/humanizeutil"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/cockroach/pkg/util/protoutil"
"github.com/cockroachdb/cockroach/pkg/util/retry"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
"github.com/cockroachdb/errors"
Expand Down Expand Up @@ -414,23 +415,147 @@ func (tpccIncData) runRestoreDetached(
return jobID, nil
}

// This data set restores a backup created from a 500k tpce fixture. The backed
// up cluster had around 7.6 TB of data on disk, and the restore cluster will
// have around 8.5TB on disk.
//
// Backup Fixture description: This fixture contains two full backups and hourly
// incremental backups taken over 24 hours, with revision history. The cluster
// was running the tpce workload, initialized with 500k customers and running
// with 100k active customers. The init step created 7.6 TB of data, and the
// running step only created another 100 GB of data. The backups did not begin
// until after tpce init ended.
//
// Fixture recreation steps:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

💭 do you want to check in the regeneration script so its trivial for future authors to regenerate it? Docs get moved and edited so I think this would be more future proof.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hrm, that's a good idea. There are bugs in my current script and I plan to fix it and check it in when i create 22.2 fixtures.

//
// 1) Create a roachprod cluster with the same topology as
// this test.
//
// 2) Setup the cluster to run tpc-e init with 500k customers, using
// the repro steps in the appendix of Nathan's v22.2
// Scalability & Efficiency Evaluation using TPC-E doc
// https://docs.google.com/document/d/1wzkBXaA3Ap_daMV1oY1AhQqlnAjO3pIVLZTXY53m0Xk/edit
//
// 3) Set the gc ttl for the tpce database to 25 hrs
//
// 4) Create a backup schedule with revision history and hourly incremental backups.
//
// 6) Run the tpce workload for 24 hours with 100k active customers.
type tpce10TB struct{}

func (tpce10TB) name() string {
return "TPCE10TB"
}

func (tpce10TB) runRestore(ctx context.Context, c cluster.Cluster) {
// Restore from the first full backup AOST in the last incremental backup in the chain.
c.Run(ctx, c.Node(1), `./cockroach sql --insecure -e "
RESTORE DATABASE tpce FROM '/2022/11/06-124208.52' IN
'gs://cockroach-fixtures/backups/tpc-e/rev-history=true,inc-count=23,cluster/customers=500k/22.1.8?AUTH=implicit'
AS OF SYSTEM TIME '2022-11-06 23:40:22'"`)
}

func (tpce10TB) runRestoreDetached(
ctx context.Context, t test.Test, c cluster.Cluster,
) (jobspb.JobID, error) {
// Restore from the first full backup AOST in the last incremental backup in the chain.
c.Run(ctx, c.Node(1), `./cockroach sql --insecure -e "
RESTORE DATABASE tpce FROM '/2022/11/06-124208.52' IN
'gs://cockroach-fixtures/backups/tpc-e/rev-history=true,inc-count=23,cluster/customers=500k/22.1.8?AUTH=implicit'
AS OF SYSTEM TIME '2022-11-06 23:40:22' WITH detached"`)
db, err := c.ConnE(ctx, t.L(), c.Node(1)[0])
if err != nil {
return 0, errors.Wrap(err, "failed to connect to node 1; running restore detached")
}

var jobID jobspb.JobID
if err := db.QueryRow(`SELECT job_id FROM [SHOW JOBS] WHERE job_type = 'RESTORE' ORDER BY created DESC LIMIT 1`).Scan(
&jobID); err != nil {
return 0, err
}

return jobID, nil
}

var _ testDataSet = tpccIncData{}

// checkDetachedRestore returns when the detached restore has completed
func checkDetachedRestore(
ctx context.Context, t test.Test, c cluster.Cluster, jobID jobspb.JobID,
) error {
checkJobTick := time.NewTicker(time.Minute * 1)
defer checkJobTick.Stop()
done := ctx.Done()
for {
select {
case <-done:
return ctx.Err()
case <-checkJobTick.C:
checkSucceeded := func() (bool, error) {
// Open a new connection for every check to prevent connection flakes.
conn, err := c.ConnE(ctx, t.L(), c.Node(1)[0])
if err != nil {
return false, errors.Wrapf(err, "failed to open connection to sql server")
}
defer conn.Close()
var status string
var payloadBytes []byte
if err = conn.QueryRowContext(ctx,
`SELECT status, payload FROM [SHOW JOBS] WHERE job_id = $1`,
jobID).Scan(&status, &payloadBytes); err != nil {
return false, errors.Wrapf(err, "failed to check restore job status")
}
switch status {
case string(jobs.StatusSucceeded):
return true, nil
case string(jobs.StatusRunning):
return false, nil
case string(jobs.StatusFailed):
var payload jobspb.Payload
require.NoError(t, protoutil.Unmarshal(payloadBytes, &payload))
return false, errors.Newf("job failed: %s", payload.Error)
default:
return false, errors.Newf("job unexpectedly found in %s state", status)
}
}
succeeded, err := checkSucceeded()
if err != nil {
return err
}
if succeeded {
return nil
}
}
}
}
func registerRestore(r registry.Registry) {
largeVolumeSize := 2500 // the size in GB of disks in large volume configs

for _, item := range []struct {
nodes int
cpus int
largeVolumes bool
nodes int
cpus int

// pdVolumeSize specifies, in GB, the pd-ssd GB per node. If not specified,
// each node will use a local ssd, which on GCP, has 375 GB capacity.
pdVolumeSize int
dataSet testDataSet

timeout time.Duration

// detatched runs a detatched restore, which will cause the roachperf time to be off by at
// most 1 minute. Use this setting for especially large roachtests where an
// ssh connection disruption could occur.
detached bool

// parallelize bumps the restore node and addsstable request concurrency.
parallelize bool
}{
{dataSet: dataBank2TB{}, nodes: 10, timeout: 6 * time.Hour},
{dataSet: dataBank2TB{}, nodes: 32, timeout: 3 * time.Hour},
{dataSet: dataBank2TB{}, nodes: 6, timeout: 4 * time.Hour, cpus: 8, largeVolumes: true},
{dataSet: dataBank2TB{}, nodes: 6, timeout: 4 * time.Hour, cpus: 8, pdVolumeSize: 2500,
parallelize: true},
{dataSet: tpccIncData{}, nodes: 10, timeout: 6 * time.Hour},
{dataSet: tpce10TB{}, nodes: 10, timeout: 10 * time.Hour, cpus: 8, pdVolumeSize: 1500,
detached: true},
} {
item := item
clusterOpts := make([]spec.Option, 0)
Expand All @@ -439,9 +564,9 @@ func registerRestore(r registry.Registry) {
clusterOpts = append(clusterOpts, spec.CPU(item.cpus))
testName += fmt.Sprintf("/cpus=%d", item.cpus)
}
if item.largeVolumes {
clusterOpts = append(clusterOpts, spec.VolumeSize(largeVolumeSize))
testName += fmt.Sprintf("/pd-volume=%dGB", largeVolumeSize)
if item.pdVolumeSize != 0 {
clusterOpts = append(clusterOpts, spec.VolumeSize(item.pdVolumeSize))
testName += fmt.Sprintf("/pd-volume=%dGB", item.pdVolumeSize)
}
// Has been seen to OOM: https://github.com/cockroachdb/cockroach/issues/71805
clusterOpts = append(clusterOpts, spec.HighMem(true))
Expand Down Expand Up @@ -484,7 +609,7 @@ func registerRestore(r registry.Registry) {
// capture the total elapsed time. This is used by
// roachperf to compute and display the average MB/sec per
// node.
if item.cpus >= 8 {
if item.parallelize {
// If the nodes are large enough (specifically, if they
// have enough memory we can increase the parallelism
// of restore). Machines with 16 vCPUs typically have
Expand All @@ -495,7 +620,14 @@ func registerRestore(r registry.Registry) {
`./cockroach sql --insecure -e "SET CLUSTER SETTING kv.bulk_io_write.concurrent_addsstable_requests = 5"`)
}
tick()
item.dataSet.runRestore(ctx, c)
if item.detached {
jobID, err := item.dataSet.runRestoreDetached(ctx, t, c)
require.NoError(t, err)
err = checkDetachedRestore(ctx, t, c, jobID)
require.NoError(t, err)
} else {
item.dataSet.runRestore(ctx, c)
}
tick()

// Upload the perf artifacts to any one of the nodes so that the test
Expand Down Expand Up @@ -540,6 +672,7 @@ func registerRestore(r registry.Registry) {
// Wait until the restore job has been created.
conn, err := c.ConnE(ctx, t.L(), c.Node(1)[0])
require.NoError(t, err)
defer conn.Close()

// The job should be created fairly quickly once the roachtest starts.
done := ctx.Done()
Expand Down