Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

loqrecovery: use captured meta range content for LOQ plans #94239

Merged
merged 1 commit into from
Jan 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion pkg/cli/debug.go
Original file line number Diff line number Diff line change
Expand Up @@ -1432,7 +1432,9 @@ func init() {
f.StringVarP(&debugRecoverPlanOpts.outputFileName, "plan", "o", "",
"filename to write plan to")
f.IntSliceVar(&debugRecoverPlanOpts.deadStoreIDs, "dead-store-ids", nil,
"list of dead store IDs")
"list of dead store IDs (can't be used together with dead-node-ids)")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we just get rid of dead-store-ids, to avoid the complexity of supporting both?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should get rid of dead store ids in 23.2. Just to avoid any confusion on mixed cluster where old option which was the primary means disappears after upgrade. It should be less of an issue with 23.1 to 23.2 where we don't expect people to use dead-store-ids anymore.

f.IntSliceVar(&debugRecoverPlanOpts.deadNodeIDs, "dead-node-ids", nil,
"list of dead node IDs (can't be used together with dead-store-ids)")
f.VarP(&debugRecoverPlanOpts.confirmAction, cliflags.ConfirmActions.Name, cliflags.ConfirmActions.Shorthand,
cliflags.ConfirmActions.Usage())
f.BoolVar(&debugRecoverPlanOpts.force, "force", false,
Expand Down
149 changes: 129 additions & 20 deletions pkg/cli/debug_recover_loss_of_quorum.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import (
"fmt"
"io"
"os"
"path"
"strings"

"github.com/cockroachdb/cockroach/pkg/base"
Expand All @@ -32,6 +33,7 @@ import (
"github.com/cockroachdb/errors"
"github.com/cockroachdb/errors/hintdetail"
"github.com/spf13/cobra"
"github.com/spf13/pflag"
)

// confirmActionFlag defines a pflag to parse a confirm option.
Expand Down Expand Up @@ -291,61 +293,117 @@ var debugRecoverPlanCmd = &cobra.Command{
Long: `
Devise a plan to restore ranges that lost a quorum.
This command will read files with information about replicas collected from all
surviving nodes of a cluster and make a decision which replicas should be survivors
for the ranges where quorum was lost.
Decision is then written into a file or stdout.
The command analyzes information about replicas from all surviving nodes of a
cluster, finds ranges that lost quorum and makes decisions about which replicas
should act as survivors to restore quorum.
Information about replicas could be collected directly by connecting to the
cluster or from files generated by the collect-info command. In former case,
cluster connection parameters must be specified. If latter case, file names
should be provided as arguments.
After the data is analyzed, a recovery plan is written into a file or stdout.
This command only creates a plan and doesn't change any data.'
See debug recover command help for more details on how to use this command.
`,
Args: cobra.MinimumNArgs(1),
Args: cobra.MinimumNArgs(0),
RunE: runDebugPlanReplicaRemoval,
}

var debugRecoverPlanOpts struct {
outputFileName string
deadStoreIDs []int
deadNodeIDs []int
confirmAction confirmActionFlag
force bool
}

var planSpecificFlags = map[string]struct{}{
"plan": {},
"dead-store-ids": {},
"dead-node-ids": {},
"force": {},
"confirm": {},
}

func runDebugPlanReplicaRemoval(cmd *cobra.Command, args []string) error {
replicas, err := readReplicaInfoData(args)
if err != nil {
return err
ctx, cancel := context.WithCancel(cmd.Context())
defer cancel()

var replicas loqrecoverypb.ClusterReplicaInfo
var err error

if debugRecoverPlanOpts.deadStoreIDs != nil && debugRecoverPlanOpts.deadNodeIDs != nil {
return errors.New("debug recover make-plan command accepts either --dead-node-ids or --dead-store-ids")
}

var stats loqrecovery.CollectionStats
if len(args) == 0 {
// If no replica info is provided, try to connect to a cluster default or
// explicitly provided to retrieve replica info.
c, finish, err := getAdminClient(ctx, serverCfg)
if err != nil {
return errors.Wrapf(err, "failed to get admin connection to cluster")
}
defer finish()
replicas, stats, err = loqrecovery.CollectRemoteReplicaInfo(ctx, c)
if err != nil {
return errors.Wrapf(err, "failed to retrieve replica info from cluster")
}
} else {
replicas, err = readReplicaInfoData(args)
if err != nil {
return err
}
}

var deadStoreIDs []roachpb.StoreID
for _, id := range debugRecoverPlanOpts.deadStoreIDs {
deadStoreIDs = append(deadStoreIDs, roachpb.StoreID(id))
}

plan, report, err := loqrecovery.PlanReplicas(cmd.Context(), replicas.LocalInfo, deadStoreIDs)
var deadNodeIDs []roachpb.NodeID
for _, id := range debugRecoverPlanOpts.deadNodeIDs {
deadNodeIDs = append(deadNodeIDs, roachpb.NodeID(id))
}

plan, report, err := loqrecovery.PlanReplicas(
ctx,
replicas,
deadStoreIDs,
deadNodeIDs)
if err != nil {
return err
}

if stats.Nodes > 0 {
_, _ = fmt.Fprintf(stderr, `Nodes scanned: %d
`, stats.Nodes)
}
_, _ = fmt.Fprintf(stderr, `Total replicas analyzed: %d
Ranges without quorum: %d
Discarded live replicas: %d
`, report.TotalReplicas, len(report.PlannedUpdates), report.DiscardedNonSurvivors)
_, _ = fmt.Fprintf(stderr, "Proposed changes:\n")
for _, r := range report.PlannedUpdates {
_, _ = fmt.Fprintf(stderr, "Recovering range r%d:%s updating replica %s to %s. "+
_, _ = fmt.Fprintf(stderr, " range r%d:%s updating replica %s to %s. "+
"Discarding available replicas: [%s], discarding dead replicas: [%s].\n",
r.RangeID, r.StartKey, r.OldReplica, r.Replica,
r.RangeID, r.StartKey, r.OldReplica, r.NewReplica,
r.DiscardedAvailableReplicas, r.DiscardedDeadReplicas)
}

deadStoreMsg := fmt.Sprintf("\nDiscovered dead stores from provided files: %s",
joinStoreIDs(report.MissingStores))
argStoresMsg := ""
if len(deadStoreIDs) > 0 {
_, _ = fmt.Fprintf(stderr, "%s, (matches --dead-store-ids)\n\n", deadStoreMsg)
} else {
_, _ = fmt.Fprintf(stderr, "%s\n\n", deadStoreMsg)
argStoresMsg = ", (matches --dead-store-ids)"
}
if len(deadNodeIDs) > 0 {
argStoresMsg = ", (matches --dead-node-ids)"
}
_, _ = fmt.Fprintf(stderr, "\nDiscovered dead nodes, will be marked as decommissioned:\n%s\n%s\n\n",
formatNodeStores(report.MissingNodes, " "), argStoresMsg)

planningErr := report.Error()
if planningErr != nil {
Expand Down Expand Up @@ -406,6 +464,7 @@ Discarded live replicas: %d
return nil
}

planFile := "<plan file>"
var writer io.Writer = os.Stdout
if len(debugRecoverPlanOpts.outputFileName) > 0 {
if _, err = os.Stat(debugRecoverPlanOpts.outputFileName); err == nil {
Expand All @@ -417,6 +476,7 @@ Discarded live replicas: %d
}
defer outFile.Close()
writer = outFile
planFile = path.Base(debugRecoverPlanOpts.outputFileName)
}

jsonpb := protoutil.JSONPb{Indent: " "}
Expand All @@ -428,10 +488,22 @@ Discarded live replicas: %d
return errors.Wrap(err, "failed to write recovery plan")
}

_, _ = fmt.Fprint(stderr, "Plan created\nTo complete recovery, distribute the plan to the"+
" below nodes and invoke `debug recover apply-plan` on:\n")
for node, stores := range report.UpdatedNodes {
_, _ = fmt.Fprintf(stderr, "- node n%d, store(s) %s\n", node, joinStoreIDs(stores))
// No args means we collected connection info from cluster and need to
// preserve flags for subsequent invocation.
remoteArgs := getCLIClusterFlags(len(args) == 0, cmd, func(flag string) bool {
_, filter := planSpecificFlags[flag]
return filter
})

_, _ = fmt.Fprintf(stderr, `Plan created.
To stage recovery application in half-online mode invoke:
'cockroach debug recover apply-plan %s %s'
Alternatively distribute plan to below nodes and invoke 'debug recover apply-plan --store=<store-dir> %s' on:
`, remoteArgs, planFile, planFile)
for _, node := range report.UpdatedNodes {
_, _ = fmt.Fprintf(stderr, "- node n%d, store(s) %s\n", node.NodeID, joinStoreIDs(node.StoreIDs))
}

return nil
Expand Down Expand Up @@ -593,13 +665,50 @@ func joinStoreIDs(storeIDs []roachpb.StoreID) string {
return strings.Join(storeNames, ", ")
}

func formatNodeStores(locations []loqrecovery.NodeStores, indent string) string {
hasMultiStore := false
for _, v := range locations {
hasMultiStore = hasMultiStore || len(v.StoreIDs) > 1
}
if !hasMultiStore {
// we only have a single store per node, no need to list stores.
nodeNames := make([]string, 0, len(locations))
for _, node := range locations {
nodeNames = append(nodeNames, fmt.Sprintf("n%d", node.NodeID))
}
return indent + strings.Join(nodeNames, ", ")
}
nodeDetails := make([]string, 0, len(locations))
for _, node := range locations {
nodeDetails = append(nodeDetails,
indent+fmt.Sprintf("n%d: store(s): %s", node.NodeID, joinStoreIDs(node.StoreIDs)))
}
return strings.Join(nodeDetails, "\n")
}

// getCLIClusterFlags recreates command line flags from current command
// discarding any flags that filter returns true for.
func getCLIClusterFlags(fromCfg bool, cmd *cobra.Command, filter func(flag string) bool) string {
if !fromCfg {
return " --host <node-hostname>[:<port>] [--certs-dir <certificates-dir>|--insecure]"
}
var buf strings.Builder
cmd.Flags().VisitAll(func(f *pflag.Flag) {
if f.Changed && !filter(f.Name) {
_, _ = fmt.Fprintf(&buf, " --%s=%v", f.Name, f.Value.String())
}
})
return buf.String()
}

// setDebugRecoverContextDefaults resets values of command line flags to
// their default values to ensure tests don't interfere with each other.
func setDebugRecoverContextDefaults() {
debugRecoverCollectInfoOpts.Stores.Specs = nil
debugRecoverPlanOpts.outputFileName = ""
debugRecoverPlanOpts.confirmAction = prompt
debugRecoverPlanOpts.deadStoreIDs = nil
debugRecoverPlanOpts.deadStoreIDs = nil
debugRecoverExecuteOpts.Stores.Specs = nil
debugRecoverExecuteOpts.confirmAction = prompt
}
4 changes: 2 additions & 2 deletions pkg/kv/kvserver/loqrecovery/apply.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ func PrepareUpdateReplicas(

// Map contains a set of store names that were found in plan for this node,
// but were not configured in this command invocation.
missing := make(map[roachpb.StoreID]struct{})
missing := make(storeIDSet)
for _, update := range plan.Updates {
if nodeID != update.NodeID() {
continue
Expand Down Expand Up @@ -132,7 +132,7 @@ func PrepareUpdateReplicas(
}

if len(missing) > 0 {
report.MissingStores = storeSliceFromSet(missing)
report.MissingStores = missing.storeSliceFromSet()
}
return report, nil
}
Expand Down
3 changes: 3 additions & 0 deletions pkg/kv/kvserver/loqrecovery/loqrecoverypb/recovery.proto
Original file line number Diff line number Diff line change
Expand Up @@ -103,11 +103,14 @@ message ReplicaUpdatePlan {
// PlanID contains ID generated by cli when generating recovery plan and is subsequently
// used for status checks and auditing purposes.
bytes plan_id = 2 [(gogoproto.customname) = "PlanID",
(gogoproto.nullable) = false,
(gogoproto.customtype) = "github.com/cockroachdb/cockroach/pkg/util/uuid.UUID"];
// DecommissionedNodeIDs is a set of node IDs that need to be marked as decommissioned as a
// part of loss of quorum recovery process.
repeated int32 decommissioned_node_ids = 3 [(gogoproto.customname) = "DecommissionedNodeIDs",
(gogoproto.casttype) = "github.com/cockroachdb/cockroach/pkg/roachpb.NodeID"];
// ClusterID contains id of the cluster from which info was collected.
string cluster_id = 4 [(gogoproto.customname) = "ClusterID"];
}

// ReplicaRecoveryRecord is a struct that loss of quorum recovery commands
Expand Down
Loading