Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Vault 6815/respect min quorum #23

Merged
merged 15 commits into from
Nov 5, 2022
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ go 1.14

require (
github.com/hashicorp/go-hclog v0.14.1
github.com/hashicorp/go-multierror v1.1.1 // indirect
github.com/hashicorp/raft v1.2.0
github.com/stretchr/testify v1.6.1
go.uber.org/goleak v1.1.10
Expand Down
4 changes: 4 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSs
github.com/fatih/color v1.7.0 h1:DkWD4oS2D8LGGgTQ6IvwJJXSL5Vp2ffcQg58nFV38Ys=
github.com/fatih/color v1.7.0/go.mod h1:Zm6kSWBoL9eyXnKyktHP6abPY2pDugNf5KwzbycvMj4=
github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
github.com/hashicorp/errwrap v1.0.0 h1:hLrqtEDnRye3+sgx6z4qVLNuviH3MR5aQ0ykNJa/UYA=
github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4=
github.com/hashicorp/go-cleanhttp v0.5.0/go.mod h1:JpRdi6/HCYpAwUzNwuwqhbovhLtngrth3wmdIIUrZ80=
github.com/hashicorp/go-hclog v0.9.1 h1:9PZfAcVEvez4yhLH2TBU64/h/z4xlFI80cWXRrxuKuM=
github.com/hashicorp/go-hclog v0.9.1/go.mod h1:5CU+agLiy3J7N7QjHK5d05KxGsuXiQLrjA0H7acj2lQ=
Expand All @@ -20,6 +22,8 @@ github.com/hashicorp/go-immutable-radix v1.0.0 h1:AKDB1HM5PWEA7i4nhcpwOrO2byshxB
github.com/hashicorp/go-immutable-radix v1.0.0/go.mod h1:0y9vanUI8NX6FsYoO3zeMjhV/C5i9g4Q3DwcSNZ4P60=
github.com/hashicorp/go-msgpack v0.5.5 h1:i9R9JSrqIz0QVLz3sz+i3YJdT7TTSLcfLLzJi9aZTuI=
github.com/hashicorp/go-msgpack v0.5.5/go.mod h1:ahLV/dePpqEmjfWmKiqvPkv/twdG7iPBM1vqhUKIvfM=
github.com/hashicorp/go-multierror v1.1.1 h1:H5DkEtf6CXdFp0N0Em5UCwQpXMWke8IA0+lD48awMYo=
github.com/hashicorp/go-multierror v1.1.1/go.mod h1:iw975J/qwKPdAO1clOe2L8331t/9/fmwbPZ6JB6eMoM=
github.com/hashicorp/go-retryablehttp v0.5.3/go.mod h1:9B5zBasrRhHXnJnui7y6sL7es7NDiJgTc6Er0maI1Xs=
github.com/hashicorp/go-uuid v1.0.0 h1:RS8zrF7PhGwyNPOtxSClXXj9HA8feRnJzgnI1RJCSnM=
github.com/hashicorp/go-uuid v1.0.0/go.mod h1:6SBZvOh/SIDV7/2o3Jml5SYk/TvGqwFJ/bN7x4byOro=
Expand Down
14 changes: 14 additions & 0 deletions mock_promoter.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

144 changes: 101 additions & 43 deletions reconcile.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"fmt"
"sort"

"github.com/hashicorp/go-multierror"
"github.com/hashicorp/raft"
)

Expand All @@ -22,7 +23,7 @@ func (a *Autopilot) reconcile() error {
state := a.GetState()

if state == nil || state.Leader == "" {
return fmt.Errorf("Cannote reconcile Raft server voting rights without a valid autopilot state")
return fmt.Errorf("cannot reconcile Raft server voting rights without a valid autopilot state")
}

// have the promoter calculate the required Raft changeset.
Expand Down Expand Up @@ -104,7 +105,7 @@ func (a *Autopilot) applyPromotions(state *State, changes RaftChanges) (bool, er

// when we promoted anything we return true to indicate that the promotion/demotion applying
// process is finished to prevent promotions and demotions in the same round. This is what
// autopilot within Consul used to do so I am keeping the behavior the same for now.
// autopilot within Consul used to do, so I am keeping the behavior the same for now.
return promoted, nil
}

Expand Down Expand Up @@ -151,36 +152,45 @@ func (a *Autopilot) applyDemotions(state *State, changes RaftChanges) (bool, err
return demoted, nil
}

// getFailedServers aggregates all of the information about servers that the consuming application believes are in
// getFailedServers aggregates all the information about servers that the consuming application believes are in
// a failed/left state (indicated by the NodeStatus field on the Server type) as well as stale servers that are
// in the raft configuration but not know to the consuming application. This function will do nothing with
// that information and is purely to collect the data.
func (a *Autopilot) getFailedServers() (*FailedServers, int, error) {
func (a *Autopilot) getFailedServers() (*FailedServers, *VoterRegistry, error) {
staleRaftServers := make(map[raft.ServerID]raft.Server)
raftConfig, err := a.getRaftConfiguration()
if err != nil {
return nil, 0, err
return nil, nil, err
}

// Populate a map of all the raft servers. We will
// remove some later on from the map leaving us with
// just the stale servers.
var voters int
registry := NewVoterRegistry()

for _, server := range raftConfig.Servers {
staleRaftServers[server.ID] = server

if server.Suffrage == raft.Voter {
voters++
registry.Eligibility[server.ID] = &VoterEligibility{
currentVoter: server.Suffrage == raft.Voter,
}
}

var failed FailedServers

for id, srv := range a.delegate.KnownServers() {
raftSrv, found := staleRaftServers[id]
if found {
delete(staleRaftServers, id)
} else {
// This server was known to the application,
// but not in the Raft config, so will be ignored
continue
}

// Update the potential suffrage using the supplied predicate.
v := registry.Eligibility[id]
v.SetPotentialVoter(a.promoter.IsPotentialVoter(srv.NodeType))

if srv.NodeStatus != NodeAlive {
if found && raftSrv.Suffrage == raft.Voter {
failed.FailedVoters = append(failed.FailedVoters, srv)
Expand Down Expand Up @@ -211,7 +221,7 @@ func (a *Autopilot) getFailedServers() (*FailedServers, int, error) {
return failed.FailedVoters[i].ID < failed.FailedVoters[j].ID
})

return &failed, voters, nil
return &failed, registry, nil
}

// pruneDeadServers will find stale raft servers and failed servers as indicated by the consuming application
Expand All @@ -220,7 +230,7 @@ func (a *Autopilot) getFailedServers() (*FailedServers, int, error) {
// removed first. Then stale voters and finally failed servers. For servers with voting rights we will
// cap the number removed so that we do not remove too many at a time and do not remove nodes to the
// point where the number of voters would be below the MinQuorum value from the autopilot config.
// Additionally the delegate will be consulted to determine if all of the removals should be done and
// Additionally, the delegate will be consulted to determine if all the removals should be done and
// can filter the failed servers listings if need be.
func (a *Autopilot) pruneDeadServers() error {
if !a.ReconciliationEnabled() {
Expand All @@ -234,54 +244,102 @@ func (a *Autopilot) pruneDeadServers() error {

state := a.GetState()

failed, voters, err := a.getFailedServers()
failed, vr, err := a.getFailedServers()
if err != nil || failed == nil {
return err
}

failed = a.promoter.FilterFailedServerRemovals(conf, state, failed)

// remove failed non voting servers
for _, srv := range failed.FailedNonVoters {
a.logger.Info("Attempting removal of failed server node", "id", srv.ID, "name", srv.Name, "address", srv.Address)
a.delegate.RemoveFailedServer(srv)
// Remove servers in order of increasing precedence
// Update the voter registry after each batch is processed
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are we changing the order here, e.g. before FailedNonVoters were considered first, now we're looking at them after the stale servers?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess what I'm asking is: what's the basis for the "precedence" ordering you're using?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe (and may well be totally wrong) it should be that stale servers are the lowest 'value', as to the application those servers don't exist (in its view).

So first removing stale servers (non-voting and then voting) when possible, should be done before we attempt to remove failed servers (that the app does know about and could potentially become healthy again).

A failed non-voter is still (most likely) a potential voter, a stale voter or non-voter won't be.

Might be one which requires more thought/conversation :D

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure that's true in Vault's case at least. A stale server could simply be one whose echo message we haven't seen yet. So it might be stale now, and in 10s it won't be.

Copy link
Author

@peteski22 peteski22 Nov 2, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Vault would be the 'application' in this scenario, and Vault's chance to say which servers should/shouldn't be part of the raft config is via KnownServers which it supplies. Naming both healthy and failed servers.

So if the server isn't in that collection of known servers, but is in the Raft configuration, then it's stale (from autopilot's perspective).. and needs to be removed.

I may have been in this code for too long on my own, but that was how it seemed to me to work.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So here in Vault's code a server is either healthy (alive) or failed (left) https://github.com/hashicorp/vault/blob/main/physical/raft/raft_autopilot.go#L428. If the server isn't in the map of servers, it's stale to Autopilot.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I went back and forth on this for a while. There are tradeoffs here. If we remove stale first then we are moving things closer to the applications desired cluster config. However, this does involve messing with the raft config which is inherently a bit riskier than telling the application to remove a failed server. However, at least in Consul's usage, the request to remove the failed server will almost always result in the server being removed from the raft config very quickly after.

For Consul's case the only time when one of those operations will not result in a raft config update is when the failed server isn't actually in the raft config yet. This seems like an extreme edge case and not something we need to explicitly account for.

So in the end I think we should follow two rules:

  1. Deal with non-voters first as their removal shouldn't impact cluster stability.
  2. Handle stale before failed so as to be moving things towards the applications desired server set.

The code as implemented does follow those rules. It would be great if you could add a comment about the ordering somewhere in here though (which I should have done when originally writing the library)

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've added the rules you've mentioned in a comment in pruneDeadServers now, so hopefully the intent is documented to some extent.


// remove stale non-voters
toRemove := a.adjudicateRemoval(failed.StaleNonVoters, vr)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess in principle you could now collapse these 4 calls into 2, e.g.

stale := append(append([]raft.ServerID{}, failed.StaleNonVoters...), failed.StaleVoters...)
toRemove = a.adjudicateRemoval(stale, vr)
if err = a.removeStaleServers(toRemove); err != nil {
	return err
}
vr.removeAll(toRemove)

and similarly for the failed servers. Up to you, it's also fine as you have it.

if err = a.removeStaleServers(toRemove); err != nil {
return err
}
vr.RemoveAll(toRemove)

// remove stale non voters
for _, id := range failed.StaleNonVoters {
a.logger.Debug("removing stale raft server from configuration", "id", id)
if err := a.removeServer(id); err != nil {
return err
}
// Remove stale voters
toRemove = a.adjudicateRemoval(failed.StaleVoters, vr)
if err = a.removeStaleServers(toRemove); err != nil {
return err
}
vr.RemoveAll(toRemove)

maxRemoval := (voters - 1) / 2
// remove failed non-voters
failedNonVoters := vr.Filter(failed.FailedNonVoters)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the purpose of this filter call?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just to filter the []*Server objects to only return the raft.ServerIDs of the ones we want.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Which ones do we want? I'm confused because it's just ensuring that these nodes are present in the map, and I'm not sure why that's needed.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We want the collection of server ID ([]raft.ServerID), on FailedServers object, the 'stale' fields, StaleNonVoters and StaleVoters are both just collections (slices) or the ID. But FailedNonVoters and FailedVoters are collections of server objects. So that filter is just a way to get the IDs, but ensuring the voter registry knows about them (as we pull voter eligibility info from the registry inside the adjudicate func).

toRemove = a.adjudicateRemoval(failedNonVoters, vr)
a.removeFailedServers(failed.GetFailed(toRemove, false))
vr.RemoveAll(toRemove)

// remove failed voters
failedVoters := vr.Filter(failed.FailedVoters)
toRemove = a.adjudicateRemoval(failedVoters, vr)
a.removeFailedServers(failed.GetFailed(toRemove, true))
vr.RemoveAll(toRemove)

return nil
}

func (a *Autopilot) adjudicateRemoval(ids []raft.ServerID, vr *VoterRegistry) []raft.ServerID {
var result []raft.ServerID
initialPotentialVoters := vr.PotentialVoters()
removedPotentialVoters := 0
maxRemoval := (initialPotentialVoters - 1) / 2
minQuorum := a.delegate.AutopilotConfig().MinQuorum

for _, id := range ids {
v, found := vr.Eligibility[id]
if !found {
peteski22 marked this conversation as resolved.
Show resolved Hide resolved

for _, id := range failed.StaleVoters {
if voters-1 < int(conf.MinQuorum) {
a.logger.Debug("will not remove server as it would leave less voters than the minimum number allowed", "id", id, "min", conf.MinQuorum)
} else if maxRemoval < 1 {
a.logger.Debug("will not remove server as removal of a majority of servers is not safe", "id", id)
} else if err := a.removeServer(id); err != nil {
return err
} else {
maxRemoval--
voters--
}
}

for _, srv := range failed.FailedVoters {
if voters-1 < int(conf.MinQuorum) {
a.logger.Debug("will not remove server as it would leave less voters than the minimum number allowed", "id", srv.ID, "min", conf.MinQuorum)
} else if maxRemoval < 1 {
a.logger.Debug("will not remove server as a removal of a majority of servers is not safe", "id", srv.ID)
} else {
a.logger.Info("Attempting removal of failed server node", "id", srv.ID, "name", srv.Name, "address", srv.Address)
a.delegate.RemoveFailedServer(srv)
if v != nil && v.IsPotentialVoter() && initialPotentialVoters-removedPotentialVoters-1 < int(minQuorum) {
a.logger.Debug("will not remove server node as it would leave less voters than the minimum number allowed", "id", id, "min", minQuorum)
} else if v.IsCurrentVoter() && maxRemoval < 1 {
a.logger.Debug("will not remove server node as removal of a majority of voting servers is not safe", "id", id)
} else if v != nil && v.IsCurrentVoter() {
maxRemoval--
voters--
// We need to track how many voters we have removed from the registry
// to ensure the total remaining potential voters is accurate
removedPotentialVoters++
result = append(result, id)
} else {
result = append(result, id)
}
}

return result
}

func (a *Autopilot) removeStaleServer(id raft.ServerID) error {
a.logger.Debug("removing server by ID", "id", id)
future := a.raft.RemoveServer(id, 0, 0)
if err := future.Error(); err != nil {
a.logger.Error("failed to remove raft server", "id", id, "error", err)
return err
}
a.logger.Info("removed server", "id", id)
return nil
}

func (a *Autopilot) removeStaleServers(toRemove []raft.ServerID) error {
var result error

for _, id := range toRemove {
err := a.removeStaleServer(id)
if err != nil {
result = multierror.Append(result, err)
}
}

return result
}

func (a *Autopilot) removeFailedServers(toRemove []*Server) {
for _, srv := range toRemove {
a.delegate.RemoveFailedServer(srv)
}
}
Loading