Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Vault 6815/respect min quorum #23

Merged
merged 15 commits into from
Nov 5, 2022
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ go 1.14

require (
github.com/hashicorp/go-hclog v0.14.1
github.com/hashicorp/go-multierror v1.1.1 // indirect
github.com/hashicorp/raft v1.2.0
github.com/stretchr/testify v1.6.1
go.uber.org/goleak v1.1.10
Expand Down
4 changes: 4 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSs
github.com/fatih/color v1.7.0 h1:DkWD4oS2D8LGGgTQ6IvwJJXSL5Vp2ffcQg58nFV38Ys=
github.com/fatih/color v1.7.0/go.mod h1:Zm6kSWBoL9eyXnKyktHP6abPY2pDugNf5KwzbycvMj4=
github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
github.com/hashicorp/errwrap v1.0.0 h1:hLrqtEDnRye3+sgx6z4qVLNuviH3MR5aQ0ykNJa/UYA=
github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4=
github.com/hashicorp/go-cleanhttp v0.5.0/go.mod h1:JpRdi6/HCYpAwUzNwuwqhbovhLtngrth3wmdIIUrZ80=
github.com/hashicorp/go-hclog v0.9.1 h1:9PZfAcVEvez4yhLH2TBU64/h/z4xlFI80cWXRrxuKuM=
github.com/hashicorp/go-hclog v0.9.1/go.mod h1:5CU+agLiy3J7N7QjHK5d05KxGsuXiQLrjA0H7acj2lQ=
Expand All @@ -20,6 +22,8 @@ github.com/hashicorp/go-immutable-radix v1.0.0 h1:AKDB1HM5PWEA7i4nhcpwOrO2byshxB
github.com/hashicorp/go-immutable-radix v1.0.0/go.mod h1:0y9vanUI8NX6FsYoO3zeMjhV/C5i9g4Q3DwcSNZ4P60=
github.com/hashicorp/go-msgpack v0.5.5 h1:i9R9JSrqIz0QVLz3sz+i3YJdT7TTSLcfLLzJi9aZTuI=
github.com/hashicorp/go-msgpack v0.5.5/go.mod h1:ahLV/dePpqEmjfWmKiqvPkv/twdG7iPBM1vqhUKIvfM=
github.com/hashicorp/go-multierror v1.1.1 h1:H5DkEtf6CXdFp0N0Em5UCwQpXMWke8IA0+lD48awMYo=
github.com/hashicorp/go-multierror v1.1.1/go.mod h1:iw975J/qwKPdAO1clOe2L8331t/9/fmwbPZ6JB6eMoM=
github.com/hashicorp/go-retryablehttp v0.5.3/go.mod h1:9B5zBasrRhHXnJnui7y6sL7es7NDiJgTc6Er0maI1Xs=
github.com/hashicorp/go-uuid v1.0.0 h1:RS8zrF7PhGwyNPOtxSClXXj9HA8feRnJzgnI1RJCSnM=
github.com/hashicorp/go-uuid v1.0.0/go.mod h1:6SBZvOh/SIDV7/2o3Jml5SYk/TvGqwFJ/bN7x4byOro=
Expand Down
14 changes: 14 additions & 0 deletions mock_promoter.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

148 changes: 105 additions & 43 deletions reconcile.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"fmt"
"sort"

"github.com/hashicorp/go-multierror"
"github.com/hashicorp/raft"
)

Expand All @@ -22,7 +23,7 @@ func (a *Autopilot) reconcile() error {
state := a.GetState()

if state == nil || state.Leader == "" {
return fmt.Errorf("Cannote reconcile Raft server voting rights without a valid autopilot state")
return fmt.Errorf("cannot reconcile Raft server voting rights without a valid autopilot state")
}

// have the promoter calculate the required Raft changeset.
Expand Down Expand Up @@ -104,7 +105,7 @@ func (a *Autopilot) applyPromotions(state *State, changes RaftChanges) (bool, er

// when we promoted anything we return true to indicate that the promotion/demotion applying
// process is finished to prevent promotions and demotions in the same round. This is what
// autopilot within Consul used to do so I am keeping the behavior the same for now.
// autopilot within Consul used to do, so I am keeping the behavior the same for now.
return promoted, nil
}

Expand Down Expand Up @@ -151,36 +152,44 @@ func (a *Autopilot) applyDemotions(state *State, changes RaftChanges) (bool, err
return demoted, nil
}

// getFailedServers aggregates all of the information about servers that the consuming application believes are in
// getFailedServers aggregates all the information about servers that the consuming application believes are in
// a failed/left state (indicated by the NodeStatus field on the Server type) as well as stale servers that are
// in the raft configuration but not know to the consuming application. This function will do nothing with
// that information and is purely to collect the data.
func (a *Autopilot) getFailedServers() (*FailedServers, int, error) {
func (a *Autopilot) getFailedServers() (*FailedServers, VoterRegistry, error) {
staleRaftServers := make(map[raft.ServerID]raft.Server)
raftConfig, err := a.getRaftConfiguration()
if err != nil {
return nil, 0, err
return nil, nil, err
}

// Populate a map of all the raft servers. We will
// remove some later on from the map leaving us with
// just the stale servers.
var voters int
registry := make(VoterRegistry)
for _, server := range raftConfig.Servers {
staleRaftServers[server.ID] = server

if server.Suffrage == raft.Voter {
voters++
registry[server.ID] = &VoterEligibility{
currentVoter: server.Suffrage == raft.Voter,
}
}

var failed FailedServers

for id, srv := range a.delegate.KnownServers() {
raftSrv, found := staleRaftServers[id]
if found {
delete(staleRaftServers, id)
} else {
// This server was known to the application,
// but not in the Raft config, so will be ignored
continue
}

// Update the potential suffrage using the supplied predicate.
v, _ := registry[id]
peteski22 marked this conversation as resolved.
Show resolved Hide resolved
v.SetPotentialVoter(a.promoter.PotentialVoterPredicate(srv.NodeType))

if srv.NodeStatus != NodeAlive {
if found && raftSrv.Suffrage == raft.Voter {
failed.FailedVoters = append(failed.FailedVoters, srv)
Expand Down Expand Up @@ -211,7 +220,7 @@ func (a *Autopilot) getFailedServers() (*FailedServers, int, error) {
return failed.FailedVoters[i].ID < failed.FailedVoters[j].ID
})

return &failed, voters, nil
return &failed, registry, nil
}

// pruneDeadServers will find stale raft servers and failed servers as indicated by the consuming application
Expand All @@ -220,7 +229,7 @@ func (a *Autopilot) getFailedServers() (*FailedServers, int, error) {
// removed first. Then stale voters and finally failed servers. For servers with voting rights we will
// cap the number removed so that we do not remove too many at a time and do not remove nodes to the
// point where the number of voters would be below the MinQuorum value from the autopilot config.
// Additionally the delegate will be consulted to determine if all of the removals should be done and
// Additionally, the delegate will be consulted to determine if all the removals should be done and
// can filter the failed servers listings if need be.
func (a *Autopilot) pruneDeadServers() error {
if !a.ReconciliationEnabled() {
Expand All @@ -234,54 +243,107 @@ func (a *Autopilot) pruneDeadServers() error {

state := a.GetState()

failed, voters, err := a.getFailedServers()
failed, vr, err := a.getFailedServers()
if err != nil || failed == nil {
return err
}

failed = a.promoter.FilterFailedServerRemovals(conf, state, failed)

// remove failed non voting servers
for _, srv := range failed.FailedNonVoters {
a.logger.Info("Attempting removal of failed server node", "id", srv.ID, "name", srv.Name, "address", srv.Address)
a.delegate.RemoveFailedServer(srv)
// remove stale non-voters
staleNonVoters := vr.FilterStale(failed.StaleNonVoters)
toRemove := a.adjudicateRemoval(staleNonVoters, vr.PotentialVoters)
if err = a.removeStaleServers(toRemove); err != nil {
return err
}
vr.Remove(toRemove)

// remove stale non voters
for _, id := range failed.StaleNonVoters {
a.logger.Debug("removing stale raft server from configuration", "id", id)
if err := a.removeServer(id); err != nil {
return err
}
// Remove stale voters
staleVoters := vr.FilterStale(failed.StaleVoters)
toRemove = a.adjudicateRemoval(staleVoters, vr.PotentialVoters)
if err = a.removeStaleServers(toRemove); err != nil {
return err
}
vr.Remove(toRemove)

maxRemoval := (voters - 1) / 2
// remove failed non-voters
failedNonVoters := vr.FilterFailed(failed.FailedNonVoters)
toRemove = a.adjudicateRemoval(failedNonVoters, vr.PotentialVoters)
a.removeFailedServers(failed.Get(toRemove, false))
vr.Remove(toRemove)

for _, id := range failed.StaleVoters {
if voters-1 < int(conf.MinQuorum) {
a.logger.Debug("will not remove server as it would leave less voters than the minimum number allowed", "id", id, "min", conf.MinQuorum)
} else if maxRemoval < 1 {
a.logger.Debug("will not remove server as removal of a majority of servers is not safe", "id", id)
} else if err := a.removeServer(id); err != nil {
return err
} else {
// remove failed voters
failedVoters := vr.FilterFailed(failed.FailedVoters)
toRemove = a.adjudicateRemoval(failedVoters, vr.PotentialVoters)
a.removeFailedServers(failed.Get(toRemove, true))
vr.Remove(toRemove)

return nil
}

func (a *Autopilot) adjudicateRemoval(s VoterRegistry, voterCountProvider func() int) []raft.ServerID {
peteski22 marked this conversation as resolved.
Show resolved Hide resolved
var ids []raft.ServerID
maxRemoval := (voterCountProvider() - 1) / 2
minQuorum := a.delegate.AutopilotConfig().MinQuorum

for id, v := range s {
if v != nil && v.IsPotentialVoter() && voterCountProvider()-1 < int(minQuorum) {
a.logger.Debug("will not remove server node as it would leave less voters than the minimum number allowed", "id", id, "min", minQuorum)
} else if v.IsCurrentVoter() && maxRemoval < 1 {
a.logger.Debug("will not remove server node as removal of a majority of voting servers is not safe", "id", id)
} else if v != nil && v.IsCurrentVoter() {
maxRemoval--
voters--
delete(s, id)
peteski22 marked this conversation as resolved.
Show resolved Hide resolved
ids = append(ids, id)
} else {
delete(s, id)
ids = append(ids, id)
}
}

for _, srv := range failed.FailedVoters {
if voters-1 < int(conf.MinQuorum) {
a.logger.Debug("will not remove server as it would leave less voters than the minimum number allowed", "id", srv.ID, "min", conf.MinQuorum)
} else if maxRemoval < 1 {
a.logger.Debug("will not remove server as a removal of a majority of servers is not safe", "id", srv.ID)
} else {
a.logger.Info("Attempting removal of failed server node", "id", srv.ID, "name", srv.Name, "address", srv.Address)
a.delegate.RemoveFailedServer(srv)
maxRemoval--
voters--
return ids
}

func (a *Autopilot) removeStaleServer(id raft.ServerID) error {
a.logger.Debug("removing server by ID", "id", id)
future := a.raft.RemoveServer(id, 0, 0)
if err := future.Error(); err != nil {
a.logger.Error("failed to remove raft server", "id", id, "error", err)
return err
}
a.logger.Info("removed server", "id", id)
return nil
}

func (a *Autopilot) removeStaleServers(toRemove []raft.ServerID) error {
var result error

for _, id := range toRemove {
err := a.removeStaleServer(id)
if err != nil {
result = multierror.Append(result, err)
}
}

return nil
return result
}

func (a *Autopilot) removeFailedServers(toRemove []*Server) {
for _, srv := range toRemove {
a.delegate.RemoveFailedServer(srv)
}
}

func Remove(removeFrom []raft.ServerID, toRemove []raft.ServerID) []raft.ServerID {
var result []raft.ServerID

for _, toRemoveId := range toRemove {
for i, id := range removeFrom {
if removeFrom[i] != toRemoveId {
result = append(result, id)
}
}
}

return result
}
Loading