From 7005cc31185d3eaad5808a0450eb09ec6b4d7ef3 Mon Sep 17 00:00:00 2001 From: peteski22 Date: Tue, 1 Nov 2022 15:36:01 +0000 Subject: [PATCH 01/15] Fixed red squigglies on comments --- reconcile.go | 6 +++--- state.go | 24 ++++++++++++------------ types.go | 2 +- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/reconcile.go b/reconcile.go index 0c2d2b5..a88830d 100644 --- a/reconcile.go +++ b/reconcile.go @@ -22,7 +22,7 @@ func (a *Autopilot) reconcile() error { state := a.GetState() if state == nil || state.Leader == "" { - return fmt.Errorf("Cannote reconcile Raft server voting rights without a valid autopilot state") + return fmt.Errorf("cannot reconcile Raft server voting rights without a valid autopilot state") } // have the promoter calculate the required Raft changeset. @@ -220,7 +220,7 @@ func (a *Autopilot) getFailedServers() (*FailedServers, int, error) { // removed first. Then stale voters and finally failed servers. For servers with voting rights we will // cap the number removed so that we do not remove too many at a time and do not remove nodes to the // point where the number of voters would be below the MinQuorum value from the autopilot config. -// Additionally the delegate will be consulted to determine if all of the removals should be done and +// Additionally, the delegate will be consulted to determine if all the removals should be done and // can filter the failed servers listings if need be. func (a *Autopilot) pruneDeadServers() error { if !a.ReconciliationEnabled() { @@ -241,7 +241,7 @@ func (a *Autopilot) pruneDeadServers() error { failed = a.promoter.FilterFailedServerRemovals(conf, state, failed) - // remove failed non voting servers + // remove failed non-voting servers for _, srv := range failed.FailedNonVoters { a.logger.Info("Attempting removal of failed server node", "id", srv.ID, "name", srv.Name, "address", srv.Address) a.delegate.RemoveFailedServer(srv) diff --git a/state.go b/state.go index 4f1ffb7..bbd502a 100644 --- a/state.go +++ b/state.go @@ -9,7 +9,7 @@ import ( "github.com/hashicorp/raft" ) -// aliveServers will filter the input map of servers and output one with all of the +// aliveServers will filter the input map of servers and output one with all the // servers in a Left state removed. func aliveServers(servers map[raft.ServerID]*Server) map[raft.ServerID]*Server { serverMap := make(map[raft.ServerID]*Server) @@ -74,10 +74,10 @@ func (a *Autopilot) gatherNextStateInputs(ctx context.Context) (*nextStateInputs // This is really only important for when autopilot is first started. We will use the // first state's time when determining if a server is stable. Under normal circumstances // we need to just check that the current time - the servers StableSince time is greater - // than the configured stabilization time. However while autopilot has been running for + // than the configured stabilization time. However, while autopilot has been running for // less time than the stabilization time we need to consider all servers as stable - // to prevent unnecessary leader elections. Therefore its important to track the first - // time a state was generated so we know if we have a state old enough where there is + // to prevent unnecessary leader elections. Therefore, it's important to track the first + // time a state was generated, so we know if we have a state old enough where there is // any chance of seeing servers as stable based off that configured threshold. var firstStateTime time.Time currentState := a.GetState() @@ -302,7 +302,7 @@ func (a *Autopilot) nextServers(inputs *nextStateInputs) map[raft.ServerID]*Serv func buildServerState(inputs *nextStateInputs, srv raft.Server) ServerState { // Note that the ordering of operations in this method are very important. // We are building up the ServerState from the least important sources - // and overriding them with more up to date values. + // and overriding them with more up-to-date values. // build the basic state from the Raft server state := ServerState{ @@ -337,8 +337,8 @@ func buildServerState(inputs *nextStateInputs, srv raft.Server) ServerState { state.Health = existing.Health previousHealthy = &state.Health.Healthy - // it is is important to note that the map values we retrieved this from are - // stored by value. Therefore we are modifying a copy of what is in the existing + // it is important to note that the map values we retrieved this from are + // stored by value. Therefore, we are modifying a copy of what is in the existing // state and not the actual state itself. We want to ensure that the Address // is what Raft will know about. state.Server = existing.Server @@ -350,7 +350,7 @@ func buildServerState(inputs *nextStateInputs, srv raft.Server) ServerState { if known, found := inputs.KnownServers[srv.ID]; found { // it is important to note that we are modifying a copy of a Server as the // map we retrieved this from has a non-pointer type value. We definitely - // do not want to modify the current known servers but we do want to ensure + // do not want to modify the current known servers, but we do want to ensure // that we do not overwrite the Address state.Server = *known state.Server.Address = srv.Address @@ -367,7 +367,7 @@ func buildServerState(inputs *nextStateInputs, srv raft.Server) ServerState { state.Server.IsLeader = true } - // override the Stats if any where in the fetched results + // override the Stats if any were in the fetched results if stats, found := inputs.FetchedStats[srv.ID]; found { state.Stats = *stats } @@ -421,13 +421,13 @@ func SortServers(ids []raft.ServerID, s *State) { }) } -// ServerLessThan will lookup both servers in the given State and return +// ServerLessThan will look up both servers in the given State and return // true if the first id corresponds to a server that is logically less than // lower than, better than etc. the second server. The following criteria // are considered in order of most important to least important // // 1. A Leader server is always less than all others -// 2. A voter is less than non voters +// 2. A voter is less than non-voters // 3. Healthy servers are less than unhealthy servers // 4. Servers that have been stable longer are consider less than. func ServerLessThan(id1 raft.ServerID, id2 raft.ServerID, s *State) bool { @@ -449,7 +449,7 @@ func ServerLessThan(id1 raft.ServerID, id2 raft.ServerID, s *State) bool { } // at this point we know that the raft state of both nodes is roughly - // equivalent so we want to now sort based on health + // equivalent, so we want to now sort based on health if srvI.Health.Healthy == srvJ.Health.Healthy { if srvI.Health.StableSince.Before(srvJ.Health.StableSince) { return srvI.Health.Healthy diff --git a/types.go b/types.go index 5092bd5..baa8e52 100644 --- a/types.go +++ b/types.go @@ -137,7 +137,7 @@ func (s *ServerState) isHealthy(lastTerm uint64, leaderLastIndex uint64, conf *C } type ServerHealth struct { - // Healthy is whether or not the server is healthy according to the current + // Healthy is whether the server is healthy according to the current // Autopilot config. Healthy bool From 874cbcc3852fc12816bd201995a5a2233f723393 Mon Sep 17 00:00:00 2001 From: peteski22 Date: Tue, 1 Nov 2022 20:58:22 +0000 Subject: [PATCH 02/15] Added additional map to track voter eligibility and tried to centralise logic to handle juding whether a server can be removed or not --- go.mod | 1 + go.sum | 4 ++ mock_promoter.go | 14 +++++ reconcile.go | 144 ++++++++++++++++++++++++++++++++------------- reconcile_test.go | 35 ++++++++++- stable_promoter.go | 4 ++ types.go | 90 ++++++++++++++++++++++++++++ 7 files changed, 248 insertions(+), 44 deletions(-) diff --git a/go.mod b/go.mod index 2dd9ca4..333227b 100644 --- a/go.mod +++ b/go.mod @@ -4,6 +4,7 @@ go 1.14 require ( github.com/hashicorp/go-hclog v0.14.1 + github.com/hashicorp/go-multierror v1.1.1 // indirect github.com/hashicorp/raft v1.2.0 github.com/stretchr/testify v1.6.1 go.uber.org/goleak v1.1.10 diff --git a/go.sum b/go.sum index e0f4afa..d9f2495 100644 --- a/go.sum +++ b/go.sum @@ -11,6 +11,8 @@ github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSs github.com/fatih/color v1.7.0 h1:DkWD4oS2D8LGGgTQ6IvwJJXSL5Vp2ffcQg58nFV38Ys= github.com/fatih/color v1.7.0/go.mod h1:Zm6kSWBoL9eyXnKyktHP6abPY2pDugNf5KwzbycvMj4= github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/hashicorp/errwrap v1.0.0 h1:hLrqtEDnRye3+sgx6z4qVLNuviH3MR5aQ0ykNJa/UYA= +github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= github.com/hashicorp/go-cleanhttp v0.5.0/go.mod h1:JpRdi6/HCYpAwUzNwuwqhbovhLtngrth3wmdIIUrZ80= github.com/hashicorp/go-hclog v0.9.1 h1:9PZfAcVEvez4yhLH2TBU64/h/z4xlFI80cWXRrxuKuM= github.com/hashicorp/go-hclog v0.9.1/go.mod h1:5CU+agLiy3J7N7QjHK5d05KxGsuXiQLrjA0H7acj2lQ= @@ -20,6 +22,8 @@ github.com/hashicorp/go-immutable-radix v1.0.0 h1:AKDB1HM5PWEA7i4nhcpwOrO2byshxB github.com/hashicorp/go-immutable-radix v1.0.0/go.mod h1:0y9vanUI8NX6FsYoO3zeMjhV/C5i9g4Q3DwcSNZ4P60= github.com/hashicorp/go-msgpack v0.5.5 h1:i9R9JSrqIz0QVLz3sz+i3YJdT7TTSLcfLLzJi9aZTuI= github.com/hashicorp/go-msgpack v0.5.5/go.mod h1:ahLV/dePpqEmjfWmKiqvPkv/twdG7iPBM1vqhUKIvfM= +github.com/hashicorp/go-multierror v1.1.1 h1:H5DkEtf6CXdFp0N0Em5UCwQpXMWke8IA0+lD48awMYo= +github.com/hashicorp/go-multierror v1.1.1/go.mod h1:iw975J/qwKPdAO1clOe2L8331t/9/fmwbPZ6JB6eMoM= github.com/hashicorp/go-retryablehttp v0.5.3/go.mod h1:9B5zBasrRhHXnJnui7y6sL7es7NDiJgTc6Er0maI1Xs= github.com/hashicorp/go-uuid v1.0.0 h1:RS8zrF7PhGwyNPOtxSClXXj9HA8feRnJzgnI1RJCSnM= github.com/hashicorp/go-uuid v1.0.0/go.mod h1:6SBZvOh/SIDV7/2o3Jml5SYk/TvGqwFJ/bN7x4byOro= diff --git a/mock_promoter.go b/mock_promoter.go index 2e928de..2d95fca 100644 --- a/mock_promoter.go +++ b/mock_promoter.go @@ -90,6 +90,20 @@ func (_m *MockPromoter) GetStateExt(_a0 *Config, _a1 *State) interface{} { return r0 } +// PotentialVoterPredicate provides a mock function with given fields: _a0 +func (_m *MockPromoter) PotentialVoterPredicate(_a0 NodeType) bool { + ret := _m.Called(_a0) + + var r0 bool + if rf, ok := ret.Get(0).(func(NodeType) bool); ok { + r0 = rf(_a0) + } else { + r0 = ret.Get(0).(bool) + } + + return r0 +} + type mockConstructorTestingTNewMockPromoter interface { mock.TestingT Cleanup(func()) diff --git a/reconcile.go b/reconcile.go index a88830d..6502606 100644 --- a/reconcile.go +++ b/reconcile.go @@ -4,6 +4,7 @@ import ( "fmt" "sort" + "github.com/hashicorp/go-multierror" "github.com/hashicorp/raft" ) @@ -104,7 +105,7 @@ func (a *Autopilot) applyPromotions(state *State, changes RaftChanges) (bool, er // when we promoted anything we return true to indicate that the promotion/demotion applying // process is finished to prevent promotions and demotions in the same round. This is what - // autopilot within Consul used to do so I am keeping the behavior the same for now. + // autopilot within Consul used to do, so I am keeping the behavior the same for now. return promoted, nil } @@ -151,36 +152,44 @@ func (a *Autopilot) applyDemotions(state *State, changes RaftChanges) (bool, err return demoted, nil } -// getFailedServers aggregates all of the information about servers that the consuming application believes are in +// getFailedServers aggregates all the information about servers that the consuming application believes are in // a failed/left state (indicated by the NodeStatus field on the Server type) as well as stale servers that are // in the raft configuration but not know to the consuming application. This function will do nothing with // that information and is purely to collect the data. -func (a *Autopilot) getFailedServers() (*FailedServers, int, error) { +func (a *Autopilot) getFailedServers() (*FailedServers, VoterRegistry, error) { staleRaftServers := make(map[raft.ServerID]raft.Server) raftConfig, err := a.getRaftConfiguration() if err != nil { - return nil, 0, err + return nil, nil, err } // Populate a map of all the raft servers. We will // remove some later on from the map leaving us with // just the stale servers. - var voters int + registry := make(VoterRegistry) for _, server := range raftConfig.Servers { staleRaftServers[server.ID] = server - - if server.Suffrage == raft.Voter { - voters++ + registry[server.ID] = &VoterEligibility{ + currentVoter: server.Suffrage == raft.Voter, } } var failed FailedServers + for id, srv := range a.delegate.KnownServers() { raftSrv, found := staleRaftServers[id] if found { delete(staleRaftServers, id) + } else { + // This server was known to the application, + // but not in the Raft config, so will be ignored + continue } + // Update the potential suffrage using the supplied predicate. + v, _ := registry[id] + v.SetPotentialVoter(a.promoter.PotentialVoterPredicate(srv.NodeType)) + if srv.NodeStatus != NodeAlive { if found && raftSrv.Suffrage == raft.Voter { failed.FailedVoters = append(failed.FailedVoters, srv) @@ -211,7 +220,7 @@ func (a *Autopilot) getFailedServers() (*FailedServers, int, error) { return failed.FailedVoters[i].ID < failed.FailedVoters[j].ID }) - return &failed, voters, nil + return &failed, registry, nil } // pruneDeadServers will find stale raft servers and failed servers as indicated by the consuming application @@ -234,54 +243,107 @@ func (a *Autopilot) pruneDeadServers() error { state := a.GetState() - failed, voters, err := a.getFailedServers() + failed, vr, err := a.getFailedServers() if err != nil || failed == nil { return err } failed = a.promoter.FilterFailedServerRemovals(conf, state, failed) - // remove failed non-voting servers - for _, srv := range failed.FailedNonVoters { - a.logger.Info("Attempting removal of failed server node", "id", srv.ID, "name", srv.Name, "address", srv.Address) - a.delegate.RemoveFailedServer(srv) + // remove stale non-voters + staleNonVoters := vr.FilterStale(failed.StaleNonVoters) + toRemove := a.adjudicateRemoval(staleNonVoters, vr.PotentialVoters) + if err = a.removeStaleServers(toRemove); err != nil { + return err } + vr.Remove(toRemove) - // remove stale non voters - for _, id := range failed.StaleNonVoters { - a.logger.Debug("removing stale raft server from configuration", "id", id) - if err := a.removeServer(id); err != nil { - return err - } + // Remove stale voters + staleVoters := vr.FilterStale(failed.StaleVoters) + toRemove = a.adjudicateRemoval(staleVoters, vr.PotentialVoters) + if err = a.removeStaleServers(toRemove); err != nil { + return err } + vr.Remove(toRemove) - maxRemoval := (voters - 1) / 2 + // remove failed non-voters + failedNonVoters := vr.FilterFailed(failed.FailedNonVoters) + toRemove = a.adjudicateRemoval(failedNonVoters, vr.PotentialVoters) + a.removeFailedServers(failed.Get(toRemove, false)) + vr.Remove(toRemove) - for _, id := range failed.StaleVoters { - if voters-1 < int(conf.MinQuorum) { - a.logger.Debug("will not remove server as it would leave less voters than the minimum number allowed", "id", id, "min", conf.MinQuorum) - } else if maxRemoval < 1 { - a.logger.Debug("will not remove server as removal of a majority of servers is not safe", "id", id) - } else if err := a.removeServer(id); err != nil { - return err - } else { + // remove failed voters + failedVoters := vr.FilterFailed(failed.FailedVoters) + toRemove = a.adjudicateRemoval(failedVoters, vr.PotentialVoters) + a.removeFailedServers(failed.Get(toRemove, true)) + vr.Remove(toRemove) + + return nil +} + +func (a *Autopilot) adjudicateRemoval(s VoterRegistry, voterCountProvider func() int) []raft.ServerID { + var ids []raft.ServerID + maxRemoval := (voterCountProvider() - 1) / 2 + minQuorum := a.delegate.AutopilotConfig().MinQuorum + + for id, v := range s { + if v != nil && v.IsPotentialVoter() && voterCountProvider()-1 < int(minQuorum) { + a.logger.Debug("will not remove server node as it would leave less voters than the minimum number allowed", "id", id, "min", minQuorum) + } else if v.IsCurrentVoter() && maxRemoval < 1 { + a.logger.Debug("will not remove server node as removal of a majority of voting servers is not safe", "id", id) + } else if v != nil && v.IsCurrentVoter() { maxRemoval-- - voters-- + delete(s, id) + ids = append(ids, id) + } else { + delete(s, id) + ids = append(ids, id) } } - for _, srv := range failed.FailedVoters { - if voters-1 < int(conf.MinQuorum) { - a.logger.Debug("will not remove server as it would leave less voters than the minimum number allowed", "id", srv.ID, "min", conf.MinQuorum) - } else if maxRemoval < 1 { - a.logger.Debug("will not remove server as a removal of a majority of servers is not safe", "id", srv.ID) - } else { - a.logger.Info("Attempting removal of failed server node", "id", srv.ID, "name", srv.Name, "address", srv.Address) - a.delegate.RemoveFailedServer(srv) - maxRemoval-- - voters-- + return ids +} + +func (a *Autopilot) removeStaleServer(id raft.ServerID) error { + a.logger.Debug("removing server by ID", "id", id) + future := a.raft.RemoveServer(id, 0, 0) + if err := future.Error(); err != nil { + a.logger.Error("failed to remove raft server", "id", id, "error", err) + return err + } + a.logger.Info("removed server", "id", id) + return nil +} + +func (a *Autopilot) removeStaleServers(toRemove []raft.ServerID) error { + var result error + + for _, id := range toRemove { + err := a.removeStaleServer(id) + if err != nil { + result = multierror.Append(result, err) } } - return nil + return result +} + +func (a *Autopilot) removeFailedServers(toRemove []*Server) { + for _, srv := range toRemove { + a.delegate.RemoveFailedServer(srv) + } +} + +func Remove(removeFrom []raft.ServerID, toRemove []raft.ServerID) []raft.ServerID { + var result []raft.ServerID + + for _, toRemoveId := range toRemove { + for i, id := range removeFrom { + if removeFrom[i] != toRemoveId { + result = append(result, id) + } + } + } + + return result } diff --git a/reconcile_test.go b/reconcile_test.go index 466e246..68ae083 100644 --- a/reconcile_test.go +++ b/reconcile_test.go @@ -421,6 +421,7 @@ func TestPruneDeadServers(t *testing.T) { Name: "node6", Address: "198.18.0.6:8300", NodeStatus: NodeFailed, + NodeType: NodeVoter, }, }, FailedVoters: []*Server{ @@ -429,6 +430,7 @@ func TestPruneDeadServers(t *testing.T) { Name: "node3", Address: "198.18.0.3:8300", NodeStatus: NodeFailed, + NodeType: NodeVoter, }, }, }, @@ -438,30 +440,35 @@ func TestPruneDeadServers(t *testing.T) { Name: "node1", Address: "198.18.0.1:8300", NodeStatus: NodeAlive, + NodeType: NodeVoter, }, "51fb4248-be6a-43e5-b47f-c089818e2010": { ID: "51fb4248-be6a-43e5-b47f-c089818e2010", Name: "node2", Address: "198.18.0.2:8300", NodeStatus: NodeAlive, + NodeType: NodeVoter, }, "a227f9a9-f55e-4321-b959-5afdcc63c6d4": { ID: "a227f9a9-f55e-4321-b959-5afdcc63c6d4", Name: "node7", Address: "198.18.0.7:8300", NodeStatus: NodeAlive, + NodeType: NodeVoter, }, "3857f1d4-5c23-4016-9078-fee502c0d1be": { ID: "3857f1d4-5c23-4016-9078-fee502c0d1be", Name: "node3", Address: "198.18.0.3:8300", NodeStatus: NodeFailed, + NodeType: NodeVoter, }, "8830c599-04cc-4b28-9b75-173355d49ab1": { ID: "8830c599-04cc-4b28-9b75-173355d49ab1", Name: "node6", Address: "198.18.0.6:8300", NodeStatus: NodeFailed, + NodeType: NodeVoter, }, }, state: State{ @@ -472,6 +479,7 @@ func TestPruneDeadServers(t *testing.T) { Name: "node1", Address: "198.18.0.1:8300", NodeStatus: NodeAlive, + NodeType: NodeVoter, }, }, "51fb4248-be6a-43e5-b47f-c089818e2010": { @@ -480,6 +488,7 @@ func TestPruneDeadServers(t *testing.T) { Name: "node2", Address: "198.18.0.2:8300", NodeStatus: NodeAlive, + NodeType: NodeVoter, }, }, "a227f9a9-f55e-4321-b959-5afdcc63c6d4": { @@ -488,6 +497,7 @@ func TestPruneDeadServers(t *testing.T) { Name: "node7", Address: "198.18.0.7:8300", NodeStatus: NodeAlive, + NodeType: NodeVoter, }, }, "3857f1d4-5c23-4016-9078-fee502c0d1be": { @@ -496,6 +506,7 @@ func TestPruneDeadServers(t *testing.T) { Name: "node3", Address: "198.18.0.3:8300", NodeStatus: NodeFailed, + NodeType: NodeVoter, }, }, "8830c599-04cc-4b28-9b75-173355d49ab1": { @@ -504,6 +515,7 @@ func TestPruneDeadServers(t *testing.T) { Name: "node6", Address: "198.18.0.6:8300", NodeStatus: NodeFailed, + NodeType: NodeVoter, }, }, }, @@ -524,12 +536,14 @@ func TestPruneDeadServers(t *testing.T) { Name: "node6", Address: "198.18.0.6:8300", NodeStatus: NodeFailed, + NodeType: NodeVoter, }).Once() mapp.On("RemoveFailedServer", &Server{ ID: "3857f1d4-5c23-4016-9078-fee502c0d1be", Name: "node3", Address: "198.18.0.3:8300", NodeStatus: NodeFailed, + NodeType: NodeVoter, }).Once() }, }, @@ -569,7 +583,7 @@ func TestPruneDeadServers(t *testing.T) { ID: "0aacc844-1d0a-4ba7-bbc2-bd88d51cb231", Address: "198.18.0.5:8300", }, - // this is going to be our failed non voter + // this is going to be our failed non-voter { Suffrage: raft.Nonvoter, ID: "8830c599-04cc-4b28-9b75-173355d49ab1", @@ -586,6 +600,7 @@ func TestPruneDeadServers(t *testing.T) { Name: "node6", Address: "198.18.0.6:8300", NodeStatus: NodeFailed, + NodeType: NodeVoter, }, }, FailedVoters: []*Server{ @@ -594,12 +609,14 @@ func TestPruneDeadServers(t *testing.T) { Name: "node3", Address: "198.18.0.3:8300", NodeStatus: NodeFailed, + NodeType: NodeVoter, }, { ID: "51fb4248-be6a-43e5-b47f-c089818e2010", Name: "node2", Address: "198.18.0.2:8300", NodeStatus: NodeFailed, + NodeType: NodeVoter, }, }, }, @@ -609,30 +626,35 @@ func TestPruneDeadServers(t *testing.T) { Name: "node1", Address: "198.18.0.1:8300", NodeStatus: NodeAlive, + NodeType: NodeVoter, }, "51fb4248-be6a-43e5-b47f-c089818e2010": { ID: "51fb4248-be6a-43e5-b47f-c089818e2010", Name: "node2", Address: "198.18.0.2:8300", NodeStatus: NodeFailed, + NodeType: NodeVoter, }, "a227f9a9-f55e-4321-b959-5afdcc63c6d4": { ID: "a227f9a9-f55e-4321-b959-5afdcc63c6d4", Name: "node7", Address: "198.18.0.7:8300", NodeStatus: NodeAlive, + NodeType: NodeVoter, }, "3857f1d4-5c23-4016-9078-fee502c0d1be": { ID: "3857f1d4-5c23-4016-9078-fee502c0d1be", Name: "node3", Address: "198.18.0.3:8300", NodeStatus: NodeFailed, + NodeType: NodeVoter, }, "8830c599-04cc-4b28-9b75-173355d49ab1": { ID: "8830c599-04cc-4b28-9b75-173355d49ab1", Name: "node6", Address: "198.18.0.6:8300", NodeStatus: NodeFailed, + NodeType: NodeVoter, }, }, state: State{ @@ -643,6 +665,7 @@ func TestPruneDeadServers(t *testing.T) { Name: "node1", Address: "198.18.0.1:8300", NodeStatus: NodeAlive, + NodeType: NodeVoter, }, State: RaftLeader, }, @@ -652,6 +675,7 @@ func TestPruneDeadServers(t *testing.T) { Name: "node2", Address: "198.18.0.2:8300", NodeStatus: NodeFailed, + NodeType: NodeVoter, }, State: RaftVoter, }, @@ -661,6 +685,7 @@ func TestPruneDeadServers(t *testing.T) { Name: "node7", Address: "198.18.0.7:8300", NodeStatus: NodeAlive, + NodeType: NodeVoter, }, State: RaftVoter, }, @@ -670,6 +695,7 @@ func TestPruneDeadServers(t *testing.T) { Name: "node3", Address: "198.18.0.3:8300", NodeStatus: NodeFailed, + NodeType: NodeVoter, }, State: RaftVoter, }, @@ -679,6 +705,7 @@ func TestPruneDeadServers(t *testing.T) { Name: "node6", Address: "198.18.0.6:8300", NodeStatus: NodeFailed, + NodeType: NodeVoter, }, State: RaftNonVoter, }, @@ -700,12 +727,14 @@ func TestPruneDeadServers(t *testing.T) { Name: "node6", Address: "198.18.0.6:8300", NodeStatus: NodeFailed, + NodeType: NodeVoter, }).Once() mapp.On("RemoveFailedServer", &Server{ ID: "3857f1d4-5c23-4016-9078-fee502c0d1be", Name: "node3", Address: "198.18.0.3:8300", NodeStatus: NodeFailed, + NodeType: NodeVoter, }).Once() }, }, @@ -719,9 +748,9 @@ func TestPruneDeadServers(t *testing.T) { } mpromoter := NewMockPromoter(t) mpromoter.On("FilterFailedServerRemovals", conf, &tcase.state, &tcase.expectedFailed).Return(&tcase.expectedFailed).Once() - + mpromoter.On("PotentialVoterPredicate", NodeVoter).Return(true) mapp := NewMockApplicationIntegration(t) - mapp.On("AutopilotConfig").Return(conf).Once() + mapp.On("AutopilotConfig").Return(conf).Times(5) mapp.On("KnownServers").Return(tcase.knownServers).Once() mraft := NewMockRaft(t) diff --git a/stable_promoter.go b/stable_promoter.go index 9e8b725..8d982a5 100644 --- a/stable_promoter.go +++ b/stable_promoter.go @@ -52,3 +52,7 @@ func (_ *StablePromoter) CalculatePromotionsAndDemotions(c *Config, s *State) Ra return changes } + +func (_ *StablePromoter) PotentialVoterPredicate(nodeType NodeType) bool { + return nodeType == NodeVoter +} diff --git a/types.go b/types.go index baa8e52..08fd8b0 100644 --- a/types.go +++ b/types.go @@ -291,6 +291,10 @@ type Promoter interface { // failed/stale servers and will return those failed servers which the promoter thinks // should be allowed to be removed. FilterFailedServerRemovals(*Config, *State, *FailedServers) *FailedServers + + // PotentialVoterPredicate takes a NodeType and returns whether that type represents + // a potential voter, based on a predicate implemented by the promoter. + PotentialVoterPredicate(NodeType) bool } // TimeProvider is an interface for getting a local time. This is mainly useful for testing @@ -304,3 +308,89 @@ type runtimeTimeProvider struct{} func (_ *runtimeTimeProvider) Now() time.Time { return time.Now() } + +func (v *VoterEligibility) IsCurrentVoter() bool { + return v.currentVoter +} + +func (v *VoterEligibility) IsPotentialVoter() bool { + return v.potentialVoter +} + +func (v *VoterEligibility) SetPotentialVoter(isVoter bool) { + v.potentialVoter = isVoter +} + +// VoterEligibility represents whether a node can currently vote, +// and if it could potentially vote in the future. +type VoterEligibility struct { + currentVoter bool + potentialVoter bool +} + +type VoterRegistry map[raft.ServerID]*VoterEligibility + +func (vr *VoterRegistry) PotentialVoters() int { + potentialVoters := 0 + + for _, v := range *vr { + if v.IsPotentialVoter() { + potentialVoters++ + } + } + + return potentialVoters +} + +func (vr *VoterRegistry) FilterStale(ids []raft.ServerID) VoterRegistry { + result := make(VoterRegistry) + + for _, id := range ids { + if ve, ok := (*vr)[id]; ok { + result[id] = ve + } + } + + return result +} + +func (vr *VoterRegistry) FilterFailed(ids []*Server) VoterRegistry { + result := make(VoterRegistry) + + for _, srv := range ids { + if ve, ok := (*vr)[srv.ID]; ok { + result[srv.ID] = ve + } + } + + return result +} + +func (f *FailedServers) Get(ids []raft.ServerID, isVoter bool) []*Server { + var servers []*Server + var result []*Server + + if isVoter { + servers = f.FailedVoters + } else { + servers = f.FailedNonVoters + } + + for _, id := range ids { + for _, srv := range servers { + if srv.ID == id { + result = append(result, srv) + } + } + } + + return result +} + +func (vr *VoterRegistry) Remove(ids []raft.ServerID) *VoterRegistry { + for _, id := range ids { + delete(*vr, id) + } + + return vr +} From 32d3f81052987765100c9e5499eb61003e5aa0fb Mon Sep 17 00:00:00 2001 From: Peter Wilson Date: Wed, 2 Nov 2022 13:09:19 +0000 Subject: [PATCH 03/15] Update reconcile.go Co-authored-by: Nick Cabatoff --- reconcile.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/reconcile.go b/reconcile.go index 6502606..1c8a7d4 100644 --- a/reconcile.go +++ b/reconcile.go @@ -187,7 +187,7 @@ func (a *Autopilot) getFailedServers() (*FailedServers, VoterRegistry, error) { } // Update the potential suffrage using the supplied predicate. - v, _ := registry[id] + v := registry[id] v.SetPotentialVoter(a.promoter.PotentialVoterPredicate(srv.NodeType)) if srv.NodeStatus != NodeAlive { From e4d643c5983afb0975cc5460428f2a51f6f1c699 Mon Sep 17 00:00:00 2001 From: peteski22 Date: Wed, 2 Nov 2022 13:43:29 +0000 Subject: [PATCH 04/15] PR feedback updates --- mock_promoter.go | 4 +-- reconcile.go | 77 +++++++++++++++++++------------------------ reconcile_test.go | 2 +- stable_promoter.go | 2 +- types.go | 82 ++++++++++++++++++++++++---------------------- 5 files changed, 80 insertions(+), 87 deletions(-) diff --git a/mock_promoter.go b/mock_promoter.go index 2d95fca..7c49635 100644 --- a/mock_promoter.go +++ b/mock_promoter.go @@ -90,8 +90,8 @@ func (_m *MockPromoter) GetStateExt(_a0 *Config, _a1 *State) interface{} { return r0 } -// PotentialVoterPredicate provides a mock function with given fields: _a0 -func (_m *MockPromoter) PotentialVoterPredicate(_a0 NodeType) bool { +// IsPotentialVoter provides a mock function with given fields: _a0 +func (_m *MockPromoter) IsPotentialVoter(_a0 NodeType) bool { ret := _m.Called(_a0) var r0 bool diff --git a/reconcile.go b/reconcile.go index 1c8a7d4..ab6a7fc 100644 --- a/reconcile.go +++ b/reconcile.go @@ -156,7 +156,7 @@ func (a *Autopilot) applyDemotions(state *State, changes RaftChanges) (bool, err // a failed/left state (indicated by the NodeStatus field on the Server type) as well as stale servers that are // in the raft configuration but not know to the consuming application. This function will do nothing with // that information and is purely to collect the data. -func (a *Autopilot) getFailedServers() (*FailedServers, VoterRegistry, error) { +func (a *Autopilot) getFailedServers() (*FailedServers, *VoterRegistry, error) { staleRaftServers := make(map[raft.ServerID]raft.Server) raftConfig, err := a.getRaftConfiguration() if err != nil { @@ -166,10 +166,11 @@ func (a *Autopilot) getFailedServers() (*FailedServers, VoterRegistry, error) { // Populate a map of all the raft servers. We will // remove some later on from the map leaving us with // just the stale servers. - registry := make(VoterRegistry) + registry := NewVoterRegistry() + for _, server := range raftConfig.Servers { staleRaftServers[server.ID] = server - registry[server.ID] = &VoterEligibility{ + registry.Eligibility[server.ID] = &VoterEligibility{ currentVoter: server.Suffrage == raft.Voter, } } @@ -187,8 +188,8 @@ func (a *Autopilot) getFailedServers() (*FailedServers, VoterRegistry, error) { } // Update the potential suffrage using the supplied predicate. - v := registry[id] - v.SetPotentialVoter(a.promoter.PotentialVoterPredicate(srv.NodeType)) + v := registry.Eligibility[id] + v.SetPotentialVoter(a.promoter.IsPotentialVoter(srv.NodeType)) if srv.NodeStatus != NodeAlive { if found && raftSrv.Suffrage == raft.Voter { @@ -251,57 +252,61 @@ func (a *Autopilot) pruneDeadServers() error { failed = a.promoter.FilterFailedServerRemovals(conf, state, failed) // remove stale non-voters - staleNonVoters := vr.FilterStale(failed.StaleNonVoters) - toRemove := a.adjudicateRemoval(staleNonVoters, vr.PotentialVoters) + toRemove := a.adjudicateRemoval(failed.StaleNonVoters, vr) if err = a.removeStaleServers(toRemove); err != nil { return err } - vr.Remove(toRemove) + vr.RemoveAll(toRemove) // Remove stale voters - staleVoters := vr.FilterStale(failed.StaleVoters) - toRemove = a.adjudicateRemoval(staleVoters, vr.PotentialVoters) + toRemove = a.adjudicateRemoval(failed.StaleVoters, vr) if err = a.removeStaleServers(toRemove); err != nil { return err } - vr.Remove(toRemove) + vr.RemoveAll(toRemove) // remove failed non-voters - failedNonVoters := vr.FilterFailed(failed.FailedNonVoters) - toRemove = a.adjudicateRemoval(failedNonVoters, vr.PotentialVoters) - a.removeFailedServers(failed.Get(toRemove, false)) - vr.Remove(toRemove) + failedNonVoters := vr.Filter(failed.FailedNonVoters) + toRemove = a.adjudicateRemoval(failedNonVoters, vr) + a.removeFailedServers(failed.GetFailed(toRemove, false)) + vr.RemoveAll(toRemove) // remove failed voters - failedVoters := vr.FilterFailed(failed.FailedVoters) - toRemove = a.adjudicateRemoval(failedVoters, vr.PotentialVoters) - a.removeFailedServers(failed.Get(toRemove, true)) - vr.Remove(toRemove) + failedVoters := vr.Filter(failed.FailedVoters) + toRemove = a.adjudicateRemoval(failedVoters, vr) + a.removeFailedServers(failed.GetFailed(toRemove, true)) + vr.RemoveAll(toRemove) return nil } -func (a *Autopilot) adjudicateRemoval(s VoterRegistry, voterCountProvider func() int) []raft.ServerID { - var ids []raft.ServerID - maxRemoval := (voterCountProvider() - 1) / 2 +func (a *Autopilot) adjudicateRemoval(ids []raft.ServerID, vr *VoterRegistry) []raft.ServerID { + var result []raft.ServerID + maxRemoval := (vr.PotentialVoters() - 1) / 2 minQuorum := a.delegate.AutopilotConfig().MinQuorum - for id, v := range s { - if v != nil && v.IsPotentialVoter() && voterCountProvider()-1 < int(minQuorum) { + for _, id := range ids { + v, found := vr.Eligibility[id] + if !found { + + } + + if v != nil && v.IsPotentialVoter() && vr.PotentialVoters()-1 < int(minQuorum) { a.logger.Debug("will not remove server node as it would leave less voters than the minimum number allowed", "id", id, "min", minQuorum) } else if v.IsCurrentVoter() && maxRemoval < 1 { a.logger.Debug("will not remove server node as removal of a majority of voting servers is not safe", "id", id) } else if v != nil && v.IsCurrentVoter() { maxRemoval-- - delete(s, id) - ids = append(ids, id) + // We need to update the removal in the registry so the next call for potential voters is accurate + vr.Remove(id) + result = append(result, id) } else { - delete(s, id) - ids = append(ids, id) + delete(vr.Eligibility, id) + result = append(result, id) } } - return ids + return result } func (a *Autopilot) removeStaleServer(id raft.ServerID) error { @@ -333,17 +338,3 @@ func (a *Autopilot) removeFailedServers(toRemove []*Server) { a.delegate.RemoveFailedServer(srv) } } - -func Remove(removeFrom []raft.ServerID, toRemove []raft.ServerID) []raft.ServerID { - var result []raft.ServerID - - for _, toRemoveId := range toRemove { - for i, id := range removeFrom { - if removeFrom[i] != toRemoveId { - result = append(result, id) - } - } - } - - return result -} diff --git a/reconcile_test.go b/reconcile_test.go index 68ae083..4b1f3d7 100644 --- a/reconcile_test.go +++ b/reconcile_test.go @@ -748,7 +748,7 @@ func TestPruneDeadServers(t *testing.T) { } mpromoter := NewMockPromoter(t) mpromoter.On("FilterFailedServerRemovals", conf, &tcase.state, &tcase.expectedFailed).Return(&tcase.expectedFailed).Once() - mpromoter.On("PotentialVoterPredicate", NodeVoter).Return(true) + mpromoter.On("IsPotentialVoter", NodeVoter).Return(true) mapp := NewMockApplicationIntegration(t) mapp.On("AutopilotConfig").Return(conf).Times(5) mapp.On("KnownServers").Return(tcase.knownServers).Once() diff --git a/stable_promoter.go b/stable_promoter.go index 8d982a5..08500be 100644 --- a/stable_promoter.go +++ b/stable_promoter.go @@ -53,6 +53,6 @@ func (_ *StablePromoter) CalculatePromotionsAndDemotions(c *Config, s *State) Ra return changes } -func (_ *StablePromoter) PotentialVoterPredicate(nodeType NodeType) bool { +func (_ *StablePromoter) IsPotentialVoter(nodeType NodeType) bool { return nodeType == NodeVoter } diff --git a/types.go b/types.go index 08fd8b0..2a288cb 100644 --- a/types.go +++ b/types.go @@ -258,6 +258,27 @@ type FailedServers struct { FailedVoters []*Server } +func (f *FailedServers) GetFailed(ids []raft.ServerID, isVoter bool) []*Server { + var servers []*Server + var result []*Server + + if isVoter { + servers = f.FailedVoters + } else { + servers = f.FailedNonVoters + } + + for _, id := range ids { + for _, srv := range servers { + if srv.ID == id { + result = append(result, srv) + } + } + } + + return result +} + // Promoter is an interface to provide promotion/demotion algorithms to the core autopilot type. // The BasicPromoter satisfies this interface and will promote any stable servers but other // algorithms could be implemented. The implementation of these methods shouldn't "block". @@ -292,9 +313,9 @@ type Promoter interface { // should be allowed to be removed. FilterFailedServerRemovals(*Config, *State, *FailedServers) *FailedServers - // PotentialVoterPredicate takes a NodeType and returns whether that type represents + // IsPotentialVoter takes a NodeType and returns whether that type represents // a potential voter, based on a predicate implemented by the promoter. - PotentialVoterPredicate(NodeType) bool + IsPotentialVoter(NodeType) bool } // TimeProvider is an interface for getting a local time. This is mainly useful for testing @@ -328,12 +349,20 @@ type VoterEligibility struct { potentialVoter bool } -type VoterRegistry map[raft.ServerID]*VoterEligibility +type VoterRegistry struct { + Eligibility map[raft.ServerID]*VoterEligibility +} + +func NewVoterRegistry() *VoterRegistry { + var result VoterRegistry + result.Eligibility = make(map[raft.ServerID]*VoterEligibility) + return &result +} func (vr *VoterRegistry) PotentialVoters() int { potentialVoters := 0 - for _, v := range *vr { + for _, v := range vr.Eligibility { if v.IsPotentialVoter() { potentialVoters++ } @@ -342,55 +371,28 @@ func (vr *VoterRegistry) PotentialVoters() int { return potentialVoters } -func (vr *VoterRegistry) FilterStale(ids []raft.ServerID) VoterRegistry { - result := make(VoterRegistry) - - for _, id := range ids { - if ve, ok := (*vr)[id]; ok { - result[id] = ve - } - } - - return result -} - -func (vr *VoterRegistry) FilterFailed(ids []*Server) VoterRegistry { - result := make(VoterRegistry) +func (vr *VoterRegistry) Filter(ids []*Server) []raft.ServerID { + var result []raft.ServerID for _, srv := range ids { - if ve, ok := (*vr)[srv.ID]; ok { - result[srv.ID] = ve + if _, ok := vr.Eligibility[srv.ID]; ok { + result = append(result, srv.ID) } } return result } -func (f *FailedServers) Get(ids []raft.ServerID, isVoter bool) []*Server { - var servers []*Server - var result []*Server - - if isVoter { - servers = f.FailedVoters - } else { - servers = f.FailedNonVoters - } - +func (vr *VoterRegistry) RemoveAll(ids []raft.ServerID) *VoterRegistry { for _, id := range ids { - for _, srv := range servers { - if srv.ID == id { - result = append(result, srv) - } - } + vr.Remove(id) } - return result + return vr } -func (vr *VoterRegistry) Remove(ids []raft.ServerID) *VoterRegistry { - for _, id := range ids { - delete(*vr, id) - } +func (vr *VoterRegistry) Remove(id raft.ServerID) *VoterRegistry { + delete(vr.Eligibility, id) return vr } From 08bf949704ddf6d6e22787d7f2e605f2c7dbefa8 Mon Sep 17 00:00:00 2001 From: peteski22 Date: Wed, 2 Nov 2022 14:21:05 +0000 Subject: [PATCH 05/15] Removed call to delete on the underlying map during the adjudication --- reconcile.go | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/reconcile.go b/reconcile.go index ab6a7fc..a8ec48a 100644 --- a/reconcile.go +++ b/reconcile.go @@ -251,6 +251,9 @@ func (a *Autopilot) pruneDeadServers() error { failed = a.promoter.FilterFailedServerRemovals(conf, state, failed) + // Remove servers in order of increasing precedence + // Update the voter registry after each batch is processed + // remove stale non-voters toRemove := a.adjudicateRemoval(failed.StaleNonVoters, vr) if err = a.removeStaleServers(toRemove); err != nil { @@ -282,7 +285,9 @@ func (a *Autopilot) pruneDeadServers() error { func (a *Autopilot) adjudicateRemoval(ids []raft.ServerID, vr *VoterRegistry) []raft.ServerID { var result []raft.ServerID - maxRemoval := (vr.PotentialVoters() - 1) / 2 + initialPotentialVoters := vr.PotentialVoters() + removedPotentialVoters := 0 + maxRemoval := (initialPotentialVoters - 1) / 2 minQuorum := a.delegate.AutopilotConfig().MinQuorum for _, id := range ids { @@ -291,17 +296,17 @@ func (a *Autopilot) adjudicateRemoval(ids []raft.ServerID, vr *VoterRegistry) [] } - if v != nil && v.IsPotentialVoter() && vr.PotentialVoters()-1 < int(minQuorum) { + if v != nil && v.IsPotentialVoter() && initialPotentialVoters-removedPotentialVoters-1 < int(minQuorum) { a.logger.Debug("will not remove server node as it would leave less voters than the minimum number allowed", "id", id, "min", minQuorum) } else if v.IsCurrentVoter() && maxRemoval < 1 { a.logger.Debug("will not remove server node as removal of a majority of voting servers is not safe", "id", id) } else if v != nil && v.IsCurrentVoter() { maxRemoval-- - // We need to update the removal in the registry so the next call for potential voters is accurate - vr.Remove(id) + // We need to track how many voters we have removed from the registry + // to ensure the total remaining potential voters is accurate + removedPotentialVoters++ result = append(result, id) } else { - delete(vr.Eligibility, id) result = append(result, id) } } From e9f4b47d58969dd0c6ab2a947c5d6e455d14325e Mon Sep 17 00:00:00 2001 From: peteski22 Date: Wed, 2 Nov 2022 15:01:34 +0000 Subject: [PATCH 06/15] Added test, made most things private for the package --- reconcile.go | 38 +++++------ reconcile_test.go | 157 +++++++++++++++++++++++++++++++++++++++++++++- types.go | 40 ++++++------ 3 files changed, 195 insertions(+), 40 deletions(-) diff --git a/reconcile.go b/reconcile.go index a8ec48a..118b0c6 100644 --- a/reconcile.go +++ b/reconcile.go @@ -156,7 +156,7 @@ func (a *Autopilot) applyDemotions(state *State, changes RaftChanges) (bool, err // a failed/left state (indicated by the NodeStatus field on the Server type) as well as stale servers that are // in the raft configuration but not know to the consuming application. This function will do nothing with // that information and is purely to collect the data. -func (a *Autopilot) getFailedServers() (*FailedServers, *VoterRegistry, error) { +func (a *Autopilot) getFailedServers() (*FailedServers, *voterRegistry, error) { staleRaftServers := make(map[raft.ServerID]raft.Server) raftConfig, err := a.getRaftConfiguration() if err != nil { @@ -166,11 +166,11 @@ func (a *Autopilot) getFailedServers() (*FailedServers, *VoterRegistry, error) { // Populate a map of all the raft servers. We will // remove some later on from the map leaving us with // just the stale servers. - registry := NewVoterRegistry() + registry := newVoterRegistry() for _, server := range raftConfig.Servers { staleRaftServers[server.ID] = server - registry.Eligibility[server.ID] = &VoterEligibility{ + registry.eligibility[server.ID] = &voterEligibility{ currentVoter: server.Suffrage == raft.Voter, } } @@ -188,8 +188,8 @@ func (a *Autopilot) getFailedServers() (*FailedServers, *VoterRegistry, error) { } // Update the potential suffrage using the supplied predicate. - v := registry.Eligibility[id] - v.SetPotentialVoter(a.promoter.IsPotentialVoter(srv.NodeType)) + v := registry.eligibility[id] + v.setPotentialVoter(a.promoter.IsPotentialVoter(srv.NodeType)) if srv.NodeStatus != NodeAlive { if found && raftSrv.Suffrage == raft.Voter { @@ -259,48 +259,48 @@ func (a *Autopilot) pruneDeadServers() error { if err = a.removeStaleServers(toRemove); err != nil { return err } - vr.RemoveAll(toRemove) + vr.removeAll(toRemove) // Remove stale voters toRemove = a.adjudicateRemoval(failed.StaleVoters, vr) if err = a.removeStaleServers(toRemove); err != nil { return err } - vr.RemoveAll(toRemove) + vr.removeAll(toRemove) // remove failed non-voters - failedNonVoters := vr.Filter(failed.FailedNonVoters) + failedNonVoters := vr.filter(failed.FailedNonVoters) toRemove = a.adjudicateRemoval(failedNonVoters, vr) - a.removeFailedServers(failed.GetFailed(toRemove, false)) - vr.RemoveAll(toRemove) + a.removeFailedServers(failed.getFailed(toRemove, false)) + vr.removeAll(toRemove) // remove failed voters - failedVoters := vr.Filter(failed.FailedVoters) + failedVoters := vr.filter(failed.FailedVoters) toRemove = a.adjudicateRemoval(failedVoters, vr) - a.removeFailedServers(failed.GetFailed(toRemove, true)) - vr.RemoveAll(toRemove) + a.removeFailedServers(failed.getFailed(toRemove, true)) + vr.removeAll(toRemove) return nil } -func (a *Autopilot) adjudicateRemoval(ids []raft.ServerID, vr *VoterRegistry) []raft.ServerID { +func (a *Autopilot) adjudicateRemoval(ids []raft.ServerID, vr *voterRegistry) []raft.ServerID { var result []raft.ServerID - initialPotentialVoters := vr.PotentialVoters() + initialPotentialVoters := vr.potentialVoters() removedPotentialVoters := 0 maxRemoval := (initialPotentialVoters - 1) / 2 minQuorum := a.delegate.AutopilotConfig().MinQuorum for _, id := range ids { - v, found := vr.Eligibility[id] + v, found := vr.eligibility[id] if !found { } - if v != nil && v.IsPotentialVoter() && initialPotentialVoters-removedPotentialVoters-1 < int(minQuorum) { + if v != nil && v.isPotentialVoter() && initialPotentialVoters-removedPotentialVoters-1 < int(minQuorum) { a.logger.Debug("will not remove server node as it would leave less voters than the minimum number allowed", "id", id, "min", minQuorum) - } else if v.IsCurrentVoter() && maxRemoval < 1 { + } else if v.isCurrentVoter() && maxRemoval < 1 { a.logger.Debug("will not remove server node as removal of a majority of voting servers is not safe", "id", id) - } else if v != nil && v.IsCurrentVoter() { + } else if v != nil && v.isCurrentVoter() { maxRemoval-- // We need to track how many voters we have removed from the registry // to ensure the total remaining potential voters is accurate diff --git a/reconcile_test.go b/reconcile_test.go index 4b1f3d7..80a302f 100644 --- a/reconcile_test.go +++ b/reconcile_test.go @@ -738,6 +738,161 @@ func TestPruneDeadServers(t *testing.T) { }).Once() }, }, + "ignore-stabilizing-nodes": { + raftConfig: raft.Configuration{ + Servers: []raft.Server{ + { + Suffrage: raft.Voter, + ID: "51b2d56e-816e-409a-8b8e-afef2cf49661", + Address: "198.18.0.1:8300", + }, + { + Suffrage: raft.Voter, + ID: "51fb4248-be6a-43e5-b47f-c089818e2012", + Address: "198.18.0.2:8300", + }, + // this is going to be our failed voter + { + Suffrage: raft.Voter, + ID: "a227f9a9-f55e-4321-b959-5afdcc63c6d3", + Address: "198.18.0.3:8300", + }, + // this is going to be our stale non-voter + // (it won't be known to the delegate that supplies 'KnownServers' + { + Suffrage: raft.Nonvoter, + ID: "db877f23-3e0a-4107-8ed8-bd7c3d710944", + Address: "198.18.0.4:8300", + }, + // this is going to be our failed non-voter + { + Suffrage: raft.Nonvoter, + ID: "8830c599-04cc-4b28-9b75-173355d49ab5", + Address: "198.18.0.5:8300", + }, + }, + }, + expectedFailed: FailedServers{ + StaleNonVoters: []raft.ServerID{ + "db877f23-3e0a-4107-8ed8-bd7c3d710944", + }, + FailedNonVoters: []*Server{ + { + ID: "8830c599-04cc-4b28-9b75-173355d49ab5", + Name: "node5", + Address: "198.18.0.5:8300", + NodeStatus: NodeFailed, + NodeType: NodeVoter, + }, + }, + FailedVoters: []*Server{ + { + ID: "a227f9a9-f55e-4321-b959-5afdcc63c6d3", + Name: "node3", + Address: "198.18.0.3:8300", + NodeStatus: NodeFailed, + NodeType: NodeVoter, + }, + }, + }, + knownServers: map[raft.ServerID]*Server{ + "51b2d56e-816e-409a-8b8e-afef2cf49661": { + ID: "51b2d56e-816e-409a-8b8e-afef2cf49661", + Name: "node1", + Address: "198.18.0.1:8300", + NodeStatus: NodeAlive, + NodeType: NodeVoter, + }, + "51fb4248-be6a-43e5-b47f-c089818e2012": { + ID: "51fb4248-be6a-43e5-b47f-c089818e2012", + Name: "node2", + Address: "198.18.0.2:8300", + NodeStatus: NodeAlive, + NodeType: NodeVoter, + }, + "a227f9a9-f55e-4321-b959-5afdcc63c6d3": { + ID: "a227f9a9-f55e-4321-b959-5afdcc63c6d3", + Name: "node3", + Address: "198.18.0.3:8300", + NodeStatus: NodeFailed, + NodeType: NodeVoter, + }, + "8830c599-04cc-4b28-9b75-173355d49ab5": { + ID: "8830c599-04cc-4b28-9b75-173355d49ab5", + Name: "node5", + Address: "198.18.0.5:8300", + NodeStatus: NodeFailed, + NodeType: NodeVoter, + }, + }, + state: State{ + Servers: map[raft.ServerID]*ServerState{ + "51b2d56e-816e-409a-8b8e-afef2cf49661": { + Server: Server{ + ID: "51b2d56e-816e-409a-8b8e-afef2cf49661", + Name: "node1", + Address: "198.18.0.1:8300", + NodeStatus: NodeAlive, + NodeType: NodeVoter, + }, State: RaftVoter, + }, + "51fb4248-be6a-43e5-b47f-c089818e2012": { + Server: Server{ + ID: "51fb4248-be6a-43e5-b47f-c089818e2012", + Name: "node2", + Address: "198.18.0.2:8300", + NodeStatus: NodeAlive, + NodeType: NodeVoter, + }, State: RaftVoter, + }, + "a227f9a9-f55e-4321-b959-5afdcc63c6d3": { + Server: Server{ + ID: "a227f9a9-f55e-4321-b959-5afdcc63c6d3", + Name: "node3", + Address: "198.18.0.3:8300", + NodeStatus: NodeFailed, + NodeType: NodeVoter, + }, State: RaftVoter, + }, + // Stale non-voter + "db877f23-3e0a-4107-8ed8-bd7c3d710944": { + Server: Server{ + ID: "db877f23-3e0a-4107-8ed8-bd7c3d710944", + Name: "node4", + Address: "198.18.0.4:8300", + NodeStatus: NodeLeft, + NodeType: NodeVoter, + }, State: RaftNonVoter, + }, + // Failed non-voter + "8830c599-04cc-4b28-9b75-173355d49ab5": { + Server: Server{ + ID: "8830c599-04cc-4b28-9b75-173355d49ab5", + Name: "node5", + Address: "198.18.0.5:8300", + NodeStatus: NodeFailed, + NodeType: NodeVoter, + }, State: RaftNonVoter, + }, + }, + }, + setupExpectations: func(mraft *MockRaft, mapp *MockApplicationIntegration) { + // Stale non-voter + mraft.On("RemoveServer", + raft.ServerID("db877f23-3e0a-4107-8ed8-bd7c3d710944"), + uint64(0), + time.Duration(0), + ).Return(&raftIndexFuture{}).Once() + // Failed non-voter + mapp.On("RemoveFailedServer", &Server{ + ID: "8830c599-04cc-4b28-9b75-173355d49ab5", + Name: "node5", + Address: "198.18.0.5:8300", + NodeStatus: NodeFailed, + NodeType: NodeVoter, + }).Once() + }, + }, } for name, tcase := range cases { @@ -748,7 +903,7 @@ func TestPruneDeadServers(t *testing.T) { } mpromoter := NewMockPromoter(t) mpromoter.On("FilterFailedServerRemovals", conf, &tcase.state, &tcase.expectedFailed).Return(&tcase.expectedFailed).Once() - mpromoter.On("IsPotentialVoter", NodeVoter).Return(true) + mpromoter.On("isPotentialVoter", NodeVoter).Return(true) mapp := NewMockApplicationIntegration(t) mapp.On("AutopilotConfig").Return(conf).Times(5) mapp.On("KnownServers").Return(tcase.knownServers).Once() diff --git a/types.go b/types.go index 2a288cb..0e12d2e 100644 --- a/types.go +++ b/types.go @@ -258,7 +258,7 @@ type FailedServers struct { FailedVoters []*Server } -func (f *FailedServers) GetFailed(ids []raft.ServerID, isVoter bool) []*Server { +func (f *FailedServers) getFailed(ids []raft.ServerID, isVoter bool) []*Server { var servers []*Server var result []*Server @@ -330,40 +330,40 @@ func (_ *runtimeTimeProvider) Now() time.Time { return time.Now() } -func (v *VoterEligibility) IsCurrentVoter() bool { +func (v *voterEligibility) isCurrentVoter() bool { return v.currentVoter } -func (v *VoterEligibility) IsPotentialVoter() bool { +func (v *voterEligibility) isPotentialVoter() bool { return v.potentialVoter } -func (v *VoterEligibility) SetPotentialVoter(isVoter bool) { +func (v *voterEligibility) setPotentialVoter(isVoter bool) { v.potentialVoter = isVoter } -// VoterEligibility represents whether a node can currently vote, +// voterEligibility represents whether a node can currently vote, // and if it could potentially vote in the future. -type VoterEligibility struct { +type voterEligibility struct { currentVoter bool potentialVoter bool } -type VoterRegistry struct { - Eligibility map[raft.ServerID]*VoterEligibility +type voterRegistry struct { + eligibility map[raft.ServerID]*voterEligibility } -func NewVoterRegistry() *VoterRegistry { - var result VoterRegistry - result.Eligibility = make(map[raft.ServerID]*VoterEligibility) +func newVoterRegistry() *voterRegistry { + var result voterRegistry + result.eligibility = make(map[raft.ServerID]*voterEligibility) return &result } -func (vr *VoterRegistry) PotentialVoters() int { +func (vr *voterRegistry) potentialVoters() int { potentialVoters := 0 - for _, v := range vr.Eligibility { - if v.IsPotentialVoter() { + for _, v := range vr.eligibility { + if v.isPotentialVoter() { potentialVoters++ } } @@ -371,11 +371,11 @@ func (vr *VoterRegistry) PotentialVoters() int { return potentialVoters } -func (vr *VoterRegistry) Filter(ids []*Server) []raft.ServerID { +func (vr *voterRegistry) filter(ids []*Server) []raft.ServerID { var result []raft.ServerID for _, srv := range ids { - if _, ok := vr.Eligibility[srv.ID]; ok { + if _, ok := vr.eligibility[srv.ID]; ok { result = append(result, srv.ID) } } @@ -383,16 +383,16 @@ func (vr *VoterRegistry) Filter(ids []*Server) []raft.ServerID { return result } -func (vr *VoterRegistry) RemoveAll(ids []raft.ServerID) *VoterRegistry { +func (vr *voterRegistry) removeAll(ids []raft.ServerID) *voterRegistry { for _, id := range ids { - vr.Remove(id) + vr.remove(id) } return vr } -func (vr *VoterRegistry) Remove(id raft.ServerID) *VoterRegistry { - delete(vr.Eligibility, id) +func (vr *voterRegistry) remove(id raft.ServerID) *voterRegistry { + delete(vr.eligibility, id) return vr } From 7123c5cc5041def17836b05bf77fe36391f84ea7 Mon Sep 17 00:00:00 2001 From: peteski22 Date: Wed, 2 Nov 2022 15:05:16 +0000 Subject: [PATCH 07/15] Remove spurious check for finding the voter eligibility info --- reconcile.go | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/reconcile.go b/reconcile.go index 118b0c6..35123c1 100644 --- a/reconcile.go +++ b/reconcile.go @@ -291,10 +291,7 @@ func (a *Autopilot) adjudicateRemoval(ids []raft.ServerID, vr *voterRegistry) [] minQuorum := a.delegate.AutopilotConfig().MinQuorum for _, id := range ids { - v, found := vr.eligibility[id] - if !found { - - } + v := vr.eligibility[id] if v != nil && v.isPotentialVoter() && initialPotentialVoters-removedPotentialVoters-1 < int(minQuorum) { a.logger.Debug("will not remove server node as it would leave less voters than the minimum number allowed", "id", id, "min", minQuorum) From 0ef5700432775b07bff07ef78beb3db0a8f6274c Mon Sep 17 00:00:00 2001 From: peteski22 Date: Wed, 2 Nov 2022 15:44:47 +0000 Subject: [PATCH 08/15] Bad refactor - corrected method name in test --- reconcile_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/reconcile_test.go b/reconcile_test.go index 80a302f..72aff01 100644 --- a/reconcile_test.go +++ b/reconcile_test.go @@ -903,7 +903,7 @@ func TestPruneDeadServers(t *testing.T) { } mpromoter := NewMockPromoter(t) mpromoter.On("FilterFailedServerRemovals", conf, &tcase.state, &tcase.expectedFailed).Return(&tcase.expectedFailed).Once() - mpromoter.On("isPotentialVoter", NodeVoter).Return(true) + mpromoter.On("IsPotentialVoter", NodeVoter).Return(true) mapp := NewMockApplicationIntegration(t) mapp.On("AutopilotConfig").Return(conf).Times(5) mapp.On("KnownServers").Return(tcase.knownServers).Once() From 1bfc30290bffc9ce74f93c4b8d589b4a32f4f1da Mon Sep 17 00:00:00 2001 From: peteski22 Date: Wed, 2 Nov 2022 15:50:49 +0000 Subject: [PATCH 09/15] Another test --- reconcile_test.go | 106 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) diff --git a/reconcile_test.go b/reconcile_test.go index 72aff01..0da1e1a 100644 --- a/reconcile_test.go +++ b/reconcile_test.go @@ -893,6 +893,112 @@ func TestPruneDeadServers(t *testing.T) { }).Once() }, }, + "stale-non-voters": { + // 2 working nodes and 2 stale servers - should only remove stale + // non-voter and refuse to remove anything else + raftConfig: raft.Configuration{ + Servers: []raft.Server{ + { + Suffrage: raft.Voter, + ID: "51b2d56e-816e-409a-8b8e-afef2cf49661", + Address: "198.18.0.1:8300", + }, + { + Suffrage: raft.Voter, + ID: "51fb4248-be6a-43e5-b47f-c089818e2012", + Address: "198.18.0.2:8300", + }, + // this is going to be our stale voter + { + Suffrage: raft.Voter, + ID: "3857f1d4-5c23-4016-9078-fee502c0d1b4", + Address: "198.18.0.4:8300", + }, + // this is going to be our stale non-voter + { + Suffrage: raft.Nonvoter, + ID: "db877f23-3e0a-4107-8ed8-bd7c3d710945", + Address: "198.18.0.5:8300", + }, + }, + }, + knownServers: map[raft.ServerID]*Server{ + "51b2d56e-816e-409a-8b8e-afef2cf49661": { + ID: "51b2d56e-816e-409a-8b8e-afef2cf49661", + Name: "node1", + Address: "198.18.0.1:8300", + NodeStatus: NodeAlive, + NodeType: NodeVoter, + }, + "51fb4248-be6a-43e5-b47f-c089818e2012": { + ID: "51fb4248-be6a-43e5-b47f-c089818e2012", + Name: "node2", + Address: "198.18.0.2:8300", + NodeStatus: NodeAlive, + NodeType: NodeVoter, + }, + }, + state: State{ + Servers: map[raft.ServerID]*ServerState{ + "51b2d56e-816e-409a-8b8e-afef2cf49661": { + Server: Server{ + ID: "51b2d56e-816e-409a-8b8e-afef2cf49661", + Name: "node1", + Address: "198.18.0.1:8300", + NodeStatus: NodeAlive, + NodeType: NodeVoter, + }, + State: RaftLeader, + }, + "51fb4248-be6a-43e5-b47f-c089818e2012": { + Server: Server{ + ID: "51fb4248-be6a-43e5-b47f-c089818e2012", + Name: "node2", + Address: "198.18.0.2:8300", + NodeStatus: NodeAlive, + NodeType: NodeVoter, + }, + State: RaftVoter, + }, + "3857f1d4-5c23-4016-9078-fee502c0d1b4": { + Server: Server{ + ID: "3857f1d4-5c23-4016-9078-fee502c0d1b4", + Name: "node4", + Address: "198.18.0.4:8300", + NodeStatus: NodeFailed, + NodeType: NodeVoter, + }, + State: RaftVoter, + }, + "db877f23-3e0a-4107-8ed8-bd7c3d710945": { + Server: Server{ + ID: "db877f23-3e0a-4107-8ed8-bd7c3d710945", + Name: "node5", + Address: "198.18.0.5:8300", + NodeStatus: NodeFailed, + NodeType: NodeVoter, + }, + State: RaftNonVoter, + }, + }, + }, + expectedFailed: FailedServers{ + StaleNonVoters: []raft.ServerID{ + "db877f23-3e0a-4107-8ed8-bd7c3d710945", + }, + StaleVoters: []raft.ServerID{ + "3857f1d4-5c23-4016-9078-fee502c0d1b4", + }, + }, + setupExpectations: func(mraft *MockRaft, mapp *MockApplicationIntegration) { + // Stale non-voter + mraft.On("RemoveServer", + raft.ServerID("db877f23-3e0a-4107-8ed8-bd7c3d710945"), + uint64(0), + time.Duration(0), + ).Return(&raftIndexFuture{}).Once() + }, + }, } for name, tcase := range cases { From 029e90d63bb6e4d97e9c1ec32ab4c43c499b583e Mon Sep 17 00:00:00 2001 From: Peter Wilson Date: Thu, 3 Nov 2022 15:16:27 +0000 Subject: [PATCH 10/15] Update reconcile.go Great catch, thanks for that. Co-authored-by: Matt Keeler --- reconcile.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/reconcile.go b/reconcile.go index 35123c1..d34d948 100644 --- a/reconcile.go +++ b/reconcile.go @@ -297,7 +297,7 @@ func (a *Autopilot) adjudicateRemoval(ids []raft.ServerID, vr *voterRegistry) [] a.logger.Debug("will not remove server node as it would leave less voters than the minimum number allowed", "id", id, "min", minQuorum) } else if v.isCurrentVoter() && maxRemoval < 1 { a.logger.Debug("will not remove server node as removal of a majority of voting servers is not safe", "id", id) - } else if v != nil && v.isCurrentVoter() { + } else if v != nil && v.isPotentialVoter() { maxRemoval-- // We need to track how many voters we have removed from the registry // to ensure the total remaining potential voters is accurate From cd5e495ef39d191a5861ea362345d3290f4b3796 Mon Sep 17 00:00:00 2001 From: Peter Wilson Date: Thu, 3 Nov 2022 15:17:01 +0000 Subject: [PATCH 11/15] Update reconcile_test.go Co-authored-by: Matt Keeler --- reconcile_test.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/reconcile_test.go b/reconcile_test.go index 0da1e1a..3be74c0 100644 --- a/reconcile_test.go +++ b/reconcile_test.go @@ -895,7 +895,8 @@ func TestPruneDeadServers(t *testing.T) { }, "stale-non-voters": { // 2 working nodes and 2 stale servers - should only remove stale - // non-voter and refuse to remove anything else + // non-voter and refuse to remove anything else due to restrictions + // preventing removal half or more of the servers in the cluster at once. raftConfig: raft.Configuration{ Servers: []raft.Server{ { From dbaeaff58aad7e61d208c4dfedcfbb088500e079 Mon Sep 17 00:00:00 2001 From: peteski22 Date: Thu, 3 Nov 2022 15:47:53 +0000 Subject: [PATCH 12/15] Tests for min quorum --- reconcile_test.go | 124 +++++++++++++++++++++++----------------------- 1 file changed, 63 insertions(+), 61 deletions(-) diff --git a/reconcile_test.go b/reconcile_test.go index 3be74c0..0463b8b 100644 --- a/reconcile_test.go +++ b/reconcile_test.go @@ -738,7 +738,7 @@ func TestPruneDeadServers(t *testing.T) { }).Once() }, }, - "ignore-stabilizing-nodes": { + "dont-go-under-min-quorum-with-3": { raftConfig: raft.Configuration{ Servers: []raft.Server{ { @@ -757,34 +757,9 @@ func TestPruneDeadServers(t *testing.T) { ID: "a227f9a9-f55e-4321-b959-5afdcc63c6d3", Address: "198.18.0.3:8300", }, - // this is going to be our stale non-voter - // (it won't be known to the delegate that supplies 'KnownServers' - { - Suffrage: raft.Nonvoter, - ID: "db877f23-3e0a-4107-8ed8-bd7c3d710944", - Address: "198.18.0.4:8300", - }, - // this is going to be our failed non-voter - { - Suffrage: raft.Nonvoter, - ID: "8830c599-04cc-4b28-9b75-173355d49ab5", - Address: "198.18.0.5:8300", - }, }, }, expectedFailed: FailedServers{ - StaleNonVoters: []raft.ServerID{ - "db877f23-3e0a-4107-8ed8-bd7c3d710944", - }, - FailedNonVoters: []*Server{ - { - ID: "8830c599-04cc-4b28-9b75-173355d49ab5", - Name: "node5", - Address: "198.18.0.5:8300", - NodeStatus: NodeFailed, - NodeType: NodeVoter, - }, - }, FailedVoters: []*Server{ { ID: "a227f9a9-f55e-4321-b959-5afdcc63c6d3", @@ -817,13 +792,6 @@ func TestPruneDeadServers(t *testing.T) { NodeStatus: NodeFailed, NodeType: NodeVoter, }, - "8830c599-04cc-4b28-9b75-173355d49ab5": { - ID: "8830c599-04cc-4b28-9b75-173355d49ab5", - Name: "node5", - Address: "198.18.0.5:8300", - NodeStatus: NodeFailed, - NodeType: NodeVoter, - }, }, state: State{ Servers: map[raft.ServerID]*ServerState{ @@ -854,48 +822,82 @@ func TestPruneDeadServers(t *testing.T) { NodeType: NodeVoter, }, State: RaftVoter, }, - // Stale non-voter - "db877f23-3e0a-4107-8ed8-bd7c3d710944": { + }, + }, + setupExpectations: func(mraft *MockRaft, mapp *MockApplicationIntegration) { + }, + }, + "dont-go-under-min-quorum-with-2": { + raftConfig: raft.Configuration{ + Servers: []raft.Server{ + { + Suffrage: raft.Voter, + ID: "51b2d56e-816e-409a-8b8e-afef2cf49661", + Address: "198.18.0.1:8300", + }, + // this is going to be our failed voter + { + Suffrage: raft.Voter, + ID: "a227f9a9-f55e-4321-b959-5afdcc63c6d3", + Address: "198.18.0.3:8300", + }, + }, + }, + expectedFailed: FailedServers{ + FailedVoters: []*Server{ + { + ID: "a227f9a9-f55e-4321-b959-5afdcc63c6d3", + Name: "node3", + Address: "198.18.0.3:8300", + NodeStatus: NodeFailed, + NodeType: NodeVoter, + }, + }, + }, + knownServers: map[raft.ServerID]*Server{ + "51b2d56e-816e-409a-8b8e-afef2cf49661": { + ID: "51b2d56e-816e-409a-8b8e-afef2cf49661", + Name: "node1", + Address: "198.18.0.1:8300", + NodeStatus: NodeAlive, + NodeType: NodeVoter, + }, + "a227f9a9-f55e-4321-b959-5afdcc63c6d3": { + ID: "a227f9a9-f55e-4321-b959-5afdcc63c6d3", + Name: "node3", + Address: "198.18.0.3:8300", + NodeStatus: NodeFailed, + NodeType: NodeVoter, + }, + }, + state: State{ + Servers: map[raft.ServerID]*ServerState{ + "51b2d56e-816e-409a-8b8e-afef2cf49661": { Server: Server{ - ID: "db877f23-3e0a-4107-8ed8-bd7c3d710944", - Name: "node4", - Address: "198.18.0.4:8300", - NodeStatus: NodeLeft, + ID: "51b2d56e-816e-409a-8b8e-afef2cf49661", + Name: "node1", + Address: "198.18.0.1:8300", + NodeStatus: NodeAlive, NodeType: NodeVoter, - }, State: RaftNonVoter, + }, State: RaftVoter, }, - // Failed non-voter - "8830c599-04cc-4b28-9b75-173355d49ab5": { + "a227f9a9-f55e-4321-b959-5afdcc63c6d3": { Server: Server{ - ID: "8830c599-04cc-4b28-9b75-173355d49ab5", - Name: "node5", - Address: "198.18.0.5:8300", + ID: "a227f9a9-f55e-4321-b959-5afdcc63c6d3", + Name: "node3", + Address: "198.18.0.3:8300", NodeStatus: NodeFailed, NodeType: NodeVoter, - }, State: RaftNonVoter, + }, State: RaftVoter, }, }, }, setupExpectations: func(mraft *MockRaft, mapp *MockApplicationIntegration) { - // Stale non-voter - mraft.On("RemoveServer", - raft.ServerID("db877f23-3e0a-4107-8ed8-bd7c3d710944"), - uint64(0), - time.Duration(0), - ).Return(&raftIndexFuture{}).Once() - // Failed non-voter - mapp.On("RemoveFailedServer", &Server{ - ID: "8830c599-04cc-4b28-9b75-173355d49ab5", - Name: "node5", - Address: "198.18.0.5:8300", - NodeStatus: NodeFailed, - NodeType: NodeVoter, - }).Once() }, }, "stale-non-voters": { // 2 working nodes and 2 stale servers - should only remove stale - // non-voter and refuse to remove anything else due to restrictions + // non-voter and refuse to remove anything else due to restrictions // preventing removal half or more of the servers in the cluster at once. raftConfig: raft.Configuration{ Servers: []raft.Server{ From 2ff6a1865c9a6848ad6b0d500d553fa2fdd1eb7f Mon Sep 17 00:00:00 2001 From: peteski22 Date: Thu, 3 Nov 2022 15:54:31 +0000 Subject: [PATCH 13/15] Fix added test --- reconcile_test.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/reconcile_test.go b/reconcile_test.go index 0463b8b..e1bfce0 100644 --- a/reconcile_test.go +++ b/reconcile_test.go @@ -751,16 +751,16 @@ func TestPruneDeadServers(t *testing.T) { ID: "51fb4248-be6a-43e5-b47f-c089818e2012", Address: "198.18.0.2:8300", }, - // this is going to be our failed voter + // this is going to be our failed non-voter { - Suffrage: raft.Voter, + Suffrage: raft.Nonvoter, ID: "a227f9a9-f55e-4321-b959-5afdcc63c6d3", Address: "198.18.0.3:8300", }, }, }, expectedFailed: FailedServers{ - FailedVoters: []*Server{ + FailedNonVoters: []*Server{ { ID: "a227f9a9-f55e-4321-b959-5afdcc63c6d3", Name: "node3", @@ -820,7 +820,7 @@ func TestPruneDeadServers(t *testing.T) { Address: "198.18.0.3:8300", NodeStatus: NodeFailed, NodeType: NodeVoter, - }, State: RaftVoter, + }, State: RaftNonVoter, }, }, }, From e1db596dca153531c384276d3b73d63e18f06bf5 Mon Sep 17 00:00:00 2001 From: peteski22 Date: Thu, 3 Nov 2022 15:58:21 +0000 Subject: [PATCH 14/15] Try to make it clear the rules we're basing the code on --- reconcile.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/reconcile.go b/reconcile.go index d34d948..7a8fa88 100644 --- a/reconcile.go +++ b/reconcile.go @@ -251,8 +251,10 @@ func (a *Autopilot) pruneDeadServers() error { failed = a.promoter.FilterFailedServerRemovals(conf, state, failed) - // Remove servers in order of increasing precedence - // Update the voter registry after each batch is processed + // Remove servers in order of increasing precedence (and update the registry) + // Rules: + // 1. Deal with non-voters first as their removal shouldn't impact cluster stability. + // 2. Handle 'stale' before 'failed' in order to make progress towards the applications desired server set. // remove stale non-voters toRemove := a.adjudicateRemoval(failed.StaleNonVoters, vr) From 47a597d4c70aa3847cb3cc069fccfc1a5c0625b2 Mon Sep 17 00:00:00 2001 From: peteski22 Date: Thu, 3 Nov 2022 16:06:40 +0000 Subject: [PATCH 15/15] TIL: Variadic functions/ellipses in Go --- reconcile.go | 8 ++++---- types.go | 10 ++-------- 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/reconcile.go b/reconcile.go index 7a8fa88..a314492 100644 --- a/reconcile.go +++ b/reconcile.go @@ -261,26 +261,26 @@ func (a *Autopilot) pruneDeadServers() error { if err = a.removeStaleServers(toRemove); err != nil { return err } - vr.removeAll(toRemove) + vr.remove(toRemove...) // Remove stale voters toRemove = a.adjudicateRemoval(failed.StaleVoters, vr) if err = a.removeStaleServers(toRemove); err != nil { return err } - vr.removeAll(toRemove) + vr.remove(toRemove...) // remove failed non-voters failedNonVoters := vr.filter(failed.FailedNonVoters) toRemove = a.adjudicateRemoval(failedNonVoters, vr) a.removeFailedServers(failed.getFailed(toRemove, false)) - vr.removeAll(toRemove) + vr.remove(toRemove...) // remove failed voters failedVoters := vr.filter(failed.FailedVoters) toRemove = a.adjudicateRemoval(failedVoters, vr) a.removeFailedServers(failed.getFailed(toRemove, true)) - vr.removeAll(toRemove) + vr.remove(toRemove...) return nil } diff --git a/types.go b/types.go index 0e12d2e..2943768 100644 --- a/types.go +++ b/types.go @@ -383,16 +383,10 @@ func (vr *voterRegistry) filter(ids []*Server) []raft.ServerID { return result } -func (vr *voterRegistry) removeAll(ids []raft.ServerID) *voterRegistry { +func (vr *voterRegistry) remove(ids ...raft.ServerID) *voterRegistry { for _, id := range ids { - vr.remove(id) + delete(vr.eligibility, id) } return vr } - -func (vr *voterRegistry) remove(id raft.ServerID) *voterRegistry { - delete(vr.eligibility, id) - - return vr -}