Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Backport of VAULT-14048: raft-autopilot appears to refuse to remove a node which has left and wouldn't impact stability into release/1.13.x #20436

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions changelog/19472.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:improvement
autopilot: Update version to v0.2.0 to add better support for respecting min quorum
```
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ require (
github.com/hashicorp/hcp-sdk-go v0.23.0
github.com/hashicorp/nomad/api v0.0.0-20220707195938-75f4c2237b28
github.com/hashicorp/raft v1.3.10
github.com/hashicorp/raft-autopilot v0.1.6
github.com/hashicorp/raft-autopilot v0.2.0
github.com/hashicorp/raft-boltdb/v2 v2.0.0-20210421194847-a7e34179d62c
github.com/hashicorp/raft-snapshot v1.0.4
github.com/hashicorp/vault-plugin-auth-alicloud v0.14.0
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -1129,8 +1129,8 @@ github.com/hashicorp/raft v1.1.2-0.20191002163536-9c6bd3e3eb17/go.mod h1:vPAJM8A
github.com/hashicorp/raft v1.2.0/go.mod h1:vPAJM8Asw6u8LxC3eJCUZmRP/E4QmUGE1R7g7k8sG/8=
github.com/hashicorp/raft v1.3.10 h1:LR5QZX1VQd0DFWZfeCwWawyeKfpS/Tm1yjnJIY5X4Tw=
github.com/hashicorp/raft v1.3.10/go.mod h1:J8naEwc6XaaCfts7+28whSeRvCqTd6e20BlCU3LtEO4=
github.com/hashicorp/raft-autopilot v0.1.6 h1:C1q3RNF2FfXNZfHWbvVAu0QixaQK8K5pX4O5lh+9z4I=
github.com/hashicorp/raft-autopilot v0.1.6/go.mod h1:Af4jZBwaNOI+tXfIqIdbcAnh/UyyqIMj/pOISIfhArw=
github.com/hashicorp/raft-autopilot v0.2.0 h1:2/R2RPgamgRKgNWGQioULZvjeKXQZmDuw5Ty+6c+H7Y=
github.com/hashicorp/raft-autopilot v0.2.0/go.mod h1:q6tZ8UAZ5xio2gv2JvjgmtOlh80M6ic8xQYBe2Egkg8=
github.com/hashicorp/raft-boltdb v0.0.0-20171010151810-6e5ba93211ea/go.mod h1:pNv7Wc3ycL6F5oOWn+tPGo2gWD4a5X+yp/ntwdKLjRk=
github.com/hashicorp/raft-boltdb/v2 v2.0.0-20210421194847-a7e34179d62c h1:oiKun9QlrOz5yQxMZJ3tf1kWtFYuKSJzxzEDxDPevj4=
github.com/hashicorp/raft-boltdb/v2 v2.0.0-20210421194847-a7e34179d62c/go.mod h1:kiPs9g148eLShc2TYagUAyKDnD+dH9U+CQKsXzlY9xo=
Expand Down
31 changes: 31 additions & 0 deletions helper/testhelpers/testhelpers_oss.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,34 @@ func WaitForActiveNodeAndStandbys(t testing.T, cluster *vault.TestCluster) {
}
}
}

// WaitForNodesExcludingSelectedStandbys is variation on WaitForActiveNodeAndStandbys.
// It waits for the active node before waiting for standby nodes, however
// it will not wait for cores with indexes that match those specified as arguments.
// Whilst you could specify index 0 which is likely to be the leader node, the function
// checks for the leader first regardless of the indexes to skip, so it would be redundant to do so.
// The intention/use case for this function is to allow a cluster to start and become active with one
// or more nodes not joined, so that we can test scenarios where a node joins later.
// e.g. 4 nodes in the cluster, only 3 nodes in cluster 'active', 1 node can be joined later in tests.
func WaitForNodesExcludingSelectedStandbys(t testing.T, cluster *vault.TestCluster, indexesToSkip ...int) {
WaitForActiveNode(t, cluster)

contains := func(elems []int, e int) bool {
for _, v := range elems {
if v == e {
return true
}
}

return false
}
for i, core := range cluster.Cores {
if contains(indexesToSkip, i) {
continue
}

if standby, _ := core.Core.Standby(); standby {
WaitForStandbyNode(t, core)
}
}
}
19 changes: 17 additions & 2 deletions physical/raft/raft_autopilot.go
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,7 @@ type Delegate struct {
emptyVersionLogs map[raft.ServerID]struct{}
}

func newDelegate(b *RaftBackend) *Delegate {
func NewDelegate(b *RaftBackend) *Delegate {
return &Delegate{
RaftBackend: b,
inflightRemovals: make(map[raft.ServerID]bool),
Expand Down Expand Up @@ -382,6 +382,7 @@ func (d *Delegate) KnownServers() map[raft.ServerID]*autopilot.Server {
return nil
}

apServerStates := d.autopilot.GetState().Servers
servers := future.Configuration().Servers
serverIDs := make([]string, 0, len(servers))
for _, server := range servers {
Expand Down Expand Up @@ -425,6 +426,19 @@ func (d *Delegate) KnownServers() map[raft.ServerID]*autopilot.Server {
Ext: d.autopilotServerExt(state),
}

// As KnownServers is a delegate called by autopilot let's check if we already
// had this data in the correct format and use it. If we don't (which sounds a
// bit sad, unless this ISN'T a voter) then as a fail-safe, let's try what we've
// done elsewhere in code to check the desired suffrage and manually set NodeType
// based on whether that's a voter or not. If we don't do either of these
// things, NodeType isn't set which means technically it's not a voter.
// It shouldn't be a voter and end up in this state.
if apServerState, found := apServerStates[raft.ServerID(id)]; found && apServerState.Server.NodeType != "" {
server.NodeType = apServerState.Server.NodeType
} else if state.DesiredSuffrage == "voter" {
server.NodeType = autopilot.NodeVoter
}

switch state.IsDead.Load() {
case true:
d.logger.Debug("informing autopilot that the node left", "id", id)
Expand All @@ -442,6 +456,7 @@ func (d *Delegate) KnownServers() map[raft.ServerID]*autopilot.Server {
Name: d.localID,
RaftVersion: raft.ProtocolVersionMax,
NodeStatus: autopilot.NodeAlive,
NodeType: autopilot.NodeVoter, // The leader must be a voter
Meta: d.meta(&FollowerState{
UpgradeVersion: d.EffectiveVersion(),
RedundancyZone: d.RedundancyZone(),
Expand Down Expand Up @@ -817,7 +832,7 @@ func (b *RaftBackend) SetupAutopilot(ctx context.Context, storageConfig *Autopil
if b.autopilotUpdateInterval != 0 {
options = append(options, autopilot.WithUpdateInterval(b.autopilotUpdateInterval))
}
b.autopilot = autopilot.New(b.raft, newDelegate(b), options...)
b.autopilot = autopilot.New(b.raft, NewDelegate(b), options...)
b.followerStates = followerStates
b.followerHeartbeatTicker = time.NewTicker(1 * time.Second)

Expand Down
Loading