mirror of
https://github.com/hashicorp/vault.git
synced 2026-01-09 18:51:12 +01:00
Backport of VAULT-37080: Maintain redundancy zones for unhealthy nodes into release/1.20.x (#31445)
* backport of commit beeb8c20325f01370e523ccc6033a377e7086ce6 * VAULT-37080: Fix changelog and comment (#31449) --------- Co-authored-by: miagilepner <mia.epner@hashicorp.com>
This commit is contained in:
parent
4b31e3e68a
commit
10576ee3d1
3
changelog/31443.txt
Normal file
3
changelog/31443.txt
Normal file
@ -0,0 +1,3 @@
|
||||
```release-note:bug
|
||||
raft/autopilot: Fixes an issue with enterprise redundancy zones where, if the leader was in a redundancy zone and that leader becomes unavailable, the node would become an unzoned voter. This can artificially inflate the required number of nodes for quorum, leading to a situation where the cluster cannot recover if another leader subsequently becomes unavailable. Vault will now keep an unavailable node in its last known redundancy zone as a non-voter.
|
||||
```
|
||||
@ -31,6 +31,7 @@ const (
|
||||
CleanupDeadServersFalse CleanupDeadServersValue = 2
|
||||
AutopilotUpgradeVersionTag string = "upgrade_version"
|
||||
AutopilotRedundancyZoneTag string = "redundancy_zone"
|
||||
emptyRedundancyZone = ""
|
||||
)
|
||||
|
||||
func (c CleanupDeadServersValue) Value() bool {
|
||||
@ -211,6 +212,14 @@ type FollowerState struct {
|
||||
RedundancyZone string
|
||||
}
|
||||
|
||||
func (f *FollowerState) neverSentHeartbeat() bool {
|
||||
// We can't use LastHeartbeat to determine if the node has been contacted,
|
||||
// since we set it for every node in the cluster during post-unseal.
|
||||
// Instead, check if the node has reported any applied index or term.
|
||||
// These will both be 0 if the node has never sent an echo.
|
||||
return f.LastTerm == 0 && f.AppliedIndex == 0
|
||||
}
|
||||
|
||||
// partialCopy returns a partial copy of the follower state.
|
||||
// This copy uses the same pointer to the IsDead
|
||||
// atomic field. We need to do this to ensure that
|
||||
@ -231,8 +240,9 @@ func (f *FollowerState) partialCopy() *FollowerState {
|
||||
|
||||
// PersistedFollowerState holds the information that gets persisted to storage
|
||||
type PersistedFollowerState struct {
|
||||
Version string `json:"version"`
|
||||
UpgradeVersion string `json:"upgrade_version"`
|
||||
Version string `json:"version"`
|
||||
UpgradeVersion string `json:"upgrade_version"`
|
||||
RedundancyZone *string `json:"redundancy_zone"`
|
||||
}
|
||||
|
||||
type PersistedFollowerStates struct {
|
||||
@ -257,7 +267,9 @@ func (p *PersistedFollowerStates) shouldUpdate(state *autopilot.State, grabLock
|
||||
return true
|
||||
}
|
||||
if server.Server.Version != persistedServer.Version ||
|
||||
server.Server.Meta[AutopilotUpgradeVersionTag] != persistedServer.UpgradeVersion {
|
||||
server.Server.Meta[AutopilotUpgradeVersionTag] != persistedServer.UpgradeVersion ||
|
||||
persistedServer.RedundancyZone == nil ||
|
||||
server.Server.Meta[AutopilotRedundancyZoneTag] != *persistedServer.RedundancyZone {
|
||||
return true
|
||||
}
|
||||
}
|
||||
@ -272,9 +284,11 @@ func (d *Delegate) updatePersistedState(state *autopilot.State) error {
|
||||
}
|
||||
newStates := make(map[string]PersistedFollowerState)
|
||||
for id, server := range state.Servers {
|
||||
redundancyZone := server.Server.Meta[AutopilotRedundancyZoneTag]
|
||||
newStates[string(id)] = PersistedFollowerState{
|
||||
Version: server.Server.Version,
|
||||
UpgradeVersion: server.Server.Meta[AutopilotUpgradeVersionTag],
|
||||
RedundancyZone: &redundancyZone,
|
||||
}
|
||||
}
|
||||
d.persistedState.l.Lock()
|
||||
@ -524,11 +538,13 @@ func (d *Delegate) KnownServers() map[raft.ServerID]*autopilot.Server {
|
||||
|
||||
currentServerID := raft.ServerID(id)
|
||||
followerVersion, upgradeVersion := d.determineFollowerVersions(id, state)
|
||||
if state.UpgradeVersion != upgradeVersion {
|
||||
// we only have a read lock on state, so we can't modify it
|
||||
redundancyZone := d.determineRedundancyZone(id, state)
|
||||
if state.UpgradeVersion != upgradeVersion || state.RedundancyZone != redundancyZone {
|
||||
// We only have a read lock on state, so we can't modify it
|
||||
// safely. Instead, copy it to override the upgrade version
|
||||
state = state.partialCopy()
|
||||
state.UpgradeVersion = upgradeVersion
|
||||
state.RedundancyZone = redundancyZone
|
||||
}
|
||||
|
||||
server := &autopilot.Server{
|
||||
@ -583,6 +599,39 @@ func (d *Delegate) KnownServers() map[raft.ServerID]*autopilot.Server {
|
||||
return ret
|
||||
}
|
||||
|
||||
// determineRedundancyZone will return the correct redundancy zone to report for
|
||||
// the given follower
|
||||
func (d *Delegate) determineRedundancyZone(id string, state *FollowerState) string {
|
||||
// If the follower has a non-empty redundancy zone, use that
|
||||
if state.RedundancyZone != emptyRedundancyZone {
|
||||
return state.RedundancyZone
|
||||
}
|
||||
|
||||
// If we don't have any persisted states (which can happen on an upgrade to
|
||||
// 1.18) we don't have any other option; return an empty zone (which is the
|
||||
// same as the follower's reported state)
|
||||
if len(d.persistedState.States) == 0 {
|
||||
return emptyRedundancyZone
|
||||
}
|
||||
|
||||
// If we've ever gotten a heartbeat from this follower, then trust that
|
||||
// it reported its empty zone correctly
|
||||
if !state.neverSentHeartbeat() {
|
||||
return emptyRedundancyZone
|
||||
}
|
||||
|
||||
// Check if we have any persisted state for this follower, and if that
|
||||
// state includes a redundancy zone. If either of these doesn't exist, we
|
||||
// can't know the follower's zone. Assume it's empty.
|
||||
persistedState, ok := d.persistedState.States[id]
|
||||
if !ok || persistedState.RedundancyZone == nil {
|
||||
return emptyRedundancyZone
|
||||
}
|
||||
|
||||
d.logger.Debug("using redundancy zone from persisted states", "id", id, "redundancy_zone", *persistedState.RedundancyZone)
|
||||
return *persistedState.RedundancyZone
|
||||
}
|
||||
|
||||
// determineFollowerVersions uses the following logic:
|
||||
// - if the version and upgrade version are present in the follower state,
|
||||
// return those.
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user