From cc6409222ce246ed72d067debe6ffeb8f62f9dad Mon Sep 17 00:00:00 2001 From: Mike Palmiotto Date: Mon, 18 Jul 2022 12:37:12 -0600 Subject: [PATCH] Vault 6773/raft rejoin nonvoter (#16324) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * raft: Ensure init before setting suffrage As reported in https://hashicorp.atlassian.net/browse/VAULT-6773: The /sys/storage/raft/join endpoint is intended to be unauthenticated. We rely on the seal to manage trust. It’s possible to use multiple join requests to switch nodes from voter to non-voter. The screenshot shows a 3 node cluster where vault_2 is the leader, and vault_3 and vault_4 are followers with non-voters set to false. sent two requests to the raft join endpoint to have vault_3 and vault_4 join the cluster with non_voters:true. This commit fixes the issue by delaying the call to SetDesiredSuffrage until after the initialization check, preventing unauthenticated mangling of voter status. Tested locally using https://github.com/hashicorp/vault-tools/blob/main/users/ncabatoff/cluster/raft.sh and the reproducer outlined in VAULT-6773. * raft: Return join err on failure This is necessary to correctly distinguish errors returned from the Join workflow. Previously, errors were being masked as timeouts. * raft: Default autopilot parameters in teststorage Change some defaults so we don't have to pass in parameters or set them in the originating tests. These storage types are only used in two places: 1) Raft HA testing 2) Seal migration testing Both consumers have been tested and pass with this change. * changelog: Unauthn voter status change bugfix --- changelog/16324.txt | 3 +++ helper/testhelpers/teststorage/teststorage.go | 8 ++++--- .../teststorage/teststorage_reusable.go | 9 ++++---- vault/raft.go | 21 ++++++++++++------- 4 files changed, 27 insertions(+), 14 deletions(-) create mode 100644 changelog/16324.txt diff --git a/changelog/16324.txt b/changelog/16324.txt new file mode 100644 index 0000000000..3d4cfbf2a5 --- /dev/null +++ b/changelog/16324.txt @@ -0,0 +1,3 @@ +```release-note:bug +storage/raft (enterprise): Prevent unauthenticated voter status change with rejoin +``` diff --git a/helper/testhelpers/teststorage/teststorage.go b/helper/testhelpers/teststorage/teststorage.go index 915d435dab..bf342e11eb 100644 --- a/helper/testhelpers/teststorage/teststorage.go +++ b/helper/testhelpers/teststorage/teststorage.go @@ -137,9 +137,11 @@ func RaftHAFactory(f PhysicalBackendBundler) func(t testing.T, coreIdx int, logg nodeID := fmt.Sprintf("core-%d", coreIdx) backendConf := map[string]string{ - "path": raftDir, - "node_id": nodeID, - "performance_multiplier": "8", + "path": raftDir, + "node_id": nodeID, + "performance_multiplier": "8", + "autopilot_reconcile_interval": "300ms", + "autopilot_update_interval": "100ms", } // Create and set the HA Backend diff --git a/helper/testhelpers/teststorage/teststorage_reusable.go b/helper/testhelpers/teststorage/teststorage_reusable.go index 69f3ec310d..257a5a0184 100644 --- a/helper/testhelpers/teststorage/teststorage_reusable.go +++ b/helper/testhelpers/teststorage/teststorage_reusable.go @@ -18,7 +18,6 @@ import ( // seal migration, wherein a given physical backend must be re-used as several // test clusters are sequentially created, tested, and discarded. type ReusableStorage struct { - // IsRaft specifies whether the storage is using a raft backend. IsRaft bool @@ -169,9 +168,11 @@ func makeRaftDir(t testing.T) string { func makeReusableRaftBackend(t testing.T, coreIdx int, logger hclog.Logger, raftDir string, addressProvider raftlib.ServerAddressProvider, ha bool) *vault.PhysicalBackendBundle { nodeID := fmt.Sprintf("core-%d", coreIdx) conf := map[string]string{ - "path": raftDir, - "node_id": nodeID, - "performance_multiplier": "8", + "path": raftDir, + "node_id": nodeID, + "performance_multiplier": "8", + "autopilot_reconcile_interval": "300ms", + "autopilot_update_interval": "100ms", } backend, err := raft.NewRaftBackend(conf, logger) diff --git a/vault/raft.go b/vault/raft.go index 89397e18db..7799f4a0bb 100644 --- a/vault/raft.go +++ b/vault/raft.go @@ -835,11 +835,6 @@ func (c *Core) JoinRaftCluster(ctx context.Context, leaderInfos []*raft.LeaderJo return false, errors.New("raft backend not in use") } - if err := raftBackend.SetDesiredSuffrage(nonVoter); err != nil { - c.logger.Error("failed to set desired suffrage for this node", "error", err) - return false, nil - } - init, err := c.InitializedLocally(ctx) if err != nil { return false, fmt.Errorf("failed to check if core is initialized: %w", err) @@ -963,10 +958,21 @@ func (c *Core) JoinRaftCluster(ctx context.Context, leaderInfos []*raft.LeaderJo if err != nil { return fmt.Errorf("failed to check if core is initialized: %w", err) } - if init && !isRaftHAOnly { + + // InitializedLocally will return non-nil before HA backends are + // initialized. c.Initialized(ctx) checks InitializedLocally first, so + // we can't use that generically for both cases. Instead check + // raftBackend.Initialized() directly for the HA-Only case. + if (!isRaftHAOnly && init) || (isRaftHAOnly && raftBackend.Initialized()) { c.logger.Info("returning from raft join as the node is initialized") return nil } + + if err := raftBackend.SetDesiredSuffrage(nonVoter); err != nil { + c.logger.Error("failed to set desired suffrage for this node", "error", err) + return nil + } + challengeCh := make(chan *raftInformation) var expandedJoinInfos []*raft.LeaderJoinInfo for _, leaderInfo := range leaderInfos { @@ -1001,6 +1007,7 @@ func (c *Core) JoinRaftCluster(ctx context.Context, leaderInfos []*raft.LeaderJo select { case <-ctx.Done(): + err = ctx.Err() case raftInfo := <-challengeCh: if raftInfo != nil { err = answerChallenge(ctx, raftInfo) @@ -1009,7 +1016,7 @@ func (c *Core) JoinRaftCluster(ctx context.Context, leaderInfos []*raft.LeaderJo } } } - return fmt.Errorf("timed out on raft join: %w", ctx.Err()) + return err } switch retryFailures {