From c8c51b1b9dc358ad0e1b2ed6e7dbbf3697ec81d3 Mon Sep 17 00:00:00 2001 From: Ryan Cragun Date: Tue, 24 Sep 2024 09:13:40 -0600 Subject: [PATCH] VAULT-30819: verify DR secondary leader before unsealing followers (#28459) * VAULT-30819: verify DR secondary leader before unsealing followers After we've enabled DR replication on the secondary leader the existing cluster followers will be resealed with the primary clusters encryption keys. We have to unseal the followers to make them available. To ensure that we absolutely take every precaution before attempting to unseal the followers we now verify that the secondary leader is the cluster leader, has a valid merkle tree, and is streaming wals from the primary cluster before we attempt to unseal the secondary followers. Signed-off-by: Ryan Cragun --- enos/enos-dev-scenario-pr-replication.hcl | 2 +- enos/enos-modules.hcl | 4 +- enos/enos-scenario-dr-replication.hcl | 14 +++- enos/enos-scenario-pr-replication.hcl | 2 +- enos/modules/vault_setup_dr_primary/main.tf | 6 +- ...onfigure-vault-dr-primary.sh => enable.sh} | 0 .../vault_setup_replication_secondary/main.tf | 42 +++++++++++- .../scripts/wait-for-leader-ready.sh | 65 +++++++++++++++++++ .../main.tf | 0 .../scripts/unseal-node.sh | 0 .../scripts/wait-until-sealed.sh | 0 11 files changed, 125 insertions(+), 10 deletions(-) rename enos/modules/vault_setup_dr_primary/scripts/{configure-vault-dr-primary.sh => enable.sh} (100%) create mode 100644 enos/modules/vault_setup_replication_secondary/scripts/wait-for-leader-ready.sh rename enos/modules/{vault_unseal_nodes => vault_unseal_replication_followers}/main.tf (100%) rename enos/modules/{vault_unseal_nodes => vault_unseal_replication_followers}/scripts/unseal-node.sh (100%) rename enos/modules/{vault_unseal_nodes => vault_unseal_replication_followers}/scripts/wait-until-sealed.sh (100%) diff --git a/enos/enos-dev-scenario-pr-replication.hcl b/enos/enos-dev-scenario-pr-replication.hcl index 4f3797c3c8..70765182d5 100644 --- a/enos/enos-dev-scenario-pr-replication.hcl +++ b/enos/enos-dev-scenario-pr-replication.hcl @@ -815,7 +815,7 @@ scenario "dev_pr_replication" { Depending on how we're configured we'll pass the unseal keys according to this guide: https://developer.hashicorp.com/vault/docs/enterprise/replication#seals EOF - module = module.vault_unseal_nodes + module = module.vault_unseal_replication_followers depends_on = [ step.create_primary_cluster, step.create_secondary_cluster, diff --git a/enos/enos-modules.hcl b/enos/enos-modules.hcl index d9fb53c470..9a11042d5f 100644 --- a/enos/enos-modules.hcl +++ b/enos/enos-modules.hcl @@ -256,8 +256,8 @@ module "vault_test_ui" { ui_run_tests = var.ui_run_tests } -module "vault_unseal_nodes" { - source = "./modules/vault_unseal_nodes" +module "vault_unseal_replication_followers" { + source = "./modules/vault_unseal_replication_followers" vault_install_dir = var.vault_install_dir } diff --git a/enos/enos-scenario-dr-replication.hcl b/enos/enos-scenario-dr-replication.hcl index e39e6b0c04..20b8836dca 100644 --- a/enos/enos-scenario-dr-replication.hcl +++ b/enos/enos-scenario-dr-replication.hcl @@ -814,7 +814,11 @@ scenario "dr_replication" { enos = local.enos_provider[matrix.distro] } - verifies = quality.vault_api_sys_replication_dr_secondary_enable_write + verifies = [ + quality.vault_api_sys_leader_read, + quality.vault_api_sys_replication_dr_secondary_enable_write, + quality.vault_api_sys_replication_dr_status_read, + ] variables { ip_version = matrix.ip_version @@ -834,7 +838,7 @@ scenario "dr_replication" { type combinations. See the guide for more information: https://developer.hashicorp.com/vault/docs/enterprise/replication#seals EOF - module = module.vault_unseal_nodes + module = module.vault_unseal_replication_followers depends_on = [ step.configure_dr_replication_secondary ] @@ -883,7 +887,11 @@ scenario "dr_replication" { and ensuring that all secondary nodes are unsealed. EOF module = module.vault_verify_dr_replication - depends_on = [step.configure_dr_replication_secondary] + depends_on = [ + step.configure_dr_replication_secondary, + step.unseal_secondary_followers, + step.verify_secondary_cluster_is_unsealed_after_enabling_replication, + ] providers = { enos = local.enos_provider[matrix.distro] diff --git a/enos/enos-scenario-pr-replication.hcl b/enos/enos-scenario-pr-replication.hcl index 93001420bd..1f8aa8682f 100644 --- a/enos/enos-scenario-pr-replication.hcl +++ b/enos/enos-scenario-pr-replication.hcl @@ -820,7 +820,7 @@ scenario "pr_replication" { type combinations. See the guide for more information: https://developer.hashicorp.com/vault/docs/enterprise/replication#seals EOF - module = module.vault_unseal_nodes + module = module.vault_unseal_replication_followers depends_on = [ step.create_primary_cluster, step.create_secondary_cluster, diff --git a/enos/modules/vault_setup_dr_primary/main.tf b/enos/modules/vault_setup_dr_primary/main.tf index 440517b6ed..69e29e6d03 100644 --- a/enos/modules/vault_setup_dr_primary/main.tf +++ b/enos/modules/vault_setup_dr_primary/main.tf @@ -42,14 +42,16 @@ variable "vault_root_token" { type = string description = "The vault root token" } -resource "enos_remote_exec" "configure_dr_primary" { + +// Enable DR replication on the primary. This will immediately clear all data in the secondary. +resource "enos_remote_exec" "enable_dr_replication" { environment = { VAULT_ADDR = var.vault_addr VAULT_TOKEN = var.vault_root_token VAULT_INSTALL_DIR = var.vault_install_dir } - scripts = [abspath("${path.module}/scripts/configure-vault-dr-primary.sh")] + scripts = [abspath("${path.module}/scripts/enable.sh")] transport = { ssh = { diff --git a/enos/modules/vault_setup_dr_primary/scripts/configure-vault-dr-primary.sh b/enos/modules/vault_setup_dr_primary/scripts/enable.sh similarity index 100% rename from enos/modules/vault_setup_dr_primary/scripts/configure-vault-dr-primary.sh rename to enos/modules/vault_setup_dr_primary/scripts/enable.sh diff --git a/enos/modules/vault_setup_replication_secondary/main.tf b/enos/modules/vault_setup_replication_secondary/main.tf index fd891e3555..ec1ae6425a 100644 --- a/enos/modules/vault_setup_replication_secondary/main.tf +++ b/enos/modules/vault_setup_replication_secondary/main.tf @@ -58,7 +58,7 @@ variable "wrapping_token" { description = "The wrapping token created on primary cluster" } -resource "enos_remote_exec" "configure_pr_secondary" { +resource "enos_remote_exec" "enable_replication" { environment = { VAULT_ADDR = var.vault_addr VAULT_TOKEN = var.vault_root_token @@ -72,3 +72,43 @@ resource "enos_remote_exec" "configure_pr_secondary" { } } } + +// Wait for our primary host to be the "leader", which means it's running and all "setup" tasks +// have been completed. We'll have to unseal our follower nodes after this has occurred. +module "wait_for_leader" { + source = "../vault_wait_for_leader" + + depends_on = [ + enos_remote_exec.enable_replication + ] + + hosts = { "0" : var.secondary_leader_host } + ip_version = var.ip_version + vault_addr = var.vault_addr + vault_install_dir = var.vault_install_dir + vault_root_token = var.vault_root_token +} + +// Ensure that our leader is ready to for us to unseal follower nodes. +resource "enos_remote_exec" "wait_for_leader_ready" { + depends_on = [ + module.wait_for_leader, + ] + + environment = { + REPLICATION_TYPE = var.replication_type + RETRY_INTERVAL = 3 // seconds + TIMEOUT_SECONDS = 60 // seconds + VAULT_ADDR = var.vault_addr + VAULT_TOKEN = var.vault_root_token + VAULT_INSTALL_DIR = var.vault_install_dir + } + + scripts = [abspath("${path.module}/scripts/wait-for-leader-ready.sh")] + + transport = { + ssh = { + host = var.secondary_leader_host.public_ip + } + } +} diff --git a/enos/modules/vault_setup_replication_secondary/scripts/wait-for-leader-ready.sh b/enos/modules/vault_setup_replication_secondary/scripts/wait-for-leader-ready.sh new file mode 100644 index 0000000000..09837c610f --- /dev/null +++ b/enos/modules/vault_setup_replication_secondary/scripts/wait-for-leader-ready.sh @@ -0,0 +1,65 @@ +#!/usr/bin/env bash +# Copyright (c) HashiCorp, Inc. +# SPDX-License-Identifier: BUSL-1.1 + +set -e + +fail() { + echo "$1" 1>&2 + return 1 +} + +[[ -z "$REPLICATION_TYPE" ]] && fail "REPLICATION_TYPE env variable has not been set" +[[ -z "$RETRY_INTERVAL" ]] && fail "RETRY_INTERVAL env variable has not been set" +[[ -z "$TIMEOUT_SECONDS" ]] && fail "TIMEOUT_SECONDS env variable has not been set" +[[ -z "$VAULT_ADDR" ]] && fail "VAULT_ADDR env variable has not been set" +[[ -z "$VAULT_INSTALL_DIR" ]] && fail "VAULT_INSTALL_DIR env variable has not been set" +[[ -z "$VAULT_TOKEN" ]] && fail "VAULT_TOKEN env variable has not been set" + +binpath=${VAULT_INSTALL_DIR}/vault +test -x "$binpath" || fail "unable to locate vault binary at $binpath" + +export VAULT_FORMAT=json + +replicationStatus() { + $binpath read "sys/replication/${REPLICATION_TYPE}/status" | jq .data +} + +isReady() { + # Find the leader private IP address + local status + if ! status=$(replicationStatus); then + return 1 + fi + + if ! jq -eMc '.state == "stream-wals"' &> /dev/null <<< "$status"; then + echo "DR replication state is not yet running" 1>&2 + echo "DR replication is not yet running, got: $(jq '.state' <<< "$status")" 1>&2 + return 1 + fi + + if ! jq -eMc '.mode == "secondary"' &> /dev/null <<< "$status"; then + echo "DR replication mode is not yet primary, got: $(jq '.mode' <<< "$status")" 1>&2 + return 1 + fi + + if ! jq -eMc '.corrupted_merkle_tree == false' &> /dev/null <<< "$status"; then + echo "DR replication merkle is corrupted" 1>&2 + return 1 + fi + + echo "${REPLICATION_TYPE} primary is ready for followers to be unsealed!" 1>&2 + return 0 +} + +begin_time=$(date +%s) +end_time=$((begin_time + TIMEOUT_SECONDS)) +while [ "$(date +%s)" -lt "$end_time" ]; do + if isReady; then + exit 0 + fi + + sleep "$RETRY_INTERVAL" +done + +fail "Timed out waiting for ${REPLICATION_TYPE} primary to ready: $(replicationStatus)" diff --git a/enos/modules/vault_unseal_nodes/main.tf b/enos/modules/vault_unseal_replication_followers/main.tf similarity index 100% rename from enos/modules/vault_unseal_nodes/main.tf rename to enos/modules/vault_unseal_replication_followers/main.tf diff --git a/enos/modules/vault_unseal_nodes/scripts/unseal-node.sh b/enos/modules/vault_unseal_replication_followers/scripts/unseal-node.sh similarity index 100% rename from enos/modules/vault_unseal_nodes/scripts/unseal-node.sh rename to enos/modules/vault_unseal_replication_followers/scripts/unseal-node.sh diff --git a/enos/modules/vault_unseal_nodes/scripts/wait-until-sealed.sh b/enos/modules/vault_unseal_replication_followers/scripts/wait-until-sealed.sh similarity index 100% rename from enos/modules/vault_unseal_nodes/scripts/wait-until-sealed.sh rename to enos/modules/vault_unseal_replication_followers/scripts/wait-until-sealed.sh