From 684a372589fb9dec29ef7d80397938bbfa1fa456 Mon Sep 17 00:00:00 2001 From: Vault Automation Date: Wed, 29 Apr 2026 07:25:06 -0600 Subject: [PATCH] feat(enos): migrate vault_verify_undo_logs to vault_run_blackbox_test module (#14170) (#14374) - Migrate undo logs verification from shell script to Go blackbox test - Add session_metrics.go and session_remote.go helpers to blackbox SDK - Create undo_logs_test.go in vault/external_tests/blackbox/verify package - Update autopilot scenario to use vault_run_blackbox_test module - Remove deprecated vault_verify_undo_logs module - Update vault_run_blackbox_test module to support test environment variables This change improves test maintainability and consistency by using the standardized blackbox testing framework instead of custom shell scripts. Co-authored-by: brewgator <12831681+brewgator@users.noreply.github.com> --- enos/enos-modules.hcl | 6 -- enos/enos-scenario-autopilot.hcl | 38 +++++--- enos/modules/vault_run_blackbox_test/main.tf | 3 +- .../vault_run_blackbox_test/variables.tf | 6 ++ enos/modules/vault_verify_undo_logs/main.tf | 77 ---------------- .../scripts/smoke-verify-undo-logs.sh | 35 ------- .../testcluster/blackbox/session_metrics.go | 92 +++++++++++++++++++ .../testcluster/blackbox/session_remote.go | 70 ++++++++++++++ .../blackbox/verify/undo_logs_test.go | 63 +++++++++++++ 9 files changed, 259 insertions(+), 131 deletions(-) delete mode 100644 enos/modules/vault_verify_undo_logs/main.tf delete mode 100644 enos/modules/vault_verify_undo_logs/scripts/smoke-verify-undo-logs.sh create mode 100644 sdk/helper/testcluster/blackbox/session_metrics.go create mode 100644 sdk/helper/testcluster/blackbox/session_remote.go create mode 100644 vault/external_tests/blackbox/verify/undo_logs_test.go diff --git a/enos/enos-modules.hcl b/enos/enos-modules.hcl index ff088a793f..9714126ab7 100644 --- a/enos/enos-modules.hcl +++ b/enos/enos-modules.hcl @@ -396,12 +396,6 @@ module "vault_verify_ui" { source = "./modules/vault_verify_ui" } -module "vault_verify_undo_logs" { - source = "./modules/vault_verify_undo_logs" - - vault_install_dir = var.vault_install_dir -} - module "vault_wait_for_cluster_unsealed" { source = "./modules/vault_wait_for_cluster_unsealed" diff --git a/enos/enos-scenario-autopilot.hcl b/enos/enos-scenario-autopilot.hcl index 4dc52aa39d..ed04ea1da4 100644 --- a/enos/enos-scenario-autopilot.hcl +++ b/enos/enos-scenario-autopilot.hcl @@ -905,7 +905,7 @@ scenario "autopilot" { step "verify_undo_logs_enabled_on_primary" { skip_step = semverconstraint(var.vault_product_version, "<1.13.0-0") - module = module.vault_verify_undo_logs + module = module.vault_run_blackbox_test description = <<-EOF Verifies that undo logs is correctly enabled on newly upgraded target hosts. For this it will query the metrics system backend for the vault.core.replication.write_undo_logs gauge. @@ -925,18 +925,25 @@ scenario "autopilot" { } variables { - expected_state = 1 # Enabled - hosts = step.get_updated_vault_cluster_ips.leader_hosts - timeout = 180 # Seconds - vault_addr = step.upgrade_vault_cluster_with_autopilot.api_addr_localhost - vault_install_dir = local.vault_install_dir + leader_host = step.get_updated_vault_cluster_ips.leader_host + leader_public_ip = step.get_updated_vault_cluster_ips.leader_public_ip vault_root_token = step.create_vault_cluster.root_token + test_package = "./vault/external_tests/blackbox/verify" + test_names = ["TestVaultUndoLogsMetric"] + vault_edition = matrix.edition + vault_install_dir = local.vault_install_dir + vault_addr = step.upgrade_vault_cluster_with_autopilot.api_addr_localhost + test_env_vars = { + EXPECTED_STATE = "1" + TIMEOUT_SECONDS = "180" + RETRY_INTERVAL = "5" + } } } step "verify_undo_logs_disabled_on_followers" { skip_step = semverconstraint(var.vault_product_version, "<1.13.0-0") - module = module.vault_verify_undo_logs + module = module.vault_run_blackbox_test depends_on = [step.verify_undo_logs_enabled_on_primary] providers = { @@ -944,12 +951,19 @@ scenario "autopilot" { } variables { - expected_state = 0 # Disabled - hosts = step.get_updated_vault_cluster_ips.follower_hosts - timeout = 10 # Seconds - vault_addr = step.upgrade_vault_cluster_with_autopilot.api_addr_localhost - vault_install_dir = local.vault_install_dir + leader_host = step.get_updated_vault_cluster_ips.follower_hosts[0] + leader_public_ip = step.get_updated_vault_cluster_ips.follower_hosts[0].public_ip vault_root_token = step.create_vault_cluster.root_token + test_package = "./vault/external_tests/blackbox/verify" + test_names = ["TestVaultUndoLogsMetric"] + vault_edition = matrix.edition + vault_install_dir = local.vault_install_dir + vault_addr = step.upgrade_vault_cluster_with_autopilot.api_addr_localhost + test_env_vars = { + EXPECTED_STATE = "0" + TIMEOUT_SECONDS = "10" + RETRY_INTERVAL = "2" + } } } diff --git a/enos/modules/vault_run_blackbox_test/main.tf b/enos/modules/vault_run_blackbox_test/main.tf index 700549f0f5..1c5f173969 100644 --- a/enos/modules/vault_run_blackbox_test/main.tf +++ b/enos/modules/vault_run_blackbox_test/main.tf @@ -51,7 +51,8 @@ resource "enos_local_exec" "run_blackbox_test" { var.vault_install_dir != null ? { VAULT_INSTALL_DIR = var.vault_install_dir } : {}, local.ldap_environment, local.postgres_environment, - local.mongodb_environment + local.mongodb_environment, + var.test_env_vars ) } diff --git a/enos/modules/vault_run_blackbox_test/variables.tf b/enos/modules/vault_run_blackbox_test/variables.tf index 62814e18b7..65e3a16ddc 100644 --- a/enos/modules/vault_run_blackbox_test/variables.tf +++ b/enos/modules/vault_run_blackbox_test/variables.tf @@ -77,3 +77,9 @@ variable "vault_install_dir" { description = "The directory where Vault is installed" default = null } + +variable "test_env_vars" { + type = map(string) + description = "Additional environment variables to pass to the test" + default = {} +} diff --git a/enos/modules/vault_verify_undo_logs/main.tf b/enos/modules/vault_verify_undo_logs/main.tf deleted file mode 100644 index d2e917918d..0000000000 --- a/enos/modules/vault_verify_undo_logs/main.tf +++ /dev/null @@ -1,77 +0,0 @@ -# Copyright IBM Corp. 2016, 2025 -# SPDX-License-Identifier: BUSL-1.1 - -terraform { - required_providers { - enos = { - source = "registry.terraform.io/hashicorp-forge/enos" - } - } -} - -variable "expected_state" { - type = number - description = "The expected state to have in vault.core.replication.write_undo_logs telemetry. Must be either 1 for enabled or 0 for disabled." - - validation { - condition = contains([0, 1], var.expected_state) - error_message = "The expected_state must be either 0 or 1" - } -} - -variable "hosts" { - type = map(object({ - ipv6 = string - private_ip = string - public_ip = string - })) - description = "The vault cluster target hosts to check" -} - -variable "retry_interval" { - type = number - description = "How many seconds to wait between each retry" - default = 2 -} - -variable "timeout" { - type = number - description = "The max number of seconds to wait before timing out" - default = 60 -} - -variable "vault_addr" { - type = string - description = "The local vault API listen address" -} - -variable "vault_install_dir" { - type = string - description = "The directory where the Vault binary will be installed" -} - -variable "vault_root_token" { - type = string - description = "The vault root token" -} - -resource "enos_remote_exec" "smoke-verify-undo-logs" { - for_each = var.hosts - - environment = { - EXPECTED_STATE = var.expected_state - RETRY_INTERVAL = var.retry_interval - TIMEOUT_SECONDS = var.timeout - VAULT_ADDR = var.vault_addr - VAULT_INSTALL_DIR = var.vault_install_dir - VAULT_TOKEN = var.vault_root_token - } - - scripts = [abspath("${path.module}/scripts/smoke-verify-undo-logs.sh")] - - transport = { - ssh = { - host = each.value.public_ip - } - } -} diff --git a/enos/modules/vault_verify_undo_logs/scripts/smoke-verify-undo-logs.sh b/enos/modules/vault_verify_undo_logs/scripts/smoke-verify-undo-logs.sh deleted file mode 100644 index 7af19f6b72..0000000000 --- a/enos/modules/vault_verify_undo_logs/scripts/smoke-verify-undo-logs.sh +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/env bash -# Copyright IBM Corp. 2016, 2025 -# SPDX-License-Identifier: BUSL-1.1 - -function fail() { - echo "$1" 1>&2 - exit 1 -} - -[[ -z "$EXPECTED_STATE" ]] && fail "EXPECTED_STAE env variable has not been set" -[[ -z "$RETRY_INTERVAL" ]] && fail "RETRY_INTERVAL env variable has not been set" -[[ -z "$TIMEOUT_SECONDS" ]] && fail "TIMEOUT_SECONDS env variable has not been set" -[[ -z "$VAULT_ADDR" ]] && fail "VAULT_ADDR env variable has not been set" -[[ -z "$VAULT_INSTALL_DIR" ]] && fail "VAULT_INSTALL_DIR env variable has not been set" -[[ -z "$VAULT_TOKEN" ]] && fail "VAULT_TOKEN env variable has not been set" - -binpath=${VAULT_INSTALL_DIR}/vault -test -x "$binpath" || fail "unable to locate vault binary at $binpath" - -begin_time=$(date +%s) -end_time=$((begin_time + TIMEOUT_SECONDS)) -while [ "$(date +%s)" -lt "$end_time" ]; do - state=$($binpath read sys/metrics -format=json | jq -r '.data.Gauges[] | select(.Name == "vault.core.replication.write_undo_logs")') - target_undo_logs_status="$(jq -r '.Value' <<< "$state")" - - if [ "$target_undo_logs_status" == "$EXPECTED_STATE" ]; then - echo "vault.core.replication.write_undo_logs has expected Value: \"${EXPECTED_STATE}\"" - exit 0 - fi - - echo "Waiting for vault.core.replication.write_undo_logs to have Value: \"${EXPECTED_STATE}\"" - sleep "$RETRY_INTERVAL" -done - -fail "Timed out waiting for vault.core.replication.write_undo_logs to have Value: \"${EXPECTED_STATE}\"" diff --git a/sdk/helper/testcluster/blackbox/session_metrics.go b/sdk/helper/testcluster/blackbox/session_metrics.go new file mode 100644 index 0000000000..f69128f5f9 --- /dev/null +++ b/sdk/helper/testcluster/blackbox/session_metrics.go @@ -0,0 +1,92 @@ +// Copyright IBM Corp. 2025, 2026 +// SPDX-License-Identifier: BUSL-1.1 + +package blackbox + +import ( + "encoding/json" + "fmt" + "time" +) + +// MetricsResponse represents the response from sys/metrics endpoint +type MetricsResponse struct { + Data struct { + Gauges []struct { + Name string `json:"Name"` + Value float64 `json:"Value"` + Labels map[string]string `json:"Labels"` + } `json:"Gauges"` + Counters []struct { + Name string `json:"Name"` + Count int `json:"Count"` + Sum float64 `json:"Sum"` + Labels map[string]string `json:"Labels"` + } `json:"Counters"` + Samples []struct { + Name string `json:"Name"` + Count int `json:"Count"` + Sum float64 `json:"Sum"` + Labels map[string]string `json:"Labels"` + } `json:"Samples"` + } `json:"data"` +} + +// AssertMetricGaugeValue verifies that a specific gauge metric has the expected value +// This method includes retry logic with configurable timeout +// Note: retryInterval parameter is ignored as the SDK uses a fixed 200ms interval +func (s *Session) AssertMetricGaugeValue(gaugeName string, expectedValue float64, timeout time.Duration, retryInterval time.Duration) { + s.t.Helper() + + s.EventuallyWithTimeout(func() error { + // Read sys/metrics endpoint + secret, err := s.Client.Logical().Read("sys/metrics") + if err != nil { + return fmt.Errorf("failed to read sys/metrics: %w", err) + } + + if secret == nil || secret.Data == nil { + return fmt.Errorf("sys/metrics returned nil data") + } + + // Marshal and unmarshal to get proper structure + dataBytes, err := json.Marshal(secret.Data) + if err != nil { + return fmt.Errorf("failed to marshal metrics data: %w", err) + } + + var metricsData struct { + Gauges []struct { + Name string `json:"Name"` + Value float64 `json:"Value"` + Labels map[string]string `json:"Labels"` + } `json:"Gauges"` + } + + if err := json.Unmarshal(dataBytes, &metricsData); err != nil { + return fmt.Errorf("failed to unmarshal metrics data: %w", err) + } + + // Find the gauge by name + var found bool + var actualValue float64 + for _, gauge := range metricsData.Gauges { + if gauge.Name == gaugeName { + found = true + actualValue = gauge.Value + break + } + } + + if !found { + return fmt.Errorf("gauge metric %q not found in sys/metrics response", gaugeName) + } + + if actualValue != expectedValue { + return fmt.Errorf("gauge %q has value %.0f, expected %.0f", gaugeName, actualValue, expectedValue) + } + + s.t.Logf("Gauge metric %q has expected value: %.0f", gaugeName, expectedValue) + return nil + }, timeout) +} diff --git a/sdk/helper/testcluster/blackbox/session_remote.go b/sdk/helper/testcluster/blackbox/session_remote.go new file mode 100644 index 0000000000..25e3283936 --- /dev/null +++ b/sdk/helper/testcluster/blackbox/session_remote.go @@ -0,0 +1,70 @@ +// Copyright IBM Corp. 2025, 2026 +// SPDX-License-Identifier: BUSL-1.1 + +package blackbox + +import ( + "bytes" + "fmt" + "os/exec" + "strings" +) + +// RemoteHost represents a remote host configuration +type RemoteHost struct { + PublicIP string `json:"public_ip"` + PrivateIP string `json:"private_ip"` +} + +// AssertRemoteCLIVersion verifies the Vault CLI version on a remote host via SSH +// This method SSHs to the remote host and runs the vault version command +func (s *Session) AssertRemoteCLIVersion(host RemoteHost, vaultInstallDir, version, sha, buildDate, edition string) { + s.t.Helper() + + // Build the vault version command + vaultBinary := fmt.Sprintf("%s/vault", vaultInstallDir) + remoteCmd := fmt.Sprintf("%s version", vaultBinary) + + // Execute SSH command + cmd := exec.Command("ssh", + "-o", "StrictHostKeyChecking=no", + "-o", "UserKnownHostsFile=/dev/null", + "-o", "LogLevel=ERROR", + host.PublicIP, + remoteCmd, + ) + + var stdout, stderr bytes.Buffer + cmd.Stdout = &stdout + cmd.Stderr = &stderr + + err := cmd.Run() + if err != nil { + s.t.Fatalf("Failed to execute vault version on remote host %s: %v\nStderr: %s", host.PublicIP, err, stderr.String()) + } + + output := strings.TrimSpace(stdout.String()) + + // Build expected version string + expectedVersion := fmt.Sprintf("Vault v%s ('%s'), built %s", version, sha, buildDate) + + switch edition { + case "ce", "ent": + // No additional suffix + case "ent.hsm", "ent.fips1403", "ent.hsm.fips1403": + expectedVersion += " (cgo)" + default: + s.t.Fatalf("unknown Vault edition: %s", edition) + } + + // Also check version without SHA (some builds may not include it) + expectedVersionNoSHA := strings.Replace(expectedVersion, fmt.Sprintf("('%s') ", sha), "", 1) + expectedVersionNoSHA = strings.TrimSpace(strings.Replace(expectedVersionNoSHA, " ", " ", -1)) + + if output != expectedVersion && output != expectedVersionNoSHA { + s.t.Fatalf("CLI version mismatch on host %s.\nExpected: %s\nor: %s\nGot: %s", + host.PublicIP, expectedVersion, expectedVersionNoSHA, output) + } + + s.t.Logf("CLI version verification succeeded on host %s: %s", host.PublicIP, output) +} diff --git a/vault/external_tests/blackbox/verify/undo_logs_test.go b/vault/external_tests/blackbox/verify/undo_logs_test.go new file mode 100644 index 0000000000..a4b29ceeae --- /dev/null +++ b/vault/external_tests/blackbox/verify/undo_logs_test.go @@ -0,0 +1,63 @@ +// Copyright IBM Corp. 2025, 2026 +// SPDX-License-Identifier: BUSL-1.1 + +package verify + +import ( + "os" + "strconv" + "testing" + "time" + + "github.com/hashicorp/vault/sdk/helper/testcluster/blackbox" +) + +// TestVaultUndoLogsMetric verifies the vault.core.replication.write_undo_logs gauge metric +// This test runs from CI/GitHub runners and connects to the Vault cluster via API +func TestVaultUndoLogsMetric(t *testing.T) { + t.Parallel() + + // Read required environment variables + expectedStateStr := os.Getenv("EXPECTED_STATE") + if expectedStateStr == "" { + t.Fatal("EXPECTED_STATE environment variable is required") + } + + expectedState, err := strconv.ParseFloat(expectedStateStr, 64) + if err != nil { + t.Fatalf("Failed to parse EXPECTED_STATE: %v", err) + } + + // Validate expected state is 0 or 1 + if expectedState != 0 && expectedState != 1 { + t.Fatalf("EXPECTED_STATE must be 0 or 1, got: %.0f", expectedState) + } + + timeoutStr := os.Getenv("TIMEOUT_SECONDS") + if timeoutStr == "" { + t.Fatal("TIMEOUT_SECONDS environment variable is required") + } + + timeoutSeconds, err := strconv.Atoi(timeoutStr) + if err != nil { + t.Fatalf("Failed to parse TIMEOUT_SECONDS: %v", err) + } + + retryIntervalStr := os.Getenv("RETRY_INTERVAL") + if retryIntervalStr == "" { + t.Fatal("RETRY_INTERVAL environment variable is required") + } + + retryIntervalSeconds, err := strconv.Atoi(retryIntervalStr) + if err != nil { + t.Fatalf("Failed to parse RETRY_INTERVAL: %v", err) + } + + timeout := time.Duration(timeoutSeconds) * time.Second + retryInterval := time.Duration(retryIntervalSeconds) * time.Second + + v := blackbox.New(t) + + // Verify the undo logs metric has the expected value + v.AssertMetricGaugeValue("vault.core.replication.write_undo_logs", expectedState, timeout, retryInterval) +}