feat(enos): migrate vault_verify_undo_logs to vault_run_blackbox_test module (#14170) (#14374)

- Migrate undo logs verification from shell script to Go blackbox test
- Add session_metrics.go and session_remote.go helpers to blackbox SDK
- Create undo_logs_test.go in vault/external_tests/blackbox/verify package
- Update autopilot scenario to use vault_run_blackbox_test module
- Remove deprecated vault_verify_undo_logs module
- Update vault_run_blackbox_test module to support test environment variables

This change improves test maintainability and consistency by using the
standardized blackbox testing framework instead of custom shell scripts.

Co-authored-by: brewgator <12831681+brewgator@users.noreply.github.com>
This commit is contained in:
Vault Automation 2026-04-29 07:25:06 -06:00 committed by GitHub
parent c9430538b3
commit 684a372589
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 259 additions and 131 deletions

View File

@ -396,12 +396,6 @@ module "vault_verify_ui" {
source = "./modules/vault_verify_ui"
}
module "vault_verify_undo_logs" {
source = "./modules/vault_verify_undo_logs"
vault_install_dir = var.vault_install_dir
}
module "vault_wait_for_cluster_unsealed" {
source = "./modules/vault_wait_for_cluster_unsealed"

View File

@ -905,7 +905,7 @@ scenario "autopilot" {
step "verify_undo_logs_enabled_on_primary" {
skip_step = semverconstraint(var.vault_product_version, "<1.13.0-0")
module = module.vault_verify_undo_logs
module = module.vault_run_blackbox_test
description = <<-EOF
Verifies that undo logs is correctly enabled on newly upgraded target hosts. For this it will
query the metrics system backend for the vault.core.replication.write_undo_logs gauge.
@ -925,18 +925,25 @@ scenario "autopilot" {
}
variables {
expected_state = 1 # Enabled
hosts = step.get_updated_vault_cluster_ips.leader_hosts
timeout = 180 # Seconds
vault_addr = step.upgrade_vault_cluster_with_autopilot.api_addr_localhost
vault_install_dir = local.vault_install_dir
leader_host = step.get_updated_vault_cluster_ips.leader_host
leader_public_ip = step.get_updated_vault_cluster_ips.leader_public_ip
vault_root_token = step.create_vault_cluster.root_token
test_package = "./vault/external_tests/blackbox/verify"
test_names = ["TestVaultUndoLogsMetric"]
vault_edition = matrix.edition
vault_install_dir = local.vault_install_dir
vault_addr = step.upgrade_vault_cluster_with_autopilot.api_addr_localhost
test_env_vars = {
EXPECTED_STATE = "1"
TIMEOUT_SECONDS = "180"
RETRY_INTERVAL = "5"
}
}
}
step "verify_undo_logs_disabled_on_followers" {
skip_step = semverconstraint(var.vault_product_version, "<1.13.0-0")
module = module.vault_verify_undo_logs
module = module.vault_run_blackbox_test
depends_on = [step.verify_undo_logs_enabled_on_primary]
providers = {
@ -944,12 +951,19 @@ scenario "autopilot" {
}
variables {
expected_state = 0 # Disabled
hosts = step.get_updated_vault_cluster_ips.follower_hosts
timeout = 10 # Seconds
vault_addr = step.upgrade_vault_cluster_with_autopilot.api_addr_localhost
vault_install_dir = local.vault_install_dir
leader_host = step.get_updated_vault_cluster_ips.follower_hosts[0]
leader_public_ip = step.get_updated_vault_cluster_ips.follower_hosts[0].public_ip
vault_root_token = step.create_vault_cluster.root_token
test_package = "./vault/external_tests/blackbox/verify"
test_names = ["TestVaultUndoLogsMetric"]
vault_edition = matrix.edition
vault_install_dir = local.vault_install_dir
vault_addr = step.upgrade_vault_cluster_with_autopilot.api_addr_localhost
test_env_vars = {
EXPECTED_STATE = "0"
TIMEOUT_SECONDS = "10"
RETRY_INTERVAL = "2"
}
}
}

View File

@ -51,7 +51,8 @@ resource "enos_local_exec" "run_blackbox_test" {
var.vault_install_dir != null ? { VAULT_INSTALL_DIR = var.vault_install_dir } : {},
local.ldap_environment,
local.postgres_environment,
local.mongodb_environment
local.mongodb_environment,
var.test_env_vars
)
}

View File

@ -77,3 +77,9 @@ variable "vault_install_dir" {
description = "The directory where Vault is installed"
default = null
}
variable "test_env_vars" {
type = map(string)
description = "Additional environment variables to pass to the test"
default = {}
}

View File

@ -1,77 +0,0 @@
# Copyright IBM Corp. 2016, 2025
# SPDX-License-Identifier: BUSL-1.1
terraform {
required_providers {
enos = {
source = "registry.terraform.io/hashicorp-forge/enos"
}
}
}
variable "expected_state" {
type = number
description = "The expected state to have in vault.core.replication.write_undo_logs telemetry. Must be either 1 for enabled or 0 for disabled."
validation {
condition = contains([0, 1], var.expected_state)
error_message = "The expected_state must be either 0 or 1"
}
}
variable "hosts" {
type = map(object({
ipv6 = string
private_ip = string
public_ip = string
}))
description = "The vault cluster target hosts to check"
}
variable "retry_interval" {
type = number
description = "How many seconds to wait between each retry"
default = 2
}
variable "timeout" {
type = number
description = "The max number of seconds to wait before timing out"
default = 60
}
variable "vault_addr" {
type = string
description = "The local vault API listen address"
}
variable "vault_install_dir" {
type = string
description = "The directory where the Vault binary will be installed"
}
variable "vault_root_token" {
type = string
description = "The vault root token"
}
resource "enos_remote_exec" "smoke-verify-undo-logs" {
for_each = var.hosts
environment = {
EXPECTED_STATE = var.expected_state
RETRY_INTERVAL = var.retry_interval
TIMEOUT_SECONDS = var.timeout
VAULT_ADDR = var.vault_addr
VAULT_INSTALL_DIR = var.vault_install_dir
VAULT_TOKEN = var.vault_root_token
}
scripts = [abspath("${path.module}/scripts/smoke-verify-undo-logs.sh")]
transport = {
ssh = {
host = each.value.public_ip
}
}
}

View File

@ -1,35 +0,0 @@
#!/usr/bin/env bash
# Copyright IBM Corp. 2016, 2025
# SPDX-License-Identifier: BUSL-1.1
function fail() {
echo "$1" 1>&2
exit 1
}
[[ -z "$EXPECTED_STATE" ]] && fail "EXPECTED_STAE env variable has not been set"
[[ -z "$RETRY_INTERVAL" ]] && fail "RETRY_INTERVAL env variable has not been set"
[[ -z "$TIMEOUT_SECONDS" ]] && fail "TIMEOUT_SECONDS env variable has not been set"
[[ -z "$VAULT_ADDR" ]] && fail "VAULT_ADDR env variable has not been set"
[[ -z "$VAULT_INSTALL_DIR" ]] && fail "VAULT_INSTALL_DIR env variable has not been set"
[[ -z "$VAULT_TOKEN" ]] && fail "VAULT_TOKEN env variable has not been set"
binpath=${VAULT_INSTALL_DIR}/vault
test -x "$binpath" || fail "unable to locate vault binary at $binpath"
begin_time=$(date +%s)
end_time=$((begin_time + TIMEOUT_SECONDS))
while [ "$(date +%s)" -lt "$end_time" ]; do
state=$($binpath read sys/metrics -format=json | jq -r '.data.Gauges[] | select(.Name == "vault.core.replication.write_undo_logs")')
target_undo_logs_status="$(jq -r '.Value' <<< "$state")"
if [ "$target_undo_logs_status" == "$EXPECTED_STATE" ]; then
echo "vault.core.replication.write_undo_logs has expected Value: \"${EXPECTED_STATE}\""
exit 0
fi
echo "Waiting for vault.core.replication.write_undo_logs to have Value: \"${EXPECTED_STATE}\""
sleep "$RETRY_INTERVAL"
done
fail "Timed out waiting for vault.core.replication.write_undo_logs to have Value: \"${EXPECTED_STATE}\""

View File

@ -0,0 +1,92 @@
// Copyright IBM Corp. 2025, 2026
// SPDX-License-Identifier: BUSL-1.1
package blackbox
import (
"encoding/json"
"fmt"
"time"
)
// MetricsResponse represents the response from sys/metrics endpoint
type MetricsResponse struct {
Data struct {
Gauges []struct {
Name string `json:"Name"`
Value float64 `json:"Value"`
Labels map[string]string `json:"Labels"`
} `json:"Gauges"`
Counters []struct {
Name string `json:"Name"`
Count int `json:"Count"`
Sum float64 `json:"Sum"`
Labels map[string]string `json:"Labels"`
} `json:"Counters"`
Samples []struct {
Name string `json:"Name"`
Count int `json:"Count"`
Sum float64 `json:"Sum"`
Labels map[string]string `json:"Labels"`
} `json:"Samples"`
} `json:"data"`
}
// AssertMetricGaugeValue verifies that a specific gauge metric has the expected value
// This method includes retry logic with configurable timeout
// Note: retryInterval parameter is ignored as the SDK uses a fixed 200ms interval
func (s *Session) AssertMetricGaugeValue(gaugeName string, expectedValue float64, timeout time.Duration, retryInterval time.Duration) {
s.t.Helper()
s.EventuallyWithTimeout(func() error {
// Read sys/metrics endpoint
secret, err := s.Client.Logical().Read("sys/metrics")
if err != nil {
return fmt.Errorf("failed to read sys/metrics: %w", err)
}
if secret == nil || secret.Data == nil {
return fmt.Errorf("sys/metrics returned nil data")
}
// Marshal and unmarshal to get proper structure
dataBytes, err := json.Marshal(secret.Data)
if err != nil {
return fmt.Errorf("failed to marshal metrics data: %w", err)
}
var metricsData struct {
Gauges []struct {
Name string `json:"Name"`
Value float64 `json:"Value"`
Labels map[string]string `json:"Labels"`
} `json:"Gauges"`
}
if err := json.Unmarshal(dataBytes, &metricsData); err != nil {
return fmt.Errorf("failed to unmarshal metrics data: %w", err)
}
// Find the gauge by name
var found bool
var actualValue float64
for _, gauge := range metricsData.Gauges {
if gauge.Name == gaugeName {
found = true
actualValue = gauge.Value
break
}
}
if !found {
return fmt.Errorf("gauge metric %q not found in sys/metrics response", gaugeName)
}
if actualValue != expectedValue {
return fmt.Errorf("gauge %q has value %.0f, expected %.0f", gaugeName, actualValue, expectedValue)
}
s.t.Logf("Gauge metric %q has expected value: %.0f", gaugeName, expectedValue)
return nil
}, timeout)
}

View File

@ -0,0 +1,70 @@
// Copyright IBM Corp. 2025, 2026
// SPDX-License-Identifier: BUSL-1.1
package blackbox
import (
"bytes"
"fmt"
"os/exec"
"strings"
)
// RemoteHost represents a remote host configuration
type RemoteHost struct {
PublicIP string `json:"public_ip"`
PrivateIP string `json:"private_ip"`
}
// AssertRemoteCLIVersion verifies the Vault CLI version on a remote host via SSH
// This method SSHs to the remote host and runs the vault version command
func (s *Session) AssertRemoteCLIVersion(host RemoteHost, vaultInstallDir, version, sha, buildDate, edition string) {
s.t.Helper()
// Build the vault version command
vaultBinary := fmt.Sprintf("%s/vault", vaultInstallDir)
remoteCmd := fmt.Sprintf("%s version", vaultBinary)
// Execute SSH command
cmd := exec.Command("ssh",
"-o", "StrictHostKeyChecking=no",
"-o", "UserKnownHostsFile=/dev/null",
"-o", "LogLevel=ERROR",
host.PublicIP,
remoteCmd,
)
var stdout, stderr bytes.Buffer
cmd.Stdout = &stdout
cmd.Stderr = &stderr
err := cmd.Run()
if err != nil {
s.t.Fatalf("Failed to execute vault version on remote host %s: %v\nStderr: %s", host.PublicIP, err, stderr.String())
}
output := strings.TrimSpace(stdout.String())
// Build expected version string
expectedVersion := fmt.Sprintf("Vault v%s ('%s'), built %s", version, sha, buildDate)
switch edition {
case "ce", "ent":
// No additional suffix
case "ent.hsm", "ent.fips1403", "ent.hsm.fips1403":
expectedVersion += " (cgo)"
default:
s.t.Fatalf("unknown Vault edition: %s", edition)
}
// Also check version without SHA (some builds may not include it)
expectedVersionNoSHA := strings.Replace(expectedVersion, fmt.Sprintf("('%s') ", sha), "", 1)
expectedVersionNoSHA = strings.TrimSpace(strings.Replace(expectedVersionNoSHA, " ", " ", -1))
if output != expectedVersion && output != expectedVersionNoSHA {
s.t.Fatalf("CLI version mismatch on host %s.\nExpected: %s\nor: %s\nGot: %s",
host.PublicIP, expectedVersion, expectedVersionNoSHA, output)
}
s.t.Logf("CLI version verification succeeded on host %s: %s", host.PublicIP, output)
}

View File

@ -0,0 +1,63 @@
// Copyright IBM Corp. 2025, 2026
// SPDX-License-Identifier: BUSL-1.1
package verify
import (
"os"
"strconv"
"testing"
"time"
"github.com/hashicorp/vault/sdk/helper/testcluster/blackbox"
)
// TestVaultUndoLogsMetric verifies the vault.core.replication.write_undo_logs gauge metric
// This test runs from CI/GitHub runners and connects to the Vault cluster via API
func TestVaultUndoLogsMetric(t *testing.T) {
t.Parallel()
// Read required environment variables
expectedStateStr := os.Getenv("EXPECTED_STATE")
if expectedStateStr == "" {
t.Fatal("EXPECTED_STATE environment variable is required")
}
expectedState, err := strconv.ParseFloat(expectedStateStr, 64)
if err != nil {
t.Fatalf("Failed to parse EXPECTED_STATE: %v", err)
}
// Validate expected state is 0 or 1
if expectedState != 0 && expectedState != 1 {
t.Fatalf("EXPECTED_STATE must be 0 or 1, got: %.0f", expectedState)
}
timeoutStr := os.Getenv("TIMEOUT_SECONDS")
if timeoutStr == "" {
t.Fatal("TIMEOUT_SECONDS environment variable is required")
}
timeoutSeconds, err := strconv.Atoi(timeoutStr)
if err != nil {
t.Fatalf("Failed to parse TIMEOUT_SECONDS: %v", err)
}
retryIntervalStr := os.Getenv("RETRY_INTERVAL")
if retryIntervalStr == "" {
t.Fatal("RETRY_INTERVAL environment variable is required")
}
retryIntervalSeconds, err := strconv.Atoi(retryIntervalStr)
if err != nil {
t.Fatalf("Failed to parse RETRY_INTERVAL: %v", err)
}
timeout := time.Duration(timeoutSeconds) * time.Second
retryInterval := time.Duration(retryIntervalSeconds) * time.Second
v := blackbox.New(t)
// Verify the undo logs metric has the expected value
v.AssertMetricGaugeValue("vault.core.replication.write_undo_logs", expectedState, timeout, retryInterval)
}