feat(enos): migrate vault_verify_undo_logs to vault_run_blackbox_test module (#14170) (#14374)

- Migrate undo logs verification from shell script to Go blackbox test - Add session_metrics.go and session_remote.go helpers to blackbox SDK - Create undo_logs_test.go in vault/external_tests/blackbox/verify package - Update autopilot scenario to use vault_run_blackbox_test module - Remove deprecated vault_verify_undo_logs module - Update vault_run_blackbox_test module to support test environment variables This change improves test maintainability and consistency by using the standardized blackbox testing framework instead of custom shell scripts. Co-authored-by: brewgator <12831681+brewgator@users.noreply.github.com>
2026-05-05 04:16:31 +02:00 · 2026-04-29 07:25:06 -06:00 · 2026-04-29 07:25:06 -06:00 · 684a372589
commit 684a372589
parent c9430538b3
9 changed files with 259 additions and 131 deletions
--- a/enos/enos-modules.hcl
+++ b/enos/enos-modules.hcl
@ -396,12 +396,6 @@ module "vault_verify_ui" {
  source = "./modules/vault_verify_ui"
 }

-module "vault_verify_undo_logs" {
-  source = "./modules/vault_verify_undo_logs"
-
-  vault_install_dir = var.vault_install_dir
-}
-
 module "vault_wait_for_cluster_unsealed" {
  source = "./modules/vault_wait_for_cluster_unsealed"

--- a/enos/enos-scenario-autopilot.hcl
+++ b/enos/enos-scenario-autopilot.hcl
@ -905,7 +905,7 @@ scenario "autopilot" {

  step "verify_undo_logs_enabled_on_primary" {
    skip_step   = semverconstraint(var.vault_product_version, "<1.13.0-0")
-    module      = module.vault_verify_undo_logs
+    module      = module.vault_run_blackbox_test
    description = <<-EOF
      Verifies that undo logs is correctly enabled on newly upgraded target hosts. For this it will
      query the metrics system backend for the vault.core.replication.write_undo_logs gauge.
@ -925,18 +925,25 @@ scenario "autopilot" {
    }

    variables {
-      expected_state    = 1 # Enabled
-      hosts             = step.get_updated_vault_cluster_ips.leader_hosts
-      timeout           = 180 # Seconds
-      vault_addr        = step.upgrade_vault_cluster_with_autopilot.api_addr_localhost
-      vault_install_dir = local.vault_install_dir
+      leader_host       = step.get_updated_vault_cluster_ips.leader_host
+      leader_public_ip  = step.get_updated_vault_cluster_ips.leader_public_ip
      vault_root_token  = step.create_vault_cluster.root_token
+      test_package      = "./vault/external_tests/blackbox/verify"
+      test_names        = ["TestVaultUndoLogsMetric"]
+      vault_edition     = matrix.edition
+      vault_install_dir = local.vault_install_dir
+      vault_addr        = step.upgrade_vault_cluster_with_autopilot.api_addr_localhost
+      test_env_vars = {
+        EXPECTED_STATE  = "1"
+        TIMEOUT_SECONDS = "180"
+        RETRY_INTERVAL  = "5"
+      }
    }
  }

  step "verify_undo_logs_disabled_on_followers" {
    skip_step  = semverconstraint(var.vault_product_version, "<1.13.0-0")
-    module     = module.vault_verify_undo_logs
+    module     = module.vault_run_blackbox_test
    depends_on = [step.verify_undo_logs_enabled_on_primary]

    providers = {
@ -944,12 +951,19 @@ scenario "autopilot" {
    }

    variables {
-      expected_state    = 0 # Disabled
-      hosts             = step.get_updated_vault_cluster_ips.follower_hosts
-      timeout           = 10 # Seconds
-      vault_addr        = step.upgrade_vault_cluster_with_autopilot.api_addr_localhost
-      vault_install_dir = local.vault_install_dir
+      leader_host       = step.get_updated_vault_cluster_ips.follower_hosts[0]
+      leader_public_ip  = step.get_updated_vault_cluster_ips.follower_hosts[0].public_ip
      vault_root_token  = step.create_vault_cluster.root_token
+      test_package      = "./vault/external_tests/blackbox/verify"
+      test_names        = ["TestVaultUndoLogsMetric"]
+      vault_edition     = matrix.edition
+      vault_install_dir = local.vault_install_dir
+      vault_addr        = step.upgrade_vault_cluster_with_autopilot.api_addr_localhost
+      test_env_vars = {
+        EXPECTED_STATE  = "0"
+        TIMEOUT_SECONDS = "10"
+        RETRY_INTERVAL  = "2"
+      }
    }
  }

--- a/enos/modules/vault_run_blackbox_test/main.tf
+++ b/enos/modules/vault_run_blackbox_test/main.tf
@ -51,7 +51,8 @@ resource "enos_local_exec" "run_blackbox_test" {
    var.vault_install_dir != null ? { VAULT_INSTALL_DIR = var.vault_install_dir } : {},
    local.ldap_environment,
    local.postgres_environment,
-    local.mongodb_environment
+    local.mongodb_environment,
+    var.test_env_vars
  )
 }

--- a/enos/modules/vault_run_blackbox_test/variables.tf
+++ b/enos/modules/vault_run_blackbox_test/variables.tf
@ -77,3 +77,9 @@ variable "vault_install_dir" {
  description = "The directory where Vault is installed"
  default     = null
 }
+
+variable "test_env_vars" {
+  type        = map(string)
+  description = "Additional environment variables to pass to the test"
+  default     = {}
+}
--- a/enos/modules/vault_verify_undo_logs/main.tf
+++ b/enos/modules/vault_verify_undo_logs/main.tf
@ -1,77 +0,0 @@
-# Copyright IBM Corp. 2016, 2025
-# SPDX-License-Identifier: BUSL-1.1
-
-terraform {
-  required_providers {
-    enos = {
-      source = "registry.terraform.io/hashicorp-forge/enos"
-    }
-  }
-}
-
-variable "expected_state" {
-  type        = number
-  description = "The expected state to have in vault.core.replication.write_undo_logs telemetry. Must be either 1 for enabled or 0 for disabled."
-
-  validation {
-    condition     = contains([0, 1], var.expected_state)
-    error_message = "The expected_state must be either 0 or 1"
-  }
-}
-
-variable "hosts" {
-  type = map(object({
-    ipv6       = string
-    private_ip = string
-    public_ip  = string
-  }))
-  description = "The vault cluster target hosts to check"
-}
-
-variable "retry_interval" {
-  type        = number
-  description = "How many seconds to wait between each retry"
-  default     = 2
-}
-
-variable "timeout" {
-  type        = number
-  description = "The max number of seconds to wait before timing out"
-  default     = 60
-}
-
-variable "vault_addr" {
-  type        = string
-  description = "The local vault API listen address"
-}
-
-variable "vault_install_dir" {
-  type        = string
-  description = "The directory where the Vault binary will be installed"
-}
-
-variable "vault_root_token" {
-  type        = string
-  description = "The vault root token"
-}
-
-resource "enos_remote_exec" "smoke-verify-undo-logs" {
-  for_each = var.hosts
-
-  environment = {
-    EXPECTED_STATE    = var.expected_state
-    RETRY_INTERVAL    = var.retry_interval
-    TIMEOUT_SECONDS   = var.timeout
-    VAULT_ADDR        = var.vault_addr
-    VAULT_INSTALL_DIR = var.vault_install_dir
-    VAULT_TOKEN       = var.vault_root_token
-  }
-
-  scripts = [abspath("${path.module}/scripts/smoke-verify-undo-logs.sh")]
-
-  transport = {
-    ssh = {
-      host = each.value.public_ip
-    }
-  }
-}
--- a/enos/modules/vault_verify_undo_logs/scripts/smoke-verify-undo-logs.sh
+++ b/enos/modules/vault_verify_undo_logs/scripts/smoke-verify-undo-logs.sh
@ -1,35 +0,0 @@
-#!/usr/bin/env bash
-# Copyright IBM Corp. 2016, 2025
-# SPDX-License-Identifier: BUSL-1.1
-
-function fail() {
-  echo "$1" 1>&2
-  exit 1
-}
-
-[[ -z "$EXPECTED_STATE" ]] && fail "EXPECTED_STAE env variable has not been set"
-[[ -z "$RETRY_INTERVAL" ]] && fail "RETRY_INTERVAL env variable has not been set"
-[[ -z "$TIMEOUT_SECONDS" ]] && fail "TIMEOUT_SECONDS env variable has not been set"
-[[ -z "$VAULT_ADDR" ]] && fail "VAULT_ADDR env variable has not been set"
-[[ -z "$VAULT_INSTALL_DIR" ]] && fail "VAULT_INSTALL_DIR env variable has not been set"
-[[ -z "$VAULT_TOKEN" ]] && fail "VAULT_TOKEN env variable has not been set"
-
-binpath=${VAULT_INSTALL_DIR}/vault
-test -x "$binpath" || fail "unable to locate vault binary at $binpath"
-
-begin_time=$(date +%s)
-end_time=$((begin_time + TIMEOUT_SECONDS))
-while [ "$(date +%s)" -lt "$end_time" ]; do
-  state=$($binpath read sys/metrics -format=json | jq -r '.data.Gauges[] | select(.Name == "vault.core.replication.write_undo_logs")')
-  target_undo_logs_status="$(jq -r '.Value' <<< "$state")"
-
-  if [ "$target_undo_logs_status" == "$EXPECTED_STATE" ]; then
-    echo "vault.core.replication.write_undo_logs has expected Value: \"${EXPECTED_STATE}\""
-    exit 0
-  fi
-
-  echo "Waiting for vault.core.replication.write_undo_logs to have Value: \"${EXPECTED_STATE}\""
-  sleep "$RETRY_INTERVAL"
-done
-
-fail "Timed out waiting for vault.core.replication.write_undo_logs to have Value: \"${EXPECTED_STATE}\""
--- a/sdk/helper/testcluster/blackbox/session_metrics.go
+++ b/sdk/helper/testcluster/blackbox/session_metrics.go
@ -0,0 +1,92 @@
+// Copyright IBM Corp. 2025, 2026
+// SPDX-License-Identifier: BUSL-1.1
+
+package blackbox
+
+import (
+	"encoding/json"
+	"fmt"
+	"time"
+)
+
+// MetricsResponse represents the response from sys/metrics endpoint
+type MetricsResponse struct {
+	Data struct {
+		Gauges []struct {
+			Name   string            `json:"Name"`
+			Value  float64           `json:"Value"`
+			Labels map[string]string `json:"Labels"`
+		} `json:"Gauges"`
+		Counters []struct {
+			Name   string            `json:"Name"`
+			Count  int               `json:"Count"`
+			Sum    float64           `json:"Sum"`
+			Labels map[string]string `json:"Labels"`
+		} `json:"Counters"`
+		Samples []struct {
+			Name   string            `json:"Name"`
+			Count  int               `json:"Count"`
+			Sum    float64           `json:"Sum"`
+			Labels map[string]string `json:"Labels"`
+		} `json:"Samples"`
+	} `json:"data"`
+}
+
+// AssertMetricGaugeValue verifies that a specific gauge metric has the expected value
+// This method includes retry logic with configurable timeout
+// Note: retryInterval parameter is ignored as the SDK uses a fixed 200ms interval
+func (s *Session) AssertMetricGaugeValue(gaugeName string, expectedValue float64, timeout time.Duration, retryInterval time.Duration) {
+	s.t.Helper()
+
+	s.EventuallyWithTimeout(func() error {
+		// Read sys/metrics endpoint
+		secret, err := s.Client.Logical().Read("sys/metrics")
+		if err != nil {
+			return fmt.Errorf("failed to read sys/metrics: %w", err)
+		}
+
+		if secret == nil || secret.Data == nil {
+			return fmt.Errorf("sys/metrics returned nil data")
+		}
+
+		// Marshal and unmarshal to get proper structure
+		dataBytes, err := json.Marshal(secret.Data)
+		if err != nil {
+			return fmt.Errorf("failed to marshal metrics data: %w", err)
+		}
+
+		var metricsData struct {
+			Gauges []struct {
+				Name   string            `json:"Name"`
+				Value  float64           `json:"Value"`
+				Labels map[string]string `json:"Labels"`
+			} `json:"Gauges"`
+		}
+
+		if err := json.Unmarshal(dataBytes, &metricsData); err != nil {
+			return fmt.Errorf("failed to unmarshal metrics data: %w", err)
+		}
+
+		// Find the gauge by name
+		var found bool
+		var actualValue float64
+		for _, gauge := range metricsData.Gauges {
+			if gauge.Name == gaugeName {
+				found = true
+				actualValue = gauge.Value
+				break
+			}
+		}
+
+		if !found {
+			return fmt.Errorf("gauge metric %q not found in sys/metrics response", gaugeName)
+		}
+
+		if actualValue != expectedValue {
+			return fmt.Errorf("gauge %q has value %.0f, expected %.0f", gaugeName, actualValue, expectedValue)
+		}
+
+		s.t.Logf("Gauge metric %q has expected value: %.0f", gaugeName, expectedValue)
+		return nil
+	}, timeout)
+}
--- a/sdk/helper/testcluster/blackbox/session_remote.go
+++ b/sdk/helper/testcluster/blackbox/session_remote.go
@ -0,0 +1,70 @@
+// Copyright IBM Corp. 2025, 2026
+// SPDX-License-Identifier: BUSL-1.1
+
+package blackbox
+
+import (
+	"bytes"
+	"fmt"
+	"os/exec"
+	"strings"
+)
+
+// RemoteHost represents a remote host configuration
+type RemoteHost struct {
+	PublicIP  string `json:"public_ip"`
+	PrivateIP string `json:"private_ip"`
+}
+
+// AssertRemoteCLIVersion verifies the Vault CLI version on a remote host via SSH
+// This method SSHs to the remote host and runs the vault version command
+func (s *Session) AssertRemoteCLIVersion(host RemoteHost, vaultInstallDir, version, sha, buildDate, edition string) {
+	s.t.Helper()
+
+	// Build the vault version command
+	vaultBinary := fmt.Sprintf("%s/vault", vaultInstallDir)
+	remoteCmd := fmt.Sprintf("%s version", vaultBinary)
+
+	// Execute SSH command
+	cmd := exec.Command("ssh",
+		"-o", "StrictHostKeyChecking=no",
+		"-o", "UserKnownHostsFile=/dev/null",
+		"-o", "LogLevel=ERROR",
+		host.PublicIP,
+		remoteCmd,
+	)
+
+	var stdout, stderr bytes.Buffer
+	cmd.Stdout = &stdout
+	cmd.Stderr = &stderr
+
+	err := cmd.Run()
+	if err != nil {
+		s.t.Fatalf("Failed to execute vault version on remote host %s: %v\nStderr: %s", host.PublicIP, err, stderr.String())
+	}
+
+	output := strings.TrimSpace(stdout.String())
+
+	// Build expected version string
+	expectedVersion := fmt.Sprintf("Vault v%s ('%s'), built %s", version, sha, buildDate)
+
+	switch edition {
+	case "ce", "ent":
+		// No additional suffix
+	case "ent.hsm", "ent.fips1403", "ent.hsm.fips1403":
+		expectedVersion += " (cgo)"
+	default:
+		s.t.Fatalf("unknown Vault edition: %s", edition)
+	}
+
+	// Also check version without SHA (some builds may not include it)
+	expectedVersionNoSHA := strings.Replace(expectedVersion, fmt.Sprintf("('%s') ", sha), "", 1)
+	expectedVersionNoSHA = strings.TrimSpace(strings.Replace(expectedVersionNoSHA, "  ", " ", -1))
+
+	if output != expectedVersion && output != expectedVersionNoSHA {
+		s.t.Fatalf("CLI version mismatch on host %s.\nExpected: %s\nor: %s\nGot: %s",
+			host.PublicIP, expectedVersion, expectedVersionNoSHA, output)
+	}
+
+	s.t.Logf("CLI version verification succeeded on host %s: %s", host.PublicIP, output)
+}
--- a/vault/external_tests/blackbox/verify/undo_logs_test.go
+++ b/vault/external_tests/blackbox/verify/undo_logs_test.go
@ -0,0 +1,63 @@
+// Copyright IBM Corp. 2025, 2026
+// SPDX-License-Identifier: BUSL-1.1
+
+package verify
+
+import (
+	"os"
+	"strconv"
+	"testing"
+	"time"
+
+	"github.com/hashicorp/vault/sdk/helper/testcluster/blackbox"
+)
+
+// TestVaultUndoLogsMetric verifies the vault.core.replication.write_undo_logs gauge metric
+// This test runs from CI/GitHub runners and connects to the Vault cluster via API
+func TestVaultUndoLogsMetric(t *testing.T) {
+	t.Parallel()
+
+	// Read required environment variables
+	expectedStateStr := os.Getenv("EXPECTED_STATE")
+	if expectedStateStr == "" {
+		t.Fatal("EXPECTED_STATE environment variable is required")
+	}
+
+	expectedState, err := strconv.ParseFloat(expectedStateStr, 64)
+	if err != nil {
+		t.Fatalf("Failed to parse EXPECTED_STATE: %v", err)
+	}
+
+	// Validate expected state is 0 or 1
+	if expectedState != 0 && expectedState != 1 {
+		t.Fatalf("EXPECTED_STATE must be 0 or 1, got: %.0f", expectedState)
+	}
+
+	timeoutStr := os.Getenv("TIMEOUT_SECONDS")
+	if timeoutStr == "" {
+		t.Fatal("TIMEOUT_SECONDS environment variable is required")
+	}
+
+	timeoutSeconds, err := strconv.Atoi(timeoutStr)
+	if err != nil {
+		t.Fatalf("Failed to parse TIMEOUT_SECONDS: %v", err)
+	}
+
+	retryIntervalStr := os.Getenv("RETRY_INTERVAL")
+	if retryIntervalStr == "" {
+		t.Fatal("RETRY_INTERVAL environment variable is required")
+	}
+
+	retryIntervalSeconds, err := strconv.Atoi(retryIntervalStr)
+	if err != nil {
+		t.Fatalf("Failed to parse RETRY_INTERVAL: %v", err)
+	}
+
+	timeout := time.Duration(timeoutSeconds) * time.Second
+	retryInterval := time.Duration(retryIntervalSeconds) * time.Second
+
+	v := blackbox.New(t)
+
+	// Verify the undo logs metric has the expected value
+	v.AssertMetricGaugeValue("vault.core.replication.write_undo_logs", expectedState, timeout, retryInterval)
+}