Josh Black 5e90024b26
Add Enos benchmark scenario (#30675)
* Add Enos benchmark scenario

* add docs on how to run the scenario

* update description again

* see if this works better if we return an empty map

* hopefully disabling telemetry doesn't crash everything now

* yet another try at making telemetry configurable

* swap consul nodes over to be the same as the vault ones

* adjust up IOPs and add a note about it to the docs

* fix missing variables in the ec2 shim

* randomly pick an az for k6 and metrics instances

* enos(benchmark): futher modularize and make target infra cloud agnostic

The initial goal of this was to resolve an issue where sometimes the
one-or-more target instances would attempt to be provisioned in an
avaliability zone that doesn't support it. The target_ec2_instances
module already supports assigning based on instance offerings so I
wanted to use it for all instances. It also has a side effect of
provisioning instances in parallel to speed up overall scenario time.

I ended up futher modularizing the `benchmark` module into several
sub-modules that perform a single task well, and rely on provisioning in
the root module. This will allow us to utilize the module in other
clouds more easily should we desire to do that in the future.

Signed-off-by: Ryan Cragun <me@ryan.ec>

* add copywrite headers

Signed-off-by: Ryan Cragun <me@ryan.ec>

* address some feedback and limit disk iops to 16k by default

Signed-off-by: Ryan Cragun <me@ryan.ec>

---------

Signed-off-by: Ryan Cragun <me@ryan.ec>
Co-authored-by: Ryan Cragun <me@ryan.ec>
2025-06-20 09:40:21 -07:00

292 lines
11 KiB
HCL

# Copyright (c) HashiCorp, Inc.
# SPDX-License-Identifier: BUSL-1.1
terraform {
required_providers {
# We need to specify the provider source in each module until we publish it
# to the public registry
enos = {
source = "registry.terraform.io/hashicorp-forge/enos"
version = ">= 0.5.3"
}
}
}
locals {
api_addr_localhost = var.ip_version == 4 ? "http://127.0.0.1:${var.listener_port}" : "http://[::1]:${var.listener_port}"
api_addrs = tolist([for h in var.hosts : {
4 : "http://${h.public_ip}:${var.listener_port}",
6 : "http://[${h.ipv6}]:${var.listener_port}",
}])
api_addrs_internal = tolist([for h in var.hosts : {
4 : "http://${h.private_ip}:${var.listener_port}",
6 : "http://[${h.ipv6}]:${var.listener_port}",
}])
bin_path = "${var.install_dir}/vault"
cluster_addrs = tolist([for h in var.hosts : {
4 : "http://${h.public_ip}:${var.cluster_port}",
6 : "http://[${h.ipv6}]:${var.cluster_port}",
}])
cluster_addrs_internal = tolist([for h in var.hosts : {
4 : "http://${h.private_ip}:${var.cluster_port}",
6 : "http://[${h.ipv6}]:${var.cluster_port}",
}])
// In order to get Terraform to plan we have to use collections with keys that are known at plan
// time. Here we're creating locals that keep track of index values that point to our target hosts.
followers = toset(slice(local.instances, 1, length(local.instances)))
instances = [for idx in range(length(var.hosts)) : tostring(idx)]
leader = toset(slice(local.instances, 0, 1))
listener_address = var.ip_version == 4 ? "0.0.0.0:${var.listener_port}" : "[::]:${var.listener_port}"
prometheus_retention_time = var.enable_telemetry ? "24h" : "0"
// Handle cases where we might have to distribute HSM tokens for the pkcs11 seal before starting
// vault.
token_base64 = try(lookup(var.seal_attributes, "token_base64", ""), "")
token_base64_secondary = try(lookup(var.seal_attributes_secondary, "token_base64", ""), "")
// This module currently supports up to two defined seals. Most of our locals logic here is for
// creating the correct seal configuration.
seals = {
primary = local.seal_primary
secondary = local.seal_secondary
}
seals_primary = {
awskms = {
type = "awskms"
attributes = merge(
{
name = var.seal_alias
priority = var.seal_priority
}, var.seal_attributes
)
}
pkcs11 = {
type = "pkcs11"
attributes = merge(
{
name = var.seal_alias
priority = var.seal_priority
},
// Strip out attributes that aren't supposed to be in seal stanza like our base64 encoded
// softhsm blob and the token directory. We'll also inject the shared object library
// location that we detect on the target machines. This allows use to create the token and
// keys on a machines that have different shared object locations.
merge(
try({ for key, val in var.seal_attributes : key => val if key != "token_base64" && key != "token_dir" }, {}),
# Note: the below reference has to point to a specific instance of the maybe_configure_hsm
# module (in this case [0]) due to the maybe_configure_hsm module call using `count` to control whether it runs or not.
try({ lib = module.maybe_configure_hsm[0].lib }, {})
),
)
}
shamir = {
type = "shamir"
attributes = null
}
}
seal_primary = local.seals_primary[var.seal_type]
seals_secondary = {
awskms = {
type = "awskms"
attributes = merge(
{
name = var.seal_alias_secondary
priority = var.seal_priority_secondary
}, var.seal_attributes_secondary
)
}
pkcs11 = {
type = "pkcs11"
attributes = merge(
{
name = var.seal_alias_secondary
priority = var.seal_priority_secondary
},
merge(
try({ for key, val in var.seal_attributes_secondary : key => val if key != "token_base64" && key != "token_dir" }, {}),
# Note: the below reference has to point to a specific instance of the maybe_configure_hsm_secondary
# module (in this case [0]) due to the maybe_configure_hsm_secondary module call using `count` to control whether it runs or not.
try({ lib = module.maybe_configure_hsm_secondary[0].lib }, {})
),
)
}
none = {
type = "none"
attributes = null
}
}
seal_secondary = local.seals_secondary[var.seal_type_secondary]
storage_address = var.ip_version == 4 ? "0.0.0.0:${var.external_storage_port}" : "[::]:${var.external_storage_port}"
storage_attributes = [for idx, host in var.hosts : (var.storage_backend == "raft" ?
merge(
{
node_id = "${var.storage_node_prefix}_${idx}"
},
var.storage_backend_attrs
) :
{
address = local.storage_address
path = "vault"
})
]
storage_retry_join = {
"raft" : {
auto_join : "provider=aws addr_type=${var.ip_version == 4 ? "private_v4" : "public_v6"} tag_key=${var.cluster_tag_key} tag_value=${var.cluster_name}",
auto_join_scheme : "http",
},
}
}
# You might be wondering why our start_vault module, which supports shamir, awskms, and pkcs11 seal
# types, contains sub-modules that are only used for HSM. Well, each of those seal devices has
# different requirements and as such we have some seal specific requirements before starting Vault.
#
# A Shamir seal key cannot exist until Vault has already started, so this modules responsibility for
# shamir seals is ensuring that the seal type is passed to the enos_vault_start resource. That's it.
#
# Auto-unseal with a KMS requires that we configure the enos_vault_start resource with the correct
# seal type and the attributes necessary to know which KMS key to use. Vault should automatically
# unseal if we've given it the correct configuration. As long as Vault is able to access the key
# in the KMS it should be able to start. That's normally done via roles associated to the target
# machines, which is outside the scope of this module.
#
# Auto-unseal with an HSM and PKCS#11 is more complicated because a shared object library, which is
# how we interface with the HSM, must be present on each node in order to start Vault. In the real
# world this means an actual HSM in the same rack or data center as every node in the Vault cluster,
# but in our case we're creating ephemeral infrastructure for these test scenarios and don't have a
# real HSM available. We could use CloudHSM or the like, but at the time of writing CloudHSM
# provisioning takes anywhere from 30 to 60 minutes and costs upwards of $2 dollars an hour. That's
# far too long and expensive for scenarios we'll run fairly frequently. Instead, we test using a
# software HSM. Using a software HSM solves the cost and speed problems but creates new set of
# problems. We need to ensure every node in the cluster has access to the same "HSM" and with
# softhsm that means the same software, configuration, tokens and keys. Our `seal_pkcs11` module
# takes care of creating the token and keys, but that's the end of the road for that module. It's
# our job to ensure that when we're starting Vault with a software HSM that we'll ensure the correct
# software, configuration and data are available on the nodes. That's where the following two
# modules come in. They handle installing the required software, configuring it, and distributing
# the key data that was passed in via seal attributes.
module "maybe_configure_hsm" {
source = "../softhsm_distribute_vault_keys"
count = (var.seal_type == "pkcs11" || var.seal_type_secondary == "pkcs11") ? 1 : 0
hosts = var.hosts
token_base64 = local.token_base64
}
module "maybe_configure_hsm_secondary" {
source = "../softhsm_distribute_vault_keys"
depends_on = [module.maybe_configure_hsm]
count = (var.seal_type == "pkcs11" || var.seal_type_secondary == "pkcs11") ? 1 : 0
hosts = var.hosts
token_base64 = local.token_base64_secondary
}
resource "enos_vault_start" "leader" {
for_each = local.leader
depends_on = [
module.maybe_configure_hsm_secondary,
]
bin_path = local.bin_path
config_dir = var.config_dir
config_mode = var.config_mode
environment = merge(var.environment, {
VAULT_DISABLE_MLOCK = var.disable_mlock
})
config = {
api_addr = local.api_addrs_internal[tonumber(each.value)][var.ip_version]
cluster_addr = local.cluster_addrs_internal[tonumber(each.value)][var.ip_version]
cluster_name = var.cluster_name
listener = {
type = "tcp"
attributes = {
address = local.listener_address
tls_disable = "true"
}
telemetry = {
unauthenticated_metrics_access = var.enable_telemetry
}
}
log_level = var.log_level
storage = {
type = var.storage_backend
attributes = local.storage_attributes[each.key]
retry_join = try(local.storage_retry_join[var.storage_backend], null)
}
seals = local.seals
ui = true
telemetry = {
prometheus_retention_time = local.prometheus_retention_time
disable_hostname = true
}
}
license = var.license
manage_service = var.manage_service
username = var.service_username
unit_name = "vault"
transport = {
ssh = {
host = var.hosts[each.value].public_ip
}
}
}
resource "enos_vault_start" "followers" {
depends_on = [
enos_vault_start.leader,
]
for_each = local.followers
bin_path = local.bin_path
config_dir = var.config_dir
config_mode = var.config_mode
environment = merge(var.environment, {
VAULT_DISABLE_MLOCK = var.disable_mlock
})
config = {
api_addr = local.api_addrs_internal[tonumber(each.value)][var.ip_version]
cluster_addr = local.cluster_addrs_internal[tonumber(each.value)][var.ip_version]
cluster_name = var.cluster_name
listener = {
type = "tcp"
attributes = {
address = local.listener_address
tls_disable = "true"
}
telemetry = {
unauthenticated_metrics_access = var.enable_telemetry
}
}
log_level = var.log_level
storage = {
type = var.storage_backend
attributes = { for key, value in local.storage_attributes[each.key] : key => value }
retry_join = try(local.storage_retry_join[var.storage_backend], null)
}
seals = local.seals
ui = true
telemetry = {
prometheus_retention_time = local.prometheus_retention_time
disable_hostname = true
}
}
license = var.license
manage_service = var.manage_service
username = var.service_username
unit_name = "vault"
transport = {
ssh = {
host = var.hosts[each.value].public_ip
}
}
}
output "token_base64" {
value = local.token_base64
}
output "token_base64_secondary" {
value = local.token_base64_secondary
}