Ryan Cragun 0764d7d177
enos: poweroff and terminate instances when shutting them down (#28316)
Previously our `shutdown_nodes` modules would halt the machine. While
this is useful for simulating a failure it makes cleaning up the halted
machines very slow in AWS.

Instead, we now poweroff the machines and utilize EC2's instance
poweroff handling to immediately terminate the instances.

I've test both scenarios locally utilizing the change and both still
work as expected. I also timed before and after and this change saves 5
MINUTES in total runtime (~40%) for the PR replication scenario. I assume
it yields similar results for autopilot.

Signed-off-by: Ryan Cragun <me@ryan.ec>
2024-09-09 13:22:41 -06:00

215 lines
5.0 KiB
HCL

# Copyright (c) HashiCorp, Inc.
# SPDX-License-Identifier: BUSL-1.1
terraform {
required_providers {
# We need to specify the provider source in each module until we publish it
# to the public registry
enos = {
source = "registry.terraform.io/hashicorp-forge/enos"
version = ">= 0.3.24"
}
}
}
data "aws_vpc" "vpc" {
id = var.vpc_id
}
data "aws_ami" "ami" {
filter {
name = "image-id"
values = [var.ami_id]
}
}
data "aws_ec2_instance_type_offerings" "instance" {
filter {
name = "instance-type"
values = [local.instance_type]
}
location_type = "availability-zone"
}
data "aws_availability_zones" "available" {
state = "available"
filter {
name = "zone-name"
values = data.aws_ec2_instance_type_offerings.instance.locations
}
}
data "aws_subnets" "vpc" {
filter {
name = "availability-zone"
values = data.aws_availability_zones.available.names
}
filter {
name = "vpc-id"
values = [var.vpc_id]
}
}
data "aws_iam_policy_document" "target" {
statement {
resources = ["*"]
actions = [
"ec2:DescribeInstances",
"secretsmanager:*"
]
}
dynamic "statement" {
for_each = var.seal_key_names
content {
resources = [statement.value]
actions = [
"kms:DescribeKey",
"kms:ListKeys",
"kms:Encrypt",
"kms:Decrypt",
"kms:GenerateDataKey"
]
}
}
}
data "aws_iam_policy_document" "target_instance_role" {
statement {
actions = ["sts:AssumeRole"]
principals {
type = "Service"
identifiers = ["ec2.amazonaws.com"]
}
}
}
data "enos_environment" "localhost" {}
locals {
cluster_name = coalesce(var.cluster_name, random_string.cluster_name.result)
instance_type = local.instance_types[data.aws_ami.ami.architecture]
instance_types = {
"arm64" = var.instance_types["arm64"]
"x86_64" = var.instance_types["amd64"]
}
instances = toset([for idx in range(var.instance_count) : tostring(idx)])
name_prefix = "${var.project_name}-${local.cluster_name}-${random_string.unique_id.result}"
}
resource "random_string" "cluster_name" {
length = 8
lower = true
upper = false
numeric = false
special = false
}
resource "random_string" "unique_id" {
length = 4
lower = true
upper = false
numeric = false
special = false
}
resource "aws_iam_role" "target_instance_role" {
name = "${local.name_prefix}-instance-role"
assume_role_policy = data.aws_iam_policy_document.target_instance_role.json
}
resource "aws_iam_instance_profile" "target" {
name = "${local.name_prefix}-instance-profile"
role = aws_iam_role.target_instance_role.name
}
resource "aws_iam_role_policy" "target" {
name = "${local.name_prefix}-role-policy"
role = aws_iam_role.target_instance_role.id
policy = data.aws_iam_policy_document.target.json
}
resource "aws_security_group" "target" {
name = "${local.name_prefix}-sg"
description = "Target instance security group"
vpc_id = var.vpc_id
# External ingress
dynamic "ingress" {
for_each = var.ports_ingress
content {
from_port = ingress.value.port
to_port = ingress.value.port
protocol = ingress.value.protocol
cidr_blocks = flatten([
formatlist("%s/32", data.enos_environment.localhost.public_ipv4_addresses),
join(",", data.aws_vpc.vpc.cidr_block_associations.*.cidr_block),
formatlist("%s/32", var.ssh_allow_ips)
])
ipv6_cidr_blocks = data.aws_vpc.vpc.ipv6_cidr_block != "" ? [data.aws_vpc.vpc.ipv6_cidr_block] : null
}
}
# Internal traffic
ingress {
from_port = 0
to_port = 0
protocol = "-1"
self = true
}
# External traffic
egress {
from_port = 0
to_port = 0
protocol = "-1"
cidr_blocks = ["0.0.0.0/0"]
ipv6_cidr_blocks = ["::/0"]
}
tags = merge(
var.common_tags,
{
Name = "${local.name_prefix}-sg"
},
)
}
resource "aws_instance" "targets" {
for_each = local.instances
ami = var.ami_id
iam_instance_profile = aws_iam_instance_profile.target.name
// Some scenarios (autopilot, pr_replication) shutdown instances to simulate failure. In those
// cases we should terminate the instance entirely rather than get stuck in stopped limbo.
instance_initiated_shutdown_behavior = "terminate"
instance_type = local.instance_type
key_name = var.ssh_keypair
subnet_id = data.aws_subnets.vpc.ids[tonumber(each.key) % length(data.aws_subnets.vpc.ids)]
vpc_security_group_ids = [aws_security_group.target.id]
tags = merge(
var.common_tags,
{
Name = "${local.name_prefix}-${var.cluster_tag_key}-instance-target"
"${var.cluster_tag_key}" = local.cluster_name
},
)
}
module "disable_selinux" {
depends_on = [aws_instance.targets]
source = "../disable_selinux"
count = var.disable_selinux == true ? 1 : 0
hosts = local.hosts
}