mirror of
https://github.com/hashicorp/vault.git
synced 2025-08-22 23:21:08 +02:00
* [VAULT-27917] fix(enos): handle SLES guestregister.service unreliability The SLES provided `guestregister.service` systemd unit is unreliable enough that it will fail ~ 1/9 times when provisioning SLES instances. When this happens the machine will never successfully exec SUSEConnect to enroll and we'll get no access to the SLES repositories and subsequently break our scenarios. I resolved this by restructuring our `install_packages` module to to separate repository synchronization, repository addition, and package installation into different scripts and resources and by adding special case handling for SLES and the `guestregister.service`. I also make a distinction between `dnf` and `yum` because while they are sort of the same thing on RHEL, it is not the case with Amazon2. I also shimmed out the rest of the support for Apt in case we ever need to add repos there. * Revert "Temporarily remove SLES from samples (#27378)" This reverts commit 490cdd90661a57cf849c7d64aec545e87fb393c8. Signed-off-by: Ryan Cragun <me@ryan.ec>
119 lines
4.0 KiB
Bash
119 lines
4.0 KiB
Bash
#!/usr/bin/env bash
|
|
# Copyright (c) HashiCorp, Inc.
|
|
# SPDX-License-Identifier: BUSL-1.1
|
|
|
|
set -e
|
|
|
|
fail() {
|
|
echo "$1" 1>&2
|
|
exit 1
|
|
}
|
|
|
|
[[ -z "${PACKAGE_MANAGER}" ]] && fail "PACKAGE_MANAGER env variable has not been set"
|
|
[[ -z "${RETRY_INTERVAL}" ]] && fail "RETRY_INTERVAL env variable has not been set"
|
|
[[ -z "${TIMEOUT_SECONDS}" ]] && fail "TIMEOUT_SECONDS env variable has not been set"
|
|
|
|
# The SLES AMI's do not come configured with Zypper repositories by default. To get them you
|
|
# have to run SUSEConnect to register the instance with SUSE. On the AMI this is handled
|
|
# automatically by a oneshot systemd unit called guestregister.service. This oneshot service needs
|
|
# to complete before any other repo or package steps are completed. At the time of writing it's very
|
|
# unreliable so we have to ensure that it has correctly executed ourselves or restart it. We do this
|
|
# by checking if the guestregister.service has reached the correct "inactive" state that we need.
|
|
# If it hasn't reached that state it's usually in some sort of active state, i.e. running, or it has
|
|
# failed. If it's in one of the active states we need to let it continue and check the status when
|
|
# it completes. If it has completed but is failed we'll restart the service to re-run the script that
|
|
# executes SUSEConnect.
|
|
sles_check_guestregister_service_and_restart_if_failed() {
|
|
local active_state
|
|
local failed_state
|
|
|
|
# systemctl returns non-zero exit codes. We rely on output here because all states don't have
|
|
# their own exit code.
|
|
set +e
|
|
active_state=$(sudo systemctl is-active guestregister.service)
|
|
failed_state=$(sudo systemctl is-failed guestregister.service)
|
|
set -e
|
|
|
|
case "$active_state" in
|
|
active|activating|deactivating)
|
|
# It's running so we'll return 1 and get retried by the caller
|
|
echo "the guestregister.service is still in the ${active_state} state" 1>&2
|
|
return 1
|
|
;;
|
|
*)
|
|
if [ "$active_state" == "inactive" ] && [ "$failed_state" == "inactive" ]; then
|
|
# The oneshot has completed and hasn't "failed"
|
|
echo "the guestregister.service is 'inactive' for both active and failed states"
|
|
return 0
|
|
fi
|
|
|
|
# Our service is stopped and failed, restart it and hope it works the next time
|
|
sudo systemctl restart --wait guestregister.service
|
|
;;
|
|
esac
|
|
}
|
|
|
|
# Check or restart the guestregister service if it has failed. If it passes do another check to make
|
|
# sure that the zypper repositories list isn't empty.
|
|
sles_ensure_suseconnect() {
|
|
local health_output
|
|
if ! health_output=$(sles_check_guestregister_service_and_restart_if_failed); then
|
|
echo "the guestregister.service failed to reach a healthy state: ${health_output}" 1>&2
|
|
return 1
|
|
fi
|
|
|
|
# Make sure Zypper has repositories.
|
|
if ! lr_output=$(zypper lr); then
|
|
echo "The guestregister.service failed. Unable to SUSEConnect and thus have no Zypper repositories: ${lr_output}: ${health_output}." 1>&2
|
|
return 1
|
|
fi
|
|
|
|
return 0
|
|
}
|
|
|
|
# Synchronize our repositories so that futher installation steps are working with updated cache
|
|
# and repo metadata.
|
|
synchronize_repos() {
|
|
case $PACKAGE_MANAGER in
|
|
apt)
|
|
sudo apt update
|
|
;;
|
|
dnf)
|
|
sudo dnf makecache
|
|
;;
|
|
yum)
|
|
sudo yum makecache
|
|
;;
|
|
zypper)
|
|
if [ "$DISTRO" == "sles" ]; then
|
|
if ! sles_ensure_suseconnect; then
|
|
return 1
|
|
fi
|
|
fi
|
|
sudo zypper --gpg-auto-import-keys --non-interactive ref
|
|
sudo zypper --gpg-auto-import-keys --non-interactive refs
|
|
;;
|
|
*)
|
|
return 0
|
|
;;
|
|
esac
|
|
}
|
|
|
|
# Before we start to modify repositories and install packages we'll wait for cloud-init to finish
|
|
# so it doesn't race with any of our package installations.
|
|
# We run as sudo becase Amazon Linux 2 throws Python 2.7 errors when running `cloud-init status` as
|
|
# non-root user (known bug).
|
|
sudo cloud-init status --wait
|
|
|
|
begin_time=$(date +%s)
|
|
end_time=$((begin_time + TIMEOUT_SECONDS))
|
|
while [ "$(date +%s)" -lt "$end_time" ]; do
|
|
if synchronize_repos; then
|
|
exit 0
|
|
fi
|
|
|
|
sleep "$RETRY_INTERVAL"
|
|
done
|
|
|
|
fail "Timed out waiting for distro repos to be set up"
|