vault/enos/modules/install_packages/scripts/synchronize-repos.sh
Ryan Cragun 77e48e837e
enos(cloud-init): fix synchronize-repos (#30013)
`$?` in bash is wonky. When you evaluate an expression in an `if`
statement the `$?` variable is only set the actual value in blocks
scoped in the statement. Therefore, since we rely on it in
synchronize-repos we have to evaluate the rest of the function in a
scope of that statement.

Signed-off-by: Ryan Cragun <me@ryan.ec>
2025-03-24 16:02:24 -06:00

152 lines
4.9 KiB
Bash

#!/usr/bin/env bash
# Copyright (c) HashiCorp, Inc.
# SPDX-License-Identifier: BUSL-1.1
set -e
fail() {
echo "$1" 1>&2
exit 1
}
[[ -z "${PACKAGE_MANAGER}" ]] && fail "PACKAGE_MANAGER env variable has not been set"
[[ -z "${RETRY_INTERVAL}" ]] && fail "RETRY_INTERVAL env variable has not been set"
[[ -z "${TIMEOUT_SECONDS}" ]] && fail "TIMEOUT_SECONDS env variable has not been set"
# The SLES AMI's do not come configured with Zypper repositories by default. To get them you
# have to run SUSEConnect to register the instance with SUSE. On the AMI this is handled
# automatically by a oneshot systemd unit called guestregister.service. This oneshot service needs
# to complete before any other repo or package steps are completed. At the time of writing it's very
# unreliable so we have to ensure that it has correctly executed ourselves or restart it. We do this
# by checking if the guestregister.service has reached the correct "inactive" state that we need.
# If it hasn't reached that state it's usually in some sort of active state, i.e. running, or it has
# failed. If it's in one of the active states we need to let it continue and check the status when
# it completes. If it has completed but is failed we'll restart the service to re-run the script that
# executes SUSEConnect.
sles_check_guestregister_service_and_restart_if_failed() {
local active_state
local failed_state
# systemctl returns non-zero exit codes. We rely on output here because all states don't have
# their own exit code.
set +e
active_state=$(sudo systemctl is-active guestregister.service)
failed_state=$(sudo systemctl is-failed guestregister.service)
set -e
case "$active_state" in
active | activating | deactivating)
# It's running so we'll return 1 and get retried by the caller
echo "the guestregister.service is still in the ${active_state} state" 1>&2
return 1
;;
*)
if [ "$active_state" == "inactive" ] && [ "$failed_state" == "inactive" ]; then
# The oneshot has completed and hasn't "failed"
echo "the guestregister.service is 'inactive' for both active and failed states"
return 0
fi
# Our service is stopped and failed, restart it and hope it works the next time
sudo systemctl restart --wait guestregister.service
;;
esac
}
# Check or restart the guestregister service if it has failed. If it passes do another check to make
# sure that the zypper repositories list isn't empty.
sles_ensure_suseconnect() {
local health_output
if ! health_output=$(sles_check_guestregister_service_and_restart_if_failed); then
echo "the guestregister.service failed to reach a healthy state: ${health_output}" 1>&2
return 1
fi
# Make sure Zypper has repositories.
if ! lr_output=$(zypper lr); then
echo "The guestregister.service failed. Unable to SUSEConnect and thus have no Zypper repositories: ${lr_output}: ${health_output}." 1>&2
return 1
fi
return 0
}
# Synchronize our repositories so that futher installation steps are working with updated cache
# and repo metadata.
synchronize_repos() {
case $PACKAGE_MANAGER in
apt)
sudo apt update
;;
dnf)
sudo dnf makecache
;;
yum)
sudo yum makecache
;;
zypper)
if [ "$DISTRO" == "sles" ]; then
if ! sles_ensure_suseconnect; then
return 1
fi
fi
sudo zypper --gpg-auto-import-keys --non-interactive ref
sudo zypper --gpg-auto-import-keys --non-interactive refs
;;
*)
return 0
;;
esac
}
# Function to check cloud-init status and retry on failure
# Before we start to modify repositories and install packages we'll wait for cloud-init to finish
# so it doesn't race with any of our package installations.
# We run as sudo because Amazon Linux 2 throws Python 2.7 errors when running `cloud-init status` as
# non-root user (known bug).
wait_for_cloud_init() {
if output=$(sudo cloud-init status --wait); then
return 0
else
res=$?
case $res in
2)
{
echo "WARNING: cloud-init did not complete successfully but recovered."
echo "Exit code: $res"
echo "Output: $output"
echo "Here are the logs for the failure:"
cat /var/log/cloud-init-*
} 1>&2
return 0
;;
*)
{
echo "cloud-init did not complete successfully."
echo "Exit code: $res"
echo "Output: $output"
echo "Here are the logs for the failure:"
cat /var/log/cloud-init-*
} 1>&2
return 1
;;
esac
fi
}
# Wait for cloud-init if it exists
type cloud-init && wait_for_cloud_init
# Synchronizing repos
begin_time=$(date +%s)
end_time=$((begin_time + TIMEOUT_SECONDS))
while [ "$(date +%s)" -lt "$end_time" ]; do
if synchronize_repos; then
exit 0
fi
sleep "$RETRY_INTERVAL"
done
fail "Timed out waiting for distro repos to be set up"