flatcar-scripts/ci-automation/test.sh

#!/bin/bash
#
# Copyright (c) 2021 The Flatcar Maintainers.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

# >>> This file is supposed to be SOURCED from the repository ROOT. <<<
#
# test_run() should be called w/ the positional INPUT parameters below.

# Test scenarios runner stub.
#   This script will run test scenarios for a single image type.
#   Tests will be started inside the SDK container.
#   This script is generic and will use a vendor-specific test runner from
#    "ci-automation/vendor-testing/<image>.sh.
#
# PREREQUISITES:
#
#   1. SDK version and OS image version are recorded in sdk_container/.repo/manifests/version.txt
#   2. Scripts repo version tag of OS image version to be built is available and checked out.
#   2. SDK container is either
#       - available via ghcr.io/flatcar-linux/flatcar-sdk-[ARCH]:[VERSION] (official SDK release)
#       OR
#       - available via build cache server "/containers/[VERSION]/flatcar-sdk-[ARCH]-[VERSION].tar.gz"
#         (dev SDK)
#   4. Vendor image and torcx docker tarball + manifest to run tests for are available on buildcache
#         ( images/[ARCH]/[FLATCAR_VERSION]/ )
#
# INPUT:
#
#   1. Architecture (ARCH) of the TARGET vm images ("arm64", "amd64").
#   2. Image type to be tested. One of:
#      ami, azure, azure_pro, digitalocean, gce, gce_pro, packet, qemu, qemu_uefi, vmware
#
# OPTIONAL INPUT:
#
#   3. List of tests / test patterns. Defaults to "*" (all tests).
#      All positional arguments after the first 2 (see above) are tests / patterns of tests to run.
#
#   MAX_RETRIES. Environment variable. Number of re-runs to overcome transient failures. Defaults to 20.
#
# OUTPUT:
#
#   1. 2 merged TAP reports with all test runs / vendors.
#        - a "summary" report which contains error messages only for tests which never succeeded (per vendor).
#        - a "detailed" report which also contains error messages of transient failures which succeeded after re-runs.
#        These reports will be updated after each (re-)run of each vendor, making the test job safe
#          to abort at any point - the previous runs' results won't be lost.
#   2. All intermediate kola tap reports, kola debug output, and merged tap reports (from 1.) published
#        to buildcache at testing/[VERSION]/[ARCH]/[IMAGE]
#   3. "./ci-cleanup.sh" with commands to clean up temporary build resources,
#        to be run after this step finishes / when this step is aborted.
#
#
# LOW-LEVEL / VENDOR SPECIFIC scripts API
#
# Vendor scripts are provided with their own sub-directory and are expected to CD into there before
#  creating any artifacts (see vendor script argument 1 below).
# The torcx manifest is supplied in
#   ../
# relative to the vendor sub-directory. The manifest is updated to include a URL pointing to the docker
#  torcx tarball on the build cache (for the docker.torcx-manifest-pkgs test).
#
# Vendor specific scripts are called with the following positional arguments:
# 1 - working directory for the tests.
#     The vendor script is expected to keep all artifacts it produces in that directory.
# 2 - Architecture to test.
# 3 - version number to test.
# 4 - output TAP file.
# All following arguments specify test cases / test case patterns to run.

set -euo pipefail

# Download torcx package and manifest, add build cache URL to manifest
#  so the docker.torcx-manifest-pkgs test can use it.
function __prepare_torcx() {
    local arch="$1"
    local vernum="$2"
    local workdir="$3"

    copy_from_buildcache "images/${arch}/${vernum}/torcx/torcx_manifest.json" "${workdir}"

    local docker_pkg
    docker_pkg="$(basename \
                        "$(jq -r ".value.packages[0].versions[0].locations[0].path" \
                        ${workdir}/torcx_manifest.json)")"

    # Add docker package URL on build cache to manifest
    local docker_url="http://${BUILDCACHE_SERVER}/images/${arch}/${vernum}/torcx/${docker_pkg}"
    jq ".value.packages[0].versions[0].locations += [{\"url\" : \"${docker_url}\"}]" \
        "${workdir}/torcx_manifest.json" \
        > "${workdir}/torcx_manifest_new.json"

    mv "${workdir}/torcx_manifest.json" "${workdir}/torcx_manifest.json.original"
    mv "${workdir}/torcx_manifest_new.json" "${workdir}/torcx_manifest.json"
}
# --

function test_run() {
    local arch="$1" ; shift
    local image="$1"; shift

    # default to all tests
    if [ $# -le 0 ] ; then
        set -- '*'
    fi

    local retries="${MAX_RETRIES:-20}"

    source ci-automation/tapfile_helper_lib.sh
    source ci-automation/ci_automation_common.sh
    init_submodules

    source sdk_container/.repo/manifests/version.txt
    local vernum="${FLATCAR_VERSION}"
    local docker_vernum
    docker_vernum="$(vernum_to_docker_image_version "${vernum}")"

    # Get SDK from either the registry or import from build cache
    local sdk_version="${FLATCAR_SDK_VERSION}"
    local sdk_name="flatcar-sdk-${arch}"
    local docker_sdk_vernum="$(vernum_to_docker_image_version "${sdk_version}")"

    docker_image_from_registry_or_buildcache "${sdk_name}" "${docker_sdk_vernum}"
    local sdk_image="$(docker_image_fullname "${sdk_name}" "${docker_sdk_vernum}")"
    echo "docker image rm -f '${sdk_image}'" >> ./ci-cleanup.sh

    local work_dir="__TESTS__"
    local tests_dir="${work_dir}/${image}"
    mkdir -p "${tests_dir}"

    local container_name="flatcar-tests-${arch}-${docker_vernum}-${image}"

    # Make the torcx artifacts available to test implementation
    __prepare_torcx "${arch}" "${vernum}" "${work_dir}"

    local tap_merged_summary="results-${image}.tap"
    local tap_merged_detailed="results-${image}-detailed.tap"
    local retry=""
    local success=false
    for retry in $(seq "${retries}"); do
        local tapfile="results-run-${retry}.tap"
        local failfile="failed-run-${retry}.txt"

        # Ignore retcode since tests are flaky. We'll re-run failed tests and
        #  determine success based on test results (tapfile).
        set +e -o noglob
        ./run_sdk_container -x ./ci-cleanup.sh \
            -n "${container_name}" -C "${sdk_image}" -v "${vernum}" \
            ci-automation/vendor-testing/"${image}".sh \
                "${tests_dir}" \
                "${arch}" \
                "${vernum}" \
                "${tapfile}" \
                $@
        set -e +o noglob

        ./run_sdk_container -x ./ci-cleanup.sh \
            -n "${container_name}" -C "${sdk_image}" -v "${vernum}" \
            ci-automation/test_update_reruns.sh \
                "${arch}" "${vernum}" "${image}" "${retry}" \
                "${tests_dir}/${tapfile}" \
                "${tests_dir}/${failfile}" \
                "${tap_merged_summary}" \
                "${tap_merged_detailed}"

        local failed_tests
        failed_tests="$(cat "${tests_dir}/${failfile}")"
        if [ -z "$failed_tests" ] ; then
            echo "########### All tests succeeded. ###########"
            success=true
            break
        fi

        echo "########### Some tests failed and will be re-run (${retry} / ${retries}). ###########"
        echo "Failed tests: $failed_tests"
        echo "-----------"
        set -- $failed_tests
    done


    if ! $success; then
        echo "########### All re-runs exhausted ($retries). Giving up. ###########"
    fi

    # publish kola output, TAP files to build cache
    copy_to_buildcache "testing/${vernum}/${arch}/${image}" \
        "${tests_dir}/_kola_temp"
    copy_to_buildcache "testing/${vernum}/${arch}/${image}" \
        "${tests_dir}/"*.tap
    copy_to_buildcache "testing/${vernum}/${arch}/${image}" \
        "${tap_merged_summary}"
    copy_to_buildcache "testing/${vernum}/${arch}/${image}" \
        "${tap_merged_detailed}"
}
# --