From f6f44e2ca8b5cbf2b6e2026efde421d8c76b8aa6 Mon Sep 17 00:00:00 2001
From: Thilo Fromm <thilo@kinvolk.io>
Date: Wed, 16 Feb 2022 19:17:42 +0100
Subject: [PATCH 01/12] ci-automation: first stab at adding testing

Signed-off-by: Thilo Fromm <thilo@kinvolk.io>
---
 ci-automation/ci-config.env          |   4 +
 ci-automation/tapfile_helper_lib.sh  | 281 +++++++++++++++++++++++++++
 ci-automation/test.sh                | 122 ++++++++++++
 ci-automation/test_update_reruns.sh  |  20 ++
 ci-automation/vendor-testing/qemu.sh |  38 ++++
 5 files changed, 465 insertions(+)
 create mode 100644 ci-automation/tapfile_helper_lib.sh
 create mode 100644 ci-automation/test.sh
 create mode 100755 ci-automation/test_update_reruns.sh
 create mode 100644 ci-automation/vendor-testing/qemu.sh
diff --git a/ci-automation/ci-config.env b/ci-automation/ci-config.env
index 40f6a6a959..3e4f347802 100644
--- a/ci-automation/ci-config.env
+++ b/ci-automation/ci-config.env
@@ -22,3 +22,7 @@ CI_GIT_EMAIL="infra+ci@flatcar-linux.org"
 # build artifacts go here (in container)
 CONTAINER_TORCX_ROOT="/home/sdk/build/torcx"
 CONTAINER_IMAGE_ROOT="/home/sdk/build/images"
+
+# Image / vendor tests settings
+QEMU_IMAGE_NAME="flatcar_production_image.bin"
+QEMU_PARALLEL=4
diff --git a/ci-automation/tapfile_helper_lib.sh b/ci-automation/tapfile_helper_lib.sh
new file mode 100644
index 0000000000..0cd7f3efa4
--- /dev/null
+++ b/ci-automation/tapfile_helper_lib.sh
@@ -0,0 +1,281 @@
+#!/bin/bash
+#
+# Copyright (c) 2021 The Flatcar Maintainers.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+# Helper script for extracting information from TAP files and for merging multiple
+#  TAP files into one report.
+# The script uses a temporary SQLite DB for querzing and for result generation.
+#
+# Brief usage overview (scroll down for parameters etc.):
+#   tap_ingest_tapfile - add test results from tap file to the DB 
+#   tap_list_vendors   - list all vendors TAP files have been ingested for
+#   tap_failed_tests_for_vendor - list all tests that never succeded even once, per vendor
+#   tap_generate_report - generate a merged test report
+
+
+TAPFILE_HELPER_DBNAME="results.sqlite3"
+
+# wrapper around sqlite3 w/ retries if DB is locked
+function __sqlite3_wrapper() {
+    local dbfile="$1"
+    shift
+
+    while true; do
+        sqlite3 "${dbfile}" "$@"
+        if [ $? -ne 5 ] ; then
+            return $?
+        fi
+        local sleep="$((1 + $RANDOM % 5))"
+        echo "Retrying in ${sleep} seconds." >&2
+        sleep "${sleep}"
+    done
+}
+# --
+
+# Initialise the DB if it wasn't yet.
+function __db_init() {
+    local dbname="${TAPFILE_HELPER_DBNAME}"
+
+    __sqlite3_wrapper "${dbname}" '
+    CREATE TABLE IF NOT EXISTS "test_case" (
+        "id"    INTEGER,
+        "name"  TEXT UNIQUE,
+        PRIMARY KEY("id")
+    );
+    CREATE TABLE IF NOT EXISTS "vendor" (
+        "id"    INTEGER,
+        "name"  TEXT UNIQUE,
+        PRIMARY KEY("id")
+    );
+    CREATE TABLE IF NOT EXISTS "test_run" (
+        "id"        INTEGER NOT NULL,
+        "result"    INTEGER NOT NULL,
+        "output"    TEXT,
+        "case_id"   INTEGER NOT NULL,
+        "run"       INTEGER NOT NULL,
+        "vendor_id" INTEGER,
+        PRIMARY KEY("id"),
+        FOREIGN KEY("case_id") REFERENCES "test_case"("id"),
+        FOREIGN KEY("vendor_id") REFERENCES "vendor"("id"),
+        UNIQUE (case_id, run, vendor_id)
+    );
+'
+}
+# --
+
+# Read tapfile into temporary DB.
+# INPUT:
+# 1: <tapfile> - tapfile to ingest
+# 2: <vendor>  - vendor (qemu, azure, aws, etc...)
+# 3: <run>     - re-run iteration
+
+function tap_ingest_tapfile() {
+    local tapfile="${1}"
+    local vendor="${2}"
+    local run="${3}"
+
+    local dbname="${TAPFILE_HELPER_DBNAME}"
+
+    local result=""
+    local test_name=""
+    local error_message=""
+    local in_error_message=false
+
+    if ! [ -f "${TAPFILE_HELPER_DBNAME}" ] ; then
+       __db_init
+    fi
+
+    # Wrap all SQL commands in a transaction to speed up INSERTs
+    local SQL="BEGIN TRANSACTION;"
+
+    # Example TAP input:
+    # ok - coreos.auth.verify
+    # ok - coreos.locksmith.tls
+    # not ok - cl.filesystem
+    #   ---
+    #   Error: "--- FAIL: cl.filesystem/deadlinks (1.86s)\n            files.go:90: Dead symbolic links found: [/var/lib/flatcar-oem-gce/usr/lib64/python3.9/site-packages/certifi-3021.3.16-py3.9.egg-info]"
+    #   ...
+    # ok - cl.cloudinit.script
+    # ok - kubeadm.v1.22.0.flannel.base
+    while read -r line; do
+        if [[ "${line}" == "1.."* ]] ; then continue; fi
+        if [ "${line}" = "---" ] ; then  # note: read removes leading whitespaces
+            in_error_message=true
+            continue
+        fi
+
+        if $in_error_message ; then
+            if [ "${line}" = "..." ] ; then
+                in_error_message=false
+            else
+                error_message="$(echo -e "$line" \
+                                    | sed -e 's/^Error: "--- FAIL: /"/' -e 's/^[[:space:]]*//' \
+                                    | sed -e "s/[>\"']/_/g" -e 's/[[:space:]]/ /g')"
+                continue
+            fi
+        else
+            test_name="$(echo "${line}" | sed 's/^[^-]* - //')"
+            local result_string="$(echo "${line}" | sed 's/ - .*//')"
+            result=0
+            if [ "${result_string}" = "ok" ] ; then
+                result=1
+            fi
+        fi
+
+        SQL="${SQL}INSERT OR IGNORE INTO test_case(name) VALUES ('${test_name}');"
+        SQL="${SQL}INSERT OR IGNORE INTO vendor(name) VALUES ('${vendor}');"
+
+        SQL="${SQL}INSERT OR REPLACE INTO test_run(run,result,output,case_id,vendor_id)
+                             VALUES ('${run}','${result}', '${error_message}',
+                                     (SELECT id FROM test_case WHERE name='${test_name}'),
+                                     (SELECT id FROM vendor WHERE name='${vendor}'));"
+        error_message=""
+    done < "$tapfile"
+
+    local SQL="${SQL}COMMIT;"
+
+    __sqlite3_wrapper "${dbname}" "${SQL}"
+}
+# --
+
+# Print a list of all vendors we've seen so far.
+function tap_list_vendors() {
+    local dbname="${TAPFILE_HELPER_DBNAME}"
+
+    __sqlite3_wrapper "${dbname}" 'SELECT DISTINCT name from vendor;'
+}
+# --
+
+# List tests that never succeeded for a given vendor.
+# INPUT:
+# 1: <vendor> - Vendor name to check for failed test runs
+function tap_failed_tests_for_vendor() {
+    local vendor="$1"
+
+    local dbname="${TAPFILE_HELPER_DBNAME}"
+
+    __sqlite3_wrapper "${dbname}" "
+		SELECT failed.name FROM test_case AS failed
+		WHERE EXISTS (
+				SELECT * FROM test_run AS t, vendor AS v, test_case AS c
+				WHERE t.vendor_id=v.id AND t.case_id=c.id               
+					AND v.name='${vendor}'
+					AND c.name=failed.name
+			)
+			AND NOT exists (
+				SELECT * FROM test_run AS t, vendor AS v, test_case AS c
+				WHERE t.vendor_id=v.id AND t.case_id=c.id               
+					AND v.name='${vendor}'
+					AND c.name=failed.name
+					AND t.result=1 );"
+}
+# --
+
+# Print the tap file from contents of the database.
+# INPUT:
+# 1: <arch>    - Architecture to be included in the first line of the report
+# 2: <version> - OS version tested, to be included in the first line of the report
+# 3: <include_transient_errors> - If set to "true" then debug output of transient test failures
+#                   is included in the result report.
+function tap_generate_report() {
+    local arch="$1"
+    local version="$2"
+    local full_error_report="${3:-false}"
+
+    local dbname="${TAPFILE_HELPER_DBNAME}"
+
+    local count="$(__sqlite3_wrapper "${dbname}" 'SELECT count(name) FROM test_case;')"
+    local vendors="$(__sqlite3_wrapper "${dbname}" 'SELECT name FROM vendor;' | tr '\n' ' ')"
+
+    echo "1..$((count+1))"
+    echo "ok - Version: ${version}, Architecture: ${arch}" 
+    echo "   ---"
+    echo "   Platforms tested: ${vendors}"
+    echo "   ..."
+
+    # Print result line for every test, including platforms it succeeded on
+    #  and transient failed runs.
+    __sqlite3_wrapper "${dbname}" 'SELECT DISTINCT name from test_case;' | \
+    while read -r test_name; do
+
+        # "ok" if the test succeeded at least once for all vendors that run the test,
+        #   "not ok" otherwise.
+        local verdict="$(__sqlite3_wrapper "${dbname}" "
+        SELECT failed.name FROM vendor AS failed
+        WHERE EXISTS (
+                SELECT * FROM test_run AS t, vendor AS v, test_case AS c
+                WHERE t.vendor_id=v.id AND t.case_id=c.id
+                    AND v.name=failed.name
+                    AND c.name='${test_name}'
+            )
+            AND NOT exists (
+                SELECT * FROM test_run AS t, vendor AS v, test_case AS c
+                WHERE t.vendor_id=v.id AND t.case_id=c.id
+                    AND v.name=failed.name
+                    AND c.name='${test_name}'
+                    AND t.result=1 );
+        ")"
+        if [ -n "${verdict}" ] ; then
+            verdict="not ok"
+        else
+            verdict="ok"
+        fi
+
+        # Generate a list of vendors and respective runs, in a single line.
+        function list_runs() {
+            local res="$1"
+            __sqlite3_wrapper -csv "${dbname}" "
+                SELECT v.name, t.run FROM test_run AS t, vendor AS v, test_case AS c
+                WHERE t.vendor_id=v.id AND t.case_id=c.id
+                    AND c.name='${test_name}'
+                    AND t.result=${res}
+                    ORDER BY v.name;" \
+                | awk -F, '{ if (t && (t != $1)) {
+                                printf t " " r "); "
+                                r="";}
+                             t=$1
+                             if (r)
+                                r=r ", " $2
+                             else
+                                r="(" $2 ; }
+                            END { if (t) print t r ")"; }'
+        }
+
+        local succeded="$(list_runs 1)"
+        local failed="$(list_runs 0)"
+
+        echo "${verdict} - ${test_name}"
+        echo "   ---"
+        if [ -n "${succeded}" ] ; then
+            echo "   Succeeded: ${succeded}"
+        fi
+        if [ -n "${failed}" ] ; then
+            echo "   Failed: ${failed}"
+            if [ "${verdict}" = "not ok" -o "${full_error_report}" = "true" ] ; then
+                # generate diagnostic output, per failed run.
+                __sqlite3_wrapper -csv "${dbname}" "
+                SELECT v.name, t.run
+                    FROM test_run AS t, vendor AS v, test_case AS c
+                    WHERE t.vendor_id=v.id AND t.case_id=c.id
+                    AND c.name='${test_name}'
+                    AND t.result=0
+                    ORDER BY t.run DESC;" | \
+                sed 's/,/ /' | \
+                while read -r vendor run; do
+                    echo "   Error messages for ${vendor}, run ${run}:"
+                    __sqlite3_wrapper -csv "${dbname}" "
+                    SELECT t.output FROM test_run AS t, test_case AS c
+                        WHERE t.case_id=c.id
+                        AND c.name='${test_name}'
+                        AND t.run='${run}';" | \
+                    sed 's/"/ /' | \
+                    awk '{print "      LINE " NR":" $0}'
+                done
+            fi
+        fi
+        echo "   ..."
+    done
+}
+# --
diff --git a/ci-automation/test.sh b/ci-automation/test.sh
new file mode 100644
index 0000000000..95197dc736
--- /dev/null
+++ b/ci-automation/test.sh
@@ -0,0 +1,122 @@
+#!/bin/bash
+#
+# Copyright (c) 2021 The Flatcar Maintainers.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+# >>> This file is supposed to be SOURCED from the repository ROOT. <<<
+#
+# test_run() should be called w/ the positional INPUT parameters below.
+
+# Test scenarios runner stub.
+#   This script will run test scenarios for a single image type.
+#   Tests will be started inside a container based on the packages container image
+#    (which contains the torcx manifest).
+#   This script is generic and will use a vendor-specific test runner from
+#    "ci-automation/vendor-testing/<image>.sh.
+#
+# PREREQUISITES:
+#
+#   1. SDK version and OS image version are recorded in sdk_container/.repo/manifests/version.txt
+#   2. Scripts repo version tag of OS image version to be built is available and checked out.
+#   3. Flatcar packages container is available via build cache server
+#       from "/containers/[VERSION]/flatcar-packages-[ARCH]-[FLATCAR_VERSION].tar.gz"
+#       or present locally. Container must contain binary packages and torcx artefacts.
+#   4. Vendor image(s) to run tests for are available on buildcache ( images/[ARCH]/[FLATCAR_VERSION]/ )
+#
+# INPUT:
+#
+#   1. Architecture (ARCH) of the TARGET vm images ("arm64", "amd64").
+#   2. Image type to be tested. One of:
+#      ami, azure, azure_pro, digitalocean, gce, gce_pro, packet, qemu, qemu_uefi, vmware
+#
+# OPTIONAL INPUT:
+#
+#   3. List of tests / test patterns. Defaults to "*" (all tests).
+#      All positional arguments after the first 2 (see above) are tests / patterns of tests to run.
+#
+#   MAX_RETRIES. Environment variable. Number of re-runs to overcome transient failures. Defaults to 999.
+#
+# OUTPUT:
+#
+#   1. 2 merged TAP reports with all test runs / vendors.
+#        - a "summary" report which contains error messages only for tests which never succeeded (per vendor).
+#        - a "detailed" report which also contains error messages of transient failures which succeeded after re-runs.
+#        These reports will be updated after each (re-)run of each vendor, making the test job safe
+#          to abort at any point - the previous runs' results won't be lost.
+#   2. "./ci-cleanup.sh" with commands to clean up temporary build resources,
+#        to be run after this step finishes / when this step is aborted.
+
+set -eu
+
+function test_run() {
+    local arch="$1" ; shift
+    local image="$2"; shift
+
+    # default to all tests
+    if [ $# -le 0 ] ; then
+        set -- *
+    fi
+
+    source ci-automation/tapfile_helper_lib.sh
+    source ci-automation/ci_automation_common.sh
+    init_submodules
+
+    source sdk_container/.repo/manifests/version.txt
+    local vernum="${FLATCAR_VERSION}"
+    local docker_vernum="$(vernum_to_docker_image_version "${vernum}")"
+
+    local packages="flatcar-packages-${arch}"
+    local packages_image="${packages}:${docker_vernum}"
+
+    docker_image_from_buildcache "${packages}" "${docker_vernum}"
+
+    local tests_dir="__TESTS__/${image}"
+    mkdir -p "${tests_dir}"
+
+    local container_name="flatcar-tests-${arch}-${docker_vernum}-${image}"
+
+    local retry=""
+    local success=false
+    for retry in $(seq "${retries}"); do
+        local tapfile="results-run-${retry}.tap"
+        local failfile="failed-run-${retry}."
+
+        set -o noglob
+        ./run_sdk_container -n "${container_name}" -C "${packages_image}" -v "${vernum}" \
+            ci-automation/vendor/testing/"${image}".sh \
+                "${tests_dir}" \
+                "${arch}" \
+                "${vernum}" \
+                "${tapfile}" \
+                $@
+        set +o noglob
+
+        ./run_sdk_container -n "${container_name}" -C "${packages_image}" -v "${vernum}" \
+            ci-automation/test_update_reruns.sh \
+                "${tests_dir}/${tapfile}" "${image}" "${retry}" \
+                "${tests_dir}/failed-run-${retry}.txt"
+
+        local failed_tests="$(cat "${tests_dir}/failed-run-${retry}.txt")"
+        if [ -z "$failed_tests" ] ; then
+            echo "########### All tests succeeded. ###########"
+            success=true
+            break
+        fi
+
+        echo "########### Some tests failed and will be re-run. ###########"
+        echo "Failed tests: $failed_tests"
+        echo "-----------"
+        set -- $failed_tests
+    done
+
+    if ! $success; then
+        echo "########### All re-runs exhausted ($retries). Giving up. ###########"
+    fi
+
+    # TODO: publish to bincache?
+    # "${tests_dir}/"*.tap
+    # "${tests_dir}/_kola_temp.tar.xz"
+
+}
+# --
diff --git a/ci-automation/test_update_reruns.sh b/ci-automation/test_update_reruns.sh
new file mode 100755
index 0000000000..e559158f31
--- /dev/null
+++ b/ci-automation/test_update_reruns.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+#
+# Copyright (c) 2021 The Flatcar Maintainers.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+# Helper script for test.sh to update the test failures text file.
+# test.sh uses this to determine which tests need to re-run.
+# This script is run within the SDK container.
+
+set -eu
+
+tapfile="$1"
+image="$2"
+retry="$3"
+outfile="$4"
+
+source ci-automation/tapfile_helper_lib.sh
+tap_ingest_tapfile "${tapfile}" "${image}" "${retry}"
+tap_failed_tests_for_vendor "${image}" | tee "${outfile}"
diff --git a/ci-automation/vendor-testing/qemu.sh b/ci-automation/vendor-testing/qemu.sh
new file mode 100644
index 0000000000..3f2dc95cff
--- /dev/null
+++ b/ci-automation/vendor-testing/qemu.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+#
+# Copyright (c) 2021 The Flatcar Maintainers.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+# Test execution script for the qemu vendor image.
+# This script is supposed to run in the SDK container.
+
+function run_testsuite() {
+    local work_dir="$1"; shift
+    local arch="$2"; shift
+    local vernum="$3"; shift
+    local tapfile="$4"; shift
+
+    # $@ now contains tests / test patterns to run
+
+    source ci-automation/ci_automation_common.sh
+
+    mkdir -p "${work_dir}"
+    cd "${work_dir}"
+
+    copy_from_buildcache "images/${arch}/${vernum}/${QEMU_IMAGE_NAME}" .
+
+    set -o noglob
+
+    sudo kola run
+        --board="${arch}-usr" \
+        --parallel="${QEMU_PARALLEL}" \
+        --platform=qemu \
+        --qemu-bios=/usr/share/qemu/bios-256k.bin \
+        --qemu-image="${QEMU_IMAGE_NAME}" \
+        --tapfile="${tapfile}" \
+        --torcx-manifest="${CONTAINER_TORCX_ROOT}/${arch}-usr/latest/torcx_manifest.json"
+        $@
+
+    set +o noglob
+}

From 5bfe2f395c5c5c60289430c0935cad7923460beb Mon Sep 17 00:00:00 2001
From: Thilo Fromm <thilo@kinvolk.io>
Date: Thu, 17 Feb 2022 10:29:05 +0100
Subject: [PATCH 02/12] Apply @pothos' suggestions from code review
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Kai Lüke <pothos@users.noreply.github.com>
---
 ci-automation/tapfile_helper_lib.sh  | 15 ++++++++++-----
 ci-automation/test.sh                |  8 +++++---
 ci-automation/test_update_reruns.sh  |  2 +-
 ci-automation/vendor-testing/qemu.sh |  2 +-
 4 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/ci-automation/tapfile_helper_lib.sh b/ci-automation/tapfile_helper_lib.sh
index 0cd7f3efa4..35fc12be17 100644
--- a/ci-automation/tapfile_helper_lib.sh
+++ b/ci-automation/tapfile_helper_lib.sh
@@ -117,7 +117,8 @@ function tap_ingest_tapfile() {
             fi
         else
             test_name="$(echo "${line}" | sed 's/^[^-]* - //')"
-            local result_string="$(echo "${line}" | sed 's/ - .*//')"
+            local result_string
+            result_string="$(echo "${line}" | sed 's/ - .*//')"
             result=0
             if [ "${result_string}" = "ok" ] ; then
                 result=1
@@ -186,8 +187,10 @@ function tap_generate_report() {
 
     local dbname="${TAPFILE_HELPER_DBNAME}"
 
-    local count="$(__sqlite3_wrapper "${dbname}" 'SELECT count(name) FROM test_case;')"
-    local vendors="$(__sqlite3_wrapper "${dbname}" 'SELECT name FROM vendor;' | tr '\n' ' ')"
+    local count
+    count="$(__sqlite3_wrapper "${dbname}" 'SELECT count(name) FROM test_case;')"
+    local vendors
+    vendors="$(__sqlite3_wrapper "${dbname}" 'SELECT name FROM vendor;' | tr '\n' ' ')"
 
     echo "1..$((count+1))"
     echo "ok - Version: ${version}, Architecture: ${arch}" 
@@ -243,8 +246,10 @@ function tap_generate_report() {
                             END { if (t) print t r ")"; }'
         }
 
-        local succeded="$(list_runs 1)"
-        local failed="$(list_runs 0)"
+        local succeded
+        succeded="$(list_runs 1)"
+        local failed
+        failed="$(list_runs 0)"
 
         echo "${verdict} - ${test_name}"
         echo "   ---"
diff --git a/ci-automation/test.sh b/ci-automation/test.sh
index 95197dc736..84e06860ba 100644
--- a/ci-automation/test.sh
+++ b/ci-automation/test.sh
@@ -47,7 +47,7 @@
 #   2. "./ci-cleanup.sh" with commands to clean up temporary build resources,
 #        to be run after this step finishes / when this step is aborted.
 
-set -eu
+set -euo pipefail
 
 function test_run() {
     local arch="$1" ; shift
@@ -64,7 +64,8 @@ function test_run() {
 
     source sdk_container/.repo/manifests/version.txt
     local vernum="${FLATCAR_VERSION}"
-    local docker_vernum="$(vernum_to_docker_image_version "${vernum}")"
+    local docker_vernum
+   docker_vernum="$(vernum_to_docker_image_version "${vernum}")"
 
     local packages="flatcar-packages-${arch}"
     local packages_image="${packages}:${docker_vernum}"
@@ -97,7 +98,8 @@ function test_run() {
                 "${tests_dir}/${tapfile}" "${image}" "${retry}" \
                 "${tests_dir}/failed-run-${retry}.txt"
 
-        local failed_tests="$(cat "${tests_dir}/failed-run-${retry}.txt")"
+        local failed_tests
+        failed_tests="$(cat "${tests_dir}/failed-run-${retry}.txt")"
         if [ -z "$failed_tests" ] ; then
             echo "########### All tests succeeded. ###########"
             success=true
diff --git a/ci-automation/test_update_reruns.sh b/ci-automation/test_update_reruns.sh
index e559158f31..8a3eb078a7 100755
--- a/ci-automation/test_update_reruns.sh
+++ b/ci-automation/test_update_reruns.sh
@@ -8,7 +8,7 @@
 # test.sh uses this to determine which tests need to re-run.
 # This script is run within the SDK container.
 
-set -eu
+set -euo pipefail
 
 tapfile="$1"
 image="$2"
diff --git a/ci-automation/vendor-testing/qemu.sh b/ci-automation/vendor-testing/qemu.sh
index 3f2dc95cff..852e386899 100644
--- a/ci-automation/vendor-testing/qemu.sh
+++ b/ci-automation/vendor-testing/qemu.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-#
+set -euo pipefail
 # Copyright (c) 2021 The Flatcar Maintainers.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.

From 6c76bfa1cd8ad6d7816d26ccf372c4b8d808458b Mon Sep 17 00:00:00 2001
From: Thilo Fromm <thilo@kinvolk.io>
Date: Thu, 17 Feb 2022 10:45:10 +0100
Subject: [PATCH 03/12] ci-automation/tapfile_helper_lib.sh: add @pothos'
 retcode fix
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Kai Lüke <pothos@users.noreply.github.com>
---
 ci-automation/tapfile_helper_lib.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ci-automation/tapfile_helper_lib.sh b/ci-automation/tapfile_helper_lib.sh
index 35fc12be17..7503550cca 100644
--- a/ci-automation/tapfile_helper_lib.sh
+++ b/ci-automation/tapfile_helper_lib.sh
@@ -205,7 +205,8 @@ function tap_generate_report() {
 
         # "ok" if the test succeeded at least once for all vendors that run the test,
         #   "not ok" otherwise.
-        local verdict="$(__sqlite3_wrapper "${dbname}" "
+        local verdict
+        verdict="$(__sqlite3_wrapper "${dbname}" "
         SELECT failed.name FROM vendor AS failed
         WHERE EXISTS (
                 SELECT * FROM test_run AS t, vendor AS v, test_case AS c

From 3a416fbf32699a3e6720a2365b45b6c7088c3e46 Mon Sep 17 00:00:00 2001
From: Thilo Fromm <thilo@kinvolk.io>
Date: Thu, 17 Feb 2022 10:48:49 +0100
Subject: [PATCH 04/12] ci-automation testing: address PR review comments

- add cleanup script to test.sh
- remove wrapper function from qemu test

Signed-off-by: Thilo Fromm <thilo@kinvolk.io>
---
 ci-automation/ci_automation_common.sh |  8 +++--
 ci-automation/tapfile_helper_lib.sh   |  5 +--
 ci-automation/test.sh                 | 17 ++++++----
 ci-automation/vendor-testing/qemu.sh  | 46 +++++++++++++--------------
 4 files changed, 43 insertions(+), 33 deletions(-)
 mode change 100644 => 100755 ci-automation/vendor-testing/qemu.sh

diff --git a/ci-automation/ci_automation_common.sh b/ci-automation/ci_automation_common.sh
index 9818f147f1..871f8cfbfe 100644
--- a/ci-automation/ci_automation_common.sh
+++ b/ci-automation/ci_automation_common.sh
@@ -10,8 +10,12 @@ source ci-automation/ci-config.env
 : ${PIGZ:=pigz}
 
 # set up author and email so git does not complain when tagging
-git -C . config user.name "${CI_GIT_AUTHOR}"  
-git -C . config user.email "${CI_GIT_EMAIL}"
+if ! git config --get user.name ; then
+    git -C . config user.name "${CI_GIT_AUTHOR}"
+fi
+if ! git config --get user.email ; then
+    git -C . config user.email "${CI_GIT_EMAIL}"
+fi
 
 function init_submodules() {
     git submodule init
diff --git a/ci-automation/tapfile_helper_lib.sh b/ci-automation/tapfile_helper_lib.sh
index 7503550cca..bf121937bf 100644
--- a/ci-automation/tapfile_helper_lib.sh
+++ b/ci-automation/tapfile_helper_lib.sh
@@ -24,8 +24,9 @@ function __sqlite3_wrapper() {
 
     while true; do
         sqlite3 "${dbfile}" "$@"
-        if [ $? -ne 5 ] ; then
-            return $?
+        local ret="$?"
+        if [ "$ret" -ne 5 ] ; then
+            return $ret
         fi
         local sleep="$((1 + $RANDOM % 5))"
         echo "Retrying in ${sleep} seconds." >&2
diff --git a/ci-automation/test.sh b/ci-automation/test.sh
index 84e06860ba..9fbc760fc8 100644
--- a/ci-automation/test.sh
+++ b/ci-automation/test.sh
@@ -51,13 +51,15 @@ set -euo pipefail
 
 function test_run() {
     local arch="$1" ; shift
-    local image="$2"; shift
+    local image="$1"; shift
 
     # default to all tests
     if [ $# -le 0 ] ; then
-        set -- *
+        set -- '*'
     fi
 
+    local retries="${MAX_RETRIES:-999}"
+
     source ci-automation/tapfile_helper_lib.sh
     source ci-automation/ci_automation_common.sh
     init_submodules
@@ -65,7 +67,7 @@ function test_run() {
     source sdk_container/.repo/manifests/version.txt
     local vernum="${FLATCAR_VERSION}"
     local docker_vernum
-   docker_vernum="$(vernum_to_docker_image_version "${vernum}")"
+    docker_vernum="$(vernum_to_docker_image_version "${vernum}")"
 
     local packages="flatcar-packages-${arch}"
     local packages_image="${packages}:${docker_vernum}"
@@ -74,6 +76,7 @@ function test_run() {
 
     local tests_dir="__TESTS__/${image}"
     mkdir -p "${tests_dir}"
+    echo "sudo rm -rf '${tests_dir}'" >> ci-cleanup.sh
 
     local container_name="flatcar-tests-${arch}-${docker_vernum}-${image}"
 
@@ -84,8 +87,9 @@ function test_run() {
         local failfile="failed-run-${retry}."
 
         set -o noglob
-        ./run_sdk_container -n "${container_name}" -C "${packages_image}" -v "${vernum}" \
-            ci-automation/vendor/testing/"${image}".sh \
+        ./run_sdk_container -x ./ci-cleanup.sh \
+            -n "${container_name}" -C "${packages_image}" -v "${vernum}" \
+            ci-automation/vendor-testing/"${image}".sh \
                 "${tests_dir}" \
                 "${arch}" \
                 "${vernum}" \
@@ -93,7 +97,8 @@ function test_run() {
                 $@
         set +o noglob
 
-        ./run_sdk_container -n "${container_name}" -C "${packages_image}" -v "${vernum}" \
+        ./run_sdk_container -x ./ci-cleanup.sh \
+            -n "${container_name}" -C "${packages_image}" -v "${vernum}" \
             ci-automation/test_update_reruns.sh \
                 "${tests_dir}/${tapfile}" "${image}" "${retry}" \
                 "${tests_dir}/failed-run-${retry}.txt"
diff --git a/ci-automation/vendor-testing/qemu.sh b/ci-automation/vendor-testing/qemu.sh
old mode 100644
new mode 100755
index 852e386899..b66daa2064
--- a/ci-automation/vendor-testing/qemu.sh
+++ b/ci-automation/vendor-testing/qemu.sh
@@ -1,38 +1,38 @@
 #!/bin/bash
-set -euo pipefail
 # Copyright (c) 2021 The Flatcar Maintainers.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.
 
+set -euo pipefail
+
 # Test execution script for the qemu vendor image.
 # This script is supposed to run in the SDK container.
 
-function run_testsuite() {
-    local work_dir="$1"; shift
-    local arch="$2"; shift
-    local vernum="$3"; shift
-    local tapfile="$4"; shift
+work_dir="$1"; shift
+arch="$1"; shift
+vernum="$1"; shift
+tapfile="$1"; shift
 
-    # $@ now contains tests / test patterns to run
+# $@ now contains tests / test patterns to run
 
-    source ci-automation/ci_automation_common.sh
+source ci-automation/ci_automation_common.sh
 
-    mkdir -p "${work_dir}"
-    cd "${work_dir}"
+mkdir -p "${work_dir}"
+cd "${work_dir}"
 
-    copy_from_buildcache "images/${arch}/${vernum}/${QEMU_IMAGE_NAME}" .
+echo "++++ QEMU test: downloading ${QEMU_IMAGE_NAME} for ${vernum} (${arch}) ++++"
+copy_from_buildcache "images/${arch}/${vernum}/${QEMU_IMAGE_NAME}" .
 
-    set -o noglob
+set -o noglob
 
-    sudo kola run
-        --board="${arch}-usr" \
-        --parallel="${QEMU_PARALLEL}" \
-        --platform=qemu \
-        --qemu-bios=/usr/share/qemu/bios-256k.bin \
-        --qemu-image="${QEMU_IMAGE_NAME}" \
-        --tapfile="${tapfile}" \
-        --torcx-manifest="${CONTAINER_TORCX_ROOT}/${arch}-usr/latest/torcx_manifest.json"
-        $@
+sudo kola run \
+    --board="${arch}-usr" \
+    --parallel="${QEMU_PARALLEL}" \
+    --platform=qemu \
+    --qemu-bios=/usr/share/qemu/bios-256k.bin \
+    --qemu-image="${QEMU_IMAGE_NAME}" \
+    --tapfile="${tapfile}" \
+    --torcx-manifest="${CONTAINER_TORCX_ROOT}/${arch}-usr/latest/torcx_manifest.json" \
+    $@
 
-    set +o noglob
-}
+set +o noglob

From 95ef0b73226aa7bc40f44fef5e7fa828b8b0c1c9 Mon Sep 17 00:00:00 2001
From: Thilo Fromm <thilo@kinvolk.io>
Date: Thu, 17 Feb 2022 12:12:08 +0100
Subject: [PATCH 05/12] ci-automation: git author and curl verboseness

- Git author configuration moves to tagging function and put under a
  condition so as to not pollute peoples' workspaces.
- curl now less verbose since it was spamming logs with TLS debug
  information.

Signed-off-by: Thilo Fromm <thilo@kinvolk.io>
---
 ci-automation/ci_automation_common.sh | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/ci-automation/ci_automation_common.sh b/ci-automation/ci_automation_common.sh
index 871f8cfbfe..e98a44ca45 100644
--- a/ci-automation/ci_automation_common.sh
+++ b/ci-automation/ci_automation_common.sh
@@ -9,14 +9,6 @@
 source ci-automation/ci-config.env
 : ${PIGZ:=pigz}
 
-# set up author and email so git does not complain when tagging
-if ! git config --get user.name ; then
-    git -C . config user.name "${CI_GIT_AUTHOR}"
-fi
-if ! git config --get user.email ; then
-    git -C . config user.email "${CI_GIT_EMAIL}"
-fi
-
 function init_submodules() {
     git submodule init
     git submodule update
@@ -31,7 +23,6 @@ function update_submodule() {
     git fetch --all --tags
     git checkout "${commit_ish}"
     cd -
-
 }
 # --
 
@@ -58,6 +49,14 @@ function update_submodules() {
 function update_and_push_version() {
     local version="$1"
 
+    # set up author and email so git does not complain when tagging
+    if ! git config --get user.name >/dev/null 2>&1 ; then
+        git -C . config user.name "${CI_GIT_AUTHOR}"
+    fi
+    if ! git config --get user.email >/dev/null 2>&1 ; then
+        git -C . config user.email "${CI_GIT_EMAIL}"
+    fi
+
     # Add and commit local changes
     git add "sdk_container/src/third_party/coreos-overlay"
     git add "sdk_container/src/third_party/portage-stable"
@@ -102,7 +101,7 @@ function copy_from_buildcache() {
     local where_to="$2"
 
     mkdir -p "$where_to"
-    curl --verbose --fail --silent --show-error --location --retry-delay 1 --retry 60 \
+    curl --fail --silent --show-error --location --retry-delay 1 --retry 60 \
         --retry-connrefused --retry-max-time 60 --connect-timeout 20 \
         --remote-name --output-dir "${where_to}" "https://${BUILDCACHE_SERVER}/${what}" 
 }
@@ -194,7 +193,7 @@ function docker_image_from_buildcache() {
 
     local url="https://${BUILDCACHE_SERVER}/containers/${version}/${tgz}"
 
-    curl --verbose --fail --silent --show-error --location --retry-delay 1 --retry 60 \
+    curl --fail --silent --show-error --location --retry-delay 1 --retry 60 \
         --retry-connrefused --retry-max-time 60 --connect-timeout 20 \
         --remote-name "${url}"
 

From a5b958fd0763cf372fb31da4fa02acc24f05780e Mon Sep 17 00:00:00 2001
From: Thilo Fromm <thilo@kinvolk.io>
Date: Fri, 18 Feb 2022 14:40:18 +0100
Subject: [PATCH 06/12] ci-automation/test.sh: fix reruns, set retry to 20

Signed-off-by: Thilo Fromm <thilo@kinvolk.io>
---
 ci-automation/ci-config.env |  2 +-
 ci-automation/test.sh       | 16 +++++++++-------
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/ci-automation/ci-config.env b/ci-automation/ci-config.env
index 3e4f347802..34ca3eaf21 100644
--- a/ci-automation/ci-config.env
+++ b/ci-automation/ci-config.env
@@ -25,4 +25,4 @@ CONTAINER_IMAGE_ROOT="/home/sdk/build/images"
 
 # Image / vendor tests settings
 QEMU_IMAGE_NAME="flatcar_production_image.bin"
-QEMU_PARALLEL=4
+QEMU_PARALLEL=20
diff --git a/ci-automation/test.sh b/ci-automation/test.sh
index 9fbc760fc8..b488ea3f0c 100644
--- a/ci-automation/test.sh
+++ b/ci-automation/test.sh
@@ -35,7 +35,7 @@
 #   3. List of tests / test patterns. Defaults to "*" (all tests).
 #      All positional arguments after the first 2 (see above) are tests / patterns of tests to run.
 #
-#   MAX_RETRIES. Environment variable. Number of re-runs to overcome transient failures. Defaults to 999.
+#   MAX_RETRIES. Environment variable. Number of re-runs to overcome transient failures. Defaults to 20.
 #
 # OUTPUT:
 #
@@ -58,7 +58,7 @@ function test_run() {
         set -- '*'
     fi
 
-    local retries="${MAX_RETRIES:-999}"
+    local retries="${MAX_RETRIES:-20}"
 
     source ci-automation/tapfile_helper_lib.sh
     source ci-automation/ci_automation_common.sh
@@ -84,9 +84,11 @@ function test_run() {
     local success=false
     for retry in $(seq "${retries}"); do
         local tapfile="results-run-${retry}.tap"
-        local failfile="failed-run-${retry}."
+        local failfile="failed-run-${retry}.txt"
 
-        set -o noglob
+        # Ignore retcode since tests are flaky. We'll re-run failed tests and
+        #  determine success based on test results (tapfile).
+        set +e -o noglob
         ./run_sdk_container -x ./ci-cleanup.sh \
             -n "${container_name}" -C "${packages_image}" -v "${vernum}" \
             ci-automation/vendor-testing/"${image}".sh \
@@ -95,16 +97,16 @@ function test_run() {
                 "${vernum}" \
                 "${tapfile}" \
                 $@
-        set +o noglob
+        set -e +o noglob
 
         ./run_sdk_container -x ./ci-cleanup.sh \
             -n "${container_name}" -C "${packages_image}" -v "${vernum}" \
             ci-automation/test_update_reruns.sh \
                 "${tests_dir}/${tapfile}" "${image}" "${retry}" \
-                "${tests_dir}/failed-run-${retry}.txt"
+                "${tests_dir}/${failfile}"
 
         local failed_tests
-        failed_tests="$(cat "${tests_dir}/failed-run-${retry}.txt")"
+        failed_tests="$(cat "${tests_dir}/${failfile}")"
         if [ -z "$failed_tests" ] ; then
             echo "########### All tests succeeded. ###########"
             success=true

From cafa385164c0ade0387b6f4ab9595f130bdddae3 Mon Sep 17 00:00:00 2001
From: Thilo Fromm <thilo@kinvolk.io>
Date: Fri, 18 Feb 2022 15:35:49 +0100
Subject: [PATCH 07/12] ci-automation: publish torcx json and use in tests

This change updates the package build script to publish the torcx
manifest file to the build cache so it can be used by tests.
It also updates the generic test script to use the SDK container instead
of the packages container image, and to download and use the torcx
manifest from the build cache.

Signed-off-by: Thilo Fromm <thilo@kinvolk.io>
---
 ci-automation/packages.sh            | 11 +++++++++-
 ci-automation/test.sh                | 31 ++++++++++++++++++----------
 ci-automation/vendor-testing/qemu.sh |  2 +-
 3 files changed, 31 insertions(+), 13 deletions(-)

diff --git a/ci-automation/packages.sh b/ci-automation/packages.sh
index a3786bd21e..9e5efe743c 100644
--- a/ci-automation/packages.sh
+++ b/ci-automation/packages.sh
@@ -48,7 +48,8 @@
 # OUTPUT:
 #
 #   1. Exported container image "flatcar-packages-[ARCH]-[VERSION].tar.gz" with binary packages
-#       pushed to buildcache.
+#       pushed to buildcache, and torcx_manifest.json pushed to "images/${arch}/${vernum}/"
+#       (for use with tests).
 #   2. Updated scripts repository
 #        - version tag w/ submodules
 #        - sdk_container/.repo/manifests/version.txt denotes new FLATCAR OS version
@@ -102,6 +103,11 @@ function packages_build() {
         ./build_packages --board="${arch}-usr" \
             --torcx_output_root="${CONTAINER_TORCX_ROOT}"
 
+    # copy torcx manifest for publishing
+    ./run_sdk_container -n "${packages_container}" -v "${version}" \
+        -C "${sdk_image}" \
+        cp "${CONTAINER_TORCX_ROOT}/amd64-usr/latest/torcx_manifest.json" __build__/
+
     # run_sdk_container updates the version file, use that version from here on
     source sdk_container/.repo/manifests/version.txt
     local vernum="${FLATCAR_VERSION}"
@@ -111,6 +117,9 @@ function packages_build() {
     # generate image + push to build cache
     docker_commit_to_buildcache "${packages_container}" "${packages_image}" "${docker_vernum}"
 
+    # Publish torcx manifest to "images" cache so tests can pull it later.
+    copy_to_buildcache "images/${arch}/${vernum}/" __build__/torcx_manifest.json
+
     update_and_push_version "${version}"
 }
 # --
diff --git a/ci-automation/test.sh b/ci-automation/test.sh
index b488ea3f0c..924a791f4b 100644
--- a/ci-automation/test.sh
+++ b/ci-automation/test.sh
@@ -10,8 +10,7 @@
 
 # Test scenarios runner stub.
 #   This script will run test scenarios for a single image type.
-#   Tests will be started inside a container based on the packages container image
-#    (which contains the torcx manifest).
+#   Tests will be started inside the SDK container.
 #   This script is generic and will use a vendor-specific test runner from
 #    "ci-automation/vendor-testing/<image>.sh.
 #
@@ -19,10 +18,13 @@
 #
 #   1. SDK version and OS image version are recorded in sdk_container/.repo/manifests/version.txt
 #   2. Scripts repo version tag of OS image version to be built is available and checked out.
-#   3. Flatcar packages container is available via build cache server
-#       from "/containers/[VERSION]/flatcar-packages-[ARCH]-[FLATCAR_VERSION].tar.gz"
-#       or present locally. Container must contain binary packages and torcx artefacts.
-#   4. Vendor image(s) to run tests for are available on buildcache ( images/[ARCH]/[FLATCAR_VERSION]/ )
+#   2. SDK container is either
+#       - available via ghcr.io/flatcar-linux/flatcar-sdk-[ARCH]:[VERSION] (official SDK release)
+#       OR
+#       - available via build cache server "/containers/[VERSION]/flatcar-sdk-[ARCH]-[VERSION].tar.gz"
+#         (dev SDK)
+#   4. Vendor image and torcx manifest to run tests for are available on buildcache
+#         ( images/[ARCH]/[FLATCAR_VERSION]/ )
 #
 # INPUT:
 #
@@ -69,10 +71,14 @@ function test_run() {
     local docker_vernum
     docker_vernum="$(vernum_to_docker_image_version "${vernum}")"
 
-    local packages="flatcar-packages-${arch}"
-    local packages_image="${packages}:${docker_vernum}"
+    # Get SDK from either the registry or import from build cache
+    local sdk_version="${FLATCAR_SDK_VERSION}"
+    local sdk_name="flatcar-sdk-${arch}"
+    local docker_sdk_vernum="$(vernum_to_docker_image_version "${sdk_version}")"
 
-    docker_image_from_buildcache "${packages}" "${docker_vernum}"
+    docker_image_from_registry_or_buildcache "${sdk_name}" "${docker_sdk_vernum}"
+    local sdk_image="$(docker_image_fullname "${sdk_name}" "${docker_sdk_vernum}")"
+    echo "docker image rm -f '${sdk_image}'" >> ./ci-cleanup.sh
 
     local tests_dir="__TESTS__/${image}"
     mkdir -p "${tests_dir}"
@@ -80,6 +86,9 @@ function test_run() {
 
     local container_name="flatcar-tests-${arch}-${docker_vernum}-${image}"
 
+    # Make the torcx manifest available to test implementation
+    copy_from_buildcache "images/${arch}/${vernum}/torcx_manifest.json" "${tests_dir}"
+
     local retry=""
     local success=false
     for retry in $(seq "${retries}"); do
@@ -90,7 +99,7 @@ function test_run() {
         #  determine success based on test results (tapfile).
         set +e -o noglob
         ./run_sdk_container -x ./ci-cleanup.sh \
-            -n "${container_name}" -C "${packages_image}" -v "${vernum}" \
+            -n "${container_name}" -C "${sdk_image}" -v "${vernum}" \
             ci-automation/vendor-testing/"${image}".sh \
                 "${tests_dir}" \
                 "${arch}" \
@@ -100,7 +109,7 @@ function test_run() {
         set -e +o noglob
 
         ./run_sdk_container -x ./ci-cleanup.sh \
-            -n "${container_name}" -C "${packages_image}" -v "${vernum}" \
+            -n "${container_name}" -C "${sdk_image}" -v "${vernum}" \
             ci-automation/test_update_reruns.sh \
                 "${tests_dir}/${tapfile}" "${image}" "${retry}" \
                 "${tests_dir}/${failfile}"
diff --git a/ci-automation/vendor-testing/qemu.sh b/ci-automation/vendor-testing/qemu.sh
index b66daa2064..8c40b87159 100755
--- a/ci-automation/vendor-testing/qemu.sh
+++ b/ci-automation/vendor-testing/qemu.sh
@@ -32,7 +32,7 @@ sudo kola run \
     --qemu-bios=/usr/share/qemu/bios-256k.bin \
     --qemu-image="${QEMU_IMAGE_NAME}" \
     --tapfile="${tapfile}" \
-    --torcx-manifest="${CONTAINER_TORCX_ROOT}/${arch}-usr/latest/torcx_manifest.json" \
+    --torcx-manifest=torcx_manifest.json \
     $@
 
 set +o noglob

From bee5ac7f747a36f956ad90b775cc22f69bf4ba86 Mon Sep 17 00:00:00 2001
From: Thilo Fromm <thilo@kinvolk.io>
Date: Mon, 21 Feb 2022 12:56:45 +0100
Subject: [PATCH 08/12] ci-automation/tapfile: enforce foreign keys, simplify

---
 ci-automation/README.md             | 60 ++++++++++++++++++++++++++++-
 ci-automation/tapfile_helper_lib.sh | 42 +++++++++-----------
 ci-automation/test.sh               |  2 +-
 3 files changed, 79 insertions(+), 25 deletions(-)

diff --git a/ci-automation/README.md b/ci-automation/README.md
index f7c61d551e..7db3f97654 100644
--- a/ci-automation/README.md
+++ b/ci-automation/README.md
@@ -2,16 +2,39 @@
 
 Scripts in this directory aim to ease automation of Flatcar builds in continuous integration systems.
 
+Design goal of the automation scripts is to provide self-contained, context-aware automation with a low integration overhead.
+Each step takes its context from the repository (version to build etc.) and from the artifact of the previous build, with the aim of reducing the number of arguments to an absolute minimum.
+
 Each script represents a distinct build step; each step ingests the container image of the previous step and produces a new container image for the next step.
 Notable exceptions are "SDK Bootstrap" (`sdk.sh`) which only creates an SDK tarball, and "VMs build" which does not output a container but only VM (vendor) images.
+The container images are self-contained and aim for ease of reproducibility.
+All steps make use of a "build cache" server for pulling (https) build inputs and for pushing (rsync) artifacts.
+
+Test automation is provided alongside build automation, following the same design principles.
 
 Please refer to the individual scripts for prerequisites, input parameters, and outputs.
 
-All steps make use of a "build cache" server for pulling (https) and pushing (rsync) build inputs and artifacts.
 
 ## Build steps
 
 The build pipeline can be used to build everything from scratch, including the SDK (starting from 1. below) or to build a new OS image (starting from 3.).
+"From scratch" builds (i.e. builds which include a new SDK) are usually only done for the `main` branch (`main` can be considered `alpha-next`).
+Release / maintenance branches in the majority of cases do note build a new SDK but start with the OS image build.
+Release branches usually use the SDK introduced when the new major version was branched off `main` throughout the lifetime of the major version; i.e. release `stable-MMMM.mm.pp` would use `SDK-MMMM.0.0`.
+
+To reproduce any given build step, follow this pattern:
+```
+./checkout <build-tag> # Build tag from either SDK bootstrap pr Packages step
+source ci-automation/<step-script>.sh
+<step_function> <parameters>
+```
+
+For example, to rebuild the AMD64 OS image of build `main-3145.0.0-nightly-20220209-0139`, do
+```
+./checkout main-3145.0.0-nightly-20220209-0139
+source ci-automation/image.sh
+image_build amd64
+```
 
 ### SDK bootstrap build
 
@@ -69,6 +92,7 @@ The build pipeline can be used to build everything from scratch, including the S
             |                             `--------´                   |
             |<-- tag: alpha-3499.0.0-dev23 --´|`- sdk + OS packages -->|
             |                                 |    container image     |
+            |                                 |    torcx manifest      |
             |                           ______v_______                 |
             |                          ( publish pkgs )                |
             |                           `------------´                 |
@@ -82,3 +106,37 @@ The build pipeline can be used to build everything from scratch, including the S
                  alpha-3499.0.0-dev23       `---´                      |
                                               `- vendor OS images ---->|
 ```
+
+## Testing
+
+Testing follows the same design principles build automation adheres to - it's self-contained and context-aware, reducing required parameters to a minimum.
+The `test.sh` script needs exactly two parameters: the architecture, and the image type to be tested.
+Optionally, patterns matching a group of tests can be supplied (or simply a list of tests); this defaults to "all tests" of a given vendor / image.
+`test.sh` also supports re-running failed tests automatically to reduce the need for human interaction on flaky tests.
+
+Testing is implemented in two layers:
+1. `ci-automation/test.sh` is a generic test wrapper / stub to be called from CI.
+2. `ci-automation/vendor-testing/` contains low-level vendor-specific test wrappers around [`kola`](https://github.com/flatcar-linux/mantle/tree/flatcar-master/kola/), our test scenario orchestrator.
+
+Testing relies on the SDK container and will use tools / test suites from the SDK.
+The low-level vendor / image specific script (layer 2. in the list above) is run inside the SDK.
+Testing will use the vendor image published by `vms.sh` from buildcache, and the torcx manifest published by `packages`.
+
+Additionally, a script library is provided (at `ci-automation/tapfile_helper_lib.sh`) to help handling `.tap` test result files produced by test runs.
+Library functions may be used to merge the result of multiple test runs (e.g. for multiple image types / vendors) into a single test result report.
+The test runs are considered successful only if all tests succeeded for all vendors / images at least once.
+
+**Usage**
+```
+./checkout <version-to-test>
+source ci-automation/test.sh
+test_run <arch> <image-type>
+```
+
+E.g. for running qemu / amd64 tests on `main-3145.0.0-nightly-20220209-0139`:
+```
+./checkout main-3145.0.0-nightly-20220209-0139
+source ci-automation/test.sh
+test_run amd64 qemu
+```
+
diff --git a/ci-automation/tapfile_helper_lib.sh b/ci-automation/tapfile_helper_lib.sh
index bf121937bf..650f11441e 100644
--- a/ci-automation/tapfile_helper_lib.sh
+++ b/ci-automation/tapfile_helper_lib.sh
@@ -19,11 +19,16 @@ TAPFILE_HELPER_DBNAME="results.sqlite3"
 
 # wrapper around sqlite3 w/ retries if DB is locked
 function __sqlite3_wrapper() {
-    local dbfile="$1"
-    shift
+    local dbfile="${TAPFILE_HELPER_DBNAME}"
+
+    local params=""
+    while [[ "$1" == -* ]] ; do
+        params="$params $1"
+        shift
+    done
 
     while true; do
-        sqlite3 "${dbfile}" "$@"
+        sqlite3 "${dbfile}" $params "PRAGMA foreign_keys = ON;$@"
         local ret="$?"
         if [ "$ret" -ne 5 ] ; then
             return $ret
@@ -37,9 +42,8 @@ function __sqlite3_wrapper() {
 
 # Initialise the DB if it wasn't yet.
 function __db_init() {
-    local dbname="${TAPFILE_HELPER_DBNAME}"
 
-    __sqlite3_wrapper "${dbname}" '
+    __sqlite3_wrapper '
     CREATE TABLE IF NOT EXISTS "test_case" (
         "id"    INTEGER,
         "name"  TEXT UNIQUE,
@@ -77,8 +81,6 @@ function tap_ingest_tapfile() {
     local vendor="${2}"
     local run="${3}"
 
-    local dbname="${TAPFILE_HELPER_DBNAME}"
-
     local result=""
     local test_name=""
     local error_message=""
@@ -138,15 +140,13 @@ function tap_ingest_tapfile() {
 
     local SQL="${SQL}COMMIT;"
 
-    __sqlite3_wrapper "${dbname}" "${SQL}"
+    __sqlite3_wrapper "${SQL}"
 }
 # --
 
 # Print a list of all vendors we've seen so far.
 function tap_list_vendors() {
-    local dbname="${TAPFILE_HELPER_DBNAME}"
-
-    __sqlite3_wrapper "${dbname}" 'SELECT DISTINCT name from vendor;'
+    __sqlite3_wrapper 'SELECT DISTINCT name from vendor;'
 }
 # --
 
@@ -156,9 +156,7 @@ function tap_list_vendors() {
 function tap_failed_tests_for_vendor() {
     local vendor="$1"
 
-    local dbname="${TAPFILE_HELPER_DBNAME}"
-
-    __sqlite3_wrapper "${dbname}" "
+    __sqlite3_wrapper "
 		SELECT failed.name FROM test_case AS failed
 		WHERE EXISTS (
 				SELECT * FROM test_run AS t, vendor AS v, test_case AS c
@@ -186,12 +184,10 @@ function tap_generate_report() {
     local version="$2"
     local full_error_report="${3:-false}"
 
-    local dbname="${TAPFILE_HELPER_DBNAME}"
-
     local count
-    count="$(__sqlite3_wrapper "${dbname}" 'SELECT count(name) FROM test_case;')"
+    count="$(__sqlite3_wrapper 'SELECT count(name) FROM test_case;')"
     local vendors
-    vendors="$(__sqlite3_wrapper "${dbname}" 'SELECT name FROM vendor;' | tr '\n' ' ')"
+    vendors="$(__sqlite3_wrapper 'SELECT name FROM vendor;' | tr '\n' ' ')"
 
     echo "1..$((count+1))"
     echo "ok - Version: ${version}, Architecture: ${arch}" 
@@ -201,13 +197,13 @@ function tap_generate_report() {
 
     # Print result line for every test, including platforms it succeeded on
     #  and transient failed runs.
-    __sqlite3_wrapper "${dbname}" 'SELECT DISTINCT name from test_case;' | \
+    __sqlite3_wrapper 'SELECT DISTINCT name from test_case;' | \
     while read -r test_name; do
 
         # "ok" if the test succeeded at least once for all vendors that run the test,
         #   "not ok" otherwise.
         local verdict
-        verdict="$(__sqlite3_wrapper "${dbname}" "
+        verdict="$(__sqlite3_wrapper "
         SELECT failed.name FROM vendor AS failed
         WHERE EXISTS (
                 SELECT * FROM test_run AS t, vendor AS v, test_case AS c
@@ -231,7 +227,7 @@ function tap_generate_report() {
         # Generate a list of vendors and respective runs, in a single line.
         function list_runs() {
             local res="$1"
-            __sqlite3_wrapper -csv "${dbname}" "
+            __sqlite3_wrapper -csv "
                 SELECT v.name, t.run FROM test_run AS t, vendor AS v, test_case AS c
                 WHERE t.vendor_id=v.id AND t.case_id=c.id
                     AND c.name='${test_name}'
@@ -262,7 +258,7 @@ function tap_generate_report() {
             echo "   Failed: ${failed}"
             if [ "${verdict}" = "not ok" -o "${full_error_report}" = "true" ] ; then
                 # generate diagnostic output, per failed run.
-                __sqlite3_wrapper -csv "${dbname}" "
+                __sqlite3_wrapper -csv "
                 SELECT v.name, t.run
                     FROM test_run AS t, vendor AS v, test_case AS c
                     WHERE t.vendor_id=v.id AND t.case_id=c.id
@@ -272,7 +268,7 @@ function tap_generate_report() {
                 sed 's/,/ /' | \
                 while read -r vendor run; do
                     echo "   Error messages for ${vendor}, run ${run}:"
-                    __sqlite3_wrapper -csv "${dbname}" "
+                    __sqlite3_wrapper -csv "
                     SELECT t.output FROM test_run AS t, test_case AS c
                         WHERE t.case_id=c.id
                         AND c.name='${test_name}'
diff --git a/ci-automation/test.sh b/ci-automation/test.sh
index 924a791f4b..7e948bf721 100644
--- a/ci-automation/test.sh
+++ b/ci-automation/test.sh
@@ -122,7 +122,7 @@ function test_run() {
             break
         fi
 
-        echo "########### Some tests failed and will be re-run. ###########"
+        echo "########### Some tests failed and will be re-run (${retry} / ${retries}). ###########"
         echo "Failed tests: $failed_tests"
         echo "-----------"
         set -- $failed_tests

From 1045fd5ac8567646db678ef0df4e48292db3caeb Mon Sep 17 00:00:00 2001
From: Thilo Fromm <thilo@kinvolk.io>
Date: Mon, 21 Feb 2022 13:53:19 +0100
Subject: [PATCH 09/12] ci-automation/README.md: add docs for qemu test

---
 ci-automation/README.md | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/ci-automation/README.md b/ci-automation/README.md
index 7db3f97654..87beaf09e0 100644
--- a/ci-automation/README.md
+++ b/ci-automation/README.md
@@ -119,13 +119,14 @@ Testing is implemented in two layers:
 2. `ci-automation/vendor-testing/` contains low-level vendor-specific test wrappers around [`kola`](https://github.com/flatcar-linux/mantle/tree/flatcar-master/kola/), our test scenario orchestrator.
 
 Testing relies on the SDK container and will use tools / test suites from the SDK.
-The low-level vendor / image specific script (layer 2. in the list above) is run inside the SDK.
-Testing will use the vendor image published by `vms.sh` from buildcache, and the torcx manifest published by `packages`.
+The low-level vendor / image specific script (layer 2. in the list above) runs inside the SDK.
+Testing will use the vendor image published by `vms.sh` from buildcache, and the torcx manifest published by `packages.sh`.
 
 Additionally, a script library is provided (at `ci-automation/tapfile_helper_lib.sh`) to help handling `.tap` test result files produced by test runs.
 Library functions may be used to merge the result of multiple test runs (e.g. for multiple image types / vendors) into a single test result report.
 The test runs are considered successful only if all tests succeeded for all vendors / images at least once.
 
+
 **Usage**
 ```
 ./checkout <version-to-test>
@@ -140,3 +141,20 @@ source ci-automation/test.sh
 test_run amd64 qemu
 ```
 
+### QEmu test
+
+`ci-automation/vendor-testing/qemu.sh` implements a `kola` wrapper for testing the `qemu` image.
+The wrapper is a straightforward call to `kola` and does not have any additional requirements.
+
+**NOTE** that the generic image (`flatcar_production_image.bin`) is used for the test instead of the QEmu vendor image.
+
+**NOTE on host firewalling** The test automation uses bridged networking and will handle forwarding and NAT.
+However, we experienced test failures from lack of internet access with several firewall implementations.
+It is recommended to stop firewalling on the host the tests are run on (for example, use `systemctl stop firewalld` if the host used `firewalld`).
+
+**Settings**
+
+* `QEMU_IMAGE_NAME` - file name of the QEmu image to fetch from bincache.
+* `QEMU_PARALLEL` - Number of parallel test cases to run.
+                  Note that test cases may involve launching mutliple QEmu VMs (network testing etc.).
+                  Tests are memory bound, not CPU bound; e.g. `20` is a sensible value for a 6 core / 12 threads systwem w/ 32 GB RAM.

From 0fa985b8721855e1b7b4dc651b3e6f121488edbb Mon Sep 17 00:00:00 2001
From: Thilo Fromm <thilo@kinvolk.io>
Date: Mon, 21 Feb 2022 16:30:30 +0100
Subject: [PATCH 10/12] ci-automation/test.sh: stage torcx manifest

Signed-off-by: Thilo Fromm <thilo@kinvolk.io>
---
 ci-automation/packages.sh            | 11 ++++--
 ci-automation/test.sh                | 51 +++++++++++++++++++++++++---
 ci-automation/vendor-testing/qemu.sh | 10 ++++--
 3 files changed, 62 insertions(+), 10 deletions(-)

diff --git a/ci-automation/packages.sh b/ci-automation/packages.sh
index 9e5efe743c..5e6a7fce15 100644
--- a/ci-automation/packages.sh
+++ b/ci-automation/packages.sh
@@ -103,10 +103,14 @@ function packages_build() {
         ./build_packages --board="${arch}-usr" \
             --torcx_output_root="${CONTAINER_TORCX_ROOT}"
 
-    # copy torcx manifest for publishing
+    # copy torcx manifest and docker tarball for publishing
+    local torcx_tmp="__build__/torcx_tmp"
+    rm -rf "${torcx_tmp}"
+    mkdir "${torcx_tmp}"
     ./run_sdk_container -n "${packages_container}" -v "${version}" \
         -C "${sdk_image}" \
-        cp "${CONTAINER_TORCX_ROOT}/amd64-usr/latest/torcx_manifest.json" __build__/
+        cp -r "${CONTAINER_TORCX_ROOT}/" \
+        "${torcx_tmp}"
 
     # run_sdk_container updates the version file, use that version from here on
     source sdk_container/.repo/manifests/version.txt
@@ -118,7 +122,8 @@ function packages_build() {
     docker_commit_to_buildcache "${packages_container}" "${packages_image}" "${docker_vernum}"
 
     # Publish torcx manifest to "images" cache so tests can pull it later.
-    copy_to_buildcache "images/${arch}/${vernum}/" __build__/torcx_manifest.json
+    copy_to_buildcache "images/${arch}/${vernum}/torcx" \
+        __build__/torcx_tmp/pkgs/${arch}-usr/docker/*/*.torcx.tgz
 
     update_and_push_version "${version}"
 }
diff --git a/ci-automation/test.sh b/ci-automation/test.sh
index 7e948bf721..86b3077cfe 100644
--- a/ci-automation/test.sh
+++ b/ci-automation/test.sh
@@ -23,7 +23,7 @@
 #       OR
 #       - available via build cache server "/containers/[VERSION]/flatcar-sdk-[ARCH]-[VERSION].tar.gz"
 #         (dev SDK)
-#   4. Vendor image and torcx manifest to run tests for are available on buildcache
+#   4. Vendor image and torcx docker tarball + manifest to run tests for are available on buildcache
 #         ( images/[ARCH]/[FLATCAR_VERSION]/ )
 #
 # INPUT:
@@ -48,9 +48,51 @@
 #          to abort at any point - the previous runs' results won't be lost.
 #   2. "./ci-cleanup.sh" with commands to clean up temporary build resources,
 #        to be run after this step finishes / when this step is aborted.
+#
+#
+# LOW-LEVEL / VENDOR SPECIFIC scripts API
+#
+# Vendor scripts are provided with their own sub-directory and are expected to CD into there before
+#  creating any artifacts (see vendor script argument 1 below).
+# The torcx manifest is supplied in
+#   ../
+# relative to the vendor sub-directory. The manifest is updated to include a URL pointing to the docker
+#  torcx tarball on the build cache (for the docker.torcx-manifest-pkgs test).
+#
+# Vendor specific scripts are called with the following positional arguments:
+# 1 - working directory for the tests.
+#     The vendor script is expected to keep all artifacts it produces in that directory.
+# 2 - Architecture to test.
+# 3 - version number to test.
+# 4 - output TAP file.
+# All following arguments specify test cases / test case patterns to run.
 
 set -euo pipefail
 
+# Download torcx package and manifest, add build cache URL to manifest
+#  so the docker.torcx-manifest-pkgs test can use it.
+function __prepare_torcx() {
+    local arch="$1"
+    local vernum="$2"
+    local workdir="$3"
+
+    copy_from_buildcache "images/${arch}/${vernum}/torcx/torcx_manifest.json" "${workdir}"
+
+    local docker_pkg
+    docker_pkg="$(basename \
+                        "$(jq -r ".value.packages[0].versions[0].locations[0].path" \
+                        ${workdir}/torcx_manifest.json)")"
+
+    # Add docker package URL on build cache to manifest
+    jq ".value.packages[0].versions[0].locations += [{\"url\" : \"https://${BUILDCACHE_SERVER}/images/${arch}/${vernum}/${docker_pkg}\"}]" \
+        "${workdir}/torcx_manifest.json" \
+        > "${workdir}/torcx_manifest_new.json"
+
+    mv "${workdir}/torcx_manifest.json" "${workdir}/torcx_manifest.json.original"
+    mv "${workdir}/torcx_manifest_new.json" "${workdir}/torcx_manifest.json"
+}
+# --
+
 function test_run() {
     local arch="$1" ; shift
     local image="$1"; shift
@@ -80,14 +122,15 @@ function test_run() {
     local sdk_image="$(docker_image_fullname "${sdk_name}" "${docker_sdk_vernum}")"
     echo "docker image rm -f '${sdk_image}'" >> ./ci-cleanup.sh
 
-    local tests_dir="__TESTS__/${image}"
+    local work_dir="__TESTS__"
+    local tests_dir="${work_dir}/${image}"
     mkdir -p "${tests_dir}"
     echo "sudo rm -rf '${tests_dir}'" >> ci-cleanup.sh
 
     local container_name="flatcar-tests-${arch}-${docker_vernum}-${image}"
 
-    # Make the torcx manifest available to test implementation
-    copy_from_buildcache "images/${arch}/${vernum}/torcx_manifest.json" "${tests_dir}"
+    # Make the torcx artifacts available to test implementation
+    __prepare_torcx "${arch}" "${vernum}" "${work_dir}"
 
     local retry=""
     local success=false
diff --git a/ci-automation/vendor-testing/qemu.sh b/ci-automation/vendor-testing/qemu.sh
index 8c40b87159..9f174a127c 100755
--- a/ci-automation/vendor-testing/qemu.sh
+++ b/ci-automation/vendor-testing/qemu.sh
@@ -20,8 +20,12 @@ source ci-automation/ci_automation_common.sh
 mkdir -p "${work_dir}"
 cd "${work_dir}"
 
-echo "++++ QEMU test: downloading ${QEMU_IMAGE_NAME} for ${vernum} (${arch}) ++++"
-copy_from_buildcache "images/${arch}/${vernum}/${QEMU_IMAGE_NAME}" .
+if [ -f "${QEMU_IMAGE_NAME}" ] ; then
+    echo "++++ QEMU test: Using existing ${work_dir}/${QEMU_IMAGE_NAME} for testing ${vernum} (${arch}) ++++"
+else
+    echo "++++ QEMU test: downloading ${QEMU_IMAGE_NAME} for ${vernum} (${arch}) ++++"
+    copy_from_buildcache "images/${arch}/${vernum}/${QEMU_IMAGE_NAME}" .
+fi
 
 set -o noglob
 
@@ -32,7 +36,7 @@ sudo kola run \
     --qemu-bios=/usr/share/qemu/bios-256k.bin \
     --qemu-image="${QEMU_IMAGE_NAME}" \
     --tapfile="${tapfile}" \
-    --torcx-manifest=torcx_manifest.json \
+    --torcx-manifest=../torcx_manifest.json \
     $@
 
 set +o noglob

From 4f39e0112ff7bf75c2b56a3444d50b6ae809c4f3 Mon Sep 17 00:00:00 2001
From: Thilo Fromm <thilo@kinvolk.io>
Date: Mon, 21 Feb 2022 17:23:53 +0100
Subject: [PATCH 11/12] ci-automation/tests.sh: use http in torcx manifest

Use HTTP instead of https because Ignition does not recognise
letsencrypt certificates, leading to test breakage in
docker.torcx-manifest-pkgs.

Add a note in settings.env to explicitly call out HTTP requirement of
build cache server.

Signed-off-by: Thilo Fromm <thilo@kinvolk.io>
---
 ci-automation/ci-config.env | 4 ++++
 ci-automation/test.sh       | 3 ++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/ci-automation/ci-config.env b/ci-automation/ci-config.env
index 34ca3eaf21..deac6763cf 100644
--- a/ci-automation/ci-config.env
+++ b/ci-automation/ci-config.env
@@ -4,6 +4,10 @@
 
 # Flatcar CI static configuration
 
+# Build cache server for build artifacts.
+#  Required services:
+#  - http and https (WITHOUT auto-redirect)
+#  - ssh for BUILDCACHE_USER
 BUILDCACHE_SERVER="bincache.flatcar-linux.net"
 BUILDCACHE_PATH_PREFIX="/srv/bincache"
 BUILDCACHE_USER="bincache"
diff --git a/ci-automation/test.sh b/ci-automation/test.sh
index 86b3077cfe..07df9ca2fc 100644
--- a/ci-automation/test.sh
+++ b/ci-automation/test.sh
@@ -84,7 +84,8 @@ function __prepare_torcx() {
                         ${workdir}/torcx_manifest.json)")"
 
     # Add docker package URL on build cache to manifest
-    jq ".value.packages[0].versions[0].locations += [{\"url\" : \"https://${BUILDCACHE_SERVER}/images/${arch}/${vernum}/${docker_pkg}\"}]" \
+    local docker_url="http://${BUILDCACHE_SERVER}/images/${arch}/${vernum}/torcx/${docker_pkg}"
+    jq ".value.packages[0].versions[0].locations += [{\"url\" : \"${docker_url}\"}]" \
         "${workdir}/torcx_manifest.json" \
         > "${workdir}/torcx_manifest_new.json"
 

From 081df6cd2c0b3bcc863f2da4b0a1d1bb219d3f26 Mon Sep 17 00:00:00 2001
From: Thilo Fromm <thilo@kinvolk.io>
Date: Tue, 22 Feb 2022 15:44:04 +0100
Subject: [PATCH 12/12] ci-automtion/packages.sh: fix torcx URL, add manifest

Signed-off-by: Thilo Fromm <thilo@kinvolk.io>
---
 ci-automation/packages.sh | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/ci-automation/packages.sh b/ci-automation/packages.sh
index 5e6a7fce15..adad0e0b70 100644
--- a/ci-automation/packages.sh
+++ b/ci-automation/packages.sh
@@ -121,9 +121,11 @@ function packages_build() {
     # generate image + push to build cache
     docker_commit_to_buildcache "${packages_container}" "${packages_image}" "${docker_vernum}"
 
-    # Publish torcx manifest to "images" cache so tests can pull it later.
+    # Publish torcx manifest and docker tarball to "images" cache so tests can pull it later.
     copy_to_buildcache "images/${arch}/${vernum}/torcx" \
-        __build__/torcx_tmp/pkgs/${arch}-usr/docker/*/*.torcx.tgz
+        "${torcx_tmp}/torcx/amd64-usr/latest/torcx_manifest.json"
+    copy_to_buildcache "images/${arch}/${vernum}/torcx" \
+        "${torcx_tmp}/torcx/pkgs/${arch}-usr/docker/"*/*.torcx.tgz
 
     update_and_push_version "${version}"
 }