From f6f44e2ca8b5cbf2b6e2026efde421d8c76b8aa6 Mon Sep 17 00:00:00 2001
From: Thilo Fromm <thilo@kinvolk.io>
Date: Wed, 16 Feb 2022 19:17:42 +0100
Subject: [PATCH] ci-automation: first stab at adding testing

Signed-off-by: Thilo Fromm <thilo@kinvolk.io>
---
 ci-automation/ci-config.env          |   4 +
 ci-automation/tapfile_helper_lib.sh  | 281 +++++++++++++++++++++++++++
 ci-automation/test.sh                | 122 ++++++++++++
 ci-automation/test_update_reruns.sh  |  20 ++
 ci-automation/vendor-testing/qemu.sh |  38 ++++
 5 files changed, 465 insertions(+)
 create mode 100644 ci-automation/tapfile_helper_lib.sh
 create mode 100644 ci-automation/test.sh
 create mode 100755 ci-automation/test_update_reruns.sh
 create mode 100644 ci-automation/vendor-testing/qemu.sh
diff --git a/ci-automation/ci-config.env b/ci-automation/ci-config.env
index 40f6a6a959..3e4f347802 100644
--- a/ci-automation/ci-config.env
+++ b/ci-automation/ci-config.env
@@ -22,3 +22,7 @@ CI_GIT_EMAIL="infra+ci@flatcar-linux.org"
 # build artifacts go here (in container)
 CONTAINER_TORCX_ROOT="/home/sdk/build/torcx"
 CONTAINER_IMAGE_ROOT="/home/sdk/build/images"
+
+# Image / vendor tests settings
+QEMU_IMAGE_NAME="flatcar_production_image.bin"
+QEMU_PARALLEL=4
diff --git a/ci-automation/tapfile_helper_lib.sh b/ci-automation/tapfile_helper_lib.sh
new file mode 100644
index 0000000000..0cd7f3efa4
--- /dev/null
+++ b/ci-automation/tapfile_helper_lib.sh
@@ -0,0 +1,281 @@
+#!/bin/bash
+#
+# Copyright (c) 2021 The Flatcar Maintainers.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+# Helper script for extracting information from TAP files and for merging multiple
+#  TAP files into one report.
+# The script uses a temporary SQLite DB for querzing and for result generation.
+#
+# Brief usage overview (scroll down for parameters etc.):
+#   tap_ingest_tapfile - add test results from tap file to the DB 
+#   tap_list_vendors   - list all vendors TAP files have been ingested for
+#   tap_failed_tests_for_vendor - list all tests that never succeded even once, per vendor
+#   tap_generate_report - generate a merged test report
+
+
+TAPFILE_HELPER_DBNAME="results.sqlite3"
+
+# wrapper around sqlite3 w/ retries if DB is locked
+function __sqlite3_wrapper() {
+    local dbfile="$1"
+    shift
+
+    while true; do
+        sqlite3 "${dbfile}" "$@"
+        if [ $? -ne 5 ] ; then
+            return $?
+        fi
+        local sleep="$((1 + $RANDOM % 5))"
+        echo "Retrying in ${sleep} seconds." >&2
+        sleep "${sleep}"
+    done
+}
+# --
+
+# Initialise the DB if it wasn't yet.
+function __db_init() {
+    local dbname="${TAPFILE_HELPER_DBNAME}"
+
+    __sqlite3_wrapper "${dbname}" '
+    CREATE TABLE IF NOT EXISTS "test_case" (
+        "id"    INTEGER,
+        "name"  TEXT UNIQUE,
+        PRIMARY KEY("id")
+    );
+    CREATE TABLE IF NOT EXISTS "vendor" (
+        "id"    INTEGER,
+        "name"  TEXT UNIQUE,
+        PRIMARY KEY("id")
+    );
+    CREATE TABLE IF NOT EXISTS "test_run" (
+        "id"        INTEGER NOT NULL,
+        "result"    INTEGER NOT NULL,
+        "output"    TEXT,
+        "case_id"   INTEGER NOT NULL,
+        "run"       INTEGER NOT NULL,
+        "vendor_id" INTEGER,
+        PRIMARY KEY("id"),
+        FOREIGN KEY("case_id") REFERENCES "test_case"("id"),
+        FOREIGN KEY("vendor_id") REFERENCES "vendor"("id"),
+        UNIQUE (case_id, run, vendor_id)
+    );
+'
+}
+# --
+
+# Read tapfile into temporary DB.
+# INPUT:
+# 1: <tapfile> - tapfile to ingest
+# 2: <vendor>  - vendor (qemu, azure, aws, etc...)
+# 3: <run>     - re-run iteration
+
+function tap_ingest_tapfile() {
+    local tapfile="${1}"
+    local vendor="${2}"
+    local run="${3}"
+
+    local dbname="${TAPFILE_HELPER_DBNAME}"
+
+    local result=""
+    local test_name=""
+    local error_message=""
+    local in_error_message=false
+
+    if ! [ -f "${TAPFILE_HELPER_DBNAME}" ] ; then
+       __db_init
+    fi
+
+    # Wrap all SQL commands in a transaction to speed up INSERTs
+    local SQL="BEGIN TRANSACTION;"
+
+    # Example TAP input:
+    # ok - coreos.auth.verify
+    # ok - coreos.locksmith.tls
+    # not ok - cl.filesystem
+    #   ---
+    #   Error: "--- FAIL: cl.filesystem/deadlinks (1.86s)\n            files.go:90: Dead symbolic links found: [/var/lib/flatcar-oem-gce/usr/lib64/python3.9/site-packages/certifi-3021.3.16-py3.9.egg-info]"
+    #   ...
+    # ok - cl.cloudinit.script
+    # ok - kubeadm.v1.22.0.flannel.base
+    while read -r line; do
+        if [[ "${line}" == "1.."* ]] ; then continue; fi
+        if [ "${line}" = "---" ] ; then  # note: read removes leading whitespaces
+            in_error_message=true
+            continue
+        fi
+
+        if $in_error_message ; then
+            if [ "${line}" = "..." ] ; then
+                in_error_message=false
+            else
+                error_message="$(echo -e "$line" \
+                                    | sed -e 's/^Error: "--- FAIL: /"/' -e 's/^[[:space:]]*//' \
+                                    | sed -e "s/[>\"']/_/g" -e 's/[[:space:]]/ /g')"
+                continue
+            fi
+        else
+            test_name="$(echo "${line}" | sed 's/^[^-]* - //')"
+            local result_string="$(echo "${line}" | sed 's/ - .*//')"
+            result=0
+            if [ "${result_string}" = "ok" ] ; then
+                result=1
+            fi
+        fi
+
+        SQL="${SQL}INSERT OR IGNORE INTO test_case(name) VALUES ('${test_name}');"
+        SQL="${SQL}INSERT OR IGNORE INTO vendor(name) VALUES ('${vendor}');"
+
+        SQL="${SQL}INSERT OR REPLACE INTO test_run(run,result,output,case_id,vendor_id)
+                             VALUES ('${run}','${result}', '${error_message}',
+                                     (SELECT id FROM test_case WHERE name='${test_name}'),
+                                     (SELECT id FROM vendor WHERE name='${vendor}'));"
+        error_message=""
+    done < "$tapfile"
+
+    local SQL="${SQL}COMMIT;"
+
+    __sqlite3_wrapper "${dbname}" "${SQL}"
+}
+# --
+
+# Print a list of all vendors we've seen so far.
+function tap_list_vendors() {
+    local dbname="${TAPFILE_HELPER_DBNAME}"
+
+    __sqlite3_wrapper "${dbname}" 'SELECT DISTINCT name from vendor;'
+}
+# --
+
+# List tests that never succeeded for a given vendor.
+# INPUT:
+# 1: <vendor> - Vendor name to check for failed test runs
+function tap_failed_tests_for_vendor() {
+    local vendor="$1"
+
+    local dbname="${TAPFILE_HELPER_DBNAME}"
+
+    __sqlite3_wrapper "${dbname}" "
+		SELECT failed.name FROM test_case AS failed
+		WHERE EXISTS (
+				SELECT * FROM test_run AS t, vendor AS v, test_case AS c
+				WHERE t.vendor_id=v.id AND t.case_id=c.id               
+					AND v.name='${vendor}'
+					AND c.name=failed.name
+			)
+			AND NOT exists (
+				SELECT * FROM test_run AS t, vendor AS v, test_case AS c
+				WHERE t.vendor_id=v.id AND t.case_id=c.id               
+					AND v.name='${vendor}'
+					AND c.name=failed.name
+					AND t.result=1 );"
+}
+# --
+
+# Print the tap file from contents of the database.
+# INPUT:
+# 1: <arch>    - Architecture to be included in the first line of the report
+# 2: <version> - OS version tested, to be included in the first line of the report
+# 3: <include_transient_errors> - If set to "true" then debug output of transient test failures
+#                   is included in the result report.
+function tap_generate_report() {
+    local arch="$1"
+    local version="$2"
+    local full_error_report="${3:-false}"
+
+    local dbname="${TAPFILE_HELPER_DBNAME}"
+
+    local count="$(__sqlite3_wrapper "${dbname}" 'SELECT count(name) FROM test_case;')"
+    local vendors="$(__sqlite3_wrapper "${dbname}" 'SELECT name FROM vendor;' | tr '\n' ' ')"
+
+    echo "1..$((count+1))"
+    echo "ok - Version: ${version}, Architecture: ${arch}" 
+    echo "   ---"
+    echo "   Platforms tested: ${vendors}"
+    echo "   ..."
+
+    # Print result line for every test, including platforms it succeeded on
+    #  and transient failed runs.
+    __sqlite3_wrapper "${dbname}" 'SELECT DISTINCT name from test_case;' | \
+    while read -r test_name; do
+
+        # "ok" if the test succeeded at least once for all vendors that run the test,
+        #   "not ok" otherwise.
+        local verdict="$(__sqlite3_wrapper "${dbname}" "
+        SELECT failed.name FROM vendor AS failed
+        WHERE EXISTS (
+                SELECT * FROM test_run AS t, vendor AS v, test_case AS c
+                WHERE t.vendor_id=v.id AND t.case_id=c.id
+                    AND v.name=failed.name
+                    AND c.name='${test_name}'
+            )
+            AND NOT exists (
+                SELECT * FROM test_run AS t, vendor AS v, test_case AS c
+                WHERE t.vendor_id=v.id AND t.case_id=c.id
+                    AND v.name=failed.name
+                    AND c.name='${test_name}'
+                    AND t.result=1 );
+        ")"
+        if [ -n "${verdict}" ] ; then
+            verdict="not ok"
+        else
+            verdict="ok"
+        fi
+
+        # Generate a list of vendors and respective runs, in a single line.
+        function list_runs() {
+            local res="$1"
+            __sqlite3_wrapper -csv "${dbname}" "
+                SELECT v.name, t.run FROM test_run AS t, vendor AS v, test_case AS c
+                WHERE t.vendor_id=v.id AND t.case_id=c.id
+                    AND c.name='${test_name}'
+                    AND t.result=${res}
+                    ORDER BY v.name;" \
+                | awk -F, '{ if (t && (t != $1)) {
+                                printf t " " r "); "
+                                r="";}
+                             t=$1
+                             if (r)
+                                r=r ", " $2
+                             else
+                                r="(" $2 ; }
+                            END { if (t) print t r ")"; }'
+        }
+
+        local succeded="$(list_runs 1)"
+        local failed="$(list_runs 0)"
+
+        echo "${verdict} - ${test_name}"
+        echo "   ---"
+        if [ -n "${succeded}" ] ; then
+            echo "   Succeeded: ${succeded}"
+        fi
+        if [ -n "${failed}" ] ; then
+            echo "   Failed: ${failed}"
+            if [ "${verdict}" = "not ok" -o "${full_error_report}" = "true" ] ; then
+                # generate diagnostic output, per failed run.
+                __sqlite3_wrapper -csv "${dbname}" "
+                SELECT v.name, t.run
+                    FROM test_run AS t, vendor AS v, test_case AS c
+                    WHERE t.vendor_id=v.id AND t.case_id=c.id
+                    AND c.name='${test_name}'
+                    AND t.result=0
+                    ORDER BY t.run DESC;" | \
+                sed 's/,/ /' | \
+                while read -r vendor run; do
+                    echo "   Error messages for ${vendor}, run ${run}:"
+                    __sqlite3_wrapper -csv "${dbname}" "
+                    SELECT t.output FROM test_run AS t, test_case AS c
+                        WHERE t.case_id=c.id
+                        AND c.name='${test_name}'
+                        AND t.run='${run}';" | \
+                    sed 's/"/ /' | \
+                    awk '{print "      LINE " NR":" $0}'
+                done
+            fi
+        fi
+        echo "   ..."
+    done
+}
+# --
diff --git a/ci-automation/test.sh b/ci-automation/test.sh
new file mode 100644
index 0000000000..95197dc736
--- /dev/null
+++ b/ci-automation/test.sh
@@ -0,0 +1,122 @@
+#!/bin/bash
+#
+# Copyright (c) 2021 The Flatcar Maintainers.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+# >>> This file is supposed to be SOURCED from the repository ROOT. <<<
+#
+# test_run() should be called w/ the positional INPUT parameters below.
+
+# Test scenarios runner stub.
+#   This script will run test scenarios for a single image type.
+#   Tests will be started inside a container based on the packages container image
+#    (which contains the torcx manifest).
+#   This script is generic and will use a vendor-specific test runner from
+#    "ci-automation/vendor-testing/<image>.sh.
+#
+# PREREQUISITES:
+#
+#   1. SDK version and OS image version are recorded in sdk_container/.repo/manifests/version.txt
+#   2. Scripts repo version tag of OS image version to be built is available and checked out.
+#   3. Flatcar packages container is available via build cache server
+#       from "/containers/[VERSION]/flatcar-packages-[ARCH]-[FLATCAR_VERSION].tar.gz"
+#       or present locally. Container must contain binary packages and torcx artefacts.
+#   4. Vendor image(s) to run tests for are available on buildcache ( images/[ARCH]/[FLATCAR_VERSION]/ )
+#
+# INPUT:
+#
+#   1. Architecture (ARCH) of the TARGET vm images ("arm64", "amd64").
+#   2. Image type to be tested. One of:
+#      ami, azure, azure_pro, digitalocean, gce, gce_pro, packet, qemu, qemu_uefi, vmware
+#
+# OPTIONAL INPUT:
+#
+#   3. List of tests / test patterns. Defaults to "*" (all tests).
+#      All positional arguments after the first 2 (see above) are tests / patterns of tests to run.
+#
+#   MAX_RETRIES. Environment variable. Number of re-runs to overcome transient failures. Defaults to 999.
+#
+# OUTPUT:
+#
+#   1. 2 merged TAP reports with all test runs / vendors.
+#        - a "summary" report which contains error messages only for tests which never succeeded (per vendor).
+#        - a "detailed" report which also contains error messages of transient failures which succeeded after re-runs.
+#        These reports will be updated after each (re-)run of each vendor, making the test job safe
+#          to abort at any point - the previous runs' results won't be lost.
+#   2. "./ci-cleanup.sh" with commands to clean up temporary build resources,
+#        to be run after this step finishes / when this step is aborted.
+
+set -eu
+
+function test_run() {
+    local arch="$1" ; shift
+    local image="$2"; shift
+
+    # default to all tests
+    if [ $# -le 0 ] ; then
+        set -- *
+    fi
+
+    source ci-automation/tapfile_helper_lib.sh
+    source ci-automation/ci_automation_common.sh
+    init_submodules
+
+    source sdk_container/.repo/manifests/version.txt
+    local vernum="${FLATCAR_VERSION}"
+    local docker_vernum="$(vernum_to_docker_image_version "${vernum}")"
+
+    local packages="flatcar-packages-${arch}"
+    local packages_image="${packages}:${docker_vernum}"
+
+    docker_image_from_buildcache "${packages}" "${docker_vernum}"
+
+    local tests_dir="__TESTS__/${image}"
+    mkdir -p "${tests_dir}"
+
+    local container_name="flatcar-tests-${arch}-${docker_vernum}-${image}"
+
+    local retry=""
+    local success=false
+    for retry in $(seq "${retries}"); do
+        local tapfile="results-run-${retry}.tap"
+        local failfile="failed-run-${retry}."
+
+        set -o noglob
+        ./run_sdk_container -n "${container_name}" -C "${packages_image}" -v "${vernum}" \
+            ci-automation/vendor/testing/"${image}".sh \
+                "${tests_dir}" \
+                "${arch}" \
+                "${vernum}" \
+                "${tapfile}" \
+                $@
+        set +o noglob
+
+        ./run_sdk_container -n "${container_name}" -C "${packages_image}" -v "${vernum}" \
+            ci-automation/test_update_reruns.sh \
+                "${tests_dir}/${tapfile}" "${image}" "${retry}" \
+                "${tests_dir}/failed-run-${retry}.txt"
+
+        local failed_tests="$(cat "${tests_dir}/failed-run-${retry}.txt")"
+        if [ -z "$failed_tests" ] ; then
+            echo "########### All tests succeeded. ###########"
+            success=true
+            break
+        fi
+
+        echo "########### Some tests failed and will be re-run. ###########"
+        echo "Failed tests: $failed_tests"
+        echo "-----------"
+        set -- $failed_tests
+    done
+
+    if ! $success; then
+        echo "########### All re-runs exhausted ($retries). Giving up. ###########"
+    fi
+
+    # TODO: publish to bincache?
+    # "${tests_dir}/"*.tap
+    # "${tests_dir}/_kola_temp.tar.xz"
+
+}
+# --
diff --git a/ci-automation/test_update_reruns.sh b/ci-automation/test_update_reruns.sh
new file mode 100755
index 0000000000..e559158f31
--- /dev/null
+++ b/ci-automation/test_update_reruns.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+#
+# Copyright (c) 2021 The Flatcar Maintainers.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+# Helper script for test.sh to update the test failures text file.
+# test.sh uses this to determine which tests need to re-run.
+# This script is run within the SDK container.
+
+set -eu
+
+tapfile="$1"
+image="$2"
+retry="$3"
+outfile="$4"
+
+source ci-automation/tapfile_helper_lib.sh
+tap_ingest_tapfile "${tapfile}" "${image}" "${retry}"
+tap_failed_tests_for_vendor "${image}" | tee "${outfile}"
diff --git a/ci-automation/vendor-testing/qemu.sh b/ci-automation/vendor-testing/qemu.sh
new file mode 100644
index 0000000000..3f2dc95cff
--- /dev/null
+++ b/ci-automation/vendor-testing/qemu.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+#
+# Copyright (c) 2021 The Flatcar Maintainers.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+# Test execution script for the qemu vendor image.
+# This script is supposed to run in the SDK container.
+
+function run_testsuite() {
+    local work_dir="$1"; shift
+    local arch="$2"; shift
+    local vernum="$3"; shift
+    local tapfile="$4"; shift
+
+    # $@ now contains tests / test patterns to run
+
+    source ci-automation/ci_automation_common.sh
+
+    mkdir -p "${work_dir}"
+    cd "${work_dir}"
+
+    copy_from_buildcache "images/${arch}/${vernum}/${QEMU_IMAGE_NAME}" .
+
+    set -o noglob
+
+    sudo kola run
+        --board="${arch}-usr" \
+        --parallel="${QEMU_PARALLEL}" \
+        --platform=qemu \
+        --qemu-bios=/usr/share/qemu/bios-256k.bin \
+        --qemu-image="${QEMU_IMAGE_NAME}" \
+        --tapfile="${tapfile}" \
+        --torcx-manifest="${CONTAINER_TORCX_ROOT}/${arch}-usr/latest/torcx_manifest.json"
+        $@
+
+    set +o noglob
+}