omni/internal/integration/kubernetes_node_audit_test.go
Artem Chernyshev c9c4c8e10d
Some checks failed
default / default (push) Has been cancelled
default / e2e-backups (push) Has been cancelled
default / e2e-forced-removal (push) Has been cancelled
default / e2e-scaling (push) Has been cancelled
default / e2e-short (push) Has been cancelled
default / e2e-short-secureboot (push) Has been cancelled
default / e2e-templates (push) Has been cancelled
default / e2e-upgrades (push) Has been cancelled
default / e2e-workload-proxy (push) Has been cancelled
test: use go test to build and run Omni integration tests
All test modules were moved under `integration` tag and are now in
`internal/integration` folder: no more `cmd/integration-test`
executable.

New Kres version is able to build the same executable from the tests
directory instead.

All Omni related flags were renamed, for example `--endpoint` ->
`--omni.endpoint`.

2 more functional changes:

- Enabled `--test.failfast` for all test runs.
- Removed finalizers, which were running if the test has failed.

Both of these changes should make it easier to understand the test
failure: Talos node logs won't be cluttered with the finalizer tearing
down the cluster.

Fixes: https://github.com/siderolabs/omni/issues/1171

Signed-off-by: Artem Chernyshev <artem.chernyshev@talos-systems.com>
2025-06-03 15:07:00 +03:00

121 lines
4.6 KiB
Go

// Copyright (c) 2025 Sidero Labs, Inc.
//
// Use of this software is governed by the Business Source License
// included in the LICENSE file.
//go:build integration
package integration_test
import (
"context"
"testing"
"time"
"github.com/cosi-project/runtime/pkg/resource"
"github.com/cosi-project/runtime/pkg/resource/rtestutils"
"github.com/cosi-project/runtime/pkg/safe"
"github.com/cosi-project/runtime/pkg/state"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"go.uber.org/zap"
"go.uber.org/zap/zaptest"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"github.com/siderolabs/omni/client/pkg/omni/resources/omni"
)
// AssertKubernetesNodeAudit tests the Kubernetes node audit feature (KubernetesNodeAuditController) by doing the following:
// 1. Freeze the whole control plane, so that kube-apiserver won't be accessible to the ClusterMachineTeardownController,
// so it won't be able to remove the node from Kubernetes at the moment of the node deletion.
// 2. Freeze & force-delete a worker node. It won't be removed from Kubernetes due to the control plane being frozen.
// 3. Assert that the ClusterMachine resource is deleted - the ClusterMachineTeardownController did not block its deletion despite failing to remove the node from Kubernetes.
// 4. Wake the control plane back up.
// 5. Assert that the worker node eventually gets removed from Kubernetes due to node audit.
func AssertKubernetesNodeAudit(ctx context.Context, clusterName string, options *TestOptions) TestFunc {
st := options.omniClient.Omni().State()
return func(t *testing.T) {
if options.FreezeAMachineFunc == nil || options.RestartAMachineFunc == nil {
t.Skip("skip the test as FreezeAMachineFunc or RestartAMachineFunc is not set")
}
logger := zaptest.NewLogger(t)
cpIDs := rtestutils.ResourceIDs[*omni.MachineSetNode](ctx, t, st, state.WithLabelQuery(
resource.LabelEqual(omni.LabelCluster, clusterName),
resource.LabelExists(omni.LabelControlPlaneRole),
))
require.NotEmpty(t, cpIDs, "no control plane nodes found")
workerIDs := rtestutils.ResourceIDs[*omni.MachineSetNode](ctx, t, st, state.WithLabelQuery(
resource.LabelEqual(omni.LabelCluster, clusterName),
resource.LabelExists(omni.LabelWorkerRole),
))
require.NotEmpty(t, workerIDs, "no worker nodes found")
logger.Info("freeze control plane")
freezeMachinesOfType(ctx, t, st, clusterName, options.FreezeAMachineFunc, omni.LabelControlPlaneRole)
workerID := workerIDs[0]
workerIdentity, err := safe.StateGetByID[*omni.ClusterMachineIdentity](ctx, st, workerID)
require.NoError(t, err)
workerNodeName := workerIdentity.TypedSpec().Value.Nodename
logger.Info("freeze the worker node", zap.String("id", workerID))
err = options.FreezeAMachineFunc(ctx, workerID)
require.NoError(t, err)
logger.Info("force delete & wipe the worker node", zap.String("id", workerID))
wipeMachine(ctx, t, st, workerID, options.WipeAMachineFunc)
// assert that the ClusterMachine is deleted.
// here, the ClusterMachineTeardownController will fail to remove the node from Kubernetes, as the control plane is frozen.
// but it should not block the deletion of the ClusterMachine resource.
rtestutils.AssertNoResource[*omni.ClusterMachine](ctx, t, st, workerID)
logger.Info("wake the control plane back up")
for _, id := range cpIDs {
require.NoError(t, options.RestartAMachineFunc(ctx, id))
}
kubernetesClient := getKubernetesClient(ctx, t, options.omniClient.Management(), clusterName)
logger.Info("assert that the node is removed from Kubernetes due to node audit")
count := 0
require.EventuallyWithT(t, func(collect *assert.CollectT) {
require.NoError(collect, ctx.Err()) // if the context is done, fail immediately
count++
log := count%6 == 0 // log at most once every 30 seconds
if log {
logger.Info("list nodes in Kubernetes to check if the worker node is removed")
}
nodeList, listErr := kubernetesClient.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
if !assert.NoError(collect, listErr) && log {
logger.Error("failed to list nodes in Kubernetes", zap.Error(listErr))
}
nodeNames := make([]string, 0, len(nodeList.Items))
for _, k8sNode := range nodeList.Items {
nodeNames = append(nodeNames, k8sNode.Name)
}
if !assert.NotContains(collect, nodeNames, workerNodeName, "worker node should not be present in the list of nodes in Kubernetes") && log {
logger.Error("worker node is still present in the list of nodes in Kubernetes", zap.String("node", workerNodeName), zap.Strings("nodes", nodeNames))
}
}, 10*time.Minute, 5*time.Second)
}
}