mirror of
https://github.com/siderolabs/omni.git
synced 2025-08-06 01:27:02 +02:00
Some checks failed
default / default (push) Has been cancelled
default / e2e-backups (push) Has been cancelled
default / e2e-forced-removal (push) Has been cancelled
default / e2e-scaling (push) Has been cancelled
default / e2e-short (push) Has been cancelled
default / e2e-short-secureboot (push) Has been cancelled
default / e2e-templates (push) Has been cancelled
default / e2e-upgrades (push) Has been cancelled
default / e2e-workload-proxy (push) Has been cancelled
All test modules were moved under `integration` tag and are now in `internal/integration` folder: no more `cmd/integration-test` executable. New Kres version is able to build the same executable from the tests directory instead. All Omni related flags were renamed, for example `--endpoint` -> `--omni.endpoint`. 2 more functional changes: - Enabled `--test.failfast` for all test runs. - Removed finalizers, which were running if the test has failed. Both of these changes should make it easier to understand the test failure: Talos node logs won't be cluttered with the finalizer tearing down the cluster. Fixes: https://github.com/siderolabs/omni/issues/1171 Signed-off-by: Artem Chernyshev <artem.chernyshev@talos-systems.com>
121 lines
4.6 KiB
Go
121 lines
4.6 KiB
Go
// Copyright (c) 2025 Sidero Labs, Inc.
|
|
//
|
|
// Use of this software is governed by the Business Source License
|
|
// included in the LICENSE file.
|
|
|
|
//go:build integration
|
|
|
|
package integration_test
|
|
|
|
import (
|
|
"context"
|
|
"testing"
|
|
"time"
|
|
|
|
"github.com/cosi-project/runtime/pkg/resource"
|
|
"github.com/cosi-project/runtime/pkg/resource/rtestutils"
|
|
"github.com/cosi-project/runtime/pkg/safe"
|
|
"github.com/cosi-project/runtime/pkg/state"
|
|
"github.com/stretchr/testify/assert"
|
|
"github.com/stretchr/testify/require"
|
|
"go.uber.org/zap"
|
|
"go.uber.org/zap/zaptest"
|
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
|
|
|
"github.com/siderolabs/omni/client/pkg/omni/resources/omni"
|
|
)
|
|
|
|
// AssertKubernetesNodeAudit tests the Kubernetes node audit feature (KubernetesNodeAuditController) by doing the following:
|
|
// 1. Freeze the whole control plane, so that kube-apiserver won't be accessible to the ClusterMachineTeardownController,
|
|
// so it won't be able to remove the node from Kubernetes at the moment of the node deletion.
|
|
// 2. Freeze & force-delete a worker node. It won't be removed from Kubernetes due to the control plane being frozen.
|
|
// 3. Assert that the ClusterMachine resource is deleted - the ClusterMachineTeardownController did not block its deletion despite failing to remove the node from Kubernetes.
|
|
// 4. Wake the control plane back up.
|
|
// 5. Assert that the worker node eventually gets removed from Kubernetes due to node audit.
|
|
func AssertKubernetesNodeAudit(ctx context.Context, clusterName string, options *TestOptions) TestFunc {
|
|
st := options.omniClient.Omni().State()
|
|
|
|
return func(t *testing.T) {
|
|
if options.FreezeAMachineFunc == nil || options.RestartAMachineFunc == nil {
|
|
t.Skip("skip the test as FreezeAMachineFunc or RestartAMachineFunc is not set")
|
|
}
|
|
|
|
logger := zaptest.NewLogger(t)
|
|
|
|
cpIDs := rtestutils.ResourceIDs[*omni.MachineSetNode](ctx, t, st, state.WithLabelQuery(
|
|
resource.LabelEqual(omni.LabelCluster, clusterName),
|
|
resource.LabelExists(omni.LabelControlPlaneRole),
|
|
))
|
|
require.NotEmpty(t, cpIDs, "no control plane nodes found")
|
|
|
|
workerIDs := rtestutils.ResourceIDs[*omni.MachineSetNode](ctx, t, st, state.WithLabelQuery(
|
|
resource.LabelEqual(omni.LabelCluster, clusterName),
|
|
resource.LabelExists(omni.LabelWorkerRole),
|
|
))
|
|
require.NotEmpty(t, workerIDs, "no worker nodes found")
|
|
|
|
logger.Info("freeze control plane")
|
|
|
|
freezeMachinesOfType(ctx, t, st, clusterName, options.FreezeAMachineFunc, omni.LabelControlPlaneRole)
|
|
|
|
workerID := workerIDs[0]
|
|
|
|
workerIdentity, err := safe.StateGetByID[*omni.ClusterMachineIdentity](ctx, st, workerID)
|
|
require.NoError(t, err)
|
|
|
|
workerNodeName := workerIdentity.TypedSpec().Value.Nodename
|
|
|
|
logger.Info("freeze the worker node", zap.String("id", workerID))
|
|
|
|
err = options.FreezeAMachineFunc(ctx, workerID)
|
|
require.NoError(t, err)
|
|
|
|
logger.Info("force delete & wipe the worker node", zap.String("id", workerID))
|
|
|
|
wipeMachine(ctx, t, st, workerID, options.WipeAMachineFunc)
|
|
|
|
// assert that the ClusterMachine is deleted.
|
|
// here, the ClusterMachineTeardownController will fail to remove the node from Kubernetes, as the control plane is frozen.
|
|
// but it should not block the deletion of the ClusterMachine resource.
|
|
rtestutils.AssertNoResource[*omni.ClusterMachine](ctx, t, st, workerID)
|
|
|
|
logger.Info("wake the control plane back up")
|
|
|
|
for _, id := range cpIDs {
|
|
require.NoError(t, options.RestartAMachineFunc(ctx, id))
|
|
}
|
|
|
|
kubernetesClient := getKubernetesClient(ctx, t, options.omniClient.Management(), clusterName)
|
|
|
|
logger.Info("assert that the node is removed from Kubernetes due to node audit")
|
|
|
|
count := 0
|
|
|
|
require.EventuallyWithT(t, func(collect *assert.CollectT) {
|
|
require.NoError(collect, ctx.Err()) // if the context is done, fail immediately
|
|
|
|
count++
|
|
log := count%6 == 0 // log at most once every 30 seconds
|
|
|
|
if log {
|
|
logger.Info("list nodes in Kubernetes to check if the worker node is removed")
|
|
}
|
|
|
|
nodeList, listErr := kubernetesClient.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
|
|
if !assert.NoError(collect, listErr) && log {
|
|
logger.Error("failed to list nodes in Kubernetes", zap.Error(listErr))
|
|
}
|
|
|
|
nodeNames := make([]string, 0, len(nodeList.Items))
|
|
|
|
for _, k8sNode := range nodeList.Items {
|
|
nodeNames = append(nodeNames, k8sNode.Name)
|
|
}
|
|
|
|
if !assert.NotContains(collect, nodeNames, workerNodeName, "worker node should not be present in the list of nodes in Kubernetes") && log {
|
|
logger.Error("worker node is still present in the list of nodes in Kubernetes", zap.String("node", workerNodeName), zap.Strings("nodes", nodeNames))
|
|
}
|
|
}, 10*time.Minute, 5*time.Second)
|
|
}
|
|
}
|