omni/internal/integration/kubernetes_node_audit_test.go
Utku Ozdemir 7c19c318e8
Some checks are pending
default / default (push) Waiting to run
default / e2e-backups (push) Blocked by required conditions
default / e2e-forced-removal (push) Blocked by required conditions
default / e2e-scaling (push) Blocked by required conditions
default / e2e-short (push) Blocked by required conditions
default / e2e-short-secureboot (push) Blocked by required conditions
default / e2e-templates (push) Blocked by required conditions
default / e2e-upgrades (push) Blocked by required conditions
default / e2e-workload-proxy (push) Blocked by required conditions
test: improve workload proxying tests
Add many more services and test scenarios to the workload proxying feature:
- Use two clusters, a 1+1 and a 1+2.
- Use multiple nginx workloads.
- Each workload serving its name in its `index.html` and it being asserted (i.e., we assert that we hit the correct service).
- Multiple exposed services per workload.
- Multiple parallel requests per exposed service.
- Toggle the feature off and on, assert service accessibility.
- Toggle an exposed service off and on by removing/readding the k8s service annotation, assert accessibility.
- Test explicit prefixes.

Additionally:
- Fix two bugs in workload services:
  - Check the cookies before returning 404 for a non-existing exposed service prefix.
  - Add timeouts to `inmem` proxy transport, so requests do not potentially hang forever.
- Bring back the logic the saving of a support bundle when an integration test fails, and fix its save path.

Signed-off-by: Utku Ozdemir <utku.ozdemir@siderolabs.com>
2025-06-04 17:41:30 +02:00

123 lines
4.7 KiB
Go

// Copyright (c) 2025 Sidero Labs, Inc.
//
// Use of this software is governed by the Business Source License
// included in the LICENSE file.
//go:build integration
package integration_test
import (
"context"
"testing"
"time"
"github.com/cosi-project/runtime/pkg/resource"
"github.com/cosi-project/runtime/pkg/resource/rtestutils"
"github.com/cosi-project/runtime/pkg/safe"
"github.com/cosi-project/runtime/pkg/state"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"go.uber.org/zap"
"go.uber.org/zap/zaptest"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"github.com/siderolabs/omni/client/pkg/omni/resources/omni"
"github.com/siderolabs/omni/internal/integration/kubernetes"
)
// AssertKubernetesNodeAudit tests the Kubernetes node audit feature (KubernetesNodeAuditController) by doing the following:
// 1. Freeze the whole control plane, so that kube-apiserver won't be accessible to the ClusterMachineTeardownController,
// so it won't be able to remove the node from Kubernetes at the moment of the node deletion.
// 2. Freeze & force-delete a worker node. It won't be removed from Kubernetes due to the control plane being frozen.
// 3. Assert that the ClusterMachine resource is deleted - the ClusterMachineTeardownController did not block its deletion despite failing to remove the node from Kubernetes.
// 4. Wake the control plane back up.
// 5. Assert that the worker node eventually gets removed from Kubernetes due to node audit.
func AssertKubernetesNodeAudit(testCtx context.Context, clusterName string, options *TestOptions) TestFunc {
st := options.omniClient.Omni().State()
return func(t *testing.T) {
ctx := kubernetes.WrapContext(testCtx, t)
if options.FreezeAMachineFunc == nil || options.RestartAMachineFunc == nil {
t.Skip("skip the test as FreezeAMachineFunc or RestartAMachineFunc is not set")
}
logger := zaptest.NewLogger(t)
cpIDs := rtestutils.ResourceIDs[*omni.MachineSetNode](ctx, t, st, state.WithLabelQuery(
resource.LabelEqual(omni.LabelCluster, clusterName),
resource.LabelExists(omni.LabelControlPlaneRole),
))
require.NotEmpty(t, cpIDs, "no control plane nodes found")
workerIDs := rtestutils.ResourceIDs[*omni.MachineSetNode](ctx, t, st, state.WithLabelQuery(
resource.LabelEqual(omni.LabelCluster, clusterName),
resource.LabelExists(omni.LabelWorkerRole),
))
require.NotEmpty(t, workerIDs, "no worker nodes found")
logger.Info("freeze control plane")
freezeMachinesOfType(ctx, t, st, clusterName, options.FreezeAMachineFunc, omni.LabelControlPlaneRole)
workerID := workerIDs[0]
workerIdentity, err := safe.StateGetByID[*omni.ClusterMachineIdentity](ctx, st, workerID)
require.NoError(t, err)
workerNodeName := workerIdentity.TypedSpec().Value.Nodename
logger.Info("freeze the worker node", zap.String("id", workerID))
err = options.FreezeAMachineFunc(ctx, workerID)
require.NoError(t, err)
logger.Info("force delete & wipe the worker node", zap.String("id", workerID))
wipeMachine(ctx, t, st, workerID, options.WipeAMachineFunc)
// assert that the ClusterMachine is deleted.
// here, the ClusterMachineTeardownController will fail to remove the node from Kubernetes, as the control plane is frozen.
// but it should not block the deletion of the ClusterMachine resource.
rtestutils.AssertNoResource[*omni.ClusterMachine](ctx, t, st, workerID)
logger.Info("wake the control plane back up")
for _, id := range cpIDs {
require.NoError(t, options.RestartAMachineFunc(ctx, id))
}
kubernetesClient := kubernetes.GetClient(ctx, t, options.omniClient.Management(), clusterName)
logger.Info("assert that the node is removed from Kubernetes due to node audit")
count := 0
require.EventuallyWithT(t, func(collect *assert.CollectT) {
require.NoError(collect, ctx.Err()) // if the context is done, fail immediately
count++
log := count%6 == 0 // log at most once every 30 seconds
if log {
logger.Info("list nodes in Kubernetes to check if the worker node is removed")
}
nodeList, listErr := kubernetesClient.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
if !assert.NoError(collect, listErr) && log {
logger.Error("failed to list nodes in Kubernetes", zap.Error(listErr))
}
nodeNames := make([]string, 0, len(nodeList.Items))
for _, k8sNode := range nodeList.Items {
nodeNames = append(nodeNames, k8sNode.Name)
}
if !assert.NotContains(collect, nodeNames, workerNodeName, "worker node should not be present in the list of nodes in Kubernetes") && log {
logger.Error("worker node is still present in the list of nodes in Kubernetes", zap.String("node", workerNodeName), zap.Strings("nodes", nodeNames))
}
}, 10*time.Minute, 5*time.Second)
}
}