diff --git a/.drone.jsonnet b/.drone.jsonnet index 148f2b3f0..b7517437d 100644 --- a/.drone.jsonnet +++ b/.drone.jsonnet @@ -384,7 +384,7 @@ local ExtensionsStep(with_e2e=true) = QEMU_MEMORY_WORKERS: '4096', WITH_CONFIG_PATCH_WORKER: '@_out/extensions-patch.json', IMAGE_REGISTRY: local_registry, - QEMU_EXTRA_DISKS: '1', + QEMU_EXTRA_DISKS: '3', SHORT_INTEGRATION_TEST: 'yes', EXTRA_TEST_ARGS: '-talos.extensions.qemu', }); diff --git a/internal/integration/api/extensions_qemu.go b/internal/integration/api/extensions_qemu.go index 3eaa00bef..36d7875b1 100644 --- a/internal/integration/api/extensions_qemu.go +++ b/internal/integration/api/extensions_qemu.go @@ -8,6 +8,7 @@ package api import ( "context" + "crypto/rand" "fmt" "io" "net" @@ -271,7 +272,9 @@ func (suite *ExtensionsSuiteQEMU) TestExtensionsISCSI() { suite.Require().Contains(stdout, "successful.") } - suite.Assert().True(iscsiTargetExists()) + suite.Eventually(func() bool { + return iscsiTargetExists() + }, 5*time.Second, 1*time.Second) } // TestExtensionsNutClient verifies nut client is working. @@ -416,6 +419,127 @@ func (suite *ExtensionsSuiteQEMU) TestExtensionsStargz() { suite.Require().NoError(suite.WaitForPodToBeRunning(suite.ctx, 5*time.Minute, "default", "stargz-hello")) } +// TestExtensionsMdADM verifies mdadm is working, udev rules work and the raid is mounted on reboot. +func (suite *ExtensionsSuiteQEMU) TestExtensionsMdADM() { + node := suite.RandomDiscoveredNodeInternalIP(machine.TypeWorker) + + var mdADMArrayExists bool + + uuid := suite.mdADMScan() + if uuid != "" { + mdADMArrayExists = true + } + + if !mdADMArrayExists { + userDisks, err := suite.UserDisks(suite.ctx, node, 4) + suite.Require().NoError(err) + + suite.Require().GreaterOrEqual(len(userDisks), 2, "expected at least two user disks with size greater than 4GB to be available") + + _, err = suite.Clientset.CoreV1().Pods("kube-system").Create(suite.ctx, &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "mdadm-create", + }, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "mdadm-create", + Image: "alpine", + Command: []string{ + "tail", + "-f", + "/dev/null", + }, + SecurityContext: &corev1.SecurityContext{ + Privileged: pointer.To(true), + }, + }, + }, + HostNetwork: true, + HostPID: true, + }, + }, metav1.CreateOptions{}) + defer suite.Clientset.CoreV1().Pods("kube-system").Delete(suite.ctx, "mdadm-create", metav1.DeleteOptions{}) //nolint:errcheck + + suite.Require().NoError(err) + + // wait for the pod to be ready + suite.Require().NoError(suite.WaitForPodToBeRunning(suite.ctx, 5*time.Minute, "kube-system", "mdadm-create")) + + _, stderr, err := suite.ExecuteCommandInPod( + suite.ctx, + "kube-system", + "mdadm-create", + fmt.Sprintf("nsenter --mount=/proc/1/ns/mnt -- mdadm --create --verbose /dev/md0 --metadata=0.90 --level=1 --raid-devices=2 %s", strings.Join(userDisks[:2], " ")), + ) + suite.Require().NoError(err) + + suite.Require().Contains(stderr, "mdadm: array /dev/md0 started.") + } + + // now we want to reboot the node and make sure the array is still mounted + suite.AssertRebooted( + suite.ctx, node, func(nodeCtx context.Context) error { + return base.IgnoreGRPCUnavailable(suite.Client.Reboot(nodeCtx)) + }, 5*time.Minute, + ) + + suite.Require().NotEmpty(suite.mdADMScan()) +} + +func (suite *ExtensionsSuiteQEMU) mdADMScan() string { + // create a random suffix for the mdadm-scan pod + randomSuffix := make([]byte, 4) + _, err := rand.Read(randomSuffix) + suite.Require().NoError(err) + + podName := fmt.Sprintf("mdadm-scan-%x", randomSuffix) + + _, err = suite.Clientset.CoreV1().Pods("kube-system").Create(suite.ctx, &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: podName, + }, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: podName, + Image: "alpine", + Command: []string{ + "tail", + "-f", + "/dev/null", + }, + SecurityContext: &corev1.SecurityContext{ + Privileged: pointer.To(true), + }, + }, + }, + HostNetwork: true, + HostPID: true, + }, + }, metav1.CreateOptions{}) + defer suite.Clientset.CoreV1().Pods("kube-system").Delete(suite.ctx, podName, metav1.DeleteOptions{}) //nolint:errcheck + + suite.Require().NoError(err) + + // wait for the pod to be ready + suite.Require().NoError(suite.WaitForPodToBeRunning(suite.ctx, 5*time.Minute, "kube-system", podName)) + + stdout, stderr, err := suite.ExecuteCommandInPod( + suite.ctx, + "kube-system", + podName, + "nsenter --mount=/proc/1/ns/mnt -- mdadm --detail --scan", + ) + suite.Require().NoError(err) + + suite.Require().Equal("", stderr) + + stdOutSplit := strings.Split(stdout, " ") + + return strings.TrimPrefix(stdOutSplit[len(stdOutSplit)-1], "UUID=") +} + // TestExtensionsZFS verifies zfs is working, udev rules work and the pool is mounted on reboot. func (suite *ExtensionsSuiteQEMU) TestExtensionsZFS() { node := suite.RandomDiscoveredNodeInternalIP(machine.TypeWorker) @@ -425,21 +549,57 @@ func (suite *ExtensionsSuiteQEMU) TestExtensionsZFS() { var zfsPoolExists bool - userDisks, err := suite.UserDisks(suite.ctx, node, 4) + _, err := suite.Clientset.CoreV1().Pods("kube-system").Create(suite.ctx, &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "zpool-list", + }, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "zpool-list", + Image: "alpine", + Command: []string{ + "tail", + "-f", + "/dev/null", + }, + SecurityContext: &corev1.SecurityContext{ + Privileged: pointer.To(true), + }, + }, + }, + HostNetwork: true, + HostPID: true, + }, + }, metav1.CreateOptions{}) + defer suite.Clientset.CoreV1().Pods("kube-system").Delete(suite.ctx, "zpool-list", metav1.DeleteOptions{}) //nolint:errcheck + suite.Require().NoError(err) - suite.Require().NotEmpty(userDisks, "expected at least one user disk with size greater than 4GB to be available") + // wait for the pod to be ready + suite.Require().NoError(suite.WaitForPodToBeRunning(suite.ctx, 5*time.Minute, "kube-system", "zpool-list")) - resp, err := suite.Client.LS(ctx, &machineapi.ListRequest{ - Root: fmt.Sprintf("/dev/%s1", userDisks[0]), - }) + stdout, stderr, err := suite.ExecuteCommandInPod( + suite.ctx, + "kube-system", + "zpool-list", + "nsenter --mount=/proc/1/ns/mnt -- zpool list", + ) suite.Require().NoError(err) - if _, err = resp.Recv(); err == nil { + suite.Require().Equal("", stderr) + suite.Require().NotEmpty(stdout) + + if stdout != "no pools available\n" { zfsPoolExists = true } if !zfsPoolExists { + userDisks, err := suite.UserDisks(suite.ctx, node, 4) + suite.Require().NoError(err) + + suite.Require().NotEmpty(userDisks, "expected at least one user disk with size greater than 4GB to be available") + _, err = suite.Clientset.CoreV1().Pods("kube-system").Create(suite.ctx, &corev1.Pod{ ObjectMeta: metav1.ObjectMeta{ Name: "zpool-create", diff --git a/internal/integration/base/api.go b/internal/integration/base/api.go index 39972987e..7da9a5bde 100644 --- a/internal/integration/base/api.go +++ b/internal/integration/base/api.go @@ -25,6 +25,7 @@ import ( "github.com/stretchr/testify/suite" "google.golang.org/grpc/backoff" + "github.com/siderolabs/talos/cmd/talosctl/pkg/talos/helpers" "github.com/siderolabs/talos/internal/app/machined/pkg/runtime" "github.com/siderolabs/talos/pkg/cluster" "github.com/siderolabs/talos/pkg/cluster/check" @@ -454,7 +455,9 @@ func (apiSuite *APISuite) ReadConfigFromNode(nodeCtx context.Context) (config.Pr return provider, nil } -// UserDisks returns list of user disks on the with size greater than sizeGreaterThanGB. +// UserDisks returns list of user disks on with size greater than sizeGreaterThanGB and not having any partitions present. +// +//nolint:gocyclo func (apiSuite *APISuite) UserDisks(ctx context.Context, node string, sizeGreaterThanGB int) ([]string, error) { nodeCtx := client.WithNodes(ctx, node) @@ -465,13 +468,47 @@ func (apiSuite *APISuite) UserDisks(ctx context.Context, node string, sizeGreate var disks []string + blockDeviceInUse := func(deviceName string) (bool, error) { + devicePart := strings.Split(deviceName, "/dev/")[1] + + // https://unix.stackexchange.com/questions/111779/how-to-find-out-easily-whether-a-block-device-or-a-part-of-it-is-mounted-someh + // this was the only easy way I could find to check if the block device is already in use by something like raid + stream, err := apiSuite.Client.LS(nodeCtx, &machineapi.ListRequest{ + Root: fmt.Sprintf("/sys/block/%s/holders", devicePart), + }) + if err != nil { + return false, err + } + + counter := 0 + + if err = helpers.ReadGRPCStream(stream, func(info *machineapi.FileInfo, node string, multipleNodes bool) error { + counter++ + + return nil + }); err != nil { + return false, err + } + + if counter > 1 { + return true, nil + } + + return false, nil + } + for _, msg := range resp.Messages { for _, disk := range msg.Disks { if disk.SystemDisk { continue } - if disk.Size > uint64(sizeGreaterThanGB)*1024*1024*1024 { + blockDeviceUsed, err := blockDeviceInUse(disk.DeviceName) + if err != nil { + return nil, err + } + + if disk.Size > uint64(sizeGreaterThanGB)*1024*1024*1024 && !blockDeviceUsed { disks = append(disks, disk.DeviceName) } }