mirror of
https://github.com/siderolabs/talos.git
synced 2025-10-25 22:41:10 +02:00
Fixes #7137 The `umount` syscall might hang "forever" if the underlying network filesystem endpoint is down. To be on the safe side, add a timeout around unmount operations, and try to umount with force as a last resort. Sample log: ``` 14795.458779] [talos] task unmountPodMounts (2/2): unmounting /var/lib/kubelet/plugins/kubernetes.io/csi/rook-ceph.rbd.csi.ceph.com/dbe8d7f58e21d06cbef1ae0849317661eba4e82776722e7db5c65194ad73e916/globalmount/0001-0009-rook-ceph-0000000000000001-1051beb3-8d7a-4291-bf45-5711c13523d1 [14795.459797] [talos] task unmountPodMounts (2/2): unmounting /var/lib/kubelet/pods/f3f4d789-7f48-4dd9-9ef5-649b002c8f9c/volumes/kubernetes.io~csi/pvc-a4e72749-a8a1-43d9-9152-5bc1f757c924/mount [14795.460555] EXT4-fs (rbd0): unmounting filesystem. [14813.461319] [talos] task unmountPodMounts (2/2): unmounting /var/lib/kubelet/pods/f3f4d789-7f48-4dd9-9ef5-649b002c8f9c/volumes/kubernetes.io~csi/pvc-a4e72749-a8a1-43d9-9152-5bc1f757c924/mount is taking longer than expected, still waiting for 1m11.999162834s [14831.460813] [talos] task unmountPodMounts (2/2): unmounting /var/lib/kubelet/pods/f3f4d789-7f48-4dd9-9ef5-649b002c8f9c/volumes/kubernetes.io~csi/pvc-a4e72749-a8a1-43d9-9152-5bc1f757c924/mount is taking longer than expected, still waiting for 53.999567033s [14849.461336] [talos] task unmountPodMounts (2/2): unmounting /var/lib/kubelet/pods/f3f4d789-7f48-4dd9-9ef5-649b002c8f9c/volumes/kubernetes.io~csi/pvc-a4e72749-a8a1-43d9-9152-5bc1f757c924/mount is taking longer than expected, still waiting for 35.998979117s [14867.460748] [talos] task unmountPodMounts (2/2): unmounting /var/lib/kubelet/pods/f3f4d789-7f48-4dd9-9ef5-649b002c8f9c/volumes/kubernetes.io~csi/pvc-a4e72749-a8a1-43d9-9152-5bc1f757c924/mount is taking longer than expected, still waiting for 17.999502128s [14885.461123] [talos] task unmountPodMounts (2/2): unmounting /var/lib/kubelet/pods/f3f4d789-7f48-4dd9-9ef5-649b002c8f9c/volumes/kubernetes.io~csi/pvc-a4e72749-a8a1-43d9-9152-5bc1f757c924/mount with force [14885.462395] [talos] ignoring unmount error /var/lib/kubelet/pods/f3f4d789-7f48-4dd9-9ef5-649b002c8f9c/volumes/kubernetes.io~csi/pvc-a4e72749-a8a1-43d9-9152-5bc1f757c924/mount: invalid argument [14885.463529] [talos] task unmountPodMounts (2/2): unmounting /var/run/netns/cni-0888dc71-ba9e-af8a-d322-074f654561e5 [14885.464267] [talos] task unmountPodMounts (2/2): done, 1m30.028862262s ``` Signed-off-by: Andrey Smirnov <andrey.smirnov@talos-systems.com>
78 lines
1.7 KiB
Go
78 lines
1.7 KiB
Go
// This Source Code Form is subject to the terms of the Mozilla Public
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
|
|
|
package mount
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"log"
|
|
"time"
|
|
|
|
"golang.org/x/sys/unix"
|
|
)
|
|
|
|
func unmountLoop(ctx context.Context, logger *log.Logger, target string, flags int, timeout time.Duration, extraMessage string) (bool, error) {
|
|
errCh := make(chan error, 1)
|
|
|
|
go func() {
|
|
errCh <- unix.Unmount(target, flags)
|
|
}()
|
|
|
|
start := time.Now()
|
|
|
|
progessTicker := time.NewTicker(timeout / 5)
|
|
defer progessTicker.Stop()
|
|
|
|
unmountLoop:
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return true, ctx.Err()
|
|
case err := <-errCh:
|
|
return true, err
|
|
case <-progessTicker.C:
|
|
timeLeft := timeout - time.Since(start)
|
|
|
|
if timeLeft <= 0 {
|
|
break unmountLoop
|
|
}
|
|
|
|
if logger != nil {
|
|
logger.Printf("unmounting %s%s is taking longer than expected, still waiting for %s", target, extraMessage, timeLeft)
|
|
}
|
|
}
|
|
}
|
|
|
|
return false, nil
|
|
}
|
|
|
|
// SafeUnmount unmounts the target path, first without force, then with force if the first attempt fails.
|
|
//
|
|
// It makes sure that unmounting has a finite operation timeout.
|
|
func SafeUnmount(ctx context.Context, logger *log.Logger, target string) error {
|
|
const (
|
|
unmountTimeout = 90 * time.Second
|
|
unmountForceTimeout = 10 * time.Second
|
|
)
|
|
|
|
ok, err := unmountLoop(ctx, logger, target, 0, unmountTimeout, "")
|
|
|
|
if ok {
|
|
return err
|
|
}
|
|
|
|
if logger != nil {
|
|
logger.Printf("unmounting %s with force", target)
|
|
}
|
|
|
|
ok, err = unmountLoop(ctx, logger, target, unix.MNT_FORCE, unmountTimeout, " with force flag")
|
|
|
|
if ok {
|
|
return err
|
|
}
|
|
|
|
return fmt.Errorf("unmounting %s with force flag timed out", target)
|
|
}
|