mirror of
https://github.com/siderolabs/talos.git
synced 2025-10-28 23:11:37 +01:00
Fixes #7137 The `umount` syscall might hang "forever" if the underlying network filesystem endpoint is down. To be on the safe side, add a timeout around unmount operations, and try to umount with force as a last resort. Sample log: ``` 14795.458779] [talos] task unmountPodMounts (2/2): unmounting /var/lib/kubelet/plugins/kubernetes.io/csi/rook-ceph.rbd.csi.ceph.com/dbe8d7f58e21d06cbef1ae0849317661eba4e82776722e7db5c65194ad73e916/globalmount/0001-0009-rook-ceph-0000000000000001-1051beb3-8d7a-4291-bf45-5711c13523d1 [14795.459797] [talos] task unmountPodMounts (2/2): unmounting /var/lib/kubelet/pods/f3f4d789-7f48-4dd9-9ef5-649b002c8f9c/volumes/kubernetes.io~csi/pvc-a4e72749-a8a1-43d9-9152-5bc1f757c924/mount [14795.460555] EXT4-fs (rbd0): unmounting filesystem. [14813.461319] [talos] task unmountPodMounts (2/2): unmounting /var/lib/kubelet/pods/f3f4d789-7f48-4dd9-9ef5-649b002c8f9c/volumes/kubernetes.io~csi/pvc-a4e72749-a8a1-43d9-9152-5bc1f757c924/mount is taking longer than expected, still waiting for 1m11.999162834s [14831.460813] [talos] task unmountPodMounts (2/2): unmounting /var/lib/kubelet/pods/f3f4d789-7f48-4dd9-9ef5-649b002c8f9c/volumes/kubernetes.io~csi/pvc-a4e72749-a8a1-43d9-9152-5bc1f757c924/mount is taking longer than expected, still waiting for 53.999567033s [14849.461336] [talos] task unmountPodMounts (2/2): unmounting /var/lib/kubelet/pods/f3f4d789-7f48-4dd9-9ef5-649b002c8f9c/volumes/kubernetes.io~csi/pvc-a4e72749-a8a1-43d9-9152-5bc1f757c924/mount is taking longer than expected, still waiting for 35.998979117s [14867.460748] [talos] task unmountPodMounts (2/2): unmounting /var/lib/kubelet/pods/f3f4d789-7f48-4dd9-9ef5-649b002c8f9c/volumes/kubernetes.io~csi/pvc-a4e72749-a8a1-43d9-9152-5bc1f757c924/mount is taking longer than expected, still waiting for 17.999502128s [14885.461123] [talos] task unmountPodMounts (2/2): unmounting /var/lib/kubelet/pods/f3f4d789-7f48-4dd9-9ef5-649b002c8f9c/volumes/kubernetes.io~csi/pvc-a4e72749-a8a1-43d9-9152-5bc1f757c924/mount with force [14885.462395] [talos] ignoring unmount error /var/lib/kubelet/pods/f3f4d789-7f48-4dd9-9ef5-649b002c8f9c/volumes/kubernetes.io~csi/pvc-a4e72749-a8a1-43d9-9152-5bc1f757c924/mount: invalid argument [14885.463529] [talos] task unmountPodMounts (2/2): unmounting /var/run/netns/cni-0888dc71-ba9e-af8a-d322-074f654561e5 [14885.464267] [talos] task unmountPodMounts (2/2): done, 1m30.028862262s ``` Signed-off-by: Andrey Smirnov <andrey.smirnov@talos-systems.com>
124 lines
2.2 KiB
Go
124 lines
2.2 KiB
Go
// This Source Code Form is subject to the terms of the Mozilla Public
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
|
|
|
package mount
|
|
|
|
import (
|
|
"bufio"
|
|
"fmt"
|
|
"log"
|
|
"os"
|
|
"strings"
|
|
"time"
|
|
|
|
"golang.org/x/sys/unix"
|
|
)
|
|
|
|
func unmountWithTimeout(target string, flags int, timeout time.Duration) error {
|
|
errCh := make(chan error, 1)
|
|
|
|
go func() {
|
|
errCh <- unix.Unmount(target, flags)
|
|
}()
|
|
|
|
timer := time.NewTimer(timeout)
|
|
defer timer.Stop()
|
|
|
|
select {
|
|
case <-timer.C:
|
|
return fmt.Errorf("unmounting %s timed out after %s", target, timeout)
|
|
case err := <-errCh:
|
|
return err
|
|
}
|
|
}
|
|
|
|
// UnmountAll attempts to unmount all the mounted filesystems via "self" mountinfo.
|
|
func UnmountAll() error {
|
|
// timeout in seconds
|
|
const timeout = 10
|
|
|
|
ticker := time.NewTicker(time.Second)
|
|
defer ticker.Stop()
|
|
|
|
for i := 0; i < timeout; i++ {
|
|
mounts, err := readMountInfo()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
failedUnmounts := 0
|
|
|
|
for _, mountInfo := range mounts {
|
|
if mountInfo.MountPoint == "" {
|
|
continue
|
|
}
|
|
|
|
if strings.HasPrefix(mountInfo.MountSource, "/dev/") {
|
|
err = unmountWithTimeout(mountInfo.MountPoint, 0, time.Second)
|
|
|
|
if err == nil {
|
|
log.Printf("unmounted %s (%s)", mountInfo.MountPoint, mountInfo.MountSource)
|
|
} else {
|
|
log.Printf("failed unmounting %s: %s", mountInfo.MountPoint, err)
|
|
failedUnmounts++
|
|
}
|
|
}
|
|
}
|
|
|
|
if failedUnmounts == 0 {
|
|
break
|
|
}
|
|
|
|
log.Printf("retrying %d unmount operations...", failedUnmounts)
|
|
|
|
<-ticker.C
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
type mountInfo struct {
|
|
MountPoint string
|
|
MountSource string
|
|
}
|
|
|
|
func readMountInfo() ([]mountInfo, error) {
|
|
f, err := os.Open("/proc/self/mountinfo")
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
defer f.Close() //nolint:errcheck
|
|
|
|
var mounts []mountInfo
|
|
|
|
scanner := bufio.NewScanner(f)
|
|
for scanner.Scan() {
|
|
line := scanner.Text()
|
|
|
|
parts := strings.SplitN(line, " - ", 2)
|
|
|
|
if len(parts) < 2 {
|
|
continue
|
|
}
|
|
|
|
var mntInfo mountInfo
|
|
|
|
pre := strings.Fields(parts[0])
|
|
post := strings.Fields(parts[1])
|
|
|
|
if len(pre) >= 5 {
|
|
mntInfo.MountPoint = pre[4]
|
|
}
|
|
|
|
if len(post) >= 1 {
|
|
mntInfo.MountSource = post[1]
|
|
}
|
|
|
|
mounts = append(mounts, mntInfo)
|
|
}
|
|
|
|
return mounts, scanner.Err()
|
|
}
|