Andrey Smirnov cbf6dc1009
fix: set timeout for unmount calls
Fixes #7137

The `umount` syscall might hang "forever" if the underlying network
filesystem endpoint is down.

To be on the safe side, add a timeout around unmount operations, and try
to umount with force as a last resort.

Sample log:

```
14795.458779] [talos] task unmountPodMounts (2/2): unmounting /var/lib/kubelet/plugins/kubernetes.io/csi/rook-ceph.rbd.csi.ceph.com/dbe8d7f58e21d06cbef1ae0849317661eba4e82776722e7db5c65194ad73e916/globalmount/0001-0009-rook-ceph-0000000000000001-1051beb3-8d7a-4291-bf45-5711c13523d1
[14795.459797] [talos] task unmountPodMounts (2/2): unmounting /var/lib/kubelet/pods/f3f4d789-7f48-4dd9-9ef5-649b002c8f9c/volumes/kubernetes.io~csi/pvc-a4e72749-a8a1-43d9-9152-5bc1f757c924/mount
[14795.460555] EXT4-fs (rbd0): unmounting filesystem.
[14813.461319] [talos] task unmountPodMounts (2/2): unmounting /var/lib/kubelet/pods/f3f4d789-7f48-4dd9-9ef5-649b002c8f9c/volumes/kubernetes.io~csi/pvc-a4e72749-a8a1-43d9-9152-5bc1f757c924/mount is taking longer than expected, still waiting for 1m11.999162834s
[14831.460813] [talos] task unmountPodMounts (2/2): unmounting /var/lib/kubelet/pods/f3f4d789-7f48-4dd9-9ef5-649b002c8f9c/volumes/kubernetes.io~csi/pvc-a4e72749-a8a1-43d9-9152-5bc1f757c924/mount is taking longer than expected, still waiting for 53.999567033s
[14849.461336] [talos] task unmountPodMounts (2/2): unmounting /var/lib/kubelet/pods/f3f4d789-7f48-4dd9-9ef5-649b002c8f9c/volumes/kubernetes.io~csi/pvc-a4e72749-a8a1-43d9-9152-5bc1f757c924/mount is taking longer than expected, still waiting for 35.998979117s
[14867.460748] [talos] task unmountPodMounts (2/2): unmounting /var/lib/kubelet/pods/f3f4d789-7f48-4dd9-9ef5-649b002c8f9c/volumes/kubernetes.io~csi/pvc-a4e72749-a8a1-43d9-9152-5bc1f757c924/mount is taking longer than expected, still waiting for 17.999502128s
[14885.461123] [talos] task unmountPodMounts (2/2): unmounting /var/lib/kubelet/pods/f3f4d789-7f48-4dd9-9ef5-649b002c8f9c/volumes/kubernetes.io~csi/pvc-a4e72749-a8a1-43d9-9152-5bc1f757c924/mount with force
[14885.462395] [talos] ignoring unmount error /var/lib/kubelet/pods/f3f4d789-7f48-4dd9-9ef5-649b002c8f9c/volumes/kubernetes.io~csi/pvc-a4e72749-a8a1-43d9-9152-5bc1f757c924/mount: invalid argument
[14885.463529] [talos] task unmountPodMounts (2/2): unmounting /var/run/netns/cni-0888dc71-ba9e-af8a-d322-074f654561e5
[14885.464267] [talos] task unmountPodMounts (2/2): done, 1m30.028862262s
```

Signed-off-by: Andrey Smirnov <andrey.smirnov@talos-systems.com>
2023-05-03 23:32:23 +04:00

124 lines
2.2 KiB
Go

// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
package mount
import (
"bufio"
"fmt"
"log"
"os"
"strings"
"time"
"golang.org/x/sys/unix"
)
func unmountWithTimeout(target string, flags int, timeout time.Duration) error {
errCh := make(chan error, 1)
go func() {
errCh <- unix.Unmount(target, flags)
}()
timer := time.NewTimer(timeout)
defer timer.Stop()
select {
case <-timer.C:
return fmt.Errorf("unmounting %s timed out after %s", target, timeout)
case err := <-errCh:
return err
}
}
// UnmountAll attempts to unmount all the mounted filesystems via "self" mountinfo.
func UnmountAll() error {
// timeout in seconds
const timeout = 10
ticker := time.NewTicker(time.Second)
defer ticker.Stop()
for i := 0; i < timeout; i++ {
mounts, err := readMountInfo()
if err != nil {
return err
}
failedUnmounts := 0
for _, mountInfo := range mounts {
if mountInfo.MountPoint == "" {
continue
}
if strings.HasPrefix(mountInfo.MountSource, "/dev/") {
err = unmountWithTimeout(mountInfo.MountPoint, 0, time.Second)
if err == nil {
log.Printf("unmounted %s (%s)", mountInfo.MountPoint, mountInfo.MountSource)
} else {
log.Printf("failed unmounting %s: %s", mountInfo.MountPoint, err)
failedUnmounts++
}
}
}
if failedUnmounts == 0 {
break
}
log.Printf("retrying %d unmount operations...", failedUnmounts)
<-ticker.C
}
return nil
}
type mountInfo struct {
MountPoint string
MountSource string
}
func readMountInfo() ([]mountInfo, error) {
f, err := os.Open("/proc/self/mountinfo")
if err != nil {
return nil, err
}
defer f.Close() //nolint:errcheck
var mounts []mountInfo
scanner := bufio.NewScanner(f)
for scanner.Scan() {
line := scanner.Text()
parts := strings.SplitN(line, " - ", 2)
if len(parts) < 2 {
continue
}
var mntInfo mountInfo
pre := strings.Fields(parts[0])
post := strings.Fields(parts[1])
if len(pre) >= 5 {
mntInfo.MountPoint = pre[4]
}
if len(post) >= 1 {
mntInfo.MountSource = post[1]
}
mounts = append(mounts, mntInfo)
}
return mounts, scanner.Err()
}