diff --git a/internal/app/machined/main.go b/internal/app/machined/main.go index 127b5bdf6..e1e9743f7 100644 --- a/internal/app/machined/main.go +++ b/internal/app/machined/main.go @@ -99,8 +99,7 @@ func syncNonVolatileStorageBuffers() { func handle(ctx context.Context, err error) { rebootCmd := int(emergency.RebootCmd.Load()) - var rebootErr runtime.RebootError - if errors.As(err, &rebootErr) { + if rebootErr, ok := errors.AsType[runtime.RebootError](err); ok { // not a failure, but wrapped reboot command rebootCmd = rebootErr.Cmd @@ -195,7 +194,7 @@ func run() error { } //nolint:gocyclo -func runEntrypoint(ctx context.Context, c *v1alpha1runtime.Controller) error { +func runEntrypoint(ctx context.Context, c *v1alpha1runtime.Controller) (returnErr error) { errCh := make(chan error) var controllerWaitGroup sync.WaitGroup @@ -217,6 +216,29 @@ func runEntrypoint(ctx context.Context, c *v1alpha1runtime.Controller) error { go runDebugServer(ctx) + // Run emergency volume cleanup on fatal errors before canceling the context, + // so that the COSI controller runtime is still alive and can react to + // volume lifecycle teardown. + defer func() { + if returnErr == nil { + return + } + + if _, ok := errors.AsType[runtime.RebootError](returnErr); ok { //nolint:errcheck + // successful reboot/shutdown sequences already performed volume cleanup + return + } + + log.Printf("running emergency volume cleanup") + + emergencyCtx, emergencyCancel := context.WithTimeout(context.Background(), constants.EmergencyCleanupTimeout) + defer emergencyCancel() + + if e := c.Run(emergencyCtx, runtime.SequenceEmergencyVolumeCleanup, nil, runtime.WithTakeover()); e != nil { + log.Printf("WARNING: emergency volume cleanup failed: %s", e) + } + }() + // Schedule service shutdown on any return. defer system.Services(c.Runtime()).Shutdown(ctx) diff --git a/internal/app/machined/pkg/runtime/sequencer.go b/internal/app/machined/pkg/runtime/sequencer.go index ef4b23c6f..80a3ef7ee 100644 --- a/internal/app/machined/pkg/runtime/sequencer.go +++ b/internal/app/machined/pkg/runtime/sequencer.go @@ -35,30 +35,35 @@ const ( SequenceReset // SequenceReboot is the reboot sequence. SequenceReboot + // SequenceEmergencyVolumeCleanup is the emergency volume cleanup sequence. + SequenceEmergencyVolumeCleanup ) const ( - boot = "boot" - initialize = "initialize" - install = "install" - shutdown = "shutdown" - upgrade = "upgrade" - stageUpgrade = "stageUpgrade" - maintenanceUpgrade = "maintenanceUpgrade" - reset = "reset" - reboot = "reboot" - noop = "noop" + boot = "boot" + initialize = "initialize" + install = "install" + shutdown = "shutdown" + upgrade = "upgrade" + stageUpgrade = "stageUpgrade" + maintenanceUpgrade = "maintenanceUpgrade" + reset = "reset" + reboot = "reboot" + emergencyVolumeCleanup = "emergencyVolumeCleanup" + noop = "noop" ) var sequenceTakeOver = map[Sequence]map[Sequence]struct{}{ SequenceInitialize: { - SequenceMaintenanceUpgrade: {}, - SequenceReboot: {}, + SequenceMaintenanceUpgrade: {}, + SequenceReboot: {}, + SequenceEmergencyVolumeCleanup: {}, }, SequenceBoot: { - SequenceReboot: {}, - SequenceReset: {}, - SequenceUpgrade: {}, + SequenceReboot: {}, + SequenceReset: {}, + SequenceUpgrade: {}, + SequenceEmergencyVolumeCleanup: {}, }, SequenceReboot: { SequenceReboot: {}, @@ -70,7 +75,7 @@ var sequenceTakeOver = map[Sequence]map[Sequence]struct{}{ // String returns the string representation of a `Sequence`. func (s Sequence) String() string { - return [...]string{noop, boot, initialize, install, shutdown, upgrade, stageUpgrade, maintenanceUpgrade, reset, reboot}[s] + return [...]string{noop, boot, initialize, install, shutdown, upgrade, stageUpgrade, maintenanceUpgrade, reset, reboot, emergencyVolumeCleanup}[s] } // CanTakeOver defines sequences priority. @@ -117,6 +122,8 @@ func ParseSequence(s string) (seq Sequence, err error) { seq = SequenceReset case reboot: seq = SequenceReboot + case emergencyVolumeCleanup: + seq = SequenceEmergencyVolumeCleanup case noop: seq = SequenceNoop default: @@ -154,6 +161,7 @@ type Sequencer interface { StageUpgrade(Runtime, *machine.UpgradeRequest) []Phase Upgrade(Runtime, *machine.UpgradeRequest) []Phase MaintenanceUpgrade(Runtime, *machine.UpgradeRequest) []Phase + EmergencyVolumeCleanup(Runtime) []Phase } // EventSequenceStart represents the sequence start event. diff --git a/internal/app/machined/pkg/runtime/v1alpha1/v1alpha1_controller.go b/internal/app/machined/pkg/runtime/v1alpha1/v1alpha1_controller.go index 0a9b7eed2..c0b6b8d59 100644 --- a/internal/app/machined/pkg/runtime/v1alpha1/v1alpha1_controller.go +++ b/internal/app/machined/pkg/runtime/v1alpha1/v1alpha1_controller.go @@ -417,6 +417,8 @@ func (c *Controller) phases(seq runtime.Sequence, data any) ([]runtime.Phase, er } phases = c.s.Reset(c.r, in) + case runtime.SequenceEmergencyVolumeCleanup: + phases = c.s.EmergencyVolumeCleanup(c.r) case runtime.SequenceNoop: default: return nil, fmt.Errorf("sequence not implemented: %q", seq) diff --git a/internal/app/machined/pkg/runtime/v1alpha1/v1alpha1_controller_test.go b/internal/app/machined/pkg/runtime/v1alpha1/v1alpha1_controller_test.go index 2b41511b3..e66907ee0 100644 --- a/internal/app/machined/pkg/runtime/v1alpha1/v1alpha1_controller_test.go +++ b/internal/app/machined/pkg/runtime/v1alpha1/v1alpha1_controller_test.go @@ -68,6 +68,10 @@ func (m *mockSequencer) Upgrade(r runtime.Runtime, req *machine.UpgradeRequest) return m.phases[runtime.SequenceUpgrade] } +func (m *mockSequencer) EmergencyVolumeCleanup(r runtime.Runtime) []runtime.Phase { + return m.phases[runtime.SequenceEmergencyVolumeCleanup] +} + func (m *mockSequencer) trackCall(name string, doneCh chan struct{}) func(runtime.Sequence, any) (runtime.TaskExecutionFunc, string) { return func(seq runtime.Sequence, data any) (runtime.TaskExecutionFunc, string) { return func(ctx context.Context, logger *log.Logger, r runtime.Runtime) error { diff --git a/internal/app/machined/pkg/runtime/v1alpha1/v1alpha1_sequencer.go b/internal/app/machined/pkg/runtime/v1alpha1/v1alpha1_sequencer.go index 7c4cf3f3e..56f8e9c11 100644 --- a/internal/app/machined/pkg/runtime/v1alpha1/v1alpha1_sequencer.go +++ b/internal/app/machined/pkg/runtime/v1alpha1/v1alpha1_sequencer.go @@ -514,6 +514,36 @@ func stopAllPhaselist(r runtime.Runtime, enableKexec bool) PhaseList { return phases } +// EmergencyVolumeCleanup is the emergency volume cleanup sequence. +// +// This sequence runs the essential volume cleanup phases with a short timeout +// to properly unmount and tear down volumes before an emergency reboot. +// It is invoked when a fatal error occurs during normal sequence execution. +func (*Sequencer) EmergencyVolumeCleanup(r runtime.Runtime) []runtime.Phase { + phases := PhaseList{} + + switch r.State().Platform().Mode() { //nolint:exhaustive + case runtime.ModeContainer: + // no volume cleanup needed in container mode + default: + phases = phases.Append( + "umount", + UnmountPodMounts, + ).Append( + "unmountBind", + UnmountSystemDiskBindMounts, + ).Append( + "unmountSystem", + UnmountEphemeralPartition, + ).Append( + "volumeFinalize", + TeardownVolumeLifecycle, + ) + } + + return phases +} + func bootPartitionInTargets(targets []runtime.PartitionTarget) bool { for _, target := range targets { if target.GetLabel() == constants.BootPartitionLabel { diff --git a/internal/pkg/mount/v3/all.go b/internal/pkg/mount/v3/all.go index d23b9ba9f..0b6937937 100644 --- a/internal/pkg/mount/v3/all.go +++ b/internal/pkg/mount/v3/all.go @@ -35,6 +35,8 @@ func unmountWithTimeout(target string, flags int, timeout time.Duration) error { } // UnmountAll attempts to unmount all the mounted filesystems via "self" mountinfo. +// +//nolint:gocyclo func UnmountAll() error { // timeout in seconds const timeout = 10 @@ -42,7 +44,7 @@ func UnmountAll() error { ticker := time.NewTicker(time.Second) defer ticker.Stop() - for range timeout { + for iteration := range timeout { mounts, err := readMountInfo() if err != nil { return err @@ -50,6 +52,8 @@ func UnmountAll() error { failedUnmounts := 0 + var failedMountPoints []string + for _, mountInfo := range mounts { if mountInfo.MountPoint == "" { continue @@ -63,6 +67,8 @@ func UnmountAll() error { log.Printf("failed unmounting %s: %s", mountInfo.MountPoint, err) failedUnmounts++ + + failedMountPoints = append(failedMountPoints, mountInfo.MountPoint) } } } @@ -71,6 +77,13 @@ func UnmountAll() error { break } + // Log mount users on first and last failure to help diagnose busy mounts. + if iteration == 0 || iteration == timeout-1 { + for _, mp := range failedMountPoints { + logMountUsers(log.Printf, mp) + } + } + log.Printf("retrying %d unmount operations...", failedUnmounts) <-ticker.C diff --git a/internal/pkg/mount/v3/helpers.go b/internal/pkg/mount/v3/helpers.go index 082a17492..ca0bd3d99 100644 --- a/internal/pkg/mount/v3/helpers.go +++ b/internal/pkg/mount/v3/helpers.go @@ -210,6 +210,7 @@ func PseudoLate(printer func(string, ...any)) Managers { WithTarget("/run"), WithMountAttributes(unix.MOUNT_ATTR_NOSUID|unix.MOUNT_ATTR_NOEXEC|unix.MOUNT_ATTR_RELATIME), WithSelinuxLabel(constants.RunSelinuxLabel), + WithRecursiveUnmount(), WithFsopen( "tmpfs", fsopen.WithStringParameter("mode", "0755"), diff --git a/internal/pkg/mount/v3/point.go b/internal/pkg/mount/v3/point.go index 66d497abe..c020e646e 100644 --- a/internal/pkg/mount/v3/point.go +++ b/internal/pkg/mount/v3/point.go @@ -146,9 +146,17 @@ func (p *Point) Unmount(opts UnmountOptions) error { return nil } - return p.retry(func() error { + err := p.retry(func() error { return SafeUnmount(context.Background(), opts.Printer, p.target, opts.Recursive) }, true) + if err != nil { + logSubmounts(opts.Printer, p.target) + logMountUsers(opts.Printer, p.target) + + return err + } + + return nil } // IsMounted checks if the mount point is mounted by checking the mount on the target. diff --git a/internal/pkg/mount/v3/unmount.go b/internal/pkg/mount/v3/unmount.go index 1cc9c5f76..718279724 100644 --- a/internal/pkg/mount/v3/unmount.go +++ b/internal/pkg/mount/v3/unmount.go @@ -5,18 +5,31 @@ package mount import ( + "bufio" "context" + "errors" "fmt" + "os" + "path/filepath" + "strconv" + "strings" "time" "golang.org/x/sys/unix" ) -func trySyncMount(target string, printer func(string, ...any)) error { - // open the mountpoint directory to get an fd on the fs +func trySyncMount(target string) error { + // Try the directory path first, then fall back to a file mountpoint. fd, err := unix.Open(target, unix.O_RDONLY|unix.O_DIRECTORY|unix.O_CLOEXEC, 0) if err != nil { - return fmt.Errorf("open %q: %w", target, err) + if errors.Is(err, unix.ENOTDIR) { + fd, err = unix.Open(target, unix.O_RDONLY|unix.O_CLOEXEC, 0) + if err != nil { + return fmt.Errorf("open %q as file: %w", target, err) + } + } else { + return fmt.Errorf("open %q as directory: %w", target, err) + } } defer unix.Close(fd) //nolint:errcheck @@ -25,8 +38,6 @@ func trySyncMount(target string, printer func(string, ...any)) error { return fmt.Errorf("SYS_SYNCFS %q: %w", target, err) } - printer("syncfs(%s) ok", target) - return nil } @@ -34,7 +45,7 @@ func unmountLoop(ctx context.Context, printer func(string, ...any), target strin errCh := make(chan error, 1) // we need to try to sync fs before - if err := trySyncMount(target, printer); err != nil { + if err := trySyncMount(target); err != nil { printer("sync failed: %s", err) } @@ -103,8 +114,6 @@ func SafeUnmount(ctx context.Context, printer func(string, ...any), target strin return err } - logSubmounts(printer, target) - printer("unmounting %s with force", target) ok, err = unmountLoop(ctx, printer, target, unix.MNT_FORCE, unmountForceTimeout, " with force flag") @@ -113,8 +122,6 @@ func SafeUnmount(ctx context.Context, printer func(string, ...any), target strin return err } - logSubmounts(printer, target) - return fmt.Errorf("unmounting %s with force flag timed out", target) } @@ -139,3 +146,109 @@ func logSubmounts(printer func(string, ...any), target string) { printer("submounts on %s: %v", target, submounts) } } + +// logMountUsers scans /proc to find processes that have open file descriptors, +// working directories, or memory-mapped files under the given mount target. +// This helps diagnose "device or resource busy" errors during unmount. +// +//nolint:gocyclo,cyclop +func logMountUsers(printer func(string, ...any), target string) { + entries, err := os.ReadDir("/proc") + if err != nil { + printer("failed to read /proc: %v", err) + + return + } + + targetWithSlash := target + "/" + + found := false + + for _, entry := range entries { + if !entry.IsDir() { + continue + } + + pid, err := strconv.Atoi(entry.Name()) + if err != nil { + continue // not a PID directory + } + + procPath := filepath.Join("/proc", entry.Name()) + + var offendingPaths []string + + // Check cwd. + if cwd, err := os.Readlink(filepath.Join(procPath, "cwd")); err == nil { + if cwd == target || strings.HasPrefix(cwd, targetWithSlash) { + offendingPaths = append(offendingPaths, "cwd="+cwd) + } + } + + // Check root. + if root, err := os.Readlink(filepath.Join(procPath, "root")); err == nil { + if root == target || strings.HasPrefix(root, targetWithSlash) { + offendingPaths = append(offendingPaths, "root="+root) + } + } + + // Check open file descriptors. + if fds, err := os.ReadDir(filepath.Join(procPath, "fd")); err == nil { + for _, fd := range fds { + if link, err := os.Readlink(filepath.Join(procPath, "fd", fd.Name())); err == nil { + if link == target || strings.HasPrefix(link, targetWithSlash) { + offendingPaths = append(offendingPaths, "fd/"+fd.Name()+"="+link) + } + } + } + } + + // Check memory-mapped files. + if f, err := os.Open(filepath.Join(procPath, "maps")); err == nil { + scanner := bufio.NewScanner(f) + for scanner.Scan() { + line := scanner.Text() + // maps format: address perms offset dev inode pathname + // pathname starts after the 5th field + fields := strings.Fields(line) + if len(fields) >= 6 { + mappedPath := fields[len(fields)-1] + if mappedPath == target || strings.HasPrefix(mappedPath, targetWithSlash) { + offendingPaths = append(offendingPaths, "mmap="+mappedPath) + } + } + } + + f.Close() //nolint:errcheck + } + + if len(offendingPaths) == 0 { + continue + } + + found = true + + // Read process identity. + comm := "" + + if data, err := os.ReadFile(filepath.Join(procPath, "comm")); err == nil { + comm = strings.TrimSpace(string(data)) + } + + cmdline := "" + + if data, err := os.ReadFile(filepath.Join(procPath, "cmdline")); err == nil { + // cmdline uses null bytes as separators + cmdline = strings.ReplaceAll(strings.TrimRight(string(data), "\x00"), "\x00", " ") + } + + printer("mount %s held by pid %d (%s) cmdline=[%s]: %s", + target, pid, comm, cmdline, strings.Join(offendingPaths, ", ")) + } + + if !found { + printer("mount %s is busy but no processes found holding it (may be held by kernel references)", target) + } else { + printer("if you see this message, report a bug with the above information to help us identify and fix the issue") + } +} diff --git a/pkg/machinery/constants/constants.go b/pkg/machinery/constants/constants.go index b46fdd873..9ba14525b 100644 --- a/pkg/machinery/constants/constants.go +++ b/pkg/machinery/constants/constants.go @@ -908,6 +908,9 @@ const ( // BootTimeout is the timeout to run all services. BootTimeout = 70 * time.Minute + // EmergencyCleanupTimeout is the timeout to wait for the volume cleanup in case of terminal failure during booting. + EmergencyCleanupTimeout = 30 * time.Second + // FailurePauseTimeout is the timeout for the sequencer failures which can be fixed by updating the machine config. FailurePauseTimeout = 35 * time.Minute