diff --git a/nsenter/event.go b/nsenter/event.go index aa779ca..e7481a5 100644 --- a/nsenter/event.go +++ b/nsenter/event.go @@ -640,30 +640,17 @@ func (e *NSenterEvent) SendRequest() error { return errors.New("Error copying payload to pipe") } - // Wait for sysbox-fs' first child process to finish. - status, err := cmd.Process.Wait() - if err != nil { - logrus.Warnf("Error waiting for sysbox-fs first child process: %d, status: %s, error: %s", - cmd.Process.Pid, status.String(), err) - if !e.Async { - e.reaper.nsenterReapReq() - } - return err - } - if !status.Success() { - logrus.Warnf("Sysbox-fs first child process error status: %s, pid: %d", - status.String(), cmd.Process.Pid) - if !e.Async { - e.reaper.nsenterReapReq() - } - return errors.New("Error waiting for sysbox-fs first child process") - } - - // Receive sysbox-fs' first-child pid. + // Receive sysbox-fs' first-child pid. This must happen before waiting on + // the PARENT process, because the PARENT writes the pid to the pipe before + // exiting. Reading the pipe first avoids holding an RLock while blocked on + // Process.Wait() for a child whose exit may be delayed by fuse_flush. var pid pid decoder := json.NewDecoder(e.parentPipe) if err := decoder.Decode(&pid); err != nil { logrus.Warnf("Error receiving first-child pid: %s", err) + if !e.Async { + e.reaper.nsenterReapReq() + } return errors.New("Error receiving first-child pid") } @@ -673,9 +660,12 @@ func (e *NSenterEvent) SendRequest() error { return err } - // Wait for sysbox-fs' second child process to finish. Ignore the error in - // case the child has already been reaped for any reason. - _, _ = firstChildProcess.Wait() + // Wait for the PARENT and CHILD nsenter processes asynchronously. Their + // exits can be delayed by the kernel's fuse_flush on inherited FUSE file + // descriptors; waiting synchronously here would hold the reaper RLock and + // block both the reaper and new FUSE handlers. + go cmd.Process.Wait() + go firstChildProcess.Wait() // Sysbox-fs' third child (grand-child) process remains and will enter the // go runtime. This is the nsenter agent process. @@ -745,7 +735,7 @@ func (e *NSenterEvent) SendRequest() error { return ierr } - e.Process.Wait() + e.reaper.reapProcessAsync(e.Process) return nil } diff --git a/nsenter/reaper.go b/nsenter/reaper.go index c42cf97..fd7287a 100644 --- a/nsenter/reaper.go +++ b/nsenter/reaper.go @@ -17,6 +17,7 @@ package nsenter import ( + "os" "sync" "syscall" "time" @@ -56,6 +57,17 @@ func (zr *zombieReaper) nsenterReapReq() { } } +// reapProcessAsync waits for the given process in a background goroutine +// and triggers a reaper sweep when it exits. Use this at the end of an +// nsenter operation so the caller can release the RLock without blocking +// on a child whose exit may be delayed by fuse_flush. +func (zr *zombieReaper) reapProcessAsync(p *os.Process) { + go func() { + p.Wait() + zr.nsenterReapReq() + }() +} + // Go-routine that performs reaping func reaper(signal chan bool, mu *sync.RWMutex) { var wstatus syscall.WaitStatus @@ -63,30 +75,26 @@ func reaper(signal chan bool, mu *sync.RWMutex) { for { <-signal - // Without this delay, sysbox-fs sometimes hangs the FUSE request that generates an - // nsenter event that requires reaping. It's not clear why, but the tell-tale sign - // of the hang is that the reaper is signaled but finds nothing to reap. This delay - // mitigates this condition and the reaper finds something to reap. - // - // The delay chosen is one that allows nsenter agents to complete their tasks before - // reaping occurs. Since the reaper runs in its own goroutine, this delay only - // affects it (there is no undesired side-effect on nsenters). - - time.Sleep(time.Second) - + // Use TryLock instead of Lock to avoid blocking subsequent RLock + // callers. Go's sync.RWMutex implements writer-preference: a pending + // Lock() blocks all new RLock() calls. If an nsenter child triggers + // a FUSE request back to sysbox-fs, the FUSE handler needs RLock to + // start a new nsenter. A blocking Lock() here would prevent that + // RLock, causing a deadlock. TryLock avoids this by simply skipping + // the reap cycle when nsenters are active + for !mu.TryLock() { + time.Sleep(time.Millisecond * 100) + } for { - mu.Lock() - // WNOHANG: if there is no child to reap, don't block wpid, err := syscall.Wait4(-1, &wstatus, syscall.WNOHANG, nil) if err != nil || wpid == 0 { logrus.Infof("reaper: nothing to reap") - mu.Unlock() break } logrus.Infof("reaper: reaped pid %d", wpid) - mu.Unlock() } + mu.Unlock() } } diff --git a/nsenter/reaper_test.go b/nsenter/reaper_test.go new file mode 100644 index 0000000..d19a664 --- /dev/null +++ b/nsenter/reaper_test.go @@ -0,0 +1,101 @@ +package nsenter + +import ( + "os/exec" + "syscall" + "testing" + "time" +) + +// TestReaperDoesNotBlockNewNsenter verifies that signaling the zombie reaper +// while an nsenter is active does not prevent a new nsenter from starting. +// +// This is a regression test for a deadlock caused by Go's sync.RWMutex +// writer-preference: when the reaper's Lock() was pending (waiting for an +// active nsenter's RLock to be released), all new RLock() callers — including +// FUSE handlers that needed to start their own nsenter — were blocked. If the +// active nsenter depended on one of those FUSE handlers to complete, the +// system deadlocked. +// +// The fix replaced Lock() with TryLock() in the reaper goroutine so it never +// registers a pending writer. +func TestReaperDoesNotBlockNewNsenter(t *testing.T) { + zr := newZombieReaper() + + // Simulate an active nsenter holding the read lock. + zr.nsenterStarted() + + // Signal the reaper multiple times over a few seconds to ensure at least + // one signal arrives after the reaper's internal delay and the goroutine + // reaches its lock attempt. + for i := 0; i < 5; i++ { + zr.nsenterReapReq() + time.Sleep(500 * time.Millisecond) + } + + // Try to start a second nsenter from a new goroutine. This calls RLock(). + // If the reaper registered a pending writer via Lock(), this blocks + // indefinitely due to writer-preference. + done := make(chan struct{}) + go func() { + zr.nsenterStarted() + zr.nsenterEnded() + close(done) + }() + + select { + case <-done: + // New nsenter started successfully — no writer-preference deadlock. + case <-time.After(3 * time.Second): + t.Fatal("nsenterStarted() blocked: reaper's pending write lock is starving new readers") + } + + zr.nsenterEnded() +} + +// TestReapProcessAsyncReleasesCallerPromptly verifies that reapProcessAsync +// does not block the caller. This is a regression test for a deadlock where +// SendRequest() called Process.Wait() synchronously while holding the reaper +// RLock. If the child's exit was delayed (e.g., by the kernel's fuse_flush on +// inherited FUSE file descriptors), the RLock was held indefinitely, preventing +// both the reaper and new FUSE handlers from making progress. +// +// The fix moved Process.Wait() into reapProcessAsync, which waits in a +// background goroutine so the caller can release the RLock immediately. +func TestReapProcessAsyncReleasesCallerPromptly(t *testing.T) { + cmd := exec.Command("sleep", "3") + if err := cmd.Start(); err != nil { + t.Fatalf("failed to start child: %v", err) + } + defer cmd.Process.Kill() + + zr := newZombieReaper() + + // Simulate SendRequest's locking pattern: hold RLock, schedule async + // reap, release RLock. + zr.nsenterStarted() + zr.reapProcessAsync(cmd.Process) + zr.nsenterEnded() + + // The reaper must be able to acquire Lock immediately. If + // reapProcessAsync blocked the caller (old behavior), the RLock would + // still be held and this would time out. + locked := make(chan struct{}) + go func() { + zr.mu.Lock() + zr.mu.Unlock() + close(locked) + }() + + select { + case <-locked: + // RLock was released before the child exited. + case <-time.After(2 * time.Second): + t.Fatal("reaper Lock blocked: reapProcessAsync is not releasing the caller promptly") + } + + // The child must still be alive, proving we did not wait synchronously. + if err := cmd.Process.Signal(syscall.Signal(0)); err != nil { + t.Fatalf("child exited prematurely: reapProcessAsync should not wait synchronously: %v", err) + } +}