From 8808abf8d11753c60f4f36641afdd68d02bf525d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?okhowang=28=E7=8E=8B=E6=B2=9B=E6=96=87=29?= Date: Mon, 13 Apr 2026 12:46:25 +0800 Subject: [PATCH 1/2] nsenter: use TryLock in reaper to prevent cascading FUSE deadlock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace mu.Lock() with mu.TryLock() in the zombie reaper goroutine. Go's sync.RWMutex implements writer-preference: a pending Lock() blocks all subsequent RLock() callers. When the reaper's Lock() is pending while an nsenter child holds RLock, all new FUSE Lookup handlers are blocked from acquiring RLock to start their own nsenter processes. If the nsenter child's operation triggers a FUSE request back to sysbox-fs, the FUSE handler cannot proceed, creating a deadlock. TryLock() does not register a pending writer, so FUSE handlers can still acquire RLock. If TryLock fails, the reaper sleep a second to retry. Fixes: nestybox/sysbox#1002 Signed-off-by: okhowang(王沛文) --- nsenter/reaper.go | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/nsenter/reaper.go b/nsenter/reaper.go index c42cf979..73ea6bb7 100644 --- a/nsenter/reaper.go +++ b/nsenter/reaper.go @@ -63,30 +63,26 @@ func reaper(signal chan bool, mu *sync.RWMutex) { for { <-signal - // Without this delay, sysbox-fs sometimes hangs the FUSE request that generates an - // nsenter event that requires reaping. It's not clear why, but the tell-tale sign - // of the hang is that the reaper is signaled but finds nothing to reap. This delay - // mitigates this condition and the reaper finds something to reap. - // - // The delay chosen is one that allows nsenter agents to complete their tasks before - // reaping occurs. Since the reaper runs in its own goroutine, this delay only - // affects it (there is no undesired side-effect on nsenters). - - time.Sleep(time.Second) - + // Use TryLock instead of Lock to avoid blocking subsequent RLock + // callers. Go's sync.RWMutex implements writer-preference: a pending + // Lock() blocks all new RLock() calls. If an nsenter child triggers + // a FUSE request back to sysbox-fs, the FUSE handler needs RLock to + // start a new nsenter. A blocking Lock() here would prevent that + // RLock, causing a deadlock. TryLock avoids this by simply skipping + // the reap cycle when nsenters are active + for !mu.TryLock() { + time.Sleep(time.Millisecond * 100) + } for { - mu.Lock() - // WNOHANG: if there is no child to reap, don't block wpid, err := syscall.Wait4(-1, &wstatus, syscall.WNOHANG, nil) if err != nil || wpid == 0 { logrus.Infof("reaper: nothing to reap") - mu.Unlock() break } logrus.Infof("reaper: reaped pid %d", wpid) - mu.Unlock() } + mu.Unlock() } } From 8ddc7c643626fd3097c6a75976d92808c03ca298 Mon Sep 17 00:00:00 2001 From: Ben Burns <803016+benjamincburns@users.noreply.github.com> Date: Tue, 26 May 2026 04:47:37 +0000 Subject: [PATCH 2/2] nsenter: async child waits to prevent FUSE deadlock SendRequest() held the reaper RLock while synchronously waiting for nsenter child processes via Process.Wait(). If a child's exit was delayed by the kernel's fuse_flush on inherited FUSE file descriptors, the RLock was held indefinitely, preventing both the reaper and new FUSE handlers from making progress. Move the pipe read before the PARENT wait (the PARENT writes the pid before exiting) and replace synchronous Process.Wait() calls with background goroutines so the caller releases the RLock promptly. Fixes: nestybox/sysbox#998 Signed-off-by: Ben Burns <803016+benjamincburns@users.noreply.github.com> --- nsenter/event.go | 38 ++++++---------- nsenter/reaper.go | 12 +++++ nsenter/reaper_test.go | 101 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 127 insertions(+), 24 deletions(-) create mode 100644 nsenter/reaper_test.go diff --git a/nsenter/event.go b/nsenter/event.go index aa779caa..e7481a59 100644 --- a/nsenter/event.go +++ b/nsenter/event.go @@ -640,30 +640,17 @@ func (e *NSenterEvent) SendRequest() error { return errors.New("Error copying payload to pipe") } - // Wait for sysbox-fs' first child process to finish. - status, err := cmd.Process.Wait() - if err != nil { - logrus.Warnf("Error waiting for sysbox-fs first child process: %d, status: %s, error: %s", - cmd.Process.Pid, status.String(), err) - if !e.Async { - e.reaper.nsenterReapReq() - } - return err - } - if !status.Success() { - logrus.Warnf("Sysbox-fs first child process error status: %s, pid: %d", - status.String(), cmd.Process.Pid) - if !e.Async { - e.reaper.nsenterReapReq() - } - return errors.New("Error waiting for sysbox-fs first child process") - } - - // Receive sysbox-fs' first-child pid. + // Receive sysbox-fs' first-child pid. This must happen before waiting on + // the PARENT process, because the PARENT writes the pid to the pipe before + // exiting. Reading the pipe first avoids holding an RLock while blocked on + // Process.Wait() for a child whose exit may be delayed by fuse_flush. var pid pid decoder := json.NewDecoder(e.parentPipe) if err := decoder.Decode(&pid); err != nil { logrus.Warnf("Error receiving first-child pid: %s", err) + if !e.Async { + e.reaper.nsenterReapReq() + } return errors.New("Error receiving first-child pid") } @@ -673,9 +660,12 @@ func (e *NSenterEvent) SendRequest() error { return err } - // Wait for sysbox-fs' second child process to finish. Ignore the error in - // case the child has already been reaped for any reason. - _, _ = firstChildProcess.Wait() + // Wait for the PARENT and CHILD nsenter processes asynchronously. Their + // exits can be delayed by the kernel's fuse_flush on inherited FUSE file + // descriptors; waiting synchronously here would hold the reaper RLock and + // block both the reaper and new FUSE handlers. + go cmd.Process.Wait() + go firstChildProcess.Wait() // Sysbox-fs' third child (grand-child) process remains and will enter the // go runtime. This is the nsenter agent process. @@ -745,7 +735,7 @@ func (e *NSenterEvent) SendRequest() error { return ierr } - e.Process.Wait() + e.reaper.reapProcessAsync(e.Process) return nil } diff --git a/nsenter/reaper.go b/nsenter/reaper.go index 73ea6bb7..fd7287ae 100644 --- a/nsenter/reaper.go +++ b/nsenter/reaper.go @@ -17,6 +17,7 @@ package nsenter import ( + "os" "sync" "syscall" "time" @@ -56,6 +57,17 @@ func (zr *zombieReaper) nsenterReapReq() { } } +// reapProcessAsync waits for the given process in a background goroutine +// and triggers a reaper sweep when it exits. Use this at the end of an +// nsenter operation so the caller can release the RLock without blocking +// on a child whose exit may be delayed by fuse_flush. +func (zr *zombieReaper) reapProcessAsync(p *os.Process) { + go func() { + p.Wait() + zr.nsenterReapReq() + }() +} + // Go-routine that performs reaping func reaper(signal chan bool, mu *sync.RWMutex) { var wstatus syscall.WaitStatus diff --git a/nsenter/reaper_test.go b/nsenter/reaper_test.go new file mode 100644 index 00000000..d19a664b --- /dev/null +++ b/nsenter/reaper_test.go @@ -0,0 +1,101 @@ +package nsenter + +import ( + "os/exec" + "syscall" + "testing" + "time" +) + +// TestReaperDoesNotBlockNewNsenter verifies that signaling the zombie reaper +// while an nsenter is active does not prevent a new nsenter from starting. +// +// This is a regression test for a deadlock caused by Go's sync.RWMutex +// writer-preference: when the reaper's Lock() was pending (waiting for an +// active nsenter's RLock to be released), all new RLock() callers — including +// FUSE handlers that needed to start their own nsenter — were blocked. If the +// active nsenter depended on one of those FUSE handlers to complete, the +// system deadlocked. +// +// The fix replaced Lock() with TryLock() in the reaper goroutine so it never +// registers a pending writer. +func TestReaperDoesNotBlockNewNsenter(t *testing.T) { + zr := newZombieReaper() + + // Simulate an active nsenter holding the read lock. + zr.nsenterStarted() + + // Signal the reaper multiple times over a few seconds to ensure at least + // one signal arrives after the reaper's internal delay and the goroutine + // reaches its lock attempt. + for i := 0; i < 5; i++ { + zr.nsenterReapReq() + time.Sleep(500 * time.Millisecond) + } + + // Try to start a second nsenter from a new goroutine. This calls RLock(). + // If the reaper registered a pending writer via Lock(), this blocks + // indefinitely due to writer-preference. + done := make(chan struct{}) + go func() { + zr.nsenterStarted() + zr.nsenterEnded() + close(done) + }() + + select { + case <-done: + // New nsenter started successfully — no writer-preference deadlock. + case <-time.After(3 * time.Second): + t.Fatal("nsenterStarted() blocked: reaper's pending write lock is starving new readers") + } + + zr.nsenterEnded() +} + +// TestReapProcessAsyncReleasesCallerPromptly verifies that reapProcessAsync +// does not block the caller. This is a regression test for a deadlock where +// SendRequest() called Process.Wait() synchronously while holding the reaper +// RLock. If the child's exit was delayed (e.g., by the kernel's fuse_flush on +// inherited FUSE file descriptors), the RLock was held indefinitely, preventing +// both the reaper and new FUSE handlers from making progress. +// +// The fix moved Process.Wait() into reapProcessAsync, which waits in a +// background goroutine so the caller can release the RLock immediately. +func TestReapProcessAsyncReleasesCallerPromptly(t *testing.T) { + cmd := exec.Command("sleep", "3") + if err := cmd.Start(); err != nil { + t.Fatalf("failed to start child: %v", err) + } + defer cmd.Process.Kill() + + zr := newZombieReaper() + + // Simulate SendRequest's locking pattern: hold RLock, schedule async + // reap, release RLock. + zr.nsenterStarted() + zr.reapProcessAsync(cmd.Process) + zr.nsenterEnded() + + // The reaper must be able to acquire Lock immediately. If + // reapProcessAsync blocked the caller (old behavior), the RLock would + // still be held and this would time out. + locked := make(chan struct{}) + go func() { + zr.mu.Lock() + zr.mu.Unlock() + close(locked) + }() + + select { + case <-locked: + // RLock was released before the child exited. + case <-time.After(2 * time.Second): + t.Fatal("reaper Lock blocked: reapProcessAsync is not releasing the caller promptly") + } + + // The child must still be alive, proving we did not wait synchronously. + if err := cmd.Process.Signal(syscall.Signal(0)); err != nil { + t.Fatalf("child exited prematurely: reapProcessAsync should not wait synchronously: %v", err) + } +}