Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 14 additions & 24 deletions nsenter/event.go
Original file line number Diff line number Diff line change
Expand Up @@ -640,30 +640,17 @@ func (e *NSenterEvent) SendRequest() error {
return errors.New("Error copying payload to pipe")
}

// Wait for sysbox-fs' first child process to finish.
status, err := cmd.Process.Wait()
if err != nil {
logrus.Warnf("Error waiting for sysbox-fs first child process: %d, status: %s, error: %s",
cmd.Process.Pid, status.String(), err)
if !e.Async {
e.reaper.nsenterReapReq()
}
return err
}
if !status.Success() {
logrus.Warnf("Sysbox-fs first child process error status: %s, pid: %d",
status.String(), cmd.Process.Pid)
if !e.Async {
e.reaper.nsenterReapReq()
}
return errors.New("Error waiting for sysbox-fs first child process")
}

// Receive sysbox-fs' first-child pid.
// Receive sysbox-fs' first-child pid. This must happen before waiting on
// the PARENT process, because the PARENT writes the pid to the pipe before
// exiting. Reading the pipe first avoids holding an RLock while blocked on
// Process.Wait() for a child whose exit may be delayed by fuse_flush.
var pid pid
decoder := json.NewDecoder(e.parentPipe)
if err := decoder.Decode(&pid); err != nil {
logrus.Warnf("Error receiving first-child pid: %s", err)
if !e.Async {
e.reaper.nsenterReapReq()
}
return errors.New("Error receiving first-child pid")
}

Expand All @@ -673,9 +660,12 @@ func (e *NSenterEvent) SendRequest() error {
return err
}

// Wait for sysbox-fs' second child process to finish. Ignore the error in
// case the child has already been reaped for any reason.
_, _ = firstChildProcess.Wait()
// Wait for the PARENT and CHILD nsenter processes asynchronously. Their
// exits can be delayed by the kernel's fuse_flush on inherited FUSE file
// descriptors; waiting synchronously here would hold the reaper RLock and
// block both the reaper and new FUSE handlers.
go cmd.Process.Wait()
go firstChildProcess.Wait()

// Sysbox-fs' third child (grand-child) process remains and will enter the
// go runtime. This is the nsenter agent process.
Expand Down Expand Up @@ -745,7 +735,7 @@ func (e *NSenterEvent) SendRequest() error {
return ierr
}

e.Process.Wait()
e.reaper.reapProcessAsync(e.Process)

return nil
}
Expand Down
38 changes: 23 additions & 15 deletions nsenter/reaper.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
package nsenter

import (
"os"
"sync"
"syscall"
"time"
Expand Down Expand Up @@ -56,37 +57,44 @@ func (zr *zombieReaper) nsenterReapReq() {
}
}

// reapProcessAsync waits for the given process in a background goroutine
// and triggers a reaper sweep when it exits. Use this at the end of an
// nsenter operation so the caller can release the RLock without blocking
// on a child whose exit may be delayed by fuse_flush.
func (zr *zombieReaper) reapProcessAsync(p *os.Process) {
go func() {
p.Wait()
zr.nsenterReapReq()
}()
}

// Go-routine that performs reaping
func reaper(signal chan bool, mu *sync.RWMutex) {
var wstatus syscall.WaitStatus

for {
<-signal

// Without this delay, sysbox-fs sometimes hangs the FUSE request that generates an
// nsenter event that requires reaping. It's not clear why, but the tell-tale sign
// of the hang is that the reaper is signaled but finds nothing to reap. This delay
// mitigates this condition and the reaper finds something to reap.
//
// The delay chosen is one that allows nsenter agents to complete their tasks before
// reaping occurs. Since the reaper runs in its own goroutine, this delay only
// affects it (there is no undesired side-effect on nsenters).

time.Sleep(time.Second)

// Use TryLock instead of Lock to avoid blocking subsequent RLock
// callers. Go's sync.RWMutex implements writer-preference: a pending
// Lock() blocks all new RLock() calls. If an nsenter child triggers
// a FUSE request back to sysbox-fs, the FUSE handler needs RLock to
// start a new nsenter. A blocking Lock() here would prevent that
// RLock, causing a deadlock. TryLock avoids this by simply skipping
// the reap cycle when nsenters are active
for !mu.TryLock() {
time.Sleep(time.Millisecond * 100)
}
for {
mu.Lock()

// WNOHANG: if there is no child to reap, don't block
wpid, err := syscall.Wait4(-1, &wstatus, syscall.WNOHANG, nil)
if err != nil || wpid == 0 {
logrus.Infof("reaper: nothing to reap")
mu.Unlock()
break
}

logrus.Infof("reaper: reaped pid %d", wpid)
mu.Unlock()
}
mu.Unlock()
}
}
101 changes: 101 additions & 0 deletions nsenter/reaper_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
package nsenter

import (
"os/exec"
"syscall"
"testing"
"time"
)

// TestReaperDoesNotBlockNewNsenter verifies that signaling the zombie reaper
// while an nsenter is active does not prevent a new nsenter from starting.
//
// This is a regression test for a deadlock caused by Go's sync.RWMutex
// writer-preference: when the reaper's Lock() was pending (waiting for an
// active nsenter's RLock to be released), all new RLock() callers — including
// FUSE handlers that needed to start their own nsenter — were blocked. If the
// active nsenter depended on one of those FUSE handlers to complete, the
// system deadlocked.
//
// The fix replaced Lock() with TryLock() in the reaper goroutine so it never
// registers a pending writer.
func TestReaperDoesNotBlockNewNsenter(t *testing.T) {
zr := newZombieReaper()

// Simulate an active nsenter holding the read lock.
zr.nsenterStarted()

// Signal the reaper multiple times over a few seconds to ensure at least
// one signal arrives after the reaper's internal delay and the goroutine
// reaches its lock attempt.
for i := 0; i < 5; i++ {
zr.nsenterReapReq()
time.Sleep(500 * time.Millisecond)
}

// Try to start a second nsenter from a new goroutine. This calls RLock().
// If the reaper registered a pending writer via Lock(), this blocks
// indefinitely due to writer-preference.
done := make(chan struct{})
go func() {
zr.nsenterStarted()
zr.nsenterEnded()
close(done)
}()

select {
case <-done:
// New nsenter started successfully — no writer-preference deadlock.
case <-time.After(3 * time.Second):
t.Fatal("nsenterStarted() blocked: reaper's pending write lock is starving new readers")
}

zr.nsenterEnded()
}

// TestReapProcessAsyncReleasesCallerPromptly verifies that reapProcessAsync
// does not block the caller. This is a regression test for a deadlock where
// SendRequest() called Process.Wait() synchronously while holding the reaper
// RLock. If the child's exit was delayed (e.g., by the kernel's fuse_flush on
// inherited FUSE file descriptors), the RLock was held indefinitely, preventing
// both the reaper and new FUSE handlers from making progress.
//
// The fix moved Process.Wait() into reapProcessAsync, which waits in a
// background goroutine so the caller can release the RLock immediately.
func TestReapProcessAsyncReleasesCallerPromptly(t *testing.T) {
cmd := exec.Command("sleep", "3")
if err := cmd.Start(); err != nil {
t.Fatalf("failed to start child: %v", err)
}
defer cmd.Process.Kill()

zr := newZombieReaper()

// Simulate SendRequest's locking pattern: hold RLock, schedule async
// reap, release RLock.
zr.nsenterStarted()
zr.reapProcessAsync(cmd.Process)
zr.nsenterEnded()

// The reaper must be able to acquire Lock immediately. If
// reapProcessAsync blocked the caller (old behavior), the RLock would
// still be held and this would time out.
locked := make(chan struct{})
go func() {
zr.mu.Lock()
zr.mu.Unlock()
close(locked)
}()

select {
case <-locked:
// RLock was released before the child exited.
case <-time.After(2 * time.Second):
t.Fatal("reaper Lock blocked: reapProcessAsync is not releasing the caller promptly")
}

// The child must still be alive, proving we did not wait synchronously.
if err := cmd.Process.Signal(syscall.Signal(0)); err != nil {
t.Fatalf("child exited prematurely: reapProcessAsync should not wait synchronously: %v", err)
}
}