Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 14 additions & 3 deletions BedrockServer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include <libstuff/libstuff.h>
#include <libstuff/SRandom.h>
#include <libstuff/AutoTimer.h>
#include <libstuff/SThread.h>
#include <PageLockGuard.h>
#include <sqlitecluster/SQLitePeer.h>

Expand Down Expand Up @@ -2207,12 +2208,14 @@ void BedrockServer::_acceptSockets()
}

// And start up this socket's thread.
// We use SThread to enable signal recovery - if a SIGSEGV occurs in the handler,
// it will be converted to an exception and the thread will exit gracefully.
_outstandingSocketThreads++;
thread t;
pair<thread, future<void>> threadPair;
bool threadStarted = false;
while (!threadStarted) {
try {
t = thread(&BedrockServer::handleSocket, this, move(socket), port == _controlPort, port == _commandPortPublic, port == _commandPortPrivate);
threadPair = SThread(&BedrockServer::handleSocket, this, move(socket), port == _controlPort, port == _commandPortPublic, port == _commandPortPrivate);
threadStarted = true;
} catch (const system_error& e) {
// We don't care about this lock here from a performance perspective, it only happens when we
Expand Down Expand Up @@ -2246,7 +2249,11 @@ void BedrockServer::_acceptSockets()
}
}
try {
t.detach();
// Detach the thread - it will run independently.
// The future (threadPair.second) is discarded. If handleSocket catches the
// SSignalException (it does), the future sees normal completion. If it doesn't
// catch, the exception is stored in the future but never retrieved.
threadPair.first.detach();
} catch (const system_error& e) {
SALERT("Caught system_error in thread detach: " << e.code() << ", message: " << e.what());
throw;
Expand Down Expand Up @@ -2513,6 +2520,10 @@ void BedrockServer::handleSocket(Socket&& socket, bool fromControlPort, bool fro
SWARN("Socket in unhandled state: " << socket.state);
}
}
} catch (const SSignalException& e) {
// Signal-generated exception (SIGSEGV, SIGFPE, etc.) - log with full stack trace.
SALERT("handleSocket caught signal " << e.signalName() << " at " << e.faultAddress());
e.logStackTrace();
} catch (const exception& e) {
SALERT("handleSocket got exception: " << e.what());
} catch (...) {
Expand Down
134 changes: 134 additions & 0 deletions libstuff/SSignal.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
#include <unistd.h>
#include <format>

// setjmp.h is included via libstuff.h for signal recovery

thread_local function<string()> SSignalHandlerDieFunc;
void SSetSignalHandlerDieFunc(function<string()>&& func)
{
Expand Down Expand Up @@ -45,6 +47,26 @@ thread _SSignal_signalThread;
// `abort()`, this records the original signal number until the signal handler for abort has a chance to log it.
thread_local int _SSignal_threadCaughtSignalNumber = 0;

// Thread-local flag indicating this thread should convert signals to exceptions instead of aborting.
// Used by SThread to enable graceful recovery from SIGSEGV/SIGFPE.
thread_local bool _SSignal_threadIsRecoverable = false;

// Thread-local jump buffer for signal recovery via sigsetjmp/siglongjmp.
thread_local sigjmp_buf _SSignal_recoveryPoint;

// Thread-local flag indicating whether the recovery point has been set (sigsetjmp called).
thread_local bool _SSignal_recoveryPointSet = false;

// Thread-local storage for crash info to be passed back after siglongjmp.
struct SSignalCrashInfo {
int signum;
void* faultAddress;
void* callstack[32];
int callstackDepth;
bool hasCrashInfo;
};
thread_local SSignalCrashInfo _SSignal_crashInfo = {0, nullptr, {}, 0, false};

// The number of termination signals received so far.
atomic<uint64_t> _SSignal_terminationCount(0);
uint64_t STerminationSignalCount()
Expand Down Expand Up @@ -89,6 +111,39 @@ void SClearSignals()
_SSignal_pendingSignalBitMask.store(0);
}

void SSetThreadRecoverable(bool recoverable)
{
_SSignal_threadIsRecoverable = recoverable;
}

bool SIsThreadRecoverable()
{
return _SSignal_threadIsRecoverable;
}

sigjmp_buf* SGetRecoveryPoint()
{
return &_SSignal_recoveryPoint;
}

void SSetRecoveryPointActive(bool active)
{
_SSignal_recoveryPointSet = active;
}

SSignalException SBuildSignalException()
{
SSignalException ex(
_SSignal_crashInfo.signum,
_SSignal_crashInfo.faultAddress,
nullptr, // No instruction pointer available with sigsetjmp approach
_SSignal_crashInfo.callstack,
_SSignal_crashInfo.callstackDepth
);
_SSignal_crashInfo.hasCrashInfo = false;
return ex;
}

void SInitializeSignals()
{
// Our default die function does nothing.
Expand Down Expand Up @@ -206,6 +261,32 @@ void SStopSignalThread()

void _SSignal_StackTrace(int signum, siginfo_t* info, void* ucontext)
{
// Check if this is a recoverable signal in a recoverable thread with a recovery point set.
// SIGABRT is not recoverable - it's usually called intentionally or as a result of another crash.
bool isRecoverableSignal = (signum == SIGSEGV || signum == SIGFPE || signum == SIGBUS || signum == SIGILL);

if (isRecoverableSignal && _SSignal_threadIsRecoverable && _SSignal_recoveryPointSet) {
// Store crash information in thread-local storage before jumping.
_SSignal_crashInfo.signum = signum;
_SSignal_crashInfo.faultAddress = info ? info->si_addr : nullptr;

// Capture backtrace while we're in the signal handler context.
// Note: backtrace() is not strictly signal-safe but usually works.
_SSignal_crashInfo.callstackDepth = backtrace(
_SSignal_crashInfo.callstack,
sizeof(_SSignal_crashInfo.callstack) / sizeof(void*)
);
_SSignal_crashInfo.hasCrashInfo = true;

SWARN("Signal " << strsignal(signum) << "(" << signum << ") in recoverable thread, "
<< "jumping to recovery point. Fault address: " << (info ? info->si_addr : nullptr));

// Jump back to the recovery point set in SThread.
// The second argument becomes the return value of sigsetjmp.
siglongjmp(_SSignal_recoveryPoint, signum);
Comment on lines +291 to +293
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Avoid longjmp from signal handler skipping RAII cleanup

The recoverable path uses siglongjmp to escape the signal handler back to SThread. In C++, this bypasses stack unwinding for the faulting frame, so any RAII cleanup (mutex/lock_guard releases, SQLite transactions, socket state) held at the time of SIGSEGV/SIGFPE won’t run. If a recoverable signal hits while a lock or transaction is held in a socket handler, the thread exits but the lock can remain held, causing deadlocks or corrupted shared state while the server keeps running. This is a regression introduced by the new recovery path; consider restricting recovery to code that can tolerate skipping destructors or falling back to process termination instead.

Useful? React with 👍 / 👎.

// Never reaches here
}

if (signum == SIGSEGV || signum == SIGABRT || signum == SIGFPE || signum == SIGILL || signum == SIGBUS) {
// If we haven't already saved a signal number, we'll do it now. Any signal we catch here will generate a
// second ABORT signal, and we don't want that to overwrite this value, so we only set it if unset.
Expand Down Expand Up @@ -306,3 +387,56 @@ void _SSignal_StackTrace(int signum, siginfo_t* info, void* ucontext)
SALERT("Non-signal thread got signal " << strsignal(signum) << "(" << signum << "), which wasn't expected");
}
}

// SSignalException implementation
SSignalException::SSignalException(int signum,
void* faultAddress,
void* instructionPointer,
void* const* callstack,
int depth)
: _signum(signum),
_faultAddress(faultAddress),
_instructionPointer(instructionPointer),
_depth(min(depth, CALLSTACK_LIMIT))
{
if (callstack && _depth > 0) {
memcpy(_callstack, callstack, _depth * sizeof(void*));
}
}

const char* SSignalException::what() const noexcept {
if (_whatMessage.empty()) {
// Build message lazily to avoid issues in constructor context.
try {
_whatMessage = "Signal " + string(signalName()) + " at address " + SToHex((uint64_t)_faultAddress);
} catch (...) {
_whatMessage = "Signal exception";
}
}
return _whatMessage.c_str();
}

const char* SSignalException::signalName() const noexcept {
switch (_signum) {
case SIGSEGV: return "SIGSEGV";
case SIGFPE: return "SIGFPE";
case SIGBUS: return "SIGBUS";
case SIGILL: return "SIGILL";
default: return "UNKNOWN";
}
}

vector<string> SSignalException::stackTrace() const noexcept {
return SGetCallstack(_depth, _callstack);
}

void SSignalException::logStackTrace() const noexcept {
try {
SWARN("Signal " << signalName() << " at fault address " << SToHex((uint64_t)_faultAddress));
for (const auto& frame : stackTrace()) {
SWARN(" " << frame);
}
} catch (...) {
// Logging failed, but we're noexcept so just ignore.
}
}
39 changes: 38 additions & 1 deletion libstuff/SThread.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@
using namespace std;

// SThread is a thread wrapper intended to be used in the same way as thread,
// except that it will trap exceptions and pass them back to the caller as part of a promise.
// except that it will trap exceptions (including signal-generated exceptions like SIGSEGV)
// and pass them back to the caller as part of a promise.
template<class F, class ... Args>
auto SThread(F&& f, Args&&... args)
{
Expand Down Expand Up @@ -39,6 +40,33 @@ auto SThread(F&& f, Args&&... args)
// Finally we can create our new thread and pass it our function and arguments.
thread t(
[p = move(prom), fn = move(fn), argTuple = move(argTuple)]() mutable {
// Initialize signal handling for this thread and mark it as recoverable.
// This allows signals like SIGSEGV/SIGFPE to be converted to SSignalException
// instead of aborting the process.
SInitializeSignals();
SSetThreadRecoverable(true);

// Set up the recovery point for signal handling using sigsetjmp.
// If a signal occurs, the handler will call siglongjmp and sigsetjmp will
// "return" with the signal number instead of 0.
int signalCaught = sigsetjmp(*SGetRecoveryPoint(), 1);
SSetRecoveryPointActive(true);

if (signalCaught != 0) {
// We got here via siglongjmp from the signal handler.
// signalCaught contains the signal number.
SSetRecoveryPointActive(false);
SSetThreadRecoverable(false);

// Build the exception from the crash info stored by the signal handler.
SSignalException ex = SBuildSignalException();

SWARN("Signal exception in SThread: " << ex.what());
ex.logStackTrace();
p.set_exception(make_exception_ptr(ex));
return;
}

try {
// We call `apply` to use our argments from a tuple as if they were a list of discrete arguments.
// This is effectively like calling `invoke` and passing the arguments separately.
Expand All @@ -50,13 +78,22 @@ auto SThread(F&& f, Args&&... args)
} else {
p.set_value(apply(move(fn), move(argTuple)));
}
} catch (const SSignalException& e) {
// Signal-generated exception - log stack trace before propagating.
SWARN("Signal exception in SThread: " << e.what());
e.logStackTrace();
p.set_exception(current_exception());
} catch (const exception& e) {
SWARN("Uncaught exception in SThread: " << e.what());
p.set_exception(current_exception());
} catch (...) {
SWARN("Uncaught exception in SThread: unknown type");
p.set_exception(current_exception());
}

// Restore non-recoverable state (belt-and-suspenders, thread is ending anyway).
SSetRecoveryPointActive(false);
SSetThreadRecoverable(false);
}
);

Expand Down
53 changes: 53 additions & 0 deletions libstuff/libstuff.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

#include <poll.h>
#include <libgen.h>
#include <setjmp.h>
#include <syslog.h>

#include <algorithm>
Expand Down Expand Up @@ -169,6 +170,41 @@ class SException : public exception {
// Utility function for generating pretty callstacks.
vector<string> SGetCallstack(int depth = 0, void* const* callstack = nullptr) noexcept;

// An SSignalException is thrown when a recoverable thread (started via SThread) receives
// a signal like SIGSEGV or SIGFPE. Instead of crashing the process, the signal is converted
// into this exception which can be caught and handled gracefully.
class SSignalException : public exception {
private:
static const int CALLSTACK_LIMIT = 32;
int _signum;
void* _faultAddress;
void* _instructionPointer;
void* _callstack[CALLSTACK_LIMIT];
int _depth;
mutable string _whatMessage;

public:
SSignalException(int signum,
void* faultAddress,
void* instructionPointer,
void* const* callstack,
int depth);

const char* what() const noexcept override;

// Accessors
int signal() const noexcept { return _signum; }
void* faultAddress() const noexcept { return _faultAddress; }
void* instructionPointer() const noexcept { return _instructionPointer; }

// Get demangled stack trace
vector<string> stackTrace() const noexcept;
void logStackTrace() const noexcept;

// Signal name for logging
const char* signalName() const noexcept;
};

// --------------------------------------------------------------------------
// Time stuff TODO: Replace with chrono
// --------------------------------------------------------------------------
Expand Down Expand Up @@ -235,6 +271,23 @@ void SClearSignals();

void SStopSignalThread();

// Mark this thread as recoverable - signals like SIGSEGV/SIGFPE will be converted
// to SSignalException instead of aborting. Used by SThread.
void SSetThreadRecoverable(bool recoverable);

// Check if this thread is marked as recoverable.
bool SIsThreadRecoverable();

// Get the thread-local recovery point for sigsetjmp/siglongjmp signal recovery.
sigjmp_buf* SGetRecoveryPoint();

// Set whether the recovery point is active (sigsetjmp has been called).
void SSetRecoveryPointActive(bool active);

// Build an SSignalException from the current thread-local crash info.
// Called after siglongjmp returns to the recovery point.
SSignalException SBuildSignalException();

// And also exception stuff.
string SGetCurrentExceptionName();
void STerminateHandler(void);
Expand Down
Loading
Loading