diff --git a/docs/proposals/profiling-ffi-catch-unwind-bit-flag.md b/docs/proposals/profiling-ffi-catch-unwind-bit-flag.md new file mode 100644 index 0000000000..23e49f529c --- /dev/null +++ b/docs/proposals/profiling-ffi-catch-unwind-bit-flag.md @@ -0,0 +1,107 @@ +# Proposal: catch_unwind at the profiling FFI boundary — bit-flag variant + +Status: draft, exploration. + +## Problem + +Panics that unwind through the profiling FFI boundary are undefined behavior in +C callers. The hot-path APIs that return `ProfileStatus` +(`ddog_prof_Profile_add2`, `ddog_prof_ProfilesDictionary_insert_*`, ...) have no +panic guard today — a debug-assert, arithmetic overflow, or allocator-OOM in +inner code unwinds straight into C. + +Symbol interning is a frequent panic site in practice: hash-table growth, +string allocation, and id arithmetic all sit on hot paths and can panic. + +`libdd-common-ffi`'s `wrap_with_ffi_result!` +([utils.rs:10](../../libdd-common-ffi/src/utils.rs)) already solves this for +functions returning the old tagged-union `Result` / `VoidResult`. The new +`ProfileStatus` type has no equivalent. + +## Proposal + +Encode "this status is the result of a caught panic" in a new bit on +`ProfileStatus.flags`. The OK representation (`{0, null}`) is unchanged, and +existing C consumers continue to work — they just see another error. + +```c +#define DDOG_PROF_STATUS_FLAG_ALLOCATED 0x1 +#define DDOG_PROF_STATUS_FLAG_PANIC 0x2 + +bool ddog_prof_Status_is_panic(const ddog_prof_ProfileStatus *s); +``` + +Internally, every `ProfileStatus`-returning FFI function wraps its body in +`wrap_with_profile_status!`, which: + +1. Runs the body inside `catch_unwind(AssertUnwindSafe(...))`. +2. On `Ok(Ok(()))` → `ProfileStatus::OK`. +3. On `Ok(Err(e))` → existing `ProfileStatus::from(e)` path. +4. On `Err(payload)` → `ProfileStatus::from_panic(payload, function_name!())`, + which builds a heap message (or falls back to a static `'libdatadog + panicked'`) and ORs in `IS_PANIC_MASK`. + +This mirrors the nested-`catch_unwind` OOM-fallback strategy used by +`libdd-common-ffi`'s `handle_panic_error` so the panic path is allocation-safe. + +## Caller pattern (C) + +```c +ddog_prof_FunctionId2 id; +ddog_prof_ProfileStatus s = ddog_prof_ProfilesDictionary_insert_function(&id, dict, &fn); +if (s.err) { + if (s.flags & DDOG_PROF_STATUS_FLAG_PANIC) { + log_fatal("libdatadog panic: %s", s.err); + // policy: scrap the profile, it may be corrupt + ddog_prof_ProfilesDictionary_drop(dict); + } else { + log_warn("intern failed: %s", s.err); // recoverable + } + ddog_prof_Status_drop(&s); +} +``` + +## Pros + +- One return type, one inspection point. Caller picks the recovery policy + per-call. +- Zero-cost on the happy path: OK stays `{0, null}`. +- Panic info is per-call, naturally scoped to the failed operation. +- ABI-stable: just a new bit; old C clients that don't know about it still see + `err != null` and degrade as a generic error. +- No global state, no init ordering, fork-safe. + +## Cons + +- Caller has to remember to mask the panic bit if they want to differentiate. + Default-behavior is "treat panic as any other error" — acceptable but loses + information unless the caller opts in. +- Adds a knob (`is_panic`) to the public surface every caller now sees. +- Does not, by itself, mark the underlying `Profile` / `ProfilesDictionary` as + poisoned — caller is on the hook for dropping the handle. See "Open question: + poisoning" below. + +## Open question: handle poisoning + +`AssertUnwindSafe` makes the compiler happy; it does not make the invariant +true. After a caught panic, the `Profile` or `ProfilesDictionary` may be in a +half-mutated state. + +This proposal does **not** auto-poison the handle. The recommended caller +contract is "on panic, drop and recreate". A follow-up could add an atomic +`poisoned` flag on the handle and have subsequent FFI calls short-circuit to +`ProfileStatus::from(c"profile poisoned by prior panic")` — analogous to +`std::sync::Mutex` poisoning. + +## Example migration + +This PR migrates exactly one function — +`ddog_prof_ProfilesDictionary_insert_function` — to show the shape. If +accepted, every `ProfileStatus`-returning FFI function would follow. + +## Sibling proposals + +- `r1viollet/profiling-ffi-panic-callback` — global panic callback, return path + stays clean. +- `r1viollet/profiling-ffi-panic-bit-and-callback` — both, with the callback as + an optional observability overlay on top of the bit. diff --git a/libdd-profiling-ffi/src/profile_status.rs b/libdd-profiling-ffi/src/profile_status.rs index f87d8fe29f..d14ca4cb7c 100644 --- a/libdd-profiling-ffi/src/profile_status.rs +++ b/libdd-profiling-ffi/src/profile_status.rs @@ -12,7 +12,12 @@ use std::ptr::{null, NonNull}; /// ProfileStatus uses `err` being null to encode OK, so we only need /// one bit in flags to distinguish between STATIC and ALLOCATED errors. -const IS_ALLOCATED_MASK: usize = 1; +const IS_ALLOCATED_MASK: usize = 0x1; + +/// Set when the status was produced by `catch_unwind` catching a panic in the +/// FFI body. Independent of `IS_ALLOCATED_MASK` — a panic status may carry +/// either a heap-allocated formatted message or the static fallback. +const IS_PANIC_MASK: usize = 0x2; /// Represents the result of an operation that either succeeds with no value, or fails with an /// error message. This is like `Result<(), Cow<'static, CStr>` except its representation is @@ -129,7 +134,7 @@ impl From for Result<(), Cow<'static, CStr>> { let flags = status.flags; if status.err.is_null() { status.verify_flags() - } else if flags == IS_ALLOCATED_MASK { + } else if flags & IS_ALLOCATED_MASK != 0 { Err(Cow::Owned(unsafe { CString::from_raw(status.err.cast_mut()) })) @@ -267,6 +272,56 @@ impl ProfileStatus { pub fn from_error(err: E) -> Self { ProfileStatus::from(ProfileError::from_display(err)) } + + /// True if this status was produced by a caught panic in the FFI body. + /// Callers can use this to distinguish a recoverable error from a + /// catastrophic one (handle is likely corrupt and should be dropped). + pub fn is_panic(&self) -> bool { + self.flags & IS_PANIC_MASK != 0 + } + + /// Builds a `ProfileStatus` from a `catch_unwind` panic payload. + /// + /// The message-formatting path is itself wrapped in a nested `catch_unwind` + /// so that an allocation failure while formatting the panic message + /// gracefully falls back to a static C string. The returned status always + /// has the panic bit set. + pub fn from_panic( + payload: Box, + function_name: &str, + ) -> Self { + let formatted = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| { + let msg = if let Some(s) = payload.downcast_ref::() { + format!("{function_name} panicked: {s}") + } else if let Some(s) = payload.downcast_ref::<&'static str>() { + format!("{function_name} panicked: {s}") + } else { + format!("{function_name} panicked: ") + }; + CString::new(msg).ok() + })); + + let mut s = match formatted { + Ok(Some(cstring)) => ProfileStatus::from(cstring), + _ => ProfileStatus { + flags: 0, + err: c"libdatadog panicked".as_ptr(), + }, + }; + s.flags |= IS_PANIC_MASK; + s + } +} + +/// True if this status was produced by a caught panic in the FFI body. +/// Returns false for null or OK statuses. +/// +/// # Safety +/// +/// The pointer, if non-null, must point at a valid `ProfileStatus`. +#[no_mangle] +pub unsafe extern "C" fn ddog_prof_Status_is_panic(status: Option<&ProfileStatus>) -> bool { + status.is_some_and(ProfileStatus::is_panic) } /// Frees any error associated with the status, and replaces it with an OK. diff --git a/libdd-profiling-ffi/src/profiles/mod.rs b/libdd-profiling-ffi/src/profiles/mod.rs index 077500d41a..153d8d77ef 100644 --- a/libdd-profiling-ffi/src/profiles/mod.rs +++ b/libdd-profiling-ffi/src/profiles/mod.rs @@ -24,4 +24,22 @@ macro_rules! ensure_non_null_insert { }; } -pub(crate) use {ensure_non_null_insert, ensure_non_null_out_parameter}; +/// Wraps the body of a `ProfileStatus`-returning FFI function in +/// `catch_unwind`. The body must evaluate to a `Result<(), E>` where +/// `ProfileStatus: From`. On caught panic the returned status has its +/// `IS_PANIC_MASK` bit set; callers can check via `ddog_prof_Status_is_panic`. +/// +/// The enclosing function must carry `#[function_name::named]` so that the +/// caught-panic message is automatically prefixed with the function name. +#[macro_export] +macro_rules! wrap_with_profile_status { + ($body:block) => {{ + use std::panic::{catch_unwind, AssertUnwindSafe}; + match catch_unwind(AssertUnwindSafe(|| $body)) { + Ok(result) => $crate::ProfileStatus::from(result), + Err(payload) => $crate::ProfileStatus::from_panic(payload, function_name!()), + } + }}; +} + +pub(crate) use {ensure_non_null_insert, ensure_non_null_out_parameter, wrap_with_profile_status}; diff --git a/libdd-profiling-ffi/src/profiles/profiles_dictionary.rs b/libdd-profiling-ffi/src/profiles/profiles_dictionary.rs index e4ddf38601..1ea5f3aa80 100644 --- a/libdd-profiling-ffi/src/profiles/profiles_dictionary.rs +++ b/libdd-profiling-ffi/src/profiles/profiles_dictionary.rs @@ -4,8 +4,11 @@ use crate::arc_handle::ArcHandle; use crate::profile_status::ProfileStatus; use crate::profiles::utf8::{self, Utf8Option}; -use crate::profiles::{ensure_non_null_insert, ensure_non_null_out_parameter}; +use crate::profiles::{ + ensure_non_null_insert, ensure_non_null_out_parameter, wrap_with_profile_status, +}; use crate::ProfileError; +use function_name::named; use libdd_common_ffi::slice::{CharSlice, Slice}; use libdd_common_ffi::MutSlice; use libdd_profiling::profiles::collections::StringRef; @@ -113,6 +116,7 @@ pub unsafe extern "C" fn ddog_prof_ProfilesDictionary_try_clone( /// - `dict` must refer to a live dictionary. /// - `function` must be non-null and point to a valid `Function` for the duration of the call. #[no_mangle] +#[named] pub unsafe extern "C" fn ddog_prof_ProfilesDictionary_insert_function( function_id: *mut FunctionId2, dict: Option<&ProfilesDictionary>, @@ -120,13 +124,13 @@ pub unsafe extern "C" fn ddog_prof_ProfilesDictionary_insert_function( ) -> ProfileStatus { ensure_non_null_out_parameter!(function_id); ensure_non_null_insert!(function); - ProfileStatus::from(|| -> Result<(), ProfileError> { + wrap_with_profile_status!({ let dict = dict.ok_or(NULL_PROFILES_DICTIONARY)?; let f2: Function2 = unsafe { *function }; let id = dict.try_insert_function2(f2)?; unsafe { function_id.write(id) }; - Ok(()) - }()) + Ok::<(), ProfileError>(()) + }) } /// Inserts a `Mapping` into the dictionary and returns its id.