From fbe64f8fd2cbf88f5ca469f1a7adb4d7374446b8 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 8 May 2026 01:22:38 +0000 Subject: [PATCH] fix(io): generalize PR #54 write-path contract to non-TLS sites (P3) Phase 3 of roadmap/plan-graceful-shutdown.md (Pattern D' on the `roadmap` branch). Audits the five upstream TODO/FIXME sites flagged for the "silent fall-through on write paths" bug family and applies PR #54's contract where it matters. Of the five, two are real NULL-deref bugs that needed code fixes; three are documentation-only dispositions. Real fixes ---------- src/nxt_port_socket.c:746 nxt_port_read_handler The empty `if (b == NULL) {}` body fell through to `iov[1].iov_base = b->mem.pos`, NULL-derefing on transient OOM in the port mem_pool. Now alerts, disarms the read event via nxt_fd_event_block_read, and `goto fail` (the existing fail: label at :802 queues nxt_port_error_handler). Strict improvement: crash -> orderly teardown. Recovery via timer-driven retry was rejected as out of P3 scope. src/nxt_port_socket.c:889 nxt_port_queue_read_handler Same NULL-deref class -- `b->mem.pos` was used in both the dequeue memcpy and the iov[1] recv branch. Mirror the fix: alert, disarm, decrement queue->nitems (balances the entry +1 at :830), and queue nxt_port_error_handler. Both sites now carry an `nxt_assert(b != NULL)` immediately past the OOM-teardown block, documenting the invariant for future audits and providing a deterministic guard against a future regression that re-introduces an empty if-body. Documentation-only dispositions ------------------------------- src/nxt_port_socket.c:1369 nxt_port_error_handler The bare `/* TODO */` historically asked for richer error context (e.g. surfacing socket.error to peers). Audit found the handler already drains queued sends, releases buffers, and decrements port refcounts -- no silent-success or NULL-deref bug hidden here. Replaced bare TODO with a disposition note; richer error responses are a future enhancement and not part of PR #54's write-path contract. src/nxt_router.c:5895,5917 nxt_router_get_mmap_handler Two `// FIXME` markers on best-effort RPC_ERROR sends. The function is `void`, delivery failures surface in port->socket.error and the port-side teardown (nxt_port_error_handler) fires; peer-side RPC timeout is the ultimate backstop. Replaced FIXMEs with a one-line disposition note and added explicit `(void)` casts to match the convention at the end of the same function. Tests ----- No new tests. The NULL-deref sites cannot be reliably reproduced from Python without malloc-failure injection (LD_PRELOAD or AddressSanitizer with allocator hooks); ASan validation is recommended in CI follow-up. The nxt_assert(b != NULL) lines provide deterministic in-tree documentation of the invariant in lieu of a behavioural test. Verified -------- ./configure --tests --modules=python && ./configure python \ --config=python3-config && make -j$(nproc) # clean python3 -m pytest test/test_idle_close_wait.py # 2 pass python3 -m pytest test/test_python_application.py # 40 pass, 6 skip Refs ---- - roadmap/plan-graceful-shutdown.md P3 (Pattern D') - PR #54 (TLS write-path contract; the canonical shape mirrored by the two real fixes above) --- src/nxt_port_socket.c | 55 ++++++++++++++++++++++++++++++++++++++++--- src/nxt_router.c | 12 +++++----- 2 files changed, 58 insertions(+), 9 deletions(-) diff --git a/src/nxt_port_socket.c b/src/nxt_port_socket.c index 5752d5ab0..cccf881ca 100644 --- a/src/nxt_port_socket.c +++ b/src/nxt_port_socket.c @@ -746,9 +746,28 @@ nxt_port_read_handler(nxt_task_t *task, void *obj, void *data) b = nxt_port_buf_alloc(port); if (nxt_slow_path(b == NULL)) { - /* TODO: disable event for some time */ + /* + * Buffer pool exhausted (transient OOM on port mem_pool). + * Falling through would dereference b->mem.pos and crash; + * mirror PR #54's contract by disarming the read event and + * routing through the orderly error path. No timer + * infrastructure exists for ports, so re-arm depends on + * the port being torn down via nxt_port_error_handler. + * + * Recovery via timer-driven retry was rejected as out of + * P3 scope; terminal teardown is the strict improvement + * over the pre-P3 NULL-deref. + */ + nxt_alert(task, "port{%d,%d} %d: buf alloc failed; " + "disabling read", + (int) port->pid, (int) port->id, port->socket.fd); + nxt_fd_event_block_read(task->thread->engine, &port->socket); + goto fail; } + /* Invariant past the OOM teardown above; aids future audits. */ + nxt_assert(b != NULL); + iov[0].iov_base = &msg.port_msg; iov[0].iov_len = sizeof(nxt_port_msg_t); @@ -889,9 +908,31 @@ nxt_port_queue_read_handler(nxt_task_t *task, void *obj, void *data) b = nxt_port_buf_alloc(port); if (nxt_slow_path(b == NULL)) { - /* TODO: disable event for some time */ + /* + * Buffer pool exhausted (transient OOM on port mem_pool). + * Falling through would dereference b->mem.pos in either the + * dequeue memcpy or the iov[1] recv branch. Mirror PR #54's + * contract: disarm the read event, decrement the queue + * counter, and route through the orderly error handler. + * + * Recovery via timer-driven retry was rejected as out of + * P3 scope; terminal teardown is the strict improvement + * over the pre-P3 NULL-deref. + */ + nxt_alert(task, "port{%d,%d} %d: buf alloc failed; " + "disabling read", + (int) port->pid, (int) port->id, port->socket.fd); + nxt_fd_event_block_read(task->thread->engine, &port->socket); + nxt_atomic_fetch_add(&queue->nitems, -1); + nxt_work_queue_add(&task->thread->engine->fast_work_queue, + nxt_port_error_handler, task, &port->socket, + NULL); + return; } + /* Invariant past the OOM teardown above; aids future audits. */ + nxt_assert(b != NULL); + if (n >= (ssize_t) sizeof(nxt_port_msg_t)) { nxt_memcpy(&msg.port_msg, qmsg, sizeof(nxt_port_msg_t)); @@ -1342,7 +1383,15 @@ nxt_port_error_handler(nxt_task_t *task, void *obj, void *data) nxt_port_send_msg_t *msg; nxt_debug(task, "port error handler %p", obj); - /* TODO */ + /* + * The bare TODO here historically asked for richer error context + * (e.g. surfacing the actual socket.error to peers). Investigated + * for P3 (write-path contract): the handler already drains all + * queued send messages, releases buffers, and decrements port + * refcounts. No silent success or NULL-deref bug is hidden here; + * adding richer error responses is a future enhancement and not + * part of PR #54's write-path contract. + */ port = nxt_container_of(obj, nxt_port_t, socket); diff --git a/src/nxt_router.c b/src/nxt_router.c index c994f3707..0a2438aa0 100644 --- a/src/nxt_router.c +++ b/src/nxt_router.c @@ -5895,9 +5895,9 @@ nxt_router_get_mmap_handler(nxt_task_t *task, nxt_port_recv_msg_t *msg) nxt_alert(task, "get_mmap_handler: app == NULL for reply port %PI:%d", port->pid, port->id); - // FIXME - nxt_port_socket_write(task, port, NXT_PORT_MSG_RPC_ERROR, - -1, msg->port_msg.stream, 0, NULL); + /* Best-effort RPC_ERROR; peer-side RPC timeout is the backstop. */ + (void) nxt_port_socket_write(task, port, NXT_PORT_MSG_RPC_ERROR, + -1, msg->port_msg.stream, 0, NULL); return; } @@ -5911,9 +5911,9 @@ nxt_router_get_mmap_handler(nxt_task_t *task, nxt_port_recv_msg_t *msg) nxt_alert(task, "get_mmap_handler: mmap id is too big (%d)", (int) get_mmap_msg->id); - // FIXME - nxt_port_socket_write(task, port, NXT_PORT_MSG_RPC_ERROR, - -1, msg->port_msg.stream, 0, NULL); + /* Best-effort RPC_ERROR; peer-side RPC timeout is the backstop. */ + (void) nxt_port_socket_write(task, port, NXT_PORT_MSG_RPC_ERROR, + -1, msg->port_msg.stream, 0, NULL); return; }