From 29130d41291bf0d1ecf04b772a0ce7858da93263 Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Fri, 10 Apr 2026 18:16:49 -0700 Subject: [PATCH 1/2] fix(cache): enable SO_KEEPALIVE on django-redis cache connections Without SOCKET_KEEPALIVE, redis-py does not call setsockopt(SO_KEEPALIVE, 1) on the Redis cache socket, so the kernel never sends TCP keepalive probes regardless of host-level sysctl tuning. Pooled connections can sit idle long enough for stateful cloud firewalls to silently drop them, at which point the next task to borrow one from the pool fails with ECONNRESET. This mirrors what CELERY_BROKER_TRANSPORT_OPTIONS.socket_settings does for the celery broker path, which has been protected for some time. Closes #1218. --- config/settings/base.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/config/settings/base.py b/config/settings/base.py index c3a8750dc..9c503a8a3 100644 --- a/config/settings/base.py +++ b/config/settings/base.py @@ -259,6 +259,20 @@ # Mimicing memcache behavior. # https://github.com/jazzband/django-redis#memcached-exceptions-behavior "IGNORE_EXCEPTIONS": True, + # TCP keepalive for pooled connections. Without SO_KEEPALIVE the + # kernel never sends probes, regardless of host-level sysctl + # tuning, so pooled Redis connections can sit idle long enough + # for cloud firewalls to silently drop them. The next task to + # borrow such a connection from the pool fails with ECONNRESET. + # Values mirror CELERY_BROKER_TRANSPORT_OPTIONS.socket_settings + # below. See RolnickLab/antenna#1218. + "SOCKET_KEEPALIVE": True, + "SOCKET_KEEPALIVE_OPTIONS": { + socket.TCP_KEEPIDLE: 60, + socket.TCP_KEEPINTVL: 10, + socket.TCP_KEEPCNT: 9, + }, + "SOCKET_CONNECT_TIMEOUT": 5, }, } } From 5f714405a067f2fe2da1ae145cb87053068298bc Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Tue, 14 Apr 2026 12:06:29 -0700 Subject: [PATCH 2/2] refactor(settings): share TCP keepalive options, make tunable via env Extracts TCP_KEEPALIVE_OPTIONS to a module-level constant with env-var overrides (TCP_KEEPALIVE_IDLE/INTVL/CNT) and reuses it in both the django-redis CACHES block and CELERY_BROKER_TRANSPORT_OPTIONS. Also exposes REDIS_CACHE_SOCKET_KEEPALIVE and REDIS_CACHE_SOCKET_CONNECT_TIMEOUT as env vars for production tuning. Defaults match the values introduced for the broker in #1073 (60/10/9), which are safe for local/staging/demo and correct for deployments behind stateful firewalls. --- config/settings/base.py | 56 +++++++++++++++++++++++------------------ 1 file changed, 32 insertions(+), 24 deletions(-) diff --git a/config/settings/base.py b/config/settings/base.py index 9c503a8a3..d9abb1f8d 100644 --- a/config/settings/base.py +++ b/config/settings/base.py @@ -247,6 +247,27 @@ SENDGRID_SANDBOX_MODE_IN_DEBUG = False SENDGRID_ECHO_TO_STDOUT = True +# TCP keepalive (shared by Redis cache and Celery/RabbitMQ broker) +# ------------------------------------------------------------------------------ +# Without SO_KEEPALIVE set on the client socket, the kernel never sends +# keepalive probes regardless of host-level sysctl tuning. Long-idle pooled +# connections are then vulnerable to silent drops by stateful cloud firewalls +# (the failure mode behind #1218, and the trigger for #1073). +# +# Default probe schedule: start after 60s idle, retry every 10s, give up +# after 9 failed attempts -> first dead-connection detection at ~150s. +# +# These defaults are safe for all environments: no-ops on localhost +# (local dev, docker-compose staging/demo), protective anywhere upstream +# infrastructure can drop idle connections. Production operators can +# override via env vars if their network needs different tuning (e.g. a +# more aggressive firewall might warrant a shorter idle). +TCP_KEEPALIVE_OPTIONS = { + socket.TCP_KEEPIDLE: env.int("TCP_KEEPALIVE_IDLE", default=60), + socket.TCP_KEEPINTVL: env.int("TCP_KEEPALIVE_INTVL", default=10), + socket.TCP_KEEPCNT: env.int("TCP_KEEPALIVE_CNT", default=9), +} + # CACHES # ------------------------------------------------------------------------------ # https://docs.djangoproject.com/en/dev/ref/settings/#caches @@ -259,20 +280,14 @@ # Mimicing memcache behavior. # https://github.com/jazzband/django-redis#memcached-exceptions-behavior "IGNORE_EXCEPTIONS": True, - # TCP keepalive for pooled connections. Without SO_KEEPALIVE the - # kernel never sends probes, regardless of host-level sysctl - # tuning, so pooled Redis connections can sit idle long enough - # for cloud firewalls to silently drop them. The next task to - # borrow such a connection from the pool fails with ECONNRESET. - # Values mirror CELERY_BROKER_TRANSPORT_OPTIONS.socket_settings - # below. See RolnickLab/antenna#1218. - "SOCKET_KEEPALIVE": True, - "SOCKET_KEEPALIVE_OPTIONS": { - socket.TCP_KEEPIDLE: 60, - socket.TCP_KEEPINTVL: 10, - socket.TCP_KEEPCNT: 9, - }, - "SOCKET_CONNECT_TIMEOUT": 5, + # TCP keepalive -- see TCP_KEEPALIVE_OPTIONS above. + "SOCKET_KEEPALIVE": env.bool("REDIS_CACHE_SOCKET_KEEPALIVE", default=True), + "SOCKET_KEEPALIVE_OPTIONS": TCP_KEEPALIVE_OPTIONS, + # Cap connect-time to fail fast if the Redis host is unreachable, + # rather than hanging pooled-connection acquisition indefinitely. + # 5s is well above normal connect latency; raise in high-latency + # networks. + "SOCKET_CONNECT_TIMEOUT": env.int("REDIS_CACHE_SOCKET_CONNECT_TIMEOUT", default=5), }, } } @@ -414,17 +429,10 @@ def _celery_result_backend_url(redis_url): # RabbitMQ broker connection settings # These settings improve reliability for long-running workers with intermittent network issues CELERY_BROKER_TRANSPORT_OPTIONS = { - # Custom TCP Keepalives to ensure network stack doesn't silently drop connections + # TCP keepalive so the network stack doesn't silently drop connections. + # Shared schedule with the Redis cache above -- see TCP_KEEPALIVE_OPTIONS. "socket_keepalive": True, - "socket_settings": { - # Start sending Keepalive packets after 60 seconds of silence. - # This forces traffic on the wire, preventing the OpenStack 1-hour timeout. - socket.TCP_KEEPIDLE: 60, - # If no response, retry every 10 seconds. - socket.TCP_KEEPINTVL: 10, - # Give up and close connection after 9 failed attempts. - socket.TCP_KEEPCNT: 9, - }, + "socket_settings": TCP_KEEPALIVE_OPTIONS, # Connection Stability Settings "socket_connect_timeout": 40, # Max time to establish connection "retry_on_timeout": True, # Retry operations if they time out