diff --git a/config/settings/base.py b/config/settings/base.py index c3a8750dc..d9abb1f8d 100644 --- a/config/settings/base.py +++ b/config/settings/base.py @@ -247,6 +247,27 @@ SENDGRID_SANDBOX_MODE_IN_DEBUG = False SENDGRID_ECHO_TO_STDOUT = True +# TCP keepalive (shared by Redis cache and Celery/RabbitMQ broker) +# ------------------------------------------------------------------------------ +# Without SO_KEEPALIVE set on the client socket, the kernel never sends +# keepalive probes regardless of host-level sysctl tuning. Long-idle pooled +# connections are then vulnerable to silent drops by stateful cloud firewalls +# (the failure mode behind #1218, and the trigger for #1073). +# +# Default probe schedule: start after 60s idle, retry every 10s, give up +# after 9 failed attempts -> first dead-connection detection at ~150s. +# +# These defaults are safe for all environments: no-ops on localhost +# (local dev, docker-compose staging/demo), protective anywhere upstream +# infrastructure can drop idle connections. Production operators can +# override via env vars if their network needs different tuning (e.g. a +# more aggressive firewall might warrant a shorter idle). +TCP_KEEPALIVE_OPTIONS = { + socket.TCP_KEEPIDLE: env.int("TCP_KEEPALIVE_IDLE", default=60), + socket.TCP_KEEPINTVL: env.int("TCP_KEEPALIVE_INTVL", default=10), + socket.TCP_KEEPCNT: env.int("TCP_KEEPALIVE_CNT", default=9), +} + # CACHES # ------------------------------------------------------------------------------ # https://docs.djangoproject.com/en/dev/ref/settings/#caches @@ -259,6 +280,14 @@ # Mimicing memcache behavior. # https://github.com/jazzband/django-redis#memcached-exceptions-behavior "IGNORE_EXCEPTIONS": True, + # TCP keepalive -- see TCP_KEEPALIVE_OPTIONS above. + "SOCKET_KEEPALIVE": env.bool("REDIS_CACHE_SOCKET_KEEPALIVE", default=True), + "SOCKET_KEEPALIVE_OPTIONS": TCP_KEEPALIVE_OPTIONS, + # Cap connect-time to fail fast if the Redis host is unreachable, + # rather than hanging pooled-connection acquisition indefinitely. + # 5s is well above normal connect latency; raise in high-latency + # networks. + "SOCKET_CONNECT_TIMEOUT": env.int("REDIS_CACHE_SOCKET_CONNECT_TIMEOUT", default=5), }, } } @@ -400,17 +429,10 @@ def _celery_result_backend_url(redis_url): # RabbitMQ broker connection settings # These settings improve reliability for long-running workers with intermittent network issues CELERY_BROKER_TRANSPORT_OPTIONS = { - # Custom TCP Keepalives to ensure network stack doesn't silently drop connections + # TCP keepalive so the network stack doesn't silently drop connections. + # Shared schedule with the Redis cache above -- see TCP_KEEPALIVE_OPTIONS. "socket_keepalive": True, - "socket_settings": { - # Start sending Keepalive packets after 60 seconds of silence. - # This forces traffic on the wire, preventing the OpenStack 1-hour timeout. - socket.TCP_KEEPIDLE: 60, - # If no response, retry every 10 seconds. - socket.TCP_KEEPINTVL: 10, - # Give up and close connection after 9 failed attempts. - socket.TCP_KEEPCNT: 9, - }, + "socket_settings": TCP_KEEPALIVE_OPTIONS, # Connection Stability Settings "socket_connect_timeout": 40, # Max time to establish connection "retry_on_timeout": True, # Retry operations if they time out