diff --git a/.envs/.local/.django b/.envs/.local/.django index 8eb5610f7..36568b3e1 100644 --- a/.envs/.local/.django +++ b/.envs/.local/.django @@ -19,6 +19,11 @@ NATS_URL=nats://nats:4222 CELERY_FLOWER_USER=QSocnxapfMvzLqJXSsXtnEZqRkBtsmKT CELERY_FLOWER_PASSWORD=BEQgmCtgyrFieKNoGTsux9YIye0I7P5Q7vEgfJD2C4jxmtHDetFaE2jhS7K7rxaf +# Celery worker concurrency (prefork pool size). +# Default in settings is 8. Lower to 2-4 on memory-constrained dev laptops — +# each prefork worker is a separate Python process with its own DB connection. +# CELERY_WORKER_CONCURRENCY=4 + # RabbitMQ CELERY_BROKER_URL=amqp://rabbituser:rabbitpass@rabbitmq:5672/ CELERY_RESULT_BACKEND=rpc:// # Use RabbitMQ for results backend diff --git a/.envs/.production/.django-example b/.envs/.production/.django-example index 93737d527..c30d88dc0 100644 --- a/.envs/.production/.django-example +++ b/.envs/.production/.django-example @@ -20,6 +20,13 @@ REDIS_URL=redis://redis:6379/0 # Celery # ------------------------------------------------------------------------------ +# Worker concurrency (prefork pool size). Settings default is 8. +# Recommended for production: 16 on an 8-core host — process_nats_pipeline_result +# and create_detection_images are DB/Redis-bound, so ~2x cpu_count roughly matches +# observed drain vs ingress on the antenna queue without saturating pgbouncer. +# Tune higher only after confirming postgres connection-pool headroom. +CELERY_WORKER_CONCURRENCY=16 + # Flower CELERY_FLOWER_USER= CELERY_FLOWER_PASSWORD= diff --git a/config/settings/base.py b/config/settings/base.py index c3a8750dc..5877d3406 100644 --- a/config/settings/base.py +++ b/config/settings/base.py @@ -393,6 +393,16 @@ def _celery_result_backend_url(redis_url): CELERY_WORKER_PREFETCH_MULTIPLIER = 1 CELERY_WORKER_ENABLE_PREFETCH_COUNT_REDUCTION = True +# Worker concurrency (prefork pool size) +# https://docs.celeryq.dev/en/stable/userguide/configuration.html#worker-concurrency +# Celery's own default when unset is os.cpu_count(), which on the production +# 8-core host produced an 8-process pool that could not keep up with the antenna +# queue's DB/Redis-bound tasks (process_nats_pipeline_result, create_detection_images). +# 8 is a conservative default that keeps local/staging/demo memory footprints +# reasonable (each prefork worker is a separate Python process with imports + +# DB connection). Production should override to 16 (see .envs/.production/.django-example). +CELERY_WORKER_CONCURRENCY = env.int("CELERY_WORKER_CONCURRENCY", default=8) + # Cancel & return to queue if connection is lost # https://docs.celeryq.dev/en/latest/userguide/configuration.html#worker-cancel-long-running-tasks-on-connection-loss CELERY_WORKER_CANCEL_LONG_RUNNING_TASKS_ON_CONNECTION_LOSS = True