diff --git a/ami/celery_schedulers.py b/ami/celery_schedulers.py new file mode 100644 index 000000000..f2adf43de --- /dev/null +++ b/ami/celery_schedulers.py @@ -0,0 +1,74 @@ +"""Celery beat scheduler with a Docker-healthcheck-friendly heartbeat file. + +Why this module exists +---------------------- +Docker's default ``restart: unless-stopped`` only catches process death, not a +frozen scheduler thread. On 2026-04-16 the celerybeat container on ami-live +showed "Up 10 hours" in ``docker ps`` with four live PIDs and +``RestartCount=0``, yet its last log line was "Sending due task +celery.check_processing_services_online" twelve hours earlier — a Redis +connection blip had deadlocked the connection-pool lock and the scheduler +thread never recovered. The 15-minute ``jobs_health_check`` beat task stopped +firing, and stuck job 2421 was never reaped. + +To let Docker flip the container to ``unhealthy`` on that failure mode, we +need a heartbeat signal that proves the scheduler's main loop is progressing. +Constraints: + +- Beat does not answer ``celery inspect ping`` (that's a worker control + message), so we can't reuse the worker healthcheck. +- We use ``DatabaseScheduler`` from ``django_celery_beat``, which keeps the + schedule in Postgres, so there is no on-disk schedule file whose mtime + would update naturally. +- A plain Celery task written from a worker would touch a file in the + **worker's** filesystem, not beat's — Docker healthchecks read files + inside the checked container. + +So: override ``DatabaseScheduler.tick()`` to touch ``/tmp/beat-heartbeat`` +on every iteration. ``tick()`` runs inside the beat process itself, so the +file lives in the beat container. If the scheduler loop hangs anywhere +(Redis pool lock, DB query, sync deadlock), ``tick()`` stops returning and +the file goes stale within ~60 s. The healthcheck +(``compose/*/django/celery/healthcheck-beat.sh``) fails, Docker marks the +container ``unhealthy``, and autoheal restarts it. + +Activation +---------- +Wired in via ``CELERY_BEAT_SCHEDULER`` in ``config/settings/base.py``. +""" + +from __future__ import annotations + +import logging +from pathlib import Path + +from django_celery_beat.schedulers import DatabaseScheduler + +logger = logging.getLogger(__name__) + +HEARTBEAT_PATH = Path("/tmp/beat-heartbeat") + + +class HeartbeatDatabaseScheduler(DatabaseScheduler): + """DatabaseScheduler that touches a heartbeat file on every tick. + + Each call to ``tick()`` represents one cycle of the scheduler's main loop: + evaluate due tasks, enqueue them, return the seconds until the next tick. + If any step in that cycle hangs (e.g. a Redis or DB call blocks forever), + ``tick()`` stops returning, the file mtime stops advancing, and the Docker + healthcheck flips the container to ``unhealthy`` within ~2 minutes. + + We touch the file *before* delegating to ``super().tick()`` so a successful + iteration of the loop itself is what proves liveness; if the heartbeat + write ever fails (disk full, permission error), we log at warning level + but don't re-raise — an I/O problem writing ``/tmp`` shouldn't take down + the scheduler. Docker will eventually mark the container unhealthy on the + stale file, which is the right outcome. + """ + + def tick(self, *args, **kwargs): + try: + HEARTBEAT_PATH.touch() + except OSError as exc: + logger.warning("beat heartbeat touch failed: %s", exc) + return super().tick(*args, **kwargs) diff --git a/compose/local/django/Dockerfile b/compose/local/django/Dockerfile index 0e778f82b..d3592363a 100644 --- a/compose/local/django/Dockerfile +++ b/compose/local/django/Dockerfile @@ -41,6 +41,8 @@ RUN apt-get update && apt-get install --no-install-recommends -y \ libpq-dev \ # Translations dependencies gettext \ + # healthcheck dependencies (pgrep, findmnt, etc.) + procps \ # cleaning up unused files && apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false \ && rm -rf /var/lib/apt/lists/* @@ -74,6 +76,11 @@ COPY ./compose/local/django/celery/flower/start /start-flower RUN sed -i 's/\r$//g' /start-flower RUN chmod +x /start-flower +# Healthcheck scripts for celery worker (inspect ping) and beat (heartbeat mtime) +COPY ./compose/local/django/celery/healthcheck.sh /celery/healthcheck.sh +COPY ./compose/local/django/celery/healthcheck-beat.sh /celery/healthcheck-beat.sh +RUN chmod +x /celery/healthcheck.sh /celery/healthcheck-beat.sh + # copy application code to WORKDIR COPY . ${APP_HOME} diff --git a/compose/local/django/celery/healthcheck-beat.sh b/compose/local/django/celery/healthcheck-beat.sh new file mode 100755 index 000000000..6a84bd50e --- /dev/null +++ b/compose/local/django/celery/healthcheck-beat.sh @@ -0,0 +1,16 @@ +#!/bin/bash +# Celerybeat healthcheck: verify the scheduler is alive by checking heartbeat file age. +# +# Beat doesn't respond to `celery inspect ping` (that's a worker control message), +# and with DatabaseScheduler (django_celery_beat) there's no schedule file whose +# mtime we can watch. So we rely on a dedicated `ami.tasks.beat_heartbeat` task +# that runs every 60s via CELERY_BEAT_SCHEDULE and touches /tmp/beat-heartbeat. +# +# If beat hangs (e.g. scheduler thread deadlocked on a Redis connection blip — +# the 2026-04-16 incident), the task stops firing and the file goes stale. +# Docker flips the container to `unhealthy`, autoheal restarts it. +# +# Window: task runs every 60s, we tolerate up to 2 min of staleness before +# marking unhealthy (one missed tick is fine; two in a row is a hang). +set -e +find /tmp/beat-heartbeat -mmin -2 2>/dev/null | grep -q . || exit 1 diff --git a/compose/local/django/celery/healthcheck.sh b/compose/local/django/celery/healthcheck.sh new file mode 100755 index 000000000..52572888c --- /dev/null +++ b/compose/local/django/celery/healthcheck.sh @@ -0,0 +1,7 @@ +#!/bin/bash +# Celery worker healthcheck: verify the worker is responsive via the broker. +# Catches stuck, deadlocked, and crashed workers — not just process existence. +set -e +exec celery -A config.celery_app inspect ping \ + --destination "celery@$(hostname)" \ + --timeout 10 > /dev/null 2>&1 diff --git a/compose/local/django/celery/worker/start b/compose/local/django/celery/worker/start index a3482f44e..2633668e3 100755 --- a/compose/local/django/celery/worker/start +++ b/compose/local/django/celery/worker/start @@ -6,7 +6,8 @@ set -o nounset # Local development with auto-reload and optional debugging # -# DEBUGGER=1 - Enable debugpy for remote debugging on port 5679 +# CELERY_DEBUG=1 - Enable debugpy for remote debugging on port 5679 +# CELERY_NO_RELOAD=1 - Disable watchfiles auto-reload # # Worker protections (prevent memory leaks from long-running workers): # --max-tasks-per-child=100 - Restart worker process after 100 tasks @@ -18,10 +19,15 @@ set -o nounset MAX_TASKS_PER_CHILD=100 MAX_MEMORY_PER_CHILD=1048576 # 1 GiB in KB -# Launch VS Code debug server if DEBUGGER environment variable is set to 1 -# Note that auto reloading is disabled when debugging, manual restart required for code changes. -if [ "${DEBUGGER:-0}" = "1" ]; then +if [ "${CELERY_DEBUG:-0}" = "1" ]; then + echo "Starting Celery worker with debugpy on port 5679..." exec python -Xfrozen_modules=off -m debugpy --listen 0.0.0.0:5679 -m celery -A config.celery_app worker --queues=antenna -l INFO --max-tasks-per-child=$MAX_TASKS_PER_CHILD --max-memory-per-child=$MAX_MEMORY_PER_CHILD +fi + +if [ "${CELERY_NO_RELOAD:-0}" = "1" ]; then + echo "Starting Celery worker without auto-reload..." + exec celery -A config.celery_app worker --queues=antenna -l INFO --max-tasks-per-child=$MAX_TASKS_PER_CHILD --max-memory-per-child=$MAX_MEMORY_PER_CHILD else - exec watchfiles --filter python celery.__main__.main --args '-A config.celery_app worker --queues=antenna -l INFO --max-tasks-per-child='$MAX_TASKS_PER_CHILD' --max-memory-per-child='$MAX_MEMORY_PER_CHILD + echo "Starting Celery worker with watchfiles auto-reload..." + exec watchfiles --filter python celery.__main__.main --args "-A config.celery_app worker --queues=antenna -l INFO --max-tasks-per-child=${MAX_TASKS_PER_CHILD} --max-memory-per-child=${MAX_MEMORY_PER_CHILD}" fi diff --git a/compose/production/django/Dockerfile b/compose/production/django/Dockerfile index fd6b80ec1..5e1d9ac63 100644 --- a/compose/production/django/Dockerfile +++ b/compose/production/django/Dockerfile @@ -45,6 +45,8 @@ RUN apt-get update && apt-get install --no-install-recommends -y \ libpq-dev \ # Translations dependencies gettext \ + # healthcheck dependencies (pgrep, findmnt, etc.) + procps \ # cleaning up unused files && apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false \ && rm -rf /var/lib/apt/lists/* @@ -80,6 +82,11 @@ COPY ./compose/production/django/celery/flower/start /start-flower RUN sed -i 's/\r$//g' /start-flower RUN chmod +x /start-flower +# Healthcheck scripts for celery worker (inspect ping) and beat (heartbeat mtime) +COPY --chown=django:django ./compose/production/django/celery/healthcheck.sh /celery/healthcheck.sh +COPY --chown=django:django ./compose/production/django/celery/healthcheck-beat.sh /celery/healthcheck-beat.sh +RUN chmod +x /celery/healthcheck.sh /celery/healthcheck-beat.sh + # copy application code to WORKDIR COPY --chown=django:django . ${APP_HOME} diff --git a/compose/production/django/celery/healthcheck-beat.sh b/compose/production/django/celery/healthcheck-beat.sh new file mode 100755 index 000000000..6a84bd50e --- /dev/null +++ b/compose/production/django/celery/healthcheck-beat.sh @@ -0,0 +1,16 @@ +#!/bin/bash +# Celerybeat healthcheck: verify the scheduler is alive by checking heartbeat file age. +# +# Beat doesn't respond to `celery inspect ping` (that's a worker control message), +# and with DatabaseScheduler (django_celery_beat) there's no schedule file whose +# mtime we can watch. So we rely on a dedicated `ami.tasks.beat_heartbeat` task +# that runs every 60s via CELERY_BEAT_SCHEDULE and touches /tmp/beat-heartbeat. +# +# If beat hangs (e.g. scheduler thread deadlocked on a Redis connection blip — +# the 2026-04-16 incident), the task stops firing and the file goes stale. +# Docker flips the container to `unhealthy`, autoheal restarts it. +# +# Window: task runs every 60s, we tolerate up to 2 min of staleness before +# marking unhealthy (one missed tick is fine; two in a row is a hang). +set -e +find /tmp/beat-heartbeat -mmin -2 2>/dev/null | grep -q . || exit 1 diff --git a/compose/production/django/celery/healthcheck.sh b/compose/production/django/celery/healthcheck.sh new file mode 100755 index 000000000..52572888c --- /dev/null +++ b/compose/production/django/celery/healthcheck.sh @@ -0,0 +1,7 @@ +#!/bin/bash +# Celery worker healthcheck: verify the worker is responsive via the broker. +# Catches stuck, deadlocked, and crashed workers — not just process existence. +set -e +exec celery -A config.celery_app inspect ping \ + --destination "celery@$(hostname)" \ + --timeout 10 > /dev/null 2>&1 diff --git a/compose/production/django/celery/worker/start b/compose/production/django/celery/worker/start index 4ba716507..08cbdd8e8 100644 --- a/compose/production/django/celery/worker/start +++ b/compose/production/django/celery/worker/start @@ -12,8 +12,18 @@ set -o nounset # # With prefork pool (default), each CPU gets a worker process. # Example: 8 CPUs × 2 GiB = 16 GiB max total worker memory +# +# These options work in conjunction with the Docker healthcheck: +# - Healthcheck (celery inspect ping) detects STUCK workers (not responding) +# - These options prevent RESOURCE LEAKS (memory/task buildup over time) +# - Autoheal restarts UNHEALTHY containers +# - restart:always brings containers back after any exit MAX_TASKS_PER_CHILD=100 MAX_MEMORY_PER_CHILD=2097152 # 2 GiB in KB -exec newrelic-admin run-program celery -A config.celery_app worker --queues=antenna -l INFO --max-tasks-per-child=$MAX_TASKS_PER_CHILD --max-memory-per-child=$MAX_MEMORY_PER_CHILD +exec newrelic-admin run-program celery -A config.celery_app worker \ + --queues=antenna \ + -l INFO \ + --max-tasks-per-child=$MAX_TASKS_PER_CHILD \ + --max-memory-per-child=$MAX_MEMORY_PER_CHILD diff --git a/config/settings/base.py b/config/settings/base.py index be6f3fada..8c8ba9c9b 100644 --- a/config/settings/base.py +++ b/config/settings/base.py @@ -403,7 +403,10 @@ def _celery_result_backend_url(redis_url): # TODO: set to whatever value is adequate in your circumstances CELERY_TASK_SOFT_TIME_LIMIT = 3 * 60 * 60 * 24 # 3 days # https://docs.celeryq.dev/en/stable/userguide/configuration.html#beat-scheduler -CELERY_BEAT_SCHEDULER = "django_celery_beat.schedulers:DatabaseScheduler" +# HeartbeatDatabaseScheduler extends DatabaseScheduler to touch /tmp/beat-heartbeat +# on every tick so the celerybeat Docker healthcheck can detect a frozen +# scheduler (ami/celery_schedulers.py). +CELERY_BEAT_SCHEDULER = "ami.celery_schedulers:HeartbeatDatabaseScheduler" # https://docs.celeryq.dev/en/stable/userguide/configuration.html#worker-send-task-events CELERY_WORKER_SEND_TASK_EVENTS = True # https://docs.celeryq.dev/en/stable/userguide/configuration.html#std-setting-task_send_sent_event diff --git a/docker-compose.override-example.yml b/docker-compose.override-example.yml index 62b836b08..d7323061b 100644 --- a/docker-compose.override-example.yml +++ b/docker-compose.override-example.yml @@ -22,7 +22,7 @@ services: celeryworker: environment: - - DEBUGGER=1 + - CELERY_DEBUG=1 ports: - "5679:5679" volumes: diff --git a/docker-compose.production.yml b/docker-compose.production.yml index 4e04d0a08..8ffaca2d6 100644 --- a/docker-compose.production.yml +++ b/docker-compose.production.yml @@ -25,18 +25,21 @@ services: scale: 1 # Can't scale until the load balancer is within the compose config restart: always - celeryworker: - <<: *django - scale: 1 - ports: [] - command: /start-celeryworker - restart: always + # Workers run on dedicated machines via docker-compose.worker.yml (not here). celerybeat: <<: *django ports: [] command: /start-celerybeat restart: always + healthcheck: + test: ["CMD-SHELL", "/celery/healthcheck-beat.sh"] + interval: 30s + timeout: 5s + retries: 3 + start_period: 90s + labels: + - "autoheal=true" flower: <<: *django @@ -47,6 +50,20 @@ services: volumes: - ./data/flower/:/data/ + autoheal: + # Docker Compose has no native restart-on-unhealthy (swarm-only feature). + # willfarrell/autoheal watches the docker socket and restarts any container + # labeled `autoheal=true` that Docker has marked unhealthy. + image: willfarrell/autoheal:1.2.0 + container_name: ami_production_autoheal + restart: always + environment: + - AUTOHEAL_CONTAINER_LABEL=autoheal + - AUTOHEAL_INTERVAL=10 # poll docker for health every 10s + - AUTOHEAL_START_PERIOD=60 # ignore containers in their start_period + volumes: + - /var/run/docker.sock:/var/run/docker.sock + awscli: build: context: . diff --git a/docker-compose.staging.yml b/docker-compose.staging.yml index 66ae6c14b..b7db2f13b 100644 --- a/docker-compose.staging.yml +++ b/docker-compose.staging.yml @@ -54,12 +54,28 @@ services: ports: [] command: /start-celeryworker restart: always + healthcheck: + test: ["CMD-SHELL", "/celery/healthcheck.sh"] + interval: 30s + timeout: 15s + retries: 3 + start_period: 90s + labels: + - "autoheal=true" celerybeat: <<: *django ports: [] command: /start-celerybeat restart: always + healthcheck: + test: ["CMD-SHELL", "/celery/healthcheck-beat.sh"] + interval: 30s + timeout: 5s + retries: 3 + start_period: 90s + labels: + - "autoheal=true" flower: <<: *django @@ -70,6 +86,19 @@ services: volumes: - ./data/flower/:/data/ + autoheal: + # Docker Compose has no native restart-on-unhealthy (swarm-only feature). + # willfarrell/autoheal watches the docker socket and restarts any container + # labeled `autoheal=true` that Docker has marked unhealthy. + image: willfarrell/autoheal:1.2.0 + restart: always + environment: + - AUTOHEAL_CONTAINER_LABEL=autoheal + - AUTOHEAL_INTERVAL=10 + - AUTOHEAL_START_PERIOD=60 + volumes: + - /var/run/docker.sock:/var/run/docker.sock + redis: image: redis:6 command: redis-server /usr/local/etc/redis/redis.conf diff --git a/docker-compose.worker.yml b/docker-compose.worker.yml index e66584721..db8b98ddc 100644 --- a/docker-compose.worker.yml +++ b/docker-compose.worker.yml @@ -27,3 +27,22 @@ services: ports: [] command: /start-celeryworker restart: always + healthcheck: + test: ["CMD-SHELL", "/celery/healthcheck.sh"] + interval: 30s + timeout: 15s + retries: 3 + start_period: 90s + labels: + - "autoheal=true" + + autoheal: + image: willfarrell/autoheal:1.2.0 + container_name: ami_worker_autoheal + restart: always + environment: + - AUTOHEAL_CONTAINER_LABEL=autoheal + - AUTOHEAL_INTERVAL=10 + - AUTOHEAL_START_PERIOD=60 + volumes: + - /var/run/docker.sock:/var/run/docker.sock diff --git a/docker-compose.yml b/docker-compose.yml index 703ecea0d..28c1f0fb1 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -108,16 +108,31 @@ services: <<: *django image: ami_local_celeryworker scale: 1 - ports: [] + # For remote debugging with debugpy, set CELERY_DEBUG=1 in environment + # To disable watchfiles auto-reload, set CELERY_NO_RELOAD=1 + ports: + - "5679:5679" command: /start-celeryworker depends_on: - rabbitmq + healthcheck: + test: ["CMD-SHELL", "/celery/healthcheck.sh"] + interval: 30s + timeout: 15s + retries: 3 + start_period: 90s celerybeat: <<: *django image: ami_local_celerybeat ports: [] command: /start-celerybeat + healthcheck: + test: ["CMD-SHELL", "/celery/healthcheck-beat.sh"] + interval: 30s + timeout: 5s + retries: 3 + start_period: 90s flower: <<: *django