RolnickLab · mihow · Apr 17, 2026 · Copilot · Mar 3, 2026 · Copilot
diff --git a/ami/celery_schedulers.py b/ami/celery_schedulers.py
@@ -0,0 +1,74 @@
+"""Celery beat scheduler with a Docker-healthcheck-friendly heartbeat file.
+
+Why this module exists
+----------------------
+Docker's default ``restart: unless-stopped`` only catches process death, not a
+frozen scheduler thread. On 2026-04-16 the celerybeat container on ami-live
+showed "Up 10 hours" in ``docker ps`` with four live PIDs and
+``RestartCount=0``, yet its last log line was "Sending due task
+celery.check_processing_services_online" twelve hours earlier — a Redis
+connection blip had deadlocked the connection-pool lock and the scheduler
+thread never recovered. The 15-minute ``jobs_health_check`` beat task stopped
+firing, and stuck job 2421 was never reaped.
+
+To let Docker flip the container to ``unhealthy`` on that failure mode, we
+need a heartbeat signal that proves the scheduler's main loop is progressing.
+Constraints:
+
+- Beat does not answer ``celery inspect ping`` (that's a worker control
+  message), so we can't reuse the worker healthcheck.
+- We use ``DatabaseScheduler`` from ``django_celery_beat``, which keeps the
+  schedule in Postgres, so there is no on-disk schedule file whose mtime
+  would update naturally.
+- A plain Celery task written from a worker would touch a file in the
+  **worker's** filesystem, not beat's — Docker healthchecks read files
+  inside the checked container.
+
+So: override ``DatabaseScheduler.tick()`` to touch ``/tmp/beat-heartbeat``
+on every iteration. ``tick()`` runs inside the beat process itself, so the
+file lives in the beat container. If the scheduler loop hangs anywhere
+(Redis pool lock, DB query, sync deadlock), ``tick()`` stops returning and
+the file goes stale within ~60 s. The healthcheck
+(``compose/*/django/celery/healthcheck-beat.sh``) fails, Docker marks the
+container ``unhealthy``, and autoheal restarts it.
+
+Activation
+----------
+Wired in via ``CELERY_BEAT_SCHEDULER`` in ``config/settings/base.py``.
+"""
+
+from __future__ import annotations
+
+import logging
+from pathlib import Path
+
+from django_celery_beat.schedulers import DatabaseScheduler
+
+logger = logging.getLogger(__name__)
+
+HEARTBEAT_PATH = Path("/tmp/beat-heartbeat")
+
+
+class HeartbeatDatabaseScheduler(DatabaseScheduler):
+    """DatabaseScheduler that touches a heartbeat file on every tick.
+
+    Each call to ``tick()`` represents one cycle of the scheduler's main loop:
+    evaluate due tasks, enqueue them, return the seconds until the next tick.
+    If any step in that cycle hangs (e.g. a Redis or DB call blocks forever),
+    ``tick()`` stops returning, the file mtime stops advancing, and the Docker
+    healthcheck flips the container to ``unhealthy`` within ~2 minutes.
+
+    We touch the file *before* delegating to ``super().tick()`` so a successful
+    iteration of the loop itself is what proves liveness; if the heartbeat
+    write ever fails (disk full, permission error), we log at warning level
+    but don't re-raise — an I/O problem writing ``/tmp`` shouldn't take down
+    the scheduler. Docker will eventually mark the container unhealthy on the
+    stale file, which is the right outcome.
+    """
+
+    def tick(self, *args, **kwargs):
+        try:
+            HEARTBEAT_PATH.touch()
+        except OSError as exc:
+            logger.warning("beat heartbeat touch failed: %s", exc)
+        return super().tick(*args, **kwargs)
diff --git a/compose/local/django/Dockerfile b/compose/local/django/Dockerfile
@@ -41,6 +41,8 @@ RUN apt-get update && apt-get install --no-install-recommends -y \
   libpq-dev \
   # Translations dependencies
   gettext \
+  # healthcheck dependencies (pgrep, findmnt, etc.)
+  procps \
   # cleaning up unused files
   && apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false \
   && rm -rf /var/lib/apt/lists/*
@@ -74,6 +76,11 @@ COPY ./compose/local/django/celery/flower/start /start-flower
 RUN sed -i 's/\r$//g' /start-flower
 RUN chmod +x /start-flower
 
+# Healthcheck scripts for celery worker (inspect ping) and beat (heartbeat mtime)
+COPY ./compose/local/django/celery/healthcheck.sh /celery/healthcheck.sh
+COPY ./compose/local/django/celery/healthcheck-beat.sh /celery/healthcheck-beat.sh
+RUN chmod +x /celery/healthcheck.sh /celery/healthcheck-beat.sh
+
 
 # copy application code to WORKDIR
 COPY . ${APP_HOME}

diff --git a/compose/local/django/celery/healthcheck-beat.sh b/compose/local/django/celery/healthcheck-beat.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+# Celerybeat healthcheck: verify the scheduler is alive by checking heartbeat file age.
+#
+# Beat doesn't respond to `celery inspect ping` (that's a worker control message),
+# and with DatabaseScheduler (django_celery_beat) there's no schedule file whose
+# mtime we can watch. So we rely on a dedicated `ami.tasks.beat_heartbeat` task
+# that runs every 60s via CELERY_BEAT_SCHEDULE and touches /tmp/beat-heartbeat.
+#
+# If beat hangs (e.g. scheduler thread deadlocked on a Redis connection blip —
+# the 2026-04-16 incident), the task stops firing and the file goes stale.
+# Docker flips the container to `unhealthy`, autoheal restarts it.
+#
+# Window: task runs every 60s, we tolerate up to 2 min of staleness before
+# marking unhealthy (one missed tick is fine; two in a row is a hang).
+set -e
+find /tmp/beat-heartbeat -mmin -2 2>/dev/null | grep -q . || exit 1
diff --git a/compose/local/django/celery/healthcheck.sh b/compose/local/django/celery/healthcheck.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+# Celery worker healthcheck: verify the worker is responsive via the broker.
+# Catches stuck, deadlocked, and crashed workers — not just process existence.
+set -e
+exec celery -A config.celery_app inspect ping \
+    --destination "celery@$(hostname)" \
+    --timeout 10 > /dev/null 2>&1
diff --git a/compose/local/django/celery/worker/start b/compose/local/django/celery/worker/start
@@ -6,7 +6,8 @@ set -o nounset
 
 # Local development with auto-reload and optional debugging
 #
-# DEBUGGER=1 - Enable debugpy for remote debugging on port 5679
+# CELERY_DEBUG=1 - Enable debugpy for remote debugging on port 5679
+# CELERY_NO_RELOAD=1 - Disable watchfiles auto-reload
 #
 # Worker protections (prevent memory leaks from long-running workers):
 # --max-tasks-per-child=100 - Restart worker process after 100 tasks
@@ -18,10 +19,15 @@ set -o nounset
 MAX_TASKS_PER_CHILD=100
 MAX_MEMORY_PER_CHILD=1048576  # 1 GiB in KB
 
-# Launch VS Code debug server if DEBUGGER environment variable is set to 1
-# Note that auto reloading is disabled when debugging, manual restart required for code changes.
-if [ "${DEBUGGER:-0}" = "1" ]; then
+if [ "${CELERY_DEBUG:-0}" = "1" ]; then
+    echo "Starting Celery worker with debugpy on port 5679..."
     exec python -Xfrozen_modules=off -m debugpy --listen 0.0.0.0:5679 -m celery -A config.celery_app worker --queues=antenna -l INFO --max-tasks-per-child=$MAX_TASKS_PER_CHILD --max-memory-per-child=$MAX_MEMORY_PER_CHILD
+fi
+
+if [ "${CELERY_NO_RELOAD:-0}" = "1" ]; then
+    echo "Starting Celery worker without auto-reload..."
+    exec celery -A config.celery_app worker --queues=antenna -l INFO --max-tasks-per-child=$MAX_TASKS_PER_CHILD --max-memory-per-child=$MAX_MEMORY_PER_CHILD
 else
-    exec watchfiles --filter python celery.__main__.main --args '-A config.celery_app worker --queues=antenna -l INFO --max-tasks-per-child='$MAX_TASKS_PER_CHILD' --max-memory-per-child='$MAX_MEMORY_PER_CHILD
+    echo "Starting Celery worker with watchfiles auto-reload..."
+    exec watchfiles --filter python celery.__main__.main --args "-A config.celery_app worker --queues=antenna -l INFO --max-tasks-per-child=${MAX_TASKS_PER_CHILD} --max-memory-per-child=${MAX_MEMORY_PER_CHILD}"
 fi
diff --git a/compose/production/django/Dockerfile b/compose/production/django/Dockerfile
@@ -45,6 +45,8 @@ RUN apt-get update && apt-get install --no-install-recommends -y \
   libpq-dev \
   # Translations dependencies
   gettext \
+  # healthcheck dependencies (pgrep, findmnt, etc.)
+  procps \
   # cleaning up unused files
   && apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false \
   && rm -rf /var/lib/apt/lists/*
@@ -80,6 +82,11 @@ COPY ./compose/production/django/celery/flower/start /start-flower
 RUN sed -i 's/\r$//g' /start-flower
 RUN chmod +x /start-flower
 
+# Healthcheck scripts for celery worker (inspect ping) and beat (heartbeat mtime)
+COPY --chown=django:django ./compose/production/django/celery/healthcheck.sh /celery/healthcheck.sh
+COPY --chown=django:django ./compose/production/django/celery/healthcheck-beat.sh /celery/healthcheck-beat.sh
+RUN chmod +x /celery/healthcheck.sh /celery/healthcheck-beat.sh
+
 
 # copy application code to WORKDIR
 COPY --chown=django:django . ${APP_HOME}

diff --git a/compose/production/django/celery/healthcheck-beat.sh b/compose/production/django/celery/healthcheck-beat.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+# Celerybeat healthcheck: verify the scheduler is alive by checking heartbeat file age.
+#
+# Beat doesn't respond to `celery inspect ping` (that's a worker control message),
+# and with DatabaseScheduler (django_celery_beat) there's no schedule file whose
+# mtime we can watch. So we rely on a dedicated `ami.tasks.beat_heartbeat` task
+# that runs every 60s via CELERY_BEAT_SCHEDULE and touches /tmp/beat-heartbeat.
+#
+# If beat hangs (e.g. scheduler thread deadlocked on a Redis connection blip —
+# the 2026-04-16 incident), the task stops firing and the file goes stale.
+# Docker flips the container to `unhealthy`, autoheal restarts it.
+#
+# Window: task runs every 60s, we tolerate up to 2 min of staleness before
+# marking unhealthy (one missed tick is fine; two in a row is a hang).
+set -e
+find /tmp/beat-heartbeat -mmin -2 2>/dev/null | grep -q . || exit 1
diff --git a/compose/production/django/celery/healthcheck.sh b/compose/production/django/celery/healthcheck.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+# Celery worker healthcheck: verify the worker is responsive via the broker.
+# Catches stuck, deadlocked, and crashed workers — not just process existence.
+set -e
+exec celery -A config.celery_app inspect ping \
+    --destination "celery@$(hostname)" \
+    --timeout 10 > /dev/null 2>&1
diff --git a/compose/production/django/celery/worker/start b/compose/production/django/celery/worker/start
@@ -12,8 +12,18 @@ set -o nounset
 #
 # With prefork pool (default), each CPU gets a worker process.
 # Example: 8 CPUs × 2 GiB = 16 GiB max total worker memory
+#
+# These options work in conjunction with the Docker healthcheck:
+# - Healthcheck (celery inspect ping) detects STUCK workers (not responding)
+# - These options prevent RESOURCE LEAKS (memory/task buildup over time)
+# - Autoheal restarts UNHEALTHY containers
+# - restart:always brings containers back after any exit
 
 MAX_TASKS_PER_CHILD=100
 MAX_MEMORY_PER_CHILD=2097152  # 2 GiB in KB
 
-exec newrelic-admin run-program celery -A config.celery_app worker --queues=antenna -l INFO --max-tasks-per-child=$MAX_TASKS_PER_CHILD --max-memory-per-child=$MAX_MEMORY_PER_CHILD
+exec newrelic-admin run-program celery -A config.celery_app worker \
+    --queues=antenna \
+    -l INFO \
+    --max-tasks-per-child=$MAX_TASKS_PER_CHILD \
+    --max-memory-per-child=$MAX_MEMORY_PER_CHILD
diff --git a/config/settings/base.py b/config/settings/base.py
@@ -403,7 +403,10 @@ def _celery_result_backend_url(redis_url):
 # TODO: set to whatever value is adequate in your circumstances
 CELERY_TASK_SOFT_TIME_LIMIT = 3 * 60 * 60 * 24  # 3 days
 # https://docs.celeryq.dev/en/stable/userguide/configuration.html#beat-scheduler
-CELERY_BEAT_SCHEDULER = "django_celery_beat.schedulers:DatabaseScheduler"
+# HeartbeatDatabaseScheduler extends DatabaseScheduler to touch /tmp/beat-heartbeat
+# on every tick so the celerybeat Docker healthcheck can detect a frozen
+# scheduler (ami/celery_schedulers.py).
+CELERY_BEAT_SCHEDULER = "ami.celery_schedulers:HeartbeatDatabaseScheduler"
 # https://docs.celeryq.dev/en/stable/userguide/configuration.html#worker-send-task-events
 CELERY_WORKER_SEND_TASK_EVENTS = True
 # https://docs.celeryq.dev/en/stable/userguide/configuration.html#std-setting-task_send_sent_event

diff --git a/docker-compose.override-example.yml b/docker-compose.override-example.yml
@@ -22,7 +22,7 @@ services:
 
   celeryworker:
     environment:
-      - DEBUGGER=1
+      - CELERY_DEBUG=1
     ports:
       - "5679:5679"
     volumes:

diff --git a/docker-compose.production.yml b/docker-compose.production.yml
@@ -25,18 +25,21 @@ services:
     scale: 1  # Can't scale until the load balancer is within the compose config
     restart: always
 
-  celeryworker:
-    <<: *django
-    scale: 1
-    ports: []
-    command: /start-celeryworker
-    restart: always
+  # Workers run on dedicated machines via docker-compose.worker.yml (not here).
 
   celerybeat:
     <<: *django
     ports: []
     command: /start-celerybeat
     restart: always
+    healthcheck:
+      test: ["CMD-SHELL", "/celery/healthcheck-beat.sh"]
+      interval: 30s
+      timeout: 5s
+      retries: 3
+      start_period: 90s
+    labels:
+      - "autoheal=true"
 
   flower:
     <<: *django
@@ -47,6 +50,20 @@ services:
     volumes:
       - ./data/flower/:/data/
 
+  autoheal:
+    # Docker Compose has no native restart-on-unhealthy (swarm-only feature).
+    # willfarrell/autoheal watches the docker socket and restarts any container
+    # labeled `autoheal=true` that Docker has marked unhealthy.
+    image: willfarrell/autoheal:1.2.0
+    container_name: ami_production_autoheal
+    restart: always
+    environment:
+      - AUTOHEAL_CONTAINER_LABEL=autoheal
+      - AUTOHEAL_INTERVAL=10       # poll docker for health every 10s
+      - AUTOHEAL_START_PERIOD=60   # ignore containers in their start_period
+    volumes:
+      - /var/run/docker.sock:/var/run/docker.sock
+
   awscli:
     build:
       context: .

diff --git a/docker-compose.staging.yml b/docker-compose.staging.yml
@@ -54,12 +54,28 @@ services:
     ports: []
     command: /start-celeryworker
     restart: always
+    healthcheck:
+      test: ["CMD-SHELL", "/celery/healthcheck.sh"]
+      interval: 30s
+      timeout: 15s
+      retries: 3
+      start_period: 90s
+    labels:
+      - "autoheal=true"
 
   celerybeat:
     <<: *django
     ports: []
     command: /start-celerybeat
     restart: always
+    healthcheck:
+      test: ["CMD-SHELL", "/celery/healthcheck-beat.sh"]
+      interval: 30s
+      timeout: 5s
+      retries: 3
+      start_period: 90s
+    labels:
+      - "autoheal=true"
 
   flower:
     <<: *django
@@ -70,6 +86,19 @@ services:
     volumes:
       - ./data/flower/:/data/
 
+  autoheal:
+    # Docker Compose has no native restart-on-unhealthy (swarm-only feature).
+    # willfarrell/autoheal watches the docker socket and restarts any container
+    # labeled `autoheal=true` that Docker has marked unhealthy.
+    image: willfarrell/autoheal:1.2.0
+    restart: always
+    environment:
+      - AUTOHEAL_CONTAINER_LABEL=autoheal
+      - AUTOHEAL_INTERVAL=10
+      - AUTOHEAL_START_PERIOD=60
+    volumes:
+      - /var/run/docker.sock:/var/run/docker.sock
+
   redis:
     image: redis:6
     command: redis-server /usr/local/etc/redis/redis.conf

diff --git a/docker-compose.worker.yml b/docker-compose.worker.yml
@@ -27,3 +27,22 @@ services:
     ports: []
     command: /start-celeryworker
     restart: always
+    healthcheck:
+      test: ["CMD-SHELL", "/celery/healthcheck.sh"]
+      interval: 30s
+      timeout: 15s
+      retries: 3
+      start_period: 90s
+    labels:
+      - "autoheal=true"
+
+  autoheal:
+    image: willfarrell/autoheal:1.2.0
+    container_name: ami_worker_autoheal
+    restart: always
+    environment:
+      - AUTOHEAL_CONTAINER_LABEL=autoheal
+      - AUTOHEAL_INTERVAL=10
+      - AUTOHEAL_START_PERIOD=60
+    volumes:
+      - /var/run/docker.sock:/var/run/docker.sock
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -108,16 +108,31 @@ services:
     <<: *django
     image: ami_local_celeryworker
     scale: 1
-    ports: []
+    # For remote debugging with debugpy, set CELERY_DEBUG=1 in environment
+    # To disable watchfiles auto-reload, set CELERY_NO_RELOAD=1
+    ports:
+      - "5679:5679"
     command: /start-celeryworker
     depends_on:
       - rabbitmq
+    healthcheck:
+      test: ["CMD-SHELL", "/celery/healthcheck.sh"]
+      interval: 30s
+      timeout: 15s
+      retries: 3
+      start_period: 90s
 
   celerybeat:
     <<: *django
     image: ami_local_celerybeat
     ports: []
     command: /start-celerybeat
+    healthcheck:
+      test: ["CMD-SHELL", "/celery/healthcheck-beat.sh"]
+      interval: 30s
+      timeout: 5s
+      retries: 3
+      start_period: 90s
 
   flower:
     <<: *django