RolnickLab · Copilot · Apr 21, 2026 · Apr 21, 2026 · mihow · Apr 21, 2026
diff --git a/ami/jobs/migrations/0021_joblog.py b/ami/jobs/migrations/0021_joblog.py
@@ -0,0 +1,32 @@
+from django.db import migrations, models
+import django.db.models.deletion
+
+
+class Migration(migrations.Migration):
+    dependencies = [
+        ("jobs", "0020_schedule_job_monitoring_beat_tasks"),
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name="JobLog",
+            fields=[
+                ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")),
+                ("created_at", models.DateTimeField(auto_now_add=True)),
+                ("updated_at", models.DateTimeField(auto_now=True)),
+                ("level", models.CharField(max_length=20)),
+                ("message", models.TextField()),
+                ("context", models.JSONField(blank=True, default=dict)),
+                (
+                    "job",
+                    models.ForeignKey(
+                        on_delete=django.db.models.deletion.CASCADE, related_name="log_entries", to="jobs.job"
+                    ),
+                ),
+            ],
+            options={
+                "ordering": ["-created_at", "-pk"],
+                "indexes": [models.Index(fields=["job", "-created_at"], name="jobs_joblog_job_id_e4aa59_idx")],
+            },
+        ),
+    ]
diff --git a/ami/jobs/models.py b/ami/jobs/models.py
@@ -322,6 +322,31 @@ class JobLogs(pydantic.BaseModel):
     stderr: list[str] = pydantic.Field(default_factory=list, alias="stderr", title="Error messages")
 
 
+class JobLog(BaseModel):
+    """Append-only per-job log row.
+
+    Replaces the ``jobs_job.logs`` JSON-field UPDATE path that caused row-lock
+    contention under concurrent async_api load (issue #1256). Each log emit
+    becomes a cheap INSERT on this child table instead of a refresh+UPDATE of
+    the shared parent row. Legacy JSON-field logs are still served by the
+    serializer for jobs created before this table existed.
+    """
+
+    project_accessor = "job__project"
+
+    job = models.ForeignKey("Job", on_delete=models.CASCADE, related_name="log_entries")
+    level = models.CharField(max_length=20)
+    message = models.TextField()
+    # Freeform bag for future per-line metadata (stage, worker id, counters, ...)
+    # without requiring a schema migration. Kept nullable/empty-default so it
+    # costs nothing on existing rows.
+    context = models.JSONField(blank=True, default=dict)
+
+    class Meta:
+        ordering = ["-created_at", "-pk"]
+        indexes = [models.Index(fields=["job", "-created_at"])]
+
+
 class JobLogHandler(logging.Handler):
     """
     Class for handling logs from a job and writing them to the job instance.
@@ -337,41 +362,24 @@ def emit(self, record: logging.LogRecord):
         # Log to the current app logger (container stdout).
         logger.log(record.levelno, self.format(record))
 
-        # Gated by ``JOB_LOG_PERSIST_ENABLED`` (default True). Persisting every
-        # log line to ``jobs_job.logs`` becomes a row-lock contention point
-        # under concurrent async_api load — each call triggers
-        # ``UPDATE jobs_job SET logs = ...`` on the shared job row, and inside
-        # ``ATOMIC_REQUESTS`` a single batched ``/result`` POST stacks N such
-        # UPDATEs in one tx, blocking every ML worker on the same row for the
-        # duration of the request. Deployments hitting that pattern can set the
-        # flag to False to short-circuit here until PR #1259 lands an
-        # append-only ``JobLog`` child table. See issue #1256.
+        # Escape hatch: when False, skip the per-job DB write entirely. Container
+        # stdout still captures every line above, so ops observability is
+        # unchanged; only the per-job UI log view loses new entries for the
+        # duration the flag is off. Default is True. See issue #1256.
         if not getattr(settings, "JOB_LOG_PERSIST_ENABLED", True):
             return
 
-        # Write to the logs field on the job instance.
-        # Refresh from DB first to reduce the window for concurrent overwrites — each
-        # worker holds its own stale in-memory copy of `logs`, so without a refresh the
-        # last writer always wins and earlier entries are silently dropped.
-        # @TODO consider saving logs to the database periodically rather than on every log
+        # Append-only insert on the JobLog child table. Unlike the legacy
+        # jobs_job.logs JSONB update path, this does not contend with
+        # _update_job_progress on the parent row.
         try:
-            self.job.refresh_from_db(fields=["logs"])
-            timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-            msg = f"[{timestamp}] {record.levelname} {self.format(record)}"
-            if msg not in self.job.logs.stdout:
-                self.job.logs.stdout.insert(0, msg)
-
-            # Write a simpler copy of any errors to the errors field
-            if record.levelno >= logging.ERROR:
-                if record.message not in self.job.logs.stderr:
-                    self.job.logs.stderr.insert(0, record.message)
-
-            if len(self.job.logs.stdout) > self.max_log_length:
-                self.job.logs.stdout = self.job.logs.stdout[: self.max_log_length]
-
-            self.job.save(update_fields=["logs"], update_progress=False)
+            JobLog.objects.create(
+                job_id=self.job.pk,
+                level=record.levelname,
+                message=self.format(record),
+            )
         except Exception as e:
-            logger.error(f"Failed to save logs for job #{self.job.pk}: {e}")
+            logger.error(f"Failed to save log for job #{self.job.pk}: {e}")
 
 
 @dataclass

diff --git a/ami/jobs/serializers.py b/ami/jobs/serializers.py
@@ -13,9 +13,46 @@
 from ami.ml.schemas import PipelineProcessingTask, PipelineTaskResult, ProcessingServiceClientInfo
 from ami.ml.serializers import PipelineNestedSerializer
 
-from .models import Job, JobLogs, JobProgress, MLJob
+from .models import Job, JobLog, JobProgress, MLJob
 from .schemas import QueuedTaskAcknowledgment
 
+JOB_LOG_LEVELS_STDERR = {"ERROR", "CRITICAL"}
+JOB_LOG_TIMESTAMP_FORMAT = "%Y-%m-%d %H:%M:%S"
+JOB_LOGS_DEFAULT_LIMIT = 1000
+
+
+def _legacy_logs_shape(job: Job) -> dict[str, list[str]]:
+    legacy = getattr(job, "logs", None)
+    return {
+        "stdout": list(getattr(legacy, "stdout", []) or []),
+        "stderr": list(getattr(legacy, "stderr", []) or []),
+    }
+
+
+def serialize_job_logs(job: Job, *, limit: int = JOB_LOGS_DEFAULT_LIMIT) -> dict[str, list[str]]:
+    """Return ``{stdout, stderr}`` in the shape the UI already parses.
+
+    Reads joined ``JobLog`` rows first (newest-first, capped at ``limit``). Jobs
+    created before the table existed and jobs written while
+    ``JOB_LOG_PERSIST_ENABLED=False`` have no rows and fall back to the legacy
+    ``jobs_job.logs`` JSON column so their UI log panel stays populated.
+    """
+    entries = list(
+        JobLog.objects.filter(job_id=job.pk)
+        .only("created_at", "level", "message")
+        .order_by("-created_at", "-pk")[:limit]
+    )
+    if entries:
+        return {
+            "stdout": [
+                f"[{entry.created_at.strftime(JOB_LOG_TIMESTAMP_FORMAT)}] {entry.level} {entry.message}"
+                for entry in entries
+            ],
+            "stderr": [entry.message for entry in entries if entry.level in JOB_LOG_LEVELS_STDERR],
+        }
+
+    return _legacy_logs_shape(job)
+
 
 class JobProjectNestedSerializer(DefaultSerializer):
     class Meta:
@@ -49,7 +86,7 @@ class JobListSerializer(DefaultSerializer):
     source_image_single = SourceImageNestedSerializer(read_only=True)
     data_export = DataExportNestedSerializer(read_only=True)
     progress = SchemaField(schema=JobProgress, read_only=True)
-    logs = SchemaField(schema=JobLogs, read_only=True)
+    logs = serializers.SerializerMethodField()
     job_type = JobTypeSerializer(read_only=True)
     # All jobs created from the Jobs UI are ML jobs (datasync, etc. are created for the user)
     # @TODO Remove this when the UI is updated pass a job type. This should be a required field.
@@ -147,6 +184,16 @@ class Meta:
             "dispatch_mode",
         ]
 
+    def get_logs(self, obj: Job) -> dict[str, list[str]]:
+        # List responses skip the JobLog query to avoid N+1 — the UI only renders
+        # logs on the detail page, so returning the (typically empty for new jobs)
+        # legacy JSON shape is acceptable. Detail responses go to the joined table
+        # and fall back to the legacy shape for pre-migration jobs.
+        view = self.context.get("view")
+        if getattr(view, "action", None) == "list":
+            return _legacy_logs_shape(obj)
+        return serialize_job_logs(obj)
+
 
 class JobSerializer(JobListSerializer):
     # progress = serializers.JSONField(initial=Job.default_progress(), allow_null=False, required=False)