Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions ami/jobs/migrations/0021_joblog.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from django.db import migrations, models
import django.db.models.deletion


class Migration(migrations.Migration):
dependencies = [
("jobs", "0020_schedule_job_monitoring_beat_tasks"),
]

operations = [
migrations.CreateModel(
name="JobLog",
fields=[
("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")),
("created_at", models.DateTimeField(auto_now_add=True)),
("updated_at", models.DateTimeField(auto_now=True)),
("level", models.CharField(max_length=20)),
("message", models.TextField()),
("context", models.JSONField(blank=True, default=dict)),
(
"job",
models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE, related_name="log_entries", to="jobs.job"
),
),
],
options={
"ordering": ["-created_at", "-pk"],
"indexes": [models.Index(fields=["job", "-created_at"], name="jobs_joblog_job_id_e4aa59_idx")],
},
),
]
68 changes: 38 additions & 30 deletions ami/jobs/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,31 @@ class JobLogs(pydantic.BaseModel):
stderr: list[str] = pydantic.Field(default_factory=list, alias="stderr", title="Error messages")


class JobLog(BaseModel):
"""Append-only per-job log row.

Replaces the ``jobs_job.logs`` JSON-field UPDATE path that caused row-lock
contention under concurrent async_api load (issue #1256). Each log emit
becomes a cheap INSERT on this child table instead of a refresh+UPDATE of
the shared parent row. Legacy JSON-field logs are still served by the
serializer for jobs created before this table existed.
"""

project_accessor = "job__project"

job = models.ForeignKey("Job", on_delete=models.CASCADE, related_name="log_entries")
level = models.CharField(max_length=20)
message = models.TextField()
# Freeform bag for future per-line metadata (stage, worker id, counters, ...)
# without requiring a schema migration. Kept nullable/empty-default so it
# costs nothing on existing rows.
context = models.JSONField(blank=True, default=dict)

class Meta:
ordering = ["-created_at", "-pk"]
indexes = [models.Index(fields=["job", "-created_at"])]


class JobLogHandler(logging.Handler):
"""
Class for handling logs from a job and writing them to the job instance.
Expand All @@ -337,41 +362,24 @@ def emit(self, record: logging.LogRecord):
# Log to the current app logger (container stdout).
logger.log(record.levelno, self.format(record))

# Gated by ``JOB_LOG_PERSIST_ENABLED`` (default True). Persisting every
# log line to ``jobs_job.logs`` becomes a row-lock contention point
# under concurrent async_api load — each call triggers
# ``UPDATE jobs_job SET logs = ...`` on the shared job row, and inside
# ``ATOMIC_REQUESTS`` a single batched ``/result`` POST stacks N such
# UPDATEs in one tx, blocking every ML worker on the same row for the
# duration of the request. Deployments hitting that pattern can set the
# flag to False to short-circuit here until PR #1259 lands an
# append-only ``JobLog`` child table. See issue #1256.
# Escape hatch: when False, skip the per-job DB write entirely. Container
# stdout still captures every line above, so ops observability is
# unchanged; only the per-job UI log view loses new entries for the
# duration the flag is off. Default is True. See issue #1256.
if not getattr(settings, "JOB_LOG_PERSIST_ENABLED", True):
return

# Write to the logs field on the job instance.
# Refresh from DB first to reduce the window for concurrent overwrites — each
# worker holds its own stale in-memory copy of `logs`, so without a refresh the
# last writer always wins and earlier entries are silently dropped.
# @TODO consider saving logs to the database periodically rather than on every log
# Append-only insert on the JobLog child table. Unlike the legacy
# jobs_job.logs JSONB update path, this does not contend with
# _update_job_progress on the parent row.
try:
self.job.refresh_from_db(fields=["logs"])
timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
msg = f"[{timestamp}] {record.levelname} {self.format(record)}"
if msg not in self.job.logs.stdout:
self.job.logs.stdout.insert(0, msg)

# Write a simpler copy of any errors to the errors field
if record.levelno >= logging.ERROR:
if record.message not in self.job.logs.stderr:
self.job.logs.stderr.insert(0, record.message)

if len(self.job.logs.stdout) > self.max_log_length:
self.job.logs.stdout = self.job.logs.stdout[: self.max_log_length]

self.job.save(update_fields=["logs"], update_progress=False)
JobLog.objects.create(
job_id=self.job.pk,
level=record.levelname,
message=self.format(record),
)
except Exception as e:
logger.error(f"Failed to save logs for job #{self.job.pk}: {e}")
logger.error(f"Failed to save log for job #{self.job.pk}: {e}")


@dataclass
Expand Down
51 changes: 49 additions & 2 deletions ami/jobs/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,46 @@
from ami.ml.schemas import PipelineProcessingTask, PipelineTaskResult, ProcessingServiceClientInfo
from ami.ml.serializers import PipelineNestedSerializer

from .models import Job, JobLogs, JobProgress, MLJob
from .models import Job, JobLog, JobProgress, MLJob
from .schemas import QueuedTaskAcknowledgment

JOB_LOG_LEVELS_STDERR = {"ERROR", "CRITICAL"}
JOB_LOG_TIMESTAMP_FORMAT = "%Y-%m-%d %H:%M:%S"
JOB_LOGS_DEFAULT_LIMIT = 1000


def _legacy_logs_shape(job: Job) -> dict[str, list[str]]:
legacy = getattr(job, "logs", None)
return {
"stdout": list(getattr(legacy, "stdout", []) or []),
"stderr": list(getattr(legacy, "stderr", []) or []),
}


def serialize_job_logs(job: Job, *, limit: int = JOB_LOGS_DEFAULT_LIMIT) -> dict[str, list[str]]:
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it typical to have this in the serializer? feels like something for our jobs view

"""Return ``{stdout, stderr}`` in the shape the UI already parses.

Reads joined ``JobLog`` rows first (newest-first, capped at ``limit``). Jobs
created before the table existed and jobs written while
``JOB_LOG_PERSIST_ENABLED=False`` have no rows and fall back to the legacy
``jobs_job.logs`` JSON column so their UI log panel stays populated.
"""
entries = list(
JobLog.objects.filter(job_id=job.pk)
.only("created_at", "level", "message")
.order_by("-created_at", "-pk")[:limit]
)
if entries:
return {
"stdout": [
f"[{entry.created_at.strftime(JOB_LOG_TIMESTAMP_FORMAT)}] {entry.level} {entry.message}"
for entry in entries
],
"stderr": [entry.message for entry in entries if entry.level in JOB_LOG_LEVELS_STDERR],
}

return _legacy_logs_shape(job)


class JobProjectNestedSerializer(DefaultSerializer):
class Meta:
Expand Down Expand Up @@ -49,7 +86,7 @@ class JobListSerializer(DefaultSerializer):
source_image_single = SourceImageNestedSerializer(read_only=True)
data_export = DataExportNestedSerializer(read_only=True)
progress = SchemaField(schema=JobProgress, read_only=True)
logs = SchemaField(schema=JobLogs, read_only=True)
logs = serializers.SerializerMethodField()
job_type = JobTypeSerializer(read_only=True)
# All jobs created from the Jobs UI are ML jobs (datasync, etc. are created for the user)
# @TODO Remove this when the UI is updated pass a job type. This should be a required field.
Expand Down Expand Up @@ -147,6 +184,16 @@ class Meta:
"dispatch_mode",
]

def get_logs(self, obj: Job) -> dict[str, list[str]]:
# List responses skip the JobLog query to avoid N+1 — the UI only renders
# logs on the detail page, so returning the (typically empty for new jobs)
# legacy JSON shape is acceptable. Detail responses go to the joined table
# and fall back to the legacy shape for pre-migration jobs.
view = self.context.get("view")
if getattr(view, "action", None) == "list":
return _legacy_logs_shape(obj)
return serialize_job_logs(obj)


class JobSerializer(JobListSerializer):
# progress = serializers.JSONField(initial=Job.default_progress(), allow_null=False, required=False)
Expand Down
Loading
Loading