From f21ec4d4d26797d86b5f1d8ebee33c3b58f53a61 Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Tue, 7 Apr 2026 19:06:47 -0700 Subject: [PATCH 01/12] feat(exports): add machine prediction, verification, and detection fields Separate machine predictions from human identifications in exports and API, so researchers see both side-by-side. Previously the determination was overwritten when a human verified, losing the original ML prediction. Model layer: - Extract find_best_prediction() and find_best_identification() from update_occurrence_determination() for reuse by exports and API - Set determination_score to None for human-determined occurrences (ML confidence preserved in best_machine_prediction_score) New CSV export fields: - best_machine_prediction_name, _algorithm, _score - verified_by, verified_by_count, agreed_with_algorithm - determination_matches_machine_prediction - best_detection_bbox, best_detection_source_image_url, best_detection_occurrence_url API changes: - Add best_machine_prediction nested object to OccurrenceListSerializer (always populated regardless of verification status) Closes #1213 Co-Authored-By: Claude Opus 4.6 (1M context) --- ami/exports/format_types.py | 75 +++++++++++++-- ami/exports/tests.py | 185 +++++++++++++++++++++++++++++++++++- ami/main/api/serializers.py | 29 ++++++ ami/main/models.py | 104 +++++++++++++++++--- 4 files changed, 374 insertions(+), 19 deletions(-) diff --git a/ami/exports/format_types.py b/ami/exports/format_types.py index bc628a8ef..52c715efb 100644 --- a/ami/exports/format_types.py +++ b/ami/exports/format_types.py @@ -93,9 +93,24 @@ class OccurrenceTabularSerializer(serializers.ModelSerializer): determination_score = serializers.FloatField(allow_null=True) verification_status = serializers.SerializerMethodField() + # Machine prediction fields + best_machine_prediction_name = serializers.CharField(allow_null=True, default=None) + best_machine_prediction_algorithm = serializers.CharField(allow_null=True, default=None) + best_machine_prediction_score = serializers.FloatField(allow_null=True, default=None) + + # Verification fields + verified_by = serializers.SerializerMethodField() + verified_by_count = serializers.IntegerField(default=0) + agreed_with_algorithm = serializers.SerializerMethodField() + determination_matches_machine_prediction = serializers.SerializerMethodField() + + # Detection fields best_detection_url = serializers.SerializerMethodField() + best_detection_bbox = serializers.SerializerMethodField() best_detection_width = serializers.SerializerMethodField() best_detection_height = serializers.SerializerMethodField() + best_detection_source_image_url = serializers.SerializerMethodField() + best_detection_occurrence_url = serializers.SerializerMethodField() class Meta: model = Occurrence @@ -111,29 +126,59 @@ class Meta: "determination_name", "determination_score", "verification_status", + "best_machine_prediction_name", + "best_machine_prediction_algorithm", + "best_machine_prediction_score", + "verified_by", + "verified_by_count", + "agreed_with_algorithm", + "determination_matches_machine_prediction", "detections_count", "first_appearance_timestamp", "last_appearance_timestamp", "duration", "best_detection_url", + "best_detection_bbox", "best_detection_width", "best_detection_height", + "best_detection_source_image_url", + "best_detection_occurrence_url", ] def get_verification_status(self, obj): - """ - Returns 'Verified' if the occurrence has identifications, otherwise 'Not verified'. - """ + """Returns 'Verified' if the occurrence has non-withdrawn identifications.""" + count = getattr(obj, "verified_by_count", None) + if count is not None: + return "Verified" if count > 0 else "Not verified" return "Verified" if obj.identifications.exists() else "Not verified" + def get_verified_by(self, obj): + """Returns the name or email of the user who made the best identification.""" + name = getattr(obj, "verified_by_name", None) + if name: + return name + return getattr(obj, "verified_by_email", None) + + def get_agreed_with_algorithm(self, obj): + """Returns the algorithm name if the identifier explicitly agreed with an ML prediction.""" + return getattr(obj, "agreed_with_algorithm_name", None) + + def get_determination_matches_machine_prediction(self, obj): + """Returns whether the determination taxon matches the best machine prediction taxon.""" + prediction_taxon_id = getattr(obj, "best_machine_prediction_taxon_id", None) + if prediction_taxon_id is None or obj.determination_id is None: + return None + return obj.determination_id == prediction_taxon_id + def get_best_detection_url(self, obj): - """ - Returns the full URL to the cropped detection image. - Uses the annotated best_detection_path from the queryset. - """ + """Returns the full URL to the cropped detection image.""" path = getattr(obj, "best_detection_path", None) return get_media_url(path) if path else None + def get_best_detection_bbox(self, obj): + """Returns the raw bounding box coordinates [x1, y1, x2, y2].""" + return getattr(obj, "best_detection_bbox", None) + def get_best_detection_width(self, obj): """Returns the width of the detection bounding box.""" bbox = BoundingBox.from_coords(getattr(obj, "best_detection_bbox", None), raise_on_error=False) @@ -144,6 +189,20 @@ def get_best_detection_height(self, obj): bbox = BoundingBox.from_coords(getattr(obj, "best_detection_bbox", None), raise_on_error=False) return bbox.height if bbox else None + def get_best_detection_source_image_url(self, obj): + """Returns the public URL to the original source image.""" + path = getattr(obj, "best_detection_source_image_path", None) + base_url = getattr(obj, "best_detection_source_image_public_base_url", None) + if path and base_url: + import urllib.parse + + return urllib.parse.urljoin(base_url, path.lstrip("/")) + return None + + def get_best_detection_occurrence_url(self, obj): + """Returns the platform UI link to the occurrence in context.""" + return obj.context_url() + class CSVExporter(BaseExporter): """Handles CSV export of occurrences.""" @@ -165,6 +224,8 @@ def get_queryset(self): .with_detections_count() .with_identifications() .with_best_detection() # type: ignore[union-attr] Custom queryset method + .with_best_machine_prediction() + .with_verification_info() ) def export(self): diff --git a/ami/exports/tests.py b/ami/exports/tests.py index 6dcd73915..edb1bf605 100644 --- a/ami/exports/tests.py +++ b/ami/exports/tests.py @@ -1,4 +1,5 @@ import csv +import datetime import json import logging @@ -8,7 +9,8 @@ from rest_framework.test import APIClient from ami.exports.models import DataExport -from ami.main.models import Occurrence, SourceImageCollection +from ami.main.models import Detection, Identification, Occurrence, SourceImageCollection, Taxon +from ami.ml.models import Algorithm from ami.tests.fixtures.main import ( create_captures, create_occurrences, @@ -302,3 +304,184 @@ def test_non_member_cannot_create_export(self): self.non_member.has_perm(Project.Permissions.CREATE_DATA_EXPORT, self.project), "Non-member should not have create_dataexport permission", ) + + +class ExportNewFieldsTest(TestCase): + """Test the new machine prediction, verification, and detection fields in CSV exports.""" + + def setUp(self): + self.project, self.deployment = setup_test_project(reuse=False) + self.user = self.project.owner + self.client = APIClient() + self.client.force_authenticate(user=self.user) + + create_captures(deployment=self.deployment, num_nights=1, images_per_night=4, interval_minutes=1) + group_images_into_events(self.deployment) + create_taxa(self.project) + + # Create an algorithm for classifications + self.algorithm, _ = Algorithm.objects.get_or_create( + name="test-classifier", + defaults={"key": "test-classifier"}, + ) + + # Create a second taxon for disagreement tests + self.taxa = list(Taxon.objects.filter(projects=self.project)[:2]) + self.taxon_a = self.taxa[0] + if len(self.taxa) > 1: + self.taxon_b = self.taxa[1] + else: + self.taxon_b = Taxon.objects.create(name="Test Taxon B") + self.taxon_b.projects.add(self.project) + + def _create_occurrence_with_prediction(self, taxon=None, score=0.85): + """Create an occurrence with a single detection and ML classification.""" + taxon = taxon or self.taxon_a + source_image = self.project.captures.first() + detection = Detection.objects.create( + source_image=source_image, + timestamp=source_image.timestamp, + bbox=[0.1, 0.1, 0.5, 0.5], + path="detections/test.jpg", + ) + classification = detection.classifications.create( + taxon=taxon, + score=score, + timestamp=datetime.datetime.now(), + algorithm=self.algorithm, + terminal=True, + ) + occurrence = detection.associate_new_occurrence() + return occurrence, classification + + def _run_csv_export(self): + """Run a CSV export and return the rows as a list of dicts.""" + data_export = DataExport.objects.create( + user=self.user, + project=self.project, + format="occurrences_simple_csv", + job=None, + ) + file_url = data_export.run_export() + self.assertIsNotNone(file_url) + file_path = file_url.replace("/media/", "") + with default_storage.open(file_path, "r") as f: + rows = list(csv.DictReader(f)) + default_storage.delete(file_path) + return rows + + def test_ml_prediction_only(self): + """Occurrence with only ML prediction: machine prediction fields populated, verified_by null.""" + occurrence, classification = self._create_occurrence_with_prediction() + rows = self._run_csv_export() + + row = next(r for r in rows if int(r["id"]) == occurrence.pk) + self.assertEqual(row["best_machine_prediction_name"], self.taxon_a.name) + self.assertEqual(row["best_machine_prediction_algorithm"], "test-classifier") + self.assertAlmostEqual(float(row["best_machine_prediction_score"]), 0.85, places=2) + self.assertEqual(row["verified_by"], "") + self.assertEqual(row["verified_by_count"], "0") + + def test_ml_prediction_with_agreeing_human(self): + """Human agrees with ML: verified_by set, determination_matches = True, determination_score = None.""" + occurrence, classification = self._create_occurrence_with_prediction() + + # Human agrees with the same taxon + Identification.objects.create( + user=self.user, + taxon=self.taxon_a, + occurrence=occurrence, + agreed_with_prediction=classification, + ) + + rows = self._run_csv_export() + row = next(r for r in rows if int(r["id"]) == occurrence.pk) + + # Machine prediction fields still populated + self.assertEqual(row["best_machine_prediction_name"], self.taxon_a.name) + self.assertAlmostEqual(float(row["best_machine_prediction_score"]), 0.85, places=2) + + # Verification fields + verified_by = row["verified_by"] + self.assertTrue(verified_by, "verified_by should not be empty") + self.assertEqual(row["verified_by_count"], "1") + self.assertEqual(row["agreed_with_algorithm"], "test-classifier") + self.assertEqual(row["determination_matches_machine_prediction"], "True") + + # determination_score should be empty/None for human-determined occurrences + self.assertIn(row["determination_score"], ["", "None", None]) + + def test_ml_prediction_with_disagreeing_human(self): + """Human disagrees with ML: different determination, determination_matches = False.""" + occurrence, classification = self._create_occurrence_with_prediction(taxon=self.taxon_a) + + # Human identifies as a different taxon + Identification.objects.create( + user=self.user, + taxon=self.taxon_b, + occurrence=occurrence, + ) + + rows = self._run_csv_export() + row = next(r for r in rows if int(r["id"]) == occurrence.pk) + + # Machine prediction still shows original + self.assertEqual(row["best_machine_prediction_name"], self.taxon_a.name) + # Determination is now the human's choice + self.assertEqual(row["determination_name"], self.taxon_b.name) + self.assertEqual(row["determination_matches_machine_prediction"], "False") + self.assertEqual(row["agreed_with_algorithm"], "") + + def test_multiple_identifications_count(self): + """Multiple identifications: verified_by_count reflects all non-withdrawn IDs.""" + occurrence, _ = self._create_occurrence_with_prediction() + + from ami.users.models import User + + user2 = User.objects.create_user(email="verifier2@test.org") + + Identification.objects.create(user=self.user, taxon=self.taxon_a, occurrence=occurrence) + Identification.objects.create(user=user2, taxon=self.taxon_a, occurrence=occurrence) + + rows = self._run_csv_export() + row = next(r for r in rows if int(r["id"]) == occurrence.pk) + self.assertEqual(row["verified_by_count"], "2") + + def test_detection_bbox_field(self): + """Best detection bbox is included in export.""" + occurrence, _ = self._create_occurrence_with_prediction() + rows = self._run_csv_export() + row = next(r for r in rows if int(r["id"]) == occurrence.pk) + self.assertIn("best_detection_bbox", row) + # bbox should be a string representation of the list + self.assertIn("0.1", row["best_detection_bbox"]) + + def test_csv_has_all_new_fields(self): + """All new fields are present as CSV column headers.""" + self._create_occurrence_with_prediction() + rows = self._run_csv_export() + self.assertGreater(len(rows), 0) + headers = rows[0].keys() + expected_fields = [ + "best_machine_prediction_name", + "best_machine_prediction_algorithm", + "best_machine_prediction_score", + "verified_by", + "verified_by_count", + "agreed_with_algorithm", + "determination_matches_machine_prediction", + "best_detection_bbox", + "best_detection_source_image_url", + "best_detection_occurrence_url", + ] + for field in expected_fields: + self.assertIn(field, headers, f"Missing CSV field: {field}") + + def test_occurrence_url_field(self): + """best_detection_occurrence_url contains a valid platform link.""" + occurrence, _ = self._create_occurrence_with_prediction() + rows = self._run_csv_export() + row = next(r for r in rows if int(r["id"]) == occurrence.pk) + url = row.get("best_detection_occurrence_url", "") + if url: + self.assertIn(str(occurrence.pk), url) diff --git a/ami/main/api/serializers.py b/ami/main/api/serializers.py index 0caa7b3e6..e3c65eeea 100644 --- a/ami/main/api/serializers.py +++ b/ami/main/api/serializers.py @@ -1319,6 +1319,7 @@ class OccurrenceListSerializer(DefaultSerializer): event = EventNestedSerializer(read_only=True) # first_appearance = TaxonSourceImageNestedSerializer(read_only=True) determination_details = serializers.SerializerMethodField() + best_machine_prediction = serializers.SerializerMethodField() identifications = OccurrenceIdentificationSerializer(many=True, read_only=True) def get_permissions(self, instance, instance_data): @@ -1357,6 +1358,7 @@ class Meta: "detection_images", "determination_score", "determination_details", + "best_machine_prediction", "identifications", "created_at", "updated_at", @@ -1391,6 +1393,33 @@ def get_determination_details(self, obj: Occurrence): score=obj.determination_score, ) + def get_best_machine_prediction(self, obj: Occurrence): + """Always return the best machine prediction, regardless of human verification status.""" + context = self.context + context["occurrence"] = obj + + prediction = obj.find_best_prediction() + if not prediction: + return None + + taxon_data = TaxonNestedSerializer(prediction.taxon, context=context).data if prediction.taxon else None + algorithm_data = None + if prediction.algorithm: + from ami.ml.serializers import AlgorithmNestedSerializer + + algorithm_data = AlgorithmNestedSerializer(prediction.algorithm, context=context).data + + determination_matches = None + if obj.determination_id and prediction.taxon_id: + determination_matches = obj.determination_id == prediction.taxon_id + + return dict( + taxon=taxon_data, + algorithm=algorithm_data, + score=prediction.score, + determination_matches_machine_prediction=determination_matches, + ) + class OccurrenceSerializer(OccurrenceListSerializer): determination = CaptureTaxonSerializer(read_only=True) diff --git a/ami/main/models.py b/ami/main/models.py index 2dc5e13bd..51e417204 100644 --- a/ami/main/models.py +++ b/ami/main/models.py @@ -2780,6 +2780,8 @@ def with_best_detection(self): Adds the following annotations: - best_detection_path: The path to the detection image - best_detection_bbox: The bounding box of the detection as a list [x1, y1, x2, y2] + - best_detection_source_image_path: The path of the source image + - best_detection_source_image_public_base_url: The public base URL of the source image """ # Subquery to get the path of the best detection # Use id as secondary sort to ensure deterministic results @@ -2797,9 +2799,75 @@ def with_best_detection(self): .values("bbox")[:1] ) + # Subquery to get the source image path and public_base_url for the best detection + best_detection_source_image_path_subquery = ( + Detection.objects.filter(occurrence=OuterRef("pk")) + .order_by("-classifications__score", "id") + .values("source_image__path")[:1] + ) + best_detection_source_image_public_base_url_subquery = ( + Detection.objects.filter(occurrence=OuterRef("pk")) + .order_by("-classifications__score", "id") + .values("source_image__public_base_url")[:1] + ) + return self.annotate( best_detection_path=models.Subquery(best_detection_path_subquery), best_detection_bbox=models.Subquery(best_detection_bbox_subquery), + best_detection_source_image_path=models.Subquery(best_detection_source_image_path_subquery), + best_detection_source_image_public_base_url=models.Subquery( + best_detection_source_image_public_base_url_subquery + ), + ) + + def with_best_machine_prediction(self): + """ + Annotate the queryset with fields from the best machine prediction. + + The best prediction is the highest-scoring classification, preferring terminal ones. + Same ordering as Occurrence.find_best_prediction(): -terminal, -score. + + Adds the following annotations: + - best_machine_prediction_name: The taxon name of the best prediction + - best_machine_prediction_score: The confidence score + - best_machine_prediction_algorithm: The algorithm name + - best_machine_prediction_taxon_id: The taxon ID (for determination_matches comparison) + """ + best_prediction_subquery = Classification.objects.filter(detection__occurrence=OuterRef("pk")).order_by( + "-terminal", "-score" + ) + + return self.annotate( + best_machine_prediction_name=models.Subquery(best_prediction_subquery.values("taxon__name")[:1]), + best_machine_prediction_score=models.Subquery(best_prediction_subquery.values("score")[:1]), + best_machine_prediction_algorithm=models.Subquery(best_prediction_subquery.values("algorithm__name")[:1]), + best_machine_prediction_taxon_id=models.Subquery(best_prediction_subquery.values("taxon_id")[:1]), + ) + + def with_verification_info(self): + """ + Annotate the queryset with verification/identification fields. + + Adds the following annotations: + - verified_by_name: The name of the user who made the best identification + - verified_by_email: The email of the user (fallback if name is empty) + - verified_by_count: The count of non-withdrawn identifications + - agreed_with_algorithm_name: The algorithm name the identifier agreed with + """ + best_identification_subquery = Identification.objects.filter( + occurrence=OuterRef("pk"), withdrawn=False + ).order_by("-created_at") + + return self.annotate( + verified_by_name=models.Subquery(best_identification_subquery.values("user__name")[:1]), + verified_by_email=models.Subquery(best_identification_subquery.values("user__email")[:1]), + verified_by_count=models.Count( + "identifications", + filter=Q(identifications__withdrawn=False), + ), + agreed_with_algorithm_name=models.Subquery( + best_identification_subquery.values("agreed_with_prediction__algorithm__name")[:1] + ), ) def unique_taxa(self, project: Project | None = None): @@ -2993,10 +3061,9 @@ def detection_images(self, limit=None): def best_detection(self): return Detection.objects.filter(occurrence=self).order_by("-classifications__score").first() - @functools.cached_property - def best_prediction(self): + def find_best_prediction(self) -> "Classification | None": """ - Use the best prediction as the best identification if there are no human identifications. + Find the best machine prediction for this occurrence. Uses the highest scoring classification (from any algorithm) as the best prediction. Considers terminal classifications first, then non-terminal ones. @@ -3004,20 +3071,35 @@ def best_prediction(self): """ return self.predictions().order_by("-terminal", "-score").first() - @functools.cached_property - def best_identification(self): + def find_best_identification(self) -> "Identification | None": """ - The most recent human identification is used as the best identification. + Find the best human identification for this occurrence. + + The most recent non-withdrawn identification is used as the best identification. @TODO this could use a confidence level chosen manually by the users/experts. """ return Identification.objects.filter(occurrence=self, withdrawn=False).order_by("-created_at").first() + @functools.cached_property + def best_prediction(self): + return self.find_best_prediction() + + @functools.cached_property + def best_identification(self): + return self.find_best_identification() + def get_determination_score(self) -> float | None: + """ + Return the determination score for this occurrence. + + Human identifications return None (score of 1.0 is meaningless). + Machine predictions return the classification confidence score. + """ if not self.determination: return None elif self.best_identification: - return self.best_identification.score + return None # Human ID score (1.0) is not meaningful elif self.best_prediction: return self.best_prediction.score else: @@ -3125,12 +3207,12 @@ def update_occurrence_determination( new_determination = None new_score = None - top_identification = occurrence.best_identification + top_identification = occurrence.find_best_identification() if top_identification and top_identification.taxon and top_identification.taxon != current_determination: new_determination = top_identification.taxon - new_score = top_identification.score + new_score = None # Human ID score is not meaningful for determination_score elif not top_identification: - top_prediction = occurrence.best_prediction + top_prediction = occurrence.find_best_prediction() if top_prediction and top_prediction.taxon and top_prediction.taxon != current_determination: new_determination = top_prediction.taxon new_score = top_prediction.score @@ -3140,7 +3222,7 @@ def update_occurrence_determination( occurrence.determination = new_determination needs_update = True - if new_score and new_score != occurrence.determination_score: + if new_determination and new_score != occurrence.determination_score: logger.debug(f"Changing det. score of {occurrence} from {occurrence.determination_score} to {new_score}") occurrence.determination_score = new_score needs_update = True From d6853f53bfce01781a21fe1ea835ef7a9206dbbf Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Tue, 7 Apr 2026 19:22:50 -0700 Subject: [PATCH 02/12] fix: address code review feedback - Add distinct=True to verified_by_count to prevent join multiplication - Fix update_occurrence_determination to recompute score even when taxon stays the same (handles authority flip without taxon change) - Remove email fallback in verified_by (PII concern, name-only) - Filter withdrawn identifications in verification_status fallback - Use obj.best_prediction cached property instead of find_best_prediction() in API serializer to avoid N+1 queries - Build occurrence URL from annotated fields instead of context_url() to avoid N+1 queries in export - Use source_image.timestamp instead of datetime.now() in tests Co-Authored-By: Claude Opus 4.6 (1M context) --- ami/exports/format_types.py | 19 ++++++++++++------- ami/exports/tests.py | 5 +++-- ami/main/api/serializers.py | 2 +- ami/main/models.py | 24 +++++++++++++++++++----- 4 files changed, 35 insertions(+), 15 deletions(-) diff --git a/ami/exports/format_types.py b/ami/exports/format_types.py index 52c715efb..5fb289215 100644 --- a/ami/exports/format_types.py +++ b/ami/exports/format_types.py @@ -150,14 +150,11 @@ def get_verification_status(self, obj): count = getattr(obj, "verified_by_count", None) if count is not None: return "Verified" if count > 0 else "Not verified" - return "Verified" if obj.identifications.exists() else "Not verified" + return "Verified" if obj.identifications.filter(withdrawn=False).exists() else "Not verified" def get_verified_by(self, obj): - """Returns the name or email of the user who made the best identification.""" - name = getattr(obj, "verified_by_name", None) - if name: - return name - return getattr(obj, "verified_by_email", None) + """Returns the display name of the user who made the best identification.""" + return getattr(obj, "verified_by_name", None) def get_agreed_with_algorithm(self, obj): """Returns the algorithm name if the identifier explicitly agreed with an ML prediction.""" @@ -201,7 +198,15 @@ def get_best_detection_source_image_url(self, obj): def get_best_detection_occurrence_url(self, obj): """Returns the platform UI link to the occurrence in context.""" - return obj.context_url() + event_id = getattr(obj, "best_detection_event_id", None) + source_image_id = getattr(obj, "best_detection_source_image_id", None) + if event_id and source_image_id: + # @TODO use settings for base URL instead of hardcoding + return ( + f"https://app.preview.insectai.org/sessions/{event_id}" + f"?capture={source_image_id}&occurrence={obj.pk}" + ) + return None class CSVExporter(BaseExporter): diff --git a/ami/exports/tests.py b/ami/exports/tests.py index edb1bf605..df0f33f46 100644 --- a/ami/exports/tests.py +++ b/ami/exports/tests.py @@ -1,5 +1,4 @@ import csv -import datetime import json import logging @@ -312,6 +311,8 @@ class ExportNewFieldsTest(TestCase): def setUp(self): self.project, self.deployment = setup_test_project(reuse=False) self.user = self.project.owner + self.user.name = "Test Verifier" + self.user.save() self.client = APIClient() self.client.force_authenticate(user=self.user) @@ -347,7 +348,7 @@ def _create_occurrence_with_prediction(self, taxon=None, score=0.85): classification = detection.classifications.create( taxon=taxon, score=score, - timestamp=datetime.datetime.now(), + timestamp=source_image.timestamp, algorithm=self.algorithm, terminal=True, ) diff --git a/ami/main/api/serializers.py b/ami/main/api/serializers.py index e3c65eeea..a189fad38 100644 --- a/ami/main/api/serializers.py +++ b/ami/main/api/serializers.py @@ -1398,7 +1398,7 @@ def get_best_machine_prediction(self, obj: Occurrence): context = self.context context["occurrence"] = obj - prediction = obj.find_best_prediction() + prediction = obj.best_prediction if not prediction: return None diff --git a/ami/main/models.py b/ami/main/models.py index 51e417204..8166e3579 100644 --- a/ami/main/models.py +++ b/ami/main/models.py @@ -2811,6 +2811,18 @@ def with_best_detection(self): .values("source_image__public_base_url")[:1] ) + # Subquery to get source_image_id and event_id for building the occurrence URL + best_detection_source_image_id_subquery = ( + Detection.objects.filter(occurrence=OuterRef("pk")) + .order_by("-classifications__score", "id") + .values("source_image_id")[:1] + ) + best_detection_event_id_subquery = ( + Detection.objects.filter(occurrence=OuterRef("pk")) + .order_by("-classifications__score", "id") + .values("source_image__event_id")[:1] + ) + return self.annotate( best_detection_path=models.Subquery(best_detection_path_subquery), best_detection_bbox=models.Subquery(best_detection_bbox_subquery), @@ -2818,6 +2830,8 @@ def with_best_detection(self): best_detection_source_image_public_base_url=models.Subquery( best_detection_source_image_public_base_url_subquery ), + best_detection_source_image_id=models.Subquery(best_detection_source_image_id_subquery), + best_detection_event_id=models.Subquery(best_detection_event_id_subquery), ) def with_best_machine_prediction(self): @@ -2860,10 +2874,10 @@ def with_verification_info(self): return self.annotate( verified_by_name=models.Subquery(best_identification_subquery.values("user__name")[:1]), - verified_by_email=models.Subquery(best_identification_subquery.values("user__email")[:1]), verified_by_count=models.Count( "identifications", filter=Q(identifications__withdrawn=False), + distinct=True, ), agreed_with_algorithm_name=models.Subquery( best_identification_subquery.values("agreed_with_prediction__algorithm__name")[:1] @@ -3208,12 +3222,12 @@ def update_occurrence_determination( new_score = None top_identification = occurrence.find_best_identification() - if top_identification and top_identification.taxon and top_identification.taxon != current_determination: + if top_identification and top_identification.taxon: new_determination = top_identification.taxon new_score = None # Human ID score is not meaningful for determination_score - elif not top_identification: + else: top_prediction = occurrence.find_best_prediction() - if top_prediction and top_prediction.taxon and top_prediction.taxon != current_determination: + if top_prediction and top_prediction.taxon: new_determination = top_prediction.taxon new_score = top_prediction.score @@ -3222,7 +3236,7 @@ def update_occurrence_determination( occurrence.determination = new_determination needs_update = True - if new_determination and new_score != occurrence.determination_score: + if new_score != occurrence.determination_score: logger.debug(f"Changing det. score of {occurrence} from {occurrence.determination_score} to {new_score}") occurrence.determination_score = new_score needs_update = True From 66d9390c2d7b84ba662d280a98797f51f1a09cc6 Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Mon, 13 Apr 2026 15:55:53 -0700 Subject: [PATCH 03/12] =?UTF-8?q?fix:=20address=20reviewer=20feedback=20?= =?UTF-8?q?=E2=80=94=20rename=20verified=5Fby=5Fcount,=20remove=20occurren?= =?UTF-8?q?ce=20URL?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per @annavik and @mihow's review: - Rename `verified_by_count` to `participant_count` (counts distinct users, not total IDs) - Remove `best_detection_occurrence_url` (URLs not stable enough to distribute in exports yet) - Clean up unused annotations (event_id, source_image_id) from with_best_detection() Co-Authored-By: Claude Opus 4.6 (1M context) --- ami/exports/format_types.py | 20 +++----------------- ami/exports/tests.py | 18 ++++-------------- ami/main/models.py | 21 +++------------------ 3 files changed, 10 insertions(+), 49 deletions(-) diff --git a/ami/exports/format_types.py b/ami/exports/format_types.py index 5fb289215..c1d75f745 100644 --- a/ami/exports/format_types.py +++ b/ami/exports/format_types.py @@ -100,7 +100,7 @@ class OccurrenceTabularSerializer(serializers.ModelSerializer): # Verification fields verified_by = serializers.SerializerMethodField() - verified_by_count = serializers.IntegerField(default=0) + participant_count = serializers.IntegerField(default=0) agreed_with_algorithm = serializers.SerializerMethodField() determination_matches_machine_prediction = serializers.SerializerMethodField() @@ -110,7 +110,6 @@ class OccurrenceTabularSerializer(serializers.ModelSerializer): best_detection_width = serializers.SerializerMethodField() best_detection_height = serializers.SerializerMethodField() best_detection_source_image_url = serializers.SerializerMethodField() - best_detection_occurrence_url = serializers.SerializerMethodField() class Meta: model = Occurrence @@ -130,7 +129,7 @@ class Meta: "best_machine_prediction_algorithm", "best_machine_prediction_score", "verified_by", - "verified_by_count", + "participant_count", "agreed_with_algorithm", "determination_matches_machine_prediction", "detections_count", @@ -142,12 +141,11 @@ class Meta: "best_detection_width", "best_detection_height", "best_detection_source_image_url", - "best_detection_occurrence_url", ] def get_verification_status(self, obj): """Returns 'Verified' if the occurrence has non-withdrawn identifications.""" - count = getattr(obj, "verified_by_count", None) + count = getattr(obj, "participant_count", None) if count is not None: return "Verified" if count > 0 else "Not verified" return "Verified" if obj.identifications.filter(withdrawn=False).exists() else "Not verified" @@ -196,18 +194,6 @@ def get_best_detection_source_image_url(self, obj): return urllib.parse.urljoin(base_url, path.lstrip("/")) return None - def get_best_detection_occurrence_url(self, obj): - """Returns the platform UI link to the occurrence in context.""" - event_id = getattr(obj, "best_detection_event_id", None) - source_image_id = getattr(obj, "best_detection_source_image_id", None) - if event_id and source_image_id: - # @TODO use settings for base URL instead of hardcoding - return ( - f"https://app.preview.insectai.org/sessions/{event_id}" - f"?capture={source_image_id}&occurrence={obj.pk}" - ) - return None - class CSVExporter(BaseExporter): """Handles CSV export of occurrences.""" diff --git a/ami/exports/tests.py b/ami/exports/tests.py index df0f33f46..d85d31c89 100644 --- a/ami/exports/tests.py +++ b/ami/exports/tests.py @@ -381,7 +381,7 @@ def test_ml_prediction_only(self): self.assertEqual(row["best_machine_prediction_algorithm"], "test-classifier") self.assertAlmostEqual(float(row["best_machine_prediction_score"]), 0.85, places=2) self.assertEqual(row["verified_by"], "") - self.assertEqual(row["verified_by_count"], "0") + self.assertEqual(row["participant_count"], "0") def test_ml_prediction_with_agreeing_human(self): """Human agrees with ML: verified_by set, determination_matches = True, determination_score = None.""" @@ -405,7 +405,7 @@ def test_ml_prediction_with_agreeing_human(self): # Verification fields verified_by = row["verified_by"] self.assertTrue(verified_by, "verified_by should not be empty") - self.assertEqual(row["verified_by_count"], "1") + self.assertEqual(row["participant_count"], "1") self.assertEqual(row["agreed_with_algorithm"], "test-classifier") self.assertEqual(row["determination_matches_machine_prediction"], "True") @@ -446,7 +446,7 @@ def test_multiple_identifications_count(self): rows = self._run_csv_export() row = next(r for r in rows if int(r["id"]) == occurrence.pk) - self.assertEqual(row["verified_by_count"], "2") + self.assertEqual(row["participant_count"], "2") def test_detection_bbox_field(self): """Best detection bbox is included in export.""" @@ -468,21 +468,11 @@ def test_csv_has_all_new_fields(self): "best_machine_prediction_algorithm", "best_machine_prediction_score", "verified_by", - "verified_by_count", + "participant_count", "agreed_with_algorithm", "determination_matches_machine_prediction", "best_detection_bbox", "best_detection_source_image_url", - "best_detection_occurrence_url", ] for field in expected_fields: self.assertIn(field, headers, f"Missing CSV field: {field}") - - def test_occurrence_url_field(self): - """best_detection_occurrence_url contains a valid platform link.""" - occurrence, _ = self._create_occurrence_with_prediction() - rows = self._run_csv_export() - row = next(r for r in rows if int(r["id"]) == occurrence.pk) - url = row.get("best_detection_occurrence_url", "") - if url: - self.assertIn(str(occurrence.pk), url) diff --git a/ami/main/models.py b/ami/main/models.py index 8166e3579..dfb439e07 100644 --- a/ami/main/models.py +++ b/ami/main/models.py @@ -2811,18 +2811,6 @@ def with_best_detection(self): .values("source_image__public_base_url")[:1] ) - # Subquery to get source_image_id and event_id for building the occurrence URL - best_detection_source_image_id_subquery = ( - Detection.objects.filter(occurrence=OuterRef("pk")) - .order_by("-classifications__score", "id") - .values("source_image_id")[:1] - ) - best_detection_event_id_subquery = ( - Detection.objects.filter(occurrence=OuterRef("pk")) - .order_by("-classifications__score", "id") - .values("source_image__event_id")[:1] - ) - return self.annotate( best_detection_path=models.Subquery(best_detection_path_subquery), best_detection_bbox=models.Subquery(best_detection_bbox_subquery), @@ -2830,8 +2818,6 @@ def with_best_detection(self): best_detection_source_image_public_base_url=models.Subquery( best_detection_source_image_public_base_url_subquery ), - best_detection_source_image_id=models.Subquery(best_detection_source_image_id_subquery), - best_detection_event_id=models.Subquery(best_detection_event_id_subquery), ) def with_best_machine_prediction(self): @@ -2864,8 +2850,7 @@ def with_verification_info(self): Adds the following annotations: - verified_by_name: The name of the user who made the best identification - - verified_by_email: The email of the user (fallback if name is empty) - - verified_by_count: The count of non-withdrawn identifications + - participant_count: The count of distinct users who made non-withdrawn identifications - agreed_with_algorithm_name: The algorithm name the identifier agreed with """ best_identification_subquery = Identification.objects.filter( @@ -2874,8 +2859,8 @@ def with_verification_info(self): return self.annotate( verified_by_name=models.Subquery(best_identification_subquery.values("user__name")[:1]), - verified_by_count=models.Count( - "identifications", + participant_count=models.Count( + "identifications__user", filter=Q(identifications__withdrawn=False), distinct=True, ), From 7aa6173e09eee03db22a16c0a83adf76aa814499 Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Mon, 13 Apr 2026 16:18:48 -0700 Subject: [PATCH 04/12] feat(exports): add agreed_with_user column Exposes the email of the prior human identifier when the best identification was explicitly an "Agree" with another user. Lets consumers trace agreement chains without the backend having to resolve transitivity. Co-Authored-By: Claude --- ami/exports/format_types.py | 6 ++++++ ami/exports/tests.py | 29 +++++++++++++++++++++++++++++ ami/main/models.py | 4 ++++ 3 files changed, 39 insertions(+) diff --git a/ami/exports/format_types.py b/ami/exports/format_types.py index c1d75f745..90e1e0a15 100644 --- a/ami/exports/format_types.py +++ b/ami/exports/format_types.py @@ -102,6 +102,7 @@ class OccurrenceTabularSerializer(serializers.ModelSerializer): verified_by = serializers.SerializerMethodField() participant_count = serializers.IntegerField(default=0) agreed_with_algorithm = serializers.SerializerMethodField() + agreed_with_user = serializers.SerializerMethodField() determination_matches_machine_prediction = serializers.SerializerMethodField() # Detection fields @@ -131,6 +132,7 @@ class Meta: "verified_by", "participant_count", "agreed_with_algorithm", + "agreed_with_user", "determination_matches_machine_prediction", "detections_count", "first_appearance_timestamp", @@ -158,6 +160,10 @@ def get_agreed_with_algorithm(self, obj): """Returns the algorithm name if the identifier explicitly agreed with an ML prediction.""" return getattr(obj, "agreed_with_algorithm_name", None) + def get_agreed_with_user(self, obj): + """Returns the email of the prior identifier the best identification explicitly agreed with.""" + return getattr(obj, "agreed_with_user_email", None) + def get_determination_matches_machine_prediction(self, obj): """Returns whether the determination taxon matches the best machine prediction taxon.""" prediction_taxon_id = getattr(obj, "best_machine_prediction_taxon_id", None) diff --git a/ami/exports/tests.py b/ami/exports/tests.py index d85d31c89..785db08e1 100644 --- a/ami/exports/tests.py +++ b/ami/exports/tests.py @@ -433,6 +433,34 @@ def test_ml_prediction_with_disagreeing_human(self): self.assertEqual(row["determination_matches_machine_prediction"], "False") self.assertEqual(row["agreed_with_algorithm"], "") + def test_human_agrees_with_another_human(self): + """User B agrees with user A's identification: agreed_with_user exposes A's email.""" + from ami.users.models import User + + user_a = User.objects.create_user(email="user-a@test.org") + user_b = User.objects.create_user(email="user-b@test.org") + + occurrence, _ = self._create_occurrence_with_prediction() + + id_a = Identification.objects.create( + user=user_a, + taxon=self.taxon_b, + occurrence=occurrence, + ) + Identification.objects.create( + user=user_b, + taxon=self.taxon_b, + occurrence=occurrence, + agreed_with_identification=id_a, + ) + + rows = self._run_csv_export() + row = next(r for r in rows if int(r["id"]) == occurrence.pk) + + self.assertEqual(row["agreed_with_user"], "user-a@test.org") + # Not agreeing with an ML prediction + self.assertEqual(row["agreed_with_algorithm"], "") + def test_multiple_identifications_count(self): """Multiple identifications: verified_by_count reflects all non-withdrawn IDs.""" occurrence, _ = self._create_occurrence_with_prediction() @@ -470,6 +498,7 @@ def test_csv_has_all_new_fields(self): "verified_by", "participant_count", "agreed_with_algorithm", + "agreed_with_user", "determination_matches_machine_prediction", "best_detection_bbox", "best_detection_source_image_url", diff --git a/ami/main/models.py b/ami/main/models.py index dfb439e07..4e249ee93 100644 --- a/ami/main/models.py +++ b/ami/main/models.py @@ -2852,6 +2852,7 @@ def with_verification_info(self): - verified_by_name: The name of the user who made the best identification - participant_count: The count of distinct users who made non-withdrawn identifications - agreed_with_algorithm_name: The algorithm name the identifier agreed with + - agreed_with_user_email: The email of the prior identifier the best identification agreed with """ best_identification_subquery = Identification.objects.filter( occurrence=OuterRef("pk"), withdrawn=False @@ -2867,6 +2868,9 @@ def with_verification_info(self): agreed_with_algorithm_name=models.Subquery( best_identification_subquery.values("agreed_with_prediction__algorithm__name")[:1] ), + agreed_with_user_email=models.Subquery( + best_identification_subquery.values("agreed_with_identification__user__email")[:1] + ), ) def unique_taxa(self, project: Project | None = None): From bb46a2876d07975d7ce2f03e974c8178f26c2894 Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Mon, 13 Apr 2026 18:00:26 -0700 Subject: [PATCH 05/12] feat(main): backfill command + fix legacy determination_score handling - Add `backfill_determination_score` management command to null-out determination_score on legacy human-determined occurrences. Supports --dry-run and --project filtering. - Tighten the legacy score-backfill block in Occurrence.save() to skip human-determined occurrences (avoids a redundant query and suppresses the misleading "Could not determine score" warning). - Fix type mismatch in update_occurrence_determination(): when the caller didn't pass current_determination, the fallback fetched the FK id (int) and compared it against a Taxon instance, which was always unequal and caused unnecessary save() calls and misleading debug logs. Normalize current_determination to an id for the comparison and accept int|Taxon|None. Tests: BackfillDeterminationScoreTest covers the ML-only, human-determined, and withdrawn-identification cases. Co-Authored-By: Claude --- .../commands/backfill_determination_score.py | 47 ++++++++++++ ami/main/models.py | 35 ++++----- ami/main/tests.py | 73 +++++++++++++++++++ 3 files changed, 138 insertions(+), 17 deletions(-) create mode 100644 ami/main/management/commands/backfill_determination_score.py diff --git a/ami/main/management/commands/backfill_determination_score.py b/ami/main/management/commands/backfill_determination_score.py new file mode 100644 index 000000000..fb12bf6d8 --- /dev/null +++ b/ami/main/management/commands/backfill_determination_score.py @@ -0,0 +1,47 @@ +import logging + +from django.core.management.base import BaseCommand +from django.db.models import Exists, OuterRef + +from ...models import Identification, Occurrence + +logger = logging.getLogger(__name__) + + +def backfill_human_determination_scores(dry_run: bool = True, project_id: int | None = None) -> str: + """Set determination_score=None for occurrences determined by a human identification. + + A human-determined occurrence is one with at least one non-withdrawn Identification. + The ML confidence that previously lived in determination_score is preserved in + best_machine_prediction_score (available via the with_best_machine_prediction + queryset annotation). + """ + has_identification = Identification.objects.filter( + occurrence=OuterRef("pk"), + withdrawn=False, + ) + qs = Occurrence.objects.filter(determination_score__isnull=False).filter(Exists(has_identification)) + if project_id is not None: + qs = qs.filter(project_id=project_id) + + if dry_run: + count = qs.count() + return f"Would clear determination_score on {count} human-determined occurrences" + + updated = qs.update(determination_score=None) + return f"Cleared determination_score on {updated} human-determined occurrences" + + +class Command(BaseCommand): + help = "Backfill determination_score=None on occurrences determined by a human identification" + + def add_arguments(self, parser): + parser.add_argument("--dry-run", action="store_true", help="Report the count without writing") + parser.add_argument("--project", type=int, default=None, help="Limit to a single project id") + + def handle(self, *args, **options): + msg = backfill_human_determination_scores( + dry_run=options["dry_run"], + project_id=options["project"], + ) + self.stdout.write(msg) diff --git a/ami/main/models.py b/ami/main/models.py index 4e249ee93..6acf28613 100644 --- a/ami/main/models.py +++ b/ami/main/models.py @@ -3145,12 +3145,11 @@ def save(self, update_determination=True, *args, **kwargs): save=True, ) - if self.determination and not self.determination_score: - # This may happen for legacy occurrences that were created - # before the determination_score field was added - # @TODO remove + if self.determination and self.determination_score is None and not self.best_identification: + # Legacy occurrences created before the determination_score field existed. + # Human-determined occurrences intentionally have a null score, so skip them. self.determination_score = self.get_determination_score() - if not self.determination_score: + if self.determination_score is None: logger.warning(f"Could not determine score for {self}") else: self.save(update_determination=False) @@ -3176,7 +3175,9 @@ class Meta: def update_occurrence_determination( - occurrence: Occurrence, current_determination: typing.Optional["Taxon"] = None, save=True + occurrence: Occurrence, + current_determination: "Taxon | int | None" = None, + save=True, ) -> bool: """ Update the determination of the occurrence based on the identifications & predictions. @@ -3185,9 +3186,8 @@ def update_occurrence_determination( If there are no identifications, set the determination to the top prediction. The `current_determination` is the determination currently saved in the database. - The `occurrence` object may already have a different un-saved determination set - so it is necessary to retrieve the current determination from the database, but - this can also be passed in as an argument to avoid an extra database query. + It may be passed as a Taxon instance or as a taxon id to avoid a DB lookup; when + omitted, the current determination id is fetched from the DB. @TODO Add tests for this important method! """ @@ -3201,12 +3201,13 @@ def update_occurrence_determination( if hasattr(occurrence, "best_identification"): del occurrence.best_identification - current_determination = ( - current_determination - or Occurrence.objects.select_related("determination") - .values("determination") - .get(pk=occurrence.pk)["determination"] - ) + if isinstance(current_determination, Taxon): + current_determination_id = current_determination.pk + elif current_determination is not None: + current_determination_id = current_determination + else: + current_determination_id = Occurrence.objects.values_list("determination_id", flat=True).get(pk=occurrence.pk) + new_determination = None new_score = None @@ -3220,8 +3221,8 @@ def update_occurrence_determination( new_determination = top_prediction.taxon new_score = top_prediction.score - if new_determination and new_determination != current_determination: - logger.debug(f"Changing det. of {occurrence} from {current_determination} to {new_determination}") + if new_determination and new_determination.pk != current_determination_id: + logger.debug(f"Changing det. of {occurrence} from taxon#{current_determination_id} to {new_determination}") occurrence.determination = new_determination needs_update = True diff --git a/ami/main/tests.py b/ami/main/tests.py index 4bfbdc4de..22b1c664e 100644 --- a/ami/main/tests.py +++ b/ami/main/tests.py @@ -3782,3 +3782,76 @@ def test_list_pipelines_public_project_non_member(self): self.client.force_authenticate(user=non_member) response = self.client.get(url) self.assertEqual(response.status_code, status.HTTP_200_OK) + + +class BackfillDeterminationScoreTest(TestCase): + def setUp(self): + from io import StringIO as _StringIO # noqa: F401 (avoid top-level churn) + + self.project, self.deployment = setup_test_project(reuse=False) + create_captures(deployment=self.deployment, num_nights=1, images_per_night=2) + create_taxa(self.project) + self.taxon = self.project.taxa.first() + self.user = User.objects.create_user(email="identifier@test.org") # type: ignore + + source_image = self.project.captures.first() + assert source_image is not None + + det_ml = Detection.objects.create( + source_image=source_image, + timestamp=source_image.timestamp, + bbox=[0.1, 0.1, 0.5, 0.5], + path="detections/ml.jpg", + ) + self.ml_only_occ = det_ml.associate_new_occurrence() + Occurrence.objects.filter(pk=self.ml_only_occ.pk).update(determination=self.taxon, determination_score=0.9) + + det_human = Detection.objects.create( + source_image=source_image, + timestamp=source_image.timestamp, + bbox=[0.1, 0.1, 0.5, 0.5], + path="detections/human.jpg", + ) + self.human_occ = det_human.associate_new_occurrence() + Identification.objects.create(user=self.user, taxon=self.taxon, occurrence=self.human_occ) + # Simulate legacy bad data: determination_score was populated despite a human + # identification existing. Identification.save() clears it, so use .update() + # to bypass the save hook and recreate the legacy state. + Occurrence.objects.filter(pk=self.human_occ.pk).update(determination=self.taxon, determination_score=0.7) + + def test_dry_run_reports_without_writing(self): + from io import StringIO + + from django.core.management import call_command + + out = StringIO() + call_command("backfill_determination_score", "--dry-run", stdout=out) + self.assertIn("Would clear determination_score on 1", out.getvalue()) + + self.ml_only_occ.refresh_from_db() + self.human_occ.refresh_from_db() + self.assertEqual(self.ml_only_occ.determination_score, 0.9) + self.assertEqual(self.human_occ.determination_score, 0.7) + + def test_clears_only_human_determined_scores(self): + from io import StringIO + + from django.core.management import call_command + + out = StringIO() + call_command("backfill_determination_score", stdout=out) + self.assertIn("Cleared determination_score on 1", out.getvalue()) + + self.ml_only_occ.refresh_from_db() + self.human_occ.refresh_from_db() + self.assertEqual(self.ml_only_occ.determination_score, 0.9) + self.assertIsNone(self.human_occ.determination_score) + + def test_withdrawn_identifications_are_not_human_determined(self): + from django.core.management import call_command + + Identification.objects.filter(occurrence=self.human_occ).update(withdrawn=True) + Occurrence.objects.filter(pk=self.human_occ.pk).update(determination_score=0.7) + call_command("backfill_determination_score") + self.human_occ.refresh_from_db() + self.assertEqual(self.human_occ.determination_score, 0.7) From 16372f7262d2cedff5039ff9b09967f0d506c9de Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Mon, 13 Apr 2026 18:12:01 -0700 Subject: [PATCH 06/12] fix: align find_best_prediction with with_best_machine_prediction annotation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The API's cached `best_prediction` and the CSV export's annotated `best_machine_prediction_*` fields could pick different rows for the same occurrence: - `find_best_prediction()` went through `self.predictions()`, which filters to the max-score row per algorithm **before** applying the `-terminal, -score` ordering. A high-score non-terminal could survive dedup and then lose to a lower-score terminal on ordering. - `with_best_machine_prediction()` orders directly over all classifications, so it picks the terminal winner regardless of intra-algorithm dedup. Align `find_best_prediction()` to query classifications directly with the same `-terminal, -score, -pk` ordering. Add `-pk` to both so ties break deterministically. Also: - Serializer `get_best_machine_prediction()` now returns a stable shape ({taxon: null, algorithm: null, score: null, ...}) when no prediction exists, instead of null — clients read nullable fields instead of branching on type. - `update_occurrence_determination()` now clears `occurrence.determination` when the last non-withdrawn identification is deleted and no prediction remains, instead of leaving a stale taxon. Tests: new `test_api_and_csv_pick_same_best_prediction_with_mixed_terminal` covers the divergence case. Co-Authored-By: Claude --- ami/exports/tests.py | 49 +++++++++++++++++++++++++++++++++++++ ami/main/api/serializers.py | 7 +++++- ami/main/models.py | 23 +++++++++++------ 3 files changed, 71 insertions(+), 8 deletions(-) diff --git a/ami/exports/tests.py b/ami/exports/tests.py index 785db08e1..a58459f2c 100644 --- a/ami/exports/tests.py +++ b/ami/exports/tests.py @@ -485,6 +485,55 @@ def test_detection_bbox_field(self): # bbox should be a string representation of the list self.assertIn("0.1", row["best_detection_bbox"]) + def test_api_and_csv_pick_same_best_prediction_with_mixed_terminal(self): + """find_best_prediction() and with_best_machine_prediction() must agree. + + With both a high-score non-terminal classification and a lower-score terminal + classification, the terminal row should win in both the API's cached + best_prediction and the CSV's annotated best_machine_prediction_* fields. + """ + alg_intermediate, _ = Algorithm.objects.get_or_create( + name="intermediate-classifier", defaults={"key": "intermediate-classifier"} + ) + alg_terminal, _ = Algorithm.objects.get_or_create( + name="terminal-classifier", defaults={"key": "terminal-classifier"} + ) + source_image = self.project.captures.first() + detection = Detection.objects.create( + source_image=source_image, + timestamp=source_image.timestamp, + bbox=[0.1, 0.1, 0.5, 0.5], + path="detections/mixed.jpg", + ) + detection.classifications.create( + taxon=self.taxon_a, + score=0.95, + timestamp=source_image.timestamp, + algorithm=alg_intermediate, + terminal=False, + ) + detection.classifications.create( + taxon=self.taxon_b, + score=0.80, + timestamp=source_image.timestamp, + algorithm=alg_terminal, + terminal=True, + ) + occurrence = detection.associate_new_occurrence() + + rows = self._run_csv_export() + row = next(r for r in rows if int(r["id"]) == occurrence.pk) + + self.assertEqual(row["best_machine_prediction_name"], self.taxon_b.name) + self.assertEqual(row["best_machine_prediction_algorithm"], "terminal-classifier") + self.assertAlmostEqual(float(row["best_machine_prediction_score"]), 0.80, places=2) + + occurrence.refresh_from_db() + api_best = occurrence.find_best_prediction() + self.assertIsNotNone(api_best) + self.assertEqual(api_best.taxon_id, self.taxon_b.pk) + self.assertEqual(api_best.algorithm.name, "terminal-classifier") + def test_csv_has_all_new_fields(self): """All new fields are present as CSV column headers.""" self._create_occurrence_with_prediction() diff --git a/ami/main/api/serializers.py b/ami/main/api/serializers.py index a189fad38..09b0f3c23 100644 --- a/ami/main/api/serializers.py +++ b/ami/main/api/serializers.py @@ -1400,7 +1400,12 @@ def get_best_machine_prediction(self, obj: Occurrence): prediction = obj.best_prediction if not prediction: - return None + return dict( + taxon=None, + algorithm=None, + score=None, + determination_matches_machine_prediction=None, + ) taxon_data = TaxonNestedSerializer(prediction.taxon, context=context).data if prediction.taxon else None algorithm_data = None diff --git a/ami/main/models.py b/ami/main/models.py index 6acf28613..772463487 100644 --- a/ami/main/models.py +++ b/ami/main/models.py @@ -2834,7 +2834,7 @@ def with_best_machine_prediction(self): - best_machine_prediction_taxon_id: The taxon ID (for determination_matches comparison) """ best_prediction_subquery = Classification.objects.filter(detection__occurrence=OuterRef("pk")).order_by( - "-terminal", "-score" + "-terminal", "-score", "-pk" ) return self.annotate( @@ -3068,11 +3068,17 @@ def find_best_prediction(self) -> "Classification | None": """ Find the best machine prediction for this occurrence. - Uses the highest scoring classification (from any algorithm) as the best prediction. - Considers terminal classifications first, then non-terminal ones. - (Terminal classifications are the final classifications of a pipeline, non-terminal are intermediate models.) + Ordering matches OccurrenceQuerySet.with_best_machine_prediction() so that the + API's cached `best_prediction` and the CSV export's annotated fields agree. + Terminal classifications win over non-terminal, then highest score, with pk as + the deterministic tiebreaker. """ - return self.predictions().order_by("-terminal", "-score").first() + return ( + Classification.objects.filter(detection__occurrence=self) + .select_related("taxon", "algorithm") + .order_by("-terminal", "-score", "-pk") + .first() + ) def find_best_identification(self) -> "Identification | None": """ @@ -3221,8 +3227,11 @@ def update_occurrence_determination( new_determination = top_prediction.taxon new_score = top_prediction.score - if new_determination and new_determination.pk != current_determination_id: - logger.debug(f"Changing det. of {occurrence} from taxon#{current_determination_id} to {new_determination}") + new_determination_id = new_determination.pk if new_determination else None + if new_determination_id != current_determination_id: + logger.debug( + f"Changing det. of {occurrence} from taxon#{current_determination_id} to taxon#{new_determination_id}" + ) occurrence.determination = new_determination needs_update = True From f356caccda2fe358f31a7aa62d1e8d37d3a1b737 Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Mon, 13 Apr 2026 18:19:53 -0700 Subject: [PATCH 07/12] fix: share best-prediction ordering + revert serializer null shape - Extract BEST_MACHINE_PREDICTION_ORDER as a shared constant used by both Occurrence.find_best_prediction() (row-at-a-time) and OccurrenceQuerySet.with_best_machine_prediction() (bulk annotation). Both methods now reference the constant and each other's docstrings, so future edits to the ordering touch one place and can't silently diverge. - Revert get_best_machine_prediction() to return None when no prediction exists (it was briefly returning a dict of nulls). Annotate the method's return type as `dict | None` so drf-spectacular emits nullability. - Add UpdateOccurrenceDeterminationTest covering the stale-determination clearing fix: withdrawing the only identification on an occurrence with no predictions now clears occurrence.determination (previously left a stale taxon in place). Co-Authored-By: Claude --- ami/main/api/serializers.py | 15 +++++++------- ami/main/models.py | 25 ++++++++++++++-------- ami/main/tests.py | 41 +++++++++++++++++++++++++++++++++++++ 3 files changed, 65 insertions(+), 16 deletions(-) diff --git a/ami/main/api/serializers.py b/ami/main/api/serializers.py index 09b0f3c23..b707f88b6 100644 --- a/ami/main/api/serializers.py +++ b/ami/main/api/serializers.py @@ -1393,19 +1393,18 @@ def get_determination_details(self, obj: Occurrence): score=obj.determination_score, ) - def get_best_machine_prediction(self, obj: Occurrence): - """Always return the best machine prediction, regardless of human verification status.""" + def get_best_machine_prediction(self, obj: Occurrence) -> dict | None: + """Return the best machine prediction for this occurrence, or null if none exists. + + Populated regardless of human verification status, so clients can always show + the ML result alongside a human-set determination. + """ context = self.context context["occurrence"] = obj prediction = obj.best_prediction if not prediction: - return dict( - taxon=None, - algorithm=None, - score=None, - determination_matches_machine_prediction=None, - ) + return None taxon_data = TaxonNestedSerializer(prediction.taxon, context=context).data if prediction.taxon else None algorithm_data = None diff --git a/ami/main/models.py b/ami/main/models.py index 772463487..51d013dab 100644 --- a/ami/main/models.py +++ b/ami/main/models.py @@ -55,6 +55,13 @@ # Constants _POST_TITLE_MAX_LENGTH: Final = 80 +# Shared ordering for "best machine prediction" selection. Used by both +# Occurrence.find_best_prediction() (row-at-a-time, e.g. the API) and +# OccurrenceQuerySet.with_best_machine_prediction() (bulk-annotated, e.g. the CSV +# export). They must use the same ordering — otherwise the two code paths can pick +# different classifications for the same occurrence. +BEST_MACHINE_PREDICTION_ORDER: Final = ("-terminal", "-score", "-pk") + class TaxonRank(OrderedEnum): KINGDOM = "KINGDOM" @@ -2824,8 +2831,10 @@ def with_best_machine_prediction(self): """ Annotate the queryset with fields from the best machine prediction. - The best prediction is the highest-scoring classification, preferring terminal ones. - Same ordering as Occurrence.find_best_prediction(): -terminal, -score. + The "best prediction" rule is shared with Occurrence.find_best_prediction() via + BEST_MACHINE_PREDICTION_ORDER. If you change the ordering in one place, update it + in the other — otherwise the API's cached `best_prediction` and the annotations + exposed by this method can pick different classifications for the same occurrence. Adds the following annotations: - best_machine_prediction_name: The taxon name of the best prediction @@ -2834,7 +2843,7 @@ def with_best_machine_prediction(self): - best_machine_prediction_taxon_id: The taxon ID (for determination_matches comparison) """ best_prediction_subquery = Classification.objects.filter(detection__occurrence=OuterRef("pk")).order_by( - "-terminal", "-score", "-pk" + *BEST_MACHINE_PREDICTION_ORDER ) return self.annotate( @@ -3068,15 +3077,15 @@ def find_best_prediction(self) -> "Classification | None": """ Find the best machine prediction for this occurrence. - Ordering matches OccurrenceQuerySet.with_best_machine_prediction() so that the - API's cached `best_prediction` and the CSV export's annotated fields agree. - Terminal classifications win over non-terminal, then highest score, with pk as - the deterministic tiebreaker. + Ordering is shared with OccurrenceQuerySet.with_best_machine_prediction() via + BEST_MACHINE_PREDICTION_ORDER so that the API's cached `best_prediction` and + the CSV export's annotated fields agree. Terminal classifications win over + non-terminal, then highest score, with pk as the deterministic tiebreaker. """ return ( Classification.objects.filter(detection__occurrence=self) .select_related("taxon", "algorithm") - .order_by("-terminal", "-score", "-pk") + .order_by(*BEST_MACHINE_PREDICTION_ORDER) .first() ) diff --git a/ami/main/tests.py b/ami/main/tests.py index 22b1c664e..bfce41f60 100644 --- a/ami/main/tests.py +++ b/ami/main/tests.py @@ -3855,3 +3855,44 @@ def test_withdrawn_identifications_are_not_human_determined(self): call_command("backfill_determination_score") self.human_occ.refresh_from_db() self.assertEqual(self.human_occ.determination_score, 0.7) + + +class UpdateOccurrenceDeterminationTest(TestCase): + """Tests for the update_occurrence_determination() function.""" + + def setUp(self): + self.project, self.deployment = setup_test_project(reuse=False) + create_captures(deployment=self.deployment, num_nights=1, images_per_night=1) + create_taxa(self.project) + self.taxon = self.project.taxa.first() + self.user = User.objects.create_user(email="identifier@test.org") # type: ignore + + def _create_occurrence(self): + source_image = self.project.captures.first() + assert source_image is not None + detection = Detection.objects.create( + source_image=source_image, + timestamp=source_image.timestamp, + bbox=[0.1, 0.1, 0.5, 0.5], + path="detections/update_det_test.jpg", + ) + return detection.associate_new_occurrence() + + def test_determination_cleared_when_last_identification_withdrawn_and_no_prediction(self): + """When the only authority (a human identification) is withdrawn and there is no + machine prediction, update_occurrence_determination() must clear + occurrence.determination rather than leave the stale taxon in place.""" + occurrence = self._create_occurrence() + identification = Identification.objects.create(user=self.user, taxon=self.taxon, occurrence=occurrence) + occurrence.refresh_from_db() + self.assertEqual(occurrence.determination_id, self.taxon.pk) + + # Identification.save() fires update_occurrence_determination(), so withdrawing + # and saving is enough to exercise the path. No predictions exist, so the + # previous guard would leave occurrence.determination unchanged. + identification.withdrawn = True + identification.save() + + occurrence.refresh_from_db() + self.assertIsNone(occurrence.determination_id) + self.assertIsNone(occurrence.determination_score) From 6de816ad905210855d486fbff602307c23226b2e Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Thu, 16 Apr 2026 21:56:43 -0700 Subject: [PATCH 08/12] revert: keep determination_score behavior as in main MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per discussion on #1214, narrow this PR to the export format changes only. Drop the determination_score=None-for-humans behavior and the backfill command that supported it. What stays: BEST_MACHINE_PREDICTION_ORDER, with_best_machine_prediction(), with_verification_info(), the expanded with_best_detection() (source-image fields), all CSV export columns, and the best_machine_prediction nested object on OccurrenceListSerializer. None of these read or mutate determination_score, so existing sorting / filtering / default-filter behavior is unchanged. What goes: - Occurrence.find_best_prediction() / find_best_identification() extractions (best_prediction / best_identification cached_properties restored to main's bodies) - get_determination_score() — restored to return best_identification.score for human-determined occurrences - update_occurrence_determination() — restored to main (loses the FK type fix and the stale-determination-clearing fix; both are real bugs but out of scope for this PR) - Occurrence.save() legacy backfill — restored to main - backfill_determination_score management command + its test - UpdateOccurrenceDeterminationTest (covered the stale-det fix that's no longer in scope) Sortable best_machine_prediction_score on the API queryset is tracked separately in #1242. Co-Authored-By: Claude --- ami/exports/tests.py | 9 +- .../commands/backfill_determination_score.py | 47 -------- ami/main/models.py | 108 ++++++----------- ami/main/tests.py | 114 ------------------ 4 files changed, 43 insertions(+), 235 deletions(-) delete mode 100644 ami/main/management/commands/backfill_determination_score.py diff --git a/ami/exports/tests.py b/ami/exports/tests.py index a58459f2c..878520198 100644 --- a/ami/exports/tests.py +++ b/ami/exports/tests.py @@ -384,7 +384,7 @@ def test_ml_prediction_only(self): self.assertEqual(row["participant_count"], "0") def test_ml_prediction_with_agreeing_human(self): - """Human agrees with ML: verified_by set, determination_matches = True, determination_score = None.""" + """Human agrees with ML: verified_by set, determination_matches = True.""" occurrence, classification = self._create_occurrence_with_prediction() # Human agrees with the same taxon @@ -409,9 +409,6 @@ def test_ml_prediction_with_agreeing_human(self): self.assertEqual(row["agreed_with_algorithm"], "test-classifier") self.assertEqual(row["determination_matches_machine_prediction"], "True") - # determination_score should be empty/None for human-determined occurrences - self.assertIn(row["determination_score"], ["", "None", None]) - def test_ml_prediction_with_disagreeing_human(self): """Human disagrees with ML: different determination, determination_matches = False.""" occurrence, classification = self._create_occurrence_with_prediction(taxon=self.taxon_a) @@ -486,7 +483,7 @@ def test_detection_bbox_field(self): self.assertIn("0.1", row["best_detection_bbox"]) def test_api_and_csv_pick_same_best_prediction_with_mixed_terminal(self): - """find_best_prediction() and with_best_machine_prediction() must agree. + """Occurrence.best_prediction and with_best_machine_prediction() must agree. With both a high-score non-terminal classification and a lower-score terminal classification, the terminal row should win in both the API's cached @@ -529,7 +526,7 @@ def test_api_and_csv_pick_same_best_prediction_with_mixed_terminal(self): self.assertAlmostEqual(float(row["best_machine_prediction_score"]), 0.80, places=2) occurrence.refresh_from_db() - api_best = occurrence.find_best_prediction() + api_best = occurrence.best_prediction self.assertIsNotNone(api_best) self.assertEqual(api_best.taxon_id, self.taxon_b.pk) self.assertEqual(api_best.algorithm.name, "terminal-classifier") diff --git a/ami/main/management/commands/backfill_determination_score.py b/ami/main/management/commands/backfill_determination_score.py deleted file mode 100644 index fb12bf6d8..000000000 --- a/ami/main/management/commands/backfill_determination_score.py +++ /dev/null @@ -1,47 +0,0 @@ -import logging - -from django.core.management.base import BaseCommand -from django.db.models import Exists, OuterRef - -from ...models import Identification, Occurrence - -logger = logging.getLogger(__name__) - - -def backfill_human_determination_scores(dry_run: bool = True, project_id: int | None = None) -> str: - """Set determination_score=None for occurrences determined by a human identification. - - A human-determined occurrence is one with at least one non-withdrawn Identification. - The ML confidence that previously lived in determination_score is preserved in - best_machine_prediction_score (available via the with_best_machine_prediction - queryset annotation). - """ - has_identification = Identification.objects.filter( - occurrence=OuterRef("pk"), - withdrawn=False, - ) - qs = Occurrence.objects.filter(determination_score__isnull=False).filter(Exists(has_identification)) - if project_id is not None: - qs = qs.filter(project_id=project_id) - - if dry_run: - count = qs.count() - return f"Would clear determination_score on {count} human-determined occurrences" - - updated = qs.update(determination_score=None) - return f"Cleared determination_score on {updated} human-determined occurrences" - - -class Command(BaseCommand): - help = "Backfill determination_score=None on occurrences determined by a human identification" - - def add_arguments(self, parser): - parser.add_argument("--dry-run", action="store_true", help="Report the count without writing") - parser.add_argument("--project", type=int, default=None, help="Limit to a single project id") - - def handle(self, *args, **options): - msg = backfill_human_determination_scores( - dry_run=options["dry_run"], - project_id=options["project"], - ) - self.stdout.write(msg) diff --git a/ami/main/models.py b/ami/main/models.py index 8975fd51b..45b967ede 100644 --- a/ami/main/models.py +++ b/ami/main/models.py @@ -55,11 +55,9 @@ # Constants _POST_TITLE_MAX_LENGTH: Final = 80 -# Shared ordering for "best machine prediction" selection. Used by both -# Occurrence.find_best_prediction() (row-at-a-time, e.g. the API) and -# OccurrenceQuerySet.with_best_machine_prediction() (bulk-annotated, e.g. the CSV -# export). They must use the same ordering — otherwise the two code paths can pick -# different classifications for the same occurrence. +# Ordering for "best machine prediction" selection used by +# OccurrenceQuerySet.with_best_machine_prediction(). Terminal classifications win +# over non-terminal, then highest score, with pk as the deterministic tiebreaker. BEST_MACHINE_PREDICTION_ORDER: Final = ("-terminal", "-score", "-pk") @@ -2831,10 +2829,8 @@ def with_best_machine_prediction(self): """ Annotate the queryset with fields from the best machine prediction. - The "best prediction" rule is shared with Occurrence.find_best_prediction() via - BEST_MACHINE_PREDICTION_ORDER. If you change the ordering in one place, update it - in the other — otherwise the API's cached `best_prediction` and the annotations - exposed by this method can pick different classifications for the same occurrence. + Uses BEST_MACHINE_PREDICTION_ORDER to pick the winner: terminal classifications + first, then highest score, with pk as the deterministic tiebreaker. Adds the following annotations: - best_machine_prediction_name: The taxon name of the best prediction @@ -3073,51 +3069,31 @@ def detection_images(self, limit=None): def best_detection(self): return Detection.objects.filter(occurrence=self).order_by("-classifications__score").first() - def find_best_prediction(self) -> "Classification | None": + @functools.cached_property + def best_prediction(self): """ - Find the best machine prediction for this occurrence. + Use the best prediction as the best identification if there are no human identifications. - Ordering is shared with OccurrenceQuerySet.with_best_machine_prediction() via - BEST_MACHINE_PREDICTION_ORDER so that the API's cached `best_prediction` and - the CSV export's annotated fields agree. Terminal classifications win over - non-terminal, then highest score, with pk as the deterministic tiebreaker. + Uses the highest scoring classification (from any algorithm) as the best prediction. + Considers terminal classifications first, then non-terminal ones. + (Terminal classifications are the final classifications of a pipeline, non-terminal are intermediate models.) """ - return ( - Classification.objects.filter(detection__occurrence=self) - .select_related("taxon", "algorithm") - .order_by(*BEST_MACHINE_PREDICTION_ORDER) - .first() - ) + return self.predictions().order_by("-terminal", "-score").first() - def find_best_identification(self) -> "Identification | None": + @functools.cached_property + def best_identification(self): """ - Find the best human identification for this occurrence. - - The most recent non-withdrawn identification is used as the best identification. + The most recent human identification is used as the best identification. @TODO this could use a confidence level chosen manually by the users/experts. """ return Identification.objects.filter(occurrence=self, withdrawn=False).order_by("-created_at").first() - @functools.cached_property - def best_prediction(self): - return self.find_best_prediction() - - @functools.cached_property - def best_identification(self): - return self.find_best_identification() - def get_determination_score(self) -> float | None: - """ - Return the determination score for this occurrence. - - Human identifications return None (score of 1.0 is meaningless). - Machine predictions return the classification confidence score. - """ if not self.determination: return None elif self.best_identification: - return None # Human ID score (1.0) is not meaningful + return self.best_identification.score elif self.best_prediction: return self.best_prediction.score else: @@ -3160,11 +3136,12 @@ def save(self, update_determination=True, *args, **kwargs): save=True, ) - if self.determination and self.determination_score is None and not self.best_identification: - # Legacy occurrences created before the determination_score field existed. - # Human-determined occurrences intentionally have a null score, so skip them. + if self.determination and not self.determination_score: + # This may happen for legacy occurrences that were created + # before the determination_score field was added + # @TODO remove self.determination_score = self.get_determination_score() - if self.determination_score is None: + if not self.determination_score: logger.warning(f"Could not determine score for {self}") else: self.save(update_determination=False) @@ -3190,9 +3167,7 @@ class Meta: def update_occurrence_determination( - occurrence: Occurrence, - current_determination: "Taxon | int | None" = None, - save=True, + occurrence: Occurrence, current_determination: typing.Optional["Taxon"] = None, save=True ) -> bool: """ Update the determination of the occurrence based on the identifications & predictions. @@ -3201,8 +3176,9 @@ def update_occurrence_determination( If there are no identifications, set the determination to the top prediction. The `current_determination` is the determination currently saved in the database. - It may be passed as a Taxon instance or as a taxon id to avoid a DB lookup; when - omitted, the current determination id is fetched from the DB. + The `occurrence` object may already have a different un-saved determination set + so it is necessary to retrieve the current determination from the database, but + this can also be passed in as an argument to avoid an extra database query. @TODO Add tests for this important method! """ @@ -3216,35 +3192,31 @@ def update_occurrence_determination( if hasattr(occurrence, "best_identification"): del occurrence.best_identification - if isinstance(current_determination, Taxon): - current_determination_id = current_determination.pk - elif current_determination is not None: - current_determination_id = current_determination - else: - current_determination_id = Occurrence.objects.values_list("determination_id", flat=True).get(pk=occurrence.pk) - + current_determination = ( + current_determination + or Occurrence.objects.select_related("determination") + .values("determination") + .get(pk=occurrence.pk)["determination"] + ) new_determination = None new_score = None - top_identification = occurrence.find_best_identification() - if top_identification and top_identification.taxon: + top_identification = occurrence.best_identification + if top_identification and top_identification.taxon and top_identification.taxon != current_determination: new_determination = top_identification.taxon - new_score = None # Human ID score is not meaningful for determination_score - else: - top_prediction = occurrence.find_best_prediction() - if top_prediction and top_prediction.taxon: + new_score = top_identification.score + elif not top_identification: + top_prediction = occurrence.best_prediction + if top_prediction and top_prediction.taxon and top_prediction.taxon != current_determination: new_determination = top_prediction.taxon new_score = top_prediction.score - new_determination_id = new_determination.pk if new_determination else None - if new_determination_id != current_determination_id: - logger.debug( - f"Changing det. of {occurrence} from taxon#{current_determination_id} to taxon#{new_determination_id}" - ) + if new_determination and new_determination != current_determination: + logger.debug(f"Changing det. of {occurrence} from {current_determination} to {new_determination}") occurrence.determination = new_determination needs_update = True - if new_score != occurrence.determination_score: + if new_score and new_score != occurrence.determination_score: logger.debug(f"Changing det. score of {occurrence} from {occurrence.determination_score} to {new_score}") occurrence.determination_score = new_score needs_update = True diff --git a/ami/main/tests.py b/ami/main/tests.py index 8dec49a01..d105b6277 100644 --- a/ami/main/tests.py +++ b/ami/main/tests.py @@ -3977,117 +3977,3 @@ def test_list_pipelines_public_project_non_member(self): self.client.force_authenticate(user=non_member) response = self.client.get(url) self.assertEqual(response.status_code, status.HTTP_200_OK) - - -class BackfillDeterminationScoreTest(TestCase): - def setUp(self): - from io import StringIO as _StringIO # noqa: F401 (avoid top-level churn) - - self.project, self.deployment = setup_test_project(reuse=False) - create_captures(deployment=self.deployment, num_nights=1, images_per_night=2) - create_taxa(self.project) - self.taxon = self.project.taxa.first() - self.user = User.objects.create_user(email="identifier@test.org") # type: ignore - - source_image = self.project.captures.first() - assert source_image is not None - - det_ml = Detection.objects.create( - source_image=source_image, - timestamp=source_image.timestamp, - bbox=[0.1, 0.1, 0.5, 0.5], - path="detections/ml.jpg", - ) - self.ml_only_occ = det_ml.associate_new_occurrence() - Occurrence.objects.filter(pk=self.ml_only_occ.pk).update(determination=self.taxon, determination_score=0.9) - - det_human = Detection.objects.create( - source_image=source_image, - timestamp=source_image.timestamp, - bbox=[0.1, 0.1, 0.5, 0.5], - path="detections/human.jpg", - ) - self.human_occ = det_human.associate_new_occurrence() - Identification.objects.create(user=self.user, taxon=self.taxon, occurrence=self.human_occ) - # Simulate legacy bad data: determination_score was populated despite a human - # identification existing. Identification.save() clears it, so use .update() - # to bypass the save hook and recreate the legacy state. - Occurrence.objects.filter(pk=self.human_occ.pk).update(determination=self.taxon, determination_score=0.7) - - def test_dry_run_reports_without_writing(self): - from io import StringIO - - from django.core.management import call_command - - out = StringIO() - call_command("backfill_determination_score", "--dry-run", stdout=out) - self.assertIn("Would clear determination_score on 1", out.getvalue()) - - self.ml_only_occ.refresh_from_db() - self.human_occ.refresh_from_db() - self.assertEqual(self.ml_only_occ.determination_score, 0.9) - self.assertEqual(self.human_occ.determination_score, 0.7) - - def test_clears_only_human_determined_scores(self): - from io import StringIO - - from django.core.management import call_command - - out = StringIO() - call_command("backfill_determination_score", stdout=out) - self.assertIn("Cleared determination_score on 1", out.getvalue()) - - self.ml_only_occ.refresh_from_db() - self.human_occ.refresh_from_db() - self.assertEqual(self.ml_only_occ.determination_score, 0.9) - self.assertIsNone(self.human_occ.determination_score) - - def test_withdrawn_identifications_are_not_human_determined(self): - from django.core.management import call_command - - Identification.objects.filter(occurrence=self.human_occ).update(withdrawn=True) - Occurrence.objects.filter(pk=self.human_occ.pk).update(determination_score=0.7) - call_command("backfill_determination_score") - self.human_occ.refresh_from_db() - self.assertEqual(self.human_occ.determination_score, 0.7) - - -class UpdateOccurrenceDeterminationTest(TestCase): - """Tests for the update_occurrence_determination() function.""" - - def setUp(self): - self.project, self.deployment = setup_test_project(reuse=False) - create_captures(deployment=self.deployment, num_nights=1, images_per_night=1) - create_taxa(self.project) - self.taxon = self.project.taxa.first() - self.user = User.objects.create_user(email="identifier@test.org") # type: ignore - - def _create_occurrence(self): - source_image = self.project.captures.first() - assert source_image is not None - detection = Detection.objects.create( - source_image=source_image, - timestamp=source_image.timestamp, - bbox=[0.1, 0.1, 0.5, 0.5], - path="detections/update_det_test.jpg", - ) - return detection.associate_new_occurrence() - - def test_determination_cleared_when_last_identification_withdrawn_and_no_prediction(self): - """When the only authority (a human identification) is withdrawn and there is no - machine prediction, update_occurrence_determination() must clear - occurrence.determination rather than leave the stale taxon in place.""" - occurrence = self._create_occurrence() - identification = Identification.objects.create(user=self.user, taxon=self.taxon, occurrence=occurrence) - occurrence.refresh_from_db() - self.assertEqual(occurrence.determination_id, self.taxon.pk) - - # Identification.save() fires update_occurrence_determination(), so withdrawing - # and saving is enough to exercise the path. No predictions exist, so the - # previous guard would leave occurrence.determination unchanged. - identification.withdrawn = True - identification.save() - - occurrence.refresh_from_db() - self.assertIsNone(occurrence.determination_id) - self.assertIsNone(occurrence.determination_score) From 46b7d361f1802db97365a51e7791fcfe6a6523ee Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Thu, 16 Apr 2026 22:07:22 -0700 Subject: [PATCH 09/12] perf(occurrence): select_related taxon/algorithm in predictions() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Avoids per-row lazy loads when callers serialize the result. Affects both the new OccurrenceListSerializer.best_machine_prediction field and the existing get_determination_details path in the same serializer. Per CodeRabbit thread on #1214 — full annotation-driven serialization remains tracked in #1242. Co-Authored-By: Claude --- ami/main/models.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ami/main/models.py b/ami/main/models.py index 45b967ede..cb21c464e 100644 --- a/ami/main/models.py +++ b/ami/main/models.py @@ -3100,9 +3100,12 @@ def get_determination_score(self) -> float | None: return None def predictions(self): - # Retrieve the classification with the max score for each algorithm + # Retrieve the classification with the max score for each algorithm. + # select_related avoids per-row taxon/algorithm lazy loads when callers + # serialize the result (e.g. OccurrenceListSerializer.best_prediction). classifications = ( Classification.objects.filter(detection__occurrence=self) + .select_related("taxon", "algorithm") .filter( score__in=models.Subquery( Classification.objects.filter(detection__occurrence=self) From b2823e0902f730baea3b58a2428f07408ab9f509 Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Thu, 16 Apr 2026 22:40:27 -0700 Subject: [PATCH 10/12] refactor(api): hoist determination_matches_machine_prediction to top level MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sits at the same level as `determination` / `determination_score` rather than nested under `best_machine_prediction`. The flag describes the determination's relationship to the prediction, so it doesn't belong as an attribute of the prediction object — particularly once `determination` becomes its own nested object. Co-Authored-By: Claude --- ami/main/api/serializers.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/ami/main/api/serializers.py b/ami/main/api/serializers.py index b707f88b6..884460646 100644 --- a/ami/main/api/serializers.py +++ b/ami/main/api/serializers.py @@ -1320,6 +1320,7 @@ class OccurrenceListSerializer(DefaultSerializer): # first_appearance = TaxonSourceImageNestedSerializer(read_only=True) determination_details = serializers.SerializerMethodField() best_machine_prediction = serializers.SerializerMethodField() + determination_matches_machine_prediction = serializers.SerializerMethodField() identifications = OccurrenceIdentificationSerializer(many=True, read_only=True) def get_permissions(self, instance, instance_data): @@ -1358,6 +1359,7 @@ class Meta: "detection_images", "determination_score", "determination_details", + "determination_matches_machine_prediction", "best_machine_prediction", "identifications", "created_at", @@ -1413,17 +1415,24 @@ def get_best_machine_prediction(self, obj: Occurrence) -> dict | None: algorithm_data = AlgorithmNestedSerializer(prediction.algorithm, context=context).data - determination_matches = None - if obj.determination_id and prediction.taxon_id: - determination_matches = obj.determination_id == prediction.taxon_id - return dict( taxon=taxon_data, algorithm=algorithm_data, score=prediction.score, - determination_matches_machine_prediction=determination_matches, ) + def get_determination_matches_machine_prediction(self, obj: Occurrence) -> bool | None: + """Whether the determination taxon matches the best machine prediction taxon. + + Sits at the top level (alongside `determination` / `determination_score`) + rather than nested under `best_machine_prediction`, since it's a property + of the determination's relationship to the prediction. + """ + prediction = obj.best_prediction + if not prediction or not obj.determination_id or not prediction.taxon_id: + return None + return obj.determination_id == prediction.taxon_id + class OccurrenceSerializer(OccurrenceListSerializer): determination = CaptureTaxonSerializer(read_only=True) From a1e9b26bcfae4b67e5f9ab2ce805763f90bf5e0f Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Thu, 16 Apr 2026 23:30:17 -0700 Subject: [PATCH 11/12] refactor(serializers,exports): drop determination_matches_machine_prediction from API; share public-URL builder - Drop determination_matches_machine_prediction from OccurrenceListSerializer. API consumers can compare taxon IDs directly; CSV consumers compare species name strings, so the field stays in the tabular export only. - Add SourceImage.build_public_url(base_url, path) staticmethod and reuse it from both SourceImage.public_url() and the export's get_best_detection_source_image_url(). The export deliberately works from annotated path + public_base_url to avoid loading a SourceImage row per occurrence; presigned URLs (private buckets) aren't supported in that path for the same reason. Addresses review feedback on PR #1214. Co-Authored-By: Claude --- ami/exports/format_types.py | 13 ++++++++----- ami/main/api/serializers.py | 14 -------------- ami/main/models.py | 11 ++++++++++- 3 files changed, 18 insertions(+), 20 deletions(-) diff --git a/ami/exports/format_types.py b/ami/exports/format_types.py index 90e1e0a15..4675d4197 100644 --- a/ami/exports/format_types.py +++ b/ami/exports/format_types.py @@ -8,7 +8,7 @@ from ami.exports.base import BaseExporter from ami.exports.utils import get_data_in_batches -from ami.main.models import Occurrence, get_media_url +from ami.main.models import Occurrence, SourceImage, get_media_url from ami.ml.schemas import BoundingBox logger = logging.getLogger(__name__) @@ -191,13 +191,16 @@ def get_best_detection_height(self, obj): return bbox.height if bbox else None def get_best_detection_source_image_url(self, obj): - """Returns the public URL to the original source image.""" + """Returns the public URL to the original source image. + + Built from annotated `path` + `public_base_url` to avoid loading the + SourceImage row per occurrence; presigned URLs (private buckets) aren't + supported here for the same reason. + """ path = getattr(obj, "best_detection_source_image_path", None) base_url = getattr(obj, "best_detection_source_image_public_base_url", None) if path and base_url: - import urllib.parse - - return urllib.parse.urljoin(base_url, path.lstrip("/")) + return SourceImage.build_public_url(base_url, path) return None diff --git a/ami/main/api/serializers.py b/ami/main/api/serializers.py index 884460646..ec79603e7 100644 --- a/ami/main/api/serializers.py +++ b/ami/main/api/serializers.py @@ -1320,7 +1320,6 @@ class OccurrenceListSerializer(DefaultSerializer): # first_appearance = TaxonSourceImageNestedSerializer(read_only=True) determination_details = serializers.SerializerMethodField() best_machine_prediction = serializers.SerializerMethodField() - determination_matches_machine_prediction = serializers.SerializerMethodField() identifications = OccurrenceIdentificationSerializer(many=True, read_only=True) def get_permissions(self, instance, instance_data): @@ -1359,7 +1358,6 @@ class Meta: "detection_images", "determination_score", "determination_details", - "determination_matches_machine_prediction", "best_machine_prediction", "identifications", "created_at", @@ -1421,18 +1419,6 @@ def get_best_machine_prediction(self, obj: Occurrence) -> dict | None: score=prediction.score, ) - def get_determination_matches_machine_prediction(self, obj: Occurrence) -> bool | None: - """Whether the determination taxon matches the best machine prediction taxon. - - Sits at the top level (alongside `determination` / `determination_score`) - rather than nested under `best_machine_prediction`, since it's a property - of the determination's relationship to the prediction. - """ - prediction = obj.best_prediction - if not prediction or not obj.determination_id or not prediction.taxon_id: - return None - return obj.determination_id == prediction.taxon_id - class OccurrenceSerializer(OccurrenceListSerializer): determination = CaptureTaxonSerializer(read_only=True) diff --git a/ami/main/models.py b/ami/main/models.py index cb21c464e..e47b3c4c4 100644 --- a/ami/main/models.py +++ b/ami/main/models.py @@ -1840,6 +1840,15 @@ class SourceImage(BaseModel): def __str__(self) -> str: return f"{self.__class__.__name__} #{self.pk} {self.path}" + @staticmethod + def build_public_url(base_url: str, path: str) -> str: + """Join a public base URL with a stored object path. + + Shared with callers that have annotated `public_base_url` + `path` onto a + queryset row and want to skip loading the SourceImage instance. + """ + return urllib.parse.urljoin(base_url, path.lstrip("/")) + def public_url(self, raise_errors=False) -> str | None: """ Return the public URL for this image. @@ -1862,7 +1871,7 @@ def public_url(self, raise_errors=False) -> str | None: ): url = ami.utils.s3.get_presigned_url(data_source.config, key=self.path) elif self.public_base_url: - url = urllib.parse.urljoin(self.public_base_url, self.path.lstrip("/")) + url = self.build_public_url(self.public_base_url, self.path) else: msg = f"Public URL for {self} is not available. Public base URL: '{self.public_base_url}'" if raise_errors: From d50d775634a52de6f6a357d155203860d547e8db Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Thu, 16 Apr 2026 23:43:11 -0700 Subject: [PATCH 12/12] =?UTF-8?q?refactor(exports):=20rename=20source=5Fim?= =?UTF-8?q?age=20=E2=86=92=20capture;=20verification=5Fstatus=20=E2=86=92?= =?UTF-8?q?=20bool?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Rename `best_detection_source_image_url` (CSV column) and the underlying `best_detection_source_image_{path,public_base_url}` queryset annotations to `best_detection_capture_*` to match the public "Capture" terminology used in the UI. Internal model is still SourceImage; only externally-visible names change. - Change `verification_status` from "Verified"/"Not verified" strings to a bool (True/False). The hardcoded strings were awkward, and the field is now consistent with the bool-shaped `determination_matches_machine_prediction`. Addresses review feedback on PR #1214. Co-Authored-By: Claude --- ami/exports/format_types.py | 24 ++++++++++++------------ ami/exports/tests.py | 2 +- ami/main/models.py | 16 +++++++--------- 3 files changed, 20 insertions(+), 22 deletions(-) diff --git a/ami/exports/format_types.py b/ami/exports/format_types.py index 4675d4197..9c57b5c25 100644 --- a/ami/exports/format_types.py +++ b/ami/exports/format_types.py @@ -110,7 +110,7 @@ class OccurrenceTabularSerializer(serializers.ModelSerializer): best_detection_bbox = serializers.SerializerMethodField() best_detection_width = serializers.SerializerMethodField() best_detection_height = serializers.SerializerMethodField() - best_detection_source_image_url = serializers.SerializerMethodField() + best_detection_capture_url = serializers.SerializerMethodField() class Meta: model = Occurrence @@ -142,15 +142,15 @@ class Meta: "best_detection_bbox", "best_detection_width", "best_detection_height", - "best_detection_source_image_url", + "best_detection_capture_url", ] - def get_verification_status(self, obj): - """Returns 'Verified' if the occurrence has non-withdrawn identifications.""" + def get_verification_status(self, obj) -> bool: + """True if the occurrence has any non-withdrawn human identification.""" count = getattr(obj, "participant_count", None) if count is not None: - return "Verified" if count > 0 else "Not verified" - return "Verified" if obj.identifications.filter(withdrawn=False).exists() else "Not verified" + return count > 0 + return obj.identifications.filter(withdrawn=False).exists() def get_verified_by(self, obj): """Returns the display name of the user who made the best identification.""" @@ -190,15 +190,15 @@ def get_best_detection_height(self, obj): bbox = BoundingBox.from_coords(getattr(obj, "best_detection_bbox", None), raise_on_error=False) return bbox.height if bbox else None - def get_best_detection_source_image_url(self, obj): - """Returns the public URL to the original source image. + def get_best_detection_capture_url(self, obj): + """Returns the public URL to the source capture (original full-frame image). Built from annotated `path` + `public_base_url` to avoid loading the - SourceImage row per occurrence; presigned URLs (private buckets) aren't - supported here for the same reason. + capture (SourceImage) row per occurrence; presigned URLs for private + buckets aren't supported here for the same reason. """ - path = getattr(obj, "best_detection_source_image_path", None) - base_url = getattr(obj, "best_detection_source_image_public_base_url", None) + path = getattr(obj, "best_detection_capture_path", None) + base_url = getattr(obj, "best_detection_capture_public_base_url", None) if path and base_url: return SourceImage.build_public_url(base_url, path) return None diff --git a/ami/exports/tests.py b/ami/exports/tests.py index 878520198..866b1af61 100644 --- a/ami/exports/tests.py +++ b/ami/exports/tests.py @@ -547,7 +547,7 @@ def test_csv_has_all_new_fields(self): "agreed_with_user", "determination_matches_machine_prediction", "best_detection_bbox", - "best_detection_source_image_url", + "best_detection_capture_url", ] for field in expected_fields: self.assertIn(field, headers, f"Missing CSV field: {field}") diff --git a/ami/main/models.py b/ami/main/models.py index e47b3c4c4..30342de15 100644 --- a/ami/main/models.py +++ b/ami/main/models.py @@ -2794,8 +2794,8 @@ def with_best_detection(self): Adds the following annotations: - best_detection_path: The path to the detection image - best_detection_bbox: The bounding box of the detection as a list [x1, y1, x2, y2] - - best_detection_source_image_path: The path of the source image - - best_detection_source_image_public_base_url: The public base URL of the source image + - best_detection_capture_path: The path of the source capture image + - best_detection_capture_public_base_url: The public base URL of the source capture image """ # Subquery to get the path of the best detection # Use id as secondary sort to ensure deterministic results @@ -2813,13 +2813,13 @@ def with_best_detection(self): .values("bbox")[:1] ) - # Subquery to get the source image path and public_base_url for the best detection - best_detection_source_image_path_subquery = ( + # Subquery to get the source capture path and public_base_url for the best detection + best_detection_capture_path_subquery = ( Detection.objects.filter(occurrence=OuterRef("pk")) .order_by("-classifications__score", "id") .values("source_image__path")[:1] ) - best_detection_source_image_public_base_url_subquery = ( + best_detection_capture_public_base_url_subquery = ( Detection.objects.filter(occurrence=OuterRef("pk")) .order_by("-classifications__score", "id") .values("source_image__public_base_url")[:1] @@ -2828,10 +2828,8 @@ def with_best_detection(self): return self.annotate( best_detection_path=models.Subquery(best_detection_path_subquery), best_detection_bbox=models.Subquery(best_detection_bbox_subquery), - best_detection_source_image_path=models.Subquery(best_detection_source_image_path_subquery), - best_detection_source_image_public_base_url=models.Subquery( - best_detection_source_image_public_base_url_subquery - ), + best_detection_capture_path=models.Subquery(best_detection_capture_path_subquery), + best_detection_capture_public_base_url=models.Subquery(best_detection_capture_public_base_url_subquery), ) def with_best_machine_prediction(self):