metaspace2020 · Bisho2122 · Apr 17, 2026 · Apr 17, 2026 · Apr 17, 2026
diff --git a/metaspace/engine/sm/engine/annotation/formula_validator.py b/metaspace/engine/sm/engine/annotation/formula_validator.py
@@ -123,8 +123,23 @@ def make_compute_image_metrics(
 
     sample_area_mask = imzml_reader.mask
     n_spectra = np.count_nonzero(sample_area_mask)
-    empty_matrix = np.zeros(sample_area_mask.shape, dtype=np.float32)
-    sample_area_mask_flat = sample_area_mask.flatten()
+    pixel_to_flat_idx = imzml_reader.pixel_to_flat_idx
+    img_w = imzml_reader.w
+    empty_flat = np.zeros(n_spectra, dtype=np.float32)
+
+    def _coo_to_flat(img):
+        """Scatter a coo_matrix directly into a 1D masked-flat array of size n_spectra.
+
+        Equivalent to img.toarray().flatten()[sample_area_mask_flat] but without ever
+        allocating the full h×w bounding box. np.add.at correctly sums duplicate coordinates
+        that arise from concat_coo_matrices for split formulas.
+        """
+        if img is None or img.nnz == 0:
+            return empty_flat
+        flat = np.zeros(n_spectra, dtype=np.float32)
+        flat_positions = pixel_to_flat_idx[img.row * img_w + img.col]
+        np.add.at(flat, flat_positions, img.data)
+        return flat
 
     def compute_metrics(image_set: FormulaImageSet):
         # pylint: disable=unused-variable  # benchmark is used in commented-out dev code
@@ -136,8 +151,7 @@ def benchmark(attr):
 
         # with benchmark('overall'):
 
-        iso_imgs = [img.toarray() if img is not None else empty_matrix for img in image_set.images]
-        iso_imgs_flat = np.array([img.flatten()[sample_area_mask_flat] for img in iso_imgs])
+        iso_imgs_flat = np.array([_coo_to_flat(img) for img in image_set.images])
 
         doc = Metrics(formula_i=image_set.formula_i)
 
@@ -178,7 +192,7 @@ def benchmark(attr):
                 )
                 if (doc.spatial or 0.0) > 0.0 or calc_all:
                     # with benchmark('chaos'):
-                    doc.chaos = chaos_metric(iso_imgs[0], n_levels)
+                    doc.chaos = chaos_metric(image_set.images[0], n_levels)
 
         doc.msm = (doc.chaos or 0.0) * (doc.spatial or 0.0) * (doc.spectral or 0.0)
 

diff --git a/metaspace/engine/sm/engine/annotation/imzml_reader.py b/metaspace/engine/sm/engine/annotation/imzml_reader.py
@@ -57,6 +57,13 @@ def __init__(self, imzml_parser: ImzMLParser):
         sample_area_mask[self.pixel_indexes] = True
         self.mask = sample_area_mask.reshape(self.h, self.w)
 
+        # pixel_to_flat_idx: maps flat pixel coord (Y*W+X) → position in the masked-flat array.
+        # Positions are assigned in ascending pixel_index order to match the semantics of
+        # mask.flatten()[sample_area_mask_flat] used elsewhere in the pipeline.
+        # -1 for pixels that have no spectrum.
+        self.pixel_to_flat_idx = np.full(self.h * self.w, -1, dtype=np.intp)
+        self.pixel_to_flat_idx[sample_area_mask] = np.arange(self.n_spectra)
+
         # raw_coord_bounds is the RAW min/max coordinates, purely for diagnostics
         self.raw_coord_bounds = (
             np.min(imzml_parser.coordinates, axis=0),

diff --git a/metaspace/engine/sm/engine/annotation/metrics.py b/metaspace/engine/sm/engine/annotation/metrics.py
@@ -98,14 +98,64 @@ def _chaos_dilate(arr):
         return arr
 
 
-def chaos_metric(iso_img, n_levels):
-    # Shrink image if possible, as chaos performance is highly resolution-dependent
-    iso_img = iso_img[_chaos_dilate(np.any(iso_img, axis=1)), :]
-    iso_img = iso_img[:, _chaos_dilate(np.any(iso_img, axis=0))]
+def chaos_metric(coo_img, n_levels):
+    """Compute the chaos metric directly from a sparse coo_matrix.
+
+    Builds the same compact 2D dense array that the previous implementation produced after
+    toarray() + _chaos_dilate cropping, but without ever allocating the full h×w bounding box.
+    Results are bit-for-bit identical to the old path for any given dataset.
+
+    Args:
+        coo_img: First-isotope image as a scipy coo_matrix (or None / empty).
+        n_levels: Number of intensity thresholds for measure_of_chaos.
+    """
+    if coo_img is None or coo_img.nnz == 0:
+        return 0
+
+    nrows, ncols = coo_img.shape
+
+    # Build row/col occupancy masks directly from sparse indices — O(nnz), no h×w allocation
+    row_mask = np.zeros(nrows, dtype=bool)
+    col_mask = np.zeros(ncols, dtype=bool)
+    row_mask[coo_img.row] = True
+    col_mask[coo_img.col] = True
+
+    # Apply the same dilation as before so measure_of_chaos sees the same spatial context
+    row_sel = _chaos_dilate(row_mask)
+    col_sel = _chaos_dilate(col_mask)
+
+    if isinstance(row_sel, slice) and isinstance(col_sel, slice):
+        # Image is already dense (>90% non-empty in both dimensions); toarray() is cheap here
+        iso_img = coo_img.toarray()
+    else:
+        # Build compact index remaps: original row/col index → position in compact array
+        if isinstance(row_sel, slice):
+            n_rows_compact = nrows
+            row_remap = np.arange(nrows, dtype=np.intp)
+        else:
+            selected_rows = np.where(row_sel)[0]
+            n_rows_compact = len(selected_rows)
+            row_remap = np.full(nrows, -1, dtype=np.intp)
+            row_remap[selected_rows] = np.arange(n_rows_compact)
+
+        if isinstance(col_sel, slice):
+            n_cols_compact = ncols
+            col_remap = np.arange(ncols, dtype=np.intp)
+        else:
+            selected_cols = np.where(col_sel)[0]
+            n_cols_compact = len(selected_cols)
+            col_remap = np.full(ncols, -1, dtype=np.intp)
+            col_remap[selected_cols] = np.arange(n_cols_compact)
+
+        # Scatter sparse values into compact array.
+        # np.add.at handles duplicate coordinates (e.g. from concat_coo_matrices for split
+        # formulas) the same way toarray() does — by summing them.
+        iso_img = np.zeros((n_rows_compact, n_cols_compact), dtype=coo_img.dtype)
+        compact_rows = row_remap[coo_img.row]
+        compact_cols = col_remap[coo_img.col]
+        np.add.at(iso_img, (compact_rows, compact_cols), coo_img.data)
 
     if iso_img.size == 0:
-        # measure_of_chaos segfaults if the image has no elements - in Lithops this appears as a
-        # MemoryError. Skip empty images.
         return 0
 
     # measure_of_chaos behaves weirdly on Fortran-ordered arrays, which happen sometimes due to the

diff --git a/metaspace/engine/sm/engine/annotation_lithops/pipeline.py b/metaspace/engine/sm/engine/annotation_lithops/pipeline.py
@@ -121,6 +121,15 @@ def run_pipeline(
                 f'The bounding box area exceeds the maximum allowed pixel count of ({pixel_limit}). Contact contact@metaspace2020.org.'
             )
 
+        sparsity_ratio = nz_pixels / n_pixels
+        if sparsity_ratio < 0.05:
+            logger.warning(
+                f'Low pixel density detected: {nz_pixels} spectra in a '
+                f'{self.imzml_reader.h}×{self.imzml_reader.w} bounding box '
+                f'(density={sparsity_ratio:.3f}). This dataset may have isolated blobs '
+                f'with large empty areas.'
+            )
+
         self.segment_centroids(use_cache=use_cache)
         if debug_validate:
             self.validate_segment_centroids()

diff --git a/metaspace/engine/sm/engine/tests/annotation/test_formula_validator.py b/metaspace/engine/sm/engine/tests/annotation/test_formula_validator.py
@@ -26,7 +26,7 @@ def _test_compute_metrics():
         isotope_generation=None,
         fdr=None,
     )
-    imzml_reader = make_imzml_reader_mock(list(product(range(2), range(3))))
+    imzml_reader = make_imzml_reader_mock(list(product(range(3), range(2))))
     compute_metrics = make_compute_image_metrics(imzml_reader, ds_config)
     return compute_metrics
 
@@ -37,7 +37,7 @@ def _test_compute_metrics():
 def test_formula_image_metrics(chaos_mock, spatial_mock, spectral_mock):
     spectral_mock.side_effect = lambda imgs_flat, *args: imgs_flat[0][0]
     spatial_mock.side_effect = lambda imgs_flat, *args, **kwargs: imgs_flat[0][1]
-    chaos_mock.side_effect = lambda img, *args: img[0, 2]
+    chaos_mock.side_effect = lambda img, *args: img.toarray()[0, 2]
 
     # Images 2 & 3 combine to equal image 1
     # First 3 intensities get used as spectral, spatial & chaos metrics