diff --git a/brainscore_vision/__init__.py b/brainscore_vision/__init__.py
index cd7ec541b9..817c25d998 100644
--- a/brainscore_vision/__init__.py
+++ b/brainscore_vision/__init__.py
@@ -1,4 +1,5 @@
 import logging
+import os
 from typing import Dict, Any, Union, Callable
 
 from brainscore_core.supported_data_standards.brainio.assemblies import DataAssembly
@@ -6,6 +7,7 @@
 
 from brainscore_core.benchmarks import Benchmark
 from brainscore_core.metrics import Metric, Score
+from brainscore_core.benchmarks import score_benchmark
 from brainscore_core.plugin_management.conda_score import wrap_score
 from brainscore_core.plugin_management.import_plugin import import_plugin
 from brainscore_vision.metrics import Ceiling
@@ -74,7 +76,21 @@ def _run_score(model_identifier: str, benchmark_identifier: str) -> Score:
     """
     model: BrainModel = load_model(model_identifier)
     benchmark: Benchmark = load_benchmark(benchmark_identifier)
-    score: Score = benchmark(model)
+    try:
+        score: Score = score_benchmark(benchmark, model)
+    except AssertionError as e:
+        cache_dir = os.path.expanduser(
+            '~/.result_caching/brainscore_vision.model_helpers.activations.core'
+            '.ActivationsExtractorHelper._from_paths_stored'
+        )
+        raise AssertionError(
+            f"{e}\n\n"
+            f"If this is a stale activations cache (cached stimulus paths no longer match "
+            f"current locations, e.g. temp directory changed between runs), fix with:\n"
+            f"  rm {cache_dir}/identifier={model_identifier},stimuli_identifier=*.pkl\n\n"
+            f"Or to clear the entire activations cache:\n"
+            f"  rm {cache_dir}/*.pkl"
+        ) from e
     score.attrs['model_identifier'] = model_identifier
     score.attrs['benchmark_identifier'] = benchmark_identifier
     try:  # attempt to look up the layer commitment if model uses a standard layer model
@@ -104,3 +120,7 @@ def score(model_identifier: str, benchmark_identifier: str, conda_active: bool =
     return wrap_score(__file__,
                       model_identifier=model_identifier, benchmark_identifier=benchmark_identifier,
                       score_function=_run_score, conda_active=conda_active)
+
+
+# Public re-export so callers can do: from brainscore_vision import preallocate_memory
+from brainscore_vision.benchmark_helpers.memory import preallocate_memory  # noqa: E402
diff --git a/brainscore_vision/benchmark_helpers/benchmark_costs.json b/brainscore_vision/benchmark_helpers/benchmark_costs.json
new file mode 100644
index 0000000000..a740715bd7
--- /dev/null
+++ b/brainscore_vision/benchmark_helpers/benchmark_costs.json
@@ -0,0 +1,51 @@
+{
+  "Allen2022_fmri.IT-ridge": 0.4113,
+  "Allen2022_fmri.V1-ridge": 0.5544,
+  "Allen2022_fmri.V2-ridge": 0.7258,
+  "Allen2022_fmri.V4-ridge": 0.3258,
+  "Allen2022_fmri_4subj.IT-ridge": 1.49,
+  "Allen2022_fmri_4subj.V1-ridge": 0.6966,
+  "Allen2022_fmri_4subj.V2-ridge": 1.3744,
+  "Allen2022_fmri_4subj.V4-ridge": 0.6729,
+  "Allen2022_fmri_surface.IT-ridge": 0.6236,
+  "Allen2022_fmri_surface.V1-ridge": 1.2537,
+  "Allen2022_fmri_surface.V2-ridge": 1.5052,
+  "Allen2022_fmri_surface.V4-ridge": 0.6126,
+  "Allen2022_fmri_surface_4subj.IT-ridge": 0.5683,
+  "Allen2022_fmri_surface_4subj.V1-ridge": 1.3479,
+  "Allen2022_fmri_surface_4subj.V2-ridge": 1.6253,
+  "Allen2022_fmri_surface_4subj.V4-ridge": 0.7984,
+  "Cadena2017-mask": 4.4728,
+  "Cadena2017-pls": 6.9435,
+  "FreemanZiemba2013.V1-pls": 4.0874,
+  "FreemanZiemba2013.V2-pls": 3.8205,
+  "FreemanZiemba2013public.V1-pls": 1.813,
+  "FreemanZiemba2013public.V2-pls": 0.8573,
+  "Gifford2022.IT-ridge": 3.6693,
+  "Gifford2022.IT-ridgecv": 7.3324,
+  "Hebart2023_fmri.IT-ridge": 16.8948,
+  "Hebart2023_fmri.IT-ridgecv": 3.8821,
+  "Hebart2023_fmri.V1-ridge": 4.843,
+  "Hebart2023_fmri.V1-ridgecv": 8.8359,
+  "Hebart2023_fmri.V2-ridge": 15.5814,
+  "Hebart2023_fmri.V2-ridgecv": 12.4336,
+  "Hebart2023_fmri.V4-ridge": 6.8122,
+  "Hebart2023_fmri.V4-ridgecv": 12.4122,
+  "Igustibagus2024-ridge": 2.1582,
+  "MajajHong2015.IT-pls": 2.8336,
+  "MajajHong2015.V4-pls": 4.0503,
+  "MajajHong2015public.IT-pls": 3.5527,
+  "MajajHong2015public.V4-pls": 4.9696,
+  "Papale2025.IT-ridge": 43.45,
+  "Papale2025.IT-ridgecv": 7.22,
+  "Papale2025.V1-ridge": 15.07,
+  "Papale2025.V1-ridgecv": 26.58,
+  "Papale2025.V4-ridge": 40.54,
+  "Rajalingham2020.IT-pls": 0.45,
+  "Sanghavi2020.IT-pls": 6.71,
+  "Sanghavi2020.V4-pls": 8.97,
+  "SanghaviJozwik2020.IT-pls": 5.41,
+  "SanghaviJozwik2020.V4-pls": 7.65,
+  "SanghaviMurty2020.IT-pls": 0.22,
+  "SanghaviMurty2020.V4-pls": 0.56
+}
diff --git a/brainscore_vision/benchmark_helpers/memory.py b/brainscore_vision/benchmark_helpers/memory.py
new file mode 100644
index 0000000000..1e82a8ef56
--- /dev/null
+++ b/brainscore_vision/benchmark_helpers/memory.py
@@ -0,0 +1,512 @@
+"""
+Memory estimation utilities for Brain-Score benchmarks.
+
+Call :func:`preallocate_memory` before scoring to detect OOM errors early,
+rather than discovering them 6+ hours into a benchmark run.
+
+Example usage::
+
+    from brainscore_vision import load_model, load_benchmark
+    from brainscore_vision.benchmark_helpers.memory import preallocate_memory
+
+    model = load_model('resnet50')
+    benchmark = load_benchmark('MajajHong2015public.IT-pls')
+    estimate = preallocate_memory(model, benchmark)   # raises MemoryError if OOM
+    score = benchmark(model)
+"""
+
+import json
+import logging
+import os
+from dataclasses import dataclass
+from typing import Optional
+
+import psutil
+
+from brainscore_vision.benchmark_helpers.neural_common import NeuralBenchmark, TrainTestNeuralBenchmark, RSABenchmark, timebins_from_assembly
+from brainscore_vision.benchmark_helpers.screen import place_on_screen
+from brainscore_vision.model_interface import BrainModel
+
+_logger = logging.getLogger(__name__)
+
+# Default path for the persistent calibration table.
+# Prefer the file bundled with the package; fall back to the user-local path
+# so that a local calibration run (mem_profile_suite.py --calibrate) can
+# extend or override the shipped table without touching the source tree.
+_BUNDLED_CALIBRATION_PATH = os.path.join(os.path.dirname(__file__), 'benchmark_costs.json')
+_DEFAULT_CALIBRATION_PATH = (
+    _BUNDLED_CALIBRATION_PATH
+    if os.path.exists(_BUNDLED_CALIBRATION_PATH)
+    else os.path.expanduser('~/.brainscore/benchmark_costs.json')
+)
+
+# float32 = 4 bytes per element
+_BYTES_PER_ELEMENT = 4
+
+# Overhead multiplier on top of the activation assembly size.
+# Accounts for xarray coordinate arrays, regression/CV matrices, and
+# temporary buffers.  Calibrated against MajajHong2015.IT-pls (resnet50,
+# no PCA): 1.91 GB assembly → 9.98 GB observed peak delta → 5.2× real
+# overhead.  Using 6× to stay slightly conservative.
+_OVERHEAD_FACTOR = 6
+
+# Overhead multiplier applied to the activation array for PLS benchmarks.
+# PLS regression builds cross-covariance matrices of shape
+# (num_features × num_neuroids) whose memory scales with the model's feature
+# count.  The calibrated fixed_benchmark_cost is therefore NOT model-independent
+# for PLS — it was measured on alexnet (~9K features) and severely underestimates
+# for large-feature models (200K+ features).
+#
+# Formula for PLS:  total = activation_gb × _PLS_OVERHEAD_FACTOR + fixed_cost_gb
+#   where fixed_cost_gb covers the neural-assembly side (truly model-independent).
+#
+# Validated against a 3-model × 2-PLS-benchmark grid:
+#   worst miss after fix: resnet50 × Cadena2017-pls  →  -12.7%  (within 15%)
+_PLS_OVERHEAD_FACTOR = 7
+
+
+@dataclass
+class MemoryEstimate:
+    """Breakdown of the estimated memory footprint for a benchmark run."""
+    num_stimuli: int
+    num_trials: int
+    num_features: int
+    num_timebins: int
+    activation_gb: float        # activation array only
+    total_estimated_gb: float   # see formula description below
+    available_gb: float
+    fixed_benchmark_cost_gb: Optional[float] = None  # None → overhead-factor fallback was used
+    is_pls: bool = False        # True → PLS formula was used (activation × _PLS_OVERHEAD_FACTOR + fixed_cost)
+    # formula_type: 'pls' | 'rdm' | 'ridge_formula' | 'calibrated' | 'fallback'
+    formula_type: str = 'fallback'
+    rdm_overhead_gb: Optional[float] = None  # n_stimuli^2 term used in RDM and ridge-formula paths
+
+    @property
+    def will_oom(self) -> bool:
+        return self.total_estimated_gb > self.available_gb
+
+    def __str__(self) -> str:
+        status = "OOM LIKELY" if self.will_oom else "OK"
+        if self.formula_type == 'pls':
+            fixed_str = (f" + {self.fixed_benchmark_cost_gb:.2f} GB fixed cost"
+                         if self.fixed_benchmark_cost_gb else "")
+            formula = (f"{self.activation_gb:.2f} GB activations "
+                       f"×{_PLS_OVERHEAD_FACTOR} (PLS){fixed_str}")
+        elif self.formula_type == 'rdm':
+            formula = (f"{self.activation_gb:.2f} GB activations "
+                       f"×3 (RDM pairwise distance overhead → {self.total_estimated_gb:.1f} GB total)")
+        elif self.formula_type == 'ridge_large_feature':
+            formula = (f"{self.activation_gb:.2f} GB activations "
+                       f"×{_OVERHEAD_FACTOR} (ridge SVD path: n_features > n_stimuli → {self.total_estimated_gb:.1f} GB total)")
+        elif self.formula_type == 'ridge_formula':
+            formula = (f"{self.activation_gb:.2f} GB activations "
+                       f"+ {self.rdm_overhead_gb:.2f} GB gram matrix ({self.num_stimuli}²×4B)")
+        elif self.formula_type == 'calibrated':
+            formula = (f"{self.activation_gb:.2f} GB activations "
+                       f"+ {self.fixed_benchmark_cost_gb:.2f} GB fixed benchmark cost (calibrated)")
+        else:
+            formula = (f"{self.activation_gb:.2f} GB "
+                       f"(×{_OVERHEAD_FACTOR} overhead → {self.total_estimated_gb:.1f} GB total)")
+        return (
+            f"[{status}] Memory estimate: {self.total_estimated_gb:.1f} GB needed, "
+            f"{self.available_gb:.1f} GB available\n"
+            f"  Activations: {self.num_stimuli} stimuli × {self.num_features:,} features "
+            f"× {self.num_timebins} timebins = {formula}"
+        )
+
+
+def load_calibration(path: Optional[str] = None) -> dict:
+    """Load the benchmark fixed-cost table from disk.
+
+    Returns an empty dict if the file does not exist yet.
+    The file is written by :func:`save_calibration` (or by the
+    ``--calibrate`` mode of ``mem_profile_suite.py``).
+    """
+    path = path or _DEFAULT_CALIBRATION_PATH
+    try:
+        with open(path) as f:
+            return json.load(f)
+    except FileNotFoundError:
+        return {}
+    except Exception as e:
+        _logger.warning(f"Could not load calibration from {path}: {e}")
+        return {}
+
+
+def save_calibration(costs: dict, path: Optional[str] = None) -> None:
+    """Persist benchmark fixed costs to disk.
+
+    Parameters
+    ----------
+    costs : dict
+        ``{benchmark_identifier: fixed_cost_gb}`` mapping produced by a
+        calibration run.
+    path : str, optional
+        Destination file.  Defaults to ``~/.brainscore/benchmark_costs.json``.
+    """
+    path = path or _DEFAULT_CALIBRATION_PATH
+    os.makedirs(os.path.dirname(path), exist_ok=True)
+    with open(path, 'w') as f:
+        json.dump(costs, f, indent=2, sort_keys=True)
+    _logger.info(f"Calibration saved → {path}  ({len(costs)} benchmarks)")
+
+
+def _is_pls_benchmark(benchmark) -> bool:
+    """Return True if the benchmark uses PLS regression.
+
+    PLS cross-covariance matrices scale with num_features, so the calibrated
+    fixed_benchmark_cost (measured on alexnet with ~9K features) does not
+    generalise to large-feature models.  A dedicated PLS overhead formula is
+    applied instead.  Detection is based on the naming convention: all PLS
+    benchmarks in brainscore_vision end with ``-pls`` or ``-reverse_pls``.
+    """
+    ident = str(getattr(benchmark, 'identifier', ''))
+    return ident.endswith('-pls') or ident.endswith('-reverse_pls') or '-temporal-pls' in ident
+
+
+def _is_rdm_benchmark(benchmark) -> bool:
+    """Return True if the benchmark uses RDM/RSA.
+
+    RDM overhead scales with activation size (≈ 2× activation_gb), not purely
+    with n_stimuli². Detected via the ``-rdm`` suffix or RSABenchmark instance type.
+    """
+    if isinstance(benchmark, RSABenchmark):
+        return True
+    return str(getattr(benchmark, 'identifier', '')).endswith('-rdm')
+
+
+def _is_ridge_benchmark(benchmark) -> bool:
+    """Return True if the benchmark uses ridge or ridgecv regression.
+
+    The gram matrix for ridge is n_stimuli × n_stimuli — model-independent —
+    so we can compute a formula-based estimate when no calibration entry exists.
+    RSABenchmark instances are explicitly excluded: they are always RDM, never ridge.
+    """
+    if isinstance(benchmark, RSABenchmark):
+        return False
+    ident = str(getattr(benchmark, 'identifier', ''))
+    return ident.endswith('-ridge') or ident.endswith('-ridgecv')
+
+
+def _get_probe_layer(model):
+    """
+    Return the committed layer string for the model's primary recording region,
+    or None if it cannot be determined without triggering expensive layer selection.
+    """
+    try:
+        # Navigate ModelCommitment → TemporalAligned → LayerMappedModel
+        lm = getattr(model, 'layer_model', None)
+        if lm is not None and hasattr(lm, '_layer_model'):
+            lm = lm._layer_model  # TemporalAligned → LayerMappedModel
+        if lm is None:
+            lm = model  # might itself be LayerMappedModel-like
+
+        rmap = getattr(lm, 'region_layer_map', None)
+        if rmap is None:
+            return None
+
+        # Prefer IT, then any committed region.
+        # Use dict.__contains__ to avoid triggering lazy RegionLayerMap.__getitem__
+        for candidate_region in ['IT', 'V4', 'V2', 'V1']:
+            if dict.__contains__(rmap, candidate_region):
+                layers = dict.__getitem__(rmap, candidate_region)
+                if layers is not None:
+                    if isinstance(layers, (list, tuple)):
+                        return layers[0]
+                    return layers
+
+        # If it's a plain dict (not lazy RegionLayerMap), just grab any value
+        if type(rmap) is dict and rmap:
+            return next(iter(rmap.values()))
+
+    except Exception:
+        pass
+    return None
+
+
+def preallocate_memory(
+    model: BrainModel,
+    benchmark,
+    raise_if_oom: bool = True,
+    fixed_benchmark_cost_gb: Optional[float] = None,
+) -> Optional[MemoryEstimate]:
+    """
+    Estimate memory requirements before running a full benchmark.
+
+    Probes the model with a single stimulus to get the actual feature count.
+    The probe calls the activations extractor's ``_from_paths`` directly,
+    bypassing ``from_stimulus_set`` / ``attach_stimulus_set_meta`` so that
+    the probe cannot interfere with the subsequent scoring run's result cache.
+
+    Estimates total memory as
+    ``num_stimuli × num_features × num_timebins × 4 bytes × overhead``.
+
+    num_trials is intentionally excluded: deterministic models process each
+    unique stimulus once; the trial dimension in the neural assembly does not
+    scale model memory.
+
+    Parameters
+    ----------
+    model : BrainModel
+        The candidate model that will be scored.
+    benchmark : NeuralBenchmark or TrainTestNeuralBenchmark
+        The benchmark the model will be scored on.
+    raise_if_oom : bool, optional
+        If ``True`` (default), raises :exc:`MemoryError` when the estimate
+        exceeds available RAM.  If ``False``, logs a warning instead.
+
+    Returns
+    -------
+    MemoryEstimate
+        Estimated memory breakdown with a ``.will_oom`` property.
+
+    Raises
+    ------
+    TypeError
+        If *benchmark* is not a supported neural benchmark type.
+    MemoryError
+        If ``raise_if_oom=True`` and estimated memory exceeds available RAM.
+    """
+    if os.environ.get('BRAINSCORE_SKIP_MEMORY_CHECK', '0') == '1':
+        _logger.debug("BRAINSCORE_SKIP_MEMORY_CHECK is set — skipping memory pre-check.")
+        return None
+
+    # ------------------------------------------------------------------ #
+    #  1. Extract metadata from the benchmark                             #
+    # ------------------------------------------------------------------ #
+    if isinstance(benchmark, NeuralBenchmark):
+        stimulus_set = benchmark._assembly.stimulus_set
+        num_stimuli = int(stimulus_set['stimulus_id'].nunique())
+        num_trials = benchmark._number_of_trials
+        timebins = benchmark.timebins
+        region = benchmark.region
+        visual_degrees = benchmark._visual_degrees
+
+    elif isinstance(benchmark, TrainTestNeuralBenchmark):
+        train_ss = benchmark.train_assembly.stimulus_set
+        test_ss = benchmark.test_assembly.stimulus_set
+        stimulus_set = train_ss
+        num_stimuli = int(train_ss['stimulus_id'].nunique()) + int(test_ss['stimulus_id'].nunique())
+        num_trials = benchmark._number_of_trials
+        timebins = benchmark.timebins
+        region = benchmark.region
+        visual_degrees = benchmark._visual_degrees
+
+    elif isinstance(benchmark, RSABenchmark):
+        stimulus_set = benchmark._assembly.stimulus_set
+        num_stimuli = int(stimulus_set['stimulus_id'].nunique())
+        num_trials = benchmark._number_of_trials
+        timebins = timebins_from_assembly(benchmark._assembly)
+        region = benchmark.region
+        visual_degrees = benchmark._visual_degrees
+
+    else:
+        # Unsupported benchmark type (e.g. behavioral/engineering called directly).
+        # Return None rather than crashing — the no-op on brainscore_vision.benchmarks.Benchmark
+        # means score_benchmark never reaches here for non-neural benchmarks, but direct
+        # calls from scripts should not raise unexpectedly.
+        _logger.debug(
+            f"preallocate_memory: unsupported benchmark type {type(benchmark).__name__}, skipping."
+        )
+        return None
+
+    # ------------------------------------------------------------------ #
+    #  2. Prepare probe stimulus (1 image, visual-degree corrected)       #
+    # ------------------------------------------------------------------ #
+    probe_set = stimulus_set.iloc[:1].copy()
+    probe_set.identifier = None
+    probe_set = place_on_screen(
+        probe_set,
+        target_visual_degrees=model.visual_degrees(),
+        source_visual_degrees=visual_degrees,
+    )
+    probe_stimulus_id = probe_set['stimulus_id'].values[0]
+    probe_path = str(probe_set.get_stimulus(probe_stimulus_id))
+
+    # ------------------------------------------------------------------ #
+    #  3. Probe the model with 1 stimulus                                 #
+    #                                                                     #
+    #  We call _from_paths directly — bypassing from_stimulus_set and    #
+    #  attach_stimulus_set_meta — so the probe cannot corrupt the        #
+    #  activations cache used by the subsequent scoring run.             #
+    #                                                                     #
+    #  We do NOT disable LayerPCA: the probe should measure the feature  #
+    #  count exactly as the scoring run will accumulate it (i.e. after   #
+    #  PCA reduction when PCA is hooked, raw otherwise).                 #
+    # ------------------------------------------------------------------ #
+    _am = getattr(model, 'activations_model', None)
+    _extractor = getattr(_am, '_extractor', None) if _am else None
+    probe_layer = _get_probe_layer(model) if _extractor is not None else None
+
+    if _extractor is not None and probe_layer is not None:
+        # Fast path: call _from_paths directly — no attach_stimulus_set_meta
+        probe_output = _extractor._from_paths(layers=[probe_layer], stimuli_paths=[probe_path])
+        num_features = probe_output.sizes['neuroid']
+        num_timebins = len(timebins)  # _from_paths has no time expansion; timebins from benchmark
+    else:
+        # Fallback: use the standard look_at pipeline
+        model.start_recording(region, time_bins=timebins)
+        probe_output = model.look_at(probe_set, number_of_trials=1)
+        num_features = probe_output.sizes['neuroid']
+        num_timebins = probe_output.sizes.get('time_bin', 1)
+
+    _logger.info(
+        f"Memory probe: benchmark={benchmark.identifier} region={region} "
+        f"stimuli={num_stimuli} features={num_features} timebins={num_timebins}"
+    )
+
+    # ------------------------------------------------------------------ #
+    #  4. Compute estimate and check against available RAM                #
+    #                                                                     #
+    #  num_trials excluded: deterministic models process each unique      #
+    #  stimulus once; trial repetition does not scale model memory.       #
+    # ------------------------------------------------------------------ #
+    activation_bytes = num_stimuli * num_features * num_timebins * _BYTES_PER_ELEMENT
+    activation_gb = activation_bytes / (1024 ** 3)
+
+    # Auto-load from the calibration table if no explicit value was given
+    if fixed_benchmark_cost_gb is None:
+        _cal = load_calibration()
+        fixed_benchmark_cost_gb = _cal.get(benchmark.identifier)
+        if fixed_benchmark_cost_gb is not None:
+            _logger.debug(
+                f"Using calibrated fixed cost for {benchmark.identifier}: "
+                f"{fixed_benchmark_cost_gb:.3f} GB"
+            )
+
+    # ------------------------------------------------------------------ #
+    #  Choose the right formula based on the benchmark's regression type  #
+    #                                                                     #
+    #  PLS: cross-covariance matrices scale with num_features — use      #
+    #  activation × _PLS_OVERHEAD_FACTOR.  This is approximate; a       #
+    #  warning is printed.                                               #
+    #                                                                     #
+    #  RDM/RSA: pairwise distance computation passes through the full    #
+    #  activation matrix — overhead ≈ 2× activation_gb.  Use 3× total.  #
+    #                                                                     #
+    #  Ridge/RidgeCV — two regimes depending on feature count:           #
+    #                                                                     #
+    #    n_features ≤ n_stimuli (primal solver): calibrated fixed cost   #
+    #    is accurate — gram matrix is n_stimuli×n_stimuli and is model-  #
+    #    independent.                                                     #
+    #                                                                     #
+    #    n_features > n_stimuli (sklearn switches to SVD of X): overhead #
+    #    ≈ 5× activation_gb — SVD creates V^T (same shape as X) and     #
+    #    U (n_stimuli×n_stimuli), so total ≈ 6× activation_gb.  The     #
+    #    calibrated fixed cost was measured on a small model (alexnet,   #
+    #    n_features < n_stimuli for most benchmarks) and severely        #
+    #    underestimates in this regime.  Use the ×6 fallback instead so  #
+    #    the pre-flight raises MemoryError cleanly before the OS kills   #
+    #    the container with no Python traceback.                          #
+    # ------------------------------------------------------------------ #
+    is_pls = _is_pls_benchmark(benchmark)
+    is_rdm = _is_rdm_benchmark(benchmark)
+    is_ridge = _is_ridge_benchmark(benchmark)
+    ridge_large_feature = is_ridge and num_features > num_stimuli
+
+    rdm_overhead_gb = None
+    if is_pls:
+        total_estimated_gb = activation_gb * _PLS_OVERHEAD_FACTOR + (fixed_benchmark_cost_gb or 0.0)
+        formula_type = 'pls'
+    elif is_rdm:
+        # Overhead ≈ 2× activation_gb (scales with features, not n_stimuli²).
+        # Validated across alexnet/resnet50/ViT on Allen2022_fmri.IT-rdm.
+        rdm_overhead_gb = 2 * activation_gb
+        total_estimated_gb = activation_gb + rdm_overhead_gb  # = 3 × activation_gb
+        formula_type = 'rdm'
+    elif ridge_large_feature:
+        # n_features > n_stimuli: sklearn SVD path — overhead ≈ 5× activation_gb.
+        # Validated: resnet50/ViT × Gifford2022.IT-ridgecv both gave exactly 5.1×.
+        # Use ×6 total (activation + 5× overhead) to stay conservative and ensure
+        # the pre-flight MemoryError fires before the OS kills the container.
+        total_estimated_gb = activation_gb * _OVERHEAD_FACTOR
+        formula_type = 'ridge_large_feature'
+    elif is_ridge and fixed_benchmark_cost_gb is not None:
+        total_estimated_gb = activation_gb + fixed_benchmark_cost_gb
+        formula_type = 'calibrated'
+    elif is_ridge:
+        # No calibration entry, primal regime: gram matrix is n_stimuli×n_stimuli
+        rdm_overhead_gb = (num_stimuli ** 2) * _BYTES_PER_ELEMENT / (1024 ** 3)
+        total_estimated_gb = activation_gb + rdm_overhead_gb
+        formula_type = 'ridge_formula'
+    elif fixed_benchmark_cost_gb is not None:
+        total_estimated_gb = activation_gb + fixed_benchmark_cost_gb
+        formula_type = 'calibrated'
+    else:
+        total_estimated_gb = activation_gb * _OVERHEAD_FACTOR
+        formula_type = 'fallback'
+
+    available_gb = psutil.virtual_memory().available / (1024 ** 3)
+
+    estimate = MemoryEstimate(
+        num_stimuli=num_stimuli,
+        num_trials=num_trials,
+        num_features=num_features,
+        num_timebins=num_timebins,
+        activation_gb=activation_gb,
+        total_estimated_gb=total_estimated_gb,
+        available_gb=available_gb,
+        fixed_benchmark_cost_gb=fixed_benchmark_cost_gb,
+        is_pls=is_pls,
+        formula_type=formula_type,
+        rdm_overhead_gb=rdm_overhead_gb,
+    )
+
+    verdict = "OOM LIKELY" if estimate.will_oom else "OK"
+    print(
+        f"[pre-flight] [{verdict}]  "
+        f"{estimate.total_estimated_gb:.2f} GB needed  /  {estimate.available_gb:.1f} GB available  "
+        f"[{formula_type}]\n"
+        f"  {estimate.num_stimuli:,} stimuli  ×  {estimate.num_features:,} features  ×  "
+        f"{estimate.num_timebins} timebins  =  {estimate.activation_gb:.3f} GB activation",
+        end='',
+        flush=True,
+    )
+    if formula_type == 'pls':
+        fixed_str = (f"  +  {estimate.fixed_benchmark_cost_gb:.3f} GB fixed cost"
+                     if estimate.fixed_benchmark_cost_gb is not None else "")
+        print(f"  ×{_PLS_OVERHEAD_FACTOR} (PLS){fixed_str}  =  {estimate.total_estimated_gb:.3f} GB total",
+              flush=True)
+        print(
+            f"[pre-flight] WARNING: PLS overhead multiplier (×{_PLS_OVERHEAD_FACTOR}) is approximate. "
+            f"Actual usage can vary significantly depending on model feature count and convergence.",
+            flush=True,
+        )
+    elif formula_type == 'ridge_large_feature':
+        print(f"  ×{_OVERHEAD_FACTOR} (ridge SVD: n_features={num_features:,} > n_stimuli={num_stimuli:,})"
+              f"  =  {estimate.total_estimated_gb:.3f} GB total", flush=True)
+    elif formula_type == 'rdm':
+        print(f"  ×3 (RDM pairwise overhead)"
+              f"  =  {estimate.total_estimated_gb:.3f} GB total", flush=True)
+    elif formula_type == 'ridge_formula':
+        print(f"  +  {estimate.rdm_overhead_gb:.3f} GB gram matrix ({num_stimuli:,}²×4B)  "
+              f"[no calibration entry — formula estimate]"
+              f"  =  {estimate.total_estimated_gb:.3f} GB total", flush=True)
+    elif formula_type == 'calibrated':
+        print(f"  +  {estimate.fixed_benchmark_cost_gb:.3f} GB benchmark overhead (calibrated)"
+              f"  =  {estimate.total_estimated_gb:.3f} GB total", flush=True)
+    else:
+        print(f"  ×{_OVERHEAD_FACTOR}  =  {estimate.total_estimated_gb:.3f} GB total", flush=True)
+
+    # Structured sentinel for CloudWatch Insights calibration queries and reliable
+    # OOM signal parsing by the scoring orchestrator. Every pre-flight run emits
+    # this line regardless of outcome — filter on will_oom=true for OOM cases.
+    # Query example: filter @message like "BRAINSCORE_PREFLIGHT"
+    #                | stats avg(estimate_gb) by benchmark_id, formula_type
+    print(
+        f"BRAINSCORE_PREFLIGHT {json.dumps({'estimate_gb': round(total_estimated_gb, 3), 'available_gb': round(available_gb, 1), 'formula_type': formula_type, 'will_oom': estimate.will_oom, 'num_features': num_features, 'num_stimuli': num_stimuli})}",
+        flush=True,
+    )
+
+    if estimate.will_oom:
+        msg = (
+            f"preallocate_memory: {str(estimate)}. "
+            f"Consider reducing layer output dimensionality (e.g. via LayerPCA), "
+            f"running on a machine with more RAM, or selecting a different layer."
+        )
+        if raise_if_oom:
+            raise MemoryError(msg)
+        else:
+            _logger.warning(msg)
+
+    return estimate
diff --git a/brainscore_vision/benchmark_helpers/neural_common.py b/brainscore_vision/benchmark_helpers/neural_common.py
index 7d93ed2b3c..12a45348d9 100644
--- a/brainscore_vision/benchmark_helpers/neural_common.py
+++ b/brainscore_vision/benchmark_helpers/neural_common.py
@@ -24,6 +24,10 @@ def __init__(self, identifier, assembly, similarity_metric, visual_degrees, numb
         self._visual_degrees = visual_degrees
         self._number_of_trials = number_of_trials
 
+    def preallocate_memory(self, candidate: BrainModel) -> None:
+        from brainscore_vision.benchmark_helpers.memory import preallocate_memory as _probe
+        _probe(candidate, self)
+
     def __call__(self, candidate: BrainModel):
         candidate.start_recording(self.region, time_bins=self.timebins)
         stimulus_set = place_on_screen(self._assembly.stimulus_set, target_visual_degrees=candidate.visual_degrees(),
@@ -79,8 +83,12 @@ def __init__(self, identifier, ceiling_func, version,
             self.ceiling_mode = neuroid_wise_explained_var
         else:
             self.ceiling_mode = explained_variance
-        
-    def __call__(self, candidate: BrainModel):  
+
+    def preallocate_memory(self, candidate: BrainModel) -> None:
+        from brainscore_vision.benchmark_helpers.memory import preallocate_memory as _probe
+        _probe(candidate, self)
+
+    def __call__(self, candidate: BrainModel):
         """
         Score a candidate model on this benchmark.
         
@@ -247,6 +255,10 @@ def __init__(
             bibtex=bibtex,
         )
 
+    def preallocate_memory(self, candidate: BrainModel) -> None:
+        from brainscore_vision.benchmark_helpers.memory import preallocate_memory as _probe
+        _probe(candidate, self)
+
     def __call__(self, candidate: BrainModel) -> Score:
         assembly = self._assembly
         timebins = timebins_from_assembly(assembly)
diff --git a/brainscore_vision/benchmarks/__init__.py b/brainscore_vision/benchmarks/__init__.py
index 98c3ca69e1..e6762b5c1a 100644
--- a/brainscore_vision/benchmarks/__init__.py
+++ b/brainscore_vision/benchmarks/__init__.py
@@ -38,6 +38,18 @@ def __call__(self, candidate: BrainModel) -> Score:
         """
         raise NotImplementedError()
 
+    def preallocate_memory(self, candidate: BrainModel) -> None:
+        """
+        Optional pre-flight memory check before scoring. Neural benchmarks override
+        this to raise :exc:`MemoryError` early if the model is estimated to exceed
+        available RAM. Behavioral and engineering benchmarks use this no-op default
+        since they do not run activation extraction and rarely require pre-flight
+        checks. Note: if a behavioral benchmark does OOM (OS kill, exit 137), the
+        gated scoring orchestrator will not detect it as an OOM and will not
+        automatically escalate the tier.
+        """
+        pass
+
     @property
     def bibtex(self) -> str:
         """
diff --git a/scripts/mem_profile_suite.py b/scripts/mem_profile_suite.py
new file mode 100644
index 0000000000..9c8d36d620
--- /dev/null
+++ b/scripts/mem_profile_suite.py
@@ -0,0 +1,1176 @@
+"""
+Memory Profile Suite
+====================
+Two modes: a 5×5 accuracy grid and a full benchmark calibration run.
+
+BACKGROUND
+----------
+Before scoring a model on a benchmark, we want to estimate whether there is
+enough RAM to complete the run without OOM-killing the process.  The estimate
+has two components:
+
+  total_memory_needed = activation_gb + fixed_benchmark_cost_gb
+
+  • activation_gb          — the raw model output array
+                             (stimuli × features × timebins × 4 bytes)
+                             measured by a cheap 1-stimulus forward pass (the "probe")
+  • fixed_benchmark_cost   — the benchmark's model-independent overhead
+                             (regression matrices, xarray bookkeeping, CV buffers)
+                             calibrated once per benchmark via --calibrate and
+                             stored in ~/.brainscore/benchmark_costs.json
+
+The fixed cost is environment-specific (calibrate on the same machine you score on).
+
+MODE 1 — 5×5 accuracy grid (default)
+--------------------------------------
+Runs 5 models × 5 benchmarks, compares the pre-flight estimate to the actual
+peak RSS delta for each pair.  Good for validating the estimation system.
+
+    python scripts/mem_profile_suite.py [--csv out.csv] [--skip-score]
+
+    --skip-score   probe only, skip actual scoring
+    --csv PATH     write per-pair results to CSV (flushed after each pair)
+
+MODE 2 — Benchmark calibration (--calibrate)
+---------------------------------------------
+Runs alexnet on every known benchmark to measure fixed_benchmark_cost per
+benchmark.  Results are saved incrementally to ~/.brainscore/benchmark_costs.json
+so a crash mid-run does not lose completed work.  Non-neural benchmarks
+(behavioral, engineering) are skipped automatically.
+
+    python scripts/mem_profile_suite.py --calibrate [--csv out.csv]
+                                        [--calibration-json PATH]
+                                        [--resume-from BENCHMARK_ID]
+
+    --resume-from BID   skip all benchmarks up to and including BID,
+                        then continue — use this after a crash to pick up
+                        where you left off
+"""
+import os
+import sys
+import time
+import argparse
+import csv
+import logging
+import threading
+
+# ---------------------------------------------------------------------------
+# Resolve local repos so the script works without installation
+# ---------------------------------------------------------------------------
+_script_dir = os.path.dirname(os.path.abspath(__file__))
+_vision_root = os.path.dirname(_script_dir)
+_core_root = os.path.join(os.path.dirname(_vision_root), 'core')
+for _p in [_vision_root, _core_root]:
+    if _p not in sys.path:
+        sys.path.insert(0, _p)
+
+logging.basicConfig(level=logging.WARNING)
+
+print("Importing brainscore_vision... ", end='', flush=True)
+import brainscore_vision  # noqa: E402
+print("done.", flush=True)
+
+import psutil  # noqa: E402
+
+# ---------------------------------------------------------------------------
+# Model / benchmark lists
+# ---------------------------------------------------------------------------
+MODELS = [
+    'resnet50_tutorial',
+    'alexnet',
+    'vit_large_patch14_clip_224:openai_ft_in1k',
+    'VOneCORnet-S',
+    'efficientnet_b0',
+]
+
+BENCHMARKS = [
+    'MajajHong2015.IT-pls',
+    'Sanghavi2020.IT-pls',
+    'Papale2025.IT-ridgecv',
+    'Hebart2023_fmri.V4-ridgecv',
+    'Allen2022_fmri.IT-ridge',
+]
+
+_BM_SHORT = {
+    'MajajHong2015.IT-pls':        'MajajHong.IT',
+    'Sanghavi2020.IT-pls':         'Sanghavi.IT',
+    'Papale2025.IT-ridgecv':       'Papale25.IT',
+    'Hebart2023_fmri.V4-ridgecv':  'Hebart23.V4',
+    'Allen2022_fmri.IT-ridge':     'Allen22.IT',
+}
+
+# All registered leaf benchmarks — used by --calibrate mode.
+ALL_BENCHMARKS = [
+    # Allen2022 fMRI (volumetric)
+    'Allen2022_fmri.V1-ridge', 'Allen2022_fmri.V2-ridge',
+    'Allen2022_fmri.V4-ridge', 'Allen2022_fmri.IT-ridge',
+    'Allen2022_fmri.V1-rdm',   'Allen2022_fmri.V2-rdm',
+    'Allen2022_fmri.V4-rdm',   'Allen2022_fmri.IT-rdm',
+    'Allen2022_fmri_4subj.V1-ridge', 'Allen2022_fmri_4subj.V2-ridge',
+    'Allen2022_fmri_4subj.V4-ridge', 'Allen2022_fmri_4subj.IT-ridge',
+    'Allen2022_fmri_4subj.V1-rdm',   'Allen2022_fmri_4subj.V2-rdm',
+    'Allen2022_fmri_4subj.V4-rdm',   'Allen2022_fmri_4subj.IT-rdm',
+    # Allen2022 fMRI (surface)
+    'Allen2022_fmri_surface.V1-ridge', 'Allen2022_fmri_surface.V2-ridge',
+    'Allen2022_fmri_surface.V4-ridge', 'Allen2022_fmri_surface.IT-ridge',
+    'Allen2022_fmri_surface.V1-rdm',   'Allen2022_fmri_surface.V2-rdm',
+    'Allen2022_fmri_surface.V4-rdm',   'Allen2022_fmri_surface.IT-rdm',
+    'Allen2022_fmri_surface_4subj.V1-ridge', 'Allen2022_fmri_surface_4subj.V2-ridge',
+    'Allen2022_fmri_surface_4subj.V4-ridge', 'Allen2022_fmri_surface_4subj.IT-ridge',
+    'Allen2022_fmri_surface_4subj.V1-rdm',   'Allen2022_fmri_surface_4subj.V2-rdm',
+    'Allen2022_fmri_surface_4subj.V4-rdm',   'Allen2022_fmri_surface_4subj.IT-rdm',
+    # Baker2022
+    'Baker2022frankenstein-accuracy_delta',
+    'Baker2022fragmented-accuracy_delta',
+    'Baker2022inverted-accuracy_delta',
+    # BMD2024
+    'BMD2024.texture_1Behavioral-accuracy_distance',
+    'BMD2024.texture_2Behavioral-accuracy_distance',
+    'BMD2024.dotted_1Behavioral-accuracy_distance',
+    'BMD2024.dotted_2Behavioral-accuracy_distance',
+    # Bracci2019
+    'Bracci2019.anteriorVTC-rdm',
+    # Cadena2017
+    'Cadena2017-pls', 'Cadena2017-mask',
+    # Coggan2024
+    'tong.Coggan2024_fMRI.V1-rdm', 'tong.Coggan2024_fMRI.V2-rdm',
+    'tong.Coggan2024_fMRI.V4-rdm', 'tong.Coggan2024_fMRI.IT-rdm',
+    'tong.Coggan2024_behavior-ConditionWiseAccuracySimilarity',
+    # Ferguson2024
+    'Ferguson2024circle_line-value_delta', 'Ferguson2024color-value_delta',
+    'Ferguson2024convergence-value_delta',  'Ferguson2024eighth-value_delta',
+    'Ferguson2024gray_easy-value_delta',    'Ferguson2024gray_hard-value_delta',
+    'Ferguson2024half-value_delta',         'Ferguson2024juncture-value_delta',
+    'Ferguson2024lle-value_delta',          'Ferguson2024llh-value_delta',
+    'Ferguson2024quarter-value_delta',      'Ferguson2024round_f-value_delta',
+    'Ferguson2024round_v-value_delta',      'Ferguson2024tilted_line-value_delta',
+    # FreemanZiemba2013
+    'FreemanZiemba2013.V1-pls',       'FreemanZiemba2013.V2-pls',
+    'FreemanZiemba2013public.V1-pls', 'FreemanZiemba2013public.V2-pls',
+    # Geirhos2021
+    'Geirhos2021colour-top1',              'Geirhos2021colour-error_consistency',
+    'Geirhos2021contrast-top1',            'Geirhos2021contrast-error_consistency',
+    'Geirhos2021cueconflict-top1',         'Geirhos2021cueconflict-error_consistency',
+    'Geirhos2021edge-top1',                'Geirhos2021edge-error_consistency',
+    'Geirhos2021eidolonI-top1',            'Geirhos2021eidolonI-error_consistency',
+    'Geirhos2021eidolonII-top1',           'Geirhos2021eidolonII-error_consistency',
+    'Geirhos2021eidolonIII-top1',          'Geirhos2021eidolonIII-error_consistency',
+    'Geirhos2021falsecolour-top1',         'Geirhos2021falsecolour-error_consistency',
+    'Geirhos2021highpass-top1',            'Geirhos2021highpass-error_consistency',
+    'Geirhos2021lowpass-top1',             'Geirhos2021lowpass-error_consistency',
+    'Geirhos2021phasescrambling-top1',     'Geirhos2021phasescrambling-error_consistency',
+    'Geirhos2021powerequalisation-top1',   'Geirhos2021powerequalisation-error_consistency',
+    'Geirhos2021rotation-top1',            'Geirhos2021rotation-error_consistency',
+    'Geirhos2021silhouette-top1',          'Geirhos2021silhouette-error_consistency',
+    'Geirhos2021sketch-top1',              'Geirhos2021sketch-error_consistency',
+    'Geirhos2021stylized-top1',            'Geirhos2021stylized-error_consistency',
+    'Geirhos2021uniformnoise-top1',        'Geirhos2021uniformnoise-error_consistency',
+    # Gifford2022
+    'Gifford2022.IT-ridge', 'Gifford2022.IT-ridgecv',
+    # Hebart2023
+    'Hebart2023-match',
+    'Hebart2023_fmri.V1-ridge',   'Hebart2023_fmri.V2-ridge',
+    'Hebart2023_fmri.V4-ridge',   'Hebart2023_fmri.IT-ridge',
+    'Hebart2023_fmri.V1-ridgecv', 'Hebart2023_fmri.V2-ridgecv',
+    'Hebart2023_fmri.V4-ridgecv', 'Hebart2023_fmri.IT-ridgecv',
+    # Hermann2020
+    'Hermann2020cueconflict-shape_bias', 'Hermann2020cueconflict-shape_match',
+    # Igustibagus2024
+    'Igustibagus2024-ridge', 'Igustibagus2024.IT_readout-accuracy',
+    # ImageNet
+    'ImageNet-top1',
+    'ImageNet-C-noise-top1', 'ImageNet-C-blur-top1',
+    'ImageNet-C-weather-top1', 'ImageNet-C-digital-top1',
+    # Islam2021
+    'Islam2021-shape_v1_dimensionality',  'Islam2021-texture_v1_dimensionality',
+    'Islam2021-shape_v2_dimensionality',  'Islam2021-texture_v2_dimensionality',
+    'Islam2021-shape_v4_dimensionality',  'Islam2021-texture_v4_dimensionality',
+    'Islam2021-shape_it_dimensionality',  'Islam2021-texture_it_dimensionality',
+    # Kar2019
+    'Kar2019-ost',
+    # Lonnqvist2024
+    'Lonnqvist2024_InlabInstructionsBehavioralAccuracyDistance',
+    'Lonnqvist2024_InlabNoInstructionsBehavioralAccuracyDistance',
+    'Lonnqvist2024_OnlineNoInstructionsBehavioralAccuracyDistance',
+    'Lonnqvist2024_EngineeringAccuracy',
+    # MajajHong2015
+    'MajajHong2015.V4-pls',              'MajajHong2015.IT-pls',
+    'MajajHong2015public.V4-pls',        'MajajHong2015public.IT-pls',
+    'MajajHong2015public.V4-temporal-pls', 'MajajHong2015public.IT-temporal-pls',
+    'MajajHong2015public.V4-reverse_pls', 'MajajHong2015public.IT-reverse_pls',
+    # Malania2007
+    'Malania2007.short2-threshold_elevation',  'Malania2007.short4-threshold_elevation',
+    'Malania2007.short6-threshold_elevation',  'Malania2007.short8-threshold_elevation',
+    'Malania2007.short16-threshold_elevation', 'Malania2007.equal2-threshold_elevation',
+    'Malania2007.long2-threshold_elevation',   'Malania2007.equal16-threshold_elevation',
+    'Malania2007.long16-threshold_elevation',  'Malania2007.vernieracuity-threshold',
+    # Maniquet2024
+    'Maniquet2024-confusion_similarity', 'Maniquet2024-tasks_consistency',
+    # Marques2020
+    'Marques2020_Cavanaugh2002-grating_summation_field',
+    'Marques2020_Cavanaugh2002-surround_diameter',
+    'Marques2020_Cavanaugh2002-surround_suppression_index',
+    'Marques2020_DeValois1982-pref_or',
+    'Marques2020_DeValois1982-peak_sf',
+    'Marques2020_FreemanZiemba2013-texture_modulation_index',
+    'Marques2020_FreemanZiemba2013-abs_texture_modulation_index',
+    'Marques2020_FreemanZiemba2013-texture_selectivity',
+    'Marques2020_FreemanZiemba2013-texture_sparseness',
+    'Marques2020_FreemanZiemba2013-texture_variance_ratio',
+    'Marques2020_FreemanZiemba2013-max_texture',
+    'Marques2020_FreemanZiemba2013-max_noise',
+    'Marques2020_Ringach2002-circular_variance',  'Marques2020_Ringach2002-or_bandwidth',
+    'Marques2020_Ringach2002-orth_pref_ratio',    'Marques2020_Ringach2002-or_selective',
+    'Marques2020_Ringach2002-cv_bandwidth_ratio', 'Marques2020_Ringach2002-opr_cv_diff',
+    'Marques2020_Ringach2002-max_dc',             'Marques2020_Ringach2002-modulation_ratio',
+    'Marques2020_Schiller1976-sf_selective',      'Marques2020_Schiller1976-sf_bandwidth',
+    # ObjectNet
+    'ObjectNet-top1',
+    # Papale2025
+    'Papale2025.V1-ridge',   'Papale2025.V4-ridge',   'Papale2025.IT-ridge',
+    'Papale2025.V1-ridgecv', 'Papale2025.V4-ridgecv', 'Papale2025.IT-ridgecv',
+    # Rajalingham2018
+    'Rajalingham2018-i2n', 'Rajalingham2018public-i2n',
+    # Rajalingham2020
+    'Rajalingham2020.IT-pls',
+    # Sanghavi2020
+    'Sanghavi2020.V4-pls',     'Sanghavi2020.IT-pls',
+    'SanghaviJozwik2020.V4-pls', 'SanghaviJozwik2020.IT-pls',
+    'SanghaviMurty2020.V4-pls',  'SanghaviMurty2020.IT-pls',
+    # Scialom2024
+    'Scialom2024_rgbBehavioralAccuracyDistance',
+    'Scialom2024_contoursBehavioralAccuracyDistance',
+    'Scialom2024_phosphenes-12BehavioralAccuracyDistance',
+    'Scialom2024_phosphenes-16BehavioralAccuracyDistance',
+    'Scialom2024_phosphenes-21BehavioralAccuracyDistance',
+    'Scialom2024_phosphenes-27BehavioralAccuracyDistance',
+    'Scialom2024_phosphenes-35BehavioralAccuracyDistance',
+    'Scialom2024_phosphenes-46BehavioralAccuracyDistance',
+    'Scialom2024_phosphenes-59BehavioralAccuracyDistance',
+    'Scialom2024_phosphenes-77BehavioralAccuracyDistance',
+    'Scialom2024_phosphenes-100BehavioralAccuracyDistance',
+    'Scialom2024_segments-12BehavioralAccuracyDistance',
+    'Scialom2024_segments-16BehavioralAccuracyDistance',
+    'Scialom2024_segments-21BehavioralAccuracyDistance',
+    'Scialom2024_segments-27BehavioralAccuracyDistance',
+    'Scialom2024_segments-35BehavioralAccuracyDistance',
+    'Scialom2024_segments-46BehavioralAccuracyDistance',
+    'Scialom2024_segments-59BehavioralAccuracyDistance',
+    'Scialom2024_segments-77BehavioralAccuracyDistance',
+    'Scialom2024_segments-100BehavioralAccuracyDistance',
+    'Scialom2024_phosphenes-allBehavioralErrorConsistency',
+    'Scialom2024_segments-allBehavioralErrorConsistency',
+    'Scialom2024_phosphenes-allBehavioralAccuracyDistance',
+    'Scialom2024_segments-allBehavioralAccuracyDistance',
+    'Scialom2024_rgbEngineeringAccuracy',
+    'Scialom2024_contoursEngineeringAccuracy',
+    'Scialom2024_phosphenes-12EngineeringAccuracy',
+    'Scialom2024_phosphenes-16EngineeringAccuracy',
+    'Scialom2024_phosphenes-21EngineeringAccuracy',
+    'Scialom2024_phosphenes-27EngineeringAccuracy',
+    'Scialom2024_phosphenes-35EngineeringAccuracy',
+    'Scialom2024_phosphenes-46EngineeringAccuracy',
+    'Scialom2024_phosphenes-59EngineeringAccuracy',
+    'Scialom2024_phosphenes-77EngineeringAccuracy',
+    'Scialom2024_phosphenes-100EngineeringAccuracy',
+    'Scialom2024_segments-12EngineeringAccuracy',
+    'Scialom2024_segments-16EngineeringAccuracy',
+    'Scialom2024_segments-21EngineeringAccuracy',
+    'Scialom2024_segments-27EngineeringAccuracy',
+    'Scialom2024_segments-35EngineeringAccuracy',
+    'Scialom2024_segments-46EngineeringAccuracy',
+    'Scialom2024_segments-59EngineeringAccuracy',
+    'Scialom2024_segments-77EngineeringAccuracy',
+    'Scialom2024_segments-100EngineeringAccuracy',
+]
+
+# ---------------------------------------------------------------------------
+# ANSI colours
+# ---------------------------------------------------------------------------
+_RESET  = '\033[0m'
+_BOLD   = '\033[1m'
+_GREEN  = '\033[32m'
+_YELLOW = '\033[33m'
+_RED    = '\033[31m'
+_CYAN   = '\033[36m'
+_DIM    = '\033[2m'
+_BLUE   = '\033[34m'
+
+
+def _c(text, colour):
+    return f"{colour}{text}{_RESET}"
+
+
+def _step(msg, indent=4):
+    print(f"{' ' * indent}{_c('→', _BLUE)} {msg}", flush=True)
+
+
+def _substep(msg, indent=6):
+    print(f"{' ' * indent}{_c('·', _DIM)} {msg}", flush=True)
+
+
+def _gb(n_bytes):
+    return f"{n_bytes / (1024 ** 3):.2f} GB"
+
+
+# ---------------------------------------------------------------------------
+# Peak RSS monitor (background thread)
+# ---------------------------------------------------------------------------
+
+class _PeakMonitor:
+    def __init__(self, interval=0.5):
+        self._proc = psutil.Process(os.getpid())
+        self._interval = interval
+        self._peak = self._proc.memory_info().rss
+        self._stop = threading.Event()
+        self._thread = threading.Thread(target=self._run, daemon=True)
+
+    def start(self):
+        self._thread.start()
+        return self
+
+    def stop(self):
+        self._stop.set()
+        self._thread.join()
+        return self._peak
+
+    def _run(self):
+        while not self._stop.is_set():
+            try:
+                rss = self._proc.memory_info().rss
+                if rss > self._peak:
+                    self._peak = rss
+            except psutil.NoSuchProcess:
+                break
+            self._stop.wait(self._interval)
+
+
+# ---------------------------------------------------------------------------
+# Comparison helpers
+# ---------------------------------------------------------------------------
+
+def _compare_label(estimate_gb, actual_delta_gb):
+    """Returns (colour, verdict_string, ratio)."""
+    if actual_delta_gb <= 0.01:
+        return _GREEN, "no measurable RSS delta", None
+    ratio = estimate_gb / actual_delta_gb
+    if ratio >= 0.8:
+        return _GREEN,  f"ACCURATE  ({ratio:.2f}× of actual)", ratio
+    elif ratio >= 0.4:
+        under = actual_delta_gb - estimate_gb
+        pct = (1 - ratio) * 100
+        return _YELLOW, f"UNDER by {under:.2f} GB  ({pct:.0f}% under)", ratio
+    else:
+        under = actual_delta_gb - estimate_gb
+        pct = (1 - ratio) * 100
+        return _RED,    f"UNDER by {under:.2f} GB  ({pct:.0f}% under)", ratio
+
+
+# ---------------------------------------------------------------------------
+# Result helper
+# ---------------------------------------------------------------------------
+
+def _make_result(model_id, benchmark_id, **kw):
+    return dict(model_id=model_id, benchmark_id=benchmark_id, **kw)
+
+
+# ---------------------------------------------------------------------------
+# Run one (model, benchmark) pair
+# ---------------------------------------------------------------------------
+
+def run_pair(model, model_id, benchmark, benchmark_id, skip_score=False):
+    from brainscore_vision.benchmark_helpers.memory import preallocate_memory
+
+    proc = psutil.Process(os.getpid())
+
+    # ── Pre-flight probe ─────────────────────────────────────────────────
+    _step("pre-flight probe  (1-stimulus forward pass)")
+    t_probe = time.time()
+    try:
+        est = preallocate_memory(model, benchmark, raise_if_oom=False)
+    except TypeError as e:
+        _substep(_c(f"skipped — unsupported benchmark type: {e}", _DIM))
+        return _make_result(model_id, benchmark_id, status='skip',
+                            est_gb=None, act_gb=None, actual_delta_gb=None, score=None,
+                            probe_elapsed=time.time() - t_probe,
+                            score_elapsed=None, note=str(e)[:100])
+    except Exception as e:
+        _substep(_c(f"probe ERROR: {str(e)[:100]}", _RED))
+        return _make_result(model_id, benchmark_id, status='error',
+                            est_gb=None, act_gb=None, actual_delta_gb=None, score=None,
+                            probe_elapsed=time.time() - t_probe,
+                            score_elapsed=None, note=str(e)[:120])
+
+    probe_elapsed = time.time() - t_probe
+
+    if est is None:
+        _substep(_c("skipped (BRAINSCORE_SKIP_MEMORY_CHECK set)", _DIM))
+        return _make_result(model_id, benchmark_id, status='skip',
+                            est_gb=None, act_gb=None, actual_delta_gb=None, score=None,
+                            probe_elapsed=probe_elapsed, score_elapsed=None,
+                            note='BRAINSCORE_SKIP_MEMORY_CHECK set')
+
+    _substep(f"features={est.num_features:,}  stimuli={est.num_stimuli:,}  timebins={est.num_timebins}")
+    _substep(
+        f"activation = {est.activation_gb:.3f} GB  ×6 overhead  "
+        f"→ {_c(f'estimate: {est.total_estimated_gb:.2f} GB', _CYAN)}"
+    )
+
+    if skip_score:
+        return _make_result(model_id, benchmark_id, status='probe_only',
+                            est_gb=est.total_estimated_gb, act_gb=est.activation_gb,
+                            feat=est.num_features, stimuli=est.num_stimuli,
+                            timebins=est.num_timebins,
+                            actual_delta_gb=None, score=None,
+                            probe_elapsed=probe_elapsed, score_elapsed=None, note='--skip-score')
+
+    # ── Full benchmark run ───────────────────────────────────────────────
+    baseline_rss = proc.memory_info().rss
+    _step(f"scoring  (baseline RSS: {_gb(baseline_rss)})")
+
+    # Ticker thread: prints RSS + elapsed every 30s while benchmark runs
+    _ticker_stop = threading.Event()
+    def _ticker():
+        t_start = time.time()
+        interval = 30
+        while not _ticker_stop.wait(interval):
+            elapsed = time.time() - t_start
+            rss = proc.memory_info().rss
+            print(f"      {_c('…', _DIM)} still scoring  "
+                  f"{elapsed/60:.1f} min elapsed  "
+                  f"RSS {_gb(rss)}", flush=True)
+    ticker_thread = threading.Thread(target=_ticker, daemon=True)
+    ticker_thread.start()
+
+    monitor = _PeakMonitor().start()
+    t_score = time.time()
+    score_val = None
+    score_status = 'ok'
+    score_note = ''
+    try:
+        score_val = benchmark(model)
+    except MemoryError as e:
+        score_status = 'oom'
+        score_note = str(e)[:120]
+        _substep(_c(f"MemoryError: {score_note}", _RED))
+    except Exception as e:
+        score_status = 'error'
+        score_note = str(e)[:120]
+        _substep(_c(f"scoring ERROR: {score_note}", _RED))
+    finally:
+        _ticker_stop.set()
+        ticker_thread.join()
+
+    score_elapsed = time.time() - t_score
+    peak_rss = monitor.stop()
+    actual_delta_gb = (peak_rss - baseline_rss) / (1024 ** 3)
+
+    # ── Comparison ───────────────────────────────────────────────────────
+    colour, verdict, ratio = _compare_label(est.total_estimated_gb, actual_delta_gb)
+    _step("comparison")
+    _substep(
+        f"baseline RSS = {_gb(baseline_rss)}  "
+        f"{_c('← model weights, Python, etc. already in RAM before scoring', _DIM)}"
+    )
+    _substep(
+        f"peak RSS     = {_gb(peak_rss)}  "
+        f"{_c('← highest point reached during scoring', _DIM)}"
+    )
+    _substep(
+        f"Δ (peak−base)= {_c(f'+{actual_delta_gb:.2f} GB', _CYAN)}  "
+        f"{_c('← extra RAM the benchmark itself consumed  ← this is what we compare against', _DIM)}"
+    )
+    _substep(
+        f"estimated    = {_c(f'{est.total_estimated_gb:.2f} GB', _CYAN)}  "
+        f"{_c(f'← {est.activation_gb:.3f} GB activations × 6 overhead', _DIM)}"
+    )
+    _substep(f"verdict      : {_c(verdict, colour)}")
+    if score_val is not None:
+        _substep(f"score        : {float(score_val):.4f}   elapsed {score_elapsed:.0f}s")
+
+    return _make_result(model_id, benchmark_id,
+                        status=score_status,
+                        est_gb=est.total_estimated_gb,
+                        act_gb=est.activation_gb,
+                        feat=est.num_features,
+                        stimuli=est.num_stimuli,
+                        timebins=est.num_timebins,
+                        actual_delta_gb=actual_delta_gb,
+                        baseline_rss_gb=baseline_rss / (1024 ** 3),
+                        peak_rss_gb=peak_rss / (1024 ** 3),
+                        score=float(score_val) if score_val is not None else None,
+                        ratio=ratio,
+                        probe_elapsed=probe_elapsed,
+                        score_elapsed=score_elapsed,
+                        note=score_note)
+
+
+# ---------------------------------------------------------------------------
+# Load helpers
+# ---------------------------------------------------------------------------
+
+def _load_model(mid):
+    return brainscore_vision.load_model(mid)
+
+
+def _load_benchmark(bid):
+    return brainscore_vision.load_benchmark(bid)
+
+
+def _timed_load(fn, arg, t0, interval=15):
+    """Run fn(arg) in a thread, printing elapsed time every `interval` seconds."""
+    result = [None]
+    exc    = [None]
+
+    def _worker():
+        try:
+            result[0] = fn(arg)
+        except Exception as e:
+            exc[0] = e
+
+    t = threading.Thread(target=_worker, daemon=True)
+    t.start()
+    while t.is_alive():
+        t.join(timeout=interval)
+        if t.is_alive():
+            print(f"         {_c('…', _DIM)} still loading  {time.time()-t0:.0f}s elapsed",
+                  flush=True)
+
+    if exc[0] is not None:
+        raise exc[0]
+    return result[0]
+
+
+# ---------------------------------------------------------------------------
+# Summary table  (est GB / actual Δ GB per cell)
+# ---------------------------------------------------------------------------
+
+_MODEL_W = 36
+_CELL_W  = 20
+
+
+def _trunc(s, n):
+    return s if len(s) <= n else s[:n - 1] + '…'
+
+
+def _cell_text_plain(r):
+    """Fixed-width plain text (no ANSI) for padding calculation."""
+    est = r.get('est_gb')
+    act = r.get('actual_delta_gb')
+    if est is None:
+        return "  —  skip/err      "
+    if act is None:
+        return f"{est:5.2f} GB est  n/a  "
+    return f"{est:4.1f}/{act:4.1f} GB        "
+
+
+def _cell_colour(r):
+    est = r.get('est_gb')
+    act = r.get('actual_delta_gb')
+    if est is None:
+        return _c("  —  skip/err", _DIM)
+    if act is None:
+        return f"{_c(f'{est:5.2f} GB', _CYAN)} est"
+    _, _, ratio = _compare_label(est, act)
+    col = _GREEN if (ratio and ratio >= 0.8) else (_YELLOW if (ratio and ratio >= 0.4) else _RED)
+    return f"{_c(f'{est:.1f}', _CYAN)}/{_c(f'{act:.1f}', col)} GB"
+
+
+def _hline(c_mid, c_left, c_right, c_sep):
+    parts = [c_left, c_mid * (_MODEL_W + 2)]
+    for _ in BENCHMARKS:
+        parts += [c_sep, c_mid * (_CELL_W + 2)]
+    parts.append(c_right)
+    return ''.join(parts)
+
+
+def print_summary_table(results_grid):
+    top    = _hline('─', '┌', '┐', '┬')
+    mid    = _hline('─', '├', '┤', '┼')
+    bottom = _hline('─', '└', '┘', '┴')
+
+    print(top)
+    hdr = f"│ {_c(_trunc('Model', _MODEL_W), _BOLD):<{_MODEL_W + len(_BOLD) + len(_RESET)}} "
+    for bid in BENCHMARKS:
+        short = _BM_SHORT.get(bid, bid)
+        hdr += f"│ {_c(_trunc(short, _CELL_W), _BOLD):<{_CELL_W + len(_BOLD) + len(_RESET)}} "
+    print(hdr + "│")
+    print(mid)
+
+    for mid_id in MODELS:
+        row = f"│ {_trunc(mid_id, _MODEL_W):<{_MODEL_W}} "
+        for bid in BENCHMARKS:
+            r = results_grid[mid_id][bid]
+            cell = _cell_colour(r)
+            padding = _CELL_W - len(_cell_text_plain(r))
+            row += f"│ {cell}{' ' * max(0, padding)} "
+        print(row + "│")
+
+    print(bottom)
+    print(_c("  est GB / actual Δ GB  "
+             "(cyan=estimate, green=accurate ≥0.8×, yellow=under 0.4–0.8×, red=under <0.4×)", _DIM))
+
+
+# ---------------------------------------------------------------------------
+# Full text report
+# ---------------------------------------------------------------------------
+
+def print_full_report(results_grid):
+    print(f"\n{_c('Per-pair results:', _BOLD)}\n")
+    for mid_id in MODELS:
+        print(f"  {_c(mid_id, _BOLD)}")
+        for bid in BENCHMARKS:
+            r = results_grid[mid_id][bid]
+            short = _BM_SHORT.get(bid, bid)
+            est = r.get('est_gb')
+            act = r.get('actual_delta_gb')
+            score = r.get('score')
+
+            if est is not None and act is not None:
+                _, verdict, ratio = _compare_label(est, act)
+                col = _GREEN if (ratio and ratio >= 0.8) else (_YELLOW if (ratio and ratio >= 0.4) else _RED)
+                score_str = f"   score={score:.4f}" if score is not None else ""
+                print(f"    {short:<18}  "
+                      f"est {est:.2f} GB  actual Δ {act:.2f} GB  "
+                      f"→ {_c(verdict, col)}{score_str}")
+            elif est is not None:
+                print(f"    {short:<18}  est {est:.2f} GB  no actual  ({r.get('note','')[:50]})")
+            else:
+                print(f"    {short:<18}  {r['status']}  {r.get('note','')[:60]}")
+        print()
+
+
+# ---------------------------------------------------------------------------
+# Overhead recommendation
+# ---------------------------------------------------------------------------
+
+def print_overhead_recommendation(results_grid):
+    from brainscore_vision.benchmark_helpers.memory import _OVERHEAD_FACTOR
+
+    # Collect pairs where we have both raw activation GB and actual delta GB
+    pairs = []
+    for mid_id in MODELS:
+        for bid in BENCHMARKS:
+            r = results_grid[mid_id][bid]
+            act_gb  = r.get('act_gb')          # raw activation array GB
+            delta_gb = r.get('actual_delta_gb') # actual peak-baseline delta
+            if act_gb and act_gb > 0 and delta_gb is not None and delta_gb > 0.01:
+                true_factor = delta_gb / act_gb
+                pairs.append({
+                    'model': r['model_id'],
+                    'benchmark': r['benchmark_id'],
+                    'act_gb': act_gb,
+                    'delta_gb': delta_gb,
+                    'true_factor': true_factor,
+                })
+
+    n_total = len(MODELS) * len(BENCHMARKS)
+    n_scored = len(pairs)
+
+    print(f"\n{'═' * 66}")
+    print(f"  {_c('OVERHEAD FACTOR RECOMMENDATION', _BOLD)}")
+    print(f"{'═' * 66}\n")
+
+    if n_scored == 0:
+        print(f"  {_c('No scored pairs to analyse.', _DIM)}")
+        return
+
+    true_factors = sorted(p['true_factor'] for p in pairs)
+    current_factor = _OVERHEAD_FACTOR
+
+    # For a given overhead factor F, count pairs where estimate < actual delta
+    # (i.e. estimate would have UNDER-predicted, missing a potential OOM)
+    def n_underpredicted(factor):
+        return sum(1 for p in pairs if p['act_gb'] * factor < p['delta_gb'])
+
+    current_under = n_underpredicted(current_factor)
+    n_safe = n_scored - current_under
+
+    print(f"  Scored pairs:  {n_scored}/{n_total}  "
+          f"({n_total - n_scored} skipped/errored)\n")
+    print(f"  {_c('Current overhead factor = ×{}'.format(current_factor), _BOLD)}")
+    print(f"    estimate covered (≥ actual Δ) : "
+          f"{_c(str(n_safe), _GREEN)}/{n_scored} pairs")
+    print(f"    estimate under-predicted      : "
+          f"{_c(str(current_under), _RED)}/{n_scored} pairs  "
+          f"{_c('← estimate too low; real usage exceeded prediction', _DIM)}")
+
+    # Show the actual overhead factors observed per pair
+    print(f"\n  {_c('Actual overhead factors observed (activation GB → actual Δ GB):', _DIM)}")
+    for p in sorted(pairs, key=lambda x: x['true_factor'], reverse=True):
+        short_m = p['model'][:28]
+        short_b = _BM_SHORT.get(p['benchmark'], p['benchmark'])
+        tf = p['true_factor']
+        bar = '█' * min(int(tf), 20)
+        col = _GREEN if tf <= current_factor else _RED
+        factor_str = _c(f'{tf:.1f}×', col)
+        print(f"    {short_m:<28}  {short_b:<16}  "
+              f"{p['act_gb']:.2f} GB → {p['delta_gb']:.2f} GB  "
+              f"= {factor_str}  {_c(bar, col)}")
+
+    # Find the factor that covers each percentile threshold
+    print(f"\n  {_c('Factor needed to cover N% of pairs:', _DIM)}")
+    for pct in [50, 75, 90, 95, 100]:
+        idx = min(int(len(true_factors) * pct / 100), len(true_factors) - 1)
+        needed = true_factors[idx]
+        rounded = max(current_factor, round(needed + 0.5))  # round up to nearest int
+        still_under = n_underpredicted(needed)
+        col = _GREEN if needed <= current_factor else _YELLOW if needed <= current_factor * 1.5 else _RED
+        print(f"    {pct:>3}% coverage  →  ×{_c(f'{needed:.1f}', col)}  "
+              f"(≈ ×{rounded} rounded)  "
+              f"→ {still_under}/{n_scored} pairs still under-predicted")
+
+    # Final recommendation: smallest integer factor covering ≥ 90% of pairs
+    idx_90 = min(int(len(true_factors) * 0.90), len(true_factors) - 1)
+    factor_90 = true_factors[idx_90]
+    recommended = max(current_factor, int(factor_90) + (1 if factor_90 % 1 > 0 else 0))
+    under_at_rec = n_underpredicted(recommended)
+
+    print(f"\n  {_c('Recommendation', _BOLD)}")
+    if recommended == current_factor:
+        print(f"    Current factor ×{current_factor} already covers ≥90% of pairs. {_c('No change needed.', _GREEN)}")
+    else:
+        improvement = current_under - under_at_rec
+        print(f"    Increase overhead factor from "
+              f"{_c(f'×{current_factor}', _RED)} → {_c(f'×{recommended}', _GREEN)}")
+        print(f"    This moves from {_c(str(current_under), _RED)} under-predicted pairs "
+              f"to {_c(str(under_at_rec), _GREEN)} "
+              f"({_c(f'−{improvement} pairs', _GREEN)} now safely caught)")
+        print(f"\n    To apply: set  {_c('_OVERHEAD_FACTOR = ' + str(recommended), _CYAN)}  "
+              f"in  brainscore_vision/benchmark_helpers/memory.py")
+    print()
+
+
+# ---------------------------------------------------------------------------
+# CSV helpers  (incremental — one row written immediately after each pair)
+# ---------------------------------------------------------------------------
+
+_CSV_HEADER = [
+    'model', 'benchmark', 'status',
+    'est_total_gb', 'act_activation_gb', 'actual_delta_gb',
+    'num_features', 'num_stimuli', 'num_timebins',
+    'baseline_rss_gb', 'peak_rss_gb',
+    'ratio', 'score',
+    'probe_elapsed_s', 'score_elapsed_s', 'note',
+]
+
+
+def _csv_row(r):
+    def _f(k, fmt='.4f'):
+        v = r.get(k)
+        return format(v, fmt) if v is not None else ''
+    return [
+        r['model_id'], r['benchmark_id'], r['status'],
+        _f('est_gb'), _f('act_gb'), _f('actual_delta_gb'),
+        r.get('feat', ''), r.get('stimuli', ''), r.get('timebins', ''),
+        _f('baseline_rss_gb'), _f('peak_rss_gb'),
+        _f('ratio'), _f('score'),
+        _f('probe_elapsed', '.2f'), _f('score_elapsed', '.2f'),
+        r.get('note', ''),
+    ]
+
+
+def init_csv(path):
+    """Write header row, return open file handle + csv.writer."""
+    f = open(path, 'w', newline='')
+    w = csv.writer(f)
+    w.writerow(_CSV_HEADER)
+    f.flush()
+    return f, w
+
+
+def append_csv_row(writer, file_handle, r):
+    writer.writerow(_csv_row(r))
+    file_handle.flush()  # write immediately so partial results survive a crash
+
+
+# ---------------------------------------------------------------------------
+# Calibration mode  (alexnet × all benchmarks → fixed_benchmark_cost per bm)
+# ---------------------------------------------------------------------------
+
+def run_calibration_pair(model, benchmark, benchmark_id, bm_idx, n_bm):
+    """Run one benchmark and return fixed_benchmark_cost = actual_delta - activation_gb."""
+    from brainscore_vision.benchmark_helpers.memory import preallocate_memory
+
+    proc = psutil.Process(os.getpid())
+
+    print(f"\n  [{bm_idx}/{n_bm}] {benchmark_id}")
+    print(f"  {'─' * 62}")
+
+    # Probe
+    _step("probe  (1-stimulus forward pass)")
+    try:
+        est = preallocate_memory(model, benchmark, raise_if_oom=False)
+    except TypeError:
+        _substep(_c("skipped — not a NeuralBenchmark (behavioral/non-neural)", _DIM))
+        print(f"  {_c(benchmark_id[:55], _DIM)}: N/A (non-neural)", flush=True)
+        return dict(benchmark_id=benchmark_id, status='skip',
+                    activation_gb=None, actual_delta_gb=None, fixed_cost_gb=None)
+    except Exception as e:
+        _substep(_c(f"probe ERROR: {str(e)[:80]}", _RED))
+        return dict(benchmark_id=benchmark_id, status='error',
+                    activation_gb=None, actual_delta_gb=None, fixed_cost_gb=None,
+                    note=str(e)[:100])
+
+    if est is None:
+        return dict(benchmark_id=benchmark_id, status='skip',
+                    activation_gb=None, actual_delta_gb=None, fixed_cost_gb=None)
+
+    _substep(
+        f"activation = {est.activation_gb:.3f} GB  "
+        f"({est.num_features:,} feat × {est.num_stimuli:,} stim × {est.num_timebins} tbin)"
+    )
+
+    # Score
+    baseline_rss = proc.memory_info().rss
+    _step(f"scoring  (baseline RSS: {_gb(baseline_rss)})")
+
+    _ticker_stop = threading.Event()
+    def _ticker():
+        t_start = time.time()
+        while not _ticker_stop.wait(30):
+            elapsed = time.time() - t_start
+            rss = proc.memory_info().rss
+            print(f"      {_c('…', _DIM)} still scoring  "
+                  f"{elapsed/60:.1f} min  RSS {_gb(rss)}", flush=True)
+    ticker_thread = threading.Thread(target=_ticker, daemon=True)
+    ticker_thread.start()
+
+    monitor = _PeakMonitor().start()
+    t_score = time.time()
+    score_status = 'ok'
+    score_note = ''
+    try:
+        benchmark(model)
+    except MemoryError as e:
+        score_status = 'oom'
+        score_note = str(e)[:120]
+        _substep(_c(f"MemoryError: {score_note}", _RED))
+    except Exception as e:
+        score_status = 'error'
+        score_note = str(e)[:120]
+        _substep(_c(f"ERROR: {score_note}", _RED))
+    finally:
+        _ticker_stop.set()
+        ticker_thread.join()
+
+    score_elapsed = time.time() - t_score
+    peak_rss = monitor.stop()
+    actual_delta_gb = (peak_rss - baseline_rss) / (1024 ** 3)
+
+    fixed_cost_gb = None
+    if score_status == 'ok':
+        fixed_cost_gb = max(0.0, actual_delta_gb - est.activation_gb)
+        _step("result")
+        _substep(
+            f"actual Δ = {_c(f'{actual_delta_gb:.3f} GB', _CYAN)}  "
+            f"elapsed {score_elapsed:.0f}s"
+        )
+        _substep(
+            f"fixed_benchmark_cost = {actual_delta_gb:.3f} − {est.activation_gb:.3f} "
+            f"= {_c(f'{fixed_cost_gb:.3f} GB', _GREEN)}"
+        )
+        # One-liner summary line
+        print(
+            f"\n  {_c(benchmark_id[:55], _BOLD)}: "
+            f"fixed_cost = {_c(f'{fixed_cost_gb:.2f} GB', _GREEN)}",
+            flush=True,
+        )
+    else:
+        _substep(f"actual Δ = {_gb(peak_rss - baseline_rss)}  (run failed — no fixed_cost)")
+
+    return dict(
+        benchmark_id=benchmark_id,
+        status=score_status,
+        activation_gb=est.activation_gb,
+        actual_delta_gb=actual_delta_gb,
+        fixed_cost_gb=fixed_cost_gb,
+        score_elapsed=score_elapsed,
+        note=score_note,
+    )
+
+
+def print_calibration_table(results):
+    neural = [r for r in results if r.get('fixed_cost_gb') is not None]
+    skipped = sum(1 for r in results if r['status'] == 'skip')
+    errors  = sum(1 for r in results if r['status'] in ('error', 'oom'))
+
+    print(f"\n\n{'═' * 72}")
+    print(f"  {_c('BENCHMARK FIXED COSTS  (model-independent overhead)', _BOLD)}")
+    print(f"  Calibrated with alexnet on {len(neural)} neural benchmarks  "
+          f"|  {skipped} non-neural skipped  |  {errors} errors")
+    print(f"{'═' * 72}\n")
+
+    if not neural:
+        print(f"  {_c('No neural benchmarks scored.', _DIM)}")
+        return
+
+    neural_sorted = sorted(neural, key=lambda r: r['fixed_cost_gb'], reverse=True)
+    col_w = min(max(len(r['benchmark_id']) for r in neural_sorted), 56)
+
+    print(f"  {'Benchmark':<{col_w}}  {'Fixed cost':>12}  {'Act. Δ':>9}  {'Activation':>10}")
+    print(f"  {'─' * col_w}  {'─' * 12}  {'─' * 9}  {'─' * 10}")
+
+    for r in neural_sorted:
+        bid  = r['benchmark_id'][:col_w]
+        fc   = r['fixed_cost_gb']
+        act  = r['actual_delta_gb']
+        actl = r['activation_gb']
+        col  = _GREEN if fc < 5 else (_YELLOW if fc < 15 else _RED)
+        print(
+            f"  {bid:<{col_w}}  "
+            f"{_c(f'{fc:>8.2f} GB', col)}  "
+            f"{act:>7.2f} GB  "
+            f"{actl:>8.3f} GB"
+        )
+
+    print()
+    print(f"  {_c('Formula:', _BOLD)} total_needed = activation_gb + fixed_benchmark_cost")
+    print(f"  {_c('Usage:', _DIM)}   preallocate_memory(model, bm, "
+          f"fixed_benchmark_cost_gb=<value>)")
+    print()
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+def _run_calibrate(args):
+    """--calibrate mode: load alexnet, run every benchmark, output fixed_benchmark_cost."""
+    from brainscore_vision.benchmark_helpers.memory import save_calibration, load_calibration, _DEFAULT_CALIBRATION_PATH
+    cal_path = getattr(args, 'calibration_json', None) or _DEFAULT_CALIBRATION_PATH
+
+    n_bm = len(ALL_BENCHMARKS)
+    print(f"\n{'═' * 72}")
+    print(f"  {_c('CALIBRATION MODE', _BOLD)}  —  alexnet × {n_bm} benchmarks")
+    print(f"  Goal: measure fixed_benchmark_cost = actual_Δ − activation_gb per benchmark")
+    print(f"  Calibration table will be saved → {_c(cal_path, _CYAN)}")
+    print(f"{'═' * 72}\n")
+
+    # Load alexnet
+    _step("loading alexnet...", indent=2)
+    t0 = time.time()
+    try:
+        model = _timed_load(_load_model, 'alexnet', t0)
+        print(f"       {_c('OK', _GREEN)} ({time.time() - t0:.1f}s)", flush=True)
+    except Exception as e:
+        print(f"       {_c('FAILED', _RED)}: {e}")
+        return
+
+    # Open CSV
+    csv_file, csv_writer = None, None
+    if args.csv:
+        csv_file = open(args.csv, 'w', newline='')
+        csv_writer = csv.writer(csv_file)
+        csv_writer.writerow(['benchmark', 'status', 'activation_gb',
+                             'actual_delta_gb', 'fixed_cost_gb', 'score_elapsed_s', 'note'])
+        csv_file.flush()
+        print(f"\n  {_c('CSV →', _CYAN)} {args.csv}\n")
+
+    # Load any costs already written in a previous run
+    costs = load_calibration(cal_path)
+    if costs:
+        print(f"  {_c('Resuming:', _CYAN)} loaded {len(costs)} existing costs from {cal_path}")
+
+    # Find the resume offset
+    resume_from = getattr(args, 'resume_from', None)
+    start_idx = 0
+    if resume_from:
+        if resume_from in ALL_BENCHMARKS:
+            start_idx = ALL_BENCHMARKS.index(resume_from) + 1
+            print(f"  {_c('Skipping', _DIM)} benchmarks 1–{start_idx} "
+                  f"(up to and including {resume_from})")
+        else:
+            print(f"  {_c('WARNING', _YELLOW)}: --resume-from '{resume_from}' "
+                  f"not found in ALL_BENCHMARKS — starting from the beginning")
+    print()
+
+    results = []
+    costs = costs  # carry forward existing costs
+    try:
+        for i, bid in enumerate(ALL_BENCHMARKS, 1):
+            if i <= start_idx:
+                continue  # skip already-completed benchmarks
+            # Load benchmark with ticker
+            t0 = time.time()
+            try:
+                bm = _timed_load(_load_benchmark, bid, t0)
+            except Exception as e:
+                print(f"\n  [{i}/{n_bm}] {bid}")
+                _substep(_c(f"load FAILED: {str(e)[:80]}", _RED))
+                r = dict(benchmark_id=bid, status='error', activation_gb=None,
+                         actual_delta_gb=None, fixed_cost_gb=None, note=str(e)[:100])
+                results.append(r)
+                if csv_writer:
+                    csv_writer.writerow([bid, 'error', '', '', '', '', r.get('note', '')])
+                    csv_file.flush()
+                continue
+
+            r = run_calibration_pair(model, bm, bid, i, n_bm)
+            results.append(r)
+
+            if csv_writer:
+                csv_writer.writerow([
+                    bid, r['status'],
+                    f"{r['activation_gb']:.4f}"  if r['activation_gb']  is not None else '',
+                    f"{r['actual_delta_gb']:.4f}" if r['actual_delta_gb'] is not None else '',
+                    f"{r['fixed_cost_gb']:.4f}"  if r['fixed_cost_gb']   is not None else '',
+                    f"{r.get('score_elapsed', ''):.1f}" if r.get('score_elapsed') else '',
+                    r.get('note', ''),
+                ])
+                csv_file.flush()
+
+            # Incrementally save JSON after every benchmark that yielded a cost
+            if r.get('fixed_cost_gb') is not None:
+                costs[bid] = r['fixed_cost_gb']
+                save_calibration(costs, cal_path)
+                print(f"  {_c('↳ JSON updated', _DIM)} ({len(costs)} benchmarks so far)",
+                      flush=True)
+
+    finally:
+        if csv_file:
+            csv_file.close()
+            print(f"\n{_c('CSV finalised →', _CYAN)} {args.csv}")
+
+    save_calibration(costs, cal_path)
+    print(f"\n{_c('Calibration saved →', _CYAN)} {cal_path}  ({len(costs)} benchmarks)\n")
+
+    print_calibration_table(results)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Memory profile suite — 5×5 estimate vs actual, or benchmark calibration.")
+    parser.add_argument('--csv', metavar='PATH', default=None,
+                        help='write results to CSV')
+    parser.add_argument('--skip-score', action='store_true',
+                        help='probe only — do not run actual scoring')
+    parser.add_argument('--calibrate', action='store_true',
+                        help='run alexnet on ALL benchmarks and output fixed_benchmark_cost per benchmark')
+    parser.add_argument('--calibration-json', metavar='PATH', default=None,
+                        help='path to save/load calibration JSON '
+                             '(default: ~/.brainscore/benchmark_costs.json)')
+    parser.add_argument('--resume-from', metavar='BENCHMARK_ID', default=None,
+                        help='skip all benchmarks up to and including this one, '
+                             'then continue from the next')
+    args = parser.parse_args()
+
+    if args.calibrate:
+        _run_calibrate(args)
+        return
+
+    n_bm = len(BENCHMARKS)
+    n_m  = len(MODELS)
+    mode = "probe only" if args.skip_score else "probe + score"
+    total_pairs = n_m * n_bm
+
+    print(f"\n{'═' * 66}")
+    print(f"  {_c('MEM PROFILE SUITE', _BOLD)}  —  "
+          f"{n_m} models × {n_bm} benchmarks = {total_pairs} pairs  [{mode}]")
+    print(f"{'═' * 66}")
+
+    # ── Load all benchmarks first ────────────────────────────────────────
+    print(f"\n{_c(f'Loading {n_bm} benchmarks  (may download from S3)', _CYAN)}\n")
+    benchmarks = {}
+    for i, bid in enumerate(BENCHMARKS, 1):
+        print(f"  [{i}/{n_bm}] {bid}")
+        t0 = time.time()
+        try:
+            benchmarks[bid] = _timed_load(_load_benchmark, bid, t0)
+            print(f"         {_c('OK', _GREEN)} ({time.time() - t0:.1f}s)")
+        except Exception as e:
+            benchmarks[bid] = None
+            print(f"         {_c('FAILED', _RED)}: {str(e)[:80]}")
+        print()
+
+    # ── Open CSV for incremental writing ────────────────────────────────
+    csv_file, csv_writer = None, None
+    if args.csv:
+        csv_file, csv_writer = init_csv(args.csv)
+        print(f"  {_c('CSV opened →', _CYAN)} {args.csv}  (rows written after each pair)\n")
+
+    # ── For each model, run all benchmarks ───────────────────────────────
+    results_grid = {mid_id: {} for mid_id in MODELS}
+    pair_num = 0
+
+    try:
+        for m_idx, mid_id in enumerate(MODELS, 1):
+            print(f"\n{'═' * 66}")
+            print(f"  {_c(f'Model {m_idx}/{n_m}: {mid_id}', _CYAN)}")
+            print(f"{'═' * 66}\n")
+
+            _step("loading model...", indent=2)
+            t0 = time.time()
+            try:
+                model = _timed_load(_load_model, mid_id, t0)
+                print(f"       {_c('OK', _GREEN)} ({time.time() - t0:.1f}s)", flush=True)
+            except Exception as e:
+                print(f"       {_c('FAILED', _RED)}: {str(e)[:80]}")
+                for bid in BENCHMARKS:
+                    pair_num += 1
+                    r = _make_result(
+                        mid_id, bid, status='error', est_gb=None, act_gb=None,
+                        actual_delta_gb=None, score=None,
+                        probe_elapsed=0.0, score_elapsed=None,
+                        note=f"model load failed: {str(e)[:60]}")
+                    results_grid[mid_id][bid] = r
+                    if csv_writer:
+                        append_csv_row(csv_writer, csv_file, r)
+                continue
+
+            for bid in BENCHMARKS:
+                pair_num += 1
+                short = _BM_SHORT.get(bid, bid)
+                print(f"\n  {_c(f'pair {pair_num}/{total_pairs}', _DIM)}  "
+                      f"{_c(_trunc(mid_id, 28), _BOLD)} × {_c(short, _BOLD)}")
+                print(f"  {'─' * 54}")
+
+                bm = benchmarks.get(bid)
+                if bm is None:
+                    _step(_c("benchmark failed to load — skipping", _RED), indent=4)
+                    r = _make_result(
+                        mid_id, bid, status='error', est_gb=None, act_gb=None,
+                        actual_delta_gb=None, score=None,
+                        probe_elapsed=0.0, score_elapsed=None,
+                        note='benchmark failed to load')
+                    results_grid[mid_id][bid] = r
+                    if csv_writer:
+                        append_csv_row(csv_writer, csv_file, r)
+                    continue
+
+                r = run_pair(model, mid_id, bm, bid, skip_score=args.skip_score)
+                results_grid[mid_id][bid] = r
+
+                # Write CSV row immediately
+                if csv_writer:
+                    append_csv_row(csv_writer, csv_file, r)
+                    print(f"  {_c('↳ CSV row written', _DIM)}", flush=True)
+
+                # One-line pair summary
+                est = r.get('est_gb')
+                act = r.get('actual_delta_gb')
+                if est is not None and act is not None:
+                    _, verdict, ratio = _compare_label(est, act)
+                    col = (_GREEN if (ratio and ratio >= 0.8)
+                           else _YELLOW if (ratio and ratio >= 0.4) else _RED)
+                    print(f"\n  {_c('RESULT', _BOLD)}: "
+                          f"est {est:.2f} GB  actual Δ {act:.2f} GB  → {_c(verdict, col)}")
+                elif est is not None:
+                    print(f"\n  {_c('RESULT', _BOLD)}: est {est:.2f} GB  (no actual)")
+
+    finally:
+        if csv_file:
+            csv_file.close()
+            print(f"\n{_c('CSV finalised →', _CYAN)} {args.csv}")
+
+    # ── Final summary ────────────────────────────────────────────────────
+    print(f"\n\n{'═' * 66}")
+    print(f"  {_c('FINAL SUMMARY', _BOLD)}")
+    print(f"{'═' * 66}\n")
+    print_summary_table(results_grid)
+    print_full_report(results_grid)
+    print_overhead_recommendation(results_grid)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/scripts/memory_flight_report.py b/scripts/memory_flight_report.py
new file mode 100644
index 0000000000..2b4302924b
--- /dev/null
+++ b/scripts/memory_flight_report.py
@@ -0,0 +1,280 @@
+"""
+Memory Flight Report
+====================
+Runs a pre-flight memory estimate for one (model, benchmark) pair, then
+optionally executes the full benchmark while tracking peak RSS, so you can
+see how close the estimate was to reality.
+
+NOTE: prefer preflight_check.py for day-to-day use — it is simpler and uses
+the calibrated fixed_benchmark_cost table automatically.  This script is
+useful for one-off investigations or when you want the box-formatted output.
+
+Usage
+-----
+    python scripts/memory_flight_report.py <model_id> <benchmark_id>
+    python scripts/memory_flight_report.py <model_id> <benchmark_id> --skip-score
+
+    --skip-score   only run the pre-flight estimate, skip the actual benchmark
+
+Example
+-------
+    python scripts/memory_flight_report.py resnet50_tutorial MajajHong2015.IT-pls
+
+Output
+------
+    ┌─ PRE-FLIGHT ESTIMATE ──────────────────────────────────────────┐
+    │  Stimuli:    2560   Features: 200,704   Timebins: 1            │
+    │  Activation: 1.91 GB  (×6 overhead → 11.47 GB estimated)       │
+    │  Available RAM: 13.6 GB   →  OK                                │
+    └────────────────────────────────────────────────────────────────┘
+    [scoring runs...]
+    ┌─ ACTUAL USAGE ─────────────────────────────────────────────────┐
+    │  Baseline RSS:   1.2 GB                                        │
+    │  Peak RSS:       4.7 GB   (Δ +3.5 GB)                         │
+    │  Final RSS:      2.1 GB   (Δ +0.9 GB)                         │
+    │  Estimated:     11.5 GB   →  estimate was ACCURATE (1.2×)      │
+    └────────────────────────────────────────────────────────────────┘
+"""
+
+import os
+import sys
+import threading
+import time
+import argparse
+import logging
+
+import psutil
+
+# ---------------------------------------------------------------------------
+# Resolve local repos so the script works without installation
+# ---------------------------------------------------------------------------
+_script_dir = os.path.dirname(os.path.abspath(__file__))
+_vision_root = os.path.dirname(_script_dir)
+_core_root = os.path.join(os.path.dirname(_vision_root), 'core')
+for _p in [_vision_root, _core_root]:
+    if _p not in sys.path:
+        sys.path.insert(0, _p)
+
+from brainscore_vision import load_model, load_benchmark
+from brainscore_vision.benchmark_helpers.memory import preallocate_memory
+from brainscore_core.benchmarks import score_benchmark
+
+logging.basicConfig(level=logging.WARNING)
+
+_RESET  = '\033[0m'
+_BOLD   = '\033[1m'
+_GREEN  = '\033[32m'
+_YELLOW = '\033[33m'
+_RED    = '\033[31m'
+_CYAN   = '\033[36m'
+
+
+# ---------------------------------------------------------------------------
+# Peak RSS monitor (background thread)
+# ---------------------------------------------------------------------------
+
+class _PeakMonitor:
+    """Polls process RSS every `interval` seconds and records the peak."""
+
+    def __init__(self, interval: float = 0.5):
+        self._proc = psutil.Process(os.getpid())
+        self._interval = interval
+        self._peak = self._proc.memory_info().rss
+        self._stop = threading.Event()
+        self._thread = threading.Thread(target=self._run, daemon=True)
+
+    def start(self):
+        self._thread.start()
+        return self
+
+    def stop(self) -> int:
+        self._stop.set()
+        self._thread.join()
+        return self._peak
+
+    def _run(self):
+        while not self._stop.is_set():
+            try:
+                rss = self._proc.memory_info().rss
+                if rss > self._peak:
+                    self._peak = rss
+            except psutil.NoSuchProcess:
+                break
+            self._stop.wait(self._interval)
+
+
+# ---------------------------------------------------------------------------
+# Formatting helpers
+# ---------------------------------------------------------------------------
+
+def _gb(n_bytes: int) -> str:
+    return f"{n_bytes / (1024 ** 3):.2f} GB"
+
+
+def _ratio_label(estimate_gb: float, actual_delta_gb: float) -> str:
+    if actual_delta_gb <= 0:
+        return f"{_GREEN}estimate unavailable (no measurable delta){_RESET}"
+    ratio = estimate_gb / actual_delta_gb
+    if ratio >= 0.8:
+        colour = _GREEN
+        verdict = f"estimate was ACCURATE ({ratio:.1f}×)"
+    elif ratio >= 0.4:
+        colour = _YELLOW
+        verdict = f"estimate was UNDER by {1/ratio:.1f}×"
+    else:
+        colour = _RED
+        verdict = f"estimate was UNDER by {actual_delta_gb/estimate_gb:.1f}×"
+    return f"{colour}{verdict}{_RESET}"
+
+
+def _box(title: str, lines: list[str], width: int = 66) -> str:
+    top    = f"┌─ {_BOLD}{title}{_RESET} " + "─" * (width - len(title) - 3) + "┐"
+    bottom = "└" + "─" * (width) + "┘"
+    body   = "\n".join(f"│  {l:<{width - 2}}│" for l in lines)
+    return f"{top}\n{body}\n{bottom}"
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+def main():
+    parser = argparse.ArgumentParser(description="Memory flight report for a brainscore scoring run.")
+    parser.add_argument("model_identifier")
+    parser.add_argument("benchmark_identifier")
+    parser.add_argument("--skip-score", action="store_true",
+                        help="Only run the pre-flight estimate, skip the actual benchmark.")
+    args = parser.parse_args()
+
+    proc = psutil.Process(os.getpid())
+
+    # ------------------------------------------------------------------ #
+    #  1. Load model + benchmark                                          #
+    # ------------------------------------------------------------------ #
+    print(f"\n{_CYAN}Loading model '{args.model_identifier}'...{_RESET}")
+    model = load_model(args.model_identifier)
+
+    print(f"{_CYAN}Loading benchmark '{args.benchmark_identifier}'...{_RESET}\n")
+    benchmark = load_benchmark(args.benchmark_identifier)
+
+    # ------------------------------------------------------------------ #
+    #  2. Pre-flight estimate                                             #
+    # ------------------------------------------------------------------ #
+    print(f"{_BOLD}Running pre-flight probe (1 stimulus)...{_RESET}")
+    try:
+        estimate = preallocate_memory(model, benchmark, raise_if_oom=False)
+    except TypeError as e:
+        print(f"{_YELLOW}Pre-flight skipped (benchmark type not supported): {e}{_RESET}\n")
+        estimate = None
+
+    if estimate is not None:
+        ok_or_oom = f"{_RED}OOM LIKELY{_RESET}" if estimate.will_oom else f"{_GREEN}OK{_RESET}"
+        preflight_lines = [
+            f"Stimuli: {estimate.num_stimuli:>6}   Features: {estimate.num_features:>7,}   Timebins: {estimate.num_timebins}",
+            f"Activation: {estimate.activation_gb:.2f} GB  "
+            f"(×6 overhead → {estimate.total_estimated_gb:.2f} GB estimated)",
+            f"Available RAM: {estimate.available_gb:.1f} GB   →  {ok_or_oom}",
+        ]
+        print(_box("PRE-FLIGHT ESTIMATE", preflight_lines))
+        print()
+
+        if estimate.will_oom and not args.skip_score:
+            print(f"{_RED}OOM predicted — proceeding anyway to measure actual usage.{_RESET}\n")
+
+    if args.skip_score:
+        return
+
+    # ------------------------------------------------------------------ #
+    #  3. Score (with RSS monitoring)                                     #
+    # ------------------------------------------------------------------ #
+    baseline_rss = proc.memory_info().rss
+    print(f"{_CYAN}Baseline RSS: {_gb(baseline_rss)}{_RESET}")
+    print(f"{_BOLD}Scoring...{_RESET}  (this may take a while)\n")
+
+    monitor = _PeakMonitor(interval=0.5).start()
+    t0 = time.time()
+
+    try:
+        score = benchmark(model)
+        elapsed = time.time() - t0
+        peak_rss = monitor.stop()
+        final_rss = proc.memory_info().rss
+    except AssertionError as e:
+        monitor.stop()
+        elapsed = time.time() - t0
+        print(f"\n{_RED}AssertionError in attach_stimulus_set_meta after {elapsed:.1f}s{_RESET}")
+        print("This usually means the activations cache has stale paths.")
+        print()
+        # Print diagnostic: which paths are mismatching
+        try:
+            from brainscore_vision.model_helpers.activations.core import lstrip_local
+            import numpy as np
+            stimulus_set = benchmark._assembly.stimulus_set
+            from brainscore_vision.benchmark_helpers.screen import place_on_screen
+            ss = place_on_screen(stimulus_set, target_visual_degrees=model.visual_degrees(),
+                                 source_visual_degrees=benchmark._visual_degrees)
+            expected_paths = [lstrip_local(str(ss.get_stimulus(sid))) for sid in ss['stimulus_id'].values[:3]]
+            print(f"{_CYAN}Expected paths (first 3):{_RESET}")
+            for p in expected_paths:
+                print(f"  {p}")
+            # Show what fresh _from_paths returns for comparison
+            _am = model.activations_model
+            lm = model.layer_model._layer_model
+            layer = list(dict.items(lm.region_layer_map))[0][1]
+            layer = layer if isinstance(layer, str) else layer[0]
+            dummy = _am._extractor._from_paths([str(ss.get_stimulus(ss['stimulus_id'].values[0]))], layers=[layer])
+            got_paths = [lstrip_local(p) for p in dummy['stimulus_path'].values[:3]]
+            print(f"{_CYAN}Fresh _from_paths result paths (first 3):{_RESET}")
+            for p in got_paths:
+                print(f"  {p}")
+        except Exception as diag_err:
+            print(f"(diagnostic failed: {diag_err})")
+        print()
+        print(f"Fix: delete the stale cache entry and re-run:")
+        try:
+            cache_dir = os.path.expanduser(
+                "~/.result_caching/brainscore_vision.model_helpers.activations.core"
+                ".ActivationsExtractorHelper._from_paths_stored"
+            )
+            cache_file = (
+                f"identifier={model.identifier},"
+                f"stimuli_identifier={ss.identifier},"
+                f"number_of_trials=1,require_variance=False.pkl"
+            )
+            print(f"  rm '{os.path.join(cache_dir, cache_file)}'")
+        except Exception:
+            print(
+                "  rm ~/.result_caching/brainscore_vision.model_helpers.activations.core"
+                ".ActivationsExtractorHelper._from_paths_stored/<model>,<benchmark>*.pkl"
+            )
+        sys.exit(1)
+
+    except MemoryError as e:
+        monitor.stop()
+        elapsed = time.time() - t0
+        peak_rss = proc.memory_info().rss
+        print(f"\n{_RED}MemoryError after {elapsed:.1f}s:{_RESET} {e}")
+        print(f"Peak RSS before crash: {_gb(peak_rss)}  (Δ +{_gb(peak_rss - baseline_rss)})\n")
+        sys.exit(1)
+
+    # ------------------------------------------------------------------ #
+    #  4. Report                                                          #
+    # ------------------------------------------------------------------ #
+    delta_peak  = peak_rss  - baseline_rss
+    delta_final = final_rss - baseline_rss
+    est_gb = estimate.total_estimated_gb if estimate else float('nan')
+
+    actual_lines = [
+        f"Baseline RSS:  {_gb(baseline_rss)}",
+        f"Peak RSS:      {_gb(peak_rss)}   (Δ +{_gb(delta_peak)})",
+        f"Final RSS:     {_gb(final_rss)}   (Δ +{_gb(delta_final)})",
+        f"Estimated:     {est_gb:.2f} GB   →  {_ratio_label(est_gb, delta_peak / (1024**3))}",
+        f"Elapsed:       {elapsed:.1f}s",
+        f"Score:         {float(score):.4f}",
+    ]
+    print(_box("ACTUAL USAGE", actual_lines))
+    print()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/scripts/preflight_check.py b/scripts/preflight_check.py
new file mode 100644
index 0000000000..12b89bf732
--- /dev/null
+++ b/scripts/preflight_check.py
@@ -0,0 +1,304 @@
+"""
+Pre-flight Memory Check
+=======================
+The main entry point for checking whether a model will OOM on a benchmark
+before committing to a full (potentially multi-hour) scoring run.
+
+HOW IT WORKS
+------------
+1. Loads the calibration table from ~/.brainscore/benchmark_costs.json
+   (produced by:  python scripts/mem_profile_suite.py --calibrate)
+
+2. Runs a 1-stimulus forward pass through the model (the "probe") to measure
+   the model's actual feature count for this benchmark's region/layer.
+
+3. Estimates total RAM needed:
+     total = activation_gb + fixed_benchmark_cost_gb   (if benchmark is calibrated)
+     total = activation_gb × 6                          (fallback if not calibrated)
+
+   where:
+     activation_gb          = stimuli × features × timebins × 4 bytes
+     fixed_benchmark_cost   = benchmark's model-independent overhead
+                              (regression matrices, xarray, CV buffers)
+                              — constant regardless of which model you run
+
+4. Compares the estimate against available RAM and reports OK or OOM LIKELY.
+
+Optionally (--score) runs the full benchmark and compares the estimate to
+the actual peak RSS delta, so you can validate the calibration on this machine.
+
+IMPORTANT: Calibrate on the same machine you score on.  The fixed_benchmark_cost
+is environment-specific (Linux EC2 numbers will differ from macOS).
+
+Usage
+-----
+    python scripts/preflight_check.py <model_id> <benchmark_id> [--score]
+
+Examples
+--------
+    # Fast probe — just check if it will OOM (recommended before any scoring run)
+    python scripts/preflight_check.py resnet50_tutorial MajajHong2015.IT-pls
+
+    # Full roundtrip — probe then score and compare estimate to actual peak RSS
+    python scripts/preflight_check.py resnet50_tutorial MajajHong2015.IT-pls --score
+"""
+
+import os
+import sys
+import time
+import argparse
+import threading
+
+# ---------------------------------------------------------------------------
+# Resolve local repos
+# ---------------------------------------------------------------------------
+_script_dir  = os.path.dirname(os.path.abspath(__file__))
+_vision_root = os.path.dirname(_script_dir)
+_core_root   = os.path.join(os.path.dirname(_vision_root), 'core')
+for _p in [_vision_root, _core_root]:
+    if _p not in sys.path:
+        sys.path.insert(0, _p)
+
+import logging
+logging.basicConfig(level=logging.WARNING)
+
+import psutil
+
+_RESET  = '\033[0m'
+_BOLD   = '\033[1m'
+_GREEN  = '\033[32m'
+_YELLOW = '\033[33m'
+_RED    = '\033[31m'
+_CYAN   = '\033[36m'
+_DIM    = '\033[2m'
+
+
+def _c(text, colour):
+    return f"{colour}{text}{_RESET}"
+
+
+def _gb(n_bytes):
+    return f"{n_bytes / (1024 ** 3):.2f} GB"
+
+
+def _divider(char='─', width=66):
+    print(char * width)
+
+
+# ---------------------------------------------------------------------------
+# Peak RSS monitor
+# ---------------------------------------------------------------------------
+class _PeakMonitor:
+    def __init__(self, interval=0.5):
+        self._proc = psutil.Process(os.getpid())
+        self._peak = self._proc.memory_info().rss
+        self._stop = threading.Event()
+        self._thread = threading.Thread(target=self._run, daemon=True)
+
+    def start(self):
+        self._thread.start()
+        return self
+
+    def stop(self):
+        self._stop.set()
+        self._thread.join()
+        return self._peak
+
+    def _run(self):
+        while not self._stop.is_set():
+            try:
+                rss = self._proc.memory_info().rss
+                if rss > self._peak:
+                    self._peak = rss
+            except psutil.NoSuchProcess:
+                break
+            self._stop.wait(0.5)
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+def main():
+    parser = argparse.ArgumentParser(
+        description='Pre-flight memory check integration test.')
+    parser.add_argument('model_id')
+    parser.add_argument('benchmark_id')
+    parser.add_argument('--score', action='store_true',
+                        help='also run the full benchmark and compare estimate to actual RSS')
+    args = parser.parse_args()
+
+    from brainscore_vision import load_model, load_benchmark
+    from brainscore_vision.benchmark_helpers.memory import (
+        preallocate_memory, load_calibration, _DEFAULT_CALIBRATION_PATH,
+    )
+
+    print()
+    _divider('═')
+    print(f"  {_c('PRE-FLIGHT CHECK', _BOLD)}")
+    print(f"  model     : {_c(args.model_id, _CYAN)}")
+    print(f"  benchmark : {_c(args.benchmark_id, _CYAN)}")
+    _divider('═')
+
+    # ── Calibration table status ─────────────────────────────────────────
+    print()
+    cal = load_calibration()
+    if cal:
+        fixed = cal.get(args.benchmark_id)
+        if fixed is not None:
+            print(f"  {_c('Calibration table', _BOLD)}: {_DEFAULT_CALIBRATION_PATH}")
+            print(f"  {_c('✓', _GREEN)} '{args.benchmark_id}' found  →  "
+                  f"fixed_benchmark_cost = {_c(f'{fixed:.4f} GB', _GREEN)}")
+            print(f"  Formula: total = activation_gb + {fixed:.4f} GB")
+        else:
+            print(f"  {_c('Calibration table', _BOLD)}: {_DEFAULT_CALIBRATION_PATH}  "
+                  f"({len(cal)} entries)")
+            print(f"  {_c('⚠', _YELLOW)} '{args.benchmark_id}' not in table  →  "
+                  f"will fall back to ×6 overhead")
+    else:
+        print(f"  {_c('⚠', _YELLOW)} No calibration table found at {_DEFAULT_CALIBRATION_PATH}")
+        print(f"  Will fall back to ×6 overhead multiplier.")
+
+    # ── Load model + benchmark ───────────────────────────────────────────
+    print()
+    _divider()
+
+    print(f"\n  Loading model '{args.model_id}'...", end='', flush=True)
+    t0 = time.time()
+    model = load_model(args.model_id)
+    print(f"  {_c('OK', _GREEN)} ({time.time()-t0:.1f}s)")
+
+    print(f"  Loading benchmark '{args.benchmark_id}'...", end='', flush=True)
+    t0 = time.time()
+    benchmark = load_benchmark(args.benchmark_id)
+    print(f"  {_c('OK', _GREEN)} ({time.time()-t0:.1f}s)")
+
+    # ── Pre-flight probe ─────────────────────────────────────────────────
+    print()
+    _divider()
+    print(f"\n  {_c('PRE-FLIGHT PROBE', _BOLD)}  (1-stimulus forward pass)\n")
+
+    t0 = time.time()
+    try:
+        est = preallocate_memory(model, benchmark, raise_if_oom=False)
+    except TypeError as e:
+        print(f"  {_c('SKIPPED', _YELLOW)}: {e}")
+        return
+
+    probe_elapsed = time.time() - t0
+
+    if est is None:
+        print(f"  {_c('SKIPPED', _YELLOW)} (BRAINSCORE_SKIP_MEMORY_CHECK set)")
+        return
+
+    print(f"  {'Stimuli':<22}: {est.num_stimuli:,}")
+    print(f"  {'Features (neuroid)':<22}: {est.num_features:,}")
+    print(f"  {'Timebins':<22}: {est.num_timebins}")
+    print(f"  {'Activation array':<22}: {est.activation_gb:.4f} GB  "
+          f"{_c(f'({est.num_stimuli} × {est.num_features:,} × {est.num_timebins} × 4B)', _DIM)}")
+    print()
+
+    if est.fixed_benchmark_cost_gb is not None:
+        print(f"  {_c('Formula', _BOLD)}: {_c('CALIBRATED', _GREEN)}")
+        print(f"  {'Activation':<22}: {est.activation_gb:.4f} GB")
+        print(f"  {'Fixed benchmark cost':<22}: {est.fixed_benchmark_cost_gb:.4f} GB  "
+              f"{_c('← model-independent overhead from calibration table', _DIM)}")
+        print(f"  {'Total estimated':<22}: {_c(f'{est.total_estimated_gb:.4f} GB', _CYAN)}  "
+              f"{_c(f'({est.activation_gb:.4f} + {est.fixed_benchmark_cost_gb:.4f})', _DIM)}")
+    else:
+        print(f"  {_c('Formula', _BOLD)}: {_c('FALLBACK (×6)', _YELLOW)}  "
+              f"{_c('← benchmark not in calibration table', _DIM)}")
+        print(f"  {'Activation':<22}: {est.activation_gb:.4f} GB")
+        print(f"  {'Total estimated':<22}: {_c(f'{est.total_estimated_gb:.4f} GB', _CYAN)}  "
+              f"{_c(f'({est.activation_gb:.4f} × 6)', _DIM)}")
+
+    print()
+    avail_col = _RED if est.will_oom else _GREEN
+    verdict = _c('OOM LIKELY', _RED) if est.will_oom else _c('OK', _GREEN)
+    print(f"  {'Available RAM':<22}: {_c(f'{est.available_gb:.2f} GB', avail_col)}")
+    print(f"  {'Verdict':<22}: {verdict}")
+    print(f"  {'Probe elapsed':<22}: {probe_elapsed:.1f}s")
+
+    if not args.score:
+        print()
+        _divider()
+        print(f"\n  {_c('Tip:', _DIM)} run with --score to also execute the full benchmark")
+        print(f"  and compare the estimate against actual peak RSS.\n")
+        return
+
+    # ── Full benchmark run ───────────────────────────────────────────────
+    print()
+    _divider()
+    print(f"\n  {_c('FULL BENCHMARK RUN', _BOLD)}\n")
+
+    proc = psutil.Process(os.getpid())
+    baseline_rss = proc.memory_info().rss
+    print(f"  Baseline RSS: {_gb(baseline_rss)}  "
+          f"{_c('← everything already in RAM (model weights, Python, etc.)', _DIM)}")
+    print(f"  Scoring...  (this may take a while)\n")
+
+    # Ticker thread
+    _ticker_stop = threading.Event()
+    def _ticker():
+        t_start = time.time()
+        while not _ticker_stop.wait(30):
+            elapsed = time.time() - t_start
+            rss = proc.memory_info().rss
+            print(f"    {_c('…', _DIM)} still scoring  {elapsed/60:.1f} min  RSS {_gb(rss)}",
+                  flush=True)
+    ticker_thread = threading.Thread(target=_ticker, daemon=True)
+    ticker_thread.start()
+
+    monitor = _PeakMonitor().start()
+    t_score = time.time()
+    score_val = None
+    try:
+        score_val = benchmark(model)
+    except MemoryError as e:
+        print(f"\n  {_c('MemoryError', _RED)}: {e}")
+    except Exception as e:
+        print(f"\n  {_c('ERROR', _RED)}: {e}")
+    finally:
+        _ticker_stop.set()
+        ticker_thread.join()
+
+    score_elapsed = time.time() - t_score
+    peak_rss  = monitor.stop()
+    final_rss = proc.memory_info().rss
+    actual_delta_gb = (peak_rss - baseline_rss) / (1024 ** 3)
+
+    # ── Comparison ───────────────────────────────────────────────────────
+    print()
+    _divider()
+    print(f"\n  {_c('RESULT', _BOLD)}\n")
+
+    print(f"  {'Baseline RSS':<24}: {_gb(baseline_rss)}")
+    print(f"  {'Peak RSS':<24}: {_gb(peak_rss)}")
+    print(f"  {'Δ (peak − baseline)':<24}: {_c(f'+{actual_delta_gb:.4f} GB', _CYAN)}  "
+          f"{_c('← actual RAM the benchmark consumed', _DIM)}")
+    print(f"  {'Estimated total':<24}: {_c(f'{est.total_estimated_gb:.4f} GB', _CYAN)}")
+    print()
+
+    if actual_delta_gb > 0.01:
+        error_gb  = est.total_estimated_gb - actual_delta_gb
+        error_pct = (error_gb / actual_delta_gb) * 100
+        if error_gb >= 0:
+            accuracy = _c(f'OVER by {error_gb:.2f} GB ({error_pct:.1f}%)  ← conservative, safe', _GREEN)
+        elif abs(error_pct) <= 15:
+            accuracy = _c(f'UNDER by {abs(error_gb):.2f} GB ({abs(error_pct):.1f}%)  ← within 15%, acceptable', _YELLOW)
+        else:
+            accuracy = _c(f'UNDER by {abs(error_gb):.2f} GB ({abs(error_pct):.1f}%)  ← significant miss', _RED)
+        print(f"  {'Accuracy':<24}: {accuracy}")
+
+        formula = 'calibrated' if est.fixed_benchmark_cost_gb is not None else '×6 fallback'
+        print(f"  {'Formula used':<24}: {formula}")
+
+    if score_val is not None:
+        print(f"  {'Score':<24}: {float(score_val):.4f}")
+    print(f"  {'Elapsed':<24}: {score_elapsed:.0f}s")
+    print()
+    _divider('═')
+    print()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/scripts/validation.py b/scripts/validation.py
new file mode 100644
index 0000000000..1a2c1fe442
--- /dev/null
+++ b/scripts/validation.py
@@ -0,0 +1,385 @@
+"""
+Pre-flight Estimator Validation
+================================
+Runs a 3-model × 4-benchmark grid to validate the pre-flight memory estimator
+across all formula types: PLS, Ridge (calibrated), RidgeCV (calibrated), and RDM.
+
+One benchmark is selected per formula class so that every code path in
+preallocate_memory is exercised:
+
+  FreemanZiemba2013.V1-pls   → PLS  (activation × 7 + fixed_cost, warning printed)
+  Papale2025.IT-ridge        → Ridge calibrated  (activation + calibrated cost)
+  Gifford2022.IT-ridgecv     → RidgeCV calibrated  (activation + calibrated cost)
+  Allen2022_fmri.IT-rdm      → RDM  (activation + 2×n_stimuli²×4B, model-independent)
+
+For each (model, benchmark) pair it:
+  1. Runs the pre-flight probe  → estimates total GB via the appropriate formula
+  2. Runs the full benchmark    → measures actual peak RSS delta
+  3. Compares estimate to actual and reports over/under by how much
+
+Results are written to `validation_results.jsonl` after every pair so a crash
+does not lose completed work.  Re-running will overwrite the file.
+
+Usage
+-----
+    python scripts/validation.py
+
+    # Skip the actual benchmark runs (probe only — fast)
+    python scripts/validation.py --probe-only
+
+    # Write results to a custom path
+    python scripts/validation.py --output /tmp/val.jsonl
+"""
+
+import argparse
+import json
+import logging
+import os
+import sys
+import threading
+import time
+
+import psutil
+
+# ---------------------------------------------------------------------------
+# Resolve local repos so the script works without installation
+# ---------------------------------------------------------------------------
+_script_dir  = os.path.dirname(os.path.abspath(__file__))
+_vision_root = os.path.dirname(_script_dir)
+_core_root   = os.path.join(os.path.dirname(_vision_root), 'core')
+for _p in [_vision_root, _core_root]:
+    if _p not in sys.path:
+        sys.path.insert(0, _p)
+
+logging.basicConfig(level=logging.WARNING)
+
+# ---------------------------------------------------------------------------
+# Grid definition
+# ---------------------------------------------------------------------------
+MODELS = [
+    'alexnet',
+    'resnet50_tutorial',
+    'vit_large_patch14_clip_224:openai_ft_in1k',
+]
+
+BENCHMARKS = [
+    'FreemanZiemba2013.V1-pls',    # PLS         — activation × 7 + fixed_cost (approximate, warning)
+    'Papale2025.IT-ridge',          # Ridge        — activation + calibrated cost
+    'Gifford2022.IT-ridgecv',       # RidgeCV      — activation + calibrated cost
+    'Allen2022_fmri.IT-rdm',        # RDM          — activation + 2×n_stimuli²×4B (model-independent)
+]
+
+# ---------------------------------------------------------------------------
+# ANSI colours
+# ---------------------------------------------------------------------------
+_RESET  = '\033[0m'
+_BOLD   = '\033[1m'
+_DIM    = '\033[2m'
+_GREEN  = '\033[32m'
+_YELLOW = '\033[33m'
+_RED    = '\033[31m'
+_CYAN   = '\033[36m'
+
+def _c(text, colour): return f"{colour}{text}{_RESET}"
+def _gb(n_bytes):     return f"{n_bytes / (1024 ** 3):.3f} GB"
+
+
+# ---------------------------------------------------------------------------
+# Peak RSS monitor
+# ---------------------------------------------------------------------------
+class _PeakMonitor:
+    def __init__(self, interval=0.5):
+        self._proc  = psutil.Process(os.getpid())
+        self._peak  = self._proc.memory_info().rss
+        self._stop  = threading.Event()
+        self._thread = threading.Thread(target=self._run, daemon=True)
+
+    def start(self):
+        self._thread.start()
+        return self
+
+    def stop(self) -> int:
+        self._stop.set()
+        self._thread.join()
+        return self._peak
+
+    def _run(self):
+        while not self._stop.is_set():
+            try:
+                rss = self._proc.memory_info().rss
+                if rss > self._peak:
+                    self._peak = rss
+            except psutil.NoSuchProcess:
+                break
+            self._stop.wait(0.5)
+
+
+# ---------------------------------------------------------------------------
+# Formatting helpers
+# ---------------------------------------------------------------------------
+def _divider(char='─', width=70):
+    print(char * width)
+
+
+def _accuracy_label(estimated_gb: float, actual_gb: float):
+    """Return a coloured accuracy string."""
+    if actual_gb <= 0.01:
+        return _c('actual delta too small to measure', _DIM)
+    error_gb  = estimated_gb - actual_gb
+    error_pct = (error_gb / actual_gb) * 100
+    if error_gb >= 0:
+        return _c(f'OVER  by {error_gb:.2f} GB ({error_pct:.1f}%)  ← conservative, safe', _GREEN)
+    elif abs(error_pct) <= 15:
+        return _c(f'UNDER by {abs(error_gb):.2f} GB ({abs(error_pct):.1f}%)  ← within 15%, acceptable', _YELLOW)
+    else:
+        return _c(f'UNDER by {abs(error_gb):.2f} GB ({abs(error_pct):.1f}%)  ← significant miss', _RED)
+
+
+def _write_result(path: str, record: dict):
+    """Append one JSON record to the output file (crash-safe)."""
+    with open(path, 'a') as f:
+        f.write(json.dumps(record) + '\n')
+
+
+# ---------------------------------------------------------------------------
+# Single-pair validation
+# ---------------------------------------------------------------------------
+def run_pair(model_id: str, benchmark_id: str, output_path: str, probe_only: bool) -> dict:
+    from brainscore_vision import load_model, load_benchmark
+    from brainscore_vision.benchmark_helpers.memory import preallocate_memory
+
+    record = {
+        'model': model_id,
+        'benchmark': benchmark_id,
+        'status': 'pending',
+    }
+
+    proc = psutil.Process(os.getpid())
+
+    # ── Load model ──────────────────────────────────────────────────────
+    print(f"\n  Loading model     {_c(model_id, _CYAN)} ...", end='', flush=True)
+    t0 = time.time()
+    try:
+        model = load_model(model_id)
+    except Exception as e:
+        print(f"  {_c('FAILED', _RED)}: {e}")
+        record.update({'status': 'error', 'error': f'load_model: {e}'})
+        _write_result(output_path, record)
+        return record
+    print(f"  {_c('OK', _GREEN)} ({time.time()-t0:.1f}s)")
+
+    # ── Load benchmark ───────────────────────────────────────────────────
+    print(f"  Loading benchmark {_c(benchmark_id, _CYAN)} ...", end='', flush=True)
+    t0 = time.time()
+    try:
+        benchmark = load_benchmark(benchmark_id)
+    except Exception as e:
+        print(f"  {_c('FAILED', _RED)}: {e}")
+        record.update({'status': 'error', 'error': f'load_benchmark: {e}'})
+        _write_result(output_path, record)
+        return record
+    print(f"  {_c('OK', _GREEN)} ({time.time()-t0:.1f}s)")
+
+    # ── Pre-flight probe ─────────────────────────────────────────────────
+    print(f"\n  {_c('PRE-FLIGHT PROBE', _BOLD)}")
+    t0 = time.time()
+    try:
+        est = preallocate_memory(model, benchmark, raise_if_oom=False)
+    except TypeError as e:
+        print(f"  {_c('SKIPPED', _YELLOW)}: {e}")
+        record.update({'status': 'skipped', 'skip_reason': str(e)})
+        _write_result(output_path, record)
+        return record
+    probe_elapsed = time.time() - t0
+
+    if est is None:
+        print(f"  {_c('SKIPPED', _YELLOW)} (BRAINSCORE_SKIP_MEMORY_CHECK set)")
+        record.update({'status': 'skipped', 'skip_reason': 'BRAINSCORE_SKIP_MEMORY_CHECK'})
+        _write_result(output_path, record)
+        return record
+
+    formula = 'calibrated' if est.fixed_benchmark_cost_gb is not None else f'x{6}_fallback'
+    print(f"  {'Stimuli':<24}: {est.num_stimuli:,}")
+    print(f"  {'Features':<24}: {est.num_features:,}")
+    print(f"  {'Timebins':<24}: {est.num_timebins}")
+    print(f"  {'Activation':<24}: {est.activation_gb:.4f} GB  "
+          f"{_c(f'({est.num_stimuli:,} × {est.num_features:,} × {est.num_timebins} × 4B)', _DIM)}")
+    if est.fixed_benchmark_cost_gb is not None:
+        print(f"  {'Fixed benchmark cost':<24}: {est.fixed_benchmark_cost_gb:.4f} GB  "
+              f"{_c('← from calibration table', _DIM)}")
+        print(f"  {'Estimated total':<24}: {_c(f'{est.total_estimated_gb:.4f} GB', _CYAN)}  "
+              f"{_c(f'({est.activation_gb:.4f} + {est.fixed_benchmark_cost_gb:.4f})', _DIM)}")
+    else:
+        print(f"  {'Estimated total':<24}: {_c(f'{est.total_estimated_gb:.4f} GB', _CYAN)}  "
+              f"{_c(f'({est.activation_gb:.4f} × 6 fallback)', _DIM)}")
+    print(f"  {'Available RAM':<24}: {est.available_gb:.2f} GB")
+    print(f"  {'OOM predicted':<24}: {_c('YES', _RED) if est.will_oom else _c('NO', _GREEN)}")
+    print(f"  {'Probe elapsed':<24}: {probe_elapsed:.1f}s")
+
+    record.update({
+        'num_stimuli':           est.num_stimuli,
+        'num_features':          est.num_features,
+        'num_timebins':          est.num_timebins,
+        'activation_gb':         round(est.activation_gb, 6),
+        'fixed_benchmark_cost_gb': round(est.fixed_benchmark_cost_gb, 6) if est.fixed_benchmark_cost_gb is not None else None,
+        'estimated_total_gb':    round(est.total_estimated_gb, 6),
+        'available_gb':          round(est.available_gb, 2),
+        'oom_predicted':         est.will_oom,
+        'formula':               formula,
+        'probe_elapsed_s':       round(probe_elapsed, 1),
+    })
+
+    if probe_only:
+        record['status'] = 'probe_only'
+        _write_result(output_path, record)
+        return record
+
+    # ── Full benchmark run ───────────────────────────────────────────────
+    print(f"\n  {_c('FULL BENCHMARK RUN', _BOLD)}")
+    baseline_rss = proc.memory_info().rss
+    print(f"  Baseline RSS: {_gb(baseline_rss)}  "
+          f"{_c('← model weights + Python already in RAM', _DIM)}")
+    print(f"  Scoring...  (this may take a while)", flush=True)
+
+    # Ticker thread — prints a heartbeat every 60s so we know it's alive
+    _ticker_stop = threading.Event()
+    def _ticker():
+        t_start = time.time()
+        while not _ticker_stop.wait(60):
+            elapsed = time.time() - t_start
+            rss = proc.memory_info().rss
+            print(f"    {_c('…', _DIM)} still scoring  {elapsed/60:.1f} min  RSS {_gb(rss)}", flush=True)
+    ticker = threading.Thread(target=_ticker, daemon=True)
+    ticker.start()
+
+    monitor = _PeakMonitor().start()
+    t_score = time.time()
+    score_val = None
+    score_error = None
+    try:
+        score_val = benchmark(model)
+    except MemoryError as e:
+        score_error = f'MemoryError: {e}'
+        print(f"\n  {_c('MemoryError', _RED)}: {e}")
+    except Exception as e:
+        score_error = f'{type(e).__name__}: {e}'
+        print(f"\n  {_c('ERROR', _RED)} ({type(e).__name__}): {e}")
+    finally:
+        _ticker_stop.set()
+        ticker.join()
+
+    score_elapsed  = time.time() - t_score
+    peak_rss       = monitor.stop()
+    actual_delta_gb = (peak_rss - baseline_rss) / (1024 ** 3)
+
+    # ── Comparison ───────────────────────────────────────────────────────
+    print(f"\n  {_c('COMPARISON', _BOLD)}")
+    print(f"  {'Baseline RSS':<24}: {_gb(baseline_rss)}")
+    print(f"  {'Peak RSS':<24}: {_gb(peak_rss)}")
+    print(f"  {'Actual delta':<24}: {_c(f'+{actual_delta_gb:.4f} GB', _CYAN)}  "
+          f"{_c('← RAM the benchmark consumed', _DIM)}")
+    print(f"  {'Estimated total':<24}: {_c(f'{est.total_estimated_gb:.4f} GB', _CYAN)}")
+    print(f"  {'Accuracy':<24}: {_accuracy_label(est.total_estimated_gb, actual_delta_gb)}")
+    print(f"  {'Score elapsed':<24}: {score_elapsed:.0f}s")
+    if score_val is not None:
+        print(f"  {'Score':<24}: {float(score_val):.4f}")
+
+    record.update({
+        'baseline_rss_gb':   round(baseline_rss / (1024 ** 3), 4),
+        'peak_rss_gb':       round(peak_rss / (1024 ** 3), 4),
+        'actual_delta_gb':   round(actual_delta_gb, 4),
+        'error_gb':          round(est.total_estimated_gb - actual_delta_gb, 4),
+        'error_pct':         round((est.total_estimated_gb - actual_delta_gb) / actual_delta_gb * 100, 1)
+                             if actual_delta_gb > 0.01 else None,
+        'score_elapsed_s':   round(score_elapsed, 0),
+        'score':             float(score_val) if score_val is not None else None,
+        'score_error':       score_error,
+        'status':            'error' if score_error else 'ok',
+    })
+    _write_result(output_path, record)
+    return record
+
+
+# ---------------------------------------------------------------------------
+# Summary table
+# ---------------------------------------------------------------------------
+def print_summary(results: list[dict]):
+    print()
+    _divider('═')
+    print(f"  {_c('VALIDATION SUMMARY', _BOLD)}  ({len(results)} pairs)\n")
+
+    header = f"  {'Model':<48}  {'Benchmark':<30}  {'Est GB':>8}  {'Act GB':>8}  {'Err GB':>8}  {'Err %':>7}  Status"
+    print(header)
+    _divider()
+
+    for r in results:
+        model = r['model'][-46:]   # truncate long model names
+        bm    = r['benchmark']
+        est   = r.get('estimated_total_gb')
+        act   = r.get('actual_delta_gb')
+        err   = r.get('error_gb')
+        pct   = r.get('error_pct')
+        status = r.get('status', '?')
+
+        if status == 'ok':
+            if err is not None and err >= 0:
+                status_str = _c('OVER', _GREEN)
+            elif pct is not None and abs(pct) <= 15:
+                status_str = _c('~OK', _YELLOW)
+            else:
+                status_str = _c('MISS', _RED)
+        elif status in ('skipped', 'skipped_oom', 'probe_only'):
+            status_str = _c(status.upper(), _YELLOW)
+        else:
+            status_str = _c(status.upper(), _RED)
+
+        est_s = f"{est:.3f}" if est is not None else '—'
+        act_s = f"{act:.3f}" if act is not None else '—'
+        err_s = f"{err:+.3f}" if err is not None else '—'
+        pct_s = f"{pct:+.1f}%" if pct is not None else '—'
+
+        print(f"  {model:<48}  {bm:<30}  {est_s:>8}  {act_s:>8}  {err_s:>8}  {pct_s:>7}  {status_str}")
+
+    _divider('═')
+    print()
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+def main():
+    parser = argparse.ArgumentParser(description='Pre-flight estimator validation suite.')
+    parser.add_argument('--probe-only', action='store_true',
+                        help='Only run the pre-flight probe; skip full benchmark scoring.')
+    parser.add_argument('--output', default=os.path.join(_script_dir, 'validation_results.jsonl'),
+                        help='Path to write per-pair JSONL results (default: scripts/validation_results.jsonl)')
+    args = parser.parse_args()
+
+    # Truncate output file at the start of a fresh run
+    open(args.output, 'w').close()
+    print(f"\n{_c('Results will be written incrementally to:', _DIM)} {args.output}\n")
+
+    n_pairs = len(MODELS) * len(BENCHMARKS)
+    pair_idx = 0
+    results = []
+
+    for model_id in MODELS:
+        for benchmark_id in BENCHMARKS:
+            pair_idx += 1
+            print()
+            _divider('═')
+            print(f"  {_c(f'PAIR {pair_idx}/{n_pairs}', _BOLD)}  "
+                  f"{_c(model_id, _CYAN)}  ×  {_c(benchmark_id, _CYAN)}")
+            _divider('═')
+
+            record = run_pair(model_id, benchmark_id, args.output, args.probe_only)
+            results.append(record)
+
+    print_summary(results)
+    print(f"Full results written to: {args.output}\n")
+
+
+if __name__ == '__main__':
+    main()
+
+
diff --git a/tests/test_plugin_management/test_memory_precheck.py b/tests/test_plugin_management/test_memory_precheck.py
new file mode 100644
index 0000000000..8e1bde6978
--- /dev/null
+++ b/tests/test_plugin_management/test_memory_precheck.py
@@ -0,0 +1,561 @@
+"""
+Integration tests for the pre-flight memory check (preallocate_memory).
+
+Uses object.__new__ to bypass NeuralBenchmark.__init__ / timebins_from_assembly
+so we can construct minimal benchmark fixtures without real S3 data.
+
+Model is mocked at the BrainModel level: look_at returns a tiny xarray
+DataArray with a 'neuroid' dim so the probe can read sizes['neuroid'].
+place_on_screen short-circuits when source == target visual degrees (no I/O).
+"""
+
+import json
+import os
+import tempfile
+import unittest
+from unittest.mock import MagicMock, patch
+
+import numpy as np
+import xarray as xr
+import pytest
+
+from brainscore_core import Score
+from brainscore_core.benchmarks import score_benchmark
+from brainscore_vision.benchmark_helpers.memory import (
+    MemoryEstimate,
+    _OVERHEAD_FACTOR,
+    _PLS_OVERHEAD_FACTOR,
+    _BYTES_PER_ELEMENT,
+    _DEFAULT_CALIBRATION_PATH,
+    preallocate_memory,
+    load_calibration,
+    save_calibration,
+)
+from brainscore_vision.benchmark_helpers.neural_common import (
+    NeuralBenchmark,
+    TrainTestNeuralBenchmark,
+)
+from brainscore_vision.model_interface import BrainModel
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+_VISUAL_DEGREES = 8   # source == target so place_on_screen is a no-op
+
+
+def _make_stimulus_set(n: int = 10):
+    """Minimal StimulusSet-like DataFrame with stimulus_id coordinate."""
+    from brainscore_core.supported_data_standards.brainio.stimuli import StimulusSet
+    import pandas as pd
+    df = pd.DataFrame({'stimulus_id': [f'img{i:03d}' for i in range(n)],
+                       'image_file_name': [f'img{i:03d}.png' for i in range(n)]})
+    ss = StimulusSet(df)
+    ss.identifier = 'test_stimulus_set'
+    ss.stimulus_paths = {row.stimulus_id: f'/tmp/{row.image_file_name}'
+                         for _, row in df.iterrows()}
+    return ss
+
+
+def _make_neural_benchmark(n_stimuli: int = 10, n_trials: int = 1,
+                            timebins=None, region: str = 'IT') -> NeuralBenchmark:
+    """Construct a NeuralBenchmark without calling __init__."""
+    bm = object.__new__(NeuralBenchmark)
+    bm._identifier = 'test-neural-benchmark'
+    bm._number_of_trials = n_trials
+    bm.timebins = timebins or [(70, 170)]
+    bm.region = region
+    bm._visual_degrees = _VISUAL_DEGREES
+    bm._ceiling_func = lambda: Score(0.8)
+
+    ss = _make_stimulus_set(n_stimuli)
+    assembly = MagicMock()
+    assembly.stimulus_set = ss
+    bm._assembly = assembly
+    return bm
+
+
+def _make_train_test_benchmark(n_train: int = 8, n_test: int = 4) -> TrainTestNeuralBenchmark:
+    """Construct a TrainTestNeuralBenchmark without calling __init__."""
+    bm = object.__new__(TrainTestNeuralBenchmark)
+    bm._identifier = 'test-train-test-benchmark'
+    bm._number_of_trials = 1
+    bm.timebins = [(70, 170)]
+    bm.region = 'IT'
+    bm._visual_degrees = _VISUAL_DEGREES
+    bm._ceiling_func = lambda: Score(0.8)
+
+    train_assembly = MagicMock()
+    train_assembly.stimulus_set = _make_stimulus_set(n_train)
+    test_assembly = MagicMock()
+    test_assembly.stimulus_set = _make_stimulus_set(n_test)
+    bm.train_assembly = train_assembly
+    bm.test_assembly = test_assembly
+    return bm
+
+
+def _make_model(num_features: int = 512) -> BrainModel:
+    """Mock BrainModel whose look_at returns a DataArray with neuroid dim."""
+    model = MagicMock(spec=BrainModel)
+    model.visual_degrees.return_value = _VISUAL_DEGREES
+
+    def _look_at(stimuli, number_of_trials=1):
+        n = len(stimuli)
+        data = np.zeros((n, num_features))
+        return xr.DataArray(
+            data,
+            dims=['presentation', 'neuroid'],
+            coords={
+                'stimulus_id': ('presentation', stimuli['stimulus_id'].values),
+                'neuroid_id': ('neuroid', np.arange(num_features)),
+            },
+        )
+
+    model.look_at.side_effect = _look_at
+    model.activations_model = None   # no LayerPCA
+    return model
+
+
+# ---------------------------------------------------------------------------
+# TestMemoryEstimateShape
+# ---------------------------------------------------------------------------
+
+class TestMemoryEstimateShape(unittest.TestCase):
+
+    def setUp(self):
+        self.bm = _make_neural_benchmark(n_stimuli=10)
+        self.model = _make_model(num_features=512)
+
+    def test_estimate_fields(self):
+        with patch('psutil.virtual_memory') as mock_vm:
+            mock_vm.return_value.available = 32 * (1024 ** 3)
+            est = preallocate_memory(self.model, self.bm, raise_if_oom=False)
+
+        self.assertEqual(est.num_stimuli, 10)
+        self.assertEqual(est.num_features, 512)
+        self.assertEqual(est.num_timebins, 1)
+
+    def test_activation_gb_formula(self):
+        with patch('psutil.virtual_memory') as mock_vm:
+            mock_vm.return_value.available = 32 * (1024 ** 3)
+            est = preallocate_memory(self.model, self.bm, raise_if_oom=False)
+
+        expected_bytes = 10 * 512 * 1 * _BYTES_PER_ELEMENT
+        expected_gb = expected_bytes / (1024 ** 3)
+        self.assertAlmostEqual(est.activation_gb, expected_gb, places=6)
+
+    def test_total_estimated_gb(self):
+        with patch('psutil.virtual_memory') as mock_vm:
+            mock_vm.return_value.available = 32 * (1024 ** 3)
+            est = preallocate_memory(self.model, self.bm, raise_if_oom=False)
+
+        self.assertAlmostEqual(est.total_estimated_gb,
+                               est.activation_gb * _OVERHEAD_FACTOR, places=6)
+
+    def test_available_gb_from_psutil(self):
+        available_bytes = 16 * (1024 ** 3)
+        with patch('psutil.virtual_memory') as mock_vm:
+            mock_vm.return_value.available = available_bytes
+            est = preallocate_memory(self.model, self.bm, raise_if_oom=False)
+
+        self.assertAlmostEqual(est.available_gb, 16.0, places=3)
+
+
+# ---------------------------------------------------------------------------
+# TestOOMDetection
+# ---------------------------------------------------------------------------
+
+class TestOOMDetection(unittest.TestCase):
+
+    def _estimate(self, available_gb, num_features=1_000_000, n_stimuli=100):
+        bm = _make_neural_benchmark(n_stimuli=n_stimuli)
+        model = _make_model(num_features=num_features)
+        with patch('psutil.virtual_memory') as mock_vm:
+            mock_vm.return_value.available = int(available_gb * (1024 ** 3))
+            return preallocate_memory(model, bm, raise_if_oom=False)
+
+    def test_will_oom_true_when_over(self):
+        est = self._estimate(available_gb=0.001)
+        self.assertTrue(est.will_oom)
+
+    def test_will_oom_false_when_under(self):
+        est = self._estimate(available_gb=1000)
+        self.assertFalse(est.will_oom)
+
+    def test_raises_memory_error_when_raise_if_oom(self):
+        bm = _make_neural_benchmark(n_stimuli=100)
+        model = _make_model(num_features=1_000_000)
+        with patch('psutil.virtual_memory') as mock_vm:
+            mock_vm.return_value.available = 1
+            with self.assertRaises(MemoryError):
+                preallocate_memory(model, bm, raise_if_oom=True)
+
+    def test_no_raise_when_raise_if_oom_false(self):
+        bm = _make_neural_benchmark(n_stimuli=100)
+        model = _make_model(num_features=1_000_000)
+        with patch('psutil.virtual_memory') as mock_vm:
+            mock_vm.return_value.available = 1
+            est = preallocate_memory(model, bm, raise_if_oom=False)
+        self.assertTrue(est.will_oom)
+
+
+# ---------------------------------------------------------------------------
+# TestProbeUsesOneStimulusOnly
+# ---------------------------------------------------------------------------
+
+class TestProbeUsesOneStimulusOnly(unittest.TestCase):
+
+    def test_look_at_called_with_one_stimulus(self):
+        bm = _make_neural_benchmark(n_stimuli=100)
+        model = _make_model()
+        with patch('psutil.virtual_memory') as mock_vm:
+            mock_vm.return_value.available = 32 * (1024 ** 3)
+            preallocate_memory(model, bm, raise_if_oom=False)
+
+        call_args = model.look_at.call_args
+        stimuli_arg = call_args[0][0]
+        self.assertEqual(len(stimuli_arg), 1)
+
+    def test_num_stimuli_reflects_full_benchmark_not_probe(self):
+        bm = _make_neural_benchmark(n_stimuli=42)
+        model = _make_model()
+        with patch('psutil.virtual_memory') as mock_vm:
+            mock_vm.return_value.available = 32 * (1024 ** 3)
+            est = preallocate_memory(model, bm, raise_if_oom=False)
+
+        self.assertEqual(est.num_stimuli, 42)
+
+
+# ---------------------------------------------------------------------------
+# TestScoreBenchmarkAbortOnOOM
+# ---------------------------------------------------------------------------
+
+class TestScoreBenchmarkAbortOnOOM(unittest.TestCase):
+
+    def test_score_benchmark_aborts_before_calling_benchmark(self):
+        """score_benchmark should raise MemoryError before __call__ is invoked."""
+        bm = _make_neural_benchmark(n_stimuli=100)
+        bm.__call__ = MagicMock(return_value=Score(0.5))
+        model = _make_model(num_features=1_000_000)
+
+        with patch('psutil.virtual_memory') as mock_vm:
+            mock_vm.return_value.available = 1
+            with self.assertRaises(MemoryError):
+                score_benchmark(bm, model)
+
+        bm.__call__.assert_not_called()
+
+    def test_score_benchmark_calls_benchmark_when_ok(self):
+        bm = _make_neural_benchmark(n_stimuli=5)
+        score_val = Score(0.42)
+        score_val.attrs['ceiling'] = Score(1.0)
+        model = _make_model(num_features=10)
+
+        with patch('psutil.virtual_memory') as mock_vm:
+            mock_vm.return_value.available = 32 * (1024 ** 3)
+            with patch.object(NeuralBenchmark, '__call__', return_value=score_val) as mock_call:
+                result = score_benchmark(bm, model)
+
+        mock_call.assert_called_once_with(model)
+        self.assertEqual(float(result), 0.42)
+
+
+# ---------------------------------------------------------------------------
+# TestSkipEnvVar
+# ---------------------------------------------------------------------------
+
+class TestSkipEnvVar(unittest.TestCase):
+
+    def test_returns_none_when_env_var_set(self):
+        bm = _make_neural_benchmark()
+        model = _make_model()
+        with patch.dict(os.environ, {'BRAINSCORE_SKIP_MEMORY_CHECK': '1'}):
+            result = preallocate_memory(model, bm, raise_if_oom=True)
+        self.assertIsNone(result)
+
+    def test_runs_normally_when_env_var_unset(self):
+        bm = _make_neural_benchmark()
+        model = _make_model()
+        with patch.dict(os.environ, {'BRAINSCORE_SKIP_MEMORY_CHECK': '0'}):
+            with patch('psutil.virtual_memory') as mock_vm:
+                mock_vm.return_value.available = 32 * (1024 ** 3)
+                result = preallocate_memory(model, bm, raise_if_oom=False)
+        self.assertIsNotNone(result)
+
+
+# ---------------------------------------------------------------------------
+# TestUnsupportedBenchmarkType
+# ---------------------------------------------------------------------------
+
+class TestUnsupportedBenchmarkType(unittest.TestCase):
+
+    def test_raises_type_error_for_unknown_benchmark(self):
+        class WeirdBenchmark:
+            pass
+
+        model = _make_model()
+        with self.assertRaises(TypeError):
+            preallocate_memory(model, WeirdBenchmark())
+
+
+# ---------------------------------------------------------------------------
+# TestTrainTestNeuralBenchmark
+# ---------------------------------------------------------------------------
+
+class TestTrainTestNeuralBenchmark(unittest.TestCase):
+
+    def test_num_stimuli_is_train_plus_test(self):
+        bm = _make_train_test_benchmark(n_train=8, n_test=4)
+        model = _make_model()
+        with patch('psutil.virtual_memory') as mock_vm:
+            mock_vm.return_value.available = 32 * (1024 ** 3)
+            est = preallocate_memory(model, bm, raise_if_oom=False)
+
+        self.assertEqual(est.num_stimuli, 12)
+
+    def test_estimate_formula_train_test(self):
+        bm = _make_train_test_benchmark(n_train=8, n_test=4)
+        model = _make_model(num_features=256)
+        with patch('psutil.virtual_memory') as mock_vm:
+            mock_vm.return_value.available = 32 * (1024 ** 3)
+            est = preallocate_memory(model, bm, raise_if_oom=False)
+
+        expected_bytes = 12 * 256 * 1 * _BYTES_PER_ELEMENT
+        self.assertAlmostEqual(est.activation_gb, expected_bytes / (1024 ** 3), places=6)
+
+
+# ---------------------------------------------------------------------------
+# TestCalibrationIO  —  load_calibration / save_calibration
+# ---------------------------------------------------------------------------
+
+class TestCalibrationIO(unittest.TestCase):
+
+    def setUp(self):
+        self._tmpdir = tempfile.mkdtemp()
+        self._cal_path = os.path.join(self._tmpdir, 'benchmark_costs.json')
+
+    def test_load_returns_empty_dict_when_file_missing(self):
+        result = load_calibration('/nonexistent/path/benchmark_costs.json')
+        self.assertEqual(result, {})
+
+    def test_save_and_load_roundtrip(self):
+        costs = {'MajajHong2015.IT-pls': 2.8336, 'Allen2022_fmri.V1-ridge': 0.5544}
+        save_calibration(costs, self._cal_path)
+        loaded = load_calibration(self._cal_path)
+        self.assertEqual(loaded, costs)
+
+    def test_save_creates_intermediate_directories(self):
+        deep_path = os.path.join(self._tmpdir, 'a', 'b', 'c', 'costs.json')
+        save_calibration({'bm': 1.0}, deep_path)
+        self.assertTrue(os.path.exists(deep_path))
+
+    def test_load_handles_corrupt_file_gracefully(self):
+        with open(self._cal_path, 'w') as f:
+            f.write('not valid json {{{')
+        result = load_calibration(self._cal_path)
+        self.assertEqual(result, {})
+
+    def test_save_writes_valid_json(self):
+        costs = {'foo-bar': 3.14}
+        save_calibration(costs, self._cal_path)
+        with open(self._cal_path) as f:
+            data = json.load(f)
+        self.assertAlmostEqual(data['foo-bar'], 3.14)
+
+    def test_save_overwrites_existing_file(self):
+        save_calibration({'old': 1.0}, self._cal_path)
+        save_calibration({'new': 2.0}, self._cal_path)
+        loaded = load_calibration(self._cal_path)
+        self.assertNotIn('old', loaded)
+        self.assertAlmostEqual(loaded['new'], 2.0)
+
+
+# ---------------------------------------------------------------------------
+# TestCalibratedFormula  —  two-component formula vs ×6 fallback
+# ---------------------------------------------------------------------------
+
+class TestCalibratedFormula(unittest.TestCase):
+
+    def setUp(self):
+        self._tmpdir = tempfile.mkdtemp()
+        self._cal_path = os.path.join(self._tmpdir, 'costs.json')
+
+    def _estimate(self, bm, model, fixed_cost=None, cal_path=None):
+        with patch('psutil.virtual_memory') as mock_vm:
+            mock_vm.return_value.available = 64 * (1024 ** 3)
+            with patch('brainscore_vision.benchmark_helpers.memory._DEFAULT_CALIBRATION_PATH',
+                       cal_path or '/nonexistent'):
+                return preallocate_memory(model, bm, raise_if_oom=False,
+                                          fixed_benchmark_cost_gb=fixed_cost)
+
+    def test_explicit_fixed_cost_overrides_fallback(self):
+        bm = _make_neural_benchmark(n_stimuli=10)
+        model = _make_model(num_features=512)
+        est = self._estimate(bm, model, fixed_cost=5.0)
+        self.assertAlmostEqual(est.total_estimated_gb, est.activation_gb + 5.0, places=5)
+
+    def test_fixed_cost_stored_in_estimate(self):
+        bm = _make_neural_benchmark(n_stimuli=10)
+        model = _make_model(num_features=512)
+        est = self._estimate(bm, model, fixed_cost=3.5)
+        self.assertAlmostEqual(est.fixed_benchmark_cost_gb, 3.5)
+
+    def test_falls_back_to_overhead_when_no_calibration(self):
+        bm = _make_neural_benchmark(n_stimuli=10)
+        model = _make_model(num_features=512)
+        est = self._estimate(bm, model, fixed_cost=None, cal_path='/nonexistent')
+        self.assertIsNone(est.fixed_benchmark_cost_gb)
+        self.assertAlmostEqual(est.total_estimated_gb,
+                               est.activation_gb * _OVERHEAD_FACTOR, places=5)
+
+    def test_auto_loads_fixed_cost_from_calibration_json(self):
+        bm = _make_neural_benchmark(n_stimuli=10)
+        bm._identifier = 'MajajHong2015.IT-pls'
+        model = _make_model(num_features=512)
+        save_calibration({'MajajHong2015.IT-pls': 2.8336}, self._cal_path)
+
+        with patch('psutil.virtual_memory') as mock_vm:
+            mock_vm.return_value.available = 64 * (1024 ** 3)
+            with patch('brainscore_vision.benchmark_helpers.memory._DEFAULT_CALIBRATION_PATH',
+                       self._cal_path):
+                est = preallocate_memory(model, bm, raise_if_oom=False)
+
+        self.assertAlmostEqual(est.fixed_benchmark_cost_gb, 2.8336, places=4)
+        self.assertAlmostEqual(est.total_estimated_gb,
+                               est.activation_gb * _PLS_OVERHEAD_FACTOR + 2.8336, places=4)
+
+    def test_benchmark_not_in_table_uses_fallback(self):
+        bm = _make_neural_benchmark(n_stimuli=10)
+        bm._identifier = 'unknown-benchmark'
+        model = _make_model(num_features=512)
+        save_calibration({'MajajHong2015.IT-pls': 2.8336}, self._cal_path)
+
+        with patch('psutil.virtual_memory') as mock_vm:
+            mock_vm.return_value.available = 64 * (1024 ** 3)
+            with patch('brainscore_vision.benchmark_helpers.memory._DEFAULT_CALIBRATION_PATH',
+                       self._cal_path):
+                est = preallocate_memory(model, bm, raise_if_oom=False)
+
+        self.assertIsNone(est.fixed_benchmark_cost_gb)
+        self.assertAlmostEqual(est.total_estimated_gb,
+                               est.activation_gb * _OVERHEAD_FACTOR, places=5)
+
+    def test_oom_detected_with_calibrated_formula(self):
+        bm = _make_neural_benchmark(n_stimuli=10)
+        model = _make_model(num_features=512)
+        # fixed cost alone exceeds available RAM
+        with patch('psutil.virtual_memory') as mock_vm:
+            mock_vm.return_value.available = int(0.001 * (1024 ** 3))
+            with self.assertRaises(MemoryError):
+                preallocate_memory(model, bm, raise_if_oom=True, fixed_benchmark_cost_gb=100.0)
+
+
+# ---------------------------------------------------------------------------
+# TestMemoryEstimateStr  —  __str__ output
+# ---------------------------------------------------------------------------
+
+class TestMemoryEstimateStr(unittest.TestCase):
+
+    def _make_estimate(self, fixed_cost=None, will_oom=False):
+        available = 1.0 if will_oom else 100.0
+        total = 200.0 if will_oom else 1.5
+        return MemoryEstimate(
+            num_stimuli=100,
+            num_trials=1,
+            num_features=512,
+            num_timebins=1,
+            activation_gb=0.5,
+            total_estimated_gb=total,
+            available_gb=available,
+            fixed_benchmark_cost_gb=fixed_cost,
+        )
+
+    def test_str_shows_ok_when_not_oom(self):
+        est = self._make_estimate()
+        self.assertIn('[OK]', str(est))
+
+    def test_str_shows_oom_likely_when_oom(self):
+        est = self._make_estimate(will_oom=True)
+        self.assertIn('[OOM LIKELY]', str(est))
+
+    def test_str_shows_calibrated_formula_when_fixed_cost_set(self):
+        est = self._make_estimate(fixed_cost=3.5)
+        s = str(est)
+        self.assertIn('fixed benchmark cost', s)
+        self.assertNotIn(f'×{_OVERHEAD_FACTOR}', s)
+
+    def test_str_shows_overhead_formula_when_no_fixed_cost(self):
+        est = self._make_estimate(fixed_cost=None)
+        s = str(est)
+        self.assertIn(f'×{_OVERHEAD_FACTOR}', s)
+        self.assertNotIn('fixed benchmark cost', s)
+
+    def test_str_contains_stimuli_and_features(self):
+        est = self._make_estimate()
+        s = str(est)
+        self.assertIn('100', s)   # num_stimuli
+        self.assertIn('512', s)   # num_features
+
+
+# ---------------------------------------------------------------------------
+# TestCalibratedIntegration  —  full pipeline with a real JSON file
+# ---------------------------------------------------------------------------
+
+class TestCalibratedIntegration(unittest.TestCase):
+    """
+    End-to-end test: save a calibration JSON, then verify preallocate_memory
+    picks it up automatically and produces the correct two-component estimate.
+    """
+
+    def setUp(self):
+        self._tmpdir = tempfile.mkdtemp()
+        self._cal_path = os.path.join(self._tmpdir, 'costs.json')
+
+    def test_full_roundtrip_calibrated_estimate(self):
+        n_stimuli = 20
+        n_features = 256
+        fixed_cost = 4.35
+
+        bm = _make_neural_benchmark(n_stimuli=n_stimuli)
+        bm._identifier = 'integration-test-benchmark'
+        model = _make_model(num_features=n_features)
+
+        save_calibration({'integration-test-benchmark': fixed_cost}, self._cal_path)
+
+        with patch('psutil.virtual_memory') as mock_vm:
+            mock_vm.return_value.available = 64 * (1024 ** 3)
+            with patch('brainscore_vision.benchmark_helpers.memory._DEFAULT_CALIBRATION_PATH',
+                       self._cal_path):
+                est = preallocate_memory(model, bm, raise_if_oom=False)
+
+        expected_activation = n_stimuli * n_features * 1 * _BYTES_PER_ELEMENT / (1024 ** 3)
+        self.assertAlmostEqual(est.activation_gb, expected_activation, places=6)
+        self.assertAlmostEqual(est.fixed_benchmark_cost_gb, fixed_cost, places=4)
+        self.assertAlmostEqual(est.total_estimated_gb, expected_activation + fixed_cost, places=4)
+        self.assertFalse(est.will_oom)
+
+    def test_score_benchmark_uses_preallocate_memory(self):
+        """score_benchmark must call preallocate_memory before __call__."""
+        bm = _make_neural_benchmark(n_stimuli=5)
+        model = _make_model(num_features=10)
+        score_val = MagicMock()
+
+        call_order = []
+
+        def _fake_preallocate(self, candidate):
+            call_order.append('preallocate')
+
+        def _fake_call(self, candidate):
+            call_order.append('score')
+            return score_val
+
+        with patch.object(NeuralBenchmark, 'preallocate_memory', _fake_preallocate):
+            with patch.object(NeuralBenchmark, '__call__', _fake_call):
+                score_benchmark(bm, model)
+
+        self.assertEqual(call_order, ['preallocate', 'score'])
+
+
+if __name__ == '__main__':
+    unittest.main()