diff --git a/brainscore_vision/__init__.py b/brainscore_vision/__init__.py index cd7ec541b9..817c25d998 100644 --- a/brainscore_vision/__init__.py +++ b/brainscore_vision/__init__.py @@ -1,4 +1,5 @@ import logging +import os from typing import Dict, Any, Union, Callable from brainscore_core.supported_data_standards.brainio.assemblies import DataAssembly @@ -6,6 +7,7 @@ from brainscore_core.benchmarks import Benchmark from brainscore_core.metrics import Metric, Score +from brainscore_core.benchmarks import score_benchmark from brainscore_core.plugin_management.conda_score import wrap_score from brainscore_core.plugin_management.import_plugin import import_plugin from brainscore_vision.metrics import Ceiling @@ -74,7 +76,21 @@ def _run_score(model_identifier: str, benchmark_identifier: str) -> Score: """ model: BrainModel = load_model(model_identifier) benchmark: Benchmark = load_benchmark(benchmark_identifier) - score: Score = benchmark(model) + try: + score: Score = score_benchmark(benchmark, model) + except AssertionError as e: + cache_dir = os.path.expanduser( + '~/.result_caching/brainscore_vision.model_helpers.activations.core' + '.ActivationsExtractorHelper._from_paths_stored' + ) + raise AssertionError( + f"{e}\n\n" + f"If this is a stale activations cache (cached stimulus paths no longer match " + f"current locations, e.g. temp directory changed between runs), fix with:\n" + f" rm {cache_dir}/identifier={model_identifier},stimuli_identifier=*.pkl\n\n" + f"Or to clear the entire activations cache:\n" + f" rm {cache_dir}/*.pkl" + ) from e score.attrs['model_identifier'] = model_identifier score.attrs['benchmark_identifier'] = benchmark_identifier try: # attempt to look up the layer commitment if model uses a standard layer model @@ -104,3 +120,7 @@ def score(model_identifier: str, benchmark_identifier: str, conda_active: bool = return wrap_score(__file__, model_identifier=model_identifier, benchmark_identifier=benchmark_identifier, score_function=_run_score, conda_active=conda_active) + + +# Public re-export so callers can do: from brainscore_vision import preallocate_memory +from brainscore_vision.benchmark_helpers.memory import preallocate_memory # noqa: E402 diff --git a/brainscore_vision/benchmark_helpers/benchmark_costs.json b/brainscore_vision/benchmark_helpers/benchmark_costs.json new file mode 100644 index 0000000000..a740715bd7 --- /dev/null +++ b/brainscore_vision/benchmark_helpers/benchmark_costs.json @@ -0,0 +1,51 @@ +{ + "Allen2022_fmri.IT-ridge": 0.4113, + "Allen2022_fmri.V1-ridge": 0.5544, + "Allen2022_fmri.V2-ridge": 0.7258, + "Allen2022_fmri.V4-ridge": 0.3258, + "Allen2022_fmri_4subj.IT-ridge": 1.49, + "Allen2022_fmri_4subj.V1-ridge": 0.6966, + "Allen2022_fmri_4subj.V2-ridge": 1.3744, + "Allen2022_fmri_4subj.V4-ridge": 0.6729, + "Allen2022_fmri_surface.IT-ridge": 0.6236, + "Allen2022_fmri_surface.V1-ridge": 1.2537, + "Allen2022_fmri_surface.V2-ridge": 1.5052, + "Allen2022_fmri_surface.V4-ridge": 0.6126, + "Allen2022_fmri_surface_4subj.IT-ridge": 0.5683, + "Allen2022_fmri_surface_4subj.V1-ridge": 1.3479, + "Allen2022_fmri_surface_4subj.V2-ridge": 1.6253, + "Allen2022_fmri_surface_4subj.V4-ridge": 0.7984, + "Cadena2017-mask": 4.4728, + "Cadena2017-pls": 6.9435, + "FreemanZiemba2013.V1-pls": 4.0874, + "FreemanZiemba2013.V2-pls": 3.8205, + "FreemanZiemba2013public.V1-pls": 1.813, + "FreemanZiemba2013public.V2-pls": 0.8573, + "Gifford2022.IT-ridge": 3.6693, + "Gifford2022.IT-ridgecv": 7.3324, + "Hebart2023_fmri.IT-ridge": 16.8948, + "Hebart2023_fmri.IT-ridgecv": 3.8821, + "Hebart2023_fmri.V1-ridge": 4.843, + "Hebart2023_fmri.V1-ridgecv": 8.8359, + "Hebart2023_fmri.V2-ridge": 15.5814, + "Hebart2023_fmri.V2-ridgecv": 12.4336, + "Hebart2023_fmri.V4-ridge": 6.8122, + "Hebart2023_fmri.V4-ridgecv": 12.4122, + "Igustibagus2024-ridge": 2.1582, + "MajajHong2015.IT-pls": 2.8336, + "MajajHong2015.V4-pls": 4.0503, + "MajajHong2015public.IT-pls": 3.5527, + "MajajHong2015public.V4-pls": 4.9696, + "Papale2025.IT-ridge": 43.45, + "Papale2025.IT-ridgecv": 7.22, + "Papale2025.V1-ridge": 15.07, + "Papale2025.V1-ridgecv": 26.58, + "Papale2025.V4-ridge": 40.54, + "Rajalingham2020.IT-pls": 0.45, + "Sanghavi2020.IT-pls": 6.71, + "Sanghavi2020.V4-pls": 8.97, + "SanghaviJozwik2020.IT-pls": 5.41, + "SanghaviJozwik2020.V4-pls": 7.65, + "SanghaviMurty2020.IT-pls": 0.22, + "SanghaviMurty2020.V4-pls": 0.56 +} diff --git a/brainscore_vision/benchmark_helpers/memory.py b/brainscore_vision/benchmark_helpers/memory.py new file mode 100644 index 0000000000..1e82a8ef56 --- /dev/null +++ b/brainscore_vision/benchmark_helpers/memory.py @@ -0,0 +1,512 @@ +""" +Memory estimation utilities for Brain-Score benchmarks. + +Call :func:`preallocate_memory` before scoring to detect OOM errors early, +rather than discovering them 6+ hours into a benchmark run. + +Example usage:: + + from brainscore_vision import load_model, load_benchmark + from brainscore_vision.benchmark_helpers.memory import preallocate_memory + + model = load_model('resnet50') + benchmark = load_benchmark('MajajHong2015public.IT-pls') + estimate = preallocate_memory(model, benchmark) # raises MemoryError if OOM + score = benchmark(model) +""" + +import json +import logging +import os +from dataclasses import dataclass +from typing import Optional + +import psutil + +from brainscore_vision.benchmark_helpers.neural_common import NeuralBenchmark, TrainTestNeuralBenchmark, RSABenchmark, timebins_from_assembly +from brainscore_vision.benchmark_helpers.screen import place_on_screen +from brainscore_vision.model_interface import BrainModel + +_logger = logging.getLogger(__name__) + +# Default path for the persistent calibration table. +# Prefer the file bundled with the package; fall back to the user-local path +# so that a local calibration run (mem_profile_suite.py --calibrate) can +# extend or override the shipped table without touching the source tree. +_BUNDLED_CALIBRATION_PATH = os.path.join(os.path.dirname(__file__), 'benchmark_costs.json') +_DEFAULT_CALIBRATION_PATH = ( + _BUNDLED_CALIBRATION_PATH + if os.path.exists(_BUNDLED_CALIBRATION_PATH) + else os.path.expanduser('~/.brainscore/benchmark_costs.json') +) + +# float32 = 4 bytes per element +_BYTES_PER_ELEMENT = 4 + +# Overhead multiplier on top of the activation assembly size. +# Accounts for xarray coordinate arrays, regression/CV matrices, and +# temporary buffers. Calibrated against MajajHong2015.IT-pls (resnet50, +# no PCA): 1.91 GB assembly → 9.98 GB observed peak delta → 5.2× real +# overhead. Using 6× to stay slightly conservative. +_OVERHEAD_FACTOR = 6 + +# Overhead multiplier applied to the activation array for PLS benchmarks. +# PLS regression builds cross-covariance matrices of shape +# (num_features × num_neuroids) whose memory scales with the model's feature +# count. The calibrated fixed_benchmark_cost is therefore NOT model-independent +# for PLS — it was measured on alexnet (~9K features) and severely underestimates +# for large-feature models (200K+ features). +# +# Formula for PLS: total = activation_gb × _PLS_OVERHEAD_FACTOR + fixed_cost_gb +# where fixed_cost_gb covers the neural-assembly side (truly model-independent). +# +# Validated against a 3-model × 2-PLS-benchmark grid: +# worst miss after fix: resnet50 × Cadena2017-pls → -12.7% (within 15%) +_PLS_OVERHEAD_FACTOR = 7 + + +@dataclass +class MemoryEstimate: + """Breakdown of the estimated memory footprint for a benchmark run.""" + num_stimuli: int + num_trials: int + num_features: int + num_timebins: int + activation_gb: float # activation array only + total_estimated_gb: float # see formula description below + available_gb: float + fixed_benchmark_cost_gb: Optional[float] = None # None → overhead-factor fallback was used + is_pls: bool = False # True → PLS formula was used (activation × _PLS_OVERHEAD_FACTOR + fixed_cost) + # formula_type: 'pls' | 'rdm' | 'ridge_formula' | 'calibrated' | 'fallback' + formula_type: str = 'fallback' + rdm_overhead_gb: Optional[float] = None # n_stimuli^2 term used in RDM and ridge-formula paths + + @property + def will_oom(self) -> bool: + return self.total_estimated_gb > self.available_gb + + def __str__(self) -> str: + status = "OOM LIKELY" if self.will_oom else "OK" + if self.formula_type == 'pls': + fixed_str = (f" + {self.fixed_benchmark_cost_gb:.2f} GB fixed cost" + if self.fixed_benchmark_cost_gb else "") + formula = (f"{self.activation_gb:.2f} GB activations " + f"×{_PLS_OVERHEAD_FACTOR} (PLS){fixed_str}") + elif self.formula_type == 'rdm': + formula = (f"{self.activation_gb:.2f} GB activations " + f"×3 (RDM pairwise distance overhead → {self.total_estimated_gb:.1f} GB total)") + elif self.formula_type == 'ridge_large_feature': + formula = (f"{self.activation_gb:.2f} GB activations " + f"×{_OVERHEAD_FACTOR} (ridge SVD path: n_features > n_stimuli → {self.total_estimated_gb:.1f} GB total)") + elif self.formula_type == 'ridge_formula': + formula = (f"{self.activation_gb:.2f} GB activations " + f"+ {self.rdm_overhead_gb:.2f} GB gram matrix ({self.num_stimuli}²×4B)") + elif self.formula_type == 'calibrated': + formula = (f"{self.activation_gb:.2f} GB activations " + f"+ {self.fixed_benchmark_cost_gb:.2f} GB fixed benchmark cost (calibrated)") + else: + formula = (f"{self.activation_gb:.2f} GB " + f"(×{_OVERHEAD_FACTOR} overhead → {self.total_estimated_gb:.1f} GB total)") + return ( + f"[{status}] Memory estimate: {self.total_estimated_gb:.1f} GB needed, " + f"{self.available_gb:.1f} GB available\n" + f" Activations: {self.num_stimuli} stimuli × {self.num_features:,} features " + f"× {self.num_timebins} timebins = {formula}" + ) + + +def load_calibration(path: Optional[str] = None) -> dict: + """Load the benchmark fixed-cost table from disk. + + Returns an empty dict if the file does not exist yet. + The file is written by :func:`save_calibration` (or by the + ``--calibrate`` mode of ``mem_profile_suite.py``). + """ + path = path or _DEFAULT_CALIBRATION_PATH + try: + with open(path) as f: + return json.load(f) + except FileNotFoundError: + return {} + except Exception as e: + _logger.warning(f"Could not load calibration from {path}: {e}") + return {} + + +def save_calibration(costs: dict, path: Optional[str] = None) -> None: + """Persist benchmark fixed costs to disk. + + Parameters + ---------- + costs : dict + ``{benchmark_identifier: fixed_cost_gb}`` mapping produced by a + calibration run. + path : str, optional + Destination file. Defaults to ``~/.brainscore/benchmark_costs.json``. + """ + path = path or _DEFAULT_CALIBRATION_PATH + os.makedirs(os.path.dirname(path), exist_ok=True) + with open(path, 'w') as f: + json.dump(costs, f, indent=2, sort_keys=True) + _logger.info(f"Calibration saved → {path} ({len(costs)} benchmarks)") + + +def _is_pls_benchmark(benchmark) -> bool: + """Return True if the benchmark uses PLS regression. + + PLS cross-covariance matrices scale with num_features, so the calibrated + fixed_benchmark_cost (measured on alexnet with ~9K features) does not + generalise to large-feature models. A dedicated PLS overhead formula is + applied instead. Detection is based on the naming convention: all PLS + benchmarks in brainscore_vision end with ``-pls`` or ``-reverse_pls``. + """ + ident = str(getattr(benchmark, 'identifier', '')) + return ident.endswith('-pls') or ident.endswith('-reverse_pls') or '-temporal-pls' in ident + + +def _is_rdm_benchmark(benchmark) -> bool: + """Return True if the benchmark uses RDM/RSA. + + RDM overhead scales with activation size (≈ 2× activation_gb), not purely + with n_stimuli². Detected via the ``-rdm`` suffix or RSABenchmark instance type. + """ + if isinstance(benchmark, RSABenchmark): + return True + return str(getattr(benchmark, 'identifier', '')).endswith('-rdm') + + +def _is_ridge_benchmark(benchmark) -> bool: + """Return True if the benchmark uses ridge or ridgecv regression. + + The gram matrix for ridge is n_stimuli × n_stimuli — model-independent — + so we can compute a formula-based estimate when no calibration entry exists. + RSABenchmark instances are explicitly excluded: they are always RDM, never ridge. + """ + if isinstance(benchmark, RSABenchmark): + return False + ident = str(getattr(benchmark, 'identifier', '')) + return ident.endswith('-ridge') or ident.endswith('-ridgecv') + + +def _get_probe_layer(model): + """ + Return the committed layer string for the model's primary recording region, + or None if it cannot be determined without triggering expensive layer selection. + """ + try: + # Navigate ModelCommitment → TemporalAligned → LayerMappedModel + lm = getattr(model, 'layer_model', None) + if lm is not None and hasattr(lm, '_layer_model'): + lm = lm._layer_model # TemporalAligned → LayerMappedModel + if lm is None: + lm = model # might itself be LayerMappedModel-like + + rmap = getattr(lm, 'region_layer_map', None) + if rmap is None: + return None + + # Prefer IT, then any committed region. + # Use dict.__contains__ to avoid triggering lazy RegionLayerMap.__getitem__ + for candidate_region in ['IT', 'V4', 'V2', 'V1']: + if dict.__contains__(rmap, candidate_region): + layers = dict.__getitem__(rmap, candidate_region) + if layers is not None: + if isinstance(layers, (list, tuple)): + return layers[0] + return layers + + # If it's a plain dict (not lazy RegionLayerMap), just grab any value + if type(rmap) is dict and rmap: + return next(iter(rmap.values())) + + except Exception: + pass + return None + + +def preallocate_memory( + model: BrainModel, + benchmark, + raise_if_oom: bool = True, + fixed_benchmark_cost_gb: Optional[float] = None, +) -> Optional[MemoryEstimate]: + """ + Estimate memory requirements before running a full benchmark. + + Probes the model with a single stimulus to get the actual feature count. + The probe calls the activations extractor's ``_from_paths`` directly, + bypassing ``from_stimulus_set`` / ``attach_stimulus_set_meta`` so that + the probe cannot interfere with the subsequent scoring run's result cache. + + Estimates total memory as + ``num_stimuli × num_features × num_timebins × 4 bytes × overhead``. + + num_trials is intentionally excluded: deterministic models process each + unique stimulus once; the trial dimension in the neural assembly does not + scale model memory. + + Parameters + ---------- + model : BrainModel + The candidate model that will be scored. + benchmark : NeuralBenchmark or TrainTestNeuralBenchmark + The benchmark the model will be scored on. + raise_if_oom : bool, optional + If ``True`` (default), raises :exc:`MemoryError` when the estimate + exceeds available RAM. If ``False``, logs a warning instead. + + Returns + ------- + MemoryEstimate + Estimated memory breakdown with a ``.will_oom`` property. + + Raises + ------ + TypeError + If *benchmark* is not a supported neural benchmark type. + MemoryError + If ``raise_if_oom=True`` and estimated memory exceeds available RAM. + """ + if os.environ.get('BRAINSCORE_SKIP_MEMORY_CHECK', '0') == '1': + _logger.debug("BRAINSCORE_SKIP_MEMORY_CHECK is set — skipping memory pre-check.") + return None + + # ------------------------------------------------------------------ # + # 1. Extract metadata from the benchmark # + # ------------------------------------------------------------------ # + if isinstance(benchmark, NeuralBenchmark): + stimulus_set = benchmark._assembly.stimulus_set + num_stimuli = int(stimulus_set['stimulus_id'].nunique()) + num_trials = benchmark._number_of_trials + timebins = benchmark.timebins + region = benchmark.region + visual_degrees = benchmark._visual_degrees + + elif isinstance(benchmark, TrainTestNeuralBenchmark): + train_ss = benchmark.train_assembly.stimulus_set + test_ss = benchmark.test_assembly.stimulus_set + stimulus_set = train_ss + num_stimuli = int(train_ss['stimulus_id'].nunique()) + int(test_ss['stimulus_id'].nunique()) + num_trials = benchmark._number_of_trials + timebins = benchmark.timebins + region = benchmark.region + visual_degrees = benchmark._visual_degrees + + elif isinstance(benchmark, RSABenchmark): + stimulus_set = benchmark._assembly.stimulus_set + num_stimuli = int(stimulus_set['stimulus_id'].nunique()) + num_trials = benchmark._number_of_trials + timebins = timebins_from_assembly(benchmark._assembly) + region = benchmark.region + visual_degrees = benchmark._visual_degrees + + else: + # Unsupported benchmark type (e.g. behavioral/engineering called directly). + # Return None rather than crashing — the no-op on brainscore_vision.benchmarks.Benchmark + # means score_benchmark never reaches here for non-neural benchmarks, but direct + # calls from scripts should not raise unexpectedly. + _logger.debug( + f"preallocate_memory: unsupported benchmark type {type(benchmark).__name__}, skipping." + ) + return None + + # ------------------------------------------------------------------ # + # 2. Prepare probe stimulus (1 image, visual-degree corrected) # + # ------------------------------------------------------------------ # + probe_set = stimulus_set.iloc[:1].copy() + probe_set.identifier = None + probe_set = place_on_screen( + probe_set, + target_visual_degrees=model.visual_degrees(), + source_visual_degrees=visual_degrees, + ) + probe_stimulus_id = probe_set['stimulus_id'].values[0] + probe_path = str(probe_set.get_stimulus(probe_stimulus_id)) + + # ------------------------------------------------------------------ # + # 3. Probe the model with 1 stimulus # + # # + # We call _from_paths directly — bypassing from_stimulus_set and # + # attach_stimulus_set_meta — so the probe cannot corrupt the # + # activations cache used by the subsequent scoring run. # + # # + # We do NOT disable LayerPCA: the probe should measure the feature # + # count exactly as the scoring run will accumulate it (i.e. after # + # PCA reduction when PCA is hooked, raw otherwise). # + # ------------------------------------------------------------------ # + _am = getattr(model, 'activations_model', None) + _extractor = getattr(_am, '_extractor', None) if _am else None + probe_layer = _get_probe_layer(model) if _extractor is not None else None + + if _extractor is not None and probe_layer is not None: + # Fast path: call _from_paths directly — no attach_stimulus_set_meta + probe_output = _extractor._from_paths(layers=[probe_layer], stimuli_paths=[probe_path]) + num_features = probe_output.sizes['neuroid'] + num_timebins = len(timebins) # _from_paths has no time expansion; timebins from benchmark + else: + # Fallback: use the standard look_at pipeline + model.start_recording(region, time_bins=timebins) + probe_output = model.look_at(probe_set, number_of_trials=1) + num_features = probe_output.sizes['neuroid'] + num_timebins = probe_output.sizes.get('time_bin', 1) + + _logger.info( + f"Memory probe: benchmark={benchmark.identifier} region={region} " + f"stimuli={num_stimuli} features={num_features} timebins={num_timebins}" + ) + + # ------------------------------------------------------------------ # + # 4. Compute estimate and check against available RAM # + # # + # num_trials excluded: deterministic models process each unique # + # stimulus once; trial repetition does not scale model memory. # + # ------------------------------------------------------------------ # + activation_bytes = num_stimuli * num_features * num_timebins * _BYTES_PER_ELEMENT + activation_gb = activation_bytes / (1024 ** 3) + + # Auto-load from the calibration table if no explicit value was given + if fixed_benchmark_cost_gb is None: + _cal = load_calibration() + fixed_benchmark_cost_gb = _cal.get(benchmark.identifier) + if fixed_benchmark_cost_gb is not None: + _logger.debug( + f"Using calibrated fixed cost for {benchmark.identifier}: " + f"{fixed_benchmark_cost_gb:.3f} GB" + ) + + # ------------------------------------------------------------------ # + # Choose the right formula based on the benchmark's regression type # + # # + # PLS: cross-covariance matrices scale with num_features — use # + # activation × _PLS_OVERHEAD_FACTOR. This is approximate; a # + # warning is printed. # + # # + # RDM/RSA: pairwise distance computation passes through the full # + # activation matrix — overhead ≈ 2× activation_gb. Use 3× total. # + # # + # Ridge/RidgeCV — two regimes depending on feature count: # + # # + # n_features ≤ n_stimuli (primal solver): calibrated fixed cost # + # is accurate — gram matrix is n_stimuli×n_stimuli and is model- # + # independent. # + # # + # n_features > n_stimuli (sklearn switches to SVD of X): overhead # + # ≈ 5× activation_gb — SVD creates V^T (same shape as X) and # + # U (n_stimuli×n_stimuli), so total ≈ 6× activation_gb. The # + # calibrated fixed cost was measured on a small model (alexnet, # + # n_features < n_stimuli for most benchmarks) and severely # + # underestimates in this regime. Use the ×6 fallback instead so # + # the pre-flight raises MemoryError cleanly before the OS kills # + # the container with no Python traceback. # + # ------------------------------------------------------------------ # + is_pls = _is_pls_benchmark(benchmark) + is_rdm = _is_rdm_benchmark(benchmark) + is_ridge = _is_ridge_benchmark(benchmark) + ridge_large_feature = is_ridge and num_features > num_stimuli + + rdm_overhead_gb = None + if is_pls: + total_estimated_gb = activation_gb * _PLS_OVERHEAD_FACTOR + (fixed_benchmark_cost_gb or 0.0) + formula_type = 'pls' + elif is_rdm: + # Overhead ≈ 2× activation_gb (scales with features, not n_stimuli²). + # Validated across alexnet/resnet50/ViT on Allen2022_fmri.IT-rdm. + rdm_overhead_gb = 2 * activation_gb + total_estimated_gb = activation_gb + rdm_overhead_gb # = 3 × activation_gb + formula_type = 'rdm' + elif ridge_large_feature: + # n_features > n_stimuli: sklearn SVD path — overhead ≈ 5× activation_gb. + # Validated: resnet50/ViT × Gifford2022.IT-ridgecv both gave exactly 5.1×. + # Use ×6 total (activation + 5× overhead) to stay conservative and ensure + # the pre-flight MemoryError fires before the OS kills the container. + total_estimated_gb = activation_gb * _OVERHEAD_FACTOR + formula_type = 'ridge_large_feature' + elif is_ridge and fixed_benchmark_cost_gb is not None: + total_estimated_gb = activation_gb + fixed_benchmark_cost_gb + formula_type = 'calibrated' + elif is_ridge: + # No calibration entry, primal regime: gram matrix is n_stimuli×n_stimuli + rdm_overhead_gb = (num_stimuli ** 2) * _BYTES_PER_ELEMENT / (1024 ** 3) + total_estimated_gb = activation_gb + rdm_overhead_gb + formula_type = 'ridge_formula' + elif fixed_benchmark_cost_gb is not None: + total_estimated_gb = activation_gb + fixed_benchmark_cost_gb + formula_type = 'calibrated' + else: + total_estimated_gb = activation_gb * _OVERHEAD_FACTOR + formula_type = 'fallback' + + available_gb = psutil.virtual_memory().available / (1024 ** 3) + + estimate = MemoryEstimate( + num_stimuli=num_stimuli, + num_trials=num_trials, + num_features=num_features, + num_timebins=num_timebins, + activation_gb=activation_gb, + total_estimated_gb=total_estimated_gb, + available_gb=available_gb, + fixed_benchmark_cost_gb=fixed_benchmark_cost_gb, + is_pls=is_pls, + formula_type=formula_type, + rdm_overhead_gb=rdm_overhead_gb, + ) + + verdict = "OOM LIKELY" if estimate.will_oom else "OK" + print( + f"[pre-flight] [{verdict}] " + f"{estimate.total_estimated_gb:.2f} GB needed / {estimate.available_gb:.1f} GB available " + f"[{formula_type}]\n" + f" {estimate.num_stimuli:,} stimuli × {estimate.num_features:,} features × " + f"{estimate.num_timebins} timebins = {estimate.activation_gb:.3f} GB activation", + end='', + flush=True, + ) + if formula_type == 'pls': + fixed_str = (f" + {estimate.fixed_benchmark_cost_gb:.3f} GB fixed cost" + if estimate.fixed_benchmark_cost_gb is not None else "") + print(f" ×{_PLS_OVERHEAD_FACTOR} (PLS){fixed_str} = {estimate.total_estimated_gb:.3f} GB total", + flush=True) + print( + f"[pre-flight] WARNING: PLS overhead multiplier (×{_PLS_OVERHEAD_FACTOR}) is approximate. " + f"Actual usage can vary significantly depending on model feature count and convergence.", + flush=True, + ) + elif formula_type == 'ridge_large_feature': + print(f" ×{_OVERHEAD_FACTOR} (ridge SVD: n_features={num_features:,} > n_stimuli={num_stimuli:,})" + f" = {estimate.total_estimated_gb:.3f} GB total", flush=True) + elif formula_type == 'rdm': + print(f" ×3 (RDM pairwise overhead)" + f" = {estimate.total_estimated_gb:.3f} GB total", flush=True) + elif formula_type == 'ridge_formula': + print(f" + {estimate.rdm_overhead_gb:.3f} GB gram matrix ({num_stimuli:,}²×4B) " + f"[no calibration entry — formula estimate]" + f" = {estimate.total_estimated_gb:.3f} GB total", flush=True) + elif formula_type == 'calibrated': + print(f" + {estimate.fixed_benchmark_cost_gb:.3f} GB benchmark overhead (calibrated)" + f" = {estimate.total_estimated_gb:.3f} GB total", flush=True) + else: + print(f" ×{_OVERHEAD_FACTOR} = {estimate.total_estimated_gb:.3f} GB total", flush=True) + + # Structured sentinel for CloudWatch Insights calibration queries and reliable + # OOM signal parsing by the scoring orchestrator. Every pre-flight run emits + # this line regardless of outcome — filter on will_oom=true for OOM cases. + # Query example: filter @message like "BRAINSCORE_PREFLIGHT" + # | stats avg(estimate_gb) by benchmark_id, formula_type + print( + f"BRAINSCORE_PREFLIGHT {json.dumps({'estimate_gb': round(total_estimated_gb, 3), 'available_gb': round(available_gb, 1), 'formula_type': formula_type, 'will_oom': estimate.will_oom, 'num_features': num_features, 'num_stimuli': num_stimuli})}", + flush=True, + ) + + if estimate.will_oom: + msg = ( + f"preallocate_memory: {str(estimate)}. " + f"Consider reducing layer output dimensionality (e.g. via LayerPCA), " + f"running on a machine with more RAM, or selecting a different layer." + ) + if raise_if_oom: + raise MemoryError(msg) + else: + _logger.warning(msg) + + return estimate diff --git a/brainscore_vision/benchmark_helpers/neural_common.py b/brainscore_vision/benchmark_helpers/neural_common.py index 7d93ed2b3c..12a45348d9 100644 --- a/brainscore_vision/benchmark_helpers/neural_common.py +++ b/brainscore_vision/benchmark_helpers/neural_common.py @@ -24,6 +24,10 @@ def __init__(self, identifier, assembly, similarity_metric, visual_degrees, numb self._visual_degrees = visual_degrees self._number_of_trials = number_of_trials + def preallocate_memory(self, candidate: BrainModel) -> None: + from brainscore_vision.benchmark_helpers.memory import preallocate_memory as _probe + _probe(candidate, self) + def __call__(self, candidate: BrainModel): candidate.start_recording(self.region, time_bins=self.timebins) stimulus_set = place_on_screen(self._assembly.stimulus_set, target_visual_degrees=candidate.visual_degrees(), @@ -79,8 +83,12 @@ def __init__(self, identifier, ceiling_func, version, self.ceiling_mode = neuroid_wise_explained_var else: self.ceiling_mode = explained_variance - - def __call__(self, candidate: BrainModel): + + def preallocate_memory(self, candidate: BrainModel) -> None: + from brainscore_vision.benchmark_helpers.memory import preallocate_memory as _probe + _probe(candidate, self) + + def __call__(self, candidate: BrainModel): """ Score a candidate model on this benchmark. @@ -247,6 +255,10 @@ def __init__( bibtex=bibtex, ) + def preallocate_memory(self, candidate: BrainModel) -> None: + from brainscore_vision.benchmark_helpers.memory import preallocate_memory as _probe + _probe(candidate, self) + def __call__(self, candidate: BrainModel) -> Score: assembly = self._assembly timebins = timebins_from_assembly(assembly) diff --git a/brainscore_vision/benchmarks/__init__.py b/brainscore_vision/benchmarks/__init__.py index 98c3ca69e1..e6762b5c1a 100644 --- a/brainscore_vision/benchmarks/__init__.py +++ b/brainscore_vision/benchmarks/__init__.py @@ -38,6 +38,18 @@ def __call__(self, candidate: BrainModel) -> Score: """ raise NotImplementedError() + def preallocate_memory(self, candidate: BrainModel) -> None: + """ + Optional pre-flight memory check before scoring. Neural benchmarks override + this to raise :exc:`MemoryError` early if the model is estimated to exceed + available RAM. Behavioral and engineering benchmarks use this no-op default + since they do not run activation extraction and rarely require pre-flight + checks. Note: if a behavioral benchmark does OOM (OS kill, exit 137), the + gated scoring orchestrator will not detect it as an OOM and will not + automatically escalate the tier. + """ + pass + @property def bibtex(self) -> str: """ diff --git a/scripts/mem_profile_suite.py b/scripts/mem_profile_suite.py new file mode 100644 index 0000000000..9c8d36d620 --- /dev/null +++ b/scripts/mem_profile_suite.py @@ -0,0 +1,1176 @@ +""" +Memory Profile Suite +==================== +Two modes: a 5×5 accuracy grid and a full benchmark calibration run. + +BACKGROUND +---------- +Before scoring a model on a benchmark, we want to estimate whether there is +enough RAM to complete the run without OOM-killing the process. The estimate +has two components: + + total_memory_needed = activation_gb + fixed_benchmark_cost_gb + + • activation_gb — the raw model output array + (stimuli × features × timebins × 4 bytes) + measured by a cheap 1-stimulus forward pass (the "probe") + • fixed_benchmark_cost — the benchmark's model-independent overhead + (regression matrices, xarray bookkeeping, CV buffers) + calibrated once per benchmark via --calibrate and + stored in ~/.brainscore/benchmark_costs.json + +The fixed cost is environment-specific (calibrate on the same machine you score on). + +MODE 1 — 5×5 accuracy grid (default) +-------------------------------------- +Runs 5 models × 5 benchmarks, compares the pre-flight estimate to the actual +peak RSS delta for each pair. Good for validating the estimation system. + + python scripts/mem_profile_suite.py [--csv out.csv] [--skip-score] + + --skip-score probe only, skip actual scoring + --csv PATH write per-pair results to CSV (flushed after each pair) + +MODE 2 — Benchmark calibration (--calibrate) +--------------------------------------------- +Runs alexnet on every known benchmark to measure fixed_benchmark_cost per +benchmark. Results are saved incrementally to ~/.brainscore/benchmark_costs.json +so a crash mid-run does not lose completed work. Non-neural benchmarks +(behavioral, engineering) are skipped automatically. + + python scripts/mem_profile_suite.py --calibrate [--csv out.csv] + [--calibration-json PATH] + [--resume-from BENCHMARK_ID] + + --resume-from BID skip all benchmarks up to and including BID, + then continue — use this after a crash to pick up + where you left off +""" +import os +import sys +import time +import argparse +import csv +import logging +import threading + +# --------------------------------------------------------------------------- +# Resolve local repos so the script works without installation +# --------------------------------------------------------------------------- +_script_dir = os.path.dirname(os.path.abspath(__file__)) +_vision_root = os.path.dirname(_script_dir) +_core_root = os.path.join(os.path.dirname(_vision_root), 'core') +for _p in [_vision_root, _core_root]: + if _p not in sys.path: + sys.path.insert(0, _p) + +logging.basicConfig(level=logging.WARNING) + +print("Importing brainscore_vision... ", end='', flush=True) +import brainscore_vision # noqa: E402 +print("done.", flush=True) + +import psutil # noqa: E402 + +# --------------------------------------------------------------------------- +# Model / benchmark lists +# --------------------------------------------------------------------------- +MODELS = [ + 'resnet50_tutorial', + 'alexnet', + 'vit_large_patch14_clip_224:openai_ft_in1k', + 'VOneCORnet-S', + 'efficientnet_b0', +] + +BENCHMARKS = [ + 'MajajHong2015.IT-pls', + 'Sanghavi2020.IT-pls', + 'Papale2025.IT-ridgecv', + 'Hebart2023_fmri.V4-ridgecv', + 'Allen2022_fmri.IT-ridge', +] + +_BM_SHORT = { + 'MajajHong2015.IT-pls': 'MajajHong.IT', + 'Sanghavi2020.IT-pls': 'Sanghavi.IT', + 'Papale2025.IT-ridgecv': 'Papale25.IT', + 'Hebart2023_fmri.V4-ridgecv': 'Hebart23.V4', + 'Allen2022_fmri.IT-ridge': 'Allen22.IT', +} + +# All registered leaf benchmarks — used by --calibrate mode. +ALL_BENCHMARKS = [ + # Allen2022 fMRI (volumetric) + 'Allen2022_fmri.V1-ridge', 'Allen2022_fmri.V2-ridge', + 'Allen2022_fmri.V4-ridge', 'Allen2022_fmri.IT-ridge', + 'Allen2022_fmri.V1-rdm', 'Allen2022_fmri.V2-rdm', + 'Allen2022_fmri.V4-rdm', 'Allen2022_fmri.IT-rdm', + 'Allen2022_fmri_4subj.V1-ridge', 'Allen2022_fmri_4subj.V2-ridge', + 'Allen2022_fmri_4subj.V4-ridge', 'Allen2022_fmri_4subj.IT-ridge', + 'Allen2022_fmri_4subj.V1-rdm', 'Allen2022_fmri_4subj.V2-rdm', + 'Allen2022_fmri_4subj.V4-rdm', 'Allen2022_fmri_4subj.IT-rdm', + # Allen2022 fMRI (surface) + 'Allen2022_fmri_surface.V1-ridge', 'Allen2022_fmri_surface.V2-ridge', + 'Allen2022_fmri_surface.V4-ridge', 'Allen2022_fmri_surface.IT-ridge', + 'Allen2022_fmri_surface.V1-rdm', 'Allen2022_fmri_surface.V2-rdm', + 'Allen2022_fmri_surface.V4-rdm', 'Allen2022_fmri_surface.IT-rdm', + 'Allen2022_fmri_surface_4subj.V1-ridge', 'Allen2022_fmri_surface_4subj.V2-ridge', + 'Allen2022_fmri_surface_4subj.V4-ridge', 'Allen2022_fmri_surface_4subj.IT-ridge', + 'Allen2022_fmri_surface_4subj.V1-rdm', 'Allen2022_fmri_surface_4subj.V2-rdm', + 'Allen2022_fmri_surface_4subj.V4-rdm', 'Allen2022_fmri_surface_4subj.IT-rdm', + # Baker2022 + 'Baker2022frankenstein-accuracy_delta', + 'Baker2022fragmented-accuracy_delta', + 'Baker2022inverted-accuracy_delta', + # BMD2024 + 'BMD2024.texture_1Behavioral-accuracy_distance', + 'BMD2024.texture_2Behavioral-accuracy_distance', + 'BMD2024.dotted_1Behavioral-accuracy_distance', + 'BMD2024.dotted_2Behavioral-accuracy_distance', + # Bracci2019 + 'Bracci2019.anteriorVTC-rdm', + # Cadena2017 + 'Cadena2017-pls', 'Cadena2017-mask', + # Coggan2024 + 'tong.Coggan2024_fMRI.V1-rdm', 'tong.Coggan2024_fMRI.V2-rdm', + 'tong.Coggan2024_fMRI.V4-rdm', 'tong.Coggan2024_fMRI.IT-rdm', + 'tong.Coggan2024_behavior-ConditionWiseAccuracySimilarity', + # Ferguson2024 + 'Ferguson2024circle_line-value_delta', 'Ferguson2024color-value_delta', + 'Ferguson2024convergence-value_delta', 'Ferguson2024eighth-value_delta', + 'Ferguson2024gray_easy-value_delta', 'Ferguson2024gray_hard-value_delta', + 'Ferguson2024half-value_delta', 'Ferguson2024juncture-value_delta', + 'Ferguson2024lle-value_delta', 'Ferguson2024llh-value_delta', + 'Ferguson2024quarter-value_delta', 'Ferguson2024round_f-value_delta', + 'Ferguson2024round_v-value_delta', 'Ferguson2024tilted_line-value_delta', + # FreemanZiemba2013 + 'FreemanZiemba2013.V1-pls', 'FreemanZiemba2013.V2-pls', + 'FreemanZiemba2013public.V1-pls', 'FreemanZiemba2013public.V2-pls', + # Geirhos2021 + 'Geirhos2021colour-top1', 'Geirhos2021colour-error_consistency', + 'Geirhos2021contrast-top1', 'Geirhos2021contrast-error_consistency', + 'Geirhos2021cueconflict-top1', 'Geirhos2021cueconflict-error_consistency', + 'Geirhos2021edge-top1', 'Geirhos2021edge-error_consistency', + 'Geirhos2021eidolonI-top1', 'Geirhos2021eidolonI-error_consistency', + 'Geirhos2021eidolonII-top1', 'Geirhos2021eidolonII-error_consistency', + 'Geirhos2021eidolonIII-top1', 'Geirhos2021eidolonIII-error_consistency', + 'Geirhos2021falsecolour-top1', 'Geirhos2021falsecolour-error_consistency', + 'Geirhos2021highpass-top1', 'Geirhos2021highpass-error_consistency', + 'Geirhos2021lowpass-top1', 'Geirhos2021lowpass-error_consistency', + 'Geirhos2021phasescrambling-top1', 'Geirhos2021phasescrambling-error_consistency', + 'Geirhos2021powerequalisation-top1', 'Geirhos2021powerequalisation-error_consistency', + 'Geirhos2021rotation-top1', 'Geirhos2021rotation-error_consistency', + 'Geirhos2021silhouette-top1', 'Geirhos2021silhouette-error_consistency', + 'Geirhos2021sketch-top1', 'Geirhos2021sketch-error_consistency', + 'Geirhos2021stylized-top1', 'Geirhos2021stylized-error_consistency', + 'Geirhos2021uniformnoise-top1', 'Geirhos2021uniformnoise-error_consistency', + # Gifford2022 + 'Gifford2022.IT-ridge', 'Gifford2022.IT-ridgecv', + # Hebart2023 + 'Hebart2023-match', + 'Hebart2023_fmri.V1-ridge', 'Hebart2023_fmri.V2-ridge', + 'Hebart2023_fmri.V4-ridge', 'Hebart2023_fmri.IT-ridge', + 'Hebart2023_fmri.V1-ridgecv', 'Hebart2023_fmri.V2-ridgecv', + 'Hebart2023_fmri.V4-ridgecv', 'Hebart2023_fmri.IT-ridgecv', + # Hermann2020 + 'Hermann2020cueconflict-shape_bias', 'Hermann2020cueconflict-shape_match', + # Igustibagus2024 + 'Igustibagus2024-ridge', 'Igustibagus2024.IT_readout-accuracy', + # ImageNet + 'ImageNet-top1', + 'ImageNet-C-noise-top1', 'ImageNet-C-blur-top1', + 'ImageNet-C-weather-top1', 'ImageNet-C-digital-top1', + # Islam2021 + 'Islam2021-shape_v1_dimensionality', 'Islam2021-texture_v1_dimensionality', + 'Islam2021-shape_v2_dimensionality', 'Islam2021-texture_v2_dimensionality', + 'Islam2021-shape_v4_dimensionality', 'Islam2021-texture_v4_dimensionality', + 'Islam2021-shape_it_dimensionality', 'Islam2021-texture_it_dimensionality', + # Kar2019 + 'Kar2019-ost', + # Lonnqvist2024 + 'Lonnqvist2024_InlabInstructionsBehavioralAccuracyDistance', + 'Lonnqvist2024_InlabNoInstructionsBehavioralAccuracyDistance', + 'Lonnqvist2024_OnlineNoInstructionsBehavioralAccuracyDistance', + 'Lonnqvist2024_EngineeringAccuracy', + # MajajHong2015 + 'MajajHong2015.V4-pls', 'MajajHong2015.IT-pls', + 'MajajHong2015public.V4-pls', 'MajajHong2015public.IT-pls', + 'MajajHong2015public.V4-temporal-pls', 'MajajHong2015public.IT-temporal-pls', + 'MajajHong2015public.V4-reverse_pls', 'MajajHong2015public.IT-reverse_pls', + # Malania2007 + 'Malania2007.short2-threshold_elevation', 'Malania2007.short4-threshold_elevation', + 'Malania2007.short6-threshold_elevation', 'Malania2007.short8-threshold_elevation', + 'Malania2007.short16-threshold_elevation', 'Malania2007.equal2-threshold_elevation', + 'Malania2007.long2-threshold_elevation', 'Malania2007.equal16-threshold_elevation', + 'Malania2007.long16-threshold_elevation', 'Malania2007.vernieracuity-threshold', + # Maniquet2024 + 'Maniquet2024-confusion_similarity', 'Maniquet2024-tasks_consistency', + # Marques2020 + 'Marques2020_Cavanaugh2002-grating_summation_field', + 'Marques2020_Cavanaugh2002-surround_diameter', + 'Marques2020_Cavanaugh2002-surround_suppression_index', + 'Marques2020_DeValois1982-pref_or', + 'Marques2020_DeValois1982-peak_sf', + 'Marques2020_FreemanZiemba2013-texture_modulation_index', + 'Marques2020_FreemanZiemba2013-abs_texture_modulation_index', + 'Marques2020_FreemanZiemba2013-texture_selectivity', + 'Marques2020_FreemanZiemba2013-texture_sparseness', + 'Marques2020_FreemanZiemba2013-texture_variance_ratio', + 'Marques2020_FreemanZiemba2013-max_texture', + 'Marques2020_FreemanZiemba2013-max_noise', + 'Marques2020_Ringach2002-circular_variance', 'Marques2020_Ringach2002-or_bandwidth', + 'Marques2020_Ringach2002-orth_pref_ratio', 'Marques2020_Ringach2002-or_selective', + 'Marques2020_Ringach2002-cv_bandwidth_ratio', 'Marques2020_Ringach2002-opr_cv_diff', + 'Marques2020_Ringach2002-max_dc', 'Marques2020_Ringach2002-modulation_ratio', + 'Marques2020_Schiller1976-sf_selective', 'Marques2020_Schiller1976-sf_bandwidth', + # ObjectNet + 'ObjectNet-top1', + # Papale2025 + 'Papale2025.V1-ridge', 'Papale2025.V4-ridge', 'Papale2025.IT-ridge', + 'Papale2025.V1-ridgecv', 'Papale2025.V4-ridgecv', 'Papale2025.IT-ridgecv', + # Rajalingham2018 + 'Rajalingham2018-i2n', 'Rajalingham2018public-i2n', + # Rajalingham2020 + 'Rajalingham2020.IT-pls', + # Sanghavi2020 + 'Sanghavi2020.V4-pls', 'Sanghavi2020.IT-pls', + 'SanghaviJozwik2020.V4-pls', 'SanghaviJozwik2020.IT-pls', + 'SanghaviMurty2020.V4-pls', 'SanghaviMurty2020.IT-pls', + # Scialom2024 + 'Scialom2024_rgbBehavioralAccuracyDistance', + 'Scialom2024_contoursBehavioralAccuracyDistance', + 'Scialom2024_phosphenes-12BehavioralAccuracyDistance', + 'Scialom2024_phosphenes-16BehavioralAccuracyDistance', + 'Scialom2024_phosphenes-21BehavioralAccuracyDistance', + 'Scialom2024_phosphenes-27BehavioralAccuracyDistance', + 'Scialom2024_phosphenes-35BehavioralAccuracyDistance', + 'Scialom2024_phosphenes-46BehavioralAccuracyDistance', + 'Scialom2024_phosphenes-59BehavioralAccuracyDistance', + 'Scialom2024_phosphenes-77BehavioralAccuracyDistance', + 'Scialom2024_phosphenes-100BehavioralAccuracyDistance', + 'Scialom2024_segments-12BehavioralAccuracyDistance', + 'Scialom2024_segments-16BehavioralAccuracyDistance', + 'Scialom2024_segments-21BehavioralAccuracyDistance', + 'Scialom2024_segments-27BehavioralAccuracyDistance', + 'Scialom2024_segments-35BehavioralAccuracyDistance', + 'Scialom2024_segments-46BehavioralAccuracyDistance', + 'Scialom2024_segments-59BehavioralAccuracyDistance', + 'Scialom2024_segments-77BehavioralAccuracyDistance', + 'Scialom2024_segments-100BehavioralAccuracyDistance', + 'Scialom2024_phosphenes-allBehavioralErrorConsistency', + 'Scialom2024_segments-allBehavioralErrorConsistency', + 'Scialom2024_phosphenes-allBehavioralAccuracyDistance', + 'Scialom2024_segments-allBehavioralAccuracyDistance', + 'Scialom2024_rgbEngineeringAccuracy', + 'Scialom2024_contoursEngineeringAccuracy', + 'Scialom2024_phosphenes-12EngineeringAccuracy', + 'Scialom2024_phosphenes-16EngineeringAccuracy', + 'Scialom2024_phosphenes-21EngineeringAccuracy', + 'Scialom2024_phosphenes-27EngineeringAccuracy', + 'Scialom2024_phosphenes-35EngineeringAccuracy', + 'Scialom2024_phosphenes-46EngineeringAccuracy', + 'Scialom2024_phosphenes-59EngineeringAccuracy', + 'Scialom2024_phosphenes-77EngineeringAccuracy', + 'Scialom2024_phosphenes-100EngineeringAccuracy', + 'Scialom2024_segments-12EngineeringAccuracy', + 'Scialom2024_segments-16EngineeringAccuracy', + 'Scialom2024_segments-21EngineeringAccuracy', + 'Scialom2024_segments-27EngineeringAccuracy', + 'Scialom2024_segments-35EngineeringAccuracy', + 'Scialom2024_segments-46EngineeringAccuracy', + 'Scialom2024_segments-59EngineeringAccuracy', + 'Scialom2024_segments-77EngineeringAccuracy', + 'Scialom2024_segments-100EngineeringAccuracy', +] + +# --------------------------------------------------------------------------- +# ANSI colours +# --------------------------------------------------------------------------- +_RESET = '\033[0m' +_BOLD = '\033[1m' +_GREEN = '\033[32m' +_YELLOW = '\033[33m' +_RED = '\033[31m' +_CYAN = '\033[36m' +_DIM = '\033[2m' +_BLUE = '\033[34m' + + +def _c(text, colour): + return f"{colour}{text}{_RESET}" + + +def _step(msg, indent=4): + print(f"{' ' * indent}{_c('→', _BLUE)} {msg}", flush=True) + + +def _substep(msg, indent=6): + print(f"{' ' * indent}{_c('·', _DIM)} {msg}", flush=True) + + +def _gb(n_bytes): + return f"{n_bytes / (1024 ** 3):.2f} GB" + + +# --------------------------------------------------------------------------- +# Peak RSS monitor (background thread) +# --------------------------------------------------------------------------- + +class _PeakMonitor: + def __init__(self, interval=0.5): + self._proc = psutil.Process(os.getpid()) + self._interval = interval + self._peak = self._proc.memory_info().rss + self._stop = threading.Event() + self._thread = threading.Thread(target=self._run, daemon=True) + + def start(self): + self._thread.start() + return self + + def stop(self): + self._stop.set() + self._thread.join() + return self._peak + + def _run(self): + while not self._stop.is_set(): + try: + rss = self._proc.memory_info().rss + if rss > self._peak: + self._peak = rss + except psutil.NoSuchProcess: + break + self._stop.wait(self._interval) + + +# --------------------------------------------------------------------------- +# Comparison helpers +# --------------------------------------------------------------------------- + +def _compare_label(estimate_gb, actual_delta_gb): + """Returns (colour, verdict_string, ratio).""" + if actual_delta_gb <= 0.01: + return _GREEN, "no measurable RSS delta", None + ratio = estimate_gb / actual_delta_gb + if ratio >= 0.8: + return _GREEN, f"ACCURATE ({ratio:.2f}× of actual)", ratio + elif ratio >= 0.4: + under = actual_delta_gb - estimate_gb + pct = (1 - ratio) * 100 + return _YELLOW, f"UNDER by {under:.2f} GB ({pct:.0f}% under)", ratio + else: + under = actual_delta_gb - estimate_gb + pct = (1 - ratio) * 100 + return _RED, f"UNDER by {under:.2f} GB ({pct:.0f}% under)", ratio + + +# --------------------------------------------------------------------------- +# Result helper +# --------------------------------------------------------------------------- + +def _make_result(model_id, benchmark_id, **kw): + return dict(model_id=model_id, benchmark_id=benchmark_id, **kw) + + +# --------------------------------------------------------------------------- +# Run one (model, benchmark) pair +# --------------------------------------------------------------------------- + +def run_pair(model, model_id, benchmark, benchmark_id, skip_score=False): + from brainscore_vision.benchmark_helpers.memory import preallocate_memory + + proc = psutil.Process(os.getpid()) + + # ── Pre-flight probe ───────────────────────────────────────────────── + _step("pre-flight probe (1-stimulus forward pass)") + t_probe = time.time() + try: + est = preallocate_memory(model, benchmark, raise_if_oom=False) + except TypeError as e: + _substep(_c(f"skipped — unsupported benchmark type: {e}", _DIM)) + return _make_result(model_id, benchmark_id, status='skip', + est_gb=None, act_gb=None, actual_delta_gb=None, score=None, + probe_elapsed=time.time() - t_probe, + score_elapsed=None, note=str(e)[:100]) + except Exception as e: + _substep(_c(f"probe ERROR: {str(e)[:100]}", _RED)) + return _make_result(model_id, benchmark_id, status='error', + est_gb=None, act_gb=None, actual_delta_gb=None, score=None, + probe_elapsed=time.time() - t_probe, + score_elapsed=None, note=str(e)[:120]) + + probe_elapsed = time.time() - t_probe + + if est is None: + _substep(_c("skipped (BRAINSCORE_SKIP_MEMORY_CHECK set)", _DIM)) + return _make_result(model_id, benchmark_id, status='skip', + est_gb=None, act_gb=None, actual_delta_gb=None, score=None, + probe_elapsed=probe_elapsed, score_elapsed=None, + note='BRAINSCORE_SKIP_MEMORY_CHECK set') + + _substep(f"features={est.num_features:,} stimuli={est.num_stimuli:,} timebins={est.num_timebins}") + _substep( + f"activation = {est.activation_gb:.3f} GB ×6 overhead " + f"→ {_c(f'estimate: {est.total_estimated_gb:.2f} GB', _CYAN)}" + ) + + if skip_score: + return _make_result(model_id, benchmark_id, status='probe_only', + est_gb=est.total_estimated_gb, act_gb=est.activation_gb, + feat=est.num_features, stimuli=est.num_stimuli, + timebins=est.num_timebins, + actual_delta_gb=None, score=None, + probe_elapsed=probe_elapsed, score_elapsed=None, note='--skip-score') + + # ── Full benchmark run ─────────────────────────────────────────────── + baseline_rss = proc.memory_info().rss + _step(f"scoring (baseline RSS: {_gb(baseline_rss)})") + + # Ticker thread: prints RSS + elapsed every 30s while benchmark runs + _ticker_stop = threading.Event() + def _ticker(): + t_start = time.time() + interval = 30 + while not _ticker_stop.wait(interval): + elapsed = time.time() - t_start + rss = proc.memory_info().rss + print(f" {_c('…', _DIM)} still scoring " + f"{elapsed/60:.1f} min elapsed " + f"RSS {_gb(rss)}", flush=True) + ticker_thread = threading.Thread(target=_ticker, daemon=True) + ticker_thread.start() + + monitor = _PeakMonitor().start() + t_score = time.time() + score_val = None + score_status = 'ok' + score_note = '' + try: + score_val = benchmark(model) + except MemoryError as e: + score_status = 'oom' + score_note = str(e)[:120] + _substep(_c(f"MemoryError: {score_note}", _RED)) + except Exception as e: + score_status = 'error' + score_note = str(e)[:120] + _substep(_c(f"scoring ERROR: {score_note}", _RED)) + finally: + _ticker_stop.set() + ticker_thread.join() + + score_elapsed = time.time() - t_score + peak_rss = monitor.stop() + actual_delta_gb = (peak_rss - baseline_rss) / (1024 ** 3) + + # ── Comparison ─────────────────────────────────────────────────────── + colour, verdict, ratio = _compare_label(est.total_estimated_gb, actual_delta_gb) + _step("comparison") + _substep( + f"baseline RSS = {_gb(baseline_rss)} " + f"{_c('← model weights, Python, etc. already in RAM before scoring', _DIM)}" + ) + _substep( + f"peak RSS = {_gb(peak_rss)} " + f"{_c('← highest point reached during scoring', _DIM)}" + ) + _substep( + f"Δ (peak−base)= {_c(f'+{actual_delta_gb:.2f} GB', _CYAN)} " + f"{_c('← extra RAM the benchmark itself consumed ← this is what we compare against', _DIM)}" + ) + _substep( + f"estimated = {_c(f'{est.total_estimated_gb:.2f} GB', _CYAN)} " + f"{_c(f'← {est.activation_gb:.3f} GB activations × 6 overhead', _DIM)}" + ) + _substep(f"verdict : {_c(verdict, colour)}") + if score_val is not None: + _substep(f"score : {float(score_val):.4f} elapsed {score_elapsed:.0f}s") + + return _make_result(model_id, benchmark_id, + status=score_status, + est_gb=est.total_estimated_gb, + act_gb=est.activation_gb, + feat=est.num_features, + stimuli=est.num_stimuli, + timebins=est.num_timebins, + actual_delta_gb=actual_delta_gb, + baseline_rss_gb=baseline_rss / (1024 ** 3), + peak_rss_gb=peak_rss / (1024 ** 3), + score=float(score_val) if score_val is not None else None, + ratio=ratio, + probe_elapsed=probe_elapsed, + score_elapsed=score_elapsed, + note=score_note) + + +# --------------------------------------------------------------------------- +# Load helpers +# --------------------------------------------------------------------------- + +def _load_model(mid): + return brainscore_vision.load_model(mid) + + +def _load_benchmark(bid): + return brainscore_vision.load_benchmark(bid) + + +def _timed_load(fn, arg, t0, interval=15): + """Run fn(arg) in a thread, printing elapsed time every `interval` seconds.""" + result = [None] + exc = [None] + + def _worker(): + try: + result[0] = fn(arg) + except Exception as e: + exc[0] = e + + t = threading.Thread(target=_worker, daemon=True) + t.start() + while t.is_alive(): + t.join(timeout=interval) + if t.is_alive(): + print(f" {_c('…', _DIM)} still loading {time.time()-t0:.0f}s elapsed", + flush=True) + + if exc[0] is not None: + raise exc[0] + return result[0] + + +# --------------------------------------------------------------------------- +# Summary table (est GB / actual Δ GB per cell) +# --------------------------------------------------------------------------- + +_MODEL_W = 36 +_CELL_W = 20 + + +def _trunc(s, n): + return s if len(s) <= n else s[:n - 1] + '…' + + +def _cell_text_plain(r): + """Fixed-width plain text (no ANSI) for padding calculation.""" + est = r.get('est_gb') + act = r.get('actual_delta_gb') + if est is None: + return " — skip/err " + if act is None: + return f"{est:5.2f} GB est n/a " + return f"{est:4.1f}/{act:4.1f} GB " + + +def _cell_colour(r): + est = r.get('est_gb') + act = r.get('actual_delta_gb') + if est is None: + return _c(" — skip/err", _DIM) + if act is None: + return f"{_c(f'{est:5.2f} GB', _CYAN)} est" + _, _, ratio = _compare_label(est, act) + col = _GREEN if (ratio and ratio >= 0.8) else (_YELLOW if (ratio and ratio >= 0.4) else _RED) + return f"{_c(f'{est:.1f}', _CYAN)}/{_c(f'{act:.1f}', col)} GB" + + +def _hline(c_mid, c_left, c_right, c_sep): + parts = [c_left, c_mid * (_MODEL_W + 2)] + for _ in BENCHMARKS: + parts += [c_sep, c_mid * (_CELL_W + 2)] + parts.append(c_right) + return ''.join(parts) + + +def print_summary_table(results_grid): + top = _hline('─', '┌', '┐', '┬') + mid = _hline('─', '├', '┤', '┼') + bottom = _hline('─', '└', '┘', '┴') + + print(top) + hdr = f"│ {_c(_trunc('Model', _MODEL_W), _BOLD):<{_MODEL_W + len(_BOLD) + len(_RESET)}} " + for bid in BENCHMARKS: + short = _BM_SHORT.get(bid, bid) + hdr += f"│ {_c(_trunc(short, _CELL_W), _BOLD):<{_CELL_W + len(_BOLD) + len(_RESET)}} " + print(hdr + "│") + print(mid) + + for mid_id in MODELS: + row = f"│ {_trunc(mid_id, _MODEL_W):<{_MODEL_W}} " + for bid in BENCHMARKS: + r = results_grid[mid_id][bid] + cell = _cell_colour(r) + padding = _CELL_W - len(_cell_text_plain(r)) + row += f"│ {cell}{' ' * max(0, padding)} " + print(row + "│") + + print(bottom) + print(_c(" est GB / actual Δ GB " + "(cyan=estimate, green=accurate ≥0.8×, yellow=under 0.4–0.8×, red=under <0.4×)", _DIM)) + + +# --------------------------------------------------------------------------- +# Full text report +# --------------------------------------------------------------------------- + +def print_full_report(results_grid): + print(f"\n{_c('Per-pair results:', _BOLD)}\n") + for mid_id in MODELS: + print(f" {_c(mid_id, _BOLD)}") + for bid in BENCHMARKS: + r = results_grid[mid_id][bid] + short = _BM_SHORT.get(bid, bid) + est = r.get('est_gb') + act = r.get('actual_delta_gb') + score = r.get('score') + + if est is not None and act is not None: + _, verdict, ratio = _compare_label(est, act) + col = _GREEN if (ratio and ratio >= 0.8) else (_YELLOW if (ratio and ratio >= 0.4) else _RED) + score_str = f" score={score:.4f}" if score is not None else "" + print(f" {short:<18} " + f"est {est:.2f} GB actual Δ {act:.2f} GB " + f"→ {_c(verdict, col)}{score_str}") + elif est is not None: + print(f" {short:<18} est {est:.2f} GB no actual ({r.get('note','')[:50]})") + else: + print(f" {short:<18} {r['status']} {r.get('note','')[:60]}") + print() + + +# --------------------------------------------------------------------------- +# Overhead recommendation +# --------------------------------------------------------------------------- + +def print_overhead_recommendation(results_grid): + from brainscore_vision.benchmark_helpers.memory import _OVERHEAD_FACTOR + + # Collect pairs where we have both raw activation GB and actual delta GB + pairs = [] + for mid_id in MODELS: + for bid in BENCHMARKS: + r = results_grid[mid_id][bid] + act_gb = r.get('act_gb') # raw activation array GB + delta_gb = r.get('actual_delta_gb') # actual peak-baseline delta + if act_gb and act_gb > 0 and delta_gb is not None and delta_gb > 0.01: + true_factor = delta_gb / act_gb + pairs.append({ + 'model': r['model_id'], + 'benchmark': r['benchmark_id'], + 'act_gb': act_gb, + 'delta_gb': delta_gb, + 'true_factor': true_factor, + }) + + n_total = len(MODELS) * len(BENCHMARKS) + n_scored = len(pairs) + + print(f"\n{'═' * 66}") + print(f" {_c('OVERHEAD FACTOR RECOMMENDATION', _BOLD)}") + print(f"{'═' * 66}\n") + + if n_scored == 0: + print(f" {_c('No scored pairs to analyse.', _DIM)}") + return + + true_factors = sorted(p['true_factor'] for p in pairs) + current_factor = _OVERHEAD_FACTOR + + # For a given overhead factor F, count pairs where estimate < actual delta + # (i.e. estimate would have UNDER-predicted, missing a potential OOM) + def n_underpredicted(factor): + return sum(1 for p in pairs if p['act_gb'] * factor < p['delta_gb']) + + current_under = n_underpredicted(current_factor) + n_safe = n_scored - current_under + + print(f" Scored pairs: {n_scored}/{n_total} " + f"({n_total - n_scored} skipped/errored)\n") + print(f" {_c('Current overhead factor = ×{}'.format(current_factor), _BOLD)}") + print(f" estimate covered (≥ actual Δ) : " + f"{_c(str(n_safe), _GREEN)}/{n_scored} pairs") + print(f" estimate under-predicted : " + f"{_c(str(current_under), _RED)}/{n_scored} pairs " + f"{_c('← estimate too low; real usage exceeded prediction', _DIM)}") + + # Show the actual overhead factors observed per pair + print(f"\n {_c('Actual overhead factors observed (activation GB → actual Δ GB):', _DIM)}") + for p in sorted(pairs, key=lambda x: x['true_factor'], reverse=True): + short_m = p['model'][:28] + short_b = _BM_SHORT.get(p['benchmark'], p['benchmark']) + tf = p['true_factor'] + bar = '█' * min(int(tf), 20) + col = _GREEN if tf <= current_factor else _RED + factor_str = _c(f'{tf:.1f}×', col) + print(f" {short_m:<28} {short_b:<16} " + f"{p['act_gb']:.2f} GB → {p['delta_gb']:.2f} GB " + f"= {factor_str} {_c(bar, col)}") + + # Find the factor that covers each percentile threshold + print(f"\n {_c('Factor needed to cover N% of pairs:', _DIM)}") + for pct in [50, 75, 90, 95, 100]: + idx = min(int(len(true_factors) * pct / 100), len(true_factors) - 1) + needed = true_factors[idx] + rounded = max(current_factor, round(needed + 0.5)) # round up to nearest int + still_under = n_underpredicted(needed) + col = _GREEN if needed <= current_factor else _YELLOW if needed <= current_factor * 1.5 else _RED + print(f" {pct:>3}% coverage → ×{_c(f'{needed:.1f}', col)} " + f"(≈ ×{rounded} rounded) " + f"→ {still_under}/{n_scored} pairs still under-predicted") + + # Final recommendation: smallest integer factor covering ≥ 90% of pairs + idx_90 = min(int(len(true_factors) * 0.90), len(true_factors) - 1) + factor_90 = true_factors[idx_90] + recommended = max(current_factor, int(factor_90) + (1 if factor_90 % 1 > 0 else 0)) + under_at_rec = n_underpredicted(recommended) + + print(f"\n {_c('Recommendation', _BOLD)}") + if recommended == current_factor: + print(f" Current factor ×{current_factor} already covers ≥90% of pairs. {_c('No change needed.', _GREEN)}") + else: + improvement = current_under - under_at_rec + print(f" Increase overhead factor from " + f"{_c(f'×{current_factor}', _RED)} → {_c(f'×{recommended}', _GREEN)}") + print(f" This moves from {_c(str(current_under), _RED)} under-predicted pairs " + f"to {_c(str(under_at_rec), _GREEN)} " + f"({_c(f'−{improvement} pairs', _GREEN)} now safely caught)") + print(f"\n To apply: set {_c('_OVERHEAD_FACTOR = ' + str(recommended), _CYAN)} " + f"in brainscore_vision/benchmark_helpers/memory.py") + print() + + +# --------------------------------------------------------------------------- +# CSV helpers (incremental — one row written immediately after each pair) +# --------------------------------------------------------------------------- + +_CSV_HEADER = [ + 'model', 'benchmark', 'status', + 'est_total_gb', 'act_activation_gb', 'actual_delta_gb', + 'num_features', 'num_stimuli', 'num_timebins', + 'baseline_rss_gb', 'peak_rss_gb', + 'ratio', 'score', + 'probe_elapsed_s', 'score_elapsed_s', 'note', +] + + +def _csv_row(r): + def _f(k, fmt='.4f'): + v = r.get(k) + return format(v, fmt) if v is not None else '' + return [ + r['model_id'], r['benchmark_id'], r['status'], + _f('est_gb'), _f('act_gb'), _f('actual_delta_gb'), + r.get('feat', ''), r.get('stimuli', ''), r.get('timebins', ''), + _f('baseline_rss_gb'), _f('peak_rss_gb'), + _f('ratio'), _f('score'), + _f('probe_elapsed', '.2f'), _f('score_elapsed', '.2f'), + r.get('note', ''), + ] + + +def init_csv(path): + """Write header row, return open file handle + csv.writer.""" + f = open(path, 'w', newline='') + w = csv.writer(f) + w.writerow(_CSV_HEADER) + f.flush() + return f, w + + +def append_csv_row(writer, file_handle, r): + writer.writerow(_csv_row(r)) + file_handle.flush() # write immediately so partial results survive a crash + + +# --------------------------------------------------------------------------- +# Calibration mode (alexnet × all benchmarks → fixed_benchmark_cost per bm) +# --------------------------------------------------------------------------- + +def run_calibration_pair(model, benchmark, benchmark_id, bm_idx, n_bm): + """Run one benchmark and return fixed_benchmark_cost = actual_delta - activation_gb.""" + from brainscore_vision.benchmark_helpers.memory import preallocate_memory + + proc = psutil.Process(os.getpid()) + + print(f"\n [{bm_idx}/{n_bm}] {benchmark_id}") + print(f" {'─' * 62}") + + # Probe + _step("probe (1-stimulus forward pass)") + try: + est = preallocate_memory(model, benchmark, raise_if_oom=False) + except TypeError: + _substep(_c("skipped — not a NeuralBenchmark (behavioral/non-neural)", _DIM)) + print(f" {_c(benchmark_id[:55], _DIM)}: N/A (non-neural)", flush=True) + return dict(benchmark_id=benchmark_id, status='skip', + activation_gb=None, actual_delta_gb=None, fixed_cost_gb=None) + except Exception as e: + _substep(_c(f"probe ERROR: {str(e)[:80]}", _RED)) + return dict(benchmark_id=benchmark_id, status='error', + activation_gb=None, actual_delta_gb=None, fixed_cost_gb=None, + note=str(e)[:100]) + + if est is None: + return dict(benchmark_id=benchmark_id, status='skip', + activation_gb=None, actual_delta_gb=None, fixed_cost_gb=None) + + _substep( + f"activation = {est.activation_gb:.3f} GB " + f"({est.num_features:,} feat × {est.num_stimuli:,} stim × {est.num_timebins} tbin)" + ) + + # Score + baseline_rss = proc.memory_info().rss + _step(f"scoring (baseline RSS: {_gb(baseline_rss)})") + + _ticker_stop = threading.Event() + def _ticker(): + t_start = time.time() + while not _ticker_stop.wait(30): + elapsed = time.time() - t_start + rss = proc.memory_info().rss + print(f" {_c('…', _DIM)} still scoring " + f"{elapsed/60:.1f} min RSS {_gb(rss)}", flush=True) + ticker_thread = threading.Thread(target=_ticker, daemon=True) + ticker_thread.start() + + monitor = _PeakMonitor().start() + t_score = time.time() + score_status = 'ok' + score_note = '' + try: + benchmark(model) + except MemoryError as e: + score_status = 'oom' + score_note = str(e)[:120] + _substep(_c(f"MemoryError: {score_note}", _RED)) + except Exception as e: + score_status = 'error' + score_note = str(e)[:120] + _substep(_c(f"ERROR: {score_note}", _RED)) + finally: + _ticker_stop.set() + ticker_thread.join() + + score_elapsed = time.time() - t_score + peak_rss = monitor.stop() + actual_delta_gb = (peak_rss - baseline_rss) / (1024 ** 3) + + fixed_cost_gb = None + if score_status == 'ok': + fixed_cost_gb = max(0.0, actual_delta_gb - est.activation_gb) + _step("result") + _substep( + f"actual Δ = {_c(f'{actual_delta_gb:.3f} GB', _CYAN)} " + f"elapsed {score_elapsed:.0f}s" + ) + _substep( + f"fixed_benchmark_cost = {actual_delta_gb:.3f} − {est.activation_gb:.3f} " + f"= {_c(f'{fixed_cost_gb:.3f} GB', _GREEN)}" + ) + # One-liner summary line + print( + f"\n {_c(benchmark_id[:55], _BOLD)}: " + f"fixed_cost = {_c(f'{fixed_cost_gb:.2f} GB', _GREEN)}", + flush=True, + ) + else: + _substep(f"actual Δ = {_gb(peak_rss - baseline_rss)} (run failed — no fixed_cost)") + + return dict( + benchmark_id=benchmark_id, + status=score_status, + activation_gb=est.activation_gb, + actual_delta_gb=actual_delta_gb, + fixed_cost_gb=fixed_cost_gb, + score_elapsed=score_elapsed, + note=score_note, + ) + + +def print_calibration_table(results): + neural = [r for r in results if r.get('fixed_cost_gb') is not None] + skipped = sum(1 for r in results if r['status'] == 'skip') + errors = sum(1 for r in results if r['status'] in ('error', 'oom')) + + print(f"\n\n{'═' * 72}") + print(f" {_c('BENCHMARK FIXED COSTS (model-independent overhead)', _BOLD)}") + print(f" Calibrated with alexnet on {len(neural)} neural benchmarks " + f"| {skipped} non-neural skipped | {errors} errors") + print(f"{'═' * 72}\n") + + if not neural: + print(f" {_c('No neural benchmarks scored.', _DIM)}") + return + + neural_sorted = sorted(neural, key=lambda r: r['fixed_cost_gb'], reverse=True) + col_w = min(max(len(r['benchmark_id']) for r in neural_sorted), 56) + + print(f" {'Benchmark':<{col_w}} {'Fixed cost':>12} {'Act. Δ':>9} {'Activation':>10}") + print(f" {'─' * col_w} {'─' * 12} {'─' * 9} {'─' * 10}") + + for r in neural_sorted: + bid = r['benchmark_id'][:col_w] + fc = r['fixed_cost_gb'] + act = r['actual_delta_gb'] + actl = r['activation_gb'] + col = _GREEN if fc < 5 else (_YELLOW if fc < 15 else _RED) + print( + f" {bid:<{col_w}} " + f"{_c(f'{fc:>8.2f} GB', col)} " + f"{act:>7.2f} GB " + f"{actl:>8.3f} GB" + ) + + print() + print(f" {_c('Formula:', _BOLD)} total_needed = activation_gb + fixed_benchmark_cost") + print(f" {_c('Usage:', _DIM)} preallocate_memory(model, bm, " + f"fixed_benchmark_cost_gb=)") + print() + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + +def _run_calibrate(args): + """--calibrate mode: load alexnet, run every benchmark, output fixed_benchmark_cost.""" + from brainscore_vision.benchmark_helpers.memory import save_calibration, load_calibration, _DEFAULT_CALIBRATION_PATH + cal_path = getattr(args, 'calibration_json', None) or _DEFAULT_CALIBRATION_PATH + + n_bm = len(ALL_BENCHMARKS) + print(f"\n{'═' * 72}") + print(f" {_c('CALIBRATION MODE', _BOLD)} — alexnet × {n_bm} benchmarks") + print(f" Goal: measure fixed_benchmark_cost = actual_Δ − activation_gb per benchmark") + print(f" Calibration table will be saved → {_c(cal_path, _CYAN)}") + print(f"{'═' * 72}\n") + + # Load alexnet + _step("loading alexnet...", indent=2) + t0 = time.time() + try: + model = _timed_load(_load_model, 'alexnet', t0) + print(f" {_c('OK', _GREEN)} ({time.time() - t0:.1f}s)", flush=True) + except Exception as e: + print(f" {_c('FAILED', _RED)}: {e}") + return + + # Open CSV + csv_file, csv_writer = None, None + if args.csv: + csv_file = open(args.csv, 'w', newline='') + csv_writer = csv.writer(csv_file) + csv_writer.writerow(['benchmark', 'status', 'activation_gb', + 'actual_delta_gb', 'fixed_cost_gb', 'score_elapsed_s', 'note']) + csv_file.flush() + print(f"\n {_c('CSV →', _CYAN)} {args.csv}\n") + + # Load any costs already written in a previous run + costs = load_calibration(cal_path) + if costs: + print(f" {_c('Resuming:', _CYAN)} loaded {len(costs)} existing costs from {cal_path}") + + # Find the resume offset + resume_from = getattr(args, 'resume_from', None) + start_idx = 0 + if resume_from: + if resume_from in ALL_BENCHMARKS: + start_idx = ALL_BENCHMARKS.index(resume_from) + 1 + print(f" {_c('Skipping', _DIM)} benchmarks 1–{start_idx} " + f"(up to and including {resume_from})") + else: + print(f" {_c('WARNING', _YELLOW)}: --resume-from '{resume_from}' " + f"not found in ALL_BENCHMARKS — starting from the beginning") + print() + + results = [] + costs = costs # carry forward existing costs + try: + for i, bid in enumerate(ALL_BENCHMARKS, 1): + if i <= start_idx: + continue # skip already-completed benchmarks + # Load benchmark with ticker + t0 = time.time() + try: + bm = _timed_load(_load_benchmark, bid, t0) + except Exception as e: + print(f"\n [{i}/{n_bm}] {bid}") + _substep(_c(f"load FAILED: {str(e)[:80]}", _RED)) + r = dict(benchmark_id=bid, status='error', activation_gb=None, + actual_delta_gb=None, fixed_cost_gb=None, note=str(e)[:100]) + results.append(r) + if csv_writer: + csv_writer.writerow([bid, 'error', '', '', '', '', r.get('note', '')]) + csv_file.flush() + continue + + r = run_calibration_pair(model, bm, bid, i, n_bm) + results.append(r) + + if csv_writer: + csv_writer.writerow([ + bid, r['status'], + f"{r['activation_gb']:.4f}" if r['activation_gb'] is not None else '', + f"{r['actual_delta_gb']:.4f}" if r['actual_delta_gb'] is not None else '', + f"{r['fixed_cost_gb']:.4f}" if r['fixed_cost_gb'] is not None else '', + f"{r.get('score_elapsed', ''):.1f}" if r.get('score_elapsed') else '', + r.get('note', ''), + ]) + csv_file.flush() + + # Incrementally save JSON after every benchmark that yielded a cost + if r.get('fixed_cost_gb') is not None: + costs[bid] = r['fixed_cost_gb'] + save_calibration(costs, cal_path) + print(f" {_c('↳ JSON updated', _DIM)} ({len(costs)} benchmarks so far)", + flush=True) + + finally: + if csv_file: + csv_file.close() + print(f"\n{_c('CSV finalised →', _CYAN)} {args.csv}") + + save_calibration(costs, cal_path) + print(f"\n{_c('Calibration saved →', _CYAN)} {cal_path} ({len(costs)} benchmarks)\n") + + print_calibration_table(results) + + +def main(): + parser = argparse.ArgumentParser( + description="Memory profile suite — 5×5 estimate vs actual, or benchmark calibration.") + parser.add_argument('--csv', metavar='PATH', default=None, + help='write results to CSV') + parser.add_argument('--skip-score', action='store_true', + help='probe only — do not run actual scoring') + parser.add_argument('--calibrate', action='store_true', + help='run alexnet on ALL benchmarks and output fixed_benchmark_cost per benchmark') + parser.add_argument('--calibration-json', metavar='PATH', default=None, + help='path to save/load calibration JSON ' + '(default: ~/.brainscore/benchmark_costs.json)') + parser.add_argument('--resume-from', metavar='BENCHMARK_ID', default=None, + help='skip all benchmarks up to and including this one, ' + 'then continue from the next') + args = parser.parse_args() + + if args.calibrate: + _run_calibrate(args) + return + + n_bm = len(BENCHMARKS) + n_m = len(MODELS) + mode = "probe only" if args.skip_score else "probe + score" + total_pairs = n_m * n_bm + + print(f"\n{'═' * 66}") + print(f" {_c('MEM PROFILE SUITE', _BOLD)} — " + f"{n_m} models × {n_bm} benchmarks = {total_pairs} pairs [{mode}]") + print(f"{'═' * 66}") + + # ── Load all benchmarks first ──────────────────────────────────────── + print(f"\n{_c(f'Loading {n_bm} benchmarks (may download from S3)', _CYAN)}\n") + benchmarks = {} + for i, bid in enumerate(BENCHMARKS, 1): + print(f" [{i}/{n_bm}] {bid}") + t0 = time.time() + try: + benchmarks[bid] = _timed_load(_load_benchmark, bid, t0) + print(f" {_c('OK', _GREEN)} ({time.time() - t0:.1f}s)") + except Exception as e: + benchmarks[bid] = None + print(f" {_c('FAILED', _RED)}: {str(e)[:80]}") + print() + + # ── Open CSV for incremental writing ──────────────────────────────── + csv_file, csv_writer = None, None + if args.csv: + csv_file, csv_writer = init_csv(args.csv) + print(f" {_c('CSV opened →', _CYAN)} {args.csv} (rows written after each pair)\n") + + # ── For each model, run all benchmarks ─────────────────────────────── + results_grid = {mid_id: {} for mid_id in MODELS} + pair_num = 0 + + try: + for m_idx, mid_id in enumerate(MODELS, 1): + print(f"\n{'═' * 66}") + print(f" {_c(f'Model {m_idx}/{n_m}: {mid_id}', _CYAN)}") + print(f"{'═' * 66}\n") + + _step("loading model...", indent=2) + t0 = time.time() + try: + model = _timed_load(_load_model, mid_id, t0) + print(f" {_c('OK', _GREEN)} ({time.time() - t0:.1f}s)", flush=True) + except Exception as e: + print(f" {_c('FAILED', _RED)}: {str(e)[:80]}") + for bid in BENCHMARKS: + pair_num += 1 + r = _make_result( + mid_id, bid, status='error', est_gb=None, act_gb=None, + actual_delta_gb=None, score=None, + probe_elapsed=0.0, score_elapsed=None, + note=f"model load failed: {str(e)[:60]}") + results_grid[mid_id][bid] = r + if csv_writer: + append_csv_row(csv_writer, csv_file, r) + continue + + for bid in BENCHMARKS: + pair_num += 1 + short = _BM_SHORT.get(bid, bid) + print(f"\n {_c(f'pair {pair_num}/{total_pairs}', _DIM)} " + f"{_c(_trunc(mid_id, 28), _BOLD)} × {_c(short, _BOLD)}") + print(f" {'─' * 54}") + + bm = benchmarks.get(bid) + if bm is None: + _step(_c("benchmark failed to load — skipping", _RED), indent=4) + r = _make_result( + mid_id, bid, status='error', est_gb=None, act_gb=None, + actual_delta_gb=None, score=None, + probe_elapsed=0.0, score_elapsed=None, + note='benchmark failed to load') + results_grid[mid_id][bid] = r + if csv_writer: + append_csv_row(csv_writer, csv_file, r) + continue + + r = run_pair(model, mid_id, bm, bid, skip_score=args.skip_score) + results_grid[mid_id][bid] = r + + # Write CSV row immediately + if csv_writer: + append_csv_row(csv_writer, csv_file, r) + print(f" {_c('↳ CSV row written', _DIM)}", flush=True) + + # One-line pair summary + est = r.get('est_gb') + act = r.get('actual_delta_gb') + if est is not None and act is not None: + _, verdict, ratio = _compare_label(est, act) + col = (_GREEN if (ratio and ratio >= 0.8) + else _YELLOW if (ratio and ratio >= 0.4) else _RED) + print(f"\n {_c('RESULT', _BOLD)}: " + f"est {est:.2f} GB actual Δ {act:.2f} GB → {_c(verdict, col)}") + elif est is not None: + print(f"\n {_c('RESULT', _BOLD)}: est {est:.2f} GB (no actual)") + + finally: + if csv_file: + csv_file.close() + print(f"\n{_c('CSV finalised →', _CYAN)} {args.csv}") + + # ── Final summary ──────────────────────────────────────────────────── + print(f"\n\n{'═' * 66}") + print(f" {_c('FINAL SUMMARY', _BOLD)}") + print(f"{'═' * 66}\n") + print_summary_table(results_grid) + print_full_report(results_grid) + print_overhead_recommendation(results_grid) + + +if __name__ == '__main__': + main() diff --git a/scripts/memory_flight_report.py b/scripts/memory_flight_report.py new file mode 100644 index 0000000000..2b4302924b --- /dev/null +++ b/scripts/memory_flight_report.py @@ -0,0 +1,280 @@ +""" +Memory Flight Report +==================== +Runs a pre-flight memory estimate for one (model, benchmark) pair, then +optionally executes the full benchmark while tracking peak RSS, so you can +see how close the estimate was to reality. + +NOTE: prefer preflight_check.py for day-to-day use — it is simpler and uses +the calibrated fixed_benchmark_cost table automatically. This script is +useful for one-off investigations or when you want the box-formatted output. + +Usage +----- + python scripts/memory_flight_report.py + python scripts/memory_flight_report.py --skip-score + + --skip-score only run the pre-flight estimate, skip the actual benchmark + +Example +------- + python scripts/memory_flight_report.py resnet50_tutorial MajajHong2015.IT-pls + +Output +------ + ┌─ PRE-FLIGHT ESTIMATE ──────────────────────────────────────────┐ + │ Stimuli: 2560 Features: 200,704 Timebins: 1 │ + │ Activation: 1.91 GB (×6 overhead → 11.47 GB estimated) │ + │ Available RAM: 13.6 GB → OK │ + └────────────────────────────────────────────────────────────────┘ + [scoring runs...] + ┌─ ACTUAL USAGE ─────────────────────────────────────────────────┐ + │ Baseline RSS: 1.2 GB │ + │ Peak RSS: 4.7 GB (Δ +3.5 GB) │ + │ Final RSS: 2.1 GB (Δ +0.9 GB) │ + │ Estimated: 11.5 GB → estimate was ACCURATE (1.2×) │ + └────────────────────────────────────────────────────────────────┘ +""" + +import os +import sys +import threading +import time +import argparse +import logging + +import psutil + +# --------------------------------------------------------------------------- +# Resolve local repos so the script works without installation +# --------------------------------------------------------------------------- +_script_dir = os.path.dirname(os.path.abspath(__file__)) +_vision_root = os.path.dirname(_script_dir) +_core_root = os.path.join(os.path.dirname(_vision_root), 'core') +for _p in [_vision_root, _core_root]: + if _p not in sys.path: + sys.path.insert(0, _p) + +from brainscore_vision import load_model, load_benchmark +from brainscore_vision.benchmark_helpers.memory import preallocate_memory +from brainscore_core.benchmarks import score_benchmark + +logging.basicConfig(level=logging.WARNING) + +_RESET = '\033[0m' +_BOLD = '\033[1m' +_GREEN = '\033[32m' +_YELLOW = '\033[33m' +_RED = '\033[31m' +_CYAN = '\033[36m' + + +# --------------------------------------------------------------------------- +# Peak RSS monitor (background thread) +# --------------------------------------------------------------------------- + +class _PeakMonitor: + """Polls process RSS every `interval` seconds and records the peak.""" + + def __init__(self, interval: float = 0.5): + self._proc = psutil.Process(os.getpid()) + self._interval = interval + self._peak = self._proc.memory_info().rss + self._stop = threading.Event() + self._thread = threading.Thread(target=self._run, daemon=True) + + def start(self): + self._thread.start() + return self + + def stop(self) -> int: + self._stop.set() + self._thread.join() + return self._peak + + def _run(self): + while not self._stop.is_set(): + try: + rss = self._proc.memory_info().rss + if rss > self._peak: + self._peak = rss + except psutil.NoSuchProcess: + break + self._stop.wait(self._interval) + + +# --------------------------------------------------------------------------- +# Formatting helpers +# --------------------------------------------------------------------------- + +def _gb(n_bytes: int) -> str: + return f"{n_bytes / (1024 ** 3):.2f} GB" + + +def _ratio_label(estimate_gb: float, actual_delta_gb: float) -> str: + if actual_delta_gb <= 0: + return f"{_GREEN}estimate unavailable (no measurable delta){_RESET}" + ratio = estimate_gb / actual_delta_gb + if ratio >= 0.8: + colour = _GREEN + verdict = f"estimate was ACCURATE ({ratio:.1f}×)" + elif ratio >= 0.4: + colour = _YELLOW + verdict = f"estimate was UNDER by {1/ratio:.1f}×" + else: + colour = _RED + verdict = f"estimate was UNDER by {actual_delta_gb/estimate_gb:.1f}×" + return f"{colour}{verdict}{_RESET}" + + +def _box(title: str, lines: list[str], width: int = 66) -> str: + top = f"┌─ {_BOLD}{title}{_RESET} " + "─" * (width - len(title) - 3) + "┐" + bottom = "└" + "─" * (width) + "┘" + body = "\n".join(f"│ {l:<{width - 2}}│" for l in lines) + return f"{top}\n{body}\n{bottom}" + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + +def main(): + parser = argparse.ArgumentParser(description="Memory flight report for a brainscore scoring run.") + parser.add_argument("model_identifier") + parser.add_argument("benchmark_identifier") + parser.add_argument("--skip-score", action="store_true", + help="Only run the pre-flight estimate, skip the actual benchmark.") + args = parser.parse_args() + + proc = psutil.Process(os.getpid()) + + # ------------------------------------------------------------------ # + # 1. Load model + benchmark # + # ------------------------------------------------------------------ # + print(f"\n{_CYAN}Loading model '{args.model_identifier}'...{_RESET}") + model = load_model(args.model_identifier) + + print(f"{_CYAN}Loading benchmark '{args.benchmark_identifier}'...{_RESET}\n") + benchmark = load_benchmark(args.benchmark_identifier) + + # ------------------------------------------------------------------ # + # 2. Pre-flight estimate # + # ------------------------------------------------------------------ # + print(f"{_BOLD}Running pre-flight probe (1 stimulus)...{_RESET}") + try: + estimate = preallocate_memory(model, benchmark, raise_if_oom=False) + except TypeError as e: + print(f"{_YELLOW}Pre-flight skipped (benchmark type not supported): {e}{_RESET}\n") + estimate = None + + if estimate is not None: + ok_or_oom = f"{_RED}OOM LIKELY{_RESET}" if estimate.will_oom else f"{_GREEN}OK{_RESET}" + preflight_lines = [ + f"Stimuli: {estimate.num_stimuli:>6} Features: {estimate.num_features:>7,} Timebins: {estimate.num_timebins}", + f"Activation: {estimate.activation_gb:.2f} GB " + f"(×6 overhead → {estimate.total_estimated_gb:.2f} GB estimated)", + f"Available RAM: {estimate.available_gb:.1f} GB → {ok_or_oom}", + ] + print(_box("PRE-FLIGHT ESTIMATE", preflight_lines)) + print() + + if estimate.will_oom and not args.skip_score: + print(f"{_RED}OOM predicted — proceeding anyway to measure actual usage.{_RESET}\n") + + if args.skip_score: + return + + # ------------------------------------------------------------------ # + # 3. Score (with RSS monitoring) # + # ------------------------------------------------------------------ # + baseline_rss = proc.memory_info().rss + print(f"{_CYAN}Baseline RSS: {_gb(baseline_rss)}{_RESET}") + print(f"{_BOLD}Scoring...{_RESET} (this may take a while)\n") + + monitor = _PeakMonitor(interval=0.5).start() + t0 = time.time() + + try: + score = benchmark(model) + elapsed = time.time() - t0 + peak_rss = monitor.stop() + final_rss = proc.memory_info().rss + except AssertionError as e: + monitor.stop() + elapsed = time.time() - t0 + print(f"\n{_RED}AssertionError in attach_stimulus_set_meta after {elapsed:.1f}s{_RESET}") + print("This usually means the activations cache has stale paths.") + print() + # Print diagnostic: which paths are mismatching + try: + from brainscore_vision.model_helpers.activations.core import lstrip_local + import numpy as np + stimulus_set = benchmark._assembly.stimulus_set + from brainscore_vision.benchmark_helpers.screen import place_on_screen + ss = place_on_screen(stimulus_set, target_visual_degrees=model.visual_degrees(), + source_visual_degrees=benchmark._visual_degrees) + expected_paths = [lstrip_local(str(ss.get_stimulus(sid))) for sid in ss['stimulus_id'].values[:3]] + print(f"{_CYAN}Expected paths (first 3):{_RESET}") + for p in expected_paths: + print(f" {p}") + # Show what fresh _from_paths returns for comparison + _am = model.activations_model + lm = model.layer_model._layer_model + layer = list(dict.items(lm.region_layer_map))[0][1] + layer = layer if isinstance(layer, str) else layer[0] + dummy = _am._extractor._from_paths([str(ss.get_stimulus(ss['stimulus_id'].values[0]))], layers=[layer]) + got_paths = [lstrip_local(p) for p in dummy['stimulus_path'].values[:3]] + print(f"{_CYAN}Fresh _from_paths result paths (first 3):{_RESET}") + for p in got_paths: + print(f" {p}") + except Exception as diag_err: + print(f"(diagnostic failed: {diag_err})") + print() + print(f"Fix: delete the stale cache entry and re-run:") + try: + cache_dir = os.path.expanduser( + "~/.result_caching/brainscore_vision.model_helpers.activations.core" + ".ActivationsExtractorHelper._from_paths_stored" + ) + cache_file = ( + f"identifier={model.identifier}," + f"stimuli_identifier={ss.identifier}," + f"number_of_trials=1,require_variance=False.pkl" + ) + print(f" rm '{os.path.join(cache_dir, cache_file)}'") + except Exception: + print( + " rm ~/.result_caching/brainscore_vision.model_helpers.activations.core" + ".ActivationsExtractorHelper._from_paths_stored/,*.pkl" + ) + sys.exit(1) + + except MemoryError as e: + monitor.stop() + elapsed = time.time() - t0 + peak_rss = proc.memory_info().rss + print(f"\n{_RED}MemoryError after {elapsed:.1f}s:{_RESET} {e}") + print(f"Peak RSS before crash: {_gb(peak_rss)} (Δ +{_gb(peak_rss - baseline_rss)})\n") + sys.exit(1) + + # ------------------------------------------------------------------ # + # 4. Report # + # ------------------------------------------------------------------ # + delta_peak = peak_rss - baseline_rss + delta_final = final_rss - baseline_rss + est_gb = estimate.total_estimated_gb if estimate else float('nan') + + actual_lines = [ + f"Baseline RSS: {_gb(baseline_rss)}", + f"Peak RSS: {_gb(peak_rss)} (Δ +{_gb(delta_peak)})", + f"Final RSS: {_gb(final_rss)} (Δ +{_gb(delta_final)})", + f"Estimated: {est_gb:.2f} GB → {_ratio_label(est_gb, delta_peak / (1024**3))}", + f"Elapsed: {elapsed:.1f}s", + f"Score: {float(score):.4f}", + ] + print(_box("ACTUAL USAGE", actual_lines)) + print() + + +if __name__ == '__main__': + main() diff --git a/scripts/preflight_check.py b/scripts/preflight_check.py new file mode 100644 index 0000000000..12b89bf732 --- /dev/null +++ b/scripts/preflight_check.py @@ -0,0 +1,304 @@ +""" +Pre-flight Memory Check +======================= +The main entry point for checking whether a model will OOM on a benchmark +before committing to a full (potentially multi-hour) scoring run. + +HOW IT WORKS +------------ +1. Loads the calibration table from ~/.brainscore/benchmark_costs.json + (produced by: python scripts/mem_profile_suite.py --calibrate) + +2. Runs a 1-stimulus forward pass through the model (the "probe") to measure + the model's actual feature count for this benchmark's region/layer. + +3. Estimates total RAM needed: + total = activation_gb + fixed_benchmark_cost_gb (if benchmark is calibrated) + total = activation_gb × 6 (fallback if not calibrated) + + where: + activation_gb = stimuli × features × timebins × 4 bytes + fixed_benchmark_cost = benchmark's model-independent overhead + (regression matrices, xarray, CV buffers) + — constant regardless of which model you run + +4. Compares the estimate against available RAM and reports OK or OOM LIKELY. + +Optionally (--score) runs the full benchmark and compares the estimate to +the actual peak RSS delta, so you can validate the calibration on this machine. + +IMPORTANT: Calibrate on the same machine you score on. The fixed_benchmark_cost +is environment-specific (Linux EC2 numbers will differ from macOS). + +Usage +----- + python scripts/preflight_check.py [--score] + +Examples +-------- + # Fast probe — just check if it will OOM (recommended before any scoring run) + python scripts/preflight_check.py resnet50_tutorial MajajHong2015.IT-pls + + # Full roundtrip — probe then score and compare estimate to actual peak RSS + python scripts/preflight_check.py resnet50_tutorial MajajHong2015.IT-pls --score +""" + +import os +import sys +import time +import argparse +import threading + +# --------------------------------------------------------------------------- +# Resolve local repos +# --------------------------------------------------------------------------- +_script_dir = os.path.dirname(os.path.abspath(__file__)) +_vision_root = os.path.dirname(_script_dir) +_core_root = os.path.join(os.path.dirname(_vision_root), 'core') +for _p in [_vision_root, _core_root]: + if _p not in sys.path: + sys.path.insert(0, _p) + +import logging +logging.basicConfig(level=logging.WARNING) + +import psutil + +_RESET = '\033[0m' +_BOLD = '\033[1m' +_GREEN = '\033[32m' +_YELLOW = '\033[33m' +_RED = '\033[31m' +_CYAN = '\033[36m' +_DIM = '\033[2m' + + +def _c(text, colour): + return f"{colour}{text}{_RESET}" + + +def _gb(n_bytes): + return f"{n_bytes / (1024 ** 3):.2f} GB" + + +def _divider(char='─', width=66): + print(char * width) + + +# --------------------------------------------------------------------------- +# Peak RSS monitor +# --------------------------------------------------------------------------- +class _PeakMonitor: + def __init__(self, interval=0.5): + self._proc = psutil.Process(os.getpid()) + self._peak = self._proc.memory_info().rss + self._stop = threading.Event() + self._thread = threading.Thread(target=self._run, daemon=True) + + def start(self): + self._thread.start() + return self + + def stop(self): + self._stop.set() + self._thread.join() + return self._peak + + def _run(self): + while not self._stop.is_set(): + try: + rss = self._proc.memory_info().rss + if rss > self._peak: + self._peak = rss + except psutil.NoSuchProcess: + break + self._stop.wait(0.5) + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- +def main(): + parser = argparse.ArgumentParser( + description='Pre-flight memory check integration test.') + parser.add_argument('model_id') + parser.add_argument('benchmark_id') + parser.add_argument('--score', action='store_true', + help='also run the full benchmark and compare estimate to actual RSS') + args = parser.parse_args() + + from brainscore_vision import load_model, load_benchmark + from brainscore_vision.benchmark_helpers.memory import ( + preallocate_memory, load_calibration, _DEFAULT_CALIBRATION_PATH, + ) + + print() + _divider('═') + print(f" {_c('PRE-FLIGHT CHECK', _BOLD)}") + print(f" model : {_c(args.model_id, _CYAN)}") + print(f" benchmark : {_c(args.benchmark_id, _CYAN)}") + _divider('═') + + # ── Calibration table status ───────────────────────────────────────── + print() + cal = load_calibration() + if cal: + fixed = cal.get(args.benchmark_id) + if fixed is not None: + print(f" {_c('Calibration table', _BOLD)}: {_DEFAULT_CALIBRATION_PATH}") + print(f" {_c('✓', _GREEN)} '{args.benchmark_id}' found → " + f"fixed_benchmark_cost = {_c(f'{fixed:.4f} GB', _GREEN)}") + print(f" Formula: total = activation_gb + {fixed:.4f} GB") + else: + print(f" {_c('Calibration table', _BOLD)}: {_DEFAULT_CALIBRATION_PATH} " + f"({len(cal)} entries)") + print(f" {_c('⚠', _YELLOW)} '{args.benchmark_id}' not in table → " + f"will fall back to ×6 overhead") + else: + print(f" {_c('⚠', _YELLOW)} No calibration table found at {_DEFAULT_CALIBRATION_PATH}") + print(f" Will fall back to ×6 overhead multiplier.") + + # ── Load model + benchmark ─────────────────────────────────────────── + print() + _divider() + + print(f"\n Loading model '{args.model_id}'...", end='', flush=True) + t0 = time.time() + model = load_model(args.model_id) + print(f" {_c('OK', _GREEN)} ({time.time()-t0:.1f}s)") + + print(f" Loading benchmark '{args.benchmark_id}'...", end='', flush=True) + t0 = time.time() + benchmark = load_benchmark(args.benchmark_id) + print(f" {_c('OK', _GREEN)} ({time.time()-t0:.1f}s)") + + # ── Pre-flight probe ───────────────────────────────────────────────── + print() + _divider() + print(f"\n {_c('PRE-FLIGHT PROBE', _BOLD)} (1-stimulus forward pass)\n") + + t0 = time.time() + try: + est = preallocate_memory(model, benchmark, raise_if_oom=False) + except TypeError as e: + print(f" {_c('SKIPPED', _YELLOW)}: {e}") + return + + probe_elapsed = time.time() - t0 + + if est is None: + print(f" {_c('SKIPPED', _YELLOW)} (BRAINSCORE_SKIP_MEMORY_CHECK set)") + return + + print(f" {'Stimuli':<22}: {est.num_stimuli:,}") + print(f" {'Features (neuroid)':<22}: {est.num_features:,}") + print(f" {'Timebins':<22}: {est.num_timebins}") + print(f" {'Activation array':<22}: {est.activation_gb:.4f} GB " + f"{_c(f'({est.num_stimuli} × {est.num_features:,} × {est.num_timebins} × 4B)', _DIM)}") + print() + + if est.fixed_benchmark_cost_gb is not None: + print(f" {_c('Formula', _BOLD)}: {_c('CALIBRATED', _GREEN)}") + print(f" {'Activation':<22}: {est.activation_gb:.4f} GB") + print(f" {'Fixed benchmark cost':<22}: {est.fixed_benchmark_cost_gb:.4f} GB " + f"{_c('← model-independent overhead from calibration table', _DIM)}") + print(f" {'Total estimated':<22}: {_c(f'{est.total_estimated_gb:.4f} GB', _CYAN)} " + f"{_c(f'({est.activation_gb:.4f} + {est.fixed_benchmark_cost_gb:.4f})', _DIM)}") + else: + print(f" {_c('Formula', _BOLD)}: {_c('FALLBACK (×6)', _YELLOW)} " + f"{_c('← benchmark not in calibration table', _DIM)}") + print(f" {'Activation':<22}: {est.activation_gb:.4f} GB") + print(f" {'Total estimated':<22}: {_c(f'{est.total_estimated_gb:.4f} GB', _CYAN)} " + f"{_c(f'({est.activation_gb:.4f} × 6)', _DIM)}") + + print() + avail_col = _RED if est.will_oom else _GREEN + verdict = _c('OOM LIKELY', _RED) if est.will_oom else _c('OK', _GREEN) + print(f" {'Available RAM':<22}: {_c(f'{est.available_gb:.2f} GB', avail_col)}") + print(f" {'Verdict':<22}: {verdict}") + print(f" {'Probe elapsed':<22}: {probe_elapsed:.1f}s") + + if not args.score: + print() + _divider() + print(f"\n {_c('Tip:', _DIM)} run with --score to also execute the full benchmark") + print(f" and compare the estimate against actual peak RSS.\n") + return + + # ── Full benchmark run ─────────────────────────────────────────────── + print() + _divider() + print(f"\n {_c('FULL BENCHMARK RUN', _BOLD)}\n") + + proc = psutil.Process(os.getpid()) + baseline_rss = proc.memory_info().rss + print(f" Baseline RSS: {_gb(baseline_rss)} " + f"{_c('← everything already in RAM (model weights, Python, etc.)', _DIM)}") + print(f" Scoring... (this may take a while)\n") + + # Ticker thread + _ticker_stop = threading.Event() + def _ticker(): + t_start = time.time() + while not _ticker_stop.wait(30): + elapsed = time.time() - t_start + rss = proc.memory_info().rss + print(f" {_c('…', _DIM)} still scoring {elapsed/60:.1f} min RSS {_gb(rss)}", + flush=True) + ticker_thread = threading.Thread(target=_ticker, daemon=True) + ticker_thread.start() + + monitor = _PeakMonitor().start() + t_score = time.time() + score_val = None + try: + score_val = benchmark(model) + except MemoryError as e: + print(f"\n {_c('MemoryError', _RED)}: {e}") + except Exception as e: + print(f"\n {_c('ERROR', _RED)}: {e}") + finally: + _ticker_stop.set() + ticker_thread.join() + + score_elapsed = time.time() - t_score + peak_rss = monitor.stop() + final_rss = proc.memory_info().rss + actual_delta_gb = (peak_rss - baseline_rss) / (1024 ** 3) + + # ── Comparison ─────────────────────────────────────────────────────── + print() + _divider() + print(f"\n {_c('RESULT', _BOLD)}\n") + + print(f" {'Baseline RSS':<24}: {_gb(baseline_rss)}") + print(f" {'Peak RSS':<24}: {_gb(peak_rss)}") + print(f" {'Δ (peak − baseline)':<24}: {_c(f'+{actual_delta_gb:.4f} GB', _CYAN)} " + f"{_c('← actual RAM the benchmark consumed', _DIM)}") + print(f" {'Estimated total':<24}: {_c(f'{est.total_estimated_gb:.4f} GB', _CYAN)}") + print() + + if actual_delta_gb > 0.01: + error_gb = est.total_estimated_gb - actual_delta_gb + error_pct = (error_gb / actual_delta_gb) * 100 + if error_gb >= 0: + accuracy = _c(f'OVER by {error_gb:.2f} GB ({error_pct:.1f}%) ← conservative, safe', _GREEN) + elif abs(error_pct) <= 15: + accuracy = _c(f'UNDER by {abs(error_gb):.2f} GB ({abs(error_pct):.1f}%) ← within 15%, acceptable', _YELLOW) + else: + accuracy = _c(f'UNDER by {abs(error_gb):.2f} GB ({abs(error_pct):.1f}%) ← significant miss', _RED) + print(f" {'Accuracy':<24}: {accuracy}") + + formula = 'calibrated' if est.fixed_benchmark_cost_gb is not None else '×6 fallback' + print(f" {'Formula used':<24}: {formula}") + + if score_val is not None: + print(f" {'Score':<24}: {float(score_val):.4f}") + print(f" {'Elapsed':<24}: {score_elapsed:.0f}s") + print() + _divider('═') + print() + + +if __name__ == '__main__': + main() diff --git a/scripts/validation.py b/scripts/validation.py new file mode 100644 index 0000000000..1a2c1fe442 --- /dev/null +++ b/scripts/validation.py @@ -0,0 +1,385 @@ +""" +Pre-flight Estimator Validation +================================ +Runs a 3-model × 4-benchmark grid to validate the pre-flight memory estimator +across all formula types: PLS, Ridge (calibrated), RidgeCV (calibrated), and RDM. + +One benchmark is selected per formula class so that every code path in +preallocate_memory is exercised: + + FreemanZiemba2013.V1-pls → PLS (activation × 7 + fixed_cost, warning printed) + Papale2025.IT-ridge → Ridge calibrated (activation + calibrated cost) + Gifford2022.IT-ridgecv → RidgeCV calibrated (activation + calibrated cost) + Allen2022_fmri.IT-rdm → RDM (activation + 2×n_stimuli²×4B, model-independent) + +For each (model, benchmark) pair it: + 1. Runs the pre-flight probe → estimates total GB via the appropriate formula + 2. Runs the full benchmark → measures actual peak RSS delta + 3. Compares estimate to actual and reports over/under by how much + +Results are written to `validation_results.jsonl` after every pair so a crash +does not lose completed work. Re-running will overwrite the file. + +Usage +----- + python scripts/validation.py + + # Skip the actual benchmark runs (probe only — fast) + python scripts/validation.py --probe-only + + # Write results to a custom path + python scripts/validation.py --output /tmp/val.jsonl +""" + +import argparse +import json +import logging +import os +import sys +import threading +import time + +import psutil + +# --------------------------------------------------------------------------- +# Resolve local repos so the script works without installation +# --------------------------------------------------------------------------- +_script_dir = os.path.dirname(os.path.abspath(__file__)) +_vision_root = os.path.dirname(_script_dir) +_core_root = os.path.join(os.path.dirname(_vision_root), 'core') +for _p in [_vision_root, _core_root]: + if _p not in sys.path: + sys.path.insert(0, _p) + +logging.basicConfig(level=logging.WARNING) + +# --------------------------------------------------------------------------- +# Grid definition +# --------------------------------------------------------------------------- +MODELS = [ + 'alexnet', + 'resnet50_tutorial', + 'vit_large_patch14_clip_224:openai_ft_in1k', +] + +BENCHMARKS = [ + 'FreemanZiemba2013.V1-pls', # PLS — activation × 7 + fixed_cost (approximate, warning) + 'Papale2025.IT-ridge', # Ridge — activation + calibrated cost + 'Gifford2022.IT-ridgecv', # RidgeCV — activation + calibrated cost + 'Allen2022_fmri.IT-rdm', # RDM — activation + 2×n_stimuli²×4B (model-independent) +] + +# --------------------------------------------------------------------------- +# ANSI colours +# --------------------------------------------------------------------------- +_RESET = '\033[0m' +_BOLD = '\033[1m' +_DIM = '\033[2m' +_GREEN = '\033[32m' +_YELLOW = '\033[33m' +_RED = '\033[31m' +_CYAN = '\033[36m' + +def _c(text, colour): return f"{colour}{text}{_RESET}" +def _gb(n_bytes): return f"{n_bytes / (1024 ** 3):.3f} GB" + + +# --------------------------------------------------------------------------- +# Peak RSS monitor +# --------------------------------------------------------------------------- +class _PeakMonitor: + def __init__(self, interval=0.5): + self._proc = psutil.Process(os.getpid()) + self._peak = self._proc.memory_info().rss + self._stop = threading.Event() + self._thread = threading.Thread(target=self._run, daemon=True) + + def start(self): + self._thread.start() + return self + + def stop(self) -> int: + self._stop.set() + self._thread.join() + return self._peak + + def _run(self): + while not self._stop.is_set(): + try: + rss = self._proc.memory_info().rss + if rss > self._peak: + self._peak = rss + except psutil.NoSuchProcess: + break + self._stop.wait(0.5) + + +# --------------------------------------------------------------------------- +# Formatting helpers +# --------------------------------------------------------------------------- +def _divider(char='─', width=70): + print(char * width) + + +def _accuracy_label(estimated_gb: float, actual_gb: float): + """Return a coloured accuracy string.""" + if actual_gb <= 0.01: + return _c('actual delta too small to measure', _DIM) + error_gb = estimated_gb - actual_gb + error_pct = (error_gb / actual_gb) * 100 + if error_gb >= 0: + return _c(f'OVER by {error_gb:.2f} GB ({error_pct:.1f}%) ← conservative, safe', _GREEN) + elif abs(error_pct) <= 15: + return _c(f'UNDER by {abs(error_gb):.2f} GB ({abs(error_pct):.1f}%) ← within 15%, acceptable', _YELLOW) + else: + return _c(f'UNDER by {abs(error_gb):.2f} GB ({abs(error_pct):.1f}%) ← significant miss', _RED) + + +def _write_result(path: str, record: dict): + """Append one JSON record to the output file (crash-safe).""" + with open(path, 'a') as f: + f.write(json.dumps(record) + '\n') + + +# --------------------------------------------------------------------------- +# Single-pair validation +# --------------------------------------------------------------------------- +def run_pair(model_id: str, benchmark_id: str, output_path: str, probe_only: bool) -> dict: + from brainscore_vision import load_model, load_benchmark + from brainscore_vision.benchmark_helpers.memory import preallocate_memory + + record = { + 'model': model_id, + 'benchmark': benchmark_id, + 'status': 'pending', + } + + proc = psutil.Process(os.getpid()) + + # ── Load model ────────────────────────────────────────────────────── + print(f"\n Loading model {_c(model_id, _CYAN)} ...", end='', flush=True) + t0 = time.time() + try: + model = load_model(model_id) + except Exception as e: + print(f" {_c('FAILED', _RED)}: {e}") + record.update({'status': 'error', 'error': f'load_model: {e}'}) + _write_result(output_path, record) + return record + print(f" {_c('OK', _GREEN)} ({time.time()-t0:.1f}s)") + + # ── Load benchmark ─────────────────────────────────────────────────── + print(f" Loading benchmark {_c(benchmark_id, _CYAN)} ...", end='', flush=True) + t0 = time.time() + try: + benchmark = load_benchmark(benchmark_id) + except Exception as e: + print(f" {_c('FAILED', _RED)}: {e}") + record.update({'status': 'error', 'error': f'load_benchmark: {e}'}) + _write_result(output_path, record) + return record + print(f" {_c('OK', _GREEN)} ({time.time()-t0:.1f}s)") + + # ── Pre-flight probe ───────────────────────────────────────────────── + print(f"\n {_c('PRE-FLIGHT PROBE', _BOLD)}") + t0 = time.time() + try: + est = preallocate_memory(model, benchmark, raise_if_oom=False) + except TypeError as e: + print(f" {_c('SKIPPED', _YELLOW)}: {e}") + record.update({'status': 'skipped', 'skip_reason': str(e)}) + _write_result(output_path, record) + return record + probe_elapsed = time.time() - t0 + + if est is None: + print(f" {_c('SKIPPED', _YELLOW)} (BRAINSCORE_SKIP_MEMORY_CHECK set)") + record.update({'status': 'skipped', 'skip_reason': 'BRAINSCORE_SKIP_MEMORY_CHECK'}) + _write_result(output_path, record) + return record + + formula = 'calibrated' if est.fixed_benchmark_cost_gb is not None else f'x{6}_fallback' + print(f" {'Stimuli':<24}: {est.num_stimuli:,}") + print(f" {'Features':<24}: {est.num_features:,}") + print(f" {'Timebins':<24}: {est.num_timebins}") + print(f" {'Activation':<24}: {est.activation_gb:.4f} GB " + f"{_c(f'({est.num_stimuli:,} × {est.num_features:,} × {est.num_timebins} × 4B)', _DIM)}") + if est.fixed_benchmark_cost_gb is not None: + print(f" {'Fixed benchmark cost':<24}: {est.fixed_benchmark_cost_gb:.4f} GB " + f"{_c('← from calibration table', _DIM)}") + print(f" {'Estimated total':<24}: {_c(f'{est.total_estimated_gb:.4f} GB', _CYAN)} " + f"{_c(f'({est.activation_gb:.4f} + {est.fixed_benchmark_cost_gb:.4f})', _DIM)}") + else: + print(f" {'Estimated total':<24}: {_c(f'{est.total_estimated_gb:.4f} GB', _CYAN)} " + f"{_c(f'({est.activation_gb:.4f} × 6 fallback)', _DIM)}") + print(f" {'Available RAM':<24}: {est.available_gb:.2f} GB") + print(f" {'OOM predicted':<24}: {_c('YES', _RED) if est.will_oom else _c('NO', _GREEN)}") + print(f" {'Probe elapsed':<24}: {probe_elapsed:.1f}s") + + record.update({ + 'num_stimuli': est.num_stimuli, + 'num_features': est.num_features, + 'num_timebins': est.num_timebins, + 'activation_gb': round(est.activation_gb, 6), + 'fixed_benchmark_cost_gb': round(est.fixed_benchmark_cost_gb, 6) if est.fixed_benchmark_cost_gb is not None else None, + 'estimated_total_gb': round(est.total_estimated_gb, 6), + 'available_gb': round(est.available_gb, 2), + 'oom_predicted': est.will_oom, + 'formula': formula, + 'probe_elapsed_s': round(probe_elapsed, 1), + }) + + if probe_only: + record['status'] = 'probe_only' + _write_result(output_path, record) + return record + + # ── Full benchmark run ─────────────────────────────────────────────── + print(f"\n {_c('FULL BENCHMARK RUN', _BOLD)}") + baseline_rss = proc.memory_info().rss + print(f" Baseline RSS: {_gb(baseline_rss)} " + f"{_c('← model weights + Python already in RAM', _DIM)}") + print(f" Scoring... (this may take a while)", flush=True) + + # Ticker thread — prints a heartbeat every 60s so we know it's alive + _ticker_stop = threading.Event() + def _ticker(): + t_start = time.time() + while not _ticker_stop.wait(60): + elapsed = time.time() - t_start + rss = proc.memory_info().rss + print(f" {_c('…', _DIM)} still scoring {elapsed/60:.1f} min RSS {_gb(rss)}", flush=True) + ticker = threading.Thread(target=_ticker, daemon=True) + ticker.start() + + monitor = _PeakMonitor().start() + t_score = time.time() + score_val = None + score_error = None + try: + score_val = benchmark(model) + except MemoryError as e: + score_error = f'MemoryError: {e}' + print(f"\n {_c('MemoryError', _RED)}: {e}") + except Exception as e: + score_error = f'{type(e).__name__}: {e}' + print(f"\n {_c('ERROR', _RED)} ({type(e).__name__}): {e}") + finally: + _ticker_stop.set() + ticker.join() + + score_elapsed = time.time() - t_score + peak_rss = monitor.stop() + actual_delta_gb = (peak_rss - baseline_rss) / (1024 ** 3) + + # ── Comparison ─────────────────────────────────────────────────────── + print(f"\n {_c('COMPARISON', _BOLD)}") + print(f" {'Baseline RSS':<24}: {_gb(baseline_rss)}") + print(f" {'Peak RSS':<24}: {_gb(peak_rss)}") + print(f" {'Actual delta':<24}: {_c(f'+{actual_delta_gb:.4f} GB', _CYAN)} " + f"{_c('← RAM the benchmark consumed', _DIM)}") + print(f" {'Estimated total':<24}: {_c(f'{est.total_estimated_gb:.4f} GB', _CYAN)}") + print(f" {'Accuracy':<24}: {_accuracy_label(est.total_estimated_gb, actual_delta_gb)}") + print(f" {'Score elapsed':<24}: {score_elapsed:.0f}s") + if score_val is not None: + print(f" {'Score':<24}: {float(score_val):.4f}") + + record.update({ + 'baseline_rss_gb': round(baseline_rss / (1024 ** 3), 4), + 'peak_rss_gb': round(peak_rss / (1024 ** 3), 4), + 'actual_delta_gb': round(actual_delta_gb, 4), + 'error_gb': round(est.total_estimated_gb - actual_delta_gb, 4), + 'error_pct': round((est.total_estimated_gb - actual_delta_gb) / actual_delta_gb * 100, 1) + if actual_delta_gb > 0.01 else None, + 'score_elapsed_s': round(score_elapsed, 0), + 'score': float(score_val) if score_val is not None else None, + 'score_error': score_error, + 'status': 'error' if score_error else 'ok', + }) + _write_result(output_path, record) + return record + + +# --------------------------------------------------------------------------- +# Summary table +# --------------------------------------------------------------------------- +def print_summary(results: list[dict]): + print() + _divider('═') + print(f" {_c('VALIDATION SUMMARY', _BOLD)} ({len(results)} pairs)\n") + + header = f" {'Model':<48} {'Benchmark':<30} {'Est GB':>8} {'Act GB':>8} {'Err GB':>8} {'Err %':>7} Status" + print(header) + _divider() + + for r in results: + model = r['model'][-46:] # truncate long model names + bm = r['benchmark'] + est = r.get('estimated_total_gb') + act = r.get('actual_delta_gb') + err = r.get('error_gb') + pct = r.get('error_pct') + status = r.get('status', '?') + + if status == 'ok': + if err is not None and err >= 0: + status_str = _c('OVER', _GREEN) + elif pct is not None and abs(pct) <= 15: + status_str = _c('~OK', _YELLOW) + else: + status_str = _c('MISS', _RED) + elif status in ('skipped', 'skipped_oom', 'probe_only'): + status_str = _c(status.upper(), _YELLOW) + else: + status_str = _c(status.upper(), _RED) + + est_s = f"{est:.3f}" if est is not None else '—' + act_s = f"{act:.3f}" if act is not None else '—' + err_s = f"{err:+.3f}" if err is not None else '—' + pct_s = f"{pct:+.1f}%" if pct is not None else '—' + + print(f" {model:<48} {bm:<30} {est_s:>8} {act_s:>8} {err_s:>8} {pct_s:>7} {status_str}") + + _divider('═') + print() + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- +def main(): + parser = argparse.ArgumentParser(description='Pre-flight estimator validation suite.') + parser.add_argument('--probe-only', action='store_true', + help='Only run the pre-flight probe; skip full benchmark scoring.') + parser.add_argument('--output', default=os.path.join(_script_dir, 'validation_results.jsonl'), + help='Path to write per-pair JSONL results (default: scripts/validation_results.jsonl)') + args = parser.parse_args() + + # Truncate output file at the start of a fresh run + open(args.output, 'w').close() + print(f"\n{_c('Results will be written incrementally to:', _DIM)} {args.output}\n") + + n_pairs = len(MODELS) * len(BENCHMARKS) + pair_idx = 0 + results = [] + + for model_id in MODELS: + for benchmark_id in BENCHMARKS: + pair_idx += 1 + print() + _divider('═') + print(f" {_c(f'PAIR {pair_idx}/{n_pairs}', _BOLD)} " + f"{_c(model_id, _CYAN)} × {_c(benchmark_id, _CYAN)}") + _divider('═') + + record = run_pair(model_id, benchmark_id, args.output, args.probe_only) + results.append(record) + + print_summary(results) + print(f"Full results written to: {args.output}\n") + + +if __name__ == '__main__': + main() + + diff --git a/tests/test_plugin_management/test_memory_precheck.py b/tests/test_plugin_management/test_memory_precheck.py new file mode 100644 index 0000000000..8e1bde6978 --- /dev/null +++ b/tests/test_plugin_management/test_memory_precheck.py @@ -0,0 +1,561 @@ +""" +Integration tests for the pre-flight memory check (preallocate_memory). + +Uses object.__new__ to bypass NeuralBenchmark.__init__ / timebins_from_assembly +so we can construct minimal benchmark fixtures without real S3 data. + +Model is mocked at the BrainModel level: look_at returns a tiny xarray +DataArray with a 'neuroid' dim so the probe can read sizes['neuroid']. +place_on_screen short-circuits when source == target visual degrees (no I/O). +""" + +import json +import os +import tempfile +import unittest +from unittest.mock import MagicMock, patch + +import numpy as np +import xarray as xr +import pytest + +from brainscore_core import Score +from brainscore_core.benchmarks import score_benchmark +from brainscore_vision.benchmark_helpers.memory import ( + MemoryEstimate, + _OVERHEAD_FACTOR, + _PLS_OVERHEAD_FACTOR, + _BYTES_PER_ELEMENT, + _DEFAULT_CALIBRATION_PATH, + preallocate_memory, + load_calibration, + save_calibration, +) +from brainscore_vision.benchmark_helpers.neural_common import ( + NeuralBenchmark, + TrainTestNeuralBenchmark, +) +from brainscore_vision.model_interface import BrainModel + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +_VISUAL_DEGREES = 8 # source == target so place_on_screen is a no-op + + +def _make_stimulus_set(n: int = 10): + """Minimal StimulusSet-like DataFrame with stimulus_id coordinate.""" + from brainscore_core.supported_data_standards.brainio.stimuli import StimulusSet + import pandas as pd + df = pd.DataFrame({'stimulus_id': [f'img{i:03d}' for i in range(n)], + 'image_file_name': [f'img{i:03d}.png' for i in range(n)]}) + ss = StimulusSet(df) + ss.identifier = 'test_stimulus_set' + ss.stimulus_paths = {row.stimulus_id: f'/tmp/{row.image_file_name}' + for _, row in df.iterrows()} + return ss + + +def _make_neural_benchmark(n_stimuli: int = 10, n_trials: int = 1, + timebins=None, region: str = 'IT') -> NeuralBenchmark: + """Construct a NeuralBenchmark without calling __init__.""" + bm = object.__new__(NeuralBenchmark) + bm._identifier = 'test-neural-benchmark' + bm._number_of_trials = n_trials + bm.timebins = timebins or [(70, 170)] + bm.region = region + bm._visual_degrees = _VISUAL_DEGREES + bm._ceiling_func = lambda: Score(0.8) + + ss = _make_stimulus_set(n_stimuli) + assembly = MagicMock() + assembly.stimulus_set = ss + bm._assembly = assembly + return bm + + +def _make_train_test_benchmark(n_train: int = 8, n_test: int = 4) -> TrainTestNeuralBenchmark: + """Construct a TrainTestNeuralBenchmark without calling __init__.""" + bm = object.__new__(TrainTestNeuralBenchmark) + bm._identifier = 'test-train-test-benchmark' + bm._number_of_trials = 1 + bm.timebins = [(70, 170)] + bm.region = 'IT' + bm._visual_degrees = _VISUAL_DEGREES + bm._ceiling_func = lambda: Score(0.8) + + train_assembly = MagicMock() + train_assembly.stimulus_set = _make_stimulus_set(n_train) + test_assembly = MagicMock() + test_assembly.stimulus_set = _make_stimulus_set(n_test) + bm.train_assembly = train_assembly + bm.test_assembly = test_assembly + return bm + + +def _make_model(num_features: int = 512) -> BrainModel: + """Mock BrainModel whose look_at returns a DataArray with neuroid dim.""" + model = MagicMock(spec=BrainModel) + model.visual_degrees.return_value = _VISUAL_DEGREES + + def _look_at(stimuli, number_of_trials=1): + n = len(stimuli) + data = np.zeros((n, num_features)) + return xr.DataArray( + data, + dims=['presentation', 'neuroid'], + coords={ + 'stimulus_id': ('presentation', stimuli['stimulus_id'].values), + 'neuroid_id': ('neuroid', np.arange(num_features)), + }, + ) + + model.look_at.side_effect = _look_at + model.activations_model = None # no LayerPCA + return model + + +# --------------------------------------------------------------------------- +# TestMemoryEstimateShape +# --------------------------------------------------------------------------- + +class TestMemoryEstimateShape(unittest.TestCase): + + def setUp(self): + self.bm = _make_neural_benchmark(n_stimuli=10) + self.model = _make_model(num_features=512) + + def test_estimate_fields(self): + with patch('psutil.virtual_memory') as mock_vm: + mock_vm.return_value.available = 32 * (1024 ** 3) + est = preallocate_memory(self.model, self.bm, raise_if_oom=False) + + self.assertEqual(est.num_stimuli, 10) + self.assertEqual(est.num_features, 512) + self.assertEqual(est.num_timebins, 1) + + def test_activation_gb_formula(self): + with patch('psutil.virtual_memory') as mock_vm: + mock_vm.return_value.available = 32 * (1024 ** 3) + est = preallocate_memory(self.model, self.bm, raise_if_oom=False) + + expected_bytes = 10 * 512 * 1 * _BYTES_PER_ELEMENT + expected_gb = expected_bytes / (1024 ** 3) + self.assertAlmostEqual(est.activation_gb, expected_gb, places=6) + + def test_total_estimated_gb(self): + with patch('psutil.virtual_memory') as mock_vm: + mock_vm.return_value.available = 32 * (1024 ** 3) + est = preallocate_memory(self.model, self.bm, raise_if_oom=False) + + self.assertAlmostEqual(est.total_estimated_gb, + est.activation_gb * _OVERHEAD_FACTOR, places=6) + + def test_available_gb_from_psutil(self): + available_bytes = 16 * (1024 ** 3) + with patch('psutil.virtual_memory') as mock_vm: + mock_vm.return_value.available = available_bytes + est = preallocate_memory(self.model, self.bm, raise_if_oom=False) + + self.assertAlmostEqual(est.available_gb, 16.0, places=3) + + +# --------------------------------------------------------------------------- +# TestOOMDetection +# --------------------------------------------------------------------------- + +class TestOOMDetection(unittest.TestCase): + + def _estimate(self, available_gb, num_features=1_000_000, n_stimuli=100): + bm = _make_neural_benchmark(n_stimuli=n_stimuli) + model = _make_model(num_features=num_features) + with patch('psutil.virtual_memory') as mock_vm: + mock_vm.return_value.available = int(available_gb * (1024 ** 3)) + return preallocate_memory(model, bm, raise_if_oom=False) + + def test_will_oom_true_when_over(self): + est = self._estimate(available_gb=0.001) + self.assertTrue(est.will_oom) + + def test_will_oom_false_when_under(self): + est = self._estimate(available_gb=1000) + self.assertFalse(est.will_oom) + + def test_raises_memory_error_when_raise_if_oom(self): + bm = _make_neural_benchmark(n_stimuli=100) + model = _make_model(num_features=1_000_000) + with patch('psutil.virtual_memory') as mock_vm: + mock_vm.return_value.available = 1 + with self.assertRaises(MemoryError): + preallocate_memory(model, bm, raise_if_oom=True) + + def test_no_raise_when_raise_if_oom_false(self): + bm = _make_neural_benchmark(n_stimuli=100) + model = _make_model(num_features=1_000_000) + with patch('psutil.virtual_memory') as mock_vm: + mock_vm.return_value.available = 1 + est = preallocate_memory(model, bm, raise_if_oom=False) + self.assertTrue(est.will_oom) + + +# --------------------------------------------------------------------------- +# TestProbeUsesOneStimulusOnly +# --------------------------------------------------------------------------- + +class TestProbeUsesOneStimulusOnly(unittest.TestCase): + + def test_look_at_called_with_one_stimulus(self): + bm = _make_neural_benchmark(n_stimuli=100) + model = _make_model() + with patch('psutil.virtual_memory') as mock_vm: + mock_vm.return_value.available = 32 * (1024 ** 3) + preallocate_memory(model, bm, raise_if_oom=False) + + call_args = model.look_at.call_args + stimuli_arg = call_args[0][0] + self.assertEqual(len(stimuli_arg), 1) + + def test_num_stimuli_reflects_full_benchmark_not_probe(self): + bm = _make_neural_benchmark(n_stimuli=42) + model = _make_model() + with patch('psutil.virtual_memory') as mock_vm: + mock_vm.return_value.available = 32 * (1024 ** 3) + est = preallocate_memory(model, bm, raise_if_oom=False) + + self.assertEqual(est.num_stimuli, 42) + + +# --------------------------------------------------------------------------- +# TestScoreBenchmarkAbortOnOOM +# --------------------------------------------------------------------------- + +class TestScoreBenchmarkAbortOnOOM(unittest.TestCase): + + def test_score_benchmark_aborts_before_calling_benchmark(self): + """score_benchmark should raise MemoryError before __call__ is invoked.""" + bm = _make_neural_benchmark(n_stimuli=100) + bm.__call__ = MagicMock(return_value=Score(0.5)) + model = _make_model(num_features=1_000_000) + + with patch('psutil.virtual_memory') as mock_vm: + mock_vm.return_value.available = 1 + with self.assertRaises(MemoryError): + score_benchmark(bm, model) + + bm.__call__.assert_not_called() + + def test_score_benchmark_calls_benchmark_when_ok(self): + bm = _make_neural_benchmark(n_stimuli=5) + score_val = Score(0.42) + score_val.attrs['ceiling'] = Score(1.0) + model = _make_model(num_features=10) + + with patch('psutil.virtual_memory') as mock_vm: + mock_vm.return_value.available = 32 * (1024 ** 3) + with patch.object(NeuralBenchmark, '__call__', return_value=score_val) as mock_call: + result = score_benchmark(bm, model) + + mock_call.assert_called_once_with(model) + self.assertEqual(float(result), 0.42) + + +# --------------------------------------------------------------------------- +# TestSkipEnvVar +# --------------------------------------------------------------------------- + +class TestSkipEnvVar(unittest.TestCase): + + def test_returns_none_when_env_var_set(self): + bm = _make_neural_benchmark() + model = _make_model() + with patch.dict(os.environ, {'BRAINSCORE_SKIP_MEMORY_CHECK': '1'}): + result = preallocate_memory(model, bm, raise_if_oom=True) + self.assertIsNone(result) + + def test_runs_normally_when_env_var_unset(self): + bm = _make_neural_benchmark() + model = _make_model() + with patch.dict(os.environ, {'BRAINSCORE_SKIP_MEMORY_CHECK': '0'}): + with patch('psutil.virtual_memory') as mock_vm: + mock_vm.return_value.available = 32 * (1024 ** 3) + result = preallocate_memory(model, bm, raise_if_oom=False) + self.assertIsNotNone(result) + + +# --------------------------------------------------------------------------- +# TestUnsupportedBenchmarkType +# --------------------------------------------------------------------------- + +class TestUnsupportedBenchmarkType(unittest.TestCase): + + def test_raises_type_error_for_unknown_benchmark(self): + class WeirdBenchmark: + pass + + model = _make_model() + with self.assertRaises(TypeError): + preallocate_memory(model, WeirdBenchmark()) + + +# --------------------------------------------------------------------------- +# TestTrainTestNeuralBenchmark +# --------------------------------------------------------------------------- + +class TestTrainTestNeuralBenchmark(unittest.TestCase): + + def test_num_stimuli_is_train_plus_test(self): + bm = _make_train_test_benchmark(n_train=8, n_test=4) + model = _make_model() + with patch('psutil.virtual_memory') as mock_vm: + mock_vm.return_value.available = 32 * (1024 ** 3) + est = preallocate_memory(model, bm, raise_if_oom=False) + + self.assertEqual(est.num_stimuli, 12) + + def test_estimate_formula_train_test(self): + bm = _make_train_test_benchmark(n_train=8, n_test=4) + model = _make_model(num_features=256) + with patch('psutil.virtual_memory') as mock_vm: + mock_vm.return_value.available = 32 * (1024 ** 3) + est = preallocate_memory(model, bm, raise_if_oom=False) + + expected_bytes = 12 * 256 * 1 * _BYTES_PER_ELEMENT + self.assertAlmostEqual(est.activation_gb, expected_bytes / (1024 ** 3), places=6) + + +# --------------------------------------------------------------------------- +# TestCalibrationIO — load_calibration / save_calibration +# --------------------------------------------------------------------------- + +class TestCalibrationIO(unittest.TestCase): + + def setUp(self): + self._tmpdir = tempfile.mkdtemp() + self._cal_path = os.path.join(self._tmpdir, 'benchmark_costs.json') + + def test_load_returns_empty_dict_when_file_missing(self): + result = load_calibration('/nonexistent/path/benchmark_costs.json') + self.assertEqual(result, {}) + + def test_save_and_load_roundtrip(self): + costs = {'MajajHong2015.IT-pls': 2.8336, 'Allen2022_fmri.V1-ridge': 0.5544} + save_calibration(costs, self._cal_path) + loaded = load_calibration(self._cal_path) + self.assertEqual(loaded, costs) + + def test_save_creates_intermediate_directories(self): + deep_path = os.path.join(self._tmpdir, 'a', 'b', 'c', 'costs.json') + save_calibration({'bm': 1.0}, deep_path) + self.assertTrue(os.path.exists(deep_path)) + + def test_load_handles_corrupt_file_gracefully(self): + with open(self._cal_path, 'w') as f: + f.write('not valid json {{{') + result = load_calibration(self._cal_path) + self.assertEqual(result, {}) + + def test_save_writes_valid_json(self): + costs = {'foo-bar': 3.14} + save_calibration(costs, self._cal_path) + with open(self._cal_path) as f: + data = json.load(f) + self.assertAlmostEqual(data['foo-bar'], 3.14) + + def test_save_overwrites_existing_file(self): + save_calibration({'old': 1.0}, self._cal_path) + save_calibration({'new': 2.0}, self._cal_path) + loaded = load_calibration(self._cal_path) + self.assertNotIn('old', loaded) + self.assertAlmostEqual(loaded['new'], 2.0) + + +# --------------------------------------------------------------------------- +# TestCalibratedFormula — two-component formula vs ×6 fallback +# --------------------------------------------------------------------------- + +class TestCalibratedFormula(unittest.TestCase): + + def setUp(self): + self._tmpdir = tempfile.mkdtemp() + self._cal_path = os.path.join(self._tmpdir, 'costs.json') + + def _estimate(self, bm, model, fixed_cost=None, cal_path=None): + with patch('psutil.virtual_memory') as mock_vm: + mock_vm.return_value.available = 64 * (1024 ** 3) + with patch('brainscore_vision.benchmark_helpers.memory._DEFAULT_CALIBRATION_PATH', + cal_path or '/nonexistent'): + return preallocate_memory(model, bm, raise_if_oom=False, + fixed_benchmark_cost_gb=fixed_cost) + + def test_explicit_fixed_cost_overrides_fallback(self): + bm = _make_neural_benchmark(n_stimuli=10) + model = _make_model(num_features=512) + est = self._estimate(bm, model, fixed_cost=5.0) + self.assertAlmostEqual(est.total_estimated_gb, est.activation_gb + 5.0, places=5) + + def test_fixed_cost_stored_in_estimate(self): + bm = _make_neural_benchmark(n_stimuli=10) + model = _make_model(num_features=512) + est = self._estimate(bm, model, fixed_cost=3.5) + self.assertAlmostEqual(est.fixed_benchmark_cost_gb, 3.5) + + def test_falls_back_to_overhead_when_no_calibration(self): + bm = _make_neural_benchmark(n_stimuli=10) + model = _make_model(num_features=512) + est = self._estimate(bm, model, fixed_cost=None, cal_path='/nonexistent') + self.assertIsNone(est.fixed_benchmark_cost_gb) + self.assertAlmostEqual(est.total_estimated_gb, + est.activation_gb * _OVERHEAD_FACTOR, places=5) + + def test_auto_loads_fixed_cost_from_calibration_json(self): + bm = _make_neural_benchmark(n_stimuli=10) + bm._identifier = 'MajajHong2015.IT-pls' + model = _make_model(num_features=512) + save_calibration({'MajajHong2015.IT-pls': 2.8336}, self._cal_path) + + with patch('psutil.virtual_memory') as mock_vm: + mock_vm.return_value.available = 64 * (1024 ** 3) + with patch('brainscore_vision.benchmark_helpers.memory._DEFAULT_CALIBRATION_PATH', + self._cal_path): + est = preallocate_memory(model, bm, raise_if_oom=False) + + self.assertAlmostEqual(est.fixed_benchmark_cost_gb, 2.8336, places=4) + self.assertAlmostEqual(est.total_estimated_gb, + est.activation_gb * _PLS_OVERHEAD_FACTOR + 2.8336, places=4) + + def test_benchmark_not_in_table_uses_fallback(self): + bm = _make_neural_benchmark(n_stimuli=10) + bm._identifier = 'unknown-benchmark' + model = _make_model(num_features=512) + save_calibration({'MajajHong2015.IT-pls': 2.8336}, self._cal_path) + + with patch('psutil.virtual_memory') as mock_vm: + mock_vm.return_value.available = 64 * (1024 ** 3) + with patch('brainscore_vision.benchmark_helpers.memory._DEFAULT_CALIBRATION_PATH', + self._cal_path): + est = preallocate_memory(model, bm, raise_if_oom=False) + + self.assertIsNone(est.fixed_benchmark_cost_gb) + self.assertAlmostEqual(est.total_estimated_gb, + est.activation_gb * _OVERHEAD_FACTOR, places=5) + + def test_oom_detected_with_calibrated_formula(self): + bm = _make_neural_benchmark(n_stimuli=10) + model = _make_model(num_features=512) + # fixed cost alone exceeds available RAM + with patch('psutil.virtual_memory') as mock_vm: + mock_vm.return_value.available = int(0.001 * (1024 ** 3)) + with self.assertRaises(MemoryError): + preallocate_memory(model, bm, raise_if_oom=True, fixed_benchmark_cost_gb=100.0) + + +# --------------------------------------------------------------------------- +# TestMemoryEstimateStr — __str__ output +# --------------------------------------------------------------------------- + +class TestMemoryEstimateStr(unittest.TestCase): + + def _make_estimate(self, fixed_cost=None, will_oom=False): + available = 1.0 if will_oom else 100.0 + total = 200.0 if will_oom else 1.5 + return MemoryEstimate( + num_stimuli=100, + num_trials=1, + num_features=512, + num_timebins=1, + activation_gb=0.5, + total_estimated_gb=total, + available_gb=available, + fixed_benchmark_cost_gb=fixed_cost, + ) + + def test_str_shows_ok_when_not_oom(self): + est = self._make_estimate() + self.assertIn('[OK]', str(est)) + + def test_str_shows_oom_likely_when_oom(self): + est = self._make_estimate(will_oom=True) + self.assertIn('[OOM LIKELY]', str(est)) + + def test_str_shows_calibrated_formula_when_fixed_cost_set(self): + est = self._make_estimate(fixed_cost=3.5) + s = str(est) + self.assertIn('fixed benchmark cost', s) + self.assertNotIn(f'×{_OVERHEAD_FACTOR}', s) + + def test_str_shows_overhead_formula_when_no_fixed_cost(self): + est = self._make_estimate(fixed_cost=None) + s = str(est) + self.assertIn(f'×{_OVERHEAD_FACTOR}', s) + self.assertNotIn('fixed benchmark cost', s) + + def test_str_contains_stimuli_and_features(self): + est = self._make_estimate() + s = str(est) + self.assertIn('100', s) # num_stimuli + self.assertIn('512', s) # num_features + + +# --------------------------------------------------------------------------- +# TestCalibratedIntegration — full pipeline with a real JSON file +# --------------------------------------------------------------------------- + +class TestCalibratedIntegration(unittest.TestCase): + """ + End-to-end test: save a calibration JSON, then verify preallocate_memory + picks it up automatically and produces the correct two-component estimate. + """ + + def setUp(self): + self._tmpdir = tempfile.mkdtemp() + self._cal_path = os.path.join(self._tmpdir, 'costs.json') + + def test_full_roundtrip_calibrated_estimate(self): + n_stimuli = 20 + n_features = 256 + fixed_cost = 4.35 + + bm = _make_neural_benchmark(n_stimuli=n_stimuli) + bm._identifier = 'integration-test-benchmark' + model = _make_model(num_features=n_features) + + save_calibration({'integration-test-benchmark': fixed_cost}, self._cal_path) + + with patch('psutil.virtual_memory') as mock_vm: + mock_vm.return_value.available = 64 * (1024 ** 3) + with patch('brainscore_vision.benchmark_helpers.memory._DEFAULT_CALIBRATION_PATH', + self._cal_path): + est = preallocate_memory(model, bm, raise_if_oom=False) + + expected_activation = n_stimuli * n_features * 1 * _BYTES_PER_ELEMENT / (1024 ** 3) + self.assertAlmostEqual(est.activation_gb, expected_activation, places=6) + self.assertAlmostEqual(est.fixed_benchmark_cost_gb, fixed_cost, places=4) + self.assertAlmostEqual(est.total_estimated_gb, expected_activation + fixed_cost, places=4) + self.assertFalse(est.will_oom) + + def test_score_benchmark_uses_preallocate_memory(self): + """score_benchmark must call preallocate_memory before __call__.""" + bm = _make_neural_benchmark(n_stimuli=5) + model = _make_model(num_features=10) + score_val = MagicMock() + + call_order = [] + + def _fake_preallocate(self, candidate): + call_order.append('preallocate') + + def _fake_call(self, candidate): + call_order.append('score') + return score_val + + with patch.object(NeuralBenchmark, 'preallocate_memory', _fake_preallocate): + with patch.object(NeuralBenchmark, '__call__', _fake_call): + score_benchmark(bm, model) + + self.assertEqual(call_order, ['preallocate', 'score']) + + +if __name__ == '__main__': + unittest.main()