From 89c9ba929d6abd727091abda2a265716642edde6 Mon Sep 17 00:00:00 2001 From: Seyed Yahya Shirazi Date: Wed, 13 May 2026 06:49:22 -0700 Subject: [PATCH] phase2: multi-task AMICA concat (issue #33) ThePresent alone gives k = samples/n_chans^2 = 1.1 for 128-channel ICA; standard ICA wants k >= 20. Under-determined AMICA produces components that collapse to single electrodes (visual inspection on PR #31's 3 subjects confirms). Reference pipeline mitigates by concatenating multiple HBN tasks for ICA training while keeping ThePresent as the analysis target. Adds IcaTasks opt to phase2_amica (default: the four passive-viewing movie tasks DespicableMe, DiaryOfAWimpyKid, FunwithFractals, ThePresent). When |IcaTasks| > 1: ensure Phase 1 .set exists for each task (regenerate via phase1_preprocess if missing), load all, restrict to common channel intersection, pop_mergeset, train AMICA on merged data, transplant weights onto the Task=ThePresent EEG via hbn.apply_ica_weights, then dipfit + figures + checkpoint as before. src/matlab/phase2_amica.m - IcaTasks (1,:) string default 4 movie tasks. Set to [Task] for the legacy single-task path (smoke test uses this). - BidsRoot opt added so prepare_multitask_amica_input can regenerate missing Phase 1 .set files. - qaRow now carries ica_tasks, ica_samples, k_factor, common_channels. src/matlab/+hbn/prepare_multitask_amica_input.m (new) - For each task in IcaTasks: load Phase 1 .set or regenerate via phase1_preprocess. Intersect channel labels across tasks (>= 32 channels required). pop_select to intersection. pop_mergeset. Returns merged EEG + info struct. src/matlab/+hbn/apply_ica_weights.m (new) - Copies icaweights/icasphere/icachansind/icawinv from sourceEEG to targetEEG, asserting channel-count match. After multi-task AMICA the target ThePresent EEG must already be on the same channel intersection (pop_select before the call). src/matlab/+hbn/write_qa_amica_csv.m - Schema gains ica_tasks, ica_samples, k_factor, common_channels. Insertion-order placement keeps participant_id, status, ica_method as the leftmost columns for readability. tests/matlab/test_phase2_smoke.m - Pass IcaTasks="ThePresent" so the fixture-based smoke test stays on the single-task path (the fixture only has ThePresent). - requiredFields includes IcaTasks. scripts/run_phase2_three_subjects_multitask.m (new) - Production runner. Regenerates Phase 1 ThePresent .set if missing. Per-task .sets for the other 3 movie tasks are produced lazily by prepare_multitask_amica_input. Refs #1, refs #33. --- scripts/run_phase2_three_subjects_multitask.m | 58 ++++++++++ src/matlab/+hbn/apply_ica_weights.m | 37 ++++++ .../+hbn/prepare_multitask_amica_input.m | 106 ++++++++++++++++++ src/matlab/+hbn/write_qa_amica_csv.m | 7 +- src/matlab/phase2_amica.m | 55 ++++++++- tests/matlab/test_phase2_smoke.m | 5 +- 6 files changed, 262 insertions(+), 6 deletions(-) create mode 100644 scripts/run_phase2_three_subjects_multitask.m create mode 100644 src/matlab/+hbn/apply_ica_weights.m create mode 100644 src/matlab/+hbn/prepare_multitask_amica_input.m diff --git a/scripts/run_phase2_three_subjects_multitask.m b/scripts/run_phase2_three_subjects_multitask.m new file mode 100644 index 0000000..378d6d1 --- /dev/null +++ b/scripts/run_phase2_three_subjects_multitask.m @@ -0,0 +1,58 @@ +function run_phase2_three_subjects_multitask +%RUN_PHASE2_THREE_SUBJECTS_MULTITASK Phase 2 with multi-task AMICA pool. +% Phase 2 production run on 3 subjects using the 4-movie task pool for +% ICA training (DespicableMe, DiaryOfAWimpyKid, FunwithFractals, +% ThePresent). Each task is regenerated through Phase 1 cleaning if +% the cleaned .set is missing. AMICA fits the merged cleaned data; the +% resulting weights are transplanted onto the ThePresent-only Phase 1 +% .set before dipfit + figures + checkpoint. +% +% Invoked from the worktree root: +% matlab -batch "addpath('scripts'); run_phase2_three_subjects_multitask()" + + eeglab_path = '/Users/yahya/Documents/git/eeg/eeglab'; + bids_root = '/Volumes/S1/Datasets/HBN/L100/R3_L100_bdf'; + + script_dir = fileparts(mfilename('fullpath')); + repo_root = fileparts(script_dir); + cd(repo_root); + + addpath(eeglab_path); + evalc('eeglab nogui'); + addpath(genpath(fullfile(repo_root, 'src', 'matlab'))); + + fprintf('[phase2-mt] cwd: %s\n', pwd); + fprintf('[phase2-mt] starting at %s\n', datestr(now, 'yyyy-mm-dd HH:MM:SS')); %#ok + + preprocDir = fullfile(repo_root, 'derivatives', 'preproc'); + amicaDir = fullfile(repo_root, 'derivatives', 'amica'); + + icaTasks = ["DespicableMe", "DiaryOfAWimpyKid", "FunwithFractals", "ThePresent"]; + fprintf('[phase2-mt] IcaTasks: %s\n', strjoin(icaTasks, ', ')); + + % Ensure phase 1 .set for the target task (ThePresent) on all 3 subjects. + % Per-task .sets for the other 3 movies are produced lazily by + % hbn.prepare_multitask_amica_input as needed. + targetSets = dir(fullfile(preprocDir, 'sub-*', 'eeg', '*_task-ThePresent_desc-clean_eeg.set')); + if numel(targetSets) < 3 + fprintf('[phase2-mt] regenerating Phase 1 ThePresent on 3 subjects ...\n'); + phase1_preprocess( ... + 'BidsRoot', bids_root, ... + 'OutDir', preprocDir, ... + 'Task', 'ThePresent', ... + 'SmokeSubjectCount', 3); + else + fprintf('[phase2-mt] found %d ThePresent .set; using existing\n', numel(targetSets)); + end + + fprintf('[phase2-mt] phase2_amica multi-task starting at %s\n', ... + datestr(now, 'yyyy-mm-dd HH:MM:SS')); %#ok + phase2_amica( ... + 'PreprocDir', preprocDir, ... + 'OutDir', amicaDir, ... + 'BidsRoot', bids_root, ... + 'IcaTasks', icaTasks); + + fprintf('[phase2-mt] finished at %s\n', ... + datestr(now, 'yyyy-mm-dd HH:MM:SS')); %#ok +end diff --git a/src/matlab/+hbn/apply_ica_weights.m b/src/matlab/+hbn/apply_ica_weights.m new file mode 100644 index 0000000..9acf187 --- /dev/null +++ b/src/matlab/+hbn/apply_ica_weights.m @@ -0,0 +1,37 @@ +function targetEEG = apply_ica_weights(sourceEEG, targetEEG) +%APPLY_ICA_WEIGHTS Copy ICA weights from sourceEEG onto targetEEG. +% +% targetEEG = hbn.apply_ica_weights(sourceEEG, targetEEG) installs the +% AMICA weights/sphere from sourceEEG into targetEEG (which is typically +% the ThePresent-only Phase 1 .set restricted to the same channel set). +% Used after multi-task AMICA training to attach the multi-task-learned +% weights to the single-task analysis EEG. +% +% Assumes channel set match: targetEEG must already be on the same +% channel intersection as sourceEEG (use pop_select beforehand). + + arguments + sourceEEG (1, 1) struct + targetEEG (1, 1) struct + end + + nChansTarget = size(targetEEG.data, 1); + nChansSource = size(sourceEEG.icaweights, 2); + if nChansTarget ~= nChansSource + error("hbn:phase2:apply_weights:chan_mismatch", ... + "target has %d channels, source AMICA was trained on %d; " + ... + "pop_select target to the same intersection first", ... + nChansTarget, nChansSource); + end + if isempty(sourceEEG.icaweights) || isempty(sourceEEG.icasphere) + error("hbn:phase2:apply_weights:no_weights", ... + "sourceEEG has no AMICA weights to copy"); + end + + targetEEG.icaweights = sourceEEG.icaweights; + targetEEG.icasphere = sourceEEG.icasphere; + targetEEG.icachansind = sourceEEG.icachansind; + targetEEG.icawinv = sourceEEG.icawinv; + targetEEG.icaact = []; + targetEEG = eeg_checkset(targetEEG, 'ica'); +end diff --git a/src/matlab/+hbn/prepare_multitask_amica_input.m b/src/matlab/+hbn/prepare_multitask_amica_input.m new file mode 100644 index 0000000..a85f549 --- /dev/null +++ b/src/matlab/+hbn/prepare_multitask_amica_input.m @@ -0,0 +1,106 @@ +function [mergedEEG, info] = prepare_multitask_amica_input(opts, subjId) +%PREPARE_MULTITASK_AMICA_INPUT Concatenate cleaned per-task .set files for a subject. +% +% [mergedEEG, info] = hbn.prepare_multitask_amica_input(opts, subjId) +% ensures Phase 1 `.set` checkpoints exist for each task in +% opts.IcaTasks (re-runs phase1_preprocess for missing ones), loads +% them, restricts every dataset to the channel intersection, and +% concatenates via pop_mergeset. +% +% This is the data-pool expansion described in the project brief: +% ICA training uses multi-task data so the sample-per-weight ratio +% (k = nSamples / nChans^2) lifts above the under-determined regime. +% Phase 4 onwards still analyzes ThePresent only; only the AMICA +% weights are estimated on the merged pool. +% +% info struct (per-subject diagnostics for qa_amica.csv): +% info.ica_tasks string array, tasks actually used +% info.ica_samples total samples in merged dataset +% info.k_factor ica_samples / nChans^2 +% info.common_channels number of channels surviving intersection + + arguments + opts (1, 1) struct + subjId (1, 1) string + end + + if numel(opts.IcaTasks) == 0 + error("hbn:phase2:multitask:no_tasks", ... + "IcaTasks is empty; need at least one task to load"); + end + + preprocDir = opts.PreprocDir; + + taskEEGs = cell(0, 1); + usedTasks = string.empty; + for t = opts.IcaTasks(:)' + setName = sprintf("%s_task-%s_desc-clean_eeg.set", subjId, t); + setPath = fullfile(preprocDir, subjId, "eeg", setName); + + if ~isfile(setPath) + if isfield(opts, 'BidsRoot') && opts.BidsRoot ~= "" + fprintf("[phase2-multitask] regenerating phase 1 for %s task=%s\n", subjId, t); + try + phase1_preprocess( ... + 'BidsRoot', opts.BidsRoot, ... + 'OutDir', preprocDir, ... + 'Task', t, ... + 'Subjects', subjId); + catch ME + warning("hbn:phase2:multitask:phase1_failed", ... + "phase1 regen failed for %s task=%s: %s; skipping task", ... + subjId, t, ME.message); + continue; + end + end + end + + if ~isfile(setPath) + warning("hbn:phase2:multitask:missing_task", ... + "no Phase 1 .set for %s task=%s; skipping task", subjId, t); + continue; + end + + EEG = pop_loadset('filename', char(setName), ... + 'filepath', char(fullfile(preprocDir, subjId, "eeg"))); + taskEEGs{end + 1, 1} = EEG; %#ok + usedTasks(end + 1) = t; %#ok + end + + if isempty(taskEEGs) + error("hbn:phase2:multitask:no_inputs", ... + "No Phase 1 .set files loaded for %s across IcaTasks=[%s]", ... + subjId, strjoin(opts.IcaTasks, ", ")); + end + + common = string({taskEEGs{1}.chanlocs.labels}); + for k = 2:numel(taskEEGs) + common = intersect(common, string({taskEEGs{k}.chanlocs.labels}), 'stable'); + end + if numel(common) < 32 + error("hbn:phase2:multitask:few_common_channels", ... + "Only %d channels common across %d tasks for %s; cannot fit AMICA", ... + numel(common), numel(taskEEGs), subjId); + end + + for k = 1:numel(taskEEGs) + taskEEGs{k} = pop_select(taskEEGs{k}, 'channel', cellstr(common)); + end + + if numel(taskEEGs) == 1 + mergedEEG = taskEEGs{1}; + else + mergedEEG = pop_mergeset([taskEEGs{:}], 1:numel(taskEEGs)); + end + + nSamples = size(mergedEEG.data, 2); + nChans = size(mergedEEG.data, 1); + info = struct( ... + 'ica_tasks', strjoin(usedTasks, "+"), ... + 'ica_samples', nSamples, ... + 'k_factor', nSamples / (nChans^2), ... + 'common_channels', nChans); + + fprintf("[phase2-multitask] %s: merged %d tasks (%s), %d samples, %d common chans, k=%.2f\n", ... + subjId, numel(taskEEGs), strjoin(usedTasks, "+"), nSamples, nChans, info.k_factor); +end diff --git a/src/matlab/+hbn/write_qa_amica_csv.m b/src/matlab/+hbn/write_qa_amica_csv.m index ce04188..51baa55 100644 --- a/src/matlab/+hbn/write_qa_amica_csv.m +++ b/src/matlab/+hbn/write_qa_amica_csv.m @@ -17,7 +17,8 @@ if ~isfolder(outDir); mkdir(outDir); end csvPath = fullfile(outDir, "qa_amica.csv"); - header = ["participant_id", "status", "ica_method", "n_components", "final_ll", ... + header = ["participant_id", "status", "ica_method", "ica_tasks", ... + "ica_samples", "k_factor", "common_channels", "n_components", "final_ll", ... "iterations", "reached_max_iter", "median_rv", "num_models", ... "max_threads", "duration_s", "error_message"]; needsHeader = ~isfile(csvPath); @@ -36,6 +37,10 @@ csv_escape(string(field_or_default(row, "participant_id", ""))), ... csv_escape(string(field_or_default(row, "status", "unknown"))), ... csv_escape(string(field_or_default(row, "ica_method", "amica"))), ... + csv_escape(string(field_or_default(row, "ica_tasks", ""))), ... + numeric_cell(field_or_default(row, "ica_samples", NaN)), ... + numeric_cell(field_or_default(row, "k_factor", NaN)), ... + numeric_cell(field_or_default(row, "common_channels", NaN)), ... numeric_cell(field_or_default(row, "n_components", NaN)), ... numeric_cell(field_or_default(row, "final_ll", NaN)), ... numeric_cell(field_or_default(row, "iterations", NaN)), ... diff --git a/src/matlab/phase2_amica.m b/src/matlab/phase2_amica.m index 4f4840a..8bb47c4 100644 --- a/src/matlab/phase2_amica.m +++ b/src/matlab/phase2_amica.m @@ -42,6 +42,13 @@ % RejSig (1,1) double default 4 % NumTopoICs (1,1) double default 30 % IcaMethod (1,1) string default "amica" ("amica" or "runica") +% IcaTasks (1,:) string default ["DespicableMe","DiaryOfAWimpyKid", +% "FunwithFractals","ThePresent"] +% BidsRoot (1,1) string default "" (only needed if +% IcaTasks contain +% tasks whose Phase 1 +% checkpoint is missing +% and must be regenerated) % EeglabRoot (1,1) string default "" (probed if empty) % % IcaMethod="amica" is the production path (matches the brief). The @@ -49,6 +56,14 @@ % binary needs MPI runtime libs that Ubuntu runners do not have. Both % paths populate EEG.icaweights / icasphere / icawinv, so downstream % dipfit + figure code does not branch on the method. +% +% IcaTasks controls the data pool used to train AMICA. ThePresent alone +% is ~210 s at 100 Hz, giving k=samples/n_chans^2 of ~1.1, well below +% the standard k>=20 threshold for stable ICA. The default concatenates +% the four passive-viewing movie tasks (k ~ 3.5), matching the data-pool +% expansion pattern in the reference pipeline. The downstream analysis +% stays on Task=ThePresent only; the multi-task pool is just the ICA +% training set. Set IcaTasks=[Task] to keep the legacy single-task path. arguments opts.PreprocDir (1, 1) string = "derivatives/preproc" @@ -63,6 +78,8 @@ opts.RejSig (1, 1) double {mustBePositive} = 4 opts.NumTopoICs (1, 1) double {mustBePositive, mustBeInteger} = 30 opts.IcaMethod (1, 1) string {mustBeMember(opts.IcaMethod, ["amica", "runica"])} = "amica" + opts.IcaTasks (1, :) string = ["DespicableMe", "DiaryOfAWimpyKid", "FunwithFractals", "ThePresent"] + opts.BidsRoot (1, 1) string = "" opts.EeglabRoot (1, 1) string = "" end @@ -126,6 +143,10 @@ 'participant_id', subjId, ... 'status', "failed_" + stage, ... 'ica_method', opts.IcaMethod, ... + 'ica_tasks', strjoin(opts.IcaTasks, "+"), ... + 'ica_samples', NaN, ... + 'k_factor', NaN, ... + 'common_channels', NaN, ... 'n_components', NaN, ... 'final_ll', NaN, ... 'iterations', NaN, ... @@ -146,7 +167,7 @@ 'eeglab_root', char(eeglab_root))); end -function qaRow = process_one_subject(EEG, subjId, head_model, opts) +function qaRow = process_one_subject(targetEEG, subjId, head_model, opts) out_eeg_dir = fullfile(opts.OutDir, subjId, "eeg"); out_amica_dir = fullfile(opts.OutDir, subjId, "_amica"); out_fig_dir = fullfile(opts.OutDir, subjId, "figures"); @@ -154,16 +175,40 @@ if ~isfolder(d); mkdir(d); end end + % Build the AMICA training set. Single-task mode (IcaTasks == [Task]) + % keeps the legacy single-source decomposition. Multi-task mode + % (default) concatenates the cleaned per-task .set files and trains + % AMICA on the merged data, then transplants weights back onto the + % Task-only EEG for downstream analysis. + if isscalar(opts.IcaTasks) && opts.IcaTasks(1) == opts.Task + trainEEG = targetEEG; + mtInfo = struct( ... + 'ica_tasks', opts.Task, ... + 'ica_samples', size(targetEEG.data, 2), ... + 'k_factor', size(targetEEG.data, 2) / (size(targetEEG.data, 1)^2), ... + 'common_channels', size(targetEEG.data, 1)); + else + [trainEEG, mtInfo] = hbn.prepare_multitask_amica_input(opts, subjId); + common = string({trainEEG.chanlocs.labels}); + targetEEG = pop_select(targetEEG, 'channel', cellstr(common)); + end + switch lower(opts.IcaMethod) case "amica" - [EEG, amicaStats] = hbn.run_amica(EEG, out_amica_dir, opts); + [trainEEG, amicaStats] = hbn.run_amica(trainEEG, out_amica_dir, opts); case "runica" - [EEG, amicaStats] = hbn.run_infomax(EEG, out_amica_dir, opts); + [trainEEG, amicaStats] = hbn.run_infomax(trainEEG, out_amica_dir, opts); otherwise error("hbn:phase2:unknown_ica_method", ... "IcaMethod=%s not supported", opts.IcaMethod); end + if isscalar(opts.IcaTasks) && opts.IcaTasks(1) == opts.Task + EEG = trainEEG; + else + EEG = hbn.apply_ica_weights(trainEEG, targetEEG); + end + [EEG, dipoleStats] = hbn.fit_dipoles(EEG, head_model); hbn.save_ic_topo_figure(EEG, out_fig_dir, subjId, opts.NumTopoICs); @@ -174,6 +219,10 @@ qaRow = struct( ... 'ica_method', opts.IcaMethod, ... + 'ica_tasks', mtInfo.ica_tasks, ... + 'ica_samples', mtInfo.ica_samples, ... + 'k_factor', mtInfo.k_factor, ... + 'common_channels', mtInfo.common_channels, ... 'n_components', EEG.nbchan, ... 'final_ll', amicaStats.final_ll, ... 'iterations', amicaStats.iterations, ... diff --git a/tests/matlab/test_phase2_smoke.m b/tests/matlab/test_phase2_smoke.m index b5f1907..b2d3848 100644 --- a/tests/matlab/test_phase2_smoke.m +++ b/tests/matlab/test_phase2_smoke.m @@ -39,7 +39,8 @@ OutDir = amicaDir, ... MaxIter = 100, ... DoReject = false, ... - IcaMethod = "runica"); + IcaMethod = "runica", ... + IcaTasks = "ThePresent"); assert(isfile(paramsPath), "params.json missing at %s", paramsPath); qaPath = fullfile(amicaDir, "qa_amica.csv"); @@ -70,7 +71,7 @@ params = jsondecode(fileread(paramsPath)); requiredFields = ["NumModels", "MaxIter", "MaxThreads", "DoReject", ... - "NumRej", "RejSig", "PreprocDir", "OutDir", "Task", "IcaMethod", ... + "NumRej", "RejSig", "PreprocDir", "OutDir", "Task", "IcaMethod", "IcaTasks", ... "matlab_version", "eeglab_version", "git_sha", "timestamp_iso", ... "n_subjects_input", "n_subjects_ok", "n_subjects_failed", "eeglab_root"]; missingFields = setdiff(requiredFields, string(fieldnames(params)));