From 89c9ba929d6abd727091abda2a265716642edde6 Mon Sep 17 00:00:00 2001
From: Seyed Yahya Shirazi <shirazi@ieee.org>
Date: Wed, 13 May 2026 06:49:22 -0700
Subject: [PATCH] phase2: multi-task AMICA concat (issue #33)

ThePresent alone gives k = samples/n_chans^2 = 1.1 for 128-channel ICA;
standard ICA wants k >= 20. Under-determined AMICA produces components
that collapse to single electrodes (visual inspection on PR #31's 3
subjects confirms). Reference pipeline mitigates by concatenating
multiple HBN tasks for ICA training while keeping ThePresent as the
analysis target.

Adds IcaTasks opt to phase2_amica (default: the four passive-viewing
movie tasks DespicableMe, DiaryOfAWimpyKid, FunwithFractals,
ThePresent). When |IcaTasks| > 1: ensure Phase 1 .set exists for each
task (regenerate via phase1_preprocess if missing), load all, restrict
to common channel intersection, pop_mergeset, train AMICA on merged
data, transplant weights onto the Task=ThePresent EEG via
hbn.apply_ica_weights, then dipfit + figures + checkpoint as before.

src/matlab/phase2_amica.m
- IcaTasks (1,:) string default 4 movie tasks. Set to [Task] for the
  legacy single-task path (smoke test uses this).
- BidsRoot opt added so prepare_multitask_amica_input can regenerate
  missing Phase 1 .set files.
- qaRow now carries ica_tasks, ica_samples, k_factor, common_channels.

src/matlab/+hbn/prepare_multitask_amica_input.m (new)
- For each task in IcaTasks: load Phase 1 .set or regenerate via
  phase1_preprocess. Intersect channel labels across tasks
  (>= 32 channels required). pop_select to intersection. pop_mergeset.
  Returns merged EEG + info struct.

src/matlab/+hbn/apply_ica_weights.m (new)
- Copies icaweights/icasphere/icachansind/icawinv from sourceEEG to
  targetEEG, asserting channel-count match. After multi-task AMICA
  the target ThePresent EEG must already be on the same channel
  intersection (pop_select before the call).

src/matlab/+hbn/write_qa_amica_csv.m
- Schema gains ica_tasks, ica_samples, k_factor, common_channels.
  Insertion-order placement keeps participant_id, status, ica_method
  as the leftmost columns for readability.

tests/matlab/test_phase2_smoke.m
- Pass IcaTasks="ThePresent" so the fixture-based smoke test stays
  on the single-task path (the fixture only has ThePresent).
- requiredFields includes IcaTasks.

scripts/run_phase2_three_subjects_multitask.m (new)
- Production runner. Regenerates Phase 1 ThePresent .set if missing.
  Per-task .sets for the other 3 movie tasks are produced lazily by
  prepare_multitask_amica_input.

Refs #1, refs #33.
---
 scripts/run_phase2_three_subjects_multitask.m |  58 ++++++++++
 src/matlab/+hbn/apply_ica_weights.m           |  37 ++++++
 .../+hbn/prepare_multitask_amica_input.m      | 106 ++++++++++++++++++
 src/matlab/+hbn/write_qa_amica_csv.m          |   7 +-
 src/matlab/phase2_amica.m                     |  55 ++++++++-
 tests/matlab/test_phase2_smoke.m              |   5 +-
 6 files changed, 262 insertions(+), 6 deletions(-)
 create mode 100644 scripts/run_phase2_three_subjects_multitask.m
 create mode 100644 src/matlab/+hbn/apply_ica_weights.m
 create mode 100644 src/matlab/+hbn/prepare_multitask_amica_input.m

diff --git a/scripts/run_phase2_three_subjects_multitask.m b/scripts/run_phase2_three_subjects_multitask.m
new file mode 100644
index 0000000..378d6d1
--- /dev/null
+++ b/scripts/run_phase2_three_subjects_multitask.m
@@ -0,0 +1,58 @@
+function run_phase2_three_subjects_multitask
+%RUN_PHASE2_THREE_SUBJECTS_MULTITASK Phase 2 with multi-task AMICA pool.
+%   Phase 2 production run on 3 subjects using the 4-movie task pool for
+%   ICA training (DespicableMe, DiaryOfAWimpyKid, FunwithFractals,
+%   ThePresent). Each task is regenerated through Phase 1 cleaning if
+%   the cleaned .set is missing. AMICA fits the merged cleaned data; the
+%   resulting weights are transplanted onto the ThePresent-only Phase 1
+%   .set before dipfit + figures + checkpoint.
+%
+%   Invoked from the worktree root:
+%     matlab -batch "addpath('scripts'); run_phase2_three_subjects_multitask()"
+
+    eeglab_path = '/Users/yahya/Documents/git/eeg/eeglab';
+    bids_root   = '/Volumes/S1/Datasets/HBN/L100/R3_L100_bdf';
+
+    script_dir = fileparts(mfilename('fullpath'));
+    repo_root  = fileparts(script_dir);
+    cd(repo_root);
+
+    addpath(eeglab_path);
+    evalc('eeglab nogui');
+    addpath(genpath(fullfile(repo_root, 'src', 'matlab')));
+
+    fprintf('[phase2-mt] cwd: %s\n', pwd);
+    fprintf('[phase2-mt] starting at %s\n', datestr(now, 'yyyy-mm-dd HH:MM:SS')); %#ok<DATST,TNOW1>
+
+    preprocDir = fullfile(repo_root, 'derivatives', 'preproc');
+    amicaDir   = fullfile(repo_root, 'derivatives', 'amica');
+
+    icaTasks = ["DespicableMe", "DiaryOfAWimpyKid", "FunwithFractals", "ThePresent"];
+    fprintf('[phase2-mt] IcaTasks: %s\n', strjoin(icaTasks, ', '));
+
+    % Ensure phase 1 .set for the target task (ThePresent) on all 3 subjects.
+    % Per-task .sets for the other 3 movies are produced lazily by
+    % hbn.prepare_multitask_amica_input as needed.
+    targetSets = dir(fullfile(preprocDir, 'sub-*', 'eeg', '*_task-ThePresent_desc-clean_eeg.set'));
+    if numel(targetSets) < 3
+        fprintf('[phase2-mt] regenerating Phase 1 ThePresent on 3 subjects ...\n');
+        phase1_preprocess( ...
+            'BidsRoot', bids_root, ...
+            'OutDir', preprocDir, ...
+            'Task', 'ThePresent', ...
+            'SmokeSubjectCount', 3);
+    else
+        fprintf('[phase2-mt] found %d ThePresent .set; using existing\n', numel(targetSets));
+    end
+
+    fprintf('[phase2-mt] phase2_amica multi-task starting at %s\n', ...
+        datestr(now, 'yyyy-mm-dd HH:MM:SS')); %#ok<DATST,TNOW1>
+    phase2_amica( ...
+        'PreprocDir', preprocDir, ...
+        'OutDir', amicaDir, ...
+        'BidsRoot', bids_root, ...
+        'IcaTasks', icaTasks);
+
+    fprintf('[phase2-mt] finished at %s\n', ...
+        datestr(now, 'yyyy-mm-dd HH:MM:SS')); %#ok<DATST,TNOW1>
+end
diff --git a/src/matlab/+hbn/apply_ica_weights.m b/src/matlab/+hbn/apply_ica_weights.m
new file mode 100644
index 0000000..9acf187
--- /dev/null
+++ b/src/matlab/+hbn/apply_ica_weights.m
@@ -0,0 +1,37 @@
+function targetEEG = apply_ica_weights(sourceEEG, targetEEG)
+%APPLY_ICA_WEIGHTS Copy ICA weights from sourceEEG onto targetEEG.
+%
+%   targetEEG = hbn.apply_ica_weights(sourceEEG, targetEEG) installs the
+%   AMICA weights/sphere from sourceEEG into targetEEG (which is typically
+%   the ThePresent-only Phase 1 .set restricted to the same channel set).
+%   Used after multi-task AMICA training to attach the multi-task-learned
+%   weights to the single-task analysis EEG.
+%
+%   Assumes channel set match: targetEEG must already be on the same
+%   channel intersection as sourceEEG (use pop_select beforehand).
+
+    arguments
+        sourceEEG (1, 1) struct
+        targetEEG (1, 1) struct
+    end
+
+    nChansTarget = size(targetEEG.data, 1);
+    nChansSource = size(sourceEEG.icaweights, 2);
+    if nChansTarget ~= nChansSource
+        error("hbn:phase2:apply_weights:chan_mismatch", ...
+            "target has %d channels, source AMICA was trained on %d; " + ...
+            "pop_select target to the same intersection first", ...
+            nChansTarget, nChansSource);
+    end
+    if isempty(sourceEEG.icaweights) || isempty(sourceEEG.icasphere)
+        error("hbn:phase2:apply_weights:no_weights", ...
+            "sourceEEG has no AMICA weights to copy");
+    end
+
+    targetEEG.icaweights = sourceEEG.icaweights;
+    targetEEG.icasphere = sourceEEG.icasphere;
+    targetEEG.icachansind = sourceEEG.icachansind;
+    targetEEG.icawinv = sourceEEG.icawinv;
+    targetEEG.icaact = [];
+    targetEEG = eeg_checkset(targetEEG, 'ica');
+end
diff --git a/src/matlab/+hbn/prepare_multitask_amica_input.m b/src/matlab/+hbn/prepare_multitask_amica_input.m
new file mode 100644
index 0000000..a85f549
--- /dev/null
+++ b/src/matlab/+hbn/prepare_multitask_amica_input.m
@@ -0,0 +1,106 @@
+function [mergedEEG, info] = prepare_multitask_amica_input(opts, subjId)
+%PREPARE_MULTITASK_AMICA_INPUT Concatenate cleaned per-task .set files for a subject.
+%
+%   [mergedEEG, info] = hbn.prepare_multitask_amica_input(opts, subjId)
+%   ensures Phase 1 `.set` checkpoints exist for each task in
+%   opts.IcaTasks (re-runs phase1_preprocess for missing ones), loads
+%   them, restricts every dataset to the channel intersection, and
+%   concatenates via pop_mergeset.
+%
+%   This is the data-pool expansion described in the project brief:
+%   ICA training uses multi-task data so the sample-per-weight ratio
+%   (k = nSamples / nChans^2) lifts above the under-determined regime.
+%   Phase 4 onwards still analyzes ThePresent only; only the AMICA
+%   weights are estimated on the merged pool.
+%
+%   info struct (per-subject diagnostics for qa_amica.csv):
+%     info.ica_tasks       string array, tasks actually used
+%     info.ica_samples     total samples in merged dataset
+%     info.k_factor        ica_samples / nChans^2
+%     info.common_channels number of channels surviving intersection
+
+    arguments
+        opts (1, 1) struct
+        subjId (1, 1) string
+    end
+
+    if numel(opts.IcaTasks) == 0
+        error("hbn:phase2:multitask:no_tasks", ...
+            "IcaTasks is empty; need at least one task to load");
+    end
+
+    preprocDir = opts.PreprocDir;
+
+    taskEEGs = cell(0, 1);
+    usedTasks = string.empty;
+    for t = opts.IcaTasks(:)'
+        setName = sprintf("%s_task-%s_desc-clean_eeg.set", subjId, t);
+        setPath = fullfile(preprocDir, subjId, "eeg", setName);
+
+        if ~isfile(setPath)
+            if isfield(opts, 'BidsRoot') && opts.BidsRoot ~= ""
+                fprintf("[phase2-multitask] regenerating phase 1 for %s task=%s\n", subjId, t);
+                try
+                    phase1_preprocess( ...
+                        'BidsRoot', opts.BidsRoot, ...
+                        'OutDir', preprocDir, ...
+                        'Task', t, ...
+                        'Subjects', subjId);
+                catch ME
+                    warning("hbn:phase2:multitask:phase1_failed", ...
+                        "phase1 regen failed for %s task=%s: %s; skipping task", ...
+                        subjId, t, ME.message);
+                    continue;
+                end
+            end
+        end
+
+        if ~isfile(setPath)
+            warning("hbn:phase2:multitask:missing_task", ...
+                "no Phase 1 .set for %s task=%s; skipping task", subjId, t);
+            continue;
+        end
+
+        EEG = pop_loadset('filename', char(setName), ...
+            'filepath', char(fullfile(preprocDir, subjId, "eeg")));
+        taskEEGs{end + 1, 1} = EEG; %#ok<AGROW>
+        usedTasks(end + 1) = t; %#ok<AGROW>
+    end
+
+    if isempty(taskEEGs)
+        error("hbn:phase2:multitask:no_inputs", ...
+            "No Phase 1 .set files loaded for %s across IcaTasks=[%s]", ...
+            subjId, strjoin(opts.IcaTasks, ", "));
+    end
+
+    common = string({taskEEGs{1}.chanlocs.labels});
+    for k = 2:numel(taskEEGs)
+        common = intersect(common, string({taskEEGs{k}.chanlocs.labels}), 'stable');
+    end
+    if numel(common) < 32
+        error("hbn:phase2:multitask:few_common_channels", ...
+            "Only %d channels common across %d tasks for %s; cannot fit AMICA", ...
+            numel(common), numel(taskEEGs), subjId);
+    end
+
+    for k = 1:numel(taskEEGs)
+        taskEEGs{k} = pop_select(taskEEGs{k}, 'channel', cellstr(common));
+    end
+
+    if numel(taskEEGs) == 1
+        mergedEEG = taskEEGs{1};
+    else
+        mergedEEG = pop_mergeset([taskEEGs{:}], 1:numel(taskEEGs));
+    end
+
+    nSamples = size(mergedEEG.data, 2);
+    nChans = size(mergedEEG.data, 1);
+    info = struct( ...
+        'ica_tasks', strjoin(usedTasks, "+"), ...
+        'ica_samples', nSamples, ...
+        'k_factor', nSamples / (nChans^2), ...
+        'common_channels', nChans);
+
+    fprintf("[phase2-multitask] %s: merged %d tasks (%s), %d samples, %d common chans, k=%.2f\n", ...
+        subjId, numel(taskEEGs), strjoin(usedTasks, "+"), nSamples, nChans, info.k_factor);
+end
diff --git a/src/matlab/+hbn/write_qa_amica_csv.m b/src/matlab/+hbn/write_qa_amica_csv.m
index ce04188..51baa55 100644
--- a/src/matlab/+hbn/write_qa_amica_csv.m
+++ b/src/matlab/+hbn/write_qa_amica_csv.m
@@ -17,7 +17,8 @@
     if ~isfolder(outDir); mkdir(outDir); end
     csvPath = fullfile(outDir, "qa_amica.csv");
 
-    header = ["participant_id", "status", "ica_method", "n_components", "final_ll", ...
+    header = ["participant_id", "status", "ica_method", "ica_tasks", ...
+        "ica_samples", "k_factor", "common_channels", "n_components", "final_ll", ...
         "iterations", "reached_max_iter", "median_rv", "num_models", ...
         "max_threads", "duration_s", "error_message"];
     needsHeader = ~isfile(csvPath);
@@ -36,6 +37,10 @@
         csv_escape(string(field_or_default(row, "participant_id", ""))), ...
         csv_escape(string(field_or_default(row, "status", "unknown"))), ...
         csv_escape(string(field_or_default(row, "ica_method", "amica"))), ...
+        csv_escape(string(field_or_default(row, "ica_tasks", ""))), ...
+        numeric_cell(field_or_default(row, "ica_samples", NaN)), ...
+        numeric_cell(field_or_default(row, "k_factor", NaN)), ...
+        numeric_cell(field_or_default(row, "common_channels", NaN)), ...
         numeric_cell(field_or_default(row, "n_components", NaN)), ...
         numeric_cell(field_or_default(row, "final_ll", NaN)), ...
         numeric_cell(field_or_default(row, "iterations", NaN)), ...
diff --git a/src/matlab/phase2_amica.m b/src/matlab/phase2_amica.m
index 4f4840a..8bb47c4 100644
--- a/src/matlab/phase2_amica.m
+++ b/src/matlab/phase2_amica.m
@@ -42,6 +42,13 @@
 %     RejSig       (1,1) double   default 4
 %     NumTopoICs   (1,1) double   default 30
 %     IcaMethod    (1,1) string   default "amica"        ("amica" or "runica")
+%     IcaTasks     (1,:) string   default ["DespicableMe","DiaryOfAWimpyKid",
+%                                          "FunwithFractals","ThePresent"]
+%     BidsRoot     (1,1) string   default ""             (only needed if
+%                                                          IcaTasks contain
+%                                                          tasks whose Phase 1
+%                                                          checkpoint is missing
+%                                                          and must be regenerated)
 %     EeglabRoot   (1,1) string   default ""             (probed if empty)
 %
 %   IcaMethod="amica" is the production path (matches the brief). The
@@ -49,6 +56,14 @@
 %   binary needs MPI runtime libs that Ubuntu runners do not have. Both
 %   paths populate EEG.icaweights / icasphere / icawinv, so downstream
 %   dipfit + figure code does not branch on the method.
+%
+%   IcaTasks controls the data pool used to train AMICA. ThePresent alone
+%   is ~210 s at 100 Hz, giving k=samples/n_chans^2 of ~1.1, well below
+%   the standard k>=20 threshold for stable ICA. The default concatenates
+%   the four passive-viewing movie tasks (k ~ 3.5), matching the data-pool
+%   expansion pattern in the reference pipeline. The downstream analysis
+%   stays on Task=ThePresent only; the multi-task pool is just the ICA
+%   training set. Set IcaTasks=[Task] to keep the legacy single-task path.
 
     arguments
         opts.PreprocDir (1, 1) string = "derivatives/preproc"
@@ -63,6 +78,8 @@
         opts.RejSig (1, 1) double {mustBePositive} = 4
         opts.NumTopoICs (1, 1) double {mustBePositive, mustBeInteger} = 30
         opts.IcaMethod (1, 1) string {mustBeMember(opts.IcaMethod, ["amica", "runica"])} = "amica"
+        opts.IcaTasks (1, :) string = ["DespicableMe", "DiaryOfAWimpyKid", "FunwithFractals", "ThePresent"]
+        opts.BidsRoot (1, 1) string = ""
         opts.EeglabRoot (1, 1) string = ""
     end
 
@@ -126,6 +143,10 @@
                 'participant_id', subjId, ...
                 'status', "failed_" + stage, ...
                 'ica_method', opts.IcaMethod, ...
+                'ica_tasks', strjoin(opts.IcaTasks, "+"), ...
+                'ica_samples', NaN, ...
+                'k_factor', NaN, ...
+                'common_channels', NaN, ...
                 'n_components', NaN, ...
                 'final_ll', NaN, ...
                 'iterations', NaN, ...
@@ -146,7 +167,7 @@
         'eeglab_root', char(eeglab_root)));
 end
 
-function qaRow = process_one_subject(EEG, subjId, head_model, opts)
+function qaRow = process_one_subject(targetEEG, subjId, head_model, opts)
     out_eeg_dir = fullfile(opts.OutDir, subjId, "eeg");
     out_amica_dir = fullfile(opts.OutDir, subjId, "_amica");
     out_fig_dir = fullfile(opts.OutDir, subjId, "figures");
@@ -154,16 +175,40 @@
         if ~isfolder(d); mkdir(d); end
     end
 
+    % Build the AMICA training set. Single-task mode (IcaTasks == [Task])
+    % keeps the legacy single-source decomposition. Multi-task mode
+    % (default) concatenates the cleaned per-task .set files and trains
+    % AMICA on the merged data, then transplants weights back onto the
+    % Task-only EEG for downstream analysis.
+    if isscalar(opts.IcaTasks) && opts.IcaTasks(1) == opts.Task
+        trainEEG = targetEEG;
+        mtInfo = struct( ...
+            'ica_tasks', opts.Task, ...
+            'ica_samples', size(targetEEG.data, 2), ...
+            'k_factor', size(targetEEG.data, 2) / (size(targetEEG.data, 1)^2), ...
+            'common_channels', size(targetEEG.data, 1));
+    else
+        [trainEEG, mtInfo] = hbn.prepare_multitask_amica_input(opts, subjId);
+        common = string({trainEEG.chanlocs.labels});
+        targetEEG = pop_select(targetEEG, 'channel', cellstr(common));
+    end
+
     switch lower(opts.IcaMethod)
         case "amica"
-            [EEG, amicaStats] = hbn.run_amica(EEG, out_amica_dir, opts);
+            [trainEEG, amicaStats] = hbn.run_amica(trainEEG, out_amica_dir, opts);
         case "runica"
-            [EEG, amicaStats] = hbn.run_infomax(EEG, out_amica_dir, opts);
+            [trainEEG, amicaStats] = hbn.run_infomax(trainEEG, out_amica_dir, opts);
         otherwise
             error("hbn:phase2:unknown_ica_method", ...
                 "IcaMethod=%s not supported", opts.IcaMethod);
     end
 
+    if isscalar(opts.IcaTasks) && opts.IcaTasks(1) == opts.Task
+        EEG = trainEEG;
+    else
+        EEG = hbn.apply_ica_weights(trainEEG, targetEEG);
+    end
+
     [EEG, dipoleStats] = hbn.fit_dipoles(EEG, head_model);
 
     hbn.save_ic_topo_figure(EEG, out_fig_dir, subjId, opts.NumTopoICs);
@@ -174,6 +219,10 @@
 
     qaRow = struct( ...
         'ica_method', opts.IcaMethod, ...
+        'ica_tasks', mtInfo.ica_tasks, ...
+        'ica_samples', mtInfo.ica_samples, ...
+        'k_factor', mtInfo.k_factor, ...
+        'common_channels', mtInfo.common_channels, ...
         'n_components', EEG.nbchan, ...
         'final_ll', amicaStats.final_ll, ...
         'iterations', amicaStats.iterations, ...
diff --git a/tests/matlab/test_phase2_smoke.m b/tests/matlab/test_phase2_smoke.m
index b5f1907..b2d3848 100644
--- a/tests/matlab/test_phase2_smoke.m
+++ b/tests/matlab/test_phase2_smoke.m
@@ -39,7 +39,8 @@
         OutDir = amicaDir, ...
         MaxIter = 100, ...
         DoReject = false, ...
-        IcaMethod = "runica");
+        IcaMethod = "runica", ...
+        IcaTasks = "ThePresent");
 
     assert(isfile(paramsPath), "params.json missing at %s", paramsPath);
     qaPath = fullfile(amicaDir, "qa_amica.csv");
@@ -70,7 +71,7 @@
 
     params = jsondecode(fileread(paramsPath));
     requiredFields = ["NumModels", "MaxIter", "MaxThreads", "DoReject", ...
-        "NumRej", "RejSig", "PreprocDir", "OutDir", "Task", "IcaMethod", ...
+        "NumRej", "RejSig", "PreprocDir", "OutDir", "Task", "IcaMethod", "IcaTasks", ...
         "matlab_version", "eeglab_version", "git_sha", "timestamp_iso", ...
         "n_subjects_input", "n_subjects_ok", "n_subjects_failed", "eeglab_root"];
     missingFields = setdiff(requiredFields, string(fieldnames(params)));