Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 58 additions & 0 deletions scripts/run_phase2_three_subjects_multitask.m
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
function run_phase2_three_subjects_multitask
%RUN_PHASE2_THREE_SUBJECTS_MULTITASK Phase 2 with multi-task AMICA pool.
% Phase 2 production run on 3 subjects using the 4-movie task pool for
% ICA training (DespicableMe, DiaryOfAWimpyKid, FunwithFractals,
% ThePresent). Each task is regenerated through Phase 1 cleaning if
% the cleaned .set is missing. AMICA fits the merged cleaned data; the
% resulting weights are transplanted onto the ThePresent-only Phase 1
% .set before dipfit + figures + checkpoint.
%
% Invoked from the worktree root:
% matlab -batch "addpath('scripts'); run_phase2_three_subjects_multitask()"

eeglab_path = '/Users/yahya/Documents/git/eeg/eeglab';
bids_root = '/Volumes/S1/Datasets/HBN/L100/R3_L100_bdf';

script_dir = fileparts(mfilename('fullpath'));
repo_root = fileparts(script_dir);
cd(repo_root);

addpath(eeglab_path);
evalc('eeglab nogui');
addpath(genpath(fullfile(repo_root, 'src', 'matlab')));

fprintf('[phase2-mt] cwd: %s\n', pwd);
fprintf('[phase2-mt] starting at %s\n', datestr(now, 'yyyy-mm-dd HH:MM:SS')); %#ok<DATST,TNOW1>

preprocDir = fullfile(repo_root, 'derivatives', 'preproc');
amicaDir = fullfile(repo_root, 'derivatives', 'amica');

icaTasks = ["DespicableMe", "DiaryOfAWimpyKid", "FunwithFractals", "ThePresent"];
fprintf('[phase2-mt] IcaTasks: %s\n', strjoin(icaTasks, ', '));

% Ensure phase 1 .set for the target task (ThePresent) on all 3 subjects.
% Per-task .sets for the other 3 movies are produced lazily by
% hbn.prepare_multitask_amica_input as needed.
targetSets = dir(fullfile(preprocDir, 'sub-*', 'eeg', '*_task-ThePresent_desc-clean_eeg.set'));
if numel(targetSets) < 3
fprintf('[phase2-mt] regenerating Phase 1 ThePresent on 3 subjects ...\n');
phase1_preprocess( ...
'BidsRoot', bids_root, ...
'OutDir', preprocDir, ...
'Task', 'ThePresent', ...
'SmokeSubjectCount', 3);
else
fprintf('[phase2-mt] found %d ThePresent .set; using existing\n', numel(targetSets));
end

fprintf('[phase2-mt] phase2_amica multi-task starting at %s\n', ...
datestr(now, 'yyyy-mm-dd HH:MM:SS')); %#ok<DATST,TNOW1>
phase2_amica( ...
'PreprocDir', preprocDir, ...
'OutDir', amicaDir, ...
'BidsRoot', bids_root, ...
'IcaTasks', icaTasks);

fprintf('[phase2-mt] finished at %s\n', ...
datestr(now, 'yyyy-mm-dd HH:MM:SS')); %#ok<DATST,TNOW1>
end
37 changes: 37 additions & 0 deletions src/matlab/+hbn/apply_ica_weights.m
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
function targetEEG = apply_ica_weights(sourceEEG, targetEEG)
%APPLY_ICA_WEIGHTS Copy ICA weights from sourceEEG onto targetEEG.
%
% targetEEG = hbn.apply_ica_weights(sourceEEG, targetEEG) installs the
% AMICA weights/sphere from sourceEEG into targetEEG (which is typically
% the ThePresent-only Phase 1 .set restricted to the same channel set).
% Used after multi-task AMICA training to attach the multi-task-learned
% weights to the single-task analysis EEG.
%
% Assumes channel set match: targetEEG must already be on the same
% channel intersection as sourceEEG (use pop_select beforehand).

arguments
sourceEEG (1, 1) struct
targetEEG (1, 1) struct
end

nChansTarget = size(targetEEG.data, 1);
nChansSource = size(sourceEEG.icaweights, 2);
if nChansTarget ~= nChansSource
error("hbn:phase2:apply_weights:chan_mismatch", ...
"target has %d channels, source AMICA was trained on %d; " + ...
"pop_select target to the same intersection first", ...
nChansTarget, nChansSource);
end
if isempty(sourceEEG.icaweights) || isempty(sourceEEG.icasphere)
error("hbn:phase2:apply_weights:no_weights", ...
"sourceEEG has no AMICA weights to copy");
end

targetEEG.icaweights = sourceEEG.icaweights;
targetEEG.icasphere = sourceEEG.icasphere;
targetEEG.icachansind = sourceEEG.icachansind;
targetEEG.icawinv = sourceEEG.icawinv;
targetEEG.icaact = [];
targetEEG = eeg_checkset(targetEEG, 'ica');
end
106 changes: 106 additions & 0 deletions src/matlab/+hbn/prepare_multitask_amica_input.m
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
function [mergedEEG, info] = prepare_multitask_amica_input(opts, subjId)
%PREPARE_MULTITASK_AMICA_INPUT Concatenate cleaned per-task .set files for a subject.
%
% [mergedEEG, info] = hbn.prepare_multitask_amica_input(opts, subjId)
% ensures Phase 1 `.set` checkpoints exist for each task in
% opts.IcaTasks (re-runs phase1_preprocess for missing ones), loads
% them, restricts every dataset to the channel intersection, and
% concatenates via pop_mergeset.
%
% This is the data-pool expansion described in the project brief:
% ICA training uses multi-task data so the sample-per-weight ratio
% (k = nSamples / nChans^2) lifts above the under-determined regime.
% Phase 4 onwards still analyzes ThePresent only; only the AMICA
% weights are estimated on the merged pool.
%
% info struct (per-subject diagnostics for qa_amica.csv):
% info.ica_tasks string array, tasks actually used
% info.ica_samples total samples in merged dataset
% info.k_factor ica_samples / nChans^2
% info.common_channels number of channels surviving intersection

arguments
opts (1, 1) struct
subjId (1, 1) string
end

if numel(opts.IcaTasks) == 0
error("hbn:phase2:multitask:no_tasks", ...
"IcaTasks is empty; need at least one task to load");
end

preprocDir = opts.PreprocDir;

taskEEGs = cell(0, 1);
usedTasks = string.empty;
for t = opts.IcaTasks(:)'
setName = sprintf("%s_task-%s_desc-clean_eeg.set", subjId, t);
setPath = fullfile(preprocDir, subjId, "eeg", setName);

if ~isfile(setPath)
if isfield(opts, 'BidsRoot') && opts.BidsRoot ~= ""
fprintf("[phase2-multitask] regenerating phase 1 for %s task=%s\n", subjId, t);
try
phase1_preprocess( ...
'BidsRoot', opts.BidsRoot, ...
'OutDir', preprocDir, ...
'Task', t, ...
'Subjects', subjId);
catch ME
warning("hbn:phase2:multitask:phase1_failed", ...
"phase1 regen failed for %s task=%s: %s; skipping task", ...
subjId, t, ME.message);
continue;
end
end
end

if ~isfile(setPath)
warning("hbn:phase2:multitask:missing_task", ...
"no Phase 1 .set for %s task=%s; skipping task", subjId, t);
continue;
end

EEG = pop_loadset('filename', char(setName), ...
'filepath', char(fullfile(preprocDir, subjId, "eeg")));
taskEEGs{end + 1, 1} = EEG; %#ok<AGROW>
usedTasks(end + 1) = t; %#ok<AGROW>
end

if isempty(taskEEGs)
error("hbn:phase2:multitask:no_inputs", ...
"No Phase 1 .set files loaded for %s across IcaTasks=[%s]", ...
subjId, strjoin(opts.IcaTasks, ", "));
end

common = string({taskEEGs{1}.chanlocs.labels});
for k = 2:numel(taskEEGs)
common = intersect(common, string({taskEEGs{k}.chanlocs.labels}), 'stable');
end
if numel(common) < 32
error("hbn:phase2:multitask:few_common_channels", ...
"Only %d channels common across %d tasks for %s; cannot fit AMICA", ...
numel(common), numel(taskEEGs), subjId);
end

for k = 1:numel(taskEEGs)
taskEEGs{k} = pop_select(taskEEGs{k}, 'channel', cellstr(common));
end

if numel(taskEEGs) == 1
mergedEEG = taskEEGs{1};
else
mergedEEG = pop_mergeset([taskEEGs{:}], 1:numel(taskEEGs));
end

nSamples = size(mergedEEG.data, 2);
nChans = size(mergedEEG.data, 1);
info = struct( ...
'ica_tasks', strjoin(usedTasks, "+"), ...
'ica_samples', nSamples, ...
'k_factor', nSamples / (nChans^2), ...
'common_channels', nChans);

fprintf("[phase2-multitask] %s: merged %d tasks (%s), %d samples, %d common chans, k=%.2f\n", ...
subjId, numel(taskEEGs), strjoin(usedTasks, "+"), nSamples, nChans, info.k_factor);
end
7 changes: 6 additions & 1 deletion src/matlab/+hbn/write_qa_amica_csv.m
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@
if ~isfolder(outDir); mkdir(outDir); end
csvPath = fullfile(outDir, "qa_amica.csv");

header = ["participant_id", "status", "ica_method", "n_components", "final_ll", ...
header = ["participant_id", "status", "ica_method", "ica_tasks", ...
"ica_samples", "k_factor", "common_channels", "n_components", "final_ll", ...
"iterations", "reached_max_iter", "median_rv", "num_models", ...
"max_threads", "duration_s", "error_message"];
needsHeader = ~isfile(csvPath);
Expand All @@ -36,6 +37,10 @@
csv_escape(string(field_or_default(row, "participant_id", ""))), ...
csv_escape(string(field_or_default(row, "status", "unknown"))), ...
csv_escape(string(field_or_default(row, "ica_method", "amica"))), ...
csv_escape(string(field_or_default(row, "ica_tasks", ""))), ...
numeric_cell(field_or_default(row, "ica_samples", NaN)), ...
numeric_cell(field_or_default(row, "k_factor", NaN)), ...
numeric_cell(field_or_default(row, "common_channels", NaN)), ...
numeric_cell(field_or_default(row, "n_components", NaN)), ...
numeric_cell(field_or_default(row, "final_ll", NaN)), ...
numeric_cell(field_or_default(row, "iterations", NaN)), ...
Expand Down
55 changes: 52 additions & 3 deletions src/matlab/phase2_amica.m
Original file line number Diff line number Diff line change
Expand Up @@ -42,13 +42,28 @@
% RejSig (1,1) double default 4
% NumTopoICs (1,1) double default 30
% IcaMethod (1,1) string default "amica" ("amica" or "runica")
% IcaTasks (1,:) string default ["DespicableMe","DiaryOfAWimpyKid",
% "FunwithFractals","ThePresent"]
% BidsRoot (1,1) string default "" (only needed if
% IcaTasks contain
% tasks whose Phase 1
% checkpoint is missing
% and must be regenerated)
% EeglabRoot (1,1) string default "" (probed if empty)
%
% IcaMethod="amica" is the production path (matches the brief). The
% CI smoke test sets IcaMethod="runica" because AMICA's native Fortran
% binary needs MPI runtime libs that Ubuntu runners do not have. Both
% paths populate EEG.icaweights / icasphere / icawinv, so downstream
% dipfit + figure code does not branch on the method.
%
% IcaTasks controls the data pool used to train AMICA. ThePresent alone
% is ~210 s at 100 Hz, giving k=samples/n_chans^2 of ~1.1, well below
% the standard k>=20 threshold for stable ICA. The default concatenates
% the four passive-viewing movie tasks (k ~ 3.5), matching the data-pool
% expansion pattern in the reference pipeline. The downstream analysis
% stays on Task=ThePresent only; the multi-task pool is just the ICA
% training set. Set IcaTasks=[Task] to keep the legacy single-task path.

arguments
opts.PreprocDir (1, 1) string = "derivatives/preproc"
Expand All @@ -63,6 +78,8 @@
opts.RejSig (1, 1) double {mustBePositive} = 4
opts.NumTopoICs (1, 1) double {mustBePositive, mustBeInteger} = 30
opts.IcaMethod (1, 1) string {mustBeMember(opts.IcaMethod, ["amica", "runica"])} = "amica"
opts.IcaTasks (1, :) string = ["DespicableMe", "DiaryOfAWimpyKid", "FunwithFractals", "ThePresent"]
opts.BidsRoot (1, 1) string = ""
opts.EeglabRoot (1, 1) string = ""
end

Expand Down Expand Up @@ -126,6 +143,10 @@
'participant_id', subjId, ...
'status', "failed_" + stage, ...
'ica_method', opts.IcaMethod, ...
'ica_tasks', strjoin(opts.IcaTasks, "+"), ...
'ica_samples', NaN, ...
'k_factor', NaN, ...
'common_channels', NaN, ...
'n_components', NaN, ...
'final_ll', NaN, ...
'iterations', NaN, ...
Expand All @@ -146,24 +167,48 @@
'eeglab_root', char(eeglab_root)));
end

function qaRow = process_one_subject(EEG, subjId, head_model, opts)
function qaRow = process_one_subject(targetEEG, subjId, head_model, opts)
out_eeg_dir = fullfile(opts.OutDir, subjId, "eeg");
out_amica_dir = fullfile(opts.OutDir, subjId, "_amica");
out_fig_dir = fullfile(opts.OutDir, subjId, "figures");
for d = [out_eeg_dir, out_amica_dir, out_fig_dir]
if ~isfolder(d); mkdir(d); end
end

% Build the AMICA training set. Single-task mode (IcaTasks == [Task])
% keeps the legacy single-source decomposition. Multi-task mode
% (default) concatenates the cleaned per-task .set files and trains
% AMICA on the merged data, then transplants weights back onto the
% Task-only EEG for downstream analysis.
if isscalar(opts.IcaTasks) && opts.IcaTasks(1) == opts.Task
trainEEG = targetEEG;
mtInfo = struct( ...
'ica_tasks', opts.Task, ...
'ica_samples', size(targetEEG.data, 2), ...
'k_factor', size(targetEEG.data, 2) / (size(targetEEG.data, 1)^2), ...
'common_channels', size(targetEEG.data, 1));
else
[trainEEG, mtInfo] = hbn.prepare_multitask_amica_input(opts, subjId);
common = string({trainEEG.chanlocs.labels});
targetEEG = pop_select(targetEEG, 'channel', cellstr(common));
end

switch lower(opts.IcaMethod)
case "amica"
[EEG, amicaStats] = hbn.run_amica(EEG, out_amica_dir, opts);
[trainEEG, amicaStats] = hbn.run_amica(trainEEG, out_amica_dir, opts);
case "runica"
[EEG, amicaStats] = hbn.run_infomax(EEG, out_amica_dir, opts);
[trainEEG, amicaStats] = hbn.run_infomax(trainEEG, out_amica_dir, opts);
otherwise
error("hbn:phase2:unknown_ica_method", ...
"IcaMethod=%s not supported", opts.IcaMethod);
end

if isscalar(opts.IcaTasks) && opts.IcaTasks(1) == opts.Task
EEG = trainEEG;
else
EEG = hbn.apply_ica_weights(trainEEG, targetEEG);
end

[EEG, dipoleStats] = hbn.fit_dipoles(EEG, head_model);

hbn.save_ic_topo_figure(EEG, out_fig_dir, subjId, opts.NumTopoICs);
Expand All @@ -174,6 +219,10 @@

qaRow = struct( ...
'ica_method', opts.IcaMethod, ...
'ica_tasks', mtInfo.ica_tasks, ...
'ica_samples', mtInfo.ica_samples, ...
'k_factor', mtInfo.k_factor, ...
'common_channels', mtInfo.common_channels, ...
'n_components', EEG.nbchan, ...
'final_ll', amicaStats.final_ll, ...
'iterations', amicaStats.iterations, ...
Expand Down
5 changes: 3 additions & 2 deletions tests/matlab/test_phase2_smoke.m
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@
OutDir = amicaDir, ...
MaxIter = 100, ...
DoReject = false, ...
IcaMethod = "runica");
IcaMethod = "runica", ...
IcaTasks = "ThePresent");

assert(isfile(paramsPath), "params.json missing at %s", paramsPath);
qaPath = fullfile(amicaDir, "qa_amica.csv");
Expand Down Expand Up @@ -70,7 +71,7 @@

params = jsondecode(fileread(paramsPath));
requiredFields = ["NumModels", "MaxIter", "MaxThreads", "DoReject", ...
"NumRej", "RejSig", "PreprocDir", "OutDir", "Task", "IcaMethod", ...
"NumRej", "RejSig", "PreprocDir", "OutDir", "Task", "IcaMethod", "IcaTasks", ...
"matlab_version", "eeglab_version", "git_sha", "timestamp_iso", ...
"n_subjects_input", "n_subjects_ok", "n_subjects_failed", "eeglab_root"];
missingFields = setdiff(requiredFields, string(fieldnames(params)));
Expand Down
Loading