From ab81d8379046e2313e3ac37a3c23ae9e77c78ed4 Mon Sep 17 00:00:00 2001 From: AntoinePassemiers Date: Wed, 1 Oct 2025 20:22:29 +0200 Subject: [PATCH 01/19] Bug fix --- src/metrics/sem/helper.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/metrics/sem/helper.py b/src/metrics/sem/helper.py index 4fd90259b..8685cfaa5 100644 --- a/src/metrics/sem/helper.py +++ b/src/metrics/sem/helper.py @@ -347,14 +347,15 @@ def main(par): # Create a symmetric (causally-wrong) baseline GRN print(f"Creating baseline GRN") A_baseline = np.copy(A) - for j in range(A.shape[1]): - np.random.shuffle(A[:j, j]) - np.random.shuffle(A[j+1:, j]) + for j in range(A_baseline.shape[1]): + np.random.shuffle(A_baseline[:j, j]) + np.random.shuffle(A_baseline[j+1:, j]) assert np.any(A_baseline != A) # Evaluate inferred GRN print("\n======== Evaluate inferred GRN ========") scores = evaluate_grn(X_controls, delta_X, is_train, is_reporter, A, signed=use_signs) + # Evaluate baseline GRN print("\n======== Evaluate shuffled GRN ========") n_repeats = 1 From 80195221166a937d01db53e2babb21e4af6305a5 Mon Sep 17 00:00:00 2001 From: AntoinePassemiers Date: Sat, 4 Oct 2025 17:23:32 +0200 Subject: [PATCH 02/19] Fix VC metric + replace anchor_regression by new metric 'regression3' --- common | 2 +- src/metrics/anchor_regression/helper.py | 262 --------- src/metrics/regression_3/helper.py | 196 +++++++ .../run_local.sh | 10 +- .../script.py | 8 +- src/metrics/sem/helper.py | 10 +- src/metrics/vc/helper.py | 517 ++++++++---------- src/metrics/vc/improved_helper.py | 0 src/metrics/vc/new_helper.py | 299 ++++++++++ src/metrics/vc/ultra_fast_helper.py | 0 10 files changed, 751 insertions(+), 553 deletions(-) delete mode 100644 src/metrics/anchor_regression/helper.py create mode 100644 src/metrics/regression_3/helper.py rename src/metrics/{anchor_regression => regression_3}/run_local.sh (84%) rename src/metrics/{anchor_regression => regression_3}/script.py (90%) delete mode 100644 src/metrics/vc/improved_helper.py create mode 100644 src/metrics/vc/new_helper.py delete mode 100644 src/metrics/vc/ultra_fast_helper.py diff --git a/common b/common index 876036f71..f01ff2170 160000 --- a/common +++ b/common @@ -1 +1 @@ -Subproject commit 876036f71713cbd79285b108ab0a9a8238f2b5e1 +Subproject commit f01ff2170161295e89014ee5453c61b29b4e4e77 diff --git a/src/metrics/anchor_regression/helper.py b/src/metrics/anchor_regression/helper.py deleted file mode 100644 index 9c29eb042..000000000 --- a/src/metrics/anchor_regression/helper.py +++ /dev/null @@ -1,262 +0,0 @@ -from typing import Tuple -import os -import sys -import traceback -import random -import h5py -import numpy as np -import pandas as pd -import tqdm -from scipy.sparse.linalg import LinearOperator -from scipy.stats import pearsonr, wilcoxon -from scipy.sparse import csr_matrix -from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder -import anndata as ad -import warnings -warnings.filterwarnings("ignore", category=UserWarning) - -# For reproducibility purposes -seed = 0xCAFE -os.environ['PYTHONHASHSEED'] = str(seed) -random.seed(seed) -np.random.seed(seed) - -from util import read_prediction, manage_layer -from dataset_config import DATASET_GROUPS - - -def encode_obs_cols(adata, cols): - """Encode observation columns to integer codes.""" - encoded = [] - for col in cols: - if col in adata.obs: - codes = LabelEncoder().fit_transform(adata.obs[col].values) - encoded.append(codes) - return encoded - - -def combine_multi_index(*arrays) -> np.array: - """Combine parallel label arrays into a single integer label per position.""" - A = np.stack(arrays) - n_classes = tuple(int(A[i].max()) + 1 for i in range(A.shape[0])) - return np.ravel_multi_index(A, dims=n_classes, order='C') - - -def get_whitening_transform(Z: np.ndarray, gamma: float = 1.0) -> LinearOperator: - """Get whitening transformation. - - Args: - Z: Anchor variables. - gamma: Anchor strength. - - Returns: - Sparse linear operator corresponding to a whitening transform. - """ - n, k = Z.shape - - # Precompute Gram inverse with jitter for stability - ZTZ = Z.T @ Z + 1e-10 * np.eye(k) - ZTZ_inv = np.linalg.inv(ZTZ) - sqrt_gamma = np.sqrt(gamma) - - def matvec(v: np.ndarray) -> np.ndarray: - """Matrix-vector multiplication.""" - v = np.atleast_2d(v) - if v.shape[0] != n: - v = v.T - Pv = Z @ (ZTZ_inv @ (Z.T @ v)) - out = v + (sqrt_gamma - 1.0) * Pv - return out if out.shape[1] > 1 else out.ravel() - - return LinearOperator((n, n), matvec=matvec, rmatvec=matvec) - - -def anchor_regression( - X: np.ndarray, - Z: np.ndarray, - Y: np.ndarray, - l2_reg: float = 1e-2, - anchor_strength: float = 1.0 -) -> np.ndarray: - """Anchor regression for causal inference under confounding. - - Args: - X: input features, of shape (n, d). - Z: anchor variables. The model is required to be invariant to - these environmental variables. Shape (n, z). - Y: predicted variables, of shape (n, u). - l2_reg: L2 regularization strength. - anchor_strength: Strength of anchor regularization. - 0 = standard regression, higher = more anchor regularization. - - Returns: - Inferred parameters, of shape (d, u). - """ - - # Whitening transformation - W = get_whitening_transform(Z, gamma=anchor_strength) - X_t = W @ X - Y_t = W @ Y - - # Ridge regression on the whitened data - sigma_xx = X_t.T @ X_t / X_t.shape[0] - sigma_xy = X_t.T @ Y_t / Y_t.shape[0] - theta = np.linalg.solve(sigma_xx + l2_reg * np.eye(X_t.shape[1]), sigma_xy) - - return theta - - -def compute_stabilities( - X: np.ndarray, - y: np.ndarray, - Z: np.ndarray, - A: np.ndarray, - is_selected: np.ndarray, - eps: float = 1e-50 -) -> float: - theta0_signed = anchor_regression(X, Z, y, anchor_strength=1) - theta0_signed = theta0_signed[is_selected] - theta0 = np.abs(theta0_signed) - theta0 /= np.sum(theta0) - - theta_signed = anchor_regression(X, Z, y, anchor_strength=20) - theta_signed = theta_signed[is_selected] - theta = np.abs(theta_signed) - theta /= np.sum(theta) - - stabilities = np.clip((theta0 - theta) / (theta0 + eps), 0, 1) - stabilities[np.sign(theta0_signed) != np.sign(theta_signed)] = 0 - - return stabilities - - - -def evaluate_gene_stability( - X: np.ndarray, - Z: np.ndarray, - A: np.ndarray, - j: int, - eps: float = 1e-50 -) -> float: - """Evaluate stability of regulatory relationships for a single target gene. - - Args: - X: Gene expression matrix (n_samples, n_genes). - Z: Anchor variables matrix (n_samples, n_anchors). - A: GRN adjacency matrix (n_genes, n_genes). - j: Target gene index. - eps: Small epsilon for numerical stability. - - Returns: - Stability score for gene j - """ - is_selected = np.array(A[:, j] != 0) - if (not np.any(is_selected)) or np.all(is_selected): - return 0.0 - assert not is_selected[j] - - # Exclude target gene from features - mask = np.ones(X.shape[1], dtype=bool) - mask[j] = False - - stabilities_selected = np.mean(compute_stabilities(X[:, mask], X[:, j], Z, A, is_selected[mask], eps=eps)) - #stabilities_non_selected = np.mean(compute_stabilities(X[:, mask], X[:, j], Z, A, ~is_selected[mask], eps=eps)) - - #score = (stabilities_selected - stabilities_non_selected) / (stabilities_selected + stabilities_non_selected + eps) - score = np.mean(stabilities_selected) - - return score - - -def main(par): - """Main anchor regression evaluation function.""" - # Load evaluation data - adata = ad.read_h5ad(par['evaluation_data']) - dataset_id = adata.uns['dataset_id'] - method_id = ad.read_h5ad(par['prediction'], backed='r').uns['method_id'] - - # Get dataset-specific anchor variables - if dataset_id not in DATASET_GROUPS: - raise ValueError(f"Dataset {dataset_id} not found in DATASET_GROUPS") - - anchor_cols = DATASET_GROUPS[dataset_id].get('anchors', ['donor_id', 'plate_name']) - print(f"Using anchor variables: {anchor_cols}") - - # Manage layer - layer = manage_layer(adata, par) - X = adata.layers[layer] - if isinstance(X, csr_matrix): - X = X.toarray() - X = X.astype(np.float32) - - gene_names = adata.var_names - gene_dict = {gene_name: i for i, gene_name in enumerate(gene_names)} - - # Encode anchor variables - anchor_variables = encode_obs_cols(adata, anchor_cols) - anchor_encoded = combine_multi_index(*anchor_variables) - - if len(anchor_variables) == 0: - raise ValueError(f"No anchor variables found in dataset for columns: {anchor_cols}") - - # One-hot encode anchor variables - Z = OneHotEncoder(sparse_output=False, dtype=np.float32).fit_transform(anchor_encoded.reshape(-1, 1)) - print(f"Anchor matrix Z shape: {Z.shape}") - - # Load inferred GRN - df = read_prediction(par) - sources = df["source"].to_numpy() - targets = df["target"].to_numpy() - weights = df["weight"].to_numpy() - - A = np.zeros((len(gene_names), len(gene_names)), dtype=X.dtype) - for source, target, weight in zip(sources, targets, weights): - if (source in gene_dict) and (target in gene_dict): - i = gene_dict[source] - j = gene_dict[target] - A[i, j] = float(weight) - - # Only consider the genes that are actually present in the inferred GRN, - # and keep only the most-connected genes (for speed). - gene_mask = np.logical_or(np.any(A, axis=1), np.any(A, axis=0)) - in_degrees = np.sum(A != 0, axis=0) - out_degrees = np.sum(A != 0, axis=1) - idx = np.argsort(np.maximum(out_degrees, in_degrees))[:-1000] - gene_mask[idx] = False - X = X[:, gene_mask] - X = X.toarray() if isinstance(X, csr_matrix) else X - A = A[gene_mask, :][:, gene_mask] - gene_names = gene_names[gene_mask] - - # Remove self-regulations from GRN - np.fill_diagonal(A, 0) - print(f"Evaluating {X.shape[1]} genes with {np.sum(A != 0)} regulatory links") - - # Center and scale dataset - scaler = StandardScaler() - X = scaler.fit_transform(X) - - # Create baseline GRN - A_baseline = np.copy(A) - for j in range(A_baseline.shape[1]): - np.random.shuffle(A_baseline[:j, j]) - np.random.shuffle(A_baseline[j+1:, j]) - assert np.any(A != A_baseline) - - scores, scores_baseline = [], [] - for j in tqdm.tqdm(range(X.shape[1]), desc="Evaluating gene stability"): - scores.append(evaluate_gene_stability(X, Z, A, j)) - scores = np.array(scores) - - # Calculate final score - final_score = np.mean(scores) - print(f"Method: {method_id}") - print(f"Anchor Regression Score: {final_score:.6f}") - - # Return results as DataFrame - results = { - 'anchor_regression': [final_score] - } - - df_results = pd.DataFrame(results) - return df_results diff --git a/src/metrics/regression_3/helper.py b/src/metrics/regression_3/helper.py new file mode 100644 index 000000000..9f2e427dc --- /dev/null +++ b/src/metrics/regression_3/helper.py @@ -0,0 +1,196 @@ +from typing import Tuple +import os +import sys +import traceback +import random +import h5py +import numpy as np +import pandas as pd +import tqdm +import xgboost +from scipy.sparse.linalg import LinearOperator +from scipy.stats import pearsonr, spearmanr, wilcoxon, ConstantInputWarning +from scipy.sparse import csr_matrix +from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder +from sklearn.linear_model import Ridge +import anndata as ad +import warnings +warnings.filterwarnings("ignore", category=UserWarning) +warnings.filterwarnings("ignore", category=ConstantInputWarning) + +# For reproducibility purposes +seed = 0xCAFE +os.environ['PYTHONHASHSEED'] = str(seed) +random.seed(seed) +np.random.seed(seed) + +from util import read_prediction, manage_layer +from dataset_config import DATASET_GROUPS + + +def encode_obs_cols(adata, cols): + """Encode observation columns to integer codes.""" + encoded = [] + for col in cols: + if col in adata.obs: + codes = LabelEncoder().fit_transform(adata.obs[col].values) + encoded.append(codes) + return encoded + + +def combine_multi_index(*arrays) -> np.array: + """Combine parallel label arrays into a single integer label per position.""" + A = np.stack(arrays) + n_classes = tuple(int(A[i].max()) + 1 for i in range(A.shape[0])) + return np.ravel_multi_index(A, dims=n_classes, order='C') + + +def compute_residual_correlations( + X_train: np.ndarray, + y_train: np.ndarray, + X_test: np.ndarray, + y_test: np.ndarray, + Z_test: np.ndarray +) -> np.ndarray: + model = xgboost.XGBRegressor(n_estimators=10) + #model = Ridge(alpha=10) + model.fit(X_train, y_train) + y_hat = model.predict(X_test) + residuals = y_test - y_hat + coefs = pearsonr(residuals[:, np.newaxis], Z_test, axis=0)[0] + coefs = np.nan_to_num(coefs, nan=0) + assert coefs.shape[0] == Z_test.shape[1] + return np.abs(coefs) + + +def main(par): + # Load evaluation data + adata = ad.read_h5ad(par['evaluation_data']) + dataset_id = adata.uns['dataset_id'] + method_id = ad.read_h5ad(par['prediction'], backed='r').uns['method_id'] + + # Get dataset-specific anchor variables + if dataset_id not in DATASET_GROUPS: + raise ValueError(f"Dataset {dataset_id} not found in DATASET_GROUPS") + + anchor_cols = DATASET_GROUPS[dataset_id].get('anchors', ['donor_id', 'plate_name']) + print(f"Using anchor variables: {anchor_cols}") + + # Manage layer + layer = manage_layer(adata, par) + X = adata.layers[layer] + if isinstance(X, csr_matrix): + X = X.toarray() + X = X.astype(np.float32) + + gene_names = adata.var_names + gene_dict = {gene_name: i for i, gene_name in enumerate(gene_names)} + + # Encode anchor variables + anchor_variables = encode_obs_cols(adata, anchor_cols) + anchor_encoded = combine_multi_index(*anchor_variables) + + if len(anchor_variables) == 0: + raise ValueError(f"No anchor variables found in dataset for columns: {anchor_cols}") + + # One-hot encode anchor variables + Z = OneHotEncoder(sparse_output=False, dtype=np.float32).fit_transform(anchor_encoded.reshape(-1, 1)) + print(f"Anchor matrix Z shape: {Z.shape}") + + # Load inferred GRN + df = read_prediction(par) + sources = df["source"].to_numpy() + targets = df["target"].to_numpy() + weights = df["weight"].to_numpy() + + A = np.zeros((len(gene_names), len(gene_names)), dtype=X.dtype) + for source, target, weight in zip(sources, targets, weights): + if (source in gene_dict) and (target in gene_dict): + i = gene_dict[source] + j = gene_dict[target] + A[i, j] = float(weight) + + # Only consider the genes that are actually present in the inferred GRN, + # and keep only the most-connected genes (for speed). + gene_mask = np.logical_or(np.any(A, axis=1), np.any(A, axis=0)) + in_degrees = np.sum(A != 0, axis=0) + out_degrees = np.sum(A != 0, axis=1) + idx = np.argsort(np.maximum(out_degrees, in_degrees))[:-1000] + gene_mask[idx] = False + X = X[:, gene_mask] + X = X.toarray() if isinstance(X, csr_matrix) else X + A = A[gene_mask, :][:, gene_mask] + gene_names = gene_names[gene_mask] + + # Remove self-regulations + np.fill_diagonal(A, 0) + print(f"Evaluating {X.shape[1]} genes with {np.sum(A != 0)} regulatory links") + + # Create baseline model + A_baseline = np.copy(A) + for j in range(A.shape[1]): + np.random.shuffle(A_baseline[:j, j]) + np.random.shuffle(A_baseline[j+1:, j]) + assert np.any(A_baseline != A) + + scores, baseline_scores = [], [] + for group in np.unique(anchor_encoded): + + # Train/test split + mask = (anchor_encoded != group) + X_train = X[mask, :] + X_test = X[~mask, :] + + # Standardize features + #scaler = StandardScaler() + #X_train = scaler.fit_transform(X_train) + #X_test = scaler.transform(X_test) + + for j in tqdm.tqdm(range(X_train.shape[1])): + + # Evaluate inferred GRN + selected = (A[:, j] != 0) + unselected = ~np.copy(selected) + unselected[j] = False + if (not np.any(selected)) or (not np.any(unselected)): + continue + else: + coefs = compute_residual_correlations( + X_train[:, selected], + X_train[:, j], + X_test[:, selected], + X_test[:, j], + X_test[:, ~selected] + ) + scores.append(np.mean(coefs)) + + # Evaluate baseline GRN + selected = (A_baseline[:, j] != 0) + unselected = ~np.copy(selected) + unselected[j] = False + coefs = compute_residual_correlations( + X_train[:, selected], + X_train[:, j], + X_test[:, selected], + X_test[:, j], + X_test[:, ~selected] + ) + baseline_scores.append(np.mean(coefs)) + scores = np.array(scores) + baseline_scores = np.array(baseline_scores) + + p_value = wilcoxon(baseline_scores, scores, alternative="greater").pvalue + p_value = max(p_value, 1e-300) + + # Calculate final score + final_score = -np.log10(p_value) + print(f"Anchor Regression Score: {final_score:.6f}") + print(f"Method: {method_id}") + + # Return results as DataFrame + results = { + 'regression_3': [final_score] + } + + df_results = pd.DataFrame(results) + return df_results diff --git a/src/metrics/anchor_regression/run_local.sh b/src/metrics/regression_3/run_local.sh similarity index 84% rename from src/metrics/anchor_regression/run_local.sh rename to src/metrics/regression_3/run_local.sh index fc872cc59..78e66b9ac 100644 --- a/src/metrics/anchor_regression/run_local.sh +++ b/src/metrics/regression_3/run_local.sh @@ -1,5 +1,5 @@ #!/bin/bash -#SBATCH --job-name=anchor_regression +#SBATCH --job-name=regression_3 #SBATCH --output=logs/%j.out #SBATCH --error=logs/%j.err #SBATCH --ntasks=1 @@ -12,7 +12,7 @@ set -euo pipefail -save_dir="output/anchor_regression" +save_dir="output/regression_3" mkdir -p "$save_dir" # datasets to process @@ -21,7 +21,7 @@ datasets=( 'op' "300BCG" 'parsebioscience') #"300BCG" "ibd" 'parsebioscience' methods=("pearson_corr" "positive_control" "negative_control" "ppcor" "portia" "scenic" "grnboost" "scprint" "scenicplus" "celloracle" "scglue" "figr" "granie") # temporary file to collect CSV rows -combined_csv="${save_dir}/anchor_regression_scores.csv" +combined_csv="${save_dir}/regression_3_scores.csv" echo "dataset,method,metric,value" > "$combined_csv" for dataset in "${datasets[@]}"; do @@ -31,7 +31,7 @@ for dataset in "${datasets[@]}"; do for method in "${methods[@]}"; do prediction="resources/results/${dataset}/${dataset}.${method}.${method}.prediction.h5ad" - score="${save_dir}/anchor_regression_${dataset}_${method}.h5ad" + score="${save_dir}/regression_3_${dataset}_${method}.h5ad" if [[ ! -f "$prediction" ]]; then echo "File not found: $prediction, skipping..." @@ -39,7 +39,7 @@ for dataset in "${datasets[@]}"; do fi echo -e "\nProcessing method: $method\n" - python src/metrics/anchor_regression/script.py \ + python src/metrics/regression_3/script.py \ --prediction "$prediction" \ --evaluation_data "$evaluation_data" \ --score "$score" diff --git a/src/metrics/anchor_regression/script.py b/src/metrics/regression_3/script.py similarity index 90% rename from src/metrics/anchor_regression/script.py rename to src/metrics/regression_3/script.py index a4687b358..b140d64c5 100644 --- a/src/metrics/anchor_regression/script.py +++ b/src/metrics/regression_3/script.py @@ -30,14 +30,14 @@ sys.path.append(meta["resources_dir"]) except: meta = { - "resources_dir":'src/metrics/anchor_regression/', + "resources_dir":'src/metrics/regression_3/', "util_dir": 'src/utils', - 'helper_dir': 'src/metrics/anchor_regression/' + 'helper_dir': 'src/metrics/regression_3/' } sys.path.append(meta["resources_dir"]) sys.path.append(meta["util_dir"]) sys.path.append(meta["helper_dir"]) -from helper import main as main_anchor +from helper import main as main_reg3 from util import format_save_score @@ -48,7 +48,7 @@ par[key] = value if __name__ == "__main__": - output = main_anchor(par) + output = main_reg3(par) dataset_id = ad.read_h5ad(par['evaluation_data'], backed='r').uns['dataset_id'] method_id = ad.read_h5ad(par['prediction'], backed='r').uns['method_id'] diff --git a/src/metrics/sem/helper.py b/src/metrics/sem/helper.py index 8685cfaa5..13747164a 100644 --- a/src/metrics/sem/helper.py +++ b/src/metrics/sem/helper.py @@ -112,9 +112,12 @@ def neumann_series(A: torch.Tensor, k: int = 2) -> torch.Tensor: Returns: Approximated inverse of I - A. """ - B = torch.eye(A.shape[0], device=A.device, dtype=A.dtype) - for k in range(k): - B = B + torch.mm(B, A) + I = torch.eye(A.shape[0], device=A.device, dtype=A.dtype) + term = I.clone() + B = I.clone() + for _ in range(k): + term = term @ A + B = B + term return B @@ -344,7 +347,6 @@ def main(par): print(f"Proportion of reporter genes: {np.mean(is_reporter)}") print(f"Use regulatory modes/signs: {use_signs}") - # Create a symmetric (causally-wrong) baseline GRN print(f"Creating baseline GRN") A_baseline = np.copy(A) for j in range(A_baseline.shape[1]): diff --git a/src/metrics/vc/helper.py b/src/metrics/vc/helper.py index ed715ee8a..3735f420d 100644 --- a/src/metrics/vc/helper.py +++ b/src/metrics/vc/helper.py @@ -1,4 +1,5 @@ import os +import traceback from typing import Tuple, Dict import sys import tqdm @@ -8,7 +9,8 @@ import pandas as pd import torch from scipy.sparse import csr_matrix -from sklearn.model_selection import GroupKFold +from scipy.stats import wilcoxon +from sklearn.model_selection import StratifiedGroupKFold, StratifiedKFold from sklearn.preprocessing import StandardScaler, LabelEncoder from torch.utils.data import Dataset import anndata as ad @@ -22,14 +24,15 @@ NUMPY_DTYPE = np.float32 # For reproducibility purposes -seed = 0xCAFE -os.environ['PYTHONHASHSEED'] = str(seed) -random.seed(seed) -np.random.seed(seed) -torch.manual_seed(seed) -torch.cuda.manual_seed(seed) -torch.cuda.manual_seed_all(seed) -torch.use_deterministic_algorithms(True) +def set_seed(): + seed = 0xCAFE + os.environ['PYTHONHASHSEED'] = str(seed) + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + torch.use_deterministic_algorithms(True) from util import read_prediction, manage_layer from dataset_config import DATASET_GROUPS @@ -67,154 +70,6 @@ def create_control_matching(are_controls: np.ndarray, match_groups: np.ndarray) return control_map, match_groups -def compute_pds_cell_level(X_true, X_pred, perturbations, gene_names, max_cells_per_pert=10): - """ - Compute PDS at individual cell level (more challenging than mean-based PDS). - - For each individual predicted cell, find its rank when compared to - true mean profiles of all perturbations. - """ - unique_perts = np.unique(perturbations) - n_perts = len(unique_perts) - - print(f"Computing cell-level PDS for {n_perts} unique perturbations") - - if n_perts < 2: - return 0.0 - - # Compute mean true expression profiles per perturbation - true_means = {} - for pert in unique_perts: - mask = (perturbations == pert) - if np.sum(mask) == 0: - continue - true_means[pert] = np.mean(X_true[mask, :], axis=0) - - all_scores = [] - - # For each perturbation, sample some cells and compute their PDS - for pert in unique_perts: - if pert not in true_means: - continue - - # Get cells for this perturbation - mask = (perturbations == pert) - pert_indices = np.where(mask)[0] - - if len(pert_indices) == 0: - continue - - # Sample max_cells_per_pert cells to avoid bias from perturbations with many cells - # Use seeded random generator for reproducibility - cell_rng = np.random.RandomState(seed + int(pert)) # Different seed per perturbation - sampled_indices = cell_rng.choice( - pert_indices, - size=min(max_cells_per_pert, len(pert_indices)), - replace=False - ) - - for cell_idx in sampled_indices: - # Get predicted profile for this individual cell - pred_vec = X_pred[cell_idx, :].copy() - - # Calculate distances to all true mean profiles - dists = [] - for t in unique_perts: - if t not in true_means: - continue - true_vec = true_means[t].copy() - - # Remove target gene if it exists - if str(pert) in gene_names: - gene_idx = np.where(gene_names == str(pert))[0] - if len(gene_idx) > 0: - true_vec = np.delete(true_vec, gene_idx) - pred_vec_temp = np.delete(pred_vec, gene_idx) - else: - pred_vec_temp = pred_vec - else: - pred_vec_temp = pred_vec - - dist = cityblock(pred_vec_temp, true_vec) - dists.append((t, dist)) - - # Sort by distance and find rank of correct perturbation - dists_sorted = sorted(dists, key=lambda x: x[1]) - true_rank = next((i for i, (t, _) in enumerate(dists_sorted) if t == pert), n_perts-1) - - # Cell-level PDS - pds = 1 - (true_rank / (n_perts - 1)) if n_perts > 1 else 1.0 - all_scores.append(pds) - - mean_pds = np.mean(all_scores) if all_scores else 0.0 - print(f"Cell-level PDS scores: min={min(all_scores):.3f}, max={max(all_scores):.3f}, mean={mean_pds:.3f} (n_cells={len(all_scores)})") - return mean_pds - - -def compute_pds(X_true, X_pred, perturbations, gene_names): - """ - Compute both mean-level and cell-level PDS for comparison. - """ - # Mean-level PDS (original approach) - unique_perts = np.unique(perturbations) - n_perts = len(unique_perts) - - print(f"Computing mean-level PDS for {n_perts} unique perturbations") - - if n_perts < 2: - return 0.0, 0.0 - - # Compute mean expression profiles per perturbation - true_means = {} - pred_means = {} - - for pert in unique_perts: - mask = (perturbations == pert) - if np.sum(mask) == 0: - continue - true_means[pert] = np.mean(X_true[mask, :], axis=0) - pred_means[pert] = np.mean(X_pred[mask, :], axis=0) - - scores = {} - for pert in unique_perts: - if pert not in pred_means or pert not in true_means: - continue - - pred_vec = pred_means[pert].copy() - dists = [] - - for t in unique_perts: - if t not in true_means: - continue - true_vec = true_means[t].copy() - - if str(pert) in gene_names: - gene_idx = np.where(gene_names == str(pert))[0] - if len(gene_idx) > 0: - true_vec = np.delete(true_vec, gene_idx) - pred_vec_temp = np.delete(pred_vec, gene_idx) - else: - pred_vec_temp = pred_vec - else: - pred_vec_temp = pred_vec - - dist = cityblock(pred_vec_temp, true_vec) - dists.append((t, dist)) - - dists_sorted = sorted(dists, key=lambda x: x[1]) - true_rank = next((i for i, (t, _) in enumerate(dists_sorted) if t == pert), n_perts-1) - pds = 1 - (true_rank / (n_perts - 1)) if n_perts > 1 else 1.0 - scores[pert] = pds - - mean_level_pds = np.mean(list(scores.values())) if scores else 0.0 - print(f"Mean-level PDS: min={min(scores.values()):.3f}, max={max(scores.values()):.3f}, mean={mean_level_pds:.3f}") - - # Cell-level PDS (more challenging) - cell_level_pds = compute_pds_cell_level(X_true, X_pred, perturbations, gene_names) - - return mean_level_pds, cell_level_pds - - class GRNLayer(torch.nn.Module): def __init__( @@ -223,93 +78,164 @@ def __init__( A_signs: torch.Tensor, signed: bool = True, inverse: bool = True, - alpha: float = 1.0 + alpha: float = 0.1, + stable: bool = False, + bias: bool = True ): torch.nn.Module.__init__(self) self.n_genes: int = A_weights.size(1) self.A_weights: torch.nn.Parameter = A_weights + dtype = A_weights.dtype + device = A_weights.device + if bias: + self.b: torch.nn.Parameter = torch.nn.Parameter(torch.zeros((1, self.n_genes), dtype=dtype, device=device)) + else: + self.b = None self.register_buffer('A_signs', A_signs.to(A_weights.device)) - self.register_buffer('A_mask', (A_signs > 0).to(self.A_weights.dtype).to(A_weights.device)) - self.register_buffer('I', torch.eye(self.n_genes, dtype=A_weights.dtype, device=A_weights.device)) + self.register_buffer('A_mask', (A_signs != 0).to(self.A_weights.dtype).to(A_weights.device)) + self.register_buffer('I', torch.eye(self.n_genes, dtype=dtype, device=device)) self.signed: bool = signed self.inverse: bool = inverse self.alpha: float = alpha + self.stable: bool = stable def forward(self, x: torch.Tensor) -> torch.Tensor: if self.signed: A = torch.abs(self.A_weights) * self.A_signs + assert torch.any(A < 0) else: A = self.A_weights * self.A_mask if self.inverse: - # For inverse transformation, use iterative solve to avoid memory issues - # Solve (I - alpha * A.t()) * y = x for y - ia = self.I - self.alpha * A.t() - - # Add small regularization to diagonal for numerical stability - ia = ia + 1e-6 * self.I - - # Use solve instead of inversion to save memory - try: - # Solve ia * y.t() = x.t() for y.t(), then transpose - result = torch.linalg.solve(ia, x.t()).t() - return result - except torch.linalg.LinAlgError: - # Fallback: simple linear transformation without inversion - print("Warning: Matrix solve failed, using simplified GRN transformation") - return torch.mm(x, A) + if self.stable: + # Approximation using Neumann series + B = GRNLayer.neumann_series(A.t(), self.alpha) + y = torch.mm(x, B) + else: + # For inverse transformation, use iterative solve to avoid memory issues + # Solve (I - alpha * A.t()) * y = x for y + ia = self.I - self.alpha * A.t() + + try: + # Use solve instead of inversion to save memory + y = torch.linalg.solve(ia, x.t()).t() + except torch.linalg.LinAlgError: + # Fallback: approximation using Neumann series + B = GRNLayer.neumann_series(A.t(), self.alpha) + y = torch.mm(x, B) else: # Forward transformation: apply GRN directly - return torch.mm(x, A.t()) + y = torch.mm(x, self.I - self.alpha * A.t()) + + # Add bias term + if self.b is not None: + y = y + self.b + + return y + + @staticmethod + def neumann_series(A: torch.Tensor, alpha: float, k: int = 2) -> torch.Tensor: + """Approximate the inverse of I - A using Neumann series. + + Args: + A: the matrix for which to invert I - A. + k: the number of terms in the series. The higher, the more accurate. + + Returns: + Approximated inverse of I - A. + """ + I = torch.eye(A.shape[0], device=A.device, dtype=A.dtype) + M = alpha * A + term = I.clone() + B = I.clone() + for _ in range(k): + term = term @ M + B = B + term + return B class Model(torch.nn.Module): - def __init__(self, A: np.ndarray, n_perturbations: int, n_hidden: int = 64, signed: bool = True): + + def __init__(self, A: np.ndarray, n_perturbations: int, n_hidden: int = 20, signed: bool = True): + + # n_hidden needs to be small enough to prevent the NN from arbitrarily shifting the learning task + # from the GRN to the MLPs. + torch.nn.Module.__init__(self) self.n_genes: int = A.shape[1] self.n_perturbations: int = n_perturbations self.n_hidden: int = n_hidden - self.perturbation_embedding = torch.nn.Embedding(n_perturbations, n_hidden) + self.signed: bool = signed + + # Perturbation transformations defined in the latent space + #self.perturbation_embedding = torch.nn.Embedding(n_perturbations, n_hidden) + self.perturbation_embedding = torch.nn.Embedding(n_perturbations, n_hidden * n_hidden) + # First layer: GRN-informed transformation of control expression A_signs = torch.from_numpy(np.sign(A).astype(NUMPY_DTYPE)) A_weights = np.copy(A).astype(NUMPY_DTYPE) A_weights /= (np.sqrt(self.n_genes) * float(np.std(A_weights))) A_weights = torch.nn.Parameter(torch.from_numpy(A_weights)) # Ensure A_signs is on the same device as A_weights A_signs = A_signs.to(A_weights.device) - - # First layer: GRN-informed transformation of control expression - self.grn_input_layer = GRNLayer(A_weights, A_signs, inverse=False, signed=signed, alpha=0.1) + self.grn_input_layer = GRNLayer(A_weights, A_signs, inverse=False, signed=signed) - # Middle layers: perturbation processing + # Middle layers: encode/decode between expression profiles and latent space self.encoder = torch.nn.Sequential( + torch.nn.LayerNorm(self.n_genes), + torch.nn.Dropout(p=0.4, inplace=True), torch.nn.PReLU(1), torch.nn.Linear(self.n_genes, self.n_hidden), - torch.nn.PReLU(1), + torch.nn.PReLU(self.n_hidden), torch.nn.Linear(self.n_hidden, self.n_hidden), - torch.nn.PReLU(1), + torch.nn.PReLU(self.n_hidden), + torch.nn.Linear(self.n_hidden, self.n_hidden), + torch.nn.PReLU(self.n_hidden), ) self.decoder = torch.nn.Sequential( + torch.nn.LayerNorm(self.n_hidden), torch.nn.Linear(self.n_hidden, self.n_hidden), - torch.nn.PReLU(1), + torch.nn.PReLU(self.n_hidden), + torch.nn.Linear(self.n_hidden, self.n_hidden), + torch.nn.PReLU(self.n_hidden), torch.nn.Linear(self.n_hidden, self.n_genes), torch.nn.PReLU(1), + torch.nn.Dropout(p=0.4, inplace=True), ) # Last layer: GRN-informed transformation to final expression - self.grn_output_layer = GRNLayer(A_weights, A_signs, inverse=True, signed=signed, alpha=0.1) + self.grn_output_layer = GRNLayer(A_weights, A_signs, inverse=True, signed=signed) def forward(self, x: torch.Tensor, pert: torch.LongTensor) -> torch.Tensor: - # Apply GRN transformation to control expression + + # Encode each expression profile x = self.grn_input_layer(x) y = self.encoder(x) + + # Each perturbation is a linear transformation in the latent space. + # Apply perturbation transform to the encoded profile. z = self.perturbation_embedding(pert) - y = y + z + #y = y + z + z = self.perturbation_embedding(pert).view(len(x), self.n_hidden, self.n_hidden) + y = torch.einsum('ij,ijk->ik', y, z) + + # Decode each expression profile y = self.decoder(y) y = self.grn_output_layer(y) return y + def set_grn(self, A: np.ndarray) -> None: + signed = np.any(A < 0) + A_signs = torch.from_numpy(np.sign(A).astype(NUMPY_DTYPE)) + A_weights = np.copy(A).astype(NUMPY_DTYPE) + A_weights /= (np.sqrt(self.n_genes) * float(np.std(A_weights))) + A_weights = torch.nn.Parameter(torch.from_numpy(A_weights)) + # Ensure A_signs is on the same device as A_weights + A_signs = A_signs.to(A_weights.device) + self.grn_input_layer = GRNLayer(A_weights, A_signs, inverse=False, signed=self.signed) + self.grn_output_layer = GRNLayer(A_weights, A_signs, inverse=True, signed=self.signed) + class PerturbationDataset(Dataset): @@ -337,8 +263,9 @@ def __len__(self) -> int: def __getitem__(self, i: int) -> Tuple[torch.Tensor, int, torch.Tensor]: i = self.idx[i] y = torch.from_numpy(self.X[i, :]) - group = int(self.match_groups[i]) + # Find matched control + group = int(self.match_groups[i]) if group in self.control_map: j = int(self.control_map[group]) else: @@ -347,75 +274,84 @@ def __getitem__(self, i: int) -> Tuple[torch.Tensor, int, torch.Tensor]: x = torch.from_numpy(self.X[j, :]) p = int(self.perturbations[i]) - return x, p, y + d_x = y - x + return x, p, d_x +def coefficients_of_determination(y_target: np.ndarray, y_pred: np.ndarray, eps: float = 1e-20) -> np.ndarray: + residuals = np.square(y_target - y_pred) + ss_res = np.sum(residuals, axis=0) + eps + mean = np.mean(y_target, axis=0)[np.newaxis, :] + residuals = np.square(y_target - mean) + ss_tot = np.sum(residuals, axis=0) + eps + return 1 - ss_res / ss_tot -def evaluate(A, train_data_loader, test_data_loader, n_perturbations: int) -> Tuple[float, float]: - # Training - signed = np.any(A < 0) - model = Model(A, n_perturbations, n_hidden=16, signed=signed) +def evaluate(A, train_data_loader, test_data_loader, state_dict, n_perturbations: int, signed: bool = True) -> np.ndarray: + set_seed() + A = np.copy(A) + model = Model(A, n_perturbations, signed=signed) model = model.to(DEVICE) - optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5) + model.load_state_dict(state_dict, strict=False) + model.set_grn(A) + optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-6) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, mode='min', patience=5, - min_lr=1e-5, cooldown=3, factor=0.8 + min_lr=1e-6, cooldown=3, factor=0.8 ) - pbar = tqdm.tqdm(range(100)) # Reduced epochs for faster testing - best_val_loss = float('inf') - best_ss_res = None - best_epoch = 0 + pbar = tqdm.tqdm(range(300)) + best_avg_r2, best_r2 = -np.inf, None model.train() for epoch in pbar: total_loss = 0 - for x, pert, y in train_data_loader: - x, pert, y = x.to(DEVICE), pert.to(DEVICE), y.to(DEVICE) + y_target, y_pred = [], [] + for x, pert, d_x in train_data_loader: + x, pert, d_x = x.to(DEVICE), pert.to(DEVICE), d_x.to(DEVICE) + + # Reset gradients optimizer.zero_grad() + # Model now predicts full perturbed expression directly - y_hat = model(x, pert) - loss = torch.mean(torch.square(y - y_hat)) + d_x_hat = model(x, pert) + y_target.append(d_x.cpu().data.numpy()) + y_pred.append(d_x_hat.cpu().data.numpy()) + + # Compute mean squared error + loss = torch.mean(torch.square(d_x - d_x_hat)) + total_loss += loss.item() * len(x) + + # Compute gradients (clip them to prevent divergence) loss.backward() + torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1) + + # Update parameters optimizer.step() - total_loss += loss.item() * len(x) - pbar.set_description(str(total_loss)) + scheduler.step(total_loss) + r2_train = coefficients_of_determination(np.concatenate(y_target, axis=0), np.concatenate(y_pred, axis=0)) model.eval() - ss_res = 0 + y_target, y_pred = [], [] with torch.no_grad(): - for x, pert, y in test_data_loader: - x, pert, y = x.to(DEVICE), pert.to(DEVICE), y.to(DEVICE) - # Model predicts full perturbed expression - y_hat = model(x, pert) - residuals = torch.square(y - y_hat).cpu().data.numpy() - ss_res += np.sum(residuals, axis=0) - if np.sum(ss_res) < best_val_loss: - best_val_loss = np.sum(ss_res) - best_epoch = epoch - best_ss_res = ss_res + for x, pert, d_x in test_data_loader: + x, pert, d_x = x.to(DEVICE), pert.to(DEVICE), d_x.to(DEVICE) + d_x_hat = model(x, pert) + y_target.append(d_x.cpu().data.numpy()) + y_pred.append(d_x_hat.cpu().data.numpy()) + r2_test = coefficients_of_determination(np.concatenate(y_target, axis=0), np.concatenate(y_pred, axis=0)) + avg_r2 = np.mean(r2_test) + if avg_r2 > best_avg_r2: + best_avg_r2 = avg_r2 + best_r2 = r2_test + pbar.set_description(str(np.mean(r2_train)) + " " + str(np.mean(r2_test))) model.train() - ss_res = best_ss_res - - # Final evaluation with PDS model.eval() - ss_tot = 0 - - with torch.no_grad(): - for x, pert, y in test_data_loader: - x, pert, y = x.to(DEVICE), pert.to(DEVICE), y.to(DEVICE) - y_hat = model(x, pert) - - residuals = torch.square(y - torch.mean(y, dim=0).unsqueeze(0)).cpu().data.numpy() - ss_tot += np.sum(residuals, axis=0) - - print(f"Best epoch: {best_epoch} ({best_val_loss})") - return best_ss_res, ss_tot - + return best_r2 def main(par): + # Load evaluation data adata = ad.read_h5ad(par['evaluation_data']) dataset_id = adata.uns['dataset_id'] @@ -443,6 +379,14 @@ def main(par): cv_groups = encode_obs_cols(adata, par['cv_groups']) match_groups = encode_obs_cols(adata, par['match']) loose_match_groups = encode_obs_cols(adata, par['loose_match']) + + # Get cell types + N_FOLDS = 5 + try: + cell_types = np.squeeze(encode_obs_cols(adata, ["cell_type"])) + except Exception: + print(traceback.format_exc()) + cell_types = np.random.randint(0, 5, size=len(X)) # Set perturbations to first column (perturbation) perturbations = cv_groups[0] # perturbation codes @@ -513,25 +457,26 @@ def main(par): # Remove self-regulations np.fill_diagonal(A, 0) - # Create baseline model - A_baseline = np.copy(A) - for j in range(A.shape[1]): - np.random.shuffle(A[:j, j]) - np.random.shuffle(A[j+1:, j]) - assert np.any(A_baseline != A) - # Mapping between gene expression profiles and their matched negative controls - control_map, _ = create_control_matching(are_controls, match_groups) loose_control_map, _ = create_control_matching(are_controls, loose_match_groups) - ss_res = 0 - ss_tot = 0 - cv = GroupKFold(n_splits=5) + r2 = [] + r2_baseline = [] + cv = StratifiedGroupKFold(n_splits=N_FOLDS, shuffle=True, random_state=0xCAFE) - results = [] - - for i, (train_index, test_index) in enumerate(cv.split(X, X, cv_groups)): + for i, (train_index, test_index) in enumerate(cv.split(X, perturbations, cell_types)): + + if (len(train_index) == 0) or (len(test_index) == 0): + continue + + # Create baseline model + A_baseline = np.copy(A) + for j in range(A_baseline.shape[1]): + np.random.shuffle(A_baseline[:j, j]) + np.random.shuffle(A_baseline[j+1:, j]) + assert np.any(A_baseline != A) + # Center and scale dataset scaler = StandardScaler() scaler.fit(X[train_index, :]) @@ -539,45 +484,63 @@ def main(par): # Create data loaders n_perturbations = int(np.max(perturbations) + 1) - train_dataset = PerturbationDataset( - X_standardized, - train_index, - match_groups, - control_map, - loose_match_groups, - loose_control_map, - perturbations - ) - train_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=512) - test_dataset = PerturbationDataset( - X_standardized, - test_index, - match_groups, - control_map, - loose_match_groups, - loose_control_map, - perturbations - ) - test_data_loader = torch.utils.data.DataLoader(test_dataset, batch_size=512) + def create_data_loaders(): + train_dataset = PerturbationDataset( + X_standardized, + train_index, + match_groups, + control_map, + loose_match_groups, + loose_control_map, + perturbations + ) + train_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=512, shuffle=True) + test_dataset = PerturbationDataset( + X_standardized, + test_index, + match_groups, + control_map, + loose_match_groups, + loose_control_map, + perturbations + ) + test_data_loader = torch.utils.data.DataLoader(test_dataset, batch_size=512) + return train_data_loader, test_data_loader + + # For fair comparison, we first randomly initialize NN parameters, and use these same + # parameters to build both models (only the GRN weights will differ). + signed = np.any(A < 0) + model_template = Model(A, n_perturbations, signed=signed).to(DEVICE) + state_dict = model_template.state_dict() # Evaluate inferred GRN - res = evaluate(A, train_data_loader, test_data_loader, n_perturbations) - ss_res = ss_res + res[0] - ss_tot = ss_tot + res[1] + train_data_loader, test_data_loader = create_data_loaders() + r2.append(evaluate(A, train_data_loader, test_data_loader, state_dict, n_perturbations, signed=signed)) # Evaluate baseline GRN (shuffled target genes) - #ss_tot = ss_tot + evaluate(A_baseline, train_data_loader, test_data_loader, n_perturbations) + train_data_loader, test_data_loader = create_data_loaders() + r2_baseline.append(evaluate(A_baseline, train_data_loader, test_data_loader, state_dict, n_perturbations, signed=signed)) + + print(f"Mean R2 (fold {i + 1})", np.mean(r2), np.mean(r2_baseline)) + + r2 = np.asarray(r2).flatten() + r2_baseline = np.asarray(r2_baseline).flatten() - r2 = 1 - ss_res / ss_tot + print("Mean R2", np.mean(r2), np.mean(r2_baseline)) + + if np.all(r2 == r2_baseline): + final_score = 0 + else: + print(np.mean(r2), np.mean(r2_baseline)) + p_value = wilcoxon(r2, r2_baseline, alternative="greater").pvalue + final_score = -np.log10(p_value) - final_score = np.mean(np.clip(r2, 0, 1)) print(f"Method: {method_id}") - print(f"R2: {final_score}") + print(f"Final score: {final_score}") results = { 'vc': [float(final_score)] - } df_results = pd.DataFrame(results) - return df_results \ No newline at end of file + return df_results diff --git a/src/metrics/vc/improved_helper.py b/src/metrics/vc/improved_helper.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/src/metrics/vc/new_helper.py b/src/metrics/vc/new_helper.py new file mode 100644 index 000000000..c3eccbaa9 --- /dev/null +++ b/src/metrics/vc/new_helper.py @@ -0,0 +1,299 @@ +import os +import traceback +from typing import Tuple, Dict +import sys +import tqdm +import random +import h5py +import numpy as np +import pandas as pd +import torch +import xgboost +from scipy.sparse import csr_matrix +from scipy.stats import wilcoxon +from sklearn.decomposition import PCA +from sklearn.linear_model import Ridge +from sklearn.ensemble import RandomForestRegressor +from sklearn.model_selection import StratifiedGroupKFold, StratifiedKFold +from sklearn.preprocessing import StandardScaler, LabelEncoder, TargetEncoder +from torch.utils.data import Dataset +import anndata as ad +import warnings +warnings.filterwarnings("ignore", category=UserWarning) + +os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8' # For reproducibility purposes (on GPU) + +# Hyper-parameters +DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' +NUMPY_DTYPE = np.float32 + +# For reproducibility purposes +def set_seed(): + seed = 0xCAFE + os.environ['PYTHONHASHSEED'] = str(seed) + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + torch.use_deterministic_algorithms(True) + +from util import read_prediction, manage_layer +from dataset_config import DATASET_GROUPS +from scipy.spatial.distance import cityblock + + +def combine_multi_index(*arrays) -> np.array: + """Combine parallel label arrays into a single integer label per position.""" + A = np.stack(arrays) + n_classes = tuple(int(A[i].max()) + 1 for i in range(A.shape[0])) + return np.ravel_multi_index(A, dims=n_classes, order='C') + + +def encode_obs_cols(adata, cols): + encoded = [] + for col in cols: + if col in adata.obs: + codes = LabelEncoder().fit_transform(adata.obs[col].values) + encoded.append(codes) + return encoded + + +def create_control_matching(are_controls: np.ndarray, match_groups: np.ndarray) -> Tuple[Dict[int, int], np.ndarray]: + control_indices = np.where(are_controls)[0] + + if len(control_indices) == 0: + raise ValueError("No control samples found in dataset!") + + # First, try to create exact matching (original approach) + control_map = {} + for i, (is_control, group_id) in enumerate(zip(are_controls, match_groups)): + if is_control: + control_map[int(group_id)] = i + + return control_map, match_groups + + +class Model(object): + + def __init__(self, A: np.ndarray): + self.n_genes: int = A.shape[1] + self.A: np.ndarray = A + self._te = [TargetEncoder() for _ in range(self.n_genes)] + self._models = [xgboost.XGBRegressor(n_estimators=100) for _ in range(self.n_genes)] + #self._models = [RandomForestRegressor() for _ in range(self.n_genes)] + + def fit(self, X_train: np.ndarray, Y_train: np.ndarray, p_train: np.ndarray) -> None: + Z_train = [] + for j in range(self.n_genes): + self._te[j].fit(p_train[:, np.newaxis], Y_train[:, j]) + Z_train.append(np.squeeze(self._te[j].transform(p_train[:, np.newaxis]))) + Z_train = np.asarray(Z_train).T + for j in tqdm.tqdm(range(self.n_genes), desc="Fit"): + mask = (self.A[:, j] != 0) + mask[j] = True + if np.any(mask): + self._models[j].fit(np.concatenate((X_train[:, mask], Z_train[:, mask]), axis=1), Y_train[:, j]) + + def predict(self, X_test: np.ndarray, p_test: np.ndarray) -> np.ndarray: + Z_test = [] + for j in range(self.n_genes): + Z_test.append(np.squeeze(self._te[j].transform(p_test[:, np.newaxis]))) + Z_test = np.asarray(Z_test).T + Y_hat = [] + for j in tqdm.tqdm(range(self.n_genes), desc="Predict"): + mask = (self.A[:, j] != 0) + mask[j] = True + if np.any(mask): + Y_hat.append(self._models[j].predict(np.concatenate((X_test[:, mask], Z_test[:, mask]), axis=1))) + else: + Y_hat.append(np.zeros(len(X_test))) + return np.asarray(Y_hat).T + + +def main(par): + + # Load evaluation data + adata = ad.read_h5ad(par['evaluation_data']) + dataset_id = adata.uns['dataset_id'] + method_id = ad.read_h5ad(par['prediction'], backed='r').uns['method_id'] + + # Configure dataset-specific groups + par['cv_groups'] = DATASET_GROUPS[dataset_id]['cv'] + par['match'] = DATASET_GROUPS[dataset_id]['match'] + par['loose_match'] = DATASET_GROUPS[dataset_id]['loose_match'] + + layer = manage_layer(adata, par) + X = adata.layers[layer] + if isinstance(X, csr_matrix): + X = X.toarray() + X = X.astype(np.float32) + + gene_names = adata.var_names + gene_dict = {gene_name: i for i, gene_name in enumerate(gene_names)} + + # Get sample info + perturbations = None + cv_groups, match_groups, loose_match_groups = [], [], [] + + # Encode observation columns + cv_groups = encode_obs_cols(adata, par['cv_groups']) + match_groups = encode_obs_cols(adata, par['match']) + loose_match_groups = encode_obs_cols(adata, par['loose_match']) + + # Get cell types + N_FOLDS = 5 + try: + cell_types = np.squeeze(encode_obs_cols(adata, ["cell_type"])) + except Exception: + print(traceback.format_exc()) + cell_types = np.random.randint(0, 5, size=len(X)) + + # Set perturbations to first column (perturbation) + perturbations = cv_groups[0] # perturbation codes + + # Groups used for cross-validation + cv_groups = combine_multi_index(*cv_groups) + + # Groups used for matching with negative controls + match_groups = combine_multi_index(*match_groups) + + # Groups used for loose matching with negative controls + loose_match_groups = combine_multi_index(*loose_match_groups) + + # Check for control samples - this is required for perturbation evaluation + if "is_control" not in adata.obs.columns: + raise ValueError("Dataset must contain 'is_control' column for perturbation evaluation") + + are_controls = adata.obs["is_control"].values.astype(bool) + + # Check if we have any control samples + n_controls = np.sum(are_controls) + n_perturbed = np.sum(~are_controls) + + if n_controls == 0: + raise ValueError(f"No control samples found in dataset! Found {n_perturbed} perturbed samples but 0 controls. " + "Perturbation evaluation requires control samples for comparison.") + + print(f"Found {n_controls} control samples and {n_perturbed} perturbed samples") + + # Load inferred GRN + net = read_prediction(par) + sources = net["source"].to_numpy() + targets = net["target"].to_numpy() + weights = net["weight"].to_numpy() + + A = np.zeros((len(gene_names), len(gene_names)), dtype=X.dtype) + for source, target, weight in zip(sources, targets, weights): + if (source in gene_dict) and (target in gene_dict): + i = gene_dict[source] + j = gene_dict[target] + A[i, j] = float(weight) + + # Only consider the genes that are actually present in the inferred GRN + gene_mask = np.logical_or(np.any(A, axis=1), np.any(A, axis=0)) + X = X[:, gene_mask] + A = A[gene_mask, :][:, gene_mask] + gene_names = gene_names[gene_mask] + + # Filter genes based on GRN instead of HVGs + # Keep all genes that are present in the GRN (already filtered above) + print(f"Using {len(gene_names)} genes present in the GRN") + + # Additional memory-aware gene filtering for very large GRNs + MAX_GENES_FOR_MEMORY = 3000 # Reduced further to avoid memory issues + if len(gene_names) > MAX_GENES_FOR_MEMORY: + print(f"Too many genes ({len(gene_names)}) for memory. Selecting top {MAX_GENES_FOR_MEMORY} by GRN connectivity.") + + # Select genes with highest connectivity in the GRN + gene_connectivity = np.sum(np.abs(A), axis=0) + np.sum(np.abs(A), axis=1) + top_gene_indices = np.argsort(gene_connectivity)[-MAX_GENES_FOR_MEMORY:] + + X = X[:, top_gene_indices] + A = A[top_gene_indices, :][:, top_gene_indices] + gene_names = gene_names[top_gene_indices] + + print(f"Final: Using {len(gene_names)} most connected genes for evaluation") + + # Remove self-regulations + np.fill_diagonal(A, 0) + + # Mapping between gene expression profiles and their matched negative controls + control_map, _ = create_control_matching(are_controls, match_groups) + loose_control_map, _ = create_control_matching(are_controls, loose_match_groups) + + # Build dataset + X_, Y_ = [], [] + for i, (match_group, loose_match_group) in enumerate(zip(match_groups, loose_match_groups)): + if match_group in control_map: + i_prime = control_map[match_group] + else: + i_prime = loose_control_map[loose_match_group] + X_.append(X[i_prime, :]) + Y_.append(X[i, :]) + X, Y = np.asarray(X_), np.asarray(Y_) + Y = Y - X + del X_, Y_ + + eps = 1e-20 + r2, r2_baseline = [], [] + cv = StratifiedGroupKFold(n_splits=N_FOLDS, shuffle=True, random_state=0xCAFE) + for i, (train_index, test_index) in enumerate(cv.split(X, perturbations, cell_types)): + + if (len(train_index) == 0) or (len(test_index) == 0): + continue + + # Create baseline model + A_baseline = np.copy(A) + for j in range(A_baseline.shape[1]): + np.random.shuffle(A_baseline[:j, j]) + np.random.shuffle(A_baseline[j+1:, j]) + assert np.any(A_baseline != A) + + # Center and scale dataset + scaler = StandardScaler() + X_train = scaler.fit_transform(X[train_index, :]) + X_test = scaler.transform(X[test_index, :]) + scaler = StandardScaler() + Y_train = scaler.fit_transform(Y[train_index, :]) + Y_test = scaler.transform(Y[test_index, :]) + p_train = perturbations[train_index] + p_test = perturbations[test_index] + + model = Model(A) + model.fit(X_train, Y_train, p_train) + Y_hat = model.predict(X_test, p_test) + ss_res = np.sum(np.square(Y_test - Y_hat)) + eps + ss_tot = np.sum(np.square(Y_test - np.mean(Y_test, axis=0)[np.newaxis, :])) + eps + r2.append(1 - ss_res / ss_tot) + + model = Model(A_baseline) + model.fit(X_train, Y_train, p_train) + Y_hat = model.predict(X_test, p_test) + ss_res = np.sum(np.square(Y_test - Y_hat)) + eps + ss_tot = np.sum(np.square(Y_test - np.mean(Y_test, axis=0)[np.newaxis, :])) + eps + r2_baseline.append(1 - ss_res / ss_tot) + + print("r2", np.mean(r2), np.mean(r2_baseline)) + + r2 = np.asarray(r2).flatten() + r2_baseline = np.asarray(r2_baseline).flatten() + + print("R2", np.mean(r2), np.mean(r2_baseline)) + + if np.all(r2 == r2_baseline): + final_score = 0 + else: + print(np.mean(r2), np.mean(r2_baseline)) + p_value = wilcoxon(r2, r2_baseline, alternative="greater").pvalue + final_score = -np.log10(p_value) + + print(f"Method: {method_id}") + print(f"R2: {final_score}") + + results = { + 'vc': [float(final_score)] + } + + df_results = pd.DataFrame(results) + return df_results diff --git a/src/metrics/vc/ultra_fast_helper.py b/src/metrics/vc/ultra_fast_helper.py deleted file mode 100644 index e69de29bb..000000000 From 347a4e13e05182d9be3325a4216ceff28e80f778 Mon Sep 17 00:00:00 2001 From: AntoinePassemiers Date: Sun, 5 Oct 2025 14:45:01 +0200 Subject: [PATCH 03/19] Fix recovery_2 metric --- src/metrics/recovery_2/helper.py | 369 ++++++++++++++++++------------- src/metrics/vc/helper.py | 28 +-- 2 files changed, 226 insertions(+), 171 deletions(-) diff --git a/src/metrics/recovery_2/helper.py b/src/metrics/recovery_2/helper.py index d28e53fe9..141c5081c 100644 --- a/src/metrics/recovery_2/helper.py +++ b/src/metrics/recovery_2/helper.py @@ -1,182 +1,237 @@ +from typing import Tuple, List, Optional + import scipy import numpy as np +from scipy.sparse import csr_matrix +from scipy.stats import spearmanr, pearsonr, wilcoxon +from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder import pandas as pd import anndata as ad import warnings -import scipy warnings.filterwarnings("ignore") from util import format_save_score, read_prediction -from scipy.stats import spearmanr np.random.seed(0) from util import manage_layer -def actual_calculation(adata, target_genes, group): - adata = adata[adata.obs['is_control'] == False] # only use non-control cells - target_genes = [g for g in target_genes if g in adata.var_names] - target_adata = adata[:, target_genes] - target_expr = pd.DataFrame( - target_adata.X.toarray() if scipy.sparse.issparse(target_adata.X) else target_adata.X, - index=target_adata.obs.index, - columns=target_adata.var_names - ) - for g in group: - target_expr[g] = target_adata.obs[g].values - group_consistencies = {} - for g_vals, df_group in target_expr.groupby(group): - expr_data = df_group.drop(columns=group) - if len(expr_data) < 2: - continue # skip if only one sample - - rows = expr_data.values - corr_matrix = np.corrcoef(rows) - correlation = corr_matrix[np.triu_indices_from(corr_matrix, k=1)] - correlation = correlation[~np.isnan(correlation)] - correlation = np.mean(correlation) - assert isinstance(correlation, float) - group_consistencies[g_vals] = correlation - return group_consistencies - -def calculate_target_gene_consistency(adata, net, group, min_targets_t=10, max_targets_t=20): - """ - Calculate consistency score for each TF based on target gene expression across groups. - Returns a dictionary with TF as key and consistency score as value. - """ - from scipy.stats import ttest_rel - - adata = adata[adata.obs['is_control'] == False] - all_genes = adata.var_names.tolist() - tf_size = net.groupby('source').size() - tfs = tf_size[(tf_size > min_targets_t)].index - net = net[net['source'].isin(tfs)] - - all_values = [] - all_random_values = [] - - for tf in tfs: - target_genes = net[net['source'] == tf] - target_genes = target_genes.sort_values(by='weight', ascending=False, key=abs).head(max_targets_t) - target_genes = target_genes['target'].tolist() - - actual_consistencies = actual_calculation(adata, target_genes, group) - - n_targets = len(target_genes) - random_targets = np.random.choice(all_genes, size=n_targets, replace=False) - random_consistencies = actual_calculation(adata, random_targets, group) - - # Get consistency values for groups that exist in both actual and random - common_groups = actual_consistencies.keys() - if len(common_groups) < 2: - continue # Need at least 2 groups for paired t-test - - actual_values = [actual_consistencies[g] for g in common_groups] - random_values = [random_consistencies[g] for g in common_groups] - assert len(actual_values) == len(random_values) - all_values.append(np.mean(actual_values)) - all_random_values.append(np.mean(random_values)) - - return all_values, all_random_values - -def evaluate_setting(evaluation_adata, net, group, setting_name, n_tfs=None, max_targets_per_tf=None): + + +def encode_obs_cols(adata, cols): + """Encode observation columns to integer codes.""" + encoded = [] + for col in cols: + if col in adata.obs: + codes = LabelEncoder().fit_transform(adata.obs[col].values) + encoded.append(codes) + return encoded + + +def combine_multi_index(*arrays) -> np.array: + """Combine parallel label arrays into a single integer label per position.""" + A = np.stack(arrays) + n_classes = tuple(int(A[i].max()) + 1 for i in range(A.shape[0])) + return np.ravel_multi_index(A, dims=n_classes, order='C') + + +def evaluate_grn( + C: np.ndarray, + A: np.ndarray, + n_tfs: Optional[int] = None, + max_targets_per_tf: Optional[int] = None +) -> float: """ Evaluate a specific setting (recall, balanced, or precision) """ - from scipy.stats import wilcoxon + + # Compute the min and max correlation coefficients across groups + C_min = np.min(C, axis=0) + C_max = np.max(C, axis=0) - # Filter TFs if specified - if n_tfs is not None: - tf_counts = net.groupby('source').size().sort_values(ascending=False) - top_tfs = tf_counts.head(n_tfs).index - filtered_net = net[net['source'].isin(top_tfs)] + # Filter TFs if specified (keep the TFs with the most target genes) + if n_tfs is None: + tf_idx = np.arange(A.shape[1]) else: - filtered_net = net.copy() - - # Set max_targets parameter - if max_targets_per_tf is None: - max_targets_per_tf = 50 # default value - - actual_values, random_values = calculate_target_gene_consistency( - evaluation_adata, filtered_net, group, max_targets_t=max_targets_per_tf - ) - - # Convert to numpy arrays for arithmetic operations - actual_values = np.array(actual_values) - random_values = np.array(random_values) - - # Statistical test - res = wilcoxon(actual_values - random_values, zero_method='wilcox', alternative='greater') - p_value = res.pvalue - if p_value == 0: - p_value = 1e-300 - final_score = -np.log10(p_value) - - result = { - 'setting': setting_name, - 'value_min': np.min(actual_values), - 'value_random_min': np.min(random_values), - 'value_median': np.median(actual_values), - 'value_random_median': np.median(random_values), - 'value_max': np.max(actual_values), - 'value_random_max': np.max(random_values), - 'value_mean': np.mean(actual_values), - 'value_random_mean': np.mean(random_values), - 'value_std': np.std(actual_values), - 'value_random_std': np.std(random_values), - 'n_values': len(actual_values), - 'p_value': p_value, - 'final_score': final_score, - 'mean_difference': np.mean(actual_values - random_values) - } - - print(f"{setting_name}: {result}") - return result + tf_idx = np.argsort(np.sum(A != 0, axis=1))[-n_tfs:] + tf_mask = np.zeros(A.shape[1], dtype=bool) + tf_mask[tf_idx] = True + + scores = [] + for j in range(A.shape[1]): + + if tf_mask[j]: + + # Filter TGs if specified (keep the TGs with highest regulatory links) + if max_targets_per_tf is None: + tg_idx = np.where(A[j, :] != 0)[0] + else: + tg_idx = np.argsort(np.abs(A[j, :]))[-max_targets_per_tf:] + tg_idx = tg_idx[A[j, tg_idx] != 0] + + # Compute scores + if len(tg_idx) == 0: + scores.append(np.nan) + else: + # Intuition behind the scoring scheme: + # our score is a weighted average of consistency scores, + # where the weights are proportional to regulatory weight magnitudes, + # and consistency scores are penalized by the max-min correlation spread. + # The more consistent correlation coefficients across groups, the higher + # the consistency scores. + weights = np.abs(A[j, tg_idx]) + weights /= np.sum(weights) + consistency_scores = 2 - (C_max[j, tg_idx] - C_min[j, tg_idx]) + tf_score = float(np.sum(weights * consistency_scores)) + scores.append(tf_score) + + else: + scores.append(np.nan) + + return np.array(scores) + + +def evaluate_setting(C: np.ndarray, A: np.ndarray, setting_name: str, **kwargs) -> float: + + # Evaluate inferred GRN + scores = evaluate_grn(C, A, **kwargs) + + # Create baseline GRN + A_baseline = np.copy(A) + for j in range(A.shape[1]): + np.random.shuffle(A_baseline[:j, j]) + np.random.shuffle(A_baseline[j+1:, j]) + assert np.any(A_baseline != A) + + # Evaluate baseline GRN + scores_baseline = evaluate_grn(C, A_baseline, **kwargs) + + # Skip NaNs + mask = ~np.logical_or(np.isnan(scores), np.isnan(scores_baseline)) + scores = scores[mask] + scores_baseline = scores_baseline[mask] + + # Compare results + if len(scores) == 0: + print(f"Mean corr. coef. for setting '{setting_name}': grn=nan, baseline=nan") + return 0.0 + else: + print(f"Mean corr. coef. for setting '{setting_name}': grn={np.mean(scores):.3f}, baseline={np.mean(scores_baseline):.3f}") + try: + p_value = wilcoxon(scores, scores_baseline, alternative="greater", zero_method="pratt").pvalue + p_value = np.clip(p_value, 1e-300, 1) + print(f"p-value={p_value}") + #return -np.log10(p_value) + return np.mean(np.clip(scores - scores_baseline, 0, None)) + except ValueError: + return 0.0 + def main(par): - evaluation_adata = ad.read_h5ad(par['evaluation_data']) - layer = manage_layer(evaluation_adata, par) - evaluation_adata.X = evaluation_adata.layers[layer] - net = read_prediction(par) # [source target weight] + # Load evaluation data + adata = ad.read_h5ad(par['evaluation_data']) + dataset_id = adata.uns['dataset_id'] + method_id = ad.read_h5ad(par['prediction'], backed='r').uns['method_id'] - group = par['group'] - for g in group: - assert g in evaluation_adata.obs.columns + # Manage layer + layer = manage_layer(adata, par) + X = adata.layers[layer] + if isinstance(X, csr_matrix): + X = X.toarray() + X = X.astype(np.float32) - # Evaluate three settings as specified in TODO - settings_results = [] - - # 1. Recall setting: all TFs, all target genes (no filtering) - recall_result = evaluate_setting( - evaluation_adata, net, group, - setting_name="recall", - n_tfs=None, # Use all TFs - max_targets_per_tf=None # Use all target genes (up to default limit) - ) - settings_results.append(recall_result) - - # 2. Balanced setting: top 100 TFs, top 100 target genes - balanced_result = evaluate_setting( - evaluation_adata, net, group, - setting_name="balanced", - n_tfs=100, - max_targets_per_tf=100 - ) - settings_results.append(balanced_result) - - # 3. Precision setting: top 20 TFs, top 20 target genes - precision_result = evaluate_setting( - evaluation_adata, net, group, - setting_name="precision", - n_tfs=20, - max_targets_per_tf=20 - ) - settings_results.append(precision_result) + gene_names = adata.var_names + gene_dict = {gene_name: i for i, gene_name in enumerate(gene_names)} + + # Get groups + group_variables, group_names = [], [] + if ("donor_id" in adata.obs) or ("plate_name" in adata.obs): + variables = encode_obs_cols(adata, ["donor_id", "plate_name"]) + group_variables.append(combine_multi_index(*variables)) + group_names.append("donor_id/plate_name") + if (len(group_variables) == 0) and ("cell_type" in adata.obs): + variables = encode_obs_cols(adata, ["cell_type"]) + group_variables.append(combine_multi_index(*variables)) + group_names.append("cell_type") + if len(group_variables) == 0: + variables = encode_obs_cols(adata, ["perturbation"]) + group_variables.append(combine_multi_index(*variables)) + group_names.append("perturbation") + + # Load inferred GRN + df = read_prediction(par) + sources = df["source"].to_numpy() + targets = df["target"].to_numpy() + weights = df["weight"].to_numpy() + A = np.zeros((len(gene_names), len(gene_names)), dtype=X.dtype) + for source, target, weight in zip(sources, targets, weights): + if (source in gene_dict) and (target in gene_dict): + i = gene_dict[source] + j = gene_dict[target] + A[i, j] = float(weight) + + # Only consider the genes that are actually present in the inferred GRN, + # and keep only the most-connected genes (for speed). + gene_mask = np.logical_or(np.any(A, axis=1), np.any(A, axis=0)) + X = X[:, gene_mask] + X = X.toarray() if isinstance(X, csr_matrix) else X + A = A[gene_mask, :][:, gene_mask] + gene_names = gene_names[gene_mask] + + # Remove self-regulations + np.fill_diagonal(A, 0) + print(f"Evaluating {X.shape[1]} genes with {np.sum(A != 0)} regulatory links") + + recall_results = [] + balanced_results = [] + precision_results = [] + for group_name, groups in zip(group_names, group_variables): + print("\n" + "=" * 30) + print(f"Grouping samples by {group_name}") + + # Compute correlation matrix for each group + S, C = [], [] + for group in np.unique(groups): + mask = (groups == group) + if np.sum(mask) >= 3: + assert not np.any(np.isnan(X[mask, :])) + res = spearmanr(X[mask, :]) + S.append(res.statistic) + C.append(res.pvalue) + S = np.nan_to_num(S, nan=0) + C = np.nan_to_num(C, nan=1) + #C = -np.sign(S) * np.log10(C) + #C = np.clip(C, -4, 4) + C = S + + # 1. Recall setting: all TFs, all target genes (no filtering) + recall_results.append(evaluate_setting( + C, A, "recall", + n_tfs=None, # Use all TFs + max_targets_per_tf=None # Use all target genes (up to default limit) + )) + + # 2. Balanced setting: top 100 TFs, top 100 target genes + balanced_results.append(evaluate_setting( + C, A, "balanced", + n_tfs=100, + max_targets_per_tf=100 + )) + + # 3. Precision setting: top 40 TFs, top 40 target genes + precision_results.append(evaluate_setting( + C, A, "precision", + n_tfs=40, + max_targets_per_tf=40 + )) + + recovery_recall = np.mean(recall_results) + recovery_balanced = np.mean(balanced_results) + recovery_precision = np.mean(precision_results) # Create results DataFrame with all three scores results = pd.DataFrame({ - 'recovery_recall': [recall_result['final_score']], - 'recovery_balanced': [balanced_result['final_score']], - 'recovery_precision': [precision_result['final_score']], + 'recovery_recall': [recovery_recall], + 'recovery_balanced': [recovery_balanced], + 'recovery_precision': [recovery_precision] }) + print(results) return results - - diff --git a/src/metrics/vc/helper.py b/src/metrics/vc/helper.py index 3735f420d..bf224c46e 100644 --- a/src/metrics/vc/helper.py +++ b/src/metrics/vc/helper.py @@ -78,8 +78,8 @@ def __init__( A_signs: torch.Tensor, signed: bool = True, inverse: bool = True, - alpha: float = 0.1, - stable: bool = False, + alpha: float = 0.2, + stable: bool = True, bias: bool = True ): torch.nn.Module.__init__(self) @@ -156,7 +156,7 @@ def neumann_series(A: torch.Tensor, alpha: float, k: int = 2) -> torch.Tensor: class Model(torch.nn.Module): - def __init__(self, A: np.ndarray, n_perturbations: int, n_hidden: int = 20, signed: bool = True): + def __init__(self, A: np.ndarray, n_perturbations: int, n_hidden: int = 16, signed: bool = True): # n_hidden needs to be small enough to prevent the NN from arbitrarily shifting the learning task # from the GRN to the MLPs. @@ -215,10 +215,10 @@ def forward(self, x: torch.Tensor, pert: torch.LongTensor) -> torch.Tensor: # Each perturbation is a linear transformation in the latent space. # Apply perturbation transform to the encoded profile. - z = self.perturbation_embedding(pert) - #y = y + z z = self.perturbation_embedding(pert).view(len(x), self.n_hidden, self.n_hidden) y = torch.einsum('ij,ijk->ik', y, z) + #z = self.perturbation_embedding(pert) + #y = y + z # Decode each expression profile y = self.decoder(y) @@ -391,8 +391,8 @@ def main(par): # Set perturbations to first column (perturbation) perturbations = cv_groups[0] # perturbation codes - # Groups used for cross-validation - cv_groups = combine_multi_index(*cv_groups) + # Validation strategy: evaluate on unseen (perturbation, cell type) pairs. + cv_groups = combine_multi_index(cell_types, perturbations) # Groups used for matching with negative controls match_groups = combine_multi_index(*match_groups) @@ -454,8 +454,8 @@ def main(par): print(f"Final: Using {len(gene_names)} most connected genes for evaluation") - # Remove self-regulations - np.fill_diagonal(A, 0) + # Add self-regulations + np.fill_diagonal(A, 1) # Mapping between gene expression profiles and their matched negative controls control_map, _ = create_control_matching(are_controls, match_groups) @@ -465,7 +465,7 @@ def main(par): r2_baseline = [] cv = StratifiedGroupKFold(n_splits=N_FOLDS, shuffle=True, random_state=0xCAFE) - for i, (train_index, test_index) in enumerate(cv.split(X, perturbations, cell_types)): + for i, (train_index, test_index) in enumerate(cv.split(X, perturbations, cv_groups)): if (len(train_index) == 0) or (len(test_index) == 0): continue @@ -521,17 +521,14 @@ def create_data_loaders(): train_data_loader, test_data_loader = create_data_loaders() r2_baseline.append(evaluate(A_baseline, train_data_loader, test_data_loader, state_dict, n_perturbations, signed=signed)) - print(f"Mean R2 (fold {i + 1})", np.mean(r2), np.mean(r2_baseline)) + break r2 = np.asarray(r2).flatten() r2_baseline = np.asarray(r2_baseline).flatten() - print("Mean R2", np.mean(r2), np.mean(r2_baseline)) - if np.all(r2 == r2_baseline): final_score = 0 else: - print(np.mean(r2), np.mean(r2_baseline)) p_value = wilcoxon(r2, r2_baseline, alternative="greater").pvalue final_score = -np.log10(p_value) @@ -539,6 +536,9 @@ def create_data_loaders(): print(f"Final score: {final_score}") results = { + 'r2': [float(np.mean(r2))], + 'r2_baseline': [float(np.mean(r2_baseline))], + 'r2_diff': [float(np.mean(r2)) - float(np.mean(r2_baseline))], 'vc': [float(final_score)] } From b63293977633ead2db40cb6a300092452d487075 Mon Sep 17 00:00:00 2001 From: AntoinePassemiers Date: Sun, 5 Oct 2025 19:43:46 +0200 Subject: [PATCH 04/19] recovery_2 metric: Add TG-TG correlations back --- src/metrics/recovery_2/helper.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/src/metrics/recovery_2/helper.py b/src/metrics/recovery_2/helper.py index 141c5081c..2f42c2e0a 100644 --- a/src/metrics/recovery_2/helper.py +++ b/src/metrics/recovery_2/helper.py @@ -36,7 +36,9 @@ def evaluate_grn( C: np.ndarray, A: np.ndarray, n_tfs: Optional[int] = None, - max_targets_per_tf: Optional[int] = None + max_targets_per_tf: Optional[int] = None, + tf_tg: bool = True, + tg_tg: bool = True ) -> float: """ Evaluate a specific setting (recall, balanced, or precision) @@ -45,6 +47,7 @@ def evaluate_grn( # Compute the min and max correlation coefficients across groups C_min = np.min(C, axis=0) C_max = np.max(C, axis=0) + consistency_scores = 2 - (C_max - C_min) # Filter TFs if specified (keep the TFs with the most target genes) if n_tfs is None: @@ -78,9 +81,12 @@ def evaluate_grn( # the consistency scores. weights = np.abs(A[j, tg_idx]) weights /= np.sum(weights) - consistency_scores = 2 - (C_max[j, tg_idx] - C_min[j, tg_idx]) - tf_score = float(np.sum(weights * consistency_scores)) - scores.append(tf_score) + overall_score = 0.0 + if tf_tg: + overall_score += float(np.sum(weights * consistency_scores[j, tg_idx])) + if tg_tg: + overall_score += float(np.sum(weights * np.mean(consistency_scores[tg_idx, :][:, tg_idx], axis=0))) + scores.append(overall_score) else: scores.append(np.nan) @@ -204,6 +210,7 @@ def main(par): # 1. Recall setting: all TFs, all target genes (no filtering) recall_results.append(evaluate_setting( C, A, "recall", + tf_tg=True, tg_tg=True, n_tfs=None, # Use all TFs max_targets_per_tf=None # Use all target genes (up to default limit) )) @@ -211,6 +218,7 @@ def main(par): # 2. Balanced setting: top 100 TFs, top 100 target genes balanced_results.append(evaluate_setting( C, A, "balanced", + tf_tg=True, tg_tg=True, n_tfs=100, max_targets_per_tf=100 )) @@ -218,6 +226,7 @@ def main(par): # 3. Precision setting: top 40 TFs, top 40 target genes precision_results.append(evaluate_setting( C, A, "precision", + tf_tg=True, tg_tg=True, n_tfs=40, max_targets_per_tf=40 )) From 27c6fc390169e48721fe26f317aa84a4bd7bdf39 Mon Sep 17 00:00:00 2001 From: AntoinePassemiers Date: Thu, 9 Oct 2025 09:19:46 +0200 Subject: [PATCH 05/19] Fix recovery_2, regression_3 and sem metrics --- .../experimental/anchor_regression/helper.py | 102 ++++-- src/metrics/experimental/recovery_2/helper.py | 153 +++++---- .../experimental/recovery_2/old_helper.py | 292 ++++++++++++++++++ .../experimental/regression_3/helper.py | 76 ++--- .../experimental/regression_3/script.py | 14 +- src/metrics/experimental/sem/helper.py | 36 ++- src/metrics/experimental/vc/helper.py | 27 +- 7 files changed, 548 insertions(+), 152 deletions(-) create mode 100644 src/metrics/experimental/recovery_2/old_helper.py diff --git a/src/metrics/experimental/anchor_regression/helper.py b/src/metrics/experimental/anchor_regression/helper.py index 9c29eb042..1c60dc904 100644 --- a/src/metrics/experimental/anchor_regression/helper.py +++ b/src/metrics/experimental/anchor_regression/helper.py @@ -75,8 +75,8 @@ def anchor_regression( X: np.ndarray, Z: np.ndarray, Y: np.ndarray, - l2_reg: float = 1e-2, - anchor_strength: float = 1.0 + l2_reg: float = 1e-4, + gamma: float = 1.0 ) -> np.ndarray: """Anchor regression for causal inference under confounding. @@ -86,15 +86,15 @@ def anchor_regression( these environmental variables. Shape (n, z). Y: predicted variables, of shape (n, u). l2_reg: L2 regularization strength. - anchor_strength: Strength of anchor regularization. - 0 = standard regression, higher = more anchor regularization. + gamma: Strength of anchor regularization. + 1 = standard regression, higher = more anchor regularization. Returns: Inferred parameters, of shape (d, u). """ # Whitening transformation - W = get_whitening_transform(Z, gamma=anchor_strength) + W = get_whitening_transform(Z, gamma=gamma) X_t = W @ X Y_t = W @ Y @@ -110,25 +110,50 @@ def compute_stabilities( X: np.ndarray, y: np.ndarray, Z: np.ndarray, - A: np.ndarray, + weights: np.ndarray, is_selected: np.ndarray, eps: float = 1e-50 ) -> float: - theta0_signed = anchor_regression(X, Z, y, anchor_strength=1) + theta0_signed = anchor_regression(X, Z, y, gamma=1) theta0_signed = theta0_signed[is_selected] - theta0 = np.abs(theta0_signed) - theta0 /= np.sum(theta0) + #theta0 = np.abs(theta0_signed) + theta0 = theta0_signed / np.sum(np.abs(theta0_signed)) - theta_signed = anchor_regression(X, Z, y, anchor_strength=20) + theta_signed = anchor_regression(X, Z, y, gamma=1.65) theta_signed = theta_signed[is_selected] - theta = np.abs(theta_signed) - theta /= np.sum(theta) + #theta = np.abs(theta_signed) + theta = theta_signed / np.sum(np.abs(theta_signed)) + + stabilities = np.clip(np.abs(theta0 - theta) / np.abs(theta0 + eps), 0, 1) + #stabilities[np.sign(theta0_signed) != np.sign(theta_signed)] = 0 + + weights = np.abs(weights[is_selected]) + weights /= np.sum(weights) + + return float(np.sum(weights * stabilities)) + + +def compute_stabilities_v2( + X: np.ndarray, + y: np.ndarray, + Z: np.ndarray, + A: np.ndarray, + is_selected: np.ndarray, + eps: float = 1e-50 +) -> Tuple[float, float]: - stabilities = np.clip((theta0 - theta) / (theta0 + eps), 0, 1) - stabilities[np.sign(theta0_signed) != np.sign(theta_signed)] = 0 + theta0 = anchor_regression(X[:, is_selected], Z, y, gamma=1) + theta = anchor_regression(X[:, is_selected], Z, y, gamma=1.5) + s1 = np.clip(np.abs(theta0 - theta) / np.abs(theta0 + eps), 0, 1) - return stabilities + theta0 = anchor_regression(X[:, ~is_selected], Z, y, gamma=1) + theta = anchor_regression(X[:, ~is_selected], Z, y, gamma=1.5) + s2 = np.clip(np.abs(theta0 - theta) / np.abs(theta0 + eps), 0, 1) + s1 = np.mean(s1) + s2 = np.mean(s2) + + return s1, s2 def evaluate_gene_stability( @@ -148,22 +173,21 @@ def evaluate_gene_stability( eps: Small epsilon for numerical stability. Returns: - Stability score for gene j + Stability score for gene j. """ is_selected = np.array(A[:, j] != 0) if (not np.any(is_selected)) or np.all(is_selected): - return 0.0 + return np.nan assert not is_selected[j] # Exclude target gene from features mask = np.ones(X.shape[1], dtype=bool) mask[j] = False - stabilities_selected = np.mean(compute_stabilities(X[:, mask], X[:, j], Z, A, is_selected[mask], eps=eps)) + score = compute_stabilities(X[:, mask], X[:, j], Z, A[mask, j], is_selected[mask], eps=eps) #stabilities_non_selected = np.mean(compute_stabilities(X[:, mask], X[:, j], Z, A, ~is_selected[mask], eps=eps)) - #score = (stabilities_selected - stabilities_non_selected) / (stabilities_selected + stabilities_non_selected + eps) - score = np.mean(stabilities_selected) + #score = np.mean(stabilities_selected) return score @@ -200,9 +224,12 @@ def main(par): raise ValueError(f"No anchor variables found in dataset for columns: {anchor_cols}") # One-hot encode anchor variables - Z = OneHotEncoder(sparse_output=False, dtype=np.float32).fit_transform(anchor_encoded.reshape(-1, 1)) + Z = OneHotEncoder(drop="first", sparse_output=False, dtype=np.float32).fit_transform(anchor_encoded.reshape(-1, 1)) print(f"Anchor matrix Z shape: {Z.shape}") + # Add intercept + Z = np.concatenate((Z, np.ones((len(Z), 1), dtype=np.float32)), axis=1) + # Load inferred GRN df = read_prediction(par) sources = df["source"].to_numpy() @@ -232,24 +259,43 @@ def main(par): np.fill_diagonal(A, 0) print(f"Evaluating {X.shape[1]} genes with {np.sum(A != 0)} regulatory links") + # Whether or not to take into account the regulatory modes (enhancer/inhibitor) + signed = np.any(A < 0) + # Center and scale dataset scaler = StandardScaler() X = scaler.fit_transform(X) - # Create baseline GRN + # Create baseline model: for each TG, shuffle the TFs A_baseline = np.copy(A) - for j in range(A_baseline.shape[1]): - np.random.shuffle(A_baseline[:j, j]) - np.random.shuffle(A_baseline[j+1:, j]) - assert np.any(A != A_baseline) - + if signed: + A_baseline *= (2 * (np.random.randint(0, 2) - 0.5)) + tf_mask = np.any(A_baseline != 0, axis=1) + for j in range(A.shape[1]): + mask = np.copy(tf_mask) + mask[j] = False + if np.any(mask): + values = np.copy(A_baseline[mask, j]) + np.random.shuffle(values) + A_baseline[mask, j] = values + + # Compute gene stabilities scores, scores_baseline = [], [] for j in tqdm.tqdm(range(X.shape[1]), desc="Evaluating gene stability"): scores.append(evaluate_gene_stability(X, Z, A, j)) + scores_baseline.append(evaluate_gene_stability(X, Z, A_baseline, j)) scores = np.array(scores) + scores_baseline = np.array(scores_baseline) + + # Skip NaNs + mask = ~np.logical_or(np.isnan(scores), np.isnan(scores_baseline)) + scores = scores[mask] + scores_baseline = scores_baseline[mask] # Calculate final score - final_score = np.mean(scores) + p_value = wilcoxon(scores, scores_baseline).pvalue + p_value = np.clip(p_value, 1e-300, 1) + final_score = -np.log10(p_value) print(f"Method: {method_id}") print(f"Anchor Regression Score: {final_score:.6f}") diff --git a/src/metrics/experimental/recovery_2/helper.py b/src/metrics/experimental/recovery_2/helper.py index 2f42c2e0a..c94a7c8d9 100644 --- a/src/metrics/experimental/recovery_2/helper.py +++ b/src/metrics/experimental/recovery_2/helper.py @@ -1,9 +1,10 @@ from typing import Tuple, List, Optional +import tqdm import scipy import numpy as np from scipy.sparse import csr_matrix -from scipy.stats import spearmanr, pearsonr, wilcoxon +from scipy.stats import spearmanr, pearsonr, wilcoxon, rankdata from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder import pandas as pd import anndata as ad @@ -38,16 +39,30 @@ def evaluate_grn( n_tfs: Optional[int] = None, max_targets_per_tf: Optional[int] = None, tf_tg: bool = True, - tg_tg: bool = True + tg_tg: bool = True, + signed: bool = True ) -> float: """ Evaluate a specific setting (recall, balanced, or precision) """ - # Compute the min and max correlation coefficients across groups + # Compute the min and max correlation coefficients across groups. + # If evaluation is signed, then the min-max spread occuring in the negative (-1, 0) range + # counts 3 times more if the inferred regulatory edge is positive, and vice versa. C_min = np.min(C, axis=0) C_max = np.max(C, axis=0) - consistency_scores = 2 - (C_max - C_min) + if signed: + is_positive = (A > 0) + is_negative = (A < 0) + pos_spread = np.clip(C_max, 0, 1) - np.clip(C_min, 0, 1) + neg_spread = np.clip(C_max, -1, 0) - np.clip(C_min, -1, 0) + spread = is_positive * (pos_spread + 3 * neg_spread) + spread += is_negative * (3 * pos_spread + neg_spread) + consistency_scores = 1 - 0.25 * spread + #consistency_scores = is_positive * C_min + is_negative * (-C_max) + #C_mean = np.mean(C, axis=0) + else: + consistency_scores = 1 - 0.5 * (C_max - C_min) # Filter TFs if specified (keep the TFs with the most target genes) if n_tfs is None: @@ -58,7 +73,7 @@ def evaluate_grn( tf_mask[tf_idx] = True scores = [] - for j in range(A.shape[1]): + for j in tqdm.tqdm(range(A.shape[1])): if tf_mask[j]: @@ -82,10 +97,17 @@ def evaluate_grn( weights = np.abs(A[j, tg_idx]) weights /= np.sum(weights) overall_score = 0.0 + n_terms = 0 if tf_tg: overall_score += float(np.sum(weights * consistency_scores[j, tg_idx])) + n_terms += 1 if tg_tg: - overall_score += float(np.sum(weights * np.mean(consistency_scores[tg_idx, :][:, tg_idx], axis=0))) + S = consistency_scores[tg_idx, :][:, tg_idx] + np.fill_diagonal(S, 0) + overall_score += float(np.sum(weights * np.sum(S, axis=0) / (len(S) - 1))) + n_terms += 1 + if n_terms > 0: + overall_score /= n_terms scores.append(overall_score) else: @@ -96,15 +118,25 @@ def evaluate_grn( def evaluate_setting(C: np.ndarray, A: np.ndarray, setting_name: str, **kwargs) -> float: + # Whether or not to take into account the regulatory modes (enhancer/inhibitor) + signed = kwargs.get("signed", True) + # Evaluate inferred GRN scores = evaluate_grn(C, A, **kwargs) - # Create baseline GRN + # Create baseline model: for each TG, shuffle the TFs A_baseline = np.copy(A) + if signed: + A_baseline *= (2 * (np.random.randint(0, 2) - 0.5)) + tf_mask = np.any(A_baseline != 0, axis=1) for j in range(A.shape[1]): - np.random.shuffle(A_baseline[:j, j]) - np.random.shuffle(A_baseline[j+1:, j]) - assert np.any(A_baseline != A) + mask = np.copy(tf_mask) + mask[j] = False + if np.any(mask): + values = np.copy(A_baseline[mask, j]) + np.random.shuffle(values) + A_baseline[mask, j] = values + # assert np.any(A_baseline != A) # Evaluate baseline GRN scores_baseline = evaluate_grn(C, A_baseline, **kwargs) @@ -124,12 +156,20 @@ def evaluate_setting(C: np.ndarray, A: np.ndarray, setting_name: str, **kwargs) p_value = wilcoxon(scores, scores_baseline, alternative="greater", zero_method="pratt").pvalue p_value = np.clip(p_value, 1e-300, 1) print(f"p-value={p_value}") - #return -np.log10(p_value) - return np.mean(np.clip(scores - scores_baseline, 0, None)) + return -np.log10(p_value) except ValueError: return 0.0 +def spearman_corrcoef(X: np.ndarray) -> np.ndarray: + R = np.apply_along_axis(rankdata, 0, X, method="average") + R -= R.mean(axis=0, keepdims=True) + R /= (R.std(axis=0, ddof=1, keepdims=True) + 1e-9) + C = (R.T @ R) / (R.shape[0] - 1) + np.fill_diagonal(C, 0.0) + return C + + def main(par): # Load evaluation data adata = ad.read_h5ad(par['evaluation_data']) @@ -152,14 +192,9 @@ def main(par): variables = encode_obs_cols(adata, ["donor_id", "plate_name"]) group_variables.append(combine_multi_index(*variables)) group_names.append("donor_id/plate_name") - if (len(group_variables) == 0) and ("cell_type" in adata.obs): - variables = encode_obs_cols(adata, ["cell_type"]) - group_variables.append(combine_multi_index(*variables)) - group_names.append("cell_type") if len(group_variables) == 0: - variables = encode_obs_cols(adata, ["perturbation"]) - group_variables.append(combine_multi_index(*variables)) - group_names.append("perturbation") + group_variables.append(np.random.randint(0, 6, size=len(X))) + group_names.append("random_split") # Load inferred GRN df = read_prediction(par) @@ -181,65 +216,69 @@ def main(par): A = A[gene_mask, :][:, gene_mask] gene_names = gene_names[gene_mask] + # Whether or not to take into account the regulatory modes (enhancer/inhibitor) + signed = np.any(A < 0) + # Remove self-regulations np.fill_diagonal(A, 0) print(f"Evaluating {X.shape[1]} genes with {np.sum(A != 0)} regulatory links") - recall_results = [] - balanced_results = [] - precision_results = [] + tftg_results = [] + tgtg_results = [] + tftg_tgtg_results = [] for group_name, groups in zip(group_names, group_variables): print("\n" + "=" * 30) print(f"Grouping samples by {group_name}") # Compute correlation matrix for each group - S, C = [], [] + C = [] for group in np.unique(groups): + print(f"Computing correlations for group {group}") mask = (groups == group) if np.sum(mask) >= 3: assert not np.any(np.isnan(X[mask, :])) - res = spearmanr(X[mask, :]) - S.append(res.statistic) - C.append(res.pvalue) - S = np.nan_to_num(S, nan=0) - C = np.nan_to_num(C, nan=1) - #C = -np.sign(S) * np.log10(C) - #C = np.clip(C, -4, 4) - C = S - - # 1. Recall setting: all TFs, all target genes (no filtering) - recall_results.append(evaluate_setting( - C, A, "recall", - tf_tg=True, tg_tg=True, - n_tfs=None, # Use all TFs - max_targets_per_tf=None # Use all target genes (up to default limit) + #C.append(np.corrcoef(X[mask, :].T)) + C.append(spearman_corrcoef(X[mask, :])) + C = np.asarray(C) + + tftg_results.append(evaluate_setting( + C, A, "tftg", + tf_tg=True, + tg_tg=False, + n_tfs=100, + max_targets_per_tf=100, + signed=signed )) - - # 2. Balanced setting: top 100 TFs, top 100 target genes - balanced_results.append(evaluate_setting( - C, A, "balanced", - tf_tg=True, tg_tg=True, - n_tfs=100, - max_targets_per_tf=100 + + tgtg_results.append(evaluate_setting( + C, A, "tgtg", + tf_tg=False, + tg_tg=True, + n_tfs=40, + max_targets_per_tf=300, + signed=signed )) - # 3. Precision setting: top 40 TFs, top 40 target genes - precision_results.append(evaluate_setting( - C, A, "precision", - tf_tg=True, tg_tg=True, - n_tfs=40, - max_targets_per_tf=40 + tftg_tgtg_results.append(evaluate_setting( + C, A, "tftg+tgtg", + tf_tg=True, + tg_tg=True, + n_tfs=100, + max_targets_per_tf=300, + signed=signed )) - recovery_recall = np.mean(recall_results) - recovery_balanced = np.mean(balanced_results) - recovery_precision = np.mean(precision_results) + tftg_score = np.mean(tftg_results) + tgtg_score = np.mean(tgtg_results) + tftg_tgtg_score = np.mean(tftg_tgtg_results) + final_score = (tftg_score + tgtg_score + tftg_tgtg_score) / 3.0 # Create results DataFrame with all three scores results = pd.DataFrame({ - 'recovery_recall': [recovery_recall], - 'recovery_balanced': [recovery_balanced], - 'recovery_precision': [recovery_precision] + 'tftg': [tftg_score], + 'tgtg': [tgtg_score], + 'tftg_tgtg': [tftg_tgtg_score], + 'final_score': [final_score] }) print(results) diff --git a/src/metrics/experimental/recovery_2/old_helper.py b/src/metrics/experimental/recovery_2/old_helper.py new file mode 100644 index 000000000..e885a24c4 --- /dev/null +++ b/src/metrics/experimental/recovery_2/old_helper.py @@ -0,0 +1,292 @@ +from typing import Tuple, List, Optional + +import tqdm +import scipy +import numpy as np +from scipy.sparse import csr_matrix +from scipy.stats import spearmanr, pearsonr, wilcoxon +from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder +import pandas as pd +import anndata as ad +import warnings +warnings.filterwarnings("ignore") +from util import format_save_score, read_prediction + +np.random.seed(0) +from util import manage_layer + + +def encode_obs_cols(adata, cols): + """Encode observation columns to integer codes.""" + encoded = [] + for col in cols: + if col in adata.obs: + codes = LabelEncoder().fit_transform(adata.obs[col].values) + encoded.append(codes) + return encoded + + +def combine_multi_index(*arrays) -> np.array: + """Combine parallel label arrays into a single integer label per position.""" + A = np.stack(arrays) + n_classes = tuple(int(A[i].max()) + 1 for i in range(A.shape[0])) + return np.ravel_multi_index(A, dims=n_classes, order='C') + + +def evaluate_grn( + C: np.ndarray, + A: np.ndarray, + n_tfs: Optional[int] = None, + max_targets_per_tf: Optional[int] = None, + tf_tg: bool = True, + tg_tg: bool = True, + signed: bool = True +) -> float: + """ + Evaluate a specific setting (recall, balanced, or precision) + """ + + # Compute the min and max correlation coefficients across groups. + # If evaluation is signed, then the min-max spread occuring in the negative (-1, 0) range + # counts 3 times more if the inferred regulatory edge is positive, and vice versa. + C_min = np.min(C, axis=0) + C_max = np.max(C, axis=0) + + #C_min = np.quantile(C, 0.2, axis=0) + #C_max = np.quantile(C, 0.8, axis=0) + if signed: + is_positive = (A > 0) + is_negative = (A < 0) + #pos_spread = np.clip(C_max, 0, 1) - np.clip(C_min, 0, 1) + #neg_spread = np.clip(C_max, -1, 0) - np.clip(C_min, -1, 0) + #spread = is_positive * (pos_spread + 3 * neg_spread) + #spread += is_negative * (3 * pos_spread + neg_spread) + #consistency_scores = 1 - 0.25 * spread + consistency_scores = is_positive * C_min + is_negative * (-C_max) + #C_mean = np.mean(C, axis=0) + #consistency_scores = is_positive * C_mean + is_negative * (-C_mean) + #consistency_scores = 1 - 0.5 * (C_max - C_min) + else: + consistency_scores = 1 - 0.5 * (C_max - C_min) + #assert np.all(consistency_scores >= 0) + #assert np.all(consistency_scores <= 1) + + # Filter TFs if specified (keep the TFs with the most target genes) + if n_tfs is None: + tf_idx = np.arange(A.shape[1]) + else: + tf_idx = np.argsort(np.sum(A != 0, axis=1))[-n_tfs:] + tf_mask = np.zeros(A.shape[1], dtype=bool) + tf_mask[tf_idx] = True + + scores = [] + for j in tqdm.tqdm(range(A.shape[1])): + + if tf_mask[j]: + + # Filter TGs if specified (keep the TGs with highest regulatory links) + if max_targets_per_tf is None: + tg_idx = np.where(A[j, :] != 0)[0] + else: + tg_idx = np.argsort(np.abs(A[j, :]))[-max_targets_per_tf:] + tg_idx = tg_idx[A[j, tg_idx] != 0] + + # Compute scores + if len(tg_idx) == 0: + scores.append(np.nan) + else: + # Intuition behind the scoring scheme: + # our score is a weighted average of consistency scores, + # where the weights are proportional to regulatory weight magnitudes, + # and consistency scores are penalized by the max-min correlation spread. + # The more consistent correlation coefficients across groups, the higher + # the consistency scores. + weights = np.abs(A[j, tg_idx]) + weights /= np.sum(weights) + overall_score = 0.0 + n_terms = 0 + if tf_tg: + overall_score += float(np.sum(weights * consistency_scores[j, tg_idx])) + n_terms += 1 + if tg_tg: + S = consistency_scores[tg_idx, :][:, tg_idx] + np.fill_diagonal(S, 0) + overall_score += float(np.sum(weights * np.sum(S, axis=0) / (len(S) - 1))) + n_terms += 1 + if n_terms > 0: + overall_score /= n_terms + scores.append(overall_score) + + else: + scores.append(np.nan) + + return np.array(scores) + + +def evaluate_setting(C: np.ndarray, A: np.ndarray, setting_name: str, **kwargs) -> float: + + # Whether or not to take into account the regulatory modes (enhancer/inhibitor) + signed = kwargs.get("signed", True) + + # Evaluate inferred GRN + scores = evaluate_grn(C, A, **kwargs) + + # Create baseline model: for each TG, shuffle the TFs + A_baseline = np.copy(A) + if signed: + A_baseline *= (2 * (np.random.randint(0, 2) - 0.5)) + tf_mask = np.any(A_baseline != 0, axis=1) + for j in range(A.shape[1]): + mask = np.copy(tf_mask) + mask[j] = False + if np.any(mask): + values = np.copy(A_baseline[mask, j]) + np.random.shuffle(values) + A_baseline[mask, j] = values + # assert np.any(A_baseline != A) + + # Evaluate baseline GRN + scores_baseline = evaluate_grn(C, A_baseline, **kwargs) + + # Skip NaNs + mask = ~np.logical_or(np.isnan(scores), np.isnan(scores_baseline)) + scores = scores[mask] + scores_baseline = scores_baseline[mask] + + # Compare results + if len(scores) == 0: + print(f"Mean corr. coef. for setting '{setting_name}': grn=nan, baseline=nan") + return 0.0 + else: + print(f"Mean corr. coef. for setting '{setting_name}': grn={np.mean(scores):.3f}, baseline={np.mean(scores_baseline):.3f}") + try: + p_value = wilcoxon(scores, scores_baseline, alternative="greater", zero_method="pratt").pvalue + p_value = np.clip(p_value, 1e-300, 1) + print(f"p-value={p_value}") + return -np.log10(p_value) + #return np.mean(np.clip(scores - scores_baseline, 0, None)) + except ValueError: + return 0.0 + + +def spearman_corrcoef(X: np.ndarray) -> np.ndarray: + eps = np.random.uniform(0, 1e-30, size=X.shape) # To break ties + #X = np.argsort(X + eps, axis=0) + C = np.corrcoef(X.T) + np.fill_diagonal(C, 0) + return C + + +def main(par): + # Load evaluation data + adata = ad.read_h5ad(par['evaluation_data']) + dataset_id = adata.uns['dataset_id'] + method_id = ad.read_h5ad(par['prediction'], backed='r').uns['method_id'] + + # Manage layer + layer = manage_layer(adata, par) + X = adata.layers[layer] + if isinstance(X, csr_matrix): + X = X.toarray() + X = X.astype(np.float32) + + gene_names = adata.var_names + gene_dict = {gene_name: i for i, gene_name in enumerate(gene_names)} + + # Get groups + group_variables, group_names = [], [] + if ("donor_id" in adata.obs) or ("plate_name" in adata.obs): + variables = encode_obs_cols(adata, ["donor_id", "plate_name"]) + group_variables.append(combine_multi_index(*variables)) + group_names.append("donor_id/plate_name") + if len(group_variables) == 0: + group_variables.append(np.random.randint(0, 6, size=len(X))) + group_names.append("random_split") + + # Load inferred GRN + df = read_prediction(par) + sources = df["source"].to_numpy() + targets = df["target"].to_numpy() + weights = df["weight"].to_numpy() + A = np.zeros((len(gene_names), len(gene_names)), dtype=X.dtype) + for source, target, weight in zip(sources, targets, weights): + if (source in gene_dict) and (target in gene_dict): + i = gene_dict[source] + j = gene_dict[target] + A[i, j] = float(weight) + + # Only consider the genes that are actually present in the inferred GRN, + # and keep only the most-connected genes (for speed). + gene_mask = np.logical_or(np.any(A, axis=1), np.any(A, axis=0)) + X = X[:, gene_mask] + X = X.toarray() if isinstance(X, csr_matrix) else X + A = A[gene_mask, :][:, gene_mask] + gene_names = gene_names[gene_mask] + + # Whether or not to take into account the regulatory modes (enhancer/inhibitor) + signed = np.any(A < 0) + + # Remove self-regulations + np.fill_diagonal(A, 0) + print(f"Evaluating {X.shape[1]} genes with {np.sum(A != 0)} regulatory links") + + tftg_results = [] + tgtg_results = [] + tftg_tgtg_results = [] + for group_name, groups in zip(group_names, group_variables): + print("\n" + "=" * 30) + print(f"Grouping samples by {group_name}") + + # Compute correlation matrix for each group + C = [] + for group in np.unique(groups): + print(f"Computing correlations for group {group}") + mask = (groups == group) + if np.sum(mask) >= 3: + assert not np.any(np.isnan(X[mask, :])) + #C.append(np.corrcoef(X[mask, :].T)) + C.append(spearman_corrcoef(X[mask, :])) + C = np.asarray(C) + + tftg_results.append(evaluate_setting( + C, A, "tftg", + tf_tg=True, + tg_tg=False, + n_tfs=100, + max_targets_per_tf=100, + signed=signed + )) + + tgtg_results.append(evaluate_setting( + C, A, "tgtg", + tf_tg=False, + tg_tg=True, + n_tfs=40, + max_targets_per_tf=300, + signed=signed + )) + + tftg_tgtg_results.append(evaluate_setting( + C, A, "tftg+tgtg", + tf_tg=True, + tg_tg=True, + n_tfs=100, + max_targets_per_tf=300, + signed=signed + )) + + tftg_score = np.mean(tftg_results) + tgtg_score = np.mean(tgtg_results) + tftg_tgtg_score = np.mean(tftg_tgtg_results) + final_score = (tftg_score + tgtg_score + tftg_tgtg_score) / 3.0 + + # Create results DataFrame with all three scores + results = pd.DataFrame({ + 'tftg': [tftg_score], + 'tgtg': [tgtg_score], + 'tftg_tgtg': [tftg_tgtg_score], + 'final_score': [final_score] + }) + print(results) + + return results diff --git a/src/metrics/experimental/regression_3/helper.py b/src/metrics/experimental/regression_3/helper.py index 1858b9703..0a10ae334 100644 --- a/src/metrics/experimental/regression_3/helper.py +++ b/src/metrics/experimental/regression_3/helper.py @@ -7,7 +7,6 @@ import numpy as np import pandas as pd import tqdm -import xgboost from scipy.sparse.linalg import LinearOperator from scipy.stats import pearsonr, spearmanr, wilcoxon, ConstantInputWarning from scipy.sparse import csr_matrix @@ -52,8 +51,7 @@ def compute_residual_correlations( y_test: np.ndarray, Z_test: np.ndarray ) -> np.ndarray: - model = xgboost.XGBRegressor(n_estimators=10) - #model = Ridge(alpha=10) + model = Ridge(alpha=10) model.fit(X_train, y_train) y_hat = model.predict(X_test) residuals = y_test - y_hat @@ -122,19 +120,29 @@ def main(par): A = A[gene_mask, :][:, gene_mask] gene_names = gene_names[gene_mask] + # Whether or not to take into account the regulatory modes (enhancer/inhibitor) + signed = np.any(A < 0) + # Remove self-regulations np.fill_diagonal(A, 0) print(f"Evaluating {X.shape[1]} genes with {np.sum(A != 0)} regulatory links") - # Create baseline model - A_baseline = np.copy(A) - for j in range(A.shape[1]): - np.random.shuffle(A_baseline[:j, j]) - np.random.shuffle(A_baseline[j+1:, j]) - assert np.any(A_baseline != A) - - scores, baseline_scores = [], [] + overall_scores = [] for group in np.unique(anchor_encoded): + scores, baseline_scores = [], [] + + # Create baseline model: for each TG, shuffle the TFs + A_baseline = np.copy(A) + if signed: + A_baseline *= (2 * (np.random.randint(0, 2) - 0.5)) + tf_mask = np.any(A_baseline != 0, axis=1) + for j in range(A.shape[1]): + mask = np.copy(tf_mask) + mask[j] = False + if np.any(mask): + values = np.copy(A_baseline[mask, j]) + np.random.shuffle(values) + A_baseline[mask, j] = values # Train/test split mask = (anchor_encoded != group) @@ -142,9 +150,9 @@ def main(par): X_test = X[~mask, :] # Standardize features - #scaler = StandardScaler() - #X_train = scaler.fit_transform(X_train) - #X_test = scaler.transform(X_test) + scaler = StandardScaler() + X_train = scaler.fit_transform(X_train) + X_test = scaler.transform(X_test) for j in tqdm.tqdm(range(X_train.shape[1])): @@ -164,33 +172,31 @@ def main(par): ) scores.append(np.mean(coefs)) - # Evaluate baseline GRN - selected = (A_baseline[:, j] != 0) - unselected = ~np.copy(selected) - unselected[j] = False - coefs = compute_residual_correlations( - X_train[:, selected], - X_train[:, j], - X_test[:, selected], - X_test[:, j], - X_test[:, ~selected] - ) - baseline_scores.append(np.mean(coefs)) - scores = np.array(scores) - baseline_scores = np.array(baseline_scores) - - p_value = wilcoxon(baseline_scores, scores, alternative="greater").pvalue - p_value = max(p_value, 1e-300) + # Evaluate baseline GRN + selected = (A_baseline[:, j] != 0) + coefs = compute_residual_correlations( + X_train[:, selected], + X_train[:, j], + X_test[:, selected], + X_test[:, j], + X_test[:, ~selected] + ) + baseline_scores.append(np.mean(coefs)) + + # Compute fold score + p_value = wilcoxon(baseline_scores, scores, alternative="greater").pvalue + p_value = max(p_value, 1e-300) + overall_scores.append(-np.log10(p_value)) # Calculate final score - final_score = -np.log10(p_value) - print(f"Anchor Regression Score: {final_score:.6f}") + final_score = np.mean(overall_scores) + print(f"Regression Score: {final_score:.6f}") print(f"Method: {method_id}") # Return results as DataFrame results = { - 'regression_3': [final_score] + 'regression_3': [float(final_score)] } df_results = pd.DataFrame(results) - return df_results \ No newline at end of file + return df_results diff --git a/src/metrics/experimental/regression_3/script.py b/src/metrics/experimental/regression_3/script.py index 7c660f603..6a6b03d0a 100644 --- a/src/metrics/experimental/regression_3/script.py +++ b/src/metrics/experimental/regression_3/script.py @@ -25,7 +25,7 @@ } ## VIASH END try: - sys.path.append(meta["resources_dir"]) + sys.path.append(meta["resources_dir"]) except: meta = { "resources_dir":'src/metrics/regression_2/', @@ -41,9 +41,9 @@ if __name__ == '__main__': - print(par) - output = main(par) - print(output) - method_id = ad.read_h5ad(par['prediction'], backed='r').uns['method_id'] - dataset_id = ad.read_h5ad(par['evaluation_data'], backed='r').uns['dataset_id'] - format_save_score(output, method_id, dataset_id, par['score']) + print(par) + output = main(par) + print(output) + method_id = ad.read_h5ad(par['prediction'], backed='r').uns['method_id'] + dataset_id = ad.read_h5ad(par['evaluation_data'], backed='r').uns['dataset_id'] + format_save_score(output, method_id, dataset_id, par['score']) diff --git a/src/metrics/experimental/sem/helper.py b/src/metrics/experimental/sem/helper.py index 13747164a..bb2ec9e5b 100644 --- a/src/metrics/experimental/sem/helper.py +++ b/src/metrics/experimental/sem/helper.py @@ -23,7 +23,7 @@ NUMPY_DTYPE = np.float32 # Hyper-parameters -MAX_N_ITER = 2000 +MAX_N_ITER = 500 # For reproducibility purposes seed = 0xCAFE @@ -181,7 +181,7 @@ def evaluate_grn( F = np.linalg.inv(np.eye(A.shape[0]) - A) F_CR = F[np.ix_(regulator_idx, reporter_idx)] Y_R = delta_X[:, reporter_idx] - lam = 0.1 + lam = 0.001 I = np.eye(len(regulator_idx), dtype=delta_X.dtype) M = F_CR @ F_CR.T + lam * I delta = np.zeros_like(delta_X) @@ -210,8 +210,8 @@ def evaluate_grn( A_eff = torch.abs(A) * signs if signed else A * mask delta_X_hat = solve_sem(A_eff, delta_train) loss = torch.mean(torch.sum(torch.square(X_non_reporter - delta_X_hat[:, ~is_reporter]), dim=1)) - loss = loss + 0.1 * torch.sum(torch.abs(A)) - loss = loss + 0.1 * torch.sum(torch.square(A)) + loss = loss + 0.001 * torch.sum(torch.abs(A)) + loss = loss + 0.001 * torch.sum(torch.square(A)) pbar.set_description(str(loss.item())) # Keep track of best solution @@ -292,7 +292,7 @@ def main(par): gene_mask = np.logical_or(np.any(A, axis=1), np.any(A, axis=0)) in_degrees = np.sum(A != 0, axis=0) out_degrees = np.sum(A != 0, axis=1) - idx = np.argsort(np.maximum(out_degrees, in_degrees))[:-5000] + idx = np.argsort(np.maximum(out_degrees, in_degrees))[:-2000] gene_mask[idx] = False X = X[:, gene_mask] X = X.toarray() if isinstance(X, csr_matrix) else X @@ -347,12 +347,19 @@ def main(par): print(f"Proportion of reporter genes: {np.mean(is_reporter)}") print(f"Use regulatory modes/signs: {use_signs}") + # Create baseline model: for each TG, shuffle the TFs print(f"Creating baseline GRN") A_baseline = np.copy(A) - for j in range(A_baseline.shape[1]): - np.random.shuffle(A_baseline[:j, j]) - np.random.shuffle(A_baseline[j+1:, j]) - assert np.any(A_baseline != A) + if use_signs: + A_baseline *= (2 * (np.random.randint(0, 2) - 0.5)) + tf_mask = np.any(A_baseline != 0, axis=1) + for j in range(A.shape[1]): + mask = np.copy(tf_mask) + mask[j] = False + if np.any(mask): + values = np.copy(A_baseline[mask, j]) + np.random.shuffle(values) + A_baseline[mask, j] = values # Evaluate inferred GRN print("\n======== Evaluate inferred GRN ========") @@ -360,11 +367,7 @@ def main(par): # Evaluate baseline GRN print("\n======== Evaluate shuffled GRN ========") - n_repeats = 1 - scores_baseline = np.zeros_like(scores) - for _ in range(n_repeats): # Repeat for more robust estimation - scores_baseline += evaluate_grn(X_controls, delta_X, is_train, is_reporter, A_baseline, signed=use_signs) - scores_baseline /= n_repeats + scores_baseline = evaluate_grn(X_controls, delta_X, is_train, is_reporter, A_baseline, signed=use_signs) # Keep only the genes for which both GRNs got a score mask = ~np.logical_or(np.isnan(scores), np.isnan(scores_baseline)) @@ -375,7 +378,7 @@ def main(par): # Perform rank test between actual scores and baseline rr_all['spearman'] = float(np.mean(scores)) rr_all['spearman_shuffled'] = float(np.mean(scores_baseline)) - if np.std(scores - scores_baseline) == 0: + if np.all(scores - scores_baseline == 0): df_results = pd.DataFrame({'sem': [0.0]}) else: res = wilcoxon(scores - scores_baseline, zero_method='wilcox', alternative='greater') @@ -390,6 +393,9 @@ def main(par): print(f"Final score: {score}") results = { + 'r2': [float(np.mean(scores))], + 'r2_baseline': [float(np.mean(scores_baseline))], + 'r2_diff': [float(np.mean(scores) - np.mean(scores_baseline))], 'sem': [float(score)] } diff --git a/src/metrics/experimental/vc/helper.py b/src/metrics/experimental/vc/helper.py index bf224c46e..4d422da37 100644 --- a/src/metrics/experimental/vc/helper.py +++ b/src/metrics/experimental/vc/helper.py @@ -299,7 +299,7 @@ def evaluate(A, train_data_loader, test_data_loader, state_dict, n_perturbations optimizer, mode='min', patience=5, min_lr=1e-6, cooldown=3, factor=0.8 ) - pbar = tqdm.tqdm(range(300)) + pbar = tqdm.tqdm(range(1000)) best_avg_r2, best_r2 = -np.inf, None model.train() for epoch in pbar: @@ -440,7 +440,7 @@ def main(par): print(f"Using {len(gene_names)} genes present in the GRN") # Additional memory-aware gene filtering for very large GRNs - MAX_GENES_FOR_MEMORY = 3000 # Reduced further to avoid memory issues + MAX_GENES_FOR_MEMORY = 2000 # Reduced further to avoid memory issues if len(gene_names) > MAX_GENES_FOR_MEMORY: print(f"Too many genes ({len(gene_names)}) for memory. Selecting top {MAX_GENES_FOR_MEMORY} by GRN connectivity.") @@ -457,6 +457,9 @@ def main(par): # Add self-regulations np.fill_diagonal(A, 1) + # Check whether the inferred GRN contains signed predictions + signed = np.any(A < 0) + # Mapping between gene expression profiles and their matched negative controls control_map, _ = create_control_matching(are_controls, match_groups) loose_control_map, _ = create_control_matching(are_controls, loose_match_groups) @@ -470,12 +473,19 @@ def main(par): if (len(train_index) == 0) or (len(test_index) == 0): continue - # Create baseline model + # Create baseline model: for each TG, shuffle the TFs + print(f"Creating baseline GRN") A_baseline = np.copy(A) - for j in range(A_baseline.shape[1]): - np.random.shuffle(A_baseline[:j, j]) - np.random.shuffle(A_baseline[j+1:, j]) - assert np.any(A_baseline != A) + if signed: + A_baseline *= (2 * (np.random.randint(0, 2) - 0.5)) + tf_mask = np.any(A_baseline != 0, axis=1) + for j in range(A.shape[1]): + mask = np.copy(tf_mask) + mask[j] = False + if np.any(mask): + values = np.copy(A_baseline[mask, j]) + np.random.shuffle(values) + A_baseline[mask, j] = values # Center and scale dataset scaler = StandardScaler() @@ -509,7 +519,6 @@ def create_data_loaders(): # For fair comparison, we first randomly initialize NN parameters, and use these same # parameters to build both models (only the GRN weights will differ). - signed = np.any(A < 0) model_template = Model(A, n_perturbations, signed=signed).to(DEVICE) state_dict = model_template.state_dict() @@ -521,8 +530,6 @@ def create_data_loaders(): train_data_loader, test_data_loader = create_data_loaders() r2_baseline.append(evaluate(A_baseline, train_data_loader, test_data_loader, state_dict, n_perturbations, signed=signed)) - break - r2 = np.asarray(r2).flatten() r2_baseline = np.asarray(r2_baseline).flatten() print("Mean R2", np.mean(r2), np.mean(r2_baseline)) From fa5f99c08b0b7b8e3e5b773272da39a71b6b93d6 Mon Sep 17 00:00:00 2001 From: AntoinePassemiers Date: Thu, 9 Oct 2025 22:09:20 +0200 Subject: [PATCH 06/19] Minor changes --- .../experimental/anchor_regression/helper.py | 48 ++++++++++++------- .../experimental/regression_3/helper.py | 10 ++-- src/metrics/experimental/sem/helper.py | 2 +- 3 files changed, 37 insertions(+), 23 deletions(-) diff --git a/src/metrics/experimental/anchor_regression/helper.py b/src/metrics/experimental/anchor_regression/helper.py index 1c60dc904..9703825df 100644 --- a/src/metrics/experimental/anchor_regression/helper.py +++ b/src/metrics/experimental/anchor_regression/helper.py @@ -10,6 +10,7 @@ from scipy.sparse.linalg import LinearOperator from scipy.stats import pearsonr, wilcoxon from scipy.sparse import csr_matrix +from sklearn.model_selection import KFold from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder import anndata as ad import warnings @@ -106,6 +107,32 @@ def anchor_regression( return theta +def cross_val( + cv, + X: np.ndarray, + y: np.ndarray, + Z: np.ndarray, + eps: float = 1e-50 +) -> float: + cv = KFold(5) + ss_res, ss_tot = 0, 0 + y_mean = np.mean(y) + for idx_train, idx_test in cv.split(X, y): + X_train, X_test = X[idx_train, :], X[idx_test, :] + Z_train, Z_test = Z[idx_train, :], Z[idx_test, :] + y_train, y_test = y[idx_train], y[idx_test] + theta = anchor_regression(X_train, Z_train, y_train, gamma=1) + y_hat = np.dot(X_test, theta) + ss_res += np.sum(np.square(y_test - y_hat)) + ss_tot += np.sum(np.square(y_test - y_mean)) + break # TODO + if ss_tot == 0: + r2 = 0 + else: + r2 = 1 - ss_res / ss_tot + return float(r2) + + def compute_stabilities( X: np.ndarray, y: np.ndarray, @@ -114,23 +141,10 @@ def compute_stabilities( is_selected: np.ndarray, eps: float = 1e-50 ) -> float: - theta0_signed = anchor_regression(X, Z, y, gamma=1) - theta0_signed = theta0_signed[is_selected] - #theta0 = np.abs(theta0_signed) - theta0 = theta0_signed / np.sum(np.abs(theta0_signed)) - - theta_signed = anchor_regression(X, Z, y, gamma=1.65) - theta_signed = theta_signed[is_selected] - #theta = np.abs(theta_signed) - theta = theta_signed / np.sum(np.abs(theta_signed)) - - stabilities = np.clip(np.abs(theta0 - theta) / np.abs(theta0 + eps), 0, 1) - #stabilities[np.sign(theta0_signed) != np.sign(theta_signed)] = 0 - - weights = np.abs(weights[is_selected]) - weights /= np.sum(weights) - - return float(np.sum(weights * stabilities)) + cv = KFold(5, random_state=0xCAFE, shuffle=True) + r2_selected = cross_val(cv, X[:, is_selected], y, Z) + r2_non_selected = cross_val(cv, X[:, ~is_selected], y, Z) + return r2_selected - r2_non_selected def compute_stabilities_v2( diff --git a/src/metrics/experimental/regression_3/helper.py b/src/metrics/experimental/regression_3/helper.py index 0a10ae334..9d997cd67 100644 --- a/src/metrics/experimental/regression_3/helper.py +++ b/src/metrics/experimental/regression_3/helper.py @@ -18,10 +18,10 @@ warnings.filterwarnings("ignore", category=ConstantInputWarning) # For reproducibility purposes -seed = 0xCAFE -os.environ['PYTHONHASHSEED'] = str(seed) -random.seed(seed) -np.random.seed(seed) +#seed = 0xCAFE +#os.environ['PYTHONHASHSEED'] = str(seed) +#random.seed(seed) +#np.random.seed(seed) from util import read_prediction, manage_layer from dataset_config import DATASET_GROUPS @@ -51,7 +51,7 @@ def compute_residual_correlations( y_test: np.ndarray, Z_test: np.ndarray ) -> np.ndarray: - model = Ridge(alpha=10) + model = Ridge(alpha=0.1) model.fit(X_train, y_train) y_hat = model.predict(X_test) residuals = y_test - y_hat diff --git a/src/metrics/experimental/sem/helper.py b/src/metrics/experimental/sem/helper.py index bb2ec9e5b..a3fccda2e 100644 --- a/src/metrics/experimental/sem/helper.py +++ b/src/metrics/experimental/sem/helper.py @@ -26,7 +26,7 @@ MAX_N_ITER = 500 # For reproducibility purposes -seed = 0xCAFE +seed = 0xCAFF os.environ['PYTHONHASHSEED'] = str(seed) random.seed(seed) np.random.seed(seed) From ecc78286d70b9bd46077631c9ca9f1651befac22 Mon Sep 17 00:00:00 2001 From: AntoinePassemiers Date: Fri, 10 Oct 2025 23:42:33 +0200 Subject: [PATCH 07/19] Fix issues with tf_binding metric --- src/metrics/tf_binding/helper.py | 110 +++++++++++++++++----------- src/metrics/tf_binding/run_local.py | 28 +++++++ 2 files changed, 95 insertions(+), 43 deletions(-) create mode 100644 src/metrics/tf_binding/run_local.py diff --git a/src/metrics/tf_binding/helper.py b/src/metrics/tf_binding/helper.py index b88199499..893f8f4c2 100644 --- a/src/metrics/tf_binding/helper.py +++ b/src/metrics/tf_binding/helper.py @@ -1,69 +1,93 @@ - +import math import pandas as pd -import anndata as ad -import sys import numpy as np from sklearn.metrics import average_precision_score from tqdm import tqdm from util import read_prediction + # For reproducibility seed = 42 np.random.seed(seed) def main(par): + + # Load predictions prediction = read_prediction(par) - test_data = ad.read_h5ad(par['evaluation_data'], backed='r') - evaluation_genes = test_data.var_names.tolist() - n_targets_total = len(evaluation_genes) + assert prediction.shape[0] > 0, 'No links found in the network' + if not {'source', 'target', 'weight'}.issubset(set(prediction.columns)): + raise ValueError("prediction must have columns: 'source', 'target', 'weight'") + # Load TF list and ground truth edges tf_all = np.loadtxt(par['tf_all'], dtype=str, delimiter=',', skiprows=1) true_graph = pd.read_csv(par['ground_truth']) - true_graph = true_graph[(true_graph['source'].isin(tf_all)) & (true_graph['target'].isin(evaluation_genes))] - assert prediction.shape[0] > 0, 'No links found in the network' - assert true_graph.shape[0] > 0, 'No links found in the ground truth' - + if not {'source', 'target'}.issubset(set(true_graph.columns)): + raise ValueError("ground_truth must have columns: 'source', 'target'") + + # Keep only TFs that are in the provided TF list + true_graph = true_graph[true_graph['source'].isin(tf_all)] + assert true_graph.shape[0] > 0, 'No links found in the ground truth after filtering to TF list' + + # Use the union of all targets that appear in either predictions or ground-truth + pred_targets = set(prediction['target'].astype(str).unique()) + true_targets = set(true_graph['target'].astype(str).unique()) + evaluation_genes = sorted(pred_targets.union(true_targets)) + if len(evaluation_genes) == 0: + raise ValueError("Empty evaluation target set (no targets in predictions or ground truth).") + + # Precompute lookup to avoid O(n^2) + gene_to_idx = {g: i for i, g in enumerate(evaluation_genes)} + n_targets_total = len(evaluation_genes) + scores_model = [] + # Iterate over TFs that actually have at least one positive in ground-truth for tf in tqdm(true_graph['source'].unique()): + # Positives for this TF true_edges = true_graph[true_graph['source'] == tf] + y_true = np.zeros(n_targets_total, dtype=int) + # Mark positives + for t in true_edges['target'].astype(str): + y_true[gene_to_idx[t]] = 1 + + # Scores over all candidate targets + y_score = np.zeros(n_targets_total, dtype=float) if tf in prediction['source'].unique(): pred_edges = prediction[prediction['source'] == tf] - true_labels = true_edges['target'].isin(pred_edges['target']).astype(int) - pred_scores = pred_edges.set_index('target').reindex(true_edges['target'])['weight'].fillna(0) - if true_labels.sum() == 0: # no positives - ap = 0.0 - else: - ap = average_precision_score(true_labels, pred_scores) - else: - ap = float('nan') - n_targets = len(true_edges) - - # ----- Analytic random baseline ----- - # Extend true edges to all evaluation genes - true_labels_random = np.zeros(n_targets_total) - idx = [evaluation_genes.index(t) for t in true_edges['target']] - true_labels_random[idx] = 1 - ap_random = true_labels_random.sum() / len(true_labels_random) - - scores_model.append({'source': tf, 'ap': ap, 'n_targets': n_targets, 'ap_random': ap_random}) - - scores_df = pd.DataFrame(scores_model) - print('Number of TFs in GRN:', len(scores_df[scores_df['ap'].notna()])) - # Compute weighted mean (ignoring NaNs) - valid = scores_df.dropna(subset=['ap']) - weighted_precision = np.average(valid['ap'], weights=valid['n_targets']) + # Fill scores where predicted + for tgt, w in zip(pred_edges['target'].astype(str), pred_edges['weight'].astype(float)): + idx = gene_to_idx.get(tgt, None) + if idx is not None: + y_score[idx] = w + + # Ensure numerical safety + y_score = np.nan_to_num(y_score, nan=0.0, posinf=0.0, neginf=0.0) - # Compute unweighted means (for reference) - precision = scores_df['ap'].mean(skipna=True) - precision_random = scores_df['ap_random'].mean(skipna=True) - recall = scores_df['ap'].fillna(0).mean() + # AP requires at least one positive + ap = average_precision_score(y_true, y_score) if (y_true.sum()) > 0 else 0.0 + + # Random baseline AUPRC is just the prevalence of positives + prevalence = y_true.mean() + + scores_model.append({ + 'source': tf, + 'ap': ap, + 'n_targets_pos': int(y_true.sum()), + 'prevalence': prevalence + }) + + scores_df = pd.DataFrame(scores_model) + valid = scores_df[scores_df['n_targets_pos'] > 0] + macro_ap = valid['ap'].mean() if len(valid) > 0 else np.nan + macro_ap_w = np.average(valid['ap'], weights=valid['n_targets_pos']) if len(valid) > 0 else np.nan + mean_prev = valid['prevalence'].mean() if len(valid) > 0 else np.nan + lift = macro_ap / mean_prev if (mean_prev is not None and mean_prev and not np.isnan(mean_prev)) else np.nan summary_df = pd.DataFrame([{ - 'precision': precision, - 'precision_random': precision_random, - 'recall': recall, - 'weighted_precision': weighted_precision + # 'macro_ap': macro_ap, + #'macro_ap_weighted': macro_ap_w, + # 'mean_prevalence': mean_prev, + 'lift_over_random': lift }]) - return summary_df \ No newline at end of file + return summary_df diff --git a/src/metrics/tf_binding/run_local.py b/src/metrics/tf_binding/run_local.py new file mode 100644 index 000000000..e509c1496 --- /dev/null +++ b/src/metrics/tf_binding/run_local.py @@ -0,0 +1,28 @@ +import subprocess + +import anndata as ad +import pandas as pd + + +for dataset in ["replogle", "norman", "adamson"]: + for method in ["negative_control", "pearson_corr", "positive_control", "ppcor", "portia", "scenic", "grnboost", "scprint", "scenicplus", "celloracle", "scglue", "figr", "granie"]: + + score_filepath = f"output/tf_binding/tf_binding.h5ad" + subprocess.call([ + "python", + "src/metrics/tf_binding/script.py", + "--prediction", f"resources/results/{dataset}/{dataset}.{method}.{method}.prediction.h5ad", + "--evaluation_data", f"resources/grn_benchmark/evaluation_data/{dataset}_bulk.h5ad", + "--ground_truth", "resources/grn_benchmark/ground_truth/K562.csv", + "--score", score_filepath + ]) + + adata = ad.read_h5ad(score_filepath) + if "metric_values" in adata.uns: + metric_names = adata.uns["metric_ids"] + metric_values = adata.uns["metric_values"] + df = pd.DataFrame({"metric": metric_names, "value": metric_values}) + df["dataset"] = dataset + df["method"] = method + df = df[["dataset", "method", "metric", "value"]] # Reorder columns to match header + df.to_csv(f"output/tf_binding/tf_binding_scores_{dataset}.csv", mode="a", header=False, index=False) From 33c324a77922d13b9a0043f7307838279de5508f Mon Sep 17 00:00:00 2001 From: AntoinePassemiers Date: Sat, 11 Oct 2025 11:54:46 +0200 Subject: [PATCH 08/19] Deterministic algorithm for GRN baseline creation --- .../experimental/anchor_regression/helper.py | 15 +- src/metrics/experimental/recovery_2/helper.py | 16 +- .../experimental/regression_3/helper.py | 15 +- src/metrics/experimental/sem/helper.py | 16 +- src/metrics/experimental/vc/helper.py | 18 +- src/utils/baseline.py | 171 ++++++++++++++++++ 6 files changed, 187 insertions(+), 64 deletions(-) create mode 100644 src/utils/baseline.py diff --git a/src/metrics/experimental/anchor_regression/helper.py b/src/metrics/experimental/anchor_regression/helper.py index 9703825df..f823406a6 100644 --- a/src/metrics/experimental/anchor_regression/helper.py +++ b/src/metrics/experimental/anchor_regression/helper.py @@ -24,6 +24,7 @@ from util import read_prediction, manage_layer from dataset_config import DATASET_GROUPS +from baseline import create_grn_baseline def encode_obs_cols(adata, cols): @@ -280,18 +281,8 @@ def main(par): scaler = StandardScaler() X = scaler.fit_transform(X) - # Create baseline model: for each TG, shuffle the TFs - A_baseline = np.copy(A) - if signed: - A_baseline *= (2 * (np.random.randint(0, 2) - 0.5)) - tf_mask = np.any(A_baseline != 0, axis=1) - for j in range(A.shape[1]): - mask = np.copy(tf_mask) - mask[j] = False - if np.any(mask): - values = np.copy(A_baseline[mask, j]) - np.random.shuffle(values) - A_baseline[mask, j] = values + # Create baseline model + A_baseline = create_grn_baseline(A) # Compute gene stabilities scores, scores_baseline = [], [] diff --git a/src/metrics/experimental/recovery_2/helper.py b/src/metrics/experimental/recovery_2/helper.py index c94a7c8d9..b896ef773 100644 --- a/src/metrics/experimental/recovery_2/helper.py +++ b/src/metrics/experimental/recovery_2/helper.py @@ -14,6 +14,7 @@ np.random.seed(0) from util import manage_layer +from baseline import create_grn_baseline def encode_obs_cols(adata, cols): @@ -124,19 +125,8 @@ def evaluate_setting(C: np.ndarray, A: np.ndarray, setting_name: str, **kwargs) # Evaluate inferred GRN scores = evaluate_grn(C, A, **kwargs) - # Create baseline model: for each TG, shuffle the TFs - A_baseline = np.copy(A) - if signed: - A_baseline *= (2 * (np.random.randint(0, 2) - 0.5)) - tf_mask = np.any(A_baseline != 0, axis=1) - for j in range(A.shape[1]): - mask = np.copy(tf_mask) - mask[j] = False - if np.any(mask): - values = np.copy(A_baseline[mask, j]) - np.random.shuffle(values) - A_baseline[mask, j] = values - # assert np.any(A_baseline != A) + # Create baseline model + A_baseline = create_grn_baseline(A) # Evaluate baseline GRN scores_baseline = evaluate_grn(C, A_baseline, **kwargs) diff --git a/src/metrics/experimental/regression_3/helper.py b/src/metrics/experimental/regression_3/helper.py index 9d997cd67..caaa8bfbd 100644 --- a/src/metrics/experimental/regression_3/helper.py +++ b/src/metrics/experimental/regression_3/helper.py @@ -25,6 +25,7 @@ from util import read_prediction, manage_layer from dataset_config import DATASET_GROUPS +from baseline import create_grn_baseline def encode_obs_cols(adata, cols): @@ -131,18 +132,8 @@ def main(par): for group in np.unique(anchor_encoded): scores, baseline_scores = [], [] - # Create baseline model: for each TG, shuffle the TFs - A_baseline = np.copy(A) - if signed: - A_baseline *= (2 * (np.random.randint(0, 2) - 0.5)) - tf_mask = np.any(A_baseline != 0, axis=1) - for j in range(A.shape[1]): - mask = np.copy(tf_mask) - mask[j] = False - if np.any(mask): - values = np.copy(A_baseline[mask, j]) - np.random.shuffle(values) - A_baseline[mask, j] = values + # Create baseline model + A_baseline = create_grn_baseline(A) # Train/test split mask = (anchor_encoded != group) diff --git a/src/metrics/experimental/sem/helper.py b/src/metrics/experimental/sem/helper.py index a3fccda2e..87ebf42fb 100644 --- a/src/metrics/experimental/sem/helper.py +++ b/src/metrics/experimental/sem/helper.py @@ -37,6 +37,7 @@ from util import read_prediction, manage_layer +from baseline import create_grn_baseline from dataset_config import DATASET_GROUPS @@ -347,19 +348,8 @@ def main(par): print(f"Proportion of reporter genes: {np.mean(is_reporter)}") print(f"Use regulatory modes/signs: {use_signs}") - # Create baseline model: for each TG, shuffle the TFs - print(f"Creating baseline GRN") - A_baseline = np.copy(A) - if use_signs: - A_baseline *= (2 * (np.random.randint(0, 2) - 0.5)) - tf_mask = np.any(A_baseline != 0, axis=1) - for j in range(A.shape[1]): - mask = np.copy(tf_mask) - mask[j] = False - if np.any(mask): - values = np.copy(A_baseline[mask, j]) - np.random.shuffle(values) - A_baseline[mask, j] = values + # Create baseline model + A_baseline = create_grn_baseline(A) # Evaluate inferred GRN print("\n======== Evaluate inferred GRN ========") diff --git a/src/metrics/experimental/vc/helper.py b/src/metrics/experimental/vc/helper.py index 4d422da37..61378bb66 100644 --- a/src/metrics/experimental/vc/helper.py +++ b/src/metrics/experimental/vc/helper.py @@ -9,6 +9,7 @@ import pandas as pd import torch from scipy.sparse import csr_matrix +from scipy.spatial.distance import cityblock from scipy.stats import wilcoxon from sklearn.model_selection import StratifiedGroupKFold, StratifiedKFold from sklearn.preprocessing import StandardScaler, LabelEncoder @@ -36,7 +37,7 @@ def set_seed(): from util import read_prediction, manage_layer from dataset_config import DATASET_GROUPS -from scipy.spatial.distance import cityblock +from baseline import create_grn_baseline def combine_multi_index(*arrays) -> np.array: @@ -473,19 +474,8 @@ def main(par): if (len(train_index) == 0) or (len(test_index) == 0): continue - # Create baseline model: for each TG, shuffle the TFs - print(f"Creating baseline GRN") - A_baseline = np.copy(A) - if signed: - A_baseline *= (2 * (np.random.randint(0, 2) - 0.5)) - tf_mask = np.any(A_baseline != 0, axis=1) - for j in range(A.shape[1]): - mask = np.copy(tf_mask) - mask[j] = False - if np.any(mask): - values = np.copy(A_baseline[mask, j]) - np.random.shuffle(values) - A_baseline[mask, j] = values + # Create baseline model + A_baseline = create_grn_baseline(A) # Center and scale dataset scaler = StandardScaler() diff --git a/src/utils/baseline.py b/src/utils/baseline.py new file mode 100644 index 000000000..5d6232a0e --- /dev/null +++ b/src/utils/baseline.py @@ -0,0 +1,171 @@ +from typing import List + +import numpy as np + + +class WeightedDegree(object): + + def __init__(self, node_idx: int, degree: int, weight: float): + """Class used to sort nodes by in-degree or out-degree. + + Ties are broken by using the weights from the GRN. + + Args: + node_idx + degree: in-degree or out-degree of a node. + weight: sum of weights of incoming or outcoming edges. + """ + self.node_idx: int = int(node_idx) + self.degree: int = int(degree) + self.weight: float = float(weight) + + def __lt__(self, other: "WeightedDegree") -> bool: + if self.degree < other.degree: + return True + elif self.degree == other.degree: + return self.weight < other.weight + else: + return False + + def __eq__(self, other: "WeightedDegree") -> bool: + return (self.degree == other.degree) and (self.weight == other.weight) + + @staticmethod + def from_grn(A: np.ndarray, incoming: bool = True) -> List["WeightedDegree"]: + if not incoming: + A = A.T + D = (A != 0) + degrees = np.sum(D, axis=0) + weights = np.sum(A, axis=0) + return [WeightedDegree(i, degrees[i], weights[i]) for i in range(len(degrees))] + + +def create_grn_baseline(A): + """ + Deterministic baseline for a directed simple graph. + Preserves in/out degree sequences of A (A may be weighted/signed; nonzeros indicate edges). + Returns a weighted B: same topology as the deterministic baseline, with the exact weights from A + reassigned deterministically to different edges (no randomness). + """ + + n = A.shape[0] + + # Order nodes by degrees, with explicit tie-breaking by weight + in_degrees = WeightedDegree.from_grn(A, incoming=True) + out_degrees = WeightedDegree.from_grn(A, incoming=False) + in_order = np.argsort(in_degrees)[::-1] + out_order = np.argsort(out_degrees)[::-1] + + # Baseline GRN + B = np.zeros_like(A) + + for u in out_order: + k = out_degrees[u].degree + if k == 0: + continue + # Greedily fill from current in_order + picks = [] + for v in in_order: + if v == u or B[u, v] == 1 or in_degrees[v].degree == 0: + continue + picks.append(v) + if len(picks) == k: + break + + # Deterministic repair if not enough picks + if len(picks) < k: + # Try swaps: reassign some of u's future edges by freeing capacity deterministically + # Here, we do a deterministic second pass allowing one-time edge relocation + for v in in_order: + if v == u or in_degrees[v].degree == 0 or v in picks: + continue + + # Find w already chosen where we can swap capacity + swapped = False + for w in picks: + # Check if there exists x != u with B[x, w]==1 and B[x, v]==0 to swap (x,w)->(x,v) + # Deterministic scan by x id + for x in range(n): + if x == u: + continue + if B[x, w] == 1 and B[x, v] == 0 and x != v and x != w: + B[x, w] = 0 + B[x, v] = 1 + in_degrees[w].degree += 1 + in_degrees[v].degree -= 1 + picks.append(v) + swapped = True + break + if swapped or len(picks) == k: + break + if len(picks) == k: + break + + if len(picks) < k: + raise ValueError("Directed degree sequence not realizable with simple digraph under constraints.") + + # Place edges for u + for v in picks: + B[u, v] = 1 + in_degrees[v].degree -= 1 + #out_degrees[v].degree = 0 + out_degrees[u].degree = 0 + + # Stable re-sort by residual in-degree, with explicit tie-breaking by weight + in_order = np.argsort(in_degrees)[::-1] + + # Zero diagonal guarantee + np.fill_diagonal(B, 0) + + # Recompute initial incoming stats from A + init_in_degrees = WeightedDegree.from_grn(A, incoming=True) + + # Convenience arrays for deterministic target ranking + # (higher degree first, then higher total incoming weight, then smaller id) + init_in_deg_arr = np.array([wd.degree for wd in init_in_degrees]) + init_in_wgt_arr = np.array([wd.weight for wd in init_in_degrees]) + + for u in range(n): + + # Outgoing weights in A + mask_A_u = (A[u, :] != 0) + if not np.any(mask_A_u): + continue + orig_targets = np.where(mask_A_u)[0] + W = A[u, orig_targets].astype(float) + + # Sort weights deterministically: by |w| desc, then w asc, then orig target id asc + # (lexsort uses last key as primary) + w_keys_3 = np.abs(W) * -1.0 # primary: |w| descending + w_keys_2 = W # secondary: actual value ascending (keeps sign order stable) + w_keys_1 = orig_targets # tertiary: original target id ascending + order_w = np.lexsort((w_keys_1, w_keys_2, w_keys_3)) + W_sorted = W[order_w] + + # Targets in B for this source, ranked by original A's incoming difficulty/salience + # Rank by: in-degree desc, then incoming-weight desc, then id asc + targets_B = np.where(B[u, :] == 1)[0] + if targets_B.size == 0: + continue + t_deg = init_in_deg_arr[targets_B] + t_wgt = init_in_wgt_arr[targets_B] + t_id = targets_B + order_t = np.lexsort((t_id, -t_wgt, -t_deg)) # primary: -t_deg, then -t_wgt, then t_id + targets_ranked = targets_B[order_t] + + # Sanity: degrees should match. If not, deterministically trim/pad. + kA = W_sorted.size + kB = targets_ranked.size + if kA > kB: + # Drop evenly from both ends to avoid bias + excess = kA - kB + left = excess // 2 + right = excess - left + W_sorted = W_sorted[left: kA - right] + elif kA < kB: + W_sorted = np.concatenate([W_sorted, np.zeros(kB - kA, dtype=float)], axis=0) + + # Assign exact weights to new edges deterministically + B[u, targets_ranked] = W_sorted + + return B From 1c29769f88cfb3d133216b50719606e86f7973f6 Mon Sep 17 00:00:00 2001 From: AntoinePassemiers Date: Sun, 12 Oct 2025 17:05:39 +0200 Subject: [PATCH 09/19] Use immediate early genes as reporter genes --- src/metrics/experimental/sem/helper.py | 50 ++++++++++++++++++++------ 1 file changed, 39 insertions(+), 11 deletions(-) diff --git a/src/metrics/experimental/sem/helper.py b/src/metrics/experimental/sem/helper.py index 87ebf42fb..ac27a32ad 100644 --- a/src/metrics/experimental/sem/helper.py +++ b/src/metrics/experimental/sem/helper.py @@ -26,7 +26,7 @@ MAX_N_ITER = 500 # For reproducibility purposes -seed = 0xCAFF +seed = 0xCADD os.environ['PYTHONHASHSEED'] = str(seed) random.seed(seed) np.random.seed(seed) @@ -41,6 +41,35 @@ from dataset_config import DATASET_GROUPS +# List of immediate early genes (IEGs) - scCustomize v3.2.0 +IEG = [ + "CCL7" , "CCL2" , "TNFSF9" , "SIK1B" , "RASD1" , "NUP98", + "CEBPD" , "NFKBID" , "NOCT" , "FOS" , "IER2" , "HBEGF", + "RHEB" , "PLAU" , "IFNB1" , "PCDH8" , "WEE1" , "FBXO33", + "PMAIP1", "DUSP1" , "PLK2" , "TSC22D1" , "MAP3K8" , "PIAS1", + "KLF10" , "BDNF" , "CCN2" , "TRIB1" , "SOD2" , "IER3", + "PLAT" , "RCAN1" , "ZFP36" , "CCL5" , "NFKBIA" , "NRN1", + "KLF2" , "SERPINE1", "MAFF" , "NCOA7" , "GDF15" , "LDLR", + "TNF" , "CCRL2" , "CCL18" , "CCL3" , "FOSL1" , "DUSP2", + "INHBA" , "JUND" , "NR4A1" , "EGR3" , "IRF1" , "IFIT3", + "IFIT1B", "NR4A3" , "PER2" , "GADD45G" , "SOCS3" , "TLR2", + "PELI1" , "IL1A" , "RGS2" , "CSF2" , "F3" , "APOLD1", + "ACKR4" , "BHLHE40" , "IL23A" , "GBP2" , "IL1B" , "ARIH1", + "GBP2" , "JUN" , "NPTX2" , "FOSB" , "EGR1" , "MBNL2", + "VCAM1" , "ZFP36L2" , "ARF4" , "MMP13" , "JUNB" , "ZFP36L1", + "CCN1" , "NPAS4" , "PPP1R15A", "EGR4" , "NFKBIZ" , "ARC", + "MCL1" , "RGS1" , "KLF6" , "CLEC4E" , "SGK1" , "ARHGEF3", + "EGR2" , "IFIT2" , "ID2" , "DUSP6" , "SIK1" , "DUSP5", + "NFIB" , "SAA2" , "SAA1" , "THBS1" , "FOSL2" , "ICAM1", + "CXCL10", "CSRNP1" , "BCL3" , "MARCKSL1", "HOMER1" , "TRAF1", + "ATF3" , "FLG" , "SRF" , "PIM1" , "GADD45B", "HES1", + "GEM" , "TNFAIP3" , "PER1" , "CREM" , "CD69" , "IL6", + "BTG2" , "ACOD1" , "CEBPB" , "CXCL11" , "IL12B" , "NR4A2", + "PTGS2" , "IKBKE" , "TXNIP" , "CD83" , "IER5" , "IL10", + "MYC" , "CXCL12" , "SLC2A3" , "CXCL1" +] + + def encode_obs_cols(adata, cols): encoded = [] for col in cols: @@ -334,17 +363,16 @@ def main(par): is_train[train_idx] = True # Create a split between genes: reporter genes and evaluation genes. - # All TFs should be included in the reporter gene set. - # If the reporter gene set does not represent at least 50% of all genes, - # then we randomly add target genes until the 50% threshold is reached. + # All TFs and IEGs should be included in the reporter gene set. n_genes = A.shape[1] - reg_mask = np.asarray(A != 0).any(axis=1) # TF mask - is_reporter = np.copy(reg_mask) - if np.mean(is_reporter) < 0.5 * len(is_reporter): - idx = np.where(~is_reporter)[0] - np.random.shuffle(idx) - idx = idx[:int(0.5 * len(is_reporter) - np.sum(is_reporter))] - is_reporter[idx] = True + reg_mask = np.asarray(A != 0).any(axis=1) + ieg_mask = np.asarray([gene_name in IEG for gene_name in gene_names], dtype=bool) + is_reporter = np.logical_or(reg_mask, ieg_mask) + #if np.mean(is_reporter) < 0.5 * len(is_reporter): + # idx = np.where(~is_reporter)[0] + # np.random.shuffle(idx) + # idx = idx[:int(0.5 * len(is_reporter) - np.sum(is_reporter))] + # is_reporter[idx] = True print(f"Proportion of reporter genes: {np.mean(is_reporter)}") print(f"Use regulatory modes/signs: {use_signs}") From 8bce32ab642bc658415a12ff74239662f39cbee0 Mon Sep 17 00:00:00 2001 From: AntoinePassemiers Date: Fri, 17 Oct 2025 09:03:37 +0200 Subject: [PATCH 10/19] Minor change --- src/metrics/experimental/regression_3/helper.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/src/metrics/experimental/regression_3/helper.py b/src/metrics/experimental/regression_3/helper.py index 25c0acea2..4a7c5b246 100644 --- a/src/metrics/experimental/regression_3/helper.py +++ b/src/metrics/experimental/regression_3/helper.py @@ -26,6 +26,7 @@ from util import read_prediction, manage_layer from dataset_config import DATASET_GROUPS +from baseline import create_grn_baseline def encode_obs_cols(adata, cols): @@ -52,8 +53,8 @@ def compute_residual_correlations( y_test: np.ndarray, Z_test: np.ndarray ) -> np.ndarray: - model = xgboost.XGBRegressor(n_estimators=10) - #model = Ridge(alpha=10) + #model = xgboost.XGBRegressor(n_estimators=10) + model = Ridge(alpha=0.01) model.fit(X_train, y_train) y_hat = model.predict(X_test) residuals = y_test - y_hat @@ -115,7 +116,7 @@ def main(par): gene_mask = np.logical_or(np.any(A, axis=1), np.any(A, axis=0)) in_degrees = np.sum(A != 0, axis=0) out_degrees = np.sum(A != 0, axis=1) - idx = np.argsort(np.maximum(out_degrees, in_degrees))[:-2000] + idx = np.argsort(np.maximum(out_degrees, in_degrees))[:-1000] gene_mask[idx] = False X = X[:, gene_mask] X = X.toarray() if isinstance(X, csr_matrix) else X @@ -127,11 +128,7 @@ def main(par): print(f"Evaluating {X.shape[1]} genes with {np.sum(A != 0)} regulatory links") # Create baseline model - A_baseline = np.copy(A) - for j in range(A.shape[1]): - np.random.shuffle(A_baseline[:j, j]) - np.random.shuffle(A_baseline[j+1:, j]) - assert np.any(A_baseline != A) + A_baseline = create_grn_baseline(A) scores, baseline_scores = [], [] for group in np.unique(anchor_encoded): From 5651d342d3e3693a8d76c93fe09f4e3a5e3bc99f Mon Sep 17 00:00:00 2001 From: AntoinePassemiers Date: Mon, 20 Oct 2025 12:30:49 +0200 Subject: [PATCH 11/19] Minor changes --- .../experimental/anchor_regression/helper.py | 51 +++--- src/metrics/experimental/vc/helper.py | 2 +- src/metrics/tf_binding/helper.py | 170 +++++------------- src/metrics/tf_binding/run_local.py | 8 +- 4 files changed, 82 insertions(+), 149 deletions(-) diff --git a/src/metrics/experimental/anchor_regression/helper.py b/src/metrics/experimental/anchor_regression/helper.py index f823406a6..93145bbaf 100644 --- a/src/metrics/experimental/anchor_regression/helper.py +++ b/src/metrics/experimental/anchor_regression/helper.py @@ -77,7 +77,7 @@ def anchor_regression( X: np.ndarray, Z: np.ndarray, Y: np.ndarray, - l2_reg: float = 1e-4, + l2_reg: float = 1e-5, gamma: float = 1.0 ) -> np.ndarray: """Anchor regression for causal inference under confounding. @@ -115,7 +115,6 @@ def cross_val( Z: np.ndarray, eps: float = 1e-50 ) -> float: - cv = KFold(5) ss_res, ss_tot = 0, 0 y_mean = np.mean(y) for idx_train, idx_test in cv.split(X, y): @@ -152,23 +151,23 @@ def compute_stabilities_v2( X: np.ndarray, y: np.ndarray, Z: np.ndarray, - A: np.ndarray, is_selected: np.ndarray, eps: float = 1e-50 -) -> Tuple[float, float]: - - theta0 = anchor_regression(X[:, is_selected], Z, y, gamma=1) - theta = anchor_regression(X[:, is_selected], Z, y, gamma=1.5) - s1 = np.clip(np.abs(theta0 - theta) / np.abs(theta0 + eps), 0, 1) +) -> float: - theta0 = anchor_regression(X[:, ~is_selected], Z, y, gamma=1) - theta = anchor_regression(X[:, ~is_selected], Z, y, gamma=1.5) - s2 = np.clip(np.abs(theta0 - theta) / np.abs(theta0 + eps), 0, 1) + theta0 = np.abs(anchor_regression(X, Z, y, gamma=1)) + theta0 /= np.sum(theta0) + theta = np.abs(anchor_regression(X, Z, y, gamma=2)) + theta /= np.sum(theta) + s1 = np.clip(np.abs(theta0[is_selected] - theta[is_selected]) / np.abs(theta0[is_selected] + eps), 0, 1) + s2 = np.clip(np.abs(theta0[~is_selected] - theta[~is_selected]) / np.abs(theta0[~is_selected] + eps), 0, 1) s1 = np.mean(s1) s2 = np.mean(s2) - return s1, s2 + stability = (s1 - s2) / (s1 + s2 + eps) + + return stability def evaluate_gene_stability( @@ -199,27 +198,16 @@ def evaluate_gene_stability( mask = np.ones(X.shape[1], dtype=bool) mask[j] = False - score = compute_stabilities(X[:, mask], X[:, j], Z, A[mask, j], is_selected[mask], eps=eps) - #stabilities_non_selected = np.mean(compute_stabilities(X[:, mask], X[:, j], Z, A, ~is_selected[mask], eps=eps)) - #score = (stabilities_selected - stabilities_non_selected) / (stabilities_selected + stabilities_non_selected + eps) - #score = np.mean(stabilities_selected) - - return score + return compute_stabilities_v2(X[:, mask], X[:, j], Z, is_selected[mask]) def main(par): """Main anchor regression evaluation function.""" + # Load evaluation data adata = ad.read_h5ad(par['evaluation_data']) dataset_id = adata.uns['dataset_id'] method_id = ad.read_h5ad(par['prediction'], backed='r').uns['method_id'] - - # Get dataset-specific anchor variables - if dataset_id not in DATASET_GROUPS: - raise ValueError(f"Dataset {dataset_id} not found in DATASET_GROUPS") - - anchor_cols = DATASET_GROUPS[dataset_id].get('anchors', ['donor_id', 'plate_name']) - print(f"Using anchor variables: {anchor_cols}") # Manage layer layer = manage_layer(adata, par) @@ -231,9 +219,22 @@ def main(par): gene_names = adata.var_names gene_dict = {gene_name: i for i, gene_name in enumerate(gene_names)} + # Get dataset-specific anchor variables + if dataset_id not in DATASET_GROUPS: + raise ValueError(f"Dataset {dataset_id} not found in DATASET_GROUPS") + anchor_cols = DATASET_GROUPS[dataset_id].get('anchors', ['donor_id', 'plate_name']) + print(f"Using anchor variables: {anchor_cols}") + # Encode anchor variables anchor_variables = encode_obs_cols(adata, anchor_cols) anchor_encoded = combine_multi_index(*anchor_variables) + + # Get CV groups + if "cell_type" in adata.obs: + cv_groups = LabelEncoder().fit_transform(adata.obs["cell_type"].values) + else: + np.random.randint(0, 5) + cv_groups = np.random.shuffle() if len(anchor_variables) == 0: raise ValueError(f"No anchor variables found in dataset for columns: {anchor_cols}") diff --git a/src/metrics/experimental/vc/helper.py b/src/metrics/experimental/vc/helper.py index 61378bb66..8e8c678a2 100644 --- a/src/metrics/experimental/vc/helper.py +++ b/src/metrics/experimental/vc/helper.py @@ -441,7 +441,7 @@ def main(par): print(f"Using {len(gene_names)} genes present in the GRN") # Additional memory-aware gene filtering for very large GRNs - MAX_GENES_FOR_MEMORY = 2000 # Reduced further to avoid memory issues + MAX_GENES_FOR_MEMORY = 500 # Reduced further to avoid memory issues if len(gene_names) > MAX_GENES_FOR_MEMORY: print(f"Too many genes ({len(gene_names)}) for memory. Selecting top {MAX_GENES_FOR_MEMORY} by GRN connectivity.") diff --git a/src/metrics/tf_binding/helper.py b/src/metrics/tf_binding/helper.py index 4b8978791..9d4574470 100644 --- a/src/metrics/tf_binding/helper.py +++ b/src/metrics/tf_binding/helper.py @@ -1,144 +1,72 @@ +from typing import Optional, Dict -import math -import pandas as pd import numpy as np -from sklearn.metrics import average_precision_score -from tqdm import tqdm +import pandas as pd +from sklearn.metrics import recall_score, precision_score, matthews_corrcoef, roc_auc_score, average_precision_score from util import read_prediction -from scipy.stats import wilcoxon - -# Wilcoxon signed-rank test: AP vs prevalence -def wilcoxon_logp(x, y): - try: - stat, p = wilcoxon(x, y, zero_method='wilcox', alternative='greater') - return -np.log10(p) if p > 0 else np.inf - except Exception: - return np.nan # For reproducibility seed = 42 np.random.seed(seed) -def main(par): - # Load predictions - prediction = read_prediction(par) - assert prediction.shape[0] > 0, 'No links found in the network' - if not {'source', 'target', 'weight'}.issubset(set(prediction.columns)): - raise ValueError("prediction must have columns: 'source', 'target', 'weight'") +def load_grn_from_dataframe( + df: pd.DataFrame, + tf_dict: Dict[str, int], + tg_dict: Dict[str, int] +) -> np.ndarray: + G = np.zeros((len(tf_dict), len(tg_dict)), dtype=float) + for tf_name, tg_name, weight in zip(df['source'], df['target'], df['weight']): + if tf_name not in tf_dict: + continue + if tg_name not in tg_dict: + continue + i = tf_dict[tf_name] + j = tg_dict[tg_name] + G[i, j] = weight + return G + + +def main(par): - # Load TF list and ground truth edges - tf_all = np.loadtxt(par['tf_all'], dtype=str, delimiter=',', skiprows=1) + # Load ground-truth edges (consider only TFs listed in the file loaded hereabove) true_graph = pd.read_csv(par['ground_truth']) if not {'source', 'target'}.issubset(set(true_graph.columns)): raise ValueError("ground_truth must have columns: 'source', 'target'") + if 'weight' not in true_graph: + true_graph['weight'] = np.ones(len(true_graph)) - # Keep only TFs that are in the provided TF list - true_graph = true_graph[true_graph['source'].isin(tf_all)] - assert true_graph.shape[0] > 0, 'No links found in the ground truth after filtering to TF list' - - # Use the union of all targets that appear in either predictions or ground-truth - pred_targets = set(prediction['target'].astype(str).unique()) - true_targets = set(true_graph['target'].astype(str).unique()) - evaluation_genes = sorted(pred_targets.union(true_targets)) - if len(evaluation_genes) == 0: - raise ValueError("Empty evaluation target set (no targets in predictions or ground truth).") - - # Precompute lookup to avoid O(n^2) - gene_to_idx = {g: i for i, g in enumerate(evaluation_genes)} - n_targets_total = len(evaluation_genes) - - scores_model = [] - # Iterate over TFs that actually have at least one positive in ground-truth - for tf in tqdm(true_graph['source'].unique()): - # Positives for this TF - true_edges = true_graph[true_graph['source'] == tf] - y_true = np.zeros(n_targets_total, dtype=int) - - # Mark positives - for t in true_edges['target'].astype(str): - y_true[gene_to_idx[t]] = 1 - assert y_true.sum() > 0, f'TF {tf} has no positive targets in ground truth' - - # Scores over all candidate targets - y_score = np.zeros(n_targets_total, dtype=float) - if tf in prediction['source'].unique(): - pred_edges = prediction[prediction['source'] == tf] - - # Fill scores where predicted - for tgt, w in zip(pred_edges['target'].astype(str), pred_edges['weight'].abs().astype(float)): - idx = gene_to_idx.get(tgt, None) - if idx is not None: - y_score[idx] = w - - # Ensure numerical safety - y_score = np.nan_to_num(y_score, nan=0.0, posinf=0.0, neginf=0.0) - # AP requires at least one positive - ap = average_precision_score(y_true, y_score) - else: - ap = np.nan - - # Random baseline AUPRC is just the prevalence of positives - prevalence = y_true.mean() - - # Calculate recall@k for this TF - - if tf in prediction['source'].unique() and not np.isnan(ap): - gt_targets = set(true_edges['target'].astype(str)) - k = len(gt_targets) # Top k predictions to consider - # Get top k predicted targets for this TF - genes_in_grn = set(pred_edges.sort_values(by='weight', key=abs, ascending=False).head(k)['target'].astype(str).unique() ) - - # Calculate recall@k - true_positives_in_top_k = len(gt_targets & genes_in_grn) - recall_at_k = true_positives_in_top_k / len(gt_targets) - - # Calculate random recall baseline - random_recall = len(genes_in_grn) / n_targets_total - lift_recall = recall_at_k / (random_recall + 1e-10) - else: - recall_at_k = np.nan - lift_recall = np.nan - - scores_model.append({ - 'source': tf, - 'ap': ap, - 'n_targets_pos': int(y_true.sum()), - 'prevalence': prevalence, - 'recall_at_k': recall_at_k, - 'lift_recall': lift_recall - }) - - from scipy.stats import wilcoxon - - scores_df = pd.DataFrame(scores_model) - ap_values_pred = scores_df['ap'].dropna().values - preval_all = scores_df['prevalence'].values - preval_pred = scores_df[~scores_df['ap'].isna()]['prevalence'].values - ap_pred_mean = ap_values_pred.mean() if len(ap_values_pred) > 0 else 0.0 - preval_pred_mean = preval_pred.mean() if len(preval_pred) > 0 else 0.0 - lift_pred = ap_pred_mean / (preval_pred_mean + 1e-10) + # Load inferred GRN + prediction = read_prediction(par) + assert prediction.shape[0] > 0, 'No links found in the network' + if not {'source', 'target', 'weight'}.issubset(set(prediction.columns)): + raise ValueError("prediction must have columns: 'source', 'target', 'weight'") - # Calculate mean recall metrics - recall_values_pred = scores_df['recall_at_k'].dropna().values - lift_recall_values_pred = scores_df['lift_recall'].dropna().values - lift_recall_values_all = scores_df['lift_recall'].fillna(1).values + # Intersect TF lists, intersect TG lists + tf_names = set(true_graph['source'].unique()) #.intersection(set(tf_all)) + tg_names = set(true_graph['target'].unique()) + #tf_names = set(true_graph['source'].unique()).intersection(set(prediction['source'].unique())) #.intersection(set(tf_all)) + #tg_names = set(true_graph['target'].unique()).intersection(set(prediction['target'].unique())) + tf_dict = {tf_name: i for i, tf_name in enumerate(tf_names)} + tg_dict = {tg_name: i for i, tg_name in enumerate(tg_names)} - mean_recall_at_k = recall_values_pred.mean() if len(recall_values_pred) > 0 else 0.0 - mean_lift_recall = lift_recall_values_pred.mean() if len(lift_recall_values_pred) > 0 else 1 - mean_lift_recall_all = lift_recall_values_all.mean() if len(lift_recall_values_all) > 0 else 1 + # Reformat GRNs as NumPy arrays + A = load_grn_from_dataframe(true_graph, tf_dict, tg_dict) + G = load_grn_from_dataframe(prediction, tf_dict, tg_dict) - # Calculate TF coverage - n_tfs_in_gt = true_graph['source'].nunique() - n_tfs_in_grn = set(prediction['source'].unique()) & set(true_graph['source'].unique()) - n_tfs_in_grn = len(n_tfs_in_grn) + G = np.abs(G) + G += np.random.uniform(0, 1e-20, size=G.shape) - recall = mean_lift_recall_all * mean_lift_recall + # Evaluate inferred GRN + tf_binding_precision = precision_score(A.flatten(), (G != 0).flatten()) + tf_binding_recall = recall_score(A.flatten(), (G != 0).flatten()) + final_score = roc_auc_score(A.flatten(), G.flatten()) summary_df = pd.DataFrame([{ - 'tf_binding_precision': lift_pred, - 'tf_binding_recall': recall + 'tf_binding_precision': tf_binding_precision, + 'tf_binding_recall': tf_binding_recall, + 'final_score': final_score }]) - return summary_df \ No newline at end of file + return summary_df diff --git a/src/metrics/tf_binding/run_local.py b/src/metrics/tf_binding/run_local.py index e509c1496..41470757f 100644 --- a/src/metrics/tf_binding/run_local.py +++ b/src/metrics/tf_binding/run_local.py @@ -4,9 +4,13 @@ import pandas as pd -for dataset in ["replogle", "norman", "adamson"]: - for method in ["negative_control", "pearson_corr", "positive_control", "ppcor", "portia", "scenic", "grnboost", "scprint", "scenicplus", "celloracle", "scglue", "figr", "granie"]: +for dataset in ["op"]: + #for method in ['granie']: + for method in ["negative_control", "granie", "ppcor", "portia", "pearson_corr", "positive_control", "scenic", "grnboost", "scprint", "scenicplus", "celloracle", "scglue", "figr"]: + print() + print(method) + score_filepath = f"output/tf_binding/tf_binding.h5ad" subprocess.call([ "python", From 0f6cf0bb5eb97f3f978aa4fc4c47719858781738 Mon Sep 17 00:00:00 2001 From: AntoinePassemiers Date: Mon, 20 Oct 2025 12:33:11 +0200 Subject: [PATCH 12/19] Minor change --- src/metrics/tf_binding/run_local.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/metrics/tf_binding/run_local.py b/src/metrics/tf_binding/run_local.py index 41470757f..0599d0d8f 100644 --- a/src/metrics/tf_binding/run_local.py +++ b/src/metrics/tf_binding/run_local.py @@ -4,7 +4,7 @@ import pandas as pd -for dataset in ["op"]: +for dataset in ["replogle"]: #for method in ['granie']: for method in ["negative_control", "granie", "ppcor", "portia", "pearson_corr", "positive_control", "scenic", "grnboost", "scprint", "scenicplus", "celloracle", "scglue", "figr"]: From 7db1968b856d38a0d94a8bb86ea6929508d06439 Mon Sep 17 00:00:00 2001 From: AntoinePassemiers Date: Thu, 23 Oct 2025 00:59:00 +0200 Subject: [PATCH 13/19] tf_binding GT: strand-aware promoter region --- src/metrics/tf_binding/acquire/script.py | 30 +++++++++++++----------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/src/metrics/tf_binding/acquire/script.py b/src/metrics/tf_binding/acquire/script.py index fb2efa5c5..b00445258 100644 --- a/src/metrics/tf_binding/acquire/script.py +++ b/src/metrics/tf_binding/acquire/script.py @@ -1,5 +1,6 @@ import pandas as pd import os +import numpy as np import urllib.request from tqdm import tqdm @@ -77,11 +78,13 @@ def download_tss_file(tss_local_path): TSS_FILE = 'refGene.txt.gz' TSS_URL = f'https://hgdownload.soe.ucsc.edu/goldenPath/hg38/database/{TSS_FILE}' TSS_PATH = os.path.join(local_path, TSS_FILE) + if not os.path.exists(tss_local_path): print(f'Downloading TSS (refGene) from {TSS_URL} to {tss_local_path}') + os.makedirs(os.path.dirname(tss_local_path), exist_ok=True) urllib.request.urlretrieve(TSS_URL, tss_local_path) -def genes_with_peaks_near_tss(peaks_df, tss_file, window_bp=1000): +def genes_with_peaks_near_tss(peaks_df, tss_file, window_up=1000, window_down=100): if peaks_df is None or len(peaks_df) == 0: raise ValueError("Peaks DataFrame is empty or None.") @@ -99,11 +102,11 @@ def genes_with_peaks_near_tss(peaks_df, tss_file, window_bp=1000): tx_end = pd.to_numeric(ref.iloc[:, 5], errors='coerce').fillna(0).astype(int) gene = ref.iloc[:, 12].astype(str) tss = tx_start.where(strand == '+', tx_end) - win_start = (tss - int(window_bp)).clip(lower=0) - win_end = tss + int(window_bp) + win_start = np.where(strand == '+', tss - window_up, tss - window_down)[0] + win_end = np.where(strand == '+', tss + window_down, tss + window_up)[0] genes_df = pd.DataFrame({ 'chrom': chrom, - 'start': win_start.astype(int), + 'start': np.clip(win_start, 0, None).astype(int), 'end': win_end.astype(int), 'gene': gene.str.strip(), }) @@ -129,22 +132,22 @@ def genes_with_peaks_near_tss(peaks_df, tss_file, window_bp=1000): from tqdm import tqdm from concurrent.futures import ProcessPoolExecutor, as_completed -def _process_tf(tf, peaks_df, tss_file, window_bp): +def _process_tf(tf, peaks_df, tss_file): peaks = peaks_df[peaks_df['tf'] == tf] if peaks is None or len(peaks) == 0: return [] - genes = genes_with_peaks_near_tss(peaks, tss_file, window_bp=window_bp) + genes = genes_with_peaks_near_tss(peaks, tss_file) if not genes: return [] return [{'source': tf, 'target': g} for g in genes] -def build_grn(peaks_df, tss_file, genome='hg38', window_bp=1000, max_workers=None): +def build_grn(peaks_df, tss_file, genome='hg38', max_workers=None): tfs = peaks_df['tf'].unique() rows = [] # Use ProcessPoolExecutor for parallel processing with ProcessPoolExecutor(max_workers=max_workers) as executor: - futures = {executor.submit(_process_tf, tf, peaks_df, tss_file, window_bp): tf for tf in tfs} + futures = {executor.submit(_process_tf, tf, peaks_df, tss_file): tf for tf in tfs} for future in tqdm(as_completed(futures), total=len(futures)): result = future.result() if result: @@ -154,7 +157,6 @@ def build_grn(peaks_df, tss_file, genome='hg38', window_bp=1000, max_workers=Non return grn if __name__ == '__main__': genome = 'hg38' - window_bp = 1000 qval = "50" local_path = 'resources/datasets_raw/chipseq/' tss_local_path = os.path.join(local_path, 'TSS.txt.gz') @@ -172,16 +174,16 @@ def build_grn(peaks_df, tss_file, genome='hg38', window_bp=1000, max_workers=Non peaks_df = read_peak_file(peaks_file, source='chip_atlas') if True: # Filter for score == 1000 - print('Filtering peaks with score > 0', flush=True) + print('Filtering peaks with score > 50', flush=True) print(f'Original number of peaks: {len(peaks_df)}', flush=True) - peaks_df = peaks_df[peaks_df['score'] > 0] + peaks_df = peaks_df[peaks_df['score'] > 50] print(f'Number of peaks after filtering: {len(peaks_df)}', flush=True) - grn = build_grn(peaks_df, tss_local_path, genome=genome, window_bp=window_bp, max_workers=10) + grn = build_grn(peaks_df, tss_local_path, genome=genome, max_workers=10) grn.to_csv(output_csv_path, index=False) if True: # remap print('Processing remap2', flush=True) - if False: + if True: print('Processing remap2022_all_macs2_hg38_v1_0', flush=True) peaks_file = f"{local_path}/remap/remap2022_all_macs2_hg38_v1_0.bed.gz" print('Reading peaks file', flush=True) @@ -218,7 +220,7 @@ def build_grn(peaks_df, tss_file, genome='hg38', window_bp=1000, max_workers=Non df = df[df['score'] > 0] print("After filtering, ", df.shape[0], " peaks") - grn = build_grn(df, tss_local_path, genome=genome, window_bp=window_bp) + grn = build_grn(df, tss_local_path, genome=genome) output_csv_path = f'resources/grn_benchmark/ground_truth/{cell_type}_remap.csv' print(f'Writing GRN to {output_csv_path}', flush=True) grn.to_csv(output_csv_path, index=False) From 71cc64f48a48a8400b8d749b2e7e06b122487756 Mon Sep 17 00:00:00 2001 From: AntoinePassemiers Date: Sat, 25 Oct 2025 17:29:45 +0200 Subject: [PATCH 14/19] Add BEELINE datasets --- src/methods/portia/script.py | 4 +- src/metrics/gt/acquire/run_local.sh | 13 ++++ src/metrics/gt/acquire/script.py | 19 ++++++ src/metrics/gt/helper.py | 71 ++++++++++++++++++++ src/metrics/gt/run_local.py | 37 ++++++++++ src/metrics/gt/run_local.sh | 78 ++++++++++++++++++++++ src/metrics/gt/script.py | 45 +++++++++++++ src/metrics/tf_binding/acquire/script.py | 4 +- src/process_data/beeline/config.novsh.yaml | 53 +++++++++++++++ src/process_data/beeline/script.py | 73 ++++++++++++++++++++ src/utils/util.py | 48 +++++++++++++ 11 files changed, 440 insertions(+), 5 deletions(-) create mode 100644 src/metrics/gt/acquire/run_local.sh create mode 100644 src/metrics/gt/acquire/script.py create mode 100644 src/metrics/gt/helper.py create mode 100644 src/metrics/gt/run_local.py create mode 100644 src/metrics/gt/run_local.sh create mode 100644 src/metrics/gt/script.py create mode 100644 src/process_data/beeline/config.novsh.yaml create mode 100644 src/process_data/beeline/script.py diff --git a/src/methods/portia/script.py b/src/methods/portia/script.py index d2f0cfab5..2ab94a09b 100644 --- a/src/methods/portia/script.py +++ b/src/methods/portia/script.py @@ -44,19 +44,17 @@ def main(par): tf_names = [gene_name for gene_name in gene_names if (gene_name in tfs)] tf_idx = np.asarray([i for i, gene_name in enumerate(gene_names) if gene_name in tf_names], dtype=int) - print('Inferring grn') dataset = pt.GeneExpressionDataset() for exp_id, data in enumerate(X): dataset.add(pt.Experiment(exp_id, data)) - + M_bar = pt.run(dataset, tf_idx=tf_idx, method='no-transform') ranked_scores = pt.rank_scores(M_bar, gene_names, limit=par['max_n_links']) sources, targets, weights = zip(*[(gene_a, gene_b, score) for gene_a, gene_b, score in ranked_scores]) grn = pd.DataFrame({'source':sources, 'target':targets, 'weight':weights}) - print(grn.shape) grn = grn[grn.source.isin(tf_names)] grn = process_links(grn, par) diff --git a/src/metrics/gt/acquire/run_local.sh b/src/metrics/gt/acquire/run_local.sh new file mode 100644 index 000000000..c735e7857 --- /dev/null +++ b/src/metrics/gt/acquire/run_local.sh @@ -0,0 +1,13 @@ +#!/bin/bash +#SBATCH --job-name=beeline_data +#SBATCH --output=logs/%j.out +#SBATCH --error=logs/%j.err +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=2 +#SBATCH --time=10:00:00 +#SBATCH --mem=250GB +#SBATCH --partition=cpu +#SBATCH --mail-type=END,FAIL +#SBATCH --mail-user=jalil.nourisa@gmail.com + +python src/metrics/tf_binding/acquire/script.py \ No newline at end of file diff --git a/src/metrics/gt/acquire/script.py b/src/metrics/gt/acquire/script.py new file mode 100644 index 000000000..5c4eaad6f --- /dev/null +++ b/src/metrics/gt/acquire/script.py @@ -0,0 +1,19 @@ +import os +import sys +import io +import zipfile +import tempfile +import requests +from pathlib import Path + + +sys.path.append("src/utils") +from util import download_and_uncompress_zip + + +if __name__ == "__main__": + + download_and_uncompress_zip( + "https://zenodo.org/records/3701939/files/BEELINE-Networks.zip?download=1", + "resources/grn_benchmark/ground_truth/beeline" + ) diff --git a/src/metrics/gt/helper.py b/src/metrics/gt/helper.py new file mode 100644 index 000000000..2e2adc853 --- /dev/null +++ b/src/metrics/gt/helper.py @@ -0,0 +1,71 @@ +from typing import Optional, Dict + +import numpy as np +import pandas as pd +from sklearn.metrics import recall_score, precision_score, matthews_corrcoef, roc_auc_score, average_precision_score +from util import read_prediction + + +# For reproducibility +seed = 42 +np.random.seed(seed) + + +def load_grn_from_dataframe( + df: pd.DataFrame, + tf_dict: Dict[str, int], + tg_dict: Dict[str, int] +) -> np.ndarray: + G = np.zeros((len(tf_dict), len(tg_dict)), dtype=float) + for tf_name, tg_name, weight in zip(df['source'], df['target'], df['weight']): + if tf_name not in tf_dict: + continue + if tg_name not in tg_dict: + continue + i = tf_dict[tf_name] + j = tg_dict[tg_name] + G[i, j] = weight + return G + + +def main(par): + + # Load ground-truth edges (consider only TFs listed in the file loaded hereabove) + true_graph = pd.read_csv(par['ground_truth']) + if not {'Gene1', 'Gene2'}.issubset(set(true_graph.columns)): + raise ValueError("ground_truth must have columns: 'Gene1', 'Gene2'") + if 'weight' not in true_graph: + true_graph['weight'] = np.ones(len(true_graph)) + true_graph.rename(columns={"Gene1": "source", "Gene2": "target"}, copy=False, inplace=True) + + # Load inferred GRN + prediction = read_prediction(par) + assert prediction.shape[0] > 0, 'No links found in the network' + if not {'source', 'target', 'weight'}.issubset(set(prediction.columns)): + raise ValueError("prediction must have columns: 'source', 'target', 'weight'") + + # Intersect TF lists, intersect TG lists + tf_names = set(true_graph['source'].unique()) #.intersection(set(tf_all)) + tg_names = set(true_graph['target'].unique()) + #tf_names = set(true_graph['source'].unique()).intersection(set(prediction['source'].unique())) #.intersection(set(tf_all)) + #tg_names = set(true_graph['target'].unique()).intersection(set(prediction['target'].unique())) + tf_dict = {tf_name: i for i, tf_name in enumerate(tf_names)} + tg_dict = {tg_name: i for i, tg_name in enumerate(tg_names)} + + # Reformat GRNs as NumPy arrays + A = load_grn_from_dataframe(true_graph, tf_dict, tg_dict) + G = load_grn_from_dataframe(prediction, tf_dict, tg_dict) + G = np.abs(G) + + # Evaluate inferred GRN + tf_binding_precision = precision_score((A != 0).flatten(), (G != 0).flatten()) + tf_binding_recall = recall_score((A != 0).flatten(), (G != 0).flatten()) + final_score = roc_auc_score(A.flatten(), G.flatten()) + + summary_df = pd.DataFrame([{ + 'tf_binding_precision': tf_binding_precision, + 'tf_binding_recall': tf_binding_recall, + 'final_score': final_score + }]) + + return summary_df diff --git a/src/metrics/gt/run_local.py b/src/metrics/gt/run_local.py new file mode 100644 index 000000000..3d8e1898e --- /dev/null +++ b/src/metrics/gt/run_local.py @@ -0,0 +1,37 @@ +import subprocess + +import anndata as ad +import pandas as pd + + +results = {"dataset": [], "method": [], "score": []} +for dataset in ["beeline_hESC", "beeline_hHep", "beeline_mDC", "beeline_mESC", "beeline_mESC-E", "beeline_mHSC-E", "beeline_mHSC-L"]: + #for method in ["negative_control", "granie", "ppcor", "portia", "pearson_corr", "positive_control", "scenic", "grnboost", "scprint", "scenicplus", "celloracle", "scglue", "figr"]: + #for method in ['granie', 'ppcor', 'portia', 'negative_control']: + for method in ['portia']: + + print() + print(method) + + score_filepath = f"output/gt/gt.h5ad" + if "_h" in dataset: + gt = f"human/{dataset.split('_')[1]}-ChIP-seq-network" + else: + gt = f"mouse/{dataset.split('_')[1]}-ChIP-seq-network" + subprocess.call([ + "python", + "src/metrics/gt/script.py", + "--prediction", f"resources/results/{dataset}/{method}.h5ad", + "--evaluation_data", f'resources/grn_benchmark/inference_data/{dataset}.h5ad', + "--ground_truth", f"resources/grn_benchmark/ground_truth/beeline/Networks/{gt}.csv", + "--score", score_filepath + ]) + + adata = ad.read_h5ad(score_filepath) + if "metric_values" in adata.uns: + metric_names = adata.uns["metric_ids"] + metric_values = adata.uns["metric_values"] + print(metric_values) + +df = pd.DataFrame(results) +df.to_csv(f"output/gt/gt_scores_beeline.csv", header=True, index=False) diff --git a/src/metrics/gt/run_local.sh b/src/metrics/gt/run_local.sh new file mode 100644 index 000000000..0b2ef233a --- /dev/null +++ b/src/metrics/gt/run_local.sh @@ -0,0 +1,78 @@ +#!/bin/bash +#SBATCH --job-name=tf_binding +#SBATCH --output=logs/%j.out +#SBATCH --error=logs/%j.err +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=20 +#SBATCH --time=30:00:00 +#SBATCH --mem=250GB +#SBATCH --partition=cpu +#SBATCH --mail-type=END,FAIL +#SBATCH --mail-user=jalil.nourisa@gmail.com + +set -euo pipefail + +save_dir="output/tf_binding" +mkdir -p "$save_dir" + +# datasets to process +datasets=( 'replogle' 'norman' 'adamson' ) # 'xaira_HCT116' 'replogle' 'norman' 'adamson') #"300BCG" "ibd" 'parsebioscience''op' "300BCG" 'parsebioscience' 'replogle' 'norman' 'adamson' +# methods to process +methods=( "pearson_corr" "negative_control" "positive_control" "ppcor" "portia" "scenic" "grnboost" "scprint" "scenicplus" "celloracle" "scglue" "figr" "granie") + +for dataset in "${datasets[@]}"; do + echo -e "\n\nProcessing dataset: $dataset\n" + + # Create separate CSV file for each dataset + dataset_csv="${save_dir}/tf_binding_scores_${dataset}.csv" + echo "dataset,method,metric,value" > "$dataset_csv" + + evaluation_data="resources/grn_benchmark/evaluation_data/${dataset}_bulk.h5ad" + + for method in "${methods[@]}"; do + prediction="resources/results/${dataset}/${dataset}.${method}.${method}.prediction.h5ad" + score="${save_dir}/tf_binding_${dataset}_${method}.h5ad" + + if [[ ! -f "$prediction" ]]; then + echo "File not found: $prediction, skipping..." + continue + fi + if [[ "$dataset" == "replogle" || "$dataset" == "norman" || "$dataset" == "adamson" ]]; then + ground_truth="resources/grn_benchmark/ground_truth/K562_remap.csv" + elif [[ "$dataset" == "xaira_HEK293T" ]]; then + ground_truth="resources/grn_benchmark/ground_truth/HEK293T_remap.csv" + elif [[ "$dataset" == "xaira_HCT116" ]]; then + ground_truth="resources/grn_benchmark/ground_truth/HCT116_chipatlas.csv" + else + echo "No ground truth available for dataset: $dataset, skipping..." + continue + fi + + echo -e "\nProcessing method: $method\n" + python src/metrics/tf_binding/script.py \ + --prediction "$prediction" \ + --evaluation_data "$evaluation_data" \ + --ground_truth "$ground_truth" \ + --score "$score" + + # Extract metrics from the .h5ad and append to CSV + python -u - < 50', flush=True) + print('Filtering peaks with score > 0', flush=True) print(f'Original number of peaks: {len(peaks_df)}', flush=True) - peaks_df = peaks_df[peaks_df['score'] > 50] + peaks_df = peaks_df[peaks_df['score'] > 0] print(f'Number of peaks after filtering: {len(peaks_df)}', flush=True) grn = build_grn(peaks_df, tss_local_path, genome=genome, max_workers=10) grn.to_csv(output_csv_path, index=False) diff --git a/src/process_data/beeline/config.novsh.yaml b/src/process_data/beeline/config.novsh.yaml new file mode 100644 index 000000000..922e7a7de --- /dev/null +++ b/src/process_data/beeline/config.novsh.yaml @@ -0,0 +1,53 @@ + +name: beeline +namespace: "process_data" +info: + label: Process BEELINE dataset + summary: "Process sourced BEELINE data to generate inference and evaluation datasets" + +arguments: + - name: --replogle_gwps + type: file + required: true + direction: input + - name: --tf_all + type: file + required: true + direction: input + - name: --replogle_test_perturbs + type: file + required: true + direction: input + - name: --replogle_gwps_test_sc + type: file + required: true + direction: output + - name: --replogle_gwps_train_sc + type: file + required: true + direction: output + - name: --replogle_gwps_train_sc_subset + type: file + required: true + direction: output + + +resources: + - type: python_script + path: script.py + - path: /src/utils/util.py + dest: util.py + +engines: + - type: docker + image: ghcr.io/openproblems-bio/base_python:1.0.4 + setup: + - type: python + packages: [ numpy==1.26.4 ] + - type: native + +runners: + - type: executable + - type: nextflow + directives: + label: [midtime, midmem, midcpu] diff --git a/src/process_data/beeline/script.py b/src/process_data/beeline/script.py new file mode 100644 index 000000000..72c03dfd7 --- /dev/null +++ b/src/process_data/beeline/script.py @@ -0,0 +1,73 @@ + +import os +import anndata as ad +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import sys +import scanpy as sc +from sklearn.model_selection import train_test_split + +from scipy.sparse import csr_matrix + +## VIASH START +par = { +} +## VIASH END + +try: + sys.path.append(meta["resources_dir"]) +except: + meta = { + 'resources_dir': 'src/process_data/', + 'util_dir': 'src/utils', + } + sys.path.append(meta["resources_dir"]) + sys.path.append(meta["util_dir"]) + +from helper_data import sum_by +from util import download_and_uncompress_zip + + +def add_metadata(adata, name: str): + adata.uns['dataset_summary'] = 'Processed experimental single-cell gene expression datasets used in BEELINE.' + adata.uns['dataset_description'] = 'Processed experimental single-cell gene expression datasets used in BEELINE.' + adata.uns['data_reference'] = "@dataset{aditya_pratapa_2020_3701939,\nauthor = {Aditya Pratapa and Amogh Jalihal and Jeffrey Law and Aditya Bharadwaj and T M Murali},\n title = {Benchmarking algorithms for gene regulatory network inference from single-cell transcriptomic data },\nmonth = mar,\nyear = 2020,\npublisher = {Zenodo},\ndoi = {10.5281/zenodo.3701939},\nurl = {https://doi.org/10.5281/zenodo.3701939},\n}" + adata.uns['data_url'] = 'https://zenodo.org/records/3701939' + adata.uns['dataset_id'] = f'beeline_{name}' + adata.uns['dataset_name'] = f'BEELINE_{name}' + adata.uns['dataset_organism'] = 'human' if name.startswith('h') else 'mouse' + adata.uns['normalization_id'] = 'X_norm' + return adata + + +if __name__ == '__main__': + + # - get the data + download_and_uncompress_zip( + "https://zenodo.org/records/3701939/files/BEELINE-data.zip?download=1", + "resources/datasets_raw/beeline" + ) + + # Convert CSV files to H5AD + for name in ["hESC", "hHep", "mDC", "mESC", "mHSC-E", "mHSC-GM", "mHSC-L"]: + filepath = f"resources/datasets_raw/beeline/BEELINE-data/inputs/scRNA-Seq/{name}/ExpressionData.csv" + + df = pd.read_csv(filepath, index_col=0).transpose() + adata = ad.AnnData(X=df.values, obs=df.index.to_frame(index=False), var=pd.DataFrame(index=df.columns)) + adata.var.index.name = "gene_name" + adata.layers['X_norm'] = adata.X.copy() + + # Make all obs/var column names strings and clean up a bit + adata.obs.columns = pd.Index(adata.obs.columns).map(str).str.replace("/", "_") + adata.var.columns = pd.Index(adata.var.columns).map(str).str.replace("/", "_") + + # - filter genes and cells + sc.pp.filter_cells(adata, min_genes=100) + sc.pp.filter_genes(adata, min_cells=10) + + # - add metadata + adata = add_metadata(adata, name) + + # - save + adata.write(f"resources/grn_benchmark/inference_data/beeline_{name}.h5ad") diff --git a/src/utils/util.py b/src/utils/util.py index e54ef497d..14566bf2f 100644 --- a/src/utils/util.py +++ b/src/utils/util.py @@ -1,9 +1,16 @@ +import io +import os +import zipfile +import tempfile +import requests +from pathlib import Path import pandas as pd import anndata as ad import numpy as np from tqdm import tqdm import scipy.sparse as sp + def naming_convention(dataset, method): if (dataset in ['replogle', 'parsescience', 'xaira_HEK293T']) & (method in ['scprint']): dataset = f'{dataset}_sc' @@ -400,6 +407,47 @@ def download_annotation(par): print(f"Failed to download the gencode.v45.annotation.gtf.gz. Status code: {response.status_code}") print("Downloading prior ended") +def download_and_uncompress_zip(url: str, folder: str) -> None: + + chunk_size = 1 << 20 + dest = Path(folder) + dest.mkdir(parents=True, exist_ok=True) + + # Stream download -> seekable temp file (ZipFile needs seekable) + with requests.get(url, stream=True, timeout=60) as r: + r.raise_for_status() + with tempfile.TemporaryFile() as tf: + for chunk in r.iter_content(chunk_size=chunk_size): + if chunk: # skip keep-alives + tf.write(chunk) + tf.seek(0) + + with zipfile.ZipFile(tf) as zf: + for member in zf.infolist(): + print(member) + # Normalize path and prevent traversal outside `dest` + extracted_path = dest / Path(member.filename).as_posix() + normalized = extracted_path.resolve() + + if not str(normalized).startswith(str(dest.resolve()) + os.sep) and normalized != dest.resolve(): + raise RuntimeError(f"Unsafe path in ZIP: {member.filename}") + + if member.is_dir(): + normalized.mkdir(parents=True, exist_ok=True) + continue + + # Ensure parent dirs exist + normalized.parent.mkdir(parents=True, exist_ok=True) + + # Extract the file + with zf.open(member, "r") as source, open(normalized, "wb") as target: + while True: + chunk = source.read(1 << 20) + if not chunk: + break + target.write(chunk) + + def read_gtf_as_df(gtf_path: str) -> pd.DataFrame: """ Read a GTF/GFF3 file (plain or gzipped) and return gene-level annotation From b79593c15e79f5fafdce47743365e9005eab28af Mon Sep 17 00:00:00 2001 From: AntoinePassemiers Date: Mon, 27 Oct 2025 13:45:33 +0100 Subject: [PATCH 15/19] Improve anchor_regression metric --- .../experimental/anchor_regression/helper.py | 49 ++----------------- 1 file changed, 5 insertions(+), 44 deletions(-) diff --git a/src/metrics/experimental/anchor_regression/helper.py b/src/metrics/experimental/anchor_regression/helper.py index 93145bbaf..5bdfa9a65 100644 --- a/src/metrics/experimental/anchor_regression/helper.py +++ b/src/metrics/experimental/anchor_regression/helper.py @@ -77,7 +77,7 @@ def anchor_regression( X: np.ndarray, Z: np.ndarray, Y: np.ndarray, - l2_reg: float = 1e-5, + l2_reg: float = 1e-6, gamma: float = 1.0 ) -> np.ndarray: """Anchor regression for causal inference under confounding. @@ -108,46 +108,7 @@ def anchor_regression( return theta -def cross_val( - cv, - X: np.ndarray, - y: np.ndarray, - Z: np.ndarray, - eps: float = 1e-50 -) -> float: - ss_res, ss_tot = 0, 0 - y_mean = np.mean(y) - for idx_train, idx_test in cv.split(X, y): - X_train, X_test = X[idx_train, :], X[idx_test, :] - Z_train, Z_test = Z[idx_train, :], Z[idx_test, :] - y_train, y_test = y[idx_train], y[idx_test] - theta = anchor_regression(X_train, Z_train, y_train, gamma=1) - y_hat = np.dot(X_test, theta) - ss_res += np.sum(np.square(y_test - y_hat)) - ss_tot += np.sum(np.square(y_test - y_mean)) - break # TODO - if ss_tot == 0: - r2 = 0 - else: - r2 = 1 - ss_res / ss_tot - return float(r2) - - def compute_stabilities( - X: np.ndarray, - y: np.ndarray, - Z: np.ndarray, - weights: np.ndarray, - is_selected: np.ndarray, - eps: float = 1e-50 -) -> float: - cv = KFold(5, random_state=0xCAFE, shuffle=True) - r2_selected = cross_val(cv, X[:, is_selected], y, Z) - r2_non_selected = cross_val(cv, X[:, ~is_selected], y, Z) - return r2_selected - r2_non_selected - - -def compute_stabilities_v2( X: np.ndarray, y: np.ndarray, Z: np.ndarray, @@ -157,10 +118,10 @@ def compute_stabilities_v2( theta0 = np.abs(anchor_regression(X, Z, y, gamma=1)) theta0 /= np.sum(theta0) - theta = np.abs(anchor_regression(X, Z, y, gamma=2)) + theta = np.abs(anchor_regression(X, Z, y, gamma=1.2)) theta /= np.sum(theta) - s1 = np.clip(np.abs(theta0[is_selected] - theta[is_selected]) / np.abs(theta0[is_selected] + eps), 0, 1) - s2 = np.clip(np.abs(theta0[~is_selected] - theta[~is_selected]) / np.abs(theta0[~is_selected] + eps), 0, 1) + s1 = theta[is_selected] * theta0[is_selected] + s2 = theta[~is_selected] * theta0[~is_selected] s1 = np.mean(s1) s2 = np.mean(s2) @@ -198,7 +159,7 @@ def evaluate_gene_stability( mask = np.ones(X.shape[1], dtype=bool) mask[j] = False - return compute_stabilities_v2(X[:, mask], X[:, j], Z, is_selected[mask]) + return compute_stabilities(X[:, mask], X[:, j], Z, is_selected[mask]) def main(par): From c6f14f8c6ea0f4156c1e9acfc7448117dd9da743 Mon Sep 17 00:00:00 2001 From: AntoinePassemiers Date: Fri, 31 Oct 2025 17:43:41 +0100 Subject: [PATCH 16/19] Minor change --- src/metrics/experimental/regression_3/helper.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/metrics/experimental/regression_3/helper.py b/src/metrics/experimental/regression_3/helper.py index 4a7c5b246..97e47d5bd 100644 --- a/src/metrics/experimental/regression_3/helper.py +++ b/src/metrics/experimental/regression_3/helper.py @@ -53,8 +53,9 @@ def compute_residual_correlations( y_test: np.ndarray, Z_test: np.ndarray ) -> np.ndarray: + model = xgboost.XGBRegressor(n_estimators=10) #model = xgboost.XGBRegressor(n_estimators=10) - model = Ridge(alpha=0.01) + model = Ridge(alpha=1) model.fit(X_train, y_train) y_hat = model.predict(X_test) residuals = y_test - y_hat From 0def7efbc3c6a06af7d5336f94a1826adaf332b8 Mon Sep 17 00:00:00 2001 From: AntoinePassemiers Date: Mon, 3 Nov 2025 23:20:47 +0100 Subject: [PATCH 17/19] Fix GRN baseline generation algorithm --- src/utils/util.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/src/utils/util.py b/src/utils/util.py index 2837c07d2..94c907c8f 100644 --- a/src/utils/util.py +++ b/src/utils/util.py @@ -568,6 +568,10 @@ def create_grn_baseline(A): reassigned deterministically to different edges (no randomness). """ + # Ensure no self-regulation + A = np.copy(A) + np.fill_diagonal(A, 0) + n = A.shape[0] # Order nodes by degrees, with explicit tie-breaking by weight @@ -612,7 +616,7 @@ def create_grn_baseline(A): B[x, w] = 0 B[x, v] = 1 in_degrees[w].degree += 1 - in_degrees[v].degree -= 1 + #in_degrees[v].degree -= 1 picks.append(v) swapped = True break @@ -621,14 +625,13 @@ def create_grn_baseline(A): if len(picks) == k: break - if len(picks) < k: - raise ValueError("Directed degree sequence not realizable with simple digraph under constraints.") + #if len(picks) < k: + # raise ValueError("Directed degree sequence not realizable with simple digraph under constraints.") # Place edges for u for v in picks: B[u, v] = 1 in_degrees[v].degree -= 1 - #out_degrees[v].degree = 0 out_degrees[u].degree = 0 # Stable re-sort by residual in-degree, with explicit tie-breaking by weight @@ -688,4 +691,12 @@ def create_grn_baseline(A): # Assign exact weights to new edges deterministically B[u, targets_ranked] = W_sorted - return B \ No newline at end of file + # Quality check + #target_in_degrees = np.sum((A != 0), axis=0) + #target_out_degrees = np.sum((A != 0), axis=1) + #in_degrees = np.sum((B != 0), axis=0) + #out_degrees = np.sum((B != 0), axis=1) + #print(target_in_degrees - in_degrees) + #print(target_out_degrees - out_degrees) + + return B From 83b2cda18e150d987d95da12e87ade53199ceafe Mon Sep 17 00:00:00 2001 From: AntoinePassemiers Date: Tue, 4 Nov 2025 09:38:52 +0100 Subject: [PATCH 18/19] Dictys: fix version conflicts + improve use of subprocess --- .gitignore | 1 + dockers/dictys_v4/Dockerfile | 122 +++++++++++++++++++++++++++ dockers/dictys_v4/constraints.txt | 3 + src/methods/dictys/helper.py | 134 ++++++++++++++++++++---------- 4 files changed, 218 insertions(+), 42 deletions(-) create mode 100644 dockers/dictys_v4/Dockerfile create mode 100644 dockers/dictys_v4/constraints.txt diff --git a/.gitignore b/.gitignore index d3f14a9ca..bfed5cdc6 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,7 @@ # related to files .pybiomart.sqlite +.venv/ logs/ params* resources* diff --git a/dockers/dictys_v4/Dockerfile b/dockers/dictys_v4/Dockerfile new file mode 100644 index 000000000..bf3cfa0e4 --- /dev/null +++ b/dockers/dictys_v4/Dockerfile @@ -0,0 +1,122 @@ +FROM ubuntu:22.04 + +ARG DEBIAN_FRONTEND=noninteractive +ENV TZ="America/New_York" + +# Base OS deps (build tools + common libs) + libpng for matplotlib +RUN apt-get update && apt-get install -y --no-install-recommends \ + curl ca-certificates git unzip zip \ + build-essential pkg-config \ + zlib1g-dev libbz2-dev liblzma-dev \ + libncurses5-dev libgdbm-dev libnss3-dev libssl-dev libreadline-dev libffi-dev libsqlite3-dev \ + libfreetype6-dev libpng-dev \ + python3-venv python3-distutils python3-dev \ + # bio tools via apt instead of building + samtools tabix \ + perl \ + && rm -rf /var/lib/apt/lists/* + +# Install CPython 3.9.17 from source +ARG PYTHON_VERSION=3.9.17 +RUN set -eux; \ + cd /tmp; \ + curl -fsSLO https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz; \ + tar -xzf Python-${PYTHON_VERSION}.tgz; \ + cd Python-${PYTHON_VERSION}; \ + ./configure --enable-optimizations; \ + make -j"$(nproc)"; \ + make install; \ + cd /; rm -rf /tmp/Python-${PYTHON_VERSION}*; \ + ln -s /usr/local/bin/python3 /usr/local/bin/python; \ + ln -s /usr/local/bin/pip3 /usr/local/bin/pip + +# Make constraints global for all pip installs +COPY constraints.txt /tmp/constraints.txt +ENV PIP_CONSTRAINT=/tmp/constraints.txt \ + PIP_DISABLE_PIP_VERSION_CHECK=1 \ + PIP_DEFAULT_TIMEOUT=180 + +# Clean any existing numpy/matplotlib remnants aggressively +RUN python - <<'PY' +import sys, site, pkgutil, shutil, pathlib +paths = set(site.getsitepackages() + [site.getusersitepackages()]) +for p in list(paths): + if not p: + continue + for name in ("numpy", "matplotlib"): + for m in pathlib.Path(p).glob(name): + shutil.rmtree(m, ignore_errors=True) + for m in pathlib.Path(p).glob(f"{name}-*.dist-info"): + shutil.rmtree(m, ignore_errors=True) + for m in pathlib.Path(p).glob(f"{name}-*.egg-info"): + shutil.rmtree(m, ignore_errors=True) +print("Cleaned numpy/matplotlib from:", *paths, sep="\n - ") +PY + +# Install bedtools +RUN apt-get update && apt-get install -y --no-install-recommends bedtools \ + && rm -rf /var/lib/apt/lists/* + +# Install tools + exact pins +RUN python -m pip install --no-cache-dir -U pip setuptools wheel \ + && pip install --no-cache-dir --upgrade --force-reinstall \ + "numpy==1.26.4" "matplotlib==3.8.4" "cython<3" + +# Install MACS2. Build-from-source packages must reuse our pinned toolchain +RUN pip install --no-cache-dir --no-build-isolation MACS2==2.2.9.1 + +# Install Dictys without dependencies (we'll install them manually right after) +RUN pip install --no-cache-dir --no-build-isolation --no-deps \ + git+https://github.com/pinellolab/dictys.git@a82930fe8030af2785f9069ef5e909e49acc866f + +# Install Dictys dependencies and more +RUN pip install --no-cache-dir --prefer-binary \ + pandas scipy networkx h5py threadpoolctl joblib \ + jupyter jupyterlab adjustText pyro-ppl docutils requests + +# Install pyDNase and anndata without dependencies so it can't pin matplotlib<2 +RUN pip install --no-cache-dir --no-build-isolation --no-deps pyDNase clint pysam packaging array_api_compat legacy-api-wrap zarr natsort anndata + +# Install pybedtools version that works with cython<3 +RUN pip install --no-cache-dir --no-build-isolation "pybedtools==0.9.1" + +# Install pytorch +# RUN pip install --no-cache-dir --prefer-binary --index-url https://download.pytorch.org/whl/cpu torch + +# HOMER prerequisites +RUN apt-get update && apt-get install -y --no-install-recommends \ + wget perl unzip build-essential zlib1g-dev \ + && rm -rf /var/lib/apt/lists/* + +# Install HOMER core + hg38 genome +RUN set -eux; \ + mkdir -p /opt/homer && cd /opt/homer; \ + curl -fsSLO http://homer.ucsd.edu/homer/configureHomer.pl; \ + chmod +x configureHomer.pl; \ + perl configureHomer.pl -install homer; \ + perl configureHomer.pl -install homerTools; \ + perl configureHomer.pl -install hg38 +ENV PATH="/opt/homer/bin:${PATH}" + +# hg38 annotations +RUN set -eux; \ + cd /opt/homer; \ + grep "hg38" update.txt > tmp.txt && mv tmp.txt update.txt; \ + cd update && ./updateUCSCGenomeAnnotations.pl ../update.txt + +# Install CUDA +# RUN curl -fsSLo /etc/apt/preferences.d/cuda-repository-pin-600 \ +# https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-ubuntu2204.pin && \ +# curl -fsSLo /usr/share/keyrings/nvidia-cuda.gpg \ +# https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/3bf863cc.pub && \ +# echo "deb [signed-by=/usr/share/keyrings/nvidia-cuda.gpg] http://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/ /" \ +# > /etc/apt/sources.list.d/cuda.list && \ +# apt-get update && apt-get install -y --no-install-recommends cuda && \ +# rm -rf /var/lib/apt/lists/* + +# Install AWS CLI +RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" && \ + unzip awscliv2.zip && \ + ./aws/install + +CMD ["/bin/bash"] \ No newline at end of file diff --git a/dockers/dictys_v4/constraints.txt b/dockers/dictys_v4/constraints.txt new file mode 100644 index 000000000..c5b8e303b --- /dev/null +++ b/dockers/dictys_v4/constraints.txt @@ -0,0 +1,3 @@ +numpy==1.26.4 +matplotlib<3.9 +cython<3 \ No newline at end of file diff --git a/src/methods/dictys/helper.py b/src/methods/dictys/helper.py index 90cde9b03..776929351 100644 --- a/src/methods/dictys/helper.py +++ b/src/methods/dictys/helper.py @@ -1,6 +1,8 @@ import os os.environ["MKL_SERVICE_FORCE_INTEL"] = "1" os.environ["MKL_THREADING_LAYER"] = "GNU" +import shutil +from typing import Optional, List import numpy as np import pandas as pd @@ -11,6 +13,27 @@ warnings.filterwarnings("ignore") +OVERRIDE_DOWNLOAD = False + + +def run_cmd(cmd: List[str], cwd: Optional[str] = None) -> None: + kwargs = {} + if cwd is not None: + kwargs['cwd'] = cwd + with subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + bufsize=1, + **kwargs + ) as proc: + for line in proc.stdout: + print(line, end="") + rc = proc.wait() + if rc != 0: + raise RuntimeError(f"Command {cmd} failed with exit code {rc}") + def define_vars(par): os.makedirs(par['temp_dir'], exist_ok=True) @@ -28,6 +51,7 @@ def define_vars(par): par['bams_dir'] = f"{par['data_dir']}/bams/" par['gene_bed'] = f"{par['data_dir']}/gene.bed" + par['make_dir'] = f"{par['temp_dir']}/makefiles" def extract_exp(par): @@ -88,6 +112,7 @@ def extract_atac(par): print(f'Sort and compress tsv file {frags_path}') os.system(f"sort -k1,1 -k2,2n {temp_path} | bgzip -c > {frags_path}") + def create_bam(par): print('Creating BAM file from fragments', flush=True) cmd = f"python {par['frag_to_bam']} --fnames {par['frags_path']} --barcodes {par['barcodes']}" @@ -107,9 +132,24 @@ def bam_to_bams(par): - 'bams_dir': path to output folder for per-cell BAMs - 'exp_path': path to reference expression file """ + + print('Delete temp BAM directories', flush=True) + folders = [ + par['bams_dir'], + os.path.join(par['bams_dir'], '..', 'bams_text'), + os.path.join(par['bams_dir'], '..', 'bams_header') + ] + for folder in folders: + if os.path.exists(folder): + shutil.rmtree(folder) + print('Splitting BAM into per-cell BAMs', flush=True) - cmd = f"bash dictys_helper split_bam.sh {par['bam_name']} {par['bams_dir']} --section CB:Z: --ref_expression {par['exp_path']}" - run_cmd(cmd) + run_cmd([ + "bash", "dictys_helper", "split_bam.sh", par['bam_name'], par['bams_dir'], + "--section", "CB:Z:", "--ref_expression", par['exp_path'] + ]) + + def extrac_clusters(par): print('Extracting clusters', flush=True) subsets = f"{par['data_dir']}/subsets.txt" @@ -127,15 +167,6 @@ def extrac_clusters(par): subprocess.run(cp, shell=True, check=True) print('Extracting clusters successful', flush=True) -def run_cmd(cmd): - try: - result = subprocess.run(cmd, check=True, text=True, capture_output=True, shell=True) - print("STDOUT:", result.stdout) - print("STDERR:", result.stderr) - except subprocess.CalledProcessError as e: - print("Command failed with exit code", e.returncode) - print("STDOUT:", e.stdout) - print("STDERR:", e.stderr) def download_file(url, dest): import requests @@ -145,21 +176,27 @@ def download_file(url, dest): with open(dest, "wb") as f: for chunk in r.iter_content(chunk_size=8192): f.write(chunk) + + def get_priors(par): import gzip import shutil # - get the genome print('Getting genome ...', flush=True) - os.makedirs(f"{par['data_dir']}/genome/", exist_ok=True) - cmd = f"aws s3 cp s3://openproblems-data/resources/grn/supp_data/genome/genome.fa {par['data_dir']}/genome/ --no-sign-request" - try: - run_cmd(cmd) - except: + if OVERRIDE_DOWNLOAD or (not os.path.exists(f"{par['data_dir']}/genome/genome.fa")): + os.makedirs(f"{par['data_dir']}/genome/", exist_ok=True) try: - cmd = f"cp resources/supp_data/genome/genome.fa {par['data_dir']}/genome/" - run_cmd(cmd) + run_cmd([ + "aws", "s3", "cp", "s3://openproblems-data/resources/grn/supp_data/genome/genome.fa", + f"{par['data_dir']}/genome/", "--no-sign-request" + ]) except: - raise ValueError("Could not get the genome") + try: + run_cmd([ + "cp", "resources/supp_data/genome/genome.fa", f"{par['data_dir']}/genome/" + ]) + except: + raise ValueError("Could not get the reference genome") # - get gene annotation print('Getting gene annotation ...', flush=True) @@ -167,29 +204,33 @@ def get_priors(par): gtf_gz = data_dir / "gene.gtf.gz" gtf = data_dir / "gene.gtf" url = "http://ftp.ensembl.org/pub/release-107/gtf/homo_sapiens/Homo_sapiens.GRCh38.107.gtf.gz" - download_file(url, gtf_gz) + if OVERRIDE_DOWNLOAD or (not os.path.exists(gtf_gz)): + download_file(url, gtf_gz) with gzip.open(gtf_gz, "rb") as f_in, open(gtf, "wb") as f_out: shutil.copyfileobj(f_in, f_out) gtf_gz.unlink() - cmd = f"bash dictys_helper gene_gtf.sh {gtf} {par['gene_bed']}" print('Making bed files for gene annotation ...', flush=True) - run_cmd(cmd) + run_cmd([ + "bash", "dictys_helper", "gene_gtf.sh", gtf, par['gene_bed'] + ]) print('Downloading motif file...', flush=True) url='https://hocomoco11.autosome.org/final_bundle/hocomoco11/full/HUMAN/mono/HOCOMOCOv11_full_HUMAN_mono_homer_format_0.0001.motif' motif_file = data_dir / 'motifs.motif' - download_file(url, motif_file) - + if OVERRIDE_DOWNLOAD or (not os.path.exists(motif_file)): + download_file(url, motif_file) + + def configure(par): import json device='cuda:0' #cuda:0 , cpu - par['make_dir'] = f"{par['temp_dir']}/makefiles" os.makedirs(par['make_dir'], exist_ok=True) - cmd = f"cd {par['make_dir']} && bash dictys_helper makefile_template.sh common.mk config.mk env_none.mk static.mk" - run_cmd(cmd) + run_cmd([ + "bash", "dictys_helper", "makefile_template.sh", "common.mk", "config.mk", "env_none.mk", "static.mk" + ], cwd=par['make_dir']) json_arg = json.dumps({ "DEVICE": device, @@ -197,14 +238,23 @@ def configure(par): "JOINT": "1" }) - cmd = f"cd {par['make_dir']} && bash dictys_helper makefile_update.py config.mk '{json_arg}'" - run_cmd(cmd) - cmd = f"cd {par['temp_dir']} && bash dictys_helper makefile_check.py" - run_cmd(cmd) + run_cmd([ + "bash", "dictys_helper", "makefile_update.py", "config.mk", json_arg + ], cwd=par['make_dir']) + + run_cmd([ + "bash", "dictys_helper", "makefile_check.py", "--dir_makefiles", par['make_dir'], + "--dir_data", par['data_dir'] + ]) + + def infer_grn(par): print('Inferring GRNs', flush=True) - cmd = f"cd {par['temp_dir']} && bash dictys_helper network_inference.sh -j {par['num_workers']} -J 1 static" - run_cmd(cmd) + run_cmd([ + "bash", "dictys_helper", "network_inference.sh", "-j", str(par['num_workers']), "-J", "1", "static" + ], cwd=par['temp_dir']) + + def export_net(par): from util import process_links from dictys.net import network @@ -224,14 +274,14 @@ def export_net(par): output.write(par['prediction']) def main(par): - define_vars(par) - extract_exp(par) - extract_atac(par) - create_bam(par) - bam_to_bams(par) - extrac_clusters(par) - get_priors(par) - configure(par) + define_vars(par) + #extract_exp(par) + #extract_atac(par) + #create_bam(par) + #bam_to_bams(par) + #extrac_clusters(par) + #get_priors(par) + #configure(par) infer_grn(par) - export_net(par) + #export_net(par) From 7122ef445e3eb7356f437e1179c78d6fd13f39a4 Mon Sep 17 00:00:00 2001 From: AntoinePassemiers Date: Tue, 4 Nov 2025 10:29:54 +0100 Subject: [PATCH 19/19] Dictys: minor change --- src/methods/dictys/helper.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/methods/dictys/helper.py b/src/methods/dictys/helper.py index 776929351..46c28901e 100644 --- a/src/methods/dictys/helper.py +++ b/src/methods/dictys/helper.py @@ -276,12 +276,12 @@ def export_net(par): def main(par): define_vars(par) - #extract_exp(par) - #extract_atac(par) - #create_bam(par) - #bam_to_bams(par) - #extrac_clusters(par) - #get_priors(par) - #configure(par) + extract_exp(par) + extract_atac(par) + create_bam(par) + bam_to_bams(par) + extrac_clusters(par) + get_priors(par) + configure(par) infer_grn(par) - #export_net(par) + export_net(par)