From ab81d8379046e2313e3ac37a3c23ae9e77c78ed4 Mon Sep 17 00:00:00 2001
From: AntoinePassemiers <antoine.passemiers@gmail.com>
Date: Wed, 1 Oct 2025 20:22:29 +0200
Subject: [PATCH 01/19] Bug fix

---
 src/metrics/sem/helper.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/metrics/sem/helper.py b/src/metrics/sem/helper.py
index 4fd90259b..8685cfaa5 100644
--- a/src/metrics/sem/helper.py
+++ b/src/metrics/sem/helper.py
@@ -347,14 +347,15 @@ def main(par):
     # Create a symmetric (causally-wrong) baseline GRN
     print(f"Creating baseline GRN")
     A_baseline = np.copy(A)
-    for j in range(A.shape[1]):
-        np.random.shuffle(A[:j, j])
-        np.random.shuffle(A[j+1:, j])
+    for j in range(A_baseline.shape[1]):
+        np.random.shuffle(A_baseline[:j, j])
+        np.random.shuffle(A_baseline[j+1:, j])
     assert np.any(A_baseline != A)
 
     # Evaluate inferred GRN
     print("\n======== Evaluate inferred GRN ========")
     scores = evaluate_grn(X_controls, delta_X, is_train, is_reporter, A, signed=use_signs)
+
     # Evaluate baseline GRN
     print("\n======== Evaluate shuffled GRN ========")
     n_repeats = 1

From 80195221166a937d01db53e2babb21e4af6305a5 Mon Sep 17 00:00:00 2001
From: AntoinePassemiers <antoine.passemiers@gmail.com>
Date: Sat, 4 Oct 2025 17:23:32 +0200
Subject: [PATCH 02/19] Fix VC metric + replace anchor_regression by new metric
 'regression3'

---
 common                                        |   2 +-
 src/metrics/anchor_regression/helper.py       | 262 ---------
 src/metrics/regression_3/helper.py            | 196 +++++++
 .../run_local.sh                              |  10 +-
 .../script.py                                 |   8 +-
 src/metrics/sem/helper.py                     |  10 +-
 src/metrics/vc/helper.py                      | 517 ++++++++----------
 src/metrics/vc/improved_helper.py             |   0
 src/metrics/vc/new_helper.py                  | 299 ++++++++++
 src/metrics/vc/ultra_fast_helper.py           |   0
 10 files changed, 751 insertions(+), 553 deletions(-)
 delete mode 100644 src/metrics/anchor_regression/helper.py
 create mode 100644 src/metrics/regression_3/helper.py
 rename src/metrics/{anchor_regression => regression_3}/run_local.sh (84%)
 rename src/metrics/{anchor_regression => regression_3}/script.py (90%)
 delete mode 100644 src/metrics/vc/improved_helper.py
 create mode 100644 src/metrics/vc/new_helper.py
 delete mode 100644 src/metrics/vc/ultra_fast_helper.py

diff --git a/common b/common
index 876036f71..f01ff2170 160000
--- a/common
+++ b/common
@@ -1 +1 @@
-Subproject commit 876036f71713cbd79285b108ab0a9a8238f2b5e1
+Subproject commit f01ff2170161295e89014ee5453c61b29b4e4e77
diff --git a/src/metrics/anchor_regression/helper.py b/src/metrics/anchor_regression/helper.py
deleted file mode 100644
index 9c29eb042..000000000
--- a/src/metrics/anchor_regression/helper.py
+++ /dev/null
@@ -1,262 +0,0 @@
-from typing import Tuple
-import os
-import sys
-import traceback
-import random
-import h5py
-import numpy as np
-import pandas as pd
-import tqdm
-from scipy.sparse.linalg import LinearOperator
-from scipy.stats import pearsonr, wilcoxon
-from scipy.sparse import csr_matrix
-from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
-import anndata as ad
-import warnings
-warnings.filterwarnings("ignore", category=UserWarning)
-
-# For reproducibility purposes
-seed = 0xCAFE
-os.environ['PYTHONHASHSEED'] = str(seed)
-random.seed(seed)
-np.random.seed(seed)
-
-from util import read_prediction, manage_layer
-from dataset_config import DATASET_GROUPS
-
-
-def encode_obs_cols(adata, cols):
-    """Encode observation columns to integer codes."""
-    encoded = []
-    for col in cols:
-        if col in adata.obs:
-            codes = LabelEncoder().fit_transform(adata.obs[col].values)
-            encoded.append(codes)
-    return encoded
-
-
-def combine_multi_index(*arrays) -> np.array:
-    """Combine parallel label arrays into a single integer label per position."""
-    A = np.stack(arrays)
-    n_classes = tuple(int(A[i].max()) + 1 for i in range(A.shape[0]))
-    return np.ravel_multi_index(A, dims=n_classes, order='C')
-
-
-def get_whitening_transform(Z: np.ndarray, gamma: float = 1.0) -> LinearOperator:
-    """Get whitening transformation.
-
-    Args:
-        Z: Anchor variables.
-        gamma: Anchor strength.
-
-    Returns:
-        Sparse linear operator corresponding to a whitening transform.
-    """
-    n, k = Z.shape
-
-    # Precompute Gram inverse with jitter for stability
-    ZTZ = Z.T @ Z + 1e-10 * np.eye(k)
-    ZTZ_inv = np.linalg.inv(ZTZ)
-    sqrt_gamma = np.sqrt(gamma)
-
-    def matvec(v: np.ndarray) -> np.ndarray:
-        """Matrix-vector multiplication."""
-        v = np.atleast_2d(v)
-        if v.shape[0] != n:
-            v = v.T
-        Pv = Z @ (ZTZ_inv @ (Z.T @ v))
-        out = v + (sqrt_gamma - 1.0) * Pv
-        return out if out.shape[1] > 1 else out.ravel()
-
-    return LinearOperator((n, n), matvec=matvec, rmatvec=matvec)
-
-
-def anchor_regression(
-        X: np.ndarray,
-        Z: np.ndarray,
-        Y: np.ndarray,
-        l2_reg: float = 1e-2,
-        anchor_strength: float = 1.0
-) -> np.ndarray:
-    """Anchor regression for causal inference under confounding.
-
-    Args:
-        X: input features, of shape (n, d).
-        Z: anchor variables. The model is required to be invariant to
-            these environmental variables. Shape (n, z).
-        Y: predicted variables, of shape (n, u).
-        l2_reg: L2 regularization strength.
-        anchor_strength: Strength of anchor regularization. 
-                        0 = standard regression, higher = more anchor regularization.
-
-    Returns:
-        Inferred parameters, of shape (d, u).
-    """
-
-    # Whitening transformation
-    W = get_whitening_transform(Z, gamma=anchor_strength)
-    X_t = W @ X
-    Y_t = W @ Y
-
-    # Ridge regression on the whitened data
-    sigma_xx = X_t.T @ X_t / X_t.shape[0]
-    sigma_xy = X_t.T @ Y_t / Y_t.shape[0]
-    theta = np.linalg.solve(sigma_xx + l2_reg * np.eye(X_t.shape[1]), sigma_xy)
-
-    return theta
-
-
-def compute_stabilities(
-        X: np.ndarray,
-        y: np.ndarray,
-        Z: np.ndarray,
-        A: np.ndarray,
-        is_selected: np.ndarray,
-        eps: float = 1e-50
-) -> float:
-    theta0_signed = anchor_regression(X, Z, y, anchor_strength=1)
-    theta0_signed = theta0_signed[is_selected]
-    theta0 = np.abs(theta0_signed)
-    theta0 /= np.sum(theta0)
-
-    theta_signed = anchor_regression(X, Z, y, anchor_strength=20)
-    theta_signed = theta_signed[is_selected]
-    theta = np.abs(theta_signed)
-    theta /= np.sum(theta)
-
-    stabilities = np.clip((theta0 - theta) / (theta0 + eps), 0, 1)
-    stabilities[np.sign(theta0_signed) != np.sign(theta_signed)] = 0
-
-    return stabilities
-
-
-
-def evaluate_gene_stability(
-        X: np.ndarray,
-        Z: np.ndarray,
-        A: np.ndarray,
-        j: int,
-        eps: float = 1e-50
-) -> float:
-    """Evaluate stability of regulatory relationships for a single target gene.
-    
-    Args:
-        X: Gene expression matrix (n_samples, n_genes).
-        Z: Anchor variables matrix (n_samples, n_anchors).
-        A: GRN adjacency matrix (n_genes, n_genes).
-        j: Target gene index.
-        eps: Small epsilon for numerical stability.
-        
-    Returns:
-        Stability score for gene j
-    """
-    is_selected = np.array(A[:, j] != 0)
-    if (not np.any(is_selected)) or np.all(is_selected):
-        return 0.0
-    assert not is_selected[j]
-
-    # Exclude target gene from features
-    mask = np.ones(X.shape[1], dtype=bool)
-    mask[j] = False
-
-    stabilities_selected = np.mean(compute_stabilities(X[:, mask], X[:, j], Z, A, is_selected[mask], eps=eps))
-    #stabilities_non_selected = np.mean(compute_stabilities(X[:, mask], X[:, j], Z, A, ~is_selected[mask], eps=eps))
-
-    #score = (stabilities_selected - stabilities_non_selected) / (stabilities_selected + stabilities_non_selected + eps)
-    score = np.mean(stabilities_selected)
-
-    return score
-
-
-def main(par):
-    """Main anchor regression evaluation function."""
-    # Load evaluation data
-    adata = ad.read_h5ad(par['evaluation_data'])
-    dataset_id = adata.uns['dataset_id']
-    method_id = ad.read_h5ad(par['prediction'], backed='r').uns['method_id']
-    
-    # Get dataset-specific anchor variables
-    if dataset_id not in DATASET_GROUPS:
-        raise ValueError(f"Dataset {dataset_id} not found in DATASET_GROUPS")
-    
-    anchor_cols = DATASET_GROUPS[dataset_id].get('anchors', ['donor_id', 'plate_name'])
-    print(f"Using anchor variables: {anchor_cols}")
-
-    # Manage layer
-    layer = manage_layer(adata, par)
-    X = adata.layers[layer]
-    if isinstance(X, csr_matrix):
-        X = X.toarray()
-    X = X.astype(np.float32)
-
-    gene_names = adata.var_names
-    gene_dict = {gene_name: i for i, gene_name in enumerate(gene_names)}
-
-    # Encode anchor variables
-    anchor_variables = encode_obs_cols(adata, anchor_cols)
-    anchor_encoded = combine_multi_index(*anchor_variables)
-    
-    if len(anchor_variables) == 0:
-        raise ValueError(f"No anchor variables found in dataset for columns: {anchor_cols}")
-    
-    # One-hot encode anchor variables
-    Z = OneHotEncoder(sparse_output=False, dtype=np.float32).fit_transform(anchor_encoded.reshape(-1, 1))
-    print(f"Anchor matrix Z shape: {Z.shape}")
-
-    # Load inferred GRN
-    df = read_prediction(par)
-    sources = df["source"].to_numpy()
-    targets = df["target"].to_numpy()
-    weights = df["weight"].to_numpy()
-
-    A = np.zeros((len(gene_names), len(gene_names)), dtype=X.dtype)
-    for source, target, weight in zip(sources, targets, weights):
-        if (source in gene_dict) and (target in gene_dict):
-            i = gene_dict[source]
-            j = gene_dict[target]
-            A[i, j] = float(weight)
-
-    # Only consider the genes that are actually present in the inferred GRN,
-    # and keep only the most-connected genes (for speed).
-    gene_mask = np.logical_or(np.any(A, axis=1), np.any(A, axis=0))
-    in_degrees = np.sum(A != 0, axis=0)
-    out_degrees = np.sum(A != 0, axis=1)
-    idx = np.argsort(np.maximum(out_degrees, in_degrees))[:-1000]
-    gene_mask[idx] = False
-    X = X[:, gene_mask]
-    X = X.toarray() if isinstance(X, csr_matrix) else X
-    A = A[gene_mask, :][:, gene_mask]
-    gene_names = gene_names[gene_mask]
-
-    # Remove self-regulations from GRN
-    np.fill_diagonal(A, 0)
-    print(f"Evaluating {X.shape[1]} genes with {np.sum(A != 0)} regulatory links")
-
-    # Center and scale dataset
-    scaler = StandardScaler()
-    X = scaler.fit_transform(X)
-
-    # Create baseline GRN
-    A_baseline = np.copy(A)
-    for j in range(A_baseline.shape[1]):
-        np.random.shuffle(A_baseline[:j, j])
-        np.random.shuffle(A_baseline[j+1:, j])
-    assert np.any(A != A_baseline)
-
-    scores, scores_baseline = [], []
-    for j in tqdm.tqdm(range(X.shape[1]), desc="Evaluating gene stability"):
-        scores.append(evaluate_gene_stability(X, Z, A, j))
-    scores = np.array(scores)
-
-    # Calculate final score
-    final_score = np.mean(scores)
-    print(f"Method: {method_id}")
-    print(f"Anchor Regression Score: {final_score:.6f}")
-
-    # Return results as DataFrame
-    results = {
-        'anchor_regression': [final_score]
-    }
-
-    df_results = pd.DataFrame(results)
-    return df_results
diff --git a/src/metrics/regression_3/helper.py b/src/metrics/regression_3/helper.py
new file mode 100644
index 000000000..9f2e427dc
--- /dev/null
+++ b/src/metrics/regression_3/helper.py
@@ -0,0 +1,196 @@
+from typing import Tuple
+import os
+import sys
+import traceback
+import random
+import h5py
+import numpy as np
+import pandas as pd
+import tqdm
+import xgboost
+from scipy.sparse.linalg import LinearOperator
+from scipy.stats import pearsonr, spearmanr, wilcoxon, ConstantInputWarning
+from scipy.sparse import csr_matrix
+from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
+from sklearn.linear_model import Ridge
+import anndata as ad
+import warnings
+warnings.filterwarnings("ignore", category=UserWarning)
+warnings.filterwarnings("ignore", category=ConstantInputWarning)
+
+# For reproducibility purposes
+seed = 0xCAFE
+os.environ['PYTHONHASHSEED'] = str(seed)
+random.seed(seed)
+np.random.seed(seed)
+
+from util import read_prediction, manage_layer
+from dataset_config import DATASET_GROUPS
+
+
+def encode_obs_cols(adata, cols):
+    """Encode observation columns to integer codes."""
+    encoded = []
+    for col in cols:
+        if col in adata.obs:
+            codes = LabelEncoder().fit_transform(adata.obs[col].values)
+            encoded.append(codes)
+    return encoded
+
+
+def combine_multi_index(*arrays) -> np.array:
+    """Combine parallel label arrays into a single integer label per position."""
+    A = np.stack(arrays)
+    n_classes = tuple(int(A[i].max()) + 1 for i in range(A.shape[0]))
+    return np.ravel_multi_index(A, dims=n_classes, order='C')
+
+
+def compute_residual_correlations(
+        X_train: np.ndarray,
+        y_train: np.ndarray,
+        X_test: np.ndarray,
+        y_test: np.ndarray,
+        Z_test: np.ndarray
+) -> np.ndarray:
+    model = xgboost.XGBRegressor(n_estimators=10)
+    #model = Ridge(alpha=10)
+    model.fit(X_train, y_train)
+    y_hat = model.predict(X_test)
+    residuals = y_test - y_hat
+    coefs = pearsonr(residuals[:, np.newaxis], Z_test, axis=0)[0]
+    coefs = np.nan_to_num(coefs, nan=0)
+    assert coefs.shape[0] == Z_test.shape[1]
+    return np.abs(coefs)
+
+
+def main(par):
+    # Load evaluation data
+    adata = ad.read_h5ad(par['evaluation_data'])
+    dataset_id = adata.uns['dataset_id']
+    method_id = ad.read_h5ad(par['prediction'], backed='r').uns['method_id']
+    
+    # Get dataset-specific anchor variables
+    if dataset_id not in DATASET_GROUPS:
+        raise ValueError(f"Dataset {dataset_id} not found in DATASET_GROUPS")
+    
+    anchor_cols = DATASET_GROUPS[dataset_id].get('anchors', ['donor_id', 'plate_name'])
+    print(f"Using anchor variables: {anchor_cols}")
+
+    # Manage layer
+    layer = manage_layer(adata, par)
+    X = adata.layers[layer]
+    if isinstance(X, csr_matrix):
+        X = X.toarray()
+    X = X.astype(np.float32)
+
+    gene_names = adata.var_names
+    gene_dict = {gene_name: i for i, gene_name in enumerate(gene_names)}
+
+    # Encode anchor variables
+    anchor_variables = encode_obs_cols(adata, anchor_cols)
+    anchor_encoded = combine_multi_index(*anchor_variables)
+    
+    if len(anchor_variables) == 0:
+        raise ValueError(f"No anchor variables found in dataset for columns: {anchor_cols}")
+    
+    # One-hot encode anchor variables
+    Z = OneHotEncoder(sparse_output=False, dtype=np.float32).fit_transform(anchor_encoded.reshape(-1, 1))
+    print(f"Anchor matrix Z shape: {Z.shape}")
+
+    # Load inferred GRN
+    df = read_prediction(par)
+    sources = df["source"].to_numpy()
+    targets = df["target"].to_numpy()
+    weights = df["weight"].to_numpy()
+
+    A = np.zeros((len(gene_names), len(gene_names)), dtype=X.dtype)
+    for source, target, weight in zip(sources, targets, weights):
+        if (source in gene_dict) and (target in gene_dict):
+            i = gene_dict[source]
+            j = gene_dict[target]
+            A[i, j] = float(weight)
+
+    # Only consider the genes that are actually present in the inferred GRN,
+    # and keep only the most-connected genes (for speed).
+    gene_mask = np.logical_or(np.any(A, axis=1), np.any(A, axis=0))
+    in_degrees = np.sum(A != 0, axis=0)
+    out_degrees = np.sum(A != 0, axis=1)
+    idx = np.argsort(np.maximum(out_degrees, in_degrees))[:-1000]
+    gene_mask[idx] = False
+    X = X[:, gene_mask]
+    X = X.toarray() if isinstance(X, csr_matrix) else X
+    A = A[gene_mask, :][:, gene_mask]
+    gene_names = gene_names[gene_mask]
+
+    # Remove self-regulations
+    np.fill_diagonal(A, 0)
+    print(f"Evaluating {X.shape[1]} genes with {np.sum(A != 0)} regulatory links")
+
+    # Create baseline model
+    A_baseline = np.copy(A)
+    for j in range(A.shape[1]):
+        np.random.shuffle(A_baseline[:j, j])
+        np.random.shuffle(A_baseline[j+1:, j])
+    assert np.any(A_baseline != A)
+
+    scores, baseline_scores = [], []
+    for group in np.unique(anchor_encoded):
+
+        # Train/test split
+        mask = (anchor_encoded != group)
+        X_train = X[mask, :]
+        X_test = X[~mask, :]
+
+        # Standardize features
+        #scaler = StandardScaler()
+        #X_train = scaler.fit_transform(X_train)
+        #X_test = scaler.transform(X_test)
+
+        for j in tqdm.tqdm(range(X_train.shape[1])):
+
+            # Evaluate inferred GRN
+            selected = (A[:, j] != 0)
+            unselected = ~np.copy(selected)
+            unselected[j] = False
+            if (not np.any(selected)) or (not np.any(unselected)):
+                continue
+            else:
+                coefs = compute_residual_correlations(
+                    X_train[:, selected],
+                    X_train[:, j],
+                    X_test[:, selected],
+                    X_test[:, j],
+                    X_test[:, ~selected]
+                )
+                scores.append(np.mean(coefs))
+
+            # Evaluate baseline GRN
+            selected = (A_baseline[:, j] != 0)
+            unselected = ~np.copy(selected)
+            unselected[j] = False
+            coefs = compute_residual_correlations(
+                X_train[:, selected],
+                X_train[:, j],
+                X_test[:, selected],
+                X_test[:, j],
+                X_test[:, ~selected]
+            )
+            baseline_scores.append(np.mean(coefs))
+    scores = np.array(scores)
+    baseline_scores = np.array(baseline_scores)
+
+    p_value = wilcoxon(baseline_scores, scores, alternative="greater").pvalue
+    p_value = max(p_value, 1e-300)
+
+    # Calculate final score
+    final_score = -np.log10(p_value)
+    print(f"Anchor Regression Score: {final_score:.6f}")
+    print(f"Method: {method_id}")
+
+    # Return results as DataFrame
+    results = {
+        'regression_3': [final_score]
+    }
+
+    df_results = pd.DataFrame(results)
+    return df_results
diff --git a/src/metrics/anchor_regression/run_local.sh b/src/metrics/regression_3/run_local.sh
similarity index 84%
rename from src/metrics/anchor_regression/run_local.sh
rename to src/metrics/regression_3/run_local.sh
index fc872cc59..78e66b9ac 100644
--- a/src/metrics/anchor_regression/run_local.sh
+++ b/src/metrics/regression_3/run_local.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-#SBATCH --job-name=anchor_regression
+#SBATCH --job-name=regression_3
 #SBATCH --output=logs/%j.out
 #SBATCH --error=logs/%j.err
 #SBATCH --ntasks=1
@@ -12,7 +12,7 @@
 
 set -euo pipefail
 
-save_dir="output/anchor_regression"
+save_dir="output/regression_3"
 mkdir -p "$save_dir"
 
 # datasets to process
@@ -21,7 +21,7 @@ datasets=( 'op' "300BCG" 'parsebioscience') #"300BCG" "ibd" 'parsebioscience'
 methods=("pearson_corr" "positive_control" "negative_control" "ppcor" "portia" "scenic" "grnboost" "scprint" "scenicplus" "celloracle" "scglue" "figr" "granie")
 
 # temporary file to collect CSV rows
-combined_csv="${save_dir}/anchor_regression_scores.csv"
+combined_csv="${save_dir}/regression_3_scores.csv"
 echo "dataset,method,metric,value" > "$combined_csv"
 
 for dataset in "${datasets[@]}"; do
@@ -31,7 +31,7 @@ for dataset in "${datasets[@]}"; do
 
     for method in "${methods[@]}"; do
         prediction="resources/results/${dataset}/${dataset}.${method}.${method}.prediction.h5ad"
-        score="${save_dir}/anchor_regression_${dataset}_${method}.h5ad"
+        score="${save_dir}/regression_3_${dataset}_${method}.h5ad"
 
         if [[ ! -f "$prediction" ]]; then
             echo "File not found: $prediction, skipping..."
@@ -39,7 +39,7 @@ for dataset in "${datasets[@]}"; do
         fi
 
         echo -e "\nProcessing method: $method\n"
-        python src/metrics/anchor_regression/script.py \
+        python src/metrics/regression_3/script.py \
             --prediction "$prediction" \
             --evaluation_data "$evaluation_data" \
             --score "$score"
diff --git a/src/metrics/anchor_regression/script.py b/src/metrics/regression_3/script.py
similarity index 90%
rename from src/metrics/anchor_regression/script.py
rename to src/metrics/regression_3/script.py
index a4687b358..b140d64c5 100644
--- a/src/metrics/anchor_regression/script.py
+++ b/src/metrics/regression_3/script.py
@@ -30,14 +30,14 @@
     sys.path.append(meta["resources_dir"])
 except:
     meta = {
-    "resources_dir":'src/metrics/anchor_regression/',
+    "resources_dir":'src/metrics/regression_3/',
     "util_dir": 'src/utils',
-    'helper_dir': 'src/metrics/anchor_regression/'
+    'helper_dir': 'src/metrics/regression_3/'
     }
     sys.path.append(meta["resources_dir"])
     sys.path.append(meta["util_dir"])
     sys.path.append(meta["helper_dir"])
-from helper import main as main_anchor 
+from helper import main as main_reg3
 from util import format_save_score
 
 
@@ -48,7 +48,7 @@
         par[key] = value
 
 if __name__ == "__main__":
-    output = main_anchor(par)
+    output = main_reg3(par)
 
     dataset_id = ad.read_h5ad(par['evaluation_data'], backed='r').uns['dataset_id']
     method_id = ad.read_h5ad(par['prediction'], backed='r').uns['method_id']
diff --git a/src/metrics/sem/helper.py b/src/metrics/sem/helper.py
index 8685cfaa5..13747164a 100644
--- a/src/metrics/sem/helper.py
+++ b/src/metrics/sem/helper.py
@@ -112,9 +112,12 @@ def neumann_series(A: torch.Tensor, k: int = 2) -> torch.Tensor:
     Returns:
         Approximated inverse of I - A.
     """
-    B = torch.eye(A.shape[0], device=A.device, dtype=A.dtype)
-    for k in range(k):
-        B = B + torch.mm(B, A)
+    I = torch.eye(A.shape[0], device=A.device, dtype=A.dtype)
+    term = I.clone()
+    B = I.clone()
+    for _ in range(k):
+        term = term @ A
+        B = B + term
     return B
 
 
@@ -344,7 +347,6 @@ def main(par):
     print(f"Proportion of reporter genes: {np.mean(is_reporter)}")
     print(f"Use regulatory modes/signs: {use_signs}")
 
-    # Create a symmetric (causally-wrong) baseline GRN
     print(f"Creating baseline GRN")
     A_baseline = np.copy(A)
     for j in range(A_baseline.shape[1]):
diff --git a/src/metrics/vc/helper.py b/src/metrics/vc/helper.py
index ed715ee8a..3735f420d 100644
--- a/src/metrics/vc/helper.py
+++ b/src/metrics/vc/helper.py
@@ -1,4 +1,5 @@
 import os
+import traceback
 from typing import Tuple, Dict
 import sys
 import tqdm
@@ -8,7 +9,8 @@
 import pandas as pd
 import torch
 from scipy.sparse import csr_matrix
-from sklearn.model_selection import GroupKFold
+from scipy.stats import wilcoxon
+from sklearn.model_selection import StratifiedGroupKFold, StratifiedKFold
 from sklearn.preprocessing import StandardScaler, LabelEncoder
 from torch.utils.data import Dataset
 import anndata as ad
@@ -22,14 +24,15 @@
 NUMPY_DTYPE = np.float32
 
 # For reproducibility purposes
-seed = 0xCAFE
-os.environ['PYTHONHASHSEED'] = str(seed)
-random.seed(seed)
-np.random.seed(seed)
-torch.manual_seed(seed)
-torch.cuda.manual_seed(seed)
-torch.cuda.manual_seed_all(seed)
-torch.use_deterministic_algorithms(True)
+def set_seed():
+    seed = 0xCAFE
+    os.environ['PYTHONHASHSEED'] = str(seed)
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    torch.use_deterministic_algorithms(True)
 
 from util import read_prediction, manage_layer
 from dataset_config import DATASET_GROUPS
@@ -67,154 +70,6 @@ def create_control_matching(are_controls: np.ndarray, match_groups: np.ndarray)
     return control_map, match_groups
 
 
-def compute_pds_cell_level(X_true, X_pred, perturbations, gene_names, max_cells_per_pert=10):
-    """
-    Compute PDS at individual cell level (more challenging than mean-based PDS).
-    
-    For each individual predicted cell, find its rank when compared to 
-    true mean profiles of all perturbations.
-    """
-    unique_perts = np.unique(perturbations)
-    n_perts = len(unique_perts)
-    
-    print(f"Computing cell-level PDS for {n_perts} unique perturbations")
-    
-    if n_perts < 2:
-        return 0.0
-    
-    # Compute mean true expression profiles per perturbation
-    true_means = {}
-    for pert in unique_perts:
-        mask = (perturbations == pert)
-        if np.sum(mask) == 0:
-            continue
-        true_means[pert] = np.mean(X_true[mask, :], axis=0)
-    
-    all_scores = []
-    
-    # For each perturbation, sample some cells and compute their PDS
-    for pert in unique_perts:
-        if pert not in true_means:
-            continue
-            
-        # Get cells for this perturbation
-        mask = (perturbations == pert)
-        pert_indices = np.where(mask)[0]
-        
-        if len(pert_indices) == 0:
-            continue
-            
-        # Sample max_cells_per_pert cells to avoid bias from perturbations with many cells
-        # Use seeded random generator for reproducibility
-        cell_rng = np.random.RandomState(seed + int(pert))  # Different seed per perturbation
-        sampled_indices = cell_rng.choice(
-            pert_indices, 
-            size=min(max_cells_per_pert, len(pert_indices)), 
-            replace=False
-        )
-        
-        for cell_idx in sampled_indices:
-            # Get predicted profile for this individual cell
-            pred_vec = X_pred[cell_idx, :].copy()
-            
-            # Calculate distances to all true mean profiles
-            dists = []
-            for t in unique_perts:
-                if t not in true_means:
-                    continue
-                true_vec = true_means[t].copy()
-                
-                # Remove target gene if it exists
-                if str(pert) in gene_names:
-                    gene_idx = np.where(gene_names == str(pert))[0]
-                    if len(gene_idx) > 0:
-                        true_vec = np.delete(true_vec, gene_idx)
-                        pred_vec_temp = np.delete(pred_vec, gene_idx)
-                    else:
-                        pred_vec_temp = pred_vec
-                else:
-                    pred_vec_temp = pred_vec
-                
-                dist = cityblock(pred_vec_temp, true_vec)
-                dists.append((t, dist))
-            
-            # Sort by distance and find rank of correct perturbation
-            dists_sorted = sorted(dists, key=lambda x: x[1])
-            true_rank = next((i for i, (t, _) in enumerate(dists_sorted) if t == pert), n_perts-1)
-            
-            # Cell-level PDS
-            pds = 1 - (true_rank / (n_perts - 1)) if n_perts > 1 else 1.0
-            all_scores.append(pds)
-    
-    mean_pds = np.mean(all_scores) if all_scores else 0.0
-    print(f"Cell-level PDS scores: min={min(all_scores):.3f}, max={max(all_scores):.3f}, mean={mean_pds:.3f} (n_cells={len(all_scores)})")
-    return mean_pds
-
-
-def compute_pds(X_true, X_pred, perturbations, gene_names):
-    """
-    Compute both mean-level and cell-level PDS for comparison.
-    """
-    # Mean-level PDS (original approach)
-    unique_perts = np.unique(perturbations)
-    n_perts = len(unique_perts)
-    
-    print(f"Computing mean-level PDS for {n_perts} unique perturbations")
-    
-    if n_perts < 2:
-        return 0.0, 0.0
-    
-    # Compute mean expression profiles per perturbation
-    true_means = {}
-    pred_means = {}
-    
-    for pert in unique_perts:
-        mask = (perturbations == pert)
-        if np.sum(mask) == 0:
-            continue
-        true_means[pert] = np.mean(X_true[mask, :], axis=0)
-        pred_means[pert] = np.mean(X_pred[mask, :], axis=0)
-    
-    scores = {}
-    for pert in unique_perts:
-        if pert not in pred_means or pert not in true_means:
-            continue
-            
-        pred_vec = pred_means[pert].copy()
-        dists = []
-        
-        for t in unique_perts:
-            if t not in true_means:
-                continue
-            true_vec = true_means[t].copy()
-            
-            if str(pert) in gene_names:
-                gene_idx = np.where(gene_names == str(pert))[0]
-                if len(gene_idx) > 0:
-                    true_vec = np.delete(true_vec, gene_idx)
-                    pred_vec_temp = np.delete(pred_vec, gene_idx)
-                else:
-                    pred_vec_temp = pred_vec
-            else:
-                pred_vec_temp = pred_vec
-            
-            dist = cityblock(pred_vec_temp, true_vec)
-            dists.append((t, dist))
-        
-        dists_sorted = sorted(dists, key=lambda x: x[1])
-        true_rank = next((i for i, (t, _) in enumerate(dists_sorted) if t == pert), n_perts-1)
-        pds = 1 - (true_rank / (n_perts - 1)) if n_perts > 1 else 1.0
-        scores[pert] = pds
-    
-    mean_level_pds = np.mean(list(scores.values())) if scores else 0.0
-    print(f"Mean-level PDS: min={min(scores.values()):.3f}, max={max(scores.values()):.3f}, mean={mean_level_pds:.3f}")
-    
-    # Cell-level PDS (more challenging)
-    cell_level_pds = compute_pds_cell_level(X_true, X_pred, perturbations, gene_names)
-    
-    return mean_level_pds, cell_level_pds
-
-
 class GRNLayer(torch.nn.Module):
 
     def __init__(
@@ -223,93 +78,164 @@ def __init__(
             A_signs: torch.Tensor,
             signed: bool = True,
             inverse: bool = True,
-            alpha: float = 1.0
+            alpha: float = 0.1,
+            stable: bool = False,
+            bias: bool = True
     ):
         torch.nn.Module.__init__(self)
         self.n_genes: int = A_weights.size(1)
         self.A_weights: torch.nn.Parameter = A_weights
+        dtype = A_weights.dtype
+        device = A_weights.device
+        if bias:
+            self.b: torch.nn.Parameter = torch.nn.Parameter(torch.zeros((1, self.n_genes), dtype=dtype, device=device))
+        else:
+            self.b = None
         self.register_buffer('A_signs', A_signs.to(A_weights.device))
-        self.register_buffer('A_mask', (A_signs > 0).to(self.A_weights.dtype).to(A_weights.device))
-        self.register_buffer('I', torch.eye(self.n_genes, dtype=A_weights.dtype, device=A_weights.device))
+        self.register_buffer('A_mask', (A_signs != 0).to(self.A_weights.dtype).to(A_weights.device))
+        self.register_buffer('I', torch.eye(self.n_genes, dtype=dtype, device=device))
         self.signed: bool = signed
         self.inverse: bool = inverse
         self.alpha: float = alpha
+        self.stable: bool = stable
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         if self.signed:
             A = torch.abs(self.A_weights) * self.A_signs
+            assert torch.any(A < 0)
         else:
             A = self.A_weights * self.A_mask
         
         if self.inverse:
-            # For inverse transformation, use iterative solve to avoid memory issues
-            # Solve (I - alpha * A.t()) * y = x for y
-            ia = self.I - self.alpha * A.t()
-            
-            # Add small regularization to diagonal for numerical stability
-            ia = ia + 1e-6 * self.I
-            
-            # Use solve instead of inversion to save memory
-            try:
-                # Solve ia * y.t() = x.t() for y.t(), then transpose
-                result = torch.linalg.solve(ia, x.t()).t()
-                return result
-            except torch.linalg.LinAlgError:
-                # Fallback: simple linear transformation without inversion
-                print("Warning: Matrix solve failed, using simplified GRN transformation")
-                return torch.mm(x, A)
+            if self.stable:
+                # Approximation using Neumann series
+                B = GRNLayer.neumann_series(A.t(), self.alpha)
+                y = torch.mm(x, B)
+            else:
+                # For inverse transformation, use iterative solve to avoid memory issues
+                # Solve (I - alpha * A.t()) * y = x for y
+                ia = self.I - self.alpha * A.t()
+                
+                try:
+                    # Use solve instead of inversion to save memory
+                    y = torch.linalg.solve(ia, x.t()).t()
+                except torch.linalg.LinAlgError:
+                    # Fallback: approximation using Neumann series
+                    B = GRNLayer.neumann_series(A.t(), self.alpha)
+                    y = torch.mm(x, B)
         else:
             # Forward transformation: apply GRN directly
-            return torch.mm(x, A.t())
+            y = torch.mm(x, self.I - self.alpha * A.t())
+
+        # Add bias term
+        if self.b is not None:
+            y = y + self.b
+
+        return y
+
+    @staticmethod
+    def neumann_series(A: torch.Tensor, alpha: float, k: int = 2) -> torch.Tensor:
+        """Approximate the inverse of I - A using Neumann series.
+
+        Args:
+            A: the matrix for which to invert I - A.
+            k: the number of terms in the series. The higher, the more accurate.
+
+        Returns:
+            Approximated inverse of I - A.
+        """
+        I = torch.eye(A.shape[0], device=A.device, dtype=A.dtype)
+        M = alpha * A
+        term = I.clone()
+        B = I.clone()
+        for _ in range(k):
+            term = term @ M
+            B = B + term
+        return B
 
 
 class Model(torch.nn.Module):
-    def __init__(self, A: np.ndarray, n_perturbations: int, n_hidden: int = 64, signed: bool = True):
+
+    def __init__(self, A: np.ndarray, n_perturbations: int, n_hidden: int = 20, signed: bool = True):
+
+        # n_hidden needs to be small enough to prevent the NN from arbitrarily shifting the learning task
+        # from the GRN to the MLPs.
+
         torch.nn.Module.__init__(self)
         self.n_genes: int = A.shape[1]
         self.n_perturbations: int = n_perturbations
         self.n_hidden: int = n_hidden
-        self.perturbation_embedding = torch.nn.Embedding(n_perturbations, n_hidden)
+        self.signed: bool = signed
+
+        # Perturbation transformations defined in the latent space
+        #self.perturbation_embedding = torch.nn.Embedding(n_perturbations, n_hidden)
+        self.perturbation_embedding = torch.nn.Embedding(n_perturbations, n_hidden * n_hidden)
 
+        # First layer: GRN-informed transformation of control expression
         A_signs = torch.from_numpy(np.sign(A).astype(NUMPY_DTYPE))
         A_weights = np.copy(A).astype(NUMPY_DTYPE)
         A_weights /= (np.sqrt(self.n_genes) * float(np.std(A_weights)))
         A_weights = torch.nn.Parameter(torch.from_numpy(A_weights))
         # Ensure A_signs is on the same device as A_weights
         A_signs = A_signs.to(A_weights.device)
-
-        # First layer: GRN-informed transformation of control expression
-        self.grn_input_layer = GRNLayer(A_weights, A_signs, inverse=False, signed=signed, alpha=0.1)
+        self.grn_input_layer = GRNLayer(A_weights, A_signs, inverse=False, signed=signed)
         
-        # Middle layers: perturbation processing
+        # Middle layers: encode/decode between expression profiles and latent space
         self.encoder = torch.nn.Sequential(
+            torch.nn.LayerNorm(self.n_genes),
+            torch.nn.Dropout(p=0.4, inplace=True),
             torch.nn.PReLU(1),
             torch.nn.Linear(self.n_genes, self.n_hidden),
-            torch.nn.PReLU(1),
+            torch.nn.PReLU(self.n_hidden),
             torch.nn.Linear(self.n_hidden, self.n_hidden),
-            torch.nn.PReLU(1),
+            torch.nn.PReLU(self.n_hidden),
+            torch.nn.Linear(self.n_hidden, self.n_hidden),
+            torch.nn.PReLU(self.n_hidden),
         )
         
         self.decoder = torch.nn.Sequential(
+            torch.nn.LayerNorm(self.n_hidden),
             torch.nn.Linear(self.n_hidden, self.n_hidden),
-            torch.nn.PReLU(1),
+            torch.nn.PReLU(self.n_hidden),
+            torch.nn.Linear(self.n_hidden, self.n_hidden),
+            torch.nn.PReLU(self.n_hidden),
             torch.nn.Linear(self.n_hidden, self.n_genes),
             torch.nn.PReLU(1),
+            torch.nn.Dropout(p=0.4, inplace=True),
         )
         
         # Last layer: GRN-informed transformation to final expression
-        self.grn_output_layer = GRNLayer(A_weights, A_signs, inverse=True, signed=signed, alpha=0.1)
+        self.grn_output_layer = GRNLayer(A_weights, A_signs, inverse=True, signed=signed)
 
     def forward(self, x: torch.Tensor, pert: torch.LongTensor) -> torch.Tensor:
-        # Apply GRN transformation to control expression
+
+        # Encode each expression profile
         x = self.grn_input_layer(x)
         y = self.encoder(x)
+
+        # Each perturbation is a linear transformation in the latent space.
+        # Apply perturbation transform to the encoded profile.
         z = self.perturbation_embedding(pert)
-        y = y + z
+        #y = y + z
+        z = self.perturbation_embedding(pert).view(len(x), self.n_hidden, self.n_hidden)
+        y = torch.einsum('ij,ijk->ik', y, z)
+
+        # Decode each expression profile
         y = self.decoder(y)
         y = self.grn_output_layer(y)
         return y
 
+    def set_grn(self, A: np.ndarray) -> None:
+        signed = np.any(A < 0)
+        A_signs = torch.from_numpy(np.sign(A).astype(NUMPY_DTYPE))
+        A_weights = np.copy(A).astype(NUMPY_DTYPE)
+        A_weights /= (np.sqrt(self.n_genes) * float(np.std(A_weights)))
+        A_weights = torch.nn.Parameter(torch.from_numpy(A_weights))
+        # Ensure A_signs is on the same device as A_weights
+        A_signs = A_signs.to(A_weights.device)
+        self.grn_input_layer = GRNLayer(A_weights, A_signs, inverse=False, signed=self.signed)
+        self.grn_output_layer = GRNLayer(A_weights, A_signs, inverse=True, signed=self.signed)
+
 
 class PerturbationDataset(Dataset):
 
@@ -337,8 +263,9 @@ def __len__(self) -> int:
     def __getitem__(self, i: int) -> Tuple[torch.Tensor, int, torch.Tensor]:
         i = self.idx[i]
         y = torch.from_numpy(self.X[i, :])
-        group = int(self.match_groups[i])
 
+        # Find matched control
+        group = int(self.match_groups[i])
         if group in self.control_map:
             j = int(self.control_map[group])
         else:
@@ -347,75 +274,84 @@ def __getitem__(self, i: int) -> Tuple[torch.Tensor, int, torch.Tensor]:
 
         x = torch.from_numpy(self.X[j, :])
         p = int(self.perturbations[i])
-        return x, p, y
+        d_x = y - x
+        return x, p, d_x
 
 
+def coefficients_of_determination(y_target: np.ndarray, y_pred: np.ndarray, eps: float = 1e-20) -> np.ndarray:
+    residuals = np.square(y_target - y_pred)
+    ss_res = np.sum(residuals, axis=0) + eps
+    mean = np.mean(y_target, axis=0)[np.newaxis, :]
+    residuals = np.square(y_target - mean)
+    ss_tot = np.sum(residuals, axis=0) + eps
+    return 1 - ss_res / ss_tot
 
-def evaluate(A, train_data_loader, test_data_loader, n_perturbations: int) -> Tuple[float, float]:
-    # Training
 
-    signed = np.any(A < 0)
-    model = Model(A, n_perturbations, n_hidden=16, signed=signed)
+def evaluate(A, train_data_loader, test_data_loader, state_dict, n_perturbations: int, signed: bool = True) -> np.ndarray:
+    set_seed()
+    A = np.copy(A)
+    model = Model(A, n_perturbations, signed=signed)
     model = model.to(DEVICE)
-    optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
+    model.load_state_dict(state_dict, strict=False)
+    model.set_grn(A)
+    optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-6)
     scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
         optimizer, mode='min', patience=5,
-        min_lr=1e-5, cooldown=3, factor=0.8
+        min_lr=1e-6, cooldown=3, factor=0.8
     )
-    pbar = tqdm.tqdm(range(100))  # Reduced epochs for faster testing
-    best_val_loss = float('inf')
-    best_ss_res = None
-    best_epoch = 0
+    pbar = tqdm.tqdm(range(300))
+    best_avg_r2, best_r2 = -np.inf, None
     model.train()
     for epoch in pbar:
         total_loss = 0
-        for x, pert, y in train_data_loader:
-            x, pert, y = x.to(DEVICE), pert.to(DEVICE), y.to(DEVICE)
+        y_target, y_pred = [], []
+        for x, pert, d_x in train_data_loader:
+            x, pert, d_x = x.to(DEVICE), pert.to(DEVICE), d_x.to(DEVICE)
+
+            # Reset gradients
             optimizer.zero_grad()
+
             # Model now predicts full perturbed expression directly
-            y_hat = model(x, pert)
-            loss = torch.mean(torch.square(y - y_hat))
+            d_x_hat = model(x, pert)
+            y_target.append(d_x.cpu().data.numpy())
+            y_pred.append(d_x_hat.cpu().data.numpy())
+
+            # Compute mean squared error
+            loss = torch.mean(torch.square(d_x - d_x_hat))
+            total_loss += loss.item() * len(x)
+
+            # Compute gradients (clip them to prevent divergence)
             loss.backward()
+            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
+
+            # Update parameters
             optimizer.step()
-            total_loss += loss.item() * len(x)
-        pbar.set_description(str(total_loss))
+
         scheduler.step(total_loss)
+        r2_train = coefficients_of_determination(np.concatenate(y_target, axis=0), np.concatenate(y_pred, axis=0))
 
         model.eval()
-        ss_res = 0
+        y_target, y_pred = [], []
         with torch.no_grad():
-            for x, pert, y in test_data_loader:
-                x, pert, y = x.to(DEVICE), pert.to(DEVICE), y.to(DEVICE)
-                # Model predicts full perturbed expression
-                y_hat = model(x, pert)
-                residuals = torch.square(y - y_hat).cpu().data.numpy()
-                ss_res += np.sum(residuals, axis=0)
-        if np.sum(ss_res) < best_val_loss:
-            best_val_loss = np.sum(ss_res)
-            best_epoch = epoch
-            best_ss_res = ss_res
+            for x, pert, d_x in test_data_loader:
+                x, pert, d_x = x.to(DEVICE), pert.to(DEVICE), d_x.to(DEVICE)
+                d_x_hat = model(x, pert)
+                y_target.append(d_x.cpu().data.numpy())
+                y_pred.append(d_x_hat.cpu().data.numpy())
+        r2_test = coefficients_of_determination(np.concatenate(y_target, axis=0), np.concatenate(y_pred, axis=0))
+        avg_r2 = np.mean(r2_test)
+        if avg_r2 > best_avg_r2:
+            best_avg_r2 = avg_r2
+            best_r2 = r2_test
+        pbar.set_description(str(np.mean(r2_train)) + "  " + str(np.mean(r2_test)))
         model.train()
-    ss_res = best_ss_res
-
-    # Final evaluation with PDS
     model.eval()
 
-    ss_tot = 0
-
-    with torch.no_grad():
-        for x, pert, y in test_data_loader:
-            x, pert, y = x.to(DEVICE), pert.to(DEVICE), y.to(DEVICE)
-            y_hat = model(x, pert)
-
-            residuals = torch.square(y - torch.mean(y, dim=0).unsqueeze(0)).cpu().data.numpy()
-            ss_tot += np.sum(residuals, axis=0)
-
-    print(f"Best epoch: {best_epoch} ({best_val_loss})")
-    return best_ss_res, ss_tot
-
+    return best_r2
 
 
 def main(par):
+
     # Load evaluation data
     adata = ad.read_h5ad(par['evaluation_data'])
     dataset_id = adata.uns['dataset_id']
@@ -443,6 +379,14 @@ def main(par):
     cv_groups = encode_obs_cols(adata, par['cv_groups'])
     match_groups = encode_obs_cols(adata, par['match'])
     loose_match_groups = encode_obs_cols(adata, par['loose_match'])
+
+    # Get cell types
+    N_FOLDS = 5
+    try:
+        cell_types = np.squeeze(encode_obs_cols(adata, ["cell_type"]))
+    except Exception:
+        print(traceback.format_exc())
+        cell_types = np.random.randint(0, 5, size=len(X))
     
     # Set perturbations to first column (perturbation)
     perturbations = cv_groups[0]  # perturbation codes
@@ -513,25 +457,26 @@ def main(par):
     # Remove self-regulations
     np.fill_diagonal(A, 0)
 
-    # Create baseline model
-    A_baseline = np.copy(A)
-    for j in range(A.shape[1]):
-        np.random.shuffle(A[:j, j])
-        np.random.shuffle(A[j+1:, j])
-    assert np.any(A_baseline != A)
-
     # Mapping between gene expression profiles and their matched negative controls
-
     control_map, _ = create_control_matching(are_controls, match_groups)
     loose_control_map, _ = create_control_matching(are_controls, loose_match_groups)
 
-    ss_res = 0
-    ss_tot = 0
-    cv = GroupKFold(n_splits=5)
+    r2 = []
+    r2_baseline = []
+    cv = StratifiedGroupKFold(n_splits=N_FOLDS, shuffle=True, random_state=0xCAFE)
     
-    results = []
-    
-    for i, (train_index, test_index) in enumerate(cv.split(X, X, cv_groups)):
+    for i, (train_index, test_index) in enumerate(cv.split(X, perturbations, cell_types)):
+
+        if (len(train_index) == 0) or (len(test_index) == 0):
+            continue
+
+        # Create baseline model
+        A_baseline = np.copy(A)
+        for j in range(A_baseline.shape[1]):
+            np.random.shuffle(A_baseline[:j, j])
+            np.random.shuffle(A_baseline[j+1:, j])
+        assert np.any(A_baseline != A)
+
         # Center and scale dataset
         scaler = StandardScaler()
         scaler.fit(X[train_index, :])
@@ -539,45 +484,63 @@ def main(par):
 
         # Create data loaders
         n_perturbations = int(np.max(perturbations) + 1)
-        train_dataset = PerturbationDataset(
-            X_standardized,
-            train_index,
-            match_groups,
-            control_map,
-            loose_match_groups,
-            loose_control_map,
-            perturbations
-        )
-        train_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=512)
-        test_dataset = PerturbationDataset(
-            X_standardized,
-            test_index,
-            match_groups,
-            control_map,
-            loose_match_groups,
-            loose_control_map,
-            perturbations
-        )
-        test_data_loader = torch.utils.data.DataLoader(test_dataset, batch_size=512)
+        def create_data_loaders():
+            train_dataset = PerturbationDataset(
+                X_standardized,
+                train_index,
+                match_groups,
+                control_map,
+                loose_match_groups,
+                loose_control_map,
+                perturbations
+            )
+            train_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=512, shuffle=True)
+            test_dataset = PerturbationDataset(
+                X_standardized,
+                test_index,
+                match_groups,
+                control_map,
+                loose_match_groups,
+                loose_control_map,
+                perturbations
+            )
+            test_data_loader = torch.utils.data.DataLoader(test_dataset, batch_size=512)
+            return train_data_loader, test_data_loader
+
+        # For fair comparison, we first randomly initialize NN parameters, and use these same
+        # parameters to build both models (only the GRN weights will differ).
+        signed = np.any(A < 0)
+        model_template = Model(A, n_perturbations, signed=signed).to(DEVICE)
+        state_dict = model_template.state_dict()
 
         # Evaluate inferred GRN
-        res = evaluate(A, train_data_loader, test_data_loader, n_perturbations)
-        ss_res = ss_res + res[0]
-        ss_tot = ss_tot + res[1]
+        train_data_loader, test_data_loader = create_data_loaders()
+        r2.append(evaluate(A, train_data_loader, test_data_loader, state_dict, n_perturbations, signed=signed))
 
         # Evaluate baseline GRN (shuffled target genes)
-        #ss_tot = ss_tot + evaluate(A_baseline, train_data_loader, test_data_loader, n_perturbations)
+        train_data_loader, test_data_loader = create_data_loaders()
+        r2_baseline.append(evaluate(A_baseline, train_data_loader, test_data_loader, state_dict, n_perturbations, signed=signed))
+
+        print(f"Mean R2 (fold {i + 1})", np.mean(r2), np.mean(r2_baseline))
+
+    r2 = np.asarray(r2).flatten()
+    r2_baseline = np.asarray(r2_baseline).flatten()
 
-    r2 = 1 - ss_res / ss_tot
+    print("Mean R2", np.mean(r2), np.mean(r2_baseline))
+
+    if np.all(r2 == r2_baseline):
+        final_score = 0
+    else:
+        print(np.mean(r2), np.mean(r2_baseline))
+        p_value = wilcoxon(r2, r2_baseline, alternative="greater").pvalue
+        final_score = -np.log10(p_value)
 
-    final_score = np.mean(np.clip(r2, 0, 1))
     print(f"Method: {method_id}")
-    print(f"R2: {final_score}")
+    print(f"Final score: {final_score}")
 
     results = {
         'vc': [float(final_score)]
-
     }
 
     df_results = pd.DataFrame(results)
-    return df_results
\ No newline at end of file
+    return df_results
diff --git a/src/metrics/vc/improved_helper.py b/src/metrics/vc/improved_helper.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/src/metrics/vc/new_helper.py b/src/metrics/vc/new_helper.py
new file mode 100644
index 000000000..c3eccbaa9
--- /dev/null
+++ b/src/metrics/vc/new_helper.py
@@ -0,0 +1,299 @@
+import os
+import traceback
+from typing import Tuple, Dict
+import sys
+import tqdm
+import random
+import h5py
+import numpy as np
+import pandas as pd
+import torch
+import xgboost
+from scipy.sparse import csr_matrix
+from scipy.stats import wilcoxon
+from sklearn.decomposition import PCA
+from sklearn.linear_model import Ridge
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.model_selection import StratifiedGroupKFold, StratifiedKFold
+from sklearn.preprocessing import StandardScaler, LabelEncoder, TargetEncoder
+from torch.utils.data import Dataset
+import anndata as ad
+import warnings
+warnings.filterwarnings("ignore", category=UserWarning)
+
+os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8' # For reproducibility purposes (on GPU)
+
+# Hyper-parameters
+DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
+NUMPY_DTYPE = np.float32
+
+# For reproducibility purposes
+def set_seed():
+    seed = 0xCAFE
+    os.environ['PYTHONHASHSEED'] = str(seed)
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    torch.use_deterministic_algorithms(True)
+
+from util import read_prediction, manage_layer
+from dataset_config import DATASET_GROUPS
+from scipy.spatial.distance import cityblock
+
+
+def combine_multi_index(*arrays) -> np.array:
+    """Combine parallel label arrays into a single integer label per position."""
+    A = np.stack(arrays)
+    n_classes = tuple(int(A[i].max()) + 1 for i in range(A.shape[0]))
+    return np.ravel_multi_index(A, dims=n_classes, order='C')
+
+
+def encode_obs_cols(adata, cols):
+    encoded = []
+    for col in cols:
+        if col in adata.obs:
+            codes = LabelEncoder().fit_transform(adata.obs[col].values)
+            encoded.append(codes)
+    return encoded
+
+
+def create_control_matching(are_controls: np.ndarray, match_groups: np.ndarray) -> Tuple[Dict[int, int], np.ndarray]:
+    control_indices = np.where(are_controls)[0]
+    
+    if len(control_indices) == 0:
+        raise ValueError("No control samples found in dataset!")
+    
+    # First, try to create exact matching (original approach)
+    control_map = {}
+    for i, (is_control, group_id) in enumerate(zip(are_controls, match_groups)):
+        if is_control:
+            control_map[int(group_id)] = i
+
+    return control_map, match_groups
+
+
+class Model(object):
+
+    def __init__(self, A: np.ndarray):
+        self.n_genes: int = A.shape[1]
+        self.A: np.ndarray = A
+        self._te = [TargetEncoder() for _ in range(self.n_genes)]
+        self._models = [xgboost.XGBRegressor(n_estimators=100) for _ in range(self.n_genes)]
+        #self._models = [RandomForestRegressor() for _ in range(self.n_genes)]
+
+    def fit(self, X_train: np.ndarray, Y_train: np.ndarray, p_train: np.ndarray) -> None:
+        Z_train = []
+        for j in range(self.n_genes):
+            self._te[j].fit(p_train[:, np.newaxis], Y_train[:, j])
+            Z_train.append(np.squeeze(self._te[j].transform(p_train[:, np.newaxis])))
+        Z_train = np.asarray(Z_train).T
+        for j in tqdm.tqdm(range(self.n_genes), desc="Fit"):
+            mask = (self.A[:, j] != 0)
+            mask[j] = True
+            if np.any(mask):
+                self._models[j].fit(np.concatenate((X_train[:, mask], Z_train[:, mask]), axis=1), Y_train[:, j])
+
+    def predict(self, X_test: np.ndarray, p_test: np.ndarray) -> np.ndarray:
+        Z_test = []
+        for j in range(self.n_genes):
+            Z_test.append(np.squeeze(self._te[j].transform(p_test[:, np.newaxis])))
+        Z_test = np.asarray(Z_test).T
+        Y_hat = []
+        for j in tqdm.tqdm(range(self.n_genes), desc="Predict"):
+            mask = (self.A[:, j] != 0)
+            mask[j] = True
+            if np.any(mask):
+                Y_hat.append(self._models[j].predict(np.concatenate((X_test[:, mask], Z_test[:, mask]), axis=1)))
+            else:
+                Y_hat.append(np.zeros(len(X_test)))
+        return np.asarray(Y_hat).T
+
+
+def main(par):
+
+    # Load evaluation data
+    adata = ad.read_h5ad(par['evaluation_data'])
+    dataset_id = adata.uns['dataset_id']
+    method_id = ad.read_h5ad(par['prediction'], backed='r').uns['method_id']
+    
+    # Configure dataset-specific groups
+    par['cv_groups'] = DATASET_GROUPS[dataset_id]['cv']
+    par['match'] = DATASET_GROUPS[dataset_id]['match'] 
+    par['loose_match'] = DATASET_GROUPS[dataset_id]['loose_match']
+
+    layer = manage_layer(adata, par)
+    X = adata.layers[layer]
+    if isinstance(X, csr_matrix):
+        X = X.toarray()
+    X = X.astype(np.float32)
+    
+    gene_names = adata.var_names
+    gene_dict = {gene_name: i for i, gene_name in enumerate(gene_names)}
+
+    # Get sample info
+    perturbations = None
+    cv_groups, match_groups, loose_match_groups = [], [], []
+    
+    # Encode observation columns
+    cv_groups = encode_obs_cols(adata, par['cv_groups'])
+    match_groups = encode_obs_cols(adata, par['match'])
+    loose_match_groups = encode_obs_cols(adata, par['loose_match'])
+
+    # Get cell types
+    N_FOLDS = 5
+    try:
+        cell_types = np.squeeze(encode_obs_cols(adata, ["cell_type"]))
+    except Exception:
+        print(traceback.format_exc())
+        cell_types = np.random.randint(0, 5, size=len(X))
+
+    # Set perturbations to first column (perturbation)
+    perturbations = cv_groups[0]  # perturbation codes
+    
+    # Groups used for cross-validation
+    cv_groups = combine_multi_index(*cv_groups)
+
+    # Groups used for matching with negative controls
+    match_groups = combine_multi_index(*match_groups)
+
+    # Groups used for loose matching with negative controls
+    loose_match_groups = combine_multi_index(*loose_match_groups)
+
+    # Check for control samples - this is required for perturbation evaluation
+    if "is_control" not in adata.obs.columns:
+        raise ValueError("Dataset must contain 'is_control' column for perturbation evaluation")
+    
+    are_controls = adata.obs["is_control"].values.astype(bool)
+    
+    # Check if we have any control samples
+    n_controls = np.sum(are_controls)
+    n_perturbed = np.sum(~are_controls)
+    
+    if n_controls == 0:
+        raise ValueError(f"No control samples found in dataset! Found {n_perturbed} perturbed samples but 0 controls. "
+                        "Perturbation evaluation requires control samples for comparison.")
+    
+    print(f"Found {n_controls} control samples and {n_perturbed} perturbed samples")
+
+    # Load inferred GRN
+    net = read_prediction(par)
+    sources = net["source"].to_numpy()
+    targets = net["target"].to_numpy()
+    weights = net["weight"].to_numpy()
+
+    A = np.zeros((len(gene_names), len(gene_names)), dtype=X.dtype)
+    for source, target, weight in zip(sources, targets, weights):
+        if (source in gene_dict) and (target in gene_dict):
+            i = gene_dict[source]
+            j = gene_dict[target]
+            A[i, j] = float(weight)
+
+    # Only consider the genes that are actually present in the inferred GRN
+    gene_mask = np.logical_or(np.any(A, axis=1), np.any(A, axis=0))
+    X = X[:, gene_mask]
+    A = A[gene_mask, :][:, gene_mask]
+    gene_names = gene_names[gene_mask]
+
+    # Filter genes based on GRN instead of HVGs
+    # Keep all genes that are present in the GRN (already filtered above)
+    print(f"Using {len(gene_names)} genes present in the GRN")
+    
+    # Additional memory-aware gene filtering for very large GRNs
+    MAX_GENES_FOR_MEMORY = 3000  # Reduced further to avoid memory issues
+    if len(gene_names) > MAX_GENES_FOR_MEMORY:
+        print(f"Too many genes ({len(gene_names)}) for memory. Selecting top {MAX_GENES_FOR_MEMORY} by GRN connectivity.")
+        
+        # Select genes with highest connectivity in the GRN
+        gene_connectivity = np.sum(np.abs(A), axis=0) + np.sum(np.abs(A), axis=1)
+        top_gene_indices = np.argsort(gene_connectivity)[-MAX_GENES_FOR_MEMORY:]
+        
+        X = X[:, top_gene_indices]
+        A = A[top_gene_indices, :][:, top_gene_indices]
+        gene_names = gene_names[top_gene_indices]
+        
+        print(f"Final: Using {len(gene_names)} most connected genes for evaluation")
+
+    # Remove self-regulations
+    np.fill_diagonal(A, 0)
+
+    # Mapping between gene expression profiles and their matched negative controls
+    control_map, _ = create_control_matching(are_controls, match_groups)
+    loose_control_map, _ = create_control_matching(are_controls, loose_match_groups)
+
+    # Build dataset
+    X_, Y_ = [], []
+    for i, (match_group, loose_match_group) in enumerate(zip(match_groups, loose_match_groups)):
+        if match_group in control_map:
+            i_prime = control_map[match_group]
+        else:
+            i_prime = loose_control_map[loose_match_group]
+        X_.append(X[i_prime, :])
+        Y_.append(X[i, :])
+    X, Y = np.asarray(X_), np.asarray(Y_)
+    Y = Y - X
+    del X_, Y_
+
+    eps = 1e-20
+    r2, r2_baseline = [], []
+    cv = StratifiedGroupKFold(n_splits=N_FOLDS, shuffle=True, random_state=0xCAFE)
+    for i, (train_index, test_index) in enumerate(cv.split(X, perturbations, cell_types)):
+
+        if (len(train_index) == 0) or (len(test_index) == 0):
+            continue
+
+        # Create baseline model
+        A_baseline = np.copy(A)
+        for j in range(A_baseline.shape[1]):
+            np.random.shuffle(A_baseline[:j, j])
+            np.random.shuffle(A_baseline[j+1:, j])
+        assert np.any(A_baseline != A)
+
+        # Center and scale dataset
+        scaler = StandardScaler()
+        X_train = scaler.fit_transform(X[train_index, :])
+        X_test = scaler.transform(X[test_index, :])
+        scaler = StandardScaler()
+        Y_train = scaler.fit_transform(Y[train_index, :])
+        Y_test = scaler.transform(Y[test_index, :])
+        p_train = perturbations[train_index]
+        p_test = perturbations[test_index]
+
+        model = Model(A)
+        model.fit(X_train, Y_train, p_train)
+        Y_hat = model.predict(X_test, p_test)
+        ss_res = np.sum(np.square(Y_test - Y_hat)) + eps
+        ss_tot = np.sum(np.square(Y_test - np.mean(Y_test, axis=0)[np.newaxis, :])) + eps
+        r2.append(1 - ss_res / ss_tot)
+
+        model = Model(A_baseline)
+        model.fit(X_train, Y_train, p_train)
+        Y_hat = model.predict(X_test, p_test)
+        ss_res = np.sum(np.square(Y_test - Y_hat)) + eps
+        ss_tot = np.sum(np.square(Y_test - np.mean(Y_test, axis=0)[np.newaxis, :])) + eps
+        r2_baseline.append(1 - ss_res / ss_tot)
+
+        print("r2", np.mean(r2), np.mean(r2_baseline))
+
+    r2 = np.asarray(r2).flatten()
+    r2_baseline = np.asarray(r2_baseline).flatten()
+
+    print("R2", np.mean(r2), np.mean(r2_baseline))
+
+    if np.all(r2 == r2_baseline):
+        final_score = 0
+    else:
+        print(np.mean(r2), np.mean(r2_baseline))
+        p_value = wilcoxon(r2, r2_baseline, alternative="greater").pvalue
+        final_score = -np.log10(p_value)
+
+    print(f"Method: {method_id}")
+    print(f"R2: {final_score}")
+
+    results = {
+        'vc': [float(final_score)]
+    }
+
+    df_results = pd.DataFrame(results)
+    return df_results
diff --git a/src/metrics/vc/ultra_fast_helper.py b/src/metrics/vc/ultra_fast_helper.py
deleted file mode 100644
index e69de29bb..000000000

From 347a4e13e05182d9be3325a4216ceff28e80f778 Mon Sep 17 00:00:00 2001
From: AntoinePassemiers <antoine.passemiers@gmail.com>
Date: Sun, 5 Oct 2025 14:45:01 +0200
Subject: [PATCH 03/19] Fix recovery_2 metric

---
 src/metrics/recovery_2/helper.py | 369 ++++++++++++++++++-------------
 src/metrics/vc/helper.py         |  28 +--
 2 files changed, 226 insertions(+), 171 deletions(-)

diff --git a/src/metrics/recovery_2/helper.py b/src/metrics/recovery_2/helper.py
index d28e53fe9..141c5081c 100644
--- a/src/metrics/recovery_2/helper.py
+++ b/src/metrics/recovery_2/helper.py
@@ -1,182 +1,237 @@
+from typing import Tuple, List, Optional
+
 import scipy
 import numpy as np
+from scipy.sparse import csr_matrix
+from scipy.stats import spearmanr, pearsonr, wilcoxon
+from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
 import pandas as pd
 import anndata as ad
 import warnings
-import scipy
 warnings.filterwarnings("ignore")
 from util import format_save_score, read_prediction
 
-from scipy.stats import spearmanr
 np.random.seed(0)
 from util import manage_layer
-def actual_calculation(adata, target_genes, group):
-    adata = adata[adata.obs['is_control'] == False] # only use non-control cells
-    target_genes = [g for g in target_genes if g in adata.var_names]
-    target_adata = adata[:, target_genes]
-    target_expr = pd.DataFrame(
-        target_adata.X.toarray() if scipy.sparse.issparse(target_adata.X) else target_adata.X,
-        index=target_adata.obs.index,
-        columns=target_adata.var_names
-    )
-    for g in group:
-        target_expr[g] = target_adata.obs[g].values
-    group_consistencies = {}
-    for g_vals, df_group in target_expr.groupby(group):
-        expr_data = df_group.drop(columns=group)
-        if len(expr_data) < 2:
-            continue  # skip if only one sample
-
-        rows = expr_data.values
-        corr_matrix = np.corrcoef(rows)
-        correlation = corr_matrix[np.triu_indices_from(corr_matrix, k=1)] 
-        correlation = correlation[~np.isnan(correlation)]
-        correlation = np.mean(correlation)
-        assert isinstance(correlation, float)
-        group_consistencies[g_vals] = correlation
-    return group_consistencies
-
-def calculate_target_gene_consistency(adata, net, group, min_targets_t=10, max_targets_t=20):
-    """
-    Calculate consistency score for each TF based on target gene expression across groups.
-    Returns a dictionary with TF as key and consistency score as value.
-    """
-    from scipy.stats import ttest_rel
-    
-    adata = adata[adata.obs['is_control'] == False]
-    all_genes = adata.var_names.tolist()
-    tf_size = net.groupby('source').size()
-    tfs = tf_size[(tf_size > min_targets_t)].index
-    net = net[net['source'].isin(tfs)]
-    
-    all_values = []
-    all_random_values = []
-    
-    for tf in tfs:
-        target_genes = net[net['source'] == tf]
-        target_genes = target_genes.sort_values(by='weight', ascending=False, key=abs).head(max_targets_t)
-        target_genes = target_genes['target'].tolist()
-            
-        actual_consistencies = actual_calculation(adata, target_genes, group)
-        
-        n_targets = len(target_genes)
-        random_targets = np.random.choice(all_genes, size=n_targets, replace=False)
-        random_consistencies = actual_calculation(adata, random_targets, group)
-
-        # Get consistency values for groups that exist in both actual and random
-        common_groups = actual_consistencies.keys()
-        if len(common_groups) < 2:
-            continue  # Need at least 2 groups for paired t-test
-            
-        actual_values = [actual_consistencies[g] for g in common_groups]
-        random_values = [random_consistencies[g] for g in common_groups]
-        assert len(actual_values) == len(random_values)
-        all_values.append(np.mean(actual_values))
-        all_random_values.append(np.mean(random_values))
-
-    return all_values, all_random_values
-
-def evaluate_setting(evaluation_adata, net, group, setting_name, n_tfs=None, max_targets_per_tf=None):
+
+
+def encode_obs_cols(adata, cols):
+    """Encode observation columns to integer codes."""
+    encoded = []
+    for col in cols:
+        if col in adata.obs:
+            codes = LabelEncoder().fit_transform(adata.obs[col].values)
+            encoded.append(codes)
+    return encoded
+
+
+def combine_multi_index(*arrays) -> np.array:
+    """Combine parallel label arrays into a single integer label per position."""
+    A = np.stack(arrays)
+    n_classes = tuple(int(A[i].max()) + 1 for i in range(A.shape[0]))
+    return np.ravel_multi_index(A, dims=n_classes, order='C')
+
+
+def evaluate_grn(
+        C: np.ndarray,
+        A: np.ndarray,
+        n_tfs: Optional[int] = None,
+        max_targets_per_tf: Optional[int] = None
+) -> float:
     """
     Evaluate a specific setting (recall, balanced, or precision)
     """
-    from scipy.stats import wilcoxon
+
+    # Compute the min and max correlation coefficients across groups
+    C_min = np.min(C, axis=0)
+    C_max = np.max(C, axis=0)
     
-    # Filter TFs if specified
-    if n_tfs is not None:
-        tf_counts = net.groupby('source').size().sort_values(ascending=False)
-        top_tfs = tf_counts.head(n_tfs).index
-        filtered_net = net[net['source'].isin(top_tfs)]
+    # Filter TFs if specified (keep the TFs with the most target genes)
+    if n_tfs is None:
+        tf_idx = np.arange(A.shape[1])
     else:
-        filtered_net = net.copy()
-    
-    # Set max_targets parameter
-    if max_targets_per_tf is None:
-        max_targets_per_tf = 50  # default value
-    
-    actual_values, random_values = calculate_target_gene_consistency(
-        evaluation_adata, filtered_net, group, max_targets_t=max_targets_per_tf
-    )
-    
-    # Convert to numpy arrays for arithmetic operations
-    actual_values = np.array(actual_values)
-    random_values = np.array(random_values)
-    
-    # Statistical test
-    res = wilcoxon(actual_values - random_values, zero_method='wilcox', alternative='greater')
-    p_value = res.pvalue
-    if p_value == 0:
-        p_value = 1e-300
-    final_score = -np.log10(p_value)
-    
-    result = {
-        'setting': setting_name,
-        'value_min': np.min(actual_values),
-        'value_random_min': np.min(random_values),
-        'value_median': np.median(actual_values),
-        'value_random_median': np.median(random_values),
-        'value_max': np.max(actual_values),
-        'value_random_max': np.max(random_values),
-        'value_mean': np.mean(actual_values),
-        'value_random_mean': np.mean(random_values),
-        'value_std': np.std(actual_values),
-        'value_random_std': np.std(random_values),
-        'n_values': len(actual_values),
-        'p_value': p_value,
-        'final_score': final_score,
-        'mean_difference': np.mean(actual_values - random_values)
-    }
-    
-    print(f"{setting_name}: {result}")
-    return result
+        tf_idx = np.argsort(np.sum(A != 0, axis=1))[-n_tfs:]
+    tf_mask = np.zeros(A.shape[1], dtype=bool)
+    tf_mask[tf_idx] = True
+
+    scores = []
+    for j in range(A.shape[1]):
+
+        if tf_mask[j]:
+
+            # Filter TGs if specified (keep the TGs with highest regulatory links)
+            if max_targets_per_tf is None:
+                tg_idx = np.where(A[j, :] != 0)[0]
+            else:
+                tg_idx = np.argsort(np.abs(A[j, :]))[-max_targets_per_tf:]
+                tg_idx = tg_idx[A[j, tg_idx] != 0]
+
+            # Compute scores
+            if len(tg_idx) == 0:
+                scores.append(np.nan)
+            else:
+                # Intuition behind the scoring scheme:
+                # our score is a weighted average of consistency scores,
+                # where the weights are proportional to regulatory weight magnitudes,
+                # and consistency scores are penalized by the max-min correlation spread.
+                # The more consistent correlation coefficients across groups, the higher
+                # the consistency scores.
+                weights = np.abs(A[j, tg_idx])
+                weights /= np.sum(weights)
+                consistency_scores = 2 - (C_max[j, tg_idx] - C_min[j, tg_idx])
+                tf_score = float(np.sum(weights * consistency_scores))
+                scores.append(tf_score)
+
+        else:
+            scores.append(np.nan)
+
+    return np.array(scores)
+
+
+def evaluate_setting(C: np.ndarray, A: np.ndarray, setting_name: str, **kwargs) -> float:
+
+    # Evaluate inferred GRN
+    scores = evaluate_grn(C, A, **kwargs)
+
+    # Create baseline GRN
+    A_baseline = np.copy(A)
+    for j in range(A.shape[1]):
+        np.random.shuffle(A_baseline[:j, j])
+        np.random.shuffle(A_baseline[j+1:, j])
+    assert np.any(A_baseline != A)
+
+    # Evaluate baseline GRN
+    scores_baseline = evaluate_grn(C, A_baseline, **kwargs)
+
+    # Skip NaNs
+    mask = ~np.logical_or(np.isnan(scores), np.isnan(scores_baseline))
+    scores = scores[mask]
+    scores_baseline = scores_baseline[mask]
+
+    # Compare results
+    if len(scores) == 0:
+        print(f"Mean corr. coef. for setting '{setting_name}': grn=nan, baseline=nan")
+        return 0.0
+    else:
+        print(f"Mean corr. coef. for setting '{setting_name}': grn={np.mean(scores):.3f}, baseline={np.mean(scores_baseline):.3f}")
+        try:
+            p_value = wilcoxon(scores, scores_baseline, alternative="greater", zero_method="pratt").pvalue
+            p_value = np.clip(p_value, 1e-300, 1)
+            print(f"p-value={p_value}")
+            #return -np.log10(p_value)
+            return np.mean(np.clip(scores - scores_baseline, 0, None))
+        except ValueError:
+            return 0.0
+
 
 def main(par):
-    evaluation_adata = ad.read_h5ad(par['evaluation_data'])
-    layer = manage_layer(evaluation_adata, par)
-    evaluation_adata.X = evaluation_adata.layers[layer]
-    net = read_prediction(par) # [source	target	weight]
+    # Load evaluation data
+    adata = ad.read_h5ad(par['evaluation_data'])
+    dataset_id = adata.uns['dataset_id']
+    method_id = ad.read_h5ad(par['prediction'], backed='r').uns['method_id']
 
-    group = par['group']
-    for g in group:
-        assert g in evaluation_adata.obs.columns
+    # Manage layer
+    layer = manage_layer(adata, par)
+    X = adata.layers[layer]
+    if isinstance(X, csr_matrix):
+        X = X.toarray()
+    X = X.astype(np.float32)
 
-    # Evaluate three settings as specified in TODO
-    settings_results = []
-    
-    # 1. Recall setting: all TFs, all target genes (no filtering)
-    recall_result = evaluate_setting(
-        evaluation_adata, net, group, 
-        setting_name="recall", 
-        n_tfs=None,  # Use all TFs
-        max_targets_per_tf=None  # Use all target genes (up to default limit)
-    )
-    settings_results.append(recall_result)
-    
-    # 2. Balanced setting: top 100 TFs, top 100 target genes
-    balanced_result = evaluate_setting(
-        evaluation_adata, net, group, 
-        setting_name="balanced", 
-        n_tfs=100, 
-        max_targets_per_tf=100
-    )
-    settings_results.append(balanced_result)
-    
-    # 3. Precision setting: top 20 TFs, top 20 target genes
-    precision_result = evaluate_setting(
-        evaluation_adata, net, group, 
-        setting_name="precision", 
-        n_tfs=20, 
-        max_targets_per_tf=20
-    )
-    settings_results.append(precision_result)
+    gene_names = adata.var_names
+    gene_dict = {gene_name: i for i, gene_name in enumerate(gene_names)}
+
+    # Get groups
+    group_variables, group_names = [], []
+    if ("donor_id" in adata.obs) or ("plate_name" in adata.obs):
+        variables = encode_obs_cols(adata, ["donor_id", "plate_name"])
+        group_variables.append(combine_multi_index(*variables))
+        group_names.append("donor_id/plate_name")
+    if (len(group_variables) == 0) and ("cell_type" in adata.obs):
+        variables = encode_obs_cols(adata, ["cell_type"])
+        group_variables.append(combine_multi_index(*variables))
+        group_names.append("cell_type")
+    if len(group_variables) == 0:
+        variables = encode_obs_cols(adata, ["perturbation"])
+        group_variables.append(combine_multi_index(*variables))
+        group_names.append("perturbation")
+
+    # Load inferred GRN
+    df = read_prediction(par)
+    sources = df["source"].to_numpy()
+    targets = df["target"].to_numpy()
+    weights = df["weight"].to_numpy()
+    A = np.zeros((len(gene_names), len(gene_names)), dtype=X.dtype)
+    for source, target, weight in zip(sources, targets, weights):
+        if (source in gene_dict) and (target in gene_dict):
+            i = gene_dict[source]
+            j = gene_dict[target]
+            A[i, j] = float(weight)
+
+    # Only consider the genes that are actually present in the inferred GRN,
+    # and keep only the most-connected genes (for speed).
+    gene_mask = np.logical_or(np.any(A, axis=1), np.any(A, axis=0))
+    X = X[:, gene_mask]
+    X = X.toarray() if isinstance(X, csr_matrix) else X
+    A = A[gene_mask, :][:, gene_mask]
+    gene_names = gene_names[gene_mask]
+
+    # Remove self-regulations
+    np.fill_diagonal(A, 0)
+    print(f"Evaluating {X.shape[1]} genes with {np.sum(A != 0)} regulatory links")
+
+    recall_results = []
+    balanced_results = []
+    precision_results = []
+    for group_name, groups in zip(group_names, group_variables):
+        print("\n" + "=" * 30)
+        print(f"Grouping samples by {group_name}")
+
+        # Compute correlation matrix for each group
+        S, C = [], []
+        for group in np.unique(groups):
+            mask = (groups == group)
+            if np.sum(mask) >= 3:
+                assert not np.any(np.isnan(X[mask, :]))
+                res = spearmanr(X[mask, :])
+                S.append(res.statistic)
+                C.append(res.pvalue)
+        S = np.nan_to_num(S, nan=0)
+        C = np.nan_to_num(C, nan=1)
+        #C = -np.sign(S) * np.log10(C)
+        #C = np.clip(C, -4, 4)
+        C = S
+        
+        # 1. Recall setting: all TFs, all target genes (no filtering)
+        recall_results.append(evaluate_setting(
+            C, A, "recall",
+            n_tfs=None,  # Use all TFs
+            max_targets_per_tf=None  # Use all target genes (up to default limit)
+        ))
+        
+        # 2. Balanced setting: top 100 TFs, top 100 target genes
+        balanced_results.append(evaluate_setting(
+            C, A, "balanced",
+            n_tfs=100, 
+            max_targets_per_tf=100
+        ))
+        
+        # 3. Precision setting: top 40 TFs, top 40 target genes
+        precision_results.append(evaluate_setting(
+            C, A, "precision",
+            n_tfs=40, 
+            max_targets_per_tf=40
+        ))
+
+    recovery_recall = np.mean(recall_results)
+    recovery_balanced = np.mean(balanced_results)
+    recovery_precision = np.mean(precision_results)
     
     # Create results DataFrame with all three scores
     results = pd.DataFrame({
-        'recovery_recall': [recall_result['final_score']],
-        'recovery_balanced': [balanced_result['final_score']],
-        'recovery_precision': [precision_result['final_score']],
+        'recovery_recall': [recovery_recall],
+        'recovery_balanced': [recovery_balanced],
+        'recovery_precision': [recovery_precision]
     })
+    print(results)
     
     return results
-
-
diff --git a/src/metrics/vc/helper.py b/src/metrics/vc/helper.py
index 3735f420d..bf224c46e 100644
--- a/src/metrics/vc/helper.py
+++ b/src/metrics/vc/helper.py
@@ -78,8 +78,8 @@ def __init__(
             A_signs: torch.Tensor,
             signed: bool = True,
             inverse: bool = True,
-            alpha: float = 0.1,
-            stable: bool = False,
+            alpha: float = 0.2,
+            stable: bool = True,
             bias: bool = True
     ):
         torch.nn.Module.__init__(self)
@@ -156,7 +156,7 @@ def neumann_series(A: torch.Tensor, alpha: float, k: int = 2) -> torch.Tensor:
 
 class Model(torch.nn.Module):
 
-    def __init__(self, A: np.ndarray, n_perturbations: int, n_hidden: int = 20, signed: bool = True):
+    def __init__(self, A: np.ndarray, n_perturbations: int, n_hidden: int = 16, signed: bool = True):
 
         # n_hidden needs to be small enough to prevent the NN from arbitrarily shifting the learning task
         # from the GRN to the MLPs.
@@ -215,10 +215,10 @@ def forward(self, x: torch.Tensor, pert: torch.LongTensor) -> torch.Tensor:
 
         # Each perturbation is a linear transformation in the latent space.
         # Apply perturbation transform to the encoded profile.
-        z = self.perturbation_embedding(pert)
-        #y = y + z
         z = self.perturbation_embedding(pert).view(len(x), self.n_hidden, self.n_hidden)
         y = torch.einsum('ij,ijk->ik', y, z)
+        #z = self.perturbation_embedding(pert)
+        #y = y + z
 
         # Decode each expression profile
         y = self.decoder(y)
@@ -391,8 +391,8 @@ def main(par):
     # Set perturbations to first column (perturbation)
     perturbations = cv_groups[0]  # perturbation codes
     
-    # Groups used for cross-validation
-    cv_groups = combine_multi_index(*cv_groups)
+    # Validation strategy: evaluate on unseen (perturbation, cell type) pairs.
+    cv_groups = combine_multi_index(cell_types, perturbations)
 
     # Groups used for matching with negative controls
     match_groups = combine_multi_index(*match_groups)
@@ -454,8 +454,8 @@ def main(par):
         
         print(f"Final: Using {len(gene_names)} most connected genes for evaluation")
 
-    # Remove self-regulations
-    np.fill_diagonal(A, 0)
+    # Add self-regulations
+    np.fill_diagonal(A, 1)
 
     # Mapping between gene expression profiles and their matched negative controls
     control_map, _ = create_control_matching(are_controls, match_groups)
@@ -465,7 +465,7 @@ def main(par):
     r2_baseline = []
     cv = StratifiedGroupKFold(n_splits=N_FOLDS, shuffle=True, random_state=0xCAFE)
     
-    for i, (train_index, test_index) in enumerate(cv.split(X, perturbations, cell_types)):
+    for i, (train_index, test_index) in enumerate(cv.split(X, perturbations, cv_groups)):
 
         if (len(train_index) == 0) or (len(test_index) == 0):
             continue
@@ -521,17 +521,14 @@ def create_data_loaders():
         train_data_loader, test_data_loader = create_data_loaders()
         r2_baseline.append(evaluate(A_baseline, train_data_loader, test_data_loader, state_dict, n_perturbations, signed=signed))
 
-        print(f"Mean R2 (fold {i + 1})", np.mean(r2), np.mean(r2_baseline))
+        break
 
     r2 = np.asarray(r2).flatten()
     r2_baseline = np.asarray(r2_baseline).flatten()
-
     print("Mean R2", np.mean(r2), np.mean(r2_baseline))
-
     if np.all(r2 == r2_baseline):
         final_score = 0
     else:
-        print(np.mean(r2), np.mean(r2_baseline))
         p_value = wilcoxon(r2, r2_baseline, alternative="greater").pvalue
         final_score = -np.log10(p_value)
 
@@ -539,6 +536,9 @@ def create_data_loaders():
     print(f"Final score: {final_score}")
 
     results = {
+        'r2': [float(np.mean(r2))],
+        'r2_baseline': [float(np.mean(r2_baseline))],
+        'r2_diff': [float(np.mean(r2)) - float(np.mean(r2_baseline))],
         'vc': [float(final_score)]
     }
 

From b63293977633ead2db40cb6a300092452d487075 Mon Sep 17 00:00:00 2001
From: AntoinePassemiers <antoine.passemiers@gmail.com>
Date: Sun, 5 Oct 2025 19:43:46 +0200
Subject: [PATCH 04/19] recovery_2 metric: Add TG-TG correlations back

---
 src/metrics/recovery_2/helper.py | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/src/metrics/recovery_2/helper.py b/src/metrics/recovery_2/helper.py
index 141c5081c..2f42c2e0a 100644
--- a/src/metrics/recovery_2/helper.py
+++ b/src/metrics/recovery_2/helper.py
@@ -36,7 +36,9 @@ def evaluate_grn(
         C: np.ndarray,
         A: np.ndarray,
         n_tfs: Optional[int] = None,
-        max_targets_per_tf: Optional[int] = None
+        max_targets_per_tf: Optional[int] = None,
+        tf_tg: bool = True,
+        tg_tg: bool = True
 ) -> float:
     """
     Evaluate a specific setting (recall, balanced, or precision)
@@ -45,6 +47,7 @@ def evaluate_grn(
     # Compute the min and max correlation coefficients across groups
     C_min = np.min(C, axis=0)
     C_max = np.max(C, axis=0)
+    consistency_scores = 2 - (C_max - C_min)
     
     # Filter TFs if specified (keep the TFs with the most target genes)
     if n_tfs is None:
@@ -78,9 +81,12 @@ def evaluate_grn(
                 # the consistency scores.
                 weights = np.abs(A[j, tg_idx])
                 weights /= np.sum(weights)
-                consistency_scores = 2 - (C_max[j, tg_idx] - C_min[j, tg_idx])
-                tf_score = float(np.sum(weights * consistency_scores))
-                scores.append(tf_score)
+                overall_score = 0.0
+                if tf_tg:
+                    overall_score += float(np.sum(weights * consistency_scores[j, tg_idx]))
+                if tg_tg:
+                    overall_score += float(np.sum(weights * np.mean(consistency_scores[tg_idx, :][:, tg_idx], axis=0)))
+                scores.append(overall_score)
 
         else:
             scores.append(np.nan)
@@ -204,6 +210,7 @@ def main(par):
         # 1. Recall setting: all TFs, all target genes (no filtering)
         recall_results.append(evaluate_setting(
             C, A, "recall",
+            tf_tg=True, tg_tg=True,
             n_tfs=None,  # Use all TFs
             max_targets_per_tf=None  # Use all target genes (up to default limit)
         ))
@@ -211,6 +218,7 @@ def main(par):
         # 2. Balanced setting: top 100 TFs, top 100 target genes
         balanced_results.append(evaluate_setting(
             C, A, "balanced",
+            tf_tg=True, tg_tg=True,
             n_tfs=100, 
             max_targets_per_tf=100
         ))
@@ -218,6 +226,7 @@ def main(par):
         # 3. Precision setting: top 40 TFs, top 40 target genes
         precision_results.append(evaluate_setting(
             C, A, "precision",
+            tf_tg=True, tg_tg=True,
             n_tfs=40, 
             max_targets_per_tf=40
         ))

From 27c6fc390169e48721fe26f317aa84a4bd7bdf39 Mon Sep 17 00:00:00 2001
From: AntoinePassemiers <antoine.passemiers@gmail.com>
Date: Thu, 9 Oct 2025 09:19:46 +0200
Subject: [PATCH 05/19] Fix recovery_2, regression_3 and sem metrics

---
 .../experimental/anchor_regression/helper.py  | 102 ++++--
 src/metrics/experimental/recovery_2/helper.py | 153 +++++----
 .../experimental/recovery_2/old_helper.py     | 292 ++++++++++++++++++
 .../experimental/regression_3/helper.py       |  76 ++---
 .../experimental/regression_3/script.py       |  14 +-
 src/metrics/experimental/sem/helper.py        |  36 ++-
 src/metrics/experimental/vc/helper.py         |  27 +-
 7 files changed, 548 insertions(+), 152 deletions(-)
 create mode 100644 src/metrics/experimental/recovery_2/old_helper.py

diff --git a/src/metrics/experimental/anchor_regression/helper.py b/src/metrics/experimental/anchor_regression/helper.py
index 9c29eb042..1c60dc904 100644
--- a/src/metrics/experimental/anchor_regression/helper.py
+++ b/src/metrics/experimental/anchor_regression/helper.py
@@ -75,8 +75,8 @@ def anchor_regression(
         X: np.ndarray,
         Z: np.ndarray,
         Y: np.ndarray,
-        l2_reg: float = 1e-2,
-        anchor_strength: float = 1.0
+        l2_reg: float = 1e-4,
+        gamma: float = 1.0
 ) -> np.ndarray:
     """Anchor regression for causal inference under confounding.
 
@@ -86,15 +86,15 @@ def anchor_regression(
             these environmental variables. Shape (n, z).
         Y: predicted variables, of shape (n, u).
         l2_reg: L2 regularization strength.
-        anchor_strength: Strength of anchor regularization. 
-                        0 = standard regression, higher = more anchor regularization.
+        gamma: Strength of anchor regularization. 
+            1 = standard regression, higher = more anchor regularization.
 
     Returns:
         Inferred parameters, of shape (d, u).
     """
 
     # Whitening transformation
-    W = get_whitening_transform(Z, gamma=anchor_strength)
+    W = get_whitening_transform(Z, gamma=gamma)
     X_t = W @ X
     Y_t = W @ Y
 
@@ -110,25 +110,50 @@ def compute_stabilities(
         X: np.ndarray,
         y: np.ndarray,
         Z: np.ndarray,
-        A: np.ndarray,
+        weights: np.ndarray,
         is_selected: np.ndarray,
         eps: float = 1e-50
 ) -> float:
-    theta0_signed = anchor_regression(X, Z, y, anchor_strength=1)
+    theta0_signed = anchor_regression(X, Z, y, gamma=1)
     theta0_signed = theta0_signed[is_selected]
-    theta0 = np.abs(theta0_signed)
-    theta0 /= np.sum(theta0)
+    #theta0 = np.abs(theta0_signed)
+    theta0 = theta0_signed / np.sum(np.abs(theta0_signed))
 
-    theta_signed = anchor_regression(X, Z, y, anchor_strength=20)
+    theta_signed = anchor_regression(X, Z, y, gamma=1.65)
     theta_signed = theta_signed[is_selected]
-    theta = np.abs(theta_signed)
-    theta /= np.sum(theta)
+    #theta = np.abs(theta_signed)
+    theta = theta_signed / np.sum(np.abs(theta_signed))
+
+    stabilities = np.clip(np.abs(theta0 - theta) / np.abs(theta0 + eps), 0, 1)
+    #stabilities[np.sign(theta0_signed) != np.sign(theta_signed)] = 0
+
+    weights = np.abs(weights[is_selected])
+    weights /= np.sum(weights)
+
+    return float(np.sum(weights * stabilities))
+
+
+def compute_stabilities_v2(
+        X: np.ndarray,
+        y: np.ndarray,
+        Z: np.ndarray,
+        A: np.ndarray,
+        is_selected: np.ndarray,
+        eps: float = 1e-50
+) -> Tuple[float, float]:
 
-    stabilities = np.clip((theta0 - theta) / (theta0 + eps), 0, 1)
-    stabilities[np.sign(theta0_signed) != np.sign(theta_signed)] = 0
+    theta0 = anchor_regression(X[:, is_selected], Z, y, gamma=1)
+    theta = anchor_regression(X[:, is_selected], Z, y, gamma=1.5)
+    s1 = np.clip(np.abs(theta0 - theta) / np.abs(theta0 + eps), 0, 1)
 
-    return stabilities
+    theta0 = anchor_regression(X[:, ~is_selected], Z, y, gamma=1)
+    theta = anchor_regression(X[:, ~is_selected], Z, y, gamma=1.5)
+    s2 = np.clip(np.abs(theta0 - theta) / np.abs(theta0 + eps), 0, 1)
 
+    s1 = np.mean(s1)
+    s2 = np.mean(s2)
+
+    return s1, s2
 
 
 def evaluate_gene_stability(
@@ -148,22 +173,21 @@ def evaluate_gene_stability(
         eps: Small epsilon for numerical stability.
         
     Returns:
-        Stability score for gene j
+        Stability score for gene j.
     """
     is_selected = np.array(A[:, j] != 0)
     if (not np.any(is_selected)) or np.all(is_selected):
-        return 0.0
+        return np.nan
     assert not is_selected[j]
 
     # Exclude target gene from features
     mask = np.ones(X.shape[1], dtype=bool)
     mask[j] = False
 
-    stabilities_selected = np.mean(compute_stabilities(X[:, mask], X[:, j], Z, A, is_selected[mask], eps=eps))
+    score = compute_stabilities(X[:, mask], X[:, j], Z, A[mask, j], is_selected[mask], eps=eps)
     #stabilities_non_selected = np.mean(compute_stabilities(X[:, mask], X[:, j], Z, A, ~is_selected[mask], eps=eps))
-
     #score = (stabilities_selected - stabilities_non_selected) / (stabilities_selected + stabilities_non_selected + eps)
-    score = np.mean(stabilities_selected)
+    #score = np.mean(stabilities_selected)
 
     return score
 
@@ -200,9 +224,12 @@ def main(par):
         raise ValueError(f"No anchor variables found in dataset for columns: {anchor_cols}")
     
     # One-hot encode anchor variables
-    Z = OneHotEncoder(sparse_output=False, dtype=np.float32).fit_transform(anchor_encoded.reshape(-1, 1))
+    Z = OneHotEncoder(drop="first", sparse_output=False, dtype=np.float32).fit_transform(anchor_encoded.reshape(-1, 1))
     print(f"Anchor matrix Z shape: {Z.shape}")
 
+    # Add intercept
+    Z = np.concatenate((Z, np.ones((len(Z), 1), dtype=np.float32)), axis=1)
+
     # Load inferred GRN
     df = read_prediction(par)
     sources = df["source"].to_numpy()
@@ -232,24 +259,43 @@ def main(par):
     np.fill_diagonal(A, 0)
     print(f"Evaluating {X.shape[1]} genes with {np.sum(A != 0)} regulatory links")
 
+    # Whether or not to take into account the regulatory modes (enhancer/inhibitor)
+    signed = np.any(A < 0)
+
     # Center and scale dataset
     scaler = StandardScaler()
     X = scaler.fit_transform(X)
 
-    # Create baseline GRN
+    # Create baseline model: for each TG, shuffle the TFs
     A_baseline = np.copy(A)
-    for j in range(A_baseline.shape[1]):
-        np.random.shuffle(A_baseline[:j, j])
-        np.random.shuffle(A_baseline[j+1:, j])
-    assert np.any(A != A_baseline)
-
+    if signed:
+        A_baseline *= (2 * (np.random.randint(0, 2) - 0.5))
+    tf_mask = np.any(A_baseline != 0, axis=1)
+    for j in range(A.shape[1]):
+        mask = np.copy(tf_mask)
+        mask[j] = False
+        if np.any(mask):
+            values = np.copy(A_baseline[mask, j])
+            np.random.shuffle(values)
+            A_baseline[mask, j] = values
+
+    # Compute gene stabilities
     scores, scores_baseline = [], []
     for j in tqdm.tqdm(range(X.shape[1]), desc="Evaluating gene stability"):
         scores.append(evaluate_gene_stability(X, Z, A, j))
+        scores_baseline.append(evaluate_gene_stability(X, Z, A_baseline, j))
     scores = np.array(scores)
+    scores_baseline = np.array(scores_baseline)
+
+    # Skip NaNs
+    mask = ~np.logical_or(np.isnan(scores), np.isnan(scores_baseline))
+    scores = scores[mask]
+    scores_baseline = scores_baseline[mask]
 
     # Calculate final score
-    final_score = np.mean(scores)
+    p_value = wilcoxon(scores, scores_baseline).pvalue
+    p_value = np.clip(p_value, 1e-300, 1)
+    final_score = -np.log10(p_value)
     print(f"Method: {method_id}")
     print(f"Anchor Regression Score: {final_score:.6f}")
 
diff --git a/src/metrics/experimental/recovery_2/helper.py b/src/metrics/experimental/recovery_2/helper.py
index 2f42c2e0a..c94a7c8d9 100644
--- a/src/metrics/experimental/recovery_2/helper.py
+++ b/src/metrics/experimental/recovery_2/helper.py
@@ -1,9 +1,10 @@
 from typing import Tuple, List, Optional
 
+import tqdm
 import scipy
 import numpy as np
 from scipy.sparse import csr_matrix
-from scipy.stats import spearmanr, pearsonr, wilcoxon
+from scipy.stats import spearmanr, pearsonr, wilcoxon, rankdata
 from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
 import pandas as pd
 import anndata as ad
@@ -38,16 +39,30 @@ def evaluate_grn(
         n_tfs: Optional[int] = None,
         max_targets_per_tf: Optional[int] = None,
         tf_tg: bool = True,
-        tg_tg: bool = True
+        tg_tg: bool = True,
+        signed: bool = True
 ) -> float:
     """
     Evaluate a specific setting (recall, balanced, or precision)
     """
 
-    # Compute the min and max correlation coefficients across groups
+    # Compute the min and max correlation coefficients across groups.
+    # If evaluation is signed, then the min-max spread occuring in the negative (-1, 0) range
+    # counts 3 times more if the inferred regulatory edge is positive, and vice versa.
     C_min = np.min(C, axis=0)
     C_max = np.max(C, axis=0)
-    consistency_scores = 2 - (C_max - C_min)
+    if signed:
+        is_positive = (A > 0)
+        is_negative = (A < 0)
+        pos_spread = np.clip(C_max, 0, 1) - np.clip(C_min, 0, 1)
+        neg_spread = np.clip(C_max, -1, 0) - np.clip(C_min, -1, 0)
+        spread = is_positive * (pos_spread + 3 * neg_spread)
+        spread += is_negative * (3 * pos_spread + neg_spread)
+        consistency_scores = 1 - 0.25 * spread
+        #consistency_scores = is_positive * C_min + is_negative * (-C_max)
+        #C_mean = np.mean(C, axis=0)
+    else:
+        consistency_scores = 1 - 0.5 * (C_max - C_min)
     
     # Filter TFs if specified (keep the TFs with the most target genes)
     if n_tfs is None:
@@ -58,7 +73,7 @@ def evaluate_grn(
     tf_mask[tf_idx] = True
 
     scores = []
-    for j in range(A.shape[1]):
+    for j in tqdm.tqdm(range(A.shape[1])):
 
         if tf_mask[j]:
 
@@ -82,10 +97,17 @@ def evaluate_grn(
                 weights = np.abs(A[j, tg_idx])
                 weights /= np.sum(weights)
                 overall_score = 0.0
+                n_terms = 0
                 if tf_tg:
                     overall_score += float(np.sum(weights * consistency_scores[j, tg_idx]))
+                    n_terms += 1
                 if tg_tg:
-                    overall_score += float(np.sum(weights * np.mean(consistency_scores[tg_idx, :][:, tg_idx], axis=0)))
+                    S = consistency_scores[tg_idx, :][:, tg_idx]
+                    np.fill_diagonal(S, 0)
+                    overall_score += float(np.sum(weights * np.sum(S, axis=0) / (len(S) - 1)))
+                    n_terms += 1
+                if n_terms > 0:
+                    overall_score /= n_terms
                 scores.append(overall_score)
 
         else:
@@ -96,15 +118,25 @@ def evaluate_grn(
 
 def evaluate_setting(C: np.ndarray, A: np.ndarray, setting_name: str, **kwargs) -> float:
 
+    # Whether or not to take into account the regulatory modes (enhancer/inhibitor)
+    signed = kwargs.get("signed", True)
+
     # Evaluate inferred GRN
     scores = evaluate_grn(C, A, **kwargs)
 
-    # Create baseline GRN
+    # Create baseline model: for each TG, shuffle the TFs
     A_baseline = np.copy(A)
+    if signed:
+        A_baseline *= (2 * (np.random.randint(0, 2) - 0.5))
+    tf_mask = np.any(A_baseline != 0, axis=1)
     for j in range(A.shape[1]):
-        np.random.shuffle(A_baseline[:j, j])
-        np.random.shuffle(A_baseline[j+1:, j])
-    assert np.any(A_baseline != A)
+        mask = np.copy(tf_mask)
+        mask[j] = False
+        if np.any(mask):
+            values = np.copy(A_baseline[mask, j])
+            np.random.shuffle(values)
+            A_baseline[mask, j] = values
+    # assert np.any(A_baseline != A)
 
     # Evaluate baseline GRN
     scores_baseline = evaluate_grn(C, A_baseline, **kwargs)
@@ -124,12 +156,20 @@ def evaluate_setting(C: np.ndarray, A: np.ndarray, setting_name: str, **kwargs)
             p_value = wilcoxon(scores, scores_baseline, alternative="greater", zero_method="pratt").pvalue
             p_value = np.clip(p_value, 1e-300, 1)
             print(f"p-value={p_value}")
-            #return -np.log10(p_value)
-            return np.mean(np.clip(scores - scores_baseline, 0, None))
+            return -np.log10(p_value)
         except ValueError:
             return 0.0
 
 
+def spearman_corrcoef(X: np.ndarray) -> np.ndarray:
+    R = np.apply_along_axis(rankdata, 0, X, method="average")
+    R -= R.mean(axis=0, keepdims=True)
+    R /= (R.std(axis=0, ddof=1, keepdims=True) + 1e-9)
+    C = (R.T @ R) / (R.shape[0] - 1)
+    np.fill_diagonal(C, 0.0)
+    return C
+
+
 def main(par):
     # Load evaluation data
     adata = ad.read_h5ad(par['evaluation_data'])
@@ -152,14 +192,9 @@ def main(par):
         variables = encode_obs_cols(adata, ["donor_id", "plate_name"])
         group_variables.append(combine_multi_index(*variables))
         group_names.append("donor_id/plate_name")
-    if (len(group_variables) == 0) and ("cell_type" in adata.obs):
-        variables = encode_obs_cols(adata, ["cell_type"])
-        group_variables.append(combine_multi_index(*variables))
-        group_names.append("cell_type")
     if len(group_variables) == 0:
-        variables = encode_obs_cols(adata, ["perturbation"])
-        group_variables.append(combine_multi_index(*variables))
-        group_names.append("perturbation")
+        group_variables.append(np.random.randint(0, 6, size=len(X)))
+        group_names.append("random_split")
 
     # Load inferred GRN
     df = read_prediction(par)
@@ -181,65 +216,69 @@ def main(par):
     A = A[gene_mask, :][:, gene_mask]
     gene_names = gene_names[gene_mask]
 
+    # Whether or not to take into account the regulatory modes (enhancer/inhibitor)
+    signed = np.any(A < 0)
+
     # Remove self-regulations
     np.fill_diagonal(A, 0)
     print(f"Evaluating {X.shape[1]} genes with {np.sum(A != 0)} regulatory links")
 
-    recall_results = []
-    balanced_results = []
-    precision_results = []
+    tftg_results = []
+    tgtg_results = []
+    tftg_tgtg_results = []
     for group_name, groups in zip(group_names, group_variables):
         print("\n" + "=" * 30)
         print(f"Grouping samples by {group_name}")
 
         # Compute correlation matrix for each group
-        S, C = [], []
+        C = []
         for group in np.unique(groups):
+            print(f"Computing correlations for group {group}")
             mask = (groups == group)
             if np.sum(mask) >= 3:
                 assert not np.any(np.isnan(X[mask, :]))
-                res = spearmanr(X[mask, :])
-                S.append(res.statistic)
-                C.append(res.pvalue)
-        S = np.nan_to_num(S, nan=0)
-        C = np.nan_to_num(C, nan=1)
-        #C = -np.sign(S) * np.log10(C)
-        #C = np.clip(C, -4, 4)
-        C = S
-        
-        # 1. Recall setting: all TFs, all target genes (no filtering)
-        recall_results.append(evaluate_setting(
-            C, A, "recall",
-            tf_tg=True, tg_tg=True,
-            n_tfs=None,  # Use all TFs
-            max_targets_per_tf=None  # Use all target genes (up to default limit)
+                #C.append(np.corrcoef(X[mask, :].T))
+                C.append(spearman_corrcoef(X[mask, :]))
+        C = np.asarray(C)
+
+        tftg_results.append(evaluate_setting(
+            C, A, "tftg",
+            tf_tg=True,
+            tg_tg=False,
+            n_tfs=100,
+            max_targets_per_tf=100,
+            signed=signed
         ))
-        
-        # 2. Balanced setting: top 100 TFs, top 100 target genes
-        balanced_results.append(evaluate_setting(
-            C, A, "balanced",
-            tf_tg=True, tg_tg=True,
-            n_tfs=100, 
-            max_targets_per_tf=100
+
+        tgtg_results.append(evaluate_setting(
+            C, A, "tgtg",
+            tf_tg=False,
+            tg_tg=True,
+            n_tfs=40, 
+            max_targets_per_tf=300,
+            signed=signed
         ))
         
-        # 3. Precision setting: top 40 TFs, top 40 target genes
-        precision_results.append(evaluate_setting(
-            C, A, "precision",
-            tf_tg=True, tg_tg=True,
-            n_tfs=40, 
-            max_targets_per_tf=40
+        tftg_tgtg_results.append(evaluate_setting(
+            C, A, "tftg+tgtg",
+            tf_tg=True,
+            tg_tg=True,
+            n_tfs=100, 
+            max_targets_per_tf=300,
+            signed=signed
         ))
 
-    recovery_recall = np.mean(recall_results)
-    recovery_balanced = np.mean(balanced_results)
-    recovery_precision = np.mean(precision_results)
+    tftg_score = np.mean(tftg_results)
+    tgtg_score = np.mean(tgtg_results)
+    tftg_tgtg_score = np.mean(tftg_tgtg_results)
+    final_score = (tftg_score + tgtg_score + tftg_tgtg_score) / 3.0
     
     # Create results DataFrame with all three scores
     results = pd.DataFrame({
-        'recovery_recall': [recovery_recall],
-        'recovery_balanced': [recovery_balanced],
-        'recovery_precision': [recovery_precision]
+        'tftg': [tftg_score],
+        'tgtg': [tgtg_score],
+        'tftg_tgtg': [tftg_tgtg_score],
+        'final_score': [final_score]
     })
     print(results)
     
diff --git a/src/metrics/experimental/recovery_2/old_helper.py b/src/metrics/experimental/recovery_2/old_helper.py
new file mode 100644
index 000000000..e885a24c4
--- /dev/null
+++ b/src/metrics/experimental/recovery_2/old_helper.py
@@ -0,0 +1,292 @@
+from typing import Tuple, List, Optional
+
+import tqdm
+import scipy
+import numpy as np
+from scipy.sparse import csr_matrix
+from scipy.stats import spearmanr, pearsonr, wilcoxon
+from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
+import pandas as pd
+import anndata as ad
+import warnings
+warnings.filterwarnings("ignore")
+from util import format_save_score, read_prediction
+
+np.random.seed(0)
+from util import manage_layer
+
+
+def encode_obs_cols(adata, cols):
+    """Encode observation columns to integer codes."""
+    encoded = []
+    for col in cols:
+        if col in adata.obs:
+            codes = LabelEncoder().fit_transform(adata.obs[col].values)
+            encoded.append(codes)
+    return encoded
+
+
+def combine_multi_index(*arrays) -> np.array:
+    """Combine parallel label arrays into a single integer label per position."""
+    A = np.stack(arrays)
+    n_classes = tuple(int(A[i].max()) + 1 for i in range(A.shape[0]))
+    return np.ravel_multi_index(A, dims=n_classes, order='C')
+
+
+def evaluate_grn(
+        C: np.ndarray,
+        A: np.ndarray,
+        n_tfs: Optional[int] = None,
+        max_targets_per_tf: Optional[int] = None,
+        tf_tg: bool = True,
+        tg_tg: bool = True,
+        signed: bool = True
+) -> float:
+    """
+    Evaluate a specific setting (recall, balanced, or precision)
+    """
+
+    # Compute the min and max correlation coefficients across groups.
+    # If evaluation is signed, then the min-max spread occuring in the negative (-1, 0) range
+    # counts 3 times more if the inferred regulatory edge is positive, and vice versa.
+    C_min = np.min(C, axis=0)
+    C_max = np.max(C, axis=0)
+
+    #C_min = np.quantile(C, 0.2, axis=0)
+    #C_max = np.quantile(C, 0.8, axis=0)
+    if signed:
+        is_positive = (A > 0)
+        is_negative = (A < 0)
+        #pos_spread = np.clip(C_max, 0, 1) - np.clip(C_min, 0, 1)
+        #neg_spread = np.clip(C_max, -1, 0) - np.clip(C_min, -1, 0)
+        #spread = is_positive * (pos_spread + 3 * neg_spread)
+        #spread += is_negative * (3 * pos_spread + neg_spread)
+        #consistency_scores = 1 - 0.25 * spread
+        consistency_scores = is_positive * C_min + is_negative * (-C_max)
+        #C_mean = np.mean(C, axis=0)
+        #consistency_scores = is_positive * C_mean + is_negative * (-C_mean)
+        #consistency_scores = 1 - 0.5 * (C_max - C_min)
+    else:
+        consistency_scores = 1 - 0.5 * (C_max - C_min)
+    #assert np.all(consistency_scores >= 0)
+    #assert np.all(consistency_scores <= 1)
+    
+    # Filter TFs if specified (keep the TFs with the most target genes)
+    if n_tfs is None:
+        tf_idx = np.arange(A.shape[1])
+    else:
+        tf_idx = np.argsort(np.sum(A != 0, axis=1))[-n_tfs:]
+    tf_mask = np.zeros(A.shape[1], dtype=bool)
+    tf_mask[tf_idx] = True
+
+    scores = []
+    for j in tqdm.tqdm(range(A.shape[1])):
+
+        if tf_mask[j]:
+
+            # Filter TGs if specified (keep the TGs with highest regulatory links)
+            if max_targets_per_tf is None:
+                tg_idx = np.where(A[j, :] != 0)[0]
+            else:
+                tg_idx = np.argsort(np.abs(A[j, :]))[-max_targets_per_tf:]
+                tg_idx = tg_idx[A[j, tg_idx] != 0]
+
+            # Compute scores
+            if len(tg_idx) == 0:
+                scores.append(np.nan)
+            else:
+                # Intuition behind the scoring scheme:
+                # our score is a weighted average of consistency scores,
+                # where the weights are proportional to regulatory weight magnitudes,
+                # and consistency scores are penalized by the max-min correlation spread.
+                # The more consistent correlation coefficients across groups, the higher
+                # the consistency scores.
+                weights = np.abs(A[j, tg_idx])
+                weights /= np.sum(weights)
+                overall_score = 0.0
+                n_terms = 0
+                if tf_tg:
+                    overall_score += float(np.sum(weights * consistency_scores[j, tg_idx]))
+                    n_terms += 1
+                if tg_tg:
+                    S = consistency_scores[tg_idx, :][:, tg_idx]
+                    np.fill_diagonal(S, 0)
+                    overall_score += float(np.sum(weights * np.sum(S, axis=0) / (len(S) - 1)))
+                    n_terms += 1
+                if n_terms > 0:
+                    overall_score /= n_terms
+                scores.append(overall_score)
+
+        else:
+            scores.append(np.nan)
+
+    return np.array(scores)
+
+
+def evaluate_setting(C: np.ndarray, A: np.ndarray, setting_name: str, **kwargs) -> float:
+
+    # Whether or not to take into account the regulatory modes (enhancer/inhibitor)
+    signed = kwargs.get("signed", True)
+
+    # Evaluate inferred GRN
+    scores = evaluate_grn(C, A, **kwargs)
+
+    # Create baseline model: for each TG, shuffle the TFs
+    A_baseline = np.copy(A)
+    if signed:
+        A_baseline *= (2 * (np.random.randint(0, 2) - 0.5))
+    tf_mask = np.any(A_baseline != 0, axis=1)
+    for j in range(A.shape[1]):
+        mask = np.copy(tf_mask)
+        mask[j] = False
+        if np.any(mask):
+            values = np.copy(A_baseline[mask, j])
+            np.random.shuffle(values)
+            A_baseline[mask, j] = values
+    # assert np.any(A_baseline != A)
+
+    # Evaluate baseline GRN
+    scores_baseline = evaluate_grn(C, A_baseline, **kwargs)
+
+    # Skip NaNs
+    mask = ~np.logical_or(np.isnan(scores), np.isnan(scores_baseline))
+    scores = scores[mask]
+    scores_baseline = scores_baseline[mask]
+
+    # Compare results
+    if len(scores) == 0:
+        print(f"Mean corr. coef. for setting '{setting_name}': grn=nan, baseline=nan")
+        return 0.0
+    else:
+        print(f"Mean corr. coef. for setting '{setting_name}': grn={np.mean(scores):.3f}, baseline={np.mean(scores_baseline):.3f}")
+        try:
+            p_value = wilcoxon(scores, scores_baseline, alternative="greater", zero_method="pratt").pvalue
+            p_value = np.clip(p_value, 1e-300, 1)
+            print(f"p-value={p_value}")
+            return -np.log10(p_value)
+            #return np.mean(np.clip(scores - scores_baseline, 0, None))
+        except ValueError:
+            return 0.0
+
+
+def spearman_corrcoef(X: np.ndarray) -> np.ndarray:
+    eps = np.random.uniform(0, 1e-30, size=X.shape)  # To break ties
+    #X = np.argsort(X + eps, axis=0)
+    C = np.corrcoef(X.T)
+    np.fill_diagonal(C, 0)
+    return C
+
+
+def main(par):
+    # Load evaluation data
+    adata = ad.read_h5ad(par['evaluation_data'])
+    dataset_id = adata.uns['dataset_id']
+    method_id = ad.read_h5ad(par['prediction'], backed='r').uns['method_id']
+
+    # Manage layer
+    layer = manage_layer(adata, par)
+    X = adata.layers[layer]
+    if isinstance(X, csr_matrix):
+        X = X.toarray()
+    X = X.astype(np.float32)
+
+    gene_names = adata.var_names
+    gene_dict = {gene_name: i for i, gene_name in enumerate(gene_names)}
+
+    # Get groups
+    group_variables, group_names = [], []
+    if ("donor_id" in adata.obs) or ("plate_name" in adata.obs):
+        variables = encode_obs_cols(adata, ["donor_id", "plate_name"])
+        group_variables.append(combine_multi_index(*variables))
+        group_names.append("donor_id/plate_name")
+    if len(group_variables) == 0:
+        group_variables.append(np.random.randint(0, 6, size=len(X)))
+        group_names.append("random_split")
+
+    # Load inferred GRN
+    df = read_prediction(par)
+    sources = df["source"].to_numpy()
+    targets = df["target"].to_numpy()
+    weights = df["weight"].to_numpy()
+    A = np.zeros((len(gene_names), len(gene_names)), dtype=X.dtype)
+    for source, target, weight in zip(sources, targets, weights):
+        if (source in gene_dict) and (target in gene_dict):
+            i = gene_dict[source]
+            j = gene_dict[target]
+            A[i, j] = float(weight)
+
+    # Only consider the genes that are actually present in the inferred GRN,
+    # and keep only the most-connected genes (for speed).
+    gene_mask = np.logical_or(np.any(A, axis=1), np.any(A, axis=0))
+    X = X[:, gene_mask]
+    X = X.toarray() if isinstance(X, csr_matrix) else X
+    A = A[gene_mask, :][:, gene_mask]
+    gene_names = gene_names[gene_mask]
+
+    # Whether or not to take into account the regulatory modes (enhancer/inhibitor)
+    signed = np.any(A < 0)
+
+    # Remove self-regulations
+    np.fill_diagonal(A, 0)
+    print(f"Evaluating {X.shape[1]} genes with {np.sum(A != 0)} regulatory links")
+
+    tftg_results = []
+    tgtg_results = []
+    tftg_tgtg_results = []
+    for group_name, groups in zip(group_names, group_variables):
+        print("\n" + "=" * 30)
+        print(f"Grouping samples by {group_name}")
+
+        # Compute correlation matrix for each group
+        C = []
+        for group in np.unique(groups):
+            print(f"Computing correlations for group {group}")
+            mask = (groups == group)
+            if np.sum(mask) >= 3:
+                assert not np.any(np.isnan(X[mask, :]))
+                #C.append(np.corrcoef(X[mask, :].T))
+                C.append(spearman_corrcoef(X[mask, :]))
+        C = np.asarray(C)
+
+        tftg_results.append(evaluate_setting(
+            C, A, "tftg",
+            tf_tg=True,
+            tg_tg=False,
+            n_tfs=100,
+            max_targets_per_tf=100,
+            signed=signed
+        ))
+
+        tgtg_results.append(evaluate_setting(
+            C, A, "tgtg",
+            tf_tg=False,
+            tg_tg=True,
+            n_tfs=40, 
+            max_targets_per_tf=300,
+            signed=signed
+        ))
+        
+        tftg_tgtg_results.append(evaluate_setting(
+            C, A, "tftg+tgtg",
+            tf_tg=True,
+            tg_tg=True,
+            n_tfs=100, 
+            max_targets_per_tf=300,
+            signed=signed
+        ))
+
+    tftg_score = np.mean(tftg_results)
+    tgtg_score = np.mean(tgtg_results)
+    tftg_tgtg_score = np.mean(tftg_tgtg_results)
+    final_score = (tftg_score + tgtg_score + tftg_tgtg_score) / 3.0
+    
+    # Create results DataFrame with all three scores
+    results = pd.DataFrame({
+        'tftg': [tftg_score],
+        'tgtg': [tgtg_score],
+        'tftg_tgtg': [tftg_tgtg_score],
+        'final_score': [final_score]
+    })
+    print(results)
+    
+    return results
diff --git a/src/metrics/experimental/regression_3/helper.py b/src/metrics/experimental/regression_3/helper.py
index 1858b9703..0a10ae334 100644
--- a/src/metrics/experimental/regression_3/helper.py
+++ b/src/metrics/experimental/regression_3/helper.py
@@ -7,7 +7,6 @@
 import numpy as np
 import pandas as pd
 import tqdm
-import xgboost
 from scipy.sparse.linalg import LinearOperator
 from scipy.stats import pearsonr, spearmanr, wilcoxon, ConstantInputWarning
 from scipy.sparse import csr_matrix
@@ -52,8 +51,7 @@ def compute_residual_correlations(
         y_test: np.ndarray,
         Z_test: np.ndarray
 ) -> np.ndarray:
-    model = xgboost.XGBRegressor(n_estimators=10)
-    #model = Ridge(alpha=10)
+    model = Ridge(alpha=10)
     model.fit(X_train, y_train)
     y_hat = model.predict(X_test)
     residuals = y_test - y_hat
@@ -122,19 +120,29 @@ def main(par):
     A = A[gene_mask, :][:, gene_mask]
     gene_names = gene_names[gene_mask]
 
+    # Whether or not to take into account the regulatory modes (enhancer/inhibitor)
+    signed = np.any(A < 0)
+
     # Remove self-regulations
     np.fill_diagonal(A, 0)
     print(f"Evaluating {X.shape[1]} genes with {np.sum(A != 0)} regulatory links")
 
-    # Create baseline model
-    A_baseline = np.copy(A)
-    for j in range(A.shape[1]):
-        np.random.shuffle(A_baseline[:j, j])
-        np.random.shuffle(A_baseline[j+1:, j])
-    assert np.any(A_baseline != A)
-
-    scores, baseline_scores = [], []
+    overall_scores = []
     for group in np.unique(anchor_encoded):
+        scores, baseline_scores = [], []
+
+        # Create baseline model: for each TG, shuffle the TFs
+        A_baseline = np.copy(A)
+        if signed:
+            A_baseline *= (2 * (np.random.randint(0, 2) - 0.5))
+        tf_mask = np.any(A_baseline != 0, axis=1)
+        for j in range(A.shape[1]):
+            mask = np.copy(tf_mask)
+            mask[j] = False
+            if np.any(mask):
+                values = np.copy(A_baseline[mask, j])
+                np.random.shuffle(values)
+                A_baseline[mask, j] = values
 
         # Train/test split
         mask = (anchor_encoded != group)
@@ -142,9 +150,9 @@ def main(par):
         X_test = X[~mask, :]
 
         # Standardize features
-        #scaler = StandardScaler()
-        #X_train = scaler.fit_transform(X_train)
-        #X_test = scaler.transform(X_test)
+        scaler = StandardScaler()
+        X_train = scaler.fit_transform(X_train)
+        X_test = scaler.transform(X_test)
 
         for j in tqdm.tqdm(range(X_train.shape[1])):
 
@@ -164,33 +172,31 @@ def main(par):
                 )
                 scores.append(np.mean(coefs))
 
-            # Evaluate baseline GRN
-            selected = (A_baseline[:, j] != 0)
-            unselected = ~np.copy(selected)
-            unselected[j] = False
-            coefs = compute_residual_correlations(
-                X_train[:, selected],
-                X_train[:, j],
-                X_test[:, selected],
-                X_test[:, j],
-                X_test[:, ~selected]
-            )
-            baseline_scores.append(np.mean(coefs))
-    scores = np.array(scores)
-    baseline_scores = np.array(baseline_scores)
-
-    p_value = wilcoxon(baseline_scores, scores, alternative="greater").pvalue
-    p_value = max(p_value, 1e-300)
+                # Evaluate baseline GRN
+                selected = (A_baseline[:, j] != 0)
+                coefs = compute_residual_correlations(
+                    X_train[:, selected],
+                    X_train[:, j],
+                    X_test[:, selected],
+                    X_test[:, j],
+                    X_test[:, ~selected]
+                )
+                baseline_scores.append(np.mean(coefs))
+
+        # Compute fold score
+        p_value = wilcoxon(baseline_scores, scores, alternative="greater").pvalue
+        p_value = max(p_value, 1e-300)
+        overall_scores.append(-np.log10(p_value))
 
     # Calculate final score
-    final_score = -np.log10(p_value)
-    print(f"Anchor Regression Score: {final_score:.6f}")
+    final_score = np.mean(overall_scores)
+    print(f"Regression Score: {final_score:.6f}")
     print(f"Method: {method_id}")
 
     # Return results as DataFrame
     results = {
-        'regression_3': [final_score]
+        'regression_3': [float(final_score)]
     }
 
     df_results = pd.DataFrame(results)
-    return df_results
\ No newline at end of file
+    return df_results
diff --git a/src/metrics/experimental/regression_3/script.py b/src/metrics/experimental/regression_3/script.py
index 7c660f603..6a6b03d0a 100644
--- a/src/metrics/experimental/regression_3/script.py
+++ b/src/metrics/experimental/regression_3/script.py
@@ -25,7 +25,7 @@
 }
 ## VIASH END
 try:
-  sys.path.append(meta["resources_dir"])
+    sys.path.append(meta["resources_dir"])
 except:
     meta = {
       "resources_dir":'src/metrics/regression_2/',
@@ -41,9 +41,9 @@
  
 
 if __name__ == '__main__':
-  print(par)
-  output = main(par)
-  print(output)
-  method_id = ad.read_h5ad(par['prediction'], backed='r').uns['method_id']
-  dataset_id = ad.read_h5ad(par['evaluation_data'], backed='r').uns['dataset_id']
-  format_save_score(output, method_id, dataset_id, par['score'])
+    print(par)
+    output = main(par)
+    print(output)
+    method_id = ad.read_h5ad(par['prediction'], backed='r').uns['method_id']
+    dataset_id = ad.read_h5ad(par['evaluation_data'], backed='r').uns['dataset_id']
+    format_save_score(output, method_id, dataset_id, par['score'])
diff --git a/src/metrics/experimental/sem/helper.py b/src/metrics/experimental/sem/helper.py
index 13747164a..bb2ec9e5b 100644
--- a/src/metrics/experimental/sem/helper.py
+++ b/src/metrics/experimental/sem/helper.py
@@ -23,7 +23,7 @@
 NUMPY_DTYPE = np.float32
 
 # Hyper-parameters
-MAX_N_ITER = 2000
+MAX_N_ITER = 500
 
 # For reproducibility purposes
 seed = 0xCAFE
@@ -181,7 +181,7 @@ def evaluate_grn(
     F = np.linalg.inv(np.eye(A.shape[0]) - A)
     F_CR = F[np.ix_(regulator_idx, reporter_idx)]
     Y_R = delta_X[:, reporter_idx]
-    lam = 0.1
+    lam = 0.001
     I = np.eye(len(regulator_idx), dtype=delta_X.dtype)
     M = F_CR @ F_CR.T + lam * I
     delta = np.zeros_like(delta_X)
@@ -210,8 +210,8 @@ def evaluate_grn(
         A_eff = torch.abs(A) * signs if signed else A * mask
         delta_X_hat = solve_sem(A_eff, delta_train)
         loss = torch.mean(torch.sum(torch.square(X_non_reporter - delta_X_hat[:, ~is_reporter]), dim=1))
-        loss = loss + 0.1 * torch.sum(torch.abs(A))
-        loss = loss + 0.1 * torch.sum(torch.square(A))
+        loss = loss + 0.001 * torch.sum(torch.abs(A))
+        loss = loss + 0.001 * torch.sum(torch.square(A))
         pbar.set_description(str(loss.item()))
 
         # Keep track of best solution
@@ -292,7 +292,7 @@ def main(par):
     gene_mask = np.logical_or(np.any(A, axis=1), np.any(A, axis=0))
     in_degrees = np.sum(A != 0, axis=0)
     out_degrees = np.sum(A != 0, axis=1)
-    idx = np.argsort(np.maximum(out_degrees, in_degrees))[:-5000]
+    idx = np.argsort(np.maximum(out_degrees, in_degrees))[:-2000]
     gene_mask[idx] = False
     X = X[:, gene_mask]
     X = X.toarray() if isinstance(X, csr_matrix) else X
@@ -347,12 +347,19 @@ def main(par):
     print(f"Proportion of reporter genes: {np.mean(is_reporter)}")
     print(f"Use regulatory modes/signs: {use_signs}")
 
+    # Create baseline model: for each TG, shuffle the TFs
     print(f"Creating baseline GRN")
     A_baseline = np.copy(A)
-    for j in range(A_baseline.shape[1]):
-        np.random.shuffle(A_baseline[:j, j])
-        np.random.shuffle(A_baseline[j+1:, j])
-    assert np.any(A_baseline != A)
+    if use_signs:
+        A_baseline *= (2 * (np.random.randint(0, 2) - 0.5))
+    tf_mask = np.any(A_baseline != 0, axis=1)
+    for j in range(A.shape[1]):
+        mask = np.copy(tf_mask)
+        mask[j] = False
+        if np.any(mask):
+            values = np.copy(A_baseline[mask, j])
+            np.random.shuffle(values)
+            A_baseline[mask, j] = values
 
     # Evaluate inferred GRN
     print("\n======== Evaluate inferred GRN ========")
@@ -360,11 +367,7 @@ def main(par):
 
     # Evaluate baseline GRN
     print("\n======== Evaluate shuffled GRN ========")
-    n_repeats = 1
-    scores_baseline = np.zeros_like(scores)
-    for _ in range(n_repeats):  # Repeat for more robust estimation
-        scores_baseline += evaluate_grn(X_controls, delta_X, is_train, is_reporter, A_baseline, signed=use_signs)
-    scores_baseline /= n_repeats
+    scores_baseline = evaluate_grn(X_controls, delta_X, is_train, is_reporter, A_baseline, signed=use_signs)
 
     # Keep only the genes for which both GRNs got a score
     mask = ~np.logical_or(np.isnan(scores), np.isnan(scores_baseline))
@@ -375,7 +378,7 @@ def main(par):
     # Perform rank test between actual scores and baseline
     rr_all['spearman'] = float(np.mean(scores))
     rr_all['spearman_shuffled'] = float(np.mean(scores_baseline))
-    if np.std(scores - scores_baseline) == 0:
+    if np.all(scores - scores_baseline == 0):
         df_results = pd.DataFrame({'sem': [0.0]})
     else:
         res = wilcoxon(scores - scores_baseline, zero_method='wilcox', alternative='greater')
@@ -390,6 +393,9 @@ def main(par):
         print(f"Final score: {score}")
 
         results = {
+            'r2': [float(np.mean(scores))],
+            'r2_baseline': [float(np.mean(scores_baseline))],
+            'r2_diff': [float(np.mean(scores) - np.mean(scores_baseline))],
             'sem': [float(score)]
         }
 
diff --git a/src/metrics/experimental/vc/helper.py b/src/metrics/experimental/vc/helper.py
index bf224c46e..4d422da37 100644
--- a/src/metrics/experimental/vc/helper.py
+++ b/src/metrics/experimental/vc/helper.py
@@ -299,7 +299,7 @@ def evaluate(A, train_data_loader, test_data_loader, state_dict, n_perturbations
         optimizer, mode='min', patience=5,
         min_lr=1e-6, cooldown=3, factor=0.8
     )
-    pbar = tqdm.tqdm(range(300))
+    pbar = tqdm.tqdm(range(1000))
     best_avg_r2, best_r2 = -np.inf, None
     model.train()
     for epoch in pbar:
@@ -440,7 +440,7 @@ def main(par):
     print(f"Using {len(gene_names)} genes present in the GRN")
     
     # Additional memory-aware gene filtering for very large GRNs
-    MAX_GENES_FOR_MEMORY = 3000  # Reduced further to avoid memory issues
+    MAX_GENES_FOR_MEMORY = 2000  # Reduced further to avoid memory issues
     if len(gene_names) > MAX_GENES_FOR_MEMORY:
         print(f"Too many genes ({len(gene_names)}) for memory. Selecting top {MAX_GENES_FOR_MEMORY} by GRN connectivity.")
         
@@ -457,6 +457,9 @@ def main(par):
     # Add self-regulations
     np.fill_diagonal(A, 1)
 
+    # Check whether the inferred GRN contains signed predictions
+    signed = np.any(A < 0)
+
     # Mapping between gene expression profiles and their matched negative controls
     control_map, _ = create_control_matching(are_controls, match_groups)
     loose_control_map, _ = create_control_matching(are_controls, loose_match_groups)
@@ -470,12 +473,19 @@ def main(par):
         if (len(train_index) == 0) or (len(test_index) == 0):
             continue
 
-        # Create baseline model
+        # Create baseline model: for each TG, shuffle the TFs
+        print(f"Creating baseline GRN")
         A_baseline = np.copy(A)
-        for j in range(A_baseline.shape[1]):
-            np.random.shuffle(A_baseline[:j, j])
-            np.random.shuffle(A_baseline[j+1:, j])
-        assert np.any(A_baseline != A)
+        if signed:
+            A_baseline *= (2 * (np.random.randint(0, 2) - 0.5))
+        tf_mask = np.any(A_baseline != 0, axis=1)
+        for j in range(A.shape[1]):
+            mask = np.copy(tf_mask)
+            mask[j] = False
+            if np.any(mask):
+                values = np.copy(A_baseline[mask, j])
+                np.random.shuffle(values)
+                A_baseline[mask, j] = values
 
         # Center and scale dataset
         scaler = StandardScaler()
@@ -509,7 +519,6 @@ def create_data_loaders():
 
         # For fair comparison, we first randomly initialize NN parameters, and use these same
         # parameters to build both models (only the GRN weights will differ).
-        signed = np.any(A < 0)
         model_template = Model(A, n_perturbations, signed=signed).to(DEVICE)
         state_dict = model_template.state_dict()
 
@@ -521,8 +530,6 @@ def create_data_loaders():
         train_data_loader, test_data_loader = create_data_loaders()
         r2_baseline.append(evaluate(A_baseline, train_data_loader, test_data_loader, state_dict, n_perturbations, signed=signed))
 
-        break
-
     r2 = np.asarray(r2).flatten()
     r2_baseline = np.asarray(r2_baseline).flatten()
     print("Mean R2", np.mean(r2), np.mean(r2_baseline))

From fa5f99c08b0b7b8e3e5b773272da39a71b6b93d6 Mon Sep 17 00:00:00 2001
From: AntoinePassemiers <antoine.passemiers@gmail.com>
Date: Thu, 9 Oct 2025 22:09:20 +0200
Subject: [PATCH 06/19] Minor changes

---
 .../experimental/anchor_regression/helper.py  | 48 ++++++++++++-------
 .../experimental/regression_3/helper.py       | 10 ++--
 src/metrics/experimental/sem/helper.py        |  2 +-
 3 files changed, 37 insertions(+), 23 deletions(-)

diff --git a/src/metrics/experimental/anchor_regression/helper.py b/src/metrics/experimental/anchor_regression/helper.py
index 1c60dc904..9703825df 100644
--- a/src/metrics/experimental/anchor_regression/helper.py
+++ b/src/metrics/experimental/anchor_regression/helper.py
@@ -10,6 +10,7 @@
 from scipy.sparse.linalg import LinearOperator
 from scipy.stats import pearsonr, wilcoxon
 from scipy.sparse import csr_matrix
+from sklearn.model_selection import KFold
 from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
 import anndata as ad
 import warnings
@@ -106,6 +107,32 @@ def anchor_regression(
     return theta
 
 
+def cross_val(
+        cv,
+        X: np.ndarray,
+        y: np.ndarray,
+        Z: np.ndarray,
+        eps: float = 1e-50
+) -> float:
+    cv = KFold(5)
+    ss_res, ss_tot = 0, 0
+    y_mean = np.mean(y)
+    for idx_train, idx_test in cv.split(X, y):
+        X_train, X_test = X[idx_train, :], X[idx_test, :]
+        Z_train, Z_test = Z[idx_train, :], Z[idx_test, :]
+        y_train, y_test = y[idx_train], y[idx_test]
+        theta = anchor_regression(X_train, Z_train, y_train, gamma=1)
+        y_hat = np.dot(X_test, theta)
+        ss_res += np.sum(np.square(y_test - y_hat))
+        ss_tot += np.sum(np.square(y_test - y_mean))
+        break  # TODO
+    if ss_tot == 0:
+        r2 = 0
+    else:
+        r2 = 1 - ss_res / ss_tot
+    return float(r2)
+
+
 def compute_stabilities(
         X: np.ndarray,
         y: np.ndarray,
@@ -114,23 +141,10 @@ def compute_stabilities(
         is_selected: np.ndarray,
         eps: float = 1e-50
 ) -> float:
-    theta0_signed = anchor_regression(X, Z, y, gamma=1)
-    theta0_signed = theta0_signed[is_selected]
-    #theta0 = np.abs(theta0_signed)
-    theta0 = theta0_signed / np.sum(np.abs(theta0_signed))
-
-    theta_signed = anchor_regression(X, Z, y, gamma=1.65)
-    theta_signed = theta_signed[is_selected]
-    #theta = np.abs(theta_signed)
-    theta = theta_signed / np.sum(np.abs(theta_signed))
-
-    stabilities = np.clip(np.abs(theta0 - theta) / np.abs(theta0 + eps), 0, 1)
-    #stabilities[np.sign(theta0_signed) != np.sign(theta_signed)] = 0
-
-    weights = np.abs(weights[is_selected])
-    weights /= np.sum(weights)
-
-    return float(np.sum(weights * stabilities))
+    cv = KFold(5, random_state=0xCAFE, shuffle=True)
+    r2_selected = cross_val(cv, X[:, is_selected], y, Z)
+    r2_non_selected = cross_val(cv, X[:, ~is_selected], y, Z)
+    return r2_selected - r2_non_selected
 
 
 def compute_stabilities_v2(
diff --git a/src/metrics/experimental/regression_3/helper.py b/src/metrics/experimental/regression_3/helper.py
index 0a10ae334..9d997cd67 100644
--- a/src/metrics/experimental/regression_3/helper.py
+++ b/src/metrics/experimental/regression_3/helper.py
@@ -18,10 +18,10 @@
 warnings.filterwarnings("ignore", category=ConstantInputWarning)
 
 # For reproducibility purposes
-seed = 0xCAFE
-os.environ['PYTHONHASHSEED'] = str(seed)
-random.seed(seed)
-np.random.seed(seed)
+#seed = 0xCAFE
+#os.environ['PYTHONHASHSEED'] = str(seed)
+#random.seed(seed)
+#np.random.seed(seed)
 
 from util import read_prediction, manage_layer
 from dataset_config import DATASET_GROUPS
@@ -51,7 +51,7 @@ def compute_residual_correlations(
         y_test: np.ndarray,
         Z_test: np.ndarray
 ) -> np.ndarray:
-    model = Ridge(alpha=10)
+    model = Ridge(alpha=0.1)
     model.fit(X_train, y_train)
     y_hat = model.predict(X_test)
     residuals = y_test - y_hat
diff --git a/src/metrics/experimental/sem/helper.py b/src/metrics/experimental/sem/helper.py
index bb2ec9e5b..a3fccda2e 100644
--- a/src/metrics/experimental/sem/helper.py
+++ b/src/metrics/experimental/sem/helper.py
@@ -26,7 +26,7 @@
 MAX_N_ITER = 500
 
 # For reproducibility purposes
-seed = 0xCAFE
+seed = 0xCAFF
 os.environ['PYTHONHASHSEED'] = str(seed)
 random.seed(seed)
 np.random.seed(seed)

From ecc78286d70b9bd46077631c9ca9f1651befac22 Mon Sep 17 00:00:00 2001
From: AntoinePassemiers <antoine.passemiers@gmail.com>
Date: Fri, 10 Oct 2025 23:42:33 +0200
Subject: [PATCH 07/19] Fix issues with tf_binding metric

---
 src/metrics/tf_binding/helper.py    | 110 +++++++++++++++++-----------
 src/metrics/tf_binding/run_local.py |  28 +++++++
 2 files changed, 95 insertions(+), 43 deletions(-)
 create mode 100644 src/metrics/tf_binding/run_local.py

diff --git a/src/metrics/tf_binding/helper.py b/src/metrics/tf_binding/helper.py
index b88199499..893f8f4c2 100644
--- a/src/metrics/tf_binding/helper.py
+++ b/src/metrics/tf_binding/helper.py
@@ -1,69 +1,93 @@
-
+import math
 import pandas as pd
-import anndata as ad
-import sys
 import numpy as np
 from sklearn.metrics import average_precision_score
 from tqdm import tqdm
 from util import read_prediction
 
+
 # For reproducibility
 seed = 42
 np.random.seed(seed)
 
 def main(par):
+
+    # Load predictions
     prediction = read_prediction(par)
-    test_data = ad.read_h5ad(par['evaluation_data'], backed='r')
-    evaluation_genes = test_data.var_names.tolist()
-    n_targets_total = len(evaluation_genes)
+    assert prediction.shape[0] > 0, 'No links found in the network'
+    if not {'source', 'target', 'weight'}.issubset(set(prediction.columns)):
+        raise ValueError("prediction must have columns: 'source', 'target', 'weight'")
 
+    # Load TF list and ground truth edges
     tf_all = np.loadtxt(par['tf_all'], dtype=str, delimiter=',', skiprows=1)
     true_graph = pd.read_csv(par['ground_truth'])
-    true_graph = true_graph[(true_graph['source'].isin(tf_all)) & (true_graph['target'].isin(evaluation_genes))]
-    assert prediction.shape[0] > 0, 'No links found in the network'
-    assert true_graph.shape[0] > 0, 'No links found in the ground truth'
-    
+    if not {'source', 'target'}.issubset(set(true_graph.columns)):
+        raise ValueError("ground_truth must have columns: 'source', 'target'")
+
+    # Keep only TFs that are in the provided TF list
+    true_graph = true_graph[true_graph['source'].isin(tf_all)]
+    assert true_graph.shape[0] > 0, 'No links found in the ground truth after filtering to TF list'
+
+    # Use the union of all targets that appear in either predictions or ground-truth
+    pred_targets = set(prediction['target'].astype(str).unique())
+    true_targets = set(true_graph['target'].astype(str).unique())
+    evaluation_genes = sorted(pred_targets.union(true_targets))
+    if len(evaluation_genes) == 0:
+        raise ValueError("Empty evaluation target set (no targets in predictions or ground truth).")
+
+    # Precompute lookup to avoid O(n^2)
+    gene_to_idx = {g: i for i, g in enumerate(evaluation_genes)}
+    n_targets_total = len(evaluation_genes)
+
     scores_model = []
+    # Iterate over TFs that actually have at least one positive in ground-truth
     for tf in tqdm(true_graph['source'].unique()):
+        # Positives for this TF
         true_edges = true_graph[true_graph['source'] == tf]
+        y_true = np.zeros(n_targets_total, dtype=int)
+        # Mark positives
+        for t in true_edges['target'].astype(str):
+            y_true[gene_to_idx[t]] = 1
+
+        # Scores over all candidate targets
+        y_score = np.zeros(n_targets_total, dtype=float)
         if tf in prediction['source'].unique():
             pred_edges = prediction[prediction['source'] == tf]
-            true_labels = true_edges['target'].isin(pred_edges['target']).astype(int)
-            pred_scores = pred_edges.set_index('target').reindex(true_edges['target'])['weight'].fillna(0)
-            if true_labels.sum() == 0:  # no positives
-                ap = 0.0
-            else:
-                ap = average_precision_score(true_labels, pred_scores)
-        else:
-            ap = float('nan')
-        n_targets = len(true_edges)
-
-        # ----- Analytic random baseline -----
-        # Extend true edges to all evaluation genes
-        true_labels_random = np.zeros(n_targets_total)
-        idx = [evaluation_genes.index(t) for t in true_edges['target']]
-        true_labels_random[idx] = 1
-        ap_random = true_labels_random.sum() / len(true_labels_random)
-
-        scores_model.append({'source': tf, 'ap': ap, 'n_targets': n_targets, 'ap_random': ap_random})
-    
-    scores_df = pd.DataFrame(scores_model)
-    print('Number of TFs in GRN:', len(scores_df[scores_df['ap'].notna()]))
 
-    # Compute weighted mean (ignoring NaNs)
-    valid = scores_df.dropna(subset=['ap'])
-    weighted_precision = np.average(valid['ap'], weights=valid['n_targets'])
+            # Fill scores where predicted
+            for tgt, w in zip(pred_edges['target'].astype(str), pred_edges['weight'].astype(float)):
+                idx = gene_to_idx.get(tgt, None)
+                if idx is not None:
+                    y_score[idx] = w
+
+            # Ensure numerical safety
+            y_score = np.nan_to_num(y_score, nan=0.0, posinf=0.0, neginf=0.0)
 
-    # Compute unweighted means (for reference)
-    precision = scores_df['ap'].mean(skipna=True)
-    precision_random = scores_df['ap_random'].mean(skipna=True)
-    recall = scores_df['ap'].fillna(0).mean()
+        # AP requires at least one positive
+        ap = average_precision_score(y_true, y_score) if (y_true.sum()) > 0 else 0.0
+
+        # Random baseline AUPRC is just the prevalence of positives
+        prevalence = y_true.mean()
+
+        scores_model.append({
+            'source': tf,
+            'ap': ap,
+            'n_targets_pos': int(y_true.sum()),
+            'prevalence': prevalence
+        })
+
+    scores_df = pd.DataFrame(scores_model)
+    valid = scores_df[scores_df['n_targets_pos'] > 0]
+    macro_ap = valid['ap'].mean() if len(valid) > 0 else np.nan
+    macro_ap_w = np.average(valid['ap'], weights=valid['n_targets_pos']) if len(valid) > 0 else np.nan
+    mean_prev = valid['prevalence'].mean() if len(valid) > 0 else np.nan
+    lift = macro_ap / mean_prev if (mean_prev is not None and mean_prev and not np.isnan(mean_prev)) else np.nan
 
     summary_df = pd.DataFrame([{
-        'precision': precision,
-        'precision_random': precision_random,
-        'recall': recall,
-        'weighted_precision': weighted_precision
+        # 'macro_ap': macro_ap,
+        #'macro_ap_weighted': macro_ap_w,
+        # 'mean_prevalence': mean_prev,
+        'lift_over_random': lift
     }])
 
-    return summary_df
\ No newline at end of file
+    return summary_df
diff --git a/src/metrics/tf_binding/run_local.py b/src/metrics/tf_binding/run_local.py
new file mode 100644
index 000000000..e509c1496
--- /dev/null
+++ b/src/metrics/tf_binding/run_local.py
@@ -0,0 +1,28 @@
+import subprocess
+
+import anndata as ad
+import pandas as pd
+
+
+for dataset in ["replogle", "norman", "adamson"]:
+    for method in ["negative_control", "pearson_corr", "positive_control", "ppcor", "portia", "scenic", "grnboost", "scprint", "scenicplus", "celloracle", "scglue", "figr", "granie"]:
+        
+        score_filepath = f"output/tf_binding/tf_binding.h5ad"
+        subprocess.call([
+            "python",
+            "src/metrics/tf_binding/script.py",
+            "--prediction", f"resources/results/{dataset}/{dataset}.{method}.{method}.prediction.h5ad",
+            "--evaluation_data", f"resources/grn_benchmark/evaluation_data/{dataset}_bulk.h5ad",
+            "--ground_truth", "resources/grn_benchmark/ground_truth/K562.csv",
+            "--score", score_filepath
+        ])
+
+        adata = ad.read_h5ad(score_filepath)
+        if "metric_values" in adata.uns:
+            metric_names = adata.uns["metric_ids"]
+            metric_values = adata.uns["metric_values"]
+            df = pd.DataFrame({"metric": metric_names, "value": metric_values})
+            df["dataset"] = dataset
+            df["method"] = method
+            df = df[["dataset", "method", "metric", "value"]]  # Reorder columns to match header
+            df.to_csv(f"output/tf_binding/tf_binding_scores_{dataset}.csv", mode="a", header=False, index=False)

From 33c324a77922d13b9a0043f7307838279de5508f Mon Sep 17 00:00:00 2001
From: AntoinePassemiers <antoine.passemiers@gmail.com>
Date: Sat, 11 Oct 2025 11:54:46 +0200
Subject: [PATCH 08/19] Deterministic algorithm for GRN baseline creation

---
 .../experimental/anchor_regression/helper.py  |  15 +-
 src/metrics/experimental/recovery_2/helper.py |  16 +-
 .../experimental/regression_3/helper.py       |  15 +-
 src/metrics/experimental/sem/helper.py        |  16 +-
 src/metrics/experimental/vc/helper.py         |  18 +-
 src/utils/baseline.py                         | 171 ++++++++++++++++++
 6 files changed, 187 insertions(+), 64 deletions(-)
 create mode 100644 src/utils/baseline.py

diff --git a/src/metrics/experimental/anchor_regression/helper.py b/src/metrics/experimental/anchor_regression/helper.py
index 9703825df..f823406a6 100644
--- a/src/metrics/experimental/anchor_regression/helper.py
+++ b/src/metrics/experimental/anchor_regression/helper.py
@@ -24,6 +24,7 @@
 
 from util import read_prediction, manage_layer
 from dataset_config import DATASET_GROUPS
+from baseline import create_grn_baseline
 
 
 def encode_obs_cols(adata, cols):
@@ -280,18 +281,8 @@ def main(par):
     scaler = StandardScaler()
     X = scaler.fit_transform(X)
 
-    # Create baseline model: for each TG, shuffle the TFs
-    A_baseline = np.copy(A)
-    if signed:
-        A_baseline *= (2 * (np.random.randint(0, 2) - 0.5))
-    tf_mask = np.any(A_baseline != 0, axis=1)
-    for j in range(A.shape[1]):
-        mask = np.copy(tf_mask)
-        mask[j] = False
-        if np.any(mask):
-            values = np.copy(A_baseline[mask, j])
-            np.random.shuffle(values)
-            A_baseline[mask, j] = values
+    # Create baseline model
+    A_baseline = create_grn_baseline(A)
 
     # Compute gene stabilities
     scores, scores_baseline = [], []
diff --git a/src/metrics/experimental/recovery_2/helper.py b/src/metrics/experimental/recovery_2/helper.py
index c94a7c8d9..b896ef773 100644
--- a/src/metrics/experimental/recovery_2/helper.py
+++ b/src/metrics/experimental/recovery_2/helper.py
@@ -14,6 +14,7 @@
 
 np.random.seed(0)
 from util import manage_layer
+from baseline import create_grn_baseline
 
 
 def encode_obs_cols(adata, cols):
@@ -124,19 +125,8 @@ def evaluate_setting(C: np.ndarray, A: np.ndarray, setting_name: str, **kwargs)
     # Evaluate inferred GRN
     scores = evaluate_grn(C, A, **kwargs)
 
-    # Create baseline model: for each TG, shuffle the TFs
-    A_baseline = np.copy(A)
-    if signed:
-        A_baseline *= (2 * (np.random.randint(0, 2) - 0.5))
-    tf_mask = np.any(A_baseline != 0, axis=1)
-    for j in range(A.shape[1]):
-        mask = np.copy(tf_mask)
-        mask[j] = False
-        if np.any(mask):
-            values = np.copy(A_baseline[mask, j])
-            np.random.shuffle(values)
-            A_baseline[mask, j] = values
-    # assert np.any(A_baseline != A)
+    # Create baseline model
+    A_baseline = create_grn_baseline(A)
 
     # Evaluate baseline GRN
     scores_baseline = evaluate_grn(C, A_baseline, **kwargs)
diff --git a/src/metrics/experimental/regression_3/helper.py b/src/metrics/experimental/regression_3/helper.py
index 9d997cd67..caaa8bfbd 100644
--- a/src/metrics/experimental/regression_3/helper.py
+++ b/src/metrics/experimental/regression_3/helper.py
@@ -25,6 +25,7 @@
 
 from util import read_prediction, manage_layer
 from dataset_config import DATASET_GROUPS
+from baseline import create_grn_baseline
 
 
 def encode_obs_cols(adata, cols):
@@ -131,18 +132,8 @@ def main(par):
     for group in np.unique(anchor_encoded):
         scores, baseline_scores = [], []
 
-        # Create baseline model: for each TG, shuffle the TFs
-        A_baseline = np.copy(A)
-        if signed:
-            A_baseline *= (2 * (np.random.randint(0, 2) - 0.5))
-        tf_mask = np.any(A_baseline != 0, axis=1)
-        for j in range(A.shape[1]):
-            mask = np.copy(tf_mask)
-            mask[j] = False
-            if np.any(mask):
-                values = np.copy(A_baseline[mask, j])
-                np.random.shuffle(values)
-                A_baseline[mask, j] = values
+        # Create baseline model
+        A_baseline = create_grn_baseline(A)
 
         # Train/test split
         mask = (anchor_encoded != group)
diff --git a/src/metrics/experimental/sem/helper.py b/src/metrics/experimental/sem/helper.py
index a3fccda2e..87ebf42fb 100644
--- a/src/metrics/experimental/sem/helper.py
+++ b/src/metrics/experimental/sem/helper.py
@@ -37,6 +37,7 @@
 
 
 from util import read_prediction, manage_layer
+from baseline import create_grn_baseline
 from dataset_config import DATASET_GROUPS
 
 
@@ -347,19 +348,8 @@ def main(par):
     print(f"Proportion of reporter genes: {np.mean(is_reporter)}")
     print(f"Use regulatory modes/signs: {use_signs}")
 
-    # Create baseline model: for each TG, shuffle the TFs
-    print(f"Creating baseline GRN")
-    A_baseline = np.copy(A)
-    if use_signs:
-        A_baseline *= (2 * (np.random.randint(0, 2) - 0.5))
-    tf_mask = np.any(A_baseline != 0, axis=1)
-    for j in range(A.shape[1]):
-        mask = np.copy(tf_mask)
-        mask[j] = False
-        if np.any(mask):
-            values = np.copy(A_baseline[mask, j])
-            np.random.shuffle(values)
-            A_baseline[mask, j] = values
+    # Create baseline model
+    A_baseline = create_grn_baseline(A)
 
     # Evaluate inferred GRN
     print("\n======== Evaluate inferred GRN ========")
diff --git a/src/metrics/experimental/vc/helper.py b/src/metrics/experimental/vc/helper.py
index 4d422da37..61378bb66 100644
--- a/src/metrics/experimental/vc/helper.py
+++ b/src/metrics/experimental/vc/helper.py
@@ -9,6 +9,7 @@
 import pandas as pd
 import torch
 from scipy.sparse import csr_matrix
+from scipy.spatial.distance import cityblock
 from scipy.stats import wilcoxon
 from sklearn.model_selection import StratifiedGroupKFold, StratifiedKFold
 from sklearn.preprocessing import StandardScaler, LabelEncoder
@@ -36,7 +37,7 @@ def set_seed():
 
 from util import read_prediction, manage_layer
 from dataset_config import DATASET_GROUPS
-from scipy.spatial.distance import cityblock
+from baseline import create_grn_baseline
 
 
 def combine_multi_index(*arrays) -> np.array:
@@ -473,19 +474,8 @@ def main(par):
         if (len(train_index) == 0) or (len(test_index) == 0):
             continue
 
-        # Create baseline model: for each TG, shuffle the TFs
-        print(f"Creating baseline GRN")
-        A_baseline = np.copy(A)
-        if signed:
-            A_baseline *= (2 * (np.random.randint(0, 2) - 0.5))
-        tf_mask = np.any(A_baseline != 0, axis=1)
-        for j in range(A.shape[1]):
-            mask = np.copy(tf_mask)
-            mask[j] = False
-            if np.any(mask):
-                values = np.copy(A_baseline[mask, j])
-                np.random.shuffle(values)
-                A_baseline[mask, j] = values
+        # Create baseline model
+        A_baseline = create_grn_baseline(A)
 
         # Center and scale dataset
         scaler = StandardScaler()
diff --git a/src/utils/baseline.py b/src/utils/baseline.py
new file mode 100644
index 000000000..5d6232a0e
--- /dev/null
+++ b/src/utils/baseline.py
@@ -0,0 +1,171 @@
+from typing import List
+
+import numpy as np
+
+
+class WeightedDegree(object):
+
+    def __init__(self, node_idx: int, degree: int, weight: float):
+        """Class used to sort nodes by in-degree or out-degree.
+
+        Ties are broken by using the weights from the GRN.
+
+        Args:
+            node_idx
+            degree: in-degree or out-degree of a node.
+            weight: sum of weights of incoming or outcoming edges.
+        """
+        self.node_idx: int = int(node_idx)
+        self.degree: int = int(degree)
+        self.weight: float = float(weight)
+
+    def __lt__(self, other: "WeightedDegree") -> bool:
+        if self.degree < other.degree:
+            return True
+        elif self.degree == other.degree:
+            return self.weight < other.weight
+        else:
+            return False
+
+    def __eq__(self, other: "WeightedDegree") -> bool:
+        return (self.degree == other.degree) and (self.weight == other.weight)
+
+    @staticmethod
+    def from_grn(A: np.ndarray, incoming: bool = True) -> List["WeightedDegree"]:
+        if not incoming:
+            A = A.T
+        D = (A != 0)
+        degrees = np.sum(D, axis=0)
+        weights = np.sum(A, axis=0)
+        return [WeightedDegree(i, degrees[i], weights[i]) for i in range(len(degrees))]
+
+
+def create_grn_baseline(A):
+    """
+    Deterministic baseline for a directed simple graph.
+    Preserves in/out degree sequences of A (A may be weighted/signed; nonzeros indicate edges).
+    Returns a weighted B: same topology as the deterministic baseline, with the exact weights from A
+    reassigned deterministically to different edges (no randomness).
+    """
+
+    n = A.shape[0]
+
+    # Order nodes by degrees, with explicit tie-breaking by weight
+    in_degrees = WeightedDegree.from_grn(A, incoming=True)
+    out_degrees = WeightedDegree.from_grn(A, incoming=False)
+    in_order = np.argsort(in_degrees)[::-1]
+    out_order = np.argsort(out_degrees)[::-1]
+
+    # Baseline GRN
+    B = np.zeros_like(A)
+
+    for u in out_order:
+        k = out_degrees[u].degree
+        if k == 0:
+            continue
+        # Greedily fill from current in_order
+        picks = []
+        for v in in_order:
+            if v == u or B[u, v] == 1 or in_degrees[v].degree == 0:
+                continue
+            picks.append(v)
+            if len(picks) == k:
+                break
+
+        # Deterministic repair if not enough picks
+        if len(picks) < k:
+            # Try swaps: reassign some of u's future edges by freeing capacity deterministically
+            # Here, we do a deterministic second pass allowing one-time edge relocation
+            for v in in_order:
+                if v == u or in_degrees[v].degree == 0 or v in picks:
+                    continue
+
+                # Find w already chosen where we can swap capacity
+                swapped = False
+                for w in picks:
+                    # Check if there exists x != u with B[x, w]==1 and B[x, v]==0 to swap (x,w)->(x,v)
+                    # Deterministic scan by x id
+                    for x in range(n):
+                        if x == u: 
+                            continue
+                        if B[x, w] == 1 and B[x, v] == 0 and x != v and x != w:
+                            B[x, w] = 0
+                            B[x, v] = 1
+                            in_degrees[w].degree += 1
+                            in_degrees[v].degree -= 1
+                            picks.append(v)
+                            swapped = True
+                            break
+                    if swapped or len(picks) == k:
+                        break
+                if len(picks) == k:
+                    break
+
+        if len(picks) < k:
+            raise ValueError("Directed degree sequence not realizable with simple digraph under constraints.")
+
+        # Place edges for u
+        for v in picks:
+            B[u, v] = 1
+            in_degrees[v].degree -= 1
+        #out_degrees[v].degree = 0
+        out_degrees[u].degree = 0
+
+        # Stable re-sort by residual in-degree, with explicit tie-breaking by weight
+        in_order = np.argsort(in_degrees)[::-1]
+
+    # Zero diagonal guarantee
+    np.fill_diagonal(B, 0)
+
+    # Recompute initial incoming stats from A
+    init_in_degrees = WeightedDegree.from_grn(A, incoming=True)
+
+    # Convenience arrays for deterministic target ranking
+    # (higher degree first, then higher total incoming weight, then smaller id)
+    init_in_deg_arr = np.array([wd.degree for wd in init_in_degrees])
+    init_in_wgt_arr = np.array([wd.weight for wd in init_in_degrees])
+
+    for u in range(n):
+
+        # Outgoing weights in A
+        mask_A_u = (A[u, :] != 0)
+        if not np.any(mask_A_u):
+            continue
+        orig_targets = np.where(mask_A_u)[0]
+        W = A[u, orig_targets].astype(float)
+
+        # Sort weights deterministically: by |w| desc, then w asc, then orig target id asc
+        # (lexsort uses last key as primary)
+        w_keys_3 = np.abs(W) * -1.0       # primary: |w| descending
+        w_keys_2 = W                      # secondary: actual value ascending (keeps sign order stable)
+        w_keys_1 = orig_targets           # tertiary: original target id ascending
+        order_w = np.lexsort((w_keys_1, w_keys_2, w_keys_3))
+        W_sorted = W[order_w]
+
+        # Targets in B for this source, ranked by original A's incoming difficulty/salience
+        # Rank by: in-degree desc, then incoming-weight desc, then id asc
+        targets_B = np.where(B[u, :] == 1)[0]
+        if targets_B.size == 0:
+            continue
+        t_deg = init_in_deg_arr[targets_B]
+        t_wgt = init_in_wgt_arr[targets_B]
+        t_id  = targets_B
+        order_t = np.lexsort((t_id, -t_wgt, -t_deg))  # primary: -t_deg, then -t_wgt, then t_id
+        targets_ranked = targets_B[order_t]
+
+        # Sanity: degrees should match. If not, deterministically trim/pad.
+        kA = W_sorted.size
+        kB = targets_ranked.size
+        if kA > kB:
+            # Drop evenly from both ends to avoid bias
+            excess = kA - kB
+            left = excess // 2
+            right = excess - left
+            W_sorted = W_sorted[left: kA - right]
+        elif kA < kB:
+            W_sorted = np.concatenate([W_sorted, np.zeros(kB - kA, dtype=float)], axis=0)
+
+        # Assign exact weights to new edges deterministically
+        B[u, targets_ranked] = W_sorted
+
+    return B

From 1c29769f88cfb3d133216b50719606e86f7973f6 Mon Sep 17 00:00:00 2001
From: AntoinePassemiers <antoine.passemiers@gmail.com>
Date: Sun, 12 Oct 2025 17:05:39 +0200
Subject: [PATCH 09/19] Use immediate early genes as reporter genes

---
 src/metrics/experimental/sem/helper.py | 50 ++++++++++++++++++++------
 1 file changed, 39 insertions(+), 11 deletions(-)

diff --git a/src/metrics/experimental/sem/helper.py b/src/metrics/experimental/sem/helper.py
index 87ebf42fb..ac27a32ad 100644
--- a/src/metrics/experimental/sem/helper.py
+++ b/src/metrics/experimental/sem/helper.py
@@ -26,7 +26,7 @@
 MAX_N_ITER = 500
 
 # For reproducibility purposes
-seed = 0xCAFF
+seed = 0xCADD
 os.environ['PYTHONHASHSEED'] = str(seed)
 random.seed(seed)
 np.random.seed(seed)
@@ -41,6 +41,35 @@
 from dataset_config import DATASET_GROUPS
 
 
+# List of immediate early genes (IEGs) - scCustomize v3.2.0
+IEG = [
+    "CCL7"  , "CCL2"    , "TNFSF9"  , "SIK1B"   , "RASD1"  ,  "NUP98",
+    "CEBPD" , "NFKBID"  , "NOCT"    , "FOS"     , "IER2"   ,  "HBEGF",
+    "RHEB"  , "PLAU"    , "IFNB1"   , "PCDH8"   , "WEE1"   ,  "FBXO33",
+    "PMAIP1", "DUSP1"   , "PLK2"    , "TSC22D1" , "MAP3K8" ,  "PIAS1",
+    "KLF10" , "BDNF"    , "CCN2"    , "TRIB1"   , "SOD2"   ,  "IER3",
+    "PLAT"  , "RCAN1"   , "ZFP36"   , "CCL5"    , "NFKBIA" ,  "NRN1",
+    "KLF2"  , "SERPINE1", "MAFF"    , "NCOA7"   , "GDF15"  ,  "LDLR",
+    "TNF"   , "CCRL2"   , "CCL18"   , "CCL3"    , "FOSL1"  ,  "DUSP2",
+    "INHBA" , "JUND"    , "NR4A1"   , "EGR3"    , "IRF1"   ,  "IFIT3",
+    "IFIT1B", "NR4A3"   , "PER2"    , "GADD45G" , "SOCS3"  ,  "TLR2",
+    "PELI1" , "IL1A"    , "RGS2"    , "CSF2"    , "F3"     ,  "APOLD1",
+    "ACKR4" , "BHLHE40" , "IL23A"   , "GBP2"    , "IL1B"   ,  "ARIH1",
+    "GBP2"  , "JUN"     , "NPTX2"   , "FOSB"    , "EGR1"   ,  "MBNL2",
+    "VCAM1" , "ZFP36L2" , "ARF4"    , "MMP13"   , "JUNB"   ,  "ZFP36L1",
+    "CCN1"  , "NPAS4"   , "PPP1R15A", "EGR4"    , "NFKBIZ" ,  "ARC",
+    "MCL1"  , "RGS1"    , "KLF6"    , "CLEC4E"  , "SGK1"   ,  "ARHGEF3",
+    "EGR2"  , "IFIT2"   , "ID2"     , "DUSP6"   , "SIK1"   ,  "DUSP5",
+    "NFIB"  , "SAA2"    , "SAA1"    , "THBS1"   , "FOSL2"  ,  "ICAM1",
+    "CXCL10", "CSRNP1"  , "BCL3"    , "MARCKSL1", "HOMER1" ,  "TRAF1",
+    "ATF3"  , "FLG"     , "SRF"     , "PIM1"    , "GADD45B",  "HES1",
+    "GEM"   , "TNFAIP3" , "PER1"    , "CREM"    , "CD69"   ,  "IL6",
+    "BTG2"  , "ACOD1"   , "CEBPB"   , "CXCL11"  , "IL12B"  ,  "NR4A2",
+    "PTGS2" , "IKBKE"   , "TXNIP"   , "CD83"    , "IER5"   ,  "IL10",
+    "MYC"   , "CXCL12"  , "SLC2A3"  , "CXCL1"
+]
+
+
 def encode_obs_cols(adata, cols):
     encoded = []
     for col in cols:
@@ -334,17 +363,16 @@ def main(par):
     is_train[train_idx] = True
 
     # Create a split between genes: reporter genes and evaluation genes.
-    # All TFs should be included in the reporter gene set.
-    # If the reporter gene set does not represent at least 50% of all genes,
-    # then we randomly add target genes until the 50% threshold is reached.
+    # All TFs and IEGs should be included in the reporter gene set.
     n_genes = A.shape[1]
-    reg_mask = np.asarray(A != 0).any(axis=1)  # TF mask
-    is_reporter = np.copy(reg_mask)
-    if np.mean(is_reporter) < 0.5 * len(is_reporter):
-        idx = np.where(~is_reporter)[0]
-        np.random.shuffle(idx)
-        idx = idx[:int(0.5 * len(is_reporter) - np.sum(is_reporter))]
-        is_reporter[idx] = True
+    reg_mask = np.asarray(A != 0).any(axis=1)
+    ieg_mask = np.asarray([gene_name in IEG for gene_name in gene_names], dtype=bool)
+    is_reporter = np.logical_or(reg_mask, ieg_mask)
+    #if np.mean(is_reporter) < 0.5 * len(is_reporter):
+    #    idx = np.where(~is_reporter)[0]
+    #    np.random.shuffle(idx)
+    #    idx = idx[:int(0.5 * len(is_reporter) - np.sum(is_reporter))]
+    #    is_reporter[idx] = True
     print(f"Proportion of reporter genes: {np.mean(is_reporter)}")
     print(f"Use regulatory modes/signs: {use_signs}")
 

From 8bce32ab642bc658415a12ff74239662f39cbee0 Mon Sep 17 00:00:00 2001
From: AntoinePassemiers <antoine.passemiers@gmail.com>
Date: Fri, 17 Oct 2025 09:03:37 +0200
Subject: [PATCH 10/19] Minor change

---
 src/metrics/experimental/regression_3/helper.py | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/src/metrics/experimental/regression_3/helper.py b/src/metrics/experimental/regression_3/helper.py
index 25c0acea2..4a7c5b246 100644
--- a/src/metrics/experimental/regression_3/helper.py
+++ b/src/metrics/experimental/regression_3/helper.py
@@ -26,6 +26,7 @@
 
 from util import read_prediction, manage_layer
 from dataset_config import DATASET_GROUPS
+from baseline import create_grn_baseline
 
 
 def encode_obs_cols(adata, cols):
@@ -52,8 +53,8 @@ def compute_residual_correlations(
         y_test: np.ndarray,
         Z_test: np.ndarray
 ) -> np.ndarray:
-    model = xgboost.XGBRegressor(n_estimators=10)
-    #model = Ridge(alpha=10)
+    #model = xgboost.XGBRegressor(n_estimators=10)
+    model = Ridge(alpha=0.01)
     model.fit(X_train, y_train)
     y_hat = model.predict(X_test)
     residuals = y_test - y_hat
@@ -115,7 +116,7 @@ def main(par):
     gene_mask = np.logical_or(np.any(A, axis=1), np.any(A, axis=0))
     in_degrees = np.sum(A != 0, axis=0)
     out_degrees = np.sum(A != 0, axis=1)
-    idx = np.argsort(np.maximum(out_degrees, in_degrees))[:-2000]
+    idx = np.argsort(np.maximum(out_degrees, in_degrees))[:-1000]
     gene_mask[idx] = False
     X = X[:, gene_mask]
     X = X.toarray() if isinstance(X, csr_matrix) else X
@@ -127,11 +128,7 @@ def main(par):
     print(f"Evaluating {X.shape[1]} genes with {np.sum(A != 0)} regulatory links")
 
     # Create baseline model
-    A_baseline = np.copy(A)
-    for j in range(A.shape[1]):
-        np.random.shuffle(A_baseline[:j, j])
-        np.random.shuffle(A_baseline[j+1:, j])
-    assert np.any(A_baseline != A)
+    A_baseline = create_grn_baseline(A)
 
     scores, baseline_scores = [], []
     for group in np.unique(anchor_encoded):

From 5651d342d3e3693a8d76c93fe09f4e3a5e3bc99f Mon Sep 17 00:00:00 2001
From: AntoinePassemiers <antoine.passemiers@gmail.com>
Date: Mon, 20 Oct 2025 12:30:49 +0200
Subject: [PATCH 11/19] Minor changes

---
 .../experimental/anchor_regression/helper.py  |  51 +++---
 src/metrics/experimental/vc/helper.py         |   2 +-
 src/metrics/tf_binding/helper.py              | 170 +++++-------------
 src/metrics/tf_binding/run_local.py           |   8 +-
 4 files changed, 82 insertions(+), 149 deletions(-)

diff --git a/src/metrics/experimental/anchor_regression/helper.py b/src/metrics/experimental/anchor_regression/helper.py
index f823406a6..93145bbaf 100644
--- a/src/metrics/experimental/anchor_regression/helper.py
+++ b/src/metrics/experimental/anchor_regression/helper.py
@@ -77,7 +77,7 @@ def anchor_regression(
         X: np.ndarray,
         Z: np.ndarray,
         Y: np.ndarray,
-        l2_reg: float = 1e-4,
+        l2_reg: float = 1e-5,
         gamma: float = 1.0
 ) -> np.ndarray:
     """Anchor regression for causal inference under confounding.
@@ -115,7 +115,6 @@ def cross_val(
         Z: np.ndarray,
         eps: float = 1e-50
 ) -> float:
-    cv = KFold(5)
     ss_res, ss_tot = 0, 0
     y_mean = np.mean(y)
     for idx_train, idx_test in cv.split(X, y):
@@ -152,23 +151,23 @@ def compute_stabilities_v2(
         X: np.ndarray,
         y: np.ndarray,
         Z: np.ndarray,
-        A: np.ndarray,
         is_selected: np.ndarray,
         eps: float = 1e-50
-) -> Tuple[float, float]:
-
-    theta0 = anchor_regression(X[:, is_selected], Z, y, gamma=1)
-    theta = anchor_regression(X[:, is_selected], Z, y, gamma=1.5)
-    s1 = np.clip(np.abs(theta0 - theta) / np.abs(theta0 + eps), 0, 1)
+) -> float:
 
-    theta0 = anchor_regression(X[:, ~is_selected], Z, y, gamma=1)
-    theta = anchor_regression(X[:, ~is_selected], Z, y, gamma=1.5)
-    s2 = np.clip(np.abs(theta0 - theta) / np.abs(theta0 + eps), 0, 1)
+    theta0 = np.abs(anchor_regression(X, Z, y, gamma=1))
+    theta0 /= np.sum(theta0)
+    theta = np.abs(anchor_regression(X, Z, y, gamma=2))
+    theta /= np.sum(theta)
+    s1 = np.clip(np.abs(theta0[is_selected] - theta[is_selected]) / np.abs(theta0[is_selected] + eps), 0, 1)
+    s2 = np.clip(np.abs(theta0[~is_selected] - theta[~is_selected]) / np.abs(theta0[~is_selected] + eps), 0, 1)
 
     s1 = np.mean(s1)
     s2 = np.mean(s2)
 
-    return s1, s2
+    stability = (s1 - s2) / (s1 + s2 + eps)
+
+    return stability
 
 
 def evaluate_gene_stability(
@@ -199,27 +198,16 @@ def evaluate_gene_stability(
     mask = np.ones(X.shape[1], dtype=bool)
     mask[j] = False
 
-    score = compute_stabilities(X[:, mask], X[:, j], Z, A[mask, j], is_selected[mask], eps=eps)
-    #stabilities_non_selected = np.mean(compute_stabilities(X[:, mask], X[:, j], Z, A, ~is_selected[mask], eps=eps))
-    #score = (stabilities_selected - stabilities_non_selected) / (stabilities_selected + stabilities_non_selected + eps)
-    #score = np.mean(stabilities_selected)
-
-    return score
+    return compute_stabilities_v2(X[:, mask], X[:, j], Z, is_selected[mask])
 
 
 def main(par):
     """Main anchor regression evaluation function."""
+
     # Load evaluation data
     adata = ad.read_h5ad(par['evaluation_data'])
     dataset_id = adata.uns['dataset_id']
     method_id = ad.read_h5ad(par['prediction'], backed='r').uns['method_id']
-    
-    # Get dataset-specific anchor variables
-    if dataset_id not in DATASET_GROUPS:
-        raise ValueError(f"Dataset {dataset_id} not found in DATASET_GROUPS")
-    
-    anchor_cols = DATASET_GROUPS[dataset_id].get('anchors', ['donor_id', 'plate_name'])
-    print(f"Using anchor variables: {anchor_cols}")
 
     # Manage layer
     layer = manage_layer(adata, par)
@@ -231,9 +219,22 @@ def main(par):
     gene_names = adata.var_names
     gene_dict = {gene_name: i for i, gene_name in enumerate(gene_names)}
 
+    # Get dataset-specific anchor variables
+    if dataset_id not in DATASET_GROUPS:
+        raise ValueError(f"Dataset {dataset_id} not found in DATASET_GROUPS")
+    anchor_cols = DATASET_GROUPS[dataset_id].get('anchors', ['donor_id', 'plate_name'])
+    print(f"Using anchor variables: {anchor_cols}")
+
     # Encode anchor variables
     anchor_variables = encode_obs_cols(adata, anchor_cols)
     anchor_encoded = combine_multi_index(*anchor_variables)
+
+    # Get CV groups
+    if "cell_type" in adata.obs:
+        cv_groups = LabelEncoder().fit_transform(adata.obs["cell_type"].values)
+    else:
+        np.random.randint(0, 5)
+        cv_groups = np.random.shuffle()
     
     if len(anchor_variables) == 0:
         raise ValueError(f"No anchor variables found in dataset for columns: {anchor_cols}")
diff --git a/src/metrics/experimental/vc/helper.py b/src/metrics/experimental/vc/helper.py
index 61378bb66..8e8c678a2 100644
--- a/src/metrics/experimental/vc/helper.py
+++ b/src/metrics/experimental/vc/helper.py
@@ -441,7 +441,7 @@ def main(par):
     print(f"Using {len(gene_names)} genes present in the GRN")
     
     # Additional memory-aware gene filtering for very large GRNs
-    MAX_GENES_FOR_MEMORY = 2000  # Reduced further to avoid memory issues
+    MAX_GENES_FOR_MEMORY = 500  # Reduced further to avoid memory issues
     if len(gene_names) > MAX_GENES_FOR_MEMORY:
         print(f"Too many genes ({len(gene_names)}) for memory. Selecting top {MAX_GENES_FOR_MEMORY} by GRN connectivity.")
         
diff --git a/src/metrics/tf_binding/helper.py b/src/metrics/tf_binding/helper.py
index 4b8978791..9d4574470 100644
--- a/src/metrics/tf_binding/helper.py
+++ b/src/metrics/tf_binding/helper.py
@@ -1,144 +1,72 @@
+from typing import Optional, Dict
 
-import math
-import pandas as pd
 import numpy as np
-from sklearn.metrics import average_precision_score
-from tqdm import tqdm
+import pandas as pd
+from sklearn.metrics import recall_score, precision_score, matthews_corrcoef, roc_auc_score, average_precision_score
 from util import read_prediction
-from scipy.stats import wilcoxon
-
-# Wilcoxon signed-rank test: AP vs prevalence
-def wilcoxon_logp(x, y):
-    try:
-        stat, p = wilcoxon(x, y, zero_method='wilcox', alternative='greater')
-        return -np.log10(p) if p > 0 else np.inf
-    except Exception:
-        return np.nan
 
 
 # For reproducibility
 seed = 42
 np.random.seed(seed)
 
-def main(par):
 
-    # Load predictions
-    prediction = read_prediction(par)
-    assert prediction.shape[0] > 0, 'No links found in the network'
-    if not {'source', 'target', 'weight'}.issubset(set(prediction.columns)):
-        raise ValueError("prediction must have columns: 'source', 'target', 'weight'")
+def load_grn_from_dataframe(
+        df: pd.DataFrame,
+        tf_dict: Dict[str, int],
+        tg_dict: Dict[str, int]
+) -> np.ndarray:
+    G = np.zeros((len(tf_dict), len(tg_dict)), dtype=float)
+    for tf_name, tg_name, weight in zip(df['source'], df['target'], df['weight']):
+        if tf_name not in tf_dict:
+            continue
+        if tg_name not in tg_dict:
+            continue
+        i = tf_dict[tf_name]
+        j = tg_dict[tg_name]
+        G[i, j] = weight
+    return G
+
+
+def main(par):
 
-    # Load TF list and ground truth edges
-    tf_all = np.loadtxt(par['tf_all'], dtype=str, delimiter=',', skiprows=1)
+    # Load ground-truth edges (consider only TFs listed in the file loaded hereabove)
     true_graph = pd.read_csv(par['ground_truth'])
     if not {'source', 'target'}.issubset(set(true_graph.columns)):
         raise ValueError("ground_truth must have columns: 'source', 'target'")
+    if 'weight' not in true_graph:
+        true_graph['weight'] = np.ones(len(true_graph))
 
-    # Keep only TFs that are in the provided TF list
-    true_graph = true_graph[true_graph['source'].isin(tf_all)]
-    assert true_graph.shape[0] > 0, 'No links found in the ground truth after filtering to TF list'
-
-    # Use the union of all targets that appear in either predictions or ground-truth
-    pred_targets = set(prediction['target'].astype(str).unique())
-    true_targets = set(true_graph['target'].astype(str).unique())
-    evaluation_genes = sorted(pred_targets.union(true_targets))
-    if len(evaluation_genes) == 0:
-        raise ValueError("Empty evaluation target set (no targets in predictions or ground truth).")
-
-    # Precompute lookup to avoid O(n^2)
-    gene_to_idx = {g: i for i, g in enumerate(evaluation_genes)}
-    n_targets_total = len(evaluation_genes)
-
-    scores_model = []
-    # Iterate over TFs that actually have at least one positive in ground-truth
-    for tf in tqdm(true_graph['source'].unique()):
-        # Positives for this TF
-        true_edges = true_graph[true_graph['source'] == tf]
-        y_true = np.zeros(n_targets_total, dtype=int)
-        
-        # Mark positives
-        for t in true_edges['target'].astype(str):
-            y_true[gene_to_idx[t]] = 1
-        assert y_true.sum() > 0, f'TF {tf} has no positive targets in ground truth'
-
-        # Scores over all candidate targets
-        y_score = np.zeros(n_targets_total, dtype=float)
-        if tf in prediction['source'].unique():
-            pred_edges = prediction[prediction['source'] == tf]
-
-            # Fill scores where predicted
-            for tgt, w in zip(pred_edges['target'].astype(str), pred_edges['weight'].abs().astype(float)):
-                idx = gene_to_idx.get(tgt, None)
-                if idx is not None:
-                    y_score[idx] = w
-
-            # Ensure numerical safety
-            y_score = np.nan_to_num(y_score, nan=0.0, posinf=0.0, neginf=0.0)
-            # AP requires at least one positive
-            ap = average_precision_score(y_true, y_score)
-        else:
-            ap = np.nan
-
-        # Random baseline AUPRC is just the prevalence of positives
-        prevalence = y_true.mean()
-
-        # Calculate recall@k for this TF
-        
-        if tf in prediction['source'].unique() and not np.isnan(ap):
-            gt_targets = set(true_edges['target'].astype(str))
-            k = len(gt_targets)  # Top k predictions to consider
-            # Get top k predicted targets for this TF
-            genes_in_grn = set(pred_edges.sort_values(by='weight', key=abs, ascending=False).head(k)['target'].astype(str).unique() )
-            
-            # Calculate recall@k
-            true_positives_in_top_k = len(gt_targets & genes_in_grn)
-            recall_at_k = true_positives_in_top_k / len(gt_targets)
-            
-            # Calculate random recall baseline
-            random_recall = len(genes_in_grn) / n_targets_total
-            lift_recall = recall_at_k / (random_recall + 1e-10)
-        else:
-            recall_at_k = np.nan
-            lift_recall = np.nan
-        
-        scores_model.append({
-            'source': tf,
-            'ap': ap,
-            'n_targets_pos': int(y_true.sum()),
-            'prevalence': prevalence,
-            'recall_at_k': recall_at_k,
-            'lift_recall': lift_recall
-        })
-
-        from scipy.stats import wilcoxon
-
-    scores_df = pd.DataFrame(scores_model)
-    ap_values_pred = scores_df['ap'].dropna().values
-    preval_all = scores_df['prevalence'].values
-    preval_pred = scores_df[~scores_df['ap'].isna()]['prevalence'].values
-    ap_pred_mean = ap_values_pred.mean() if len(ap_values_pred) > 0 else 0.0
-    preval_pred_mean = preval_pred.mean() if len(preval_pred) > 0 else 0.0
-    lift_pred = ap_pred_mean / (preval_pred_mean + 1e-10)
+    # Load inferred GRN
+    prediction = read_prediction(par)
+    assert prediction.shape[0] > 0, 'No links found in the network'
+    if not {'source', 'target', 'weight'}.issubset(set(prediction.columns)):
+        raise ValueError("prediction must have columns: 'source', 'target', 'weight'")
 
-    # Calculate mean recall metrics
-    recall_values_pred = scores_df['recall_at_k'].dropna().values
-    lift_recall_values_pred = scores_df['lift_recall'].dropna().values
-    lift_recall_values_all = scores_df['lift_recall'].fillna(1).values
+    # Intersect TF lists, intersect TG lists
+    tf_names = set(true_graph['source'].unique())  #.intersection(set(tf_all))
+    tg_names = set(true_graph['target'].unique())
+    #tf_names = set(true_graph['source'].unique()).intersection(set(prediction['source'].unique()))  #.intersection(set(tf_all))
+    #tg_names = set(true_graph['target'].unique()).intersection(set(prediction['target'].unique()))
+    tf_dict = {tf_name: i for i, tf_name in enumerate(tf_names)}
+    tg_dict = {tg_name: i for i, tg_name in enumerate(tg_names)}
 
-    mean_recall_at_k = recall_values_pred.mean() if len(recall_values_pred) > 0 else 0.0
-    mean_lift_recall = lift_recall_values_pred.mean() if len(lift_recall_values_pred) > 0 else 1
-    mean_lift_recall_all = lift_recall_values_all.mean() if len(lift_recall_values_all) > 0 else 1
+    # Reformat GRNs as NumPy arrays
+    A = load_grn_from_dataframe(true_graph, tf_dict, tg_dict)
+    G = load_grn_from_dataframe(prediction, tf_dict, tg_dict)
 
-    # Calculate TF coverage
-    n_tfs_in_gt = true_graph['source'].nunique()
-    n_tfs_in_grn = set(prediction['source'].unique()) & set(true_graph['source'].unique())
-    n_tfs_in_grn = len(n_tfs_in_grn)
+    G = np.abs(G)
+    G += np.random.uniform(0, 1e-20, size=G.shape)
 
-    recall = mean_lift_recall_all * mean_lift_recall
+    # Evaluate inferred GRN
+    tf_binding_precision = precision_score(A.flatten(), (G != 0).flatten())
+    tf_binding_recall = recall_score(A.flatten(), (G != 0).flatten())
+    final_score = roc_auc_score(A.flatten(), G.flatten())
 
     summary_df = pd.DataFrame([{
-        'tf_binding_precision': lift_pred,
-        'tf_binding_recall': recall
+        'tf_binding_precision': tf_binding_precision,
+        'tf_binding_recall': tf_binding_recall,
+        'final_score': final_score
     }])
 
-    return summary_df
\ No newline at end of file
+    return summary_df
diff --git a/src/metrics/tf_binding/run_local.py b/src/metrics/tf_binding/run_local.py
index e509c1496..41470757f 100644
--- a/src/metrics/tf_binding/run_local.py
+++ b/src/metrics/tf_binding/run_local.py
@@ -4,9 +4,13 @@
 import pandas as pd
 
 
-for dataset in ["replogle", "norman", "adamson"]:
-    for method in ["negative_control", "pearson_corr", "positive_control", "ppcor", "portia", "scenic", "grnboost", "scprint", "scenicplus", "celloracle", "scglue", "figr", "granie"]:
+for dataset in ["op"]:
+    #for method in ['granie']:
+    for method in ["negative_control", "granie", "ppcor", "portia", "pearson_corr", "positive_control", "scenic", "grnboost", "scprint", "scenicplus", "celloracle", "scglue", "figr"]:
         
+        print()
+        print(method)
+
         score_filepath = f"output/tf_binding/tf_binding.h5ad"
         subprocess.call([
             "python",

From 0f6cf0bb5eb97f3f978aa4fc4c47719858781738 Mon Sep 17 00:00:00 2001
From: AntoinePassemiers <antoine.passemiers@gmail.com>
Date: Mon, 20 Oct 2025 12:33:11 +0200
Subject: [PATCH 12/19] Minor change

---
 src/metrics/tf_binding/run_local.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/metrics/tf_binding/run_local.py b/src/metrics/tf_binding/run_local.py
index 41470757f..0599d0d8f 100644
--- a/src/metrics/tf_binding/run_local.py
+++ b/src/metrics/tf_binding/run_local.py
@@ -4,7 +4,7 @@
 import pandas as pd
 
 
-for dataset in ["op"]:
+for dataset in ["replogle"]:
     #for method in ['granie']:
     for method in ["negative_control", "granie", "ppcor", "portia", "pearson_corr", "positive_control", "scenic", "grnboost", "scprint", "scenicplus", "celloracle", "scglue", "figr"]:
         

From 7db1968b856d38a0d94a8bb86ea6929508d06439 Mon Sep 17 00:00:00 2001
From: AntoinePassemiers <antoine.passemiers@gmail.com>
Date: Thu, 23 Oct 2025 00:59:00 +0200
Subject: [PATCH 13/19] tf_binding GT: strand-aware promoter region

---
 src/metrics/tf_binding/acquire/script.py | 30 +++++++++++++-----------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/src/metrics/tf_binding/acquire/script.py b/src/metrics/tf_binding/acquire/script.py
index fb2efa5c5..b00445258 100644
--- a/src/metrics/tf_binding/acquire/script.py
+++ b/src/metrics/tf_binding/acquire/script.py
@@ -1,5 +1,6 @@
 import pandas as pd
 import os
+import numpy as np
 import urllib.request
 from tqdm import tqdm
 
@@ -77,11 +78,13 @@ def download_tss_file(tss_local_path):
     TSS_FILE = 'refGene.txt.gz'
     TSS_URL = f'https://hgdownload.soe.ucsc.edu/goldenPath/hg38/database/{TSS_FILE}'
     TSS_PATH = os.path.join(local_path, TSS_FILE)
+
     if not os.path.exists(tss_local_path):
         print(f'Downloading TSS (refGene) from {TSS_URL} to {tss_local_path}')
+        os.makedirs(os.path.dirname(tss_local_path), exist_ok=True)
         urllib.request.urlretrieve(TSS_URL, tss_local_path)
 
-def genes_with_peaks_near_tss(peaks_df, tss_file, window_bp=1000):
+def genes_with_peaks_near_tss(peaks_df, tss_file, window_up=1000, window_down=100):
     if peaks_df is None or len(peaks_df) == 0:
         raise ValueError("Peaks DataFrame is empty or None.")
     
@@ -99,11 +102,11 @@ def genes_with_peaks_near_tss(peaks_df, tss_file, window_bp=1000):
     tx_end = pd.to_numeric(ref.iloc[:, 5], errors='coerce').fillna(0).astype(int)
     gene = ref.iloc[:, 12].astype(str)
     tss = tx_start.where(strand == '+', tx_end)
-    win_start = (tss - int(window_bp)).clip(lower=0)
-    win_end = tss + int(window_bp)
+    win_start = np.where(strand == '+', tss - window_up, tss - window_down)[0]
+    win_end = np.where(strand == '+', tss + window_down, tss + window_up)[0]
     genes_df = pd.DataFrame({
         'chrom': chrom,
-        'start': win_start.astype(int),
+        'start': np.clip(win_start, 0, None).astype(int), 
         'end': win_end.astype(int),
         'gene': gene.str.strip(),
     })
@@ -129,22 +132,22 @@ def genes_with_peaks_near_tss(peaks_df, tss_file, window_bp=1000):
 from tqdm import tqdm
 from concurrent.futures import ProcessPoolExecutor, as_completed
 
-def _process_tf(tf, peaks_df, tss_file, window_bp):
+def _process_tf(tf, peaks_df, tss_file):
     peaks = peaks_df[peaks_df['tf'] == tf]
     if peaks is None or len(peaks) == 0:
         return []
-    genes = genes_with_peaks_near_tss(peaks, tss_file, window_bp=window_bp)
+    genes = genes_with_peaks_near_tss(peaks, tss_file)
     if not genes:
         return []
     return [{'source': tf, 'target': g} for g in genes]
 
-def build_grn(peaks_df, tss_file, genome='hg38', window_bp=1000, max_workers=None):
+def build_grn(peaks_df, tss_file, genome='hg38', max_workers=None):
     tfs = peaks_df['tf'].unique()
     rows = []
 
     # Use ProcessPoolExecutor for parallel processing
     with ProcessPoolExecutor(max_workers=max_workers) as executor:
-        futures = {executor.submit(_process_tf, tf, peaks_df, tss_file, window_bp): tf for tf in tfs}
+        futures = {executor.submit(_process_tf, tf, peaks_df, tss_file): tf for tf in tfs}
         for future in tqdm(as_completed(futures), total=len(futures)):
             result = future.result()
             if result:
@@ -154,7 +157,6 @@ def build_grn(peaks_df, tss_file, genome='hg38', window_bp=1000, max_workers=Non
     return grn
 if __name__ == '__main__':
     genome = 'hg38'
-    window_bp = 1000
     qval = "50"
     local_path = 'resources/datasets_raw/chipseq/'
     tss_local_path = os.path.join(local_path, 'TSS.txt.gz')
@@ -172,16 +174,16 @@ def build_grn(peaks_df, tss_file, genome='hg38', window_bp=1000, max_workers=Non
             peaks_df = read_peak_file(peaks_file, source='chip_atlas')
             if True: 
                 # Filter for score == 1000
-                print('Filtering peaks with score > 0', flush=True)
+                print('Filtering peaks with score > 50', flush=True)
                 print(f'Original number of peaks: {len(peaks_df)}', flush=True)
-                peaks_df = peaks_df[peaks_df['score'] > 0]
+                peaks_df = peaks_df[peaks_df['score'] > 50]
                 print(f'Number of peaks after filtering: {len(peaks_df)}', flush=True)
-            grn = build_grn(peaks_df, tss_local_path, genome=genome, window_bp=window_bp, max_workers=10)
+            grn = build_grn(peaks_df, tss_local_path, genome=genome, max_workers=10)
             grn.to_csv(output_csv_path, index=False)
     
     if True: # remap
         print('Processing remap2', flush=True)
-        if False:
+        if True:
             print('Processing remap2022_all_macs2_hg38_v1_0', flush=True)
             peaks_file = f"{local_path}/remap/remap2022_all_macs2_hg38_v1_0.bed.gz"
             print('Reading peaks file', flush=True)
@@ -218,7 +220,7 @@ def build_grn(peaks_df, tss_file, genome='hg38', window_bp=1000, max_workers=Non
             df = df[df['score'] > 0]
             print("After filtering, ", df.shape[0], " peaks")
 
-            grn = build_grn(df, tss_local_path, genome=genome, window_bp=window_bp)
+            grn = build_grn(df, tss_local_path, genome=genome)
             output_csv_path = f'resources/grn_benchmark/ground_truth/{cell_type}_remap.csv'
             print(f'Writing GRN to {output_csv_path}', flush=True)
             grn.to_csv(output_csv_path, index=False)

From 71cc64f48a48a8400b8d749b2e7e06b122487756 Mon Sep 17 00:00:00 2001
From: AntoinePassemiers <antoine.passemiers@gmail.com>
Date: Sat, 25 Oct 2025 17:29:45 +0200
Subject: [PATCH 14/19] Add BEELINE datasets

---
 src/methods/portia/script.py               |  4 +-
 src/metrics/gt/acquire/run_local.sh        | 13 ++++
 src/metrics/gt/acquire/script.py           | 19 ++++++
 src/metrics/gt/helper.py                   | 71 ++++++++++++++++++++
 src/metrics/gt/run_local.py                | 37 ++++++++++
 src/metrics/gt/run_local.sh                | 78 ++++++++++++++++++++++
 src/metrics/gt/script.py                   | 45 +++++++++++++
 src/metrics/tf_binding/acquire/script.py   |  4 +-
 src/process_data/beeline/config.novsh.yaml | 53 +++++++++++++++
 src/process_data/beeline/script.py         | 73 ++++++++++++++++++++
 src/utils/util.py                          | 48 +++++++++++++
 11 files changed, 440 insertions(+), 5 deletions(-)
 create mode 100644 src/metrics/gt/acquire/run_local.sh
 create mode 100644 src/metrics/gt/acquire/script.py
 create mode 100644 src/metrics/gt/helper.py
 create mode 100644 src/metrics/gt/run_local.py
 create mode 100644 src/metrics/gt/run_local.sh
 create mode 100644 src/metrics/gt/script.py
 create mode 100644 src/process_data/beeline/config.novsh.yaml
 create mode 100644 src/process_data/beeline/script.py

diff --git a/src/methods/portia/script.py b/src/methods/portia/script.py
index d2f0cfab5..2ab94a09b 100644
--- a/src/methods/portia/script.py
+++ b/src/methods/portia/script.py
@@ -44,19 +44,17 @@ def main(par):
   tf_names = [gene_name for gene_name in gene_names if (gene_name in tfs)]
   tf_idx = np.asarray([i for i, gene_name in enumerate(gene_names) if gene_name in tf_names], dtype=int)
 
-
   print('Inferring grn')
   dataset = pt.GeneExpressionDataset()
   
   for exp_id, data in enumerate(X):
       dataset.add(pt.Experiment(exp_id, data))
-  
+
   M_bar = pt.run(dataset, tf_idx=tf_idx, method='no-transform')
   ranked_scores = pt.rank_scores(M_bar, gene_names, limit=par['max_n_links'])
   sources, targets, weights = zip(*[(gene_a, gene_b, score) for gene_a, gene_b, score in ranked_scores])
 
   grn = pd.DataFrame({'source':sources, 'target':targets, 'weight':weights})
-  print(grn.shape)
   grn = grn[grn.source.isin(tf_names)]
 
   grn = process_links(grn, par)
diff --git a/src/metrics/gt/acquire/run_local.sh b/src/metrics/gt/acquire/run_local.sh
new file mode 100644
index 000000000..c735e7857
--- /dev/null
+++ b/src/metrics/gt/acquire/run_local.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+#SBATCH --job-name=beeline_data
+#SBATCH --output=logs/%j.out
+#SBATCH --error=logs/%j.err
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=2
+#SBATCH --time=10:00:00
+#SBATCH --mem=250GB
+#SBATCH --partition=cpu
+#SBATCH --mail-type=END,FAIL      
+#SBATCH --mail-user=jalil.nourisa@gmail.com   
+
+python src/metrics/tf_binding/acquire/script.py 
\ No newline at end of file
diff --git a/src/metrics/gt/acquire/script.py b/src/metrics/gt/acquire/script.py
new file mode 100644
index 000000000..5c4eaad6f
--- /dev/null
+++ b/src/metrics/gt/acquire/script.py
@@ -0,0 +1,19 @@
+import os
+import sys
+import io
+import zipfile
+import tempfile
+import requests
+from pathlib import Path
+
+
+sys.path.append("src/utils")
+from util import download_and_uncompress_zip
+
+
+if __name__ == "__main__":
+
+    download_and_uncompress_zip(
+        "https://zenodo.org/records/3701939/files/BEELINE-Networks.zip?download=1",
+        "resources/grn_benchmark/ground_truth/beeline"
+    )
diff --git a/src/metrics/gt/helper.py b/src/metrics/gt/helper.py
new file mode 100644
index 000000000..2e2adc853
--- /dev/null
+++ b/src/metrics/gt/helper.py
@@ -0,0 +1,71 @@
+from typing import Optional, Dict
+
+import numpy as np
+import pandas as pd
+from sklearn.metrics import recall_score, precision_score, matthews_corrcoef, roc_auc_score, average_precision_score
+from util import read_prediction
+
+
+# For reproducibility
+seed = 42
+np.random.seed(seed)
+
+
+def load_grn_from_dataframe(
+        df: pd.DataFrame,
+        tf_dict: Dict[str, int],
+        tg_dict: Dict[str, int]
+) -> np.ndarray:
+    G = np.zeros((len(tf_dict), len(tg_dict)), dtype=float)
+    for tf_name, tg_name, weight in zip(df['source'], df['target'], df['weight']):
+        if tf_name not in tf_dict:
+            continue
+        if tg_name not in tg_dict:
+            continue
+        i = tf_dict[tf_name]
+        j = tg_dict[tg_name]
+        G[i, j] = weight
+    return G
+
+
+def main(par):
+
+    # Load ground-truth edges (consider only TFs listed in the file loaded hereabove)
+    true_graph = pd.read_csv(par['ground_truth'])
+    if not {'Gene1', 'Gene2'}.issubset(set(true_graph.columns)):
+        raise ValueError("ground_truth must have columns: 'Gene1', 'Gene2'")
+    if 'weight' not in true_graph:
+        true_graph['weight'] = np.ones(len(true_graph))
+    true_graph.rename(columns={"Gene1": "source", "Gene2": "target"}, copy=False, inplace=True)
+
+    # Load inferred GRN
+    prediction = read_prediction(par)
+    assert prediction.shape[0] > 0, 'No links found in the network'
+    if not {'source', 'target', 'weight'}.issubset(set(prediction.columns)):
+        raise ValueError("prediction must have columns: 'source', 'target', 'weight'")
+
+    # Intersect TF lists, intersect TG lists
+    tf_names = set(true_graph['source'].unique())  #.intersection(set(tf_all))
+    tg_names = set(true_graph['target'].unique())
+    #tf_names = set(true_graph['source'].unique()).intersection(set(prediction['source'].unique()))  #.intersection(set(tf_all))
+    #tg_names = set(true_graph['target'].unique()).intersection(set(prediction['target'].unique()))
+    tf_dict = {tf_name: i for i, tf_name in enumerate(tf_names)}
+    tg_dict = {tg_name: i for i, tg_name in enumerate(tg_names)}
+
+    # Reformat GRNs as NumPy arrays
+    A = load_grn_from_dataframe(true_graph, tf_dict, tg_dict)
+    G = load_grn_from_dataframe(prediction, tf_dict, tg_dict)
+    G = np.abs(G)
+
+    # Evaluate inferred GRN
+    tf_binding_precision = precision_score((A != 0).flatten(), (G != 0).flatten())
+    tf_binding_recall = recall_score((A != 0).flatten(), (G != 0).flatten())
+    final_score = roc_auc_score(A.flatten(), G.flatten())
+
+    summary_df = pd.DataFrame([{
+        'tf_binding_precision': tf_binding_precision,
+        'tf_binding_recall': tf_binding_recall,
+        'final_score': final_score
+    }])
+
+    return summary_df
diff --git a/src/metrics/gt/run_local.py b/src/metrics/gt/run_local.py
new file mode 100644
index 000000000..3d8e1898e
--- /dev/null
+++ b/src/metrics/gt/run_local.py
@@ -0,0 +1,37 @@
+import subprocess
+
+import anndata as ad
+import pandas as pd
+
+
+results = {"dataset": [], "method": [], "score": []}
+for dataset in ["beeline_hESC", "beeline_hHep", "beeline_mDC", "beeline_mESC", "beeline_mESC-E", "beeline_mHSC-E", "beeline_mHSC-L"]:
+    #for method in ["negative_control", "granie", "ppcor", "portia", "pearson_corr", "positive_control", "scenic", "grnboost", "scprint", "scenicplus", "celloracle", "scglue", "figr"]:
+    #for method in ['granie', 'ppcor', 'portia', 'negative_control']:
+    for method in ['portia']:
+        
+        print()
+        print(method)
+
+        score_filepath = f"output/gt/gt.h5ad"
+        if "_h" in dataset:
+            gt = f"human/{dataset.split('_')[1]}-ChIP-seq-network"
+        else:
+            gt = f"mouse/{dataset.split('_')[1]}-ChIP-seq-network"
+        subprocess.call([
+            "python",
+            "src/metrics/gt/script.py",
+            "--prediction", f"resources/results/{dataset}/{method}.h5ad",
+            "--evaluation_data", f'resources/grn_benchmark/inference_data/{dataset}.h5ad',
+            "--ground_truth", f"resources/grn_benchmark/ground_truth/beeline/Networks/{gt}.csv",
+            "--score", score_filepath
+        ])
+
+        adata = ad.read_h5ad(score_filepath)
+        if "metric_values" in adata.uns:
+            metric_names = adata.uns["metric_ids"]
+            metric_values = adata.uns["metric_values"]
+            print(metric_values)
+
+df = pd.DataFrame(results)
+df.to_csv(f"output/gt/gt_scores_beeline.csv", header=True, index=False)
diff --git a/src/metrics/gt/run_local.sh b/src/metrics/gt/run_local.sh
new file mode 100644
index 000000000..0b2ef233a
--- /dev/null
+++ b/src/metrics/gt/run_local.sh
@@ -0,0 +1,78 @@
+#!/bin/bash
+#SBATCH --job-name=tf_binding
+#SBATCH --output=logs/%j.out
+#SBATCH --error=logs/%j.err
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=20
+#SBATCH --time=30:00:00
+#SBATCH --mem=250GB
+#SBATCH --partition=cpu
+#SBATCH --mail-type=END,FAIL      
+#SBATCH --mail-user=jalil.nourisa@gmail.com   
+
+set -euo pipefail
+
+save_dir="output/tf_binding"
+mkdir -p "$save_dir"
+
+# datasets to process
+datasets=(  'replogle' 'norman'  'adamson' ) # 'xaira_HCT116' 'replogle' 'norman'  'adamson') #"300BCG" "ibd" 'parsebioscience''op' "300BCG" 'parsebioscience'   'replogle' 'norman' 'adamson'
+# methods to process
+methods=( "pearson_corr" "negative_control" "positive_control" "ppcor" "portia" "scenic" "grnboost" "scprint" "scenicplus" "celloracle" "scglue" "figr" "granie")
+
+for dataset in "${datasets[@]}"; do
+    echo -e "\n\nProcessing dataset: $dataset\n"
+    
+    # Create separate CSV file for each dataset
+    dataset_csv="${save_dir}/tf_binding_scores_${dataset}.csv"
+    echo "dataset,method,metric,value" > "$dataset_csv"
+
+    evaluation_data="resources/grn_benchmark/evaluation_data/${dataset}_bulk.h5ad"
+
+    for method in "${methods[@]}"; do
+        prediction="resources/results/${dataset}/${dataset}.${method}.${method}.prediction.h5ad"
+        score="${save_dir}/tf_binding_${dataset}_${method}.h5ad"
+
+        if [[ ! -f "$prediction" ]]; then
+            echo "File not found: $prediction, skipping..."
+            continue
+        fi
+        if [[ "$dataset" == "replogle" || "$dataset" == "norman" || "$dataset" == "adamson" ]]; then
+            ground_truth="resources/grn_benchmark/ground_truth/K562_remap.csv"
+        elif [[ "$dataset" == "xaira_HEK293T" ]]; then
+            ground_truth="resources/grn_benchmark/ground_truth/HEK293T_remap.csv"
+        elif [[ "$dataset" == "xaira_HCT116" ]]; then
+            ground_truth="resources/grn_benchmark/ground_truth/HCT116_chipatlas.csv"
+        else
+            echo "No ground truth available for dataset: $dataset, skipping..."
+            continue
+        fi
+
+        echo -e "\nProcessing method: $method\n"
+        python src/metrics/tf_binding/script.py \
+            --prediction "$prediction" \
+            --evaluation_data "$evaluation_data" \
+            --ground_truth "$ground_truth" \
+            --score "$score"
+
+        # Extract metrics from the .h5ad and append to CSV
+        python -u - <<EOF
+import anndata as ad
+import pandas as pd
+
+adata = ad.read_h5ad("${score}")
+if "metric_values" in adata.uns:
+    metric_names = adata.uns["metric_ids"]
+    metric_values = adata.uns["metric_values"]
+    df = pd.DataFrame({"metric": metric_names, "value": metric_values})
+    df["dataset"] = "${dataset}"
+    df["method"] = "${method}"
+    df = df[["dataset", "method", "metric", "value"]]  # Reorder columns to match header
+    df.to_csv("${dataset_csv}", mode="a", header=False, index=False)
+EOF
+
+    done
+    echo -e "\nResults for dataset $dataset collected in: $dataset_csv"
+done
+
+echo -e "\nAll dataset results saved in separate CSV files in: $save_dir"
\ No newline at end of file
diff --git a/src/metrics/gt/script.py b/src/metrics/gt/script.py
new file mode 100644
index 000000000..c6398f710
--- /dev/null
+++ b/src/metrics/gt/script.py
@@ -0,0 +1,45 @@
+import os
+import sys
+import anndata as ad
+import numpy as np
+import pandas as pd
+
+
+## VIASH START
+par = {
+    'prediction': f'resources/results/op/op.pearson_corr.pearson_corr.prediction.h5ad',
+    'dataset_name': f'resources/grn_benchmark/inference_data/beeline_hESC.h5ad',
+    'ground_truth': f'resources/grn_benchmark/ground_truth/pbmc.csv',
+    'layer': 'lognorm',
+    "max_n_links": 50000,
+    'num_workers': 20,
+    'tf_all': 'resources/grn_benchmark/prior/tf_all.csv',    
+    'score': 'output/score.h5ad',
+    'genes_n': 5000
+}
+## VIASH END
+
+try:
+    sys.path.append(meta["resources_dir"])
+except:
+    meta = {
+    "resources_dir":'src/metrics/gt',
+    "util_dir": 'src/utils',
+    }
+    sys.path.append(meta["resources_dir"])
+    sys.path.append(meta["util_dir"])
+from helper import main as main 
+from util import format_save_score, parse_args
+
+args = parse_args(par)
+
+
+if __name__ == "__main__":
+    output = main(par)
+    
+    # print(output)
+
+    dataset_id = ad.read_h5ad(par['evaluation_data'], backed='r').uns['dataset_id']
+    method_id = ad.read_h5ad(par['prediction'], backed='r').uns['method_id']
+    os.makedirs(os.path.dirname(par['score']), exist_ok=True)
+    format_save_score(output, method_id, dataset_id, par['score'])
\ No newline at end of file
diff --git a/src/metrics/tf_binding/acquire/script.py b/src/metrics/tf_binding/acquire/script.py
index b00445258..dc678e50d 100644
--- a/src/metrics/tf_binding/acquire/script.py
+++ b/src/metrics/tf_binding/acquire/script.py
@@ -174,9 +174,9 @@ def build_grn(peaks_df, tss_file, genome='hg38', max_workers=None):
             peaks_df = read_peak_file(peaks_file, source='chip_atlas')
             if True: 
                 # Filter for score == 1000
-                print('Filtering peaks with score > 50', flush=True)
+                print('Filtering peaks with score > 0', flush=True)
                 print(f'Original number of peaks: {len(peaks_df)}', flush=True)
-                peaks_df = peaks_df[peaks_df['score'] > 50]
+                peaks_df = peaks_df[peaks_df['score'] > 0]
                 print(f'Number of peaks after filtering: {len(peaks_df)}', flush=True)
             grn = build_grn(peaks_df, tss_local_path, genome=genome, max_workers=10)
             grn.to_csv(output_csv_path, index=False)
diff --git a/src/process_data/beeline/config.novsh.yaml b/src/process_data/beeline/config.novsh.yaml
new file mode 100644
index 000000000..922e7a7de
--- /dev/null
+++ b/src/process_data/beeline/config.novsh.yaml
@@ -0,0 +1,53 @@
+
+name: beeline
+namespace: "process_data"
+info:
+  label: Process BEELINE dataset
+  summary: "Process sourced BEELINE data to generate inference and evaluation datasets"
+
+arguments:
+  - name: --replogle_gwps
+    type: file
+    required: true
+    direction: input
+  - name: --tf_all
+    type: file
+    required: true
+    direction: input
+  - name: --replogle_test_perturbs
+    type: file
+    required: true
+    direction: input
+  - name: --replogle_gwps_test_sc
+    type: file
+    required: true
+    direction: output
+  - name: --replogle_gwps_train_sc
+    type: file
+    required: true
+    direction: output
+  - name: --replogle_gwps_train_sc_subset
+    type: file
+    required: true
+    direction: output
+
+
+resources:
+  - type: python_script
+    path: script.py
+  - path: /src/utils/util.py
+    dest: util.py
+
+engines:
+  - type: docker
+    image: ghcr.io/openproblems-bio/base_python:1.0.4
+    setup:
+      - type: python
+        packages: [ numpy==1.26.4 ]
+  - type: native
+
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [midtime, midmem, midcpu]
diff --git a/src/process_data/beeline/script.py b/src/process_data/beeline/script.py
new file mode 100644
index 000000000..72c03dfd7
--- /dev/null
+++ b/src/process_data/beeline/script.py
@@ -0,0 +1,73 @@
+
+import os 
+import anndata as ad 
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import sys
+import scanpy as sc
+from sklearn.model_selection import train_test_split 
+
+from scipy.sparse import csr_matrix
+
+## VIASH START
+par = {
+}
+## VIASH END
+
+try:
+    sys.path.append(meta["resources_dir"])
+except:
+    meta = {
+        'resources_dir': 'src/process_data/',
+        'util_dir': 'src/utils',
+    }
+    sys.path.append(meta["resources_dir"])
+    sys.path.append(meta["util_dir"])
+
+from helper_data import sum_by
+from util import download_and_uncompress_zip
+
+
+def add_metadata(adata, name: str):
+    adata.uns['dataset_summary'] = 'Processed experimental single-cell gene expression datasets used in BEELINE.'
+    adata.uns['dataset_description'] = 'Processed experimental single-cell gene expression datasets used in BEELINE.'
+    adata.uns['data_reference'] = "@dataset{aditya_pratapa_2020_3701939,\nauthor       = {Aditya Pratapa and Amogh Jalihal and Jeffrey Law and Aditya Bharadwaj and T M Murali},\n title        = {Benchmarking algorithms for gene regulatory network inference from single-cell transcriptomic data },\nmonth        = mar,\nyear         = 2020,\npublisher    = {Zenodo},\ndoi          = {10.5281/zenodo.3701939},\nurl          = {https://doi.org/10.5281/zenodo.3701939},\n}"
+    adata.uns['data_url'] = 'https://zenodo.org/records/3701939'
+    adata.uns['dataset_id'] = f'beeline_{name}'
+    adata.uns['dataset_name'] = f'BEELINE_{name}'
+    adata.uns['dataset_organism'] = 'human' if name.startswith('h') else 'mouse'
+    adata.uns['normalization_id'] = 'X_norm'
+    return adata
+
+
+if __name__ == '__main__':
+
+    # - get the data
+    download_and_uncompress_zip(
+        "https://zenodo.org/records/3701939/files/BEELINE-data.zip?download=1",
+        "resources/datasets_raw/beeline"
+    )
+
+    # Convert CSV files to H5AD
+    for name in ["hESC", "hHep", "mDC", "mESC", "mHSC-E", "mHSC-GM", "mHSC-L"]:
+        filepath = f"resources/datasets_raw/beeline/BEELINE-data/inputs/scRNA-Seq/{name}/ExpressionData.csv"
+
+        df = pd.read_csv(filepath, index_col=0).transpose()
+        adata = ad.AnnData(X=df.values, obs=df.index.to_frame(index=False), var=pd.DataFrame(index=df.columns))
+        adata.var.index.name = "gene_name"
+        adata.layers['X_norm'] = adata.X.copy()
+
+        # Make all obs/var column names strings and clean up a bit
+        adata.obs.columns = pd.Index(adata.obs.columns).map(str).str.replace("/", "_")
+        adata.var.columns = pd.Index(adata.var.columns).map(str).str.replace("/", "_")
+
+        # - filter genes and cells
+        sc.pp.filter_cells(adata, min_genes=100)
+        sc.pp.filter_genes(adata, min_cells=10)
+
+        # - add metadata
+        adata = add_metadata(adata, name)
+
+        # - save
+        adata.write(f"resources/grn_benchmark/inference_data/beeline_{name}.h5ad")
diff --git a/src/utils/util.py b/src/utils/util.py
index e54ef497d..14566bf2f 100644
--- a/src/utils/util.py
+++ b/src/utils/util.py
@@ -1,9 +1,16 @@
+import io
+import os
+import zipfile
+import tempfile
+import requests
+from pathlib import Path
 import pandas as pd
 import anndata as ad
 import numpy as np
 from tqdm import tqdm
 import scipy.sparse as sp
 
+
 def naming_convention(dataset, method): 
     if (dataset in ['replogle', 'parsescience', 'xaira_HEK293T']) & (method in ['scprint']):
         dataset = f'{dataset}_sc'
@@ -400,6 +407,47 @@ def download_annotation(par):
             print(f"Failed to download the gencode.v45.annotation.gtf.gz. Status code: {response.status_code}")
         print("Downloading prior ended")
 
+def download_and_uncompress_zip(url: str, folder: str) -> None:
+
+    chunk_size = 1 << 20
+    dest = Path(folder)
+    dest.mkdir(parents=True, exist_ok=True)
+
+    # Stream download -> seekable temp file (ZipFile needs seekable)
+    with requests.get(url, stream=True, timeout=60) as r:
+        r.raise_for_status()
+        with tempfile.TemporaryFile() as tf:
+            for chunk in r.iter_content(chunk_size=chunk_size):
+                if chunk:  # skip keep-alives
+                    tf.write(chunk)
+            tf.seek(0)
+
+            with zipfile.ZipFile(tf) as zf:
+                for member in zf.infolist():
+                    print(member)
+                    # Normalize path and prevent traversal outside `dest`
+                    extracted_path = dest / Path(member.filename).as_posix()
+                    normalized = extracted_path.resolve()
+
+                    if not str(normalized).startswith(str(dest.resolve()) + os.sep) and normalized != dest.resolve():
+                        raise RuntimeError(f"Unsafe path in ZIP: {member.filename}")
+
+                    if member.is_dir():
+                        normalized.mkdir(parents=True, exist_ok=True)
+                        continue
+
+                    # Ensure parent dirs exist
+                    normalized.parent.mkdir(parents=True, exist_ok=True)
+
+                    # Extract the file
+                    with zf.open(member, "r") as source, open(normalized, "wb") as target:
+                        while True:
+                            chunk = source.read(1 << 20)
+                            if not chunk:
+                                break
+                            target.write(chunk)
+
+
 def read_gtf_as_df(gtf_path: str) -> pd.DataFrame:
     """
     Read a GTF/GFF3 file (plain or gzipped) and return gene-level annotation

From b79593c15e79f5fafdce47743365e9005eab28af Mon Sep 17 00:00:00 2001
From: AntoinePassemiers <antoine.passemiers@gmail.com>
Date: Mon, 27 Oct 2025 13:45:33 +0100
Subject: [PATCH 15/19] Improve anchor_regression metric

---
 .../experimental/anchor_regression/helper.py  | 49 ++-----------------
 1 file changed, 5 insertions(+), 44 deletions(-)

diff --git a/src/metrics/experimental/anchor_regression/helper.py b/src/metrics/experimental/anchor_regression/helper.py
index 93145bbaf..5bdfa9a65 100644
--- a/src/metrics/experimental/anchor_regression/helper.py
+++ b/src/metrics/experimental/anchor_regression/helper.py
@@ -77,7 +77,7 @@ def anchor_regression(
         X: np.ndarray,
         Z: np.ndarray,
         Y: np.ndarray,
-        l2_reg: float = 1e-5,
+        l2_reg: float = 1e-6,
         gamma: float = 1.0
 ) -> np.ndarray:
     """Anchor regression for causal inference under confounding.
@@ -108,46 +108,7 @@ def anchor_regression(
     return theta
 
 
-def cross_val(
-        cv,
-        X: np.ndarray,
-        y: np.ndarray,
-        Z: np.ndarray,
-        eps: float = 1e-50
-) -> float:
-    ss_res, ss_tot = 0, 0
-    y_mean = np.mean(y)
-    for idx_train, idx_test in cv.split(X, y):
-        X_train, X_test = X[idx_train, :], X[idx_test, :]
-        Z_train, Z_test = Z[idx_train, :], Z[idx_test, :]
-        y_train, y_test = y[idx_train], y[idx_test]
-        theta = anchor_regression(X_train, Z_train, y_train, gamma=1)
-        y_hat = np.dot(X_test, theta)
-        ss_res += np.sum(np.square(y_test - y_hat))
-        ss_tot += np.sum(np.square(y_test - y_mean))
-        break  # TODO
-    if ss_tot == 0:
-        r2 = 0
-    else:
-        r2 = 1 - ss_res / ss_tot
-    return float(r2)
-
-
 def compute_stabilities(
-        X: np.ndarray,
-        y: np.ndarray,
-        Z: np.ndarray,
-        weights: np.ndarray,
-        is_selected: np.ndarray,
-        eps: float = 1e-50
-) -> float:
-    cv = KFold(5, random_state=0xCAFE, shuffle=True)
-    r2_selected = cross_val(cv, X[:, is_selected], y, Z)
-    r2_non_selected = cross_val(cv, X[:, ~is_selected], y, Z)
-    return r2_selected - r2_non_selected
-
-
-def compute_stabilities_v2(
         X: np.ndarray,
         y: np.ndarray,
         Z: np.ndarray,
@@ -157,10 +118,10 @@ def compute_stabilities_v2(
 
     theta0 = np.abs(anchor_regression(X, Z, y, gamma=1))
     theta0 /= np.sum(theta0)
-    theta = np.abs(anchor_regression(X, Z, y, gamma=2))
+    theta = np.abs(anchor_regression(X, Z, y, gamma=1.2))
     theta /= np.sum(theta)
-    s1 = np.clip(np.abs(theta0[is_selected] - theta[is_selected]) / np.abs(theta0[is_selected] + eps), 0, 1)
-    s2 = np.clip(np.abs(theta0[~is_selected] - theta[~is_selected]) / np.abs(theta0[~is_selected] + eps), 0, 1)
+    s1 = theta[is_selected] * theta0[is_selected]
+    s2 = theta[~is_selected] * theta0[~is_selected]
 
     s1 = np.mean(s1)
     s2 = np.mean(s2)
@@ -198,7 +159,7 @@ def evaluate_gene_stability(
     mask = np.ones(X.shape[1], dtype=bool)
     mask[j] = False
 
-    return compute_stabilities_v2(X[:, mask], X[:, j], Z, is_selected[mask])
+    return compute_stabilities(X[:, mask], X[:, j], Z, is_selected[mask])
 
 
 def main(par):

From c6f14f8c6ea0f4156c1e9acfc7448117dd9da743 Mon Sep 17 00:00:00 2001
From: AntoinePassemiers <antoine.passemiers@gmail.com>
Date: Fri, 31 Oct 2025 17:43:41 +0100
Subject: [PATCH 16/19] Minor change

---
 src/metrics/experimental/regression_3/helper.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/metrics/experimental/regression_3/helper.py b/src/metrics/experimental/regression_3/helper.py
index 4a7c5b246..97e47d5bd 100644
--- a/src/metrics/experimental/regression_3/helper.py
+++ b/src/metrics/experimental/regression_3/helper.py
@@ -53,8 +53,9 @@ def compute_residual_correlations(
         y_test: np.ndarray,
         Z_test: np.ndarray
 ) -> np.ndarray:
+    model = xgboost.XGBRegressor(n_estimators=10)
     #model = xgboost.XGBRegressor(n_estimators=10)
-    model = Ridge(alpha=0.01)
+    model = Ridge(alpha=1)
     model.fit(X_train, y_train)
     y_hat = model.predict(X_test)
     residuals = y_test - y_hat

From 0def7efbc3c6a06af7d5336f94a1826adaf332b8 Mon Sep 17 00:00:00 2001
From: AntoinePassemiers <antoine.passemiers@gmail.com>
Date: Mon, 3 Nov 2025 23:20:47 +0100
Subject: [PATCH 17/19] Fix GRN baseline generation algorithm

---
 src/utils/util.py | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/src/utils/util.py b/src/utils/util.py
index 2837c07d2..94c907c8f 100644
--- a/src/utils/util.py
+++ b/src/utils/util.py
@@ -568,6 +568,10 @@ def create_grn_baseline(A):
     reassigned deterministically to different edges (no randomness).
     """
 
+    # Ensure no self-regulation
+    A = np.copy(A)
+    np.fill_diagonal(A, 0)
+
     n = A.shape[0]
 
     # Order nodes by degrees, with explicit tie-breaking by weight
@@ -612,7 +616,7 @@ def create_grn_baseline(A):
                             B[x, w] = 0
                             B[x, v] = 1
                             in_degrees[w].degree += 1
-                            in_degrees[v].degree -= 1
+                            #in_degrees[v].degree -= 1
                             picks.append(v)
                             swapped = True
                             break
@@ -621,14 +625,13 @@ def create_grn_baseline(A):
                 if len(picks) == k:
                     break
 
-        if len(picks) < k:
-            raise ValueError("Directed degree sequence not realizable with simple digraph under constraints.")
+        #if len(picks) < k:
+        #    raise ValueError("Directed degree sequence not realizable with simple digraph under constraints.")
 
         # Place edges for u
         for v in picks:
             B[u, v] = 1
             in_degrees[v].degree -= 1
-        #out_degrees[v].degree = 0
         out_degrees[u].degree = 0
 
         # Stable re-sort by residual in-degree, with explicit tie-breaking by weight
@@ -688,4 +691,12 @@ def create_grn_baseline(A):
         # Assign exact weights to new edges deterministically
         B[u, targets_ranked] = W_sorted
 
-    return B
\ No newline at end of file
+    # Quality check
+    #target_in_degrees = np.sum((A != 0), axis=0)
+    #target_out_degrees = np.sum((A != 0), axis=1)
+    #in_degrees = np.sum((B != 0), axis=0)
+    #out_degrees = np.sum((B != 0), axis=1)
+    #print(target_in_degrees - in_degrees)
+    #print(target_out_degrees - out_degrees)
+
+    return B

From 83b2cda18e150d987d95da12e87ade53199ceafe Mon Sep 17 00:00:00 2001
From: AntoinePassemiers <antoine.passemiers@gmail.com>
Date: Tue, 4 Nov 2025 09:38:52 +0100
Subject: [PATCH 18/19] Dictys: fix version conflicts + improve use of
 subprocess

---
 .gitignore                        |   1 +
 dockers/dictys_v4/Dockerfile      | 122 +++++++++++++++++++++++++++
 dockers/dictys_v4/constraints.txt |   3 +
 src/methods/dictys/helper.py      | 134 ++++++++++++++++++++----------
 4 files changed, 218 insertions(+), 42 deletions(-)
 create mode 100644 dockers/dictys_v4/Dockerfile
 create mode 100644 dockers/dictys_v4/constraints.txt

diff --git a/.gitignore b/.gitignore
index d3f14a9ca..bfed5cdc6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,6 +5,7 @@
 
 # related to files
 .pybiomart.sqlite
+.venv/
 logs/
 params*
 resources*
diff --git a/dockers/dictys_v4/Dockerfile b/dockers/dictys_v4/Dockerfile
new file mode 100644
index 000000000..bf3cfa0e4
--- /dev/null
+++ b/dockers/dictys_v4/Dockerfile
@@ -0,0 +1,122 @@
+FROM ubuntu:22.04
+
+ARG DEBIAN_FRONTEND=noninteractive
+ENV TZ="America/New_York"
+
+# Base OS deps (build tools + common libs) + libpng for matplotlib
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    curl ca-certificates git unzip zip \
+    build-essential pkg-config \
+    zlib1g-dev libbz2-dev liblzma-dev \
+    libncurses5-dev libgdbm-dev libnss3-dev libssl-dev libreadline-dev libffi-dev libsqlite3-dev \
+    libfreetype6-dev libpng-dev \
+    python3-venv python3-distutils python3-dev \
+    # bio tools via apt instead of building
+    samtools tabix \
+    perl \
+ && rm -rf /var/lib/apt/lists/*
+
+# Install CPython 3.9.17 from source
+ARG PYTHON_VERSION=3.9.17
+RUN set -eux; \
+    cd /tmp; \
+    curl -fsSLO https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz; \
+    tar -xzf Python-${PYTHON_VERSION}.tgz; \
+    cd Python-${PYTHON_VERSION}; \
+    ./configure --enable-optimizations; \
+    make -j"$(nproc)"; \
+    make install; \
+    cd /; rm -rf /tmp/Python-${PYTHON_VERSION}*; \
+    ln -s /usr/local/bin/python3 /usr/local/bin/python; \
+    ln -s /usr/local/bin/pip3 /usr/local/bin/pip
+
+# Make constraints global for all pip installs
+COPY constraints.txt /tmp/constraints.txt
+ENV PIP_CONSTRAINT=/tmp/constraints.txt \
+    PIP_DISABLE_PIP_VERSION_CHECK=1 \
+    PIP_DEFAULT_TIMEOUT=180
+
+# Clean any existing numpy/matplotlib remnants aggressively
+RUN python - <<'PY'
+import sys, site, pkgutil, shutil, pathlib
+paths = set(site.getsitepackages() + [site.getusersitepackages()])
+for p in list(paths):
+    if not p: 
+        continue
+    for name in ("numpy", "matplotlib"):
+        for m in pathlib.Path(p).glob(name):
+            shutil.rmtree(m, ignore_errors=True)
+        for m in pathlib.Path(p).glob(f"{name}-*.dist-info"):
+            shutil.rmtree(m, ignore_errors=True)
+        for m in pathlib.Path(p).glob(f"{name}-*.egg-info"):
+            shutil.rmtree(m, ignore_errors=True)
+print("Cleaned numpy/matplotlib from:", *paths, sep="\n  - ")
+PY
+
+# Install bedtools
+RUN apt-get update && apt-get install -y --no-install-recommends bedtools \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install tools + exact pins
+RUN python -m pip install --no-cache-dir -U pip setuptools wheel \
+ && pip install --no-cache-dir --upgrade --force-reinstall \
+      "numpy==1.26.4" "matplotlib==3.8.4" "cython<3"
+
+# Install MACS2. Build-from-source packages must reuse our pinned toolchain
+RUN pip install --no-cache-dir --no-build-isolation MACS2==2.2.9.1
+
+# Install Dictys without dependencies (we'll install them manually right after)
+RUN pip install --no-cache-dir --no-build-isolation --no-deps \
+  git+https://github.com/pinellolab/dictys.git@a82930fe8030af2785f9069ef5e909e49acc866f
+
+# Install Dictys dependencies and more
+RUN pip install --no-cache-dir --prefer-binary \
+  pandas scipy networkx h5py threadpoolctl joblib \
+  jupyter jupyterlab adjustText pyro-ppl docutils requests
+
+# Install pyDNase and anndata without dependencies so it can't pin matplotlib<2
+RUN pip install --no-cache-dir --no-build-isolation --no-deps pyDNase clint pysam packaging array_api_compat legacy-api-wrap zarr natsort anndata
+
+# Install pybedtools version that works with cython<3
+RUN pip install --no-cache-dir --no-build-isolation "pybedtools==0.9.1"
+
+# Install pytorch
+# RUN pip install --no-cache-dir --prefer-binary --index-url https://download.pytorch.org/whl/cpu torch
+
+# HOMER prerequisites
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    wget perl unzip build-essential zlib1g-dev \
+ && rm -rf /var/lib/apt/lists/*
+
+# Install HOMER core + hg38 genome
+RUN set -eux; \
+    mkdir -p /opt/homer && cd /opt/homer; \
+    curl -fsSLO http://homer.ucsd.edu/homer/configureHomer.pl; \
+    chmod +x configureHomer.pl; \
+    perl configureHomer.pl -install homer; \
+    perl configureHomer.pl -install homerTools; \
+    perl configureHomer.pl -install hg38
+ENV PATH="/opt/homer/bin:${PATH}"
+
+# hg38 annotations
+RUN set -eux; \
+    cd /opt/homer; \
+    grep "hg38" update.txt > tmp.txt && mv tmp.txt update.txt; \
+    cd update && ./updateUCSCGenomeAnnotations.pl ../update.txt
+
+# Install CUDA
+# RUN curl -fsSLo /etc/apt/preferences.d/cuda-repository-pin-600 \
+#       https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-ubuntu2204.pin && \
+#     curl -fsSLo /usr/share/keyrings/nvidia-cuda.gpg \
+#       https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/3bf863cc.pub && \
+#     echo "deb [signed-by=/usr/share/keyrings/nvidia-cuda.gpg] http://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/ /" \
+#       > /etc/apt/sources.list.d/cuda.list && \
+#     apt-get update && apt-get install -y --no-install-recommends cuda && \
+#     rm -rf /var/lib/apt/lists/*
+
+# Install AWS CLI
+RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" && \
+    unzip awscliv2.zip && \
+    ./aws/install
+
+CMD ["/bin/bash"]
\ No newline at end of file
diff --git a/dockers/dictys_v4/constraints.txt b/dockers/dictys_v4/constraints.txt
new file mode 100644
index 000000000..c5b8e303b
--- /dev/null
+++ b/dockers/dictys_v4/constraints.txt
@@ -0,0 +1,3 @@
+numpy==1.26.4
+matplotlib<3.9
+cython<3
\ No newline at end of file
diff --git a/src/methods/dictys/helper.py b/src/methods/dictys/helper.py
index 90cde9b03..776929351 100644
--- a/src/methods/dictys/helper.py
+++ b/src/methods/dictys/helper.py
@@ -1,6 +1,8 @@
 import os
 os.environ["MKL_SERVICE_FORCE_INTEL"] = "1"
 os.environ["MKL_THREADING_LAYER"] = "GNU"
+import shutil
+from typing import Optional, List
 
 import numpy as np
 import pandas as pd
@@ -11,6 +13,27 @@
 warnings.filterwarnings("ignore")
 
 
+OVERRIDE_DOWNLOAD = False
+
+
+def run_cmd(cmd: List[str], cwd: Optional[str] = None) -> None:
+    kwargs = {}
+    if cwd is not None:
+        kwargs['cwd'] = cwd
+    with subprocess.Popen(
+        cmd,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        text=True,
+        bufsize=1,
+        **kwargs
+    ) as proc:
+        for line in proc.stdout:
+            print(line, end="")
+        rc = proc.wait()
+    if rc != 0:
+        raise RuntimeError(f"Command {cmd} failed with exit code {rc}")
+
 
 def define_vars(par):
     os.makedirs(par['temp_dir'], exist_ok=True)
@@ -28,6 +51,7 @@ def define_vars(par):
     par['bams_dir'] = f"{par['data_dir']}/bams/"
 
     par['gene_bed'] = f"{par['data_dir']}/gene.bed"
+    par['make_dir'] = f"{par['temp_dir']}/makefiles"
     
 
 def extract_exp(par):
@@ -88,6 +112,7 @@ def extract_atac(par):
     print(f'Sort and compress tsv file {frags_path}')
     os.system(f"sort -k1,1 -k2,2n {temp_path} | bgzip -c > {frags_path}")
 
+
 def create_bam(par):
     print('Creating BAM file from fragments', flush=True)
     cmd = f"python {par['frag_to_bam']} --fnames {par['frags_path']} --barcodes {par['barcodes']}"
@@ -107,9 +132,24 @@ def bam_to_bams(par):
             - 'bams_dir': path to output folder for per-cell BAMs
             - 'exp_path': path to reference expression file
     """
+
+    print('Delete temp BAM directories', flush=True)
+    folders = [
+        par['bams_dir'],
+        os.path.join(par['bams_dir'], '..', 'bams_text'),
+        os.path.join(par['bams_dir'], '..', 'bams_header')
+    ]
+    for folder in folders:
+        if os.path.exists(folder):
+            shutil.rmtree(folder)
+
     print('Splitting BAM into per-cell BAMs', flush=True)
-    cmd = f"bash dictys_helper split_bam.sh {par['bam_name']} {par['bams_dir']} --section CB:Z: --ref_expression {par['exp_path']}"
-    run_cmd(cmd)
+    run_cmd([
+        "bash", "dictys_helper", "split_bam.sh", par['bam_name'], par['bams_dir'],
+        "--section", "CB:Z:", "--ref_expression", par['exp_path']
+    ])
+
+
 def extrac_clusters(par):
     print('Extracting clusters', flush=True)
     subsets = f"{par['data_dir']}/subsets.txt"
@@ -127,15 +167,6 @@ def extrac_clusters(par):
     subprocess.run(cp, shell=True, check=True)
     print('Extracting clusters successful', flush=True)
 
-def run_cmd(cmd):
-    try:
-        result = subprocess.run(cmd, check=True, text=True, capture_output=True, shell=True)
-        print("STDOUT:", result.stdout)
-        print("STDERR:", result.stderr)
-    except subprocess.CalledProcessError as e:
-        print("Command failed with exit code", e.returncode)
-        print("STDOUT:", e.stdout)
-        print("STDERR:", e.stderr)
 
 def download_file(url, dest):
     import requests
@@ -145,21 +176,27 @@ def download_file(url, dest):
         with open(dest, "wb") as f:
             for chunk in r.iter_content(chunk_size=8192):
                 f.write(chunk)
+
+
 def get_priors(par):
     import gzip
     import shutil
     # - get the genome
     print('Getting genome ...', flush=True)
-    os.makedirs(f"{par['data_dir']}/genome/", exist_ok=True)
-    cmd = f"aws s3 cp s3://openproblems-data/resources/grn/supp_data/genome/genome.fa {par['data_dir']}/genome/ --no-sign-request"
-    try:
-        run_cmd(cmd)
-    except:
+    if OVERRIDE_DOWNLOAD or (not os.path.exists(f"{par['data_dir']}/genome/genome.fa")):
+        os.makedirs(f"{par['data_dir']}/genome/", exist_ok=True)
         try:
-            cmd = f"cp resources/supp_data/genome/genome.fa {par['data_dir']}/genome/"
-            run_cmd(cmd)
+            run_cmd([
+                "aws", "s3", "cp", "s3://openproblems-data/resources/grn/supp_data/genome/genome.fa",
+                f"{par['data_dir']}/genome/", "--no-sign-request"
+            ])
         except:
-            raise ValueError("Could not get the genome")
+            try:
+                run_cmd([
+                    "cp", "resources/supp_data/genome/genome.fa", f"{par['data_dir']}/genome/"
+                ])
+            except:
+                raise ValueError("Could not get the reference genome")
     
     # - get gene annotation       
     print('Getting gene annotation ...', flush=True)
@@ -167,29 +204,33 @@ def get_priors(par):
     gtf_gz = data_dir / "gene.gtf.gz"
     gtf =  data_dir / "gene.gtf"
     url = "http://ftp.ensembl.org/pub/release-107/gtf/homo_sapiens/Homo_sapiens.GRCh38.107.gtf.gz"
-    download_file(url, gtf_gz)
+    if OVERRIDE_DOWNLOAD or (not os.path.exists(gtf_gz)):
+        download_file(url, gtf_gz)
 
     with gzip.open(gtf_gz, "rb") as f_in, open(gtf, "wb") as f_out:
         shutil.copyfileobj(f_in, f_out)
     gtf_gz.unlink()
 
-    cmd = f"bash dictys_helper gene_gtf.sh {gtf} {par['gene_bed']}"
     print('Making bed files for gene annotation ...', flush=True)
-    run_cmd(cmd)
+    run_cmd([
+        "bash", "dictys_helper", "gene_gtf.sh", gtf, par['gene_bed']
+    ])
 
     print('Downloading motif file...', flush=True)
 
     url='https://hocomoco11.autosome.org/final_bundle/hocomoco11/full/HUMAN/mono/HOCOMOCOv11_full_HUMAN_mono_homer_format_0.0001.motif'
     motif_file = data_dir / 'motifs.motif'
-    download_file(url, motif_file)
-    
+    if OVERRIDE_DOWNLOAD or (not os.path.exists(motif_file)):
+        download_file(url, motif_file)
+
+
 def configure(par):
     import json
     device='cuda:0' #cuda:0 , cpu
-    par['make_dir'] = f"{par['temp_dir']}/makefiles"
     os.makedirs(par['make_dir'], exist_ok=True)
-    cmd = f"cd {par['make_dir']} && bash dictys_helper makefile_template.sh common.mk config.mk env_none.mk static.mk"
-    run_cmd(cmd)
+    run_cmd([
+        "bash", "dictys_helper", "makefile_template.sh", "common.mk", "config.mk", "env_none.mk", "static.mk"
+    ], cwd=par['make_dir'])
     
     json_arg = json.dumps({
         "DEVICE": device,
@@ -197,14 +238,23 @@ def configure(par):
         "JOINT": "1"
     })
 
-    cmd = f"cd {par['make_dir']} && bash dictys_helper makefile_update.py config.mk '{json_arg}'"
-    run_cmd(cmd)
-    cmd = f"cd {par['temp_dir']} && bash dictys_helper makefile_check.py"
-    run_cmd(cmd)
+    run_cmd([
+        "bash", "dictys_helper", "makefile_update.py", "config.mk", json_arg
+    ], cwd=par['make_dir'])
+
+    run_cmd([
+        "bash", "dictys_helper", "makefile_check.py", "--dir_makefiles", par['make_dir'],
+        "--dir_data", par['data_dir']
+    ])
+
+
 def infer_grn(par):
     print('Inferring GRNs', flush=True)
-    cmd = f"cd {par['temp_dir']} && bash dictys_helper network_inference.sh -j {par['num_workers']} -J 1 static"
-    run_cmd(cmd)
+    run_cmd([
+        "bash", "dictys_helper", "network_inference.sh", "-j", str(par['num_workers']), "-J", "1", "static"
+    ], cwd=par['temp_dir'])
+
+
 def export_net(par):
     from util import process_links
     from dictys.net import network
@@ -224,14 +274,14 @@ def export_net(par):
     output.write(par['prediction'])
 
 def main(par):
-    define_vars(par)
 
-    extract_exp(par)
-    extract_atac(par)
-    create_bam(par)
-    bam_to_bams(par)
-    extrac_clusters(par)
-    get_priors(par)
-    configure(par)
+    define_vars(par)
+    #extract_exp(par)
+    #extract_atac(par)
+    #create_bam(par)
+    #bam_to_bams(par)
+    #extrac_clusters(par)
+    #get_priors(par)
+    #configure(par)
     infer_grn(par)
-    export_net(par)
+    #export_net(par)

From 7122ef445e3eb7356f437e1179c78d6fd13f39a4 Mon Sep 17 00:00:00 2001
From: AntoinePassemiers <antoine.passemiers@gmail.com>
Date: Tue, 4 Nov 2025 10:29:54 +0100
Subject: [PATCH 19/19] Dictys: minor change

---
 src/methods/dictys/helper.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/methods/dictys/helper.py b/src/methods/dictys/helper.py
index 776929351..46c28901e 100644
--- a/src/methods/dictys/helper.py
+++ b/src/methods/dictys/helper.py
@@ -276,12 +276,12 @@ def export_net(par):
 def main(par):
 
     define_vars(par)
-    #extract_exp(par)
-    #extract_atac(par)
-    #create_bam(par)
-    #bam_to_bams(par)
-    #extrac_clusters(par)
-    #get_priors(par)
-    #configure(par)
+    extract_exp(par)
+    extract_atac(par)
+    create_bam(par)
+    bam_to_bams(par)
+    extrac_clusters(par)
+    get_priors(par)
+    configure(par)
     infer_grn(par)
-    #export_net(par)
+    export_net(par)