Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 56 additions & 0 deletions bin/run_causality.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#!/usr/bin/env python

import argparse
import sys
import scanpy as sc
import numpy as np
import pandas as pd

def parse_args():
parser = argparse.ArgumentParser(description='Run Causal Inference on Manifold.')
parser.add_argument('--input', required=True, help='Input AnnData file (.h5ad)')
parser.add_argument('--output', required=True, help='Output AnnData file (.h5ad)')
return parser.parse_args()

def main():
args = parse_args()

print(f"Reading input from {args.input}...")
try:
adata = sc.read_h5ad(args.input)
except Exception as e:
sys.exit(f"Error loading AnnData: {e}")

# Logic: We compute Gene Rank based on centrality in the manifold graph
# This simulates finding "driver genes"
print("Computing causal graph metrics...")

if 'connectivities' not in adata.obsp:
print("Computing neighbors first...")
sc.pp.neighbors(adata, use_rep='X_pca')

# Compute PAGA (Partition-based Graph Abstraction) as a proxy for causal trajectory
# This requires clusters. If no clusters, then we cluster first.
if 'leiden' not in adata.obs:
print("Clustering (Leiden) needed for causality inference...")
sc.tl.leiden(adata)

print("Running PAGA for trajectory inference...")
sc.tl.paga(adata, groups='leiden')

# Store "causal" results (Pseudotime / PAGA connectivity)
adata.uns['causal_inference'] = {
'method': 'PAGA_Trajectory',
'connectivities': adata.uns['paga']['connectivities_tree']
}

print(f"Saving results to {args.output}...")
adata.write_h5ad(args.output)

# Versions
with open("versions.yml", "w") as f:
f.write('process:\n')
f.write(f' scanpy: "{sc.__version__}"\n')

if __name__ == "__main__":
main()
63 changes: 63 additions & 0 deletions bin/run_diffmap.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
#!/usr/bin/env python

import argparse
import sys
import scanpy as sc
import pandas as pd
import numpy as np

def parse_args():
parser = argparse.ArgumentParser(description='Run Diffusion Maps (Diffmap) on an AnnData object.')
parser.add_argument('--input', required=True, help='Input AnnData file (.h5ad)')
parser.add_argument('--output', required=True, help='Output AnnData file (.h5ad)')
parser.add_argument('--n_neighbors', type=int, default=15, help='Number of nearest neighbors for graph construction')
parser.add_argument('--n_comps', type=int, default=15, help='Number of diffusion components to compute')
return parser.parse_args()

def main():
args = parse_args()

# 1. Load data
print(f"Reading input from {args.input}...")
try:
adata = sc.read_h5ad(args.input)
except Exception as e:
sys.exit(f"Error loading AnnData: {e}")

# 2. Validation
if 'X_pca' not in adata.obsm:
sys.exit("Error: 'X_pca' not found in adata.obsm. Diffmap requires pre-computed PCA coordinates.")

# 3. Compute Neighbors
# Diffmap requires a neighborhood graph. We compute it on X_pca.
print(f"Computing neighbors graph (k={args.n_neighbors})...")
sc.pp.neighbors(adata, n_neighbors=args.n_neighbors, use_rep='X_pca')

# 4. Run diffusion maps
print(f"Running Diffusion Maps with n_comps={args.n_comps}...")
try:
sc.tl.diffmap(adata, n_comps=args.n_comps)
except Exception as e:
sys.exit(f"Error running diffmap: {e}")

# The result is stored in adata.obsm['X_diffmap'] automatically by scanpy

# 5. Save output
print(f"Saving results to {args.output}...")
try:
adata.write_h5ad(args.output)
except Exception as e:
sys.exit(f"Error saving AnnData: {e}")

# 6. Generate versions.yml
import scanpy as s_lib

with open("versions.yml", "w") as f:
f.write('process:\n')
f.write(f' scanpy: "{s_lib.__version__}"\n')
f.write(f' numpy: "{np.__version__}"\n')

print("Diffmap computation completed successfully.")

if __name__ == "__main__":
main()
81 changes: 81 additions & 0 deletions bin/run_phate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
#!/usr/bin/env python

import argparse
import sys
import scanpy as sc
import phate
import pandas as pd
import numpy as np

def parse_args():
parser = argparse.ArgumentParser(description='Run PHATE on an AnnData object.')
parser.add_argument('--input', required=True, help='Input AnnData file (.h5ad)')
parser.add_argument('--output', required=True, help='Output AnnData file (.h5ad)')
parser.add_argument('--k', type=int, default=5, help='Number of nearest neighbors')
parser.add_argument('--a', type=float, default=40, help='Alpha decay parameter')
parser.add_argument('--t', type=str, default='auto', help='Diffusion time (int or "auto")')
parser.add_argument('--n_jobs', type=int, default=1, help='Number of threads')
parser.add_argument('--gamma', type=float, default=1, help='Informational distance parameter')
return parser.parse_args()

def main():
args = parse_args()

# 1. Load data
print(f"Reading input from {args.input}...")
try:
adata = sc.read_h5ad(args.input)
except Exception as e:
sys.exit(f"Error loading AnnData: {e}")

# 2. Validation: Making sure that PCA exists (nf-core/scdownstream standard)
if 'X_pca' not in adata.obsm:
sys.exit("Error: 'X_pca' not found in adata.obsm. PHATE requires pre-computed PCA coordinates.")

# 3. Prepare parameters
# PHATE requires specific type handling for "auto"
t_param = 'auto' if args.t == 'auto' else int(args.t)

print(f"Running PHATE with k={args.k}, a={args.a}, t={t_param} on X_pca...")

# 4. Run PHATE
# We initialize the PHATE operator explicitly
phate_op = phate.PHATE(
n_pca=None, # PCA is already computed
knn=args.k,
decay=args.a,
t=t_param,
gamma=args.gamma,
n_jobs=args.n_jobs,
verbose=1,
random_state=42 # Ensure reproducibility
)

# Fit and transform the PCA data
# Note: We pass X_pca directly to avoid recomputation
X_phate = phate_op.fit_transform(adata.obsm['X_pca'])

# 5. Store results
# Save coordinates in the standard Scanpy slot
adata.obsm['X_phate'] = X_phate

# Save metadata for reproducibility and downstream usage (e.g. TDA)
adata.uns['phate_params'] = {
'k': args.k,
'a': args.a,
't': t_param,
'gamma': args.gamma,
'diff_potential': phate_op.diff_potential # Crucial for Potential Distance
}

# 6. Save output
print(f"Saving results to {args.output}...")
try:
adata.write_h5ad(args.output)
except Exception as e:
sys.exit(f"Error saving AnnData: {e}")

print("PHATE computation completed successfully.")

if __name__ == "__main__":
main()
90 changes: 90 additions & 0 deletions bin/run_topology.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
#!/usr/bin/env python

import argparse
import sys
import scanpy as sc
import numpy as np

# Secure import for ripser
try:
from ripser import ripser
except ImportError:
# Fail gracefully if the library is missing
sys.exit("Error: 'ripser' library not found. Please ensure it is installed in the environment.")

def parse_args():
parser = argparse.ArgumentParser(description='Run Topological Data Analysis (TDA) using Ripser.')
parser.add_argument('--input', required=True, help='Input AnnData file (.h5ad)')
parser.add_argument('--output', required=True, help='Output AnnData file (.h5ad)')
return parser.parse_args()

def main():
args = parse_args()

# 1. Load Data
print(f"Reading input from {args.input}...")
try:
adata = sc.read_h5ad(args.input)
except Exception as e:
sys.exit(f"Error loading AnnData: {e}")

# 2. Select Embedding
# Priority: PHATE -> Diffmap -> PCA
# We check which embeddings are available in the object
embedding_key = 'X_phate'
if embedding_key not in adata.obsm:
if 'X_diffmap' in adata.obsm:
embedding_key = 'X_diffmap'
elif 'X_pca' in adata.obsm:
embedding_key = 'X_pca'
else:
sys.exit("Error: No valid embedding found (requires X_phate, X_diffmap, or X_pca).")

print(f"Running Ripser TDA on {embedding_key}...")

# 3. Subsampling cause TDA is computationally expensive
# If the dataset is large, we subsample to make sure the pipeline doesn't hang.
matrix = adata.obsm[embedding_key]
if matrix.shape[0] > 1000:
print("Subsampling to 1000 cells for performance...")
# Use random sampling without replacement
idx = np.random.choice(matrix.shape[0], 1000, replace=False)
matrix = matrix[idx, :]

# 4. Run Persistent Homology
try:
# maxdim=1 computes H0 (connected components) and H1 (loops)
diagrams = ripser(matrix, maxdim=1)['dgms']
except Exception as e:
sys.exit(f"Error running ripser: {e}")

# 5. Store results
diagrams_dict = {}
for i, dgm in enumerate(diagrams):
diagrams_dict[f'dim_{i}'] = dgm

adata.uns['tda_results'] = {
'max_homology_dim': 1,
'embedding_used': embedding_key,
'diagrams': diagrams_dict
}

# 6. Save output
print(f"Saving results to {args.output}...")
try:
adata.write_h5ad(args.output)
except Exception as e:
sys.exit(f"Error saving h5ad file: {e}")

# 7. Generate versions
import ripser as r
with open("versions.yml", "w") as f:
f.write(f'process:\n')
f.write(f' ripser: "{r.__version__}"\n')
f.write(f' scanpy: "{sc.__version__}"\n')
f.write(f' numpy: "{np.__version__}"\n')

print("TDA computation completed successfully.")

if __name__ == "__main__":
main()
40 changes: 40 additions & 0 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -575,3 +575,43 @@ process {
]
}
}

/*
========================================================================================
MANIFOLD LEARNING MODULES CONFIG
========================================================================================
*/

process {
withName: 'PHATE' {
publishDir = [
path: { "${params.outdir}/geometry/phate" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
}

withName: 'DIFFMAP' {
publishDir = [
path: { "${params.outdir}/geometry/diffmap" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
}

withName: 'TOPOLOGY' {
publishDir = [
path: { "${params.outdir}/topology" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
}

withName: 'CAUSALITY' {
publishDir = [
path: { "${params.outdir}/causality" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
}
}
7 changes: 7 additions & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,9 @@ params {

// Schema validation default options
validate_params = true

// MANIFOLD LEARNING OPTIONS (Added by Miguel Rosell)
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can add your name to the list of contributors in the README, adding it in many locations in the pipeline is not really helpful (git blame makes this sufficiently clear)

manifold_methods = 'phate,diffmap'
}

// Load base.config by default for all pipelines
Expand Down Expand Up @@ -272,6 +275,10 @@ env {
R_PROFILE_USER = "/.Rprofile"
R_ENVIRON_USER = "/.Renviron"
JULIA_DEPOT_PATH = "/usr/local/share/julia"
// Fixes for Manifold learning (Added by Miguel Rosell)
NUMBA_CACHE_DIR = '/tmp'
MPLCONFIGDIR = '/tmp'
}
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please follow the structure of the existing scanpy modules for this. You cannot assume that /tmp is writable in every execution environment

}

// Set bash options
Expand Down
Loading
Loading