From 70f122ba36f6eb7258f1381886a598b80c3c123d Mon Sep 17 00:00:00 2001 From: Dan Lesperance Date: Thu, 21 May 2026 11:30:52 -0400 Subject: [PATCH 01/21] Create SeuratUploader.py Create SeuratUploader, now adding functions for reduction to metadata and layer_to_X --- lib/gear/SeuratUploader.py | 169 +++++++++++++++++++++++++++++++++++++ 1 file changed, 169 insertions(+) create mode 100644 lib/gear/SeuratUploader.py diff --git a/lib/gear/SeuratUploader.py b/lib/gear/SeuratUploader.py new file mode 100644 index 00000000..3b9eb810 --- /dev/null +++ b/lib/gear/SeuratUploader.py @@ -0,0 +1,169 @@ +import argparse + +import rpy2.robjects as ro +from rpy2.robjects.packages import importr +import rpy2.rinterface_lib.callbacks as r_cbs +import rpy2.robjects.packages as rpackages +import sys +import mygene +import pandas as pd +import scanpy +import os +import argparse + + +def silent_handler(s:str) -> None: + # way to bypass the R stderr output + pass + +def argument_parser(): + parser = argparse.ArgumentParser(usage="%(prog)s -r [RDS Object] -s [Share ID]",add_help=True) + parser.add_argument('-r', '--rds', required=True, type=str) + parser.add_argument('-s', '--share-id', required=True, type=str) + args = vars(parser.parse_args()) + return args + +def r_package_installer() -> None: + utils = rpackages.importr('utils') + # Install BiocManager if not installed + if not rpackages.isinstalled('BiocManager'): + utils.install_packages('BiocManager') + # Import BiocManager + BiocManager = importr('BiocManager') + # Install Seurat, anndataR and rhdf5 + if not rpackages.isinstalled('reticulate'): + utils.install_packages('reticulate') + if not rpackages.isinstalled('Seurat'): + utils.install_packages('Seurat') + if not rpackages.isinstalled('anndataR'): + BiocManager.install('anndataR') + if not rpackages.isinstalled('rhdf5'): + BiocManager.install('rhdf5') + + +def r_package_importer(package_name:str): + """ + Import installed package, if not installed return message + Input: + package_name: R package name to import + Output: + The R package that was imported or if there's an error the message will be returned + """ + importErrorMessage = "" + try: + pkg = importr(package_name) + return pkg + except: + importErrorMessage += f"{package_name} not installed or can not be imported" + sys.exit(importErrorMessage) + + + +def seurat_to_anndata(file_path: str, share_name: str, output_dir: str = "."): + """ + file_path: path to rds or rdata file + share_name: final h5ad string name to be expected (without h5ad) + output_dir: directory to write the temporary h5ad file into + + return: + absolute path to tmp h5ad, or False on failure + """ + # Suppress R console output and ensure required packages are loaded, + # since this function may be called as a module in cgi script (not via main()). + r_cbs.consolewrite_print = silent_handler + r_cbs.consolewrite_warnerror = silent_handler + # Import required R packages + base = rpackages.importr('base') + r_package_importer('Seurat') + r_package_importer('rhdf5') + r_package_importer('anndataR') + # Use R's readRDS to load the object. + # The result is an R object within the Python environment. + r_seurat_obj = base.readRDS(file_path) + ro.globalenv['seurat_obj'] = r_seurat_obj + # Using anndataR write out a converted h5ad + ro.r('adata <- as_AnnData(seurat_obj)') + output_path = os.path.join(output_dir, f'tmp_{share_name}.h5ad') + try: + ro.r(f'write_h5ad(adata, "{output_path}")') + return output_path + # In cases where the write fails we will assume the h5ad already exists + except: + print(f"h5ad name already exists {output_path}") + return False + + +def openh5ad(h5ad_name): + """Just open the supplied h5ad file""" + adata = scanpy.read_h5ad(h5ad_name) + return adata + +def genes_to_ensembl(adata, taxid=None): + # We are calling an external API for genes to ensembl mapping + # Potentially problematic down the road if this shuts down + if taxid is None: + return None + genes = adata.var.index.tolist() + mg = mygene.MyGeneInfo() + mg_genes = mg.querymany(genes, scopes="symbol", fields="ensembl.gene", species=f"{taxid}") + ensembl_mapping_dict = {} + for mg_gene in mg_genes: + gene_name = mg_gene['query'] + if 'ensembl' in mg_gene.keys(): + if isinstance(mg_gene['ensembl'],list): + # Currently taking first value, not sure of a better way to handle one gene having multiple ensembl IDs + ensembl_mapping_dict[gene_name] = mg_gene['ensembl'][0]['gene'] + else: + ensembl_mapping_dict[gene_name] = mg_gene['ensembl']['gene'] + count = 0 + # We still need an ensembl id for the genes that do not actually have them. + # So here we create a FAKE# for each one so that it can be searchable in gEAR + for gene in genes: + if gene not in ensembl_mapping_dict.keys(): + ensembl_mapping_dict[gene] = f"Fake{count}" + count += 1 + # Overwrite the current adata.var + adata.var = pd.DataFrame( + index=list(ensembl_mapping_dict.values()), data={"gene_names": list(ensembl_mapping_dict.keys())} + ) + return adata + + +def reduction_to_metadata(adata): + # Discussion with Carlo and Brian resulted in us determining we would like to + # take the first 2 values of each reduction + # PCA in the future, and potentially other reductions may need more + for reduction in adata.obsm: + if adata.obsm[reduction].shape[1] > 1: + for i in range(2): + adata.obs[f'{reduction}_{i+1}'] = adata.obsm[reduction][:,i] + return adata + + +def layer_to_X(adata, layer_name): + # Possibility for Seurat -> Anndata conversion doesn not create the X matrix. + # Use adata.layers['data'] as X + adata.X = adata.layers[layer_name] + return adata + +def main(): + arguments = argument_parser() + # Args + rds_path = arguments['rds'] + share_name = arguments['share_id'] + r_package_installer() + # Take the RDS and output the most basic h5ad + h5ad_name = seurat_to_anndata(rds_path,share_name) + # Below are some changes and checks to the h5ad to correctly format for gEAR + if h5ad_name: + adata = openh5ad(f'tmp_{h5ad_name}') + adata = genes_to_ensembl(adata) + if adata is None: + sys.exit("TaxID not supplied") + adata = reduction_to_metadata(adata) + adata.write({h5ad_name.replace('tmp_','')}) + os.remove(f'tmp_{h5ad_name}') + + +if __name__ == "__main__": + main() From db8bb98040a19e7e0e6f9a1fa45126fdc3a765c0 Mon Sep 17 00:00:00 2001 From: Dan Lesperance Date: Thu, 21 May 2026 11:44:31 -0400 Subject: [PATCH 02/21] Update process_uploaded_expression_dataset.cgi Add functionality for SeuratUpload --- .../process_uploaded_expression_dataset.cgi | 189 +++++++++--------- 1 file changed, 91 insertions(+), 98 deletions(-) diff --git a/www/cgi/process_uploaded_expression_dataset.cgi b/www/cgi/process_uploaded_expression_dataset.cgi index 4fcb35bf..64675d3f 100755 --- a/www/cgi/process_uploaded_expression_dataset.cgi +++ b/www/cgi/process_uploaded_expression_dataset.cgi @@ -38,9 +38,8 @@ sys.stdout = open(os.devnull, 'w') lib_path = Path(__file__).resolve().parents[2] / 'lib' sys.path.append(str(lib_path)) import geardb -from gear.primary_analysis import add_primary_analysis_to_dataset, PrimaryAnalysisProcessingError from gear.spatialhandler import SPATIALTYPE2CLASS -from gear.utils import update_adata_with_ensembl_ids +import gear.SeuratUploader as SeuratUploader share_uid = None session_id = None @@ -74,8 +73,9 @@ def main(): result['message'] = 'User ID not found. Please log in to continue.' return result - # values are mex_3tab, excel, rdata, h5ad - dataset_formats = ['mex_3tab', 'excel', 'rdata', 'h5ad', 'spatial'] + # values are mex_3tab, excel, rdata, h5ad, rds, or spatial formats + # Removed rdata from list as rdata will be difficult to process efficiently + dataset_formats = ['mex_3tab', 'excel', 'h5ad', 'spatial','rds'] dataset_upload_dir = Path(user_upload_file_base) / session_id / share_uid # quickly write the status so the page doesn't error out @@ -107,13 +107,9 @@ def main(): return with open(metadata_file, 'r') as f: metadata = json.load(f) - dataset_uid = metadata.get('dataset_uid', '') - dataset_type = metadata.get('dataset_type', '') # Update metadata for downstream uses metadata["dataset_format"] = dataset_format - metadata["perform_primary_analysis"] = True if dataset_type in ['single-cell-rnaseq', 'spatial'] else False - with open(metadata_file, 'w') as f: json.dump(metadata, f, indent=4) @@ -147,36 +143,28 @@ def main(): # CHILD CONTINUES FROM HERE status['process_id'] = os.getpid() - + # new child command if dataset_format == 'mex_3tab': - process_mex_3tab(dataset_upload_dir, metadata["perform_primary_analysis"]) + process_mex_3tab(dataset_upload_dir) elif dataset_format == 'excel': - process_excel(dataset_upload_dir, metadata["perform_primary_analysis"]) + process_excel(dataset_upload_dir) elif dataset_format == "h5ad": - process_h5ad(dataset_upload_dir, metadata["perform_primary_analysis"]) + process_h5ad(dataset_upload_dir) + elif dataset_format == 'rds' or dataset_format=='rdata': + process_seurat(dataset_upload_dir) elif dataset_format == "spatial": - process_spatial(dataset_upload_dir, spatial_format, metadata["perform_primary_analysis"]) + process_spatial(dataset_upload_dir, spatial_format) else: result["success"] = 0 result["message"] = f"Unsupported dataset format: {dataset_format}" return result - if metadata["perform_primary_analysis"]: - try: - result["success"] = add_primary_analysis_to_dataset(dataset_uid, share_uid, dataset_upload_dir, dataset_format) - except PrimaryAnalysisProcessingError as e: - write_status(dataset_upload_dir, 'error', f"Error during primary analysis: {str(e)}") - return result - - status["progress"] = 100 - write_status(dataset_upload_dir, 'complete', "Dataset processed successfully.") - result["success"] = 1 result["message"] = "Dataset processed successfully." return result -def process_h5ad(upload_dir: Path, perform_primary_analysis: bool) -> None: +def process_h5ad(upload_dir: Path) -> None: """ Processes an uploaded .h5ad (AnnData) file in the specified upload directory by performing the following steps: 1. Reads the .h5ad file as an AnnData object. @@ -194,43 +182,15 @@ def process_h5ad(upload_dir: Path, perform_primary_analysis: bool) -> None: # If the file is an h5ad, it should be formatted as an AnnData object already. # But we still want to do some sanitization of the obs dataframe. - # TODO: Read in chunks to save memory - write_status(upload_dir, 'processing', 'Initializing dataset processing.') filepath = upload_dir / f"{share_uid}.h5ad" adata = anndata.read_h5ad(filepath) obs = adata.obs - total_steps = 4 if perform_primary_analysis else 3 - step_counter = 1 - status["progress"] = int((step_counter / total_steps) * 100) - write_status(upload_dir, 'processing', 'Sanitizing AnnData object') - categorize_observation_columns(obs) adata.obs = sanitize_obs_for_h5ad(obs) - if "gene_symbol" not in adata.var.columns: - # get organism_id by converting sample_taxid - metadata_file = upload_dir / 'metadata.json' - if not metadata_file.is_file(): - write_status(upload_dir, 'error', "No metadata JSON file found.") - return - - with open(metadata_file, 'r') as f: - metadata = json.load(f) - sample_taxid = metadata.get("sample_taxid", None) - organism_id=geardb.get_organism_id_by_taxon_id(sample_taxid) - if not organism_id: - write_status(upload_dir, 'error', "Could not determine organism ID from sample taxonomic ID.") - return - - adata = update_adata_with_ensembl_ids(adata, organism_id, "UNMAPPED_") - - step_counter += 1 - status["progress"] = int((step_counter / total_steps) * 100) - write_status(upload_dir, 'processing', 'Writing sanitized data to new H5AD.') - h5ad_path = upload_dir / f"{share_uid}.new.h5ad" adata.write(h5ad_path) @@ -238,11 +198,66 @@ def process_h5ad(upload_dir: Path, perform_primary_analysis: bool) -> None: filepath.unlink() # remove original h5ad_path.rename(filepath) # rename new to original name - step_counter += 1 - status["progress"] = int((step_counter / total_steps) * 100) - write_status(upload_dir, 'processing', f"Finished processing dataset. {'Performing primary analysis...' if perform_primary_analysis else ''}") + write_status(upload_dir, 'complete', 'Dataset processed successfully.') -def process_3tab(upload_dir: Path, perform_primary_analysis: bool) -> None: +def process_seurat(upload_dir: Path) -> None: + # Take in an RDS file, convert to anndata, update the obs metadata based on reductions, + # convert gene symbols to ensemble IDs, and write to an updated h5ad file. + write_status(upload_dir, "processing", "Initializing dataset processing.") + seurat_filepath = upload_dir / f"{share_uid}.rds" + + # seurat to anndata uses rpy2 to convert the RDS to anndata + adata_filepath = SeuratUploader.seurat_to_anndata(str(seurat_filepath), share_uid, str(upload_dir)) + if not adata_filepath: + write_status(upload_dir, 'error', 'Failed to convert RDS to h5ad.') + return + try: + adata = anndata.read_h5ad(adata_filepath) + except Exception as e: + write_status(upload_dir, 'error', f'Failed to read h5ad: {str(e)}') + return + + + # Update obs metadata based on reductions + try: + adata = SeuratUploader.reduction_to_metadata(adata) + except Exception as e: + write_status(upload_dir, 'error', f'Failed to update Reductions to metadata: {str(e)}') + return + # Convert gene symbols to ensemble IDs + metadata_file = upload_dir / 'metadata.json' + if not metadata_file.is_file(): + write_status(upload_dir, 'error', "No metadata JSON file found.") + + # get organism_id by converting sample_taxid(needed for some but not all spatial handlers) + with open(metadata_file, 'r') as f: + metadata = json.load(f) + + sample_taxid = metadata.get("sample_taxid", None) + try: + adata = SeuratUploader.genes_to_ensembl(adata,sample_taxid) + except Exception as e: + write_status(upload_dir, 'error', f'Failed to convert genes to Ensembl: {str(e)}') + return + if adata.X is None: + adata = SeuratUploader.layer_to_X(adata, layer_name='data') + h5ad_path = upload_dir / f"{share_uid}.new.h5ad" + try: + adata.write(h5ad_path) + + # Replace the original file with the sanitized one + seurat_filepath.unlink() + Path(adata_filepath).unlink() + h5ad_path.rename(upload_dir / f"{share_uid}.h5ad") + except Exception as e: + write_status(upload_dir, 'error', f'Failed to write h5ad or during cleanup: {str(e)}') + return + + write_status(upload_dir, "complete", "Dataset processed successfully.") + + + +def process_3tab(upload_dir: Path) -> None: import subprocess chunk_size = 500 @@ -285,21 +300,8 @@ def process_3tab(upload_dir: Path, perform_primary_analysis: bool) -> None: adata = sc.AnnData(obs=var, var=obs) reader = pd.read_csv(expression_matrix_path, sep='\t', index_col=0, chunksize=chunk_size) - # Count rows safely without shell execution (https://github.com/IGS/gEAR/security/code-scanning/229) - try: - result = subprocess.run( - ['/usr/bin/wc', '-l', str(expression_matrix_path)], - capture_output=True, - text=True, - check=True - ) - total_rows = int(result.stdout.split()[0]) - except (subprocess.CalledProcessError, ValueError, FileNotFoundError): - # Fallback to Python if wc fails - total_rows = sum(1 for _ in open(expression_matrix_path)) - 1 - - if perform_primary_analysis: - total_rows += 1 # account for the additional primary analysis step that will be performed after this + # This can be an order of magnitude faster than the using python alone + total_rows = int(subprocess.check_output(f"/usr/bin/wc -l {expression_matrix_path}", shell=True).split()[0]) expression_matrix = [] rows_read = 0 @@ -313,8 +315,9 @@ def process_3tab(upload_dir: Path, perform_primary_analysis: bool) -> None: expression_matrix.append(sparse.csr_matrix(chunk.values)) status['progress'] = percentage - message = f"Processed {rows_read}/{total_rows} expression matrix chunks ..." - write_status(upload_dir, 'processing', message) + status['message'] = f"Processed {rows_read}/{total_rows} expression matrix chunks ..." + with open(upload_dir / "status.json", 'w') as f: + f.write(json.dumps(status)) adata.X = sparse.vstack(expression_matrix) # type: ignore except Exception: @@ -345,8 +348,10 @@ def process_3tab(upload_dir: Path, perform_primary_analysis: bool) -> None: rows_read += chunk_size percentage = int((rows_read / total_rows) * 100) - message = f"Processed {rows_read}/{total_rows} expression matrix chunks ..." - write_status(upload_dir, 'processing', message) + status['progress'] = percentage + status['message'] = f"Processed {rows_read}/{total_rows} expression matrix chunks ..." + with open(upload_dir / "status.json", 'w') as f: + f.write(json.dumps(status)) except Exception: #print(f"\nError in chunk {chunk_index}: {inner_e}") @@ -363,18 +368,19 @@ def process_3tab(upload_dir: Path, perform_primary_analysis: bool) -> None: #print("Collected chunk shapes:") #for i, shape in enumerate(chunk_shapes): # print(f" Chunk {i+1}: {shape}") + raise + adata = adata.transpose() adata.obs = sanitize_obs_for_h5ad(adata.obs) h5ad_path = upload_dir / f"{share_uid}.h5ad" adata.write(h5ad_path) - # Progress is accounted for in chunk processing - write_status(upload_dir, 'processing', f"Finished processing dataset. {'Performing primary analysis...' if perform_primary_analysis else ''}") + write_status(upload_dir, 'complete', 'Dataset processed successfully.') -def process_excel(upload_dir: Path, perform_primary_analysis: bool) -> None: +def process_excel(upload_dir: Path) -> None: filepath = upload_dir / f"{share_uid}.xlsx" write_status(upload_dir, 'processing', 'Initializing dataset processing.') @@ -454,15 +460,12 @@ def process_excel(upload_dir: Path, perform_primary_analysis: bool) -> None: h5ad_path = upload_dir / f"{share_uid}.h5ad" adata.write(h5ad_path) - total_steps = 2 if perform_primary_analysis else 1 - status["progress"] = int((1 / total_steps) * 100) - write_status(upload_dir, 'processing', f"Finished processing dataset. {'Performing primary analysis...' if perform_primary_analysis else ''}") + write_status(upload_dir, 'complete', 'Dataset processed successfully.') - -def process_mex(upload_dir: Path, perform_primary_analysis: bool) -> None: +def process_mex(upload_dir: Path) -> None: pass -def process_mex_3tab(upload_dir: Path, perform_primary_analysis: bool) -> None: +def process_mex_3tab(upload_dir: Path) -> None: # Extract the file import tarfile compression_format = None @@ -541,11 +544,11 @@ def process_mex_3tab(upload_dir: Path, perform_primary_analysis: bool) -> None: # Call the appropriate function if dataset_type == 'threetab': - process_3tab(upload_dir, perform_primary_analysis) + process_3tab(upload_dir) elif dataset_type == 'mex': - process_mex(upload_dir, perform_primary_analysis) + process_mex(upload_dir) -def process_spatial(upload_dir: Path, spatial_format: str, perform_primary_analysis: bool) -> None: +def process_spatial(upload_dir: Path, spatial_format: str) -> None: """ Processes a spatial transcriptomics dataset uploaded to a specified directory. @@ -562,9 +565,6 @@ def process_spatial(upload_dir: Path, spatial_format: str, perform_primary_analy Raises: Writes error status if the metadata file is missing or if reading/converting the spatial file fails. """ - - write_status(upload_dir, 'processing', 'Initializing dataset processing.') - spatial_obj = SPATIALTYPE2CLASS[spatial_format]() # instantiate the appropriate handler class metadata_file = upload_dir / 'metadata.json' if not metadata_file.is_file(): @@ -594,16 +594,9 @@ def process_spatial(upload_dir: Path, spatial_format: str, perform_primary_analy import shutil shutil.rmtree(output_path) - total_steps = 3 if perform_primary_analysis else 2 - step_counter = 1 - status["progress"] = int((step_counter / total_steps) * 100) write_status(upload_dir, 'processing', 'Writing Zarr store') - spatial_obj.write_to_zarr(filepath=output_path) - - step_counter += 1 - status["progress"] = int((step_counter / total_steps) * 100) - write_status(upload_dir, 'processing', f"Finished processing spatial dataset. {'Performing primary analysis...' if perform_primary_analysis else ''}") + write_status(upload_dir, 'complete', 'Dataset processed successfully.') def sanitize_obs_for_h5ad(obs_df: pd.DataFrame) -> pd.DataFrame: for col in obs_df.columns: From eaa4612e4bcc84db2b354a6d111c0ee1eccbf644 Mon Sep 17 00:00:00 2001 From: Dan Lesperance Date: Fri, 22 May 2026 07:41:21 -0400 Subject: [PATCH 03/21] Update BioCManager to 3.22 --- docker/install_bioc.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/install_bioc.R b/docker/install_bioc.R index b1dd8025..fd0ad770 100755 --- a/docker/install_bioc.R +++ b/docker/install_bioc.R @@ -3,7 +3,7 @@ # Install required packages tryCatch( { install.packages(c("BiocManager", "remotes"), dependencies=NA, repos="http://lib.stat.cmu.edu/R/CRAN/") - BiocManager::install(version = "3.21", ask=FALSE) + BiocManager::install(version = "3.22", ask=FALSE) }, error = function(e) { message("Error: ", e$message) quit(status = 1, save = "no") From 663753cb3098761d18a28b894739c3cc47c3cb0a Mon Sep 17 00:00:00 2001 From: Dan Lesperance Date: Fri, 22 May 2026 07:55:22 -0400 Subject: [PATCH 04/21] add packages necessary for SeuratUploader seurat5 httpuv hdf5r rhdf5 anndataR --- docker/install_packages.R | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/docker/install_packages.R b/docker/install_packages.R index f04948ed..d6b348bd 100755 --- a/docker/install_packages.R +++ b/docker/install_packages.R @@ -9,7 +9,12 @@ library(remotes) # for install_version tryCatch( { remotes::install_version("reticulate", version="1.46.0", repos="https://cloud.r-project.org/", ask=FALSE, dependencies=NA) # Sanity check with rpy2 remotes::install_github("ctlab/fgsea") # needed for projectR - remotes::install_github("genesofeve/projectR@d3dd79e2b14172a9561059d58462c97f0a78d4c8") # version 1.23.2 + remotes::install_github("genesofeve/projectR@d3dd79e2b14172a9561059d58462c97f0a78d4c8") + remotes::install_github("satijalab/seurat", "seurat5", quiet = TRUE, ask=FALSE) + install.packages('httpuv', ask=FALSE) + install.packages("hdf5r",dependencies=TRUE, ask=FALSE) + BiocManager::install("rhdf5",ask=FALSE) + BiocManager::install("anndataR", ask=FALSE)# version 1.23.2 BiocManager::install("biomaRt", ask=FALSE) # version 2.60.0 remotes::install_github("CHuanSite/SJD") }, error = function(e) { From 24e8794c4434fc4cfc0c83fe851bc1f13a7a6415 Mon Sep 17 00:00:00 2001 From: Dan Lesperance Date: Fri, 22 May 2026 07:58:49 -0400 Subject: [PATCH 05/21] Update Dockerfile.r --- docker/Dockerfile.r | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docker/Dockerfile.r b/docker/Dockerfile.r index a2c2b3cb..3015356e 100644 --- a/docker/Dockerfile.r +++ b/docker/Dockerfile.r @@ -32,6 +32,8 @@ RUN apt -qq update \ tzdata \ git \ unzip \ + libgfortran5 \ + libhdf5-dev \ && apt -qq clean autoclean \ && apt -qq autoremove -y \ && rm -rf /var/lib/apt/lists/* From 93adac2847a8a0344cc43028aa81592988b17184 Mon Sep 17 00:00:00 2001 From: Dan Lesperance Date: Fri, 22 May 2026 08:06:48 -0400 Subject: [PATCH 06/21] Enabled Seurat/RDS --- www/upload_dataset.html | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/www/upload_dataset.html b/www/upload_dataset.html index ac1ca58e..ebbaf30e 100644 --- a/www/upload_dataset.html +++ b/www/upload_dataset.html @@ -666,9 +666,9 @@

MS Excel

- +
-

Rdata / Seurat

+

RDS / Seurat

This is a binary format used by the Seurat package in R. If you've already been working with your dataset in R, including clustering and other analyses, this is @@ -682,21 +682,21 @@

Rdata / Seurat

-
From cc672065674e8216d96368b81470f9cc5db239cb Mon Sep 17 00:00:00 2001 From: Dan Lesperance Date: Wed, 27 May 2026 14:49:13 -0400 Subject: [PATCH 07/21] SeuratUploader: Change gene_names to gene_symbol --- lib/gear/SeuratUploader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/gear/SeuratUploader.py b/lib/gear/SeuratUploader.py index 3b9eb810..ecbb8b70 100644 --- a/lib/gear/SeuratUploader.py +++ b/lib/gear/SeuratUploader.py @@ -124,7 +124,7 @@ def genes_to_ensembl(adata, taxid=None): count += 1 # Overwrite the current adata.var adata.var = pd.DataFrame( - index=list(ensembl_mapping_dict.values()), data={"gene_names": list(ensembl_mapping_dict.keys())} + index=list(ensembl_mapping_dict.values()), data={"gene_symbol": list(ensembl_mapping_dict.keys())} ) return adata From b398e828844e48e3d861aa73a1d1fc22fc28b559 Mon Sep 17 00:00:00 2001 From: Dan Lesperance Date: Thu, 28 May 2026 10:06:20 -0400 Subject: [PATCH 08/21] Add packages to apt-get for Seurat --- docker/Dockerfile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docker/Dockerfile b/docker/Dockerfile index 87882e78..c4a8b11f 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -45,6 +45,9 @@ RUN apt -qq update \ libpcre2-dev \ fonts-roboto \ fontconfig \ + libgfortran5 \ + libuv1 \ + libhdf5-dev \ && apt -qq clean autoclean \ && apt -qq autoremove -y \ && rm -rf /var/lib/apt/lists/* From 0d9456c00bfe5c8493d79930022adad9fb1b295d Mon Sep 17 00:00:00 2001 From: Dan Lesperance Date: Thu, 28 May 2026 10:09:34 -0400 Subject: [PATCH 09/21] Add RDATA to list of dataset formats --- www/cgi/process_uploaded_expression_dataset.cgi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/www/cgi/process_uploaded_expression_dataset.cgi b/www/cgi/process_uploaded_expression_dataset.cgi index 64675d3f..afeae0c4 100755 --- a/www/cgi/process_uploaded_expression_dataset.cgi +++ b/www/cgi/process_uploaded_expression_dataset.cgi @@ -75,7 +75,7 @@ def main(): # values are mex_3tab, excel, rdata, h5ad, rds, or spatial formats # Removed rdata from list as rdata will be difficult to process efficiently - dataset_formats = ['mex_3tab', 'excel', 'h5ad', 'spatial','rds'] + dataset_formats = ['mex_3tab', 'excel', 'h5ad', 'spatial','rds','rdata'] dataset_upload_dir = Path(user_upload_file_base) / session_id / share_uid # quickly write the status so the page doesn't error out @@ -91,7 +91,7 @@ def main(): return result if dataset_format not in dataset_formats: - result['message'] = 'Unsupported dataset format.' + result['message'] = f'Unsupported dataset format: {dataset_format} ' write_status(dataset_upload_dir, 'error', result['message']) return result From 04411ad13e263a4b038482ce5da1feb83bba2664 Mon Sep 17 00:00:00 2001 From: Dan Lesperance Date: Thu, 28 May 2026 10:15:26 -0400 Subject: [PATCH 10/21] Change button text from 'Selected' to 'Choose' --- www/upload_dataset.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/www/upload_dataset.html b/www/upload_dataset.html index ebbaf30e..39d1e3f7 100644 --- a/www/upload_dataset.html +++ b/www/upload_dataset.html @@ -696,7 +696,7 @@

RDS / Seurat

- Selected + Choose
From ffed7345593f06fcf7e69a6e899717e6a536b7bc Mon Sep 17 00:00:00 2001 From: dlespera Date: Thu, 28 May 2026 13:46:01 -0400 Subject: [PATCH 11/21] Package installation finalization --- docker/Dockerfile | 8 ++++---- docker/install_packages.R | 4 ++-- docker/requirements.txt | 8 ++++++++ docs/developer/setup/python.md | 13 ++++++++++++- 4 files changed, 26 insertions(+), 7 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index c4a8b11f..038b65ba 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -58,12 +58,12 @@ RUN fc-cache -f -v ENV LLVM_CONFIG=/usr/bin/llvm-config-14 # Copy compiled Python from builder stage -COPY --from=adkinsrs/gear-python-base:2026-04-27 /opt/Python-${PYTHON_FULL_VERSION} /opt/Python-${PYTHON_FULL_VERSION} +COPY --from=gear-python-base:latest /opt/Python-${PYTHON_FULL_VERSION} /opt/Python-${PYTHON_FULL_VERSION} # Copy compiled R from r-builder stage -COPY --from=adkinsrs/gear-r-base:2026-04-27 /usr/local/lib/R /usr/local/lib/R -COPY --from=adkinsrs/gear-r-base:2026-04-27 /usr/local/bin/R /usr/local/bin/R -COPY --from=adkinsrs/gear-r-base:2026-04-27 /usr/local/bin/Rscript /usr/local/bin/Rscript +COPY --from=gear-r-base:latest /usr/local/lib/R /usr/local/lib/R +COPY --from=gear-r-base:latest /usr/local/bin/R /usr/local/bin/R +COPY --from=gear-r-base:latest /usr/local/bin/Rscript /usr/local/bin/Rscript # Link Python and shared library RUN mkdir -p /opt/bin \ diff --git a/docker/install_packages.R b/docker/install_packages.R index d6b348bd..55a1397a 100755 --- a/docker/install_packages.R +++ b/docker/install_packages.R @@ -11,8 +11,8 @@ tryCatch( { remotes::install_github("ctlab/fgsea") # needed for projectR remotes::install_github("genesofeve/projectR@d3dd79e2b14172a9561059d58462c97f0a78d4c8") remotes::install_github("satijalab/seurat", "seurat5", quiet = TRUE, ask=FALSE) - install.packages('httpuv', ask=FALSE) - install.packages("hdf5r",dependencies=TRUE, ask=FALSE) + install.packages('httpuv', ask=FALSE, repos="https://cloud.r-project.org/") + install.packages("hdf5r",dependencies=TRUE, ask=FALSE, repos="https://cloud.r-project.org/") BiocManager::install("rhdf5",ask=FALSE) BiocManager::install("anndataR", ask=FALSE)# version 1.23.2 BiocManager::install("biomaRt", ask=FALSE) # version 2.60.0 diff --git a/docker/requirements.txt b/docker/requirements.txt index 94021d71..3839102b 100644 --- a/docker/requirements.txt +++ b/docker/requirements.txt @@ -44,3 +44,11 @@ spatialdata_io==0.6.0 shadows==0.1a2 tables==3.11.1 watchfiles==1.1.1 +Bio==1.8.3 +biopython==1.87 +biothings-client==0.5.0 +gprofiler-official==1.0.0 +h11==0.16.0 +httpcore==1.0.9 +httpx==0.28.1 +mygene==3.2.2 diff --git a/docs/developer/setup/python.md b/docs/developer/setup/python.md index 1333a1e8..6aad6db2 100644 --- a/docs/developer/setup/python.md +++ b/docs/developer/setup/python.md @@ -38,6 +38,9 @@ fixed paths have worked fine for decades. libicu-dev \ libdeflate-dev \ libssl3 \ + libgfortran5 \ + libuv1 \ + libhdf5-dev \ pkg-config \ llvm \ apache2 \ @@ -148,7 +151,15 @@ I cannot add comments to the bash code without breaking the command. So consult spatialdata_io==0.6.0 \ shadows==0.1a2 \ tables==3.11.1 \ - watchfiles==1.1.1 + watchfiles==1.1.1 \ + Bio==1.8.3 \ + biopython==1.87 \ + biothings-client==0.5.0 \ + gprofiler-official==1.0.0 \ + h11==0.16.0 \ + httpcore==1.0.9 \ + httpx==0.28.1 \ + mygene==3.2.2\ ./pip3 install git+https://github.com/adkinsrs/diffxpy.git@ffd828c280882ca98adc6e42c934625fab0011f6 ./pip3 uninstall dask-expr -y From f18e5690baf1ea8307b270480bd572e506e5e30f Mon Sep 17 00:00:00 2001 From: adkinsrs Date: Wed, 3 Jun 2026 10:25:26 -0400 Subject: [PATCH 12/21] Update requirements and setup documentation for additional packages --- docker/requirements.txt | 16 +++++++--------- docs/developer/setup/python.md | 14 ++++++-------- 2 files changed, 13 insertions(+), 17 deletions(-) diff --git a/docker/requirements.txt b/docker/requirements.txt index 3839102b..7aaea630 100644 --- a/docker/requirements.txt +++ b/docker/requirements.txt @@ -2,6 +2,8 @@ aiohttp==3.13.5 aiohttp_retry==2.9.1 anndata==0.12.11 biocode==0.10.0 +biopython==1.87 +biothings-client==0.5.0 cairosvg==2.7.1 colorcet==3.1.0 datashader==0.19.0 @@ -10,7 +12,10 @@ Flask==3.1.3 Flask-RESTful==0.3.9 google-analytics-data==0.21.0 gosling==0.3.0 +h11==0.16.0 hic2cool==0.8.3 +httpcore==1.0.9 +httpx==0.28.1 jupyterlab==4.0.5 jupyter==1.0.0 kaleido==0.2.1 @@ -20,6 +25,7 @@ llvmlite==0.47.0 matplotlib==3.10.7 mod-wsgi==5.0.2 more_itertools==11.0.2 +mygene==3.2.2 mysql-connector-python==8.0.28 numba==0.65.0 numpy==2.4.0 @@ -43,12 +49,4 @@ spatialdata==0.7.2 spatialdata_io==0.6.0 shadows==0.1a2 tables==3.11.1 -watchfiles==1.1.1 -Bio==1.8.3 -biopython==1.87 -biothings-client==0.5.0 -gprofiler-official==1.0.0 -h11==0.16.0 -httpcore==1.0.9 -httpx==0.28.1 -mygene==3.2.2 +watchfiles==1.1.1 \ No newline at end of file diff --git a/docs/developer/setup/python.md b/docs/developer/setup/python.md index 6aad6db2..355a1d22 100644 --- a/docs/developer/setup/python.md +++ b/docs/developer/setup/python.md @@ -111,6 +111,8 @@ I cannot add comments to the bash code without breaking the command. So consult aiohttp_retry==2.9.1 \ anndata==0.12.11 \ biocode==0.10.0 \ + biopython==1.87 \ + biothings-client==0.5.0 \ cairosvg==2.7.1 \ colorcet==3.1.0 \ datashader==0.19.0 \ @@ -118,7 +120,10 @@ I cannot add comments to the bash code without breaking the command. So consult Flask-RESTful==0.3.9 \ google-analytics-data==0.21.0 \ gosling==0.3.0 \ + h11==0.16.0 \ hic2cool==0.8.3 \ + httpcore==1.0.9 \ + httpx==0.28.1 \ jupyterlab==4.0.5 \ jupyter==1.0.0 \ kaleido==0.2.1 \ @@ -128,6 +133,7 @@ I cannot add comments to the bash code without breaking the command. So consult matplotlib==3.10.7 \ mod-wsgi==5.0.2 \ more_itertools==11.0.2 \ + mygene==3.2.2 \ mysql-connector-python==8.0.28 \ numba==0.65.0 \ numpy==2.4.0 \ @@ -152,14 +158,6 @@ I cannot add comments to the bash code without breaking the command. So consult shadows==0.1a2 \ tables==3.11.1 \ watchfiles==1.1.1 \ - Bio==1.8.3 \ - biopython==1.87 \ - biothings-client==0.5.0 \ - gprofiler-official==1.0.0 \ - h11==0.16.0 \ - httpcore==1.0.9 \ - httpx==0.28.1 \ - mygene==3.2.2\ ./pip3 install git+https://github.com/adkinsrs/diffxpy.git@ffd828c280882ca98adc6e42c934625fab0011f6 ./pip3 uninstall dask-expr -y From 1b841b20e46b2bce97c6a633e777c68bd5114fe6 Mon Sep 17 00:00:00 2001 From: adkinsrs Date: Wed, 3 Jun 2026 10:30:33 -0400 Subject: [PATCH 13/21] Fix formatting in Python setup instructions for package installation --- docs/developer/setup/python.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/developer/setup/python.md b/docs/developer/setup/python.md index 355a1d22..6fc2d30e 100644 --- a/docs/developer/setup/python.md +++ b/docs/developer/setup/python.md @@ -157,7 +157,7 @@ I cannot add comments to the bash code without breaking the command. So consult spatialdata_io==0.6.0 \ shadows==0.1a2 \ tables==3.11.1 \ - watchfiles==1.1.1 \ + watchfiles==1.1.1 ./pip3 install git+https://github.com/adkinsrs/diffxpy.git@ffd828c280882ca98adc6e42c934625fab0011f6 ./pip3 uninstall dask-expr -y From 2f9a249b2cff5e126f259867f62a1cd5efd42813 Mon Sep 17 00:00:00 2001 From: adkinsrs Date: Wed, 3 Jun 2026 14:59:46 -0400 Subject: [PATCH 14/21] Fixing squashed commits already in devel. Adjusting permissions of lib/gear files to be uniform --- lib/gear/SeuratUploader.py | 13 +- lib/gear/__init__.py | 0 lib/gear/analysis.py | 0 lib/gear/dataarchive.py | 0 lib/gear/db.py | 0 lib/gear/mg_plotting.py | 0 lib/gear/orthology.py | 0 lib/gear/plotting.py | 0 lib/gear/primary_analysis.py | 0 lib/gear/serverconfig.py | 0 lib/gear/spatialhandler.py | 0 lib/gear/trackhub.py | 0 lib/gear/userhistory.py | 0 lib/gear/utils.py | 0 .../process_uploaded_expression_dataset.cgi | 136 +++++++++++++----- www/upload_dataset.html | 2 +- 16 files changed, 107 insertions(+), 44 deletions(-) mode change 100644 => 100755 lib/gear/SeuratUploader.py mode change 100644 => 100755 lib/gear/__init__.py mode change 100644 => 100755 lib/gear/analysis.py mode change 100644 => 100755 lib/gear/dataarchive.py mode change 100644 => 100755 lib/gear/db.py mode change 100644 => 100755 lib/gear/mg_plotting.py mode change 100644 => 100755 lib/gear/orthology.py mode change 100644 => 100755 lib/gear/plotting.py mode change 100644 => 100755 lib/gear/primary_analysis.py mode change 100644 => 100755 lib/gear/serverconfig.py mode change 100644 => 100755 lib/gear/spatialhandler.py mode change 100644 => 100755 lib/gear/trackhub.py mode change 100644 => 100755 lib/gear/userhistory.py mode change 100644 => 100755 lib/gear/utils.py diff --git a/lib/gear/SeuratUploader.py b/lib/gear/SeuratUploader.py old mode 100644 new mode 100755 index ecbb8b70..050f665f --- a/lib/gear/SeuratUploader.py +++ b/lib/gear/SeuratUploader.py @@ -1,15 +1,14 @@ import argparse - -import rpy2.robjects as ro -from rpy2.robjects.packages import importr -import rpy2.rinterface_lib.callbacks as r_cbs -import rpy2.robjects.packages as rpackages +import os import sys + import mygene import pandas as pd +import rpy2.rinterface_lib.callbacks as r_cbs +import rpy2.robjects as ro +import rpy2.robjects.packages as rpackages import scanpy -import os -import argparse +from rpy2.robjects.packages import importr def silent_handler(s:str) -> None: diff --git a/lib/gear/__init__.py b/lib/gear/__init__.py old mode 100644 new mode 100755 diff --git a/lib/gear/analysis.py b/lib/gear/analysis.py old mode 100644 new mode 100755 diff --git a/lib/gear/dataarchive.py b/lib/gear/dataarchive.py old mode 100644 new mode 100755 diff --git a/lib/gear/db.py b/lib/gear/db.py old mode 100644 new mode 100755 diff --git a/lib/gear/mg_plotting.py b/lib/gear/mg_plotting.py old mode 100644 new mode 100755 diff --git a/lib/gear/orthology.py b/lib/gear/orthology.py old mode 100644 new mode 100755 diff --git a/lib/gear/plotting.py b/lib/gear/plotting.py old mode 100644 new mode 100755 diff --git a/lib/gear/primary_analysis.py b/lib/gear/primary_analysis.py old mode 100644 new mode 100755 diff --git a/lib/gear/serverconfig.py b/lib/gear/serverconfig.py old mode 100644 new mode 100755 diff --git a/lib/gear/spatialhandler.py b/lib/gear/spatialhandler.py old mode 100644 new mode 100755 diff --git a/lib/gear/trackhub.py b/lib/gear/trackhub.py old mode 100644 new mode 100755 diff --git a/lib/gear/userhistory.py b/lib/gear/userhistory.py old mode 100644 new mode 100755 diff --git a/lib/gear/utils.py b/lib/gear/utils.py old mode 100644 new mode 100755 diff --git a/www/cgi/process_uploaded_expression_dataset.cgi b/www/cgi/process_uploaded_expression_dataset.cgi index afeae0c4..583a508f 100755 --- a/www/cgi/process_uploaded_expression_dataset.cgi +++ b/www/cgi/process_uploaded_expression_dataset.cgi @@ -37,9 +37,14 @@ sys.stdout = open(os.devnull, 'w') lib_path = Path(__file__).resolve().parents[2] / 'lib' sys.path.append(str(lib_path)) +import gear.seuratuploader as SeuratUploader import geardb +from gear.primary_analysis import ( + PrimaryAnalysisProcessingError, + add_primary_analysis_to_dataset, +) from gear.spatialhandler import SPATIALTYPE2CLASS -import gear.SeuratUploader as SeuratUploader +from gear.utils import update_adata_with_ensembl_ids share_uid = None session_id = None @@ -73,9 +78,8 @@ def main(): result['message'] = 'User ID not found. Please log in to continue.' return result - # values are mex_3tab, excel, rdata, h5ad, rds, or spatial formats - # Removed rdata from list as rdata will be difficult to process efficiently - dataset_formats = ['mex_3tab', 'excel', 'h5ad', 'spatial','rds','rdata'] + # values are mex_3tab, excel, h5ad, rds, or spatial formats + dataset_formats = ['mex_3tab', 'excel', 'h5ad', 'spatial','rds'] dataset_upload_dir = Path(user_upload_file_base) / session_id / share_uid # quickly write the status so the page doesn't error out @@ -107,9 +111,12 @@ def main(): return with open(metadata_file, 'r') as f: metadata = json.load(f) + dataset_uid = metadata.get('dataset_uid', '') + dataset_type = metadata.get('dataset_type', '') # Update metadata for downstream uses metadata["dataset_format"] = dataset_format + metadata["perform_primary_analysis"] = True if dataset_type in ['single-cell-rnaseq', 'spatial'] else False with open(metadata_file, 'w') as f: json.dump(metadata, f, indent=4) @@ -143,28 +150,39 @@ def main(): # CHILD CONTINUES FROM HERE status['process_id'] = os.getpid() - + # new child command if dataset_format == 'mex_3tab': - process_mex_3tab(dataset_upload_dir) + process_mex_3tab(dataset_upload_dir, metadata["perform_primary_analysis"]) elif dataset_format == 'excel': - process_excel(dataset_upload_dir) + process_excel(dataset_upload_dir, metadata["perform_primary_analysis"]) elif dataset_format == "h5ad": - process_h5ad(dataset_upload_dir) - elif dataset_format == 'rds' or dataset_format=='rdata': - process_seurat(dataset_upload_dir) + process_h5ad(dataset_upload_dir, metadata["perform_primary_analysis"]) + elif dataset_format == 'rds': + process_seurat(dataset_upload_dir, metadata["perform_primary_analysis"]) elif dataset_format == "spatial": - process_spatial(dataset_upload_dir, spatial_format) + process_spatial(dataset_upload_dir, spatial_format, metadata["perform_primary_analysis"]) else: result["success"] = 0 result["message"] = f"Unsupported dataset format: {dataset_format}" return result + + if metadata["perform_primary_analysis"]: + try: + result["success"] = add_primary_analysis_to_dataset(dataset_uid, share_uid, dataset_upload_dir, dataset_format) + except PrimaryAnalysisProcessingError as e: + write_status(dataset_upload_dir, 'error', f"Error during primary analysis: {str(e)}") + return result + + status["progress"] = 100 + write_status(dataset_upload_dir, 'complete', "Dataset processed successfully.") + result["success"] = 1 result["message"] = "Dataset processed successfully." return result -def process_h5ad(upload_dir: Path) -> None: +def process_h5ad(upload_dir: Path, perform_primary_analysis: bool) -> None: """ Processes an uploaded .h5ad (AnnData) file in the specified upload directory by performing the following steps: 1. Reads the .h5ad file as an AnnData object. @@ -182,15 +200,43 @@ def process_h5ad(upload_dir: Path) -> None: # If the file is an h5ad, it should be formatted as an AnnData object already. # But we still want to do some sanitization of the obs dataframe. + # TODO: Read in chunks to save memory + write_status(upload_dir, 'processing', 'Initializing dataset processing.') filepath = upload_dir / f"{share_uid}.h5ad" adata = anndata.read_h5ad(filepath) obs = adata.obs + total_steps = 4 if perform_primary_analysis else 3 + step_counter = 1 + status["progress"] = int((step_counter / total_steps) * 100) + write_status(upload_dir, 'processing', 'Sanitizing AnnData object') + categorize_observation_columns(obs) adata.obs = sanitize_obs_for_h5ad(obs) + if "gene_symbol" not in adata.var.columns: + # get organism_id by converting sample_taxid + metadata_file = upload_dir / 'metadata.json' + if not metadata_file.is_file(): + write_status(upload_dir, 'error', "No metadata JSON file found.") + return + + with open(metadata_file, 'r') as f: + metadata = json.load(f) + sample_taxid = metadata.get("sample_taxid", None) + organism_id=geardb.get_organism_id_by_taxon_id(sample_taxid) + if not organism_id: + write_status(upload_dir, 'error', "Could not determine organism ID from sample taxonomic ID.") + return + + adata = update_adata_with_ensembl_ids(adata, organism_id, "UNMAPPED_") + + step_counter += 1 + status["progress"] = int((step_counter / total_steps) * 100) + write_status(upload_dir, 'processing', 'Writing sanitized data to new H5AD.') + h5ad_path = upload_dir / f"{share_uid}.new.h5ad" adata.write(h5ad_path) @@ -198,9 +244,14 @@ def process_h5ad(upload_dir: Path) -> None: filepath.unlink() # remove original h5ad_path.rename(filepath) # rename new to original name - write_status(upload_dir, 'complete', 'Dataset processed successfully.') + step_counter += 1 + status["progress"] = int((step_counter / total_steps) * 100) + write_status(upload_dir, 'processing', f"Finished processing dataset. {'Performing primary analysis...' if perform_primary_analysis else ''}")\ + +def process_seurat(upload_dir: Path, perform_primary_analysis: bool) -> None: + total_steps = 2 if perform_primary_analysis else 1 + step_counter = 1 -def process_seurat(upload_dir: Path) -> None: # Take in an RDS file, convert to anndata, update the obs metadata based on reductions, # convert gene symbols to ensemble IDs, and write to an updated h5ad file. write_status(upload_dir, "processing", "Initializing dataset processing.") @@ -228,14 +279,16 @@ def process_seurat(upload_dir: Path) -> None: metadata_file = upload_dir / 'metadata.json' if not metadata_file.is_file(): write_status(upload_dir, 'error', "No metadata JSON file found.") - + return # get organism_id by converting sample_taxid(needed for some but not all spatial handlers) with open(metadata_file, 'r') as f: metadata = json.load(f) - + sample_taxid = metadata.get("sample_taxid", None) try: adata = SeuratUploader.genes_to_ensembl(adata,sample_taxid) + if adata is None: + raise Exception("genes_to_ensembl returned None") except Exception as e: write_status(upload_dir, 'error', f'Failed to convert genes to Ensembl: {str(e)}') return @@ -248,16 +301,16 @@ def process_seurat(upload_dir: Path) -> None: # Replace the original file with the sanitized one seurat_filepath.unlink() Path(adata_filepath).unlink() - h5ad_path.rename(upload_dir / f"{share_uid}.h5ad") + h5ad_path.rename(upload_dir / f"{share_uid}.h5ad") except Exception as e: write_status(upload_dir, 'error', f'Failed to write h5ad or during cleanup: {str(e)}') return - write_status(upload_dir, "complete", "Dataset processed successfully.") - + step_counter += 1 + status["progress"] = int((step_counter / total_steps) * 100) + write_status(upload_dir, 'processing', f"Finished processing dataset. {'Performing primary analysis...' if perform_primary_analysis else ''}") - -def process_3tab(upload_dir: Path) -> None: +def process_3tab(upload_dir: Path, perform_primary_analysis: bool) -> None: import subprocess chunk_size = 500 @@ -315,9 +368,8 @@ def process_3tab(upload_dir: Path) -> None: expression_matrix.append(sparse.csr_matrix(chunk.values)) status['progress'] = percentage - status['message'] = f"Processed {rows_read}/{total_rows} expression matrix chunks ..." - with open(upload_dir / "status.json", 'w') as f: - f.write(json.dumps(status)) + message = f"Processed {rows_read}/{total_rows} expression matrix chunks ..." + write_status(upload_dir, 'processing', message) adata.X = sparse.vstack(expression_matrix) # type: ignore except Exception: @@ -349,9 +401,8 @@ def process_3tab(upload_dir: Path) -> None: percentage = int((rows_read / total_rows) * 100) status['progress'] = percentage - status['message'] = f"Processed {rows_read}/{total_rows} expression matrix chunks ..." - with open(upload_dir / "status.json", 'w') as f: - f.write(json.dumps(status)) + message = f"Processed {rows_read}/{total_rows} expression matrix chunks ..." + write_status(upload_dir, 'processing', message) except Exception: #print(f"\nError in chunk {chunk_index}: {inner_e}") @@ -378,9 +429,10 @@ def process_3tab(upload_dir: Path) -> None: h5ad_path = upload_dir / f"{share_uid}.h5ad" adata.write(h5ad_path) - write_status(upload_dir, 'complete', 'Dataset processed successfully.') + # Progress is accounted for in chunk processing + write_status(upload_dir, 'processing', f"Finished processing dataset. {'Performing primary analysis...' if perform_primary_analysis else ''}") -def process_excel(upload_dir: Path) -> None: +def process_excel(upload_dir: Path, perform_primary_analysis: bool) -> None: filepath = upload_dir / f"{share_uid}.xlsx" write_status(upload_dir, 'processing', 'Initializing dataset processing.') @@ -460,12 +512,14 @@ def process_excel(upload_dir: Path) -> None: h5ad_path = upload_dir / f"{share_uid}.h5ad" adata.write(h5ad_path) - write_status(upload_dir, 'complete', 'Dataset processed successfully.') + total_steps = 2 if perform_primary_analysis else 1 + status["progress"] = int((1 / total_steps) * 100) + write_status(upload_dir, 'processing', f"Finished processing dataset. {'Performing primary analysis...' if perform_primary_analysis else ''}") -def process_mex(upload_dir: Path) -> None: +def process_mex(upload_dir: Path, perform_primary_analysis: bool) -> None: pass -def process_mex_3tab(upload_dir: Path) -> None: +def process_mex_3tab(upload_dir: Path, perform_primary_analysis: bool) -> None: # Extract the file import tarfile compression_format = None @@ -544,11 +598,11 @@ def process_mex_3tab(upload_dir: Path) -> None: # Call the appropriate function if dataset_type == 'threetab': - process_3tab(upload_dir) + process_3tab(upload_dir, perform_primary_analysis) elif dataset_type == 'mex': - process_mex(upload_dir) + process_mex(upload_dir, perform_primary_analysis) -def process_spatial(upload_dir: Path, spatial_format: str) -> None: +def process_spatial(upload_dir: Path, spatial_format: str, perform_primary_analysis: bool) -> None: """ Processes a spatial transcriptomics dataset uploaded to a specified directory. @@ -565,6 +619,9 @@ def process_spatial(upload_dir: Path, spatial_format: str) -> None: Raises: Writes error status if the metadata file is missing or if reading/converting the spatial file fails. """ + + write_status(upload_dir, 'processing', 'Initializing dataset processing.') + spatial_obj = SPATIALTYPE2CLASS[spatial_format]() # instantiate the appropriate handler class metadata_file = upload_dir / 'metadata.json' if not metadata_file.is_file(): @@ -594,9 +651,16 @@ def process_spatial(upload_dir: Path, spatial_format: str) -> None: import shutil shutil.rmtree(output_path) + total_steps = 3 if perform_primary_analysis else 2 + step_counter = 1 + status["progress"] = int((step_counter / total_steps) * 100) + write_status(upload_dir, 'processing', 'Writing Zarr store') spatial_obj.write_to_zarr(filepath=output_path) - write_status(upload_dir, 'complete', 'Dataset processed successfully.') + + step_counter += 1 + status["progress"] = int((step_counter / total_steps) * 100) + write_status(upload_dir, 'processing', f"Finished processing spatial dataset. {'Performing primary analysis...' if perform_primary_analysis else ''}") def sanitize_obs_for_h5ad(obs_df: pd.DataFrame) -> pd.DataFrame: for col in obs_df.columns: diff --git a/www/upload_dataset.html b/www/upload_dataset.html index 39d1e3f7..f17a0077 100644 --- a/www/upload_dataset.html +++ b/www/upload_dataset.html @@ -692,7 +692,7 @@

RDS / Seurat

-