diff --git a/docker/Dockerfile b/docker/Dockerfile index 87882e78..038b65ba 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -45,6 +45,9 @@ RUN apt -qq update \ libpcre2-dev \ fonts-roboto \ fontconfig \ + libgfortran5 \ + libuv1 \ + libhdf5-dev \ && apt -qq clean autoclean \ && apt -qq autoremove -y \ && rm -rf /var/lib/apt/lists/* @@ -55,12 +58,12 @@ RUN fc-cache -f -v ENV LLVM_CONFIG=/usr/bin/llvm-config-14 # Copy compiled Python from builder stage -COPY --from=adkinsrs/gear-python-base:2026-04-27 /opt/Python-${PYTHON_FULL_VERSION} /opt/Python-${PYTHON_FULL_VERSION} +COPY --from=gear-python-base:latest /opt/Python-${PYTHON_FULL_VERSION} /opt/Python-${PYTHON_FULL_VERSION} # Copy compiled R from r-builder stage -COPY --from=adkinsrs/gear-r-base:2026-04-27 /usr/local/lib/R /usr/local/lib/R -COPY --from=adkinsrs/gear-r-base:2026-04-27 /usr/local/bin/R /usr/local/bin/R -COPY --from=adkinsrs/gear-r-base:2026-04-27 /usr/local/bin/Rscript /usr/local/bin/Rscript +COPY --from=gear-r-base:latest /usr/local/lib/R /usr/local/lib/R +COPY --from=gear-r-base:latest /usr/local/bin/R /usr/local/bin/R +COPY --from=gear-r-base:latest /usr/local/bin/Rscript /usr/local/bin/Rscript # Link Python and shared library RUN mkdir -p /opt/bin \ diff --git a/docker/Dockerfile.r b/docker/Dockerfile.r index a2c2b3cb..3015356e 100644 --- a/docker/Dockerfile.r +++ b/docker/Dockerfile.r @@ -32,6 +32,8 @@ RUN apt -qq update \ tzdata \ git \ unzip \ + libgfortran5 \ + libhdf5-dev \ && apt -qq clean autoclean \ && apt -qq autoremove -y \ && rm -rf /var/lib/apt/lists/* diff --git a/docker/install_bioc.R b/docker/install_bioc.R index b1dd8025..fd0ad770 100755 --- a/docker/install_bioc.R +++ b/docker/install_bioc.R @@ -3,7 +3,7 @@ # Install required packages tryCatch( { install.packages(c("BiocManager", "remotes"), dependencies=NA, repos="http://lib.stat.cmu.edu/R/CRAN/") - BiocManager::install(version = "3.21", ask=FALSE) + BiocManager::install(version = "3.22", ask=FALSE) }, error = function(e) { message("Error: ", e$message) quit(status = 1, save = "no") diff --git a/docker/install_packages.R b/docker/install_packages.R index f04948ed..55a1397a 100755 --- a/docker/install_packages.R +++ b/docker/install_packages.R @@ -9,7 +9,12 @@ library(remotes) # for install_version tryCatch( { remotes::install_version("reticulate", version="1.46.0", repos="https://cloud.r-project.org/", ask=FALSE, dependencies=NA) # Sanity check with rpy2 remotes::install_github("ctlab/fgsea") # needed for projectR - remotes::install_github("genesofeve/projectR@d3dd79e2b14172a9561059d58462c97f0a78d4c8") # version 1.23.2 + remotes::install_github("genesofeve/projectR@d3dd79e2b14172a9561059d58462c97f0a78d4c8") + remotes::install_github("satijalab/seurat", "seurat5", quiet = TRUE, ask=FALSE) + install.packages('httpuv', ask=FALSE, repos="https://cloud.r-project.org/") + install.packages("hdf5r",dependencies=TRUE, ask=FALSE, repos="https://cloud.r-project.org/") + BiocManager::install("rhdf5",ask=FALSE) + BiocManager::install("anndataR", ask=FALSE)# version 1.23.2 BiocManager::install("biomaRt", ask=FALSE) # version 2.60.0 remotes::install_github("CHuanSite/SJD") }, error = function(e) { diff --git a/docker/requirements.txt b/docker/requirements.txt index 2d017dbd..90d5dce5 100644 --- a/docker/requirements.txt +++ b/docker/requirements.txt @@ -3,6 +3,8 @@ aiohttp_retry==2.9.1 anndata==0.12.11 bio==1.8.3 biocode==0.10.0 +biopython==1.87 +biothings-client==0.5.0 cairosvg==2.7.1 colorcet==3.1.0 datashader==0.19.0 @@ -11,7 +13,10 @@ Flask==3.1.3 Flask-RESTful==0.3.9 google-analytics-data==0.21.0 gosling==0.3.0 +h11==0.16.0 hic2cool==0.8.3 +httpcore==1.0.9 +httpx==0.28.1 jupyterlab==4.0.5 jupyter==1.0.0 kaleido==0.2.1 @@ -21,6 +26,7 @@ llvmlite==0.47.0 matplotlib==3.10.7 mod-wsgi==5.0.2 more_itertools==11.0.2 +mygene==3.2.2 mysql-connector-python==8.0.28 numba==0.65.0 numpy==2.4.0 @@ -44,4 +50,4 @@ spatialdata==0.7.2 spatialdata_io==0.6.0 shadows==0.1a2 tables==3.11.1 -watchfiles==1.1.1 +watchfiles==1.1.1 \ No newline at end of file diff --git a/docs/developer/setup/docker.md b/docs/developer/setup/docker.md index 281f176f..87efba02 100644 --- a/docs/developer/setup/docker.md +++ b/docs/developer/setup/docker.md @@ -66,7 +66,7 @@ This file is dedicated entirely to compiling Python 3.x and installing requireme **RPy2**: The "rpy2" package is actually built in the final Docker (umgear) image, due to some dependencies on R. -**The output**: This is currently built and pushed as adkinsrs/gear-python-base:YYYY-MM-DD +**The output**: This is currently built and pushed as adkinsrs/gear-python-base:YYYY-MM-DD and also tagged with the "latest" tag. #### Dockerfile.r (The R Base) @@ -74,15 +74,17 @@ This file is dedicated entirely to compiling R and running your Bioconductor scr **When you build it**: Almost never. Only touch this if the team specifically requests a new version of Bioconductor or a brand-new R system library. -**The output**: This is currently built and pushed as adkinsrs/gear-r-base:YYYY-MM-DD +**The output**: This is currently built and pushed as adkinsrs/gear-r-base:YYYY-MM-DD and also tagged with the "latest" tag. #### Dockerfile (The Final App) This is your main daily-driver file. It starts with a clean Ubuntu image, uses COPY --from=... to pull in the pre-compiled folders from your registry, installs Apache, and copies over your Flask API and HTML/JS files. +Currently the inherited R and Python images are set to use the "latest" tag of a locally built image, as most of the time we want the most up-to-date version. If for some reason you need an earlier version, edit the Dockerfile to use one of the existing `adkinsrs/:YYYY-MM-DD` tags stored in Docker Hub. + **When you build it**: Every time you update the website, tweak the Apache configuration, or change a CGI script. Anything gEAR-code related, basically. -**The output**: This builds in seconds and becomes your final production image. This is pushed as adkinsrs/umgear:YYYY-MM-DD +**The output**: This builds in seconds and becomes your final production image. This is pushed as adkinsrs/umgear:YYYY-MM-DD and also tagged with the "latest" tag. ## Starting the stack diff --git a/docs/developer/setup/python.md b/docs/developer/setup/python.md index 14d16fa7..c55d0590 100644 --- a/docs/developer/setup/python.md +++ b/docs/developer/setup/python.md @@ -38,6 +38,9 @@ fixed paths have worked fine for decades. libicu-dev \ libdeflate-dev \ libssl3 \ + libgfortran5 \ + libuv1 \ + libhdf5-dev \ pkg-config \ llvm \ apache2 \ @@ -109,6 +112,8 @@ I cannot add comments to the bash code without breaking the command. So consult anndata==0.12.11 \ bio==1.8.3 \ biocode==0.10.0 \ + biopython==1.87 \ + biothings-client==0.5.0 \ cairosvg==2.7.1 \ colorcet==3.1.0 \ datashader==0.19.0 \ @@ -116,7 +121,10 @@ I cannot add comments to the bash code without breaking the command. So consult Flask-RESTful==0.3.9 \ google-analytics-data==0.21.0 \ gosling==0.3.0 \ + h11==0.16.0 \ hic2cool==0.8.3 \ + httpcore==1.0.9 \ + httpx==0.28.1 \ jupyterlab==4.0.5 \ jupyter==1.0.0 \ kaleido==0.2.1 \ @@ -126,6 +134,7 @@ I cannot add comments to the bash code without breaking the command. So consult matplotlib==3.10.7 \ mod-wsgi==5.0.2 \ more_itertools==11.0.2 \ + mygene==3.2.2 \ mysql-connector-python==8.0.28 \ numba==0.65.0 \ numpy==2.4.0 \ diff --git a/lib/gear/__init__.py b/lib/gear/__init__.py old mode 100644 new mode 100755 diff --git a/lib/gear/analysis.py b/lib/gear/analysis.py old mode 100644 new mode 100755 diff --git a/lib/gear/dataarchive.py b/lib/gear/dataarchive.py old mode 100644 new mode 100755 diff --git a/lib/gear/db.py b/lib/gear/db.py old mode 100644 new mode 100755 diff --git a/lib/gear/mg_plotting.py b/lib/gear/mg_plotting.py old mode 100644 new mode 100755 index 75c36fd1..eae1556b --- a/lib/gear/mg_plotting.py +++ b/lib/gear/mg_plotting.py @@ -1185,7 +1185,7 @@ def update_stacked_violin_annotations(fig, primary_groups, color_map): # Am attempting to do this based on the assumption that row facet titles will never have yanchor of bottom # (or y-pos of 1) or have certain text shared with the axes titles lambda a: a.update( - font=dict(color=color_map[a.text]) + font=dict(color=color_map.get(a.text, "black")) , textangle=0 , x=0 , xanchor="right" diff --git a/lib/gear/orthology.py b/lib/gear/orthology.py old mode 100644 new mode 100755 diff --git a/lib/gear/plotting.py b/lib/gear/plotting.py old mode 100644 new mode 100755 diff --git a/lib/gear/primary_analysis.py b/lib/gear/primary_analysis.py old mode 100644 new mode 100755 diff --git a/lib/gear/serverconfig.py b/lib/gear/serverconfig.py old mode 100644 new mode 100755 diff --git a/lib/gear/seuratuploader.py b/lib/gear/seuratuploader.py new file mode 100755 index 00000000..a8ca468e --- /dev/null +++ b/lib/gear/seuratuploader.py @@ -0,0 +1,172 @@ +import argparse +import os +import sys + +import mygene +import pandas as pd +import rpy2.rinterface_lib.callbacks as r_cbs +import rpy2.robjects as ro +import rpy2.robjects.packages as rpackages +import scanpy +from rpy2.robjects.packages import importr + + +def silent_handler(s:str) -> None: + # way to bypass the R stderr output + pass + +def argument_parser(): + parser = argparse.ArgumentParser(usage="%(prog)s -r [RDS Object] -s [Share ID]",add_help=True) + parser.add_argument('-r', '--rds', required=True, type=str) + parser.add_argument('-s', '--share-id', required=True, type=str) + args = vars(parser.parse_args()) + return args + +def r_package_installer() -> None: + utils = rpackages.importr('utils') + # Install BiocManager if not installed + if not rpackages.isinstalled('BiocManager'): + utils.install_packages('BiocManager') + # Import BiocManager + BiocManager = importr('BiocManager') + # Install Seurat, anndataR and rhdf5 + if not rpackages.isinstalled('reticulate'): + utils.install_packages('reticulate') + if not rpackages.isinstalled('Seurat'): + utils.install_packages('Seurat') + if not rpackages.isinstalled('anndataR'): + BiocManager.install('anndataR') + if not rpackages.isinstalled('rhdf5'): + BiocManager.install('rhdf5') + + +def r_package_importer(package_name:str): + """ + Import installed package, if not installed return message + Input: + package_name: R package name to import + Output: + The R package that was imported or if there's an error the message will be returned + """ + importErrorMessage = "" + try: + pkg = importr(package_name) + return pkg + except Exception: + importErrorMessage += f"{package_name} not installed or can not be imported" + sys.exit(importErrorMessage) + + + +def seurat_to_anndata(file_path: str, share_name: str, output_dir: str = "."): + """ + file_path: path to rds or rdata file + share_name: final h5ad string name to be expected (without h5ad) + output_dir: directory to write the temporary h5ad file into + + return: + absolute path to tmp h5ad, or False on failure + """ + # Suppress R console output and ensure required packages are loaded, + # since this function may be called as a module in cgi script (not via main()). + r_cbs.consolewrite_print = silent_handler + r_cbs.consolewrite_warnerror = silent_handler + # Import required R packages + base = rpackages.importr('base') + r_package_importer('Seurat') + r_package_importer('rhdf5') + r_package_importer('anndataR') + # Use R's readRDS to load the object. + # The result is an R object within the Python environment. + r_seurat_obj = base.readRDS(file_path) + ro.globalenv['seurat_obj'] = r_seurat_obj + # Using anndataR write out a converted h5ad + ro.r('adata <- as_AnnData(seurat_obj)') + output_path = os.path.join(output_dir, f'tmp_{share_name}.h5ad') + try: + ro.r(f'write_h5ad(adata, "{output_path}")') + return output_path + # In cases where the write fails we will assume the h5ad already exists + except Exception: + print(f"h5ad name already exists {output_path}") + raise + +def openh5ad(h5ad_name): + """Just open the supplied h5ad file""" + adata = scanpy.read_h5ad(h5ad_name) + return adata + +def genes_to_ensembl(adata, taxid=None): + # We are calling an external API for genes to ensembl mapping + # Potentially problematic down the road if this shuts down + if taxid is None: + return None + genes = adata.var.index.tolist() + try: + # TODO: Perhaps add a retry mechanism in case the API returns 500 + mg = mygene.MyGeneInfo() + mg_genes = mg.querymany(genes, scopes="symbol", fields="ensembl.gene", species=f"{taxid}") + except Exception as e: + print(f"Error occurred while querying MyGene: {e}", file=sys.stderr) + raise + ensembl_mapping_dict = {} + for mg_gene in mg_genes: + gene_name = mg_gene['query'] + if 'ensembl' in mg_gene.keys(): + if isinstance(mg_gene['ensembl'],list): + # Currently taking first value, not sure of a better way to handle one gene having multiple ensembl IDs + ensembl_mapping_dict[gene_name] = mg_gene['ensembl'][0]['gene'] + else: + ensembl_mapping_dict[gene_name] = mg_gene['ensembl']['gene'] + count = 0 + # We still need an ensembl id for the genes that do not actually have them. + # So here we create a FAKE# for each one so that it can be searchable in gEAR + for gene in genes: + if gene not in ensembl_mapping_dict.keys(): + ensembl_mapping_dict[gene] = f"Fake{count}" + count += 1 + # Overwrite the current adata.var + adata.var = pd.DataFrame( + index=list(ensembl_mapping_dict.values()), data={"gene_symbol": list(ensembl_mapping_dict.keys())} + ) + return adata + + +def reduction_to_metadata(adata): + # Discussion with Carlo and Brian resulted in us determining we would like to + # take the first 2 values of each reduction + # PCA in the future, and potentially other reductions may need more + for reduction in adata.obsm: + if adata.obsm[reduction].shape[1] > 1: + for i in range(2): + adata.obs[f'{reduction}_{i+1}'] = adata.obsm[reduction][:,i] + return adata + + +def layer_to_X(adata, layer_name): + # Possibility for Seurat -> Anndata conversion doesn not create the X matrix. + # Use adata.layers['data'] as X + adata.X = adata.layers[layer_name] + return adata + +def main(): + arguments = argument_parser() + # Args + rds_path = arguments['rds'] + share_name = arguments['share_id'] + r_package_installer() + # Take the RDS and output the most basic h5ad + h5ad_name = seurat_to_anndata(rds_path,share_name) + # Below are some changes and checks to the h5ad to correctly format for gEAR + if h5ad_name: + adata = openh5ad(f'tmp_{h5ad_name}') + adata = genes_to_ensembl(adata) + if adata is None: + sys.exit("TaxID not supplied") + adata = reduction_to_metadata(adata) + adata.write({h5ad_name.replace('tmp_','')}) + os.remove(f'tmp_{h5ad_name}') + + +if __name__ == "__main__": + main() diff --git a/lib/gear/spatialhandler.py b/lib/gear/spatialhandler.py old mode 100644 new mode 100755 diff --git a/lib/gear/trackhub.py b/lib/gear/trackhub.py old mode 100644 new mode 100755 diff --git a/lib/gear/userhistory.py b/lib/gear/userhistory.py old mode 100644 new mode 100755 diff --git a/lib/gear/utils.py b/lib/gear/utils.py old mode 100644 new mode 100755 diff --git a/www/cgi/finalize_uploaded_expression_dataset.cgi b/www/cgi/finalize_uploaded_expression_dataset.cgi index ca52ac21..a7694e8b 100755 --- a/www/cgi/finalize_uploaded_expression_dataset.cgi +++ b/www/cgi/finalize_uploaded_expression_dataset.cgi @@ -256,6 +256,18 @@ def main() -> dict: result['message'] = 'Error migrating Excel file: {}'.format(str(e)) return result + elif dataset_format == 'rds': + # migrate the RDS file + rds_file = dataset_upload_dir / f'{share_uid}.rds' + rds_dest = dataset_final_dir / f'{dataset_id}.rds' + + try: + shutil.move(rds_file, rds_dest) + result['userdata_migrated'] = 1 + except Exception as e: + result['message'] = 'Error migrating RDS file: {}'.format(str(e)) + return result + elif dataset_format == "spatial": # migrate the spatial tarball spatial_src = dataset_upload_dir / f'{share_uid}.tar.gz' diff --git a/www/cgi/process_uploaded_expression_dataset.cgi b/www/cgi/process_uploaded_expression_dataset.cgi index 4fcb35bf..aec21e16 100755 --- a/www/cgi/process_uploaded_expression_dataset.cgi +++ b/www/cgi/process_uploaded_expression_dataset.cgi @@ -37,8 +37,12 @@ sys.stdout = open(os.devnull, 'w') lib_path = Path(__file__).resolve().parents[2] / 'lib' sys.path.append(str(lib_path)) +import gear.seuratuploader as SeuratUploader import geardb -from gear.primary_analysis import add_primary_analysis_to_dataset, PrimaryAnalysisProcessingError +from gear.primary_analysis import ( + PrimaryAnalysisProcessingError, + add_primary_analysis_to_dataset, +) from gear.spatialhandler import SPATIALTYPE2CLASS from gear.utils import update_adata_with_ensembl_ids @@ -60,10 +64,10 @@ def main(): global session_id form = cgi.FieldStorage() - share_uid = form.getvalue('share_uid') - session_id = form.getvalue('session_id') - dataset_format = form.getvalue('dataset_format') - spatial_format = form.getvalue('spatial_format') # may be None + share_uid = form.getfirst('share_uid') + session_id = form.getfirst('session_id') + dataset_format = form.getfirst('dataset_format') + spatial_format = form.getfirst('spatial_format') # may be None if share_uid is None or session_id is None or dataset_format is None: result['message'] = 'Missing one or more required parameters.' @@ -74,8 +78,8 @@ def main(): result['message'] = 'User ID not found. Please log in to continue.' return result - # values are mex_3tab, excel, rdata, h5ad - dataset_formats = ['mex_3tab', 'excel', 'rdata', 'h5ad', 'spatial'] + # values are mex_3tab, excel, h5ad, rds, or spatial formats + dataset_formats = ['mex_3tab', 'excel', 'h5ad', 'spatial','rds'] dataset_upload_dir = Path(user_upload_file_base) / session_id / share_uid # quickly write the status so the page doesn't error out @@ -91,7 +95,7 @@ def main(): return result if dataset_format not in dataset_formats: - result['message'] = 'Unsupported dataset format.' + result['message'] = f'Unsupported dataset format: {dataset_format} ' write_status(dataset_upload_dir, 'error', result['message']) return result @@ -113,7 +117,6 @@ def main(): # Update metadata for downstream uses metadata["dataset_format"] = dataset_format metadata["perform_primary_analysis"] = True if dataset_type in ['single-cell-rnaseq', 'spatial'] else False - with open(metadata_file, 'w') as f: json.dump(metadata, f, indent=4) @@ -155,6 +158,8 @@ def main(): process_excel(dataset_upload_dir, metadata["perform_primary_analysis"]) elif dataset_format == "h5ad": process_h5ad(dataset_upload_dir, metadata["perform_primary_analysis"]) + elif dataset_format == 'rds': + process_seurat(dataset_upload_dir, metadata["perform_primary_analysis"]) elif dataset_format == "spatial": process_spatial(dataset_upload_dir, spatial_format, metadata["perform_primary_analysis"]) else: @@ -162,6 +167,11 @@ def main(): result["message"] = f"Unsupported dataset format: {dataset_format}" return result + # If the functions error, we do not want them to try to do primary analysis. So exit and let the user see the original error message. + if status.get("status", "error") == "error": + result["success"] = 0 + return result + if metadata["perform_primary_analysis"]: try: result["success"] = add_primary_analysis_to_dataset(dataset_uid, share_uid, dataset_upload_dir, dataset_format) @@ -238,6 +248,85 @@ def process_h5ad(upload_dir: Path, perform_primary_analysis: bool) -> None: filepath.unlink() # remove original h5ad_path.rename(filepath) # rename new to original name + step_counter += 1 + status["progress"] = int((step_counter / total_steps) * 100) + write_status(upload_dir, 'processing', f"Finished processing dataset. {'Performing primary analysis...' if perform_primary_analysis else ''}")\ + +def process_seurat(upload_dir: Path, perform_primary_analysis: bool) -> None: + total_steps = 7 if perform_primary_analysis else 6 + step_counter = 1 + status["progress"] = int((step_counter / total_steps) * 100) + + # Take in an RDS file, convert to anndata, update the obs metadata based on reductions, + # convert gene symbols to ensemble IDs, and write to an updated h5ad file. + write_status(upload_dir, "processing", "Initializing dataset processing.") + seurat_filepath = upload_dir / f"{share_uid}.rds" + + # seurat to anndata uses rpy2 to convert the RDS to anndata + # filepath name has "tmp_" appended in front + adata_filepath = SeuratUploader.seurat_to_anndata(str(seurat_filepath), share_uid, str(upload_dir)) + if not adata_filepath: + write_status(upload_dir, 'error', 'Failed to convert RDS to h5ad.') + return + + step_counter += 1 + status["progress"] = int((step_counter / total_steps) * 100) + write_status(upload_dir, 'processing', 'Reading converted h5ad file.') + try: + adata = anndata.read_h5ad(adata_filepath) + except Exception as e: + write_status(upload_dir, 'error', f'Failed to read h5ad: {str(e)}') + return + + # Update obs metadata based on reductions + step_counter += 1 + status["progress"] = int((step_counter / total_steps) * 100) + write_status(upload_dir, 'processing', 'Updating metadata from reductions.') + try: + adata = SeuratUploader.reduction_to_metadata(adata) + except Exception as e: + write_status(upload_dir, 'error', f'Failed to update Reductions to metadata: {str(e)}') + return + + # Convert gene symbols to ensemble IDs + metadata_file = upload_dir / 'metadata.json' + if not metadata_file.is_file(): + write_status(upload_dir, 'error', "No metadata JSON file found.") + return + # get organism_id by converting sample_taxid(needed for some but not all spatial handlers) + with open(metadata_file, 'r') as f: + metadata = json.load(f) + + step_counter += 1 + status["progress"] = int((step_counter / total_steps) * 100) + write_status(upload_dir, 'processing', 'Converting gene symbols to Ensembl IDs.') + sample_taxid = metadata.get("sample_taxid", None) + try: + adata = SeuratUploader.genes_to_ensembl(adata, sample_taxid) + if adata is None: + raise Exception("genes_to_ensembl returned None") + except Exception as e: + write_status(upload_dir, 'error', f'Failed to convert genes to Ensembl: {str(e)}') + return + + step_counter += 1 + status["progress"] = int((step_counter / total_steps) * 100) + write_status(upload_dir, 'processing', 'Writing final h5ad file.') + if adata.X is None: + # TODO: This is currently not an option in the UI, but was suggested to be one by @jorvis + adata = SeuratUploader.layer_to_X(adata, layer_name='data') + h5ad_path = upload_dir / f"{share_uid}.new.h5ad" + try: + adata.write(h5ad_path) + + # Replace the original file with the sanitized one + #seurat_filepath.unlink() + Path(adata_filepath).unlink() + h5ad_path.rename(upload_dir / f"{share_uid}.h5ad") + except Exception as e: + write_status(upload_dir, 'error', f'Failed to write h5ad or during cleanup: {str(e)}') + return + step_counter += 1 status["progress"] = int((step_counter / total_steps) * 100) write_status(upload_dir, 'processing', f"Finished processing dataset. {'Performing primary analysis...' if perform_primary_analysis else ''}") @@ -285,21 +374,8 @@ def process_3tab(upload_dir: Path, perform_primary_analysis: bool) -> None: adata = sc.AnnData(obs=var, var=obs) reader = pd.read_csv(expression_matrix_path, sep='\t', index_col=0, chunksize=chunk_size) - # Count rows safely without shell execution (https://github.com/IGS/gEAR/security/code-scanning/229) - try: - result = subprocess.run( - ['/usr/bin/wc', '-l', str(expression_matrix_path)], - capture_output=True, - text=True, - check=True - ) - total_rows = int(result.stdout.split()[0]) - except (subprocess.CalledProcessError, ValueError, FileNotFoundError): - # Fallback to Python if wc fails - total_rows = sum(1 for _ in open(expression_matrix_path)) - 1 - - if perform_primary_analysis: - total_rows += 1 # account for the additional primary analysis step that will be performed after this + # This can be an order of magnitude faster than the using python alone + total_rows = int(subprocess.check_output(f"/usr/bin/wc -l {expression_matrix_path}", shell=True).split()[0]) expression_matrix = [] rows_read = 0 @@ -345,6 +421,7 @@ def process_3tab(upload_dir: Path, perform_primary_analysis: bool) -> None: rows_read += chunk_size percentage = int((rows_read / total_rows) * 100) + status['progress'] = percentage message = f"Processed {rows_read}/{total_rows} expression matrix chunks ..." write_status(upload_dir, 'processing', message) @@ -363,8 +440,10 @@ def process_3tab(upload_dir: Path, perform_primary_analysis: bool) -> None: #print("Collected chunk shapes:") #for i, shape in enumerate(chunk_shapes): # print(f" Chunk {i+1}: {shape}") + raise + adata = adata.transpose() adata.obs = sanitize_obs_for_h5ad(adata.obs) @@ -458,7 +537,6 @@ def process_excel(upload_dir: Path, perform_primary_analysis: bool) -> None: status["progress"] = int((1 / total_steps) * 100) write_status(upload_dir, 'processing', f"Finished processing dataset. {'Performing primary analysis...' if perform_primary_analysis else ''}") - def process_mex(upload_dir: Path, perform_primary_analysis: bool) -> None: pass @@ -597,8 +675,8 @@ def process_spatial(upload_dir: Path, spatial_format: str, perform_primary_analy total_steps = 3 if perform_primary_analysis else 2 step_counter = 1 status["progress"] = int((step_counter / total_steps) * 100) - write_status(upload_dir, 'processing', 'Writing Zarr store') + write_status(upload_dir, 'processing', 'Writing Zarr store') spatial_obj.write_to_zarr(filepath=output_path) step_counter += 1 diff --git a/www/cgi/store_expression_dataset.cgi b/www/cgi/store_expression_dataset.cgi index 54bb1f26..16bd0b7c 100755 --- a/www/cgi/store_expression_dataset.cgi +++ b/www/cgi/store_expression_dataset.cgi @@ -19,10 +19,10 @@ import geardb def main(): print('Content-Type: application/json\n\n') form = cgi.FieldStorage() - session_id = form.getvalue('session_id') - share_uid = form.getvalue('share_uid') - dataset_format = form.getvalue('dataset_format') - spatial_format = form.getvalue('spatial_format') # may be None + session_id = form.getfirst('session_id') + share_uid = form.getfirst('share_uid') + dataset_format = form.getfirst('dataset_format') + spatial_format = form.getfirst('spatial_format') # may be None if not share_uid: # should never happen error_msg = f"Unexpected missing share_uid in store_expression_dataset.cgi. session_id={session_id!r}" @@ -66,6 +66,11 @@ def main(): result['message'] = 'Invalid file extension for H5AD format. Expected .h5ad' return result + if dataset_format == "rds": + if not filename.endswith('rds'): + result['message'] = 'Invalid file extension for RDS format. Expected .rds' + return result + if dataset_format == 'spatial': if not filename.endswith('tar.gz'): result['message'] = 'Invalid file extension for Spatial format. Expected .tar.gz' diff --git a/www/upload_dataset.html b/www/upload_dataset.html index ac1ca58e..a34a0a0d 100644 --- a/www/upload_dataset.html +++ b/www/upload_dataset.html @@ -666,9 +666,9 @@

MS Excel

- +
-

Rdata / Seurat

+

RDS / Seurat

This is a binary format used by the Seurat package in R. If you've already been working with your dataset in R, including clustering and other analyses, this is @@ -682,21 +682,21 @@

Rdata / Seurat

-