From 70f122ba36f6eb7258f1381886a598b80c3c123d Mon Sep 17 00:00:00 2001
From: Dan Lesperance <danlesperance12@gmail.com>
Date: Thu, 21 May 2026 11:30:52 -0400
Subject: [PATCH 01/21] Create SeuratUploader.py

Create SeuratUploader, now adding functions for reduction to metadata and layer_to_X
---
 lib/gear/SeuratUploader.py | 169 +++++++++++++++++++++++++++++++++++++
 1 file changed, 169 insertions(+)
 create mode 100644 lib/gear/SeuratUploader.py

diff --git a/lib/gear/SeuratUploader.py b/lib/gear/SeuratUploader.py
new file mode 100644
index 00000000..3b9eb810
--- /dev/null
+++ b/lib/gear/SeuratUploader.py
@@ -0,0 +1,169 @@
+import argparse
+
+import rpy2.robjects as ro
+from rpy2.robjects.packages import importr
+import rpy2.rinterface_lib.callbacks as r_cbs
+import rpy2.robjects.packages as rpackages
+import sys
+import mygene
+import pandas as pd
+import scanpy
+import os
+import argparse
+
+
+def silent_handler(s:str) -> None:
+    # way to bypass the R stderr output
+    pass
+
+def argument_parser():
+    parser = argparse.ArgumentParser(usage="%(prog)s -r [RDS Object] -s [Share ID]",add_help=True)
+    parser.add_argument('-r', '--rds', required=True, type=str)
+    parser.add_argument('-s', '--share-id', required=True, type=str)
+    args = vars(parser.parse_args())
+    return args
+
+def r_package_installer() -> None:
+    utils = rpackages.importr('utils')
+    # Install BiocManager if not installed
+    if not rpackages.isinstalled('BiocManager'):
+        utils.install_packages('BiocManager')
+    # Import BiocManager
+    BiocManager = importr('BiocManager')
+    # Install Seurat, anndataR and rhdf5
+    if not rpackages.isinstalled('reticulate'):
+        utils.install_packages('reticulate')
+    if not rpackages.isinstalled('Seurat'):
+        utils.install_packages('Seurat')
+    if not rpackages.isinstalled('anndataR'):
+        BiocManager.install('anndataR')
+    if not rpackages.isinstalled('rhdf5'):
+        BiocManager.install('rhdf5')
+
+
+def r_package_importer(package_name:str):
+    """
+    Import installed package, if not installed return message
+    Input:
+        package_name: R package name to import
+    Output:
+        The R package that was imported or if there's an error the message will be returned
+    """
+    importErrorMessage = ""
+    try:
+        pkg = importr(package_name)
+        return pkg
+    except:
+        importErrorMessage += f"{package_name} not installed or can not be imported"
+        sys.exit(importErrorMessage)
+
+
+
+def seurat_to_anndata(file_path: str, share_name: str, output_dir: str = "."):
+    """
+    file_path: path to rds or rdata file
+    share_name: final h5ad string name to be expected (without h5ad)
+    output_dir: directory to write the temporary h5ad file into
+
+    return:
+        absolute path to tmp h5ad, or False on failure
+    """
+    # Suppress R console output and ensure required packages are loaded,
+    # since this function may be called as a module in cgi script (not via main()).
+    r_cbs.consolewrite_print = silent_handler
+    r_cbs.consolewrite_warnerror = silent_handler
+    # Import required R packages
+    base = rpackages.importr('base')
+    r_package_importer('Seurat')
+    r_package_importer('rhdf5')
+    r_package_importer('anndataR')
+    # Use R's readRDS to load the object.
+    # The result is an R object within the Python environment.
+    r_seurat_obj = base.readRDS(file_path)
+    ro.globalenv['seurat_obj'] = r_seurat_obj
+    # Using anndataR write out a converted h5ad
+    ro.r('adata <- as_AnnData(seurat_obj)')
+    output_path = os.path.join(output_dir, f'tmp_{share_name}.h5ad')
+    try:
+        ro.r(f'write_h5ad(adata, "{output_path}")')
+        return output_path
+    # In cases where the write fails we will assume the h5ad already exists
+    except:
+        print(f"h5ad name already exists {output_path}")
+        return False
+
+
+def openh5ad(h5ad_name):
+    """Just open the supplied h5ad file"""
+    adata = scanpy.read_h5ad(h5ad_name)
+    return adata
+
+def genes_to_ensembl(adata, taxid=None):
+    # We are calling an external API for genes to ensembl mapping
+    # Potentially problematic down the road if this shuts down
+    if taxid is None:
+        return None
+    genes = adata.var.index.tolist()
+    mg = mygene.MyGeneInfo()
+    mg_genes = mg.querymany(genes, scopes="symbol", fields="ensembl.gene", species=f"{taxid}")
+    ensembl_mapping_dict = {}
+    for mg_gene in mg_genes:
+        gene_name = mg_gene['query']
+        if 'ensembl' in mg_gene.keys():
+            if isinstance(mg_gene['ensembl'],list):
+                # Currently taking first value, not sure of a better way to handle one gene having multiple ensembl IDs
+                ensembl_mapping_dict[gene_name] = mg_gene['ensembl'][0]['gene']
+            else:
+                ensembl_mapping_dict[gene_name] = mg_gene['ensembl']['gene']
+    count = 0
+    # We still need an ensembl id for the genes that do not actually have them.
+    # So here we create a FAKE# for each one so that it can be searchable in gEAR
+    for gene in genes:
+        if gene not in ensembl_mapping_dict.keys():
+            ensembl_mapping_dict[gene] = f"Fake{count}"
+            count += 1
+    # Overwrite the current adata.var
+    adata.var = pd.DataFrame(
+        index=list(ensembl_mapping_dict.values()), data={"gene_names": list(ensembl_mapping_dict.keys())}
+    )
+    return adata
+
+
+def reduction_to_metadata(adata):
+    # Discussion with Carlo and Brian resulted in us determining we would like to
+    # take the first 2 values of each reduction
+    # PCA in the future, and potentially other reductions may need more
+    for reduction in adata.obsm:
+        if adata.obsm[reduction].shape[1] > 1:
+            for i in range(2):
+                adata.obs[f'{reduction}_{i+1}'] = adata.obsm[reduction][:,i]
+    return adata
+
+
+def layer_to_X(adata, layer_name):
+    # Possibility for Seurat -> Anndata conversion doesn not create the X matrix.
+    # Use adata.layers['data'] as X
+    adata.X = adata.layers[layer_name]
+    return adata
+
+def main():
+    arguments = argument_parser()
+    # Args
+    rds_path = arguments['rds']
+    share_name = arguments['share_id']
+    r_package_installer()
+    # Take the RDS and output the most basic h5ad
+    h5ad_name = seurat_to_anndata(rds_path,share_name)
+    # Below are some changes and checks to the h5ad to correctly format for gEAR
+    if h5ad_name:
+        adata = openh5ad(f'tmp_{h5ad_name}')
+        adata = genes_to_ensembl(adata)
+        if adata is None:
+            sys.exit("TaxID not supplied")
+        adata = reduction_to_metadata(adata)
+        adata.write({h5ad_name.replace('tmp_','')})
+        os.remove(f'tmp_{h5ad_name}')
+
+
+if __name__ == "__main__":
+    main()

From db8bb98040a19e7e0e6f9a1fa45126fdc3a765c0 Mon Sep 17 00:00:00 2001
From: Dan Lesperance <danlesperance12@gmail.com>
Date: Thu, 21 May 2026 11:44:31 -0400
Subject: [PATCH 02/21] Update process_uploaded_expression_dataset.cgi

Add functionality for SeuratUpload
---
 .../process_uploaded_expression_dataset.cgi   | 189 +++++++++---------
 1 file changed, 91 insertions(+), 98 deletions(-)

diff --git a/www/cgi/process_uploaded_expression_dataset.cgi b/www/cgi/process_uploaded_expression_dataset.cgi
index 4fcb35bf..64675d3f 100755
--- a/www/cgi/process_uploaded_expression_dataset.cgi
+++ b/www/cgi/process_uploaded_expression_dataset.cgi
@@ -38,9 +38,8 @@ sys.stdout = open(os.devnull, 'w')
 lib_path = Path(__file__).resolve().parents[2] / 'lib'
 sys.path.append(str(lib_path))
 import geardb
-from gear.primary_analysis import add_primary_analysis_to_dataset, PrimaryAnalysisProcessingError
 from gear.spatialhandler import SPATIALTYPE2CLASS
-from gear.utils import update_adata_with_ensembl_ids
+import gear.SeuratUploader as SeuratUploader
 
 share_uid = None
 session_id = None
@@ -74,8 +73,9 @@ def main():
         result['message'] = 'User ID not found. Please log in to continue.'
         return result
 
-    # values are mex_3tab, excel, rdata, h5ad
-    dataset_formats = ['mex_3tab', 'excel', 'rdata', 'h5ad', 'spatial']
+    # values are mex_3tab, excel, rdata, h5ad, rds, or spatial formats
+    # Removed rdata from list as rdata will be difficult to process efficiently
+    dataset_formats = ['mex_3tab', 'excel', 'h5ad', 'spatial','rds']
     dataset_upload_dir = Path(user_upload_file_base) / session_id / share_uid
 
     # quickly write the status so the page doesn't error out
@@ -107,13 +107,9 @@ def main():
         return
     with open(metadata_file, 'r') as f:
         metadata = json.load(f)
-        dataset_uid = metadata.get('dataset_uid', '')
-        dataset_type = metadata.get('dataset_type', '')
 
     # Update metadata for downstream uses
     metadata["dataset_format"] = dataset_format
-    metadata["perform_primary_analysis"] = True if dataset_type in ['single-cell-rnaseq', 'spatial'] else False
-
     with open(metadata_file, 'w') as f:
         json.dump(metadata, f, indent=4)
 
@@ -147,36 +143,28 @@ def main():
         # CHILD CONTINUES FROM HERE
 
     status['process_id'] = os.getpid()
-
+    
     # new child command
     if dataset_format == 'mex_3tab':
-        process_mex_3tab(dataset_upload_dir, metadata["perform_primary_analysis"])
+        process_mex_3tab(dataset_upload_dir)
     elif dataset_format == 'excel':
-        process_excel(dataset_upload_dir, metadata["perform_primary_analysis"])
+        process_excel(dataset_upload_dir)
     elif dataset_format == "h5ad":
-        process_h5ad(dataset_upload_dir, metadata["perform_primary_analysis"])
+        process_h5ad(dataset_upload_dir)
+    elif dataset_format == 'rds' or dataset_format=='rdata':
+        process_seurat(dataset_upload_dir)
     elif dataset_format == "spatial":
-        process_spatial(dataset_upload_dir, spatial_format, metadata["perform_primary_analysis"])
+        process_spatial(dataset_upload_dir, spatial_format)
     else:
         result["success"] = 0
         result["message"] = f"Unsupported dataset format: {dataset_format}"
         return result
 
-    if metadata["perform_primary_analysis"]:
-        try:
-            result["success"] = add_primary_analysis_to_dataset(dataset_uid, share_uid, dataset_upload_dir, dataset_format)
-        except PrimaryAnalysisProcessingError as e:
-            write_status(dataset_upload_dir, 'error', f"Error during primary analysis: {str(e)}")
-            return result
-
-    status["progress"] = 100
-    write_status(dataset_upload_dir, 'complete', "Dataset processed successfully.")
-
     result["success"] = 1
     result["message"] = "Dataset processed successfully."
     return result
 
-def process_h5ad(upload_dir: Path, perform_primary_analysis: bool) -> None:
+def process_h5ad(upload_dir: Path) -> None:
     """
     Processes an uploaded .h5ad (AnnData) file in the specified upload directory by performing the following steps:
     1. Reads the .h5ad file as an AnnData object.
@@ -194,43 +182,15 @@ def process_h5ad(upload_dir: Path, perform_primary_analysis: bool) -> None:
     # If the file is an h5ad, it should be formatted as an AnnData object already.
     # But we still want to do some sanitization of the obs dataframe.
 
-    # TODO: Read in chunks to save memory
-
     write_status(upload_dir, 'processing', 'Initializing dataset processing.')
 
     filepath = upload_dir / f"{share_uid}.h5ad"
     adata = anndata.read_h5ad(filepath)
     obs = adata.obs
 
-    total_steps = 4 if perform_primary_analysis else 3
-    step_counter = 1
-    status["progress"] = int((step_counter / total_steps) * 100)
-    write_status(upload_dir, 'processing', 'Sanitizing AnnData object')
-
     categorize_observation_columns(obs)
     adata.obs = sanitize_obs_for_h5ad(obs)
 
-    if "gene_symbol" not in adata.var.columns:
-        # get organism_id by converting sample_taxid
-        metadata_file = upload_dir / 'metadata.json'
-        if not metadata_file.is_file():
-            write_status(upload_dir, 'error', "No metadata JSON file found.")
-            return
-
-        with open(metadata_file, 'r') as f:
-            metadata = json.load(f)
-        sample_taxid = metadata.get("sample_taxid", None)
-        organism_id=geardb.get_organism_id_by_taxon_id(sample_taxid)
-        if not organism_id:
-            write_status(upload_dir, 'error', "Could not determine organism ID from sample taxonomic ID.")
-            return
-
-        adata = update_adata_with_ensembl_ids(adata, organism_id, "UNMAPPED_")
-
-    step_counter += 1
-    status["progress"] = int((step_counter / total_steps) * 100)
-    write_status(upload_dir, 'processing', 'Writing sanitized data to new H5AD.')
-
     h5ad_path = upload_dir / f"{share_uid}.new.h5ad"
     adata.write(h5ad_path)
 
@@ -238,11 +198,66 @@ def process_h5ad(upload_dir: Path, perform_primary_analysis: bool) -> None:
     filepath.unlink()  # remove original
     h5ad_path.rename(filepath)  # rename new to original name
 
-    step_counter += 1
-    status["progress"] = int((step_counter / total_steps) * 100)
-    write_status(upload_dir, 'processing', f"Finished processing dataset. {'Performing primary analysis...' if perform_primary_analysis else ''}")
+    write_status(upload_dir, 'complete', 'Dataset processed successfully.')
 
-def process_3tab(upload_dir: Path, perform_primary_analysis: bool) -> None:
+def process_seurat(upload_dir: Path) -> None:
+    # Take in an RDS file, convert to anndata, update the obs metadata based on reductions,
+    # convert gene symbols to ensemble IDs, and write to an updated h5ad file.
+    write_status(upload_dir, "processing", "Initializing dataset processing.")
+    seurat_filepath = upload_dir / f"{share_uid}.rds"
+
+    # seurat to anndata uses rpy2 to convert the RDS to anndata
+    adata_filepath = SeuratUploader.seurat_to_anndata(str(seurat_filepath), share_uid, str(upload_dir))
+    if not adata_filepath:
+        write_status(upload_dir, 'error', 'Failed to convert RDS to h5ad.')
+        return
+    try:
+        adata = anndata.read_h5ad(adata_filepath)
+    except Exception as e:
+        write_status(upload_dir, 'error', f'Failed to read h5ad: {str(e)}')
+        return
+
+
+    # Update obs metadata based on reductions
+    try:
+        adata = SeuratUploader.reduction_to_metadata(adata)
+    except Exception as e:
+        write_status(upload_dir, 'error', f'Failed to update Reductions to metadata: {str(e)}')
+        return
+    # Convert gene symbols to ensemble IDs
+    metadata_file = upload_dir / 'metadata.json'
+    if not metadata_file.is_file():
+        write_status(upload_dir, 'error', "No metadata JSON file found.")
+
+    # get organism_id by converting sample_taxid(needed for some but not all spatial handlers)
+    with open(metadata_file, 'r') as f:
+        metadata = json.load(f)
+    
+    sample_taxid = metadata.get("sample_taxid", None)
+    try:
+        adata = SeuratUploader.genes_to_ensembl(adata,sample_taxid)
+    except Exception as e:
+        write_status(upload_dir, 'error', f'Failed to convert genes to Ensembl: {str(e)}')
+        return
+    if adata.X is None:
+        adata = SeuratUploader.layer_to_X(adata, layer_name='data')
+    h5ad_path = upload_dir / f"{share_uid}.new.h5ad"
+    try:
+        adata.write(h5ad_path)
+
+        # Replace the original file with the sanitized one
+        seurat_filepath.unlink()
+        Path(adata_filepath).unlink()
+        h5ad_path.rename(upload_dir / f"{share_uid}.h5ad")  
+    except Exception as e:
+        write_status(upload_dir, 'error', f'Failed to write h5ad or during cleanup: {str(e)}')
+        return
+
+    write_status(upload_dir, "complete", "Dataset processed successfully.")
+
+
+
+def process_3tab(upload_dir: Path) -> None:
     import subprocess
 
     chunk_size = 500
@@ -285,21 +300,8 @@ def process_3tab(upload_dir: Path, perform_primary_analysis: bool) -> None:
     adata = sc.AnnData(obs=var, var=obs)
     reader = pd.read_csv(expression_matrix_path, sep='\t', index_col=0, chunksize=chunk_size)
 
-    # Count rows safely without shell execution (https://github.com/IGS/gEAR/security/code-scanning/229)
-    try:
-        result = subprocess.run(
-            ['/usr/bin/wc', '-l', str(expression_matrix_path)],
-            capture_output=True,
-            text=True,
-            check=True
-        )
-        total_rows = int(result.stdout.split()[0])
-    except (subprocess.CalledProcessError, ValueError, FileNotFoundError):
-        # Fallback to Python if wc fails
-        total_rows = sum(1 for _ in open(expression_matrix_path)) - 1
-
-    if perform_primary_analysis:
-        total_rows += 1  # account for the additional primary analysis step that will be performed after this
+    # This can be an order of magnitude faster than the using python alone
+    total_rows = int(subprocess.check_output(f"/usr/bin/wc -l {expression_matrix_path}", shell=True).split()[0])
 
     expression_matrix = []
     rows_read = 0
@@ -313,8 +315,9 @@ def process_3tab(upload_dir: Path, perform_primary_analysis: bool) -> None:
             expression_matrix.append(sparse.csr_matrix(chunk.values))
 
             status['progress'] = percentage
-            message = f"Processed {rows_read}/{total_rows} expression matrix chunks ..."
-            write_status(upload_dir, 'processing', message)
+            status['message'] = f"Processed {rows_read}/{total_rows} expression matrix chunks ..."
+            with open(upload_dir / "status.json", 'w') as f:
+                f.write(json.dumps(status))
 
         adata.X = sparse.vstack(expression_matrix) # type: ignore
     except Exception:
@@ -345,8 +348,10 @@ def process_3tab(upload_dir: Path, perform_primary_analysis: bool) -> None:
                 rows_read += chunk_size
                 percentage = int((rows_read / total_rows) * 100)
 
-                message = f"Processed {rows_read}/{total_rows} expression matrix chunks ..."
-                write_status(upload_dir, 'processing', message)
+                status['progress'] = percentage
+                status['message'] = f"Processed {rows_read}/{total_rows} expression matrix chunks ..."
+                with open(upload_dir / "status.json", 'w') as f:
+                    f.write(json.dumps(status))
 
             except Exception:
                 #print(f"\nError in chunk {chunk_index}: {inner_e}")
@@ -363,18 +368,19 @@ def process_3tab(upload_dir: Path, perform_primary_analysis: bool) -> None:
             #print("Collected chunk shapes:")
             #for i, shape in enumerate(chunk_shapes):
             #    print(f"  Chunk {i+1}: {shape}")
+
             raise
 
+
     adata = adata.transpose()
     adata.obs = sanitize_obs_for_h5ad(adata.obs)
 
     h5ad_path = upload_dir / f"{share_uid}.h5ad"
     adata.write(h5ad_path)
 
-    # Progress is accounted for in chunk processing
-    write_status(upload_dir, 'processing', f"Finished processing dataset. {'Performing primary analysis...' if perform_primary_analysis else ''}")
+    write_status(upload_dir, 'complete', 'Dataset processed successfully.')
 
-def process_excel(upload_dir: Path, perform_primary_analysis: bool) -> None:
+def process_excel(upload_dir: Path) -> None:
     filepath = upload_dir / f"{share_uid}.xlsx"
 
     write_status(upload_dir, 'processing', 'Initializing dataset processing.')
@@ -454,15 +460,12 @@ def process_excel(upload_dir: Path, perform_primary_analysis: bool) -> None:
     h5ad_path = upload_dir / f"{share_uid}.h5ad"
     adata.write(h5ad_path)
 
-    total_steps = 2 if perform_primary_analysis else 1
-    status["progress"] = int((1 / total_steps) * 100)
-    write_status(upload_dir, 'processing', f"Finished processing dataset. {'Performing primary analysis...' if perform_primary_analysis else ''}")
+    write_status(upload_dir, 'complete', 'Dataset processed successfully.')
 
-
-def process_mex(upload_dir: Path, perform_primary_analysis: bool) -> None:
+def process_mex(upload_dir: Path) -> None:
     pass
 
-def process_mex_3tab(upload_dir: Path, perform_primary_analysis: bool) -> None:
+def process_mex_3tab(upload_dir: Path) -> None:
     # Extract the file
     import tarfile
     compression_format = None
@@ -541,11 +544,11 @@ def process_mex_3tab(upload_dir: Path, perform_primary_analysis: bool) -> None:
 
     # Call the appropriate function
     if dataset_type == 'threetab':
-        process_3tab(upload_dir, perform_primary_analysis)
+        process_3tab(upload_dir)
     elif dataset_type == 'mex':
-        process_mex(upload_dir, perform_primary_analysis)
+        process_mex(upload_dir)
 
-def process_spatial(upload_dir: Path, spatial_format: str, perform_primary_analysis: bool) -> None:
+def process_spatial(upload_dir: Path, spatial_format: str) -> None:
     """
     Processes a spatial transcriptomics dataset uploaded to a specified directory.
 
@@ -562,9 +565,6 @@ def process_spatial(upload_dir: Path, spatial_format: str, perform_primary_analy
     Raises:
         Writes error status if the metadata file is missing or if reading/converting the spatial file fails.
     """
-
-    write_status(upload_dir, 'processing', 'Initializing dataset processing.')
-
     spatial_obj = SPATIALTYPE2CLASS[spatial_format]()   # instantiate the appropriate handler class
     metadata_file = upload_dir / 'metadata.json'
     if not metadata_file.is_file():
@@ -594,16 +594,9 @@ def process_spatial(upload_dir: Path, spatial_format: str, perform_primary_analy
         import shutil
         shutil.rmtree(output_path)
 
-    total_steps = 3 if perform_primary_analysis else 2
-    step_counter = 1
-    status["progress"] = int((step_counter / total_steps) * 100)
     write_status(upload_dir, 'processing', 'Writing Zarr store')
-
     spatial_obj.write_to_zarr(filepath=output_path)
-
-    step_counter += 1
-    status["progress"] = int((step_counter / total_steps) * 100)
-    write_status(upload_dir, 'processing', f"Finished processing spatial dataset. {'Performing primary analysis...' if perform_primary_analysis else ''}")
+    write_status(upload_dir, 'complete', 'Dataset processed successfully.')
 
 def sanitize_obs_for_h5ad(obs_df: pd.DataFrame) -> pd.DataFrame:
     for col in obs_df.columns:

From eaa4612e4bcc84db2b354a6d111c0ee1eccbf644 Mon Sep 17 00:00:00 2001
From: Dan Lesperance <danlesperance12@gmail.com>
Date: Fri, 22 May 2026 07:41:21 -0400
Subject: [PATCH 03/21] Update BioCManager to 3.22

---
 docker/install_bioc.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/install_bioc.R b/docker/install_bioc.R
index b1dd8025..fd0ad770 100755
--- a/docker/install_bioc.R
+++ b/docker/install_bioc.R
@@ -3,7 +3,7 @@
 # Install required packages
 tryCatch( {
     install.packages(c("BiocManager", "remotes"), dependencies=NA, repos="http://lib.stat.cmu.edu/R/CRAN/")
-    BiocManager::install(version = "3.21", ask=FALSE)
+    BiocManager::install(version = "3.22", ask=FALSE)
     }, error = function(e) {
         message("Error: ", e$message)
         quit(status = 1, save = "no")

From 663753cb3098761d18a28b894739c3cc47c3cb0a Mon Sep 17 00:00:00 2001
From: Dan Lesperance <danlesperance12@gmail.com>
Date: Fri, 22 May 2026 07:55:22 -0400
Subject: [PATCH 04/21] add packages necessary for SeuratUploader

seurat5
httpuv
hdf5r
rhdf5
anndataR
---
 docker/install_packages.R | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/docker/install_packages.R b/docker/install_packages.R
index f04948ed..d6b348bd 100755
--- a/docker/install_packages.R
+++ b/docker/install_packages.R
@@ -9,7 +9,12 @@ library(remotes)    # for install_version
 tryCatch( {
     remotes::install_version("reticulate", version="1.46.0", repos="https://cloud.r-project.org/", ask=FALSE, dependencies=NA) # Sanity check with rpy2
     remotes::install_github("ctlab/fgsea")   # needed for projectR
-    remotes::install_github("genesofeve/projectR@d3dd79e2b14172a9561059d58462c97f0a78d4c8")  # version 1.23.2
+    remotes::install_github("genesofeve/projectR@d3dd79e2b14172a9561059d58462c97f0a78d4c8")
+    remotes::install_github("satijalab/seurat", "seurat5", quiet = TRUE, ask=FALSE)
+    install.packages('httpuv', ask=FALSE)
+    install.packages("hdf5r",dependencies=TRUE, ask=FALSE)
+    BiocManager::install("rhdf5",ask=FALSE)
+    BiocManager::install("anndataR", ask=FALSE)# version 1.23.2
     BiocManager::install("biomaRt", ask=FALSE) # version 2.60.0
     remotes::install_github("CHuanSite/SJD")
     }, error = function(e) {

From 24e8794c4434fc4cfc0c83fe851bc1f13a7a6415 Mon Sep 17 00:00:00 2001
From: Dan Lesperance <danlesperance12@gmail.com>
Date: Fri, 22 May 2026 07:58:49 -0400
Subject: [PATCH 05/21] Update Dockerfile.r

---
 docker/Dockerfile.r | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docker/Dockerfile.r b/docker/Dockerfile.r
index a2c2b3cb..3015356e 100644
--- a/docker/Dockerfile.r
+++ b/docker/Dockerfile.r
@@ -32,6 +32,8 @@ RUN apt -qq update \
   tzdata \
   git \
   unzip \
+  libgfortran5 \
+  libhdf5-dev \
   && apt -qq clean autoclean \
   && apt -qq autoremove -y \
   && rm -rf /var/lib/apt/lists/*

From 93adac2847a8a0344cc43028aa81592988b17184 Mon Sep 17 00:00:00 2001
From: Dan Lesperance <danlesperance12@gmail.com>
Date: Fri, 22 May 2026 08:06:48 -0400
Subject: [PATCH 06/21] Enabled Seurat/RDS

---
 www/upload_dataset.html | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/www/upload_dataset.html b/www/upload_dataset.html
index ac1ca58e..ebbaf30e 100644
--- a/www/upload_dataset.html
+++ b/www/upload_dataset.html
@@ -666,9 +666,9 @@ <h3>MS Excel</h3>
                       </div>
                     </div>
                     <div class="column format-column is-3 is-flex">
-                      <!-- Rdata/Seurat -->
+                      <!-- RDS/Seurat -->
                       <div class="box format-selection-box">
-                        <h3>Rdata / Seurat</h3>
+                        <h3>RDS / Seurat</h3>
                         <p class="format-description">
                           This is a binary format used by the Seurat package in R. If you've already been
                           working with your dataset in R, including clustering and other analyses, this is
@@ -682,21 +682,21 @@ <h3>Rdata / Seurat</h3>
                         <ul class="format-documentation-links">
                           <li>
                             <span class="icon-text"><span class="icon"><i class="mdi mdi-open-in-new"></i></span>
-                              <span><a href="https://bioinformatics.ccr.cancer.gov/docs/getting-started-with-scrna-seq/IntroToR_Seurat/" target="_blank">Rdata info</a></span>
+                              <span><a href="https://bioinformatics.ccr.cancer.gov/docs/getting-started-with-scrna-seq/IntroToR_Seurat/" target="_blank">RDS info</a></span>
                             </span>
                           </li>
                           <li>
                             <span class="icon-text"><span class="icon"><i class="mdi mdi-download"></i></span>
-                              <span>Rdata example</span>
+                              <span>RDS example</span>
                             </span>
                           </li>
                         </ul>
                         <div class="control">
-                          <button data-format="rdata" class="button is-primary is-fullwidth format-selector" disabled>
+                          <button data-format="rdata" class="button is-primary is-fullwidth format-selector">
                             <span class="icon">
                               <i class="mdi mdi-cancel"></i>
                             </span>
-                            <span class="format-status">Not yet available</span>
+                            <span class="format-status">Selected</span>
                           </button>
                         </div>
                       </div>

From cc672065674e8216d96368b81470f9cc5db239cb Mon Sep 17 00:00:00 2001
From: Dan Lesperance <danlesperance12@gmail.com>
Date: Wed, 27 May 2026 14:49:13 -0400
Subject: [PATCH 07/21] SeuratUploader: Change gene_names to gene_symbol

---
 lib/gear/SeuratUploader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/gear/SeuratUploader.py b/lib/gear/SeuratUploader.py
index 3b9eb810..ecbb8b70 100644
--- a/lib/gear/SeuratUploader.py
+++ b/lib/gear/SeuratUploader.py
@@ -124,7 +124,7 @@ def genes_to_ensembl(adata, taxid=None):
             count += 1
     # Overwrite the current adata.var
     adata.var = pd.DataFrame(
-        index=list(ensembl_mapping_dict.values()), data={"gene_names": list(ensembl_mapping_dict.keys())}
+        index=list(ensembl_mapping_dict.values()), data={"gene_symbol": list(ensembl_mapping_dict.keys())}
     )
     return adata
 

From b398e828844e48e3d861aa73a1d1fc22fc28b559 Mon Sep 17 00:00:00 2001
From: Dan Lesperance <danlesperance12@gmail.com>
Date: Thu, 28 May 2026 10:06:20 -0400
Subject: [PATCH 08/21] Add packages to apt-get for Seurat

---
 docker/Dockerfile | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 87882e78..c4a8b11f 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -45,6 +45,9 @@ RUN apt -qq update \
   libpcre2-dev \
   fonts-roboto \
   fontconfig \
+  libgfortran5 \
+  libuv1 \
+  libhdf5-dev \
   && apt -qq clean autoclean \
   && apt -qq autoremove -y \
   && rm -rf /var/lib/apt/lists/*

From 0d9456c00bfe5c8493d79930022adad9fb1b295d Mon Sep 17 00:00:00 2001
From: Dan Lesperance <danlesperance12@gmail.com>
Date: Thu, 28 May 2026 10:09:34 -0400
Subject: [PATCH 09/21] Add RDATA to list of dataset formats

---
 www/cgi/process_uploaded_expression_dataset.cgi | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/www/cgi/process_uploaded_expression_dataset.cgi b/www/cgi/process_uploaded_expression_dataset.cgi
index 64675d3f..afeae0c4 100755
--- a/www/cgi/process_uploaded_expression_dataset.cgi
+++ b/www/cgi/process_uploaded_expression_dataset.cgi
@@ -75,7 +75,7 @@ def main():
 
     # values are mex_3tab, excel, rdata, h5ad, rds, or spatial formats
     # Removed rdata from list as rdata will be difficult to process efficiently
-    dataset_formats = ['mex_3tab', 'excel', 'h5ad', 'spatial','rds']
+    dataset_formats = ['mex_3tab', 'excel', 'h5ad', 'spatial','rds','rdata']
     dataset_upload_dir = Path(user_upload_file_base) / session_id / share_uid
 
     # quickly write the status so the page doesn't error out
@@ -91,7 +91,7 @@ def main():
         return result
 
     if dataset_format not in dataset_formats:
-        result['message'] = 'Unsupported dataset format.'
+        result['message'] = f'Unsupported dataset format: {dataset_format} '
         write_status(dataset_upload_dir, 'error', result['message'])
         return result
 

From 04411ad13e263a4b038482ce5da1feb83bba2664 Mon Sep 17 00:00:00 2001
From: Dan Lesperance <danlesperance12@gmail.com>
Date: Thu, 28 May 2026 10:15:26 -0400
Subject: [PATCH 10/21] Change button text from 'Selected' to 'Choose'

---
 www/upload_dataset.html | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/www/upload_dataset.html b/www/upload_dataset.html
index ebbaf30e..39d1e3f7 100644
--- a/www/upload_dataset.html
+++ b/www/upload_dataset.html
@@ -696,7 +696,7 @@ <h3>RDS / Seurat</h3>
                             <span class="icon">
                               <i class="mdi mdi-cancel"></i>
                             </span>
-                            <span class="format-status">Selected</span>
+                            <span class="format-status">Choose</span>
                           </button>
                         </div>
                       </div>

From ffed7345593f06fcf7e69a6e899717e6a536b7bc Mon Sep 17 00:00:00 2001
From: dlespera <danlesperance12@gmail.com>
Date: Thu, 28 May 2026 13:46:01 -0400
Subject: [PATCH 11/21] Package installation finalization

---
 docker/Dockerfile              |  8 ++++----
 docker/install_packages.R      |  4 ++--
 docker/requirements.txt        |  8 ++++++++
 docs/developer/setup/python.md | 13 ++++++++++++-
 4 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index c4a8b11f..038b65ba 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -58,12 +58,12 @@ RUN fc-cache -f -v
 ENV LLVM_CONFIG=/usr/bin/llvm-config-14
 
 # Copy compiled Python from builder stage
-COPY --from=adkinsrs/gear-python-base:2026-04-27 /opt/Python-${PYTHON_FULL_VERSION} /opt/Python-${PYTHON_FULL_VERSION}
+COPY --from=gear-python-base:latest /opt/Python-${PYTHON_FULL_VERSION} /opt/Python-${PYTHON_FULL_VERSION}
 
 # Copy compiled R from r-builder stage
-COPY --from=adkinsrs/gear-r-base:2026-04-27 /usr/local/lib/R /usr/local/lib/R
-COPY --from=adkinsrs/gear-r-base:2026-04-27 /usr/local/bin/R /usr/local/bin/R
-COPY --from=adkinsrs/gear-r-base:2026-04-27 /usr/local/bin/Rscript /usr/local/bin/Rscript
+COPY --from=gear-r-base:latest /usr/local/lib/R /usr/local/lib/R
+COPY --from=gear-r-base:latest /usr/local/bin/R /usr/local/bin/R
+COPY --from=gear-r-base:latest /usr/local/bin/Rscript /usr/local/bin/Rscript
 
 # Link Python and shared library
 RUN mkdir -p /opt/bin \
diff --git a/docker/install_packages.R b/docker/install_packages.R
index d6b348bd..55a1397a 100755
--- a/docker/install_packages.R
+++ b/docker/install_packages.R
@@ -11,8 +11,8 @@ tryCatch( {
     remotes::install_github("ctlab/fgsea")   # needed for projectR
     remotes::install_github("genesofeve/projectR@d3dd79e2b14172a9561059d58462c97f0a78d4c8")
     remotes::install_github("satijalab/seurat", "seurat5", quiet = TRUE, ask=FALSE)
-    install.packages('httpuv', ask=FALSE)
-    install.packages("hdf5r",dependencies=TRUE, ask=FALSE)
+    install.packages('httpuv', ask=FALSE, repos="https://cloud.r-project.org/")
+    install.packages("hdf5r",dependencies=TRUE, ask=FALSE, repos="https://cloud.r-project.org/")
     BiocManager::install("rhdf5",ask=FALSE)
     BiocManager::install("anndataR", ask=FALSE)# version 1.23.2
     BiocManager::install("biomaRt", ask=FALSE) # version 2.60.0
diff --git a/docker/requirements.txt b/docker/requirements.txt
index 94021d71..3839102b 100644
--- a/docker/requirements.txt
+++ b/docker/requirements.txt
@@ -44,3 +44,11 @@ spatialdata_io==0.6.0
 shadows==0.1a2
 tables==3.11.1
 watchfiles==1.1.1
+Bio==1.8.3 
+biopython==1.87 
+biothings-client==0.5.0 
+gprofiler-official==1.0.0 
+h11==0.16.0 
+httpcore==1.0.9 
+httpx==0.28.1 
+mygene==3.2.2
diff --git a/docs/developer/setup/python.md b/docs/developer/setup/python.md
index 1333a1e8..6aad6db2 100644
--- a/docs/developer/setup/python.md
+++ b/docs/developer/setup/python.md
@@ -38,6 +38,9 @@ fixed paths have worked fine for decades.
         libicu-dev \
         libdeflate-dev \
         libssl3 \
+        libgfortran5 \
+        libuv1 \
+        libhdf5-dev \
         pkg-config \
         llvm \
         apache2 \
@@ -148,7 +151,15 @@ I cannot add comments to the bash code without breaking the command.  So consult
     spatialdata_io==0.6.0 \
     shadows==0.1a2 \
     tables==3.11.1 \
-    watchfiles==1.1.1
+    watchfiles==1.1.1 \
+    Bio==1.8.3 \ 
+    biopython==1.87 \ 
+    biothings-client==0.5.0 \ 
+    gprofiler-official==1.0.0 \
+    h11==0.16.0 \
+    httpcore==1.0.9 \ 
+    httpx==0.28.1 \
+    mygene==3.2.2\
     ./pip3 install git+https://github.com/adkinsrs/diffxpy.git@ffd828c280882ca98adc6e42c934625fab0011f6
     ./pip3 uninstall dask-expr -y
 

From f18e5690baf1ea8307b270480bd572e506e5e30f Mon Sep 17 00:00:00 2001
From: adkinsrs <sadkins@som.umaryland.edu>
Date: Wed, 3 Jun 2026 10:25:26 -0400
Subject: [PATCH 12/21] Update requirements and setup documentation for
 additional packages

---
 docker/requirements.txt        | 16 +++++++---------
 docs/developer/setup/python.md | 14 ++++++--------
 2 files changed, 13 insertions(+), 17 deletions(-)

diff --git a/docker/requirements.txt b/docker/requirements.txt
index 3839102b..7aaea630 100644
--- a/docker/requirements.txt
+++ b/docker/requirements.txt
@@ -2,6 +2,8 @@ aiohttp==3.13.5
 aiohttp_retry==2.9.1
 anndata==0.12.11
 biocode==0.10.0
+biopython==1.87
+biothings-client==0.5.0
 cairosvg==2.7.1
 colorcet==3.1.0
 datashader==0.19.0
@@ -10,7 +12,10 @@ Flask==3.1.3
 Flask-RESTful==0.3.9
 google-analytics-data==0.21.0
 gosling==0.3.0
+h11==0.16.0
 hic2cool==0.8.3
+httpcore==1.0.9
+httpx==0.28.1
 jupyterlab==4.0.5
 jupyter==1.0.0
 kaleido==0.2.1
@@ -20,6 +25,7 @@ llvmlite==0.47.0
 matplotlib==3.10.7
 mod-wsgi==5.0.2
 more_itertools==11.0.2
+mygene==3.2.2
 mysql-connector-python==8.0.28
 numba==0.65.0
 numpy==2.4.0
@@ -43,12 +49,4 @@ spatialdata==0.7.2
 spatialdata_io==0.6.0
 shadows==0.1a2
 tables==3.11.1
-watchfiles==1.1.1
-Bio==1.8.3 
-biopython==1.87 
-biothings-client==0.5.0 
-gprofiler-official==1.0.0 
-h11==0.16.0 
-httpcore==1.0.9 
-httpx==0.28.1 
-mygene==3.2.2
+watchfiles==1.1.1
\ No newline at end of file
diff --git a/docs/developer/setup/python.md b/docs/developer/setup/python.md
index 6aad6db2..355a1d22 100644
--- a/docs/developer/setup/python.md
+++ b/docs/developer/setup/python.md
@@ -111,6 +111,8 @@ I cannot add comments to the bash code without breaking the command.  So consult
     aiohttp_retry==2.9.1 \
     anndata==0.12.11 \
     biocode==0.10.0 \
+    biopython==1.87 \
+    biothings-client==0.5.0 \
     cairosvg==2.7.1 \
     colorcet==3.1.0 \
     datashader==0.19.0 \
@@ -118,7 +120,10 @@ I cannot add comments to the bash code without breaking the command.  So consult
     Flask-RESTful==0.3.9 \
     google-analytics-data==0.21.0 \
     gosling==0.3.0 \
+    h11==0.16.0 \
     hic2cool==0.8.3 \
+    httpcore==1.0.9 \
+    httpx==0.28.1 \
     jupyterlab==4.0.5 \
     jupyter==1.0.0 \
     kaleido==0.2.1 \
@@ -128,6 +133,7 @@ I cannot add comments to the bash code without breaking the command.  So consult
     matplotlib==3.10.7 \
     mod-wsgi==5.0.2 \
     more_itertools==11.0.2 \
+    mygene==3.2.2 \
     mysql-connector-python==8.0.28 \
     numba==0.65.0 \
     numpy==2.4.0 \
@@ -152,14 +158,6 @@ I cannot add comments to the bash code without breaking the command.  So consult
     shadows==0.1a2 \
     tables==3.11.1 \
     watchfiles==1.1.1 \
-    Bio==1.8.3 \ 
-    biopython==1.87 \ 
-    biothings-client==0.5.0 \ 
-    gprofiler-official==1.0.0 \
-    h11==0.16.0 \
-    httpcore==1.0.9 \ 
-    httpx==0.28.1 \
-    mygene==3.2.2\
     ./pip3 install git+https://github.com/adkinsrs/diffxpy.git@ffd828c280882ca98adc6e42c934625fab0011f6
     ./pip3 uninstall dask-expr -y
 

From 1b841b20e46b2bce97c6a633e777c68bd5114fe6 Mon Sep 17 00:00:00 2001
From: adkinsrs <sadkins@som.umaryland.edu>
Date: Wed, 3 Jun 2026 10:30:33 -0400
Subject: [PATCH 13/21] Fix formatting in Python setup instructions for package
 installation

---
 docs/developer/setup/python.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/developer/setup/python.md b/docs/developer/setup/python.md
index 355a1d22..6fc2d30e 100644
--- a/docs/developer/setup/python.md
+++ b/docs/developer/setup/python.md
@@ -157,7 +157,7 @@ I cannot add comments to the bash code without breaking the command.  So consult
     spatialdata_io==0.6.0 \
     shadows==0.1a2 \
     tables==3.11.1 \
-    watchfiles==1.1.1 \
+    watchfiles==1.1.1
     ./pip3 install git+https://github.com/adkinsrs/diffxpy.git@ffd828c280882ca98adc6e42c934625fab0011f6
     ./pip3 uninstall dask-expr -y
 

From 2f9a249b2cff5e126f259867f62a1cd5efd42813 Mon Sep 17 00:00:00 2001
From: adkinsrs <sadkins@som.umaryland.edu>
Date: Wed, 3 Jun 2026 14:59:46 -0400
Subject: [PATCH 14/21] Fixing squashed commits already in devel.  Adjusting
 permissions of lib/gear files to be uniform

---
 lib/gear/SeuratUploader.py                    |  13 +-
 lib/gear/__init__.py                          |   0
 lib/gear/analysis.py                          |   0
 lib/gear/dataarchive.py                       |   0
 lib/gear/db.py                                |   0
 lib/gear/mg_plotting.py                       |   0
 lib/gear/orthology.py                         |   0
 lib/gear/plotting.py                          |   0
 lib/gear/primary_analysis.py                  |   0
 lib/gear/serverconfig.py                      |   0
 lib/gear/spatialhandler.py                    |   0
 lib/gear/trackhub.py                          |   0
 lib/gear/userhistory.py                       |   0
 lib/gear/utils.py                             |   0
 .../process_uploaded_expression_dataset.cgi   | 136 +++++++++++++-----
 www/upload_dataset.html                       |   2 +-
 16 files changed, 107 insertions(+), 44 deletions(-)
 mode change 100644 => 100755 lib/gear/SeuratUploader.py
 mode change 100644 => 100755 lib/gear/__init__.py
 mode change 100644 => 100755 lib/gear/analysis.py
 mode change 100644 => 100755 lib/gear/dataarchive.py
 mode change 100644 => 100755 lib/gear/db.py
 mode change 100644 => 100755 lib/gear/mg_plotting.py
 mode change 100644 => 100755 lib/gear/orthology.py
 mode change 100644 => 100755 lib/gear/plotting.py
 mode change 100644 => 100755 lib/gear/primary_analysis.py
 mode change 100644 => 100755 lib/gear/serverconfig.py
 mode change 100644 => 100755 lib/gear/spatialhandler.py
 mode change 100644 => 100755 lib/gear/trackhub.py
 mode change 100644 => 100755 lib/gear/userhistory.py
 mode change 100644 => 100755 lib/gear/utils.py

diff --git a/lib/gear/SeuratUploader.py b/lib/gear/SeuratUploader.py
old mode 100644
new mode 100755
index ecbb8b70..050f665f
--- a/lib/gear/SeuratUploader.py
+++ b/lib/gear/SeuratUploader.py
@@ -1,15 +1,14 @@
 import argparse
-
-import rpy2.robjects as ro
-from rpy2.robjects.packages import importr
-import rpy2.rinterface_lib.callbacks as r_cbs
-import rpy2.robjects.packages as rpackages
+import os
 import sys
+
 import mygene
 import pandas as pd
+import rpy2.rinterface_lib.callbacks as r_cbs
+import rpy2.robjects as ro
+import rpy2.robjects.packages as rpackages
 import scanpy
-import os
-import argparse
+from rpy2.robjects.packages import importr
 
 
 def silent_handler(s:str) -> None:
diff --git a/lib/gear/__init__.py b/lib/gear/__init__.py
old mode 100644
new mode 100755
diff --git a/lib/gear/analysis.py b/lib/gear/analysis.py
old mode 100644
new mode 100755
diff --git a/lib/gear/dataarchive.py b/lib/gear/dataarchive.py
old mode 100644
new mode 100755
diff --git a/lib/gear/db.py b/lib/gear/db.py
old mode 100644
new mode 100755
diff --git a/lib/gear/mg_plotting.py b/lib/gear/mg_plotting.py
old mode 100644
new mode 100755
diff --git a/lib/gear/orthology.py b/lib/gear/orthology.py
old mode 100644
new mode 100755
diff --git a/lib/gear/plotting.py b/lib/gear/plotting.py
old mode 100644
new mode 100755
diff --git a/lib/gear/primary_analysis.py b/lib/gear/primary_analysis.py
old mode 100644
new mode 100755
diff --git a/lib/gear/serverconfig.py b/lib/gear/serverconfig.py
old mode 100644
new mode 100755
diff --git a/lib/gear/spatialhandler.py b/lib/gear/spatialhandler.py
old mode 100644
new mode 100755
diff --git a/lib/gear/trackhub.py b/lib/gear/trackhub.py
old mode 100644
new mode 100755
diff --git a/lib/gear/userhistory.py b/lib/gear/userhistory.py
old mode 100644
new mode 100755
diff --git a/lib/gear/utils.py b/lib/gear/utils.py
old mode 100644
new mode 100755
diff --git a/www/cgi/process_uploaded_expression_dataset.cgi b/www/cgi/process_uploaded_expression_dataset.cgi
index afeae0c4..583a508f 100755
--- a/www/cgi/process_uploaded_expression_dataset.cgi
+++ b/www/cgi/process_uploaded_expression_dataset.cgi
@@ -37,9 +37,14 @@ sys.stdout = open(os.devnull, 'w')
 
 lib_path = Path(__file__).resolve().parents[2] / 'lib'
 sys.path.append(str(lib_path))
+import gear.seuratuploader as SeuratUploader
 import geardb
+from gear.primary_analysis import (
+    PrimaryAnalysisProcessingError,
+    add_primary_analysis_to_dataset,
+)
 from gear.spatialhandler import SPATIALTYPE2CLASS
-import gear.SeuratUploader as SeuratUploader
+from gear.utils import update_adata_with_ensembl_ids
 
 share_uid = None
 session_id = None
@@ -73,9 +78,8 @@ def main():
         result['message'] = 'User ID not found. Please log in to continue.'
         return result
 
-    # values are mex_3tab, excel, rdata, h5ad, rds, or spatial formats
-    # Removed rdata from list as rdata will be difficult to process efficiently
-    dataset_formats = ['mex_3tab', 'excel', 'h5ad', 'spatial','rds','rdata']
+    # values are mex_3tab, excel, h5ad, rds, or spatial formats
+    dataset_formats = ['mex_3tab', 'excel', 'h5ad', 'spatial','rds']
     dataset_upload_dir = Path(user_upload_file_base) / session_id / share_uid
 
     # quickly write the status so the page doesn't error out
@@ -107,9 +111,12 @@ def main():
         return
     with open(metadata_file, 'r') as f:
         metadata = json.load(f)
+        dataset_uid = metadata.get('dataset_uid', '')
+        dataset_type = metadata.get('dataset_type', '')
 
     # Update metadata for downstream uses
     metadata["dataset_format"] = dataset_format
+    metadata["perform_primary_analysis"] = True if dataset_type in ['single-cell-rnaseq', 'spatial'] else False
     with open(metadata_file, 'w') as f:
         json.dump(metadata, f, indent=4)
 
@@ -143,28 +150,39 @@ def main():
         # CHILD CONTINUES FROM HERE
 
     status['process_id'] = os.getpid()
-    
+
     # new child command
     if dataset_format == 'mex_3tab':
-        process_mex_3tab(dataset_upload_dir)
+        process_mex_3tab(dataset_upload_dir, metadata["perform_primary_analysis"])
     elif dataset_format == 'excel':
-        process_excel(dataset_upload_dir)
+        process_excel(dataset_upload_dir, metadata["perform_primary_analysis"])
     elif dataset_format == "h5ad":
-        process_h5ad(dataset_upload_dir)
-    elif dataset_format == 'rds' or dataset_format=='rdata':
-        process_seurat(dataset_upload_dir)
+        process_h5ad(dataset_upload_dir, metadata["perform_primary_analysis"])
+    elif dataset_format == 'rds':
+        process_seurat(dataset_upload_dir, metadata["perform_primary_analysis"])
     elif dataset_format == "spatial":
-        process_spatial(dataset_upload_dir, spatial_format)
+        process_spatial(dataset_upload_dir, spatial_format, metadata["perform_primary_analysis"])
     else:
         result["success"] = 0
         result["message"] = f"Unsupported dataset format: {dataset_format}"
         return result
 
+
+    if metadata["perform_primary_analysis"]:
+        try:
+            result["success"] = add_primary_analysis_to_dataset(dataset_uid, share_uid, dataset_upload_dir, dataset_format)
+        except PrimaryAnalysisProcessingError as e:
+            write_status(dataset_upload_dir, 'error', f"Error during primary analysis: {str(e)}")
+            return result
+
+    status["progress"] = 100
+    write_status(dataset_upload_dir, 'complete', "Dataset processed successfully.")
+
     result["success"] = 1
     result["message"] = "Dataset processed successfully."
     return result
 
-def process_h5ad(upload_dir: Path) -> None:
+def process_h5ad(upload_dir: Path, perform_primary_analysis: bool) -> None:
     """
     Processes an uploaded .h5ad (AnnData) file in the specified upload directory by performing the following steps:
     1. Reads the .h5ad file as an AnnData object.
@@ -182,15 +200,43 @@ def process_h5ad(upload_dir: Path) -> None:
     # If the file is an h5ad, it should be formatted as an AnnData object already.
     # But we still want to do some sanitization of the obs dataframe.
 
+    # TODO: Read in chunks to save memory
+
     write_status(upload_dir, 'processing', 'Initializing dataset processing.')
 
     filepath = upload_dir / f"{share_uid}.h5ad"
     adata = anndata.read_h5ad(filepath)
     obs = adata.obs
 
+    total_steps = 4 if perform_primary_analysis else 3
+    step_counter = 1
+    status["progress"] = int((step_counter / total_steps) * 100)
+    write_status(upload_dir, 'processing', 'Sanitizing AnnData object')
+
     categorize_observation_columns(obs)
     adata.obs = sanitize_obs_for_h5ad(obs)
 
+    if "gene_symbol" not in adata.var.columns:
+        # get organism_id by converting sample_taxid
+        metadata_file = upload_dir / 'metadata.json'
+        if not metadata_file.is_file():
+            write_status(upload_dir, 'error', "No metadata JSON file found.")
+            return
+
+        with open(metadata_file, 'r') as f:
+            metadata = json.load(f)
+        sample_taxid = metadata.get("sample_taxid", None)
+        organism_id=geardb.get_organism_id_by_taxon_id(sample_taxid)
+        if not organism_id:
+            write_status(upload_dir, 'error', "Could not determine organism ID from sample taxonomic ID.")
+            return
+
+        adata = update_adata_with_ensembl_ids(adata, organism_id, "UNMAPPED_")
+
+    step_counter += 1
+    status["progress"] = int((step_counter / total_steps) * 100)
+    write_status(upload_dir, 'processing', 'Writing sanitized data to new H5AD.')
+
     h5ad_path = upload_dir / f"{share_uid}.new.h5ad"
     adata.write(h5ad_path)
 
@@ -198,9 +244,14 @@ def process_h5ad(upload_dir: Path) -> None:
     filepath.unlink()  # remove original
     h5ad_path.rename(filepath)  # rename new to original name
 
-    write_status(upload_dir, 'complete', 'Dataset processed successfully.')
+    step_counter += 1
+    status["progress"] = int((step_counter / total_steps) * 100)
+    write_status(upload_dir, 'processing', f"Finished processing dataset. {'Performing primary analysis...' if perform_primary_analysis else ''}")\
+
+def process_seurat(upload_dir: Path, perform_primary_analysis: bool) -> None:
+    total_steps = 2 if perform_primary_analysis else 1
+    step_counter = 1
 
-def process_seurat(upload_dir: Path) -> None:
     # Take in an RDS file, convert to anndata, update the obs metadata based on reductions,
     # convert gene symbols to ensemble IDs, and write to an updated h5ad file.
     write_status(upload_dir, "processing", "Initializing dataset processing.")
@@ -228,14 +279,16 @@ def process_seurat(upload_dir: Path) -> None:
     metadata_file = upload_dir / 'metadata.json'
     if not metadata_file.is_file():
         write_status(upload_dir, 'error', "No metadata JSON file found.")
-
+        return
     # get organism_id by converting sample_taxid(needed for some but not all spatial handlers)
     with open(metadata_file, 'r') as f:
         metadata = json.load(f)
-    
+
     sample_taxid = metadata.get("sample_taxid", None)
     try:
         adata = SeuratUploader.genes_to_ensembl(adata,sample_taxid)
+        if adata is None:
+            raise Exception("genes_to_ensembl returned None")
     except Exception as e:
         write_status(upload_dir, 'error', f'Failed to convert genes to Ensembl: {str(e)}')
         return
@@ -248,16 +301,16 @@ def process_seurat(upload_dir: Path) -> None:
         # Replace the original file with the sanitized one
         seurat_filepath.unlink()
         Path(adata_filepath).unlink()
-        h5ad_path.rename(upload_dir / f"{share_uid}.h5ad")  
+        h5ad_path.rename(upload_dir / f"{share_uid}.h5ad")
     except Exception as e:
         write_status(upload_dir, 'error', f'Failed to write h5ad or during cleanup: {str(e)}')
         return
 
-    write_status(upload_dir, "complete", "Dataset processed successfully.")
-
+    step_counter += 1
+    status["progress"] = int((step_counter / total_steps) * 100)
+    write_status(upload_dir, 'processing', f"Finished processing dataset. {'Performing primary analysis...' if perform_primary_analysis else ''}")
 
-
-def process_3tab(upload_dir: Path) -> None:
+def process_3tab(upload_dir: Path, perform_primary_analysis: bool) -> None:
     import subprocess
 
     chunk_size = 500
@@ -315,9 +368,8 @@ def process_3tab(upload_dir: Path) -> None:
             expression_matrix.append(sparse.csr_matrix(chunk.values))
 
             status['progress'] = percentage
-            status['message'] = f"Processed {rows_read}/{total_rows} expression matrix chunks ..."
-            with open(upload_dir / "status.json", 'w') as f:
-                f.write(json.dumps(status))
+            message = f"Processed {rows_read}/{total_rows} expression matrix chunks ..."
+            write_status(upload_dir, 'processing', message)
 
         adata.X = sparse.vstack(expression_matrix) # type: ignore
     except Exception:
@@ -349,9 +401,8 @@ def process_3tab(upload_dir: Path) -> None:
                 percentage = int((rows_read / total_rows) * 100)
 
                 status['progress'] = percentage
-                status['message'] = f"Processed {rows_read}/{total_rows} expression matrix chunks ..."
-                with open(upload_dir / "status.json", 'w') as f:
-                    f.write(json.dumps(status))
+                message = f"Processed {rows_read}/{total_rows} expression matrix chunks ..."
+                write_status(upload_dir, 'processing', message)
 
             except Exception:
                 #print(f"\nError in chunk {chunk_index}: {inner_e}")
@@ -378,9 +429,10 @@ def process_3tab(upload_dir: Path) -> None:
     h5ad_path = upload_dir / f"{share_uid}.h5ad"
     adata.write(h5ad_path)
 
-    write_status(upload_dir, 'complete', 'Dataset processed successfully.')
+    # Progress is accounted for in chunk processing
+    write_status(upload_dir, 'processing', f"Finished processing dataset. {'Performing primary analysis...' if perform_primary_analysis else ''}")
 
-def process_excel(upload_dir: Path) -> None:
+def process_excel(upload_dir: Path, perform_primary_analysis: bool) -> None:
     filepath = upload_dir / f"{share_uid}.xlsx"
 
     write_status(upload_dir, 'processing', 'Initializing dataset processing.')
@@ -460,12 +512,14 @@ def process_excel(upload_dir: Path) -> None:
     h5ad_path = upload_dir / f"{share_uid}.h5ad"
     adata.write(h5ad_path)
 
-    write_status(upload_dir, 'complete', 'Dataset processed successfully.')
+    total_steps = 2 if perform_primary_analysis else 1
+    status["progress"] = int((1 / total_steps) * 100)
+    write_status(upload_dir, 'processing', f"Finished processing dataset. {'Performing primary analysis...' if perform_primary_analysis else ''}")
 
-def process_mex(upload_dir: Path) -> None:
+def process_mex(upload_dir: Path, perform_primary_analysis: bool) -> None:
     pass
 
-def process_mex_3tab(upload_dir: Path) -> None:
+def process_mex_3tab(upload_dir: Path, perform_primary_analysis: bool) -> None:
     # Extract the file
     import tarfile
     compression_format = None
@@ -544,11 +598,11 @@ def process_mex_3tab(upload_dir: Path) -> None:
 
     # Call the appropriate function
     if dataset_type == 'threetab':
-        process_3tab(upload_dir)
+        process_3tab(upload_dir, perform_primary_analysis)
     elif dataset_type == 'mex':
-        process_mex(upload_dir)
+        process_mex(upload_dir, perform_primary_analysis)
 
-def process_spatial(upload_dir: Path, spatial_format: str) -> None:
+def process_spatial(upload_dir: Path, spatial_format: str, perform_primary_analysis: bool) -> None:
     """
     Processes a spatial transcriptomics dataset uploaded to a specified directory.
 
@@ -565,6 +619,9 @@ def process_spatial(upload_dir: Path, spatial_format: str) -> None:
     Raises:
         Writes error status if the metadata file is missing or if reading/converting the spatial file fails.
     """
+
+    write_status(upload_dir, 'processing', 'Initializing dataset processing.')
+
     spatial_obj = SPATIALTYPE2CLASS[spatial_format]()   # instantiate the appropriate handler class
     metadata_file = upload_dir / 'metadata.json'
     if not metadata_file.is_file():
@@ -594,9 +651,16 @@ def process_spatial(upload_dir: Path, spatial_format: str) -> None:
         import shutil
         shutil.rmtree(output_path)
 
+    total_steps = 3 if perform_primary_analysis else 2
+    step_counter = 1
+    status["progress"] = int((step_counter / total_steps) * 100)
+
     write_status(upload_dir, 'processing', 'Writing Zarr store')
     spatial_obj.write_to_zarr(filepath=output_path)
-    write_status(upload_dir, 'complete', 'Dataset processed successfully.')
+
+    step_counter += 1
+    status["progress"] = int((step_counter / total_steps) * 100)
+    write_status(upload_dir, 'processing', f"Finished processing spatial dataset. {'Performing primary analysis...' if perform_primary_analysis else ''}")
 
 def sanitize_obs_for_h5ad(obs_df: pd.DataFrame) -> pd.DataFrame:
     for col in obs_df.columns:
diff --git a/www/upload_dataset.html b/www/upload_dataset.html
index 39d1e3f7..f17a0077 100644
--- a/www/upload_dataset.html
+++ b/www/upload_dataset.html
@@ -692,7 +692,7 @@ <h3>RDS / Seurat</h3>
                           </li>
                         </ul>
                         <div class="control">
-                          <button data-format="rdata" class="button is-primary is-fullwidth format-selector">
+                          <button data-format="rds" class="button is-primary is-fullwidth format-selector">
                             <span class="icon">
                               <i class="mdi mdi-cancel"></i>
                             </span>

From 9b6b51feed501133550bcaa52d1016ec70aa1ebf Mon Sep 17 00:00:00 2001
From: Shaun Adkins <sadkins@som.umaryland.edu>
Date: Wed, 3 Jun 2026 15:01:39 -0400
Subject: [PATCH 15/21] Rename SeuratUploader.py to seuratuploader.py

---
 lib/gear/{SeuratUploader.py => seuratuploader.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename lib/gear/{SeuratUploader.py => seuratuploader.py} (100%)

diff --git a/lib/gear/SeuratUploader.py b/lib/gear/seuratuploader.py
similarity index 100%
rename from lib/gear/SeuratUploader.py
rename to lib/gear/seuratuploader.py

From a1c923e228b1cde497b74a4655cd069bfd7a53ab Mon Sep 17 00:00:00 2001
From: adkinsrs <sadkins@som.umaryland.edu>
Date: Wed, 3 Jun 2026 15:19:52 -0400
Subject: [PATCH 16/21] Enhance error handling and parameter retrieval in
 dataset processing scripts

- Updated exception handling to specify Exception in r_package_importer and seurat_to_anndata functions.
- Added RDS file migration support in finalize_uploaded_expression_dataset.cgi.
- Changed form parameter retrieval method to getfirst in process_uploaded_expression_dataset.cgi and store_expression_dataset.cgi for consistency.
- Added validation for RDS file extension in store_expression_dataset.cgi.
---
 lib/gear/seuratuploader.py                       |  4 ++--
 www/cgi/finalize_uploaded_expression_dataset.cgi | 12 ++++++++++++
 www/cgi/process_uploaded_expression_dataset.cgi  |  8 ++++----
 www/cgi/store_expression_dataset.cgi             | 13 +++++++++----
 4 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/lib/gear/seuratuploader.py b/lib/gear/seuratuploader.py
index 050f665f..8d7872df 100755
--- a/lib/gear/seuratuploader.py
+++ b/lib/gear/seuratuploader.py
@@ -52,7 +52,7 @@ def r_package_importer(package_name:str):
     try:
         pkg = importr(package_name)
         return pkg
-    except:
+    except Exception:
         importErrorMessage += f"{package_name} not installed or can not be imported"
         sys.exit(importErrorMessage)
 
@@ -87,7 +87,7 @@ def seurat_to_anndata(file_path: str, share_name: str, output_dir: str = "."):
         ro.r(f'write_h5ad(adata, "{output_path}")')
         return output_path
     # In cases where the write fails we will assume the h5ad already exists
-    except:
+    except Exception:
         print(f"h5ad name already exists {output_path}")
         return False
 
diff --git a/www/cgi/finalize_uploaded_expression_dataset.cgi b/www/cgi/finalize_uploaded_expression_dataset.cgi
index ca52ac21..a7694e8b 100755
--- a/www/cgi/finalize_uploaded_expression_dataset.cgi
+++ b/www/cgi/finalize_uploaded_expression_dataset.cgi
@@ -256,6 +256,18 @@ def main() -> dict:
             result['message'] = 'Error migrating Excel file: {}'.format(str(e))
             return result
 
+    elif dataset_format == 'rds':
+        # migrate the RDS file
+        rds_file = dataset_upload_dir / f'{share_uid}.rds'
+        rds_dest = dataset_final_dir / f'{dataset_id}.rds'
+
+        try:
+            shutil.move(rds_file, rds_dest)
+            result['userdata_migrated'] = 1
+        except Exception as e:
+            result['message'] = 'Error migrating RDS file: {}'.format(str(e))
+            return result
+
     elif dataset_format == "spatial":
         # migrate the spatial tarball
         spatial_src = dataset_upload_dir / f'{share_uid}.tar.gz'
diff --git a/www/cgi/process_uploaded_expression_dataset.cgi b/www/cgi/process_uploaded_expression_dataset.cgi
index 583a508f..5769d5ae 100755
--- a/www/cgi/process_uploaded_expression_dataset.cgi
+++ b/www/cgi/process_uploaded_expression_dataset.cgi
@@ -64,10 +64,10 @@ def main():
     global session_id
 
     form = cgi.FieldStorage()
-    share_uid = form.getvalue('share_uid')
-    session_id = form.getvalue('session_id')
-    dataset_format = form.getvalue('dataset_format')
-    spatial_format = form.getvalue('spatial_format')  # may be None
+    share_uid = form.getfirst('share_uid')
+    session_id = form.getfirst('session_id')
+    dataset_format = form.getfirst('dataset_format')
+    spatial_format = form.getfirst('spatial_format')  # may be None
 
     if share_uid is None or session_id is None or dataset_format is None:
         result['message'] = 'Missing one or more required parameters.'
diff --git a/www/cgi/store_expression_dataset.cgi b/www/cgi/store_expression_dataset.cgi
index 54bb1f26..e16b79fb 100755
--- a/www/cgi/store_expression_dataset.cgi
+++ b/www/cgi/store_expression_dataset.cgi
@@ -19,10 +19,10 @@ import geardb
 def main():
     print('Content-Type: application/json\n\n')
     form = cgi.FieldStorage()
-    session_id = form.getvalue('session_id')
-    share_uid = form.getvalue('share_uid')
-    dataset_format = form.getvalue('dataset_format')
-    spatial_format = form.getvalue('spatial_format')  # may be None
+    session_id = form.getfirst('session_id')
+    share_uid = form.getfirst('share_uid')
+    dataset_format = form.getfirst('dataset_format')
+    spatial_format = form.getfirst('spatial_format')  # may be None
 
     if not share_uid: # should never happen
         error_msg = f"Unexpected missing share_uid in store_expression_dataset.cgi. session_id={session_id!r}"
@@ -66,6 +66,11 @@ def main():
             result['message'] = 'Invalid file extension for H5AD format. Expected .h5ad'
             return result
 
+    if dataset_format == "rds":
+        if not filename.lower().endswith('rds'):
+            result['message'] = 'Invalid file extension for RDS format. Expected .rds'
+            return result
+
     if dataset_format == 'spatial':
         if not filename.endswith('tar.gz'):
             result['message'] = 'Invalid file extension for Spatial format. Expected .tar.gz'

From 556876143787d3655e52ff1c996dd818cdb13991 Mon Sep 17 00:00:00 2001
From: adkinsrs <sadkins@som.umaryland.edu>
Date: Wed, 3 Jun 2026 15:31:54 -0400
Subject: [PATCH 17/21] Fix file extension check for RDS format in
 store_expression_dataset.cgi

---
 www/cgi/store_expression_dataset.cgi | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/www/cgi/store_expression_dataset.cgi b/www/cgi/store_expression_dataset.cgi
index e16b79fb..16bd0b7c 100755
--- a/www/cgi/store_expression_dataset.cgi
+++ b/www/cgi/store_expression_dataset.cgi
@@ -67,7 +67,7 @@ def main():
             return result
 
     if dataset_format == "rds":
-        if not filename.lower().endswith('rds'):
+        if not filename.endswith('rds'):
             result['message'] = 'Invalid file extension for RDS format. Expected .rds'
             return result
 

From ee9ef8c93df5ee8d2fb069530cb0008a59e0d6a9 Mon Sep 17 00:00:00 2001
From: adkinsrs <sadkins@som.umaryland.edu>
Date: Thu, 4 Jun 2026 08:12:57 -0400
Subject: [PATCH 18/21] Update Docker output descriptions to include "latest"
 tag for images

---
 docs/developer/setup/docker.md | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/docs/developer/setup/docker.md b/docs/developer/setup/docker.md
index 281f176f..d307a053 100644
--- a/docs/developer/setup/docker.md
+++ b/docs/developer/setup/docker.md
@@ -66,7 +66,7 @@ This file is dedicated entirely to compiling Python 3.x and installing requireme
 
 **RPy2**: The "rpy2" package is actually built in the final Docker (umgear) image, due to some dependencies on R.
 
-**The output**: This is currently built and pushed as adkinsrs/gear-python-base:YYYY-MM-DD
+**The output**: This is currently built and pushed as adkinsrs/gear-python-base:YYYY-MM-DD and also tagged with the "latest" tag.
 
 #### Dockerfile.r (The R Base)
 
@@ -74,15 +74,17 @@ This file is dedicated entirely to compiling R and running your Bioconductor scr
 
 **When you build it**: Almost never. Only touch this if the team specifically requests a new version of Bioconductor or a brand-new R system library.
 
-**The output**: This is currently built and pushed as adkinsrs/gear-r-base:YYYY-MM-DD
+**The output**: This is currently built and pushed as adkinsrs/gear-r-base:YYYY-MM-DD and also tagged with the "latest" tag.
 
 #### Dockerfile (The Final App)
 
 This is your main daily-driver file. It starts with a clean Ubuntu image, uses COPY --from=... to pull in the pre-compiled folders from your registry, installs Apache, and copies over your Flask API and HTML/JS files.
 
+Currently the inherited R and Python images are set to use the "latest" tag, as most of the time we want the most up-to-date version. If for some reason you need an earlier version, edit the Dockerfile to use one of the existing YYYY-MM-DD tags stored in Docker Hub.
+
 **When you build it**: Every time you update the website, tweak the Apache configuration, or change a CGI script.  Anything gEAR-code related, basically.
 
-**The output**: This builds in seconds and becomes your final production image.  This is pushed as adkinsrs/umgear:YYYY-MM-DD
+**The output**: This builds in seconds and becomes your final production image.  This is pushed as adkinsrs/umgear:YYYY-MM-DD and also tagged with the "latest" tag.
 
 ## Starting the stack
 

From df93f5de40c19e6d632df31e6ef44bcefdcfaa4c Mon Sep 17 00:00:00 2001
From: adkinsrs <sadkins@som.umaryland.edu>
Date: Thu, 4 Jun 2026 08:18:37 -0400
Subject: [PATCH 19/21] Clarify Docker image tag description in setup
 documentation

---
 docs/developer/setup/docker.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/developer/setup/docker.md b/docs/developer/setup/docker.md
index d307a053..87efba02 100644
--- a/docs/developer/setup/docker.md
+++ b/docs/developer/setup/docker.md
@@ -80,7 +80,7 @@ This file is dedicated entirely to compiling R and running your Bioconductor scr
 
 This is your main daily-driver file. It starts with a clean Ubuntu image, uses COPY --from=... to pull in the pre-compiled folders from your registry, installs Apache, and copies over your Flask API and HTML/JS files.
 
-Currently the inherited R and Python images are set to use the "latest" tag, as most of the time we want the most up-to-date version. If for some reason you need an earlier version, edit the Dockerfile to use one of the existing YYYY-MM-DD tags stored in Docker Hub.
+Currently the inherited R and Python images are set to use the "latest" tag of a locally built image, as most of the time we want the most up-to-date version. If for some reason you need an earlier version, edit the Dockerfile to use one of the existing `adkinsrs/<image>:YYYY-MM-DD` tags stored in Docker Hub.
 
 **When you build it**: Every time you update the website, tweak the Apache configuration, or change a CGI script.  Anything gEAR-code related, basically.
 

From e450944cdae258696dfbdbe0ccdff6fa384c0ed6 Mon Sep 17 00:00:00 2001
From: adkinsrs <sadkins@som.umaryland.edu>
Date: Thu, 4 Jun 2026 10:24:43 -0400
Subject: [PATCH 20/21] Fix font color assignment in
 update_stacked_violin_annotations to default to black

---
 lib/gear/mg_plotting.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/gear/mg_plotting.py b/lib/gear/mg_plotting.py
index 75c36fd1..eae1556b 100755
--- a/lib/gear/mg_plotting.py
+++ b/lib/gear/mg_plotting.py
@@ -1185,7 +1185,7 @@ def update_stacked_violin_annotations(fig, primary_groups, color_map):
         # Am attempting to do this based on the assumption that row facet titles will never have yanchor of bottom
         # (or y-pos of 1) or have certain text shared with the axes titles
         lambda a: a.update(
-            font=dict(color=color_map[a.text])
+            font=dict(color=color_map.get(a.text, "black"))
             , textangle=0
             , x=0
             , xanchor="right"

From 4d59376b2d1eb422c39c5240da674b16c5213365 Mon Sep 17 00:00:00 2001
From: adkinsrs <sadkins@som.umaryland.edu>
Date: Thu, 4 Jun 2026 11:45:03 -0400
Subject: [PATCH 21/21] Refactor error handling in SeuratUploader and update
 progress tracking in process_seurat function

---
 lib/gear/seuratuploader.py                    | 12 +++++---
 .../process_uploaded_expression_dataset.cgi   | 29 ++++++++++++++++---
 www/upload_dataset.html                       |  2 +-
 3 files changed, 34 insertions(+), 9 deletions(-)

diff --git a/lib/gear/seuratuploader.py b/lib/gear/seuratuploader.py
index 8d7872df..a8ca468e 100755
--- a/lib/gear/seuratuploader.py
+++ b/lib/gear/seuratuploader.py
@@ -89,8 +89,7 @@ def seurat_to_anndata(file_path: str, share_name: str, output_dir: str = "."):
     # In cases where the write fails we will assume the h5ad already exists
     except Exception:
         print(f"h5ad name already exists {output_path}")
-        return False
-
+        raise
 
 def openh5ad(h5ad_name):
     """Just open the supplied h5ad file"""
@@ -103,8 +102,13 @@ def genes_to_ensembl(adata, taxid=None):
     if taxid is None:
         return None
     genes = adata.var.index.tolist()
-    mg = mygene.MyGeneInfo()
-    mg_genes = mg.querymany(genes, scopes="symbol", fields="ensembl.gene", species=f"{taxid}")
+    try:
+        # TODO: Perhaps add a retry mechanism in case the API returns 500
+        mg = mygene.MyGeneInfo()
+        mg_genes = mg.querymany(genes, scopes="symbol", fields="ensembl.gene", species=f"{taxid}")
+    except Exception as e:
+        print(f"Error occurred while querying MyGene: {e}", file=sys.stderr)
+        raise
     ensembl_mapping_dict = {}
     for mg_gene in mg_genes:
         gene_name = mg_gene['query']
diff --git a/www/cgi/process_uploaded_expression_dataset.cgi b/www/cgi/process_uploaded_expression_dataset.cgi
index 5769d5ae..aec21e16 100755
--- a/www/cgi/process_uploaded_expression_dataset.cgi
+++ b/www/cgi/process_uploaded_expression_dataset.cgi
@@ -167,6 +167,10 @@ def main():
         result["message"] = f"Unsupported dataset format: {dataset_format}"
         return result
 
+    # If the functions error, we do not want them to try to do primary analysis. So exit and let the user see the original error message.
+    if status.get("status", "error") == "error":
+        result["success"] = 0
+        return result
 
     if metadata["perform_primary_analysis"]:
         try:
@@ -249,8 +253,9 @@ def process_h5ad(upload_dir: Path, perform_primary_analysis: bool) -> None:
     write_status(upload_dir, 'processing', f"Finished processing dataset. {'Performing primary analysis...' if perform_primary_analysis else ''}")\
 
 def process_seurat(upload_dir: Path, perform_primary_analysis: bool) -> None:
-    total_steps = 2 if perform_primary_analysis else 1
+    total_steps = 7 if perform_primary_analysis else 6
     step_counter = 1
+    status["progress"] = int((step_counter / total_steps) * 100)
 
     # Take in an RDS file, convert to anndata, update the obs metadata based on reductions,
     # convert gene symbols to ensemble IDs, and write to an updated h5ad file.
@@ -258,23 +263,31 @@ def process_seurat(upload_dir: Path, perform_primary_analysis: bool) -> None:
     seurat_filepath = upload_dir / f"{share_uid}.rds"
 
     # seurat to anndata uses rpy2 to convert the RDS to anndata
+    # filepath name has "tmp_" appended in front
     adata_filepath = SeuratUploader.seurat_to_anndata(str(seurat_filepath), share_uid, str(upload_dir))
     if not adata_filepath:
         write_status(upload_dir, 'error', 'Failed to convert RDS to h5ad.')
         return
+
+    step_counter += 1
+    status["progress"] = int((step_counter / total_steps) * 100)
+    write_status(upload_dir, 'processing', 'Reading converted h5ad file.')
     try:
         adata = anndata.read_h5ad(adata_filepath)
     except Exception as e:
         write_status(upload_dir, 'error', f'Failed to read h5ad: {str(e)}')
         return
 
-
     # Update obs metadata based on reductions
+    step_counter += 1
+    status["progress"] = int((step_counter / total_steps) * 100)
+    write_status(upload_dir, 'processing', 'Updating metadata from reductions.')
     try:
         adata = SeuratUploader.reduction_to_metadata(adata)
     except Exception as e:
         write_status(upload_dir, 'error', f'Failed to update Reductions to metadata: {str(e)}')
         return
+
     # Convert gene symbols to ensemble IDs
     metadata_file = upload_dir / 'metadata.json'
     if not metadata_file.is_file():
@@ -284,22 +297,30 @@ def process_seurat(upload_dir: Path, perform_primary_analysis: bool) -> None:
     with open(metadata_file, 'r') as f:
         metadata = json.load(f)
 
+    step_counter += 1
+    status["progress"] = int((step_counter / total_steps) * 100)
+    write_status(upload_dir, 'processing', 'Converting gene symbols to Ensembl IDs.')
     sample_taxid = metadata.get("sample_taxid", None)
     try:
-        adata = SeuratUploader.genes_to_ensembl(adata,sample_taxid)
+        adata = SeuratUploader.genes_to_ensembl(adata, sample_taxid)
         if adata is None:
             raise Exception("genes_to_ensembl returned None")
     except Exception as e:
         write_status(upload_dir, 'error', f'Failed to convert genes to Ensembl: {str(e)}')
         return
+
+    step_counter += 1
+    status["progress"] = int((step_counter / total_steps) * 100)
+    write_status(upload_dir, 'processing', 'Writing final h5ad file.')
     if adata.X is None:
+        # TODO: This is currently not an option in the UI, but was suggested to be one by @jorvis
         adata = SeuratUploader.layer_to_X(adata, layer_name='data')
     h5ad_path = upload_dir / f"{share_uid}.new.h5ad"
     try:
         adata.write(h5ad_path)
 
         # Replace the original file with the sanitized one
-        seurat_filepath.unlink()
+        #seurat_filepath.unlink()
         Path(adata_filepath).unlink()
         h5ad_path.rename(upload_dir / f"{share_uid}.h5ad")
     except Exception as e:
diff --git a/www/upload_dataset.html b/www/upload_dataset.html
index f17a0077..a34a0a0d 100644
--- a/www/upload_dataset.html
+++ b/www/upload_dataset.html
@@ -694,7 +694,7 @@ <h3>RDS / Seurat</h3>
                         <div class="control">
                           <button data-format="rds" class="button is-primary is-fullwidth format-selector">
                             <span class="icon">
-                              <i class="mdi mdi-cancel"></i>
+                              <i class="mdi mdi-checkbox-blank-outline"></i>
                             </span>
                             <span class="format-status">Choose</span>
                           </button>