PMBio · bfclarke · Apr 20, 2026 · Apr 20, 2026 · Apr 20, 2026 · Apr 24, 2026
diff --git a/deeprvat/annotations/annotations.py b/deeprvat/annotations/annotations.py
@@ -2215,16 +2215,14 @@ def merge_absplice_scores(
     if verbose:
         logger.info(f"Loading and transforming {genes}...")
 
-    con.execute(
-        f"""
+    con.execute(f"""
         CREATE TEMP TABLE genes_mod AS 
         SELECT 
             id AS gene_id,
             split_part(gene, '.', 1) AS gene,
             split_part(gene, '.', 2) AS exon
         FROM read_parquet('{genes}');
-    """
-    )
+    """)
 
     if verbose:
         logger.info(f"Loading {annotations}...")
@@ -2237,60 +2235,49 @@ def merge_absplice_scores(
     exclude_columns = {"chrom", "pos", "ref", "alt", "AbSplice_DNA"}
     selected_columns = [col for col in all_columns if col not in exclude_columns]
     select_clause = ", ".join(selected_columns)
-    con.execute(
-        f"""
+    con.execute(f"""
         CREATE TEMP TABLE annotations AS 
         SELECT {select_clause}
         FROM read_parquet('{annotations}');
-    """
-    )
+    """)
 
     if verbose:
         logger.info("Merging genes_mod with annotations...")
-    con.execute(
-        """
+    con.execute("""
         CREATE TEMP TABLE merged1 AS
         SELECT a.*, g.gene, g.exon
         FROM annotations a
         LEFT JOIN genes_mod g
         ON a.gene_id = g.gene_id;
-    """
-    )
+    """)
 
     if verbose:
         logger.info(f"Loading {variants}...")
-    con.execute(
-        f"""
+    con.execute(f"""
         CREATE TEMP TABLE variants AS 
         SELECT * FROM read_parquet('{variants}');
-    """
-    )
+    """)
 
     if verbose:
         logger.info("Merging merged1 with variants...")
-    con.execute(
-        """
+    con.execute("""
         CREATE TEMP TABLE merged2 AS
         SELECT m1.*, v.chrom, v.pos, v.ref, v.alt
         FROM merged1 m1
         LEFT JOIN variants v
         ON m1.id = v.id;
-    """
-    )
+    """)
 
     if verbose:
         logger.info(f"Loading {ab_splice_agg_score_file}...")
-    con.execute(
-        f"""
+    con.execute(f"""
         CREATE TEMP TABLE splice AS 
         SELECT * FROM read_parquet('{ab_splice_agg_score_file}');
-    """
-    )
+    """)
 
     if verbose:
         logger.info("Merging merged2 with absplice scores...")
-    con.execute(
-        """
+    con.execute("""
         CREATE TEMP TABLE final_merge AS
         SELECT m2.*, s.AbSplice_DNA
         FROM merged2 m2
@@ -2300,26 +2287,21 @@ def merge_absplice_scores(
             AND m2.ref = s.ref
             AND m2.alt = s.alt
             AND m2.gene = s.gene_id;
-    """
-    )
+    """)
 
     if fill_nan:
         if verbose:
             print(f"Filling NULL values in splice column with 0...")
-        con.execute(
-            f"""
+        con.execute(f"""
             UPDATE final_merge
             SET AbSplice_DNA = 0
             WHERE AbSplice_DNA IS NULL;
-        """
-        )
+        """)
 
     if verbose:
         logger.info(f"Saving final merged table to {output}...")
-    con.execute(
-        f"""
+    con.execute(f"""
         COPY final_merge TO '{output}' (FORMAT 'parquet');
-    """
-    )
+    """)
 
     logger.info(f"Successfully saved final merged table to {output}")
diff --git a/deeprvat/deeprvat/common_variant_condition_utils.py b/deeprvat/deeprvat/common_variant_condition_utils.py
@@ -15,7 +15,6 @@
 import shutil
 import os
 
-
 compression_level = 1
 
 logging.basicConfig(

diff --git a/deeprvat/seed_gene_discovery/evaluate.py b/deeprvat/seed_gene_discovery/evaluate.py
@@ -44,10 +44,8 @@ def evaluate_(associations: Dict[str, pd.DataFrame], alpha: float):
             logger.info("Only using genes with EAC > 50")
             result = result.query("EAC > 50")
             if result["pval"].isna().sum() > 0:
-                logger.info(
-                    "Attention: still NA pvals after EAC filtering\
-                            Removing remaining NA pvals"
-                )
+                logger.info("Attention: still NA pvals after EAC filtering\
+                            Removing remaining NA pvals")
                 result = result.dropna(subset=["pval"])
             corrected_result = pval_correction(
                 result, alpha, correction_type=correction_type

diff --git a/deeprvat/seed_gene_discovery/seed_gene_discovery.py b/deeprvat/seed_gene_discovery/seed_gene_discovery.py
@@ -557,10 +557,8 @@ def make_dataset_(
         batch_size = len(dataset)
 
     if "batch_size" in data_config["dataloader_config"].keys():
-        raise ValueError(
-            """You can't specify the batch_size in the 
-                         dataloader config for the seed gene discovery"""
-        )
+        raise ValueError("""You can't specify the batch_size in the 
+                         dataloader config for the seed gene discovery""")
 
     logger.info(f"Read dataset, batch size {batch_size}")
     dataloader = DataLoader(

diff --git a/deeprvat/utils.py b/deeprvat/utils.py
@@ -14,7 +14,6 @@
 from sklearn.preprocessing import quantile_transform
 from statsmodels.stats.multitest import fdrcorrection
 
-
 logging.basicConfig(
     format="[%(asctime)s] %(levelname)s:%(name)s: %(message)s",
     level=logging.INFO,

diff --git a/docs/index.rst b/docs/index.rst
@@ -8,7 +8,7 @@ Welcome to DeepRVAT's documentation!
 
 Rare variant association testing using deep learning and data-driven gene impairment scores.
 
-_Coming soon:_ Overview of the DeepRVAT methodaster
+_Coming soon:_ Overview of the DeepRVAT method
 
 
 How to use this documentation

diff --git a/docs/pretrained_models.md b/docs/pretrained_models.md
@@ -2,10 +2,23 @@
 
 For using the pretrained DeepRVAT model provided as part of the package, or a custom pretrained model, we have setup pipelines for running only the association testing stage. This includes creating the association dataset files, computing gene impairment scores, regression, and evaluation. 
 
+_**Important note:**_ DeepRVAT currently supports association testing using REGENIE or SEAK. Because REGENIE is actively maintained and gives better control for ancestry effects, as well as other aspects of genetic background, association testing with SEAK is _deprecated_ and will be removed in a future version. Note especially that for binary phenotypes, REGENIE _must_ be used.
 
-## Configuration and input files
 
-Configuration parameters must be specified in `deeprvat_input_pretrained_models_config.yaml`, see [example file](https://github.com/PMBio/deeprvat/blob/main/example/config/deeprvat_input_pretrained_models_config.yaml). For details on the meanings of the parameters and the format of input files, see [here](input_data).
+## Association testing with REGENIE
+
+For specifics on input files and options for REGENIE steps 1 and 2, please refer to the [REGENIE documentation](https://rgcgithub.github.io/regenie/).
+
+### Configuration and input files
+
+Configuration parameters must be specified in `deeprvat_input_pretrained_models_config.yaml` (rename the example file). For details on the meanings of the parameters and the format of input files, see [here](input_data).
+
+For an example, refer to [this configuration file](https://github.com/PMBio/deeprvat/blob/main/example/config/deeprvat_input_config_regenie.yaml) (rename or link it to `deeprvat_input_pretrained_models_config.yaml` in your working directory), which includes REGENIE-specific parameters for steps 1 and 2. Importantly, specify
+```
+regenie_options:
+  regenie_exp: True
+  gtf_file: path/to/annotation_file.gtf.gz
+```
 
 To use pretrained models, you must specify `use_pretrained_models: True` in your `deeprvat_input_pretrained_models_config.yaml` configuration file. Additionally, provide the path to pretrained models (an output of the training pipeline) in the parameter `pretrained_model_path`. Within the `pretrained_model_path` directory, there must be a `config.yaml` file in that directory with the following set of specified keys that were used for training the pretrained models; `rare_variant_annotations`, `training_data_thresholds`, and `model` . See [example file](https://github.com/PMBio/deeprvat/blob/main/pretrained_models/model_config.yaml).
 
@@ -35,59 +48,23 @@ cv_options (optional)
 
 Note that the file specified by `annotation_filename` must contain a column corresponding to each annotation in the list `rare_variant_annotations` from `deeprvat/pretrained_models/model_config.yaml`. 
 
-
-## Executing the pipeline
-
-```
-snakemake -j 1 --snakefile [path_to_deeprvat]/pipelines/association_testing_pretrained.snakefile
-```
-
-Replace `[path_to_deeprvat]` with the path to your copy of the DeepRVAT repository.
-
-
-## Running the association testing pipeline with REGENIE
-
-_Coming soon_
-
-<!---
-
-#### Input data
-For running with REGENIE, in addition to the default input data, the following REGENIE specific files should also be included in your `experiment` directory:
-
-
-To run REGENIE Step 1
-- `.sample` Inclusion file that lists individuals to retain in the analysis
-- `.sniplist` Inclusion file that lists IDs of variants to keep
-- `.bgen` input genetic data file
-- `.bgen.bgi` index bgi file corresponding to input BGEN file
-
-For these REGENIE specific files, please refer to the [REGENIE documentation](https://rgcgithub.github.io/regenie/).
-
-For running REGENIE Step 2:
-- `gtf file` gencode gene annotation gtf file 
-- `keep_samples.txt` (optional file of samples to include)
-- `protein_coding_genes.parquet`
-
-#### Config file
-
-Use the `[path_to_deeprvat]/example/config_regenie.yaml` as `config.yaml` which includes REGENIE specific parameters. 
-You can set any parameter explained in the [REGENIE documentation](https://rgcgithub.github.io/regenie/) via this config.
-Most importantly, for association testing of binary traits use:
+You can set any parameter explained in the [REGENIE documentation](https://rgcgithub.github.io/regenie/) via the config.
+Most importantly, for association testing of binary traits use `--bt` in Step 2. For imbalanced traits, we also recommend Firth logistic regression:
 ```
 step_2:
         options:
             - "--bt"
             - "--firth --approx --pThresh 0.01"
 
 ```
-and for quantitative traits:
+For quantitative traits:
 ```
 step_2:
         options:
             - "--qt"
 ```
 
-#### Run REGENIE
+### Run REGENIE
 
 
 ```
@@ -97,24 +74,37 @@ snakemake -j 1 --snakefile [path_to_deeprvat]/pipelines/association_testing_pret
 ```
 
 
-#### Testing multiple sub-chohorts
-For testing multiple sub-cohorts, remember that REGENIE Step 1 (compute intense) only needs to be executed once per sample and phenotype. We suggest running REGENIE Step 1 on all samples and phenotypes initially and then linking the output as `regenie_output/step1/` in each experiment directory for testing a sub-cohort.
+<!-- ### Testing multiple sub-cohorts -->
 
-Samples to be considered when testing sub-cohorts can be provided via `keep_samples.txt` which look like 
+<!-- For testing multiple sub-cohorts, remember that REGENIE Step 1 (compute intense) only needs to be executed once per sample and phenotype. We suggest running REGENIE Step 1 on all samples and phenotypes initially and then linking the output as `regenie_output/step1/` in each experiment directory for testing a sub-cohort. -->
 
-``` 
-12345 12345
-56789 56789
-````
-for keeping two samples with ids `12345` and `56789`
+<!-- Samples to be considered when testing sub-cohorts can be provided via `keep_samples.txt` which look like  -->
 
-### Running the association testing pipeline with SEAK
+<!-- ```  -->
+<!-- 12345 12345 -->
+<!-- 56789 56789 -->
+<!-- ```` -->
+<!-- for keeping two samples with ids `12345` and `56789` -->
 
-```shell
-cd experiment
-ln -s [path_to_deeprvat]/pretrained_models
+## Association testing with SEAK
+
+### Configuration file
+
+Follow the instructions as above, but specify
+```
+regenie_options:
+  regenie_exp: False
+```
+
+For an example, see [this configuration file](https://github.com/PMBio/deeprvat/blob/main/example/config/deeprvat_input_pretrained_models_config.yaml)
+
+### Executing the pipeline
+
+```
 snakemake -j 1 --snakefile [path_to_deeprvat]/pipelines/association_testing_pretrained.snakefile
 ```
 
---->
+Replace `[path_to_deeprvat]` with the path to your copy of the DeepRVAT repository.
+
+
 
diff --git a/example/config/deeprvat_input_pretrained_models_config.yaml b/example/config/deeprvat_input_pretrained_models_config.yaml
@@ -40,13 +40,14 @@ covariates: #x_phenotypes
   - genetic_PC_19
   - genetic_PC_20
 
+# Filters for variants to be included in association testing
+# The column names used here must be in annotation.parquet
 association_testing_data_thresholds:
   MAF: "< 1e-3"
   CADD_PHRED: "> 5"
 
 #DeepRVAT model settings
 n_repeats: 30
-y_transformation: quantile_transform
 
 # Results evaluation settings
 evaluation:
@@ -57,24 +58,18 @@ evaluation:
 #sample_files:
 #  association_testing: association_testing_samples.pkl
 
-#Additional settings if using the CV pipeline
-cv_options:
-  cv_exp: False
-  #cv_path: sample_files
-  #n_folds: 5
-
 #Additional settings if using the REGENIE integration
 regenie_options:
-  regenie_exp: False
-  # gtf_file: gencode.v38.basic.annotation.gtf.gz
-  # step_1:
-  #     bgen: imputation.bgen
-  #     snplist: imputation.snplist
-  #     bsize: 1000
-  #     options:
-  #         - "--sample imputation.sample"
-  #         - "--qt"
-  # step_2:
-  #     bsize: 400
-  #     options:
-  #         - "--qt"
+  regenie_exp: True
+  gtf_file: gencode.v38.basic.annotation.gtf.gz
+  step_1:
+      bgen: imputation.bgen
+      snplist: imputation.snplist
+      bsize: 1000
+      options:
+          - "--sample imputation.sample"
+          - "--qt"
+  step_2:
+      bsize: 400
+      options:
+          - "--qt"
diff --git a/pipelines/resources/spliceai.py b/pipelines/resources/spliceai.py
@@ -1,7 +1,6 @@
 from spliceai_rocksdb.spliceAI import SpliceAI
 import snakemake
 
-
 if snakemake.params["lookup_only"]:
     model = SpliceAI(db_path=snakemake.params["db_path"])
 else: