diff --git a/deeprvat/annotations/annotations.py b/deeprvat/annotations/annotations.py index 1bede950..a6e8772b 100644 --- a/deeprvat/annotations/annotations.py +++ b/deeprvat/annotations/annotations.py @@ -2215,16 +2215,14 @@ def merge_absplice_scores( if verbose: logger.info(f"Loading and transforming {genes}...") - con.execute( - f""" + con.execute(f""" CREATE TEMP TABLE genes_mod AS SELECT id AS gene_id, split_part(gene, '.', 1) AS gene, split_part(gene, '.', 2) AS exon FROM read_parquet('{genes}'); - """ - ) + """) if verbose: logger.info(f"Loading {annotations}...") @@ -2237,60 +2235,49 @@ def merge_absplice_scores( exclude_columns = {"chrom", "pos", "ref", "alt", "AbSplice_DNA"} selected_columns = [col for col in all_columns if col not in exclude_columns] select_clause = ", ".join(selected_columns) - con.execute( - f""" + con.execute(f""" CREATE TEMP TABLE annotations AS SELECT {select_clause} FROM read_parquet('{annotations}'); - """ - ) + """) if verbose: logger.info("Merging genes_mod with annotations...") - con.execute( - """ + con.execute(""" CREATE TEMP TABLE merged1 AS SELECT a.*, g.gene, g.exon FROM annotations a LEFT JOIN genes_mod g ON a.gene_id = g.gene_id; - """ - ) + """) if verbose: logger.info(f"Loading {variants}...") - con.execute( - f""" + con.execute(f""" CREATE TEMP TABLE variants AS SELECT * FROM read_parquet('{variants}'); - """ - ) + """) if verbose: logger.info("Merging merged1 with variants...") - con.execute( - """ + con.execute(""" CREATE TEMP TABLE merged2 AS SELECT m1.*, v.chrom, v.pos, v.ref, v.alt FROM merged1 m1 LEFT JOIN variants v ON m1.id = v.id; - """ - ) + """) if verbose: logger.info(f"Loading {ab_splice_agg_score_file}...") - con.execute( - f""" + con.execute(f""" CREATE TEMP TABLE splice AS SELECT * FROM read_parquet('{ab_splice_agg_score_file}'); - """ - ) + """) if verbose: logger.info("Merging merged2 with absplice scores...") - con.execute( - """ + con.execute(""" CREATE TEMP TABLE final_merge AS SELECT m2.*, s.AbSplice_DNA FROM merged2 m2 @@ -2300,26 +2287,21 @@ def merge_absplice_scores( AND m2.ref = s.ref AND m2.alt = s.alt AND m2.gene = s.gene_id; - """ - ) + """) if fill_nan: if verbose: print(f"Filling NULL values in splice column with 0...") - con.execute( - f""" + con.execute(f""" UPDATE final_merge SET AbSplice_DNA = 0 WHERE AbSplice_DNA IS NULL; - """ - ) + """) if verbose: logger.info(f"Saving final merged table to {output}...") - con.execute( - f""" + con.execute(f""" COPY final_merge TO '{output}' (FORMAT 'parquet'); - """ - ) + """) logger.info(f"Successfully saved final merged table to {output}") diff --git a/deeprvat/deeprvat/common_variant_condition_utils.py b/deeprvat/deeprvat/common_variant_condition_utils.py index 566f30a6..01aca89d 100644 --- a/deeprvat/deeprvat/common_variant_condition_utils.py +++ b/deeprvat/deeprvat/common_variant_condition_utils.py @@ -15,7 +15,6 @@ import shutil import os - compression_level = 1 logging.basicConfig( diff --git a/deeprvat/seed_gene_discovery/evaluate.py b/deeprvat/seed_gene_discovery/evaluate.py index 05ebb087..c1d9483e 100644 --- a/deeprvat/seed_gene_discovery/evaluate.py +++ b/deeprvat/seed_gene_discovery/evaluate.py @@ -44,10 +44,8 @@ def evaluate_(associations: Dict[str, pd.DataFrame], alpha: float): logger.info("Only using genes with EAC > 50") result = result.query("EAC > 50") if result["pval"].isna().sum() > 0: - logger.info( - "Attention: still NA pvals after EAC filtering\ - Removing remaining NA pvals" - ) + logger.info("Attention: still NA pvals after EAC filtering\ + Removing remaining NA pvals") result = result.dropna(subset=["pval"]) corrected_result = pval_correction( result, alpha, correction_type=correction_type diff --git a/deeprvat/seed_gene_discovery/seed_gene_discovery.py b/deeprvat/seed_gene_discovery/seed_gene_discovery.py index 208152cc..80164d73 100644 --- a/deeprvat/seed_gene_discovery/seed_gene_discovery.py +++ b/deeprvat/seed_gene_discovery/seed_gene_discovery.py @@ -557,10 +557,8 @@ def make_dataset_( batch_size = len(dataset) if "batch_size" in data_config["dataloader_config"].keys(): - raise ValueError( - """You can't specify the batch_size in the - dataloader config for the seed gene discovery""" - ) + raise ValueError("""You can't specify the batch_size in the + dataloader config for the seed gene discovery""") logger.info(f"Read dataset, batch size {batch_size}") dataloader = DataLoader( diff --git a/deeprvat/utils.py b/deeprvat/utils.py index e515a7ec..d926b736 100644 --- a/deeprvat/utils.py +++ b/deeprvat/utils.py @@ -14,7 +14,6 @@ from sklearn.preprocessing import quantile_transform from statsmodels.stats.multitest import fdrcorrection - logging.basicConfig( format="[%(asctime)s] %(levelname)s:%(name)s: %(message)s", level=logging.INFO, diff --git a/docs/index.rst b/docs/index.rst index c9d1700f..87cc8f0d 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -8,7 +8,7 @@ Welcome to DeepRVAT's documentation! Rare variant association testing using deep learning and data-driven gene impairment scores. -_Coming soon:_ Overview of the DeepRVAT methodaster +_Coming soon:_ Overview of the DeepRVAT method How to use this documentation diff --git a/docs/pretrained_models.md b/docs/pretrained_models.md index ae48d946..00b8b9ca 100644 --- a/docs/pretrained_models.md +++ b/docs/pretrained_models.md @@ -2,10 +2,23 @@ For using the pretrained DeepRVAT model provided as part of the package, or a custom pretrained model, we have setup pipelines for running only the association testing stage. This includes creating the association dataset files, computing gene impairment scores, regression, and evaluation. +_**Important note:**_ DeepRVAT currently supports association testing using REGENIE or SEAK. Because REGENIE is actively maintained and gives better control for ancestry effects, as well as other aspects of genetic background, association testing with SEAK is _deprecated_ and will be removed in a future version. Note especially that for binary phenotypes, REGENIE _must_ be used. -## Configuration and input files -Configuration parameters must be specified in `deeprvat_input_pretrained_models_config.yaml`, see [example file](https://github.com/PMBio/deeprvat/blob/main/example/config/deeprvat_input_pretrained_models_config.yaml). For details on the meanings of the parameters and the format of input files, see [here](input_data). +## Association testing with REGENIE + +For specifics on input files and options for REGENIE steps 1 and 2, please refer to the [REGENIE documentation](https://rgcgithub.github.io/regenie/). + +### Configuration and input files + +Configuration parameters must be specified in `deeprvat_input_pretrained_models_config.yaml` (rename the example file). For details on the meanings of the parameters and the format of input files, see [here](input_data). + +For an example, refer to [this configuration file](https://github.com/PMBio/deeprvat/blob/main/example/config/deeprvat_input_config_regenie.yaml) (rename or link it to `deeprvat_input_pretrained_models_config.yaml` in your working directory), which includes REGENIE-specific parameters for steps 1 and 2. Importantly, specify +``` +regenie_options: + regenie_exp: True + gtf_file: path/to/annotation_file.gtf.gz +``` To use pretrained models, you must specify `use_pretrained_models: True` in your `deeprvat_input_pretrained_models_config.yaml` configuration file. Additionally, provide the path to pretrained models (an output of the training pipeline) in the parameter `pretrained_model_path`. Within the `pretrained_model_path` directory, there must be a `config.yaml` file in that directory with the following set of specified keys that were used for training the pretrained models; `rare_variant_annotations`, `training_data_thresholds`, and `model` . See [example file](https://github.com/PMBio/deeprvat/blob/main/pretrained_models/model_config.yaml). @@ -35,44 +48,8 @@ cv_options (optional) Note that the file specified by `annotation_filename` must contain a column corresponding to each annotation in the list `rare_variant_annotations` from `deeprvat/pretrained_models/model_config.yaml`. - -## Executing the pipeline - -``` -snakemake -j 1 --snakefile [path_to_deeprvat]/pipelines/association_testing_pretrained.snakefile -``` - -Replace `[path_to_deeprvat]` with the path to your copy of the DeepRVAT repository. - - -## Running the association testing pipeline with REGENIE - -_Coming soon_ - - -Samples to be considered when testing sub-cohorts can be provided via `keep_samples.txt` which look like + -``` -12345 12345 -56789 56789 -```` -for keeping two samples with ids `12345` and `56789` + -### Running the association testing pipeline with SEAK + + + + + -```shell -cd experiment -ln -s [path_to_deeprvat]/pretrained_models +## Association testing with SEAK + +### Configuration file + +Follow the instructions as above, but specify +``` +regenie_options: + regenie_exp: False +``` + +For an example, see [this configuration file](https://github.com/PMBio/deeprvat/blob/main/example/config/deeprvat_input_pretrained_models_config.yaml) + +### Executing the pipeline + +``` snakemake -j 1 --snakefile [path_to_deeprvat]/pipelines/association_testing_pretrained.snakefile ``` ----> +Replace `[path_to_deeprvat]` with the path to your copy of the DeepRVAT repository. + + diff --git a/example/config/deeprvat_input_pretrained_models_config.yaml b/example/config/deeprvat_input_pretrained_models_config.yaml index 66d4c8c6..525200d3 100644 --- a/example/config/deeprvat_input_pretrained_models_config.yaml +++ b/example/config/deeprvat_input_pretrained_models_config.yaml @@ -40,13 +40,14 @@ covariates: #x_phenotypes - genetic_PC_19 - genetic_PC_20 +# Filters for variants to be included in association testing +# The column names used here must be in annotation.parquet association_testing_data_thresholds: MAF: "< 1e-3" CADD_PHRED: "> 5" #DeepRVAT model settings n_repeats: 30 -y_transformation: quantile_transform # Results evaluation settings evaluation: @@ -57,24 +58,18 @@ evaluation: #sample_files: # association_testing: association_testing_samples.pkl -#Additional settings if using the CV pipeline -cv_options: - cv_exp: False - #cv_path: sample_files - #n_folds: 5 - #Additional settings if using the REGENIE integration regenie_options: - regenie_exp: False - # gtf_file: gencode.v38.basic.annotation.gtf.gz - # step_1: - # bgen: imputation.bgen - # snplist: imputation.snplist - # bsize: 1000 - # options: - # - "--sample imputation.sample" - # - "--qt" - # step_2: - # bsize: 400 - # options: - # - "--qt" + regenie_exp: True + gtf_file: gencode.v38.basic.annotation.gtf.gz + step_1: + bgen: imputation.bgen + snplist: imputation.snplist + bsize: 1000 + options: + - "--sample imputation.sample" + - "--qt" + step_2: + bsize: 400 + options: + - "--qt" diff --git a/pipelines/resources/spliceai.py b/pipelines/resources/spliceai.py index d4b4b7c5..6d242968 100644 --- a/pipelines/resources/spliceai.py +++ b/pipelines/resources/spliceai.py @@ -1,7 +1,6 @@ from spliceai_rocksdb.spliceAI import SpliceAI import snakemake - if snakemake.params["lookup_only"]: model = SpliceAI(db_path=snakemake.params["db_path"]) else: