Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 18 additions & 36 deletions deeprvat/annotations/annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -2215,16 +2215,14 @@ def merge_absplice_scores(
if verbose:
logger.info(f"Loading and transforming {genes}...")

con.execute(
f"""
con.execute(f"""
CREATE TEMP TABLE genes_mod AS
SELECT
id AS gene_id,
split_part(gene, '.', 1) AS gene,
split_part(gene, '.', 2) AS exon
FROM read_parquet('{genes}');
"""
)
""")

if verbose:
logger.info(f"Loading {annotations}...")
Expand All @@ -2237,60 +2235,49 @@ def merge_absplice_scores(
exclude_columns = {"chrom", "pos", "ref", "alt", "AbSplice_DNA"}
selected_columns = [col for col in all_columns if col not in exclude_columns]
select_clause = ", ".join(selected_columns)
con.execute(
f"""
con.execute(f"""
CREATE TEMP TABLE annotations AS
SELECT {select_clause}
FROM read_parquet('{annotations}');
"""
)
""")

if verbose:
logger.info("Merging genes_mod with annotations...")
con.execute(
"""
con.execute("""
CREATE TEMP TABLE merged1 AS
SELECT a.*, g.gene, g.exon
FROM annotations a
LEFT JOIN genes_mod g
ON a.gene_id = g.gene_id;
"""
)
""")

if verbose:
logger.info(f"Loading {variants}...")
con.execute(
f"""
con.execute(f"""
CREATE TEMP TABLE variants AS
SELECT * FROM read_parquet('{variants}');
"""
)
""")

if verbose:
logger.info("Merging merged1 with variants...")
con.execute(
"""
con.execute("""
CREATE TEMP TABLE merged2 AS
SELECT m1.*, v.chrom, v.pos, v.ref, v.alt
FROM merged1 m1
LEFT JOIN variants v
ON m1.id = v.id;
"""
)
""")

if verbose:
logger.info(f"Loading {ab_splice_agg_score_file}...")
con.execute(
f"""
con.execute(f"""
CREATE TEMP TABLE splice AS
SELECT * FROM read_parquet('{ab_splice_agg_score_file}');
"""
)
""")

if verbose:
logger.info("Merging merged2 with absplice scores...")
con.execute(
"""
con.execute("""
CREATE TEMP TABLE final_merge AS
SELECT m2.*, s.AbSplice_DNA
FROM merged2 m2
Expand All @@ -2300,26 +2287,21 @@ def merge_absplice_scores(
AND m2.ref = s.ref
AND m2.alt = s.alt
AND m2.gene = s.gene_id;
"""
)
""")

if fill_nan:
if verbose:
print(f"Filling NULL values in splice column with 0...")
con.execute(
f"""
con.execute(f"""
UPDATE final_merge
SET AbSplice_DNA = 0
WHERE AbSplice_DNA IS NULL;
"""
)
""")

if verbose:
logger.info(f"Saving final merged table to {output}...")
con.execute(
f"""
con.execute(f"""
COPY final_merge TO '{output}' (FORMAT 'parquet');
"""
)
""")

logger.info(f"Successfully saved final merged table to {output}")
1 change: 0 additions & 1 deletion deeprvat/deeprvat/common_variant_condition_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
import shutil
import os


compression_level = 1

logging.basicConfig(
Expand Down
6 changes: 2 additions & 4 deletions deeprvat/seed_gene_discovery/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,8 @@ def evaluate_(associations: Dict[str, pd.DataFrame], alpha: float):
logger.info("Only using genes with EAC > 50")
result = result.query("EAC > 50")
if result["pval"].isna().sum() > 0:
logger.info(
"Attention: still NA pvals after EAC filtering\
Removing remaining NA pvals"
)
logger.info("Attention: still NA pvals after EAC filtering\
Removing remaining NA pvals")
result = result.dropna(subset=["pval"])
corrected_result = pval_correction(
result, alpha, correction_type=correction_type
Expand Down
6 changes: 2 additions & 4 deletions deeprvat/seed_gene_discovery/seed_gene_discovery.py
Original file line number Diff line number Diff line change
Expand Up @@ -557,10 +557,8 @@ def make_dataset_(
batch_size = len(dataset)

if "batch_size" in data_config["dataloader_config"].keys():
raise ValueError(
"""You can't specify the batch_size in the
dataloader config for the seed gene discovery"""
)
raise ValueError("""You can't specify the batch_size in the
dataloader config for the seed gene discovery""")

logger.info(f"Read dataset, batch size {batch_size}")
dataloader = DataLoader(
Expand Down
1 change: 0 additions & 1 deletion deeprvat/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
from sklearn.preprocessing import quantile_transform
from statsmodels.stats.multitest import fdrcorrection


logging.basicConfig(
format="[%(asctime)s] %(levelname)s:%(name)s: %(message)s",
level=logging.INFO,
Expand Down
2 changes: 1 addition & 1 deletion docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ Welcome to DeepRVAT's documentation!

Rare variant association testing using deep learning and data-driven gene impairment scores.

_Coming soon:_ Overview of the DeepRVAT methodaster
_Coming soon:_ Overview of the DeepRVAT method


How to use this documentation
Expand Down
100 changes: 45 additions & 55 deletions docs/pretrained_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,23 @@

For using the pretrained DeepRVAT model provided as part of the package, or a custom pretrained model, we have setup pipelines for running only the association testing stage. This includes creating the association dataset files, computing gene impairment scores, regression, and evaluation.

_**Important note:**_ DeepRVAT currently supports association testing using REGENIE or SEAK. Because REGENIE is actively maintained and gives better control for ancestry effects, as well as other aspects of genetic background, association testing with SEAK is _deprecated_ and will be removed in a future version. Note especially that for binary phenotypes, REGENIE _must_ be used.

## Configuration and input files

Configuration parameters must be specified in `deeprvat_input_pretrained_models_config.yaml`, see [example file](https://github.com/PMBio/deeprvat/blob/main/example/config/deeprvat_input_pretrained_models_config.yaml). For details on the meanings of the parameters and the format of input files, see [here](input_data).
## Association testing with REGENIE

For specifics on input files and options for REGENIE steps 1 and 2, please refer to the [REGENIE documentation](https://rgcgithub.github.io/regenie/).

### Configuration and input files

Configuration parameters must be specified in `deeprvat_input_pretrained_models_config.yaml` (rename the example file). For details on the meanings of the parameters and the format of input files, see [here](input_data).

For an example, refer to [this configuration file](https://github.com/PMBio/deeprvat/blob/main/example/config/deeprvat_input_config_regenie.yaml) (rename or link it to `deeprvat_input_pretrained_models_config.yaml` in your working directory), which includes REGENIE-specific parameters for steps 1 and 2. Importantly, specify
```
regenie_options:
regenie_exp: True
gtf_file: path/to/annotation_file.gtf.gz
```

To use pretrained models, you must specify `use_pretrained_models: True` in your `deeprvat_input_pretrained_models_config.yaml` configuration file. Additionally, provide the path to pretrained models (an output of the training pipeline) in the parameter `pretrained_model_path`. Within the `pretrained_model_path` directory, there must be a `config.yaml` file in that directory with the following set of specified keys that were used for training the pretrained models; `rare_variant_annotations`, `training_data_thresholds`, and `model` . See [example file](https://github.com/PMBio/deeprvat/blob/main/pretrained_models/model_config.yaml).

Expand Down Expand Up @@ -35,59 +48,23 @@ cv_options (optional)

Note that the file specified by `annotation_filename` must contain a column corresponding to each annotation in the list `rare_variant_annotations` from `deeprvat/pretrained_models/model_config.yaml`.


## Executing the pipeline

```
snakemake -j 1 --snakefile [path_to_deeprvat]/pipelines/association_testing_pretrained.snakefile
```

Replace `[path_to_deeprvat]` with the path to your copy of the DeepRVAT repository.


## Running the association testing pipeline with REGENIE

_Coming soon_

<!---

#### Input data
For running with REGENIE, in addition to the default input data, the following REGENIE specific files should also be included in your `experiment` directory:


To run REGENIE Step 1
- `.sample` Inclusion file that lists individuals to retain in the analysis
- `.sniplist` Inclusion file that lists IDs of variants to keep
- `.bgen` input genetic data file
- `.bgen.bgi` index bgi file corresponding to input BGEN file

For these REGENIE specific files, please refer to the [REGENIE documentation](https://rgcgithub.github.io/regenie/).

For running REGENIE Step 2:
- `gtf file` gencode gene annotation gtf file
- `keep_samples.txt` (optional file of samples to include)
- `protein_coding_genes.parquet`

#### Config file

Use the `[path_to_deeprvat]/example/config_regenie.yaml` as `config.yaml` which includes REGENIE specific parameters.
You can set any parameter explained in the [REGENIE documentation](https://rgcgithub.github.io/regenie/) via this config.
Most importantly, for association testing of binary traits use:
You can set any parameter explained in the [REGENIE documentation](https://rgcgithub.github.io/regenie/) via the config.
Most importantly, for association testing of binary traits use `--bt` in Step 2. For imbalanced traits, we also recommend Firth logistic regression:
```
step_2:
options:
- "--bt"
- "--firth --approx --pThresh 0.01"

```
and for quantitative traits:
For quantitative traits:
```
step_2:
options:
- "--qt"
```

#### Run REGENIE
### Run REGENIE


```
Expand All @@ -97,24 +74,37 @@ snakemake -j 1 --snakefile [path_to_deeprvat]/pipelines/association_testing_pret
```


#### Testing multiple sub-chohorts
For testing multiple sub-cohorts, remember that REGENIE Step 1 (compute intense) only needs to be executed once per sample and phenotype. We suggest running REGENIE Step 1 on all samples and phenotypes initially and then linking the output as `regenie_output/step1/` in each experiment directory for testing a sub-cohort.
<!-- ### Testing multiple sub-cohorts -->

Samples to be considered when testing sub-cohorts can be provided via `keep_samples.txt` which look like
<!-- For testing multiple sub-cohorts, remember that REGENIE Step 1 (compute intense) only needs to be executed once per sample and phenotype. We suggest running REGENIE Step 1 on all samples and phenotypes initially and then linking the output as `regenie_output/step1/` in each experiment directory for testing a sub-cohort. -->

```
12345 12345
56789 56789
````
for keeping two samples with ids `12345` and `56789`
<!-- Samples to be considered when testing sub-cohorts can be provided via `keep_samples.txt` which look like -->

### Running the association testing pipeline with SEAK
<!-- ``` -->
<!-- 12345 12345 -->
<!-- 56789 56789 -->
<!-- ```` -->
<!-- for keeping two samples with ids `12345` and `56789` -->

```shell
cd experiment
ln -s [path_to_deeprvat]/pretrained_models
## Association testing with SEAK

### Configuration file

Follow the instructions as above, but specify
```
regenie_options:
regenie_exp: False
```

For an example, see [this configuration file](https://github.com/PMBio/deeprvat/blob/main/example/config/deeprvat_input_pretrained_models_config.yaml)

### Executing the pipeline

```
snakemake -j 1 --snakefile [path_to_deeprvat]/pipelines/association_testing_pretrained.snakefile
```

--->
Replace `[path_to_deeprvat]` with the path to your copy of the DeepRVAT repository.



35 changes: 15 additions & 20 deletions example/config/deeprvat_input_pretrained_models_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,13 +40,14 @@ covariates: #x_phenotypes
- genetic_PC_19
- genetic_PC_20

# Filters for variants to be included in association testing
# The column names used here must be in annotation.parquet
association_testing_data_thresholds:
MAF: "< 1e-3"
CADD_PHRED: "> 5"

#DeepRVAT model settings
n_repeats: 30
y_transformation: quantile_transform

# Results evaluation settings
evaluation:
Expand All @@ -57,24 +58,18 @@ evaluation:
#sample_files:
# association_testing: association_testing_samples.pkl

#Additional settings if using the CV pipeline
cv_options:
cv_exp: False
#cv_path: sample_files
#n_folds: 5

#Additional settings if using the REGENIE integration
regenie_options:
regenie_exp: False
# gtf_file: gencode.v38.basic.annotation.gtf.gz
# step_1:
# bgen: imputation.bgen
# snplist: imputation.snplist
# bsize: 1000
# options:
# - "--sample imputation.sample"
# - "--qt"
# step_2:
# bsize: 400
# options:
# - "--qt"
regenie_exp: True
gtf_file: gencode.v38.basic.annotation.gtf.gz
step_1:
bgen: imputation.bgen
snplist: imputation.snplist
bsize: 1000
options:
- "--sample imputation.sample"
- "--qt"
step_2:
bsize: 400
options:
- "--qt"
1 change: 0 additions & 1 deletion pipelines/resources/spliceai.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from spliceai_rocksdb.spliceAI import SpliceAI
import snakemake


if snakemake.params["lookup_only"]:
model = SpliceAI(db_path=snakemake.params["db_path"])
else:
Expand Down
Loading