diff --git a/CHANGELOG.md b/CHANGELOG.md index 5e7a8fd5d..76da1d0ba 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Added` +- Add SNV scoring by MIVMIR, GICAM models [#812](https://github.com/nf-core/raredisease/pull/812) + ### `Changed` - Replace `ch_publish`/`subworkflow_results` with named typed channel emits for qc_bam subworkflow [#853](https://github.com/nf-core/raredisease/pull/853) diff --git a/README.md b/README.md index 8cdfe1045..177c0f875 100644 --- a/README.md +++ b/README.md @@ -175,7 +175,7 @@ For more details about the output files and reports, please refer to the nf-core/raredisease was written in a collaboration between the Clinical Genomics nodes in Sweden, with major contributions from [Ramprasad Neethiraj](https://github.com/ramprasadn), [Anders Jemt](https://github.com/jemten), [Lucia Pena Perez](https://github.com/Lucpen), and [Mei Wu](https://github.com/projectoriented) at Clinical Genomics Stockholm. -Additional contributors were [Sima Rahimi](https://github.com/sima-r), [Gwenna Breton](https://github.com/Gwennid) and [Emma Västerviga](https://github.com/EmmaCAndersson) (Clinical Genomics Gothenburg); [Halfdan Rydbeck](https://github.com/hrydbeck) and [Lauri Mesilaakso](https://github.com/ljmesi) (Clinical Genomics Linköping); [Subazini Thankaswamy Kosalai](https://github.com/sysbiocoder) (Clinical Genomics Örebro); [Annick Renevey](https://github.com/rannick), [Peter Pruisscher](https://github.com/peterpru) and [Eva Caceres](https://github.com/fevac) (Clinical Genomics Stockholm); [Ryan Kennedy](https://github.com/ryanjameskennedy) (Clinical Genomics Lund); [Anders Sune Pedersen](https://github.com/asp8200) (Danish National Genome Center) and [Lucas Taniguti](https://github.com/lmtani). +Additional contributors were [Sima Rahimi](https://github.com/sima-r), [Gwenna Breton](https://github.com/Gwennid) and [Emma Västerviga](https://github.com/EmmaCAndersson) (Clinical Genomics Gothenburg); [Halfdan Rydbeck](https://github.com/hrydbeck) and [Lauri Mesilaakso](https://github.com/ljmesi) (Clinical Genomics Linköping); [Subazini Thankaswamy Kosalai](https://github.com/sysbiocoder) (Clinical Genomics Örebro); [Annick Renevey](https://github.com/rannick), [Peter Pruisscher](https://github.com/peterpru), [Eva Caceres](https://github.com/fevac) and [Tor Björgen](https://github.com/torbjorgen) (Clinical Genomics Stockholm); [Ryan Kennedy](https://github.com/ryanjameskennedy) (Clinical Genomics Lund); [Anders Sune Pedersen](https://github.com/asp8200) (Danish National Genome Center) and [Lucas Taniguti](https://github.com/lmtani). We thank the nf-core community for their extensive assistance in the development of this pipeline. diff --git a/assets/rank_model_genmod_gicam.ini b/assets/rank_model_genmod_gicam.ini new file mode 100644 index 000000000..72730c8d5 --- /dev/null +++ b/assets/rank_model_genmod_gicam.ini @@ -0,0 +1,121 @@ +[Version] + version = 1 + name = rank_model_for_gicam + +[Categories] + + [[inheritance_models]] + category_aggregation = min + + [[variant_call_quality_filter]] + category_aggregation = sum + +[model_score] + category = variant_call_quality_filter + data_type = integer + description = Inheritance model score + field = INFO + info_key = ModelScore + record_rule = min + separators = ',',':', + + [[not_reported]] + score = 0 + + [[low_qual]] + score = -5 + lower = 0 + upper = 10 + + [[medium_qual]] + score = -2 + lower = 10 + upper = 20 + + [[high_qual]] + score = 0 + lower = 20 + upper = 300 + +[genetic_models] + category = inheritance_models + data_type = string + description = Inheritance models followed for the variant + field = INFO + info_key = GeneticModels + record_rule = max + separators = ',', ':', '|', + + [[ad]] + priority = 1 + score = 1 + string = 'AD' + + [[ad_dn]] + score = 1 + priority = 1 + string = 'AD_dn' + + [[ar]] + score = 1 + priority = 1 + string = 'AR_hom' + + [[ar_dn]] + score = 1 + priority = 1 + string = 'AR_hom_dn' + + [[ar_comp]] + score = 1 + priority = 1 + string = 'AR_comp' + + [[ar_comp_dn]] + score = 1 + priority = 1 + string = 'AR_comp_dn' + + [[xr]] + score = 1 + priority = 1 + string = 'XR' + + [[xr_dn]] + score = 1 + priority = 1 + string = 'XR_dn' + + [[xd]] + score = 1 + priority = 1 + string = 'XD' + + [[xd_dn]] + score = 1 + priority = 1 + string = 'XD_dn' + + [[not_reported]] + score = -12 + +[filter] + category = variant_call_quality_filter + data_type = string + description = The filters for the variant + field = FILTER + record_rule = min + separators = ';', + + [[not_reported]] + score = 0 + + [[pass]] + score = 3 + priority = 1 + string = 'PASS' + + [[dot]] + score = 3 + priority = 2 + string = '.' diff --git a/conf/modules/rank_variants.config b/conf/modules/rank_variants.config index 0c31a1cd8..70ddbc8f7 100644 --- a/conf/modules/rank_variants.config +++ b/conf/modules/rank_variants.config @@ -80,9 +80,28 @@ process { withName: '.*RANK_VARIANTS_SNV:TABIX_BGZIPTABIX' { ext.args2 = '-p vcf' - ext.prefix = { "${meta.id}_snv_ranked_${meta.set}" } + // Avoid BCFTOOLS i/o nameclash by appending genmod_ when ranking with mivmir, gicam + // VCF name is overridden to fit naming convention BCFTOOLS_APPEND_MIVMIR_GICAM + ext.prefix = { "${meta.id}_snv_ranked_${params.rank_with_mivmir_gicam ? 'genmod_': ''}${meta.set}" } + } + + withName: '.*RANK_VARIANTS_SNV:MIVMIR_INFER' { + // VCF key name change tweak for GRCh38 + ext.args = '--override_vcf_input_keys GNOMADAF_popmax:GNOMADAF_grpmax' + } + + withName: '.*RANK_VARIANTS_SNV:TABIX_BGZIPTABIX_GICAM' { + ext.prefix = { "${meta.id}_snv_ranked_gicam_${meta.set}" } } + withName: '.*RANK_VARIANTS_SNV:BCFTOOLS_APPEND_MIVMIR_GICAM' { + ext.args = { [ + '--columns MivmirScore,MivmirExplanation,GicamScore', + '--write-index=tbi', + '--output-type z' + ].join(' ') } + ext.prefix = { "${meta.id}_snv_ranked_${meta.set}" } + } } // diff --git a/docs/output.md b/docs/output.md index aa0870745..7ca560cde 100644 --- a/docs/output.md +++ b/docs/output.md @@ -69,6 +69,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d - [Filtering and ranking](#filtering-and-ranking) - [Filter_vep](#filter_vep) - [GENMOD](#genmod) + - [MIVMIR, GICAM](#mivmir-gicam) - [Mobile element analysis](#mobile-element-analysis) - [Calling mobile elements](#calling-mobile-elements) - [Annotating mobile elements](#annotating-mobile-elements) @@ -527,6 +528,101 @@ We recommend using vcfanno to annotate SNVs with precomputed CADD scores (files +#### MIVMIR, GICAM + +[MIVMIR](../modules/local/mivmir/meta.yml) and [GICAM](../modules/local/gicam/meta.yml) are two machine learning models used to +infer a pathogenicity score for SNVs, INDELs. In essence, MIVMIR infer SNV pathogenicity and GICAM improves precision +for duo, trio, ... analysis. + +The models are enabled as a single entity, to achieve the best performance. +GICAM has a dependency on MIVMIR, acting as an input feature to GICAM. + +MIVMIR, GICAM can be enabled by setting the `--rank_with_mivmir_gicam` feature flag. + +Only `/rank_and_filter/_snv_ranked_.vcf.gz` contains the output annotations. + +MIVMIR and GICAM [source code and performance metrics available here](https://github.com/Clinical-Genomics/rdds/tree/master/src/rdds/). + +##### MIVMIR + +MIVMIR is a deep neural network regression type model trained on ClinVar pathogenic +variants and an Ashkenazim trio negative background. The purpose of the model +is to infer pathogenicity/ rank scores from a set of input annotations. +The model is scoring every variant in isolation. + +Model is optimized to generalize well on Scilifelab Clinical Genomics Stockholm cohort, +and does have some custom input features (`SWEGENAF`, `Frq`) as by our +local pipeline configuration. + +> It is possible to run model inference on variants without SWEGENAF, Frq +> annotations. These annotations are dropped out during training to simulate +> lack of previous clinical evidence in order to regularize the model. +> However, such configuration is highly experimental, unsupported +> and at your own discretion. + +###### Requirements + +MIVMIR relies on the following VCF annotations/keys as variant feature inputs: + +- `CSQ/PolyPhen` +- `CSQ/SIFT` +- `CSQ/CLINVAR_CLNREVSTAT` +- `CSQ/CLINVAR_CLNSIG` +- `CSQ/MaxEntScan_alt` +- `CSQ/MaxEntScan_diff` +- `CSQ/MES-SWA_acceptor_alt` +- `CSQ/MES-SWA_donor_alt` +- `CSQ/MES-SWA_donor_diff` +- `CSQ/SpliceAI_pred_DS_AL` +- `CSQ/SpliceAI_pred_DS_DG` +- `CSQ/SpliceAI_pred_DS_DL` +- `CSQ/REVEL_score` +- `CSQ/LoFtool` +- `CSQ/GERP++_RS` +- `CSQ/phastCons100way_vertebrate` +- `CSQ/phyloP100way_vertebrate` +- `CSQ/SpliceAI_pred_DS_AG` +- `most_severe_consequence` +- `CADD` +- `SWEGENAF` ([SweGen Variant Frequency Dataset](https://swefreq.nbis.se/dataset/SweGen)) +- `GNOMADAF_popmax` +- `Frq` (variant frequency from a local database) + +> If any annotation is missing, it will be **_*silently*_** interpreted as `''` or `0.0` depending on data type. + +> For GRCh38, GnomAD has renamed the `GNOMADAF_popmax` to `GNOMADAF_grpmax`. +> Adjust this via the `--override_vcf_input_keys [OLD_KEY:NEW_KEY],[...]` flag. + +The tool adds two keys to the VCF + +- `MivmirScore`, a score in range (0, 1) where 1.0 inferred pathogenic. +- `MivmirExplanation`, a key value array type str-float explaining the additive contribution of each annotation + to the final variant score. The explanations are sorted in decreasing order, the largest input feature + contributor is listed first. This key is only available for high scoring variants for compute performance + reasons. + +##### GICAM + +Model for SNV ranking in conjunction with MIVMIR. + +This model improves precision for duo, trio, ... analysis situations by reducing MIVMIR scores for variants that's +not following the appropriate GENMOD genetic inheritance model. Applied to MIVMIR scores as a post-processing step. + +###### Requirements + +GICAM relies on the following VCF annotations/keys as variant feature inputs: + +- `MivmirScore`, (0, 1) +- `RankScoreNormalized`, (0, 1) as produced by GENMOD using rank_model_genmod_gicam.ini custom scoring config + +The tool adds one key to the VCF + +- `GicamScore` (0, 1) where 1.0 inferred pathogenic. + +> GICAM is optimized for the GENMOD scoring config present in the gicam module directory, that +> generates RankScoreNormalized. +> Modifications to the GENMOD scoring config will break inference (unless GICAM is first retrained on the new config). + ### Mobile element analysis #### Calling mobile elements diff --git a/docs/usage.md b/docs/usage.md index 867fba784..4f82c9f55 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -290,7 +290,7 @@ The mandatory and optional parameters for each category are tabulated below. | vcfanno_toml3 | vep_filters/vep_filters_scout_fmt10 | | vep_cache_version | cadd_resources11 | | vep_cache4 | run_vcfanno_db_sanity_check12 | -| gnomad_af5 | | +| gnomad_af5 | rank_with_mivmir_gicam13 | | score_config_snv6 | | | variant_consequences_snv7 | | | vep_plugin_files8 | | @@ -310,6 +310,7 @@ no header and the following columns: `CHROM POS REF_ALLELE,ALT_ALLELE AF`. Sampl 10 This file contains a list of candidate genes (with [HGNC](https://www.genenames.org/) IDs) that is used to split the variants into candidate variants and research variants. Research variants contain all the variants, while candidate variants are a subset of research variants and are associated with candidate genes. Sample file [here](https://github.com/nf-core/test-datasets/blob/raredisease/reference/hgnc.txt). Not required if `--skip_subworkflows generate_clinical_set` is set.
11Path to a folder containing cadd annotations. Equivalent of the data/annotations/ folder described [here](https://github.com/kircherlab/CADD-scripts/#manual-installation), and it is used to calculate CADD scores for small indels.
12When set to `true`, each vcfanno database file listed in `vcfanno_resources` is checked for records (non-header lines). Any database with zero records is removed from the vcfanno TOML config before annotation runs to prevent vcfanno from crashing on default resource files. Default: `false`.
+13 Enable variant SNV-INDEL scoring using MIVMIR, GICAM machine learning models. :::note We use CADD only to annotate small indels. To annotate SNVs with precomputed CADD scores, pass the file containing CADD scores as a resource to vcfanno instead. Files containing the precomputed CADD scores for SNVs can be downloaded from [here](https://cadd.gs.washington.edu/download) (download files listed under the description: "All possible SNVs of GRCh3<7/8>/hg3<7/8>") diff --git a/main.nf b/main.nf index a803c290d..f3f374731 100644 --- a/main.nf +++ b/main.nf @@ -110,6 +110,7 @@ workflow NFCORE_RAREDISEASE { val_readcount_intervals val_reduced_penetrance val_rtg_truthvcfs + val_rank_with_mivmir_gicam val_run_mt_for_wes val_run_rtgvcfeval val_run_vcfanno_db_sanity_check @@ -220,6 +221,8 @@ workflow NFCORE_RAREDISEASE { ch_score_config_mt = channelFromPath(val_score_config_mt, true) ch_score_config_snv = channelFromPath(val_score_config_snv, true) ch_score_config_sv = channelFromPath(val_score_config_sv, true) + // ch_genmod_gicam_score_config is integral to GICAM inference; it cannot be changed without retraining gicam + ch_score_config_genmod_gicam = channel.fromPath("$projectDir/assets/rank_model_genmod_gicam.ini", checkIfExists: true).collect() ch_vcf2cytosure_blacklist = channelFromPath(val_vcf2cytosure_blacklist, true) ch_vcfanno_lua = channelFromPath(val_vcfanno_lua, true) ch_vcfanno_toml = channelFromPath(val_vcfanno_toml, true) @@ -436,6 +439,7 @@ workflow NFCORE_RAREDISEASE { ch_score_config_mt, ch_score_config_snv, ch_score_config_sv, + ch_score_config_genmod_gicam, ch_sdf, ch_sentieon_pcr_indel_model, ch_subdepth, @@ -518,6 +522,7 @@ workflow NFCORE_RAREDISEASE { val_multiqc_samples, val_outdir, val_platform, + val_rank_with_mivmir_gicam, val_run_mt_for_wes, val_run_rtgvcfeval, val_run_vcfanno_db_sanity_check, @@ -673,6 +678,7 @@ workflow { params.readcount_intervals, params.reduced_penetrance, params.rtg_truthvcfs, + params.rank_with_mivmir_gicam, params.run_mt_for_wes, params.run_rtgvcfeval, params.run_vcfanno_db_sanity_check, diff --git a/modules/local/gicam/main.nf b/modules/local/gicam/main.nf new file mode 100644 index 000000000..3658321db --- /dev/null +++ b/modules/local/gicam/main.nf @@ -0,0 +1,33 @@ +process GICAM_INFER { + // https://github.com/Clinical-Genomics/rdds/tree/master/src/rdds/gicam + + tag "${meta.id}" + label 'process_high' + + container "docker.io/clinicalgenomics/rdds_mivmir:v1.13.0" + + beforeScript "mkdir ${task.workDir}/rdds-tmp" + afterScript "rm -r ${task.workDir}/rdds-tmp" + containerOptions {[ + workflow.containerEngine.equals("singularity") ? "--bind ${task.workDir}/rdds-tmp:/rdds/tmp" : "", + workflow.containerEngine.equals("docker") ? "--tmpfs /rdds/tmp": "", + "" + ].minus("").join(" ")} + + input: + tuple val(meta), path(input_vcf) + + output: + tuple val(meta), path('*-predictions.vcf'), emit: vcf + tuple val("${task.process}"), val('gicam'), val('v1.13.0'), topic: versions, emit: versions_gicam + + when: + task.ext.when == null || task.ext.when + + script: + """ + . /opt/pyenv/bin/activate + export PYTHONPATH=/rdds/src + python3 -m rdds.gicam infer-vcf --cpu_cores ${task.cpus} ${input_vcf} + """ +} diff --git a/modules/local/gicam/meta.yml b/modules/local/gicam/meta.yml new file mode 100644 index 000000000..70b98cad6 --- /dev/null +++ b/modules/local/gicam/meta.yml @@ -0,0 +1,64 @@ +name: gicam +description: "Machine learning tool to improve precision for duo, trio, ... analysis" +keywords: + - score + - ranking + - gicam +tools: + - mivmir: + description: Refer to the docs/ section for more information. + homepage: https://github.com/clinicalgenomics/rdds + documentation: https://github.com/Clinical-Genomics/rdds/tree/master/src/rdds/gicam + doi: "" + licence: ["MIT"] + identifier: "" +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input_vcf: + type: file + description: vcf file + pattern: "*.{vcf}" + ontologies: [] +output: + vcf: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*-predictions.vcf": + type: file + description: Scored output VCF file + pattern: "*.{vcf}" + ontologies: [] + versions_gicam: + - - "${task.process}": + type: string + description: The process the versions were collected from + - "gicam": + type: string + description: The tool name + - "version": + type: string + description: Tool version + +topics: + versions: + - - "${task.process}": + type: string + description: The process the versions were collected from + - "gicam": + type: string + description: The tool name + - "version": + type: eval + description: Tool version + +authors: + - "@torbjorgen" +maintainers: + - "@torbjorgen" diff --git a/modules/local/gicam/tests/main.nf.test b/modules/local/gicam/tests/main.nf.test new file mode 100644 index 000000000..1ac805ce4 --- /dev/null +++ b/modules/local/gicam/tests/main.nf.test @@ -0,0 +1,28 @@ +nextflow_process { + + name "Test Process GICAM_INFER" + script "modules/local/gicam/main.nf" + process "GICAM_INFER" + + test("Test GICAM inference on annotated VCF") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + // VCF as annotated by rank_variants subworkflow (including mivmir and genmod custom inheritance config), prior to gicam inference call + file(params.pipelines_testdata_base_path + 'testdata/justhusky_snv_gicam.vcf', checkIfExists: true) + ] + """ + } + } + + then { + assert process.success + assert snapshot(process.out).match() + } + + } + +} diff --git a/modules/local/gicam/tests/main.nf.test.snap b/modules/local/gicam/tests/main.nf.test.snap new file mode 100644 index 000000000..4518194e4 --- /dev/null +++ b/modules/local/gicam/tests/main.nf.test.snap @@ -0,0 +1,45 @@ +{ + "Test GICAM inference on annotated VCF": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "justhusky_snv_gicam-predictions.vcf:md5,57c88fc976d6833bcf0584bce9959fec" + ] + ], + "1": [ + [ + "GICAM_INFER", + "gicam", + "v1.13.0" + ] + ], + "vcf": [ + [ + { + "id": "test", + "single_end": false + }, + "justhusky_snv_gicam-predictions.vcf:md5,57c88fc976d6833bcf0584bce9959fec" + ] + ], + "versions_gicam": [ + [ + "GICAM_INFER", + "gicam", + "v1.13.0" + ] + ] + } + ], + "timestamp": "2026-04-27T11:14:56.183945487", + "meta": { + "nf-test": "0.9.5", + "nextflow": "25.10.4" + } + } +} \ No newline at end of file diff --git a/modules/local/mivmir/main.nf b/modules/local/mivmir/main.nf new file mode 100644 index 000000000..9a98b5109 --- /dev/null +++ b/modules/local/mivmir/main.nf @@ -0,0 +1,43 @@ +process MIVMIR_INFER { + // https://github.com/Clinical-Genomics/rdds/tree/master/src/rdds/variant_rank_score + + tag "${meta.id}" + label 'process_high' + + container "docker.io/clinicalgenomics/rdds_mivmir:v1.13.0" + + beforeScript "mkdir ${task.workDir}/rdds-tmp" + afterScript "rm -r ${task.workDir}/rdds-tmp" + containerOptions {[ + workflow.containerEngine.equals("singularity") ? "--bind ${task.workDir}/rdds-tmp:/rdds/tmp" : "", + workflow.containerEngine.equals("docker") ? "--tmpfs /rdds/tmp": "", + "" + ].minus("").join(" ")} + + input: + tuple val(meta), path(input_vcf) + val run_internal_test + + output: + tuple val(meta), path('*-predictions.vcf'), emit: vcf + tuple val("${task.process}"), val('mivmir'), val('v1.13.0'), topic: versions, emit:versions_mivmir + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + set -e + . /opt/pyenv/bin/activate + export PYTHONPATH=/rdds/src + if [ "$run_internal_test" == "true" ]; then + # Test inference API and numerical reproducibility + python3 -m pytest /rdds/src/tests/variant_rank_score -k test_inference + fi + python3 -m rdds.variant_rank_score predict-on-vcf \ + --cpu_cores ${task.cpus} \ + $args \ + ${input_vcf} + """ +} diff --git a/modules/local/mivmir/meta.yml b/modules/local/mivmir/meta.yml new file mode 100644 index 000000000..1b482d663 --- /dev/null +++ b/modules/local/mivmir/meta.yml @@ -0,0 +1,67 @@ +name: mivmir +description: Machine learning tool to score SNVs for inferring pathogenicity for ranking purposes +keywords: + - score + - ranking + - mivmir +tools: + - mivmir: + description: Refer to the docs/ section for more information. + homepage: https://github.com/clinicalgenomics/rdds + documentation: https://github.com/Clinical-Genomics/rdds/tree/master/src/rdds/variant_rank_score + doi: "" + licence: ["MIT"] + identifier: "" +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input_vcf: + type: file + description: vcf file + pattern: "*.{vcf}" + ontologies: [] + - run_internal_test: + type: value + description: Execute internal tests prior to running inference +output: + vcf: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*-predictions.vcf": + type: file + description: Scored output VCF file + pattern: "*.{vcf}" + ontologies: [] + versions_mivmir: + - - "${task.process}": + type: string + description: The process the versions were collected from + - "mivmir": + type: string + description: The tool name + - "version": + type: string + description: Tool version + +topics: + versions: + - - "${task.process}": + type: string + description: The process the versions were collected from + - "mivmir": + type: string + description: The tool name + - "version": + type: eval + description: Tool version + +authors: + - "@torbjorgen" +maintainers: + - "@torbjorgen" diff --git a/modules/local/mivmir/tests/main.nf.test b/modules/local/mivmir/tests/main.nf.test new file mode 100644 index 000000000..27f835ee3 --- /dev/null +++ b/modules/local/mivmir/tests/main.nf.test @@ -0,0 +1,31 @@ +nextflow_process { + + name "Test Process MIVMIR_INFER" + script "modules/local/mivmir/main.nf" + process "MIVMIR_INFER" + + test("Test MIVMIR inference on annotated VCF") { + + when { + params { + } + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + // VCF as annotated by rank_variants subworkflow, prior to mivmir inference call + file(params.pipelines_testdata_base_path + 'testdata/justhusky_snv_mivmir.vcf', checkIfExists: true) + ] + input[1] = true + """ + } + } + + then { + assert process.success + assert snapshot(process.out).match() + } + + } + +} diff --git a/modules/local/mivmir/tests/main.nf.test.snap b/modules/local/mivmir/tests/main.nf.test.snap new file mode 100644 index 000000000..0a2eb11be --- /dev/null +++ b/modules/local/mivmir/tests/main.nf.test.snap @@ -0,0 +1,45 @@ +{ + "Test MIVMIR inference on annotated VCF": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "justhusky_snv_mivmir-predictions.vcf:md5,50e4de56396cfbbf97607cddef15c205" + ] + ], + "1": [ + [ + "MIVMIR_INFER", + "mivmir", + "v1.13.0" + ] + ], + "vcf": [ + [ + { + "id": "test", + "single_end": false + }, + "justhusky_snv_mivmir-predictions.vcf:md5,50e4de56396cfbbf97607cddef15c205" + ] + ], + "versions_mivmir": [ + [ + "MIVMIR_INFER", + "mivmir", + "v1.13.0" + ] + ] + } + ], + "timestamp": "2026-04-27T11:19:19.234995243", + "meta": { + "nf-test": "0.9.5", + "nextflow": "25.10.4" + } + } +} \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index 0b8d22571..af5a09e1b 100644 --- a/nextflow.config +++ b/nextflow.config @@ -125,6 +125,9 @@ params { // variant annotation vep_cache_version = 112 + // Enable/ disable variant ranking with MIVMIR, GICAM models + rank_with_mivmir_gicam = false + // sentieon Defaults ml_model = null @@ -467,6 +470,13 @@ manifest { github: 'https://github.com/fevac', contribution: ['contributor'] ], + [ + name: 'Tor Björgen', + affiliation: 'Clinical Genomics Stockholm', + email: 'tor.bjorgen@scilifelab.se', + github: 'https://github.com/torbjorgen', + contribution: ['contributor'] + ], [ name: 'and the nf-core/raredisease team', affiliation: 'nf-core community', diff --git a/nextflow_schema.json b/nextflow_schema.json index ed6164089..e82c7dbf9 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -768,6 +768,19 @@ } } }, + "ranking_options": { + "title": "Variant ranking options", + "type": "object", + "fa_icon": "fas fa-star-half-alt", + "description": "Options related to variant ranking.", + "properties": { + "rank_with_mivmir_gicam": { + "type": "boolean", + "description": "Enable ranking by MIVMIR, GICAM models", + "fa_icon": "fas fa-robot" + } + } + }, "mitosalt_options": { "title": "Mitosalt and saltshaker options", "type": "object", @@ -1095,6 +1108,9 @@ { "$ref": "#/$defs/annotation_options" }, + { + "$ref": "#/$defs/ranking_options" + }, { "$ref": "#/$defs/mitosalt_options" }, diff --git a/subworkflows/local/rank_variants/main.nf b/subworkflows/local/rank_variants/main.nf index f2446196d..99c8eb5de 100644 --- a/subworkflows/local/rank_variants/main.nf +++ b/subworkflows/local/rank_variants/main.nf @@ -2,21 +2,28 @@ // A subworkflow to score and rank variants. // -include { BCFTOOLS_SORT } from '../../../modules/nf-core/bcftools/sort/main' -include { GENMOD_ANNOTATE } from '../../../modules/nf-core/genmod/annotate/main' -include { GENMOD_COMPOUND } from '../../../modules/nf-core/genmod/compound/main' -include { GENMOD_MODELS } from '../../../modules/nf-core/genmod/models/main' -include { GENMOD_SCORE } from '../../../modules/nf-core/genmod/score/main' -include { TABIX_BGZIPTABIX } from '../../../modules/nf-core/tabix/bgziptabix/main' +include { GENMOD_ANNOTATE } from '../../../modules/nf-core/genmod/annotate/main' +include { GENMOD_MODELS } from '../../../modules/nf-core/genmod/models/main' +include { GENMOD_SCORE } from '../../../modules/nf-core/genmod/score/main' +include { GENMOD_SCORE as GENMOD_SCORE_FOR_GICAM } from '../../../modules/nf-core/genmod/score/main' +include { GENMOD_COMPOUND } from '../../../modules/nf-core/genmod/compound/main' +include { MIVMIR_INFER } from '../../../modules/local/mivmir/main' +include { GICAM_INFER } from '../../../modules/local/gicam/main' +include { BCFTOOLS_SORT } from '../../../modules/nf-core/bcftools/sort/main' +include { TABIX_BGZIPTABIX } from '../../../modules/nf-core/tabix/bgziptabix/main' +include { TABIX_BGZIPTABIX as TABIX_BGZIPTABIX_GICAM } from '../../../modules/nf-core/tabix/bgziptabix/main' +include { BCFTOOLS_ANNOTATE as BCFTOOLS_APPEND_MIVMIR_GICAM } from '../../../modules/nf-core/bcftools/annotate/main' workflow RANK_VARIANTS { take: - ch_pedfile // channel: [mandatory] [ path(ped) ] - ch_reduced_penetrance // channel: [mandatory] [ path(pentrance) ] - ch_score_config // channel: [mandatory] [ path(ini) ] - ch_vcf // channel: [mandatory] [ val(meta), path(vcf) ] - process_with_sort // Boolean + ch_pedfile // channel: [mandatory] [ path(ped) ] + ch_reduced_penetrance // channel: [mandatory] [ path(pentrance) ] + ch_score_config // channel: [mandatory] [ path(ini) ] + ch_vcf // channel: [mandatory] [ val(meta), path(vcf) ] + process_with_sort // Boolean + rank_with_mivmir_gicam // Boolean + ch_genmod_gicam_score_config // channel: [mandatory if rank_with_mivmir_gicam] [ path(ini) ] main: GENMOD_ANNOTATE(ch_vcf) @@ -29,7 +36,19 @@ workflow RANK_VARIANTS { GENMOD_SCORE(ch_score_in, ch_score_config) - GENMOD_COMPOUND(GENMOD_SCORE.out.vcf) + // Run MIVMIR - GICAM scoring (not supported for MT SNVs and SVs) + if (rank_with_mivmir_gicam) { + // ch_genmod_gicam_score_config is integral to GICAM inference; it cannot be changed without retraining gicam + GENMOD_SCORE_FOR_GICAM(ch_score_in, ch_genmod_gicam_score_config) + + MIVMIR_INFER(GENMOD_SCORE_FOR_GICAM.out.vcf, false) + + GICAM_INFER(MIVMIR_INFER.out.vcf) + + TABIX_BGZIPTABIX_GICAM(GICAM_INFER.out.vcf) + } + + GENMOD_COMPOUND(GENMOD_SCORE.out.vcf) ch_sort_publish = channel.empty() ch_tabix_publish = channel.empty() @@ -47,6 +66,30 @@ workflow RANK_VARIANTS { ch_publish = ch_sort_publish.mix(ch_tabix_publish) + // Merge Genmod and MIVMIR-GICAM scores + if (rank_with_mivmir_gicam) { + ch_vcf + .join(TABIX_BGZIPTABIX_GICAM.out.gz_index, failOnMismatch: true) + .map { meta, vcf_genmod, vcf_gicam, vcf_index_gicam -> + return [meta, vcf_genmod, [], vcf_gicam, vcf_index_gicam, [], [], []] + } + .set { ch_merge_genmod_gicam } + + BCFTOOLS_APPEND_MIVMIR_GICAM(ch_merge_genmod_gicam) + + BCFTOOLS_APPEND_MIVMIR_GICAM.out.vcf + .join(BCFTOOLS_APPEND_MIVMIR_GICAM.out.tbi, failOnMismatch: true) + .set { ch_vcf_annotated_with_mivmir_gicam_gz_tbi } + + ch_vcf_annotated_with_mivmir_gicam_gz_tbi.map { meta, gz, _tbi -> + return [meta, gz] + }.set { ch_vcf } + + ch_vcf_annotated_with_mivmir_gicam_gz_tbi + .map { meta, gz, tbi -> ['rank_and_filter/', [meta, gz, tbi]] } + .set { ch_publish } + } + emit: publish = ch_publish // channel: [ val(destination), val(value) ] vcf = ch_vcf // channel: [ val(meta), path(vcf) ] diff --git a/subworkflows/local/rank_variants/tests/main.nf.test b/subworkflows/local/rank_variants/tests/main.nf.test index 28c9e510f..82bd43cd6 100644 --- a/subworkflows/local/rank_variants/tests/main.nf.test +++ b/subworkflows/local/rank_variants/tests/main.nf.test @@ -29,6 +29,8 @@ nextflow_workflow { file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/vcf/genmod.vcf.gz', checkIfExists: true) ]) input[4] = false + input[5] = false + input[6] = [] """ } } @@ -56,6 +58,8 @@ nextflow_workflow { file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/vcf/genmod.vcf.gz', checkIfExists: true) ]) input[4] = true + input[5] = false + input[6] = [] """ } } @@ -68,4 +72,44 @@ nextflow_workflow { } } + test("RANK_VARIANTS - SNV WITH MIVMIR, GICAM") { + + config "./nextflow.config" + + when { + workflow { + """ + input[0] = channel.value(file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/vcf/ped/justhusky.ped', checkIfExists: true)) + input[1] = channel.value(file(params.pipelines_testdata_base_path + 'reference/reduced_penetrance.tsv', checkIfExists: true)) + // Dummy scoring config, Genmod scoring is not intended test target. + input[2] = channel.value(file(params.pipelines_testdata_base_path + 'reference/rank_model_dummy_mivmir_gicam_unit_test.ini', checkIfExists: true)) + input[3] = channel.of([ + [id:'justhusky', single_end: false], + // Only SNV INDELs supported + file(params.pipelines_testdata_base_path + 'testdata/justhusky_snv_rank_variants_mivmir_gicam.vcf', checkIfExists: true) + ]) + input[4] = false + input[5] = params.rank_with_mivmir_gicam + // ch_genmod_gicam_score_config is integral to GICAM inference; it cannot be changed without retraining gicam + input[6] = channel.fromPath("$projectDir/assets/rank_model_genmod_gicam.ini", checkIfExists: true).collect() + """ + } + } + + then { + assertAll( + { assert workflow.success }, + { + with (path(workflow.out.vcf.get(0).get(1)).vcf.variants) { + for (int i = 0; i < size(); i++) { + assert snapshot(get(i).getAttribute('MivmirScore')).match('MIVMIR_GICAM-' + get(i).getChr() + '-' + get(i).getStart() + '-MivmirScore') + assert snapshot(get(i).getAttribute('MivmirExplanation')).match('MIVMIR_GICAM-' + get(i).getChr() + '-' + get(i).getStart() + '-MivmirExplanation') + assert snapshot(get(i).getAttribute('GicamScore')).match('MIVMIR_GICAM-' + get(i).getChr() + '-' + get(i).getStart() + '-GicamScore') + } + } + } + ) + } + } + } diff --git a/subworkflows/local/rank_variants/tests/main.nf.test.snap b/subworkflows/local/rank_variants/tests/main.nf.test.snap index d1cc348be..e158f8dbe 100644 --- a/subworkflows/local/rank_variants/tests/main.nf.test.snap +++ b/subworkflows/local/rank_variants/tests/main.nf.test.snap @@ -1,5 +1,55 @@ { - "RANK_VARIANTS - SV (process_with_sort = true), stub": { + "MIVMIR_GICAM-21-32863228-MivmirScore": { + "content": [ + "0.00239" + ], + "timestamp": "2026-04-28T16:47:04.194256866", + "meta": { + "nf-test": "0.9.5", + "nextflow": "25.10.4" + } + }, + "MIVMIR_GICAM-21-32863305-MivmirExplanation": { + "content": [ + "[]" + ], + "timestamp": "2026-04-28T16:47:04.228825631", + "meta": { + "nf-test": "0.9.5", + "nextflow": "25.10.4" + } + }, + "MIVMIR_GICAM-21-32863233-GicamScore": { + "content": [ + "0.00239" + ], + "timestamp": "2026-04-28T16:47:04.207766789", + "meta": { + "nf-test": "0.9.5", + "nextflow": "25.10.4" + } + }, + "MIVMIR_GICAM-21-32863244-MivmirExplanation": { + "content": [ + "[]" + ], + "timestamp": "2026-04-28T16:47:04.213435435", + "meta": { + "nf-test": "0.9.5", + "nextflow": "25.10.4" + } + }, + "MIVMIR_GICAM-21-32863253-MivmirScore": { + "content": [ + "0.00239" + ], + "timestamp": "2026-04-28T16:47:04.218931403", + "meta": { + "nf-test": "0.9.5", + "nextflow": "25.10.4" + } + }, + "RANK_VARIANTS - SNV (process_with_sort = false), stub": { "content": [ { "0": [ @@ -10,7 +60,8 @@ "id": "justhusky", "single_end": false }, - "justhusky.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + "justhusky.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940", + "justhusky.vcf.gz.tbi:md5,d41d8cd98f00b204e9800998ecf8427e" ] ] ], @@ -31,7 +82,8 @@ "id": "justhusky", "single_end": false }, - "justhusky.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + "justhusky.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940", + "justhusky.vcf.gz.tbi:md5,d41d8cd98f00b204e9800998ecf8427e" ] ] ], @@ -46,13 +98,123 @@ ] } ], - "timestamp": "2026-03-17T13:09:47.357626106", + "timestamp": "2026-03-17T13:09:21.870979671", "meta": { "nf-test": "0.9.4", "nextflow": "25.10.4" } }, - "RANK_VARIANTS - SNV (process_with_sort = false), stub": { + "MIVMIR_GICAM-21-32863233-MivmirScore": { + "content": [ + "0.00239" + ], + "timestamp": "2026-04-28T16:47:04.202751209", + "meta": { + "nf-test": "0.9.5", + "nextflow": "25.10.4" + } + }, + "MIVMIR_GICAM-21-32863244-MivmirScore": { + "content": [ + "0.00239" + ], + "timestamp": "2026-04-28T16:47:04.210715839", + "meta": { + "nf-test": "0.9.5", + "nextflow": "25.10.4" + } + }, + "MIVMIR_GICAM-21-32863220-GicamScore": { + "content": [ + "0.00239" + ], + "timestamp": "2026-04-28T16:47:04.183010674", + "meta": { + "nf-test": "0.9.5", + "nextflow": "25.10.4" + } + }, + "MIVMIR_GICAM-21-32863228-MivmirExplanation": { + "content": [ + "[]" + ], + "timestamp": "2026-04-28T16:47:04.196990189", + "meta": { + "nf-test": "0.9.5", + "nextflow": "25.10.4" + } + }, + "MIVMIR_GICAM-21-32863253-GicamScore": { + "content": [ + "0.00239" + ], + "timestamp": "2026-04-28T16:47:04.223845885", + "meta": { + "nf-test": "0.9.5", + "nextflow": "25.10.4" + } + }, + "MIVMIR_GICAM-21-32863253-MivmirExplanation": { + "content": [ + "[]" + ], + "timestamp": "2026-04-28T16:47:04.221524987", + "meta": { + "nf-test": "0.9.5", + "nextflow": "25.10.4" + } + }, + "MIVMIR_GICAM-21-32863305-GicamScore": { + "content": [ + "0.00239" + ], + "timestamp": "2026-04-28T16:47:04.231341672", + "meta": { + "nf-test": "0.9.5", + "nextflow": "25.10.4" + } + }, + "MIVMIR_GICAM-21-32863220-MivmirExplanation": { + "content": [ + "[]" + ], + "timestamp": "2026-04-28T16:47:04.174974311", + "meta": { + "nf-test": "0.9.5", + "nextflow": "25.10.4" + } + }, + "MIVMIR_GICAM-21-32863233-MivmirExplanation": { + "content": [ + "[]" + ], + "timestamp": "2026-04-28T16:47:04.205147188", + "meta": { + "nf-test": "0.9.5", + "nextflow": "25.10.4" + } + }, + "MIVMIR_GICAM-21-32863228-GicamScore": { + "content": [ + "0.00239" + ], + "timestamp": "2026-04-28T16:47:04.19987609", + "meta": { + "nf-test": "0.9.5", + "nextflow": "25.10.4" + } + }, + "MIVMIR_GICAM-21-32863244-GicamScore": { + "content": [ + "0.00239" + ], + "timestamp": "2026-04-28T16:47:04.216546153", + "meta": { + "nf-test": "0.9.5", + "nextflow": "25.10.4" + } + }, + "RANK_VARIANTS - SV (process_with_sort = true), stub": { "content": [ { "0": [ @@ -63,8 +225,7 @@ "id": "justhusky", "single_end": false }, - "justhusky.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940", - "justhusky.vcf.gz.tbi:md5,d41d8cd98f00b204e9800998ecf8427e" + "justhusky.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" ] ] ], @@ -85,8 +246,7 @@ "id": "justhusky", "single_end": false }, - "justhusky.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940", - "justhusky.vcf.gz.tbi:md5,d41d8cd98f00b204e9800998ecf8427e" + "justhusky.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" ] ] ], @@ -101,10 +261,30 @@ ] } ], - "timestamp": "2026-03-17T13:09:21.870979671", + "timestamp": "2026-03-17T13:09:47.357626106", "meta": { "nf-test": "0.9.4", "nextflow": "25.10.4" } + }, + "MIVMIR_GICAM-21-32863220-MivmirScore": { + "content": [ + "0.00239" + ], + "timestamp": "2026-04-28T16:47:04.167275806", + "meta": { + "nf-test": "0.9.5", + "nextflow": "25.10.4" + } + }, + "MIVMIR_GICAM-21-32863305-MivmirScore": { + "content": [ + "0.00239" + ], + "timestamp": "2026-04-28T16:47:04.226184722", + "meta": { + "nf-test": "0.9.5", + "nextflow": "25.10.4" + } } } \ No newline at end of file diff --git a/subworkflows/local/rank_variants/tests/nextflow.config b/subworkflows/local/rank_variants/tests/nextflow.config new file mode 100644 index 000000000..e3d7c08a8 --- /dev/null +++ b/subworkflows/local/rank_variants/tests/nextflow.config @@ -0,0 +1,50 @@ +// Minimal subset of rank_variants.config SNV set for running MIVMIR, GICAM annotation + +// Feature flag to enable model scoring +params.rank_with_mivmir_gicam = true + +process { + withName: '.*:GENMOD_ANNOTATE' { + ext.prefix = { "${meta.id}_snv_genmod_annotate_${meta.set}" } + ext.args = { [ + '--annotate_regions', + '--genome-build 37', + '--temp_dir ./' + ].join(' ') } + } + + withName: '.*:GENMOD_MODELS' { + ext.prefix = { "${meta.id}_snv_genmod_models_${meta.set}" } + ext.args = "--temp_dir ./" + } + + withName: '.*:GENMOD_SCORE' { + ext.prefix = { "${meta.id}_snv_genmod_score_${meta.set}" } + ext.args = "--rank_results" + } + + withName: '.*:GENMOD_COMPOUND' { + ext.prefix = { "${meta.id}_snv_genmod_compound_${meta.set}" } + ext.args = "--temp_dir ./" + } + + withName: '.*:TABIX_BGZIPTABIX' { + ext.args2 = '-p vcf' + // Avoid BCFTOOLS i/o nameclash by appending genmod_ when ranking with mivmir, gicam + // VCF name is overridden to fit naming convention BCFTOOLS_APPEND_MIVMIR_GICAM + ext.prefix = { "${meta.id}_snv_ranked_${params.rank_with_mivmir_gicam ? 'genmod_': ''}${meta.set}" } + } + + withName: '.*:BCFTOOLS_APPEND_MIVMIR_GICAM' { + ext.args = { [ + '--columns MivmirScore,MivmirExplanation,GicamScore', + '--write-index=tbi', + '--output-type z' + ].join(' ') } + ext.prefix = { "${meta.id}_snv_ranked_${meta.set}" } + } + + withName: '.*:TABIX_BGZIPTABIX_GICAM' { + ext.prefix = { "${meta.id}_snv_ranked_gicam_${meta.set}" } + } +} diff --git a/workflows/raredisease.nf b/workflows/raredisease.nf index 0fc7f9728..6589ac9e4 100644 --- a/workflows/raredisease.nf +++ b/workflows/raredisease.nf @@ -135,6 +135,7 @@ workflow RAREDISEASE { ch_score_config_mt ch_score_config_snv ch_score_config_sv + ch_score_config_genmod_gicam ch_sdf ch_sentieon_pcr_indel_model ch_subdepth @@ -217,6 +218,7 @@ workflow RAREDISEASE { val_multiqc_samples val_outdir val_platform + val_rank_with_mivmir_gicam val_run_mt_for_wes val_run_rtgvcfeval val_run_vcfanno_db_sanity_check @@ -537,7 +539,9 @@ workflow RAREDISEASE { ch_reduced_penetrance, ch_score_config_snv, ch_ranksnv_nuclear_in, - false + false, + val_rank_with_mivmir_gicam, + ch_score_config_genmod_gicam ) ch_rank_snv_publish = RANK_VARIANTS_SNV.out.publish } @@ -608,7 +612,9 @@ workflow RAREDISEASE { ch_reduced_penetrance, ch_score_config_mt, ch_ranksnv_mt_in, - false + false, + false, + [] ) ch_rank_mt_publish = RANK_VARIANTS_MT.out.publish } @@ -734,7 +740,9 @@ workflow RAREDISEASE { ch_reduced_penetrance, ch_score_config_sv, ch_ranksnv_sv_in, - true + true, + false, + [] ) ch_rank_sv_publish = RANK_VARIANTS_SV.out.publish }