diff --git a/CHANGELOG.md b/CHANGELOG.md index f37c1d751..a3f8891a8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,18 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## 3.1.0 - XXXXX [2026-XX-XX] + +### `Added` + +- FastDup module from nf-core and integrated it into the pipeline, as an alternative for Picard Markduplicates. #XXX + +### Parameters + +| Old parameter | New parameter | +| ------------------- | --------------------------- | +| | duplicates_marker | + ## 3.0.0 - Mario [2026-05-12] ### `Added` diff --git a/conf/modules/align_bwa_bwamem2_bwameme.config b/conf/modules/align_bwa_bwamem2_bwameme.config index da2cd83ff..52e022661 100644 --- a/conf/modules/align_bwa_bwamem2_bwameme.config +++ b/conf/modules/align_bwa_bwamem2_bwameme.config @@ -58,4 +58,9 @@ process { ext.args = "--TMP_DIR ." ext.prefix = { "${meta.id}_sorted_md" } } -} + + withName: '.*ALIGN:ALIGN_BWA_BWAMEM2_BWAMEME:FASTDUP' { + ext.args = "--create-index" + ext.prefix = { "${meta.id}_sorted_md" } + } +} \ No newline at end of file diff --git a/docs/usage.md b/docs/usage.md index e5963d4d2..ce908c2ef 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -10,25 +10,25 @@ Table of contents: - [Run nf-core/raredisease with test data](#run-nf-coreraredisease-with-test-data) - [Updating the pipeline](#updating-the-pipeline) - [Run nf-core/raredisease with your data](#run-nf-coreraredisease-with-your-data) - - [Samplesheet](#samplesheet) - - [Samplesheet for BAM file input](#samplesheet-for-bam-file-input) - - [Reference files and parameters](#reference-files-and-parameters) - - [1. Alignment](#1-alignment) - - [2. QC stats from the alignment files](#2-qc-stats-from-the-alignment-files) - - [3. Repeat expansions](#3-repeat-expansions) - - [4. Variant calling - SNV](#4-variant-calling---snv) - - [5. Variant calling - Structural variants](#5-variant-calling---structural-variants) - - [6. Copy number variant calling](#6-copy-number-variant-calling) - - [7. SNV annotation \& Ranking](#7-snv-annotation--ranking) - - [8. SV annotation \& Ranking](#8-sv-annotation--ranking) - - [9. Mitochondrial annotation](#9-mitochondrial-annotation) - - [10. Mobile element calling](#10-mobile-element-calling) - - [11. Mobile element annotation](#11-mobile-element-annotation) - - [12. Variant evaluation](#12-variant-evaluation) - - [13. Prepare data for CNV visualisation in Gens](#13-prepare-data-for-cnv-visualisation-in-gens) - - [Run the pipeline](#run-the-pipeline) - - [Direct input in CLI](#direct-input-in-cli) - - [Import from a config file (recommended)](#import-from-a-config-file-recommended) + - [Samplesheet](#samplesheet) + - [Samplesheet for BAM file input](#samplesheet-for-bam-file-input) + - [Reference files and parameters](#reference-files-and-parameters) + - [1. Alignment](#1-alignment) + - [2. QC stats from the alignment files](#2-qc-stats-from-the-alignment-files) + - [3. Repeat expansions](#3-repeat-expansions) + - [4. Variant calling - SNV](#4-variant-calling---snv) + - [5. Variant calling - Structural variants](#5-variant-calling---structural-variants) + - [6. Copy number variant calling](#6-copy-number-variant-calling) + - [7. SNV annotation \& Ranking](#7-snv-annotation--ranking) + - [8. SV annotation \& Ranking](#8-sv-annotation--ranking) + - [9. Mitochondrial annotation](#9-mitochondrial-annotation) + - [10. Mobile element calling](#10-mobile-element-calling) + - [11. Mobile element annotation](#11-mobile-element-annotation) + - [12. Variant evaluation](#12-variant-evaluation) + - [13. Prepare data for CNV visualisation in Gens](#13-prepare-data-for-cnv-visualisation-in-gens) + - [Run the pipeline](#run-the-pipeline) + - [Direct input in CLI](#direct-input-in-cli) + - [Import from a config file (recommended)](#import-from-a-config-file-recommended) - [Best practices](#best-practices) - [Core Nextflow arguments](#core-nextflow-arguments) - [`-profile`](#-profile) @@ -211,6 +211,7 @@ The mandatory and optional parameters for each category are tabulated below. | | extract_alignments | | | restrict_to_contigs7 | | | exclude_alt8 | +| | duplicates_marker9 | 1Default value is bwamem2. Other alternatives are bwa, bwameme and sentieon (requires valid Sentieon license ).
2Analysis set reference genome in fasta format, first 25 contigs need to be chromosome 1-22, X, Y and the mitochondria.
@@ -220,6 +221,7 @@ The mandatory and optional parameters for each category are tabulated below. 6Default value is 40. Used only by fastp.
7Used to limit your analysis to specific contigs. Can be used to remove alignments to unplaced contigs to minimize potential errors. This parameter should be used in conjunction with the `extract_alignments` parameter.
8When set to true, alignments to alt/unplaced contigs are removed after alignment using samtools view, retaining only primary chromosomes (GRCh37: 1-22,X,Y,MT / GRCh38: chr1-chr22,chrX,chrY,chrM). Note that this will affect all downstream variant calling, as variants will only be called on these primary chromosomes.
+9Default value is "markduplicates". Other alternative is "fastdup".
##### 2. QC stats from the alignment files diff --git a/main.nf b/main.nf index f847e20e6..683b10da6 100644 --- a/main.nf +++ b/main.nf @@ -52,6 +52,7 @@ workflow NFCORE_RAREDISEASE { val_call_interval val_concatenate_snv_calls val_skip_split_multiallelics + val_duplicates_marker val_exclude_alt val_extract_alignments val_fai @@ -481,6 +482,7 @@ workflow NFCORE_RAREDISEASE { val_cadd_resources, val_concatenate_snv_calls, val_skip_split_multiallelics, + val_duplicates_marker, val_exclude_alt, val_extract_alignments, val_genome, @@ -570,6 +572,7 @@ workflow { params.call_interval, params.concatenate_snv_calls, params.skip_split_multiallelics, + params.duplicates_marker, params.exclude_alt, params.extract_alignments, params.fai, diff --git a/modules.json b/modules.json index 4bb0c67b5..63e72f7f2 100644 --- a/modules.json +++ b/modules.json @@ -146,6 +146,11 @@ "git_sha": "2ad28db4a5a82972c1210dfa7c85f035bb80c4de", "installed_by": ["modules"] }, + "fastdup": { + "branch": "master", + "git_sha": "4aab34a29f9ca1730e2f7d194261f5145f53d56d", + "installed_by": ["modules"] + }, "fastp": { "branch": "master", "git_sha": "a331ecfd1aa48b2b2298aab23bb4516c800e410b", diff --git a/modules/nf-core/fastdup/environment.yml b/modules/nf-core/fastdup/environment.yml new file mode 100644 index 000000000..ab55fff04 --- /dev/null +++ b/modules/nf-core/fastdup/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::fastdup=1.0.0 diff --git a/modules/nf-core/fastdup/main.nf b/modules/nf-core/fastdup/main.nf new file mode 100644 index 000000000..f0cf9b977 --- /dev/null +++ b/modules/nf-core/fastdup/main.nf @@ -0,0 +1,52 @@ +process FASTDUP { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/c5/c55070589353b3e1837ca3414c4f182d3674cbf55a64edee07e8bf75370762a9/data': + 'community.wave.seqera.io/library/fastdup:1.0.0--a9b28abff06bb2bb' }" + + input: + tuple val(meta), path(reads) + + output: + tuple val(meta), path("*.bam"), emit: bam + tuple val(meta), path("*.metrics.txt"), emit: metrics + tuple val(meta), path("*.bai"), emit: bai, optional: true + tuple val(meta), path("*.csi"), emit: csi, optional: true + tuple val("${task.process}"), val('fastdup'), eval("fastdup --version"), topic: versions, emit: versions_fastdup + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + if ("${reads}" == "${prefix}.bam") { + error("Input and output names are the same, use \"task.ext.prefix\" to disambiguate!") + } + """ + fastdup \\ + $args \\ + --input $reads \\ + --metrics ${prefix}.metrics.txt \\ + --output ${prefix}.bam \\ + --num-threads $task.cpus + + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def index_command = args.contains("--index-format CSI") ? "touch ${prefix}.csi" + : args.contains("--create-index") ? "touch ${prefix}.bai" : "" + + """ + + touch ${prefix}.bam + ${index_command} + touch ${prefix}.metrics.txt + + """ +} diff --git a/modules/nf-core/fastdup/meta.yml b/modules/nf-core/fastdup/meta.yml new file mode 100644 index 000000000..cdc4496ca --- /dev/null +++ b/modules/nf-core/fastdup/meta.yml @@ -0,0 +1,101 @@ +name: "fastdup" +description: "FastDup is a tool designed to locate and tag duplicate reads in a coordinate-sorted + SAM or BAM file, using the same core algorithm as Picard MarkDuplicates." +keywords: + - duplicate + - BAM + - reads +tools: + - "fastdup": + description: "FastDup is a tool designed to locate and tag duplicate reads in + a coordinate-sorted SAM or BAM file, using the same core algorithm as Picard + MarkDuplicates." + homepage: "https://github.com/zzhofict/FastDup" + documentation: "https://github.com/zzhofict/FastDup" + tool_dev_url: "https://github.com/zzhofict/FastDup" + doi: "10.1093/bioinformatics/btaf633" + licence: + - "MIT" + identifier: "biotools:fastdup" +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - reads: + type: file + description: Sequence reads file, can be SAM/BAM format + pattern: "*.{bam,sam}" + ontologies: [] +output: + bam: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - "*.bam": + type: file + description: BAM file with duplicate reads marked/removed. + pattern: "*.{bam}" + ontologies: + - edam: "http://edamontology.org/format_2572" + versions_fastdup: + - - ${task.process}: + type: string + description: The name of the process + - fastdup: + type: string + description: The name of the tool + - fastdup --version: + type: eval + description: The expression to obtain the version of the tool + metrics: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - "*.metrics.txt": + type: file + description: Duplicate metrics file generated by fastdup. + pattern: "*.{metrics.txt}" + ontologies: [] + bai: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - "*.bai": + type: file + description: BAI index file generated by fastdup. + pattern: "*.{bai}" + ontologies: [] + csi: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - "*.csi": + type: file + description: CSI index file generated by fastdup. + pattern: "*.{csi}" + ontologies: [] +topics: + versions: + - - ${task.process}: + type: string + description: The name of the process + - fastdup: + type: string + description: The name of the tool + - fastdup --version: + type: eval + description: The expression to obtain the version of the tool +authors: + - "@emmadizdarevic" +maintainers: + - "@emmadizdarevic" diff --git a/modules/nf-core/fastdup/tests/main.nf.test b/modules/nf-core/fastdup/tests/main.nf.test new file mode 100644 index 000000000..5933c660b --- /dev/null +++ b/modules/nf-core/fastdup/tests/main.nf.test @@ -0,0 +1,138 @@ +nextflow_process { + + name "Test Process FASTDUP" + script "../main.nf" + process "FASTDUP" + + tag "modules" + tag "modules_nfcore" + tag "fastdup" + + test("sarscov2 - bam") { + + when { + process { + """ + + input[0] = [ + [ id:'test' ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true), + ] + """ + } + } + + then { + assert process.success + assertAll( + { assert snapshot( + bam(process.out.bam[0][1]).getReadsMD5(), + file(process.out.metrics[0][1]).readLines()[0..2], + process.out.bai, + process.out.csi, + process.out.findAll { key, val -> key.startsWith("versions") } + + ).match() } + ) + } + + } + + test("sarscov2 - bam - stub") { + + options "-stub" + + when { + process { + """ + + input[0] = [ + [ id:'test' ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true), + ] + """ + } + } + + then { + assert process.success + assertAll( + { assert snapshot( + process.out + ).match() } + ) + } + + } + + test("sarscov2 - bam - create bai index") { + + config "./nextflow.config" + + when { + params{ + module_args = "--create-index" + } + process { + """ + + input[0] = [ + [ id:'test' ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true), + ] + """ + } + } + + then { + assert process.success + assertAll( + { assert snapshot( + bam(process.out.bam[0][1]).getReadsMD5(), + file(process.out.metrics[0][1]).name, + process.out.bai, + process.out.csi, + process.out.findAll { key, val -> key.startsWith("versions") } + + ).match() } + ) + } + + } + + test("sarscov2 - bam - create csi index") { + + config "./nextflow.config" + + when { + params{ + module_args = "--create-index --index-format CSI" + } + process { + """ + + input[0] = [ + [ id:'test' ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true), + ] + """ + } + } + + then { + assert process.success + assertAll( + { assert snapshot( + bam(process.out.bam[0][1]).getReadsMD5(), + file(process.out.metrics[0][1]).name, + process.out.bai, + process.out.csi, + process.out.findAll { key, val -> key.startsWith("versions") } + + ).match() } + ) + } + + } + +} diff --git a/modules/nf-core/fastdup/tests/main.nf.test.snap b/modules/nf-core/fastdup/tests/main.nf.test.snap new file mode 100644 index 000000000..2342099cf --- /dev/null +++ b/modules/nf-core/fastdup/tests/main.nf.test.snap @@ -0,0 +1,163 @@ +{ + "sarscov2 - bam - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "test.metrics.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + + ], + "3": [ + + ], + "4": [ + [ + "FASTDUP", + "fastdup", + "1.0.0" + ] + ], + "bai": [ + + ], + "bam": [ + [ + { + "id": "test" + }, + "test.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "csi": [ + + ], + "metrics": [ + [ + { + "id": "test" + }, + "test.metrics.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions_fastdup": [ + [ + "FASTDUP", + "fastdup", + "1.0.0" + ] + ] + } + ], + "timestamp": "2026-06-02T13:15:04.742292", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } + }, + "sarscov2 - bam - create csi index": { + "content": [ + "894549ee3ced6b5ca2eed2563a985217", + "test.metrics.txt", + [ + + ], + [ + [ + { + "id": "test" + }, + "test.bam.csi:md5,afdc79c180dd73c1c8dadbf65af0788e" + ] + ], + { + "versions_fastdup": [ + [ + "FASTDUP", + "fastdup", + "1.0.0" + ] + ] + } + ], + "timestamp": "2026-06-03T10:08:41.493463", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } + }, + "sarscov2 - bam": { + "content": [ + "894549ee3ced6b5ca2eed2563a985217", + [ + "## StringHeader", + "# fastdup --input test.paired_end.sorted.bam --metrics test.metrics.txt --output test.bam --num-threads 2", + "## StringHeader" + ], + [ + + ], + [ + + ], + { + "versions_fastdup": [ + [ + "FASTDUP", + "fastdup", + "1.0.0" + ] + ] + } + ], + "timestamp": "2026-06-03T10:11:58.499113", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } + }, + "sarscov2 - bam - create bai index": { + "content": [ + "894549ee3ced6b5ca2eed2563a985217", + "test.metrics.txt", + [ + [ + { + "id": "test" + }, + "test.bam.bai:md5,412de50af8da0544bf151011ae739a2d" + ] + ], + [ + + ], + { + "versions_fastdup": [ + [ + "FASTDUP", + "fastdup", + "1.0.0" + ] + ] + } + ], + "timestamp": "2026-06-03T10:08:38.477602", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } + } +} \ No newline at end of file diff --git a/modules/nf-core/fastdup/tests/nextflow.config b/modules/nf-core/fastdup/tests/nextflow.config new file mode 100644 index 000000000..7dafe3ea2 --- /dev/null +++ b/modules/nf-core/fastdup/tests/nextflow.config @@ -0,0 +1,5 @@ +process { + withName: 'FASTDUP' { + ext.args = {params.module_args} + } +} \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index 4baa3cf23..14e760061 100644 --- a/nextflow.config +++ b/nextflow.config @@ -110,6 +110,7 @@ params { // Alignment aligner = 'bwamem2' mt_aligner = 'bwamem2' + duplicates_marker = 'markduplicates' mbuffer_mem = 3072 samtools_sort_threads = 4 min_trimmed_length = 40 diff --git a/nextflow_schema.json b/nextflow_schema.json index a5b135431..7daa5813f 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -644,6 +644,13 @@ "fa_icon": "fas fa-align-center", "enum": ["bwa", "bwamem2", "sentieon"] }, + "duplicates_marker": { + "type": "string", + "default": "markduplicates", + "description": "Specifies the tool to use for marking duplicates. The default is 'markduplicates', another option is the faster alternative 'fastdup'.", + "fa_icon": "fas fa-align-center", + "enum": ["markduplicates", "fastdup"] + }, "samtools_sort_threads": { "type": "integer", "default": 4, diff --git a/subworkflows/local/align/main.nf b/subworkflows/local/align/main.nf index 932ace4f8..6547b0741 100644 --- a/subworkflows/local/align/main.nf +++ b/subworkflows/local/align/main.nf @@ -35,6 +35,7 @@ workflow ALIGN { skip_fastp // boolean val_aligner // string: 'bwa', 'bwamem2', 'bwameme', or 'sentieon' val_analysis_type // string: 'wgs', 'wes', or 'mito' + val_duplicates_marker // string: 'markduplicates' or 'fastdup', default: 'markduplicates' val_exclude_alt // boolean val_extract_alignments // boolean val_mbuffer_mem // integer: [mandatory] memory in megabytes @@ -100,6 +101,7 @@ workflow ALIGN { ch_genome_fasta, ch_input_reads, val_aligner, + val_duplicates_marker, val_extract_alignments, val_mbuffer_mem, val_platform, diff --git a/subworkflows/local/align/tests/main.nf.test b/subworkflows/local/align/tests/main.nf.test index 7a8e427ac..478002a04 100644 --- a/subworkflows/local/align/tests/main.nf.test +++ b/subworkflows/local/align/tests/main.nf.test @@ -113,15 +113,16 @@ nextflow_workflow { input[18] = false input[19] = "bwamem2" input[20] = "wgs" - input[21] = true - input[22] = false - input[23] = 3072 - input[24] = "bwamem2" - input[25] = "illumina" - input[26] = false - input[27] = 4 - input[28] = true + input[21] = "markduplicates" + input[22] = true + input[23] = false + input[24] = 3072 + input[25] = "bwamem2" + input[26] = "illumina" + input[27] = false + input[28] = 4 input[29] = true + input[30] = true """ } } @@ -220,15 +221,16 @@ nextflow_workflow { input[18] = false input[19] = "bwamem2" input[20] = "wes" - input[21] = false + input[21] = "markduplicates" input[22] = false - input[23] = 3072 - input[24] = "bwamem2" - input[25] = "illumina" - input[26] = false - input[27] = 4 - input[28] = true - input[29] = false + input[23] = false + input[24] = 3072 + input[25] = "bwamem2" + input[26] = "illumina" + input[27] = false + input[28] = 4 + input[29] = true + input[30] = false """ } } @@ -318,15 +320,16 @@ nextflow_workflow { input[18] = true input[19] = "bwameme" input[20] = "wgs" - input[21] = false + input[21] = "markduplicates" input[22] = false - input[23] = 3072 - input[24] = "bwamem2" - input[25] = "illumina" - input[26] = false - input[27] = 4 - input[28] = true - input[29] = false + input[23] = false + input[24] = 3072 + input[25] = "bwamem2" + input[26] = "illumina" + input[27] = false + input[28] = 4 + input[29] = true + input[30] = false """ } } diff --git a/subworkflows/local/align_bwa_bwamem2_bwameme/main.nf b/subworkflows/local/align_bwa_bwamem2_bwameme/main.nf index 6ea0434de..69eaffb45 100644 --- a/subworkflows/local/align_bwa_bwamem2_bwameme/main.nf +++ b/subworkflows/local/align_bwa_bwamem2_bwameme/main.nf @@ -5,6 +5,7 @@ include { BWAMEM2_MEM } from '../../../modules/nf-core/bwamem2/mem/main' include { BWAMEME_MEM } from '../../../modules/nf-core/bwameme/mem/main' include { BWA_MEM as BWA } from '../../../modules/nf-core/bwa/mem/main' +include { FASTDUP } from '../../../modules/nf-core/fastdup/main' include { PICARD_MARKDUPLICATES as MARKDUPLICATES } from '../../../modules/nf-core/picard/markduplicates/main' include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_ALIGN } from '../../../modules/nf-core/samtools/index/main' include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_EXTRACT } from '../../../modules/nf-core/samtools/index/main' @@ -23,6 +24,7 @@ workflow ALIGN_BWA_BWAMEM2_BWAMEME { ch_genome_fasta // channel: [mandatory] [ val(meta), path(fasta) ] ch_input_reads // channel: [mandatory] [ val(meta), path(reads_input) ] val_aligner // string: 'bwa', 'bwamem2', 'bwameme', or 'sentieon' + val_duplicates_marker // string: 'markduplicates' or 'fastdup', default: 'markduplicates' val_extract_alignments // boolean val_mbuffer_mem // integer: [mandatory] default: 3072 val_platform // string: [mandatory] default: illumina @@ -74,19 +76,35 @@ workflow ALIGN_BWA_BWAMEM2_BWAMEME { } // Marking duplicates - MARKDUPLICATES ( prepared_bam , ch_genome_fasta, ch_genome_fai ) - SAMTOOLS_INDEX_MARKDUP ( MARKDUPLICATES.out.bam ) + if (val_duplicates_marker == "markduplicates") { + MARKDUPLICATES ( prepared_bam, ch_genome_fasta, ch_genome_fai ) + SAMTOOLS_INDEX_MARKDUP (MARKDUPLICATES.out.bam) + + ch_marked_bam = MARKDUPLICATES.out.bam + ch_marked_bai = SAMTOOLS_INDEX_MARKDUP.out.bai + ch_marked_csi = SAMTOOLS_INDEX_MARKDUP.out.csi + ch_metrics = MARKDUPLICATES.out.metrics + } else { + FASTDUP ( prepared_bam ) + ch_marked_bam = FASTDUP.out.bam + ch_marked_bai = FASTDUP.out.bai + ch_marked_csi = FASTDUP.out.csi + ch_metrics = FASTDUP.out.metrics + } + + ch_publish = ch_marked_bam + .mix(ch_metrics) + .mix(ch_marked_bai) + .mix(ch_marked_csi) + .map {meta, value -> ['alignment/', [meta, value]] } - ch_publish = MARKDUPLICATES.out.bam - .mix(MARKDUPLICATES.out.metrics) - .mix(SAMTOOLS_INDEX_MARKDUP.out.bai) - .mix(SAMTOOLS_INDEX_MARKDUP.out.csi) - .map { meta, value -> ['alignment/', [meta, value]] } emit: - marked_bai = SAMTOOLS_INDEX_MARKDUP.out.bai // channel: [ val(meta), path(bai) ] - marked_bam = MARKDUPLICATES.out.bam // channel: [ val(meta), path(bam) ] - metrics = MARKDUPLICATES.out.metrics // channel: [ val(meta), path(metrics) ] - stats = SAMTOOLS_STATS.out.stats // channel: [ val(meta), path(stats) ] - publish = ch_publish // channel: [ val(destination), val(value) ] + + marked_bam = ch_marked_bam // = MARKDUPLICATES.out.bam or FASTDUP.out.bam // channel: [ val(meta), path(bam) ] + marked_bai = ch_marked_bai // = SAMTOOLS_INDEX_MARKDUP.out.bai or FASTDUP.out.bai // channel: [ val(meta), path(bai) ] + marked_csi = ch_marked_csi // = SAMTOOLS_INDEX_MARKDUP.out.csi or FASTDUP.out.csi // channel: [ val(meta), path(csi) ] + metrics = ch_metrics // = MARKDUPLICATES.out.metrics or FASTDUP.out.metrics // channel: [ val(meta), path(metrics) ] + stats = SAMTOOLS_STATS.out.stats // channel: [ val(meta), path(stats) ] + publish = ch_publish // channel: [ val(destination), val(value) ] } diff --git a/subworkflows/local/align_bwa_bwamem2_bwameme/tests/main.nf.test b/subworkflows/local/align_bwa_bwamem2_bwameme/tests/main.nf.test index 7e64d0355..fcc437987 100644 --- a/subworkflows/local/align_bwa_bwamem2_bwameme/tests/main.nf.test +++ b/subworkflows/local/align_bwa_bwamem2_bwameme/tests/main.nf.test @@ -10,6 +10,7 @@ nextflow_workflow { tag "bwa/mem" tag "bwamem2/mem" tag "bwameme/mem" + tag "fastdup" tag "samtools/index" tag "samtools/stats" tag "samtools/merge" @@ -54,10 +55,12 @@ nextflow_workflow { ] ]) input[6] = "bwamem2" - input[7] = true - input[8] = 3072 - input[9] = "illumina" - input[10] = 4 + input[7] = "markduplicates" + input[8] = true + input[9] = 3072 + input[10] = "illumina" + input[11] = 4 + """ } } @@ -114,10 +117,137 @@ nextflow_workflow { ] ]) input[6] = "bwameme" - input[7] = true - input[8] = 3072 - input[9] = "illumina" - input[10] = 4 + input[7] = "markduplicates" + input[8] = true + input[9] = 3072 + input[10] = "illumina" + input[11] = 4 + """ + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot( + workflow.out.publish.flatten() + .findAll { it instanceof String && (it.startsWith('/') || it.contains('.')) } + .collect { new File(it).name } + .sort(), + workflow.out.marked_bam.collect { meta, bamfile -> [ meta, bam(bamfile).getHeaderMD5() ] }, + workflow.out.marked_bam.collect { meta, bamfile -> [ meta, bam(bamfile).getReadsMD5() ] } + ).match() + } + ) + } + } + + + test("align bwameme, FASTDUP with bai index") { + + setup { + run("BWAMEME_INDEX") { + script "modules/nf-core/bwameme/index/main.nf" + process { + """ + input[0] = channel.of([ + [id:'sarscov2'], + file('https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ]) + """ + } + } + } + + when { + params { + sarscov_testdata_base_path= 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/' + restrict_to_contigs = "MT192765.1" + fastdup_args = "--create-index" + } + workflow { + """ + input[0] = [[:],[]] + input[1] = [[:],[]] + input[2] = BWAMEME_INDEX.out.index + input[3] = channel.of([[id:'sarscov2'], [file(params.sarscov_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.fai', checkIfExists: true)]]) + input[4] = channel.of([[id:'sarscov2'], [file(params.sarscov_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true)]]) + input[5] = channel.of([ + [ id:'test', sample:'test', single_end:false, num_lanes:1, read_group:"\'@RG\\\\tID:test\\\\tPL:illumina\\\\tSM:test\'" ], // meta map + [ + file(params.sarscov_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.sarscov_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) + ] + ]) + input[6] = "bwameme" + input[7] = "fastdup" + input[8] = true + input[9] = 3072 + input[10] = "illumina" + input[11] = 4 + """ + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot( + workflow.out.publish.flatten() + .findAll { it instanceof String && (it.startsWith('/') || it.contains('.')) } + .collect { new File(it).name } + .sort(), + workflow.out.marked_bam.collect { meta, bamfile -> [ meta, bam(bamfile).getHeaderMD5() ] }, + workflow.out.marked_bam.collect { meta, bamfile -> [ meta, bam(bamfile).getReadsMD5() ] } + ).match() + } + ) + } + } + + + test("align bwameme, FASTDUP with csi index") { + + setup { + run("BWAMEME_INDEX") { + script "modules/nf-core/bwameme/index/main.nf" + process { + """ + input[0] = channel.of([ + [id:'sarscov2'], + file('https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ]) + """ + } + } + } + + when { + params { + sarscov_testdata_base_path= 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/' + restrict_to_contigs = "MT192765.1" + fastdup_args = "--create-index --index-format CSI" + } + workflow { + """ + input[0] = [[:],[]] + input[1] = [[:],[]] + input[2] = BWAMEME_INDEX.out.index + input[3] = channel.of([[id:'sarscov2'], [file(params.sarscov_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.fai', checkIfExists: true)]]) + input[4] = channel.of([[id:'sarscov2'], [file(params.sarscov_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true)]]) + input[5] = channel.of([ + [ id:'test', sample:'test', single_end:false, num_lanes:1, read_group:"\'@RG\\\\tID:test\\\\tPL:illumina\\\\tSM:test\'" ], // meta map + [ + file(params.sarscov_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.sarscov_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) + ] + ]) + input[6] = "bwameme" + input[7] = "fastdup" + input[8] = true + input[9] = 3072 + input[10] = "illumina" + input[11] = 4 """ } } diff --git a/subworkflows/local/align_bwa_bwamem2_bwameme/tests/main.nf.test.snap b/subworkflows/local/align_bwa_bwamem2_bwameme/tests/main.nf.test.snap index 7632f79b8..920d49646 100644 --- a/subworkflows/local/align_bwa_bwamem2_bwameme/tests/main.nf.test.snap +++ b/subworkflows/local/align_bwa_bwamem2_bwameme/tests/main.nf.test.snap @@ -43,6 +43,50 @@ "nextflow": "25.10.4" } }, + "align bwameme, FASTDUP with bai index": { + "content": [ + [ + "test_sorted_md.bam", + "test_sorted_md.bam.bai", + "test_sorted_md.metrics.txt" + ], + [ + [ + { + "groupSize": 1, + "groupTarget": { + "id": "test", + "sample": "test", + "single_end": false, + "num_lanes": 1, + "read_group": "'@RG\\tID:test\\tPL:illumina\\tSM:test'" + } + }, + "785f016d043339de0df488fa1bb3a16" + ] + ], + [ + [ + { + "groupSize": 1, + "groupTarget": { + "id": "test", + "sample": "test", + "single_end": false, + "num_lanes": 1, + "read_group": "'@RG\\tID:test\\tPL:illumina\\tSM:test'" + } + }, + "af8628d9df18b2d3d4f6fd47ef2bb872" + ] + ] + ], + "timestamp": "2026-06-09T13:46:05.133199", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } + }, "align bwameme": { "content": [ [ @@ -86,5 +130,49 @@ "nf-test": "0.9.4", "nextflow": "25.10.4" } + }, + "align bwameme, FASTDUP with csi index": { + "content": [ + [ + "test_sorted_md.bam", + "test_sorted_md.bam.csi", + "test_sorted_md.metrics.txt" + ], + [ + [ + { + "groupSize": 1, + "groupTarget": { + "id": "test", + "sample": "test", + "single_end": false, + "num_lanes": 1, + "read_group": "'@RG\\tID:test\\tPL:illumina\\tSM:test'" + } + }, + "6fbed72e21c5b2bcb981e1dded632829" + ] + ], + [ + [ + { + "groupSize": 1, + "groupTarget": { + "id": "test", + "sample": "test", + "single_end": false, + "num_lanes": 1, + "read_group": "'@RG\\tID:test\\tPL:illumina\\tSM:test'" + } + }, + "af8628d9df18b2d3d4f6fd47ef2bb872" + ] + ] + ], + "timestamp": "2026-06-09T13:46:33.608928", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } } } \ No newline at end of file diff --git a/subworkflows/local/align_bwa_bwamem2_bwameme/tests/nextflow.config b/subworkflows/local/align_bwa_bwamem2_bwameme/tests/nextflow.config index 4563b2d78..c2ecd4a69 100644 --- a/subworkflows/local/align_bwa_bwamem2_bwameme/tests/nextflow.config +++ b/subworkflows/local/align_bwa_bwamem2_bwameme/tests/nextflow.config @@ -34,4 +34,9 @@ process { ext.prefix = { "${meta.id}_sorted_md" } } + withName: 'FASTDUP' { + ext.args = {params.fastdup_args} + ext.prefix = { "${meta.id}_sorted_md" } + + } } diff --git a/workflows/raredisease.nf b/workflows/raredisease.nf index b3b1c1470..5ebd39031 100644 --- a/workflows/raredisease.nf +++ b/workflows/raredisease.nf @@ -183,6 +183,7 @@ workflow RAREDISEASE { val_cadd_resources val_concatenate_snv_calls val_skip_split_multiallelics + val_duplicates_marker val_exclude_alt val_extract_alignments val_genome @@ -310,6 +311,7 @@ workflow RAREDISEASE { skip_fastp, val_aligner, val_analysis_type, + val_duplicates_marker, val_exclude_alt, val_extract_alignments, val_mbuffer_mem,