From f9fec1bbf7ed146432c37dbec4288ecbc2de1214 Mon Sep 17 00:00:00 2001 From: SHY Date: Wed, 13 May 2026 21:12:08 +0800 Subject: [PATCH 1/3] Support user-defined BSgenome references --- .gitignore | 1 + DESCRIPTION | 5 +- NAMESPACE | 3 + R/callMotif.R | 6 +- R/callTrinucleotide.R | 9 ++- R/internalMutFunctions.R | 108 +++++++++++++++++++++++++-- R/readBam.R | 86 ++++++++++++++------- R/readGALP.R | 77 ++++++++++++------- R/read_bam_insert_metrics.R | 22 +++--- README.md | 2 +- man/callMotif.Rd | 5 ++ man/callTrinucleotide.Rd | 6 +- man/readBam.Rd | 22 +++--- man/readGALP.Rd | 21 +++--- man/read_bam_insert_metrics.Rd | 19 ++--- tests/testthat/test_readBam_genome.R | 34 +++++++++ 16 files changed, 312 insertions(+), 114 deletions(-) create mode 100644 tests/testthat/test_readBam_genome.R diff --git a/.gitignore b/.gitignore index 1926064..e5f7da7 100644 --- a/.gitignore +++ b/.gitignore @@ -29,6 +29,7 @@ cfDNAPro.Rproj .rproj .rproj.user .rhistory +.cursor/ doc Meta ..Rcheck diff --git a/DESCRIPTION b/DESCRIPTION index 4ad3eef..cd81586 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: cfDNAPro Type: Package Title: cfDNAPro extracts and Visualises biological features from whole genome sequencing data of cell-free DNA -Version: 1.7.3 +Version: 1.7.4 biocViews: Visualization, Sequencing, WholeGenome Authors@R: c( person("Haichao","Wang", email="hw538@cam.ac.uk", role =c("aut","cre")), @@ -35,6 +35,7 @@ Imports: Rsamtools (>= 2.4.0), Biostrings, rlang (>= 0.4.0), + BSgenome, BSgenome.Hsapiens.UCSC.hg38, BSgenome.Hsapiens.UCSC.hg19, BSgenome.Hsapiens.NCBI.GRCh38, @@ -58,8 +59,8 @@ Suggests: testthat License: GPL-3 Encoding: UTF-8 -RoxygenNote: 7.3.1 VignetteBuilder: knitr URL: https://github.com/hw538/cfDNAPro BugRePORTS: https://github.com/hw538/cfDNAPro/issues BiocType: Software +Config/roxygen2/version: 8.0.0 diff --git a/NAMESPACE b/NAMESPACE index 7f8f3be..665ece9 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -96,9 +96,11 @@ importFrom(Rsamtools,pileup) importFrom(Rsamtools,scanBam) importFrom(Rsamtools,scanBamFlag) importFrom(Rsamtools,testPairedEndBam) +importFrom(S4Vectors,"metadata<-") importFrom(S4Vectors,Rle) importFrom(S4Vectors,isSingleStringOrNA) importFrom(S4Vectors,mcols) +importFrom(S4Vectors,metadata) importFrom(dplyr,across) importFrom(dplyr,all_of) importFrom(dplyr,arrange) @@ -176,5 +178,6 @@ importFrom(tidyr,expand_grid) importFrom(tidyr,pivot_longer) importFrom(tidyr,replace_na) importFrom(tidyr,unite) +importFrom(utils,packageName) importFrom(utils,read.table) importFrom(utils,write.table) diff --git a/R/callMotif.R b/R/callMotif.R index b6f3bc4..cccfd11 100644 --- a/R/callMotif.R +++ b/R/callMotif.R @@ -23,6 +23,9 @@ #' fragments/read-pairs that do not overlap indicated mutation loci. #' @param downsample_ref Logical, if TRUE, downsamples reference motifs to match #' the count of mutant motifs. +#' @param genome Optional `BSgenome` passed to `get_genome_reference()`; use when +#' fragments were built with `genome_label = "user_define"` and the object +#' cannot be restored from package metadata, or to override inference. #' @param ... Additional parameters passed to underlying functions. #' #' @return Returns a tibble containing the results of the motif analysis. @@ -37,6 +40,7 @@ callMotif <- function(frag_obj, integrate_mut = FALSE, ref_type = "locus_fragment", downsample_ref = FALSE, + genome = NULL, ...) { motif_length <- as.numeric(motif_length) @@ -44,7 +48,7 @@ callMotif <- function(frag_obj, message("Started to extract ", paste0(motif_type, motif_length), " motif...") # Select the appropriate genome reference - bsgenome_obj <- get_genome_reference(frag_obj) + bsgenome_obj <- get_genome_reference(frag_obj, genome = genome) genome_label <- bsgenome_obj@metadata$genome diff --git a/R/callTrinucleotide.R b/R/callTrinucleotide.R index 9af8b40..c3d28ff 100644 --- a/R/callTrinucleotide.R +++ b/R/callTrinucleotide.R @@ -26,19 +26,22 @@ #' @importFrom magrittr '%>%' #' #' @param frag_obj_mut GRanges object containing fragment and mutation data. +#' @param genome Optional `BSgenome` passed to `get_genome_reference()`; use when +#' fragments were built with `genome_label = "user_define"` and the object +#' cannot be restored from package metadata, or to override inference. #' #' @return dataframe with summarised mutational and trinucleotide data. #' @export #' #' @examples #' trinuc_df <- callTrinucleotide(frag_obj_mut) -callTrinucleotide <- function(frag_obj_mut) { +callTrinucleotide <- function(frag_obj_mut, genome = NULL) { # Check if GRanges has mutational information check_mutation_in_metadata(frag_obj = frag_obj_mut) # Get the appropriate genome reference - genome <- get_genome_reference(frag_obj_mut) + bsgenome_obj <- get_genome_reference(frag_obj_mut, genome = genome) # Convert relevant data to dataframe gr_df <- prepare_data(frag_obj_mut) @@ -68,7 +71,7 @@ callTrinucleotide <- function(frag_obj_mut) { merged_table_df <- update_consensus_mismatch(merged_table_df, gr_df, mapping) # Add the SBS 96 annotation for consensus locus-based mutations - merged_table_df <- get_trinucleotide(merged_table_df, genome) + merged_table_df <- get_trinucleotide(merged_table_df, bsgenome_obj) return(merged_table_df) } diff --git a/R/internalMutFunctions.R b/R/internalMutFunctions.R index 75c1e50..3b64f3d 100644 --- a/R/internalMutFunctions.R +++ b/R/internalMutFunctions.R @@ -1633,25 +1633,119 @@ create_empty_galp <- function() { +#' Resolve package and export name for a BSgenome instance (for RDS recovery) +#' @importFrom utils packageName getNamespace getNamespaceExports +#' @importFrom methods is +#' @noRd +.cfDNAPro_bsgenome_pkg_symbol <- function(genome_obj) { + if (!methods::is(genome_obj, "BSgenome")) { + return(list(pkg = NA_character_, symbol = NA_character_)) + } + pkg <- tryCatch( + utils::packageName(environment(genome_obj)), + error = function(...) NA_character_ + ) + if (is.na(pkg) || identical(pkg, ".GlobalEnv")) { + return(list(pkg = NA_character_, symbol = NA_character_)) + } + ns <- tryCatch(asNamespace(pkg), error = function(...) NULL) + if (is.null(ns)) { + return(list(pkg = NA_character_, symbol = NA_character_)) + } + for (nm in utils::getNamespaceExports(pkg)) { + obj <- tryCatch(get(nm, envir = ns), error = function(...) NULL) + if (!is.null(obj) && methods::is(obj, "BSgenome") && identical(obj, genome_obj)) { + return(list(pkg = pkg, symbol = nm)) + } + } + list(pkg = NA_character_, symbol = NA_character_) +} + +#' Restore user BSgenome from readBam metadata (package + export name) +#' @importFrom methods is +#' @noRd +.cfDNAPro_restore_user_bsgenome <- function(md) { + pkg <- md$cfDNAPro_bsgenome_pkg + sym <- md$cfDNAPro_bsgenome_symbol + if (is.null(pkg) || is.null(sym) || length(pkg) != 1L || length(sym) != 1L) { + return(NULL) + } + if (is.na(pkg) || is.na(sym) || !nzchar(pkg) || !nzchar(sym)) { + return(NULL) + } + if (!requireNamespace(pkg, quietly = TRUE)) { + stop( + "BSgenome package \"", pkg, "\" is not installed; install it or pass ", + "`genome` explicitly to callMotif() / callTrinucleotide().", + call. = FALSE + ) + } + obj <- tryCatch(get(sym, envir = asNamespace(pkg)), error = function(...) NULL) + if (is.null(obj) || !methods::is(obj, "BSgenome")) { + return(NULL) + } + obj +} + +#' Attach cfDNAPro genome lineage metadata to a GRanges from readBam +#' @importFrom S4Vectors metadata metadata<- +#' @noRd +.cfDNAPro_attach_read_bam_genome_metadata <- function(frag, genome_label, genome_obj) { + md <- S4Vectors::metadata(frag) + md$cfDNAPro_genome_label <- genome_label + if (identical(genome_label, "user_define")) { + ps <- .cfDNAPro_bsgenome_pkg_symbol(genome_obj) + md$cfDNAPro_bsgenome_pkg <- ps$pkg + md$cfDNAPro_bsgenome_symbol <- ps$symbol + } else { + md$cfDNAPro_bsgenome_pkg <- NULL + md$cfDNAPro_bsgenome_symbol <- NULL + } + S4Vectors::metadata(frag) <- md + frag +} + #' Function to determine the correct BSgenome based on the seqinfo result #' @importFrom GenomeInfoDb seqinfo +#' @importFrom methods is #' @noRd -get_genome_reference <- function(frag_obj_mut) { +get_genome_reference <- function(frag_obj_mut, genome = NULL) { + if (!is.null(genome)) { + if (!methods::is(genome, "BSgenome")) { + stop("`genome` must be a BSgenome object when provided.", call. = FALSE) + } + return(genome) + } + + md <- S4Vectors::metadata(frag_obj_mut) + if (identical(md$cfDNAPro_genome_label, "user_define")) { + restored <- .cfDNAPro_restore_user_bsgenome(md) + if (!is.null(restored)) { + return(restored) + } + stop( + "This object was built with genome_label=\"user_define\" but the BSgenome ", + "cannot be restored (e.g. ad-hoc object not from an installed package). ", + "Pass `genome` explicitly to callMotif() or callTrinucleotide().", + call. = FALSE + ) + } + # Extract genome sequence from GRanges object's seqinfo genome_seq <- unique( as.character(GenomeInfoDb::seqinfo(frag_obj_mut)@genome)) - + # Define genome versions and corresponding BSgenome data packages genome_versions <- c("GRCh38", "hg38-NCBI", "hg38", "hg19") genome_packages <- c( - "BSgenome.Hsapiens.NCBI.GRCh38::BSgenome.Hsapiens.NCBI.GRCh38", - "BSgenome.Hsapiens.NCBI.GRCh38::BSgenome.Hsapiens.NCBI.GRCh38", - "BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38", + "BSgenome.Hsapiens.NCBI.GRCh38::BSgenome.Hsapiens.NCBI.GRCh38", + "BSgenome.Hsapiens.NCBI.GRCh38::BSgenome.Hsapiens.NCBI.GRCh38", + "BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38", "BSgenome.Hsapiens.UCSC.hg19::BSgenome.Hsapiens.UCSC.hg19") - + # Find the index of the genome sequence in the genome_versions vector index <- match(genome_seq, genome_versions) - + # Return the corresponding genome package; default to hg19 if not found if (!is.na(index)) { return(eval(parse(text = genome_packages[index]))) diff --git a/R/readBam.R b/R/readBam.R index 918321c..34607b0 100644 --- a/R/readBam.R +++ b/R/readBam.R @@ -4,24 +4,22 @@ #' @import GenomicAlignments #' @import S4Vectors #' @importFrom IRanges IRanges -#' @importFrom GenomeInfoDb seqlengths genome Seqinfo keepSeqlevels +#' @importFrom GenomeInfoDb seqlengths genome Seqinfo keepSeqlevels seqinfo #' @importFrom GenomicAlignments readGAlignmentPairs #' @importFrom Rsamtools scanBamFlag ScanBamParam testPairedEndBam #' @importFrom GenomicRanges GRanges seqnames sort granges #' @importFrom stringr str_remove #' @importFrom methods is #' -#' @param genome_label The Genome you used in the alignment. -#' Should be "hg19" or "hg38" or "hg38-NCBI". Default is "hg19". -#' Note: "hg19" will load BSgenome.Hsapiens.UCSC.hg19 package, which is -#' Full genome sequences for Homo sapiens (Human) as provided by -#' UCSC (hg19, based on GRCh37.p13) and stored in Biostrings objects; -#' "hg38" will load BSgenome.Hsapiens.UCSC.hg38 package, which is -#' Full genome sequences for Homo sapiens (Human) as provided by -#' UCSC (hg38, based on GRCh38.p13) and stored in Biostrings objects. -#' "hg38-NCBI" will load BSgenome.Hsapiens.NCBI.GRCh38 package, which is -#' full genome sequences for Homo sapiens (Human) as provided by -#' NCBI (GRCh38, 2013-12-17) and stored in Biostrings objects. +#' @param genome_label The genome you used for alignment: `"hg19"`, +#' `"hg38"`, `"hg38-NCBI"`, or `"user_define"`. Default is `"hg19"`. +#' For the first three values, the matching BSgenome data package is +#' loaded automatically. For `"user_define"`, you must pass a ready-made +#' `BSgenome` object via `genome` (same reference as the BAM). +#' @param genome Used only when `genome_label = "user_define"`: a +#' `BSgenome` object built or loaded beforehand (must match the BAM +#' reference). For built-in labels, if you pass a `BSgenome` here it is +#' ignored with a warning. #' @param bamfile The bam file name. #' @param curate_start_and_end A logical (TRUE or FALSE) parameter #' for curating alignment start and end coordinates. @@ -83,6 +81,7 @@ readBam <- function(bamfile, chromosome_to_keep = paste0("chr", 1:22), strand_mode = 1, genome_label = "hg19", + genome = NULL, outdir = NA, galp_flag = Rsamtools::scanBamFlag( isPaired = TRUE, @@ -107,25 +106,60 @@ readBam <- function(bamfile, stop("Input is not paired-end Bam file.") } - if (!genome_label %in% c("hg19", "hg38", "hg38-NCBI") || is.na(genome_label)) { - stop("Only 'hg19', 'hg38', or 'hg38-NCBI' are accepted as genome labels.") + .valid_genome_labels <- c("hg19", "hg38", "hg38-NCBI", "user_define") + if (!genome_label %in% .valid_genome_labels || length(genome_label) != 1L || + is.na(genome_label)) { + stop( + "genome_label must be one of: ", + paste0("'", .valid_genome_labels, "'", collapse = ", "), + "." + ) } - if (genome_label == "hg19") { + user_genome_arg <- genome - genome <- BSgenome.Hsapiens.UCSC.hg19::BSgenome.Hsapiens.UCSC.hg19 - - } else if (genome_label == "hg38") { - - genome <- BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38 - - } else if (genome_label == "hg38-NCBI") { - - genome <- BSgenome.Hsapiens.NCBI.GRCh38::BSgenome.Hsapiens.NCBI.GRCh38 + if (identical(genome_label, "user_define")) { + if (is.null(user_genome_arg)) { + stop("When genome_label is 'user_define', argument `genome` must be a BSgenome object.") + } + if (!methods::is(user_genome_arg, "BSgenome")) { + stop("When genome_label is 'user_define', `genome` must inherit from BSgenome.") + } + genome <- user_genome_arg + } else { + if (!is.null(user_genome_arg) && methods::is(user_genome_arg, "BSgenome")) { + warning( + "Argument `genome` is ignored unless genome_label is \"user_define\".", + call. = FALSE + ) + } + if (genome_label == "hg19") { + genome <- BSgenome.Hsapiens.UCSC.hg19::BSgenome.Hsapiens.UCSC.hg19 + } else if (genome_label == "hg38") { + genome <- BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38 + } else if (genome_label == "hg38-NCBI") { + genome <- BSgenome.Hsapiens.NCBI.GRCh38::BSgenome.Hsapiens.NCBI.GRCh38 + } + } + if (!isFALSE(chromosome_to_keep)) { + sl_names <- names(GenomeInfoDb::seqlengths(genome)) + unknown_chr <- setdiff(chromosome_to_keep, sl_names) + if (length(unknown_chr) > 0L) { + stop( + "chromosome_to_keep contains seqnames not found in the BSgenome: ", + paste(unknown_chr, collapse = ", ") + ) + } } - genome_name <- seqinfo(genome)@genome %>% unique() + genome_name <- unique(GenomeInfoDb::seqinfo(genome)@genome) + if (length(genome_name) != 1L || is.na(genome_name)) { + stop( + "BSgenome seqinfo must have a single non-NA genome assembly name ", + "(GenomeInfoDb::seqinfo(genome)@genome)." + ) + } ########################################################################### # Read BAM into galp based on user-defined parameters @@ -205,6 +239,8 @@ readBam <- function(bamfile, ) } + frag <- .cfDNAPro_attach_read_bam_genome_metadata(frag, genome_label, genome) + ########################################################################### # Saving RDS file ########################################################################### diff --git a/R/readGALP.R b/R/readGALP.R index 40255dc..17ad825 100644 --- a/R/readGALP.R +++ b/R/readGALP.R @@ -4,21 +4,17 @@ #' @import GenomeInfoDb #' @import GenomicAlignments #' @import S4Vectors +#' @importFrom methods is #' @importFrom Rsamtools scanBamFlag ScanBamParam #' @importFrom GenomicRanges GRanges seqnames #' @importFrom IRanges IRanges #' -#' @param genome_label The Genome you used in the alignment. -#' Should be "hg19" or "hg38" or "hg38-NCBI". Default is "hg19". -#' Note: "hg19" will load BSgenome.Hsapiens.UCSC.hg19 package, which is -#' Full genome sequences for Homo sapiens (Human) as provided by -#' UCSC (hg19, based on GRCh37.p13) and stored in Biostrings objects; -#' "hg38" will load BSgenome.Hsapiens.UCSC.hg38 package, which is -#' Full genome sequences for Homo sapiens (Human) as provided by -#' UCSC (hg38, based on GRCh38.p13) and stored in Biostrings objects. -#' "hg38-NCBI" will load BSgenome.Hsapiens.NCBI.GRCh38 package, which is -#' full genome sequences for Homo sapiens (Human) as provided by -#' NCBI (GRCh38, 2013-12-17) and stored in Biostrings objects. +#' @param genome_label The genome you used for alignment: `"hg19"`, +#' `"hg38"`, `"hg38-NCBI"`, or `"user_define"`. Default is `"hg19"`. +#' For `"user_define"`, pass a matching `BSgenome` via `genome`. +#' @param genome Used only when `genome_label = "user_define"`: a `BSgenome` +#' object. Ignored with a warning for built-in `genome_label` values if +#' a `BSgenome` is supplied. #' @param bamfile The bam file name. #' @param outdir The path for saving rds file. Default is NA, i.e. not saving. #' @param strand_mode Usually the strand_mode = 1 means the First read is @@ -54,6 +50,7 @@ readGALP <- function( chromosome_to_keep = paste("chr", 1:22, sep = ""), strand_mode = 1, genome_label = "hg19", + genome = NULL, outdir = FALSE, galp_flag = Rsamtools::scanBamFlag( isPaired = TRUE, @@ -75,25 +72,49 @@ readGALP <- function( stop("Input is not paired-end Bam file.") } - if (!genome_label %in% c("hg19", "hg38", "hg38-NCBI") | is.na(genome_label)) { - stop("Only accept hg19 or hg38 or hg38-NCBI as the genome_label...") + .valid_genome_labels <- c("hg19", "hg38", "hg38-NCBI", "user_define") + if (!genome_label %in% .valid_genome_labels || length(genome_label) != 1L || + is.na(genome_label)) { + stop( + "genome_label must be one of: ", + paste0("'", .valid_genome_labels, "'", collapse = ", "), + "." + ) } - - if (genome_label == "hg19") { - - genome <- BSgenome.Hsapiens.UCSC.hg19::BSgenome.Hsapiens.UCSC.hg19 - - } else if (genome_label == "hg38") { - - genome <- BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38 - - } else if (genome_label == "hg38-NCBI") { - - genome <- BSgenome.Hsapiens.NCBI.GRCh38::BSgenome.Hsapiens.NCBI.GRCh38 - + + user_genome_arg <- genome + + if (identical(genome_label, "user_define")) { + if (is.null(user_genome_arg)) { + stop("When genome_label is 'user_define', argument `genome` must be a BSgenome object.") + } + if (!methods::is(user_genome_arg, "BSgenome")) { + stop("When genome_label is 'user_define', `genome` must inherit from BSgenome.") + } + genome <- user_genome_arg + } else { + if (!is.null(user_genome_arg) && methods::is(user_genome_arg, "BSgenome")) { + warning( + "Argument `genome` is ignored unless genome_label is \"user_define\".", + call. = FALSE + ) + } + if (genome_label == "hg19") { + genome <- BSgenome.Hsapiens.UCSC.hg19::BSgenome.Hsapiens.UCSC.hg19 + } else if (genome_label == "hg38") { + genome <- BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38 + } else if (genome_label == "hg38-NCBI") { + genome <- BSgenome.Hsapiens.NCBI.GRCh38::BSgenome.Hsapiens.NCBI.GRCh38 + } + } + + genome_name <- unique(GenomeInfoDb::seqinfo(genome)@genome) + if (length(genome_name) != 1L || is.na(genome_name)) { + stop( + "BSgenome seqinfo must have a single non-NA genome assembly name ", + "(GenomeInfoDb::seqinfo(genome)@genome)." + ) } - - genome_name <- seqinfo(genome)@genome %>% unique() ############################################################################ # Read bam into galp diff --git a/R/read_bam_insert_metrics.R b/R/read_bam_insert_metrics.R index 7b7491c..be08bc8 100644 --- a/R/read_bam_insert_metrics.R +++ b/R/read_bam_insert_metrics.R @@ -1,17 +1,13 @@ #' Calculate insert sizes from a curated GRanges object #' #' @param bamfile The bam file name. -#' @param genome_label The Genome you used in the alignment. -#' Should be "hg19" or "hg38" or "hg38-NCBI. Default is "hg19". -#' Note: "hg19" will load BSgenome.Hsapiens.UCSC.hg19 package, which is -#' Full genome sequences for Homo sapiens (Human) as provided by -#' UCSC (hg19, based on GRCh37.p13) and stored in Biostrings objects; -#' "hg38" will load BSgenome.Hsapiens.UCSC.hg38 package, which is -#' Full genome sequences for Homo sapiens (Human) as provided by -#' UCSC (hg38, based on GRCh38.p13) and stored in Biostrings objects. -#' "hg38-NCBI" will load BSgenome.Hsapiens.NCBI.GRCh38 package, which is -#' full genome sequences for Homo sapiens (Human) as provided by -#' NCBI (GRCh38, 2013-12-17) and stored in Biostrings objects. +#' @param genome_label The Genome you used in the alignment. +#' One of `"hg19"`, `"hg38"`, `"hg38-NCBI"`, or `"user_define"` (default `"hg19"`). +#' For built-in labels, the corresponding BSgenome data package is loaded; +#' for `"user_define"`, supply `genome`. +#' @param genome When `genome_label = "user_define"`, a `BSgenome` object +#' matching the BAM reference. Otherwise optional; a `BSgenome` here is +#' ignored with a warning. #' @param outdir The path for saving rds file. Default is FALSE, i.e. not saving. #' @param strand_mode Usually the strand_mode = 1 means the First read is #' aligned to positive strand. Details please see GenomicAlignments docs. @@ -47,6 +43,7 @@ read_bam_insert_metrics <- function(bamfile = NULL, chromosome_to_keep =paste0("chr", 1:22), strand_mode = 1, genome_label = "hg19", + genome = NULL, outdir = FALSE, isize_min = 1L, isize_max = 1000L, @@ -63,10 +60,11 @@ read_bam_insert_metrics <- function(bamfile = NULL, ) if(!is.null(bamfile)) { - frag <- readBam(bamfile = bamfile, + frag <- readBam(bamfile = bamfile, chromosome_to_keep = chromosome_to_keep, strand_mode = strand_mode, genome_label = genome_label, + genome = genome, outdir = outdir) } else if(!is.null(fragment_obj)) { diff --git a/README.md b/README.md index 91cd992..240f491 100644 --- a/README.md +++ b/README.md @@ -29,7 +29,7 @@ Please do not impose any filtering on the bam files; For example, do not filter (5) No supplementary alignment;\ (6) No unmapped reads. -Note: remember to choose the correct `genome_label`, a parameter in `readBam()` function, based on the ref genome you used for alignment. At the moment, it supports three different ref genomes, hg19, hg38 and hg38-NCBI, For details see readBam() R documentation by typing `?readBam` in the R console or see source code:https://github.com/hw538/cfDNAPro/blob/master/R/readBam.R +Note: set `genome_label` in `readBam()` to match the reference you used for alignment. Built-in values are `hg19`, `hg38`, and `hg38-NCBI`. For any other reference packaged as a Bioconductor `BSgenome`, use `genome_label = "user_define"` and pass that object via the `genome` argument. See `?readBam`. ## Output diff --git a/man/callMotif.Rd b/man/callMotif.Rd index 3c71317..12be987 100644 --- a/man/callMotif.Rd +++ b/man/callMotif.Rd @@ -11,6 +11,7 @@ callMotif( integrate_mut = FALSE, ref_type = "locus_fragment", downsample_ref = FALSE, + genome = NULL, ... ) } @@ -31,6 +32,10 @@ fragments/read-pairs that do not overlap indicated mutation loci.} \item{downsample_ref}{Logical, if TRUE, downsamples reference motifs to match the count of mutant motifs.} +\item{genome}{Optional \code{BSgenome} passed to \code{get_genome_reference()}; +use when the fragment object was built with \code{genome_label = "user_define"} +and the BSgenome cannot be restored from metadata, or to override inference.} + \item{...}{Additional parameters passed to underlying functions.} } \value{ diff --git a/man/callTrinucleotide.Rd b/man/callTrinucleotide.Rd index 8f23804..89a00ba 100644 --- a/man/callTrinucleotide.Rd +++ b/man/callTrinucleotide.Rd @@ -5,10 +5,14 @@ \title{Call trinucleotides and summarise the cfDNA information for each target mutation locus.} \usage{ -callTrinucleotide(frag_obj_mut) +callTrinucleotide(frag_obj_mut, genome = NULL) } \arguments{ \item{frag_obj_mut}{GRanges object containing fragment and mutation data.} + +\item{genome}{Optional \code{BSgenome} passed to \code{get_genome_reference()}; +use when fragments were built with \code{genome_label = "user_define"} and the +object cannot be restored from package metadata, or to override inference.} } \value{ dataframe with summarised mutational and trinucleotide data. diff --git a/man/readBam.Rd b/man/readBam.Rd index d829f63..e3c4a98 100644 --- a/man/readBam.Rd +++ b/man/readBam.Rd @@ -11,6 +11,7 @@ readBam( chromosome_to_keep = paste0("chr", 1:22), strand_mode = 1, genome_label = "hg19", + genome = NULL, outdir = NA, galp_flag = Rsamtools::scanBamFlag(isPaired = TRUE, isDuplicate = FALSE, isSecondaryAlignment = FALSE, isUnmappedQuery = FALSE, isSupplementaryAlignment = @@ -42,17 +43,16 @@ Default is paste0("chr", 1:22).} the strand of the pair is strand of its first alignment. For More Details please see GenomicAlignments docs.} -\item{genome_label}{The Genome you used in the alignment. -Should be "hg19" or "hg38" or "hg38-NCBI". Default is "hg19". -Note: "hg19" will load BSgenome.Hsapiens.UCSC.hg19 package, which is -Full genome sequences for Homo sapiens (Human) as provided by -UCSC (hg19, based on GRCh37.p13) and stored in Biostrings objects; -"hg38" will load BSgenome.Hsapiens.UCSC.hg38 package, which is -Full genome sequences for Homo sapiens (Human) as provided by -UCSC (hg38, based on GRCh38.p13) and stored in Biostrings objects. -"hg38-NCBI" will load BSgenome.Hsapiens.NCBI.GRCh38 package, which is -full genome sequences for Homo sapiens (Human) as provided by -NCBI (GRCh38, 2013-12-17) and stored in Biostrings objects.} +\item{genome_label}{The genome you used for alignment: \code{"hg19"}, +\code{"hg38"}, \code{"hg38-NCBI"}, or \code{"user_define"} (default \code{"hg19"}). +For the first three values, the matching BSgenome data package is loaded +automatically. For \code{"user_define"}, pass a ready-made \code{BSgenome} +object via \code{genome} (same reference as the BAM).} + +\item{genome}{Used only when \code{genome_label = "user_define"}: a +\code{BSgenome} object built or loaded beforehand (must match the BAM +reference). For built-in labels, if you pass a \code{BSgenome} here it is +ignored with a warning.} \item{outdir}{The path for saving rds file. Default is NA, i.e. not saving.} diff --git a/man/readGALP.Rd b/man/readGALP.Rd index 72cf830..1b6eefb 100644 --- a/man/readGALP.Rd +++ b/man/readGALP.Rd @@ -10,6 +10,7 @@ readGALP( chromosome_to_keep = paste("chr", 1:22, sep = ""), strand_mode = 1, genome_label = "hg19", + genome = NULL, outdir = FALSE, galp_flag = Rsamtools::scanBamFlag(isPaired = TRUE, isDuplicate = FALSE, isSecondaryAlignment = FALSE, isUnmappedQuery = FALSE, isSupplementaryAlignment = @@ -32,19 +33,15 @@ Default is paste0("chr", 1:22).} \item{strand_mode}{Usually the strand_mode = 1 means the First read is aligned to positive strand. Details please see GenomicAlignments docs.} -\item{genome_label}{The Genome you used in the alignment. -Should be "hg19" or "hg38" or "hg38-NCBI". Default is "hg19". -Note: "hg19" will load BSgenome.Hsapiens.UCSC.hg19 package, which is -Full genome sequences for Homo sapiens (Human) as provided by -UCSC (hg19, based on GRCh37.p13) and stored in Biostrings objects; -"hg38" will load BSgenome.Hsapiens.UCSC.hg38 package, which is -Full genome sequences for Homo sapiens (Human) as provided by -UCSC (hg38, based on GRCh38.p13) and stored in Biostrings objects. -"hg38-NCBI" will load BSgenome.Hsapiens.NCBI.GRCh38 package, which is -full genome sequences for Homo sapiens (Human) as provided by -NCBI (GRCh38, 2013-12-17) and stored in Biostrings objects.} +\item{genome_label}{The genome you used for alignment: \code{"hg19"}, +\code{"hg38"}, \code{"hg38-NCBI"}, or \code{"user_define"} (default \code{"hg19"}). +For \code{"user_define"}, pass a matching \code{BSgenome} via \code{genome}.} -\item{outdir}{The path for saving rds file. Default is NA, i.e. not saving.} +\item{genome}{Used only when \code{genome_label = "user_define"}: a \code{BSgenome} +object. Ignored with a warning for built-in \code{genome_label} values if +a \code{BSgenome} is supplied.} + +\item{outdir}{The path for saving rds file. Default is FALSE, i.e. not saving.} \item{galp_flag}{} diff --git a/man/read_bam_insert_metrics.Rd b/man/read_bam_insert_metrics.Rd index 232b438..376c503 100644 --- a/man/read_bam_insert_metrics.Rd +++ b/man/read_bam_insert_metrics.Rd @@ -10,6 +10,7 @@ read_bam_insert_metrics( chromosome_to_keep = paste0("chr", 1:22), strand_mode = 1, genome_label = "hg19", + genome = NULL, outdir = FALSE, isize_min = 1L, isize_max = 1000L, @@ -26,17 +27,13 @@ Default is paste0("chr", 1:22).} \item{strand_mode}{Usually the strand_mode = 1 means the First read is aligned to positive strand. Details please see GenomicAlignments docs.} -\item{genome_label}{The Genome you used in the alignment. -Should be "hg19" or "hg38" or "hg38-NCBI. Default is "hg19". -Note: "hg19" will load BSgenome.Hsapiens.UCSC.hg19 package, which is -Full genome sequences for Homo sapiens (Human) as provided by -UCSC (hg19, based on GRCh37.p13) and stored in Biostrings objects; -"hg38" will load BSgenome.Hsapiens.UCSC.hg38 package, which is -Full genome sequences for Homo sapiens (Human) as provided by -UCSC (hg38, based on GRCh38.p13) and stored in Biostrings objects. -"hg38-NCBI" will load BSgenome.Hsapiens.NCBI.GRCh38 package, which is -full genome sequences for Homo sapiens (Human) as provided by -NCBI (GRCh38, 2013-12-17) and stored in Biostrings objects.} +\item{genome_label}{One of \code{"hg19"}, \code{"hg38"}, \code{"hg38-NCBI"}, +or \code{"user_define"} (default \code{"hg19"}). For \code{"user_define"}, +supply \code{genome}.} + +\item{genome}{When \code{genome_label = "user_define"}, a \code{BSgenome} object +matching the BAM reference. Otherwise optional; a \code{BSgenome} here is +ignored with a warning.} \item{outdir}{The path for saving rds file. Default is FALSE, i.e. not saving.} diff --git a/tests/testthat/test_readBam_genome.R b/tests/testthat/test_readBam_genome.R new file mode 100644 index 0000000..bacea1c --- /dev/null +++ b/tests/testthat/test_readBam_genome.R @@ -0,0 +1,34 @@ +test_that("get_genome_reference uses explicit genome argument", { + skip_if_not_installed("BSgenome.Hsapiens.UCSC.hg19") + g <- BSgenome.Hsapiens.UCSC.hg19::BSgenome.Hsapiens.UCSC.hg19 + gr <- GenomicRanges::GRanges("chr1", IRanges::IRanges(1, 10)) + out <- cfDNAPro:::get_genome_reference(gr, genome = g) + expect_identical(out, g) +}) + +test_that("get_genome_reference rejects non-BSgenome explicit genome", { + gr <- GenomicRanges::GRanges() + expect_error( + cfDNAPro:::get_genome_reference(gr, genome = "not-a-BSgenome"), + "BSgenome" + ) +}) + +test_that("get_genome_reference restores user_define from package metadata", { + skip_if_not_installed("BSgenome.Hsapiens.UCSC.hg19") + g <- BSgenome.Hsapiens.UCSC.hg19::BSgenome.Hsapiens.UCSC.hg19 + gr <- GenomicRanges::GRanges() + S4Vectors::metadata(gr) <- list( + cfDNAPro_genome_label = "user_define", + cfDNAPro_bsgenome_pkg = "BSgenome.Hsapiens.UCSC.hg19", + cfDNAPro_bsgenome_symbol = "BSgenome.Hsapiens.UCSC.hg19" + ) + out <- cfDNAPro:::get_genome_reference(gr) + expect_identical(out, g) +}) + +test_that("get_genome_reference errors when user_define cannot be restored", { + gr <- GenomicRanges::GRanges() + S4Vectors::metadata(gr) <- list(cfDNAPro_genome_label = "user_define") + expect_error(cfDNAPro:::get_genome_reference(gr), "user_define") +}) From 50aaaf45f6744e0df73345b00f11fd5b64cc8f2b Mon Sep 17 00:00:00 2001 From: SHY Date: Thu, 14 May 2026 10:54:26 +0800 Subject: [PATCH 2/3] use seqinfo(genome = BSgenome) rather than seqinfo(genome = "...") --- R/internalMutFunctions.R | 21 ++++++++++++++++----- R/readBam.R | 16 +++++++++++++++- R/readGALP.R | 8 +++++++- 3 files changed, 38 insertions(+), 7 deletions(-) diff --git a/R/internalMutFunctions.R b/R/internalMutFunctions.R index 3b64f3d..caaf26f 100644 --- a/R/internalMutFunctions.R +++ b/R/internalMutFunctions.R @@ -1227,7 +1227,11 @@ bam_to_galp_mut <- function(bamfile, message("BAM file has been read into galp.") # add genome information - if (isSingleStringOrNA(genome)) { + if (methods::is(genome, "Seqinfo")) { + if (!isFALSE(chromosome_to_keep)) { + genome <- genome[chromosome_to_keep] + } + } else if (isSingleStringOrNA(genome)) { genome <- Seqinfo(genome = genome)[chromosome_to_keep] } @@ -1512,6 +1516,7 @@ process_bam_file <- function(bamfile, chromosome_to_keep = NULL, strand_mode = 1, genome_name, + galp_merge_seqinfo = NULL, galp_flag, galp_what, galp_tag, @@ -1525,6 +1530,12 @@ process_bam_file <- function(bamfile, mapqFilter = galp_mapqFilter ) + genome_for_galp <- if (!is.null(galp_merge_seqinfo)) { + galp_merge_seqinfo + } else { + genome_name + } + # Check the presence of a mutation file and the mode of operation if (!is.null(mutation_file)) { loci_df <- suppressMessages( @@ -1552,7 +1563,7 @@ process_bam_file <- function(bamfile, use_names = use_names, chromosome_to_keep = chromosome_to_keep, strand_mode = strand_mode, - genome = genome_name, + genome = genome_for_galp, param = galp_param_mut) # Update metadata with content specifics galp@metadata <- list( @@ -1568,7 +1579,7 @@ process_bam_file <- function(bamfile, use_names = use_names, chromosome_to_keep = chromosome_to_keep, strand_mode = strand_mode, - genome = genome_name, + genome = genome_for_galp, param = galp_param) galp_mm <- suppressMessages(bam_to_galp_mut( @@ -1576,7 +1587,7 @@ process_bam_file <- function(bamfile, use_names = use_names, chromosome_to_keep = chromosome_to_keep, strand_mode = strand_mode, - genome = genome_name, + genome = genome_for_galp, param = galp_param_mut)) if (length(galp_mm) == 0) { @@ -1596,7 +1607,7 @@ process_bam_file <- function(bamfile, use_names = use_names, chromosome_to_keep = chromosome_to_keep, strand_mode = strand_mode, - genome = genome_name, + genome = genome_for_galp, param = galp_param) } diff --git a/R/readBam.R b/R/readBam.R index 34607b0..813934a 100644 --- a/R/readBam.R +++ b/R/readBam.R @@ -161,6 +161,15 @@ readBam <- function(bamfile, ) } + # Seqinfo from the BSgenome avoids GenomeInfoDb::Seqinfo(genome = ), + # which queries NCBI FTP (e.g. GCF_000001405.26 for GATK GRCh38) and can fail + # offline or when FTP layout changes. + galp_merge_seqinfo <- if (!isFALSE(chromosome_to_keep)) { + GenomeInfoDb::seqinfo(genome)[chromosome_to_keep] + } else { + GenomeInfoDb::seqinfo(genome) + } + ########################################################################### # Read BAM into galp based on user-defined parameters ########################################################################### @@ -171,6 +180,7 @@ readBam <- function(bamfile, chromosome_to_keep = chromosome_to_keep, strand_mode = strand_mode, genome_name = genome_name, + galp_merge_seqinfo = galp_merge_seqinfo, galp_flag = galp_flag, galp_what = galp_what, galp_tag = galp_tag, @@ -294,7 +304,11 @@ bam_to_galp2 <- function(bamfile, message("BAM file has been read into galp.") # add genome information - if (isSingleStringOrNA(genome)) { + if (methods::is(genome, "Seqinfo")) { + if (!isFALSE(chromosome_to_keep)) { + genome <- genome[chromosome_to_keep] + } + } else if (isSingleStringOrNA(genome)) { genome <- Seqinfo(genome = genome)[chromosome_to_keep] } diff --git a/R/readGALP.R b/R/readGALP.R index 17ad825..099def8 100644 --- a/R/readGALP.R +++ b/R/readGALP.R @@ -115,6 +115,12 @@ readGALP <- function( "(GenomeInfoDb::seqinfo(genome)@genome)." ) } + + galp_merge_seqinfo <- if (!isFALSE(chromosome_to_keep)) { + GenomeInfoDb::seqinfo(genome)[chromosome_to_keep] + } else { + GenomeInfoDb::seqinfo(genome) + } ############################################################################ # Read bam into galp @@ -129,7 +135,7 @@ readGALP <- function( use_names = use_names, chromosome_to_keep = chromosome_to_keep, strand_mode = strand_mode, - genome = genome_name, + genome = galp_merge_seqinfo, param = galp_param ) From d28dbd085f44c022af23947cbd77df6f78805857 Mon Sep 17 00:00:00 2001 From: SHY Date: Thu, 14 May 2026 16:05:02 +0800 Subject: [PATCH 3/3] add genome parameter in writeMutTable --- R/internalMutFunctions.R | 6 ++++-- man/writeMutTable.Rd | 6 +++++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/R/internalMutFunctions.R b/R/internalMutFunctions.R index caaf26f..63767e9 100644 --- a/R/internalMutFunctions.R +++ b/R/internalMutFunctions.R @@ -211,6 +211,8 @@ read_mutation_file <- function(mutation_file) { #' #' @param frag_obj GRanges object containing genomic ranges and associated data. #' @param output_file Character string specifying the path/name of the output file. +#' @param genome Optional `BSgenome` passed to [callTrinucleotide()]. Use when +#' `frag_obj` was built with `genome_label = "user_define"` #' #' @return The path to the output file as a character string. #' @@ -219,7 +221,7 @@ read_mutation_file <- function(mutation_file) { #' @examples #' output_path <- writeMutTable(gr, "output_directory", "output_filename") #' print(paste("File written to:", output_path)) -writeMutTable <- function(frag_obj, output_file) { +writeMutTable <- function(frag_obj, output_file, genome = NULL) { # Extract the directory path from the output file path output_dir <- dirname(output_file) @@ -231,7 +233,7 @@ writeMutTable <- function(frag_obj, output_file) { stop("The directory does not exist. Please provide a valid directory path.") } - trinuc_df <- callTrinucleotide(frag_obj = frag_obj) + trinuc_df <- callTrinucleotide(frag_obj = frag_obj, genome = genome) # Adding informative columns for counts and fractions trinuc_df <- trinuc_df %>% diff --git a/man/writeMutTable.Rd b/man/writeMutTable.Rd index 7ae552e..56fd0ea 100644 --- a/man/writeMutTable.Rd +++ b/man/writeMutTable.Rd @@ -4,12 +4,16 @@ \alias{writeMutTable} \title{Function that generates a Mutation Table with summarised cfDNA fragment data} \usage{ -writeMutTable(frag_obj, output_file) +writeMutTable(frag_obj, output_file, genome = NULL) } \arguments{ \item{frag_obj}{GRanges object containing genomic ranges and associated data.} \item{output_file}{Character string specifying the path/name of the output file.} + +\item{genome}{Optional \code{BSgenome} passed to \code{\link{callTrinucleotide}}; use when +\code{frag_obj} was built with \code{genome_label = "user_define"} and the reference +cannot be restored from object metadata. Defaults to \code{NULL} (infer when possible).} } \value{ The path to the output file as a character string.