From f9fec1bbf7ed146432c37dbec4288ecbc2de1214 Mon Sep 17 00:00:00 2001
From: SHY <serine94209792@gmail.com>
Date: Wed, 13 May 2026 21:12:08 +0800
Subject: [PATCH 1/3] Support user-defined BSgenome references

---
 .gitignore                           |   1 +
 DESCRIPTION                          |   5 +-
 NAMESPACE                            |   3 +
 R/callMotif.R                        |   6 +-
 R/callTrinucleotide.R                |   9 ++-
 R/internalMutFunctions.R             | 108 +++++++++++++++++++++++++--
 R/readBam.R                          |  86 ++++++++++++++-------
 R/readGALP.R                         |  77 ++++++++++++-------
 R/read_bam_insert_metrics.R          |  22 +++---
 README.md                            |   2 +-
 man/callMotif.Rd                     |   5 ++
 man/callTrinucleotide.Rd             |   6 +-
 man/readBam.Rd                       |  22 +++---
 man/readGALP.Rd                      |  21 +++---
 man/read_bam_insert_metrics.Rd       |  19 ++---
 tests/testthat/test_readBam_genome.R |  34 +++++++++
 16 files changed, 312 insertions(+), 114 deletions(-)
 create mode 100644 tests/testthat/test_readBam_genome.R

diff --git a/.gitignore b/.gitignore
index 1926064..e5f7da7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -29,6 +29,7 @@ cfDNAPro.Rproj
 .rproj
 .rproj.user
 .rhistory
+.cursor/
 doc
 Meta
 ..Rcheck
diff --git a/DESCRIPTION b/DESCRIPTION
index 4ad3eef..cd81586 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: cfDNAPro
 Type: Package
 Title: cfDNAPro extracts and Visualises biological features from whole genome sequencing data of cell-free DNA
-Version: 1.7.3
+Version: 1.7.4
 biocViews: Visualization, Sequencing, WholeGenome
 Authors@R: c(
     person("Haichao","Wang", email="hw538@cam.ac.uk", role =c("aut","cre")),
@@ -35,6 +35,7 @@ Imports:
     Rsamtools (>= 2.4.0),
     Biostrings,
     rlang (>= 0.4.0),
+    BSgenome,
     BSgenome.Hsapiens.UCSC.hg38,
     BSgenome.Hsapiens.UCSC.hg19,
     BSgenome.Hsapiens.NCBI.GRCh38,
@@ -58,8 +59,8 @@ Suggests:
     testthat
 License: GPL-3
 Encoding: UTF-8
-RoxygenNote: 7.3.1
 VignetteBuilder: knitr
 URL: https://github.com/hw538/cfDNAPro
 BugRePORTS: https://github.com/hw538/cfDNAPro/issues
 BiocType: Software
+Config/roxygen2/version: 8.0.0
diff --git a/NAMESPACE b/NAMESPACE
index 7f8f3be..665ece9 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -96,9 +96,11 @@ importFrom(Rsamtools,pileup)
 importFrom(Rsamtools,scanBam)
 importFrom(Rsamtools,scanBamFlag)
 importFrom(Rsamtools,testPairedEndBam)
+importFrom(S4Vectors,"metadata<-")
 importFrom(S4Vectors,Rle)
 importFrom(S4Vectors,isSingleStringOrNA)
 importFrom(S4Vectors,mcols)
+importFrom(S4Vectors,metadata)
 importFrom(dplyr,across)
 importFrom(dplyr,all_of)
 importFrom(dplyr,arrange)
@@ -176,5 +178,6 @@ importFrom(tidyr,expand_grid)
 importFrom(tidyr,pivot_longer)
 importFrom(tidyr,replace_na)
 importFrom(tidyr,unite)
+importFrom(utils,packageName)
 importFrom(utils,read.table)
 importFrom(utils,write.table)
diff --git a/R/callMotif.R b/R/callMotif.R
index b6f3bc4..cccfd11 100644
--- a/R/callMotif.R
+++ b/R/callMotif.R
@@ -23,6 +23,9 @@
 #' fragments/read-pairs that do not overlap indicated mutation loci.
 #' @param downsample_ref Logical, if TRUE, downsamples reference motifs to match
 #'        the count of mutant motifs.
+#' @param genome Optional `BSgenome` passed to `get_genome_reference()`; use when
+#'   fragments were built with `genome_label = "user_define"` and the object
+#'   cannot be restored from package metadata, or to override inference.
 #' @param ... Additional parameters passed to underlying functions.
 #'
 #' @return Returns a tibble containing the results of the motif analysis.
@@ -37,6 +40,7 @@ callMotif <- function(frag_obj,
                       integrate_mut = FALSE,
                       ref_type = "locus_fragment",
                       downsample_ref = FALSE,
+                      genome = NULL,
                       ...) {
 
   motif_length <- as.numeric(motif_length)
@@ -44,7 +48,7 @@ callMotif <- function(frag_obj,
   message("Started to extract ", paste0(motif_type, motif_length), " motif...")
 
   # Select the appropriate genome reference
-  bsgenome_obj <- get_genome_reference(frag_obj)
+  bsgenome_obj <- get_genome_reference(frag_obj, genome = genome)
 
   genome_label <- bsgenome_obj@metadata$genome
 
diff --git a/R/callTrinucleotide.R b/R/callTrinucleotide.R
index 9af8b40..c3d28ff 100644
--- a/R/callTrinucleotide.R
+++ b/R/callTrinucleotide.R
@@ -26,19 +26,22 @@
 #' @importFrom magrittr '%>%'
 #'
 #' @param frag_obj_mut GRanges object containing fragment and mutation data.
+#' @param genome Optional `BSgenome` passed to `get_genome_reference()`; use when
+#'   fragments were built with `genome_label = "user_define"` and the object
+#'   cannot be restored from package metadata, or to override inference.
 #'
 #' @return dataframe with summarised mutational and trinucleotide data.
 #' @export
 #'
 #' @examples
 #' trinuc_df <-  callTrinucleotide(frag_obj_mut)
-callTrinucleotide <- function(frag_obj_mut) {
+callTrinucleotide <- function(frag_obj_mut, genome = NULL) {
 
   # Check if GRanges has mutational information
   check_mutation_in_metadata(frag_obj = frag_obj_mut)
 
   # Get the appropriate genome reference
-  genome <- get_genome_reference(frag_obj_mut)
+  bsgenome_obj <- get_genome_reference(frag_obj_mut, genome = genome)
 
   # Convert relevant data to dataframe
   gr_df <- prepare_data(frag_obj_mut)
@@ -68,7 +71,7 @@ callTrinucleotide <- function(frag_obj_mut) {
   merged_table_df <- update_consensus_mismatch(merged_table_df, gr_df, mapping)
 
   # Add the SBS 96 annotation for consensus locus-based mutations
-  merged_table_df <- get_trinucleotide(merged_table_df, genome)
+  merged_table_df <- get_trinucleotide(merged_table_df, bsgenome_obj)
 
   return(merged_table_df)
 }
diff --git a/R/internalMutFunctions.R b/R/internalMutFunctions.R
index 75c1e50..3b64f3d 100644
--- a/R/internalMutFunctions.R
+++ b/R/internalMutFunctions.R
@@ -1633,25 +1633,119 @@ create_empty_galp <- function() {
 
 
 
+#' Resolve package and export name for a BSgenome instance (for RDS recovery)
+#' @importFrom utils packageName getNamespace getNamespaceExports
+#' @importFrom methods is
+#' @noRd
+.cfDNAPro_bsgenome_pkg_symbol <- function(genome_obj) {
+  if (!methods::is(genome_obj, "BSgenome")) {
+    return(list(pkg = NA_character_, symbol = NA_character_))
+  }
+  pkg <- tryCatch(
+    utils::packageName(environment(genome_obj)),
+    error = function(...) NA_character_
+  )
+  if (is.na(pkg) || identical(pkg, ".GlobalEnv")) {
+    return(list(pkg = NA_character_, symbol = NA_character_))
+  }
+  ns <- tryCatch(asNamespace(pkg), error = function(...) NULL)
+  if (is.null(ns)) {
+    return(list(pkg = NA_character_, symbol = NA_character_))
+  }
+  for (nm in utils::getNamespaceExports(pkg)) {
+    obj <- tryCatch(get(nm, envir = ns), error = function(...) NULL)
+    if (!is.null(obj) && methods::is(obj, "BSgenome") && identical(obj, genome_obj)) {
+      return(list(pkg = pkg, symbol = nm))
+    }
+  }
+  list(pkg = NA_character_, symbol = NA_character_)
+}
+
+#' Restore user BSgenome from readBam metadata (package + export name)
+#' @importFrom methods is
+#' @noRd
+.cfDNAPro_restore_user_bsgenome <- function(md) {
+  pkg <- md$cfDNAPro_bsgenome_pkg
+  sym <- md$cfDNAPro_bsgenome_symbol
+  if (is.null(pkg) || is.null(sym) || length(pkg) != 1L || length(sym) != 1L) {
+    return(NULL)
+  }
+  if (is.na(pkg) || is.na(sym) || !nzchar(pkg) || !nzchar(sym)) {
+    return(NULL)
+  }
+  if (!requireNamespace(pkg, quietly = TRUE)) {
+    stop(
+      "BSgenome package \"", pkg, "\" is not installed; install it or pass ",
+      "`genome` explicitly to callMotif() / callTrinucleotide().",
+      call. = FALSE
+    )
+  }
+  obj <- tryCatch(get(sym, envir = asNamespace(pkg)), error = function(...) NULL)
+  if (is.null(obj) || !methods::is(obj, "BSgenome")) {
+    return(NULL)
+  }
+  obj
+}
+
+#' Attach cfDNAPro genome lineage metadata to a GRanges from readBam
+#' @importFrom S4Vectors metadata metadata<-
+#' @noRd
+.cfDNAPro_attach_read_bam_genome_metadata <- function(frag, genome_label, genome_obj) {
+  md <- S4Vectors::metadata(frag)
+  md$cfDNAPro_genome_label <- genome_label
+  if (identical(genome_label, "user_define")) {
+    ps <- .cfDNAPro_bsgenome_pkg_symbol(genome_obj)
+    md$cfDNAPro_bsgenome_pkg <- ps$pkg
+    md$cfDNAPro_bsgenome_symbol <- ps$symbol
+  } else {
+    md$cfDNAPro_bsgenome_pkg <- NULL
+    md$cfDNAPro_bsgenome_symbol <- NULL
+  }
+  S4Vectors::metadata(frag) <- md
+  frag
+}
+
 #' Function to determine the correct BSgenome based on the seqinfo result
 #' @importFrom GenomeInfoDb seqinfo
+#' @importFrom methods is
 #' @noRd
-get_genome_reference <- function(frag_obj_mut) {
+get_genome_reference <- function(frag_obj_mut, genome = NULL) {
+  if (!is.null(genome)) {
+    if (!methods::is(genome, "BSgenome")) {
+      stop("`genome` must be a BSgenome object when provided.", call. = FALSE)
+    }
+    return(genome)
+  }
+
+  md <- S4Vectors::metadata(frag_obj_mut)
+  if (identical(md$cfDNAPro_genome_label, "user_define")) {
+    restored <- .cfDNAPro_restore_user_bsgenome(md)
+    if (!is.null(restored)) {
+      return(restored)
+    }
+    stop(
+      "This object was built with genome_label=\"user_define\" but the BSgenome ",
+      "cannot be restored (e.g. ad-hoc object not from an installed package). ",
+      "Pass `genome` explicitly to callMotif() or callTrinucleotide().",
+      call. = FALSE
+    )
+  }
+
   # Extract genome sequence from GRanges object's seqinfo
   genome_seq <- unique(
     as.character(GenomeInfoDb::seqinfo(frag_obj_mut)@genome))
-  
+
   # Define genome versions and corresponding BSgenome data packages
   genome_versions <- c("GRCh38", "hg38-NCBI", "hg38", "hg19")
   genome_packages <- c(
-    "BSgenome.Hsapiens.NCBI.GRCh38::BSgenome.Hsapiens.NCBI.GRCh38", 
-    "BSgenome.Hsapiens.NCBI.GRCh38::BSgenome.Hsapiens.NCBI.GRCh38", 
-    "BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38", 
+    "BSgenome.Hsapiens.NCBI.GRCh38::BSgenome.Hsapiens.NCBI.GRCh38",
+    "BSgenome.Hsapiens.NCBI.GRCh38::BSgenome.Hsapiens.NCBI.GRCh38",
+    "BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38",
     "BSgenome.Hsapiens.UCSC.hg19::BSgenome.Hsapiens.UCSC.hg19")
-  
+
   # Find the index of the genome sequence in the genome_versions vector
   index <- match(genome_seq, genome_versions)
-  
+
   # Return the corresponding genome package; default to hg19 if not found
   if (!is.na(index)) {
     return(eval(parse(text = genome_packages[index])))
diff --git a/R/readBam.R b/R/readBam.R
index 918321c..34607b0 100644
--- a/R/readBam.R
+++ b/R/readBam.R
@@ -4,24 +4,22 @@
 #' @import GenomicAlignments
 #' @import S4Vectors
 #' @importFrom IRanges IRanges
-#' @importFrom GenomeInfoDb seqlengths genome Seqinfo keepSeqlevels
+#' @importFrom GenomeInfoDb seqlengths genome Seqinfo keepSeqlevels seqinfo
 #' @importFrom GenomicAlignments readGAlignmentPairs
 #' @importFrom Rsamtools scanBamFlag ScanBamParam testPairedEndBam
 #' @importFrom GenomicRanges GRanges seqnames sort granges
 #' @importFrom stringr str_remove
 #' @importFrom methods is
 #'
-#' @param genome_label The Genome you used in the alignment.
-#'    Should be "hg19" or "hg38" or "hg38-NCBI". Default is "hg19".
-#'    Note: "hg19" will load BSgenome.Hsapiens.UCSC.hg19 package, which is
-#'    Full genome sequences for Homo sapiens (Human) as provided by
-#'    UCSC (hg19, based on GRCh37.p13) and stored in Biostrings objects;
-#'    "hg38" will load BSgenome.Hsapiens.UCSC.hg38 package, which is
-#'    Full genome sequences for Homo sapiens (Human) as provided by
-#'    UCSC (hg38, based on GRCh38.p13) and stored in Biostrings objects.
-#'    "hg38-NCBI" will load BSgenome.Hsapiens.NCBI.GRCh38 package, which is
-#'    full genome sequences for Homo sapiens (Human) as provided by
-#'    NCBI (GRCh38, 2013-12-17) and stored in Biostrings objects.
+#' @param genome_label The genome you used for alignment: `"hg19"`,
+#'   `"hg38"`, `"hg38-NCBI"`, or `"user_define"`. Default is `"hg19"`.
+#'   For the first three values, the matching BSgenome data package is
+#'   loaded automatically. For `"user_define"`, you must pass a ready-made
+#'   `BSgenome` object via `genome` (same reference as the BAM).
+#' @param genome Used only when `genome_label = "user_define"`: a
+#'   `BSgenome` object built or loaded beforehand (must match the BAM
+#'   reference). For built-in labels, if you pass a `BSgenome` here it is
+#'   ignored with a warning.
 #' @param bamfile The bam file name.
 #' @param curate_start_and_end A logical (TRUE or FALSE) parameter
 #'   for curating alignment start and end coordinates.
@@ -83,6 +81,7 @@ readBam <- function(bamfile,
                     chromosome_to_keep = paste0("chr", 1:22),
                     strand_mode = 1,
                     genome_label = "hg19",
+                    genome = NULL,
                     outdir = NA,
                     galp_flag = Rsamtools::scanBamFlag(
                       isPaired = TRUE,
@@ -107,25 +106,60 @@ readBam <- function(bamfile,
     stop("Input is not paired-end Bam file.")
   }
 
-  if (!genome_label %in% c("hg19", "hg38", "hg38-NCBI") || is.na(genome_label)) {
-      stop("Only 'hg19', 'hg38', or 'hg38-NCBI' are accepted as genome labels.")
+  .valid_genome_labels <- c("hg19", "hg38", "hg38-NCBI", "user_define")
+  if (!genome_label %in% .valid_genome_labels || length(genome_label) != 1L ||
+        is.na(genome_label)) {
+    stop(
+      "genome_label must be one of: ",
+      paste0("'", .valid_genome_labels, "'", collapse = ", "),
+      "."
+    )
   }
 
-  if (genome_label == "hg19") {
+  user_genome_arg <- genome
 
-    genome <- BSgenome.Hsapiens.UCSC.hg19::BSgenome.Hsapiens.UCSC.hg19
-
-  } else if (genome_label == "hg38") {
-
-    genome <- BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38
-
-  } else if (genome_label == "hg38-NCBI") {
-
-    genome <- BSgenome.Hsapiens.NCBI.GRCh38::BSgenome.Hsapiens.NCBI.GRCh38
+  if (identical(genome_label, "user_define")) {
+    if (is.null(user_genome_arg)) {
+      stop("When genome_label is 'user_define', argument `genome` must be a BSgenome object.")
+    }
+    if (!methods::is(user_genome_arg, "BSgenome")) {
+      stop("When genome_label is 'user_define', `genome` must inherit from BSgenome.")
+    }
+    genome <- user_genome_arg
+  } else {
+    if (!is.null(user_genome_arg) && methods::is(user_genome_arg, "BSgenome")) {
+      warning(
+        "Argument `genome` is ignored unless genome_label is \"user_define\".",
+        call. = FALSE
+      )
+    }
+    if (genome_label == "hg19") {
+      genome <- BSgenome.Hsapiens.UCSC.hg19::BSgenome.Hsapiens.UCSC.hg19
+    } else if (genome_label == "hg38") {
+      genome <- BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38
+    } else if (genome_label == "hg38-NCBI") {
+      genome <- BSgenome.Hsapiens.NCBI.GRCh38::BSgenome.Hsapiens.NCBI.GRCh38
+    }
+  }
 
+  if (!isFALSE(chromosome_to_keep)) {
+    sl_names <- names(GenomeInfoDb::seqlengths(genome))
+    unknown_chr <- setdiff(chromosome_to_keep, sl_names)
+    if (length(unknown_chr) > 0L) {
+      stop(
+        "chromosome_to_keep contains seqnames not found in the BSgenome: ",
+        paste(unknown_chr, collapse = ", ")
+      )
+    }
   }
 
-  genome_name <- seqinfo(genome)@genome %>% unique()
+  genome_name <- unique(GenomeInfoDb::seqinfo(genome)@genome)
+  if (length(genome_name) != 1L || is.na(genome_name)) {
+    stop(
+      "BSgenome seqinfo must have a single non-NA genome assembly name ",
+      "(GenomeInfoDb::seqinfo(genome)@genome)."
+    )
+  }
 
   ###########################################################################
   # Read BAM into galp based on user-defined parameters
@@ -205,6 +239,8 @@ readBam <- function(bamfile,
       )
   }
 
+  frag <- .cfDNAPro_attach_read_bam_genome_metadata(frag, genome_label, genome)
+
   ###########################################################################
   # Saving RDS file
   ###########################################################################
diff --git a/R/readGALP.R b/R/readGALP.R
index 40255dc..17ad825 100644
--- a/R/readGALP.R
+++ b/R/readGALP.R
@@ -4,21 +4,17 @@
 #' @import GenomeInfoDb
 #' @import GenomicAlignments
 #' @import S4Vectors
+#' @importFrom methods is
 #' @importFrom Rsamtools scanBamFlag ScanBamParam
 #' @importFrom GenomicRanges GRanges seqnames
 #' @importFrom IRanges IRanges
 #'
-#' @param genome_label The Genome you used in the alignment. 
-#'    Should be "hg19" or "hg38" or "hg38-NCBI". Default is "hg19". 
-#'    Note: "hg19" will load BSgenome.Hsapiens.UCSC.hg19 package, which is 
-#'    Full genome sequences for Homo sapiens (Human) as provided by 
-#'    UCSC (hg19, based on GRCh37.p13) and stored in Biostrings objects; 
-#'    "hg38" will load BSgenome.Hsapiens.UCSC.hg38 package, which is 
-#'    Full genome sequences for Homo sapiens (Human) as provided by 
-#'    UCSC (hg38, based on GRCh38.p13) and stored in Biostrings objects.
-#'    "hg38-NCBI" will load BSgenome.Hsapiens.NCBI.GRCh38 package, which is 
-#'    full genome sequences for Homo sapiens (Human) as provided by 
-#'    NCBI (GRCh38, 2013-12-17) and stored in Biostrings objects.
+#' @param genome_label The genome you used for alignment: `"hg19"`,
+#'   `"hg38"`, `"hg38-NCBI"`, or `"user_define"`. Default is `"hg19"`.
+#'   For `"user_define"`, pass a matching `BSgenome` via `genome`.
+#' @param genome Used only when `genome_label = "user_define"`: a `BSgenome`
+#'   object. Ignored with a warning for built-in `genome_label` values if
+#'   a `BSgenome` is supplied.
 #' @param bamfile The bam file name.
 #' @param outdir The path for saving rds file. Default is NA, i.e. not saving.
 #' @param strand_mode Usually the strand_mode  = 1 means the First read is 
@@ -54,6 +50,7 @@ readGALP <- function(
     chromosome_to_keep = paste("chr", 1:22, sep = ""),
     strand_mode = 1,
     genome_label = "hg19",
+    genome = NULL,
     outdir = FALSE,
     galp_flag =  Rsamtools::scanBamFlag(
       isPaired = TRUE,
@@ -75,25 +72,49 @@ readGALP <- function(
     stop("Input is not paired-end Bam file.")
   }
   
-  if (!genome_label %in% c("hg19", "hg38", "hg38-NCBI") | is.na(genome_label)) {
-    stop("Only accept hg19 or hg38 or hg38-NCBI as the genome_label...")
+  .valid_genome_labels <- c("hg19", "hg38", "hg38-NCBI", "user_define")
+  if (!genome_label %in% .valid_genome_labels || length(genome_label) != 1L ||
+        is.na(genome_label)) {
+    stop(
+      "genome_label must be one of: ",
+      paste0("'", .valid_genome_labels, "'", collapse = ", "),
+      "."
+    )
   }
-  
-  if (genome_label == "hg19") {
-    
-    genome <- BSgenome.Hsapiens.UCSC.hg19::BSgenome.Hsapiens.UCSC.hg19
-    
-  } else if (genome_label == "hg38") {
-    
-    genome <- BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38
-    
-  } else if (genome_label == "hg38-NCBI") {
-    
-    genome <- BSgenome.Hsapiens.NCBI.GRCh38::BSgenome.Hsapiens.NCBI.GRCh38
-    
+
+  user_genome_arg <- genome
+
+  if (identical(genome_label, "user_define")) {
+    if (is.null(user_genome_arg)) {
+      stop("When genome_label is 'user_define', argument `genome` must be a BSgenome object.")
+    }
+    if (!methods::is(user_genome_arg, "BSgenome")) {
+      stop("When genome_label is 'user_define', `genome` must inherit from BSgenome.")
+    }
+    genome <- user_genome_arg
+  } else {
+    if (!is.null(user_genome_arg) && methods::is(user_genome_arg, "BSgenome")) {
+      warning(
+        "Argument `genome` is ignored unless genome_label is \"user_define\".",
+        call. = FALSE
+      )
+    }
+    if (genome_label == "hg19") {
+      genome <- BSgenome.Hsapiens.UCSC.hg19::BSgenome.Hsapiens.UCSC.hg19
+    } else if (genome_label == "hg38") {
+      genome <- BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38
+    } else if (genome_label == "hg38-NCBI") {
+      genome <- BSgenome.Hsapiens.NCBI.GRCh38::BSgenome.Hsapiens.NCBI.GRCh38
+    }
+  }
+
+  genome_name <- unique(GenomeInfoDb::seqinfo(genome)@genome)
+  if (length(genome_name) != 1L || is.na(genome_name)) {
+    stop(
+      "BSgenome seqinfo must have a single non-NA genome assembly name ",
+      "(GenomeInfoDb::seqinfo(genome)@genome)."
+    )
   }
-  
-  genome_name <- seqinfo(genome)@genome %>% unique()
   
   ############################################################################
   # Read bam into galp
diff --git a/R/read_bam_insert_metrics.R b/R/read_bam_insert_metrics.R
index 7b7491c..be08bc8 100644
--- a/R/read_bam_insert_metrics.R
+++ b/R/read_bam_insert_metrics.R
@@ -1,17 +1,13 @@
 #' Calculate insert sizes from a curated GRanges object
 #' 
 #' @param bamfile The bam file name.
-#' @param genome_label The Genome you used in the alignment. 
-#'    Should be "hg19" or "hg38" or "hg38-NCBI. Default is "hg19". 
-#'    Note: "hg19" will load BSgenome.Hsapiens.UCSC.hg19 package, which is 
-#'    Full genome sequences for Homo sapiens (Human) as provided by 
-#'    UCSC (hg19, based on GRCh37.p13) and stored in Biostrings objects; 
-#'    "hg38" will load BSgenome.Hsapiens.UCSC.hg38 package, which is 
-#'    Full genome sequences for Homo sapiens (Human) as provided by 
-#'    UCSC (hg38, based on GRCh38.p13) and stored in Biostrings objects.
-#'    "hg38-NCBI" will load BSgenome.Hsapiens.NCBI.GRCh38 package, which is 
-#'    full genome sequences for Homo sapiens (Human) as provided by 
-#'    NCBI (GRCh38, 2013-12-17) and stored in Biostrings objects.
+#' @param genome_label The Genome you used in the alignment.
+#'    One of `"hg19"`, `"hg38"`, `"hg38-NCBI"`, or `"user_define"` (default `"hg19"`).
+#'    For built-in labels, the corresponding BSgenome data package is loaded;
+#'    for `"user_define"`, supply `genome`.
+#' @param genome When `genome_label = "user_define"`, a `BSgenome` object
+#'    matching the BAM reference. Otherwise optional; a `BSgenome` here is
+#'    ignored with a warning.
 #' @param outdir The path for saving rds file. Default is FALSE, i.e. not saving.
 #' @param strand_mode Usually the strand_mode  = 1 means the First read is 
 #'    aligned to positive strand. Details please see GenomicAlignments docs.
@@ -47,6 +43,7 @@ read_bam_insert_metrics <- function(bamfile = NULL,
                                     chromosome_to_keep =paste0("chr", 1:22),
                                     strand_mode = 1,
                                     genome_label = "hg19",
+                                    genome = NULL,
                                     outdir = FALSE,
                                     isize_min = 1L,
                                     isize_max = 1000L,
@@ -63,10 +60,11 @@ read_bam_insert_metrics <- function(bamfile = NULL,
   )
   
   if(!is.null(bamfile)) {
-    frag <- readBam(bamfile = bamfile, 
+    frag <- readBam(bamfile = bamfile,
                   chromosome_to_keep = chromosome_to_keep,
                   strand_mode = strand_mode,
                   genome_label = genome_label,
+                  genome = genome,
                   outdir = outdir)
   } else if(!is.null(fragment_obj)) {
     
diff --git a/README.md b/README.md
index 91cd992..240f491 100644
--- a/README.md
+++ b/README.md
@@ -29,7 +29,7 @@ Please do not impose any filtering on the bam files; For example, do not filter
 (5) No supplementary alignment;\
 (6) No unmapped reads.
 
-Note: remember to choose the correct `genome_label`, a parameter in `readBam()` function, based on the ref genome you used for alignment. At the moment, it supports three different ref genomes, hg19, hg38 and hg38-NCBI, For details see readBam() R documentation by typing `?readBam` in the R console or see source code:https://github.com/hw538/cfDNAPro/blob/master/R/readBam.R
+Note: set `genome_label` in `readBam()` to match the reference you used for alignment. Built-in values are `hg19`, `hg38`, and `hg38-NCBI`. For any other reference packaged as a Bioconductor `BSgenome`, use `genome_label = "user_define"` and pass that object via the `genome` argument. See `?readBam`.
 
 ## Output
 
diff --git a/man/callMotif.Rd b/man/callMotif.Rd
index 3c71317..12be987 100644
--- a/man/callMotif.Rd
+++ b/man/callMotif.Rd
@@ -11,6 +11,7 @@ callMotif(
   integrate_mut = FALSE,
   ref_type = "locus_fragment",
   downsample_ref = FALSE,
+  genome = NULL,
   ...
 )
 }
@@ -31,6 +32,10 @@ fragments/read-pairs that do not overlap indicated mutation loci.}
 \item{downsample_ref}{Logical, if TRUE, downsamples reference motifs to match
 the count of mutant motifs.}
 
+\item{genome}{Optional \code{BSgenome} passed to \code{get_genome_reference()};
+use when the fragment object was built with \code{genome_label = "user_define"}
+and the BSgenome cannot be restored from metadata, or to override inference.}
+
 \item{...}{Additional parameters passed to underlying functions.}
 }
 \value{
diff --git a/man/callTrinucleotide.Rd b/man/callTrinucleotide.Rd
index 8f23804..89a00ba 100644
--- a/man/callTrinucleotide.Rd
+++ b/man/callTrinucleotide.Rd
@@ -5,10 +5,14 @@
 \title{Call trinucleotides and summarise the cfDNA information
 for each target mutation locus.}
 \usage{
-callTrinucleotide(frag_obj_mut)
+callTrinucleotide(frag_obj_mut, genome = NULL)
 }
 \arguments{
 \item{frag_obj_mut}{GRanges object containing fragment and mutation data.}
+
+\item{genome}{Optional \code{BSgenome} passed to \code{get_genome_reference()};
+use when fragments were built with \code{genome_label = "user_define"} and the
+object cannot be restored from package metadata, or to override inference.}
 }
 \value{
 dataframe with summarised mutational and trinucleotide data.
diff --git a/man/readBam.Rd b/man/readBam.Rd
index d829f63..e3c4a98 100644
--- a/man/readBam.Rd
+++ b/man/readBam.Rd
@@ -11,6 +11,7 @@ readBam(
   chromosome_to_keep = paste0("chr", 1:22),
   strand_mode = 1,
   genome_label = "hg19",
+  genome = NULL,
   outdir = NA,
   galp_flag = Rsamtools::scanBamFlag(isPaired = TRUE, isDuplicate = FALSE,
     isSecondaryAlignment = FALSE, isUnmappedQuery = FALSE, isSupplementaryAlignment =
@@ -42,17 +43,16 @@ Default is paste0("chr", 1:22).}
 the strand of the pair is strand of its first alignment.
 For More Details please see GenomicAlignments docs.}
 
-\item{genome_label}{The Genome you used in the alignment.
-Should be "hg19" or "hg38" or "hg38-NCBI". Default is "hg19".
-Note: "hg19" will load BSgenome.Hsapiens.UCSC.hg19 package, which is
-Full genome sequences for Homo sapiens (Human) as provided by
-UCSC (hg19, based on GRCh37.p13) and stored in Biostrings objects;
-"hg38" will load BSgenome.Hsapiens.UCSC.hg38 package, which is
-Full genome sequences for Homo sapiens (Human) as provided by
-UCSC (hg38, based on GRCh38.p13) and stored in Biostrings objects.
-"hg38-NCBI" will load BSgenome.Hsapiens.NCBI.GRCh38 package, which is
-full genome sequences for Homo sapiens (Human) as provided by
-NCBI (GRCh38, 2013-12-17) and stored in Biostrings objects.}
+\item{genome_label}{The genome you used for alignment: \code{"hg19"},
+\code{"hg38"}, \code{"hg38-NCBI"}, or \code{"user_define"} (default \code{"hg19"}).
+For the first three values, the matching BSgenome data package is loaded
+automatically. For \code{"user_define"}, pass a ready-made \code{BSgenome}
+object via \code{genome} (same reference as the BAM).}
+
+\item{genome}{Used only when \code{genome_label = "user_define"}: a
+\code{BSgenome} object built or loaded beforehand (must match the BAM
+reference). For built-in labels, if you pass a \code{BSgenome} here it is
+ignored with a warning.}
 
 \item{outdir}{The path for saving rds file. Default is NA, i.e. not saving.}
 
diff --git a/man/readGALP.Rd b/man/readGALP.Rd
index 72cf830..1b6eefb 100644
--- a/man/readGALP.Rd
+++ b/man/readGALP.Rd
@@ -10,6 +10,7 @@ readGALP(
   chromosome_to_keep = paste("chr", 1:22, sep = ""),
   strand_mode = 1,
   genome_label = "hg19",
+  genome = NULL,
   outdir = FALSE,
   galp_flag = Rsamtools::scanBamFlag(isPaired = TRUE, isDuplicate = FALSE,
     isSecondaryAlignment = FALSE, isUnmappedQuery = FALSE, isSupplementaryAlignment =
@@ -32,19 +33,15 @@ Default is paste0("chr", 1:22).}
 \item{strand_mode}{Usually the strand_mode  = 1 means the First read is 
 aligned to positive strand. Details please see GenomicAlignments docs.}
 
-\item{genome_label}{The Genome you used in the alignment. 
-Should be "hg19" or "hg38" or "hg38-NCBI". Default is "hg19". 
-Note: "hg19" will load BSgenome.Hsapiens.UCSC.hg19 package, which is 
-Full genome sequences for Homo sapiens (Human) as provided by 
-UCSC (hg19, based on GRCh37.p13) and stored in Biostrings objects; 
-"hg38" will load BSgenome.Hsapiens.UCSC.hg38 package, which is 
-Full genome sequences for Homo sapiens (Human) as provided by 
-UCSC (hg38, based on GRCh38.p13) and stored in Biostrings objects.
-"hg38-NCBI" will load BSgenome.Hsapiens.NCBI.GRCh38 package, which is 
-full genome sequences for Homo sapiens (Human) as provided by 
-NCBI (GRCh38, 2013-12-17) and stored in Biostrings objects.}
+\item{genome_label}{The genome you used for alignment: \code{"hg19"},
+\code{"hg38"}, \code{"hg38-NCBI"}, or \code{"user_define"} (default \code{"hg19"}).
+For \code{"user_define"}, pass a matching \code{BSgenome} via \code{genome}.}
 
-\item{outdir}{The path for saving rds file. Default is NA, i.e. not saving.}
+\item{genome}{Used only when \code{genome_label = "user_define"}: a \code{BSgenome}
+object. Ignored with a warning for built-in \code{genome_label} values if
+a \code{BSgenome} is supplied.}
+
+\item{outdir}{The path for saving rds file. Default is FALSE, i.e. not saving.}
 
 \item{galp_flag}{}
 
diff --git a/man/read_bam_insert_metrics.Rd b/man/read_bam_insert_metrics.Rd
index 232b438..376c503 100644
--- a/man/read_bam_insert_metrics.Rd
+++ b/man/read_bam_insert_metrics.Rd
@@ -10,6 +10,7 @@ read_bam_insert_metrics(
   chromosome_to_keep = paste0("chr", 1:22),
   strand_mode = 1,
   genome_label = "hg19",
+  genome = NULL,
   outdir = FALSE,
   isize_min = 1L,
   isize_max = 1000L,
@@ -26,17 +27,13 @@ Default is paste0("chr", 1:22).}
 \item{strand_mode}{Usually the strand_mode  = 1 means the First read is 
 aligned to positive strand. Details please see GenomicAlignments docs.}
 
-\item{genome_label}{The Genome you used in the alignment. 
-Should be "hg19" or "hg38" or "hg38-NCBI. Default is "hg19". 
-Note: "hg19" will load BSgenome.Hsapiens.UCSC.hg19 package, which is 
-Full genome sequences for Homo sapiens (Human) as provided by 
-UCSC (hg19, based on GRCh37.p13) and stored in Biostrings objects; 
-"hg38" will load BSgenome.Hsapiens.UCSC.hg38 package, which is 
-Full genome sequences for Homo sapiens (Human) as provided by 
-UCSC (hg38, based on GRCh38.p13) and stored in Biostrings objects.
-"hg38-NCBI" will load BSgenome.Hsapiens.NCBI.GRCh38 package, which is 
-full genome sequences for Homo sapiens (Human) as provided by 
-NCBI (GRCh38, 2013-12-17) and stored in Biostrings objects.}
+\item{genome_label}{One of \code{"hg19"}, \code{"hg38"}, \code{"hg38-NCBI"},
+or \code{"user_define"} (default \code{"hg19"}). For \code{"user_define"},
+supply \code{genome}.}
+
+\item{genome}{When \code{genome_label = "user_define"}, a \code{BSgenome} object
+matching the BAM reference. Otherwise optional; a \code{BSgenome} here is
+ignored with a warning.}
 
 \item{outdir}{The path for saving rds file. Default is FALSE, i.e. not saving.}
 
diff --git a/tests/testthat/test_readBam_genome.R b/tests/testthat/test_readBam_genome.R
new file mode 100644
index 0000000..bacea1c
--- /dev/null
+++ b/tests/testthat/test_readBam_genome.R
@@ -0,0 +1,34 @@
+test_that("get_genome_reference uses explicit genome argument", {
+  skip_if_not_installed("BSgenome.Hsapiens.UCSC.hg19")
+  g <- BSgenome.Hsapiens.UCSC.hg19::BSgenome.Hsapiens.UCSC.hg19
+  gr <- GenomicRanges::GRanges("chr1", IRanges::IRanges(1, 10))
+  out <- cfDNAPro:::get_genome_reference(gr, genome = g)
+  expect_identical(out, g)
+})
+
+test_that("get_genome_reference rejects non-BSgenome explicit genome", {
+  gr <- GenomicRanges::GRanges()
+  expect_error(
+    cfDNAPro:::get_genome_reference(gr, genome = "not-a-BSgenome"),
+    "BSgenome"
+  )
+})
+
+test_that("get_genome_reference restores user_define from package metadata", {
+  skip_if_not_installed("BSgenome.Hsapiens.UCSC.hg19")
+  g <- BSgenome.Hsapiens.UCSC.hg19::BSgenome.Hsapiens.UCSC.hg19
+  gr <- GenomicRanges::GRanges()
+  S4Vectors::metadata(gr) <- list(
+    cfDNAPro_genome_label = "user_define",
+    cfDNAPro_bsgenome_pkg = "BSgenome.Hsapiens.UCSC.hg19",
+    cfDNAPro_bsgenome_symbol = "BSgenome.Hsapiens.UCSC.hg19"
+  )
+  out <- cfDNAPro:::get_genome_reference(gr)
+  expect_identical(out, g)
+})
+
+test_that("get_genome_reference errors when user_define cannot be restored", {
+  gr <- GenomicRanges::GRanges()
+  S4Vectors::metadata(gr) <- list(cfDNAPro_genome_label = "user_define")
+  expect_error(cfDNAPro:::get_genome_reference(gr), "user_define")
+})

From 50aaaf45f6744e0df73345b00f11fd5b64cc8f2b Mon Sep 17 00:00:00 2001
From: SHY <serine94209792@gmail.com>
Date: Thu, 14 May 2026 10:54:26 +0800
Subject: [PATCH 2/3] use seqinfo(genome = BSgenome) rather than seqinfo(genome
 = "...")

---
 R/internalMutFunctions.R | 21 ++++++++++++++++-----
 R/readBam.R              | 16 +++++++++++++++-
 R/readGALP.R             |  8 +++++++-
 3 files changed, 38 insertions(+), 7 deletions(-)

diff --git a/R/internalMutFunctions.R b/R/internalMutFunctions.R
index 3b64f3d..caaf26f 100644
--- a/R/internalMutFunctions.R
+++ b/R/internalMutFunctions.R
@@ -1227,7 +1227,11 @@ bam_to_galp_mut <- function(bamfile,
   message("BAM file has been read into galp.")
 
   # add genome information
-  if (isSingleStringOrNA(genome)) {
+  if (methods::is(genome, "Seqinfo")) {
+    if (!isFALSE(chromosome_to_keep)) {
+      genome <- genome[chromosome_to_keep]
+    }
+  } else if (isSingleStringOrNA(genome)) {
     genome <- Seqinfo(genome = genome)[chromosome_to_keep]
   }
 
@@ -1512,6 +1516,7 @@ process_bam_file <- function(bamfile,
                              chromosome_to_keep = NULL,
                              strand_mode = 1,
                              genome_name,
+                             galp_merge_seqinfo = NULL,
                              galp_flag,
                              galp_what,
                              galp_tag,
@@ -1525,6 +1530,12 @@ process_bam_file <- function(bamfile,
     mapqFilter = galp_mapqFilter
   )
 
+  genome_for_galp <- if (!is.null(galp_merge_seqinfo)) {
+    galp_merge_seqinfo
+  } else {
+    genome_name
+  }
+
   # Check the presence of a mutation file and the mode of operation
   if (!is.null(mutation_file)) {
     loci_df <- suppressMessages(
@@ -1552,7 +1563,7 @@ process_bam_file <- function(bamfile,
                               use_names = use_names,
                               chromosome_to_keep = chromosome_to_keep,
                               strand_mode = strand_mode,
-                              genome = genome_name,
+                              genome = genome_for_galp,
                               param = galp_param_mut)
       # Update metadata with content specifics
       galp@metadata <- list(
@@ -1568,7 +1579,7 @@ process_bam_file <- function(bamfile,
                            use_names = use_names,
                            chromosome_to_keep = chromosome_to_keep,
                            strand_mode = strand_mode,
-                           genome = genome_name,
+                           genome = genome_for_galp,
                            param = galp_param)
 
       galp_mm <- suppressMessages(bam_to_galp_mut(
@@ -1576,7 +1587,7 @@ process_bam_file <- function(bamfile,
                                   use_names = use_names,
                                   chromosome_to_keep = chromosome_to_keep,
                                   strand_mode = strand_mode,
-                                  genome = genome_name,
+                                  genome = genome_for_galp,
                                   param = galp_param_mut))
 
       if (length(galp_mm) == 0) {
@@ -1596,7 +1607,7 @@ process_bam_file <- function(bamfile,
                          use_names = use_names,
                          chromosome_to_keep = chromosome_to_keep,
                          strand_mode = strand_mode,
-                         genome = genome_name,
+                         genome = genome_for_galp,
                          param = galp_param)
   }
 
diff --git a/R/readBam.R b/R/readBam.R
index 34607b0..813934a 100644
--- a/R/readBam.R
+++ b/R/readBam.R
@@ -161,6 +161,15 @@ readBam <- function(bamfile,
     )
   }
 
+  # Seqinfo from the BSgenome avoids GenomeInfoDb::Seqinfo(genome = <RefSeq ID>),
+  # which queries NCBI FTP (e.g. GCF_000001405.26 for GATK GRCh38) and can fail
+  # offline or when FTP layout changes.
+  galp_merge_seqinfo <- if (!isFALSE(chromosome_to_keep)) {
+    GenomeInfoDb::seqinfo(genome)[chromosome_to_keep]
+  } else {
+    GenomeInfoDb::seqinfo(genome)
+  }
+
   ###########################################################################
   # Read BAM into galp based on user-defined parameters
   ###########################################################################
@@ -171,6 +180,7 @@ readBam <- function(bamfile,
                            chromosome_to_keep = chromosome_to_keep,
                            strand_mode = strand_mode,
                            genome_name = genome_name,
+                           galp_merge_seqinfo = galp_merge_seqinfo,
                            galp_flag = galp_flag,
                            galp_what = galp_what,
                            galp_tag = galp_tag,
@@ -294,7 +304,11 @@ bam_to_galp2 <- function(bamfile,
   message("BAM file has been read into galp.")
 
   # add genome information
-  if (isSingleStringOrNA(genome)) {
+  if (methods::is(genome, "Seqinfo")) {
+    if (!isFALSE(chromosome_to_keep)) {
+      genome <- genome[chromosome_to_keep]
+    }
+  } else if (isSingleStringOrNA(genome)) {
     genome <- Seqinfo(genome = genome)[chromosome_to_keep]
   }
 
diff --git a/R/readGALP.R b/R/readGALP.R
index 17ad825..099def8 100644
--- a/R/readGALP.R
+++ b/R/readGALP.R
@@ -115,6 +115,12 @@ readGALP <- function(
       "(GenomeInfoDb::seqinfo(genome)@genome)."
     )
   }
+
+  galp_merge_seqinfo <- if (!isFALSE(chromosome_to_keep)) {
+    GenomeInfoDb::seqinfo(genome)[chromosome_to_keep]
+  } else {
+    GenomeInfoDb::seqinfo(genome)
+  }
   
   ############################################################################
   # Read bam into galp
@@ -129,7 +135,7 @@ readGALP <- function(
                        use_names = use_names,
                        chromosome_to_keep = chromosome_to_keep,
                        strand_mode = strand_mode, 
-                       genome = genome_name,
+                       genome = galp_merge_seqinfo,
                        param = galp_param
   ) 
   

From d28dbd085f44c022af23947cbd77df6f78805857 Mon Sep 17 00:00:00 2001
From: SHY <serine94209792@gmail.com>
Date: Thu, 14 May 2026 16:05:02 +0800
Subject: [PATCH 3/3] add genome parameter in writeMutTable

---
 R/internalMutFunctions.R | 6 ++++--
 man/writeMutTable.Rd     | 6 +++++-
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/R/internalMutFunctions.R b/R/internalMutFunctions.R
index caaf26f..63767e9 100644
--- a/R/internalMutFunctions.R
+++ b/R/internalMutFunctions.R
@@ -211,6 +211,8 @@ read_mutation_file <- function(mutation_file) {
 #'
 #' @param frag_obj GRanges object containing genomic ranges and associated data.
 #' @param output_file Character string specifying the path/name of the output file.
+#' @param genome Optional `BSgenome` passed to [callTrinucleotide()]. Use when
+#'   `frag_obj` was built with `genome_label = "user_define"`
 #'
 #' @return The path to the output file as a character string.
 #'
@@ -219,7 +221,7 @@ read_mutation_file <- function(mutation_file) {
 #' @examples
 #' output_path <- writeMutTable(gr, "output_directory", "output_filename")
 #' print(paste("File written to:", output_path))
-writeMutTable <- function(frag_obj, output_file) {
+writeMutTable <- function(frag_obj, output_file, genome = NULL) {
 
   # Extract the directory path from the output file path
   output_dir <- dirname(output_file)
@@ -231,7 +233,7 @@ writeMutTable <- function(frag_obj, output_file) {
     stop("The directory does not exist. Please provide a valid directory path.")
   }
 
-  trinuc_df <- callTrinucleotide(frag_obj = frag_obj)
+  trinuc_df <- callTrinucleotide(frag_obj = frag_obj, genome = genome)
 
   # Adding informative columns for counts and fractions
   trinuc_df <- trinuc_df %>%
diff --git a/man/writeMutTable.Rd b/man/writeMutTable.Rd
index 7ae552e..56fd0ea 100644
--- a/man/writeMutTable.Rd
+++ b/man/writeMutTable.Rd
@@ -4,12 +4,16 @@
 \alias{writeMutTable}
 \title{Function that generates a Mutation Table with summarised cfDNA fragment data}
 \usage{
-writeMutTable(frag_obj, output_file)
+writeMutTable(frag_obj, output_file, genome = NULL)
 }
 \arguments{
 \item{frag_obj}{GRanges object containing genomic ranges and associated data.}
 
 \item{output_file}{Character string specifying the path/name of the output file.}
+
+\item{genome}{Optional \code{BSgenome} passed to \code{\link{callTrinucleotide}}; use when
+\code{frag_obj} was built with \code{genome_label = "user_define"} and the reference
+cannot be restored from object metadata. Defaults to \code{NULL} (infer when possible).}
 }
 \value{
 The path to the output file as a character string.