epifluidlab · ravibandaru-lab · May 26, 2026 · May 18, 2026 · May 26, 2026 · May 26, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,43 @@ The format is based on
 and this project adheres to
 [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [Unreleased]
+
+### Added
+- CRAM support threaded through all `frag` modules: `coverage`, `frag_length`,
+  `frag_length_bins`, `frag_length_intervals`, `wps`, `multi_wps`,
+  `cleavage_profile`, and `multi_cleavage_profile` now accept a
+  `reference_file` parameter (FASTA only) required for decoding CRAM input.
+- `--reference-file` CLI flag added to `frag-length-bins`,
+  `frag-length-intervals`, and `wps` subcommands.
+- `--reference-file` CLI flag added to `cleavage-profile` subcommand (no `-r`
+  short form, as `-r` was already taken by `--right`).
+- `ReferenceWrapper` auto-creates a missing `.fai` index (via `pysam.faidx`)
+  when a FASTA file is opened without a pre-built index.
+- `.fna` and `.fna.gz` extensions recognised as FASTA by `ReferenceWrapper`.
+- Integration test suite `tests/test_cram.py` verifying CRAM produces
+  identical results to BAM for `single_coverage`, `frag_length_bins`, `wps`,
+  and `delfi`.
+- Smoke tests in `tests/test_cram.py` verifying CRAM input runs without error
+  for `coverage`, `frag_length`, `frag_length_intervals`, `multi_wps`,
+  `cleavage_profile`, and `multi_cleavage_profile`.
+
+### Changed
+- CLI help text for `delfi`, `end-motifs`, `interval-end-motifs`,
+  `breakpoint-motifs`, and `interval-breakpoint-motifs` updated: the
+  `reference_file` argument description now reads "A .2bit or FASTA
+  (.fa, .fasta, .fna) file" instead of "The .2bit file".
+- `multi_wps` now correctly forwards `fraction_low` and `fraction_high` to
+  worker processes (these were silently dropped before).
+
+### Fixed
+- CRAM files can now be used with all analysis subcommands; previously
+  `frag_generator` was never passed the reference file needed for CRAM
+  decoding.
+- `delfi` docstring clarifies that when `input_file` is a CRAM file,
+  `reference_file` must be a FASTA (not .2bit), as htslib requires FASTA for
+  CRAM decoding.
+
 ## [0.11.1] - 2026-04-21
 
 ### Added

diff --git a/docs/documentation/user_guide/inputdata.rst b/docs/documentation/user_guide/inputdata.rst
@@ -21,12 +21,22 @@ CRAM
 
 A compressed read alignment map file is another binary file standard. It
 is a binary file that is smaller than a BAM file, but still contains all of the
-same information. 
+same information.
 
 **FinaleToolkit** requires that CRAM files be CRAI indexed. Therefore, you
 should have an associated ``.cram.crai`` file in the same directory of your
 input data.
 
+.. note::
+
+   CRAM files require a reference genome to decode reads. When supplying a
+   CRAM file as input, you must also provide a FASTA reference file (e.g.
+   ``--reference-file /path/to/reference.fa``) to any **FinaleToolkit**
+   subcommand. The FASTA file must be the same reference used during alignment
+   and must be accompanied by a ``.fai`` index (created automatically by
+   **FinaleToolkit** if not present). A ``.2bit`` file cannot be used as the
+   reference when reading CRAM files.
+
 Fragment File
 ^^^^^^^^^^^^^^^^
 

diff --git a/src/finaletoolkit/cli/main_cli.py b/src/finaletoolkit/cli/main_cli.py
@@ -49,7 +49,7 @@ def main_cli_parser():
     cli_coverage.add_argument(
         "-r",
         "--reference-file",
-        help="A FASTA or 2-bit file, which is used to support CRAM reading.",
+        help="Path to a FASTA (.fa, .fasta, .fna) reference genome file. Required for CRAM input.",
         required=False
     )
     cli_coverage.add_argument(
@@ -127,6 +127,11 @@ def main_cli_parser():
     cli_frag_length_bins.add_argument(
         'input_file',
         help='Path to a BAM/CRAM/Fragment file containing fragment data.')
+    cli_frag_length_bins.add_argument(
+        '-r',
+        '--reference-file',
+        help='Path to a FASTA (.fa, .fasta, .fna) reference genome file. Required for CRAM input.',
+        required=False)
     cli_frag_length_bins.add_argument(
         '-c',
         '--contig',
@@ -225,6 +230,11 @@ def main_cli_parser():
         'input_file',
         help='Path to a BAM/CRAM/Fragment file containing fragment '
         'data.')
+    cli_frag_length_intervals.add_argument(
+        '-r',
+        '--reference-file',
+        help='Path to a FASTA (.fa, .fasta, .fna) reference genome file. Required for CRAM input.',
+        required=False)
     cli_frag_length_intervals.add_argument(
         'interval_file',
         help='Path to a BED file containing intervals to retrieve '
@@ -295,6 +305,10 @@ def main_cli_parser():
     cli_cleavage_profile.add_argument(
         'input_file',
         help='Path to a BAM/CRAM/Fragment file containing fragment data.')
+    cli_cleavage_profile.add_argument(
+        '--reference-file',
+        help='Path to a FASTA (.fa, .fasta, .fna) reference genome file. Required for CRAM input.',
+        required=False)
     cli_cleavage_profile.add_argument(
         'interval_file',
         help='Path to a BED file containing intervals to calculates cleavage '
@@ -382,6 +396,11 @@ def main_cli_parser():
         'input_file',
         help='Path to a BAM/CRAM/Fragment file containing fragment '
         'data.')
+    cli_wps.add_argument(
+        '-r',
+        '--reference-file',
+        help='Path to a FASTA (.fa, .fasta, .fna) reference genome file. Required for CRAM input.',
+        required=False)
     cli_wps.add_argument(
         'site_bed',
         help='Path to a BED file containing sites to calculate WPS '
@@ -562,8 +581,8 @@ def main_cli_parser():
         "You want to replicate the original scripts.")
     cli_delfi.add_argument(
         'reference_file',
-        help="The .2bit file for the associate reference genome sequence used "
-        "during alignment.")
+        help="A .2bit or FASTA (.fa, .fasta, .fna) file for the reference "
+        "genome sequence used during alignment.")
     cli_delfi.add_argument(
         'bins_file',
         help="A BED file containing bins over which to calculate DELFI. To "
@@ -677,8 +696,8 @@ def main_cli_parser():
         help='Path to a BAM/CRAM/Fragment file containing fragment data.')
     cli_motifs.add_argument(
         'refseq_file',
-        help='The .2bit file for the associate reference genome sequence used '
-        'during alignment.')
+        help='A .2bit or FASTA (.fa, .fasta, .fna) file for the reference '
+        'genome sequence used during alignment.')
     cli_motifs.add_argument(
         '-k',
         default=4,
@@ -748,8 +767,8 @@ def main_cli_parser():
         help='Path to a BAM/CRAM/Fragment file containing fragment data.')
     cli_interval_motifs.add_argument(
         'refseq_file',
-        help='The .2bit file for the associate reference genome sequence used '
-        'during alignment.')
+        help='A .2bit or FASTA (.fa, .fasta, .fna) file for the reference '
+        'genome sequence used during alignment.')
     cli_interval_motifs.add_argument(
         'intervals',
         help='Path to a BED file containing intervals to retrieve end motif '
@@ -836,8 +855,8 @@ def main_cli_parser():
         help='Path to a BAM/CRAM/Fragment file containing fragment data.')
     cli_breakpoint.add_argument(
         'refseq_file',
-        help='The .2bit file for the associate reference genome sequence used '
-        'during alignment.')
+        help='A .2bit or FASTA (.fa, .fasta, .fna) file for the reference '
+        'genome sequence used during alignment.')
     cli_breakpoint.add_argument(
         '-k',
         default=6,
@@ -907,8 +926,8 @@ def main_cli_parser():
         help='Path to a BAM/CRAM/Fragment file containing fragment data.')
     cli_interval_breakpoint.add_argument(
         'refseq_file',
-        help='The .2bit file for the associate reference genome sequence used '
-        'during alignment.')
+        help='A .2bit or FASTA (.fa, .fasta, .fna) file for the reference '
+        'genome sequence used during alignment.')
     cli_interval_breakpoint.add_argument(
         'intervals',
         help='Path to a BED file containing intervals to retrieve breakpoint motif '

diff --git a/src/finaletoolkit/frag/_cleavage_profile.py b/src/finaletoolkit/frag/_cleavage_profile.py
@@ -41,6 +41,7 @@ def cleavage_profile(
     verbose: Union[bool, int]=0,
     fraction_low: int|None=None,
     fraction_high: int|None=None,
+    reference_file: str | Path | None = None,
 ) -> np.ndarray:
     """
     Cleavage profile calculated over a single interval.
@@ -73,6 +74,9 @@ def cleavage_profile(
         Deprecated alias for min_length
     fraction_high : int, optional
         Deprecated alias for max_length
+    reference_file : str or Path, optional
+        Path to a FASTA (.fa, .fasta, .fna) reference genome file. Required
+        when `input_file` is a CRAM file; ignored for BAM/frag files.
 
     Return
     ------
@@ -131,7 +135,8 @@ def cleavage_profile(
         stop=adj_stop,
         min_length=min_length,
         max_length=max_length,
-        intersect_policy="any"
+        intersect_policy="any",
+        reference_file=reference_file,
     )
 
     positions = np.arange(adj_start, adj_stop)
@@ -192,6 +197,7 @@ def multi_cleavage_profile(
     verbose: Union[bool, int]=0,
     fraction_low: int|None=None,
     fraction_high: int|None=None,
+    reference_file: str | Path | None = None,
 ):
     """
     Multithreaded cleavage profile implementation over intervals in a
@@ -225,7 +231,10 @@ def multi_cleavage_profile(
         Deprecated alias for min_length
     fraction_high : int, optional
         Deprecated alias for max_length
-
+    reference_file : str or Path, optional
+        Path to a FASTA (.fa, .fasta, .fna) reference genome file. Required
+        when `input_file` is a CRAM file; ignored for BAM/frag files.
+
     Returns
     -------
     output_file: str
@@ -356,7 +365,10 @@ def multi_cleavage_profile(
         count*[min_length],
         count*[max_length],
         count*[quality_threshold],
-        count*[max(verbose-1, 0)]
+        count*[max(verbose-1, 0)],
+        count*[fraction_low],
+        count*[fraction_high],
+        count*[reference_file],
     )
 
     if (verbose):

diff --git a/src/finaletoolkit/frag/_coverage.py b/src/finaletoolkit/frag/_coverage.py
@@ -25,7 +25,8 @@ def single_coverage(
     max_length: int | None = None,
     intersect_policy: str = "midpoint",
     quality_threshold: int = 30,
-    verbose: bool | int = False
+    verbose: bool | int = False,
+    reference_file: str | Path | None = None,
 ) -> tuple[str | None, int | None, int | None, str, float]:
     """
     Return estimated fragment coverage over specified `contig` and
@@ -59,6 +60,10 @@ def single_coverage(
         Minimum MAPQ to filter for. Default is 30.
     verbose : bool, optional
         Prints messages to stderr. Default is false.
+    reference_file : str or Path, optional
+        Path to a FASTA (.fa, .fasta, .fna) reference genome file. Required
+        when `input_file` is a CRAM file; ignored for BAM/frag files.
+
     Returns
     -------
     (contig, start, stop, name, coverage) : tuple[str, int, int, str, float]
@@ -92,7 +97,9 @@ def single_coverage(
         stop=stop,
         min_length=min_length,
         max_length=max_length,
-        intersect_policy=intersect_policy)
+        intersect_policy=intersect_policy,
+        reference_file=reference_file,
+    )
 
     # Iterating on each frag in file in
     # specified contig/chromosome
@@ -126,7 +133,8 @@ def coverage(
         intersect_policy: str="midpoint",
         quality_threshold: int=30,
         workers: int=1,
-        verbose: Union[bool, int]=False
+        verbose: Union[bool, int]=False,
+        reference_file: str | Path | None=None,
     ) -> list[tuple[str, int, int, str, float]]:
     """
     Return estimated fragment coverage over intervals specified in
@@ -167,6 +175,9 @@ def coverage(
         Number of subprocesses to spawn. Increases speed at the expense
         of memory.
     verbose : int or bool, optional
+    reference_file : str or Path, optional
+        Path to a FASTA (.fa, .fasta, .fna) reference genome file. Required
+        when `input_file` is a CRAM file; ignored for BAM/frag files.
 
     Returns
     -------
@@ -211,7 +222,8 @@ def coverage(
                  "max_length": max_length,
                  "intersect_policy": intersect_policy,
                  "quality_threshold": quality_threshold,
-                 "verbose": verbose}
+                 "verbose": verbose,
+                 "reference_file": reference_file}
             )
 
         intervals = get_intervals(
@@ -223,7 +235,8 @@ def coverage(
         partial_single_coverage = partial(
             single_coverage, input_file=input_file, min_length=min_length,
             max_length=max_length, intersect_policy=intersect_policy,
-            quality_threshold=quality_threshold, verbose=max(0, verbose-1))
+            quality_threshold=quality_threshold, verbose=max(0, verbose-1),
+            reference_file=reference_file)
         coverages = pool.imap(
             partial(_single_coverage_star, partial_single_coverage), intervals,
             chunksize=max(len(intervals)//workers, 1))

diff --git a/src/finaletoolkit/frag/_delfi.py b/src/finaletoolkit/frag/_delfi.py
@@ -69,7 +69,10 @@ def delfi(input_file: str,
         Path string to a BED file containing 100kb bins for reference
         genome of choice.
     reference_file: str
-        Path string to reference genome file.
+        Path string to reference genome file (.2bit or FASTA). Used for
+        GC-content correction. When `input_file` is a CRAM file, this
+        must be a FASTA file (not .2bit) because htslib requires FASTA
+        for CRAM decoding.
     gap_file: str or GenomeGaps
         Specifies locations of telomeres and centromeres for reference 
         genome. There are three options:
@@ -391,7 +394,9 @@ def _delfi_single_window(
         window_start,
         window_stop,
         min_length=100,
-        max_length=220):
+        max_length=220,
+        reference_file=reference_file,
+    ):
 
         frag_length = frag_stop - frag_start