diff --git a/CHANGELOG.md b/CHANGELOG.md index 0404a25c..e94f09b9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,43 @@ The format is based on and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [Unreleased] + +### Added +- CRAM support threaded through all `frag` modules: `coverage`, `frag_length`, + `frag_length_bins`, `frag_length_intervals`, `wps`, `multi_wps`, + `cleavage_profile`, and `multi_cleavage_profile` now accept a + `reference_file` parameter (FASTA only) required for decoding CRAM input. +- `--reference-file` CLI flag added to `frag-length-bins`, + `frag-length-intervals`, and `wps` subcommands. +- `--reference-file` CLI flag added to `cleavage-profile` subcommand (no `-r` + short form, as `-r` was already taken by `--right`). +- `ReferenceWrapper` auto-creates a missing `.fai` index (via `pysam.faidx`) + when a FASTA file is opened without a pre-built index. +- `.fna` and `.fna.gz` extensions recognised as FASTA by `ReferenceWrapper`. +- Integration test suite `tests/test_cram.py` verifying CRAM produces + identical results to BAM for `single_coverage`, `frag_length_bins`, `wps`, + and `delfi`. +- Smoke tests in `tests/test_cram.py` verifying CRAM input runs without error + for `coverage`, `frag_length`, `frag_length_intervals`, `multi_wps`, + `cleavage_profile`, and `multi_cleavage_profile`. + +### Changed +- CLI help text for `delfi`, `end-motifs`, `interval-end-motifs`, + `breakpoint-motifs`, and `interval-breakpoint-motifs` updated: the + `reference_file` argument description now reads "A .2bit or FASTA + (.fa, .fasta, .fna) file" instead of "The .2bit file". +- `multi_wps` now correctly forwards `fraction_low` and `fraction_high` to + worker processes (these were silently dropped before). + +### Fixed +- CRAM files can now be used with all analysis subcommands; previously + `frag_generator` was never passed the reference file needed for CRAM + decoding. +- `delfi` docstring clarifies that when `input_file` is a CRAM file, + `reference_file` must be a FASTA (not .2bit), as htslib requires FASTA for + CRAM decoding. + ## [0.11.1] - 2026-04-21 ### Added diff --git a/docs/documentation/user_guide/inputdata.rst b/docs/documentation/user_guide/inputdata.rst index c7135245..fe8f690a 100644 --- a/docs/documentation/user_guide/inputdata.rst +++ b/docs/documentation/user_guide/inputdata.rst @@ -21,12 +21,22 @@ CRAM A compressed read alignment map file is another binary file standard. It is a binary file that is smaller than a BAM file, but still contains all of the -same information. +same information. **FinaleToolkit** requires that CRAM files be CRAI indexed. Therefore, you should have an associated ``.cram.crai`` file in the same directory of your input data. +.. note:: + + CRAM files require a reference genome to decode reads. When supplying a + CRAM file as input, you must also provide a FASTA reference file (e.g. + ``--reference-file /path/to/reference.fa``) to any **FinaleToolkit** + subcommand. The FASTA file must be the same reference used during alignment + and must be accompanied by a ``.fai`` index (created automatically by + **FinaleToolkit** if not present). A ``.2bit`` file cannot be used as the + reference when reading CRAM files. + Fragment File ^^^^^^^^^^^^^^^^ diff --git a/src/finaletoolkit/cli/main_cli.py b/src/finaletoolkit/cli/main_cli.py index d2446986..a4938c62 100644 --- a/src/finaletoolkit/cli/main_cli.py +++ b/src/finaletoolkit/cli/main_cli.py @@ -49,7 +49,7 @@ def main_cli_parser(): cli_coverage.add_argument( "-r", "--reference-file", - help="A FASTA or 2-bit file, which is used to support CRAM reading.", + help="Path to a FASTA (.fa, .fasta, .fna) reference genome file. Required for CRAM input.", required=False ) cli_coverage.add_argument( @@ -127,6 +127,11 @@ def main_cli_parser(): cli_frag_length_bins.add_argument( 'input_file', help='Path to a BAM/CRAM/Fragment file containing fragment data.') + cli_frag_length_bins.add_argument( + '-r', + '--reference-file', + help='Path to a FASTA (.fa, .fasta, .fna) reference genome file. Required for CRAM input.', + required=False) cli_frag_length_bins.add_argument( '-c', '--contig', @@ -225,6 +230,11 @@ def main_cli_parser(): 'input_file', help='Path to a BAM/CRAM/Fragment file containing fragment ' 'data.') + cli_frag_length_intervals.add_argument( + '-r', + '--reference-file', + help='Path to a FASTA (.fa, .fasta, .fna) reference genome file. Required for CRAM input.', + required=False) cli_frag_length_intervals.add_argument( 'interval_file', help='Path to a BED file containing intervals to retrieve ' @@ -295,6 +305,10 @@ def main_cli_parser(): cli_cleavage_profile.add_argument( 'input_file', help='Path to a BAM/CRAM/Fragment file containing fragment data.') + cli_cleavage_profile.add_argument( + '--reference-file', + help='Path to a FASTA (.fa, .fasta, .fna) reference genome file. Required for CRAM input.', + required=False) cli_cleavage_profile.add_argument( 'interval_file', help='Path to a BED file containing intervals to calculates cleavage ' @@ -382,6 +396,11 @@ def main_cli_parser(): 'input_file', help='Path to a BAM/CRAM/Fragment file containing fragment ' 'data.') + cli_wps.add_argument( + '-r', + '--reference-file', + help='Path to a FASTA (.fa, .fasta, .fna) reference genome file. Required for CRAM input.', + required=False) cli_wps.add_argument( 'site_bed', help='Path to a BED file containing sites to calculate WPS ' @@ -562,8 +581,8 @@ def main_cli_parser(): "You want to replicate the original scripts.") cli_delfi.add_argument( 'reference_file', - help="The .2bit file for the associate reference genome sequence used " - "during alignment.") + help="A .2bit or FASTA (.fa, .fasta, .fna) file for the reference " + "genome sequence used during alignment.") cli_delfi.add_argument( 'bins_file', help="A BED file containing bins over which to calculate DELFI. To " @@ -677,8 +696,8 @@ def main_cli_parser(): help='Path to a BAM/CRAM/Fragment file containing fragment data.') cli_motifs.add_argument( 'refseq_file', - help='The .2bit file for the associate reference genome sequence used ' - 'during alignment.') + help='A .2bit or FASTA (.fa, .fasta, .fna) file for the reference ' + 'genome sequence used during alignment.') cli_motifs.add_argument( '-k', default=4, @@ -748,8 +767,8 @@ def main_cli_parser(): help='Path to a BAM/CRAM/Fragment file containing fragment data.') cli_interval_motifs.add_argument( 'refseq_file', - help='The .2bit file for the associate reference genome sequence used ' - 'during alignment.') + help='A .2bit or FASTA (.fa, .fasta, .fna) file for the reference ' + 'genome sequence used during alignment.') cli_interval_motifs.add_argument( 'intervals', help='Path to a BED file containing intervals to retrieve end motif ' @@ -836,8 +855,8 @@ def main_cli_parser(): help='Path to a BAM/CRAM/Fragment file containing fragment data.') cli_breakpoint.add_argument( 'refseq_file', - help='The .2bit file for the associate reference genome sequence used ' - 'during alignment.') + help='A .2bit or FASTA (.fa, .fasta, .fna) file for the reference ' + 'genome sequence used during alignment.') cli_breakpoint.add_argument( '-k', default=6, @@ -907,8 +926,8 @@ def main_cli_parser(): help='Path to a BAM/CRAM/Fragment file containing fragment data.') cli_interval_breakpoint.add_argument( 'refseq_file', - help='The .2bit file for the associate reference genome sequence used ' - 'during alignment.') + help='A .2bit or FASTA (.fa, .fasta, .fna) file for the reference ' + 'genome sequence used during alignment.') cli_interval_breakpoint.add_argument( 'intervals', help='Path to a BED file containing intervals to retrieve breakpoint motif ' diff --git a/src/finaletoolkit/frag/_cleavage_profile.py b/src/finaletoolkit/frag/_cleavage_profile.py index 41b0fd7a..cf44bb64 100644 --- a/src/finaletoolkit/frag/_cleavage_profile.py +++ b/src/finaletoolkit/frag/_cleavage_profile.py @@ -41,6 +41,7 @@ def cleavage_profile( verbose: Union[bool, int]=0, fraction_low: int|None=None, fraction_high: int|None=None, + reference_file: str | Path | None = None, ) -> np.ndarray: """ Cleavage profile calculated over a single interval. @@ -73,6 +74,9 @@ def cleavage_profile( Deprecated alias for min_length fraction_high : int, optional Deprecated alias for max_length + reference_file : str or Path, optional + Path to a FASTA (.fa, .fasta, .fna) reference genome file. Required + when `input_file` is a CRAM file; ignored for BAM/frag files. Return ------ @@ -131,7 +135,8 @@ def cleavage_profile( stop=adj_stop, min_length=min_length, max_length=max_length, - intersect_policy="any" + intersect_policy="any", + reference_file=reference_file, ) positions = np.arange(adj_start, adj_stop) @@ -192,6 +197,7 @@ def multi_cleavage_profile( verbose: Union[bool, int]=0, fraction_low: int|None=None, fraction_high: int|None=None, + reference_file: str | Path | None = None, ): """ Multithreaded cleavage profile implementation over intervals in a @@ -225,7 +231,10 @@ def multi_cleavage_profile( Deprecated alias for min_length fraction_high : int, optional Deprecated alias for max_length - + reference_file : str or Path, optional + Path to a FASTA (.fa, .fasta, .fna) reference genome file. Required + when `input_file` is a CRAM file; ignored for BAM/frag files. + Returns ------- output_file: str @@ -356,7 +365,10 @@ def multi_cleavage_profile( count*[min_length], count*[max_length], count*[quality_threshold], - count*[max(verbose-1, 0)] + count*[max(verbose-1, 0)], + count*[fraction_low], + count*[fraction_high], + count*[reference_file], ) if (verbose): diff --git a/src/finaletoolkit/frag/_coverage.py b/src/finaletoolkit/frag/_coverage.py index 36e67789..c2c6fc65 100644 --- a/src/finaletoolkit/frag/_coverage.py +++ b/src/finaletoolkit/frag/_coverage.py @@ -25,7 +25,8 @@ def single_coverage( max_length: int | None = None, intersect_policy: str = "midpoint", quality_threshold: int = 30, - verbose: bool | int = False + verbose: bool | int = False, + reference_file: str | Path | None = None, ) -> tuple[str | None, int | None, int | None, str, float]: """ Return estimated fragment coverage over specified `contig` and @@ -59,6 +60,10 @@ def single_coverage( Minimum MAPQ to filter for. Default is 30. verbose : bool, optional Prints messages to stderr. Default is false. + reference_file : str or Path, optional + Path to a FASTA (.fa, .fasta, .fna) reference genome file. Required + when `input_file` is a CRAM file; ignored for BAM/frag files. + Returns ------- (contig, start, stop, name, coverage) : tuple[str, int, int, str, float] @@ -92,7 +97,9 @@ def single_coverage( stop=stop, min_length=min_length, max_length=max_length, - intersect_policy=intersect_policy) + intersect_policy=intersect_policy, + reference_file=reference_file, + ) # Iterating on each frag in file in # specified contig/chromosome @@ -126,7 +133,8 @@ def coverage( intersect_policy: str="midpoint", quality_threshold: int=30, workers: int=1, - verbose: Union[bool, int]=False + verbose: Union[bool, int]=False, + reference_file: str | Path | None=None, ) -> list[tuple[str, int, int, str, float]]: """ Return estimated fragment coverage over intervals specified in @@ -167,6 +175,9 @@ def coverage( Number of subprocesses to spawn. Increases speed at the expense of memory. verbose : int or bool, optional + reference_file : str or Path, optional + Path to a FASTA (.fa, .fasta, .fna) reference genome file. Required + when `input_file` is a CRAM file; ignored for BAM/frag files. Returns ------- @@ -211,7 +222,8 @@ def coverage( "max_length": max_length, "intersect_policy": intersect_policy, "quality_threshold": quality_threshold, - "verbose": verbose} + "verbose": verbose, + "reference_file": reference_file} ) intervals = get_intervals( @@ -223,7 +235,8 @@ def coverage( partial_single_coverage = partial( single_coverage, input_file=input_file, min_length=min_length, max_length=max_length, intersect_policy=intersect_policy, - quality_threshold=quality_threshold, verbose=max(0, verbose-1)) + quality_threshold=quality_threshold, verbose=max(0, verbose-1), + reference_file=reference_file) coverages = pool.imap( partial(_single_coverage_star, partial_single_coverage), intervals, chunksize=max(len(intervals)//workers, 1)) diff --git a/src/finaletoolkit/frag/_delfi.py b/src/finaletoolkit/frag/_delfi.py index 30e33cbc..74c7b924 100644 --- a/src/finaletoolkit/frag/_delfi.py +++ b/src/finaletoolkit/frag/_delfi.py @@ -69,7 +69,10 @@ def delfi(input_file: str, Path string to a BED file containing 100kb bins for reference genome of choice. reference_file: str - Path string to reference genome file. + Path string to reference genome file (.2bit or FASTA). Used for + GC-content correction. When `input_file` is a CRAM file, this + must be a FASTA file (not .2bit) because htslib requires FASTA + for CRAM decoding. gap_file: str or GenomeGaps Specifies locations of telomeres and centromeres for reference genome. There are three options: @@ -391,7 +394,9 @@ def _delfi_single_window( window_start, window_stop, min_length=100, - max_length=220): + max_length=220, + reference_file=reference_file, + ): frag_length = frag_stop - frag_start diff --git a/src/finaletoolkit/frag/_frag_length.py b/src/finaletoolkit/frag/_frag_length.py index 471974c0..46429ad8 100644 --- a/src/finaletoolkit/frag/_frag_length.py +++ b/src/finaletoolkit/frag/_frag_length.py @@ -2,6 +2,7 @@ import time import warnings from typing import Union +from pathlib import Path from sys import stdout, stderr from multiprocessing import Pool import gzip @@ -110,14 +111,15 @@ def _frag_length_stats( short_reads: int, intersect_policy: str, quality_threshold: int, - verbose: Union[bool, int] + verbose: Union[bool, int], + reference_file: str | Path | None = None, ): """ Generates stats for a given interval. """ frag_gen = frag_generator(input_file, contig, quality_threshold, start, stop, min_length, max_length, intersect_policy, - verbose) + verbose, reference_file=reference_file) frag_len_dict = _distribution_from_gen(frag_gen) total_count = sum(frag_len_dict.values()) @@ -158,7 +160,8 @@ def frag_length( intersect_policy: str = "midpoint", output_file: str | None = None, quality_threshold: int = 30, - verbose: bool = False + verbose: bool = False, + reference_file: str | Path | None = None, ) -> np.ndarray: """ Return `np.ndarray` containing lengths of fragments in `input_file` @@ -185,6 +188,9 @@ def frag_length( quality_threshold: int, optional Minimum MAPQ to accept for a fragment to be counted. verbose : bool, optional + reference_file : str or Path, optional + Path to a FASTA (.fa, .fasta, .fna) reference genome file. Required + when `input_file` is a CRAM file; ignored for BAM/frag files. Returns ------- @@ -208,6 +214,7 @@ def frag_length( max_length=1000000000, #TODO: allow to have None intersect_policy=intersect_policy, verbose=verbose, + reference_file=reference_file, ) for contig, frag_start, frag_stop, _, _ in frag_gen: @@ -264,6 +271,7 @@ def frag_length_bins( short_fraction: int | None = None, histogram_path: str | None = None, verbose: Union[bool, int] = False, + reference_file: str | Path | None = None, ) -> tuple[np.ndarray, np.ndarray]: """ Takes input_file, computes frag lengths of fragments and returns @@ -312,6 +320,9 @@ def frag_length_bins( workers : int, optional Number of worker processes. verbose : bool, optional + reference_file : str or Path, optional + Path to a FASTA (.fa, .fasta, .fna) reference genome file. Required + when `input_file` is a CRAM file; ignored for BAM/frag files. Returns ------- @@ -342,8 +353,8 @@ def frag_length_bins( frag_gen = frag_generator( input_file, contig, quality_threshold, start, stop, min_length, - max_length, intersect_policy, verbose) - + max_length, intersect_policy, verbose, reference_file=reference_file) + frag_len_dict = _distribution_from_gen(frag_gen) total_count = sum(frag_len_dict.values()) @@ -445,7 +456,8 @@ def frag_length_intervals( short_reads: int = 150, workers: int = 1, verbose: Union[bool, int] = False, - )->list[tuple[str, int, int, str, float, float, int, int]]: + reference_file: str | Path | None = None, +) -> list[tuple[str, int, int, str, float, float, int, int]]: """ Takes fragments from BAM file and calculates fragment length statistics for each interval in a BED file. If output specified, @@ -479,6 +491,9 @@ def frag_length_intervals( workers : int, optional Number of worker processes. verbose : bool, optional + reference_file : str or Path, optional + Path to a FASTA (.fa, .fasta, .fna) reference genome file. Required + when `input_file` is a CRAM file; ignored for BAM/frag files. Returns ------- @@ -508,9 +523,11 @@ def frag_length_intervals( intervals = get_intervals(interval_file) partial_frag_stat = partial( - _frag_length_stats, input_file=input_file,min_length=min_length, - max_length=max_length, short_reads=short_reads, intersect_policy=intersect_policy, - quality_threshold=quality_threshold, verbose=verbose) + _frag_length_stats, input_file=input_file, min_length=min_length, + max_length=max_length, short_reads=short_reads, + intersect_policy=intersect_policy, + quality_threshold=quality_threshold, verbose=verbose, + reference_file=reference_file) results = pool.map(partial(_frag_length_stats_star, partial_frag_stat), intervals, chunksize=max(len(intervals)//workers, diff --git a/src/finaletoolkit/frag/_multi_wps.py b/src/finaletoolkit/frag/_multi_wps.py index 6f3c9f14..22947d1b 100644 --- a/src/finaletoolkit/frag/_multi_wps.py +++ b/src/finaletoolkit/frag/_multi_wps.py @@ -1,6 +1,7 @@ from __future__ import annotations import gzip from os import PathLike +from pathlib import Path import time from multiprocessing.pool import Pool from typing import Union @@ -35,6 +36,7 @@ def multi_wps( verbose: Union[bool, int]=0, fraction_low: int | None = None, fraction_high: int | None = None, + reference_file: str | Path | None = None, ): """ Function that aggregates WPS over sites in BED file according to the @@ -77,7 +79,10 @@ def multi_wps( Deprecated alias for min_length fraction_high : int, optional Deprecated alias for max_length - + reference_file : str or Path, optional + Path to a FASTA (.fa, .fasta, .fna) reference genome file. Required + when `input_file` is a CRAM file; ignored for BAM/frag files. + Returns ------- output_file: str @@ -240,7 +245,10 @@ def multi_wps( count*[min_length], count*[max_length], count*[quality_threshold], - count*[verbose-2 if verbose>2 else 0] + count*[verbose-2 if verbose>2 else 0], + count*[fraction_low], + count*[fraction_high], + count*[reference_file], ) if (verbose): diff --git a/src/finaletoolkit/frag/_wps.py b/src/finaletoolkit/frag/_wps.py index 145e16ed..c73931b4 100644 --- a/src/finaletoolkit/frag/_wps.py +++ b/src/finaletoolkit/frag/_wps.py @@ -2,6 +2,7 @@ import gzip import time from typing import Union +from pathlib import Path from sys import stdout, stderr import warnings @@ -49,6 +50,7 @@ def wps(input_file: Union[str, pysam.AlignmentFile], verbose: bool | int = 0, fraction_low: int | None = None, fraction_high: int | None = None, + reference_file: str | Path | None = None, ) -> np.ndarray: """ Return (raw) Windowed Protection Scores as specified in Snyder et al @@ -81,6 +83,9 @@ def wps(input_file: Union[str, pysam.AlignmentFile], Deprecated alias for `min_length` fraction_high : int, optional Deprecated alias for `max_length` + reference_file : str or Path, optional + Path to a FASTA (.fa, .fasta, .fna) reference genome file. Required + when `input_file` is a CRAM file; ignored for BAM/frag files. Returns ------- @@ -151,7 +156,8 @@ def wps(input_file: Union[str, pysam.AlignmentFile], stop=maximum, min_length=min_length, max_length=max_length, - verbose=(verbose>=2)) + verbose=(verbose>=2), + reference_file=reference_file) if (verbose): stderr.write("Done reading fragments, preparing for WPS calculation.\n") diff --git a/src/finaletoolkit/io/reference.py b/src/finaletoolkit/io/reference.py index 750a1130..001566ac 100644 --- a/src/finaletoolkit/io/reference.py +++ b/src/finaletoolkit/io/reference.py @@ -51,7 +51,7 @@ def __init__(self, reference_path: str | Path, use_lock: bool = True): if self.reference_path.endswith((".2bit", ".tb2")): self._is_2bit = True self._open_2bit() - elif self.reference_path.endswith((".fa", ".fasta", ".fa.gz", ".fasta.gz")): + elif self.reference_path.endswith((".fa", ".fasta", ".fna", ".fa.gz", ".fasta.gz", ".fna.gz")): self._is_fasta = True self._open_fasta() else: @@ -69,10 +69,15 @@ def _open_2bit(self): raise def _open_fasta(self): + fai_path = self.reference_path + ".fai" + if not os.path.exists(fai_path): + logger.info( + f"FASTA index not found for {self.reference_path}. " + "Creating index with pysam.faidx()..." + ) + pysam.faidx(self.reference_path) try: - # pysam.FastaFile is generally efficient for random access if indexed (.fai) self._handle = pysam.FastaFile(self.reference_path) - # pysam.FastaFile.references returns list of contigs, lengths returns list of lengths self._chroms = dict(zip(self._handle.references, self._handle.lengths)) except Exception as e: logger.error(f"Failed to open FASTA file {self.reference_path}: {e}") diff --git a/src/finaletoolkit/utils/utils.py b/src/finaletoolkit/utils/utils.py index d69911a3..6d919d39 100644 --- a/src/finaletoolkit/utils/utils.py +++ b/src/finaletoolkit/utils/utils.py @@ -174,7 +174,8 @@ def frag_array( min_length: int | None = None, max_length: int | None = None, intersect_policy: str="midpoint", - verbose: bool=False + verbose: bool=False, + reference_file: str | Path | None = None, ) -> NDArray: """ Reads from BAM, CRAM, or fragment file and returns a three column matrix @@ -226,7 +227,8 @@ def frag_array( min_length, max_length, intersect_policy, - verbose + verbose, + reference_file=reference_file, ) ] if verbose: diff --git a/tests/data/delfi/hg19.chr1.10Mb.fa.fai b/tests/data/delfi/hg19.chr1.10Mb.fa.fai new file mode 100644 index 00000000..1428d30b --- /dev/null +++ b/tests/data/delfi/hg19.chr1.10Mb.fa.fai @@ -0,0 +1 @@ +chr1 9999999 6 60 61 diff --git a/tests/data/delfi/hg19.chr1.6Mb.cram b/tests/data/delfi/hg19.chr1.6Mb.cram new file mode 100644 index 00000000..447af418 Binary files /dev/null and b/tests/data/delfi/hg19.chr1.6Mb.cram differ diff --git a/tests/data/delfi/hg19.chr1.6Mb.cram.crai b/tests/data/delfi/hg19.chr1.6Mb.cram.crai new file mode 100644 index 00000000..f95b315f Binary files /dev/null and b/tests/data/delfi/hg19.chr1.6Mb.cram.crai differ diff --git a/tests/test_cram.py b/tests/test_cram.py new file mode 100644 index 00000000..536354b4 --- /dev/null +++ b/tests/test_cram.py @@ -0,0 +1,194 @@ +""" +Integration tests verifying that CRAM files produce identical results to +BAM files when reference_file is supplied. Tests use the existing DELFI +test data (hg19.chr1.6Mb.bam + hg19.chr1.10Mb.fa). + +All tests are skipped automatically when samtools is not on PATH. +""" + +from __future__ import annotations +import shutil +import subprocess +from pathlib import Path + +import numpy as np +import pytest + +from finaletoolkit.frag import ( + single_coverage, coverage, + frag_length, frag_length_bins, frag_length_intervals, + wps, multi_wps, + cleavage_profile, delfi, +) +from finaletoolkit.frag._cleavage_profile import multi_cleavage_profile +from finaletoolkit.genome.gaps import GenomeGaps + + +pytestmark = pytest.mark.skipif( + shutil.which("samtools") is None, + reason="samtools not on PATH" +) + +DATA = Path(__file__).parent / "data" / "delfi" +BAM = DATA / "hg19.chr1.6Mb.bam" +FASTA = DATA / "hg19.chr1.10Mb.fa" +CRAM = DATA / "hg19.chr1.6Mb.cram" + + +@pytest.fixture(scope="session", autouse=True) +def cram_file(): + """Create CRAM + index from the DELFI BAM and FASTA once per session.""" + if not CRAM.exists(): + subprocess.run( + ["samtools", "view", "-C", "-T", str(FASTA), str(BAM), "-o", str(CRAM)], + check=True, + ) + subprocess.run(["samtools", "index", str(CRAM)], check=True) + yield CRAM + # leave the CRAM in place for reuse across runs + + +class TestSingleCoverageCRAM: + def test_cram_matches_bam(self, cram_file): + bam_result = single_coverage(BAM, "chr1", 0, 5000000, quality_threshold=0) + cram_result = single_coverage( + cram_file, "chr1", 0, 5000000, + quality_threshold=0, + reference_file=FASTA, + ) + assert bam_result[4] == cram_result[4], ( + f"Coverage differs: BAM={bam_result[4]}, CRAM={cram_result[4]}" + ) + + +class TestFragLengthBinsCRAM: + def test_cram_matches_bam(self, cram_file): + bam_bins, bam_counts = frag_length_bins( + BAM, contig="chr1", start=0, stop=5000000, quality_threshold=0 + ) + cram_bins, cram_counts = frag_length_bins( + cram_file, contig="chr1", start=0, stop=5000000, + quality_threshold=0, + reference_file=FASTA, + ) + np.testing.assert_array_equal(bam_bins, cram_bins) + np.testing.assert_array_equal(bam_counts, cram_counts) + + +class TestWpsCRAM: + def test_cram_matches_bam(self, cram_file): + chrom_size = 249250621 # hg19 chr1 + bam_scores = wps( + BAM, "chr1", 1000000, 1001000, chrom_size, + quality_threshold=0 + ) + cram_scores = wps( + cram_file, "chr1", 1000000, 1001000, chrom_size, + quality_threshold=0, + reference_file=FASTA, + ) + np.testing.assert_array_equal(bam_scores, cram_scores) + + +class TestDelfiCRAM: + def test_cram_matches_bam(self, cram_file): + import pandas as pd + + autosomes = DATA / "human.hg19.chr1.6Mb.genome" + bins_file = DATA / "hg19.hic.chr1.6Mb.txt" + blacklist = DATA / "hg19_darkregion.bed" + gaps = GenomeGaps.ucsc_hg19() + fasta = str(FASTA) + + # Both use FASTA as reference_file (4th positional). BAM doesn't + # need it for reading, but CRAM does — ensuring identical GC inputs. + bam_result = delfi(BAM, autosomes, bins_file, fasta, blacklist, gaps) + cram_result = delfi(cram_file, autosomes, bins_file, fasta, blacklist, gaps) + + pd.testing.assert_frame_equal(bam_result, cram_result) + + +# --------------------------------------------------------------------------- +# Smoke tests — verify each function runs without error when given CRAM input +# --------------------------------------------------------------------------- + +CHROM_SIZES = DATA / "human.hg19.chr1.6Mb.genome" +REGION = ("chr1", 1_000_000, 2_000_000) + + +class TestCoverageCRAMRuns: + def test_cram_runs(self, cram_file, tmp_path): + intervals = tmp_path / "intervals.bed" + intervals.write_text("chr1\t1000000\t2000000\t.\n") + output = tmp_path / "coverage.bed" + results = coverage( + cram_file, str(intervals), str(output), + quality_threshold=0, + reference_file=FASTA, + ) + assert len(results) > 0 + + +class TestFragLengthCRAMRuns: + def test_cram_runs(self, cram_file): + contig, start, stop = REGION + lengths = frag_length( + cram_file, contig=contig, start=start, stop=stop, + quality_threshold=0, + reference_file=FASTA, + ) + assert lengths is not None + + +class TestFragLengthIntervalsCRAMRuns: + def test_cram_runs(self, cram_file, tmp_path): + intervals = tmp_path / "intervals.bed" + intervals.write_text("chr1\t1000000\t2000000\t.\n") + results = frag_length_intervals( + cram_file, str(intervals), + quality_threshold=0, + reference_file=FASTA, + ) + assert results is not None + + +class TestMultiWpsCRAMRuns: + def test_cram_runs(self, cram_file, tmp_path): + site_bed = tmp_path / "sites.bed" + site_bed.write_text("chr1\t1000000\t1005000\n") + output = tmp_path / "wps.bed.gz" + multi_wps( + cram_file, str(site_bed), + chrom_sizes=CHROM_SIZES, + output_file=str(output), + quality_threshold=0, + reference_file=FASTA, + ) + assert output.exists() + + +class TestCleavageProfileCRAMRuns: + def test_cram_runs(self, cram_file): + contig, start, stop = REGION + chrom_size = 6_000_000 + results = cleavage_profile( + cram_file, chrom_size, contig, start, stop, + quality_threshold=0, + reference_file=FASTA, + ) + assert results is not None + + +class TestMultiCleavageProfileCRAMRuns: + def test_cram_runs(self, cram_file, tmp_path): + intervals = tmp_path / "intervals.bed" + intervals.write_text("chr1\t1000000\t1005000\n") + output = tmp_path / "cleavage.bed.gz" + multi_cleavage_profile( + cram_file, str(intervals), + chrom_sizes=str(CHROM_SIZES), + output_file=str(output), + quality_threshold=0, + reference_file=FASTA, + ) + assert output.exists()