diff --git a/CHANGELOG.md b/CHANGELOG.md index e94f09b..623e22b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -43,6 +43,12 @@ and this project adheres to - `delfi` docstring clarifies that when `input_file` is a CRAM file, `reference_file` must be a FASTA (not .2bit), as htslib requires FASTA for CRAM decoding. +- `multi_wps` (and therefore `wps` CLI) no longer silently drops chromosomes + from BigWig output when the input BED file is sorted in a different + chromosome order than the BAM header (e.g. alphabetical vs. numeric). The + interval list is now sorted by BAM-header chromosome order before writing, + preventing the `RuntimeError` that previously caused silent data loss for + chr2–chr9, chrX, chrY and similar chromosomes. ## [0.11.1] - 2026-04-21 diff --git a/src/finaletoolkit/frag/_multi_wps.py b/src/finaletoolkit/frag/_multi_wps.py index 22947d1..a25e45a 100644 --- a/src/finaletoolkit/frag/_multi_wps.py +++ b/src/finaletoolkit/frag/_multi_wps.py @@ -220,6 +220,19 @@ def multi_wps( if site_bed != '-': bed.close() + # BigWig requires entries in the same chromosome order as the header. + # BED files sorted alphabetically (chr1, chr10...chr2) produce out-of-order + # writes that pyBigWig silently drops via the RuntimeError handler below. + if header and contigs: + _chrom_order = {chrom: idx for idx, (chrom, _) in enumerate(header)} + _sort_indices = sorted( + range(len(contigs)), + key=lambda i: (_chrom_order.get(contigs[i], len(header)), starts[i]) + ) + contigs = [contigs[i] for i in _sort_indices] + starts = [starts[i] for i in _sort_indices] + stops = [stops[i] for i in _sort_indices] + try: chrom_sizes_intervals = [chrom_sizes_dict[contig] for contig in contigs] except KeyError: diff --git a/tests/test_wps.py b/tests/test_wps.py index c2c209e..63f3076 100644 --- a/tests/test_wps.py +++ b/tests/test_wps.py @@ -9,6 +9,8 @@ import pytest import numpy as np +import pysam +import pyBigWig as pbw from finaletoolkit.frag import wps, multi_wps @@ -21,4 +23,113 @@ def test_lwps(self, request): assert np.all( results['start'] == np.arange(34444145, 34444155)), str(results) assert np.all( - results['wps'] == [-1,-1,-1,-1,-1,1,1,1,1,1]), str(results) \ No newline at end of file + results['wps'] == [-1,-1,-1,-1,-1,1,1,1,1,1]), str(results) + + +def _make_two_chrom_bam(path: str) -> str: + """ + Create a coordinate-sorted, indexed BAM with one proper fragment on each + of two chromosomes. Header order is ["2", "10"] (numeric), so an + alphabetically-sorted BED file lists "10" before "2" — the ordering that + previously caused silent chromosome dropout in multi_wps. + + Fragment layout (read_len=150, frag_len=160, window_size=120): + chr start stop WPS non-zero range + 2 1_000_000 1_000_160 [1_000_060, 1_000_100) + 10 1_000_000 1_000_160 [1_000_060, 1_000_100) + """ + chroms = [("2", 100_000_000), ("10", 100_000_000)] + hdr = pysam.AlignmentHeader.from_dict({ + "HD": {"VN": "1.6", "SO": "coordinate"}, + "SQ": [{"SN": c, "LN": l} for c, l in chroms], + }) + read_len = 150 + frag_len = 160 # within default min_length=120, max_length=180 + chrom_idx = {c: i for i, (c, _) in enumerate(chroms)} + + unsorted = path + ".unsorted.bam" + with pysam.AlignmentFile(unsorted, "wb", header=hdr) as bam: + for fid, (chrom, pos) in enumerate([("2", 1_000_000), ("10", 1_000_000)]): + rid = chrom_idx[chrom] + r2_pos = pos + frag_len - read_len # = pos + 10 + + r1 = pysam.AlignedSegment(hdr) + r1.query_name = f"f{fid}" + r1.reference_id = rid + r1.reference_start = pos + r1.cigarstring = f"{read_len}M" + r1.flag = 99 # paired, proper pair, mate reverse strand, read1 + r1.mapping_quality = 60 + r1.query_sequence = "A" * read_len + r1.query_qualities = pysam.qualitystring_to_array("I" * read_len) + r1.next_reference_id = rid + r1.next_reference_start = r2_pos + r1.template_length = frag_len + bam.write(r1) + + r2 = pysam.AlignedSegment(hdr) + r2.query_name = f"f{fid}" + r2.reference_id = rid + r2.reference_start = r2_pos + r2.cigarstring = f"{read_len}M" + r2.flag = 147 # paired, proper pair, read reverse strand, read2 + r2.mapping_quality = 60 + r2.query_sequence = "A" * read_len + r2.query_qualities = pysam.qualitystring_to_array("I" * read_len) + r2.next_reference_id = rid + r2.next_reference_start = pos + r2.template_length = -frag_len + bam.write(r2) + + pysam.sort("-o", path, unsorted) + pysam.index(path) + os.unlink(unsorted) + return path + + +class TestMultiWpsBedOrder: + """ + Regression test for silent chromosome dropout in multi_wps. + + When the input BED is sorted alphabetically ("10" before "2") but the BAM + header is in numeric order ("2" before "10"), pyBigWig raises RuntimeError + on out-of-order writes. The RuntimeError handler previously swallowed the + error with `continue`, silently dropping all data for the affected + chromosome. The fix sorts intervals by header chromosome order before + passing them to the pool. + """ + + @pytest.fixture(scope="class") + def two_chrom_bam(self, tmp_path_factory): + tmp = tmp_path_factory.mktemp("bam_order") + return _make_two_chrom_bam(str(tmp / "test.bam")) + + def test_all_chroms_present_when_bed_alphabetical(self, two_chrom_bam, tmp_path): + # BED in alphabetical order: "10" sorts before "2" as a string. + # Without the interval-sort fix, chromosome "2" entries would be + # silently dropped because "2" precedes "10" in the BAM header. + bed = tmp_path / "sites.bed" + bed.write_text("10\t999500\t1000500\n2\t999500\t1000500\n") + output = str(tmp_path / "out.bw") + + multi_wps( + two_chrom_bam, + str(bed), + output_file=output, + interval_size=1000, + min_length=120, + max_length=180, + quality_threshold=0, + ) + + with pbw.open(output, "r") as bw: + chr2_max = bw.stats("2", 999_000, 1_001_000, type="max") + chr10_max = bw.stats("10", 999_000, 1_001_000, type="max") + + assert chr2_max[0] is not None, ( + "Chromosome '2' has no BigWig entries — likely dropped due to " + "out-of-order BED/header mismatch" + ) + assert chr10_max[0] is not None, ( + "Chromosome '10' has no BigWig entries" + ) \ No newline at end of file