Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 108 additions & 0 deletions tests/test_vcf_readcount_annotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from filecmp import cmp
import io
import logging
import vcfpy
from testfixtures import LogCapture, StringComparison as S

class VcfExpressionEncoderTests(unittest.TestCase):
Expand Down Expand Up @@ -231,3 +232,110 @@ def test_mnp(self):

self.assertTrue(cmp(os.path.join(self.test_data_dir, 'mnp.readcount.vcf.gz'), os.path.join(temp_path.name, 'input.readcount.vcf.gz')))
temp_path.cleanup()

def test_extra_field_avg_mapping_quality(self):
temp_path = tempfile.TemporaryDirectory()
output_vcf = os.path.join(temp_path.name, 'output.vcf')
command = [
os.path.join(self.test_data_dir, 'input.vcf'),
os.path.join(self.test_data_dir, 'snvs.bam_readcount'),
'DNA',
'-o', output_vcf,
'--avg-mapping-quality',
]
vcf_readcount_annotator.main(command)
vcf_reader = vcfpy.Reader.from_path(output_vcf)
self.assertIn('VAMQ', vcf_reader.header.format_ids())
for entry in vcf_reader:
sample = vcf_reader.header.samples.names[0]
self.assertEqual(entry.call_for_sample[sample].data['VAMQ'], 60.0)
temp_path.cleanup()



def test_extra_field_all_fields_dna_mode(self):
temp_path = tempfile.TemporaryDirectory()
output_vcf = os.path.join(temp_path.name, 'output.vcf')
command = [
os.path.join(self.test_data_dir, 'input.vcf'),
os.path.join(self.test_data_dir, 'snvs.bam_readcount'),
'DNA',
'-o', output_vcf,
'--all-fields',
]
vcf_readcount_annotator.main(command)
vcf_reader = vcfpy.Reader.from_path(output_vcf)
format_ids = vcf_reader.header.format_ids()
for tag in ['VAMQ', 'VABQ', 'VASEMQ', 'VAPF', 'VAMF', 'VAMQS', 'VAQ2', 'VAQD', 'VACL', 'VA3P']:
self.assertIn(tag, format_ids)
for entry in vcf_reader:
sample = vcf_reader.header.samples.names[0]
data = entry.call_for_sample[sample].data
self.assertEqual(data['VAMQ'], 60.0)
self.assertEqual(data['VABQ'], 35.0)
self.assertEqual(data['VASEMQ'], 0.0)
self.assertEqual(data['VAPF'], 0.68)
self.assertEqual(data['VAMF'], 0.01)
self.assertEqual(data['VAMQS'], 36.4)
self.assertEqual(data['VAQ2'], 3)
self.assertEqual(data['VAQD'], 0.52)
self.assertEqual(data['VACL'], 148.8)
self.assertEqual(data['VA3P'], 0.52)
temp_path.cleanup()

def test_extra_field_all_short_flags(self):
temp_path = tempfile.TemporaryDirectory()
output_vcf = os.path.join(temp_path.name, 'output.vcf')
logging.disable(logging.NOTSET)
with LogCapture() as l:
command = [
os.path.join(self.test_data_dir, 'input.vcf'),
os.path.join(self.test_data_dir, 'snvs.bam_readcount'),
'DNA',
'-o', output_vcf,
'-q', '-b', '-e', '-r', '-f', '-m', '-k', '-2', '-d', '-c', '-3',
]
vcf_readcount_annotator.main(command)
self.assertTrue(any('--strand-counts (-r)' in ''.join(rec) for rec in l.actual()))
vcf_reader = vcfpy.Reader.from_path(output_vcf)
format_ids = vcf_reader.header.format_ids()
for tag in ['VAMQ', 'VABQ', 'VASEMQ', 'VAPF', 'VAMF', 'VAMQS', 'VAQ2', 'VAQD', 'VACL', 'VA3P']:
self.assertIn(tag, format_ids)
for entry in vcf_reader:
sample = vcf_reader.header.samples.names[0]
data = entry.call_for_sample[sample].data
self.assertEqual(data['VAMQ'], 60.0)
self.assertEqual(data['VABQ'], 35.0)
self.assertEqual(data['VASEMQ'], 0.0)
self.assertEqual(data['VAPF'], 0.68)
self.assertEqual(data['VAMF'], 0.01)
self.assertEqual(data['VAMQS'], 36.4)
self.assertEqual(data['VAQ2'], 3)
self.assertEqual(data['VAQD'], 0.52)
self.assertEqual(data['VACL'], 148.8)
self.assertEqual(data['VA3P'], 0.52)
vcf_reader.close()
temp_path.cleanup()

def test_extra_field_strand_counts_rna_mode(self):
temp_path = tempfile.TemporaryDirectory()
output_vcf = os.path.join(temp_path.name, 'output.vcf')
command = [
os.path.join(self.test_data_dir, 'input.vcf'),
os.path.join(self.test_data_dir, 'snvs.bam_readcount'),
'RNA',
'-o', output_vcf,
'--strand-counts',
]
vcf_readcount_annotator.main(command)
vcf_reader = vcfpy.Reader.from_path(output_vcf)
format_ids = vcf_reader.header.format_ids()
self.assertIn('ADF', format_ids)
self.assertIn('ADR', format_ids)
for entry in vcf_reader:
sample = vcf_reader.header.samples.names[0]
data = entry.call_for_sample[sample].data
self.assertEqual(data['ADF'], [43, 3])
self.assertEqual(data['ADR'], [59, 2])
vcf_reader.close()
temp_path.cleanup()
166 changes: 153 additions & 13 deletions vatools/vcf_readcount_annotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,10 @@
import vcfpy
import tempfile
import csv
from collections import OrderedDict
from collections import OrderedDict, namedtuple
import logging

# parse the input params
def define_parser():
parser = argparse.ArgumentParser(
'vcf-readcount-annotator',
Expand Down Expand Up @@ -43,19 +44,109 @@ def define_parser():
choices=['snv', 'indel', 'all'],
default='all'
)
extra = parser.add_argument_group('extra bam-readcount fields')
extra.add_argument(
'-a', '--all-fields', action='store_true', default=False,
help='Append all extra bam-readcount fields to the output.'
)
extra.add_argument(
'-q', '--avg-mapping-quality', action='store_true', default=False,
help='Append avg mapping quality of variant-supporting reads (FORMAT tag: VAMQ).'
)
extra.add_argument(
'-b', '--avg-basequality', action='store_true', default=False,
help='Append avg base quality of variant-supporting reads (FORMAT tag: VABQ).'
)
extra.add_argument(
'-e', '--avg-se-mapping-quality', action='store_true', default=False,
help='Append avg SE mapping quality of variant-supporting reads (FORMAT tag: VASEMQ).'
)
extra.add_argument(
'-r', '--strand-counts', action='store_true', default=False,
help='Append ref and var forward/reverse strand read counts (FORMAT tags: ADF, ADR). '
'In DNA mode ADF/ADR are already written; this flag is a no-op with a warning.'
)
extra.add_argument(
'-f', '--avg-pos-fraction', action='store_true', default=False,
help='Append avg position of variant reads as fraction of read length (FORMAT tag: VAPF).'
)
extra.add_argument(
'-m', '--avg-mismatches', action='store_true', default=False,
help='Append avg mismatches per variant-supporting read as fraction (FORMAT tag: VAMF).'
)
extra.add_argument(
'-k', '--sum-mismatch-qual', action='store_true', default=False,
help='Append avg sum of mismatch base qualities for variant reads (FORMAT tag: VAMQS).'
)
extra.add_argument(
'-2', '--num-q2-reads', action='store_true', default=False,
help='Append number of variant-supporting reads containing a Q2 base (FORMAT tag: VAQ2).'
)
extra.add_argument(
'-d', '--avg-q2-distance', action='store_true', default=False,
help='Append avg distance to Q2 start in Q2-containing reads (FORMAT tag: VAQD).'
)
extra.add_argument(
'-c', '--avg-clipped-length', action='store_true', default=False,
help='Append avg clipped read length for variant-supporting reads (FORMAT tag: VACL).'
)
extra.add_argument(
'-3', '--avg-3p-distance', action='store_true', default=False,
help="Append avg distance to effective 3' end for variant reads (FORMAT tag: VA3P)."
)
return parser

# create an object to hold all of the accessory information about the bam-readcount columns:
# brct_col: column name in bam-readcount per-base output (None for composite fields)
# arg_name: argument name
# tag: VCF FORMAT tag, or 'strand_counts' for the ADF/ADR pair
# vcf_type, number, desc: VCF header values (empty strings for the strand_counts)
ExtraField = namedtuple('ExtraField', ['brct_col', 'arg_name', 'tag', 'vcf_type', 'number', 'desc'])

EXTRA_FIELDS = [
ExtraField('avg_mapping_quality', 'avg_mapping_quality', 'VAMQ', 'Float', '1', 'Avg mapping quality for variant-supporting reads'),
ExtraField('avg_basequality', 'avg_basequality', 'VABQ', 'Float', '1', 'Avg base quality for variant-supporting reads'),
ExtraField('avg_se_mapping_quality', 'avg_se_mapping_quality', 'VASEMQ', 'Float', '1', 'Avg SE mapping quality for variant-supporting reads'),
ExtraField(None, 'strand_counts', 'strand_counts','', '', ''),
ExtraField('avg_pos_as_fraction', 'avg_pos_fraction', 'VAPF', 'Float', '1', 'Avg position of variant reads as fraction of read length'),
ExtraField('avg_num_mismatches_as_fraction', 'avg_mismatches', 'VAMF', 'Float', '1', 'Avg mismatches per variant-supporting read as fraction'),
ExtraField('avg_sum_mismatch_qualities', 'sum_mismatch_qual', 'VAMQS', 'Float', '1', 'Avg sum of mismatch base qualities for variant reads'),
ExtraField('num_q2_containing_reads', 'num_q2_reads', 'VAQ2', 'Integer', '1', 'Number of variant reads containing a Q2 base'),
ExtraField('avg_distance_to_q2_start_in_q2_reads','avg_q2_distance', 'VAQD', 'Float', '1', 'Avg distance to Q2 start in Q2-containing reads'),
ExtraField('avg_clipped_length', 'avg_clipped_length', 'VACL', 'Float', '1', 'Avg clipped read length for variant-supporting reads'),
ExtraField('avg_distance_to_effective_3p_end', 'avg_3p_distance', 'VA3P', 'Float', '1', "Avg distance to effective 3' end for variant reads"),
]

def get_requested_extra_fields(args):
if getattr(args, 'all_fields', False):
return list(EXTRA_FIELDS)
return [f for f in EXTRA_FIELDS if getattr(args, f.arg_name, False)]

# parse the fields out for the detailed metrics
def parse_brct_field(brcts):
# Column names match bam-readcount per-base output order
quality_cols = (
'avg_mapping_quality', 'avg_basequality', 'avg_se_mapping_quality',
'num_plus_strand', 'num_minus_strand',
'avg_pos_as_fraction', 'avg_num_mismatches_as_fraction',
'avg_sum_mismatch_qualities', 'num_q2_containing_reads',
'avg_distance_to_q2_start_in_q2_reads',
'avg_clipped_length', 'avg_distance_to_effective_3p_end',
)
counts = {}
forward_counts = {}
reverse_counts = {}
qualities = {}
for brct in brcts:
(base, count, avg_mapping_quality, avg_basequality, avg_se_mapping_quality, num_plus_strand, num_minus_strand, rest) = brct.split(':', 7)
counts[base.upper()] = count
forward_counts[base.upper()] = num_plus_strand
reverse_counts[base.upper()] = num_minus_strand
return counts, forward_counts, reverse_counts

parts = brct.split(':')
base = parts[0].upper()
counts[base] = parts[1]
qualities[base] = dict(zip(quality_cols, parts[2:]))
forward_counts[base] = qualities[base].get('num_plus_strand', '0')
reverse_counts[base] = qualities[base].get('num_minus_strand', '0')
return counts, forward_counts, reverse_counts, qualities

# read the bam readcount file
def parse_bam_readcount_file(args):
coverage = {}
with open(args.bam_readcount_file, 'r') as reader:
Expand All @@ -66,11 +157,12 @@ def parse_bam_readcount_file(args):
reference_base = row[2].upper()
depth = row[3]
brct = row[4:]
counts, forward_counts, reverse_counts = parse_brct_field(brct)
counts, forward_counts, reverse_counts, qualities = parse_brct_field(brct)
parsed_brct = {
'counts': counts,
'forward_counts': forward_counts,
'reverse_counts': reverse_counts,
'qualities': qualities,
'depth': depth
}
if (chromosome, position, reference_base) in coverage and parsed_brct != coverage[(chromosome,position,reference_base)]:
Expand Down Expand Up @@ -167,7 +259,9 @@ def create_vcf_reader(args):
sample_name = vcf_reader.header.samples.names[0]
return vcf_reader, sample_name

def create_vcf_writer(args, vcf_reader):
def create_vcf_writer(args, vcf_reader, extra_fields=None):
if extra_fields is None:
extra_fields = []
if args.output_vcf:
output_file = args.output_vcf
else:
Expand All @@ -192,9 +286,22 @@ def create_vcf_writer(args, vcf_reader):
new_header.add_format_line(OrderedDict([('ID', 'RADF'), ('Number', 'R'), ('Type', 'Integer'), ('Description', 'RNA Allelic depths on the forward strand (high-quality bases)')]))
new_header.add_format_line(OrderedDict([('ID', 'RADR'), ('Number', 'R'), ('Type', 'Integer'), ('Description', 'RNA Allelic depths on the reverse strand (high-quality bases)')]))
new_header.add_format_line(OrderedDict([('ID', 'RAF'), ('Number', 'A'), ('Type', 'Float'), ('Description', 'RNA Variant-allele frequency for the alt alleles')]))
for f in extra_fields:
if f.tag == 'strand_counts':
if args.data_type == 'DNA':
logging.warning(
'--strand-counts (-r): ADF/ADR are already written in DNA mode; skipping.'
)
else:
new_header.add_format_line(OrderedDict([('ID', 'ADF'), ('Number', 'R'), ('Type', 'Integer'), ('Description', 'Allelic depths on the forward strand')]))
new_header.add_format_line(OrderedDict([('ID', 'ADR'), ('Number', 'R'), ('Type', 'Integer'), ('Description', 'Allelic depths on the reverse strand')]))
else:
new_header.add_format_line(OrderedDict([
('ID', f.tag), ('Number', f.number), ('Type', f.vcf_type), ('Description', f.desc),
]))
return vcfpy.Writer.from_path(output_file, new_header)

def write_depth(entry, sample_name, field, value):
def add_format_value(entry, sample_name, field, value):
if field not in entry.FORMAT:
entry.FORMAT += [field]
entry.call_for_sample[sample_name].data[field] = value
Expand All @@ -204,8 +311,9 @@ def main(args_input = sys.argv[1:]):
args = parser.parse_args(args_input)

read_counts = parse_bam_readcount_file(args)
extra_fields = get_requested_extra_fields(args)
(vcf_reader, sample_name) = create_vcf_reader(args)
vcf_writer = create_vcf_writer(args, vcf_reader)
vcf_writer = create_vcf_writer(args, vcf_reader, extra_fields)

if args.data_type == 'DNA':
depth_field = 'DP'
Expand Down Expand Up @@ -259,7 +367,7 @@ def main(args_input = sys.argv[1:]):
(bam_readcount_position, ref_base, var_base) = parse_to_bam_readcount(start, reference, alts[0].serialize(), entry.POS)
brct = read_counts.get((chromosome,bam_readcount_position,ref_base), None)
if brct is None:
write_depth(entry, sample_name, depth_field, 0)
add_format_value(entry, sample_name, depth_field, 0)
if frequency_field not in entry.FORMAT:
entry.FORMAT += [frequency_field]
vafs = [0] * len(alts)
Expand All @@ -281,7 +389,7 @@ def main(args_input = sys.argv[1:]):

#DP - read depth
depth = brct['depth']
write_depth(entry, sample_name, depth_field, depth)
add_format_value(entry, sample_name, depth_field, depth)

#If `depth` is the only key in this hash, then this must have
#been a duplicate bam-readcount entry where only the depths matched.
Expand All @@ -290,6 +398,10 @@ def main(args_input = sys.argv[1:]):
vcf_writer.write_record(entry)
continue

primary_brct = brct
primary_ref_base = ref_base
primary_var_base = var_base

#AF - variant allele frequencies
if frequency_field not in entry.FORMAT:
entry.FORMAT += [frequency_field]
Expand Down Expand Up @@ -330,6 +442,34 @@ def main(args_input = sys.argv[1:]):
ads.append(0)
entry.call_for_sample[sample_name].data[field_name] = ads

# Skip the whole block if no extra fields were requested, or if the bam-readcount entry doesn't have per-base quality data
if extra_fields and 'qualities' in primary_brct:
for f in extra_fields:
# strand_counts maps to two VCF tags (ADF/ADR) instead of one, so it can't follow the same path
# as the other extra fields. It's also skipped in DNA mode because ADF/ADR are already written
# earlier in the function as standard fields. In RNA mode they aren't written by default, so
# this is where they get added.
if f.tag == 'strand_counts':
if args.data_type != 'DNA':
ref_q = primary_brct['qualities'].get(primary_ref_base, {})
var_q = primary_brct['qualities'].get(primary_var_base, {})
add_format_value(entry, sample_name, 'ADF', [
int(float(ref_q.get('num_plus_strand', '0'))),
int(float(var_q.get('num_plus_strand', '0'))),
])
add_format_value(entry, sample_name, 'ADR', [
int(float(ref_q.get('num_minus_strand', '0'))),
int(float(var_q.get('num_minus_strand', '0'))),
])
else:
var_q = primary_brct['qualities'].get(primary_var_base, {})
raw = var_q.get(f.brct_col, None)
if raw is not None:
val = int(float(raw)) if f.vcf_type == 'Integer' else float(raw)
else:
val = None
add_format_value(entry, sample_name, f.tag, val)

vcf_writer.write_record(entry)

vcf_writer.close()
Expand Down
Loading