From 40562829d7a14f8217dd3a45b8f4405adf203490 Mon Sep 17 00:00:00 2001 From: Jon Kerry Date: Thu, 10 May 2018 11:51:23 +0100 Subject: [PATCH 01/15] Added absolute paths for scripts so that pipeline can be run in a docker container --- src/master.py | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/src/master.py b/src/master.py index 11d2b88..85b0d29 100755 --- a/src/master.py +++ b/src/master.py @@ -32,6 +32,7 @@ g.add("-GG", "--gnomad-genome-sites-vcf", help="gnomAD genome sites vcf file. If specified, a clinvar table with extra gnomAD genome info fields will also be created.") g.add("--output-prefix", default="../output/", help="Final output files will have this prefix") g.add("--tmp-dir", default="./output_tmp", help="Temporary output files will have this prefix") +g.add("--script-dir", help="Directory where the ClinVar source codes are stored") g = p.add_mutually_exclusive_group() g.add("--single-only", dest="single_or_multi", action="store_const", const="single", help="Only generate the single-variant tables") g.add("--multi-only", dest="single_or_multi", action="store_const", const="multi", help="Only generate the multi-variant tables") @@ -51,6 +52,7 @@ clinvar_variant_summary_table = args.clinvar_variant_summary_table output_prefix = args.output_prefix +script_dir = args.script_dir tmp_dir = args.tmp_dir os.system("mkdir -p " + tmp_dir) @@ -127,7 +129,8 @@ def download_if_changed(job_runner, local_path, ftp_host, ftp_path): # normalize (convert to minimal representation and left-align) # the normalization code is in a different repo (useful for more than just clinvar) so here I just wget it: -job.add("wget -N https://raw.githubusercontent.com/ericminikel/minimal_representation/master/normalize.py") +job.add("wget -N https://raw.githubusercontent.com/ericminikel/minimal_representation/master/normalize.py " + "-O OUT:%(output_prefix)s/normalize.py" % locals()) for genome_build in ('b37', 'b38'): # extract the GRCh37 coordinates, mutant allele, MeasureSet ID and PubMed IDs from it. This currently takes about 20 minutes. @@ -137,7 +140,7 @@ def download_if_changed(job_runner, local_path, ftp_host, ftp_path): print("Skippping steps to generate %s tables since reference genome not given." % genome_build) continue - job.add(("python -u IN:parse_clinvar_xml.py " + job.add(("python -u IN:%(script_dir)s/parse_clinvar_xml.py " "-x IN:%(clinvar_xml)s " "-g %(genome_build_id)s " "-o OUT:%(tmp_dir)s/clinvar_table_raw.single.%(genome_build)s.tsv " @@ -150,11 +153,11 @@ def download_if_changed(job_runner, local_path, ftp_host, ftp_path): continue fsuffix = "%(single_or_multi)s.%(genome_build)s" % locals() # file suffix - output_dir = '%(output_prefix)s%(genome_build)s/%(single_or_multi)s' % locals() + output_dir = '%(output_prefix)s/%(genome_build)s/%(single_or_multi)s' % locals() os.system('mkdir -p ' + output_dir) # normalize variants (use grep -v '^$' to remove empty rows) - job.add("python -u normalize.py -R IN:%(reference_genome)s < IN:%(tmp_dir)s/clinvar_table_raw.%(fsuffix)s.tsv | grep -v ^$ | bgzip -c > OUT:%(tmp_dir)s/clinvar_table_normalized.%(fsuffix)s.tsv.gz" % locals()) + job.add("python -u %(output_prefix)s/normalize.py -R IN:%(reference_genome)s < IN:%(tmp_dir)s/clinvar_table_raw.%(fsuffix)s.tsv | grep -v ^$ | bgzip -c > OUT:%(tmp_dir)s/clinvar_table_normalized.%(fsuffix)s.tsv.gz" % locals()) # sort job.add(("cat " + @@ -171,10 +174,10 @@ def download_if_changed(job_runner, local_path, ftp_host, ftp_path): ]) # group by allele, since clinvar_allele_trait_pairs.*.tsv will have more than 1 record for some alleles - job.add("python -u IN:group_by_allele.py -i IN:%(tmp_dir)s/clinvar_allele_trait_pairs.%(fsuffix)s.tsv.gz | bgzip -c > OUT:%(tmp_dir)s/clinvar_alleles_grouped.%(fsuffix)s.tsv.gz" % locals()) + job.add("python -u IN:%(script_dir)s/group_by_allele.py -i IN:%(tmp_dir)s/clinvar_allele_trait_pairs.%(fsuffix)s.tsv.gz | bgzip -c > OUT:%(tmp_dir)s/clinvar_alleles_grouped.%(fsuffix)s.tsv.gz" % locals()) # join information from the tab-delimited summary to the normalized genomic coordinates - job.add("python IN:join_variant_summary_with_clinvar_alleles.py " + job.add("python IN:%(script_dir)s/join_variant_summary_with_clinvar_alleles.py " "IN:%(variant_summary_table)s " "IN:%(tmp_dir)s/clinvar_alleles_grouped.%(fsuffix)s.tsv.gz " "OUT:%(tmp_dir)s/clinvar_alleles_combined.%(fsuffix)s.tsv.gz " @@ -196,7 +199,7 @@ def download_if_changed(job_runner, local_path, ftp_host, ftp_path): ]) # create vcf - job.add(("python -u IN:clinvar_table_to_vcf.py IN:%(tmp_dir)s/clinvar_alleles.%(fsuffix)s.tsv.gz IN:%(reference_genome)s | bgzip -c > OUT:%(tmp_dir)s/clinvar_alleles.%(fsuffix)s.vcf.gz") % locals()) # create compressed version + job.add(("python -u IN:%(script_dir)s/clinvar_table_to_vcf.py IN:%(tmp_dir)s/clinvar_alleles.%(fsuffix)s.tsv.gz IN:%(reference_genome)s | bgzip -c > OUT:%(tmp_dir)s/clinvar_alleles.%(fsuffix)s.vcf.gz") % locals()) # create compressed version job.add("tabix IN:%(tmp_dir)s/clinvar_alleles.%(fsuffix)s.vcf.gz" % locals(), output_filenames=["%(tmp_dir)s/clinvar_alleles.%(fsuffix)s.vcf.gz.tbi" % locals()]) job.add("cp IN:%(tmp_dir)s/clinvar_alleles.%(fsuffix)s.vcf.gz IN:%(tmp_dir)s/clinvar_alleles.%(fsuffix)s.vcf.gz.tbi %(output_dir)s/" % locals(), output_filenames=[ "%(output_dir)s/clinvar_alleles.%(fsuffix)s.vcf.gz" % locals(), @@ -218,7 +221,7 @@ def download_if_changed(job_runner, local_path, ftp_host, ftp_path): "vt normalize -r IN:%(reference_genome)s - | " "bgzip -c > OUT:%(tmp_dir)s/%(normalized_vcf)s") % locals()) job.add("tabix IN:%(tmp_dir)s/%(normalized_vcf)s" % locals(), output_filenames=["%(tmp_dir)s/%(normalized_vcf)s.tbi" % locals()]) - job.add(("python -u IN:%(script_name)s -i IN:%(tmp_dir)s/clinvar_alleles.%(fsuffix)s.tsv.gz %(vcf_arg)s IN:%(tmp_dir)s/%(normalized_vcf)s | " + job.add(("python -u IN:%(script_dir)s/%(script_name)s -i IN:%(tmp_dir)s/clinvar_alleles.%(fsuffix)s.tsv.gz %(vcf_arg)s IN:%(tmp_dir)s/%(normalized_vcf)s | " "bgzip -c > OUT:%(tmp_dir)s/clinvar_alleles_with_%(label)s.%(fsuffix)s.tsv.gz") % locals()) job.add("tabix -S 1 -s 1 -b 2 -e 2 IN:%(tmp_dir)s/clinvar_alleles_with_%(label)s.%(fsuffix)s.tsv.gz" % locals(), output_filenames=["%(tmp_dir)s/clinvar_alleles_with_%(label)s.%(fsuffix)s.tsv.gz.tbi" % locals()]) job.add("cp IN:%(tmp_dir)s/clinvar_alleles_with_%(label)s.%(fsuffix)s.tsv.gz IN:%(tmp_dir)s/clinvar_alleles_with_%(label)s.%(fsuffix)s.tsv.gz.tbi %(output_dir)s/" % locals(), output_filenames=[ @@ -228,18 +231,18 @@ def download_if_changed(job_runner, local_path, ftp_host, ftp_path): job.add("gunzip -c IN:%(tmp_dir)s/clinvar_alleles_with_%(label)s.%(fsuffix)s.tsv.gz | head -n 750 > OUT:%(output_dir)s/clinvar_alleles_with_%(label)s_example_750_rows.%(fsuffix)s.tsv" % locals()) job.add( - "python clinvar_alleles_stats.py " + "python %(script_dir)s/clinvar_alleles_stats.py " "IN:%(tmp_dir)s/clinvar_alleles.%(fsuffix)s.tsv.gz " "> OUT:%(tmp_dir)s/clinvar_alleles_stats.%(fsuffix)s.txt" % locals(), input_filenames=[ "%(tmp_dir)s/clinvar_alleles.%(fsuffix)s.tsv.gz" % locals(), - "clinvar_alleles_stats.py"]) + "%(script_dir)s/clinvar_alleles_stats.py" % locals()]) job.add("cp IN:%(tmp_dir)s/clinvar_alleles_stats.%(fsuffix)s.txt OUT:%(output_dir)s/clinvar_alleles_stats.%(fsuffix)s.txt" % locals()) # run basic checks - job.add("python IN:check_allele_table.py IN:%(tmp_dir)s/clinvar_alleles.%(fsuffix)s.tsv.gz" % locals()) + job.add("python IN:%(script_dir)s/check_allele_table.py IN:%(tmp_dir)s/clinvar_alleles.%(fsuffix)s.tsv.gz" % locals()) # run the above commands jr.run(job) From 35f692a951821d90a6823d597c60f301376a04e9 Mon Sep 17 00:00:00 2001 From: Jon Kerry Date: Thu, 24 May 2018 17:11:37 +0100 Subject: [PATCH 02/15] Changes to column names and introduction of dbsnp by parsing of rsID from XML file. SAP-4037 --- src/check_allele_table.py | 10 ++-- src/clinvar_alleles_stats.py | 6 +-- src/clinvar_table_to_vcf.py | 8 +-- ...in_variant_summary_with_clinvar_alleles.py | 10 ++-- src/parse_clinvar_xml.py | 52 +++++++++++-------- 5 files changed, 46 insertions(+), 40 deletions(-) diff --git a/src/check_allele_table.py b/src/check_allele_table.py index 58efdd8..3cccbbd 100755 --- a/src/check_allele_table.py +++ b/src/check_allele_table.py @@ -34,13 +34,13 @@ assert int(record['pos']) > 0 and int(record['pos']) < 3*10**8, 'Unexpected "pos" column value: ' + record['pos'] assert all(b in 'ACGTN' for b in record['ref']), 'Unexpected "ref" column value: ' + record['ref'] assert all(b in 'ACGTN' for b in record['alt']), 'Unexpected "alt" column value: ' + record['alt'] # there's one clinvar record with ALT = "NTGT". Not sure how to handle it. - assert record['variation_type'] in ["Variant", "Haplotype", "CompoundHeterozygote", "Phase unknown", "Distinct chromosomes", "CompoundHeterozygote;Haplotype", "Variant;gene-variant"], \ - 'Unexpected "variation_type" column value: ' + record['variation_type'] # there's one clinvar record with ALT = "NTGT". Not sure how to handle it. + assert record['measureset_type'] in ["Variant", "Haplotype", "CompoundHeterozygote", "Phase unknown", "Distinct chromosomes", "CompoundHeterozygote;Haplotype", "Variant;gene-variant"], \ + 'Unexpected "measureset_type" column value: ' + record['measureset_type'] # there's one clinvar record with ALT = "NTGT". Not sure how to handle it. - assert len(map(int, record['variation_id'].split(';'))) > 0, 'Unexpected "variation_id" column value: ' + record['variation_id'] - assert len(map(lambda rcv: int(rcv.strip('RCV')), record['rcv'].split(';'))) > 0, 'Unexpected "rcv" column value: ' + record['rcv'] + assert len(map(int, record['measureset_id'].split(';'))) > 0, 'Unexpected "measureset_id" column value: ' + record['measureset_id'] + assert len(map(lambda rcv: int(rcv.strip('RCV')), record['clnacc'].split(';'))) > 0, 'Unexpected "clnacc" column value: ' + record['clnacc'] assert int(record['allele_id']) > 0, 'Unexpected "rcv" column value: ' + record['allele_id'] - assert len(record['hgvs_c']) == 0 or "c." in record['hgvs_c'], 'Unexpected "hgvs_c" column value: ' + record['hgvs_c'] + assert len(record['clnhgvs']) == 0 or "c." in record['clnhgvs'], 'Unexpected "hgvs_c" column value: ' + record['clnhgvs'] assert len(record['hgvs_p']) == 0 or "p." in record['hgvs_p'], 'Unexpected "hgvs_p" column value: ' + record['hgvs_p'] #assert record['molecular_consequence'], 'Unexpected "molecular_consequence" column value: ' + record['molecular_consequence'] diff --git a/src/clinvar_alleles_stats.py b/src/clinvar_alleles_stats.py index e9e4eda..76d0fba 100644 --- a/src/clinvar_alleles_stats.py +++ b/src/clinvar_alleles_stats.py @@ -10,10 +10,10 @@ alleles_name = sys.argv[1] columns_to_summarize = [ - 'variation_type', 'clinical_significance', - 'review_status', 'gold_stars', 'all_submitters', + 'measureset_type', 'original_clnsig', + 'clnrevstat', 'gold_stars', 'all_submitters', 'inheritance_modes', 'age_of_onset', 'prevalence', 'disease_mechanism', - 'origin'] + 'clnorigin'] df = pd.read_csv(alleles_name, sep="\t", compression='gzip', index_col=False) diff --git a/src/clinvar_table_to_vcf.py b/src/clinvar_table_to_vcf.py index 9a742fb..c9451b9 100644 --- a/src/clinvar_table_to_vcf.py +++ b/src/clinvar_table_to_vcf.py @@ -6,7 +6,7 @@ import pandas as pd import sys -from parse_clinvar_xml import HEADER +from join_variant_summary_with_clinvar_alleles import FINAL_HEADER def gzopen(path, mode='r', verbose=True): @@ -37,7 +37,7 @@ def table_to_vcf(input_table_path, input_reference_genome): descriptions = { 'gold_stars': "Number of gold stars as shown on clinvar web pages to summarize review status. Lookup table described at http://www.ncbi.nlm.nih.gov/clinvar/docs/details/ was used to map the REVIEW_STATUS value to this number.", } - for key in HEADER: + for key in FINAL_HEADER: print("""##INFO=""" .format(key.upper(), descriptions.get(key, key.upper()))) with open(input_reference_genome_fai) as in_fai: @@ -52,7 +52,7 @@ def table_to_vcf(input_table_path, input_reference_genome): vcf_row = [] vcf_row.append(table_row["chrom"]) vcf_row.append(table_row["pos"]) - vcf_row.append('.') # ID + vcf_row.append(table_row["rs"]) # ID = rsID vcf_row.append(table_row["ref"]) vcf_row.append(table_row["alt"]) vcf_row.append('.') # QUAL @@ -65,7 +65,7 @@ def table_to_vcf(input_table_path, input_reference_genome): # permitted only as delimiters for lists of values) INFO fields are encoded as a semicolon-separated series of short # keys with optional values in the format: =[,data]. loc_column = ['chrom', 'pos', 'ref', 'alt'] - for key in HEADER: + for key in FINAL_HEADER: if key not in loc_column: if pd.isnull(table_row[key]): continue diff --git a/src/join_variant_summary_with_clinvar_alleles.py b/src/join_variant_summary_with_clinvar_alleles.py index 1a8334e..e478a4e 100644 --- a/src/join_variant_summary_with_clinvar_alleles.py +++ b/src/join_variant_summary_with_clinvar_alleles.py @@ -33,8 +33,8 @@ def join_variant_summary_with_clinvar_alleles( variant_summary = variant_summary[ ['allele_id', 'clinicalsignificance', 'reviewstatus','lastevaluated']] variant_summary = variant_summary.rename( - columns={'clinicalsignificance': 'clinical_significance', - 'reviewstatus': 'review_status', + columns={'clinicalsignificance': 'original_clnsig', + 'reviewstatus': 'clnrevstat', 'lastevaluated':'last_evaluated'}) # remove the duplicated records in variant summary due to alternative loci such as PAR but would be problematic for rare cases like translocation @@ -43,7 +43,7 @@ def join_variant_summary_with_clinvar_alleles( # remove clinical_significance and review_status from clinvar_alleles: clinvar_alleles = clinvar_alleles.drop( - ['clinical_significance', 'review_status','last_evaluated'], axis=1) + ['original_clnsig', 'clnrevstat', 'last_evaluated'], axis=1) # pandas is sensitive to some rows having allele_id joined on ;, causing # an object dtype, with some entries being ints and others strs @@ -69,11 +69,11 @@ def join_variant_summary_with_clinvar_alleles( 'practice guideline': 4, '-':'-' } - df['gold_stars'] = df.review_status.map(gold_star_map) + df['gold_stars'] = df.clnrevstat.map(gold_star_map) # The use of expressions on clinical significance on ClinVar aggregate records (RCV) https://www.ncbi.nlm.nih.gov/clinvar/docs/clinsig/#conflicts # conflicted = 1 if using "conflicting" - df['conflicted'] = df['clinical_significance'].str.contains( + df['conflicted'] = df['original_clnsig'].str.contains( "onflicting", case=False).astype(int) # reorder columns just in case diff --git a/src/parse_clinvar_xml.py b/src/parse_clinvar_xml.py index 697a64d..51d90ec 100755 --- a/src/parse_clinvar_xml.py +++ b/src/parse_clinvar_xml.py @@ -14,15 +14,15 @@ mentions_pubmed_regex = '(?:PubMed|PMID)(.*)' # group(1) will be all the text after the word PubMed or PMID extract_pubmed_id_regex = '[^0-9]+([0-9]+)[^0-9](.*)' # group(1) will be the first PubMed ID, group(2) will be all remaining text -HEADER = ['chrom', 'pos', 'ref', 'alt', 'start', 'stop', 'strand', 'variation_type', 'variation_id', 'rcv', 'scv', +HEADER = ['chrom', 'pos', 'ref', 'alt', 'start', 'stop', 'strand', 'measureset_type', 'measureset_id', 'clnacc', 'scv', 'allele_id', 'symbol', - 'hgvs_c', 'hgvs_p', 'molecular_consequence', - 'clinical_significance', 'clinical_significance_ordered', 'pathogenic', 'likely_pathogenic', + 'clnhgvs', 'hgvs_p', 'molecular_consequence', + 'original_clnsig', 'clinical_significance_ordered', 'pathogenic', 'likely_pathogenic', 'uncertain_significance', - 'likely_benign', 'benign', 'review_status', 'review_status_ordered', - 'last_evaluated', 'all_submitters', 'submitters_ordered', 'all_traits', + 'likely_benign', 'benign', 'clnrevstat', 'review_status_ordered', + 'last_evaluated', 'all_submitters', 'submitters_ordered', 'clndbn', 'all_pmids', 'inheritance_modes', 'age_of_onset', 'prevalence', - 'disease_mechanism', 'origin', 'xrefs', 'dates_ordered'] + 'disease_mechanism', 'clnorigin', 'xrefs', 'dates_ordered', 'rs'] def replace_semicolons(s, replace_with=":"): @@ -59,9 +59,9 @@ def parse_clinvar_tree(handle, dest=sys.stdout, multi=None, verbose=True, genome # initialize all the fields current_row = {} - current_row['rcv'] = '' - current_row['variation_type'] = '' - current_row['variation_id'] = '' + current_row['clnacc'] = '' + current_row['measureset_type'] = '' + current_row['measureset_id'] = '' current_row['allele_id'] = '' rcv = elem.find('./ReferenceClinVarAssertion/ClinVarAccession') @@ -69,7 +69,7 @@ def parse_clinvar_tree(handle, dest=sys.stdout, multi=None, verbose=True, genome print("Error, not RCV record") break else: - current_row['rcv'] = rcv.attrib.get('Acc') + current_row['clnacc'] = rcv.attrib.get('Acc') ReferenceClinVarAssertion = elem.findall(".//ReferenceClinVarAssertion") measureset = ReferenceClinVarAssertion[0].findall(".//MeasureSet") @@ -88,8 +88,8 @@ def parse_clinvar_tree(handle, dest=sys.stdout, multi=None, verbose=True, genome measure = measureset.findall('.//Measure') - current_row['variation_id'] = measureset.attrib.get('ID') - current_row['variation_type'] = measureset.get('Type') + current_row['measureset_id'] = measureset.attrib.get('ID') + current_row['measureset_type'] = measureset.get('Type') # find all scv accession number scv_number = [] @@ -132,14 +132,14 @@ def parse_clinvar_tree(handle, dest=sys.stdout, multi=None, verbose=True, genome current_row['all_submitters'] = ";".join(set(submitters_ordered)) # find the clincial significance and review status reported in RCV(aggregated from SCV) - current_row['clinical_significance'] = [] - current_row['review_status'] = [] + current_row['original_clnsig'] = [] + current_row['clnrevstat'] = [] clinical_significance = elem.find('.//ReferenceClinVarAssertion/ClinicalSignificance') if clinical_significance.find('.//ReviewStatus') is not None: - current_row['review_status'] = clinical_significance.find('.//ReviewStatus').text; + current_row['clnrevstat'] = clinical_significance.find('.//ReviewStatus').text; if clinical_significance.find('.//Description') is not None: - current_row['clinical_significance'] = clinical_significance.find('.//Description').text + current_row['original_clnsig'] = clinical_significance.find('.//Description').text current_row['last_evaluated'] = '0000-00-00' if clinical_significance.attrib.get('DateLastEvaluated') is not None: @@ -173,14 +173,14 @@ def parse_clinvar_tree(handle, dest=sys.stdout, multi=None, verbose=True, genome current_row[list_column] = set() # now find the disease(s) this variant is associated with - current_row['all_traits'] = [] + current_row['clndbn'] = [] for traitset in elem.findall('.//TraitSet'): disease_name_nodes = traitset.findall('.//Name/ElementValue') trait_values = [] for disease_name_node in disease_name_nodes: if disease_name_node.attrib is not None and disease_name_node.attrib.get('Type') == 'Preferred': trait_values.append(disease_name_node.text) - current_row['all_traits'] += trait_values + current_row['clndbn'] += trait_values for attribute_node in traitset.findall('.//AttributeSet/Attribute'): attribute_type = attribute_node.attrib.get('Type') @@ -197,12 +197,18 @@ def parse_clinvar_tree(handle, dest=sys.stdout, multi=None, verbose=True, genome xref_id = xref_node.attrib.get('ID') current_row['xrefs'].add("%s:%s" % (xref_db, xref_id)) - current_row['origin'] = set() + # parse rsID to dbSNP column + current_row['rs'] = 'BLANK' + for xref_node in measureset.findall('.//Measure/XRef'): + if xref_node.attrib.get('Type') == 'rs': + current_row['rs'] = xref_node.attrib.get('ID') + + current_row['clnorigin'] = set() for origin in elem.findall('.//ReferenceClinVarAssertion/ObservedIn/Sample/Origin'): - current_row['origin'].add(origin.text) + current_row['clnorigin'].add(origin.text) for column_name in ( - 'all_traits', 'inheritance_modes', 'age_of_onset', 'prevalence', 'disease_mechanism', 'origin', + 'clndbn', 'inheritance_modes', 'age_of_onset', 'prevalence', 'disease_mechanism', 'clnorigin', 'xrefs'): column_value = current_row[column_name] if type(current_row[column_name]) == list else sorted( current_row[column_name]) # sort columns of type 'set' to get deterministic order @@ -259,7 +265,7 @@ def parse_clinvar_tree(handle, dest=sys.stdout, multi=None, verbose=True, genome break current_row['molecular_consequence'] = set() - current_row['hgvs_c'] = '' + current_row['clnhgvs'] = '' current_row['hgvs_p'] = '' attributeset = measure[i].findall('./AttributeSet') @@ -269,7 +275,7 @@ def parse_clinvar_tree(handle, dest=sys.stdout, multi=None, verbose=True, genome # find hgvs_c if (attribute_type == 'HGVS, coding, RefSeq' and "c." in attribute_value): - current_row['hgvs_c'] = attribute_value + current_row['clnhgvs'] = attribute_value # find hgvs_p if (attribute_type == 'HGVS, protein, RefSeq' and "p." in attribute_value): From a29a74b34437bca86876a91ebd89aef4a7248724 Mon Sep 17 00:00:00 2001 From: Jon Kerry Date: Tue, 29 May 2018 10:22:41 +0100 Subject: [PATCH 03/15] Added "type" in merge and to the final list of headers. SAP-4037 --- src/join_variant_summary_with_clinvar_alleles.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/join_variant_summary_with_clinvar_alleles.py b/src/join_variant_summary_with_clinvar_alleles.py index e478a4e..cca827f 100644 --- a/src/join_variant_summary_with_clinvar_alleles.py +++ b/src/join_variant_summary_with_clinvar_alleles.py @@ -4,7 +4,7 @@ from parse_clinvar_xml import HEADER -FINAL_HEADER = HEADER + ['gold_stars', 'conflicted'] +FINAL_HEADER = HEADER + ['type', 'gold_stars', 'conflicted'] def join_variant_summary_with_clinvar_alleles( @@ -31,7 +31,7 @@ def join_variant_summary_with_clinvar_alleles( variant_summary = variant_summary[ variant_summary['assembly'] == genome_build_id] variant_summary = variant_summary[ - ['allele_id', 'clinicalsignificance', 'reviewstatus','lastevaluated']] + ['allele_id', 'clinicalsignificance', 'reviewstatus', 'lastevaluated', 'type']] variant_summary = variant_summary.rename( columns={'clinicalsignificance': 'original_clnsig', 'reviewstatus': 'clnrevstat', From e031fe336af235249dfe93e633e5bd7281372604 Mon Sep 17 00:00:00 2001 From: Jon Kerry Date: Tue, 29 May 2018 12:26:54 +0100 Subject: [PATCH 04/15] Added function for translating clinical_significance terms to those required for Sapientia. Stored in data-frame as clnsig. SAP-4037 --- ...in_variant_summary_with_clinvar_alleles.py | 26 ++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/src/join_variant_summary_with_clinvar_alleles.py b/src/join_variant_summary_with_clinvar_alleles.py index cca827f..4b3f809 100644 --- a/src/join_variant_summary_with_clinvar_alleles.py +++ b/src/join_variant_summary_with_clinvar_alleles.py @@ -1,10 +1,31 @@ #!/usr/bin/env python +from collections import OrderedDict import sys import pandas as pd from parse_clinvar_xml import HEADER -FINAL_HEADER = HEADER + ['type', 'gold_stars', 'conflicted'] +FINAL_HEADER = HEADER + ['clnsig', 'type', 'gold_stars', 'conflicted'] + + +def convert_terms_to_clnsig(terms): + clnsig_list = [term.lower() for term in terms.split(',')] + classifications = OrderedDict([ + ('Pathogenic', ('pathogenic/likely pathogenic', 'pathogenic')), + ('Likely pathogenic', ('likely pathogenic',)), + ('Uncertain significance', ('uncertain significance',)), + ('Likely benign', ('benign/likely benign', 'likely benign')), + ('Benign', ('benign',)), + ('', ('association not found', 'affects', 'drug response', 'confers sensitivity', + 'risk factor', 'other', 'association', 'protective', 'not provided', + 'conflicting data from submitters', 'conflicting interpretations of ' + 'pathogenicity', 'no interpretation for the single variant')) + ]) + for clnsig, original in classifications.items(): + if not set(clnsig_list).isdisjoint(original): + return clnsig + + raise ValueError('Unrecognised clinical significance term in {}'.format(clnsig_list)) def join_variant_summary_with_clinvar_alleles( @@ -57,6 +78,9 @@ def join_variant_summary_with_clinvar_alleles( on='allele_id', how='inner') print "merged raw", df.shape + # map clinical_significance to clnsig + df['clnsig'] = df.original_clnsig.map(convert_terms_to_clnsig) + # map review_status to gold starts: gold_star_map = { 'no assertion provided': 0, From 15e0d3acbb1aa122ee407cca3af545e2bfef06c3 Mon Sep 17 00:00:00 2001 From: Jon Kerry Date: Thu, 31 May 2018 09:46:08 +0100 Subject: [PATCH 05/15] Altered VCF output to not include a bunch of columns that we are not using in our ClinVar VCFs. Set ID column to dbSNP/rsID --- src/clinvar_table_to_vcf.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/src/clinvar_table_to_vcf.py b/src/clinvar_table_to_vcf.py index c9451b9..7d491e4 100644 --- a/src/clinvar_table_to_vcf.py +++ b/src/clinvar_table_to_vcf.py @@ -37,9 +37,17 @@ def table_to_vcf(input_table_path, input_reference_genome): descriptions = { 'gold_stars': "Number of gold stars as shown on clinvar web pages to summarize review status. Lookup table described at http://www.ncbi.nlm.nih.gov/clinvar/docs/details/ was used to map the REVIEW_STATUS value to this number.", } + not_required_in_header_or_info = ['chrom', 'pos', 'ref', 'alt', 'start', 'stop', 'strand', + 'clinical_significance_ordered', 'review_status_ordered', + 'dates_ordered', 'last_evaluated', 'submitters_ordered', 'likely_pathogenic', + 'uncertain_significance', 'likely_benign', 'scv', 'type', 'age_of_onset', + 'prevalence', 'disease_mechanism'] + single_value_fields = ['symbol', 'pathogenic', 'benign', 'clnsig', 'gold_stars', 'conflicted'] for key in FINAL_HEADER: - print("""##INFO=""" - .format(key.upper(), descriptions.get(key, key.upper()))) + if key not in not_required_in_header_or_info: + number = '1' if key in single_value_fields else '.' + print("""##INFO=""".format( + key.upper(), number, descriptions.get(key, key.upper()))) with open(input_reference_genome_fai) as in_fai: for line in in_fai: chrom, length, _ = line.split("\t", 2) @@ -64,14 +72,13 @@ def table_to_vcf(input_table_path, input_reference_genome): # INFO - additional information: (String, no white-space, semi-colons, or equals-signs permitted; commas are # permitted only as delimiters for lists of values) INFO fields are encoded as a semicolon-separated series of short # keys with optional values in the format: =[,data]. - loc_column = ['chrom', 'pos', 'ref', 'alt'] for key in FINAL_HEADER: - if key not in loc_column: + if key not in not_required_in_header_or_info: if pd.isnull(table_row[key]): continue value = str(table_row[key]) value = re.sub('\s*[,]\s*', '..', value) # replace , with .. - value = re.sub('\s*[;]\s*', '|', value) # replace ; with | + value = re.sub('\s*[;]\s*', ',', value) # replace ; with | value = value.replace("=", " eq ").replace(" ", "_") info_field[key.upper()] = value From 1108bdb8d0946477c009e6bd73e7ea8e779ac457 Mon Sep 17 00:00:00 2001 From: Jon Kerry Date: Mon, 4 Feb 2019 16:37:39 +0000 Subject: [PATCH 06/15] SAP-7990 : Modify XML parser to include variants with non-standard VCF field names --- src/parse_clinvar_xml.py | 40 ++++++++++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/src/parse_clinvar_xml.py b/src/parse_clinvar_xml.py index 51d90ec..c963d79 100755 --- a/src/parse_clinvar_xml.py +++ b/src/parse_clinvar_xml.py @@ -33,6 +33,17 @@ def remove_newlines_and_tabs(s): return re.sub("[\t\n\r]", " ", s) +def get_genomic_location(measure_instance, required_fields, genome_build): + """Loops through all SequenceLocation XML tags associated with this Measure instance (ClinVar record) and returns + the first tag it finds that has the four required VCF fields""" + + for sequence_location in measure_instance.findall(".//SequenceLocation"): + if sequence_location.attrib.get('Assembly') == genome_build: + if all(vcf_field in sequence_location.attrib for vcf_field in required_fields): + return sequence_location + return None + + def parse_clinvar_tree(handle, dest=sys.stdout, multi=None, verbose=True, genome_build='GRCh37'): """Parse clinvar XML Args: @@ -234,26 +245,27 @@ def parse_clinvar_tree(handle, dest=sys.stdout, multi=None, verbose=True, genome # find the allele ID (//Measure/@ID) current_row['allele_id'] = measure[i].attrib.get('ID') - # find the GRCh37 or GRCh38 VCF representation - genomic_location = None - - for sequence_location in measure[i].findall(".//SequenceLocation"): - if sequence_location.attrib.get('Assembly') == genome_build: - if all(sequence_location.attrib.get(key) is not None for key in - ('Chr', 'start', 'referenceAllele', 'alternateAllele')): - genomic_location = sequence_location - break - # break after finding the first non-empty GRCh37 or GRCh38 location - if genomic_location is None: + # find the GRCh37 or GRCh38 VCF representation, first trying standard field names then left-align specific + vcf_field_groups = ( + ('Chr', 'start', 'referenceAllele', 'alternateAllele'), + ('Chr', 'positionVCF', 'referenceAlleleVCF', 'alternateAlleleVCF') + ) + + for vcf_fields in vcf_field_groups: + genomic_location = get_genomic_location(measure[i], vcf_fields, genome_build) + if genomic_location is not None: + pos_key, ref_key, alt_key = vcf_fields[1:] + break + else: skipped_counter['missing SequenceLocation'] += 1 elem.clear() continue # don't bother with variants that don't have a VCF location current_row['chrom'] = genomic_location.attrib['Chr'] - current_row['pos'] = genomic_location.attrib['start'] - current_row['ref'] = genomic_location.attrib['referenceAllele'] - current_row['alt'] = genomic_location.attrib['alternateAllele'] + current_row['pos'] = genomic_location.attrib[pos_key] + current_row['ref'] = genomic_location.attrib[ref_key] + current_row['alt'] = genomic_location.attrib[alt_key] current_row['start'] = genomic_location.attrib['start'] current_row['stop'] = genomic_location.attrib['stop'] current_row['strand'] = '' From 5d8f875959aca54abbe464ed8ebf1661bd6caf13 Mon Sep 17 00:00:00 2001 From: Jon Kerry Date: Thu, 14 Feb 2019 15:37:43 +0000 Subject: [PATCH 07/15] SAP-6265 : Use most pathogenic submission for conflicted variants --- src/clinvar_table_to_vcf.py | 8 +++---- ...in_variant_summary_with_clinvar_alleles.py | 22 +++++++++++++++++++ 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/src/clinvar_table_to_vcf.py b/src/clinvar_table_to_vcf.py index 7d491e4..e122d9f 100644 --- a/src/clinvar_table_to_vcf.py +++ b/src/clinvar_table_to_vcf.py @@ -39,10 +39,10 @@ def table_to_vcf(input_table_path, input_reference_genome): } not_required_in_header_or_info = ['chrom', 'pos', 'ref', 'alt', 'start', 'stop', 'strand', 'clinical_significance_ordered', 'review_status_ordered', - 'dates_ordered', 'last_evaluated', 'submitters_ordered', 'likely_pathogenic', - 'uncertain_significance', 'likely_benign', 'scv', 'type', 'age_of_onset', - 'prevalence', 'disease_mechanism'] - single_value_fields = ['symbol', 'pathogenic', 'benign', 'clnsig', 'gold_stars', 'conflicted'] + 'dates_ordered', 'last_evaluated', 'submitters_ordered', 'scv', 'type', + 'age_of_onset', 'prevalence', 'disease_mechanism'] + single_value_fields = ['symbol', 'pathogenic', 'likely_pathogenic', 'uncertain_significance', 'likely_benign', + 'benign', 'clnsig', 'gold_stars', 'conflicted'] for key in FINAL_HEADER: if key not in not_required_in_header_or_info: number = '1' if key in single_value_fields else '.' diff --git a/src/join_variant_summary_with_clinvar_alleles.py b/src/join_variant_summary_with_clinvar_alleles.py index 4b3f809..64c2f7c 100644 --- a/src/join_variant_summary_with_clinvar_alleles.py +++ b/src/join_variant_summary_with_clinvar_alleles.py @@ -28,6 +28,25 @@ def convert_terms_to_clnsig(terms): raise ValueError('Unrecognised clinical significance term in {}'.format(clnsig_list)) +def determine_most_pathogenic_submission(row): + if row['pathogenic'] > 0: + return 'Pathogenic' + elif row['likely_pathogenic'] > 0: + return 'Likely pathogenic' + elif row['uncertain_significance'] > 0: + return 'Uncertain significance' + elif row['likely_benign'] > 0: + return 'Likely benign' + elif row['benign'] > 0: + return 'Benign' + else: + # may need to revisit this code to determine what to with these instances, really they shouldn't be classified + # as conflicted but the earlier join can be ambiguous due to a variant in variant_summary.txt.gz appearing in + # both the multi and single files with different numbers of submissions. For now, retain the current + # functionality of leaving the field blank for the conflicted variant + return '' + + def join_variant_summary_with_clinvar_alleles( variant_summary_table, clinvar_alleles_table, genome_build_id="GRCh37"): @@ -100,6 +119,9 @@ def join_variant_summary_with_clinvar_alleles( df['conflicted'] = df['original_clnsig'].str.contains( "onflicting", case=False).astype(int) + conflicted_variants = df['conflicted'] == 1 + df.loc[conflicted_variants, 'clnsig'] = df[conflicted_variants].apply(determine_most_pathogenic_submission, axis=1) + # reorder columns just in case df = df.ix[:, FINAL_HEADER] print "merged final", df.shape From 679b9f2b32bc91e2179690e7b255ed64ace7894e Mon Sep 17 00:00:00 2001 From: Jon Kerry Date: Wed, 20 Feb 2019 10:39:23 +0000 Subject: [PATCH 08/15] SAP-8224 : Output expected variant numbers in ClinVar VCF header --- src/clinvar_table_to_vcf.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/clinvar_table_to_vcf.py b/src/clinvar_table_to_vcf.py index e122d9f..b4f5e1f 100644 --- a/src/clinvar_table_to_vcf.py +++ b/src/clinvar_table_to_vcf.py @@ -48,6 +48,13 @@ def table_to_vcf(input_table_path, input_reference_genome): number = '1' if key in single_value_fields else '.' print("""##INFO=""".format( key.upper(), number, descriptions.get(key, key.upper()))) + + print('##INFO= Date: Mon, 25 Feb 2019 09:59:40 +0000 Subject: [PATCH 09/15] SAP-8219 : Add "rs" the front of the rsID --- src/parse_clinvar_xml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/parse_clinvar_xml.py b/src/parse_clinvar_xml.py index c963d79..673d442 100755 --- a/src/parse_clinvar_xml.py +++ b/src/parse_clinvar_xml.py @@ -212,7 +212,7 @@ def parse_clinvar_tree(handle, dest=sys.stdout, multi=None, verbose=True, genome current_row['rs'] = 'BLANK' for xref_node in measureset.findall('.//Measure/XRef'): if xref_node.attrib.get('Type') == 'rs': - current_row['rs'] = xref_node.attrib.get('ID') + current_row['rs'] = 'rs' + xref_node.attrib.get('ID') current_row['clnorigin'] = set() for origin in elem.findall('.//ReferenceClinVarAssertion/ObservedIn/Sample/Origin'): From 7eb8576f5291c095a7d88403486c10e3b54ef31e Mon Sep 17 00:00:00 2001 From: Jon Kerry Date: Tue, 26 Feb 2019 11:14:37 +0000 Subject: [PATCH 10/15] SAP-8222 : Add "Haplotype;Phase unknown" to expected measureset types --- src/check_allele_table.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/check_allele_table.py b/src/check_allele_table.py index 3cccbbd..e099270 100755 --- a/src/check_allele_table.py +++ b/src/check_allele_table.py @@ -34,8 +34,10 @@ assert int(record['pos']) > 0 and int(record['pos']) < 3*10**8, 'Unexpected "pos" column value: ' + record['pos'] assert all(b in 'ACGTN' for b in record['ref']), 'Unexpected "ref" column value: ' + record['ref'] assert all(b in 'ACGTN' for b in record['alt']), 'Unexpected "alt" column value: ' + record['alt'] # there's one clinvar record with ALT = "NTGT". Not sure how to handle it. - assert record['measureset_type'] in ["Variant", "Haplotype", "CompoundHeterozygote", "Phase unknown", "Distinct chromosomes", "CompoundHeterozygote;Haplotype", "Variant;gene-variant"], \ - 'Unexpected "measureset_type" column value: ' + record['measureset_type'] # there's one clinvar record with ALT = "NTGT". Not sure how to handle it. + assert record['measureset_type'] in [ + "Variant", "Haplotype", "CompoundHeterozygote", "Phase unknown", "Distinct chromosomes", + "CompoundHeterozygote;Haplotype", "Variant;gene-variant", "Haplotype;Phase unknown" + ], 'Unexpected "measureset_type" column value: ' + record['measureset_type'] assert len(map(int, record['measureset_id'].split(';'))) > 0, 'Unexpected "measureset_id" column value: ' + record['measureset_id'] assert len(map(lambda rcv: int(rcv.strip('RCV')), record['clnacc'].split(';'))) > 0, 'Unexpected "clnacc" column value: ' + record['clnacc'] From ac819fc09f72d401c23e59057b27d8bd44b71ca6 Mon Sep 17 00:00:00 2001 From: Jon Kerry Date: Tue, 5 Mar 2019 12:08:47 +0000 Subject: [PATCH 11/15] SAP-8433 : Add pathogenicity mapping for old terms --- src/parse_clinvar_xml.py | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/src/parse_clinvar_xml.py b/src/parse_clinvar_xml.py index 673d442..f424de4 100755 --- a/src/parse_clinvar_xml.py +++ b/src/parse_clinvar_xml.py @@ -24,6 +24,29 @@ 'all_pmids', 'inheritance_modes', 'age_of_onset', 'prevalence', 'disease_mechanism', 'clnorigin', 'xrefs', 'dates_ordered', 'rs'] +pathogenicity_mapping = {} +pathogenicity_mapping.update(dict.fromkeys( + ['benign', 'no known pathogenicity', 'non-pathogenic', 'poly'], + 'benign' +)) +pathogenicity_mapping.update(dict.fromkeys( + ['vsb', 'likely benign', 'probable-non-pathogenic', 'probably not pathogenic', 'suspected benign'], + 'likely benign' +)) +pathogenicity_mapping.update(dict.fromkeys( + ['uncertain', 'uncertain significance', 'unknown', 'unknown significance', 'variant of unknown significance'], + 'uncertain significance' +)) +pathogenicity_mapping.update(dict.fromkeys( + ['likely pathogenic - adrenal bilateral pheochromocy', 'likely pathogenic - adrenal pheochromocytoma', + 'likely pathogenic', 'probable-pathogenic', 'probably pathogenic', 'suspected pathogenic'], + 'likely pathogenic' +)) +pathogenicity_mapping.update(dict.fromkeys( + ['moderate', 'pathogenic', 'pathogenic variant for bardet-biedl syndrome', 'mut', 'pathologic'], + 'pathogenic' +)) + def replace_semicolons(s, replace_with=":"): return s.replace(";", replace_with) @@ -162,7 +185,9 @@ def parse_clinvar_tree(handle, dest=sys.stdout, multi=None, verbose=True, genome ]) list_significance= [ - x.text.lower() for x in elem.findall('.//ClinVarAssertion/ClinicalSignificance/Description') if x is not None + pathogenicity_mapping[x.text.lower()] for x in + elem.findall('.//ClinVarAssertion/ClinicalSignificance/Description') if x is not None + and x.text.lower() in pathogenicity_mapping ] current_row['pathogenic'] = str(list_significance.count("pathogenic")) From c895f74fd09e8a00d5fb825223f58f73e719e6dd Mon Sep 17 00:00:00 2001 From: Jon Kerry Date: Wed, 6 Mar 2019 11:31:27 +0000 Subject: [PATCH 12/15] SAP-8478 : Add human-readable descriptions for INFO fields in VCF header --- src/clinvar_table_to_vcf.py | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/src/clinvar_table_to_vcf.py b/src/clinvar_table_to_vcf.py index b4f5e1f..d357ed1 100644 --- a/src/clinvar_table_to_vcf.py +++ b/src/clinvar_table_to_vcf.py @@ -35,12 +35,36 @@ def table_to_vcf(input_table_path, input_reference_genome): print("""##fileformat=VCFv4.1\n##source=clinvar""") descriptions = { - 'gold_stars': "Number of gold stars as shown on clinvar web pages to summarize review status. Lookup table described at http://www.ncbi.nlm.nih.gov/clinvar/docs/details/ was used to map the REVIEW_STATUS value to this number.", + 'gold_stars': "Number of Gold Stars (numeric representation of review status)", + 'clnsig': "Most Severe ClinVar Pathogenicity", + 'original_clnsig': "Clinical Significance", + 'pathogenic': "Number of 'Pathogenic' Submissions", + 'likely_pathogenic': "Number of 'Likely pathogenic' Submissions", + 'uncertain_significance': "Number of 'Uncertain significance' Submissions", + 'likely_benign': "Number of 'Likely benign' Submissions", + 'benign': "Number of 'Benign' Submissions", + 'conflicted': "Conflicting Pathogenicities (0 = False; 1 = True)", + 'clnhgvs': "HGVSc", + 'clnrevstat': "Review Status", + 'clndbn': "Phenotypes", + 'clnorigin': "Allele Origin", + 'rs': "rsID", + 'all_pmids': "Pubmed IDs Documenting Evidence of Phenotypes", + 'clnacc': "RCV Accession Number", + 'measureset_type': "Measureset Type", + 'measureset_id': "Measureset ID", + 'allele_id': "Allele ID", + 'symbol': "Gene Symbol", + 'molecular_consequence': "Molecular Consequence", + 'hgvs_p': "HGVSp", + 'all_submitters': "Submitters of Variant Phenotype", + 'inheritance_modes': "Modes of Inheritance", + 'xrefs': "Cross-References to other Data Sources" } not_required_in_header_or_info = ['chrom', 'pos', 'ref', 'alt', 'start', 'stop', 'strand', 'clinical_significance_ordered', 'review_status_ordered', 'dates_ordered', 'last_evaluated', 'submitters_ordered', 'scv', 'type', - 'age_of_onset', 'prevalence', 'disease_mechanism'] + 'inheritance_modes', 'age_of_onset', 'prevalence', 'disease_mechanism'] single_value_fields = ['symbol', 'pathogenic', 'likely_pathogenic', 'uncertain_significance', 'likely_benign', 'benign', 'clnsig', 'gold_stars', 'conflicted'] for key in FINAL_HEADER: From 85b9b1cbdba2879941c5fc2d19b15b240c1a7cf6 Mon Sep 17 00:00:00 2001 From: Jon Kerry Date: Thu, 7 Mar 2019 14:15:57 +0000 Subject: [PATCH 13/15] SAP-8360 : Pass ClinVar version to VCF writer to add version and date to VCF header --- src/clinvar_table_to_vcf.py | 25 ++++++++++++++++++------- src/master.py | 4 +++- 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/src/clinvar_table_to_vcf.py b/src/clinvar_table_to_vcf.py index d357ed1..19f7ae6 100644 --- a/src/clinvar_table_to_vcf.py +++ b/src/clinvar_table_to_vcf.py @@ -1,4 +1,5 @@ import argparse +import calendar import collections import gzip import os @@ -16,7 +17,7 @@ def gzopen(path, mode='r', verbose=True): return open(path, mode) -def table_to_vcf(input_table_path, input_reference_genome): +def table_to_vcf(input_table_path, input_reference_genome, version): # validate args input_reference_genome_fai = input_reference_genome + ".fai" if not os.path.isfile(input_table_path): @@ -73,11 +74,20 @@ def table_to_vcf(input_table_path, input_reference_genome): print("""##INFO=""".format( key.upper(), number, descriptions.get(key, key.upper()))) - print('##INFO=\d{4})-(?P\d{2})', version) + try: + month_string = calendar.month_name[int(version_date.group('month'))] + except IndexError: + raise ValueError('Cannot convert version month value "{}" to a known month (must be between 01-12)'.format( + version_date.group('month') + )) + + print('##CONGENICA_VCF_VALIDATION='.format(version)) + print('##CURATED_VARIANT_LIST_INFO='.format( + version_date.group('year'), month_string)) with open(input_reference_genome_fai) as in_fai: for line in in_fai: @@ -124,6 +134,7 @@ def table_to_vcf(input_table_path, input_reference_genome): parser = argparse.ArgumentParser() parser.add_argument('input_table_path', help="Tab-delimited input table") parser.add_argument('input_reference_genome', help="Reference FASTA used. The associated .fai file, e.g. b38.fa.fai, is necessary for the VCF header generation") + parser.add_argument('clinvar_version', help="Version of ClinVar e.g. 2019-02") args = parser.parse_args() - table_to_vcf(args.input_table_path, args.input_reference_genome) + table_to_vcf(args.input_table_path, args.input_reference_genome, args.clinvar_version) diff --git a/src/master.py b/src/master.py index 85b0d29..5febd39 100755 --- a/src/master.py +++ b/src/master.py @@ -25,6 +25,7 @@ g = p.add_argument_group('main args') g.add("--b37-genome", help="b37 .fa genome reference file", default=None, required=False) g.add("--b38-genome", help="b38 .fa genome reference file. NOTE: chromosome names must be like '1', '2'.. 'X', 'Y', 'MT'.", default=None, required=False) +g.add("--clinvar-version", help="Version of ClinVar source data e.g. 2019-02", default=None, required=True) g.add("-X", "--clinvar-xml", help="The local filename of the ClinVarFullRelase.xml.gz file. If not set, grab the latest from NCBI.") g.add("-S", "--clinvar-variant-summary-table", help="The local filename of the variant_summary.txt.gz file. If not set, grab the latest from NCBI.") g.add("-E", "--exac-sites-vcf", help="ExAC sites vcf file. If specified, a clinvar table with extra ExAC fields will also be created.") @@ -51,6 +52,7 @@ gnomad_genome_sites_vcf = args.gnomad_genome_sites_vcf clinvar_variant_summary_table = args.clinvar_variant_summary_table output_prefix = args.output_prefix +clinvar_version = args.clinvar_version script_dir = args.script_dir tmp_dir = args.tmp_dir @@ -199,7 +201,7 @@ def download_if_changed(job_runner, local_path, ftp_host, ftp_path): ]) # create vcf - job.add(("python -u IN:%(script_dir)s/clinvar_table_to_vcf.py IN:%(tmp_dir)s/clinvar_alleles.%(fsuffix)s.tsv.gz IN:%(reference_genome)s | bgzip -c > OUT:%(tmp_dir)s/clinvar_alleles.%(fsuffix)s.vcf.gz") % locals()) # create compressed version + job.add(("python -u IN:%(script_dir)s/clinvar_table_to_vcf.py IN:%(tmp_dir)s/clinvar_alleles.%(fsuffix)s.tsv.gz IN:%(reference_genome)s %(clinvar_version)s | bgzip -c > OUT:%(tmp_dir)s/clinvar_alleles.%(fsuffix)s.vcf.gz") % locals()) # create compressed version job.add("tabix IN:%(tmp_dir)s/clinvar_alleles.%(fsuffix)s.vcf.gz" % locals(), output_filenames=["%(tmp_dir)s/clinvar_alleles.%(fsuffix)s.vcf.gz.tbi" % locals()]) job.add("cp IN:%(tmp_dir)s/clinvar_alleles.%(fsuffix)s.vcf.gz IN:%(tmp_dir)s/clinvar_alleles.%(fsuffix)s.vcf.gz.tbi %(output_dir)s/" % locals(), output_filenames=[ "%(output_dir)s/clinvar_alleles.%(fsuffix)s.vcf.gz" % locals(), From 89d3d28d9bd0e147072bf3c302cf6f6ceddf15b5 Mon Sep 17 00:00:00 2001 From: Kevin Dunnicliffe Date: Mon, 28 Oct 2019 09:22:55 +0000 Subject: [PATCH 14/15] SAP-11534 : set CLNGHVS field to var_name as fallback --- src/parse_clinvar_xml.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/parse_clinvar_xml.py b/src/parse_clinvar_xml.py index f424de4..f58ba46 100755 --- a/src/parse_clinvar_xml.py +++ b/src/parse_clinvar_xml.py @@ -313,6 +313,8 @@ def parse_clinvar_tree(handle, dest=sys.stdout, multi=None, verbose=True, genome # find hgvs_c if (attribute_type == 'HGVS, coding, RefSeq' and "c." in attribute_value): current_row['clnhgvs'] = attribute_value + else: + current_row['clnhgvs'] = var_name # find hgvs_p if (attribute_type == 'HGVS, protein, RefSeq' and "p." in attribute_value): From 60b0e7faca22cdf8222ea3213f9753431985e464 Mon Sep 17 00:00:00 2001 From: Kevin Dunnicliffe Date: Fri, 1 Nov 2019 09:50:44 +0000 Subject: [PATCH 15/15] SAP-11534 : improve clnghvs handling by fallback to MeasureSet.name --- src/parse_clinvar_xml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/parse_clinvar_xml.py b/src/parse_clinvar_xml.py index f58ba46..89f3d49 100755 --- a/src/parse_clinvar_xml.py +++ b/src/parse_clinvar_xml.py @@ -313,7 +313,7 @@ def parse_clinvar_tree(handle, dest=sys.stdout, multi=None, verbose=True, genome # find hgvs_c if (attribute_type == 'HGVS, coding, RefSeq' and "c." in attribute_value): current_row['clnhgvs'] = attribute_value - else: + elif "c." in var_name: current_row['clnhgvs'] = var_name # find hgvs_p