From 40562829d7a14f8217dd3a45b8f4405adf203490 Mon Sep 17 00:00:00 2001
From: Jon Kerry <jonathan.kerry@congenica.com>
Date: Thu, 10 May 2018 11:51:23 +0100
Subject: [PATCH 01/15] Added absolute paths for scripts so that pipeline can
 be run in a docker container

---
 src/master.py | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/src/master.py b/src/master.py
index 11d2b88..85b0d29 100755
--- a/src/master.py
+++ b/src/master.py
@@ -32,6 +32,7 @@
 g.add("-GG", "--gnomad-genome-sites-vcf",  help="gnomAD genome sites vcf file. If specified, a clinvar table with extra gnomAD genome info fields will also be created.")
 g.add("--output-prefix", default="../output/", help="Final output files will have this prefix")
 g.add("--tmp-dir", default="./output_tmp", help="Temporary output files will have this prefix")
+g.add("--script-dir", help="Directory where the ClinVar source codes are stored")
 g = p.add_mutually_exclusive_group()
 g.add("--single-only", dest="single_or_multi", action="store_const", const="single", help="Only generate the single-variant tables")
 g.add("--multi-only", dest="single_or_multi", action="store_const", const="multi", help="Only generate the multi-variant tables")
@@ -51,6 +52,7 @@
 clinvar_variant_summary_table = args.clinvar_variant_summary_table
 output_prefix = args.output_prefix
 
+script_dir = args.script_dir
 tmp_dir = args.tmp_dir
 os.system("mkdir -p " + tmp_dir)
 
@@ -127,7 +129,8 @@ def download_if_changed(job_runner, local_path, ftp_host, ftp_path):
 
 # normalize (convert to minimal representation and left-align)
 # the normalization code is in a different repo (useful for more than just clinvar) so here I just wget it:
-job.add("wget -N https://raw.githubusercontent.com/ericminikel/minimal_representation/master/normalize.py")
+job.add("wget -N https://raw.githubusercontent.com/ericminikel/minimal_representation/master/normalize.py "
+        "-O OUT:%(output_prefix)s/normalize.py" % locals())
 
 for genome_build in ('b37', 'b38'):
     # extract the GRCh37 coordinates, mutant allele, MeasureSet ID and PubMed IDs from it. This currently takes about 20 minutes.
@@ -137,7 +140,7 @@ def download_if_changed(job_runner, local_path, ftp_host, ftp_path):
         print("Skippping steps to generate %s tables since reference genome not given." % genome_build)
         continue
 
-    job.add(("python -u IN:parse_clinvar_xml.py "
+    job.add(("python -u IN:%(script_dir)s/parse_clinvar_xml.py "
             "-x IN:%(clinvar_xml)s "
             "-g %(genome_build_id)s "
             "-o OUT:%(tmp_dir)s/clinvar_table_raw.single.%(genome_build)s.tsv "
@@ -150,11 +153,11 @@ def download_if_changed(job_runner, local_path, ftp_host, ftp_path):
             continue
             
         fsuffix = "%(single_or_multi)s.%(genome_build)s" % locals() # file suffix
-        output_dir = '%(output_prefix)s%(genome_build)s/%(single_or_multi)s' % locals()
+        output_dir = '%(output_prefix)s/%(genome_build)s/%(single_or_multi)s' % locals()
         os.system('mkdir -p ' + output_dir)
 
         # normalize variants  (use grep -v '^$' to remove empty rows)
-        job.add("python -u normalize.py -R IN:%(reference_genome)s < IN:%(tmp_dir)s/clinvar_table_raw.%(fsuffix)s.tsv | grep -v ^$ | bgzip -c > OUT:%(tmp_dir)s/clinvar_table_normalized.%(fsuffix)s.tsv.gz" % locals())
+        job.add("python -u %(output_prefix)s/normalize.py -R IN:%(reference_genome)s < IN:%(tmp_dir)s/clinvar_table_raw.%(fsuffix)s.tsv | grep -v ^$ | bgzip -c > OUT:%(tmp_dir)s/clinvar_table_normalized.%(fsuffix)s.tsv.gz" % locals())
 
         # sort
         job.add(("cat " +
@@ -171,10 +174,10 @@ def download_if_changed(job_runner, local_path, ftp_host, ftp_path):
             ])
 
         # group by allele, since clinvar_allele_trait_pairs.*.tsv will have more than 1 record for some alleles
-        job.add("python -u IN:group_by_allele.py -i IN:%(tmp_dir)s/clinvar_allele_trait_pairs.%(fsuffix)s.tsv.gz | bgzip -c > OUT:%(tmp_dir)s/clinvar_alleles_grouped.%(fsuffix)s.tsv.gz" % locals())
+        job.add("python -u IN:%(script_dir)s/group_by_allele.py -i IN:%(tmp_dir)s/clinvar_allele_trait_pairs.%(fsuffix)s.tsv.gz | bgzip -c > OUT:%(tmp_dir)s/clinvar_alleles_grouped.%(fsuffix)s.tsv.gz" % locals())
 
         # join information from the tab-delimited summary to the normalized genomic coordinates
-        job.add("python IN:join_variant_summary_with_clinvar_alleles.py "
+        job.add("python IN:%(script_dir)s/join_variant_summary_with_clinvar_alleles.py "
                 "IN:%(variant_summary_table)s "
                 "IN:%(tmp_dir)s/clinvar_alleles_grouped.%(fsuffix)s.tsv.gz "
                 "OUT:%(tmp_dir)s/clinvar_alleles_combined.%(fsuffix)s.tsv.gz "
@@ -196,7 +199,7 @@ def download_if_changed(job_runner, local_path, ftp_host, ftp_path):
                 ])
 
         # create vcf
-        job.add(("python -u IN:clinvar_table_to_vcf.py IN:%(tmp_dir)s/clinvar_alleles.%(fsuffix)s.tsv.gz IN:%(reference_genome)s | bgzip -c > OUT:%(tmp_dir)s/clinvar_alleles.%(fsuffix)s.vcf.gz") % locals())  # create compressed version
+        job.add(("python -u IN:%(script_dir)s/clinvar_table_to_vcf.py IN:%(tmp_dir)s/clinvar_alleles.%(fsuffix)s.tsv.gz IN:%(reference_genome)s | bgzip -c > OUT:%(tmp_dir)s/clinvar_alleles.%(fsuffix)s.vcf.gz") % locals())  # create compressed version
         job.add("tabix IN:%(tmp_dir)s/clinvar_alleles.%(fsuffix)s.vcf.gz" % locals(), output_filenames=["%(tmp_dir)s/clinvar_alleles.%(fsuffix)s.vcf.gz.tbi" % locals()])
         job.add("cp IN:%(tmp_dir)s/clinvar_alleles.%(fsuffix)s.vcf.gz IN:%(tmp_dir)s/clinvar_alleles.%(fsuffix)s.vcf.gz.tbi %(output_dir)s/" % locals(), output_filenames=[
             "%(output_dir)s/clinvar_alleles.%(fsuffix)s.vcf.gz" % locals(),
@@ -218,7 +221,7 @@ def download_if_changed(job_runner, local_path, ftp_host, ftp_path):
                          "vt normalize -r IN:%(reference_genome)s - | "
                          "bgzip -c > OUT:%(tmp_dir)s/%(normalized_vcf)s") % locals())
                 job.add("tabix IN:%(tmp_dir)s/%(normalized_vcf)s" % locals(), output_filenames=["%(tmp_dir)s/%(normalized_vcf)s.tbi" % locals()])
-                job.add(("python -u IN:%(script_name)s -i IN:%(tmp_dir)s/clinvar_alleles.%(fsuffix)s.tsv.gz %(vcf_arg)s IN:%(tmp_dir)s/%(normalized_vcf)s | "
+                job.add(("python -u IN:%(script_dir)s/%(script_name)s -i IN:%(tmp_dir)s/clinvar_alleles.%(fsuffix)s.tsv.gz %(vcf_arg)s IN:%(tmp_dir)s/%(normalized_vcf)s | "
                          "bgzip -c > OUT:%(tmp_dir)s/clinvar_alleles_with_%(label)s.%(fsuffix)s.tsv.gz") % locals())
                 job.add("tabix -S 1 -s 1 -b 2 -e 2 IN:%(tmp_dir)s/clinvar_alleles_with_%(label)s.%(fsuffix)s.tsv.gz" % locals(), output_filenames=["%(tmp_dir)s/clinvar_alleles_with_%(label)s.%(fsuffix)s.tsv.gz.tbi" % locals()])
                 job.add("cp IN:%(tmp_dir)s/clinvar_alleles_with_%(label)s.%(fsuffix)s.tsv.gz IN:%(tmp_dir)s/clinvar_alleles_with_%(label)s.%(fsuffix)s.tsv.gz.tbi %(output_dir)s/" % locals(), output_filenames=[
@@ -228,18 +231,18 @@ def download_if_changed(job_runner, local_path, ftp_host, ftp_path):
                 job.add("gunzip -c IN:%(tmp_dir)s/clinvar_alleles_with_%(label)s.%(fsuffix)s.tsv.gz | head -n 750 > OUT:%(output_dir)s/clinvar_alleles_with_%(label)s_example_750_rows.%(fsuffix)s.tsv" % locals())
 
         job.add(
-            "python clinvar_alleles_stats.py "
+            "python %(script_dir)s/clinvar_alleles_stats.py "
             "IN:%(tmp_dir)s/clinvar_alleles.%(fsuffix)s.tsv.gz "
             "> OUT:%(tmp_dir)s/clinvar_alleles_stats.%(fsuffix)s.txt" %
             locals(),
             input_filenames=[
                 "%(tmp_dir)s/clinvar_alleles.%(fsuffix)s.tsv.gz" % locals(),
-                "clinvar_alleles_stats.py"])
+                "%(script_dir)s/clinvar_alleles_stats.py" % locals()])
 
         job.add("cp IN:%(tmp_dir)s/clinvar_alleles_stats.%(fsuffix)s.txt OUT:%(output_dir)s/clinvar_alleles_stats.%(fsuffix)s.txt" % locals())
 
         # run basic checks
-        job.add("python IN:check_allele_table.py IN:%(tmp_dir)s/clinvar_alleles.%(fsuffix)s.tsv.gz" % locals())
+        job.add("python IN:%(script_dir)s/check_allele_table.py IN:%(tmp_dir)s/clinvar_alleles.%(fsuffix)s.tsv.gz" % locals())
 
 # run the above commands
 jr.run(job)

From 35f692a951821d90a6823d597c60f301376a04e9 Mon Sep 17 00:00:00 2001
From: Jon Kerry <jonathan.kerry@congenica.com>
Date: Thu, 24 May 2018 17:11:37 +0100
Subject: [PATCH 02/15] Changes to column names and introduction of dbsnp by
 parsing of rsID from XML file. SAP-4037

---
 src/check_allele_table.py                     | 10 ++--
 src/clinvar_alleles_stats.py                  |  6 +--
 src/clinvar_table_to_vcf.py                   |  8 +--
 ...in_variant_summary_with_clinvar_alleles.py | 10 ++--
 src/parse_clinvar_xml.py                      | 52 +++++++++++--------
 5 files changed, 46 insertions(+), 40 deletions(-)

diff --git a/src/check_allele_table.py b/src/check_allele_table.py
index 58efdd8..3cccbbd 100755
--- a/src/check_allele_table.py
+++ b/src/check_allele_table.py
@@ -34,13 +34,13 @@
         assert int(record['pos']) > 0 and int(record['pos']) < 3*10**8, 'Unexpected "pos" column value: ' + record['pos']
         assert all(b in 'ACGTN' for b in record['ref']), 'Unexpected "ref" column value: ' + record['ref']
         assert all(b in 'ACGTN' for b in record['alt']), 'Unexpected "alt" column value: ' + record['alt']  # there's one clinvar record with ALT = "NTGT". Not sure how to handle it.
-        assert record['variation_type'] in ["Variant", "Haplotype", "CompoundHeterozygote", "Phase unknown", "Distinct chromosomes", "CompoundHeterozygote;Haplotype", "Variant;gene-variant"], \
-            'Unexpected "variation_type" column value: ' + record['variation_type']  # there's one clinvar record with ALT = "NTGT". Not sure how to handle it.
+        assert record['measureset_type'] in ["Variant", "Haplotype", "CompoundHeterozygote", "Phase unknown", "Distinct chromosomes", "CompoundHeterozygote;Haplotype", "Variant;gene-variant"], \
+            'Unexpected "measureset_type" column value: ' + record['measureset_type']  # there's one clinvar record with ALT = "NTGT". Not sure how to handle it.
 
-        assert len(map(int, record['variation_id'].split(';'))) > 0, 'Unexpected "variation_id" column value: ' + record['variation_id']
-        assert len(map(lambda rcv: int(rcv.strip('RCV')), record['rcv'].split(';'))) > 0, 'Unexpected "rcv" column value: ' + record['rcv']
+        assert len(map(int, record['measureset_id'].split(';'))) > 0, 'Unexpected "measureset_id" column value: ' + record['measureset_id']
+        assert len(map(lambda rcv: int(rcv.strip('RCV')), record['clnacc'].split(';'))) > 0, 'Unexpected "clnacc" column value: ' + record['clnacc']
         assert int(record['allele_id']) > 0, 'Unexpected "rcv" column value: ' + record['allele_id']
-        assert len(record['hgvs_c']) == 0 or "c." in record['hgvs_c'], 'Unexpected "hgvs_c" column value: ' + record['hgvs_c']
+        assert len(record['clnhgvs']) == 0 or "c." in record['clnhgvs'], 'Unexpected "hgvs_c" column value: ' + record['clnhgvs']
         assert len(record['hgvs_p']) == 0 or "p." in record['hgvs_p'], 'Unexpected "hgvs_p" column value: ' + record['hgvs_p']
         #assert record['molecular_consequence'], 'Unexpected "molecular_consequence" column value: ' + record['molecular_consequence']
 
diff --git a/src/clinvar_alleles_stats.py b/src/clinvar_alleles_stats.py
index e9e4eda..76d0fba 100644
--- a/src/clinvar_alleles_stats.py
+++ b/src/clinvar_alleles_stats.py
@@ -10,10 +10,10 @@
 alleles_name = sys.argv[1]
 
 columns_to_summarize = [
-    'variation_type', 'clinical_significance',
-    'review_status', 'gold_stars', 'all_submitters',
+    'measureset_type', 'original_clnsig',
+    'clnrevstat', 'gold_stars', 'all_submitters',
     'inheritance_modes', 'age_of_onset', 'prevalence', 'disease_mechanism',
-    'origin']
+    'clnorigin']
 
 df = pd.read_csv(alleles_name, sep="\t", compression='gzip', index_col=False)
 
diff --git a/src/clinvar_table_to_vcf.py b/src/clinvar_table_to_vcf.py
index 9a742fb..c9451b9 100644
--- a/src/clinvar_table_to_vcf.py
+++ b/src/clinvar_table_to_vcf.py
@@ -6,7 +6,7 @@
 import pandas as pd
 import sys
 
-from parse_clinvar_xml import HEADER
+from join_variant_summary_with_clinvar_alleles import FINAL_HEADER
 
 
 def gzopen(path, mode='r', verbose=True):
@@ -37,7 +37,7 @@ def table_to_vcf(input_table_path, input_reference_genome):
     descriptions = {
         'gold_stars': "Number of gold stars as shown on clinvar web pages to summarize review status. Lookup table described at http://www.ncbi.nlm.nih.gov/clinvar/docs/details/ was used to map the REVIEW_STATUS value to this number.",
     }
-    for key in HEADER:
+    for key in FINAL_HEADER:
         print("""##INFO=<ID={},Number=1,Type=String,Description="{}">"""
               .format(key.upper(), descriptions.get(key, key.upper())))
     with open(input_reference_genome_fai) as in_fai:
@@ -52,7 +52,7 @@ def table_to_vcf(input_table_path, input_reference_genome):
         vcf_row = []
         vcf_row.append(table_row["chrom"])
         vcf_row.append(table_row["pos"])
-        vcf_row.append('.')  # ID
+        vcf_row.append(table_row["rs"])  # ID = rsID
         vcf_row.append(table_row["ref"])
         vcf_row.append(table_row["alt"])
         vcf_row.append('.')  # QUAL
@@ -65,7 +65,7 @@ def table_to_vcf(input_table_path, input_reference_genome):
         #    permitted only as delimiters for lists of values) INFO fields are encoded as a semicolon-separated series of short
         #    keys with optional values in the format: <key>=<data>[,data].
         loc_column = ['chrom', 'pos', 'ref', 'alt']
-        for key in HEADER:
+        for key in FINAL_HEADER:
             if key not in loc_column:
                 if pd.isnull(table_row[key]):
                     continue
diff --git a/src/join_variant_summary_with_clinvar_alleles.py b/src/join_variant_summary_with_clinvar_alleles.py
index 1a8334e..e478a4e 100644
--- a/src/join_variant_summary_with_clinvar_alleles.py
+++ b/src/join_variant_summary_with_clinvar_alleles.py
@@ -33,8 +33,8 @@ def join_variant_summary_with_clinvar_alleles(
     variant_summary = variant_summary[
         ['allele_id', 'clinicalsignificance', 'reviewstatus','lastevaluated']]
     variant_summary = variant_summary.rename(
-        columns={'clinicalsignificance': 'clinical_significance',
-                 'reviewstatus': 'review_status',
+        columns={'clinicalsignificance': 'original_clnsig',
+                 'reviewstatus': 'clnrevstat',
                  'lastevaluated':'last_evaluated'})
 
     # remove the duplicated records in variant summary due to alternative loci such as PAR but would be problematic for rare cases like translocation
@@ -43,7 +43,7 @@ def join_variant_summary_with_clinvar_alleles(
 
     # remove clinical_significance and review_status from clinvar_alleles:
     clinvar_alleles = clinvar_alleles.drop(
-        ['clinical_significance', 'review_status','last_evaluated'], axis=1)
+        ['original_clnsig', 'clnrevstat', 'last_evaluated'], axis=1)
 
     # pandas is sensitive to some rows having allele_id joined on ;, causing
     # an object dtype, with some entries being ints and others strs
@@ -69,11 +69,11 @@ def join_variant_summary_with_clinvar_alleles(
         'practice guideline': 4,
         '-':'-'
     }
-    df['gold_stars'] = df.review_status.map(gold_star_map)
+    df['gold_stars'] = df.clnrevstat.map(gold_star_map)
 
     # The use of expressions on clinical significance on ClinVar aggregate records (RCV) https://www.ncbi.nlm.nih.gov/clinvar/docs/clinsig/#conflicts
     # conflicted = 1 if using "conflicting"
-    df['conflicted'] = df['clinical_significance'].str.contains(
+    df['conflicted'] = df['original_clnsig'].str.contains(
         "onflicting", case=False).astype(int)
 
     # reorder columns just in case
diff --git a/src/parse_clinvar_xml.py b/src/parse_clinvar_xml.py
index 697a64d..51d90ec 100755
--- a/src/parse_clinvar_xml.py
+++ b/src/parse_clinvar_xml.py
@@ -14,15 +14,15 @@
 mentions_pubmed_regex = '(?:PubMed|PMID)(.*)'  # group(1) will be all the text after the word PubMed or PMID
 extract_pubmed_id_regex = '[^0-9]+([0-9]+)[^0-9](.*)'  # group(1) will be the first PubMed ID, group(2) will be all remaining text
 
-HEADER = ['chrom', 'pos', 'ref', 'alt', 'start', 'stop', 'strand', 'variation_type', 'variation_id', 'rcv', 'scv',
+HEADER = ['chrom', 'pos', 'ref', 'alt', 'start', 'stop', 'strand', 'measureset_type', 'measureset_id', 'clnacc', 'scv',
           'allele_id', 'symbol',
-          'hgvs_c', 'hgvs_p', 'molecular_consequence',
-          'clinical_significance', 'clinical_significance_ordered', 'pathogenic', 'likely_pathogenic',
+          'clnhgvs', 'hgvs_p', 'molecular_consequence',
+          'original_clnsig', 'clinical_significance_ordered', 'pathogenic', 'likely_pathogenic',
           'uncertain_significance',
-          'likely_benign', 'benign', 'review_status', 'review_status_ordered',
-          'last_evaluated', 'all_submitters', 'submitters_ordered', 'all_traits',
+          'likely_benign', 'benign', 'clnrevstat', 'review_status_ordered',
+          'last_evaluated', 'all_submitters', 'submitters_ordered', 'clndbn',
           'all_pmids', 'inheritance_modes', 'age_of_onset', 'prevalence',
-          'disease_mechanism', 'origin', 'xrefs', 'dates_ordered']
+          'disease_mechanism', 'clnorigin', 'xrefs', 'dates_ordered', 'rs']
 
 
 def replace_semicolons(s, replace_with=":"):
@@ -59,9 +59,9 @@ def parse_clinvar_tree(handle, dest=sys.stdout, multi=None, verbose=True, genome
 
         # initialize all the fields
         current_row = {}
-        current_row['rcv'] = ''
-        current_row['variation_type'] = ''
-        current_row['variation_id'] = ''
+        current_row['clnacc'] = ''
+        current_row['measureset_type'] = ''
+        current_row['measureset_id'] = ''
         current_row['allele_id'] = ''
 
         rcv = elem.find('./ReferenceClinVarAssertion/ClinVarAccession')
@@ -69,7 +69,7 @@ def parse_clinvar_tree(handle, dest=sys.stdout, multi=None, verbose=True, genome
             print("Error, not RCV record")
             break
         else:
-            current_row['rcv'] = rcv.attrib.get('Acc')
+            current_row['clnacc'] = rcv.attrib.get('Acc')
 
         ReferenceClinVarAssertion = elem.findall(".//ReferenceClinVarAssertion")
         measureset = ReferenceClinVarAssertion[0].findall(".//MeasureSet")
@@ -88,8 +88,8 @@ def parse_clinvar_tree(handle, dest=sys.stdout, multi=None, verbose=True, genome
 
         measure = measureset.findall('.//Measure')
 
-        current_row['variation_id'] = measureset.attrib.get('ID')
-        current_row['variation_type'] = measureset.get('Type')
+        current_row['measureset_id'] = measureset.attrib.get('ID')
+        current_row['measureset_type'] = measureset.get('Type')
 
         # find all scv accession number
         scv_number = []
@@ -132,14 +132,14 @@ def parse_clinvar_tree(handle, dest=sys.stdout, multi=None, verbose=True, genome
         current_row['all_submitters'] = ";".join(set(submitters_ordered))
 
         # find the clincial significance and review status reported in RCV(aggregated from SCV)
-        current_row['clinical_significance'] = []
-        current_row['review_status'] = []
+        current_row['original_clnsig'] = []
+        current_row['clnrevstat'] = []
 
         clinical_significance = elem.find('.//ReferenceClinVarAssertion/ClinicalSignificance')
         if clinical_significance.find('.//ReviewStatus') is not None:
-            current_row['review_status'] = clinical_significance.find('.//ReviewStatus').text;
+            current_row['clnrevstat'] = clinical_significance.find('.//ReviewStatus').text;
         if clinical_significance.find('.//Description') is not None:
-            current_row['clinical_significance'] = clinical_significance.find('.//Description').text
+            current_row['original_clnsig'] = clinical_significance.find('.//Description').text
 
         current_row['last_evaluated'] = '0000-00-00'
         if clinical_significance.attrib.get('DateLastEvaluated') is not None:
@@ -173,14 +173,14 @@ def parse_clinvar_tree(handle, dest=sys.stdout, multi=None, verbose=True, genome
             current_row[list_column] = set()
 
         # now find the disease(s) this variant is associated with
-        current_row['all_traits'] = []
+        current_row['clndbn'] = []
         for traitset in elem.findall('.//TraitSet'):
             disease_name_nodes = traitset.findall('.//Name/ElementValue')
             trait_values = []
             for disease_name_node in disease_name_nodes:
                 if disease_name_node.attrib is not None and disease_name_node.attrib.get('Type') == 'Preferred':
                     trait_values.append(disease_name_node.text)
-            current_row['all_traits'] += trait_values
+            current_row['clndbn'] += trait_values
 
             for attribute_node in traitset.findall('.//AttributeSet/Attribute'):
                 attribute_type = attribute_node.attrib.get('Type')
@@ -197,12 +197,18 @@ def parse_clinvar_tree(handle, dest=sys.stdout, multi=None, verbose=True, genome
                 xref_id = xref_node.attrib.get('ID')
                 current_row['xrefs'].add("%s:%s" % (xref_db, xref_id))
 
-        current_row['origin'] = set()
+        # parse rsID to dbSNP column
+        current_row['rs'] = 'BLANK'
+        for xref_node in measureset.findall('.//Measure/XRef'):
+            if xref_node.attrib.get('Type') == 'rs':
+                current_row['rs'] = xref_node.attrib.get('ID')
+
+        current_row['clnorigin'] = set()
         for origin in elem.findall('.//ReferenceClinVarAssertion/ObservedIn/Sample/Origin'):
-            current_row['origin'].add(origin.text)
+            current_row['clnorigin'].add(origin.text)
 
         for column_name in (
-                'all_traits', 'inheritance_modes', 'age_of_onset', 'prevalence', 'disease_mechanism', 'origin',
+                'clndbn', 'inheritance_modes', 'age_of_onset', 'prevalence', 'disease_mechanism', 'clnorigin',
                 'xrefs'):
             column_value = current_row[column_name] if type(current_row[column_name]) == list else sorted(
                 current_row[column_name])  # sort columns of type 'set' to get deterministic order
@@ -259,7 +265,7 @@ def parse_clinvar_tree(handle, dest=sys.stdout, multi=None, verbose=True, genome
                             break
 
             current_row['molecular_consequence'] = set()
-            current_row['hgvs_c'] = ''
+            current_row['clnhgvs'] = ''
             current_row['hgvs_p'] = ''
 
             attributeset = measure[i].findall('./AttributeSet')
@@ -269,7 +275,7 @@ def parse_clinvar_tree(handle, dest=sys.stdout, multi=None, verbose=True, genome
 
                 # find hgvs_c
                 if (attribute_type == 'HGVS, coding, RefSeq' and "c." in attribute_value):
-                    current_row['hgvs_c'] = attribute_value
+                    current_row['clnhgvs'] = attribute_value
 
                 # find hgvs_p
                 if (attribute_type == 'HGVS, protein, RefSeq' and "p." in attribute_value):

From a29a74b34437bca86876a91ebd89aef4a7248724 Mon Sep 17 00:00:00 2001
From: Jon Kerry <jonathan.kerry@congenica.com>
Date: Tue, 29 May 2018 10:22:41 +0100
Subject: [PATCH 03/15] Added "type" in merge and to the final list of headers.
 SAP-4037

---
 src/join_variant_summary_with_clinvar_alleles.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/join_variant_summary_with_clinvar_alleles.py b/src/join_variant_summary_with_clinvar_alleles.py
index e478a4e..cca827f 100644
--- a/src/join_variant_summary_with_clinvar_alleles.py
+++ b/src/join_variant_summary_with_clinvar_alleles.py
@@ -4,7 +4,7 @@
 
 from parse_clinvar_xml import HEADER
 
-FINAL_HEADER = HEADER + ['gold_stars', 'conflicted']
+FINAL_HEADER = HEADER + ['type', 'gold_stars', 'conflicted']
 
 
 def join_variant_summary_with_clinvar_alleles(
@@ -31,7 +31,7 @@ def join_variant_summary_with_clinvar_alleles(
     variant_summary = variant_summary[
         variant_summary['assembly'] == genome_build_id]
     variant_summary = variant_summary[
-        ['allele_id', 'clinicalsignificance', 'reviewstatus','lastevaluated']]
+        ['allele_id', 'clinicalsignificance', 'reviewstatus', 'lastevaluated', 'type']]
     variant_summary = variant_summary.rename(
         columns={'clinicalsignificance': 'original_clnsig',
                  'reviewstatus': 'clnrevstat',

From e031fe336af235249dfe93e633e5bd7281372604 Mon Sep 17 00:00:00 2001
From: Jon Kerry <jonathan.kerry@congenica.com>
Date: Tue, 29 May 2018 12:26:54 +0100
Subject: [PATCH 04/15] Added function for translating clinical_significance
 terms to those required for Sapientia. Stored in data-frame as clnsig.
 SAP-4037

---
 ...in_variant_summary_with_clinvar_alleles.py | 26 ++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/src/join_variant_summary_with_clinvar_alleles.py b/src/join_variant_summary_with_clinvar_alleles.py
index cca827f..4b3f809 100644
--- a/src/join_variant_summary_with_clinvar_alleles.py
+++ b/src/join_variant_summary_with_clinvar_alleles.py
@@ -1,10 +1,31 @@
 #!/usr/bin/env python
+from collections import OrderedDict
 import sys
 import pandas as pd
 
 from parse_clinvar_xml import HEADER
 
-FINAL_HEADER = HEADER + ['type', 'gold_stars', 'conflicted']
+FINAL_HEADER = HEADER + ['clnsig', 'type', 'gold_stars', 'conflicted']
+
+
+def convert_terms_to_clnsig(terms):
+    clnsig_list = [term.lower() for term in terms.split(',')]
+    classifications = OrderedDict([
+        ('Pathogenic', ('pathogenic/likely pathogenic', 'pathogenic')),
+        ('Likely pathogenic', ('likely pathogenic',)),
+        ('Uncertain significance', ('uncertain significance',)),
+        ('Likely benign', ('benign/likely benign', 'likely benign')),
+        ('Benign', ('benign',)),
+        ('', ('association not found', 'affects', 'drug response', 'confers sensitivity',
+              'risk factor', 'other', 'association', 'protective', 'not provided',
+              'conflicting data from submitters', 'conflicting interpretations of '
+              'pathogenicity', 'no interpretation for the single variant'))
+    ])
+    for clnsig, original in classifications.items():
+        if not set(clnsig_list).isdisjoint(original):
+            return clnsig
+
+    raise ValueError('Unrecognised clinical significance term in {}'.format(clnsig_list))
 
 
 def join_variant_summary_with_clinvar_alleles(
@@ -57,6 +78,9 @@ def join_variant_summary_with_clinvar_alleles(
                   on='allele_id', how='inner')
     print "merged raw", df.shape
 
+    # map clinical_significance to clnsig
+    df['clnsig'] = df.original_clnsig.map(convert_terms_to_clnsig)
+
     # map review_status to gold starts:
     gold_star_map = {
         'no assertion provided': 0,

From 15e0d3acbb1aa122ee407cca3af545e2bfef06c3 Mon Sep 17 00:00:00 2001
From: Jon Kerry <jonathan.kerry@congenica.com>
Date: Thu, 31 May 2018 09:46:08 +0100
Subject: [PATCH 05/15] Altered VCF output to not include a bunch of columns
 that we are not using in our ClinVar VCFs. Set ID column to dbSNP/rsID

---
 src/clinvar_table_to_vcf.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/src/clinvar_table_to_vcf.py b/src/clinvar_table_to_vcf.py
index c9451b9..7d491e4 100644
--- a/src/clinvar_table_to_vcf.py
+++ b/src/clinvar_table_to_vcf.py
@@ -37,9 +37,17 @@ def table_to_vcf(input_table_path, input_reference_genome):
     descriptions = {
         'gold_stars': "Number of gold stars as shown on clinvar web pages to summarize review status. Lookup table described at http://www.ncbi.nlm.nih.gov/clinvar/docs/details/ was used to map the REVIEW_STATUS value to this number.",
     }
+    not_required_in_header_or_info = ['chrom', 'pos', 'ref', 'alt', 'start', 'stop', 'strand',
+                                      'clinical_significance_ordered', 'review_status_ordered',
+                                      'dates_ordered', 'last_evaluated', 'submitters_ordered', 'likely_pathogenic',
+                                      'uncertain_significance', 'likely_benign', 'scv', 'type', 'age_of_onset',
+                                      'prevalence', 'disease_mechanism']
+    single_value_fields = ['symbol', 'pathogenic', 'benign', 'clnsig', 'gold_stars', 'conflicted']
     for key in FINAL_HEADER:
-        print("""##INFO=<ID={},Number=1,Type=String,Description="{}">"""
-              .format(key.upper(), descriptions.get(key, key.upper())))
+        if key not in not_required_in_header_or_info:
+            number = '1' if key in single_value_fields else '.'
+            print("""##INFO=<ID={},Number={},Type=String,Description="{}">""".format(
+                key.upper(), number, descriptions.get(key, key.upper())))
     with open(input_reference_genome_fai) as in_fai:
         for line in in_fai:
             chrom, length, _ = line.split("\t", 2)
@@ -64,14 +72,13 @@ def table_to_vcf(input_table_path, input_reference_genome):
         #    INFO - additional information: (String, no white-space, semi-colons, or equals-signs permitted; commas are
         #    permitted only as delimiters for lists of values) INFO fields are encoded as a semicolon-separated series of short
         #    keys with optional values in the format: <key>=<data>[,data].
-        loc_column = ['chrom', 'pos', 'ref', 'alt']
         for key in FINAL_HEADER:
-            if key not in loc_column:
+            if key not in not_required_in_header_or_info:
                 if pd.isnull(table_row[key]):
                     continue
                 value = str(table_row[key])
                 value = re.sub('\s*[,]\s*', '..', value)  # replace , with ..
-                value = re.sub('\s*[;]\s*', '|', value)  # replace ; with |
+                value = re.sub('\s*[;]\s*', ',', value)  # replace ; with |
                 value = value.replace("=", " eq ").replace(" ", "_")
                 
                 info_field[key.upper()] = value

From 1108bdb8d0946477c009e6bd73e7ea8e779ac457 Mon Sep 17 00:00:00 2001
From: Jon Kerry <jonathan.kerry@congenica.com>
Date: Mon, 4 Feb 2019 16:37:39 +0000
Subject: [PATCH 06/15] SAP-7990 : Modify XML parser to include variants with
 non-standard VCF field names

---
 src/parse_clinvar_xml.py | 40 ++++++++++++++++++++++++++--------------
 1 file changed, 26 insertions(+), 14 deletions(-)

diff --git a/src/parse_clinvar_xml.py b/src/parse_clinvar_xml.py
index 51d90ec..c963d79 100755
--- a/src/parse_clinvar_xml.py
+++ b/src/parse_clinvar_xml.py
@@ -33,6 +33,17 @@ def remove_newlines_and_tabs(s):
     return re.sub("[\t\n\r]", " ", s)
 
 
+def get_genomic_location(measure_instance, required_fields, genome_build):
+    """Loops through all SequenceLocation XML tags associated with this Measure instance (ClinVar record) and returns
+    the first tag it finds that has the four required VCF fields"""
+
+    for sequence_location in measure_instance.findall(".//SequenceLocation"):
+        if sequence_location.attrib.get('Assembly') == genome_build:
+            if all(vcf_field in sequence_location.attrib for vcf_field in required_fields):
+                return sequence_location
+    return None
+
+
 def parse_clinvar_tree(handle, dest=sys.stdout, multi=None, verbose=True, genome_build='GRCh37'):
     """Parse clinvar XML
     Args:
@@ -234,26 +245,27 @@ def parse_clinvar_tree(handle, dest=sys.stdout, multi=None, verbose=True, genome
 
             # find the allele ID (//Measure/@ID)
             current_row['allele_id'] = measure[i].attrib.get('ID')
-            # find the GRCh37 or GRCh38 VCF representation
-            genomic_location = None
-
-            for sequence_location in measure[i].findall(".//SequenceLocation"):
-                if sequence_location.attrib.get('Assembly') == genome_build:
-                    if all(sequence_location.attrib.get(key) is not None for key in
-                           ('Chr', 'start', 'referenceAllele', 'alternateAllele')):
-                        genomic_location = sequence_location
-                        break
-            # break after finding the first non-empty GRCh37 or GRCh38 location
 
-            if genomic_location is None:
+            # find the GRCh37 or GRCh38 VCF representation, first trying standard field names then left-align specific
+            vcf_field_groups = (
+                ('Chr', 'start', 'referenceAllele', 'alternateAllele'),
+                ('Chr', 'positionVCF', 'referenceAlleleVCF', 'alternateAlleleVCF')
+            )
+
+            for vcf_fields in vcf_field_groups:
+                genomic_location = get_genomic_location(measure[i], vcf_fields, genome_build)
+                if genomic_location is not None:
+                    pos_key, ref_key, alt_key = vcf_fields[1:]
+                    break
+            else:
                 skipped_counter['missing SequenceLocation'] += 1
                 elem.clear()
                 continue  # don't bother with variants that don't have a VCF location
 
             current_row['chrom'] = genomic_location.attrib['Chr']
-            current_row['pos'] = genomic_location.attrib['start']
-            current_row['ref'] = genomic_location.attrib['referenceAllele']
-            current_row['alt'] = genomic_location.attrib['alternateAllele']
+            current_row['pos'] = genomic_location.attrib[pos_key]
+            current_row['ref'] = genomic_location.attrib[ref_key]
+            current_row['alt'] = genomic_location.attrib[alt_key]
             current_row['start'] = genomic_location.attrib['start']
             current_row['stop'] = genomic_location.attrib['stop']
             current_row['strand'] = ''

From 5d8f875959aca54abbe464ed8ebf1661bd6caf13 Mon Sep 17 00:00:00 2001
From: Jon Kerry <jonathan.kerry@congenica.com>
Date: Thu, 14 Feb 2019 15:37:43 +0000
Subject: [PATCH 07/15] SAP-6265 : Use most pathogenic submission for
 conflicted variants

---
 src/clinvar_table_to_vcf.py                   |  8 +++----
 ...in_variant_summary_with_clinvar_alleles.py | 22 +++++++++++++++++++
 2 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/src/clinvar_table_to_vcf.py b/src/clinvar_table_to_vcf.py
index 7d491e4..e122d9f 100644
--- a/src/clinvar_table_to_vcf.py
+++ b/src/clinvar_table_to_vcf.py
@@ -39,10 +39,10 @@ def table_to_vcf(input_table_path, input_reference_genome):
     }
     not_required_in_header_or_info = ['chrom', 'pos', 'ref', 'alt', 'start', 'stop', 'strand',
                                       'clinical_significance_ordered', 'review_status_ordered',
-                                      'dates_ordered', 'last_evaluated', 'submitters_ordered', 'likely_pathogenic',
-                                      'uncertain_significance', 'likely_benign', 'scv', 'type', 'age_of_onset',
-                                      'prevalence', 'disease_mechanism']
-    single_value_fields = ['symbol', 'pathogenic', 'benign', 'clnsig', 'gold_stars', 'conflicted']
+                                      'dates_ordered', 'last_evaluated', 'submitters_ordered', 'scv', 'type',
+                                      'age_of_onset', 'prevalence', 'disease_mechanism']
+    single_value_fields = ['symbol', 'pathogenic', 'likely_pathogenic', 'uncertain_significance', 'likely_benign',
+                           'benign', 'clnsig', 'gold_stars', 'conflicted']
     for key in FINAL_HEADER:
         if key not in not_required_in_header_or_info:
             number = '1' if key in single_value_fields else '.'
diff --git a/src/join_variant_summary_with_clinvar_alleles.py b/src/join_variant_summary_with_clinvar_alleles.py
index 4b3f809..64c2f7c 100644
--- a/src/join_variant_summary_with_clinvar_alleles.py
+++ b/src/join_variant_summary_with_clinvar_alleles.py
@@ -28,6 +28,25 @@ def convert_terms_to_clnsig(terms):
     raise ValueError('Unrecognised clinical significance term in {}'.format(clnsig_list))
 
 
+def determine_most_pathogenic_submission(row):
+    if row['pathogenic'] > 0:
+        return 'Pathogenic'
+    elif row['likely_pathogenic'] > 0:
+        return 'Likely pathogenic'
+    elif row['uncertain_significance'] > 0:
+        return 'Uncertain significance'
+    elif row['likely_benign'] > 0:
+        return 'Likely benign'
+    elif row['benign'] > 0:
+        return 'Benign'
+    else:
+        # may need to revisit this code to determine what to with these instances, really they shouldn't be classified
+        # as conflicted but the earlier join can be ambiguous due to a variant in variant_summary.txt.gz appearing in
+        # both the multi and single files with different numbers of submissions. For now, retain the current
+        # functionality of leaving the field blank for the conflicted variant
+        return ''
+
+
 def join_variant_summary_with_clinvar_alleles(
         variant_summary_table, clinvar_alleles_table,
         genome_build_id="GRCh37"):
@@ -100,6 +119,9 @@ def join_variant_summary_with_clinvar_alleles(
     df['conflicted'] = df['original_clnsig'].str.contains(
         "onflicting", case=False).astype(int)
 
+    conflicted_variants = df['conflicted'] == 1
+    df.loc[conflicted_variants, 'clnsig'] = df[conflicted_variants].apply(determine_most_pathogenic_submission, axis=1)
+
     # reorder columns just in case
     df = df.ix[:, FINAL_HEADER]
     print "merged final", df.shape

From 679b9f2b32bc91e2179690e7b255ed64ace7894e Mon Sep 17 00:00:00 2001
From: Jon Kerry <jonathan.kerry@congenica.com>
Date: Wed, 20 Feb 2019 10:39:23 +0000
Subject: [PATCH 08/15] SAP-8224 : Output expected variant numbers in ClinVar
 VCF header

---
 src/clinvar_table_to_vcf.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/clinvar_table_to_vcf.py b/src/clinvar_table_to_vcf.py
index e122d9f..b4f5e1f 100644
--- a/src/clinvar_table_to_vcf.py
+++ b/src/clinvar_table_to_vcf.py
@@ -48,6 +48,13 @@ def table_to_vcf(input_table_path, input_reference_genome):
             number = '1' if key in single_value_fields else '.'
             print("""##INFO=<ID={},Number={},Type=String,Description="{}">""".format(
                 key.upper(), number, descriptions.get(key, key.upper())))
+
+    print('##INFO=<ID=TOTAL_VARIANTS,Number=1,Type=String,Description="{}"'.format(t.shape[0]))
+    for pathogenicity in ('Pathogenic', 'Likely pathogenic', 'Uncertain significance', 'Likely benign', 'Benign'):
+        pathogenicity_count = (t['clnsig'] == pathogenicity).sum()
+        print('##INFO=<ID={}_VARIANTS,Number=1,Type=String,Description="{}"'.format(
+            pathogenicity.upper().replace(' ', '_'), pathogenicity_count))
+
     with open(input_reference_genome_fai) as in_fai:
         for line in in_fai:
             chrom, length, _ = line.split("\t", 2)

From ca4bc1601c004fbe2ec46c248102b93e82e0350d Mon Sep 17 00:00:00 2001
From: Jon Kerry <jonathan.kerry@congenica.com>
Date: Mon, 25 Feb 2019 09:59:40 +0000
Subject: [PATCH 09/15] SAP-8219 : Add "rs" the front of the rsID

---
 src/parse_clinvar_xml.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/parse_clinvar_xml.py b/src/parse_clinvar_xml.py
index c963d79..673d442 100755
--- a/src/parse_clinvar_xml.py
+++ b/src/parse_clinvar_xml.py
@@ -212,7 +212,7 @@ def parse_clinvar_tree(handle, dest=sys.stdout, multi=None, verbose=True, genome
         current_row['rs'] = 'BLANK'
         for xref_node in measureset.findall('.//Measure/XRef'):
             if xref_node.attrib.get('Type') == 'rs':
-                current_row['rs'] = xref_node.attrib.get('ID')
+                current_row['rs'] = 'rs' + xref_node.attrib.get('ID')
 
         current_row['clnorigin'] = set()
         for origin in elem.findall('.//ReferenceClinVarAssertion/ObservedIn/Sample/Origin'):

From 7eb8576f5291c095a7d88403486c10e3b54ef31e Mon Sep 17 00:00:00 2001
From: Jon Kerry <jonathan.kerry@congenica.com>
Date: Tue, 26 Feb 2019 11:14:37 +0000
Subject: [PATCH 10/15] SAP-8222 : Add "Haplotype;Phase unknown" to expected
 measureset types

---
 src/check_allele_table.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/check_allele_table.py b/src/check_allele_table.py
index 3cccbbd..e099270 100755
--- a/src/check_allele_table.py
+++ b/src/check_allele_table.py
@@ -34,8 +34,10 @@
         assert int(record['pos']) > 0 and int(record['pos']) < 3*10**8, 'Unexpected "pos" column value: ' + record['pos']
         assert all(b in 'ACGTN' for b in record['ref']), 'Unexpected "ref" column value: ' + record['ref']
         assert all(b in 'ACGTN' for b in record['alt']), 'Unexpected "alt" column value: ' + record['alt']  # there's one clinvar record with ALT = "NTGT". Not sure how to handle it.
-        assert record['measureset_type'] in ["Variant", "Haplotype", "CompoundHeterozygote", "Phase unknown", "Distinct chromosomes", "CompoundHeterozygote;Haplotype", "Variant;gene-variant"], \
-            'Unexpected "measureset_type" column value: ' + record['measureset_type']  # there's one clinvar record with ALT = "NTGT". Not sure how to handle it.
+        assert record['measureset_type'] in [
+            "Variant", "Haplotype", "CompoundHeterozygote", "Phase unknown", "Distinct chromosomes",
+            "CompoundHeterozygote;Haplotype", "Variant;gene-variant", "Haplotype;Phase unknown"
+        ], 'Unexpected "measureset_type" column value: ' + record['measureset_type']
 
         assert len(map(int, record['measureset_id'].split(';'))) > 0, 'Unexpected "measureset_id" column value: ' + record['measureset_id']
         assert len(map(lambda rcv: int(rcv.strip('RCV')), record['clnacc'].split(';'))) > 0, 'Unexpected "clnacc" column value: ' + record['clnacc']

From ac819fc09f72d401c23e59057b27d8bd44b71ca6 Mon Sep 17 00:00:00 2001
From: Jon Kerry <jonathan.kerry@congenica.com>
Date: Tue, 5 Mar 2019 12:08:47 +0000
Subject: [PATCH 11/15] SAP-8433 : Add pathogenicity mapping for old terms

---
 src/parse_clinvar_xml.py | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/src/parse_clinvar_xml.py b/src/parse_clinvar_xml.py
index 673d442..f424de4 100755
--- a/src/parse_clinvar_xml.py
+++ b/src/parse_clinvar_xml.py
@@ -24,6 +24,29 @@
           'all_pmids', 'inheritance_modes', 'age_of_onset', 'prevalence',
           'disease_mechanism', 'clnorigin', 'xrefs', 'dates_ordered', 'rs']
 
+pathogenicity_mapping = {}
+pathogenicity_mapping.update(dict.fromkeys(
+    ['benign', 'no known pathogenicity', 'non-pathogenic', 'poly'],
+    'benign'
+))
+pathogenicity_mapping.update(dict.fromkeys(
+    ['vsb', 'likely benign', 'probable-non-pathogenic', 'probably not pathogenic', 'suspected benign'],
+    'likely benign'
+))
+pathogenicity_mapping.update(dict.fromkeys(
+    ['uncertain', 'uncertain significance', 'unknown', 'unknown significance', 'variant of unknown significance'],
+    'uncertain significance'
+))
+pathogenicity_mapping.update(dict.fromkeys(
+    ['likely pathogenic - adrenal bilateral pheochromocy', 'likely pathogenic - adrenal pheochromocytoma',
+     'likely pathogenic', 'probable-pathogenic', 'probably pathogenic', 'suspected pathogenic'],
+    'likely pathogenic'
+))
+pathogenicity_mapping.update(dict.fromkeys(
+    ['moderate', 'pathogenic', 'pathogenic variant for bardet-biedl syndrome', 'mut', 'pathologic'],
+    'pathogenic'
+))
+
 
 def replace_semicolons(s, replace_with=":"):
     return s.replace(";", replace_with)
@@ -162,7 +185,9 @@ def parse_clinvar_tree(handle, dest=sys.stdout, multi=None, verbose=True, genome
         ])
 
         list_significance= [
-            x.text.lower() for x in elem.findall('.//ClinVarAssertion/ClinicalSignificance/Description') if x is not None
+            pathogenicity_mapping[x.text.lower()] for x in
+            elem.findall('.//ClinVarAssertion/ClinicalSignificance/Description') if x is not None
+            and x.text.lower() in pathogenicity_mapping
         ]
 
         current_row['pathogenic'] = str(list_significance.count("pathogenic"))

From c895f74fd09e8a00d5fb825223f58f73e719e6dd Mon Sep 17 00:00:00 2001
From: Jon Kerry <jonathan.kerry@congenica.com>
Date: Wed, 6 Mar 2019 11:31:27 +0000
Subject: [PATCH 12/15] SAP-8478 : Add human-readable descriptions for INFO
 fields in VCF header

---
 src/clinvar_table_to_vcf.py | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/src/clinvar_table_to_vcf.py b/src/clinvar_table_to_vcf.py
index b4f5e1f..d357ed1 100644
--- a/src/clinvar_table_to_vcf.py
+++ b/src/clinvar_table_to_vcf.py
@@ -35,12 +35,36 @@ def table_to_vcf(input_table_path, input_reference_genome):
     print("""##fileformat=VCFv4.1\n##source=clinvar""")
 
     descriptions = {
-        'gold_stars': "Number of gold stars as shown on clinvar web pages to summarize review status. Lookup table described at http://www.ncbi.nlm.nih.gov/clinvar/docs/details/ was used to map the REVIEW_STATUS value to this number.",
+        'gold_stars': "Number of Gold Stars (numeric representation of review status)",
+        'clnsig': "Most Severe ClinVar Pathogenicity",
+        'original_clnsig': "Clinical Significance",
+        'pathogenic': "Number of 'Pathogenic' Submissions",
+        'likely_pathogenic': "Number of 'Likely pathogenic' Submissions",
+        'uncertain_significance': "Number of 'Uncertain significance' Submissions",
+        'likely_benign': "Number of 'Likely benign' Submissions",
+        'benign': "Number of 'Benign' Submissions",
+        'conflicted': "Conflicting Pathogenicities (0 = False; 1 = True)",
+        'clnhgvs': "HGVSc",
+        'clnrevstat': "Review Status",
+        'clndbn': "Phenotypes",
+        'clnorigin': "Allele Origin",
+        'rs': "rsID",
+        'all_pmids': "Pubmed IDs Documenting Evidence of Phenotypes",
+        'clnacc': "RCV Accession Number",
+        'measureset_type': "Measureset Type",
+        'measureset_id': "Measureset ID",
+        'allele_id': "Allele ID",
+        'symbol': "Gene Symbol",
+        'molecular_consequence': "Molecular Consequence",
+        'hgvs_p': "HGVSp",
+        'all_submitters': "Submitters of Variant Phenotype",
+        'inheritance_modes': "Modes of Inheritance",
+        'xrefs': "Cross-References to other Data Sources"
     }
     not_required_in_header_or_info = ['chrom', 'pos', 'ref', 'alt', 'start', 'stop', 'strand',
                                       'clinical_significance_ordered', 'review_status_ordered',
                                       'dates_ordered', 'last_evaluated', 'submitters_ordered', 'scv', 'type',
-                                      'age_of_onset', 'prevalence', 'disease_mechanism']
+                                      'inheritance_modes', 'age_of_onset', 'prevalence', 'disease_mechanism']
     single_value_fields = ['symbol', 'pathogenic', 'likely_pathogenic', 'uncertain_significance', 'likely_benign',
                            'benign', 'clnsig', 'gold_stars', 'conflicted']
     for key in FINAL_HEADER:

From 85b9b1cbdba2879941c5fc2d19b15b240c1a7cf6 Mon Sep 17 00:00:00 2001
From: Jon Kerry <jonathan.kerry@congenica.com>
Date: Thu, 7 Mar 2019 14:15:57 +0000
Subject: [PATCH 13/15] SAP-8360 : Pass ClinVar version to VCF writer to add
 version and date to VCF header

---
 src/clinvar_table_to_vcf.py | 25 ++++++++++++++++++-------
 src/master.py               |  4 +++-
 2 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/src/clinvar_table_to_vcf.py b/src/clinvar_table_to_vcf.py
index d357ed1..19f7ae6 100644
--- a/src/clinvar_table_to_vcf.py
+++ b/src/clinvar_table_to_vcf.py
@@ -1,4 +1,5 @@
 import argparse
+import calendar
 import collections
 import gzip
 import os
@@ -16,7 +17,7 @@ def gzopen(path, mode='r', verbose=True):
         return open(path, mode)
 
 
-def table_to_vcf(input_table_path, input_reference_genome):
+def table_to_vcf(input_table_path, input_reference_genome, version):
     # validate args
     input_reference_genome_fai = input_reference_genome + ".fai"
     if not os.path.isfile(input_table_path):
@@ -73,11 +74,20 @@ def table_to_vcf(input_table_path, input_reference_genome):
             print("""##INFO=<ID={},Number={},Type=String,Description="{}">""".format(
                 key.upper(), number, descriptions.get(key, key.upper())))
 
-    print('##INFO=<ID=TOTAL_VARIANTS,Number=1,Type=String,Description="{}"'.format(t.shape[0]))
-    for pathogenicity in ('Pathogenic', 'Likely pathogenic', 'Uncertain significance', 'Likely benign', 'Benign'):
-        pathogenicity_count = (t['clnsig'] == pathogenicity).sum()
-        print('##INFO=<ID={}_VARIANTS,Number=1,Type=String,Description="{}"'.format(
-            pathogenicity.upper().replace(' ', '_'), pathogenicity_count))
+    version_date = re.search(r'(?P<year>\d{4})-(?P<month>\d{2})', version)
+    try:
+        month_string = calendar.month_name[int(version_date.group('month'))]
+    except IndexError:
+        raise ValueError('Cannot convert version month value "{}" to a known month (must be between 01-12)'.format(
+            version_date.group('month')
+        ))
+
+    print('##CONGENICA_VCF_VALIDATION=<ID=CLINVAR_MASTER_NUM_TOTAL_VARIANTS,VALUE={},DESCRIPTION="Number of variants '
+          'in VCF"'.format(t.shape[0]))
+    print('##CURATED_VARIANT_LIST_INFO=<ID=VERSION,VALUE="{}",DESCRIPTION="Version of source data">'.format(version))
+    print('##CURATED_VARIANT_LIST_INFO=<ID=DESCRIPTION,VALUE="ClinVar variants (SNVs and indels) in the {}-{} '
+          'version of the ClinVar dataset",DESCRIPTION="Description for the Curated Variant List">'.format(
+              version_date.group('year'), month_string))
 
     with open(input_reference_genome_fai) as in_fai:
         for line in in_fai:
@@ -124,6 +134,7 @@ def table_to_vcf(input_table_path, input_reference_genome):
     parser = argparse.ArgumentParser()
     parser.add_argument('input_table_path', help="Tab-delimited input table")
     parser.add_argument('input_reference_genome', help="Reference FASTA used. The associated .fai file, e.g. b38.fa.fai, is necessary for the VCF header generation")
+    parser.add_argument('clinvar_version', help="Version of ClinVar e.g. 2019-02")
     args = parser.parse_args()
 
-    table_to_vcf(args.input_table_path, args.input_reference_genome)
+    table_to_vcf(args.input_table_path, args.input_reference_genome, args.clinvar_version)
diff --git a/src/master.py b/src/master.py
index 85b0d29..5febd39 100755
--- a/src/master.py
+++ b/src/master.py
@@ -25,6 +25,7 @@
 g = p.add_argument_group('main args')
 g.add("--b37-genome", help="b37 .fa genome reference file", default=None, required=False)
 g.add("--b38-genome", help="b38 .fa genome reference file. NOTE: chromosome names must be like '1', '2'.. 'X', 'Y', 'MT'.", default=None, required=False)
+g.add("--clinvar-version", help="Version of ClinVar source data e.g. 2019-02", default=None, required=True)
 g.add("-X", "--clinvar-xml", help="The local filename of the ClinVarFullRelase.xml.gz file. If not set, grab the latest from NCBI.")
 g.add("-S", "--clinvar-variant-summary-table", help="The local filename of the variant_summary.txt.gz file. If not set, grab the latest from NCBI.")
 g.add("-E", "--exac-sites-vcf",  help="ExAC sites vcf file. If specified, a clinvar table with extra ExAC fields will also be created.")
@@ -51,6 +52,7 @@
 gnomad_genome_sites_vcf = args.gnomad_genome_sites_vcf
 clinvar_variant_summary_table = args.clinvar_variant_summary_table
 output_prefix = args.output_prefix
+clinvar_version = args.clinvar_version
 
 script_dir = args.script_dir
 tmp_dir = args.tmp_dir
@@ -199,7 +201,7 @@ def download_if_changed(job_runner, local_path, ftp_host, ftp_path):
                 ])
 
         # create vcf
-        job.add(("python -u IN:%(script_dir)s/clinvar_table_to_vcf.py IN:%(tmp_dir)s/clinvar_alleles.%(fsuffix)s.tsv.gz IN:%(reference_genome)s | bgzip -c > OUT:%(tmp_dir)s/clinvar_alleles.%(fsuffix)s.vcf.gz") % locals())  # create compressed version
+        job.add(("python -u IN:%(script_dir)s/clinvar_table_to_vcf.py IN:%(tmp_dir)s/clinvar_alleles.%(fsuffix)s.tsv.gz IN:%(reference_genome)s %(clinvar_version)s | bgzip -c > OUT:%(tmp_dir)s/clinvar_alleles.%(fsuffix)s.vcf.gz") % locals())  # create compressed version
         job.add("tabix IN:%(tmp_dir)s/clinvar_alleles.%(fsuffix)s.vcf.gz" % locals(), output_filenames=["%(tmp_dir)s/clinvar_alleles.%(fsuffix)s.vcf.gz.tbi" % locals()])
         job.add("cp IN:%(tmp_dir)s/clinvar_alleles.%(fsuffix)s.vcf.gz IN:%(tmp_dir)s/clinvar_alleles.%(fsuffix)s.vcf.gz.tbi %(output_dir)s/" % locals(), output_filenames=[
             "%(output_dir)s/clinvar_alleles.%(fsuffix)s.vcf.gz" % locals(),

From 89d3d28d9bd0e147072bf3c302cf6f6ceddf15b5 Mon Sep 17 00:00:00 2001
From: Kevin Dunnicliffe <kevin.dunnicliffe@congenica.com>
Date: Mon, 28 Oct 2019 09:22:55 +0000
Subject: [PATCH 14/15] SAP-11534 : set CLNGHVS field to var_name as fallback

---
 src/parse_clinvar_xml.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/parse_clinvar_xml.py b/src/parse_clinvar_xml.py
index f424de4..f58ba46 100755
--- a/src/parse_clinvar_xml.py
+++ b/src/parse_clinvar_xml.py
@@ -313,6 +313,8 @@ def parse_clinvar_tree(handle, dest=sys.stdout, multi=None, verbose=True, genome
                 # find hgvs_c
                 if (attribute_type == 'HGVS, coding, RefSeq' and "c." in attribute_value):
                     current_row['clnhgvs'] = attribute_value
+                else:
+                    current_row['clnhgvs'] = var_name
 
                 # find hgvs_p
                 if (attribute_type == 'HGVS, protein, RefSeq' and "p." in attribute_value):

From 60b0e7faca22cdf8222ea3213f9753431985e464 Mon Sep 17 00:00:00 2001
From: Kevin Dunnicliffe <kevin.dunnicliffe@congenica.com>
Date: Fri, 1 Nov 2019 09:50:44 +0000
Subject: [PATCH 15/15] SAP-11534 : improve clnghvs handling by fallback to
 MeasureSet.name

---
 src/parse_clinvar_xml.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/parse_clinvar_xml.py b/src/parse_clinvar_xml.py
index f58ba46..89f3d49 100755
--- a/src/parse_clinvar_xml.py
+++ b/src/parse_clinvar_xml.py
@@ -313,7 +313,7 @@ def parse_clinvar_tree(handle, dest=sys.stdout, multi=None, verbose=True, genome
                 # find hgvs_c
                 if (attribute_type == 'HGVS, coding, RefSeq' and "c." in attribute_value):
                     current_row['clnhgvs'] = attribute_value
-                else:
+                elif "c." in var_name:
                     current_row['clnhgvs'] = var_name
 
                 # find hgvs_p