From cd8e3062fe9719aa656d02f51803fa4eaba10ddf Mon Sep 17 00:00:00 2001 From: Christopher Holt Date: Mon, 16 Jan 2023 12:25:03 -0500 Subject: [PATCH 1/3] added lines to keep introns from Augustus in the gff3 file --- lib/biocode/gff.py | 23 +++++++++++++++++++ lib/biocode/things.py | 51 +++++++++++++++++++++++++------------------ 2 files changed, 53 insertions(+), 21 deletions(-) diff --git a/lib/biocode/gff.py b/lib/biocode/gff.py index d540366..c431d85 100644 --- a/lib/biocode/gff.py +++ b/lib/biocode/gff.py @@ -344,6 +344,12 @@ def get_gff3_features(gff3_file, assemblies=None): parent_feat.add_exon(exon) features[feat_id] = exon + elif cols[2] == 'intron': + intron = biocode.things.Intron(id=feat_id, parent=parent_feat) + intron.locate_on(target=current_assembly, fmin=rfmin, fmin_partial=fmin_partial, fmax=rfmax, fmax_partial=fmax_partial, strand=rstrand) + parent_feat.add_intron(intron) + features[feat_id] = intron + elif cols[2] == 'CDS': if phase == '.': phase = 0 @@ -729,6 +735,23 @@ def print_biogene( gene=None, fh=None, source=None, on=None ): columns[8] = build_column_9( id=exon.id, parent=RNA.id, other=exon_annot_atts ) fh.write( "\t".join(columns) + "\n" ) + ## handle introns for this RNA + for intron in sorted(RNA.introns( on )): + intron_loc = intron.location_on( on ) + + if intron_loc is None: + raise Exception("ERROR: Expected intron {0} to be located on {1} but it wasn't".format(intron.id, on.id)) + + intron_partiality_string = _partiality_string(intron_loc) + intron_annot_atts = dict() + if intron_partiality_string is not None: + intron_annot_atts['Partial'] = intron_partiality_string + + columns[2] = 'intron' + columns[3:5] = [str(intron_loc.fmin + 1), str(intron_loc.fmax)] + columns[8] = build_column_9( id=intron.id, parent=RNA.id, other=intron_annot_atts ) + fh.write( "\t".join(columns) + "\n" ) + # are there polypeptides? for polypeptide in sorted(RNA.polypeptides()): if len(polypeptide.locations) == 0: diff --git a/lib/biocode/things.py b/lib/biocode/things.py index 37270ce..8f2bff7 100644 --- a/lib/biocode/things.py +++ b/lib/biocode/things.py @@ -842,9 +842,10 @@ class Intron( LocatableThing ): removed from within the transcript by splicing together the sequences (exons) on either side of it." ''' - def __init__( self, id=None, locations=None, length=None ): + def __init__( self, id=None, locations=None, parent=None, length=None ): super().__init__(locations) self.id = id + self.parent = parent self.length = length @@ -1029,6 +1030,7 @@ def __init__( self, id=None, locations=None, parent=None, locus_tag=None, childr ## initialize any types needed self.children = _initialize_type_list(self.children, 'exon') + self.children = _initialize_type_list(self.children, 'intron') self.children = _initialize_type_list(self.children, 'CDS') self.children = _initialize_type_list(self.children, 'polypeptide') self.children = _initialize_type_list(self.children, 'UTR') @@ -1044,6 +1046,10 @@ def add_exon(self, exon): exon.parent = self self.children['exon'].append(exon) + def add_intron(self, intron): + intron.parent = self + self.children['intron'].append(intron) + def add_five_prime_UTR(self, utr): utr.parent = self self.children['UTR'].append(utr) @@ -1207,32 +1213,35 @@ def has_introns( self ): return False def introns(self, on=None): - ''' - Dynamically generates Intron objects in order for the current RNA. The coordinates of the - generated introns depend on the object passed via the 'on' argument - ''' - if on is None: - raise Exception("ERROR: the introns() method requires a passed molecule using the 'on' argument") + + return self.children["intron"] ## This ports port all the introns straight from Augustus + + # ''' + # Dynamically generates Intron objects in order for the current RNA. The coordinates of the + # generated introns depend on the object passed via the 'on' argument + # ''' + # if on is None: + # raise Exception("ERROR: the introns() method requires a passed molecule using the 'on' argument") - mol_on = on + # mol_on = on - intron_objs = list() - last_exon = None - last_exon_loc = None + # intron_objs = list() + # last_exon = None + # last_exon_loc = None - for exon in sorted(self.exons()): - exon_loc = exon.location_on( mol_on ) + # for exon in sorted(self.exons()): + # exon_loc = exon.location_on( mol_on ) - if last_exon is not None: - intron_id = uuid.uuid4() - intron = Intron( id=intron_id ) - intron.locate_on( target=mol_on, fmin=last_exon_loc.fmax, fmax=exon_loc.fmin, strand=exon_loc.strand ) - intron_objs.append( intron ) + # if last_exon is not None: + # intron_id = uuid.uuid4() ## ## This is generating random id, need format such as "g1.t1.intron1" + # intron = Intron( id=intron_id ) + # intron.locate_on( target=mol_on, fmin=last_exon_loc.fmax, fmax=exon_loc.fmin, strand=exon_loc.strand ) + # intron_objs.append( intron ) - last_exon = exon - last_exon_loc = exon_loc + # last_exon = exon + # last_exon_loc = exon_loc - return intron_objs + # return intron_objs def polypeptides(self): return self.children['polypeptide'] From b2020b07b68cf5b11debb67fb5f80bf7fb7eeef1 Mon Sep 17 00:00:00 2001 From: Christopher Holt Date: Mon, 16 Jan 2023 13:18:52 -0500 Subject: [PATCH 2/3] Changed function to keep introns that annotated in augutus.hints.gff3 file, instead of prediciting introns --- lib/biocode/things.py | 44 +++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/lib/biocode/things.py b/lib/biocode/things.py index 8f2bff7..5435f73 100644 --- a/lib/biocode/things.py +++ b/lib/biocode/things.py @@ -1213,35 +1213,35 @@ def has_introns( self ): return False def introns(self, on=None): - - return self.children["intron"] ## This ports port all the introns straight from Augustus - # ''' - # Dynamically generates Intron objects in order for the current RNA. The coordinates of the - # generated introns depend on the object passed via the 'on' argument - # ''' - # if on is None: - # raise Exception("ERROR: the introns() method requires a passed molecule using the 'on' argument") + return self.children["intron"] ## This ports port all the introns straight from Augustus + + # ''' + # Dynamically generates Intron objects in order for the current RNA. The coordinates of the + # generated introns depend on the object passed via the 'on' argument + # ''' + # if on is None: + # raise Exception("ERROR: the introns() method requires a passed molecule using the 'on' argument") - # mol_on = on + # mol_on = on - # intron_objs = list() - # last_exon = None - # last_exon_loc = None + # intron_objs = list() + # last_exon = None + # last_exon_loc = None - # for exon in sorted(self.exons()): - # exon_loc = exon.location_on( mol_on ) + # for exon in sorted(self.exons()): + # exon_loc = exon.location_on( mol_on ) - # if last_exon is not None: - # intron_id = uuid.uuid4() ## ## This is generating random id, need format such as "g1.t1.intron1" - # intron = Intron( id=intron_id ) - # intron.locate_on( target=mol_on, fmin=last_exon_loc.fmax, fmax=exon_loc.fmin, strand=exon_loc.strand ) - # intron_objs.append( intron ) + # if last_exon is not None: + # intron_id = uuid.uuid4() ## ## This is generating random id, need format such as "g1.t1.intron1" + # intron = Intron( id=intron_id ) + # intron.locate_on( target=mol_on, fmin=last_exon_loc.fmax, fmax=exon_loc.fmin, strand=exon_loc.strand ) + # intron_objs.append( intron ) - # last_exon = exon - # last_exon_loc = exon_loc + # last_exon = exon + # last_exon_loc = exon_loc - # return intron_objs + # return intron_objs def polypeptides(self): return self.children['polypeptide'] From b88e02b3bfc1e2005c88c7b1dd58159ca1d85bd6 Mon Sep 17 00:00:00 2001 From: Christopher Holt Date: Sun, 22 Jan 2023 00:14:29 -0500 Subject: [PATCH 3/3] Kept dynamic intron generation, but added hardcoded intron id based on exon id --- lib/biocode/things.py | 60 ++++++++++++++++++++++--------------------- 1 file changed, 31 insertions(+), 29 deletions(-) diff --git a/lib/biocode/things.py b/lib/biocode/things.py index 5435f73..b8912ab 100644 --- a/lib/biocode/things.py +++ b/lib/biocode/things.py @@ -1,6 +1,7 @@ import itertools import sys +import re import uuid #from biocode import utils, gff, tbl @@ -1213,35 +1214,36 @@ def has_introns( self ): return False def introns(self, on=None): - - return self.children["intron"] ## This ports port all the introns straight from Augustus - - # ''' - # Dynamically generates Intron objects in order for the current RNA. The coordinates of the - # generated introns depend on the object passed via the 'on' argument - # ''' - # if on is None: - # raise Exception("ERROR: the introns() method requires a passed molecule using the 'on' argument") - - # mol_on = on - - # intron_objs = list() - # last_exon = None - # last_exon_loc = None - - # for exon in sorted(self.exons()): - # exon_loc = exon.location_on( mol_on ) - - # if last_exon is not None: - # intron_id = uuid.uuid4() ## ## This is generating random id, need format such as "g1.t1.intron1" - # intron = Intron( id=intron_id ) - # intron.locate_on( target=mol_on, fmin=last_exon_loc.fmax, fmax=exon_loc.fmin, strand=exon_loc.strand ) - # intron_objs.append( intron ) - - # last_exon = exon - # last_exon_loc = exon_loc - - # return intron_objs + + ''' + Dynamically generates Intron objects in order for the current RNA. The coordinates of the + generated introns depend on the object passed via the 'on' argument + ''' + if on is None: + raise Exception("ERROR: the introns() method requires a passed molecule using the 'on' argument") + + mol_on = on + + intron_objs = list() + last_exon = None + last_exon_loc = None + + intron_count = 0 + for exon in sorted(self.exons()): + exon_loc = exon.location_on( mol_on ) + + if last_exon is not None: + intron_count = intron_count + 1 + intron_count_id = str(intron_count) + intron_id = str(re.sub('exon[0-9]', 'intron' + intron_count_id, exon.id)) ## ## This is generating random id, need format such as "g1.t1.intron1" + intron = Intron( id=intron_id ) + intron.locate_on( target=mol_on, fmin=last_exon_loc.fmax, fmax=exon_loc.fmin, strand=exon_loc.strand ) + intron_objs.append( intron ) + + last_exon = exon + last_exon_loc = exon_loc + + return intron_objs def polypeptides(self): return self.children['polypeptide']