diff --git a/bakta/features/annotation.py b/bakta/features/annotation.py index fd99761a..c139d25c 100644 --- a/bakta/features/annotation.py +++ b/bakta/features/annotation.py @@ -285,21 +285,39 @@ def detect_feature_overlaps(genome: dict): ) # user-provided CDS overlaps for cds_user_provided in contig_cdss_user_provided[contig['id']]: - if(cds['stop'] < cds_user_provided['start'] or cds['start'] > cds_user_provided['stop']): - continue - else: # overlap -> remove cds - overlap = min(cds['stop'], cds_user_provided['stop']) - max(cds['start'], cds_user_provided['start']) + 1 - if(overlap > bc.CDS_MAX_OVERLAPS): - overlap = f"[{max(cds['start'], cds_user_provided['start'])},{min(cds['stop'], cds_user_provided['stop'])}]" - cds['discarded'] = { - 'type': bc.DISCARD_TYPE_OVERLAP, - 'feature_type': bc.FEATURE_CDS, - 'description': f'overlaps {bc.FEATURE_CDS} at {overlap}' - } - log.info( - "overlap: de novo CDS (%s/%s) [%i, %i] overlapping user-provided CDS [%i, %i], %s, contig=%s", - cds.get('gene', '-'), cds.get('product', '-'), cds['start'], cds['stop'], cds_user_provided['start'], cds_user_provided['stop'], overlap, cds['contig'] - ) + overlap = 0 + if(not cds_user_provided.get('edge', False) and not cds.get('edge', False)): # both CDS not edge features + if(cds['stop'] < cds_user_provided['start'] or cds['start'] > cds_user_provided['stop']): + continue + else: + overlap = min(cds['stop'], cds_user_provided['stop']) - max(cds['start'], cds_user_provided['start']) + 1 + elif(cds_user_provided.get('edge', False) and not cds.get('edge', False)): # only user-provided CDS edge feature + if(cds['stop'] > cds_user_provided['start']): # overlapping de-novo CDS at sequence end + overlap = cds['stop'] - max(cds['start'], cds_user_provided['start']) + 1 + elif(cds['start'] < cds_user_provided['stop']): # overlapping de-novo CDS at sequence start + overlap = min(cds['stop'], cds_user_provided['stop']) - cds['start'] + 1 + else: + continue + elif(not cds_user_provided.get('edge', False) and cds.get('edge', False)): # only de-novo CDS edge feature + if(cds_user_provided['stop'] > cds['start']): # overlapping user-provided CDS at sequence end + overlap = cds_user_provided['stop'] - max(cds['start'], cds_user_provided['start']) + 1 + elif(cds_user_provided['start'] < cds['stop']): # overlapping user-provided CDS at sequence start + overlap = min(cds['stop'], cds_user_provided['stop']) - cds_user_provided['start'] + 1 + else: + continue + elif(cds_user_provided.get('edge', False) and cds.get('edge', False)): # both CDS edge features + overlap = (contig['length'] - max(cds['start'], cds_user_provided['start']) + 1) + min(cds['stop'], cds_user_provided['stop']) + if(overlap > bc.CDS_MAX_OVERLAPS): + overlap = f"[{max(cds['start'], cds_user_provided['start'])},{min(cds['stop'], cds_user_provided['stop'])}]" + cds['discarded'] = { + 'type': bc.DISCARD_TYPE_OVERLAP, + 'feature_type': bc.FEATURE_CDS, + 'description': f'overlaps user-provided {bc.FEATURE_CDS} at {overlap}' + } + log.info( + "overlap: de-novo CDS (%s/%s) [%i, %i] overlapping user-provided CDS [%i, %i], %s, contig=%s", + cds.get('gene', '-'), cds.get('product', '-'), cds['start'], cds['stop'], cds_user_provided['start'], cds_user_provided['stop'], overlap, cds['contig'] + ) # remove sORF overlapping with tRNAs, tmRNAs, rRNAs, CRISPRs, inframe CDSs, shorter inframe sORFs for sorf in contig_sorfs[contig['id']]: diff --git a/bakta/features/cds.py b/bakta/features/cds.py index 19e555b1..c092e139 100644 --- a/bakta/features/cds.py +++ b/bakta/features/cds.py @@ -14,6 +14,7 @@ from Bio import SeqIO from Bio.Seq import Seq +from Bio import SeqFeature from Bio.SeqUtils.ProtParam import ProteinAnalysis from xopen import xopen @@ -86,7 +87,7 @@ def predict(genome: dict): return cdss -def create_cds(contig: dict, start: int, stop: int, strand: str, nt: str, aa: str): +def create_cds(contig: dict, start: int, stop: int, strand: str, edge:bool, nt: str, aa: str): cds = OrderedDict() cds['type'] = bc.FEATURE_CDS cds['contig'] = contig['id'] @@ -100,6 +101,8 @@ def create_cds(contig: dict, start: int, stop: int, strand: str, nt: str, aa: st cds['nt'] = nt cds['aa'] = aa cds['aa_digest'], cds['aa_hexdigest'] = bu.calc_aa_hash(aa) + if(edge): + cds['edge'] = True return cds @@ -108,7 +111,7 @@ def create_cdss(genes, contig): cdss_per_sequence = [] for gene in genes: strand = bc.STRAND_FORWARD if gene.strand == 1 else bc.STRAND_REVERSE - cds = create_cds(contig, gene.begin, gene.end, strand, '', '') + cds = create_cds(contig, gene.begin, gene.end, strand, False, '', '') cds['start_type'] = gene.start_type cds['rbs_motif'] = gene.rbs_motif if gene.partial_begin: @@ -219,7 +222,14 @@ def import_user_cdss(genome: dict, import_path: Path): if(contig is None): log.error('user-provided CDS: No contig found for id=%s', contig_id) raise Exception(f'user-provided CDS: No contig found for id={contig_id}') - user_cds = create_cds(contig, int(start), int(stop), strand, '', '') + edge = False + start = int(start) + stop = int(stop) + if(stop > contig['length']): # check for features spanning sequence edges + stop = stop - contig['length'] + edge = True + + user_cds = create_cds(contig, start, stop, strand, edge, '', '') user_cds['source'] = bc.CDS_SOURCE_USER if('pseudo=' in attributes or bc.INSDC_FEATURE_PSEUDOGENE in attributes): # skip pseudo genes log.debug( @@ -259,7 +269,13 @@ def import_user_cdss(genome: dict, import_path: Path): if(contig is None): log.error('user-provided CDS: No contig found for id=%s', record.id) raise Exception(f'user-provided CDS: No contig found for id={record.id}') - strand = bc.STRAND_FORWARD if feature.location.strand == +1 else bc.STRAND_REVERSE + if(feature.location.strand is None): # weird mixed-stranded compound locations + strand = bc.STRAND_UNKNOWN + else: + strand = bc.STRAND_FORWARD if feature.location.strand == +1 else bc.STRAND_REVERSE + start = feature.location.start + 1 + end = feature.location.end + edge = False if('<' in str(feature.location.start) or '>' in str(feature.location.end)): log.debug( 'skip user-provided CDS: reason=partial, contig=%s, start=%s, stop=%s, strand=%s', @@ -278,7 +294,20 @@ def import_user_cdss(genome: dict, import_path: Path): contig['id'], feature.location.start, feature.location.end, strand ) continue - user_cds = create_cds(contig, feature.location.start + 1, feature.location.end, strand, '', '') + elif(isinstance(feature.location, SeqFeature.CompoundLocation) and len(feature.location.parts) == 2): + strand = feature.location.strand + if(strand != bc.STRAND_UNKNOWN): # only accept equal strands -> edge feature + strand = bc.STRAND_FORWARD if feature.location.strand == +1 else bc.STRAND_REVERSE + edge = True + edge_left, edge_right = feature.location.parts + if(strand == bc.STRAND_FORWARD): + start = edge_left.start + 1 + end = edge_right.end + else: + start = edge_right.start + 1 + end = edge_left.end + + user_cds = create_cds(contig, start, end, strand, edge, '', '') user_cds['source'] = bc.CDS_SOURCE_USER try: nt = bu.extract_feature_sequence(user_cds, contig)