Skip to content

Commit

Permalink
Fix import of user-provided CDS spanning sequence edges (#332)
Browse files Browse the repository at this point in the history
* fix user-provided edge cds #324
* adopt overlap filter for user-provided edge cds #324
* refactor edge cds creation
  • Loading branch information
oschwengers authored Oct 3, 2024
1 parent a7b3cfd commit 0f14902
Show file tree
Hide file tree
Showing 2 changed files with 67 additions and 20 deletions.
48 changes: 33 additions & 15 deletions bakta/features/annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,21 +285,39 @@ def detect_feature_overlaps(genome: dict):
)
# user-provided CDS overlaps
for cds_user_provided in contig_cdss_user_provided[contig['id']]:
if(cds['stop'] < cds_user_provided['start'] or cds['start'] > cds_user_provided['stop']):
continue
else: # overlap -> remove cds
overlap = min(cds['stop'], cds_user_provided['stop']) - max(cds['start'], cds_user_provided['start']) + 1
if(overlap > bc.CDS_MAX_OVERLAPS):
overlap = f"[{max(cds['start'], cds_user_provided['start'])},{min(cds['stop'], cds_user_provided['stop'])}]"
cds['discarded'] = {
'type': bc.DISCARD_TYPE_OVERLAP,
'feature_type': bc.FEATURE_CDS,
'description': f'overlaps {bc.FEATURE_CDS} at {overlap}'
}
log.info(
"overlap: de novo CDS (%s/%s) [%i, %i] overlapping user-provided CDS [%i, %i], %s, contig=%s",
cds.get('gene', '-'), cds.get('product', '-'), cds['start'], cds['stop'], cds_user_provided['start'], cds_user_provided['stop'], overlap, cds['contig']
)
overlap = 0
if(not cds_user_provided.get('edge', False) and not cds.get('edge', False)): # both CDS not edge features
if(cds['stop'] < cds_user_provided['start'] or cds['start'] > cds_user_provided['stop']):
continue
else:
overlap = min(cds['stop'], cds_user_provided['stop']) - max(cds['start'], cds_user_provided['start']) + 1
elif(cds_user_provided.get('edge', False) and not cds.get('edge', False)): # only user-provided CDS edge feature
if(cds['stop'] > cds_user_provided['start']): # overlapping de-novo CDS at sequence end
overlap = cds['stop'] - max(cds['start'], cds_user_provided['start']) + 1
elif(cds['start'] < cds_user_provided['stop']): # overlapping de-novo CDS at sequence start
overlap = min(cds['stop'], cds_user_provided['stop']) - cds['start'] + 1
else:
continue
elif(not cds_user_provided.get('edge', False) and cds.get('edge', False)): # only de-novo CDS edge feature
if(cds_user_provided['stop'] > cds['start']): # overlapping user-provided CDS at sequence end
overlap = cds_user_provided['stop'] - max(cds['start'], cds_user_provided['start']) + 1
elif(cds_user_provided['start'] < cds['stop']): # overlapping user-provided CDS at sequence start
overlap = min(cds['stop'], cds_user_provided['stop']) - cds_user_provided['start'] + 1
else:
continue
elif(cds_user_provided.get('edge', False) and cds.get('edge', False)): # both CDS edge features
overlap = (contig['length'] - max(cds['start'], cds_user_provided['start']) + 1) + min(cds['stop'], cds_user_provided['stop'])
if(overlap > bc.CDS_MAX_OVERLAPS):
overlap = f"[{max(cds['start'], cds_user_provided['start'])},{min(cds['stop'], cds_user_provided['stop'])}]"
cds['discarded'] = {
'type': bc.DISCARD_TYPE_OVERLAP,
'feature_type': bc.FEATURE_CDS,
'description': f'overlaps user-provided {bc.FEATURE_CDS} at {overlap}'
}
log.info(
"overlap: de-novo CDS (%s/%s) [%i, %i] overlapping user-provided CDS [%i, %i], %s, contig=%s",
cds.get('gene', '-'), cds.get('product', '-'), cds['start'], cds['stop'], cds_user_provided['start'], cds_user_provided['stop'], overlap, cds['contig']
)

# remove sORF overlapping with tRNAs, tmRNAs, rRNAs, CRISPRs, inframe CDSs, shorter inframe sORFs
for sorf in contig_sorfs[contig['id']]:
Expand Down
39 changes: 34 additions & 5 deletions bakta/features/cds.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

from Bio import SeqIO
from Bio.Seq import Seq
from Bio import SeqFeature
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from xopen import xopen

Expand Down Expand Up @@ -86,7 +87,7 @@ def predict(genome: dict):
return cdss


def create_cds(contig: dict, start: int, stop: int, strand: str, nt: str, aa: str):
def create_cds(contig: dict, start: int, stop: int, strand: str, edge:bool, nt: str, aa: str):
cds = OrderedDict()
cds['type'] = bc.FEATURE_CDS
cds['contig'] = contig['id']
Expand All @@ -100,6 +101,8 @@ def create_cds(contig: dict, start: int, stop: int, strand: str, nt: str, aa: st
cds['nt'] = nt
cds['aa'] = aa
cds['aa_digest'], cds['aa_hexdigest'] = bu.calc_aa_hash(aa)
if(edge):
cds['edge'] = True
return cds


Expand All @@ -108,7 +111,7 @@ def create_cdss(genes, contig):
cdss_per_sequence = []
for gene in genes:
strand = bc.STRAND_FORWARD if gene.strand == 1 else bc.STRAND_REVERSE
cds = create_cds(contig, gene.begin, gene.end, strand, '', '')
cds = create_cds(contig, gene.begin, gene.end, strand, False, '', '')
cds['start_type'] = gene.start_type
cds['rbs_motif'] = gene.rbs_motif
if gene.partial_begin:
Expand Down Expand Up @@ -219,7 +222,14 @@ def import_user_cdss(genome: dict, import_path: Path):
if(contig is None):
log.error('user-provided CDS: No contig found for id=%s', contig_id)
raise Exception(f'user-provided CDS: No contig found for id={contig_id}')
user_cds = create_cds(contig, int(start), int(stop), strand, '', '')
edge = False
start = int(start)
stop = int(stop)
if(stop > contig['length']): # check for features spanning sequence edges
stop = stop - contig['length']
edge = True

user_cds = create_cds(contig, start, stop, strand, edge, '', '')
user_cds['source'] = bc.CDS_SOURCE_USER
if('pseudo=' in attributes or bc.INSDC_FEATURE_PSEUDOGENE in attributes): # skip pseudo genes
log.debug(
Expand Down Expand Up @@ -259,7 +269,13 @@ def import_user_cdss(genome: dict, import_path: Path):
if(contig is None):
log.error('user-provided CDS: No contig found for id=%s', record.id)
raise Exception(f'user-provided CDS: No contig found for id={record.id}')
strand = bc.STRAND_FORWARD if feature.location.strand == +1 else bc.STRAND_REVERSE
if(feature.location.strand is None): # weird mixed-stranded compound locations
strand = bc.STRAND_UNKNOWN
else:
strand = bc.STRAND_FORWARD if feature.location.strand == +1 else bc.STRAND_REVERSE
start = feature.location.start + 1
end = feature.location.end
edge = False
if('<' in str(feature.location.start) or '>' in str(feature.location.end)):
log.debug(
'skip user-provided CDS: reason=partial, contig=%s, start=%s, stop=%s, strand=%s',
Expand All @@ -278,7 +294,20 @@ def import_user_cdss(genome: dict, import_path: Path):
contig['id'], feature.location.start, feature.location.end, strand
)
continue
user_cds = create_cds(contig, feature.location.start + 1, feature.location.end, strand, '', '')
elif(isinstance(feature.location, SeqFeature.CompoundLocation) and len(feature.location.parts) == 2):
strand = feature.location.strand
if(strand != bc.STRAND_UNKNOWN): # only accept equal strands -> edge feature
strand = bc.STRAND_FORWARD if feature.location.strand == +1 else bc.STRAND_REVERSE
edge = True
edge_left, edge_right = feature.location.parts
if(strand == bc.STRAND_FORWARD):
start = edge_left.start + 1
end = edge_right.end
else:
start = edge_right.start + 1
end = edge_left.end

user_cds = create_cds(contig, start, end, strand, edge, '', '')
user_cds['source'] = bc.CDS_SOURCE_USER
try:
nt = bu.extract_feature_sequence(user_cds, contig)
Expand Down

0 comments on commit 0f14902

Please sign in to comment.