Skip to content

Commit

Permalink
revise truncated dnaA/repA genes in rotated seqs
Browse files Browse the repository at this point in the history
  • Loading branch information
oschwengers committed Sep 22, 2021
1 parent 3e7a4c6 commit 0c4ad10
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 0 deletions.
48 changes: 48 additions & 0 deletions bakta/features/cds.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,3 +326,51 @@ def analyze_proteins(cdss):
)
seq_stats['isoelectric_point'] = None
cds['seq_stats'] = seq_stats


def revise_special_cases(cdss):
"""
Revise rare but known special cases as for istance supposedly truncated dnaA genes on rotated chromosome starts
which often appear on re-annotated genomes.
"""

# look for supposedly truncated dnaA genes on rotated chromosome starts: start=1, strand=+
dnaA = None
for cds in cdss:
if(
cds['start'] == 1 and
cds['strand'] == bc.STRAND_FORWARD and
cds['start_type'] == 'Edge' and
cds['rbs_motif'] is None and
('dnaa' in cds['product'].lower().split() or cds['gene'] == 'dnaA')):
dnaA = cds
break
if(dnaA is not None):
dnaA.pop('truncated')
gene = dnaA.get('gene', None)
gene = gene if gene is not None else '-'
log.info(
'revise supposedly truncated dnaA gene on rotated chromosome start: contig=%s, start=%i, stop=%i, strand=%s, gene=%s, product=%s, nt=[%s..%s], aa=[%s..%s]',
dnaA['contig'], dnaA['start'], dnaA['stop'], dnaA['strand'], gene, dnaA['product'], dnaA['nt'][:10], dnaA['nt'][-10:], dnaA['aa'][:10], dnaA['aa'][-10:]
)

# look for supposedly truncated repA genes on rotated plasmid starts: start=1, strand=+
repA = None
for cds in cdss:
if(
cds['start'] == 1 and
cds['strand'] == bc.STRAND_FORWARD and
cds['start_type'] == 'Edge' and
cds['rbs_motif'] is None and
('repa' in cds['product'].lower().split() or cds['gene'] == 'repA')):
repA = cds
break
if(repA is not None):
repA.pop('truncated')
gene = repA.get('gene', None)
gene = gene if gene is not None else '-'
log.info(
'revise supposedly truncated repA gene on rotated plasmid start: contig=%s, start=%i, stop=%i, strand=%s, gene=%s, product=%s, nt=[%s..%s], aa=[%s..%s]',
repA['contig'], repA['start'], repA['stop'], repA['strand'], gene, repA['product'], repA['nt'][:10], repA['nt'][-10:], repA['aa'][:10], repA['aa'][-10:]
)

3 changes: 3 additions & 0 deletions bakta/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,9 @@ def main():
print(f"\tdetected Pfam hits: {len(pfam_hits)} ")
feat_cds.analyze_proteins(hypotheticals)
print('\tcalculated proteins statistics')

print('\trevise special cases...')
feat_cds.revise_special_cases(cdss)

genome['features'][bc.FEATURE_CDS] = cdss

Expand Down

0 comments on commit 0c4ad10

Please sign in to comment.