Skip to content

Commit

Permalink
add check for duplicated sequence IDs #79
Browse files Browse the repository at this point in the history
  • Loading branch information
oschwengers committed Oct 12, 2021
1 parent dcc5c92 commit 888bb9b
Showing 1 changed file with 8 additions and 1 deletion.
9 changes: 8 additions & 1 deletion bakta/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,6 +352,7 @@ def qc_contigs(contigs, replicons):

complete_genome = True
plasmid_number = 1
contig_ids = set()
for contig in contigs:
if(contig['length'] >= cfg.min_contig_length):
contig_id_generated = f'{contig_prefix}_{contig_counter}'
Expand All @@ -373,7 +374,13 @@ def qc_contigs(contigs, replicons):
contig['type'] = bc.REPLICON_PLASMID
log.debug('qc: detected plasmid replicon type via description: id=%s, description=%s', contig['id'], contig['description'])

if(not cfg.keep_contig_headers):
if(cfg.keep_contig_headers):
if(contig['id'] in contig_ids):
log.error('Fasta import: duplicated contig id! contig-id=%s', contig['id'])
sys.exit(f"ERROR: Detected duplicated contig id! Contig ID ({contig['id']}) occures multiple times!")
else:
contig_ids.add(contig['id'])
else:
contig['orig_id'] = contig['id']
contig['id'] = contig_id_generated
contig['orig_description'] = contig['description']
Expand Down

0 comments on commit 888bb9b

Please sign in to comment.