From 888bb9b06bda62f88d8a93e0dfbd85eb7cf50e3e Mon Sep 17 00:00:00 2001 From: Oliver Schwengers <oliver.schwengers@computational.bio.uni-giessen.de> Date: Tue, 12 Oct 2021 16:30:19 +0200 Subject: [PATCH] add check for duplicated sequence IDs #79 --- bakta/utils.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/bakta/utils.py b/bakta/utils.py index 2bdd8167..1bcc8f70 100644 --- a/bakta/utils.py +++ b/bakta/utils.py @@ -352,6 +352,7 @@ def qc_contigs(contigs, replicons): complete_genome = True plasmid_number = 1 + contig_ids = set() for contig in contigs: if(contig['length'] >= cfg.min_contig_length): contig_id_generated = f'{contig_prefix}_{contig_counter}' @@ -373,7 +374,13 @@ def qc_contigs(contigs, replicons): contig['type'] = bc.REPLICON_PLASMID log.debug('qc: detected plasmid replicon type via description: id=%s, description=%s', contig['id'], contig['description']) - if(not cfg.keep_contig_headers): + if(cfg.keep_contig_headers): + if(contig['id'] in contig_ids): + log.error('Fasta import: duplicated contig id! contig-id=%s', contig['id']) + sys.exit(f"ERROR: Detected duplicated contig id! Contig ID ({contig['id']}) occures multiple times!") + else: + contig_ids.add(contig['id']) + else: contig['orig_id'] = contig['id'] contig['id'] = contig_id_generated contig['orig_description'] = contig['description']