From 888bb9b06bda62f88d8a93e0dfbd85eb7cf50e3e Mon Sep 17 00:00:00 2001
From: Oliver Schwengers <oliver.schwengers@computational.bio.uni-giessen.de>
Date: Tue, 12 Oct 2021 16:30:19 +0200
Subject: [PATCH] add check for duplicated sequence IDs #79

---
 bakta/utils.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/bakta/utils.py b/bakta/utils.py
index 2bdd8167..1bcc8f70 100644
--- a/bakta/utils.py
+++ b/bakta/utils.py
@@ -352,6 +352,7 @@ def qc_contigs(contigs, replicons):
 
     complete_genome = True
     plasmid_number = 1
+    contig_ids = set()
     for contig in contigs:
         if(contig['length'] >= cfg.min_contig_length):
             contig_id_generated = f'{contig_prefix}_{contig_counter}'
@@ -373,7 +374,13 @@ def qc_contigs(contigs, replicons):
                 contig['type'] = bc.REPLICON_PLASMID
                 log.debug('qc: detected plasmid replicon type via description: id=%s, description=%s', contig['id'], contig['description'])
 
-            if(not cfg.keep_contig_headers):
+            if(cfg.keep_contig_headers):
+                if(contig['id'] in contig_ids):
+                    log.error('Fasta import: duplicated contig id! contig-id=%s', contig['id'])
+                    sys.exit(f"ERROR: Detected duplicated contig id! Contig ID ({contig['id']}) occures multiple times!")
+                else:
+                    contig_ids.add(contig['id'])
+            else:
                 contig['orig_id'] = contig['id']
                 contig['id'] = contig_id_generated
                 contig['orig_description'] = contig['description']