introduce global unique feature ID

oschwengers · Sep 22, 2021 · df32038 · df32038
1 parent f77e308
commit df32038
Show file tree

Hide file tree

Showing 3 changed files with 13 additions and 16 deletions.
diff --git a/bakta/io/gff.py b/bakta/io/gff.py
@@ -25,7 +25,6 @@ def write_gff3(genome, features_by_contig, gff3_path):
         fh.write(f'# annotated with Bakta (v{bakta.__version__}): https://github.com/oschwengers/bakta\n')
         fh.write(f"# database (v{cfg.db_info['major']}.{cfg.db_info['minor']}): https://doi.org/10.5281/zenodo.4247252\n")
 
-        feature_id_counter = 1
         for contig in genome['contigs']:  # write features
             fh.write(f"##sequence-region {contig['id']} 1 {contig['length']}\n")  # sequence region
 
@@ -156,24 +155,22 @@ def write_gff3(genome, features_by_contig, gff3_path):
                     fh.write(f"{feat['contig']}\tInfernal\t{so.SO_NCRNA_GENE.name}\t{start}\t{stop}\t{feat['evalue']}\t{feat['strand']}\t.\t{annotations}\n")
                 elif(feat['type'] is bc.FEATURE_NC_RNA_REGION):
                     annotations = {
-                        'ID': feature_id_counter,
+                        'ID': feat['id'],
                         'Name': feat['product'],
                         'product': feat['product'],
                         'Dbxref': feat['db_xrefs']
                     }
-                    feature_id_counter += 1
                     if(cfg.compliant):
                         annotations['Dbxref'], annotations['Note'] = insdc.revise_dbxref_insdc(feat['db_xrefs'])  # remove INSDC invalid DbXrefs
                         annotations[bc.INSDC_FEATURE_REGULATORY_CLASS] = insdc.select_regulatory_class(feat)
                     annotations = encode_annotations(annotations)
                     fh.write(f"{feat['contig']}\tInfernal\t{so.SO_REGULATORY_REGION.name}\t{start}\t{stop}\t{feat['evalue']}\t{feat['strand']}\t.\t{annotations}\n")
                 elif(feat['type'] == bc.FEATURE_CRISPR):
                     annotations = {
-                        'ID': feature_id_counter,
+                        'ID': feat['id'],
                         'Name': feat['product'],
                         'product': feat['product']
                     }
-                    feature_id_counter += 1
                     feat_type = so.SO_CRISPR.name
                     if(cfg.compliant):
                         feat_type = bc.INSDC_FEATURE_REPEAT_REGION
@@ -243,40 +240,36 @@ def write_gff3(genome, features_by_contig, gff3_path):
                     fh.write(f"{feat['contig']}\tBakta\t{so.SO_CDS.name}\t{start}\t{stop}\t.\t{feat['strand']}\t0\t{annotations}\n")
                 elif(feat['type'] is bc.FEATURE_GAP):
                     annotations = {
-                        'ID': feature_id_counter,
+                        'ID': feat['id'],
                         'Name': f"gap ({feat['length']} bp)",
                         'product': f"gap ({feat['length']} bp)"
                     }
-                    feature_id_counter += 1
                     annotations = encode_annotations(annotations)
                     fh.write(f"{feat['contig']}\tBakta\t{so.SO_GAP.name}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
                 elif(feat['type'] == bc.FEATURE_ORIC):
                     annotations = {
-                        'ID': feature_id_counter,
+                        'ID': feat['id'],
                         'Name': 'oriC',
                         'product': 'oriC'
                     }
-                    feature_id_counter += 1
                     annotations = encode_annotations(annotations)
                     feat_type = bc.INSDC_FEATURE_ORIGIN_REPLICATION if cfg.compliant else so.SO_ORIC.name
                     fh.write(f"{feat['contig']}\tBLAST+\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
                 elif(feat['type'] == bc.FEATURE_ORIV):
                     annotations = {
-                        'ID': feature_id_counter,
+                        'ID': feat['id'],
                         'Name': 'oriV',
                         'product': 'oriV'
                     }
-                    feature_id_counter += 1
                     annotations = encode_annotations(annotations)
                     feat_type = bc.INSDC_FEATURE_ORIGIN_REPLICATION if cfg.compliant else so.SO_ORIC.name
                     fh.write(f"{feat['contig']}\tBLAST+\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
                 elif(feat['type'] == bc.FEATURE_ORIT):
                     annotations = {
-                        'ID': feature_id_counter,
+                        'ID': feat['id'],
                         'Name': 'oriT',
                         'product': 'oriT'
                     }
-                    feature_id_counter += 1
                     annotations = encode_annotations(annotations)
                     feat_type = bc.INSDC_FEATURE_ORIGIN_TRANSFER if cfg.compliant else so.SO_ORIT.name
                     fh.write(f"{feat['contig']}\tBLAST+\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")

diff --git a/bakta/main.py b/bakta/main.py
@@ -399,6 +399,8 @@ def main():
     print('select features and create locus tags...')
     log.debug('start feature selection and creation of locus tags')
     features_by_contig = {k['id']: [] for k in genome['contigs']}
+    feature_id = 1
+    feature_id_prefix = bu.create_locus_tag_prefix(contigs, length=10)
     for feature_type in [
             bc.FEATURE_T_RNA,
             bc.FEATURE_TM_RNA,
@@ -416,6 +418,8 @@ def main():
         feature_list = genome['features'].get(feature_type, [])
         for feature in feature_list:
             if('discarded' not in feature):
+                feature['id'] = f'{feature_id_prefix}_{feature_id}'
+                feature_id += 1
                 contig_features = features_by_contig.get(feature['contig'])
                 contig_features.append(feature)
     features = []

diff --git a/bakta/utils.py b/bakta/utils.py
@@ -215,22 +215,22 @@ def test_dependencies():
         test_dependency(DEPENDENCY_BLASTN)
 
 
-def create_locus_tag_prefix(contigs):
+def create_locus_tag_prefix(contigs, length=6):
     """Create either genus/species or sequence MD5 hex based locus tag prefix."""
     hash = hashlib.md5()
     for contig in contigs:
         hash.update(str.encode(contig['sequence']))
     hexdigest = hash.hexdigest().upper()
     locus_prefix_chars = []
     i = 0
-    while i < 6:
+    while i < length:
         c = hexdigest[i]
         if(c >= '0' and c <= '9'):
             c = chr(ord('F') + int(c) + 1)
         locus_prefix_chars.append(c)
         i += 1
     locus_prefix = ''.join(locus_prefix_chars)
-    log.info('generated locus-tag: prefix=%s, MD5=%s', locus_prefix, hexdigest)
+    log.info('generated sequence tag prefix: prefix=%s, length=%i, MD5=%s', locus_prefix, length, hexdigest)
     return locus_prefix