Skip to content

Commit

Permalink
introduce global unique feature ID
Browse files Browse the repository at this point in the history
  • Loading branch information
oschwengers committed Sep 22, 2021
1 parent f77e308 commit df32038
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 16 deletions.
19 changes: 6 additions & 13 deletions bakta/io/gff.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ def write_gff3(genome, features_by_contig, gff3_path):
fh.write(f'# annotated with Bakta (v{bakta.__version__}): https://github.com/oschwengers/bakta\n')
fh.write(f"# database (v{cfg.db_info['major']}.{cfg.db_info['minor']}): https://doi.org/10.5281/zenodo.4247252\n")

feature_id_counter = 1
for contig in genome['contigs']: # write features
fh.write(f"##sequence-region {contig['id']} 1 {contig['length']}\n") # sequence region

Expand Down Expand Up @@ -156,24 +155,22 @@ def write_gff3(genome, features_by_contig, gff3_path):
fh.write(f"{feat['contig']}\tInfernal\t{so.SO_NCRNA_GENE.name}\t{start}\t{stop}\t{feat['evalue']}\t{feat['strand']}\t.\t{annotations}\n")
elif(feat['type'] is bc.FEATURE_NC_RNA_REGION):
annotations = {
'ID': feature_id_counter,
'ID': feat['id'],
'Name': feat['product'],
'product': feat['product'],
'Dbxref': feat['db_xrefs']
}
feature_id_counter += 1
if(cfg.compliant):
annotations['Dbxref'], annotations['Note'] = insdc.revise_dbxref_insdc(feat['db_xrefs']) # remove INSDC invalid DbXrefs
annotations[bc.INSDC_FEATURE_REGULATORY_CLASS] = insdc.select_regulatory_class(feat)
annotations = encode_annotations(annotations)
fh.write(f"{feat['contig']}\tInfernal\t{so.SO_REGULATORY_REGION.name}\t{start}\t{stop}\t{feat['evalue']}\t{feat['strand']}\t.\t{annotations}\n")
elif(feat['type'] == bc.FEATURE_CRISPR):
annotations = {
'ID': feature_id_counter,
'ID': feat['id'],
'Name': feat['product'],
'product': feat['product']
}
feature_id_counter += 1
feat_type = so.SO_CRISPR.name
if(cfg.compliant):
feat_type = bc.INSDC_FEATURE_REPEAT_REGION
Expand Down Expand Up @@ -243,40 +240,36 @@ def write_gff3(genome, features_by_contig, gff3_path):
fh.write(f"{feat['contig']}\tBakta\t{so.SO_CDS.name}\t{start}\t{stop}\t.\t{feat['strand']}\t0\t{annotations}\n")
elif(feat['type'] is bc.FEATURE_GAP):
annotations = {
'ID': feature_id_counter,
'ID': feat['id'],
'Name': f"gap ({feat['length']} bp)",
'product': f"gap ({feat['length']} bp)"
}
feature_id_counter += 1
annotations = encode_annotations(annotations)
fh.write(f"{feat['contig']}\tBakta\t{so.SO_GAP.name}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
elif(feat['type'] == bc.FEATURE_ORIC):
annotations = {
'ID': feature_id_counter,
'ID': feat['id'],
'Name': 'oriC',
'product': 'oriC'
}
feature_id_counter += 1
annotations = encode_annotations(annotations)
feat_type = bc.INSDC_FEATURE_ORIGIN_REPLICATION if cfg.compliant else so.SO_ORIC.name
fh.write(f"{feat['contig']}\tBLAST+\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
elif(feat['type'] == bc.FEATURE_ORIV):
annotations = {
'ID': feature_id_counter,
'ID': feat['id'],
'Name': 'oriV',
'product': 'oriV'
}
feature_id_counter += 1
annotations = encode_annotations(annotations)
feat_type = bc.INSDC_FEATURE_ORIGIN_REPLICATION if cfg.compliant else so.SO_ORIC.name
fh.write(f"{feat['contig']}\tBLAST+\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
elif(feat['type'] == bc.FEATURE_ORIT):
annotations = {
'ID': feature_id_counter,
'ID': feat['id'],
'Name': 'oriT',
'product': 'oriT'
}
feature_id_counter += 1
annotations = encode_annotations(annotations)
feat_type = bc.INSDC_FEATURE_ORIGIN_TRANSFER if cfg.compliant else so.SO_ORIT.name
fh.write(f"{feat['contig']}\tBLAST+\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
Expand Down
4 changes: 4 additions & 0 deletions bakta/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -399,6 +399,8 @@ def main():
print('select features and create locus tags...')
log.debug('start feature selection and creation of locus tags')
features_by_contig = {k['id']: [] for k in genome['contigs']}
feature_id = 1
feature_id_prefix = bu.create_locus_tag_prefix(contigs, length=10)
for feature_type in [
bc.FEATURE_T_RNA,
bc.FEATURE_TM_RNA,
Expand All @@ -416,6 +418,8 @@ def main():
feature_list = genome['features'].get(feature_type, [])
for feature in feature_list:
if('discarded' not in feature):
feature['id'] = f'{feature_id_prefix}_{feature_id}'
feature_id += 1
contig_features = features_by_contig.get(feature['contig'])
contig_features.append(feature)
features = []
Expand Down
6 changes: 3 additions & 3 deletions bakta/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,22 +215,22 @@ def test_dependencies():
test_dependency(DEPENDENCY_BLASTN)


def create_locus_tag_prefix(contigs):
def create_locus_tag_prefix(contigs, length=6):
"""Create either genus/species or sequence MD5 hex based locus tag prefix."""
hash = hashlib.md5()
for contig in contigs:
hash.update(str.encode(contig['sequence']))
hexdigest = hash.hexdigest().upper()
locus_prefix_chars = []
i = 0
while i < 6:
while i < length:
c = hexdigest[i]
if(c >= '0' and c <= '9'):
c = chr(ord('F') + int(c) + 1)
locus_prefix_chars.append(c)
i += 1
locus_prefix = ''.join(locus_prefix_chars)
log.info('generated locus-tag: prefix=%s, MD5=%s', locus_prefix, hexdigest)
log.info('generated sequence tag prefix: prefix=%s, length=%i, MD5=%s', locus_prefix, length, hexdigest)
return locus_prefix


Expand Down

0 comments on commit df32038

Please sign in to comment.