Skip to content

Commit

Permalink
Introduce auxiliary scripts (#251)
Browse files Browse the repository at this point in the history
* add region extraction script
* refactor region extraction script
* fix feature type comparison
* fix SO named tuple after JSON import
* add annotation stats aggregation script
* review extract region script
* add script section to readme
* try link to scripts in readme [skip ci]
* fix typo [skip ci]
  • Loading branch information
oschwengers authored Oct 25, 2023
1 parent 53df2bf commit 65b8caa
Show file tree
Hide file tree
Showing 5 changed files with 225 additions and 28 deletions.
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ Bakta is a tool for the rapid & standardized annotation of bacterial genomes and
- [Genome Submission](#genome-submission)
- [Protein bulk annotation](#protein-bulk-annotation)
- [Genome plots](#genome-plots)
- [Auxiliary scripts](#auxiliary-scripts)
- [Web version](#web-version)
- [Citation](#citation)
- [FAQ](#faq)
Expand Down Expand Up @@ -713,6 +714,15 @@ In the `cog` mode, all protein-coding genes (CDS) are colored due to assigned CO

In addition, both plot types share two innermost GC content and GC skew rings. The first ring represents the GC content per sliding window over the entire sequence(s) in green (`#33a02c`) and red `#e31a1c` representing GC above and below average, respectively. The 2nd ring represents the GC skew in orange (`#fdbf6f`) and blue (`#1f78b4`). The GC skew gives hints on a replicon's replication bubble and hence, on the completeness of the assembly. On a complete & circular bacterial chromosome, you normally see two inflection points at the origin of replication and at its opposite region -> [Wikipedia](https://en.wikipedia.org/wiki/GC_skew)
## Auxiliary scripts
Often, the usage of Bakta is a necessary upfront task followed by deeper analyses implemented in custom scripts. In [scripts](scripts) we'd like to collect & offer a pool of scripts addressing common tasks:

- `collect-annotation-stats.py`: Collect annotation stats for a cohort of genomes and print a condensed `TSV`.
- `extract-region.py`: Extract genome features within a given genomic range and export them as `GFF3`, `Embl`, `Genbank`, `FAA` and `FFN`

Of course, pull requests are welcome ;-)

## Web version

For further convenience, we developed an accompanying web application available at https://bakta.computational.bio.
Expand Down
16 changes: 8 additions & 8 deletions bakta/io/gff.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def write_gff3(genome: dict, features_by_contig: Dict[str, dict], gff3_path: Pat
if('edge' in feat):
stop += contig['length']

if(feat['type'] is bc.FEATURE_T_RNA):
if(feat['type'] == bc.FEATURE_T_RNA):
annotations = {
'ID': feat['locus'],
'Name': feat['product'],
Expand Down Expand Up @@ -84,7 +84,7 @@ def write_gff3(genome: dict, features_by_contig: Dict[str, dict], gff3_path: Pat
fh.write(f"{feat['contig']}\ttRNAscan-SE\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
annotations = encode_annotations(annotations)
fh.write(f"{feat['contig']}\ttRNAscan-SE\t{so.SO_TRNA.name}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
elif(feat['type'] is bc.FEATURE_TM_RNA):
elif(feat['type'] == bc.FEATURE_TM_RNA):
annotations = {
'ID': feat['locus'],
'Name': feat['product'],
Expand All @@ -107,7 +107,7 @@ def write_gff3(genome: dict, features_by_contig: Dict[str, dict], gff3_path: Pat
fh.write(f"{feat['contig']}\tAragorn\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
annotations = encode_annotations(annotations)
fh.write(f"{feat['contig']}\tAragorn\t{so.SO_TMRNA.name}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
elif(feat['type'] is bc.FEATURE_R_RNA):
elif(feat['type'] == bc.FEATURE_R_RNA):
annotations = {
'ID': feat['locus'],
'Name': feat['product'],
Expand All @@ -131,7 +131,7 @@ def write_gff3(genome: dict, features_by_contig: Dict[str, dict], gff3_path: Pat
fh.write(f"{feat['contig']}\tInfernal\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
annotations = encode_annotations(annotations)
fh.write(f"{feat['contig']}\tInfernal\t{so.SO_RRNA.name}\t{start}\t{stop}\t{feat['evalue']}\t{feat['strand']}\t.\t{annotations}\n")
elif(feat['type'] is bc.FEATURE_NC_RNA):
elif(feat['type'] == bc.FEATURE_NC_RNA):
annotations = {
'ID': feat['locus'],
'Name': feat['product'],
Expand All @@ -156,7 +156,7 @@ def write_gff3(genome: dict, features_by_contig: Dict[str, dict], gff3_path: Pat
fh.write(f"{feat['contig']}\tInfernal\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
annotations = encode_annotations(annotations)
fh.write(f"{feat['contig']}\tInfernal\t{so.SO_NCRNA_GENE.name}\t{start}\t{stop}\t{feat['evalue']}\t{feat['strand']}\t.\t{annotations}\n")
elif(feat['type'] is bc.FEATURE_NC_RNA_REGION):
elif(feat['type'] == bc.FEATURE_NC_RNA_REGION):
annotations = {
'ID': feat['id'],
'Name': feat['product'],
Expand Down Expand Up @@ -209,7 +209,7 @@ def write_gff3(genome: dict, features_by_contig: Dict[str, dict], gff3_path: Pat
}
annotations = encode_annotations(annotations)
fh.write(f"{feat['contig']}\tPILER-CR\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
elif(feat['type'] is bc.FEATURE_CDS):
elif(feat['type'] == bc.FEATURE_CDS):
annotations = {
'ID': feat['locus'],
'Name': feat['product'],
Expand Down Expand Up @@ -254,7 +254,7 @@ def write_gff3(genome: dict, features_by_contig: Dict[str, dict], gff3_path: Pat
fh.write(f"{feat['contig']}\tProdigal\t{so.SO_CDS.name}\t{start}\t{stop}\t.\t{feat['strand']}\t0\t{annotations}\n")
if(bc.FEATURE_SIGNAL_PEPTIDE in feat):
write_signal_peptide(fh, feat)
elif(feat['type'] is bc.FEATURE_SORF):
elif(feat['type'] == bc.FEATURE_SORF):
annotations = {
'ID': feat['locus'],
'Name': feat['product'],
Expand Down Expand Up @@ -284,7 +284,7 @@ def write_gff3(genome: dict, features_by_contig: Dict[str, dict], gff3_path: Pat
fh.write(f"{feat['contig']}\tBakta\t{so.SO_CDS.name}\t{start}\t{stop}\t.\t{feat['strand']}\t0\t{annotations}\n")
if(bc.FEATURE_SIGNAL_PEPTIDE in feat):
write_signal_peptide(fh, feat)
elif(feat['type'] is bc.FEATURE_GAP):
elif(feat['type'] == bc.FEATURE_GAP):
annotations = {
'ID': feat['id'],
'Name': f"gap ({feat['length']} bp)",
Expand Down
52 changes: 32 additions & 20 deletions bakta/io/insdc.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,33 +270,45 @@ def write_insdc(genome: dict, features: Sequence[dict], genbank_output_path: Pat


def select_ncrna_class(feature: dict) -> str:
if(feature['class'] is None):
feature_class = feature['class']
if(feature_class is None):
return bc.INSDC_FEATURE_NC_RNA_CLASS_OTHER
elif(feature['class'].id == so.SO_NCRNA_GENE_ANTISENSE.id):
return bc.INSDC_FEATURE_NC_RNA_CLASS_ANTISENSE
elif(feature['class'].id == so.SO_NCRNA_GENE_RIBOZYME.id):
return bc.INSDC_FEATURE_NC_RNA_CLASS_RIBOZYME
elif(feature['class'].id == so.SO_NCRNA_GENE_RNASEP.id):
return bc.INSDC_FEATURE_NC_RNA_CLASS_RNASEP
else:
return bc.INSDC_FEATURE_NC_RNA_CLASS_OTHER
if(isinstance(feature_class, list)): # workaround for JSON-imported features
feature_class = so.SO(feature_class[0], feature_class[1])
feature['class'] = feature_class

if(feature_class.id == so.SO_NCRNA_GENE_ANTISENSE.id):
return bc.INSDC_FEATURE_NC_RNA_CLASS_ANTISENSE
elif(feature_class.id == so.SO_NCRNA_GENE_RIBOZYME.id):
return bc.INSDC_FEATURE_NC_RNA_CLASS_RIBOZYME
elif(feature_class.id == so.SO_NCRNA_GENE_RNASEP.id):
return bc.INSDC_FEATURE_NC_RNA_CLASS_RNASEP
else:
return bc.INSDC_FEATURE_NC_RNA_CLASS_OTHER


def select_regulatory_class(feature: dict) -> str:
if(feature['class'] is None):
feature_class = feature['class']
if(feature_class is None):
return bc.INSDC_FEATURE_REGULATORY_CLASS_OTHER
elif(feature['class'].id == so.SO_CIS_REG_ATTENUATOR.id):
return bc.INSDC_FEATURE_REGULATORY_CLASS_ATTENUATOR
elif(feature['class'].id == so.SO_CIS_REG_RIBOSWITCH.id):
return bc.INSDC_FEATURE_REGULATORY_CLASS_RIBOSWITCH
elif(feature['class'].id == so.SO_CIS_REG_THERMOMETER.id):
return bc.INSDC_FEATURE_REGULATORY_CLASS_RESPONSE_ELEMENT
elif(feature['class'].id == so.SO_CIS_REG_RECODING_STIMULATION_REGION.id or feature['class'].id == so.SO_CIS_REG_FRAMESHIFT.id):
return bc.INSDC_FEATURE_REGULATORY_CLASS_RECODING_STIMULATORY_REGION
elif(feature['class'].id == so.SO_CIS_REG_RIBOSOME_BINDING_SITE.id):
return bc.INSDC_FEATURE_REGULATORY_CLASS_RIBOSOME_BINDING_SITE
else:
return bc.INSDC_FEATURE_REGULATORY_CLASS_OTHER
if(isinstance(feature_class, list)): # workaround for JSON-imported features
feature_class = so.SO(feature_class[0], feature_class[1])
feature['class'] = feature_class

if(feature_class.id == so.SO_CIS_REG_ATTENUATOR.id):
return bc.INSDC_FEATURE_REGULATORY_CLASS_ATTENUATOR
elif(feature_class.id == so.SO_CIS_REG_RIBOSWITCH.id):
return bc.INSDC_FEATURE_REGULATORY_CLASS_RIBOSWITCH
elif(feature_class.id == so.SO_CIS_REG_THERMOMETER.id):
return bc.INSDC_FEATURE_REGULATORY_CLASS_RESPONSE_ELEMENT
elif(feature_class.id == so.SO_CIS_REG_RECODING_STIMULATION_REGION.id or feature_class.id == so.SO_CIS_REG_FRAMESHIFT.id):
return bc.INSDC_FEATURE_REGULATORY_CLASS_RECODING_STIMULATORY_REGION
elif(feature_class.id == so.SO_CIS_REG_RIBOSOME_BINDING_SITE.id):
return bc.INSDC_FEATURE_REGULATORY_CLASS_RIBOSOME_BINDING_SITE
else:
return bc.INSDC_FEATURE_REGULATORY_CLASS_OTHER


def revise_product_insdc(product: str):
Expand Down
94 changes: 94 additions & 0 deletions scripts/collect-annotation-stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
#!/usr/bin/python3

import argparse
import json
import os

from pathlib import Path

import bakta
import bakta.constants as bc


parser = argparse.ArgumentParser(
prog=f'collect-annotation-stats',
description='Collect annotation statistics and export as TSV',
epilog=f'Version: {bakta.__version__}\nDOI: {bc.BAKTA_DOI}\nURL: github.com/oschwengers/bakta\n\nCitation:\n{bc.BAKTA_CITATION}',
formatter_class=argparse.RawDescriptionHelpFormatter,
add_help=False
)
parser.add_argument('genomes', metavar='<genomes>', nargs='+', help='Bakta genome annotation files in JSON format')
parser.add_argument('--prefix', '-p', action='store', default='annotation-stats', help='Prefix for output file')
parser.add_argument('--output', '-o', action='store', default=os.getcwd(), help='Output directory (default = current working directory)')
args = parser.parse_args()


prefix = args.prefix
output_path = Path(args.output).resolve().joinpath(f'{prefix}.tsv')
with output_path.open('w') as fh_out:
fh_out.write(
'\t'.join(
[
'Genome',
'Taxon',
'Complete',
'Translation tale',
'# Sequences',
'Size',
'GC',
'N ratio',
'Coding ratio',
'tRNA',
'tmRNA',
'rRNA',
'ncRNA',
'ncRNA region',
'CRISPR',
'CDS',
'CDS hypothetical',
'CDS pseudogene',
'sORF',
'GAP',
'oriC',
'oriV',
'oriT'
]
)
)
fh_out.write('\n')
for genome in args.genomes:
genome_path = Path(genome).resolve()
try:
with genome_path.open() as fh_in:
genome = json.load(fh_in)
stats = [
genome_path.stem,
f"{' '.join([t for t in [genome['genome'].get('genus', None), genome['genome'].get('species', None), genome['genome'].get('strain', None)] if t is not None])}",
'y' if genome['genome']['complete'] else 'n',
f"{genome['genome']['translation_table']}",
f"{genome['stats']['no_sequences']}",
f"{genome['stats']['size']}",
f"{100 * genome['stats']['gc']:.1f}",
f"{100 * genome['stats']['n_ratio']:.1f}",
f"{genome['stats']['n50']}",
f"{100 * genome['stats']['coding_ratio']:.1f}",
f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_T_RNA])}",
f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_TM_RNA])}",
f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_R_RNA])}",
f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_NC_RNA])}",
f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_NC_RNA_REGION])}",
f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_CRISPR])}",
f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_CDS])}",
f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_CDS and 'hypothetical' in f])}",
f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_CDS and 'pseudogene' in f])}",
f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_SORF])}",
f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_GAP])}",
f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_ORIC])}",
f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_ORIV])}",
f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_ORIT])}",
]
output_line = '\t'.join(stats)
print(output_line)
fh_out.write(f'{output_line}\n')
except:
print(f"Error reading genome {genome_path.stem}")
81 changes: 81 additions & 0 deletions scripts/extract-region.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
#!/usr/bin/python3

import argparse
import json
import os

from pathlib import Path

import bakta
import bakta.constants as bc
import bakta.io.gff as gff
import bakta.io.insdc as insdc
import bakta.io.fasta as fasta
import bakta.config as cfg


parser = argparse.ArgumentParser(
prog=f'extract region',
description='Extract genomic region with a given range and exports selected features as GFF3, FAA, FFN, EMBL and Genbank',
epilog=f'Version: {bakta.__version__}\nDOI: {bc.BAKTA_DOI}\nURL: github.com/oschwengers/bakta\n\nCitation:\n{bc.BAKTA_CITATION}',
formatter_class=argparse.RawDescriptionHelpFormatter,
add_help=False
)
parser.add_argument('genome', metavar='<genome>', help='Bakta genome annotation in JSON format')
parser.add_argument('--prefix', '-p', action='store', default=None, help='Prefix for output files')
parser.add_argument('--output', '-o', action='store', default=os.getcwd(), help='Output directory (default = current working directory)')
parser.add_argument('--sequence', '-s', action='store', default=None, help='Sequence/Contig (default = first)')
parser.add_argument('--min', '-m', action='store', type=int, default=0, help='Left region border including (default = 0)')
parser.add_argument('--max', '-x', action='store', type=int, default=100_000_000, help='Right region border including (default = 100,000,000)')
args = parser.parse_args()


print('Load annotated genome...')
genome_path = Path(args.genome).resolve()
with genome_path.open() as fh:
genome = json.load(fh)

contig_id = args.sequence
if(contig_id is None): # take first sequence as default
contig_id = genome['sequences'][0]['id']

prefix = args.prefix
if(prefix is None): # use input file prefix as default
prefix = genome_path.stem


print('Extract features within selected region...')
features_selected = []
for feat in genome['features']:
if(feat['contig'] == contig_id):
if(feat['start'] >= args.min and feat['stop'] <= args.max):
features_selected.append(feat)
features_by_contig = {contig_id: features_selected} # needed for GFF3 export
print(f'\t...selected features: {len(features_selected)}')

genome['features'] = features_selected
genome['contigs'] = [sequence for sequence in genome['sequences'] if sequence['id'] == contig_id]
genome['genus'] = genome['genome']['genus']
genome['species'] = genome['genome']['species']
genome['strain'] = genome['genome']['strain']
genome['taxon'] = f"{genome['genome']['genus']} {genome['genome']['species']} {genome['genome']['strain']}"
cfg.db_info = {
'major': genome['version']['db']['version'].split('.')[0],
'minor': genome['version']['db']['version'].split('.')[1],
'type': genome['version']['db']['type']
}

print('Write selected features...')
output_path = Path(args.output).resolve()
gff3_path = output_path.joinpath(f'{prefix}.gff3')
gff.write_gff3(genome, features_by_contig, gff3_path)
print('\t...INSDC GenBank & EMBL')
genbank_path = output_path.joinpath(f'{prefix}.gbff')
embl_path = output_path.joinpath(f'{prefix}.embl')
insdc.write_insdc(genome, features_selected, genbank_path, embl_path)
print('\t...feature nucleotide sequences')
ffn_path = output_path.joinpath(f'{prefix}.ffn')
fasta.write_ffn(features_selected, ffn_path)
print('\t...translated CDS sequences')
faa_path = output_path.joinpath(f'{prefix}.faa')
fasta.write_faa(features_selected, faa_path)

0 comments on commit 65b8caa

Please sign in to comment.