Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Introduce auxiliary scripts #251

Merged
merged 9 commits into from
Oct 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ Bakta is a tool for the rapid & standardized annotation of bacterial genomes and
- [Genome Submission](#genome-submission)
- [Protein bulk annotation](#protein-bulk-annotation)
- [Genome plots](#genome-plots)
- [Auxiliary scripts](#auxiliary-scripts)
- [Web version](#web-version)
- [Citation](#citation)
- [FAQ](#faq)
Expand Down Expand Up @@ -713,6 +714,15 @@ In the `cog` mode, all protein-coding genes (CDS) are colored due to assigned CO

In addition, both plot types share two innermost GC content and GC skew rings. The first ring represents the GC content per sliding window over the entire sequence(s) in green (`#33a02c`) and red `#e31a1c` representing GC above and below average, respectively. The 2nd ring represents the GC skew in orange (`#fdbf6f`) and blue (`#1f78b4`). The GC skew gives hints on a replicon's replication bubble and hence, on the completeness of the assembly. On a complete & circular bacterial chromosome, you normally see two inflection points at the origin of replication and at its opposite region -> [Wikipedia](https://en.wikipedia.org/wiki/GC_skew)

## Auxiliary scripts

Often, the usage of Bakta is a necessary upfront task followed by deeper analyses implemented in custom scripts. In [scripts](scripts) we'd like to collect & offer a pool of scripts addressing common tasks:

- `collect-annotation-stats.py`: Collect annotation stats for a cohort of genomes and print a condensed `TSV`.
- `extract-region.py`: Extract genome features within a given genomic range and export them as `GFF3`, `Embl`, `Genbank`, `FAA` and `FFN`

Of course, pull requests are welcome ;-)

## Web version

For further convenience, we developed an accompanying web application available at https://bakta.computational.bio.
Expand Down
16 changes: 8 additions & 8 deletions bakta/io/gff.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def write_gff3(genome: dict, features_by_contig: Dict[str, dict], gff3_path: Pat
if('edge' in feat):
stop += contig['length']

if(feat['type'] is bc.FEATURE_T_RNA):
if(feat['type'] == bc.FEATURE_T_RNA):
annotations = {
'ID': feat['locus'],
'Name': feat['product'],
Expand Down Expand Up @@ -84,7 +84,7 @@ def write_gff3(genome: dict, features_by_contig: Dict[str, dict], gff3_path: Pat
fh.write(f"{feat['contig']}\ttRNAscan-SE\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
annotations = encode_annotations(annotations)
fh.write(f"{feat['contig']}\ttRNAscan-SE\t{so.SO_TRNA.name}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
elif(feat['type'] is bc.FEATURE_TM_RNA):
elif(feat['type'] == bc.FEATURE_TM_RNA):
annotations = {
'ID': feat['locus'],
'Name': feat['product'],
Expand All @@ -107,7 +107,7 @@ def write_gff3(genome: dict, features_by_contig: Dict[str, dict], gff3_path: Pat
fh.write(f"{feat['contig']}\tAragorn\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
annotations = encode_annotations(annotations)
fh.write(f"{feat['contig']}\tAragorn\t{so.SO_TMRNA.name}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
elif(feat['type'] is bc.FEATURE_R_RNA):
elif(feat['type'] == bc.FEATURE_R_RNA):
annotations = {
'ID': feat['locus'],
'Name': feat['product'],
Expand All @@ -131,7 +131,7 @@ def write_gff3(genome: dict, features_by_contig: Dict[str, dict], gff3_path: Pat
fh.write(f"{feat['contig']}\tInfernal\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
annotations = encode_annotations(annotations)
fh.write(f"{feat['contig']}\tInfernal\t{so.SO_RRNA.name}\t{start}\t{stop}\t{feat['evalue']}\t{feat['strand']}\t.\t{annotations}\n")
elif(feat['type'] is bc.FEATURE_NC_RNA):
elif(feat['type'] == bc.FEATURE_NC_RNA):
annotations = {
'ID': feat['locus'],
'Name': feat['product'],
Expand All @@ -156,7 +156,7 @@ def write_gff3(genome: dict, features_by_contig: Dict[str, dict], gff3_path: Pat
fh.write(f"{feat['contig']}\tInfernal\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
annotations = encode_annotations(annotations)
fh.write(f"{feat['contig']}\tInfernal\t{so.SO_NCRNA_GENE.name}\t{start}\t{stop}\t{feat['evalue']}\t{feat['strand']}\t.\t{annotations}\n")
elif(feat['type'] is bc.FEATURE_NC_RNA_REGION):
elif(feat['type'] == bc.FEATURE_NC_RNA_REGION):
annotations = {
'ID': feat['id'],
'Name': feat['product'],
Expand Down Expand Up @@ -209,7 +209,7 @@ def write_gff3(genome: dict, features_by_contig: Dict[str, dict], gff3_path: Pat
}
annotations = encode_annotations(annotations)
fh.write(f"{feat['contig']}\tPILER-CR\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
elif(feat['type'] is bc.FEATURE_CDS):
elif(feat['type'] == bc.FEATURE_CDS):
annotations = {
'ID': feat['locus'],
'Name': feat['product'],
Expand Down Expand Up @@ -254,7 +254,7 @@ def write_gff3(genome: dict, features_by_contig: Dict[str, dict], gff3_path: Pat
fh.write(f"{feat['contig']}\tProdigal\t{so.SO_CDS.name}\t{start}\t{stop}\t.\t{feat['strand']}\t0\t{annotations}\n")
if(bc.FEATURE_SIGNAL_PEPTIDE in feat):
write_signal_peptide(fh, feat)
elif(feat['type'] is bc.FEATURE_SORF):
elif(feat['type'] == bc.FEATURE_SORF):
annotations = {
'ID': feat['locus'],
'Name': feat['product'],
Expand Down Expand Up @@ -284,7 +284,7 @@ def write_gff3(genome: dict, features_by_contig: Dict[str, dict], gff3_path: Pat
fh.write(f"{feat['contig']}\tBakta\t{so.SO_CDS.name}\t{start}\t{stop}\t.\t{feat['strand']}\t0\t{annotations}\n")
if(bc.FEATURE_SIGNAL_PEPTIDE in feat):
write_signal_peptide(fh, feat)
elif(feat['type'] is bc.FEATURE_GAP):
elif(feat['type'] == bc.FEATURE_GAP):
annotations = {
'ID': feat['id'],
'Name': f"gap ({feat['length']} bp)",
Expand Down
52 changes: 32 additions & 20 deletions bakta/io/insdc.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,33 +270,45 @@ def write_insdc(genome: dict, features: Sequence[dict], genbank_output_path: Pat


def select_ncrna_class(feature: dict) -> str:
if(feature['class'] is None):
feature_class = feature['class']
if(feature_class is None):
return bc.INSDC_FEATURE_NC_RNA_CLASS_OTHER
elif(feature['class'].id == so.SO_NCRNA_GENE_ANTISENSE.id):
return bc.INSDC_FEATURE_NC_RNA_CLASS_ANTISENSE
elif(feature['class'].id == so.SO_NCRNA_GENE_RIBOZYME.id):
return bc.INSDC_FEATURE_NC_RNA_CLASS_RIBOZYME
elif(feature['class'].id == so.SO_NCRNA_GENE_RNASEP.id):
return bc.INSDC_FEATURE_NC_RNA_CLASS_RNASEP
else:
return bc.INSDC_FEATURE_NC_RNA_CLASS_OTHER
if(isinstance(feature_class, list)): # workaround for JSON-imported features
feature_class = so.SO(feature_class[0], feature_class[1])
feature['class'] = feature_class

if(feature_class.id == so.SO_NCRNA_GENE_ANTISENSE.id):
return bc.INSDC_FEATURE_NC_RNA_CLASS_ANTISENSE
elif(feature_class.id == so.SO_NCRNA_GENE_RIBOZYME.id):
return bc.INSDC_FEATURE_NC_RNA_CLASS_RIBOZYME
elif(feature_class.id == so.SO_NCRNA_GENE_RNASEP.id):
return bc.INSDC_FEATURE_NC_RNA_CLASS_RNASEP
else:
return bc.INSDC_FEATURE_NC_RNA_CLASS_OTHER


def select_regulatory_class(feature: dict) -> str:
if(feature['class'] is None):
feature_class = feature['class']
if(feature_class is None):
return bc.INSDC_FEATURE_REGULATORY_CLASS_OTHER
elif(feature['class'].id == so.SO_CIS_REG_ATTENUATOR.id):
return bc.INSDC_FEATURE_REGULATORY_CLASS_ATTENUATOR
elif(feature['class'].id == so.SO_CIS_REG_RIBOSWITCH.id):
return bc.INSDC_FEATURE_REGULATORY_CLASS_RIBOSWITCH
elif(feature['class'].id == so.SO_CIS_REG_THERMOMETER.id):
return bc.INSDC_FEATURE_REGULATORY_CLASS_RESPONSE_ELEMENT
elif(feature['class'].id == so.SO_CIS_REG_RECODING_STIMULATION_REGION.id or feature['class'].id == so.SO_CIS_REG_FRAMESHIFT.id):
return bc.INSDC_FEATURE_REGULATORY_CLASS_RECODING_STIMULATORY_REGION
elif(feature['class'].id == so.SO_CIS_REG_RIBOSOME_BINDING_SITE.id):
return bc.INSDC_FEATURE_REGULATORY_CLASS_RIBOSOME_BINDING_SITE
else:
return bc.INSDC_FEATURE_REGULATORY_CLASS_OTHER
if(isinstance(feature_class, list)): # workaround for JSON-imported features
feature_class = so.SO(feature_class[0], feature_class[1])
feature['class'] = feature_class

if(feature_class.id == so.SO_CIS_REG_ATTENUATOR.id):
return bc.INSDC_FEATURE_REGULATORY_CLASS_ATTENUATOR
elif(feature_class.id == so.SO_CIS_REG_RIBOSWITCH.id):
return bc.INSDC_FEATURE_REGULATORY_CLASS_RIBOSWITCH
elif(feature_class.id == so.SO_CIS_REG_THERMOMETER.id):
return bc.INSDC_FEATURE_REGULATORY_CLASS_RESPONSE_ELEMENT
elif(feature_class.id == so.SO_CIS_REG_RECODING_STIMULATION_REGION.id or feature_class.id == so.SO_CIS_REG_FRAMESHIFT.id):
return bc.INSDC_FEATURE_REGULATORY_CLASS_RECODING_STIMULATORY_REGION
elif(feature_class.id == so.SO_CIS_REG_RIBOSOME_BINDING_SITE.id):
return bc.INSDC_FEATURE_REGULATORY_CLASS_RIBOSOME_BINDING_SITE
else:
return bc.INSDC_FEATURE_REGULATORY_CLASS_OTHER


def revise_product_insdc(product: str):
Expand Down
94 changes: 94 additions & 0 deletions scripts/collect-annotation-stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
#!/usr/bin/python3

import argparse
import json
import os

from pathlib import Path

import bakta
import bakta.constants as bc


parser = argparse.ArgumentParser(
prog=f'collect-annotation-stats',
description='Collect annotation statistics and export as TSV',
epilog=f'Version: {bakta.__version__}\nDOI: {bc.BAKTA_DOI}\nURL: github.com/oschwengers/bakta\n\nCitation:\n{bc.BAKTA_CITATION}',
formatter_class=argparse.RawDescriptionHelpFormatter,
add_help=False
)
parser.add_argument('genomes', metavar='<genomes>', nargs='+', help='Bakta genome annotation files in JSON format')
parser.add_argument('--prefix', '-p', action='store', default='annotation-stats', help='Prefix for output file')
parser.add_argument('--output', '-o', action='store', default=os.getcwd(), help='Output directory (default = current working directory)')
args = parser.parse_args()


prefix = args.prefix
output_path = Path(args.output).resolve().joinpath(f'{prefix}.tsv')
with output_path.open('w') as fh_out:
fh_out.write(
'\t'.join(
[
'Genome',
'Taxon',
'Complete',
'Translation tale',
'# Sequences',
'Size',
'GC',
'N ratio',
'Coding ratio',
'tRNA',
'tmRNA',
'rRNA',
'ncRNA',
'ncRNA region',
'CRISPR',
'CDS',
'CDS hypothetical',
'CDS pseudogene',
'sORF',
'GAP',
'oriC',
'oriV',
'oriT'
]
)
)
fh_out.write('\n')
for genome in args.genomes:
genome_path = Path(genome).resolve()
try:
with genome_path.open() as fh_in:
genome = json.load(fh_in)
stats = [
genome_path.stem,
f"{' '.join([t for t in [genome['genome'].get('genus', None), genome['genome'].get('species', None), genome['genome'].get('strain', None)] if t is not None])}",
'y' if genome['genome']['complete'] else 'n',
f"{genome['genome']['translation_table']}",
f"{genome['stats']['no_sequences']}",
f"{genome['stats']['size']}",
f"{100 * genome['stats']['gc']:.1f}",
f"{100 * genome['stats']['n_ratio']:.1f}",
f"{genome['stats']['n50']}",
f"{100 * genome['stats']['coding_ratio']:.1f}",
f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_T_RNA])}",
f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_TM_RNA])}",
f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_R_RNA])}",
f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_NC_RNA])}",
f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_NC_RNA_REGION])}",
f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_CRISPR])}",
f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_CDS])}",
f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_CDS and 'hypothetical' in f])}",
f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_CDS and 'pseudogene' in f])}",
f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_SORF])}",
f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_GAP])}",
f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_ORIC])}",
f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_ORIV])}",
f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_ORIT])}",
]
output_line = '\t'.join(stats)
print(output_line)
fh_out.write(f'{output_line}\n')
except:
print(f"Error reading genome {genome_path.stem}")
81 changes: 81 additions & 0 deletions scripts/extract-region.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
#!/usr/bin/python3

import argparse
import json
import os

from pathlib import Path

import bakta
import bakta.constants as bc
import bakta.io.gff as gff
import bakta.io.insdc as insdc
import bakta.io.fasta as fasta
import bakta.config as cfg


parser = argparse.ArgumentParser(
prog=f'extract region',
description='Extract genomic region with a given range and exports selected features as GFF3, FAA, FFN, EMBL and Genbank',
epilog=f'Version: {bakta.__version__}\nDOI: {bc.BAKTA_DOI}\nURL: github.com/oschwengers/bakta\n\nCitation:\n{bc.BAKTA_CITATION}',
formatter_class=argparse.RawDescriptionHelpFormatter,
add_help=False
)
parser.add_argument('genome', metavar='<genome>', help='Bakta genome annotation in JSON format')
parser.add_argument('--prefix', '-p', action='store', default=None, help='Prefix for output files')
parser.add_argument('--output', '-o', action='store', default=os.getcwd(), help='Output directory (default = current working directory)')
parser.add_argument('--sequence', '-s', action='store', default=None, help='Sequence/Contig (default = first)')
parser.add_argument('--min', '-m', action='store', type=int, default=0, help='Left region border including (default = 0)')
parser.add_argument('--max', '-x', action='store', type=int, default=100_000_000, help='Right region border including (default = 100,000,000)')
args = parser.parse_args()


print('Load annotated genome...')
genome_path = Path(args.genome).resolve()
with genome_path.open() as fh:
genome = json.load(fh)

contig_id = args.sequence
if(contig_id is None): # take first sequence as default
contig_id = genome['sequences'][0]['id']

prefix = args.prefix
if(prefix is None): # use input file prefix as default
prefix = genome_path.stem


print('Extract features within selected region...')
features_selected = []
for feat in genome['features']:
if(feat['contig'] == contig_id):
if(feat['start'] >= args.min and feat['stop'] <= args.max):
features_selected.append(feat)
features_by_contig = {contig_id: features_selected} # needed for GFF3 export
print(f'\t...selected features: {len(features_selected)}')

genome['features'] = features_selected
genome['contigs'] = [sequence for sequence in genome['sequences'] if sequence['id'] == contig_id]
genome['genus'] = genome['genome']['genus']
genome['species'] = genome['genome']['species']
genome['strain'] = genome['genome']['strain']
genome['taxon'] = f"{genome['genome']['genus']} {genome['genome']['species']} {genome['genome']['strain']}"
cfg.db_info = {
'major': genome['version']['db']['version'].split('.')[0],
'minor': genome['version']['db']['version'].split('.')[1],
'type': genome['version']['db']['type']
}

print('Write selected features...')
output_path = Path(args.output).resolve()
gff3_path = output_path.joinpath(f'{prefix}.gff3')
gff.write_gff3(genome, features_by_contig, gff3_path)
print('\t...INSDC GenBank & EMBL')
genbank_path = output_path.joinpath(f'{prefix}.gbff')
embl_path = output_path.joinpath(f'{prefix}.embl')
insdc.write_insdc(genome, features_selected, genbank_path, embl_path)
print('\t...feature nucleotide sequences')
ffn_path = output_path.joinpath(f'{prefix}.ffn')
fasta.write_ffn(features_selected, ffn_path)
print('\t...translated CDS sequences')
faa_path = output_path.joinpath(f'{prefix}.faa')
fasta.write_faa(features_selected, faa_path)