From 65b8caab64e3f4ff9f7ea4dc75f96bc1e12941c3 Mon Sep 17 00:00:00 2001
From: Oliver Schwengers <oliver.schwengers@computational.bio.uni-giessen.de>
Date: Wed, 25 Oct 2023 16:46:33 +0200
Subject: [PATCH] Introduce auxiliary scripts (#251)

* add region extraction script
* refactor region extraction script
* fix feature type comparison
* fix SO named tuple after JSON import
* add annotation stats aggregation script
* review extract region script
* add script section to readme
* try link to scripts in readme [skip ci]
* fix typo [skip ci]
---
 README.md                           | 10 +++
 bakta/io/gff.py                     | 16 ++---
 bakta/io/insdc.py                   | 52 ++++++++++------
 scripts/collect-annotation-stats.py | 94 +++++++++++++++++++++++++++++
 scripts/extract-region.py           | 81 +++++++++++++++++++++++++
 5 files changed, 225 insertions(+), 28 deletions(-)
 create mode 100755 scripts/collect-annotation-stats.py
 create mode 100755 scripts/extract-region.py

diff --git a/README.md b/README.md
index 4a6c1266..458a3f21 100644
--- a/README.md
+++ b/README.md
@@ -23,6 +23,7 @@ Bakta is a tool for the rapid & standardized annotation of bacterial genomes and
 - [Genome Submission](#genome-submission)
 - [Protein bulk annotation](#protein-bulk-annotation)
 - [Genome plots](#genome-plots)
+- [Auxiliary scripts](#auxiliary-scripts)
 - [Web version](#web-version)
 - [Citation](#citation)
 - [FAQ](#faq)
@@ -713,6 +714,15 @@ In the `cog` mode, all protein-coding genes (CDS) are colored due to assigned CO
 
 In addition, both plot types share two innermost GC content and GC skew rings. The first ring represents the GC content per sliding window over the entire sequence(s) in green (`#33a02c`) and red `#e31a1c` representing GC above and below average, respectively. The 2nd ring represents the GC skew in orange (`#fdbf6f`) and blue (`#1f78b4`). The GC skew gives hints on a replicon's replication bubble and hence, on the completeness of the assembly. On a complete & circular bacterial chromosome, you normally see two inflection points at the origin of replication and at its opposite region -> [Wikipedia](https://en.wikipedia.org/wiki/GC_skew)
 
+## Auxiliary scripts
+
+Often, the usage of Bakta is a necessary upfront task followed by deeper analyses implemented in custom scripts. In [scripts](scripts) we'd like to collect & offer a pool of scripts addressing common tasks:
+
+- `collect-annotation-stats.py`: Collect annotation stats for a cohort of genomes and print a condensed `TSV`.
+- `extract-region.py`: Extract genome features within a given genomic range and export them as `GFF3`, `Embl`, `Genbank`, `FAA` and `FFN`
+
+Of course, pull requests are welcome ;-)
+
 ## Web version
 
 For further convenience, we developed an accompanying web application available at https://bakta.computational.bio.
diff --git a/bakta/io/gff.py b/bakta/io/gff.py
index 91d0445b..25349eb7 100644
--- a/bakta/io/gff.py
+++ b/bakta/io/gff.py
@@ -50,7 +50,7 @@ def write_gff3(genome: dict, features_by_contig: Dict[str, dict], gff3_path: Pat
                 if('edge' in feat):
                     stop += contig['length']
 
-                if(feat['type'] is bc.FEATURE_T_RNA):
+                if(feat['type'] == bc.FEATURE_T_RNA):
                     annotations = {
                         'ID': feat['locus'],
                         'Name': feat['product'],
@@ -84,7 +84,7 @@ def write_gff3(genome: dict, features_by_contig: Dict[str, dict], gff3_path: Pat
                         fh.write(f"{feat['contig']}\ttRNAscan-SE\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
                     annotations = encode_annotations(annotations)
                     fh.write(f"{feat['contig']}\ttRNAscan-SE\t{so.SO_TRNA.name}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
-                elif(feat['type'] is bc.FEATURE_TM_RNA):
+                elif(feat['type'] == bc.FEATURE_TM_RNA):
                     annotations = {
                         'ID': feat['locus'],
                         'Name': feat['product'],
@@ -107,7 +107,7 @@ def write_gff3(genome: dict, features_by_contig: Dict[str, dict], gff3_path: Pat
                         fh.write(f"{feat['contig']}\tAragorn\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
                     annotations = encode_annotations(annotations)
                     fh.write(f"{feat['contig']}\tAragorn\t{so.SO_TMRNA.name}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
-                elif(feat['type'] is bc.FEATURE_R_RNA):
+                elif(feat['type'] == bc.FEATURE_R_RNA):
                     annotations = {
                         'ID': feat['locus'],
                         'Name': feat['product'],
@@ -131,7 +131,7 @@ def write_gff3(genome: dict, features_by_contig: Dict[str, dict], gff3_path: Pat
                         fh.write(f"{feat['contig']}\tInfernal\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
                     annotations = encode_annotations(annotations)
                     fh.write(f"{feat['contig']}\tInfernal\t{so.SO_RRNA.name}\t{start}\t{stop}\t{feat['evalue']}\t{feat['strand']}\t.\t{annotations}\n")
-                elif(feat['type'] is bc.FEATURE_NC_RNA):
+                elif(feat['type'] == bc.FEATURE_NC_RNA):
                     annotations = {
                         'ID': feat['locus'],
                         'Name': feat['product'],
@@ -156,7 +156,7 @@ def write_gff3(genome: dict, features_by_contig: Dict[str, dict], gff3_path: Pat
                         fh.write(f"{feat['contig']}\tInfernal\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
                     annotations = encode_annotations(annotations)
                     fh.write(f"{feat['contig']}\tInfernal\t{so.SO_NCRNA_GENE.name}\t{start}\t{stop}\t{feat['evalue']}\t{feat['strand']}\t.\t{annotations}\n")
-                elif(feat['type'] is bc.FEATURE_NC_RNA_REGION):
+                elif(feat['type'] == bc.FEATURE_NC_RNA_REGION):
                     annotations = {
                         'ID': feat['id'],
                         'Name': feat['product'],
@@ -209,7 +209,7 @@ def write_gff3(genome: dict, features_by_contig: Dict[str, dict], gff3_path: Pat
                         }
                         annotations = encode_annotations(annotations)
                         fh.write(f"{feat['contig']}\tPILER-CR\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
-                elif(feat['type'] is bc.FEATURE_CDS):
+                elif(feat['type'] == bc.FEATURE_CDS):
                     annotations = {
                         'ID': feat['locus'],
                         'Name': feat['product'],
@@ -254,7 +254,7 @@ def write_gff3(genome: dict, features_by_contig: Dict[str, dict], gff3_path: Pat
                     fh.write(f"{feat['contig']}\tProdigal\t{so.SO_CDS.name}\t{start}\t{stop}\t.\t{feat['strand']}\t0\t{annotations}\n")
                     if(bc.FEATURE_SIGNAL_PEPTIDE in feat):
                         write_signal_peptide(fh, feat)
-                elif(feat['type'] is bc.FEATURE_SORF):
+                elif(feat['type'] == bc.FEATURE_SORF):
                     annotations = {
                         'ID': feat['locus'],
                         'Name': feat['product'],
@@ -284,7 +284,7 @@ def write_gff3(genome: dict, features_by_contig: Dict[str, dict], gff3_path: Pat
                     fh.write(f"{feat['contig']}\tBakta\t{so.SO_CDS.name}\t{start}\t{stop}\t.\t{feat['strand']}\t0\t{annotations}\n")
                     if(bc.FEATURE_SIGNAL_PEPTIDE in feat):
                         write_signal_peptide(fh, feat)
-                elif(feat['type'] is bc.FEATURE_GAP):
+                elif(feat['type'] == bc.FEATURE_GAP):
                     annotations = {
                         'ID': feat['id'],
                         'Name': f"gap ({feat['length']} bp)",
diff --git a/bakta/io/insdc.py b/bakta/io/insdc.py
index 73280aa6..3b287afd 100644
--- a/bakta/io/insdc.py
+++ b/bakta/io/insdc.py
@@ -270,33 +270,45 @@ def write_insdc(genome: dict, features: Sequence[dict], genbank_output_path: Pat
 
 
 def select_ncrna_class(feature: dict) -> str:
-    if(feature['class'] is None):
+    feature_class = feature['class']
+    if(feature_class is None):
         return bc.INSDC_FEATURE_NC_RNA_CLASS_OTHER
-    elif(feature['class'].id == so.SO_NCRNA_GENE_ANTISENSE.id):
-        return bc.INSDC_FEATURE_NC_RNA_CLASS_ANTISENSE
-    elif(feature['class'].id == so.SO_NCRNA_GENE_RIBOZYME.id):
-        return bc.INSDC_FEATURE_NC_RNA_CLASS_RIBOZYME
-    elif(feature['class'].id == so.SO_NCRNA_GENE_RNASEP.id):
-        return bc.INSDC_FEATURE_NC_RNA_CLASS_RNASEP
     else:
-        return bc.INSDC_FEATURE_NC_RNA_CLASS_OTHER
+        if(isinstance(feature_class, list)):  #  workaround for JSON-imported features
+            feature_class = so.SO(feature_class[0], feature_class[1])
+            feature['class'] = feature_class
+
+        if(feature_class.id == so.SO_NCRNA_GENE_ANTISENSE.id):
+            return bc.INSDC_FEATURE_NC_RNA_CLASS_ANTISENSE
+        elif(feature_class.id == so.SO_NCRNA_GENE_RIBOZYME.id):
+            return bc.INSDC_FEATURE_NC_RNA_CLASS_RIBOZYME
+        elif(feature_class.id == so.SO_NCRNA_GENE_RNASEP.id):
+            return bc.INSDC_FEATURE_NC_RNA_CLASS_RNASEP
+        else:
+            return bc.INSDC_FEATURE_NC_RNA_CLASS_OTHER
 
 
 def select_regulatory_class(feature: dict) -> str:
-    if(feature['class'] is None):
+    feature_class = feature['class']
+    if(feature_class is None):
         return bc.INSDC_FEATURE_REGULATORY_CLASS_OTHER
-    elif(feature['class'].id == so.SO_CIS_REG_ATTENUATOR.id):
-        return bc.INSDC_FEATURE_REGULATORY_CLASS_ATTENUATOR
-    elif(feature['class'].id == so.SO_CIS_REG_RIBOSWITCH.id):
-        return bc.INSDC_FEATURE_REGULATORY_CLASS_RIBOSWITCH
-    elif(feature['class'].id == so.SO_CIS_REG_THERMOMETER.id):
-        return bc.INSDC_FEATURE_REGULATORY_CLASS_RESPONSE_ELEMENT
-    elif(feature['class'].id == so.SO_CIS_REG_RECODING_STIMULATION_REGION.id or feature['class'].id == so.SO_CIS_REG_FRAMESHIFT.id):
-        return bc.INSDC_FEATURE_REGULATORY_CLASS_RECODING_STIMULATORY_REGION
-    elif(feature['class'].id == so.SO_CIS_REG_RIBOSOME_BINDING_SITE.id):
-        return bc.INSDC_FEATURE_REGULATORY_CLASS_RIBOSOME_BINDING_SITE
     else:
-        return bc.INSDC_FEATURE_REGULATORY_CLASS_OTHER
+        if(isinstance(feature_class, list)):  #  workaround for JSON-imported features
+            feature_class = so.SO(feature_class[0], feature_class[1])
+            feature['class'] = feature_class
+
+        if(feature_class.id == so.SO_CIS_REG_ATTENUATOR.id):
+            return bc.INSDC_FEATURE_REGULATORY_CLASS_ATTENUATOR
+        elif(feature_class.id == so.SO_CIS_REG_RIBOSWITCH.id):
+            return bc.INSDC_FEATURE_REGULATORY_CLASS_RIBOSWITCH
+        elif(feature_class.id == so.SO_CIS_REG_THERMOMETER.id):
+            return bc.INSDC_FEATURE_REGULATORY_CLASS_RESPONSE_ELEMENT
+        elif(feature_class.id == so.SO_CIS_REG_RECODING_STIMULATION_REGION.id or feature_class.id == so.SO_CIS_REG_FRAMESHIFT.id):
+            return bc.INSDC_FEATURE_REGULATORY_CLASS_RECODING_STIMULATORY_REGION
+        elif(feature_class.id == so.SO_CIS_REG_RIBOSOME_BINDING_SITE.id):
+            return bc.INSDC_FEATURE_REGULATORY_CLASS_RIBOSOME_BINDING_SITE
+        else:
+            return bc.INSDC_FEATURE_REGULATORY_CLASS_OTHER
 
 
 def revise_product_insdc(product: str):
diff --git a/scripts/collect-annotation-stats.py b/scripts/collect-annotation-stats.py
new file mode 100755
index 00000000..cffc0b1b
--- /dev/null
+++ b/scripts/collect-annotation-stats.py
@@ -0,0 +1,94 @@
+#!/usr/bin/python3
+
+import argparse
+import json
+import os
+
+from pathlib import Path
+
+import bakta
+import bakta.constants as bc
+
+
+parser = argparse.ArgumentParser(
+    prog=f'collect-annotation-stats',
+    description='Collect annotation statistics and export as TSV',
+    epilog=f'Version: {bakta.__version__}\nDOI: {bc.BAKTA_DOI}\nURL: github.com/oschwengers/bakta\n\nCitation:\n{bc.BAKTA_CITATION}',
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    add_help=False
+)
+parser.add_argument('genomes', metavar='<genomes>', nargs='+', help='Bakta genome annotation files in JSON format')
+parser.add_argument('--prefix', '-p', action='store', default='annotation-stats', help='Prefix for output file')
+parser.add_argument('--output', '-o', action='store', default=os.getcwd(), help='Output directory (default = current working directory)')
+args = parser.parse_args()
+
+
+prefix = args.prefix
+output_path = Path(args.output).resolve().joinpath(f'{prefix}.tsv')
+with output_path.open('w') as fh_out:
+    fh_out.write(
+        '\t'.join(
+            [
+                'Genome',
+                'Taxon',
+                'Complete',
+                'Translation tale',
+                '# Sequences',
+                'Size',
+                'GC',
+                'N ratio',
+                'Coding ratio',
+                'tRNA',
+                'tmRNA',
+                'rRNA',
+                'ncRNA',
+                'ncRNA region',
+                'CRISPR',
+                'CDS',
+                'CDS hypothetical',
+                'CDS pseudogene',
+                'sORF',
+                'GAP',
+                'oriC',
+                'oriV',
+                'oriT'
+            ]
+        )
+    )
+    fh_out.write('\n')
+    for genome in args.genomes:
+        genome_path = Path(genome).resolve()
+        try:
+            with genome_path.open() as fh_in:
+                genome = json.load(fh_in)
+            stats = [
+                genome_path.stem,
+                f"{' '.join([t for t in [genome['genome'].get('genus', None), genome['genome'].get('species', None), genome['genome'].get('strain', None)] if t is not None])}",
+                'y' if genome['genome']['complete'] else 'n',
+                f"{genome['genome']['translation_table']}",
+                f"{genome['stats']['no_sequences']}",
+                f"{genome['stats']['size']}",
+                f"{100 * genome['stats']['gc']:.1f}",
+                f"{100 * genome['stats']['n_ratio']:.1f}",
+                f"{genome['stats']['n50']}",
+                f"{100 * genome['stats']['coding_ratio']:.1f}",
+                f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_T_RNA])}",
+                f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_TM_RNA])}",
+                f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_R_RNA])}",
+                f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_NC_RNA])}",
+                f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_NC_RNA_REGION])}",
+                f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_CRISPR])}",
+                f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_CDS])}",
+                f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_CDS and 'hypothetical' in f])}",
+                f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_CDS and 'pseudogene' in f])}",
+                f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_SORF])}",
+                f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_GAP])}",
+                f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_ORIC])}",
+                f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_ORIV])}",
+                f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_ORIT])}",
+            ]
+            output_line = '\t'.join(stats)
+            print(output_line)
+            fh_out.write(f'{output_line}\n')
+        except:
+            print(f"Error reading genome {genome_path.stem}")
diff --git a/scripts/extract-region.py b/scripts/extract-region.py
new file mode 100755
index 00000000..6ddb90b1
--- /dev/null
+++ b/scripts/extract-region.py
@@ -0,0 +1,81 @@
+#!/usr/bin/python3
+
+import argparse
+import json
+import os
+
+from pathlib import Path
+
+import bakta
+import bakta.constants as bc
+import bakta.io.gff as gff
+import bakta.io.insdc as insdc
+import bakta.io.fasta as fasta
+import bakta.config as cfg
+
+
+parser = argparse.ArgumentParser(
+    prog=f'extract region',
+    description='Extract genomic region with a given range and exports selected features as GFF3, FAA, FFN, EMBL and Genbank',
+    epilog=f'Version: {bakta.__version__}\nDOI: {bc.BAKTA_DOI}\nURL: github.com/oschwengers/bakta\n\nCitation:\n{bc.BAKTA_CITATION}',
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    add_help=False
+)
+parser.add_argument('genome', metavar='<genome>', help='Bakta genome annotation in JSON format')
+parser.add_argument('--prefix', '-p', action='store', default=None, help='Prefix for output files')
+parser.add_argument('--output', '-o', action='store', default=os.getcwd(), help='Output directory (default = current working directory)')
+parser.add_argument('--sequence', '-s', action='store', default=None, help='Sequence/Contig (default = first)')
+parser.add_argument('--min', '-m', action='store', type=int, default=0, help='Left region border including (default = 0)')
+parser.add_argument('--max', '-x', action='store', type=int, default=100_000_000, help='Right region border including (default = 100,000,000)')
+args = parser.parse_args()
+
+
+print('Load annotated genome...')
+genome_path = Path(args.genome).resolve()
+with genome_path.open() as fh:
+    genome = json.load(fh)
+
+contig_id = args.sequence
+if(contig_id is None):  # take first sequence as default
+    contig_id = genome['sequences'][0]['id']
+
+prefix = args.prefix
+if(prefix is None):  # use input file prefix as default
+    prefix = genome_path.stem
+
+
+print('Extract features within selected region...')
+features_selected = []
+for feat in genome['features']:
+    if(feat['contig'] == contig_id):
+        if(feat['start'] >= args.min  and  feat['stop'] <= args.max):
+            features_selected.append(feat)
+features_by_contig = {contig_id: features_selected}  # needed for GFF3 export
+print(f'\t...selected features: {len(features_selected)}')
+
+genome['features'] = features_selected
+genome['contigs'] = [sequence for sequence in genome['sequences'] if sequence['id'] == contig_id]
+genome['genus'] = genome['genome']['genus']
+genome['species'] = genome['genome']['species']
+genome['strain'] = genome['genome']['strain']
+genome['taxon'] = f"{genome['genome']['genus']} {genome['genome']['species']} {genome['genome']['strain']}"
+cfg.db_info = {
+    'major': genome['version']['db']['version'].split('.')[0],
+    'minor': genome['version']['db']['version'].split('.')[1],
+    'type': genome['version']['db']['type']
+}
+
+print('Write selected features...')
+output_path = Path(args.output).resolve()
+gff3_path = output_path.joinpath(f'{prefix}.gff3')
+gff.write_gff3(genome, features_by_contig, gff3_path)
+print('\t...INSDC GenBank & EMBL')
+genbank_path = output_path.joinpath(f'{prefix}.gbff')
+embl_path = output_path.joinpath(f'{prefix}.embl')
+insdc.write_insdc(genome, features_selected, genbank_path, embl_path)
+print('\t...feature nucleotide sequences')
+ffn_path = output_path.joinpath(f'{prefix}.ffn')
+fasta.write_ffn(features_selected, ffn_path)
+print('\t...translated CDS sequences')
+faa_path = output_path.joinpath(f'{prefix}.faa')
+fasta.write_faa(features_selected, faa_path)
\ No newline at end of file