From 3c4a55e44451ebb8cd16e09f28f891194cca3de9 Mon Sep 17 00:00:00 2001 From: Oliver Schwengers Date: Fri, 18 Feb 2022 09:40:50 +0100 Subject: [PATCH 1/9] add region extraction script --- scripts/extract-region.py | 72 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 scripts/extract-region.py diff --git a/scripts/extract-region.py b/scripts/extract-region.py new file mode 100644 index 00000000..5c2ce1d0 --- /dev/null +++ b/scripts/extract-region.py @@ -0,0 +1,72 @@ +import json +import os + +from pathlib import Path + +from matplotlib.font_manager import json_load + +import bakta.utils as bu +import bakta.io.gff as gff +import bakta.io.insdc as insdc +import bakta.io.fasta as fasta +import bakta.config as cfg + +parser = bu.init_parser() +parser.add_argument('genome', metavar='', help='Bakta genome annotation in JSON format') +parser.add_argument('--prefix', '-p', action='store', default=None, help='Prefix for output files') +parser.add_argument('--output', '-o', action='store', default=os.getcwd(), help='Output directory (default = current working directory)') +parser.add_argument('--contig', '-c', action='store', default=None, help='Sequence/Contig (default = first)') +parser.add_argument('--min', '-m', action='store', type=int, default=0, help='Left region border including (default = 0)') +parser.add_argument('--max', '-x', action='store', type=int, default=0, help='Right region border including (default = genome size)') +args = parser.parse_args() + + +genome_path = Path(args.genome).resolve() +genome = json_load(genome_path) + + +contig = args.contig +if(contig is None): + contig = genome['sequences'][0]['id'] + +prefix = args.prefix +if(prefix is None): + prefix = genome_path.stem + + +features_by_contig = {k['id']: [] for k in genome['sequences']} +features_selected = [] +for feat in genome['features']: + contig_features = features_by_contig.get(feat['contig']) + contig_features.append(feat) + # if(feat['start'] >= args.min and feat['stop'] <= args.max): + if(feat['contig'] == contig and feat['start'] >= args.min and feat['stop'] <= args.max): + # print(f"select feature: {feat}") + features_selected.append(feat) + +genome['features'] = features_selected +genome['contigs'] = genome['sequences'] + +genome['genus'] = genome['genome']['genus'] +genome['species'] = genome['genome']['species'] +genome['strain'] = genome['genome']['strain'] +genome['taxon'] = f"{genome['genome']['genus']} {genome['genome']['species']} {genome['genome']['strain']}" + +cfg.db_info = { + 'major': genome['version']['db'].split('.')[0], + 'minor': genome['version']['db'].split('.')[1] +} + +output_path = Path(args.output).resolve() +gff3_path = output_path.joinpath(f'{prefix}.gff3') +gff.write_gff3(genome, features_by_contig, gff3_path) +print('INSDC GenBank & EMBL...') +genbank_path = output_path.joinpath(f'{prefix}.gbff') +embl_path = output_path.joinpath(f'{prefix}.embl') +insdc.write_insdc(genome, features_selected, genbank_path, embl_path) +print('feature nucleotide sequences...') +ffn_path = output_path.joinpath(f'{prefix}.ffn') +fasta.write_ffn(features_selected, ffn_path) +print('translated CDS sequences...') +faa_path = output_path.joinpath(f'{prefix}.faa') +fasta.write_faa(features_selected, faa_path) \ No newline at end of file From e9cbea255b2476259ce41cc1eb6856c9e4954940 Mon Sep 17 00:00:00 2001 From: Oliver Schwengers Date: Wed, 25 Oct 2023 14:32:39 +0200 Subject: [PATCH 2/9] refactor region extraction script --- scripts/extract-region.py | 50 ++++++++++++++++++++------------------- 1 file changed, 26 insertions(+), 24 deletions(-) mode change 100644 => 100755 scripts/extract-region.py diff --git a/scripts/extract-region.py b/scripts/extract-region.py old mode 100644 new mode 100755 index 5c2ce1d0..ff8308cc --- a/scripts/extract-region.py +++ b/scripts/extract-region.py @@ -1,10 +1,10 @@ +#!/usr/bin/python3 + import json import os from pathlib import Path -from matplotlib.font_manager import json_load - import bakta.utils as bu import bakta.io.gff as gff import bakta.io.insdc as insdc @@ -15,58 +15,60 @@ parser.add_argument('genome', metavar='', help='Bakta genome annotation in JSON format') parser.add_argument('--prefix', '-p', action='store', default=None, help='Prefix for output files') parser.add_argument('--output', '-o', action='store', default=os.getcwd(), help='Output directory (default = current working directory)') -parser.add_argument('--contig', '-c', action='store', default=None, help='Sequence/Contig (default = first)') +parser.add_argument('--sequence', '-s', action='store', default=None, help='Sequence/Contig (default = first)') parser.add_argument('--min', '-m', action='store', type=int, default=0, help='Left region border including (default = 0)') -parser.add_argument('--max', '-x', action='store', type=int, default=0, help='Right region border including (default = genome size)') +parser.add_argument('--max', '-x', action='store', type=int, default=100_000_000, help='Right region border including (default = 100,000,000)') args = parser.parse_args() +print('Load annotated genome...') genome_path = Path(args.genome).resolve() -genome = json_load(genome_path) +with genome_path.open() as fh: + genome = json.load(fh) -contig = args.contig -if(contig is None): - contig = genome['sequences'][0]['id'] +contig_id = args.sequence +if(contig_id is None): # take first sequence as default + contig_id = genome['sequences'][0]['id'] prefix = args.prefix -if(prefix is None): +if(prefix is None): # use input file prefix as default prefix = genome_path.stem -features_by_contig = {k['id']: [] for k in genome['sequences']} +print('Extract features within selected region...') features_selected = [] for feat in genome['features']: - contig_features = features_by_contig.get(feat['contig']) - contig_features.append(feat) - # if(feat['start'] >= args.min and feat['stop'] <= args.max): - if(feat['contig'] == contig and feat['start'] >= args.min and feat['stop'] <= args.max): - # print(f"select feature: {feat}") - features_selected.append(feat) + if(feat['contig'] == contig_id): + if(feat['start'] >= args.min and feat['stop'] <= args.max): + features_selected.append(feat) +features_by_contig = {contig_id: features_selected} # needed for GFF3 export +print(f'\t...selected features: {len(features_selected)}') -genome['features'] = features_selected -genome['contigs'] = genome['sequences'] +genome['features'] = features_selected +genome['contigs'] = [sequence for sequence in genome['sequences'] if sequence['id'] == contig_id] genome['genus'] = genome['genome']['genus'] genome['species'] = genome['genome']['species'] genome['strain'] = genome['genome']['strain'] genome['taxon'] = f"{genome['genome']['genus']} {genome['genome']['species']} {genome['genome']['strain']}" - cfg.db_info = { - 'major': genome['version']['db'].split('.')[0], - 'minor': genome['version']['db'].split('.')[1] + 'major': genome['version']['db']['version'].split('.')[0], + 'minor': genome['version']['db']['version'].split('.')[1], + 'type': genome['version']['db']['type'] } +print('Write selected features...') output_path = Path(args.output).resolve() gff3_path = output_path.joinpath(f'{prefix}.gff3') gff.write_gff3(genome, features_by_contig, gff3_path) -print('INSDC GenBank & EMBL...') +print('\t...INSDC GenBank & EMBL') genbank_path = output_path.joinpath(f'{prefix}.gbff') embl_path = output_path.joinpath(f'{prefix}.embl') insdc.write_insdc(genome, features_selected, genbank_path, embl_path) -print('feature nucleotide sequences...') +print('\t...feature nucleotide sequences') ffn_path = output_path.joinpath(f'{prefix}.ffn') fasta.write_ffn(features_selected, ffn_path) -print('translated CDS sequences...') +print('\t...translated CDS sequences') faa_path = output_path.joinpath(f'{prefix}.faa') fasta.write_faa(features_selected, faa_path) \ No newline at end of file From 009843670b1d18851c37295fb2b32e09d3904caa Mon Sep 17 00:00:00 2001 From: Oliver Schwengers Date: Wed, 25 Oct 2023 14:32:58 +0200 Subject: [PATCH 3/9] fix feature type comparison --- bakta/io/gff.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/bakta/io/gff.py b/bakta/io/gff.py index 91d0445b..25349eb7 100644 --- a/bakta/io/gff.py +++ b/bakta/io/gff.py @@ -50,7 +50,7 @@ def write_gff3(genome: dict, features_by_contig: Dict[str, dict], gff3_path: Pat if('edge' in feat): stop += contig['length'] - if(feat['type'] is bc.FEATURE_T_RNA): + if(feat['type'] == bc.FEATURE_T_RNA): annotations = { 'ID': feat['locus'], 'Name': feat['product'], @@ -84,7 +84,7 @@ def write_gff3(genome: dict, features_by_contig: Dict[str, dict], gff3_path: Pat fh.write(f"{feat['contig']}\ttRNAscan-SE\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n") annotations = encode_annotations(annotations) fh.write(f"{feat['contig']}\ttRNAscan-SE\t{so.SO_TRNA.name}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n") - elif(feat['type'] is bc.FEATURE_TM_RNA): + elif(feat['type'] == bc.FEATURE_TM_RNA): annotations = { 'ID': feat['locus'], 'Name': feat['product'], @@ -107,7 +107,7 @@ def write_gff3(genome: dict, features_by_contig: Dict[str, dict], gff3_path: Pat fh.write(f"{feat['contig']}\tAragorn\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n") annotations = encode_annotations(annotations) fh.write(f"{feat['contig']}\tAragorn\t{so.SO_TMRNA.name}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n") - elif(feat['type'] is bc.FEATURE_R_RNA): + elif(feat['type'] == bc.FEATURE_R_RNA): annotations = { 'ID': feat['locus'], 'Name': feat['product'], @@ -131,7 +131,7 @@ def write_gff3(genome: dict, features_by_contig: Dict[str, dict], gff3_path: Pat fh.write(f"{feat['contig']}\tInfernal\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n") annotations = encode_annotations(annotations) fh.write(f"{feat['contig']}\tInfernal\t{so.SO_RRNA.name}\t{start}\t{stop}\t{feat['evalue']}\t{feat['strand']}\t.\t{annotations}\n") - elif(feat['type'] is bc.FEATURE_NC_RNA): + elif(feat['type'] == bc.FEATURE_NC_RNA): annotations = { 'ID': feat['locus'], 'Name': feat['product'], @@ -156,7 +156,7 @@ def write_gff3(genome: dict, features_by_contig: Dict[str, dict], gff3_path: Pat fh.write(f"{feat['contig']}\tInfernal\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n") annotations = encode_annotations(annotations) fh.write(f"{feat['contig']}\tInfernal\t{so.SO_NCRNA_GENE.name}\t{start}\t{stop}\t{feat['evalue']}\t{feat['strand']}\t.\t{annotations}\n") - elif(feat['type'] is bc.FEATURE_NC_RNA_REGION): + elif(feat['type'] == bc.FEATURE_NC_RNA_REGION): annotations = { 'ID': feat['id'], 'Name': feat['product'], @@ -209,7 +209,7 @@ def write_gff3(genome: dict, features_by_contig: Dict[str, dict], gff3_path: Pat } annotations = encode_annotations(annotations) fh.write(f"{feat['contig']}\tPILER-CR\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n") - elif(feat['type'] is bc.FEATURE_CDS): + elif(feat['type'] == bc.FEATURE_CDS): annotations = { 'ID': feat['locus'], 'Name': feat['product'], @@ -254,7 +254,7 @@ def write_gff3(genome: dict, features_by_contig: Dict[str, dict], gff3_path: Pat fh.write(f"{feat['contig']}\tProdigal\t{so.SO_CDS.name}\t{start}\t{stop}\t.\t{feat['strand']}\t0\t{annotations}\n") if(bc.FEATURE_SIGNAL_PEPTIDE in feat): write_signal_peptide(fh, feat) - elif(feat['type'] is bc.FEATURE_SORF): + elif(feat['type'] == bc.FEATURE_SORF): annotations = { 'ID': feat['locus'], 'Name': feat['product'], @@ -284,7 +284,7 @@ def write_gff3(genome: dict, features_by_contig: Dict[str, dict], gff3_path: Pat fh.write(f"{feat['contig']}\tBakta\t{so.SO_CDS.name}\t{start}\t{stop}\t.\t{feat['strand']}\t0\t{annotations}\n") if(bc.FEATURE_SIGNAL_PEPTIDE in feat): write_signal_peptide(fh, feat) - elif(feat['type'] is bc.FEATURE_GAP): + elif(feat['type'] == bc.FEATURE_GAP): annotations = { 'ID': feat['id'], 'Name': f"gap ({feat['length']} bp)", From 149cb6e317597ef30ad675be330213f2c9ce6ec2 Mon Sep 17 00:00:00 2001 From: Oliver Schwengers Date: Wed, 25 Oct 2023 14:33:27 +0200 Subject: [PATCH 4/9] fix SO named tuple after JSON import --- bakta/io/insdc.py | 52 +++++++++++++++++++++++++++++------------------ 1 file changed, 32 insertions(+), 20 deletions(-) diff --git a/bakta/io/insdc.py b/bakta/io/insdc.py index 73280aa6..3b287afd 100644 --- a/bakta/io/insdc.py +++ b/bakta/io/insdc.py @@ -270,33 +270,45 @@ def write_insdc(genome: dict, features: Sequence[dict], genbank_output_path: Pat def select_ncrna_class(feature: dict) -> str: - if(feature['class'] is None): + feature_class = feature['class'] + if(feature_class is None): return bc.INSDC_FEATURE_NC_RNA_CLASS_OTHER - elif(feature['class'].id == so.SO_NCRNA_GENE_ANTISENSE.id): - return bc.INSDC_FEATURE_NC_RNA_CLASS_ANTISENSE - elif(feature['class'].id == so.SO_NCRNA_GENE_RIBOZYME.id): - return bc.INSDC_FEATURE_NC_RNA_CLASS_RIBOZYME - elif(feature['class'].id == so.SO_NCRNA_GENE_RNASEP.id): - return bc.INSDC_FEATURE_NC_RNA_CLASS_RNASEP else: - return bc.INSDC_FEATURE_NC_RNA_CLASS_OTHER + if(isinstance(feature_class, list)): # workaround for JSON-imported features + feature_class = so.SO(feature_class[0], feature_class[1]) + feature['class'] = feature_class + + if(feature_class.id == so.SO_NCRNA_GENE_ANTISENSE.id): + return bc.INSDC_FEATURE_NC_RNA_CLASS_ANTISENSE + elif(feature_class.id == so.SO_NCRNA_GENE_RIBOZYME.id): + return bc.INSDC_FEATURE_NC_RNA_CLASS_RIBOZYME + elif(feature_class.id == so.SO_NCRNA_GENE_RNASEP.id): + return bc.INSDC_FEATURE_NC_RNA_CLASS_RNASEP + else: + return bc.INSDC_FEATURE_NC_RNA_CLASS_OTHER def select_regulatory_class(feature: dict) -> str: - if(feature['class'] is None): + feature_class = feature['class'] + if(feature_class is None): return bc.INSDC_FEATURE_REGULATORY_CLASS_OTHER - elif(feature['class'].id == so.SO_CIS_REG_ATTENUATOR.id): - return bc.INSDC_FEATURE_REGULATORY_CLASS_ATTENUATOR - elif(feature['class'].id == so.SO_CIS_REG_RIBOSWITCH.id): - return bc.INSDC_FEATURE_REGULATORY_CLASS_RIBOSWITCH - elif(feature['class'].id == so.SO_CIS_REG_THERMOMETER.id): - return bc.INSDC_FEATURE_REGULATORY_CLASS_RESPONSE_ELEMENT - elif(feature['class'].id == so.SO_CIS_REG_RECODING_STIMULATION_REGION.id or feature['class'].id == so.SO_CIS_REG_FRAMESHIFT.id): - return bc.INSDC_FEATURE_REGULATORY_CLASS_RECODING_STIMULATORY_REGION - elif(feature['class'].id == so.SO_CIS_REG_RIBOSOME_BINDING_SITE.id): - return bc.INSDC_FEATURE_REGULATORY_CLASS_RIBOSOME_BINDING_SITE else: - return bc.INSDC_FEATURE_REGULATORY_CLASS_OTHER + if(isinstance(feature_class, list)): # workaround for JSON-imported features + feature_class = so.SO(feature_class[0], feature_class[1]) + feature['class'] = feature_class + + if(feature_class.id == so.SO_CIS_REG_ATTENUATOR.id): + return bc.INSDC_FEATURE_REGULATORY_CLASS_ATTENUATOR + elif(feature_class.id == so.SO_CIS_REG_RIBOSWITCH.id): + return bc.INSDC_FEATURE_REGULATORY_CLASS_RIBOSWITCH + elif(feature_class.id == so.SO_CIS_REG_THERMOMETER.id): + return bc.INSDC_FEATURE_REGULATORY_CLASS_RESPONSE_ELEMENT + elif(feature_class.id == so.SO_CIS_REG_RECODING_STIMULATION_REGION.id or feature_class.id == so.SO_CIS_REG_FRAMESHIFT.id): + return bc.INSDC_FEATURE_REGULATORY_CLASS_RECODING_STIMULATORY_REGION + elif(feature_class.id == so.SO_CIS_REG_RIBOSOME_BINDING_SITE.id): + return bc.INSDC_FEATURE_REGULATORY_CLASS_RIBOSOME_BINDING_SITE + else: + return bc.INSDC_FEATURE_REGULATORY_CLASS_OTHER def revise_product_insdc(product: str): From 635353be2884fb3cdebf65578af7af71f47f82b5 Mon Sep 17 00:00:00 2001 From: Oliver Schwengers Date: Wed, 25 Oct 2023 16:15:16 +0200 Subject: [PATCH 5/9] add annotation stats aggregation script --- scripts/collect-annotation-stats.py | 94 +++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100755 scripts/collect-annotation-stats.py diff --git a/scripts/collect-annotation-stats.py b/scripts/collect-annotation-stats.py new file mode 100755 index 00000000..cffc0b1b --- /dev/null +++ b/scripts/collect-annotation-stats.py @@ -0,0 +1,94 @@ +#!/usr/bin/python3 + +import argparse +import json +import os + +from pathlib import Path + +import bakta +import bakta.constants as bc + + +parser = argparse.ArgumentParser( + prog=f'collect-annotation-stats', + description='Collect annotation statistics and export as TSV', + epilog=f'Version: {bakta.__version__}\nDOI: {bc.BAKTA_DOI}\nURL: github.com/oschwengers/bakta\n\nCitation:\n{bc.BAKTA_CITATION}', + formatter_class=argparse.RawDescriptionHelpFormatter, + add_help=False +) +parser.add_argument('genomes', metavar='', nargs='+', help='Bakta genome annotation files in JSON format') +parser.add_argument('--prefix', '-p', action='store', default='annotation-stats', help='Prefix for output file') +parser.add_argument('--output', '-o', action='store', default=os.getcwd(), help='Output directory (default = current working directory)') +args = parser.parse_args() + + +prefix = args.prefix +output_path = Path(args.output).resolve().joinpath(f'{prefix}.tsv') +with output_path.open('w') as fh_out: + fh_out.write( + '\t'.join( + [ + 'Genome', + 'Taxon', + 'Complete', + 'Translation tale', + '# Sequences', + 'Size', + 'GC', + 'N ratio', + 'Coding ratio', + 'tRNA', + 'tmRNA', + 'rRNA', + 'ncRNA', + 'ncRNA region', + 'CRISPR', + 'CDS', + 'CDS hypothetical', + 'CDS pseudogene', + 'sORF', + 'GAP', + 'oriC', + 'oriV', + 'oriT' + ] + ) + ) + fh_out.write('\n') + for genome in args.genomes: + genome_path = Path(genome).resolve() + try: + with genome_path.open() as fh_in: + genome = json.load(fh_in) + stats = [ + genome_path.stem, + f"{' '.join([t for t in [genome['genome'].get('genus', None), genome['genome'].get('species', None), genome['genome'].get('strain', None)] if t is not None])}", + 'y' if genome['genome']['complete'] else 'n', + f"{genome['genome']['translation_table']}", + f"{genome['stats']['no_sequences']}", + f"{genome['stats']['size']}", + f"{100 * genome['stats']['gc']:.1f}", + f"{100 * genome['stats']['n_ratio']:.1f}", + f"{genome['stats']['n50']}", + f"{100 * genome['stats']['coding_ratio']:.1f}", + f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_T_RNA])}", + f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_TM_RNA])}", + f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_R_RNA])}", + f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_NC_RNA])}", + f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_NC_RNA_REGION])}", + f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_CRISPR])}", + f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_CDS])}", + f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_CDS and 'hypothetical' in f])}", + f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_CDS and 'pseudogene' in f])}", + f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_SORF])}", + f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_GAP])}", + f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_ORIC])}", + f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_ORIV])}", + f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_ORIT])}", + ] + output_line = '\t'.join(stats) + print(output_line) + fh_out.write(f'{output_line}\n') + except: + print(f"Error reading genome {genome_path.stem}") From 019dde4555f9d4b7cfccf8f9fd8e0ebd94a5f34f Mon Sep 17 00:00:00 2001 From: Oliver Schwengers Date: Wed, 25 Oct 2023 16:20:11 +0200 Subject: [PATCH 6/9] review extract region script --- scripts/extract-region.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/scripts/extract-region.py b/scripts/extract-region.py index ff8308cc..6ddb90b1 100755 --- a/scripts/extract-region.py +++ b/scripts/extract-region.py @@ -1,17 +1,26 @@ #!/usr/bin/python3 +import argparse import json import os from pathlib import Path -import bakta.utils as bu +import bakta +import bakta.constants as bc import bakta.io.gff as gff import bakta.io.insdc as insdc import bakta.io.fasta as fasta import bakta.config as cfg -parser = bu.init_parser() + +parser = argparse.ArgumentParser( + prog=f'extract region', + description='Extract genomic region with a given range and exports selected features as GFF3, FAA, FFN, EMBL and Genbank', + epilog=f'Version: {bakta.__version__}\nDOI: {bc.BAKTA_DOI}\nURL: github.com/oschwengers/bakta\n\nCitation:\n{bc.BAKTA_CITATION}', + formatter_class=argparse.RawDescriptionHelpFormatter, + add_help=False +) parser.add_argument('genome', metavar='', help='Bakta genome annotation in JSON format') parser.add_argument('--prefix', '-p', action='store', default=None, help='Prefix for output files') parser.add_argument('--output', '-o', action='store', default=os.getcwd(), help='Output directory (default = current working directory)') @@ -26,7 +35,6 @@ with genome_path.open() as fh: genome = json.load(fh) - contig_id = args.sequence if(contig_id is None): # take first sequence as default contig_id = genome['sequences'][0]['id'] @@ -45,7 +53,6 @@ features_by_contig = {contig_id: features_selected} # needed for GFF3 export print(f'\t...selected features: {len(features_selected)}') - genome['features'] = features_selected genome['contigs'] = [sequence for sequence in genome['sequences'] if sequence['id'] == contig_id] genome['genus'] = genome['genome']['genus'] From 0dec0c3e7542f310081651d2229067c00fbaf40f Mon Sep 17 00:00:00 2001 From: Oliver Schwengers Date: Wed, 25 Oct 2023 16:37:37 +0200 Subject: [PATCH 7/9] add script section to readme --- README.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/README.md b/README.md index 4a6c1266..3e88161c 100644 --- a/README.md +++ b/README.md @@ -23,6 +23,7 @@ Bakta is a tool for the rapid & standardized annotation of bacterial genomes and - [Genome Submission](#genome-submission) - [Protein bulk annotation](#protein-bulk-annotation) - [Genome plots](#genome-plots) +- [Auxiliary scripts](#auxiliary-scripts) - [Web version](#web-version) - [Citation](#citation) - [FAQ](#faq) @@ -713,6 +714,15 @@ In the `cog` mode, all protein-coding genes (CDS) are colored due to assigned CO In addition, both plot types share two innermost GC content and GC skew rings. The first ring represents the GC content per sliding window over the entire sequence(s) in green (`#33a02c`) and red `#e31a1c` representing GC above and below average, respectively. The 2nd ring represents the GC skew in orange (`#fdbf6f`) and blue (`#1f78b4`). The GC skew gives hints on a replicon's replication bubble and hence, on the completeness of the assembly. On a complete & circular bacterial chromosome, you normally see two inflection points at the origin of replication and at its opposite region -> [Wikipedia](https://en.wikipedia.org/wiki/GC_skew) +## Auxiliary scripts + +Often, the usage of Bakta is a necessary upfront task followd by deeper analyses implemented in custom scripts. In (scripts) we'd like to collect & offer a pool of scripts addressing common tasks: + +- `collect-annotation-stats.py`: Collect annotation stats for a cohort of genomes and print a condensed `TSV`. +- `extract-region.py`: Extract genome features within a given genomic range and export them as `GFF3`, `Embl`, `Genbank`, `FAA` and `FFN` + +Of course, pull requests are welcome ;-) + ## Web version For further convenience, we developed an accompanying web application available at https://bakta.computational.bio. From d8710da85c2aa42caff582c73f01fb76bc3e32c6 Mon Sep 17 00:00:00 2001 From: Oliver Schwengers Date: Wed, 25 Oct 2023 16:39:21 +0200 Subject: [PATCH 8/9] try link to scripts in readme [skip ci] --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3e88161c..76f303cd 100644 --- a/README.md +++ b/README.md @@ -716,7 +716,7 @@ In addition, both plot types share two innermost GC content and GC skew rings. T ## Auxiliary scripts -Often, the usage of Bakta is a necessary upfront task followd by deeper analyses implemented in custom scripts. In (scripts) we'd like to collect & offer a pool of scripts addressing common tasks: +Often, the usage of Bakta is a necessary upfront task followd by deeper analyses implemented in custom scripts. In [scripts](scripts) we'd like to collect & offer a pool of scripts addressing common tasks: - `collect-annotation-stats.py`: Collect annotation stats for a cohort of genomes and print a condensed `TSV`. - `extract-region.py`: Extract genome features within a given genomic range and export them as `GFF3`, `Embl`, `Genbank`, `FAA` and `FFN` From c969d61ba199043dd8f168c38a384ee51950c7c1 Mon Sep 17 00:00:00 2001 From: Oliver Schwengers Date: Wed, 25 Oct 2023 16:40:40 +0200 Subject: [PATCH 9/9] fix typo [skip ci] --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 76f303cd..458a3f21 100644 --- a/README.md +++ b/README.md @@ -716,7 +716,7 @@ In addition, both plot types share two innermost GC content and GC skew rings. T ## Auxiliary scripts -Often, the usage of Bakta is a necessary upfront task followd by deeper analyses implemented in custom scripts. In [scripts](scripts) we'd like to collect & offer a pool of scripts addressing common tasks: +Often, the usage of Bakta is a necessary upfront task followed by deeper analyses implemented in custom scripts. In [scripts](scripts) we'd like to collect & offer a pool of scripts addressing common tasks: - `collect-annotation-stats.py`: Collect annotation stats for a cohort of genomes and print a condensed `TSV`. - `extract-region.py`: Extract genome features within a given genomic range and export them as `GFF3`, `Embl`, `Genbank`, `FAA` and `FFN`