-
Notifications
You must be signed in to change notification settings - Fork 58
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* add region extraction script * refactor region extraction script * fix feature type comparison * fix SO named tuple after JSON import * add annotation stats aggregation script * review extract region script * add script section to readme * try link to scripts in readme [skip ci] * fix typo [skip ci]
- Loading branch information
1 parent
53df2bf
commit 65b8caa
Showing
5 changed files
with
225 additions
and
28 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
#!/usr/bin/python3 | ||
|
||
import argparse | ||
import json | ||
import os | ||
|
||
from pathlib import Path | ||
|
||
import bakta | ||
import bakta.constants as bc | ||
|
||
|
||
parser = argparse.ArgumentParser( | ||
prog=f'collect-annotation-stats', | ||
description='Collect annotation statistics and export as TSV', | ||
epilog=f'Version: {bakta.__version__}\nDOI: {bc.BAKTA_DOI}\nURL: github.com/oschwengers/bakta\n\nCitation:\n{bc.BAKTA_CITATION}', | ||
formatter_class=argparse.RawDescriptionHelpFormatter, | ||
add_help=False | ||
) | ||
parser.add_argument('genomes', metavar='<genomes>', nargs='+', help='Bakta genome annotation files in JSON format') | ||
parser.add_argument('--prefix', '-p', action='store', default='annotation-stats', help='Prefix for output file') | ||
parser.add_argument('--output', '-o', action='store', default=os.getcwd(), help='Output directory (default = current working directory)') | ||
args = parser.parse_args() | ||
|
||
|
||
prefix = args.prefix | ||
output_path = Path(args.output).resolve().joinpath(f'{prefix}.tsv') | ||
with output_path.open('w') as fh_out: | ||
fh_out.write( | ||
'\t'.join( | ||
[ | ||
'Genome', | ||
'Taxon', | ||
'Complete', | ||
'Translation tale', | ||
'# Sequences', | ||
'Size', | ||
'GC', | ||
'N ratio', | ||
'Coding ratio', | ||
'tRNA', | ||
'tmRNA', | ||
'rRNA', | ||
'ncRNA', | ||
'ncRNA region', | ||
'CRISPR', | ||
'CDS', | ||
'CDS hypothetical', | ||
'CDS pseudogene', | ||
'sORF', | ||
'GAP', | ||
'oriC', | ||
'oriV', | ||
'oriT' | ||
] | ||
) | ||
) | ||
fh_out.write('\n') | ||
for genome in args.genomes: | ||
genome_path = Path(genome).resolve() | ||
try: | ||
with genome_path.open() as fh_in: | ||
genome = json.load(fh_in) | ||
stats = [ | ||
genome_path.stem, | ||
f"{' '.join([t for t in [genome['genome'].get('genus', None), genome['genome'].get('species', None), genome['genome'].get('strain', None)] if t is not None])}", | ||
'y' if genome['genome']['complete'] else 'n', | ||
f"{genome['genome']['translation_table']}", | ||
f"{genome['stats']['no_sequences']}", | ||
f"{genome['stats']['size']}", | ||
f"{100 * genome['stats']['gc']:.1f}", | ||
f"{100 * genome['stats']['n_ratio']:.1f}", | ||
f"{genome['stats']['n50']}", | ||
f"{100 * genome['stats']['coding_ratio']:.1f}", | ||
f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_T_RNA])}", | ||
f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_TM_RNA])}", | ||
f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_R_RNA])}", | ||
f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_NC_RNA])}", | ||
f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_NC_RNA_REGION])}", | ||
f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_CRISPR])}", | ||
f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_CDS])}", | ||
f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_CDS and 'hypothetical' in f])}", | ||
f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_CDS and 'pseudogene' in f])}", | ||
f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_SORF])}", | ||
f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_GAP])}", | ||
f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_ORIC])}", | ||
f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_ORIV])}", | ||
f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_ORIT])}", | ||
] | ||
output_line = '\t'.join(stats) | ||
print(output_line) | ||
fh_out.write(f'{output_line}\n') | ||
except: | ||
print(f"Error reading genome {genome_path.stem}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
#!/usr/bin/python3 | ||
|
||
import argparse | ||
import json | ||
import os | ||
|
||
from pathlib import Path | ||
|
||
import bakta | ||
import bakta.constants as bc | ||
import bakta.io.gff as gff | ||
import bakta.io.insdc as insdc | ||
import bakta.io.fasta as fasta | ||
import bakta.config as cfg | ||
|
||
|
||
parser = argparse.ArgumentParser( | ||
prog=f'extract region', | ||
description='Extract genomic region with a given range and exports selected features as GFF3, FAA, FFN, EMBL and Genbank', | ||
epilog=f'Version: {bakta.__version__}\nDOI: {bc.BAKTA_DOI}\nURL: github.com/oschwengers/bakta\n\nCitation:\n{bc.BAKTA_CITATION}', | ||
formatter_class=argparse.RawDescriptionHelpFormatter, | ||
add_help=False | ||
) | ||
parser.add_argument('genome', metavar='<genome>', help='Bakta genome annotation in JSON format') | ||
parser.add_argument('--prefix', '-p', action='store', default=None, help='Prefix for output files') | ||
parser.add_argument('--output', '-o', action='store', default=os.getcwd(), help='Output directory (default = current working directory)') | ||
parser.add_argument('--sequence', '-s', action='store', default=None, help='Sequence/Contig (default = first)') | ||
parser.add_argument('--min', '-m', action='store', type=int, default=0, help='Left region border including (default = 0)') | ||
parser.add_argument('--max', '-x', action='store', type=int, default=100_000_000, help='Right region border including (default = 100,000,000)') | ||
args = parser.parse_args() | ||
|
||
|
||
print('Load annotated genome...') | ||
genome_path = Path(args.genome).resolve() | ||
with genome_path.open() as fh: | ||
genome = json.load(fh) | ||
|
||
contig_id = args.sequence | ||
if(contig_id is None): # take first sequence as default | ||
contig_id = genome['sequences'][0]['id'] | ||
|
||
prefix = args.prefix | ||
if(prefix is None): # use input file prefix as default | ||
prefix = genome_path.stem | ||
|
||
|
||
print('Extract features within selected region...') | ||
features_selected = [] | ||
for feat in genome['features']: | ||
if(feat['contig'] == contig_id): | ||
if(feat['start'] >= args.min and feat['stop'] <= args.max): | ||
features_selected.append(feat) | ||
features_by_contig = {contig_id: features_selected} # needed for GFF3 export | ||
print(f'\t...selected features: {len(features_selected)}') | ||
|
||
genome['features'] = features_selected | ||
genome['contigs'] = [sequence for sequence in genome['sequences'] if sequence['id'] == contig_id] | ||
genome['genus'] = genome['genome']['genus'] | ||
genome['species'] = genome['genome']['species'] | ||
genome['strain'] = genome['genome']['strain'] | ||
genome['taxon'] = f"{genome['genome']['genus']} {genome['genome']['species']} {genome['genome']['strain']}" | ||
cfg.db_info = { | ||
'major': genome['version']['db']['version'].split('.')[0], | ||
'minor': genome['version']['db']['version'].split('.')[1], | ||
'type': genome['version']['db']['type'] | ||
} | ||
|
||
print('Write selected features...') | ||
output_path = Path(args.output).resolve() | ||
gff3_path = output_path.joinpath(f'{prefix}.gff3') | ||
gff.write_gff3(genome, features_by_contig, gff3_path) | ||
print('\t...INSDC GenBank & EMBL') | ||
genbank_path = output_path.joinpath(f'{prefix}.gbff') | ||
embl_path = output_path.joinpath(f'{prefix}.embl') | ||
insdc.write_insdc(genome, features_selected, genbank_path, embl_path) | ||
print('\t...feature nucleotide sequences') | ||
ffn_path = output_path.joinpath(f'{prefix}.ffn') | ||
fasta.write_ffn(features_selected, ffn_path) | ||
print('\t...translated CDS sequences') | ||
faa_path = output_path.joinpath(f'{prefix}.faa') | ||
fasta.write_faa(features_selected, faa_path) |