-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
123 changed files
with
1,814 additions
and
2,152 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,241 +1,7 @@ | ||
#!/usr/bin/env python3 | ||
import sys | ||
from pathlib import Path | ||
from rich import print | ||
from rich.console import Console | ||
from rich.table import Table | ||
import rich_click as click | ||
#!/usr/bin/env bash | ||
pasty_dir=$(dirname $0) | ||
|
||
DB_FASTA = str(Path( __file__ ).parent.absolute()).replace("bin", "db/OSAdb.fasta") | ||
VERSION = '1.0.3' | ||
BLASTN_COLS = [ | ||
"qseqid", "sseqid", "pident", "qlen", "slen", "length", "nident", "positive", "mismatch", "gapopen", "gaps", | ||
"qstart", "qend", "sstart", "send", "evalue", "bitscore", "qseq", "sseq" | ||
] | ||
SEROGROUPS = ["O1","O2", "O3", "O4", "O5", "O6", "O7", "O9", "O10", "O11", "O12", "O13", "WyzB"] | ||
|
||
|
||
def check_dependencies(): | ||
""" | ||
Check if all dependencies are installed. | ||
""" | ||
from shutil import which | ||
exit_code = 0 | ||
print(f'Checking dependencies...', file=sys.stderr) | ||
for program in ['blastn']: | ||
which_path = which(program) | ||
if which_path: | ||
print(f'Found {program} at {which_path}', file=sys.stderr) | ||
else: | ||
print(f'{program} not found', file=sys.stderr) | ||
exit_code = 1 | ||
|
||
if exit_code == 1: | ||
print(f'Missing dependencies, please check.', file=sys.stderr) | ||
else: | ||
print(f'You are all set!', file=sys.stderr) | ||
sys.exit(exit_code) | ||
|
||
|
||
def check_file_exists(filename: str) -> str: | ||
""" | ||
Check if file exists and return the absolute path of file, otherwise raise an error. | ||
Args: | ||
`filename`: Name of file to check. | ||
Returns: | ||
Absolute path of file. | ||
""" | ||
if not Path(filename).exists(): | ||
raise click.BadParameter(f'{filename} does not exist') | ||
return str(Path(filename).absolute()) | ||
|
||
|
||
def run_blastn(query: str, db: str, min_pident: float) -> dict: | ||
""" | ||
Run BLASTN and save the output to a file. | ||
Args: | ||
`query`: Path to query file. | ||
`db`: Path to database file. | ||
`min_pident`: Minimum percent identity. | ||
Returns: | ||
Dictionary of BLASTN output. | ||
""" | ||
from executor import ExternalCommand, ExternalCommandFailed | ||
|
||
outfmt = ' '.join(BLASTN_COLS) | ||
cat_type = 'zcat' if query.endswith('.gz') else 'cat' | ||
try: | ||
command = ExternalCommand( | ||
f"{cat_type} {query} | blastn -query - -subject {db} -outfmt '6 {outfmt}'", | ||
capture=True, | ||
capture_stderr=True | ||
) | ||
command.start() | ||
|
||
# Convert results to list of dictionaries | ||
results = {} | ||
raw_results = [] | ||
for line in str(command.decoded_stdout).split("\n"): | ||
if line: | ||
raw_results.append(line) | ||
result = dict(zip(BLASTN_COLS, line.split('\t'))) | ||
|
||
if result['sseqid'] not in results: | ||
results[result['sseqid']] = {'hits': [], 'slen': result['slen'], 'hit_length': 0, 'comment': ''} | ||
|
||
if float(result['pident']) >= min_pident: | ||
# Add hit to list of hits | ||
results[result['sseqid']]['hits'].append(result) | ||
# Add length of hit to total length (subtracting gaps) | ||
results[result['sseqid']]['hit_length'] += int(result['nident']) + int(result['mismatch']) - int(result['gaps']) | ||
|
||
if results[result['sseqid']]['hit_length'] > int(result['slen']): | ||
# There are overlapping hits | ||
results[result['sseqid']]['hit_length'] = int(result['slen']) | ||
results[result['sseqid']]['comment'] = 'One or more fragments overlapped' | ||
elif len(results[result['sseqid']]['hits']) > 1: | ||
results[result['sseqid']]['comment'] = 'Coverage calculated from multiple blast hits' | ||
|
||
return [results, raw_results, command.decoded_stderr] | ||
except ExternalCommandFailed as e: | ||
print(e, file=sys.stderr) | ||
sys.exit(e.returncode) | ||
|
||
|
||
def predict_serogroup(hits: dict, min_coverage: float) -> dict: | ||
""" | ||
Determine serotype from BLASTN hits. | ||
Args: | ||
`hits`: Dictionary of BLASTN hits. | ||
Returns: | ||
Dictionary of serotype results. | ||
""" | ||
|
||
# Check is WzyB is present | ||
serogroups = {} | ||
for serogroup in SEROGROUPS: | ||
serogroups[serogroup] = {'cov': 0, 'fragments': 0} | ||
|
||
wzyb = False | ||
if 'WzyB' in hits: | ||
wyzb_cov = 100 * (int(hits['WzyB']['hit_length']) / float(hits['WzyB']['slen'])) | ||
wzyb = True if wyzb_cov >= min_coverage else False | ||
serogroups['WzyB'] = {'cov': f"{wyzb_cov:.2f}", 'hits': len(hits['WzyB']['hits'])} | ||
|
||
|
||
highest_coverage = 0 | ||
predicted_antigen = None | ||
comment = '' | ||
for antigen, hit in hits.items(): | ||
if antigen == 'WzyB': | ||
continue | ||
|
||
# Check if antigen is present | ||
cov = 100 * (int(hit['hit_length']) / float(hit['slen'])) | ||
serogroups[antigen] = {'cov': f"{cov:.2f}", 'fragments': len(hit['hits'])} | ||
if cov >= min_coverage: | ||
if cov > highest_coverage: | ||
highest_coverage = cov | ||
predicted_antigen = antigen | ||
comment = hit['comment'] | ||
|
||
# Predict serotype | ||
predicted_serogroup = {'serogroup': 'NT', 'cov': highest_coverage, 'fragments': 0, 'comment': comment} | ||
if predicted_antigen: | ||
if predicted_antigen == 'O2' and wzyb: | ||
predicted_serogroup['serogroup'] = 'O2' | ||
elif predicted_antigen == 'O2': | ||
predicted_serogroup['serogroup'] = 'O2' | ||
else: | ||
predicted_serogroup['serogroup'] = predicted_antigen | ||
predicted_serogroup['cov'] = serogroups[predicted_antigen]['cov'] | ||
predicted_serogroup['fragments'] = len(hits[predicted_antigen]['hits']) | ||
|
||
if predicted_serogroup['serogroup'] == 'NT': | ||
predicted_serogroup['comment'] = 'No match exceeded percent identity and/or coverage thresholds' | ||
|
||
# If multiple hits, return the one with the longest length | ||
return [serogroups, predicted_serogroup] | ||
|
||
|
||
@click.command() | ||
@click.version_option(VERSION) | ||
@click.option('--assembly', required=True, help='Input assembly in FASTA format (gzip is OK)') | ||
@click.option('--db', default=DB_FASTA, show_default=True, help='Input database in uncompressed FASTA format') | ||
@click.option('--prefix', default="basename of input", show_default=True, help='Prefix to use for output files') | ||
@click.option('--outdir', default="./", show_default=True, help='Directory to save output files') | ||
@click.option('--min_pident', default=95, show_default=True, help='Minimum percent identity to count a hit') | ||
@click.option('--min_coverage', default=95, show_default=True, help='Minimum percent coverage to count a hit') | ||
@click.option('--check', is_flag=True, help='Check dependencies are installed, then exit') | ||
def pasty(assembly, db, prefix, outdir, min_pident, min_coverage, check): | ||
"""A tool easily taken advantage of for in silico serogrouping of Pseudomonas aeruginosa isolates""" | ||
assembly = check_file_exists(assembly) | ||
db = check_file_exists(db) | ||
prefix = str(Path(assembly).stem) if prefix == "basename of input" else prefix | ||
|
||
console = Console() | ||
print("[italic]Running [deep_sky_blue1]pasty[/deep_sky_blue1] with following parameters:[/italic]") | ||
print(f"[italic] --assembly {assembly}[/italic]") | ||
print(f"[italic] --db {db}[/italic]") | ||
print(f"[italic] --prefix {prefix}[/italic]") | ||
print(f"[italic] --outdir {outdir}[/italic]") | ||
print(f"[italic] --min_pident {min_pident}[/italic]") | ||
print(f"[italic] --min_coverage {min_coverage}[/italic]\n") | ||
print("[italic]Running BLASTN...[/italic]") | ||
blastn_parsed, blastn_stdout, blastn_stderr = run_blastn(assembly, db, min_pident) | ||
serogroups, predicted_serogroup = predict_serogroup(blastn_parsed, min_coverage) | ||
|
||
# Write outputs | ||
if outdir != "./": | ||
Path(outdir).mkdir(parents=True, exist_ok=True) | ||
outfile = f"{outdir}/{prefix}".replace("//", "/") | ||
print("[italic]Writing outputs...[/italic]") | ||
print(f"[italic]BLASTN results written to [deep_sky_blue1]{outfile}.blastn.tsv[/deep_sky_blue1][/italic]\n") | ||
with open(f"{outfile}.blastn.tsv", 'wt') as fh_out: | ||
fh_out.write("\t".join(BLASTN_COLS) + "\n") | ||
for result in blastn_stdout: | ||
fh_out.write(f"{result}\n") | ||
|
||
# Serogroup Results | ||
serogroup_table = Table(title=f"Serogroup Results") | ||
serogroup_table.add_column("sample", style="white") | ||
serogroup_table.add_column("serogroup", style="cyan") | ||
serogroup_table.add_column("coverage", style="green") | ||
serogroup_table.add_column("fragments", style="green") | ||
with open(f"{outfile}.details.tsv", 'wt') as fh_out: | ||
fh_out.write("sample\tserogroup\tcoverage\tfragments\n") | ||
for serogroup in SEROGROUPS: | ||
detail = serogroups[serogroup] | ||
serogroup_table.add_row(prefix, serogroup, str(detail['cov']), str(detail['fragments'])) | ||
fh_out.write(f"{prefix}\t{serogroup}\t{detail['cov']}\t{detail['fragments']}\n") | ||
console.print(serogroup_table) | ||
print(f"[italic]Serogroup Results written to [deep_sky_blue1]{outfile}.details.tsv[/deep_sky_blue1][/italic]\n") | ||
|
||
# Predicted Serogroup | ||
prediction_table = Table(title=f"Predicted Serogroup") | ||
prediction_table.add_column("sample", style="white") | ||
prediction_table.add_column("serogroup", style="cyan") | ||
prediction_table.add_column("coverage", style="green") | ||
prediction_table.add_column("fragments", style="green") | ||
prediction_table.add_column("comment", style="green") | ||
with open(f"{outfile}.tsv", 'wt') as fh_out: | ||
prediction_table.add_row(prefix, predicted_serogroup['serogroup'], str(predicted_serogroup['cov']), str(predicted_serogroup['fragments']), predicted_serogroup['comment']) | ||
fh_out.write("sample\tserogroup\tcoverage\tfragments\tcomment\n") | ||
fh_out.write(f"{prefix}\t{predicted_serogroup['serogroup']}\t{predicted_serogroup['cov']}\t{predicted_serogroup['fragments']}\t{predicted_serogroup['comment']}\n") | ||
console.print(prediction_table) | ||
print(f"[italic]Predicted serogroup result written to [deep_sky_blue1]{outfile}.tsv[/deep_sky_blue1][/italic]\n") | ||
|
||
|
||
if __name__ == '__main__': | ||
if len(sys.argv) == 1: | ||
pasty.main(['--help']) | ||
elif "--check" in sys.argv: | ||
check_dependencies() | ||
else: | ||
pasty() | ||
CAML_YAML="${pasty_dir}/../data/pa-osa.yaml" \ | ||
CAML_TARGETS="${pasty_dir}/../data/pa-osa.fasta" \ | ||
camlhmp-blast-region \ | ||
"${@:1}" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
#!/usr/bin/env bash | ||
pasty_dir=$(dirname $0) | ||
|
||
CAML_YAML="${pasty_dir}/../share/pasty/pa-osa.yaml" \ | ||
CAML_TARGETS="${pasty_dir}/../share/pasty/pa-osa.fasta" \ | ||
camlhmp-blast-region \ | ||
"${@:1}" |
Oops, something went wrong.