Skip to content

Commit

Permalink
update pasty to use camlhmp
Browse files Browse the repository at this point in the history
  • Loading branch information
rpetit3 committed Jul 23, 2024
1 parent b5978f5 commit b1a1c39
Show file tree
Hide file tree
Showing 123 changed files with 1,814 additions and 2,152 deletions.
2 changes: 1 addition & 1 deletion .gitpod.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ tasks:
pip install graphviz
. /opt/conda/etc/profile.d/conda.sh
conda activate base
mamba create -n pasty-dev -y -q -c conda-forge -c bioconda pasty
mamba create -n pasty-dev -y -q -c conda-forge -c bioconda camlhmp
conda activate pasty-dev
git checkout main
vscode:
Expand Down
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
# Releases

## v2.0.0 rpetit3/pasty "PACS2" - 2024/07/22

- `pasty` now uses the `camlhmp` framework for classification
- fixed O2 and O5 serogroup classification

## v1.0.3 rpetit3/pasty "PAC1" - 2023/08/04

- fixed bug when int concatenated to str during blast
Expand Down
173 changes: 113 additions & 60 deletions README.md

Large diffs are not rendered by default.

246 changes: 6 additions & 240 deletions bin/pasty
Original file line number Diff line number Diff line change
@@ -1,241 +1,7 @@
#!/usr/bin/env python3
import sys
from pathlib import Path
from rich import print
from rich.console import Console
from rich.table import Table
import rich_click as click
#!/usr/bin/env bash
pasty_dir=$(dirname $0)

DB_FASTA = str(Path( __file__ ).parent.absolute()).replace("bin", "db/OSAdb.fasta")
VERSION = '1.0.3'
BLASTN_COLS = [
"qseqid", "sseqid", "pident", "qlen", "slen", "length", "nident", "positive", "mismatch", "gapopen", "gaps",
"qstart", "qend", "sstart", "send", "evalue", "bitscore", "qseq", "sseq"
]
SEROGROUPS = ["O1","O2", "O3", "O4", "O5", "O6", "O7", "O9", "O10", "O11", "O12", "O13", "WyzB"]


def check_dependencies():
"""
Check if all dependencies are installed.
"""
from shutil import which
exit_code = 0
print(f'Checking dependencies...', file=sys.stderr)
for program in ['blastn']:
which_path = which(program)
if which_path:
print(f'Found {program} at {which_path}', file=sys.stderr)
else:
print(f'{program} not found', file=sys.stderr)
exit_code = 1

if exit_code == 1:
print(f'Missing dependencies, please check.', file=sys.stderr)
else:
print(f'You are all set!', file=sys.stderr)
sys.exit(exit_code)


def check_file_exists(filename: str) -> str:
"""
Check if file exists and return the absolute path of file, otherwise raise an error.
Args:
`filename`: Name of file to check.
Returns:
Absolute path of file.
"""
if not Path(filename).exists():
raise click.BadParameter(f'{filename} does not exist')
return str(Path(filename).absolute())


def run_blastn(query: str, db: str, min_pident: float) -> dict:
"""
Run BLASTN and save the output to a file.
Args:
`query`: Path to query file.
`db`: Path to database file.
`min_pident`: Minimum percent identity.
Returns:
Dictionary of BLASTN output.
"""
from executor import ExternalCommand, ExternalCommandFailed

outfmt = ' '.join(BLASTN_COLS)
cat_type = 'zcat' if query.endswith('.gz') else 'cat'
try:
command = ExternalCommand(
f"{cat_type} {query} | blastn -query - -subject {db} -outfmt '6 {outfmt}'",
capture=True,
capture_stderr=True
)
command.start()

# Convert results to list of dictionaries
results = {}
raw_results = []
for line in str(command.decoded_stdout).split("\n"):
if line:
raw_results.append(line)
result = dict(zip(BLASTN_COLS, line.split('\t')))

if result['sseqid'] not in results:
results[result['sseqid']] = {'hits': [], 'slen': result['slen'], 'hit_length': 0, 'comment': ''}

if float(result['pident']) >= min_pident:
# Add hit to list of hits
results[result['sseqid']]['hits'].append(result)
# Add length of hit to total length (subtracting gaps)
results[result['sseqid']]['hit_length'] += int(result['nident']) + int(result['mismatch']) - int(result['gaps'])

if results[result['sseqid']]['hit_length'] > int(result['slen']):
# There are overlapping hits
results[result['sseqid']]['hit_length'] = int(result['slen'])
results[result['sseqid']]['comment'] = 'One or more fragments overlapped'
elif len(results[result['sseqid']]['hits']) > 1:
results[result['sseqid']]['comment'] = 'Coverage calculated from multiple blast hits'

return [results, raw_results, command.decoded_stderr]
except ExternalCommandFailed as e:
print(e, file=sys.stderr)
sys.exit(e.returncode)


def predict_serogroup(hits: dict, min_coverage: float) -> dict:
"""
Determine serotype from BLASTN hits.
Args:
`hits`: Dictionary of BLASTN hits.
Returns:
Dictionary of serotype results.
"""

# Check is WzyB is present
serogroups = {}
for serogroup in SEROGROUPS:
serogroups[serogroup] = {'cov': 0, 'fragments': 0}

wzyb = False
if 'WzyB' in hits:
wyzb_cov = 100 * (int(hits['WzyB']['hit_length']) / float(hits['WzyB']['slen']))
wzyb = True if wyzb_cov >= min_coverage else False
serogroups['WzyB'] = {'cov': f"{wyzb_cov:.2f}", 'hits': len(hits['WzyB']['hits'])}


highest_coverage = 0
predicted_antigen = None
comment = ''
for antigen, hit in hits.items():
if antigen == 'WzyB':
continue

# Check if antigen is present
cov = 100 * (int(hit['hit_length']) / float(hit['slen']))
serogroups[antigen] = {'cov': f"{cov:.2f}", 'fragments': len(hit['hits'])}
if cov >= min_coverage:
if cov > highest_coverage:
highest_coverage = cov
predicted_antigen = antigen
comment = hit['comment']

# Predict serotype
predicted_serogroup = {'serogroup': 'NT', 'cov': highest_coverage, 'fragments': 0, 'comment': comment}
if predicted_antigen:
if predicted_antigen == 'O2' and wzyb:
predicted_serogroup['serogroup'] = 'O2'
elif predicted_antigen == 'O2':
predicted_serogroup['serogroup'] = 'O2'
else:
predicted_serogroup['serogroup'] = predicted_antigen
predicted_serogroup['cov'] = serogroups[predicted_antigen]['cov']
predicted_serogroup['fragments'] = len(hits[predicted_antigen]['hits'])

if predicted_serogroup['serogroup'] == 'NT':
predicted_serogroup['comment'] = 'No match exceeded percent identity and/or coverage thresholds'

# If multiple hits, return the one with the longest length
return [serogroups, predicted_serogroup]


@click.command()
@click.version_option(VERSION)
@click.option('--assembly', required=True, help='Input assembly in FASTA format (gzip is OK)')
@click.option('--db', default=DB_FASTA, show_default=True, help='Input database in uncompressed FASTA format')
@click.option('--prefix', default="basename of input", show_default=True, help='Prefix to use for output files')
@click.option('--outdir', default="./", show_default=True, help='Directory to save output files')
@click.option('--min_pident', default=95, show_default=True, help='Minimum percent identity to count a hit')
@click.option('--min_coverage', default=95, show_default=True, help='Minimum percent coverage to count a hit')
@click.option('--check', is_flag=True, help='Check dependencies are installed, then exit')
def pasty(assembly, db, prefix, outdir, min_pident, min_coverage, check):
"""A tool easily taken advantage of for in silico serogrouping of Pseudomonas aeruginosa isolates"""
assembly = check_file_exists(assembly)
db = check_file_exists(db)
prefix = str(Path(assembly).stem) if prefix == "basename of input" else prefix

console = Console()
print("[italic]Running [deep_sky_blue1]pasty[/deep_sky_blue1] with following parameters:[/italic]")
print(f"[italic] --assembly {assembly}[/italic]")
print(f"[italic] --db {db}[/italic]")
print(f"[italic] --prefix {prefix}[/italic]")
print(f"[italic] --outdir {outdir}[/italic]")
print(f"[italic] --min_pident {min_pident}[/italic]")
print(f"[italic] --min_coverage {min_coverage}[/italic]\n")
print("[italic]Running BLASTN...[/italic]")
blastn_parsed, blastn_stdout, blastn_stderr = run_blastn(assembly, db, min_pident)
serogroups, predicted_serogroup = predict_serogroup(blastn_parsed, min_coverage)

# Write outputs
if outdir != "./":
Path(outdir).mkdir(parents=True, exist_ok=True)
outfile = f"{outdir}/{prefix}".replace("//", "/")
print("[italic]Writing outputs...[/italic]")
print(f"[italic]BLASTN results written to [deep_sky_blue1]{outfile}.blastn.tsv[/deep_sky_blue1][/italic]\n")
with open(f"{outfile}.blastn.tsv", 'wt') as fh_out:
fh_out.write("\t".join(BLASTN_COLS) + "\n")
for result in blastn_stdout:
fh_out.write(f"{result}\n")

# Serogroup Results
serogroup_table = Table(title=f"Serogroup Results")
serogroup_table.add_column("sample", style="white")
serogroup_table.add_column("serogroup", style="cyan")
serogroup_table.add_column("coverage", style="green")
serogroup_table.add_column("fragments", style="green")
with open(f"{outfile}.details.tsv", 'wt') as fh_out:
fh_out.write("sample\tserogroup\tcoverage\tfragments\n")
for serogroup in SEROGROUPS:
detail = serogroups[serogroup]
serogroup_table.add_row(prefix, serogroup, str(detail['cov']), str(detail['fragments']))
fh_out.write(f"{prefix}\t{serogroup}\t{detail['cov']}\t{detail['fragments']}\n")
console.print(serogroup_table)
print(f"[italic]Serogroup Results written to [deep_sky_blue1]{outfile}.details.tsv[/deep_sky_blue1][/italic]\n")

# Predicted Serogroup
prediction_table = Table(title=f"Predicted Serogroup")
prediction_table.add_column("sample", style="white")
prediction_table.add_column("serogroup", style="cyan")
prediction_table.add_column("coverage", style="green")
prediction_table.add_column("fragments", style="green")
prediction_table.add_column("comment", style="green")
with open(f"{outfile}.tsv", 'wt') as fh_out:
prediction_table.add_row(prefix, predicted_serogroup['serogroup'], str(predicted_serogroup['cov']), str(predicted_serogroup['fragments']), predicted_serogroup['comment'])
fh_out.write("sample\tserogroup\tcoverage\tfragments\tcomment\n")
fh_out.write(f"{prefix}\t{predicted_serogroup['serogroup']}\t{predicted_serogroup['cov']}\t{predicted_serogroup['fragments']}\t{predicted_serogroup['comment']}\n")
console.print(prediction_table)
print(f"[italic]Predicted serogroup result written to [deep_sky_blue1]{outfile}.tsv[/deep_sky_blue1][/italic]\n")


if __name__ == '__main__':
if len(sys.argv) == 1:
pasty.main(['--help'])
elif "--check" in sys.argv:
check_dependencies()
else:
pasty()
CAML_YAML="${pasty_dir}/../data/pa-osa.yaml" \
CAML_TARGETS="${pasty_dir}/../data/pa-osa.fasta" \
camlhmp-blast-region \
"${@:1}"
7 changes: 7 additions & 0 deletions bin/pasty-bioconda
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/usr/bin/env bash
pasty_dir=$(dirname $0)

CAML_YAML="${pasty_dir}/../share/pasty/pa-osa.yaml" \
CAML_TARGETS="${pasty_dir}/../share/pasty/pa-osa.fasta" \
camlhmp-blast-region \
"${@:1}"
Loading

0 comments on commit b1a1c39

Please sign in to comment.