update pasty to use camlhmp

rpetit3 · Jul 23, 2024 · b1a1c39 · b1a1c39
1 parent b5978f5
commit b1a1c39
Show file tree

Hide file tree

Showing 123 changed files with 1,814 additions and 2,152 deletions.
diff --git a/.gitpod.yml b/.gitpod.yml
@@ -6,7 +6,7 @@ tasks:
       pip install graphviz
       . /opt/conda/etc/profile.d/conda.sh
       conda activate base
-      mamba create -n pasty-dev -y -q -c conda-forge -c bioconda pasty
+      mamba create -n pasty-dev -y -q -c conda-forge -c bioconda camlhmp
       conda activate pasty-dev
       git checkout main
 vscode:

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,10 @@
 # Releases
 
+## v2.0.0 rpetit3/pasty "PACS2" - 2024/07/22
+
+- `pasty` now uses the `camlhmp` framework for classification
+- fixed O2 and O5 serogroup classification
+
 ## v1.0.3 rpetit3/pasty "PAC1" - 2023/08/04
 
 - fixed bug when int concatenated to str during blast

diff --git a/README.md b/README.md
diff --git a/bin/pasty b/bin/pasty
@@ -1,241 +1,7 @@
-#!/usr/bin/env python3
-import sys
-from pathlib import Path
-from rich import print
-from rich.console import Console
-from rich.table import Table
-import rich_click as click
+#!/usr/bin/env bash
+pasty_dir=$(dirname $0)
 
-DB_FASTA = str(Path( __file__ ).parent.absolute()).replace("bin", "db/OSAdb.fasta")
-VERSION = '1.0.3'
-BLASTN_COLS = [
-    "qseqid", "sseqid", "pident", "qlen", "slen", "length", "nident", "positive", "mismatch", "gapopen", "gaps",
-    "qstart", "qend", "sstart", "send", "evalue", "bitscore", "qseq", "sseq"
-]
-SEROGROUPS = ["O1","O2", "O3", "O4", "O5", "O6", "O7", "O9", "O10", "O11", "O12", "O13", "WyzB"]
-
-
-def check_dependencies():
-    """
-    Check if all dependencies are installed.
-    """
-    from shutil import which
-    exit_code = 0
-    print(f'Checking dependencies...', file=sys.stderr)
-    for program in ['blastn']:
-        which_path = which(program)
-        if which_path:
-            print(f'Found {program} at {which_path}', file=sys.stderr)
-        else:
-            print(f'{program} not found', file=sys.stderr)
-            exit_code = 1
-
-    if exit_code == 1:
-        print(f'Missing dependencies, please check.', file=sys.stderr)
-    else:
-        print(f'You are all set!', file=sys.stderr)
-    sys.exit(exit_code)
-
-
-def check_file_exists(filename: str) -> str:
-    """
-    Check if file exists and return the absolute path of file, otherwise raise an error.
-
-    Args:
-        `filename`: Name of file to check.
-
-    Returns:
-        Absolute path of file.
-    """
-    if not Path(filename).exists():
-        raise click.BadParameter(f'{filename} does not exist')
-    return str(Path(filename).absolute())
-
-
-def run_blastn(query: str, db: str, min_pident: float) -> dict:
-    """
-    Run BLASTN and save the output to a file.
-
-    Args:
-        `query`: Path to query file.
-        `db`: Path to database file.
-        `min_pident`: Minimum percent identity.
-
-    Returns:
-        Dictionary of BLASTN output.
-    """
-    from executor import ExternalCommand, ExternalCommandFailed
-
-    outfmt = ' '.join(BLASTN_COLS)
-    cat_type = 'zcat' if query.endswith('.gz') else 'cat'
-    try:
-        command = ExternalCommand(
-            f"{cat_type} {query} | blastn -query - -subject {db} -outfmt '6 {outfmt}'",
-            capture=True, 
-            capture_stderr=True
-        )
-        command.start()
-
-        # Convert results to list of dictionaries
-        results = {}
-        raw_results = []
-        for line in str(command.decoded_stdout).split("\n"):
-            if line:
-                raw_results.append(line)
-                result = dict(zip(BLASTN_COLS, line.split('\t')))
-
-                if result['sseqid'] not in results:
-                    results[result['sseqid']] = {'hits': [], 'slen': result['slen'], 'hit_length': 0, 'comment': ''}
-
-                if float(result['pident']) >= min_pident:
-                    # Add hit to list of hits
-                    results[result['sseqid']]['hits'].append(result)
-                    # Add length of hit to total length (subtracting gaps)
-                    results[result['sseqid']]['hit_length'] += int(result['nident']) + int(result['mismatch']) - int(result['gaps'])
-
-                    if results[result['sseqid']]['hit_length'] > int(result['slen']):
-                        # There are overlapping hits
-                        results[result['sseqid']]['hit_length'] = int(result['slen'])
-                        results[result['sseqid']]['comment'] = 'One or more fragments overlapped'
-                    elif len(results[result['sseqid']]['hits']) > 1:
-                        results[result['sseqid']]['comment'] = 'Coverage calculated from multiple blast hits'
-
-        return [results, raw_results, command.decoded_stderr]
-    except ExternalCommandFailed as e:
-        print(e, file=sys.stderr)
-        sys.exit(e.returncode)
-
-
-def predict_serogroup(hits: dict, min_coverage: float) -> dict:
-    """
-    Determine serotype from BLASTN hits.
-
-    Args:
-        `hits`: Dictionary of BLASTN hits.
-
-    Returns:
-        Dictionary of serotype results.
-    """
-
-    # Check is WzyB is present
-    serogroups = {}
-    for serogroup in SEROGROUPS:
-        serogroups[serogroup] = {'cov': 0, 'fragments': 0}
-
-    wzyb = False
-    if 'WzyB' in hits:
-        wyzb_cov = 100 * (int(hits['WzyB']['hit_length']) / float(hits['WzyB']['slen']))
-        wzyb = True if wyzb_cov >= min_coverage else False
-        serogroups['WzyB'] = {'cov': f"{wyzb_cov:.2f}", 'hits': len(hits['WzyB']['hits'])}
-
-
-    highest_coverage = 0
-    predicted_antigen = None
-    comment = ''
-    for antigen, hit in hits.items():
-        if antigen == 'WzyB':
-            continue
-
-        # Check if antigen is present
-        cov = 100 * (int(hit['hit_length']) / float(hit['slen']))
-        serogroups[antigen] = {'cov': f"{cov:.2f}", 'fragments': len(hit['hits'])}
-        if cov >= min_coverage:
-            if cov > highest_coverage:
-                highest_coverage = cov
-                predicted_antigen = antigen
-                comment = hit['comment']
-
-    # Predict serotype
-    predicted_serogroup = {'serogroup': 'NT', 'cov': highest_coverage, 'fragments': 0, 'comment': comment}
-    if predicted_antigen:
-        if predicted_antigen == 'O2' and wzyb:
-            predicted_serogroup['serogroup'] = 'O2'
-        elif predicted_antigen == 'O2':
-            predicted_serogroup['serogroup'] = 'O2'
-        else:
-            predicted_serogroup['serogroup'] = predicted_antigen
-        predicted_serogroup['cov'] = serogroups[predicted_antigen]['cov']
-        predicted_serogroup['fragments'] = len(hits[predicted_antigen]['hits'])
-
-    if predicted_serogroup['serogroup'] == 'NT':
-        predicted_serogroup['comment'] = 'No match exceeded percent identity and/or coverage thresholds'
-
-    # If multiple hits, return the one with the longest length
-    return [serogroups, predicted_serogroup]
-
-
-@click.command()
-@click.version_option(VERSION)
-@click.option('--assembly', required=True, help='Input assembly in FASTA format (gzip is OK)')
-@click.option('--db', default=DB_FASTA, show_default=True, help='Input database in uncompressed FASTA format')
-@click.option('--prefix', default="basename of input", show_default=True, help='Prefix to use for output files')
-@click.option('--outdir', default="./", show_default=True, help='Directory to save output files')
-@click.option('--min_pident', default=95, show_default=True, help='Minimum percent identity to count a hit')
-@click.option('--min_coverage', default=95, show_default=True, help='Minimum percent coverage to count a hit')
-@click.option('--check', is_flag=True, help='Check dependencies are installed, then exit')
-def pasty(assembly, db, prefix, outdir, min_pident, min_coverage, check):
-    """A tool easily taken advantage of for in silico serogrouping of Pseudomonas aeruginosa isolates"""
-    assembly = check_file_exists(assembly)
-    db = check_file_exists(db)
-    prefix = str(Path(assembly).stem) if prefix == "basename of input" else prefix
-
-    console = Console()
-    print("[italic]Running [deep_sky_blue1]pasty[/deep_sky_blue1] with following parameters:[/italic]")
-    print(f"[italic]    --assembly {assembly}[/italic]")
-    print(f"[italic]    --db {db}[/italic]")
-    print(f"[italic]    --prefix {prefix}[/italic]")
-    print(f"[italic]    --outdir {outdir}[/italic]")
-    print(f"[italic]    --min_pident {min_pident}[/italic]")
-    print(f"[italic]    --min_coverage {min_coverage}[/italic]\n")
-    print("[italic]Running BLASTN...[/italic]")
-    blastn_parsed, blastn_stdout, blastn_stderr = run_blastn(assembly, db, min_pident)
-    serogroups, predicted_serogroup = predict_serogroup(blastn_parsed, min_coverage)
-
-    # Write outputs
-    if outdir != "./":
-        Path(outdir).mkdir(parents=True, exist_ok=True)
-    outfile = f"{outdir}/{prefix}".replace("//", "/")
-    print("[italic]Writing outputs...[/italic]")
-    print(f"[italic]BLASTN results written to [deep_sky_blue1]{outfile}.blastn.tsv[/deep_sky_blue1][/italic]\n")
-    with open(f"{outfile}.blastn.tsv", 'wt') as fh_out:
-        fh_out.write("\t".join(BLASTN_COLS) + "\n")
-        for result in blastn_stdout:
-            fh_out.write(f"{result}\n")
-
-    # Serogroup Results
-    serogroup_table = Table(title=f"Serogroup Results")
-    serogroup_table.add_column("sample", style="white")
-    serogroup_table.add_column("serogroup", style="cyan")
-    serogroup_table.add_column("coverage", style="green")
-    serogroup_table.add_column("fragments", style="green")
-    with open(f"{outfile}.details.tsv", 'wt') as fh_out:
-        fh_out.write("sample\tserogroup\tcoverage\tfragments\n")
-        for serogroup in SEROGROUPS:
-            detail = serogroups[serogroup]
-            serogroup_table.add_row(prefix, serogroup, str(detail['cov']), str(detail['fragments']))
-            fh_out.write(f"{prefix}\t{serogroup}\t{detail['cov']}\t{detail['fragments']}\n")
-    console.print(serogroup_table)
-    print(f"[italic]Serogroup Results written to [deep_sky_blue1]{outfile}.details.tsv[/deep_sky_blue1][/italic]\n")
-
-    # Predicted Serogroup
-    prediction_table = Table(title=f"Predicted Serogroup")
-    prediction_table.add_column("sample", style="white")
-    prediction_table.add_column("serogroup", style="cyan")
-    prediction_table.add_column("coverage", style="green")
-    prediction_table.add_column("fragments", style="green")
-    prediction_table.add_column("comment", style="green")
-    with open(f"{outfile}.tsv", 'wt') as fh_out:
-        prediction_table.add_row(prefix, predicted_serogroup['serogroup'], str(predicted_serogroup['cov']), str(predicted_serogroup['fragments']), predicted_serogroup['comment'])
-        fh_out.write("sample\tserogroup\tcoverage\tfragments\tcomment\n")
-        fh_out.write(f"{prefix}\t{predicted_serogroup['serogroup']}\t{predicted_serogroup['cov']}\t{predicted_serogroup['fragments']}\t{predicted_serogroup['comment']}\n")
-    console.print(prediction_table)
-    print(f"[italic]Predicted serogroup result written to [deep_sky_blue1]{outfile}.tsv[/deep_sky_blue1][/italic]\n")
-
-
-if __name__ == '__main__':
-    if len(sys.argv) == 1:
-        pasty.main(['--help'])
-    elif "--check" in sys.argv:
-        check_dependencies()
-    else:
-        pasty()
+CAML_YAML="${pasty_dir}/../data/pa-osa.yaml" \
+CAML_TARGETS="${pasty_dir}/../data/pa-osa.fasta" \
+    camlhmp-blast-region \
+    "${@:1}"
diff --git a/bin/pasty-bioconda b/bin/pasty-bioconda
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+pasty_dir=$(dirname $0)
+
+CAML_YAML="${pasty_dir}/../share/pasty/pa-osa.yaml" \
+CAML_TARGETS="${pasty_dir}/../share/pasty/pa-osa.fasta" \
+    camlhmp-blast-region \
+    "${@:1}"