From b498f9ea914965e5574f3b4ee174773817c5698a Mon Sep 17 00:00:00 2001
From: Dennis Schmitz <48991783+DennisSchmitz@users.noreply.github.com>
Date: Tue, 13 Feb 2024 14:38:39 +0100
Subject: [PATCH] initial commit
cleaned up the private repo, committed to fresh public repo. Original public repo, versions <2.0, can be found here: https://github.com/DennisSchmitz/Jovian_archive
---
Jovian/Jovian.py | 393 ++++++
Jovian/__init__.py | 6 +
Jovian/functions.py | 200 +++
Jovian/runconfigs.py | 221 ++++
Jovian/samplesheet.py | 29 +
Jovian/update.py | 91 ++
Jovian/workflow/Snakefile | 1104 +++++++++++++++++
Jovian/workflow/directories.py | 49 +
.../envs/ancillary_files/Jovian_report.ipynb | 726 +++++++++++
.../envs/ancillary_files/acknowledgements.md | 44 +
.../workflow/envs/ancillary_files/authors.md | 15 +
.../envs/ancillary_files/default-site.conf | 47 +
.../workflow/envs/ancillary_files/edit.json | 16 +
.../envs/ancillary_files/notebook.json | 31 +
.../envs/ancillary_files/overrides.css | 23 +
.../workflow/envs/ancillary_files/tree.json | 5 +
Jovian/workflow/envs/assembly.def | 19 +
Jovian/workflow/envs/assembly.yaml | 12 +
Jovian/workflow/envs/data_wrangling.def | 19 +
Jovian/workflow/envs/data_wrangling.yaml | 12 +
Jovian/workflow/envs/heatmaps.def | 19 +
Jovian/workflow/envs/heatmaps.yaml | 13 +
Jovian/workflow/envs/jovian_report.def | 66 +
Jovian/workflow/envs/jovian_report.yaml | 22 +
Jovian/workflow/envs/krona.def | 19 +
Jovian/workflow/envs/krona.yaml | 12 +
Jovian/workflow/envs/mgkit_lca.def | 19 +
Jovian/workflow/envs/mgkit_lca.yaml | 12 +
Jovian/workflow/envs/qc_and_clean.def | 19 +
Jovian/workflow/envs/qc_and_clean.yaml | 24 +
.../workflow/envs/scaffold_classification.def | 19 +
.../envs/scaffold_classification.yaml | 9 +
Jovian/workflow/envs/sequence_analysis.def | 19 +
Jovian/workflow/envs/sequence_analysis.yaml | 21 +
Jovian/workflow/files/Jovian_logo.svg | 43 +
Jovian/workflow/files/html/1_header.html | 70 ++
.../files/html/3_tab_explanation.html | 9 +
Jovian/workflow/files/html/5_js_begin.html | 10 +
Jovian/workflow/files/html/7_js_end.html | 76 ++
Jovian/workflow/files/launch_report.sh | 40 +
Jovian/workflow/files/multiqc_config.yaml | 20 +
Jovian/workflow/files/nexteraPE_adapters.fa | 12 +
.../scripts/average_logEvalue_no_lca.py | 61 +
.../workflow/scripts/concat_filtered_vcf.py | 83 ++
.../scripts/concatenate_mapped_read_counts.py | 131 ++
Jovian/workflow/scripts/count_mapped_reads.sh | 32 +
Jovian/workflow/scripts/draw_heatmaps.py | 910 ++++++++++++++
Jovian/workflow/scripts/fqc.sh | 47 +
.../workflow/scripts/html/igvjs_write_divs.sh | 14 +
.../html/igvjs_write_flex_js_middle.sh | 71 ++
.../workflow/scripts/html/igvjs_write_tabs.sh | 12 +
Jovian/workflow/scripts/krona_magnitudes.py | 43 +
Jovian/workflow/scripts/merge_data.py | 204 +++
Jovian/workflow/scripts/quantify_profiles.py | 906 ++++++++++++++
.../workflow/scripts/slurm-cluster-status.py | 71 ++
.../typingtool_EV_XML_to_csv_parser.py | 106 ++
.../typingtool_Flavi_XML_to_csv_parser.py | 94 ++
.../typingtool_HAV_XML_to_csv_parser.py | 87 ++
.../typingtool_HEV_XML_to_csv_parser.py | 88 ++
.../typingtool_NoV_XML_to_csv_parser.py | 116 ++
.../typingtool_PV_XML_to_csv_parser.py | 83 ++
.../typingtool_RVA_XML_to_csv_parser.py | 83 ++
Jovian/workflow/scripts/virus_typing.sh | 277 +++++
LICENSE | 661 ++++++++++
README.md | 303 +++++
env.yaml | 16 +
mamba-env.yml | 16 +
setup.py | 71 ++
68 files changed, 8221 insertions(+)
create mode 100644 Jovian/Jovian.py
create mode 100644 Jovian/__init__.py
create mode 100644 Jovian/functions.py
create mode 100644 Jovian/runconfigs.py
create mode 100644 Jovian/samplesheet.py
create mode 100644 Jovian/update.py
create mode 100644 Jovian/workflow/Snakefile
create mode 100644 Jovian/workflow/directories.py
create mode 100644 Jovian/workflow/envs/ancillary_files/Jovian_report.ipynb
create mode 100644 Jovian/workflow/envs/ancillary_files/acknowledgements.md
create mode 100644 Jovian/workflow/envs/ancillary_files/authors.md
create mode 100644 Jovian/workflow/envs/ancillary_files/default-site.conf
create mode 100644 Jovian/workflow/envs/ancillary_files/edit.json
create mode 100644 Jovian/workflow/envs/ancillary_files/notebook.json
create mode 100644 Jovian/workflow/envs/ancillary_files/overrides.css
create mode 100644 Jovian/workflow/envs/ancillary_files/tree.json
create mode 100644 Jovian/workflow/envs/assembly.def
create mode 100644 Jovian/workflow/envs/assembly.yaml
create mode 100644 Jovian/workflow/envs/data_wrangling.def
create mode 100644 Jovian/workflow/envs/data_wrangling.yaml
create mode 100644 Jovian/workflow/envs/heatmaps.def
create mode 100644 Jovian/workflow/envs/heatmaps.yaml
create mode 100644 Jovian/workflow/envs/jovian_report.def
create mode 100644 Jovian/workflow/envs/jovian_report.yaml
create mode 100644 Jovian/workflow/envs/krona.def
create mode 100644 Jovian/workflow/envs/krona.yaml
create mode 100644 Jovian/workflow/envs/mgkit_lca.def
create mode 100644 Jovian/workflow/envs/mgkit_lca.yaml
create mode 100644 Jovian/workflow/envs/qc_and_clean.def
create mode 100644 Jovian/workflow/envs/qc_and_clean.yaml
create mode 100644 Jovian/workflow/envs/scaffold_classification.def
create mode 100644 Jovian/workflow/envs/scaffold_classification.yaml
create mode 100644 Jovian/workflow/envs/sequence_analysis.def
create mode 100644 Jovian/workflow/envs/sequence_analysis.yaml
create mode 100644 Jovian/workflow/files/Jovian_logo.svg
create mode 100644 Jovian/workflow/files/html/1_header.html
create mode 100644 Jovian/workflow/files/html/3_tab_explanation.html
create mode 100644 Jovian/workflow/files/html/5_js_begin.html
create mode 100644 Jovian/workflow/files/html/7_js_end.html
create mode 100644 Jovian/workflow/files/launch_report.sh
create mode 100644 Jovian/workflow/files/multiqc_config.yaml
create mode 100644 Jovian/workflow/files/nexteraPE_adapters.fa
create mode 100644 Jovian/workflow/scripts/average_logEvalue_no_lca.py
create mode 100644 Jovian/workflow/scripts/concat_filtered_vcf.py
create mode 100644 Jovian/workflow/scripts/concatenate_mapped_read_counts.py
create mode 100644 Jovian/workflow/scripts/count_mapped_reads.sh
create mode 100644 Jovian/workflow/scripts/draw_heatmaps.py
create mode 100644 Jovian/workflow/scripts/fqc.sh
create mode 100644 Jovian/workflow/scripts/html/igvjs_write_divs.sh
create mode 100644 Jovian/workflow/scripts/html/igvjs_write_flex_js_middle.sh
create mode 100644 Jovian/workflow/scripts/html/igvjs_write_tabs.sh
create mode 100644 Jovian/workflow/scripts/krona_magnitudes.py
create mode 100644 Jovian/workflow/scripts/merge_data.py
create mode 100644 Jovian/workflow/scripts/quantify_profiles.py
create mode 100644 Jovian/workflow/scripts/slurm-cluster-status.py
create mode 100644 Jovian/workflow/scripts/typingtool_EV_XML_to_csv_parser.py
create mode 100644 Jovian/workflow/scripts/typingtool_Flavi_XML_to_csv_parser.py
create mode 100644 Jovian/workflow/scripts/typingtool_HAV_XML_to_csv_parser.py
create mode 100644 Jovian/workflow/scripts/typingtool_HEV_XML_to_csv_parser.py
create mode 100644 Jovian/workflow/scripts/typingtool_NoV_XML_to_csv_parser.py
create mode 100644 Jovian/workflow/scripts/typingtool_PV_XML_to_csv_parser.py
create mode 100644 Jovian/workflow/scripts/typingtool_RVA_XML_to_csv_parser.py
create mode 100644 Jovian/workflow/scripts/virus_typing.sh
create mode 100644 LICENSE
create mode 100644 README.md
create mode 100644 env.yaml
create mode 100644 mamba-env.yml
create mode 100644 setup.py
diff --git a/Jovian/Jovian.py b/Jovian/Jovian.py
new file mode 100644
index 0000000..593bd1b
--- /dev/null
+++ b/Jovian/Jovian.py
@@ -0,0 +1,393 @@
+"""
+Starting point of the Jovian metagenomic pipeline and wrapper
+Authors:
+ Dennis Schmitz, Florian Zwagemaker, Sam Nooij, Robert Verhagen,
+ Karim Hajji, Jeroen Cremer, Thierry Janssens, Mark Kroon, Erwin
+ van Wieringen, Harry Vennema, Miranda de Graaf, Annelies Kroneman,
+ Jeroen Laros, Marion Koopmans
+Organization:
+ Rijksinstituut voor Volksgezondheid en Milieu (RIVM)
+ Dutch Public Health institute (https://www.rivm.nl/en)
+ Erasmus Medical Center (EMC) Rotterdam (https://www.erasmusmc.nl/en)
+Departments:
+ RIVM Virology, RIVM Bioinformatics, EMC Viroscience
+Date and license:
+ 2018 - present, AGPL3 (https://www.gnu.org/licenses/agpl-3.0.en.html)
+Homepage containing documentation, examples and changelog:
+ https://github.com/DennisSchmitz/jovian
+Funding:
+ This project/research has received funding from the European Union's
+ Horizon 2020 research and innovation programme under grant agreement
+ No. 643476.
+Automation:
+ iRODS automatically executes this workflow for the "vir-ngs" project
+"""
+
+import argparse
+import multiprocessing
+import os
+import pathlib
+import sys
+
+import snakemake
+import yaml
+
+from Jovian import __home_env_configuration__, __package_name__, __version__
+from Jovian.functions import MyHelpFormatter, color
+from Jovian.runconfigs import WriteConfigs
+from Jovian.samplesheet import WriteSampleSheet
+from Jovian.update import update
+
+yaml.warnings({"YAMLLoadWarning": False})
+
+
+def get_args(givenargs):
+ """
+ Parse the command line arguments
+ """
+
+ def dir_path(arginput):
+ if os.path.isdir(arginput):
+ return arginput
+ print(f'"{arginput}" is not a directory. Exiting...')
+ sys.exit(1)
+
+ def currentpath():
+ return os.getcwd()
+
+ arg = argparse.ArgumentParser(
+ prog=__package_name__,
+ usage="%(prog)s [required arguments] [optional arguments]",
+ description="%(prog)s: a metagenomic analysis workflow for public health and clinics with interactive reports in your web-browser\n\n"
+ + "NB default database paths are hardcoded for RIVM users, otherwise, specify your own database paths using the optional arguments.\n"
+ + "On subsequent invocations of %(prog)s, the database paths will be read from the file located at: "
+ + __home_env_configuration__
+ + " and you will not have to provide them again.\n"
+ + "Similarly, the default RIVM queue is provided as a default value for the '--queuename' flag, but you can override this value if you want to use a different queue.\n",
+ formatter_class=MyHelpFormatter,
+ add_help=False,
+ )
+
+ required_args = arg.add_argument_group("Required arguments")
+ optional_args = arg.add_argument_group("Optional arguments")
+
+ required_args.add_argument(
+ "--input",
+ "-i",
+ type=dir_path,
+ metavar="DIR",
+ help="The input directory containing the raw fastq(.gz) files",
+ required=True,
+ )
+
+ required_args.add_argument(
+ "--output",
+ "-o",
+ metavar="DIR",
+ type=str,
+ default=currentpath(),
+ help="Output directory",
+ required=True,
+ )
+
+ optional_args.add_argument(
+ "--reset-db-paths",
+ action="store_true",
+ help="Reset the database paths to the default values",
+ )
+
+ optional_args.add_argument(
+ "--background",
+ type=str,
+ metavar="File",
+ help="Override the default human genome background path",
+ required=False,
+ )
+
+ optional_args.add_argument(
+ "--blast-db",
+ type=str,
+ metavar="Path",
+ help="Override the default BLAST NT database path",
+ required=False,
+ )
+
+ optional_args.add_argument(
+ "--blast-taxdb",
+ type=str,
+ metavar="Path",
+ help="Override the default BLAST taxonomy database path",
+ required=False,
+ )
+
+ optional_args.add_argument(
+ "--mgkit-db",
+ type=str,
+ metavar="Path",
+ help="Override the default MGKit database path",
+ required=False,
+ )
+
+ optional_args.add_argument(
+ "--krona-db",
+ type=str,
+ metavar="Path",
+ help="Override the default Krona database path",
+ required=False,
+ )
+
+ optional_args.add_argument(
+ "--virus-host-db",
+ type=str,
+ metavar="File",
+ help="Override the default virus-host database path (https://www.genome.jp/virushostdb/)",
+ required=False,
+ )
+
+ optional_args.add_argument(
+ "--new-taxdump-db",
+ type=str,
+ metavar="Path",
+ help="Override the default new taxdump database path",
+ required=False,
+ )
+
+ optional_args.add_argument(
+ "--version",
+ "-v",
+ version=__version__,
+ action="version",
+ help="Show %(prog)s version and exit",
+ )
+
+ optional_args.add_argument(
+ "--help",
+ "-h",
+ action="help",
+ default=argparse.SUPPRESS,
+ help="Show this help message and exit",
+ )
+
+ optional_args.add_argument("--skip-updates", action="store_true", help="Skip the update check")
+
+ optional_args.add_argument(
+ "--local",
+ action="store_true",
+ help="Use %(prog)s locally instead of in a grid-computing configuration",
+ )
+
+ optional_args.add_argument(
+ "--slurm",
+ action="store_true",
+ help="Use SLURM instead of the default DRMAA for grid execution (default: DRMAA)",
+ )
+
+ optional_args.add_argument(
+ "--queuename",
+ default="bio",
+ type=str,
+ metavar="NAME",
+ help="Name of the queue to use for grid execution (default: bio)",
+ )
+
+ optional_args.add_argument(
+ "--conda",
+ action="store_true",
+ help="Use conda environments instead of the default singularity images (default: False)",
+ )
+
+ optional_args.add_argument(
+ "--dryrun",
+ action="store_true",
+ help="Run the %(prog)s workflow without actually doing anything to confirm that the workflow will run as expected",
+ )
+
+ optional_args.add_argument(
+ "--threads",
+ default=min(multiprocessing.cpu_count(), 128),
+ type=int,
+ metavar="N",
+ help=f"Number of local threads that are available to use.\nDefault is the number of available threads in your system ({min(multiprocessing.cpu_count(), 128)})",
+ )
+
+ optional_args.add_argument(
+ "--minphredscore",
+ default=20,
+ type=int,
+ metavar="N",
+ help="Minimum phred score to be used for QC trimming (default: 20)",
+ )
+
+ optional_args.add_argument(
+ "--minreadlength",
+ default=50,
+ type=int,
+ metavar="N",
+ help="Minimum read length to used for QC trimming (default: 50)",
+ )
+
+ optional_args.add_argument(
+ "--mincontiglength",
+ default=250,
+ type=int,
+ metavar="N",
+ help="Minimum contig length to be analysed and included in the final output (default: 250)",
+ )
+
+ if len(givenargs) < 1:
+ print(f"{arg.prog} was called but no arguments were given, please try again \n\tUse '{arg.prog} -h' to see the help document")
+ sys.exit(1)
+ else:
+ flags = arg.parse_args(givenargs)
+ return flags
+
+
+def CheckInputFiles(indir):
+ """
+ Check if the input files are valid fastq files
+ """
+ allowedextensions = [".fastq", ".fq", ".fastq.gz", ".fq.gz"]
+ foundfiles = []
+
+ for filenames in os.listdir(indir):
+ extensions = "".join(pathlib.Path(filenames).suffixes)
+ foundfiles.append(extensions)
+
+ return any((i in allowedextensions for i in foundfiles))
+
+
+def main():
+ """
+ Jovian starting point
+ """
+
+ flags = get_args(sys.argv[1:])
+
+ if flags.reset_db_paths:
+ os.remove(__home_env_configuration__)
+ sys.exit(f'Removed "{__home_env_configuration__}", database paths are now reset. Exiting...')
+
+ if not flags.skip_updates:
+ update(sys.argv)
+
+ inpath = os.path.abspath(flags.input)
+ outpath = os.path.abspath(flags.output)
+ exec_folder = os.path.abspath(os.path.dirname(__file__))
+
+ Snakefile = os.path.join(exec_folder, "workflow", "Snakefile")
+
+ if CheckInputFiles(inpath) is False:
+ print(
+ f"""
+{color.RED + color.BOLD}"{inpath}" does not contain any valid FastQ files.{color.END}
+Please check the input directory and try again. Exiting...
+ """
+ )
+ sys.exit(1)
+ else:
+ print(
+ f"""
+{color.GREEN}Valid input files were found in the input directory{color.END} ('{inpath}')
+ """
+ )
+ if not os.path.exists(outpath):
+ os.makedirs(outpath)
+
+ if os.getcwd() != outpath:
+ os.chdir(outpath)
+ workdir = outpath
+
+ samplesheet = WriteSampleSheet(inpath)
+
+ paramfile, _conffile, _paramdict, confdict = WriteConfigs(
+ samplesheet,
+ flags.threads,
+ flags.queuename,
+ os.getcwd(),
+ flags.local,
+ flags.minphredscore,
+ flags.minreadlength,
+ flags.mincontiglength,
+ flags.conda,
+ flags.background,
+ flags.blast_db,
+ flags.blast_taxdb,
+ flags.mgkit_db,
+ flags.krona_db,
+ flags.virus_host_db,
+ flags.new_taxdump_db,
+ flags.dryrun,
+ inpath,
+ )
+
+ # Snakemake command and params for "local" execution
+ if flags.local is True:
+ status = snakemake.snakemake(
+ Snakefile,
+ workdir=workdir,
+ conda_frontend="conda", # TODO had to change frontend from `mamba` to `conda`, for some reason the installation of `Sequence_analysis.yaml` is incompatible with the `mamba` frontend... Works fine in `singularity` though...
+ cores=confdict["cores"],
+ use_conda=confdict["use-conda"],
+ use_singularity=confdict["use-singularity"],
+ singularity_args=confdict["singularity-args"],
+ jobname=confdict["jobname"],
+ latency_wait=confdict["latency-wait"],
+ dryrun=confdict["dryrun"],
+ printshellcmds=confdict["printshellcmds"],
+ printreason=confdict["printreason"],
+ configfiles=[paramfile],
+ restart_times=3,
+ )
+ # Snakemake command and params for "grid" execution
+ if flags.local is False and flags.slurm is False:
+ status = snakemake.snakemake(
+ Snakefile,
+ workdir=workdir,
+ conda_frontend="conda", # TODO had to change frontend from `mamba` to `conda`, for some reason the installation of `Sequence_analysis.yaml` is incompatible with the `mamba` frontend... Works fine in `singularity` though...
+ cores=confdict["cores"],
+ nodes=confdict["cores"],
+ use_conda=confdict["use-conda"],
+ use_singularity=confdict["use-singularity"],
+ singularity_args=confdict["singularity-args"],
+ jobname=confdict["jobname"],
+ latency_wait=confdict["latency-wait"],
+ drmaa=confdict["drmaa"],
+ drmaa_log_dir=confdict["drmaa-log-dir"],
+ dryrun=confdict["dryrun"],
+ printshellcmds=confdict["printshellcmds"],
+ printreason=confdict["printreason"],
+ configfiles=[paramfile],
+ restart_times=3,
+ )
+
+ # Snakemake command and params for "grid" execution but using SLURM instead of DRMAA
+ if flags.local is False and flags.slurm is True:
+ status = snakemake.snakemake(
+ Snakefile,
+ workdir=workdir,
+ conda_frontend="conda", # TODO had to change frontend from `mamba` to `conda`, for some reason the installation of `Sequence_analysis.yaml` is incompatible with the `mamba` frontend... Works fine in `singularity` though...
+ cores=confdict["cores"],
+ nodes=confdict["cores"],
+ use_conda=confdict["use-conda"],
+ use_singularity=confdict["use-singularity"],
+ singularity_args=confdict["singularity-args"],
+ jobname=confdict["jobname"],
+ latency_wait=confdict["latency-wait"],
+ cluster=confdict["cluster"],
+ cluster_status=confdict["cluster-status"],
+ dryrun=confdict["dryrun"],
+ printshellcmds=confdict["printshellcmds"],
+ printreason=confdict["printreason"],
+ configfiles=[paramfile],
+ restart_times=3,
+ )
+
+ # Snakemake command for making the snakemake report only
+ if confdict["dryrun"] is False and status is True:
+ snakemake.snakemake(
+ Snakefile,
+ workdir=workdir,
+ report="results/snakemake_report.html",
+ configfiles=[paramfile],
+ quiet=True,
+ )
diff --git a/Jovian/__init__.py b/Jovian/__init__.py
new file mode 100644
index 0000000..dc1e52b
--- /dev/null
+++ b/Jovian/__init__.py
@@ -0,0 +1,6 @@
+import os
+
+__version__ = "0.1.0"
+__package_name__ = "Jovian"
+__package_dir__ = os.path.dirname(os.path.abspath(__file__))
+__home_env_configuration__ = os.path.join(os.path.expanduser("~"), ".jovian_env.yaml")
diff --git a/Jovian/functions.py b/Jovian/functions.py
new file mode 100644
index 0000000..735a003
--- /dev/null
+++ b/Jovian/functions.py
@@ -0,0 +1,200 @@
+# pylint: disable=C0103.C0301
+
+"""
+Basic functions for various uses throughout Jovian
+"""
+
+import argparse
+import glob
+import os
+import readline
+import shutil
+
+from Jovian import __package_dir__
+
+
+class MyHelpFormatter(argparse.RawTextHelpFormatter):
+ """
+ This is a custom formatter class for argparse. It allows for some custom formatting,
+ in particular for the help texts with multiple options (like bridging mode and verbosity level).
+ http://stackoverflow.com/questions/3853722
+ """
+
+ def __init__(self, prog):
+ terminal_width = shutil.get_terminal_size().columns
+ os.environ["COLUMNS"] = str(terminal_width)
+ max_help_position = min(max(24, terminal_width // 2), 80)
+ super().__init__(prog, max_help_position=max_help_position)
+
+ def _get_help_string(self, action):
+ help_text = action.help
+ if action.default != argparse.SUPPRESS and "default" not in help_text.lower() and action.default is not None:
+ help_text += f" (default: {str(action.default)})"
+ return help_text
+
+
+class color:
+ """
+ define basic colors to use in the terminal
+ """
+
+ PURPLE = "\033[95m"
+ CYAN = "\033[96m"
+ DARKCYAN = "\033[36m"
+ BLUE = "\033[94m"
+ GREEN = "\033[92m"
+ YELLOW = "\033[93m"
+ RED = "\033[91m"
+ BOLD = "\033[1m"
+ UNDERLINE = "\033[4m"
+ END = "\033[0m"
+
+
+# tabCompleter Class taken from https://gist.github.com/iamatypeofwalrus/5637895
+## this was intended for the raw_input() function of python. But that one is deprecated now
+## this also seems to work for the new input() functions however so muy bueno
+#! use the gnureadline module instead of readline module
+##! the 'normal' readline module causes a bug with memory pointers
+##! --> https://stackoverflow.com/questions/43013060/python-3-6-1-crashed-after-i-installed-readline-module
+class tabCompleter:
+ """
+ A tab completer that can either complete from
+ the filesystem or from a list.
+
+ Partially taken from:
+ http://stackoverflow.com/questions/5637124/tab-completion-in-pythons-raw-input
+ """
+
+ def pathCompleter(self, text, state):
+ """
+ This is the tab completer for systems paths.
+ Only tested on *nix systems
+ """
+ line = readline.get_line_buffer().split()
+
+ # replace ~ with the user's home dir. See https://docs.python.org/2/library/os.path.html
+ if "~" in text:
+ text = os.path.expanduser("~")
+
+ # autocomplete directories with having a trailing slash
+ if os.path.isdir(text):
+ text += "/"
+
+ return list(glob.glob(f"{text}*"))[state]
+
+ def createListCompleter(self, ll):
+ """
+ This is a closure that creates a method that autocompletes from
+ the given list.
+
+ Since the autocomplete function can't be given a list to complete from
+ a closure is used to create the listCompleter function with a list to complete
+ from.
+ """
+
+ def listCompleter(text, state):
+ line = readline.get_line_buffer()
+
+ if not line:
+ return [f"{c} " for c in ll][state]
+
+ return [f"{c} " for c in ll if c.startswith(line)][state]
+
+ self.listCompleter = listCompleter
+
+
+def get_max_local_mem():
+ mem = os.sysconf("SC_PAGE_SIZE") * os.sysconf("SC_PHYS_PAGES")
+ return int(round(mem / (1024.0**2) - 2000, -3))
+
+
+class DefaultConfig:
+ """
+ This class is the wrapper class for the default config dictionary
+ """
+
+ config = {
+ "local": {
+ "cores": 12,
+ "latency-wait": 60,
+ "use-conda": False,
+ "use-singularity": True,
+ "singularity-args": "",
+ "dryrun": False,
+ "printshellcmds": False, # ? For debugging only
+ "printreason": False, # ? For debugging only
+ "jobname": "Jovian_{name}.{jobid}",
+ },
+ "grid": {
+ "cores": 300,
+ "latency-wait": 60,
+ "use-conda": False,
+ "use-singularity": True,
+ "singularity-args": "",
+ "dryrun": False,
+ "printshellcmds": False, # ? For debugging only
+ "printreason": False, # ? For debugging only
+ "jobname": "Jovian_{name}.{jobid}",
+ "drmaa": ' -q PLACEHOLDER -n {threads} -R "span[hosts=1]" -M {resources.mem_mb}', # ? PLACEHOLDER will be replaced by queuename supplied in CLI; default = "bio"
+ "drmaa-log-dir": "logs/drmaa",
+ "cluster": "sbatch -p PLACEHOLDER --parsable -N1 -n1 -c{threads} --mem={resources.mem_mb} -D . -o logs/SLURM/Jovian_{name}-{jobid}.out -e logs/SLURM/Jovian_{name}-{jobid}.err", # ? PLACEHOLDER will be replaced by queuename supplied in CLI; default = "bio"
+ "cluster-status": f"{__package_dir__}/workflow/scripts/slurm-cluster-status.py",
+ },
+ }
+ params = {
+ "sample_sheet": "samplesheet.yaml",
+ "threads": {
+ "Alignments": 12,
+ "Filter": 6,
+ "Assemble": 14,
+ "MultiQC": 1,
+ "align_to_scaffolds_RmDup_FragLength": 4,
+ "SNP_calling": 12,
+ "ORF_analysis": 1,
+ "Contig_metrics": 1,
+ "GC_content": 1,
+ "Scaffold_classification": 12,
+ "mgkit_lca": 1,
+ "data_wrangling": 1,
+ "krona": 1,
+ },
+ "computing_execution": "grid",
+ "use_singularity_or_conda": "use_singularity",
+ "max_local_mem": get_max_local_mem(),
+ "QC": {
+ "min_phred_score": 20, # ? this is overwritten by the value supplied in the wrapper CLI
+ "window_size": 5,
+ "min_read_length": 50, # ? this is overwritten by the value supplied in the wrapper CLI
+ },
+ "Assembly": {
+ "min_contig_len": 250, # ? this is overwritten by the value supplied in the wrapper CLI
+ "kmersizes": "21,33,55,77",
+ },
+ "db": { # ? These are set either by the defaults listed below or the user-specified path, see WriteConfigs()
+ "background": "",
+ "blast_nt": "",
+ "blast_taxdb": "",
+ "mgkit_db": "",
+ "krona_db": "",
+ "virus_host_db": "",
+ "new_taxdump_db": "",
+ },
+ "db_defaults_local": {
+ "background": "/mnt/db/Jov2/HuGo_GRCh38_NoDecoy_NoEBV/genome.fa",
+ "blast_nt": "/mnt/db/Jov2/NT_database/nt",
+ "blast_taxdb": "/mnt/db/Jov2/taxdb/",
+ "mgkit_db": "/mnt/db/Jov2/mgkit_db/",
+ "krona_db": "/mnt/db/Jov2/krona_db/",
+ "virus_host_db": "/mnt/db/Jov2/virus_host_db/virushostdb.tsv",
+ "new_taxdump_db": "/mnt/db/Jov2/new_taxdump/",
+ },
+ "db_defaults_grid": {
+ "background": "/data/BioGrid/schmitzd/20220428_databases_jov2/HuGo_GRCh38_NoDecoy_NoEBV/genome.fa",
+ "blast_nt": "/data/BioGrid/schmitzd/20220428_databases_jov2/NT_database/nt",
+ "blast_taxdb": "/data/BioGrid/schmitzd/20220428_databases_jov2/taxdb/",
+ "mgkit_db": "/data/BioGrid/schmitzd/20220428_databases_jov2/mgkit_db/",
+ "krona_db": "/data/BioGrid/schmitzd/20220428_databases_jov2/krona_db/",
+ "virus_host_db": "/data/BioGrid/schmitzd/20220428_databases_jov2/virus_host_db/virushostdb.tsv",
+ "new_taxdump_db": "/data/BioGrid/schmitzd/20220428_databases_jov2/new_taxdump/",
+ },
+ }
diff --git a/Jovian/runconfigs.py b/Jovian/runconfigs.py
new file mode 100644
index 0000000..fb886a2
--- /dev/null
+++ b/Jovian/runconfigs.py
@@ -0,0 +1,221 @@
+"""
+Construct and write configuration files for Jovian.
+"""
+
+import multiprocessing
+import os
+import sys
+
+import yaml
+
+from Jovian import __home_env_configuration__
+
+from .functions import DefaultConfig
+
+
+def set_cores(cores: int) -> int:
+ """
+ Set the maximum (viable) number of cores to max - 2, allotting two threads for overhead
+ """
+ max_available = multiprocessing.cpu_count()
+ return max_available - 2 if cores >= max_available else cores
+
+
+def user_supplied_db_paths(background, blast_nt, blast_taxdb, mgkit_db, krona_db, virus_host_db, new_taxdump_db) -> None:
+ """
+ If user-supplied database paths are given, set them in DefaultConfig.params["db"]KEY
+
+ Inputs of this function are the database paths as provided to the wrapper by the user.
+ """
+ #! When adding/removing database flags/path also update check_validity_db_paths()
+ if background is not None:
+ DefaultConfig.params["db"]["background"] = background
+ if blast_nt is not None:
+ DefaultConfig.params["db"]["blast_nt"] = blast_nt
+ if blast_taxdb is not None:
+ DefaultConfig.params["db"]["blast_taxdb"] = blast_taxdb
+ if mgkit_db is not None:
+ DefaultConfig.params["db"]["mgkit_db"] = mgkit_db
+ if krona_db is not None:
+ DefaultConfig.params["db"]["krona_db"] = krona_db
+ if virus_host_db is not None:
+ DefaultConfig.params["db"]["virus_host_db"] = virus_host_db
+ if new_taxdump_db is not None:
+ DefaultConfig.params["db"]["new_taxdump_db"] = new_taxdump_db
+
+
+def write_user_supplied_db_paths_to_home_dir_env() -> None:
+ """
+ DefaultConfig.params["db"] is non-empty if the user supplied database paths, so, store non-empty
+ values permanently in a hidden configuration YAML file in the user's HOME so they do not have to
+ supply it each time they run the wrapper.
+ """
+ db_paths = {key: value for key, value in DefaultConfig.params["db"].items() if value}
+ with open(__home_env_configuration__, "w", encoding="utf-8") as yaml_file:
+ yaml.dump(db_paths, yaml_file, default_flow_style=False)
+
+
+def set_db_paths(local: bool) -> None:
+ """
+ If no user-supplied database paths are given (user_supplied_db_paths()) nor have they have been
+ stored in the config file in the user's home-dir (write_user_supplied_db_paths_to_home_dir_env()),
+ set default paths for local or grid-computation from DefaultConfig.params["db_defaults_local"]
+ and DefaultConfig.params["db_defaults_grid"], respectively. NB defaults are configured for usage
+ with default RIVM paths.
+
+ local = boolean, was `--local` flag invoked or not, i.e. local or grid execution
+ """
+ if os.path.exists(__home_env_configuration__):
+ with open(__home_env_configuration__, "r", encoding="utf-8") as yaml_file:
+ previously_stored_db_paths = yaml.safe_load(yaml_file)
+
+ # ? If db-paths were previously stored, update the DefaultConfig.params["db"] dictionary with these paths, then they are not overwritten with the RIVMs default paths downstream since they are not empty
+ if previously_stored_db_paths and isinstance(previously_stored_db_paths, dict):
+ for key, value in previously_stored_db_paths.items():
+ if key in DefaultConfig.params["db"]:
+ DefaultConfig.params["db"][key] = value
+
+ for key in DefaultConfig.params["db"].keys():
+ if not DefaultConfig.params["db"][key]:
+ if local: # ? Set local compute default paths if empty, i.e. no user-supplied path
+ DefaultConfig.params["db"][key] = DefaultConfig.params["db_defaults_local"][key]
+ else: # ? Set grid compute default paths if empty, i.e. no user-supplied path
+ DefaultConfig.params["db"][key] = DefaultConfig.params["db_defaults_grid"][key]
+
+
+def check_validity_db_paths() -> None:
+ """
+ Check if all the database root folders exist and/or files exist. Exit if not true.
+ """
+ #! When adding/removing database flags/path also update user_supplied_db_paths()
+ try:
+ for filename in [DefaultConfig.params["db"]["background"], DefaultConfig.params["db"]["virus_host_db"]]:
+ if not os.path.exists(filename):
+ raise FileNotFoundError(filename)
+ for filepath in [
+ DefaultConfig.params["db"]["blast_nt"],
+ DefaultConfig.params["db"]["blast_taxdb"],
+ DefaultConfig.params["db"]["krona_db"],
+ DefaultConfig.params["db"]["new_taxdump_db"],
+ DefaultConfig.params["db"]["mgkit_db"],
+ ]:
+ if not os.path.exists(os.path.dirname(filepath)):
+ raise FileNotFoundError(filepath)
+ except FileNotFoundError as error_message:
+ sys.exit(f"This file or folder is not available or accessible: {error_message}\nExiting...")
+
+
+def WriteConfigs(
+ samplesheet,
+ cores,
+ queuename,
+ cwd,
+ local,
+ minphredscore,
+ minreadlength,
+ mincontiglength,
+ conda,
+ background,
+ blast_nt,
+ blast_taxdb,
+ mgkit_db,
+ krona_db,
+ virus_host_db,
+ new_taxdump_db,
+ dryrun,
+ inpath,
+):
+ """
+ Write the config files needed for proper functionality. Includes
+ database paths, singularity binds, --local mode core-counts, etc.
+ are all set here.
+ """
+
+ def use_conda(configuration: dict, parameter: dict) -> None:
+ "Use conda instead of Singularity; not recommended, only for debug purposes"
+ configuration["use-conda"] = True
+ configuration["use-singularity"] = False
+ parameter["use_singularity_or_conda"] = "use_conda"
+
+ def update_queuename(configuration: dict, queuename: str) -> None:
+ "Update the queuename in the configuration file with the user-supplied queuename"
+
+ def update_queuename(key: str) -> None:
+ value = configuration[key]
+ value = value.replace("PLACEHOLDER", queuename)
+ configuration[key] = value
+
+ update_queuename("drmaa")
+ update_queuename("cluster")
+
+ def set_no_threads_local_mode(configuration: dict, parameter: dict) -> None:
+ "Update the threads used per rule based on system specs. NB this is only needed for --local mode"
+ configuration["cores"] = set_cores(cores)
+ parameter["computing_execution"] = "local"
+
+ def setup_singularity_mountpoints() -> str:
+ "Setup the singularity mount-points and returns this as a _single_ big string used to update config file"
+ # ? Bind the necessary folders to the singularity containers. Including Jovian's scripts/ and files/ folders, but also the input directory and reference basepath supplied by the user through the wrapper.
+ # ? Line below makes an anchor, below this location you"ll find ./workflow/scripts/ and ./workflow/files/ as installed by pip. Anchor to: `**/conda/envs/[PACKAGE_NAME]/lib/python3.7/site-packages/[PACKAGE_NAME]/`
+ exec_folder = os.path.abspath(os.path.dirname(__file__))
+ scripts_folder = os.path.join(exec_folder, "workflow/", "scripts/")
+ files_folder = os.path.join(exec_folder, "workflow/", "files/")
+
+ # ! This is a single big string split over multiple lines, not a list
+ singularity_mount_points = (
+ f"--bind {scripts_folder}:/Jovian/scripts --bind {files_folder}:/Jovian/files --bind {inpath}:{inpath} "
+ f"--bind {os.path.dirname(DefaultConfig.params['db']['background'])}:{os.path.dirname(DefaultConfig.params['db']['background'])} "
+ f"--bind {os.path.dirname(DefaultConfig.params['db']['blast_nt'])}:{os.path.dirname(DefaultConfig.params['db']['blast_nt'])} "
+ f"--bind {os.path.dirname(DefaultConfig.params['db']['blast_taxdb'])}:{os.path.dirname(DefaultConfig.params['db']['blast_taxdb'])} "
+ f"--bind {os.path.dirname(DefaultConfig.params['db']['mgkit_db'])}:{os.path.dirname(DefaultConfig.params['db']['mgkit_db'])} "
+ f"--bind {os.path.dirname(DefaultConfig.params['db']['krona_db'])}:{os.path.dirname(DefaultConfig.params['db']['krona_db'])} "
+ f"--bind {os.path.dirname(DefaultConfig.params['db']['virus_host_db'])}:{os.path.dirname(DefaultConfig.params['db']['virus_host_db'])} "
+ f"--bind {os.path.dirname(DefaultConfig.params['db']['new_taxdump_db'])}:{os.path.dirname(DefaultConfig.params['db']['new_taxdump_db'])}"
+ )
+ return singularity_mount_points
+
+ # ? Make folder to store config and param files
+ if not os.path.exists(f"{cwd}/config"):
+ os.makedirs(f"{cwd}/config")
+
+ # ! Below, update the parameters. I.e. values that are used by the Snakefile/rules itself
+ parameter_dict = DefaultConfig.params # ? Load default params, will be updated downstream
+ parameter_dict["QC"]["min_phred_score"] = minphredscore # ? Based on user supplied value
+ parameter_dict["QC"]["min_read_length"] = minreadlength # ? Based on user supplied value
+ parameter_dict["Assembly"]["min_contig_len"] = mincontiglength # ? Based on user supplied value
+ # ? set proper database paths, if none are given by the user, set default paths based on grid or local compute mode
+ user_supplied_db_paths(background, blast_nt, blast_taxdb, mgkit_db, krona_db, virus_host_db, new_taxdump_db)
+ write_user_supplied_db_paths_to_home_dir_env()
+ set_db_paths(local)
+ check_validity_db_paths()
+
+ # ! Below, update the configurations. I.e. values that are used by the Snakemake engine/wrapper itself
+ singularity_mount_points = setup_singularity_mountpoints() # ? setup singularity mountpoints based on installation location of this package
+
+ # ! Load the configs specific to either local execution or grid execution
+ if local is True:
+ configuration_dict = DefaultConfig.config["local"]
+ # ? The ` --bind /run/shm:/run/shm` addition is to make the Py multiprocessing of `lofreq` work, it requires write permissions to /run/shm. See similar issue here https://github.com/nipreps/fmriprep/issues/780
+ DefaultConfig.config["local"]["singularity-args"] = f"{singularity_mount_points} --bind /run/shm:/run/shm"
+ set_no_threads_local_mode(configuration_dict, parameter_dict)
+ else: # ! I.e. it will run in "grid" mode
+ configuration_dict = DefaultConfig.config["grid"]
+ update_queuename(configuration_dict, queuename)
+ DefaultConfig.config["grid"]["singularity-args"] = singularity_mount_points
+
+ # ! Configure configuration values that are independent on whether it's local or grid
+ parameter_dict["sample_sheet"] = samplesheet # ? Load sample_sheet
+ if dryrun is True:
+ configuration_dict["dryrun"] = True
+ if conda is True:
+ use_conda(configuration_dict, parameter_dict)
+
+ # ! Write final config and params yaml files for audit-trail
+ parameter_file_path = f"{cwd}/config/params.yaml"
+ configuration_file_path = f"{cwd}/config/config.yaml"
+ with open(parameter_file_path, "w", encoding="utf-8") as outfile:
+ yaml.dump(parameter_dict, outfile, default_flow_style=False)
+ with open(configuration_file_path, "w", encoding="utf-8") as outfile:
+ yaml.dump(configuration_dict, outfile, default_flow_style=False)
+
+ return parameter_file_path, configuration_file_path, parameter_dict, configuration_dict
diff --git a/Jovian/samplesheet.py b/Jovian/samplesheet.py
new file mode 100644
index 0000000..f3b5d32
--- /dev/null
+++ b/Jovian/samplesheet.py
@@ -0,0 +1,29 @@
+"""
+Write the samplesheets
+"""
+
+import os
+import re
+
+import yaml
+
+
+def illumina_sheet(inputdir, sheet):
+ illuminapattern = re.compile(r"(.*)(_|\.)R?(1|2)(?:_.*\.|\..*\.|\.)f(ast)?q(\.gz)?")
+ samples = {}
+ for dirname, subdir, filename in os.walk(inputdir):
+ for files in filename:
+ fullpath = os.path.join(dirname, files)
+ match = illuminapattern.fullmatch(files)
+ if match:
+ sample = samples.setdefault(match.group(1), {})
+ sample["R{}".format(match.group(3))] = str(fullpath)
+ with open(sheet, "w") as samplesheet:
+ yaml.dump(samples, samplesheet, default_flow_style=False)
+ samplesheet.close()
+
+
+def WriteSampleSheet(inputdir):
+ illumina_sheet(inputdir, "samplesheet.yaml")
+ samplesheet = os.getcwd() + "/samplesheet.yaml"
+ return samplesheet
diff --git a/Jovian/update.py b/Jovian/update.py
new file mode 100644
index 0000000..32d51a2
--- /dev/null
+++ b/Jovian/update.py
@@ -0,0 +1,91 @@
+import json
+import readline
+import subprocess
+import sys
+from distutils.version import LooseVersion
+from urllib import request
+
+from Jovian import __version__
+
+from .functions import color, tabCompleter
+
+
+def AskPrompts(intro, prompt, options, fixedchoices=False):
+ if fixedchoices is True:
+ completer = tabCompleter()
+ completer.createListCompleter(options)
+
+ readline.set_completer_delims("\t")
+ readline.parse_and_bind("tab: complete")
+ readline.set_completer(completer.listCompleter)
+
+ subprocess.call("/bin/clear", shell=False)
+ print(intro)
+ while "the answer is invalid":
+ if fixedchoices is True:
+ reply = input(prompt).lower().strip()
+ if reply in options:
+ return reply
+ if reply == "quit":
+ print("Quitting...")
+ sys.exit(-1)
+ else:
+ print("The given answer was invalid. Please choose one of the available options\n")
+ if fixedchoices is False:
+ reply = input(prompt).strip()
+ if reply == "quit":
+ sys.exit(-1)
+ else:
+ return reply
+
+
+def update(sysargs):
+
+ try:
+ latest_release = request.urlopen("https://api.github.com/repos/DennisSchmitz/jovian/releases/latest")
+ except Exception as e:
+ sys.stderr.write("Unable to connect to GitHub API\n" f"{e}")
+ return
+
+ latest_release = json.loads(latest_release.read().decode("utf-8"))
+
+ latest_release_tag = latest_release["tag_name"]
+ latest_release_tag_tidied = LooseVersion(latest_release["tag_name"].lstrip("v").strip())
+
+ localversion = LooseVersion(__version__)
+
+ if localversion < latest_release_tag_tidied:
+ if (
+ AskPrompts(
+ f"""
+There's a new version of Jovian available.
+Current version: {color.RED + color.BOLD}{'v' + __version__}{color.END}
+Latest version: {color.GREEN + color.BOLD}{latest_release_tag}{color.END}\n""",
+ f"""Do you want to update? [yes/no] """,
+ ["yes", "no"],
+ fixedchoices=True,
+ )
+ == "yes"
+ ):
+ subprocess.run(
+ [
+ sys.executable,
+ "-m",
+ "pip",
+ "install",
+ "--upgrade",
+ f"git+https://github.com/DennisSchmitz/jovian@{latest_release_tag}",
+ ],
+ check=True,
+ stdout=subprocess.DEVNULL,
+ stderr=subprocess.DEVNULL,
+ )
+
+ print(f"Jovian updated to {color.YELLOW + color.BOLD}{latest_release_tag}{color.END}")
+
+ subprocess.run(sysargs)
+ sys.exit(0)
+ print(f"Skipping update to version {latest_release_tag}")
+ print(f"Continuing...")
+ return
+ return
diff --git a/Jovian/workflow/Snakefile b/Jovian/workflow/Snakefile
new file mode 100644
index 0000000..73a798b
--- /dev/null
+++ b/Jovian/workflow/Snakefile
@@ -0,0 +1,1104 @@
+"""
+Starting point of the Jovian metagenomic pipeline and wrapper
+Authors:
+ Dennis Schmitz, Florian Zwagemaker, Sam Nooij, Robert Verhagen,
+ Karim Hajji, Jeroen Cremer, Thierry Janssens, Mark Kroon, Erwin
+ van Wieringen, Harry Vennema, Miranda de Graaf, Annelies Kroneman,
+ Jeroen Laros, Marion Koopmans
+Organization:
+ Rijksinstituut voor Volksgezondheid en Milieu (RIVM)
+ Dutch Public Health institute (https://www.rivm.nl/en)
+ Erasmus Medical Center (EMC) Rotterdam (https://www.erasmusmc.nl/en)
+Departments:
+ RIVM Virology, RIVM Bioinformatics, EMC Viroscience
+Date and license:
+ 2018 - present, AGPL3 (https://www.gnu.org/licenses/agpl-3.0.en.html)
+Homepage containing documentation, examples and changelog:
+ https://github.com/DennisSchmitz/jovian
+Funding:
+ This project/research has received funding from the European Union's
+ Horizon 2020 research and innovation programme under grant agreement
+ No. 643476.
+Automation:
+ iRODS automatically executes this workflow for the "vir-ngs" project
+"""
+
+import pprint
+import yaml
+import os
+import sys
+import json
+from directories import *
+import snakemake
+
+snakemake.utils.min_version("6.0")
+
+yaml.warnings({'YAMLLoadWarning': False})
+shell.executable('/bin/bash')
+
+SAMPLES = {}
+
+with open("samplesheet.yaml") as sheetfile:
+ SAMPLES = yaml.safe_load(sheetfile)
+
+def low_memory_job(wildcards, threads, attempt):
+ if config['computing_execution'] == 'local':
+ return min(attempt * threads * 1 * 1000, config['max_local_mem'])
+ return attempt * threads * 1 * 1000
+
+def medium_memory_job(wildcards, threads, attempt):
+ if config['computing_execution'] == 'local':
+ return min(attempt * threads * 2 * 1000, config['max_local_mem'])
+ return attempt * threads * 2 * 1000
+
+def high_memory_job(wildcards, threads, attempt):
+ if config['computing_execution'] == 'local':
+ return min(attempt * threads * 4 * 1000, config['max_local_mem'])
+ return attempt * threads * 4 * 1000
+
+def very_high_memory_job(wildcards, threads, attempt):
+ if config['computing_execution'] == 'local':
+ return min(attempt * threads * 4 * 1.75 * 1000, config['max_local_mem'])
+ return attempt * threads * 4 * 1.75 * 1000
+
+
+# low_runtime_min = 60 # ? Schudeler sends jobs <= 1h runtime to the 6 additional nodes
+# high_runtime_min = 3000 # ? Little over two days
+
+
+localrules:
+ all,
+ Copy_scaffolds,
+ concat_files,
+ concat_filtered_SNPs,
+ HTML_IGVjs_variable_parts,
+ HTML_IGVjs_final
+
+
+rule all:
+ input:
+ f"{res}multiqc.html",
+ expand("{p}{sample}_scaffolds.fasta", p = f"{res+scf}", sample = SAMPLES),
+ expand("{p}{sample}_{ext}", p = f"{datadir + asm + filt}", sample = SAMPLES,
+ ext = ["sorted.bam", "sorted.bam.bai", "sorted_MarkDup-metrics.txt", "insert_size_metrics.txt", "insert_size_histogram.pdf"]),
+ expand("{p}{sample}_{ext}", p = f"{datadir + asm + filt}", sample = SAMPLES,
+ ext = [f"scaffolds_filtered-ge{config['Assembly']['min_contig_len']}.fasta.fai", "scaffolds_raw.vcf",
+ "scaffolds_AF5pct-filt.vcf", "scaffolds_AF5pct-filt.vcf.gz", "scaffolds_AF5pct-filt.vcf.gz.tbi"]),
+ expand("{p}{sample}_{ext}", p = f"{datadir + asm + filt}", sample = SAMPLES,
+ ext = ["ORF_AA.fa", "ORF_NT.fa", "annotation.gff", "annotation.gff.gz", "annotation.gff.gz.tbi", "contig_ORF_count_list.txt"]),
+ expand("{p}{sample}_{ext}", p = f"{datadir + asm + filt}", sample = SAMPLES,
+ ext = ["MinLenFiltSummary.stats", "perMinLenFiltScaffold.stats", "perORFcoverage.stats"]),
+ expand("{p}{sample}_GC.bedgraph", p = f"{datadir + asm + filt}", sample = SAMPLES),
+ f"{res}" + "igv.html",
+ expand("{p}{sample}.blastn", p = f"{datadir + scf_classified}", sample = SAMPLES),
+ expand("{p}{sample}{ext}", p = f"{datadir + scf_classified}", sample = SAMPLES,
+ ext = ["_lca_raw.gff", "_lca_tax.gff", "_lca_taxfilt.gff", "_lca_filt.gff", "_nolca_filt.gff", ".taxtab", ".taxMagtab"]),
+ f"{res}" + "krona.html",
+ expand("{p}Mapped_read_counts-{sample}.tsv", p = f"{res + cnt}", sample = SAMPLES),
+ f"{res + cnt}" + "Mapped_read_counts.tsv",
+ expand("{p}all_{ext}.tsv", p = f"{res}", ext = ["taxClassified", "taxUnclassified", "virusHost", "filtered_SNPs", "noLCA"]),
+ expand("{p}{file}", p = f"{res}", file = ["profile_read_counts.csv", "profile_read_percentages.csv", "Sample_composition_graph.html", "Superkingdoms_quantities_per_sample.csv"]),
+ expand("{p}{file}", p = f"{res}", file = ["Taxonomic_rank_statistics.tsv", "Virus_rank_statistics.tsv", "Phage_rank_statistics.tsv", "Bacteria_rank_statistics.tsv"]),
+ expand("{p}{file}", p = f"{res + hmap}", file = ["Superkingdoms_heatmap.html", "Virus_heatmap.html", "Phage_heatmap.html", "Bacteria_heatmap.html"])
+
+
+onstart:
+ try:
+ print("Checking if all specified files are accessible...")
+ for filename in [
+ config['db']['background'],
+ config['db']['virus_host_db']
+ ]:
+ if not os.path.exists(filename):
+ raise FileNotFoundError(filename)
+ for filepath in [
+ config['db']['blast_nt'],
+ config['db']['blast_taxdb'],
+ config['db']['krona_db'],
+ config['db']['new_taxdump_db'],
+ config['db']['mgkit_db']
+ ]:
+ if not os.path.exists(os.path.dirname(filepath)):
+ raise FileNotFoundError(filepath)
+ except FileNotFoundError as e:
+ print("This file is not available or accessible: %s" % e)
+ sys.exit(1)
+ else:
+ print("\tAll specified files are present!")
+ shell("""
+ mkdir -p results
+ echo -e "\nLogging pipeline settings..."
+ echo -e "\tGenerating methodological hash (fingerprint)..."
+ echo -e "This is the link to the code used for this analysis:\thttps://github.com/DennisSchmitz/jovian/tree/$(git log -n 1 --pretty=format:"%H")" > results/log_git.txt
+ echo -e "This code with unique fingerprint $(git log -n1 --pretty=format:"%H") was committed by $(git log -n1 --pretty=format:"%an <%ae>") at $(git log -n1 --pretty=format:"%ad")" >> results/log_git.txt
+ echo -e "\tGenerating full software list of current Conda environment..."
+ conda list > results/log_conda.txt
+ echo -e "\tGenerating used databases log..."
+ echo -e "==> User-specified background reference (default: Homo Sapiens NCBI GRch38 NO DECOY genome): <==\n$(ls -lah {config[db][background]}*)\n" > results/log_db.txt
+ echo -e "\n==> NCBI BLAST database: <==\n$(ls -lah {config[db][blast_nt]}*)\n" >> results/log_db.txt
+ echo -e "\n==> NCBI taxonomy database: <==\n$(ls -lah {config[db][blast_taxdb]}*)\n" >> results/log_db.txt
+ echo -e "\n==> Virus-Host Interaction Database: <==\n$(ls -lah {config[db][virus_host_db]}*)\n" >> results/log_db.txt
+ echo -e "\n==> Krona Taxonomy Database: <==\n$(ls -lah {config[db][krona_db]}*)\n" >> results/log_db.txt
+ echo -e "\n==> NCBI new_taxdump Database: <==\n$(ls -lah {config[db][new_taxdump_db]}*)\n" >> results/log_db.txt
+ echo -e "\n==> mgkit database: <==\n$(ls -lah {config[db][mgkit_db]}*)\n" >> results/log_db.txt
+
+ echo -e "\tGenerating config file log..."
+ rm -f results/log_config.txt
+ for file in config/*.yaml
+ do
+ echo -e "\n==> Contents of file \"${{file}}\": <==" >> results/log_config.txt
+ cat ${{file}} >> results/log_config.txt
+ echo -e "\n\n" >> results/log_config.txt
+ done
+
+ echo -e "\tCopying samplesheet.yaml to results/ folder." #? To easily mount it in singularity since the results folder is mounted anyway
+ cp samplesheet.yaml results/samplesheet.yaml
+ """)
+
+
+rule QC_raw:
+ input: lambda wildcards: SAMPLES[wildcards.sample][wildcards.read]
+ output:
+ html = f"{datadir + qc_pre}" + "{sample}_{read}_fastqc.html",
+ zip = f"{datadir + qc_pre}" + "{sample}_{read}_fastqc.zip"
+ conda:
+ f"{conda_envs}qc_and_clean.yaml"
+ container:
+ "library://ds_bioinformatics/jovian/qc_and_clean:2.0.0"
+ log:
+ f"{logdir}" + "QC_raw_{sample}_{read}.log"
+ benchmark:
+ f"{logdir + bench}" + "QC_raw_{sample}_{read}.txt"
+ threads: config['threads']['Filter']
+ resources:
+ mem_mb = low_memory_job,
+ # runtime_min = low_runtime_min
+ params:
+ output_dir = f"{datadir + qc_pre}",
+ script = "/Jovian/scripts/fqc.sh" if config['use_singularity_or_conda'] == "use_singularity" else srcdir("scripts/fqc.sh"),
+ shell:
+ """
+bash {params.script} {input} {params.output_dir} {output.html} {output.zip} {log} {threads}
+ """
+
+
+rule QC_filter:
+ input: lambda wildcards: (SAMPLES[wildcards.sample][i] for i in ("R1", "R2"))
+ output:
+ r1 = f"{datadir + cln + qcfilt}" + "{sample}_pR1.fq",
+ r2 = f"{datadir + cln + qcfilt}" + "{sample}_pR2.fq",
+ r1_unpaired = f"{datadir + cln + qcfilt}" + "{sample}_uR1.fq",
+ r2_unpaired = f"{datadir + cln + qcfilt}" + "{sample}_uR2.fq"
+ conda:
+ f"{conda_envs}qc_and_clean.yaml"
+ container:
+ "library://ds_bioinformatics/jovian/qc_and_clean:2.0.0"
+ log:
+ f"{logdir}" + "QC_filter_{sample}.log"
+ benchmark:
+ f"{logdir + bench}" + "QC_filter_{sample}.txt"
+ threads: config['threads']['Filter']
+ resources:
+ mem_mb = medium_memory_job,
+ # runtime_min = low_runtime_min
+ params:
+ adapter_removal_config = "ILLUMINACLIP:/Jovian/files/nexteraPE_adapters.fa:2:30:10:8:true" if config['use_singularity_or_conda'] == "use_singularity" else "ILLUMINACLIP:" + srcdir("files/nexteraPE_adapters.fa") + ":2:30:10:8:true",
+ quality_trimming_config = f"SLIDINGWINDOW:{config['QC']['window_size']}:{config['QC']['min_phred_score']}",
+ minimum_length_config = f"MINLEN:{config['QC']['min_read_length']}"
+ shell:
+ """
+trimmomatic PE -threads {threads} {input[0]:q} {input[1]:q} {output.r1} {output.r1_unpaired} {output.r2} {output.r2_unpaired} \
+{params.adapter_removal_config} {params.quality_trimming_config} {params.minimum_length_config} > {log} 2>&1
+touch -r {output.r1} {output.r1_unpaired}
+touch -r {output.r2} {output.r2_unpaired}
+ """
+
+
+rule QC_clean:
+ input:
+ f"{datadir + cln + qcfilt}" + "{sample}_{read}.fq" #? don't use the `rules.QC_filter.output` syntax since you need the {sample} and {read} capturegroups to have a properly functioning DAG
+ output:
+ html = f"{datadir + qc_post}" + "{sample}_{read}_fastqc.html",
+ zip = f"{datadir + qc_post}" + "{sample}_{read}_fastqc.zip"
+ conda:
+ f"{conda_envs}qc_and_clean.yaml"
+ container:
+ "library://ds_bioinformatics/jovian/qc_and_clean:2.0.0"
+ log:
+ f"{logdir}" + "QC_clean_{sample}_{read}.log"
+ benchmark:
+ f"{logdir + bench}" + "QC_clean_{sample}_{read}.txt"
+ threads: config['threads']['Filter']
+ resources:
+ mem_mb = low_memory_job,
+ # runtime_min = low_runtime_min
+ params:
+ output_dir = f"{datadir + qc_post}"
+ shell:
+ """
+if [ -s "{input}" ]; then
+ fastqc -t {threads} --quiet --outdir {params.output_dir} {input} > {log} 2>&1
+else
+ touch {output.html}
+ touch {output.zip}
+ echo "touched things because input was empty" > {log} 2>&1
+fi
+ """
+
+
+rule Remove_BG_p1:
+ input:
+ bg = config['db']['background'],
+ r1 = rules.QC_filter.output.r1,
+ r2 = rules.QC_filter.output.r2,
+ r1_unpaired = rules.QC_filter.output.r1_unpaired,
+ r2_unpaired = rules.QC_filter.output.r2_unpaired
+ output:
+ bam = f"{datadir + cln + aln}" + "{sample}_raw-alignment.bam",
+ bai = f"{datadir + cln + aln}" + "{sample}_raw-alignment.bam.bai"
+ conda:
+ f"{conda_envs}qc_and_clean.yaml"
+ container:
+ "library://ds_bioinformatics/jovian/qc_and_clean:2.0.0"
+ log:
+ f"{logdir}" + "Remove_BG_p1_{sample}.log"
+ benchmark:
+ f"{logdir + bench}" + "Remove_BG_p1_{sample}.txt"
+ threads: config['threads']['Alignments']
+ resources:
+ mem_mb = high_memory_job,
+ # runtime_min = low_runtime_min
+ params:
+ aln_type = '--local'
+ shell:
+ """
+bowtie2 --time --threads {threads} {params.aln_type} -x {input.bg} -1 {input.r1} -2 {input.r2} -U {input.r1_unpaired} -U {input.r2_unpaired} 2> {log} |\
+samtools view -@ {threads} -uS - 2>> {log} |\
+samtools sort -@ {threads} - -o {output.bam} >> {log} 2>&1
+samtools index -@ {threads} {output.bam} >> {log} 2>&1
+ """
+
+
+rule Remove_BG_p2:
+ input:
+ bam = rules.Remove_BG_p1.output.bam,
+ bai = rules.Remove_BG_p1.output.bai
+ output:
+ r1 = f"{datadir + cln + filt}" + "{sample}_pR1.fq",
+ r2 = f"{datadir + cln + filt}" + "{sample}_pR2.fq",
+ conda:
+ f"{conda_envs}qc_and_clean.yaml"
+ container:
+ "library://ds_bioinformatics/jovian/qc_and_clean:2.0.0"
+ log:
+ f"{logdir}" + "Remove_BG_p2_{sample}.log"
+ benchmark:
+ f"{logdir + bench}" + "Remove_BG_p2_{sample}.txt"
+ threads: config['threads']['Filter']
+ resources:
+ mem_mb = medium_memory_job,
+ # runtime_min = low_runtime_min
+ shell:
+ """
+samtools view -@ {threads} -b -f 1 -f 8 {input.bam} 2> {log} |\
+samtools sort -@ {threads} -n - 2>> {log} |\
+bedtools bamtofastq -i - -fq {output.r1} -fq2 {output.r2} >> {log} 2>&1
+ """
+
+
+rule Remove_BG_p3:
+ input:
+ bam = rules.Remove_BG_p1.output.bam,
+ bai = rules.Remove_BG_p1.output.bai
+ output:
+ tbam = temp(f"{datadir + cln + aln}" + "{sample}_temp_unpaired.bam"),
+ un = f"{datadir + cln + filt}" + "{sample}_unpaired.fq",
+ conda:
+ f"{conda_envs}qc_and_clean.yaml"
+ container:
+ "library://ds_bioinformatics/jovian/qc_and_clean:2.0.0"
+ log:
+ f"{logdir}" + "Remove_BG_p3_{sample}.log"
+ benchmark:
+ f"{logdir + bench}" + "Remove_BG_p3_{sample}.txt"
+ threads: config['threads']['Filter']
+ resources:
+ mem_mb = medium_memory_job,
+ # runtime_min = low_runtime_min
+ shell:
+ """
+samtools view -@ {threads} -b -F 1 -f 4 {input.bam} 2> {log} |\
+samtools sort -@ {threads} -n -o {output.tbam} 2>> {log}
+bedtools bamtofastq -i {output.tbam} -fq {output.un} >> {log} 2>&1
+ """
+
+
+rule Assemble:
+ input:
+ r1 = rules.Remove_BG_p2.output.r1,
+ r2 = rules.Remove_BG_p2.output.r2,
+ un = rules.Remove_BG_p3.output.un
+ output:
+ scaffolds = f"{datadir + asm + raw}" + "{sample}/scaffolds.fasta",
+ scaff_filt = f"{datadir + asm + filt}" + "{sample}" + f"_scaffolds_filtered-ge{config['Assembly']['min_contig_len']}.fasta",
+ conda:
+ f"{conda_envs}assembly.yaml"
+ container:
+ "library://ds_bioinformatics/jovian/assembly:2.0.0"
+ log:
+ f"{logdir}" + "Assemble_{sample}.log"
+ benchmark:
+ f"{logdir + bench}" + "Assemble_{sample}.txt"
+ threads: config['threads']['Assemble']
+ resources:
+ mem_mb = very_high_memory_job,
+ # runtime_min = high_runtime_min
+ params:
+ min_contig_len = config['Assembly']['min_contig_len'],
+ kmersizes = config['Assembly']['kmersizes'],
+ outdir = f"{datadir + asm + raw}" + "{sample}/"
+ shell:
+ """
+spades.py --only-assembler --meta -1 {input.r1} -2 {input.r2} -s {input.un} -t {threads} -m $(({resources.mem_mb} / 1000)) -k {params.kmersizes} -o {params.outdir} > {log} 2>&1
+seqtk seq {output.scaffolds} 2>> {log} |\
+gawk -F "_" '/^>/ {{if ($4 >= {params.min_contig_len}) {{print $0; getline; print $0}};}}' 2>> {log} 1> {output.scaff_filt}
+ """
+
+
+rule align_to_scaffolds_RmDup_FragLength:
+ input:
+ fasta = rules.Assemble.output.scaff_filt,
+ R1 = rules.Remove_BG_p2.output.r1,
+ R2 = rules.Remove_BG_p2.output.r2
+ output:
+ bam = f"{datadir + asm + filt}" + "{sample}_sorted.bam",
+ bam_bai = f"{datadir + asm + filt}" + "{sample}_sorted.bam.bai",
+ dup_metrics = f"{datadir + asm + filt}" + "{sample}_sorted_MarkDup-metrics.txt",
+ frag_metrics = f"{datadir + asm + filt}" + "{sample}_insert_size_metrics.txt",
+ frag_pdf = f"{datadir + asm + filt}" + "{sample}_insert_size_histogram.pdf"
+ conda:
+ f"{conda_envs}sequence_analysis.yaml"
+ container:
+ "library://ds_bioinformatics/jovian/sequence_analysis:2.0.0"
+ log:
+ f"{logdir}" + "align_to_scaffolds_RmDup_FragLength_{sample}.log"
+ benchmark:
+ f"{logdir + bench}" + "align_to_scaffolds_RmDup_FragLength_{sample}.txt"
+ threads: config['threads']['align_to_scaffolds_RmDup_FragLength']
+ resources:
+ mem_mb = high_memory_job,
+ # runtime_min = low_runtime_min
+ params:
+ remove_dups = "", #? To turn this on, e.g. for metagenomics data replace it with a "-r" [NB, without quotes]. To turn this off, e.g. for amplicon experiments such as ARTIC, replace this with "" #! The `-r` will HARD remove the duplicates instead of only marking them, N.B. this is REQUIRED for the downstream bbtools' pileup.sh to work --> it ignores the DUP marker and counts the reads in its coverage metrics. Thus giving a false sense of confidence.
+ markdup_mode = "t",
+ max_read_length = "300" #? This is the default value and also the max read length of Illumina in-house sequencing.
+ shell:
+ """
+bwa index {input.fasta} > {log} 2>&1
+bwa mem -t {threads} {input.fasta} {input.R1} {input.R2} 2>> {log} |\
+samtools view -@ {threads} -uS - 2>> {log} |\
+samtools collate -@ {threads} -O - 2>> {log} |\
+samtools fixmate -@ {threads} -m - - 2>> {log} |\
+samtools sort -@ {threads} - -o - 2>> {log} |\
+samtools markdup -@ {threads} -l {params.max_read_length} -m {params.markdup_mode} {params.remove_dups} -f {output.dup_metrics} - {output.bam} >> {log} 2>&1
+samtools index -@ {threads} {output.bam} >> {log} 2>&1
+
+picard -Dpicard.useLegacyParser=false CollectInsertSizeMetrics -I {output.bam} -O {output.frag_metrics} -H {output.frag_pdf} >> {log} 2>&1
+ """
+
+
+rule SNP_calling:
+ input:
+ fasta = rules.Assemble.output.scaff_filt,
+ bam = rules.align_to_scaffolds_RmDup_FragLength.output.bam,
+ bam_bai = rules.align_to_scaffolds_RmDup_FragLength.output.bam_bai
+ output:
+ fasta_fai = f"{datadir + asm + filt}" + "{sample}" + f"_scaffolds_filtered-ge{config['Assembly']['min_contig_len']}.fasta.fai",
+ unfilt_vcf = f"{datadir + asm + filt}" + "{sample}" + f"_scaffolds_raw.vcf",
+ filt_vcf = f"{datadir + asm + filt}" + "{sample}" + f"_scaffolds_AF5pct-filt.vcf",
+ zipped_filt_vcf = f"{datadir + asm + filt}" + "{sample}" + f"_scaffolds_AF5pct-filt.vcf.gz",
+ zipped_filt_vcf_index = f"{datadir + asm + filt}" + "{sample}" + f"_scaffolds_AF5pct-filt.vcf.gz.tbi"
+ conda:
+ f"{conda_envs}sequence_analysis.yaml"
+ container:
+ "library://ds_bioinformatics/jovian/sequence_analysis:2.0.0"
+ log:
+ f"{logdir}" + "SNP_calling_{sample}.log"
+ benchmark:
+ f"{logdir + bench}" + "SNP_calling_{sample}.txt"
+ threads: config['threads']['SNP_calling']
+ resources:
+ mem_mb = high_memory_job,
+ # runtime_min = high_runtime_min # ? rarely it takes >1h to run this rule
+ params:
+ max_cov = 20000, #? Maximum coverage used for SNP calling.
+ min_AF = 0.05 #? This is the minimum allelle frequency (=AF) for which a SNP is reported, default is 5%.
+ shell:
+ """
+samtools faidx -o {output.fasta_fai} {input.fasta} > {log} 2>&1
+lofreq call-parallel -d {params.max_cov} --no-default-filter --pp-threads {threads} -f {input.fasta} -o {output.unfilt_vcf} {input.bam} >> {log} 2>&1
+lofreq filter -a {params.min_AF} -i {output.unfilt_vcf} -o {output.filt_vcf} >> {log} 2>&1
+bgzip -c {output.filt_vcf} 2>> {log} 1> {output.zipped_filt_vcf}
+tabix -p vcf {output.zipped_filt_vcf} >> {log} 2>&1
+ """
+
+
+rule ORF_analysis:
+ input:
+ rules.Assemble.output.scaff_filt
+ output:
+ ORF_AA_fasta = f"{datadir + asm + filt}" + "{sample}_ORF_AA.fa",
+ ORF_NT_fasta = f"{datadir + asm + filt}" + "{sample}_ORF_NT.fa",
+ ORF_annotation_gff = f"{datadir + asm + filt}" + "{sample}_annotation.gff",
+ zipped_gff3 = f"{datadir + asm + filt}" + "{sample}_annotation.gff.gz",
+ index_zipped_gff3 = f"{datadir + asm + filt}" + "{sample}_annotation.gff.gz.tbi",
+ contig_ORF_count_list = f"{datadir + asm + filt}" + "{sample}_contig_ORF_count_list.txt"
+ conda:
+ f"{conda_envs}sequence_analysis.yaml"
+ container:
+ "library://ds_bioinformatics/jovian/sequence_analysis:2.0.0"
+ log:
+ f"{logdir}" + "ORF_analysis_{sample}.log"
+ benchmark:
+ f"{logdir + bench}" + "ORF_analysis_{sample}.txt"
+ threads: config['threads']['ORF_analysis']
+ resources:
+ mem_mb = medium_memory_job,
+ # runtime_min = low_runtime_min
+ params:
+ procedure = "meta",
+ output_format = "gff"
+ shell:
+ """
+prodigal -q -i {input} -a {output.ORF_AA_fasta} -d {output.ORF_NT_fasta} -o {output.ORF_annotation_gff} -p {params.procedure} -f {params.output_format} > {log} 2>&1
+bgzip -c {output.ORF_annotation_gff} 2>> {log} 1> {output.zipped_gff3}
+tabix -p gff {output.zipped_gff3} >> {log} 2>&1
+egrep "^>" {output.ORF_NT_fasta} | sed 's/_/ /6' | tr -d ">" | cut -f 1 -d " " | uniq -c > {output.contig_ORF_count_list}
+ """
+
+
+rule Contig_metrics:
+ input:
+ bam = rules.align_to_scaffolds_RmDup_FragLength.output.bam,
+ fasta = rules.Assemble.output.scaff_filt,
+ ORF_NT_fasta = rules.ORF_analysis.output.ORF_NT_fasta
+ output:
+ summary = f"{datadir + asm + filt}" + "{sample}_MinLenFiltSummary.stats",
+ perScaffold = f"{datadir + asm + filt}" + "{sample}_perMinLenFiltScaffold.stats",
+ perORFcoverage = f"{datadir + asm + filt}" + "{sample}_perORFcoverage.stats"
+ conda:
+ f"{conda_envs}sequence_analysis.yaml"
+ container:
+ "library://ds_bioinformatics/jovian/sequence_analysis:2.0.0"
+ log:
+ f"{logdir}" + "Contig_metrics_{sample}.log"
+ benchmark:
+ f"{logdir + bench}" + "Contig_metrics_{sample}.txt"
+ threads: config['threads']['Contig_metrics']
+ resources:
+ mem_mb = medium_memory_job,
+ # runtime_min = low_runtime_min
+ params:
+ shell: #! bbtools' pileup.sh counts every read, even those marked as duplicate upstream. Hence, for accurate counts, make sure the `remove_dups` param in rule `align_to_scaffolds_RmDup_FragLength` is set to `-r`.
+ """
+pileup.sh in={input.bam} ref={input.fasta} fastaorf={input.ORF_NT_fasta} outorf={output.perORFcoverage} out={output.perScaffold} secondary=f samstreamer=t 2> {output.summary} 1> {log}
+ """
+
+
+rule GC_content:
+ input:
+ fasta = rules.Assemble.output.scaff_filt,
+ fasta_fai = rules.SNP_calling.output.fasta_fai
+ output:
+ fasta_sizes = f"{datadir + asm + filt}" + "{sample}" + f"_scaffolds_filtered-ge{config['Assembly']['min_contig_len']}.fasta.sizes",
+ bed_windows = f"{datadir + asm + filt}" + "{sample}.windows",
+ GC_bed = f"{datadir + asm + filt}" + "{sample}_GC.bedgraph"
+ conda:
+ f"{conda_envs}sequence_analysis.yaml"
+ container:
+ "library://ds_bioinformatics/jovian/sequence_analysis:2.0.0"
+ log:
+ f"{logdir}" + "GC_content_{sample}.log"
+ benchmark:
+ f"{logdir + bench}" + "GC_content_{sample}.txt"
+ threads: config['threads']['GC_content']
+ resources:
+ mem_mb = medium_memory_job,
+ # runtime_min = low_runtime_min
+ params:
+ window_size = 50
+ shell:
+ """
+cut -f 1,2 {input.fasta_fai} 2> {log} 1> {output.fasta_sizes}
+bedtools makewindows -g {output.fasta_sizes} -w {params.window_size} 2>> {log} 1> {output.bed_windows}
+bedtools nuc -fi {input.fasta} -bed {output.bed_windows} 2>> {log} | cut -f 1-3,5 2>>{log} 1> {output.GC_bed}
+ """
+
+
+rule HTML_IGVjs_variable_parts:
+ input:
+ fasta = rules.Assemble.output.scaff_filt,
+ ref_GC_bedgraph = rules.GC_content.output.GC_bed,
+ ref_zipped_ORF_gff = rules.ORF_analysis.output.zipped_gff3,
+ basepath_zipped_SNP_vcf = rules.SNP_calling.output.zipped_filt_vcf,
+ basepath_sorted_bam = rules.align_to_scaffolds_RmDup_FragLength.output.bam
+ output:
+ tab_output = f"{datadir + html}" + "2_tab_{sample}",
+ div_output = f"{datadir + html}" + "4_html_divs_{sample}",
+ js_flex_output = f"{datadir + html}" + "6_js_flex_{sample}"
+ conda:
+ f"{conda_envs}data_wrangling.yaml"
+ container:
+ "library://ds_bioinformatics/jovian/data_wrangling:2.0.0"
+ log:
+ f"{logdir}" + "HTML_IGVjs_variable_parts_{sample}.log"
+ benchmark:
+ f"{logdir + bench}" + "HTML_IGVjs_variable_parts_{sample}.txt"
+ threads: config['threads']['data_wrangling']
+ resources:
+ mem_mb = low_memory_job,
+ # runtime_min = low_runtime_min
+ params:
+ script_html_path = "/Jovian/scripts/html/" if config['use_singularity_or_conda'] == "use_singularity" else srcdir("scripts/html/"),
+ nginx_ip = "http://127.0.0.1",
+ nginx_port = "8079"
+ shell:
+ """
+bash {params.script_html_path}igvjs_write_tabs.sh {wildcards.sample} {output.tab_output} > {log} 2>&1
+bash {params.script_html_path}igvjs_write_divs.sh {wildcards.sample} {output.div_output} >> {log} 2>&1
+bash {params.script_html_path}igvjs_write_flex_js_middle.sh {wildcards.sample} {output.js_flex_output} {input.fasta} {input.ref_GC_bedgraph} {input.ref_zipped_ORF_gff} {input.basepath_zipped_SNP_vcf} {input.basepath_sorted_bam} {params.nginx_ip} {params.nginx_port} >> {log} 2>&1
+ """
+
+
+rule HTML_IGVjs_final:
+ input:
+ expand("{p}{chunk_name}_{sample}", p = f"{datadir + html}", chunk_name = ["2_tab", "4_html_divs", "6_js_flex"], sample = SAMPLES)
+ output:
+ f"{res}" + "igv.html"
+ conda:
+ f"{conda_envs}data_wrangling.yaml"
+ container:
+ "library://ds_bioinformatics/jovian/data_wrangling:2.0.0"
+ log:
+ f"{logdir}" + "HTML_IGVjs_final.log"
+ benchmark:
+ f"{logdir + bench}" + "HTML_IGVjs_final.txt"
+ threads: config['threads']['data_wrangling']
+ resources:
+ mem_mb = low_memory_job,
+ # runtime_min = low_runtime_min
+ params:
+ tab_basename = f"{datadir + html}" + "2_tab_",
+ div_basename = f"{datadir + html}" + "4_html_divs_",
+ js_flex_basename = f"{datadir + html}" + "6_js_flex_",
+ files_path = "/Jovian/files/html/" if config['use_singularity_or_conda'] == "use_singularity" else srcdir("files/html/"),
+ shell:
+ """
+cat {params.files_path}1_header.html > {output} 2> {log}
+cat {params.tab_basename}* >> {output} 2>> {log}
+cat {params.files_path}3_tab_explanation.html >> {output} 2>> {log}
+cat {params.div_basename}* >> {output} 2>> {log}
+cat {params.files_path}5_js_begin.html >> {output} 2>> {log}
+cat {params.js_flex_basename}* >> {output} 2>> {log}
+cat {params.files_path}7_js_end.html >> {output} 2>> {log}
+ """
+
+
+rule Scaffold_classification:
+ input:
+ rules.Assemble.output.scaff_filt
+ output:
+ f"{datadir + scf_classified}" + "{sample}.blastn"
+ conda:
+ f"{conda_envs}scaffold_classification.yaml"
+ container:
+ "library://ds_bioinformatics/jovian/scaffold_classification:2.0.0"
+ log:
+ f"{logdir}" + "Scaffold_classification_{sample}.log"
+ benchmark:
+ f"{logdir + bench}" + "Scaffold_classification_{sample}.txt"
+ threads: config['threads']['Scaffold_classification']
+ resources:
+ mem_mb = very_high_memory_job,
+ # runtime_min = high_runtime_min
+ params:
+ nt_db_path = config['db']['blast_nt'],
+ taxdb_db_path = config['db']['blast_taxdb'],
+ outfmt = "6 std qseqid sseqid staxids sscinames stitle",
+ evalue = "0.05", #? E-value threshold for saving hits
+ qcov_hsp_perc = "50", #? Minimum length percentage of the query (i.e. scaffold) to be covered by the hsp, i.e. hits with less than this value will not be reported.
+ max_target_seqs = "250",
+ max_hsps = "1"
+ shell:
+ """
+export BLASTDB="{params.taxdb_db_path}"
+blastn -task megablast -outfmt "{params.outfmt}" -query {input} -evalue {params.evalue} -qcov_hsp_perc {params.qcov_hsp_perc} -max_target_seqs {params.max_target_seqs} -max_hsps {params.max_hsps} -db {params.nt_db_path} -num_threads {threads} -out {output} > {log} 2>&1
+ """
+
+
+#? Reformat blast tsv output to gff
+#? Remove all construct/synthetic entries (because the filtering based on their specific taxid, as perform in rule `taxfilter_gff`, is not adequate)
+#? Remove any entry with a lower bitscore than the user specified bitscore_threshold (i.e. filter short alignments since every match is a +2 bitscore)
+rule make_gff:
+ input:
+ rules.Scaffold_classification.output
+ output:
+ f"{datadir + scf_classified}" + "{sample}_lca_raw.gff" #? This is a temp file, removed in the onSuccess//onError clause.
+ conda:
+ f"{conda_envs}mgkit_lca.yaml"
+ container:
+ "library://ds_bioinformatics/jovian/mgkit_lca:2.0.0"
+ log:
+ f"{logdir}" + "make_gff_{sample}.log"
+ benchmark:
+ f"{logdir + bench}" + "make_gff_{sample}.txt"
+ threads: config['threads']['mgkit_lca']
+ resources:
+ mem_mb = medium_memory_job,
+ # runtime_min = low_runtime_min
+ params:
+ bitscore_threshold = "100",
+ filt_keywords = "/construct\|synthetic/Id"
+ shell: #? sed does a case-insensitive search and in-place deletion of any record containing the filt_keywords
+ """
+sed -i "{params.filt_keywords}" {input}
+blast2gff blastdb -v -b {params.bitscore_threshold} -n {input} {output} > {log} 2>&1
+ """
+
+
+#? Reformat gff, augment accession id of the blasthit with the taxid
+rule addtaxa_gff:
+ input:
+ rules.make_gff.output
+ output:
+ f"{datadir + scf_classified}" + "{sample}_lca_tax.gff"
+ conda:
+ f"{conda_envs}mgkit_lca.yaml"
+ container:
+ "library://ds_bioinformatics/jovian/mgkit_lca:2.0.0"
+ log:
+ f"{logdir}" + "addtaxa_gff_{sample}.log"
+ benchmark:
+ f"{logdir + bench}" + "addtaxa_gff_{sample}.txt"
+ threads: config['threads']['mgkit_lca']
+ resources:
+ mem_mb = high_memory_job,
+ # runtime_min = low_runtime_min
+ params:
+ mgkit_tax_db = config['db']['mgkit_db']
+ shell:
+ """
+add-gff-info addtaxa -v -t {params.mgkit_tax_db}nucl_gb.accession2taxid_sliced.tsv -e {input} {output} > {log} 2>&1
+ """
+
+
+#? Filter taxid 81077 (https://www.ncbi.nlm.nih.gov/taxonomy/?term=81077 --> artificial sequences) and 12908 (https://www.ncbi.nlm.nih.gov/taxonomy/?term=12908 --> unclassified sequences)
+rule taxfilter_gff:
+ input:
+ rules.addtaxa_gff.output
+ output:
+ f"{datadir + scf_classified}" + "{sample}_lca_taxfilt.gff"
+ conda:
+ f"{conda_envs}mgkit_lca.yaml"
+ container:
+ "library://ds_bioinformatics/jovian/mgkit_lca:2.0.0"
+ log:
+ f"{logdir}" + "taxfilter_gff_{sample}.log"
+ benchmark:
+ f"{logdir + bench}" + "taxfilter_gff_{sample}.txt"
+ threads: config['threads']['mgkit_lca']
+ resources:
+ mem_mb = medium_memory_job,
+ # runtime_min = low_runtime_min
+ params:
+ mgkit_tax_db = config['db']['mgkit_db']
+ shell:
+ """
+taxon-utils filter -v -e 81077 -e 12908 -t {params.mgkit_tax_db}taxonomy.pickle {input} {output} > {log} 2>&1
+ """
+
+
+#? Filter gff on the user-specified bitscore-quantile settings.
+#? Filters on bitscore, it sorts assignments high to low and takes only the 3% highest number of records for LCA. (.97 param and 'ge' = greater or equal than params: NB it's an order-based analysis not a value-based one)
+rule qfilter_gff:
+ input:
+ rules.taxfilter_gff.output
+ output:
+ f"{datadir + scf_classified}" + "{sample}_lca_filt.gff"
+ conda:
+ f"{conda_envs}mgkit_lca.yaml"
+ container:
+ "library://ds_bioinformatics/jovian/mgkit_lca:2.0.0"
+ log:
+ f"{logdir}" + "qfilter_gff_{sample}.log"
+ benchmark:
+ f"{logdir + bench}" + "qfilter_gff_{sample}.txt"
+ threads: config['threads']['mgkit_lca']
+ resources:
+ mem_mb = medium_memory_job,
+ # runtime_min = low_runtime_min
+ params:
+ quantile_threshold = ".97"
+ shell:
+ """
+filter-gff sequence -v -t -a bitscore -f quantile -l {params.quantile_threshold} -c ge {input} {output} > {log} 2>&1
+ """
+
+
+#? Perform the LCA analysis.
+##? in `taxon-utils lca` the `-n {output.no_lca}` flag is important because these are reported in the visualisation report for manual inspection.
+##? in `taxon-utils lca` the `-b {params.bitscore_threshold} ` is redundant because this is already done in the first rule, still, leaving it in for clarity.
+##? the `sed` rule creates a header for the output file
+##? touch the {output.no_lca} if it hasn't been generated yet, otherwise you get an error downstream
+##? NB mgkit log10 transforms evalues, i.e. these are not the default evalues as reported by BLAST.
+##? `bin/average_logevalue_no_lca.py` adds a `taxid=1` and `evalue=1` for all entries without an LCA result (i.e. taxid=1, which is "Root") and also averages the e-values of the LCA constituents alongside setting av. log10 transformed evalues of 0 to -450 due to undeflow.
+##? `bin/krona_magnitudes.py` adds magnitude information for the Krona plot (same as default Krona method).
+rule lca_mgkit:
+ input:
+ filtgff = rules.qfilter_gff.output,
+ stats = rules.Contig_metrics.output.perScaffold
+ output:
+ no_lca = f"{datadir + scf_classified}" + "{sample}_nolca_filt.gff",
+ taxtab = f"{datadir + scf_classified}" + "{sample}.taxtab",
+ taxMagtab = f"{datadir + scf_classified}" + "{sample}.taxMagtab"
+ conda:
+ f"{conda_envs}mgkit_lca.yaml"
+ container:
+ "library://ds_bioinformatics/jovian/mgkit_lca:2.0.0"
+ log:
+ f"{logdir}" + "lca_mgkit_{sample}.log"
+ benchmark:
+ f"{logdir + bench}" + "lca_mgkit_{sample}.txt"
+ threads: config['threads']['mgkit_lca']
+ resources:
+ mem_mb = medium_memory_job,
+ # runtime_min = low_runtime_min
+ params:
+ mgkit_tax_db = config['db']['mgkit_db'],
+ bitscore_threshold = "100",
+ script_path = "/Jovian/scripts/" if config['use_singularity_or_conda'] == "use_singularity" else srcdir("scripts/"),
+ shell:
+ """
+taxon-utils lca -v -b {params.bitscore_threshold} -s -p -n {output.no_lca} -t {params.mgkit_tax_db}taxonomy.pickle {input.filtgff} {output.taxtab} > {log} 2>&1;
+sed -i '1i #queryID\ttaxID' {output.taxtab} >> {log} 2>&1;
+if [[ ! -e {output.no_lca} ]]; then
+ touch {output.no_lca}
+fi
+python {params.script_path}average_logEvalue_no_lca.py {output.taxtab} {output.no_lca} {input.filtgff} {output.taxtab} >> {log} 2>&1;
+python {params.script_path}krona_magnitudes.py {output.taxtab} {input.stats} {output.taxMagtab} >> {log} 2>&1
+ """
+
+
+rule Krona:
+ input:
+ sorted(expand(rules.lca_mgkit.output.taxMagtab, sample = set(SAMPLES)))
+ output:
+ f"{res}" + "krona.html"
+ conda:
+ f"{conda_envs}krona.yaml"
+ container:
+ "library://ds_bioinformatics/jovian/krona:2.0.0"
+ log:
+ f"{logdir}" + "krona.log"
+ benchmark:
+ f"{logdir + bench}" + "krona.txt"
+ threads: config['threads']['krona']
+ resources:
+ mem_mb = low_memory_job,
+ # runtime_min = low_runtime_min
+ params:
+ krona_db_path = config['db']['krona_db']
+ shell:
+ """
+ktImportTaxonomy {input} -tax {params.krona_db_path} -i -k -m 4 -o {output} > {log} 2>&1
+ """
+
+
+rule count_mapped_reads:
+ input:
+ rules.align_to_scaffolds_RmDup_FragLength.output.bam
+ output:
+ f"{res + cnt}" + "Mapped_read_counts-{sample}.tsv"
+ conda:
+ f"{conda_envs}sequence_analysis.yaml"
+ container:
+ "library://ds_bioinformatics/jovian/sequence_analysis:2.0.0"
+ log:
+ f"{logdir}" + "count_mapped_reads-{sample}.log"
+ benchmark:
+ f"{logdir + bench}" + "count_mapped_reads-{sample}.txt"
+ threads: config['threads']['data_wrangling']
+ resources:
+ mem_mb = low_memory_job,
+ # runtime_min = low_runtime_min
+ params:
+ script = "/Jovian/scripts/count_mapped_reads.sh" if config['use_singularity_or_conda'] == "use_singularity" else srcdir("scripts/count_mapped_reads.sh")
+ shell:
+ """
+bash {params.script} {input} > {output} 2> {log}
+ """
+
+
+rule concatenate_read_counts:
+ input:
+ expand(rules.count_mapped_reads.output, sample = SAMPLES)
+ output:
+ f"{res + cnt}" + "Mapped_read_counts.tsv"
+ conda:
+ f"{conda_envs}data_wrangling.yaml"
+ container:
+ "library://ds_bioinformatics/jovian/data_wrangling:2.0.0"
+ log:
+ f"{logdir}" + "concatenate_read_counts.log"
+ benchmark:
+ f"{logdir + bench}" + "concatenate_read_counts.txt"
+ threads: config['threads']['data_wrangling']
+ resources:
+ mem_mb = low_memory_job,
+ # runtime_min = low_runtime_min
+ params:
+ script = "/Jovian/scripts/concatenate_mapped_read_counts.py" if config['use_singularity_or_conda'] == "use_singularity" else srcdir("scripts/concatenate_mapped_read_counts.py")
+ shell:
+ """
+python {params.script} -i {input} -o {output} > {log} 2>&1
+ """
+
+
+rule merge_all_metrics_into_single_tsv:
+ input:
+ bbtoolsFile = rules.Contig_metrics.output.perScaffold,
+ kronaFile = rules.lca_mgkit.output.taxtab,
+ minLenFiltScaffolds = rules.Assemble.output.scaff_filt,
+ scaffoldORFcounts = rules.ORF_analysis.output.contig_ORF_count_list,
+ virusHostDB = config['db']['virus_host_db'],
+ new_taxdump_rankedlineage = f"{config['db']['new_taxdump_db']}" + "rankedlineage.dmp.delim",
+ new_taxdump_host = f"{config['db']['new_taxdump_db']}" + "host.dmp.delim"
+ output:
+ taxClassifiedTable = f"{datadir + tbl}" + "{sample}_taxClassified.tsv",
+ taxUnclassifiedTable = f"{datadir + tbl}" + "{sample}_taxUnclassified.tsv",
+ virusHostTable = f"{datadir + tbl}" + "{sample}_virusHost.tsv",
+ conda:
+ f"{conda_envs}data_wrangling.yaml"
+ container:
+ "library://ds_bioinformatics/jovian/data_wrangling:2.0.0"
+ log:
+ f"{logdir}" + "merge_all_metrics_into_single_tsv_{sample}.log"
+ benchmark:
+ f"{logdir + bench}" + "merge_all_metrics_into_single_tsv_{sample}.txt"
+ threads: config['threads']['data_wrangling']
+ resources:
+ mem_mb = medium_memory_job,
+ # runtime_min = low_runtime_min
+ params:
+ script = "/Jovian/scripts/merge_data.py" if config['use_singularity_or_conda'] == "use_singularity" else srcdir("scripts/merge_data.py")
+ shell:
+ """
+python {params.script} {wildcards.sample} {input.bbtoolsFile} {input.kronaFile} {input.minLenFiltScaffolds} {input.scaffoldORFcounts} {input.virusHostDB} {input.new_taxdump_rankedlineage} {input.new_taxdump_host} {output.taxClassifiedTable} {output.taxUnclassifiedTable} {output.virusHostTable} > {log} 2>&1
+ """
+
+
+rule concat_files:
+ input:
+ expand(rules.merge_all_metrics_into_single_tsv.output, sample = SAMPLES, extension = ["taxClassified.tsv", "taxUnclassified.tsv", "virusHost.tsv"]),
+ expand(rules.lca_mgkit.output.no_lca, sample = SAMPLES)
+ output:
+ taxClassified = f"{res}" + "all_taxClassified.tsv",
+ taxUnclassified = f"{res}" + "all_taxUnclassified.tsv",
+ virusHost = f"{res}" + "all_virusHost.tsv",
+ noLCA = f"{res}" + "all_noLCA.tsv"
+ conda:
+ f"{conda_envs}data_wrangling.yaml"
+ container:
+ "library://ds_bioinformatics/jovian/data_wrangling:2.0.0"
+ log:
+ f"{logdir}" + "concat_files.log"
+ benchmark:
+ f"{logdir + bench}" + "concat_files.txt"
+ threads: config['threads']['data_wrangling']
+ resources:
+ mem_mb = low_memory_job,
+ # runtime_min = low_runtime_min
+ params:
+ search_folder = f"{datadir + tbl}",
+ classified_glob = "*_taxClassified.tsv",
+ unclassified_glob = "*_taxUnclassified.tsv",
+ virusHost_glob = "*_virusHost.tsv",
+ search_folder_noLCA = f"{datadir + scf_classified}",
+ noLCA_glob = "*_nolca_filt.gff"
+ shell:
+ """
+find {params.search_folder} -type f -name "{params.classified_glob}" -exec gawk 'NR==1 || FNR!=1' {{}} + | (read header; echo "$header"; sort -t$'\t' -k 1,1 -k 15,15nr) 2> {log} 1> {output.taxClassified}
+find {params.search_folder} -type f -name "{params.unclassified_glob}" -exec gawk 'NR==1 || FNR!=1' {{}} + | (read header; echo "$header"; sort -t$'\t' -k 1,1 -k 4,4nr) 2>> {log} 1> {output.taxUnclassified}
+find {params.search_folder} -type f -name "{params.virusHost_glob}" -exec gawk 'NR==1 || FNR!=1' {{}} + | (read header; echo "$header"; sort -t$'\t' -k 1,1 -k 2,2) 2>> {log} 1> {output.virusHost}
+
+find {params.search_folder_noLCA} -type f -name "{params.noLCA_glob}" -exec gawk 'BEGIN {{OFS="\t"; print "Sample_name", "Scaffold_name", "Constituent_taxIDs", "Constituent_tax_names"}} {{split(FILENAME, list_A, "_nolca_filt.gff"); split(list_A[1], list_B, "{params.search_folder_noLCA}"); print list_B[2], $0}}' {{}} + 2>> {log} 1> {output.noLCA}
+ """ #? Last one-liner: search all files with {params.noLCA_glob}, print a header, extract samplename by removing the extension ("_nolca_filt.gff") and then removing the root-path (stored in {params.search_folder_noLCA}), then print samplename in first column and the normal output after that, concat them for all samples in analysis.
+
+
+rule concat_filtered_SNPs:
+ input:
+ expand(rules.SNP_calling.output.filt_vcf, sample = SAMPLES)
+ output:
+ final = f"{res}" + "all_filtered_SNPs.tsv",
+ temp = temp(f"{res}" + "all_filtered_SNPs.temp")
+ conda:
+ f"{conda_envs}data_wrangling.yaml"
+ container:
+ "library://ds_bioinformatics/jovian/data_wrangling:2.0.0"
+ log:
+ f"{logdir}" + "concat_filtered_SNPs.log"
+ benchmark:
+ f"{logdir + bench}" + "concat_filtered_SNPs.txt"
+ threads: config['threads']['data_wrangling']
+ resources:
+ mem_mb = low_memory_job,
+ # runtime_min = low_runtime_min
+ params:
+ vcf_folder_glob = f"{datadir + asm + filt}/\*-filt.vcf",
+ script = "/Jovian/scripts/concat_filtered_vcf.py" if config['use_singularity_or_conda'] == "use_singularity" else srcdir("scripts/concat_filtered_vcf.py")
+ shell:
+ """
+python {params.script} {params.vcf_folder_glob} {output.temp} > {log} 2>&1
+cat {output.temp} | (read header; echo "$header"; sort -t$'\t' -k1,1 -k2,2 -k3,3n) 1> {output.final} 2>> {log}
+ """
+
+
+rule MultiQC:
+ input:
+ expand(rules.QC_raw.output.zip, sample = SAMPLES, read = ['R1', 'R2']),
+ expand(rules.QC_clean.output.zip, sample = SAMPLES, read = ['pR1', 'pR2', 'uR1', 'uR2']),
+ expand(rules.align_to_scaffolds_RmDup_FragLength.output.frag_metrics, sample = SAMPLES),
+ expand(rules.Remove_BG_p1.log, sample = SAMPLES),
+ expand(rules.QC_filter.log, sample = SAMPLES)
+ output:
+ f"{res}multiqc.html",
+ expand("{p}multiqc_{program}.txt", p = f"{res+mqc_data}", program = ['fastqc', 'trimmomatic', 'bowtie2']),
+ conda:
+ f"{conda_envs}qc_and_clean.yaml"
+ container:
+ "library://ds_bioinformatics/jovian/qc_and_clean:2.0.0"
+ log:
+ f"{logdir}" + "MultiQC.log"
+ benchmark:
+ f"{logdir + bench}" + "MultiQC.txt"
+ threads: config['threads']['MultiQC']
+ resources:
+ mem_mb = medium_memory_job,
+ # runtime_min = low_runtime_min
+ params:
+ conf = "/Jovian/files/multiqc_config.yaml" if config['use_singularity_or_conda'] == "use_singularity" else srcdir("files/multiqc_config.yaml"),
+ outdir = f"{res}"
+ shell:
+ """
+multiqc --force --config {params.conf} -o {params.outdir} -n multiqc.html {input} > {log} 2>&1
+ """
+
+
+rule quantify_output:
+ input:
+ classified = rules.concat_files.output.taxClassified,
+ unclassified = rules.concat_files.output.taxUnclassified,
+ mapped_reads = rules.concatenate_read_counts.output,
+ fastqc = f"{res + mqc_data}multiqc_fastqc.txt",
+ trimmomatic = f"{res + mqc_data}multiqc_trimmomatic.txt",
+ hugo = expand("{p}{sample}_{suffix}.fq", p = f"{datadir + cln + filt}", sample = set(SAMPLES), suffix = ["pR1", "pR2", "unpaired"])
+ output:
+ read_count = f"{res}profile_read_counts.csv",
+ percentages = f"{res}profile_read_percentages.csv",
+ graph = f"{res}Sample_composition_graph.html"
+ conda:
+ f"{conda_envs}heatmaps.yaml"
+ container:
+ "library://ds_bioinformatics/jovian/heatmaps:2.0.0"
+ log:
+ f"{logdir}" + "quantify_output.log"
+ benchmark:
+ f"{logdir + bench}" + "quantify_output.txt"
+ threads: config['threads']['data_wrangling']
+ resources:
+ mem_mb = medium_memory_job,
+ # runtime_min = low_runtime_min
+ params:
+ script = "/Jovian/scripts/quantify_profiles.py" if config['use_singularity_or_conda'] == "use_singularity" else srcdir("scripts/quantify_profiles.py")
+ shell:
+ """
+python {params.script} -f {input.fastqc} -t {input.trimmomatic} -hg {input.hugo} -c {input.classified} -u {input.unclassified} -m {input.mapped_reads} -co {output.read_count} -p {output.percentages} -g {output.graph} -cpu {threads} -l {log}
+ """
+
+
+rule draw_heatmaps:
+ input:
+ classified = rules.concat_files.output.taxClassified,
+ numbers = f"{res + mqc_data}multiqc_trimmomatic.txt"
+ output:
+ super_quantities = f"{res}Superkingdoms_quantities_per_sample.csv",
+ super = f"{res + hmap}Superkingdoms_heatmap.html",
+ virus = f"{res + hmap}Virus_heatmap.html",
+ phage = f"{res + hmap}Phage_heatmap.html",
+ bact = f"{res + hmap}Bacteria_heatmap.html",
+ stats = f"{res}Taxonomic_rank_statistics.tsv",
+ vir_stats = f"{res}Virus_rank_statistics.tsv",
+ phage_stats = f"{res}Phage_rank_statistics.tsv",
+ bact_stats = f"{res}Bacteria_rank_statistics.tsv"
+ conda:
+ f"{conda_envs}heatmaps.yaml"
+ container:
+ "library://ds_bioinformatics/jovian/heatmaps:2.0.0"
+ log:
+ f"{logdir}" + "draw_heatmaps.log"
+ benchmark:
+ f"{logdir + bench}" + "draw_heatmaps.txt"
+ threads: config['threads']['data_wrangling']
+ resources:
+ mem_mb = medium_memory_job,
+ # runtime_min = low_runtime_min
+ params:
+ script = "/Jovian/scripts/draw_heatmaps.py" if config['use_singularity_or_conda'] == "use_singularity" else srcdir("scripts/draw_heatmaps.py")
+ shell:
+ """
+python {params.script} -c {input.classified} -n {input.numbers} -sq {output.super_quantities} -st {output.stats} -vs {output.vir_stats} -ps {output.phage_stats} -bs {output.bact_stats} -s {output.super} -v {output.virus} -p {output.phage} -b {output.bact} > {log} 2>&1
+ """
+
+
+rule Copy_scaffolds:
+ input: rules.Assemble.output.scaff_filt
+ output: f"{res+scf}" + "{sample}_scaffolds.fasta"
+ threads: 1
+ resources:
+ mem_mb = low_memory_job,
+ # runtime_min = low_runtime_min
+ shell:
+ """
+cp {input} {output}
+ """
+
+
+vt_script_path = srcdir("scripts/virus_typing.sh") #? you can add a `--force` flag to the script to force it to overwrite previous results
+launch_report_script = srcdir("files/launch_report.sh") #? should be launched via iRODS, but leaving this for --local users and/or debugging
+onsuccess:
+ shell("""
+ echo -e "\nStarting virus typing, this may take a while...\n"
+ bash {vt_script_path} all
+
+ echo -e "Virus typing finished."
+
+ echo -e "Generating HTML index of log files..."
+ tree -hD --dirsfirst -H "../logs" -L 2 -T "Logs overview" --noreport --charset utf-8 -P "*" -o {res}logfiles_index.html {logdir}
+
+ echo -e "Copying \`launch_report.sh\` script to output folder."
+ cp {launch_report_script} ./
+ echo -e "\tLaunch the Jovian-report via command \`bash launch_report.sh ./\` from the user-specified output directory.\n\tNB, this requires singularity to be installed on your system."
+
+ echo -e "Jovian is finished with processing all the files in the given input directory."
+
+ echo -e "Shutting down..."
+ """)
+ return True
+
+
+onerror:
+ print("""
+ An error occurred and Jovian had to shut down.
+ Please check the the input and logfiles for any abnormalities and try again.
+ """)
+ return False
diff --git a/Jovian/workflow/directories.py b/Jovian/workflow/directories.py
new file mode 100644
index 0000000..b880557
--- /dev/null
+++ b/Jovian/workflow/directories.py
@@ -0,0 +1,49 @@
+logdir = "logs/"
+bench = "benchmark/"
+
+conda_envs = "envs/"
+
+res = "results/"
+
+datadir = "data/"
+cln = "cleaned_fastq/"
+qcfilt = "QC_filter/"
+scf_classified = "scaffolds_classified/"
+
+asm = "assembly/"
+scf = "scaffolds/"
+
+html = "html/"
+json = "json/"
+
+qc_pre = "FastQC_pretrim/"
+qc_post = "FastQC_posttrim/"
+
+aln = "alignment/"
+bf = "bam-files/"
+vf = "vcf-files/"
+features = "features/"
+ann = "annotations/"
+muts = "mutations/"
+
+seqs = "sequences/"
+amino = "aminoacids/"
+com = "combined/"
+indiv = "individual/"
+
+tbl = "tables/"
+
+hmap = "heatmaps/"
+cnt = "counts/"
+mqc_data = "multiqc_data/"
+fls = "files/"
+
+raw = "raw/"
+filt = "filt/"
+
+cons = "consensus/"
+covs = "coverages/"
+insr = "inserts/"
+boc = "BoC/"
+chunks = "html_chunks/"
+trims = "trimmed/"
diff --git a/Jovian/workflow/envs/ancillary_files/Jovian_report.ipynb b/Jovian/workflow/envs/ancillary_files/Jovian_report.ipynb
new file mode 100644
index 0000000..511c406
--- /dev/null
+++ b/Jovian/workflow/envs/ancillary_files/Jovian_report.ipynb
@@ -0,0 +1,726 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "___\n",
+ "# Jovian analysis report\n",
+ "___"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**Please visualize the report by pressing `Cell` in the toolbar and then selecting `Run All`. This can take a couple of minutes (depending on the size of your dataset).** \n",
+ "
\n",
+ "*N.B. The sum total of reads in this report will not add up to the sum total number of reads that were supplied as input. This is because, 1) human reads are removed, 2) PCR-duplicates might be removed depending on the chosen configuration, by default, PCR-duplicates are not removed.* \n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "deleteable": false,
+ "init_cell": true,
+ "scrolled": false
+ },
+ "outputs": [],
+ "source": [
+ "%%html\n",
+ "
+