refactor!: update juno library to v2

RIVM-bioinformatics · Feb 21, 2024 · 347971f · 347971f
1 parent 396a8c9
commit 347971f
Show file tree

Hide file tree

Showing 5 changed files with 437 additions and 390 deletions.
diff --git a/.gitignore b/.gitignore
@@ -6,6 +6,8 @@ __pycache__/
 output/
 .parallel/
 .pytest_cache
+.cache
+.vscode
 
 # Files to ignore
 *.sif

diff --git a/Snakefile b/Snakefile
@@ -10,87 +10,105 @@ Date: 07-03-2022
 #################################################################################
 
 from yaml import safe_load
-import pathlib
+from pathlib import Path
 from shutil import copyfile
 
 #################################################################################
 #####                   Load samplesheet and config params                  #####
 #################################################################################
 
-# Loading sample sheet as dictionary 
+# Loading sample sheet as dictionary
 # ("R1" and "R2" keys for fastq)
 sample_sheet = config["sample_sheet"]
 SAMPLES = {}
 with open(sample_sheet) as sample_sheet_file:
-    SAMPLES = safe_load(sample_sheet_file) 
+    SAMPLES = safe_load(sample_sheet_file)
 
-GIVEN_REF=config['ref']
+GIVEN_REF = config["reference"]
 
-#@################################################################################
-#@####                         Expected output                               #####
-#@################################################################################
-output_dir = pathlib.Path(config["out"])
-log_dir = output_dir.joinpath('log')
-db_dir = pathlib.Path(config["db_dir"])
-mash_db = db_dir.joinpath('bacteria-refseq', 'db.msh')
-referenceseeker_md5 = str(db_dir.joinpath('bacteria-refseq', 'downloaded_db.txt'))
+#################################################################################
+#####                         Expected output                               #####
+#################################################################################
+output_dir = Path(config["output_dir"])
+log_dir = output_dir.joinpath("log")
+db_dir = Path(config["db_dir"])
+mash_db = db_dir.joinpath("bacteria-refseq", "db.msh")
+referenceseeker_md5 = str(db_dir.joinpath("bacteria-refseq", "downloaded_db.txt"))
 
-if config['dryrun'] is True and GIVEN_REF is not None:
+if (config["dryrun"] is True) and (GIVEN_REF != "None"):
     ref_genome = GIVEN_REF
 else:
-    ref_genome = output_dir.joinpath('ref_genomes_used', 'cluster_1', 'ref_genome.fasta')
+    ref_genome = output_dir.joinpath(
+        "ref_genomes_used", "cluster_1", "ref_genome.fasta"
+    )
 
-if GIVEN_REF is not None and not ref_genome.exists():
+# GIVEN_REF is converted to str
+if (GIVEN_REF != "None") and (not ref_genome.exists()):
+    print(f"Copying reference genome {GIVEN_REF} to {ref_genome}")
     output_dir.mkdir(exist_ok=True, parents=True)
     ref_dir = ref_genome.parent
     ref_dir.mkdir(exist_ok=True, parents=True)
     copyfile(GIVEN_REF, ref_genome)
 
+
 def get_output_per_cluster(cluster):
     with open(checkpoints.preclustering.get(**cluster).output[0]) as file:
         SAMPLE_CLUSTERS = yaml.safe_load(file)
-    CLUSTERS = set([ cluster for sample, cluster in SAMPLE_CLUSTERS.items() ])
-    output_files = expand(output_dir.joinpath('tree/cluster_{cluster}/{file}'),
-                    cluster=CLUSTERS,
-                    file=['newick_tree.txt', 'snp_matrix.csv'])
-    output_iqtree = expand(output_dir.joinpath('ml_tree', 'cluster_{cluster}'),
-                    cluster=CLUSTERS)
+    CLUSTERS = set([cluster for sample, cluster in SAMPLE_CLUSTERS.items()])
+    output_files = expand(
+        output_dir.joinpath("tree/cluster_{cluster}/{file}"),
+        cluster=CLUSTERS,
+        file=["newick_tree.txt", "snp_matrix.csv"],
+    )
+    output_iqtree = expand(
+        output_dir.joinpath("ml_tree", "cluster_{cluster}"), cluster=CLUSTERS
+    )
     return output_files + output_iqtree
 
 
-#@################################################################################
-#@####                              Processes                                #####
-#@################################################################################
+#################################################################################
+#####                              Processes                                #####
+#################################################################################
+
+if GIVEN_REF != "None":
 
-if GIVEN_REF is not None:
     include: "bin/rules/mock_cluster.smk"
+
 else:
+
     include: "bin/rules/pre_cluster.smk"
     include: "bin/rules/find_reference.smk"
 
+
 include: "bin/rules/snp_analysis.smk"
 include: "bin/rules/dm_n_viz.smk"
 
-#@################################################################################
-#@####              Finalize pipeline (error/success)                        #####
-#@################################################################################
+
+#################################################################################
+#####              Finalize pipeline (error/success)                        #####
+#################################################################################
+
 
 onerror:
-    shell("""
+    shell(
+        """
 rm -f tmp*npy
 rm -f tmp*_fastme_stat.txt
 rm -f tmp*_fastme_tree.nwk
 rm -f tmp*dist.list
 echo -e "Something went wrong with Juno-SNP pipeline. Please check the logging files in {output_dir}/log/"
-    """)
+    """
+    )
 
 
 #################################################################################
 #####                       Specify final output                            #####
 #################################################################################
 
+
 localrules:
-    all
+    all,
+
 
 rule all:
     input:

diff --git a/envs/juno_snp.yaml b/envs/juno_snp.yaml
@@ -0,0 +1,19 @@
+name: juno_snp
+channels:
+  - bioconda
+  - conda-forge
+  - nodefaults
+dependencies:
+  - git=2.40.*
+  - mamba==1.3.*
+  - pip>=21.2
+  - pandas==1.3.2
+  - python>3.7.6
+  - snakemake==7.32.0
+  - pytest
+  - biopython
+  - pip:
+    - numpy
+    - "--editable=git+https://github.com/RIVM-bioinformatics/[email protected]#egg=juno_library"
+
+
diff --git a/envs/master_env.yaml b/envs/master_env.yaml
-Original file line number
+Diff line change
@@ Expand Up / @@ -6,6 +6,8 @@ __pycache__/ @@
     output/
     .parallel/
     .pytest_cache
+    .cache
+    .vscode
     # Files to ignore
     *.sif
@@ Expand Down @@