Merge pull request #201 from Irallia/TEST/benchmarks/dataset_comparis…

…on_igenvar_only [BENCHMARK] dataset comparison igenvar only
seqan · Apr 22, 2022 · fe3f74c · fe3f74c
2 parents e02d86f + 9de92e5
commit fe3f74c
Show file tree

Hide file tree

Showing 10 changed files with 682 additions and 0 deletions.
diff --git a/doc/plots/iGenVar_only-results.DUP_as_INS.all.png b/doc/plots/iGenVar_only-results.DUP_as_INS.all.png
diff --git a/doc/plots/iGenVar_only-results.all.png b/doc/plots/iGenVar_only-results.all.png
diff --git a/test/benchmark/caller_comparison_iGenVar_only/Snakefile b/test/benchmark/caller_comparison_iGenVar_only/Snakefile
@@ -0,0 +1,13 @@
+configfile: "Repos/iGenVar/test/benchmark/config/caller_comparison_config.yaml"
+
+include: "workflow/rules/callers.smk"
+include: "workflow/rules/eval.smk"
+include: "workflow/rules/eval.DUP_as_INS.smk"
+include: "workflow/rules/plots.smk"
+
+##### Target rules #####
+
+rule all:
+    input:
+        "results/caller_comparison_iGenVar_only/eval/results.all.png",
+        "results/caller_comparison_iGenVar_only/eval/results.DUP_as_INS.all.png"
diff --git a/test/benchmark/caller_comparison_iGenVar_only/workflow/rules/callers.smk b/test/benchmark/caller_comparison_iGenVar_only/workflow/rules/callers.smk
@@ -0,0 +1,98 @@
+sample = config["parameters"]["sample"],
+min_var_length = config["parameters"]["min_var_length"],
+max_var_length = config["parameters"]["max_var_length"]
+
+rule run_iGenVar:
+    output:
+        vcf = "results/caller_comparison_iGenVar_only/{input_combination}/variants.vcf"
+    log:
+        "logs/caller_comparison_iGenVar_only/{input_combination}_output.log"
+    threads: 2
+    run:
+        if wildcards.input_combination == 'S1': # Illumina Paired End
+            short_bam = config["short_read_bam"]["s1"]
+            shell("""
+            /usr/bin/time -v ./build/iGenVar/bin/iGenVar --input_short_reads {short_bam} \
+                --output {output.vcf} --vcf_sample_name {sample} --threads {threads} --verbose \
+                --min_var_length {min_var_length} --max_var_length {max_var_length} --min_qual 1 &>> {log}
+            """)
+        elif wildcards.input_combination == 'S2': # Illumina Mate Pair
+            short_bam = config["short_read_bam"]["s2"]
+            shell("""
+            /usr/bin/time -v ./build/iGenVar/bin/iGenVar --input_short_reads {short_bam} \
+                --output {output.vcf} --vcf_sample_name {sample} --threads {threads} --verbose \
+                --min_var_length {min_var_length} --max_var_length {max_var_length} --min_qual 1 &>> {log}
+            """)
+        elif wildcards.input_combination == 'S1L1': # Illumina Paired End & MtSinai PacBio
+            short_bam = config["short_read_bam"]["s1"],
+            long_bam = config["long_read_bam"]["l1"]
+            shell("""
+            /usr/bin/time -v ./build/iGenVar/bin/iGenVar --input_short_reads {short_bam} --input_long_reads {long_bam} \
+                --output {output.vcf} --vcf_sample_name {sample} --threads {threads} --verbose \
+                --min_var_length {min_var_length} --max_var_length {max_var_length} --min_qual 1 &>> {log}
+            """)
+        elif wildcards.input_combination == 'S2L1': # Illumina Mate Pair & MtSinai PacBio
+            short_bam = config["short_read_bam"]["s2"],
+            long_bam = config["long_read_bam"]["l1"]
+            shell("""
+            /usr/bin/time -v ./build/iGenVar/bin/iGenVar --input_short_reads {short_bam} --input_long_reads {long_bam} \
+                --output {output.vcf} --vcf_sample_name {sample} --threads {threads} --verbose \
+                --min_var_length {min_var_length} --max_var_length {max_var_length} --min_qual 1 &>> {log}
+            """)
+        elif wildcards.input_combination == 'S1L2': # Illumina Paired End & PacBio CCS
+            short_bam = config["short_read_bam"]["s1"],
+            long_bam = config["long_read_bam"]["l2"]
+            shell("""
+            /usr/bin/time -v ./build/iGenVar/bin/iGenVar --input_short_reads {short_bam} --input_long_reads {long_bam} \
+                --output {output.vcf} --vcf_sample_name {sample} --threads {threads} --verbose \
+                --min_var_length {min_var_length} --max_var_length {max_var_length} --min_qual 1 &>> {log}
+            """)
+        elif wildcards.input_combination == 'S2L2': # Illumina Mate Pair & PacBio CCS
+            short_bam = config["short_read_bam"]["s2"],
+            long_bam = config["long_read_bam"]["l2"]
+            shell("""
+            /usr/bin/time -v ./build/iGenVar/bin/iGenVar --input_short_reads {short_bam} --input_long_reads {long_bam} \
+                --output {output.vcf} --vcf_sample_name {sample} --threads {threads} --verbose \
+                --min_var_length {min_var_length} --max_var_length {max_var_length} --min_qual 1 &>> {log}
+            """)
+        elif wildcards.input_combination == 'S1L3': # Illumina Paired End & 10X Genomics
+            short_bam = config["short_read_bam"]["s1"],
+            long_bam = config["long_read_bam"]["l3"]
+            shell("""
+            /usr/bin/time -v ./build/iGenVar/bin/iGenVar --input_short_reads {short_bam} --input_long_reads {long_bam} \
+                --output {output.vcf} --vcf_sample_name {sample} --threads {threads} --verbose \
+                --min_var_length {min_var_length} --max_var_length {max_var_length} --min_qual 1 &>> {log}
+            """)
+        elif wildcards.input_combination == 'S2L3': # Illumina Mate Pair & 10X Genomics
+            short_bam = config["short_read_bam"]["s2"],
+            long_bam = config["long_read_bam"]["l3"]
+            shell("""
+            /usr/bin/time -v ./build/iGenVar/bin/iGenVar --input_short_reads {short_bam} --input_long_reads {long_bam} \
+                --output {output.vcf} --vcf_sample_name {sample} --threads {threads} --verbose \
+                --min_var_length {min_var_length} --max_var_length {max_var_length} --min_qual 1 &>> {log}
+            """)
+        elif wildcards.input_combination == 'L1': # MtSinai PacBio
+            long_bam = config["long_read_bam"]["l1"]
+            shell("""
+            /usr/bin/time -v ./build/iGenVar/bin/iGenVar --input_long_reads {long_bam} \
+                 --output {output.vcf} --vcf_sample_name {sample} --threads {threads} --verbose \
+                --min_var_length {min_var_length} --max_var_length {max_var_length} --min_qual 1 &>> {log}
+            """)
+        elif wildcards.input_combination == 'L2': # PacBio CCS
+            long_bam = config["long_read_bam"]["l2"]
+            shell("""
+            /usr/bin/time -v ./build/iGenVar/bin/iGenVar --input_long_reads {long_bam} \
+                 --output {output.vcf} --vcf_sample_name {sample} --threads {threads} --verbose \
+                --min_var_length {min_var_length} --max_var_length {max_var_length} --min_qual 1 &>> {log}
+            """)
+        else: # wildcards.input_combination == 'L3': # 10X Genomics
+            long_bam = config["long_read_bam"]["l3"]
+            shell("""
+            /usr/bin/time -v ./build/iGenVar/bin/iGenVar --input_long_reads {long_bam} \
+                 --output {output.vcf} --vcf_sample_name {sample} --threads {threads} --verbose \
+                --min_var_length {min_var_length} --max_var_length {max_var_length} --min_qual 1 &>> {log}
+            """)
+        # Defaults:
+        # --method cigar_string --method split_read --method read_pairs --method read_depth
+        # --clustering_methods hierarchical_clustering --refinement_methods no_refinement
+        # --max_tol_inserted_length 50 --max_overlap 10 --hierarchical_clustering_cutoff 0.5
diff --git a/test/benchmark/caller_comparison_iGenVar_only/workflow/rules/eval.DUP_as_INS.smk b/test/benchmark/caller_comparison_iGenVar_only/workflow/rules/eval.DUP_as_INS.smk
@@ -0,0 +1,131 @@
+rule DUP_as_INS:
+    input:
+        vcf = "results/caller_comparison_iGenVar_only/{input_combination}/variants.vcf"
+    output:
+        vcf = "results/caller_comparison_iGenVar_only/{input_combination}/variants.DUP_as_INS.vcf"
+    log:
+        "logs/caller_comparison_iGenVar_only/DUP_as_INS.{input_combination}.log"
+    shell:
+        "sed -e 's/<DUP:TANDEM>/<INS>/g' {input.vcf} | sed -e 's/SVTYPE=DUP/SVTYPE=INS/g' > {output.vcf}"
+
+rule DUP_as_INS_filter_vcf:
+    input:
+        vcf = "results/caller_comparison_iGenVar_only/{input_combination}/variants.DUP_as_INS.vcf"
+    output:
+        vcf = "results/caller_comparison_iGenVar_only/{input_combination}/variants.DUP_as_INS.min_qual_{min_qual}.vcf"
+    log:
+        "logs/caller_comparison_iGenVar_only/filter_vcf_output.{input_combination}.DUP_as_INS.{min_qual}.log",
+    conda:
+        "../../../envs/bcftools.yaml"
+    shell:
+        "bcftools view -i 'QUAL>={wildcards.min_qual}.00' {input.vcf} > {output.vcf} 2> {log}"
+
+rule DUP_as_INS_truvari:
+    input:
+        vcf = "results/caller_comparison_iGenVar_only/{input_combination}/variants.DUP_as_INS.min_qual_{min_qual}.vcf.gz",
+        index = "results/caller_comparison_iGenVar_only/{input_combination}/variants.DUP_as_INS.min_qual_{min_qual}.vcf.gz.tbi"
+    output:
+        summary = "results/caller_comparison_iGenVar_only/eval/{input_combination}/DUP_as_INS.min_qual_{min_qual}/summary.txt"
+    log:
+        "logs/truvari/truvari_output.{input_combination}.{min_qual}.log"
+    params:
+        output_dir = "results/caller_comparison_iGenVar_only/eval/{input_combination}/DUP_as_INS.min_qual_{min_qual}",
+        truth_set_gz = config["truth_set_renamed_chr"]["gz"],
+        truth_set_bed = config["truth_set_renamed_chr"]["bed"]
+    shell:
+        """
+        rm -rf {params.output_dir} && truvari bench -b {params.truth_set_gz} -c {input.vcf} -o {params.output_dir} -p 0 \
+            --passonly --includebed {params.truth_set_bed} &>> {log}
+        """
+        # -f data/reference/hs37d5.fa
+
+rule DUP_as_INS_reformat_truvari_results:
+    input:
+        "results/caller_comparison_iGenVar_only/eval/{input_combination}/DUP_as_INS.min_qual_{min_qual}/summary.txt"
+    output:
+        "results/caller_comparison_iGenVar_only/eval/{input_combination}/DUP_as_INS.min_qual_{min_qual}/pr_rec.txt"
+    threads: 1
+    shell:
+        """
+        cat {input} | grep '\<precision\>\|\<recall\>' | tr -d ',' |sed 's/^[ \t]*//' | tr -d '\"' | tr -d ' ' \
+            | tr ':' '\t' | awk 'OFS=\"\\t\" {{ print \"{wildcards.input_combination}\", \"{wildcards.min_qual}\", $1, $2 }}' \
+            > {output}
+        """
+
+rule DUP_as_INS_cat_truvari_results_all:
+    input:
+        S1   = expand("results/caller_comparison_iGenVar_only/eval/S1/DUP_as_INS.min_qual_{min_qual}/pr_rec.txt",
+                              min_qual=list(range(config["quality_ranges"]["from"],
+                                                  config["quality_ranges"]["to"],
+                                                  config["quality_ranges"]["step"]))),
+        S2   = expand("results/caller_comparison_iGenVar_only/eval/S2/DUP_as_INS.min_qual_{min_qual}/pr_rec.txt",
+                              min_qual=list(range(config["quality_ranges"]["from"],
+                                                  config["quality_ranges"]["to"],
+                                                  config["quality_ranges"]["step"]))),
+        S1L1 = expand("results/caller_comparison_iGenVar_only/eval/S1L1/DUP_as_INS.min_qual_{min_qual}/pr_rec.txt",
+                              min_qual=list(range(config["quality_ranges"]["from"],
+                                                  config["quality_ranges"]["to"],
+                                                  config["quality_ranges"]["step"]))),
+        S2L1 = expand("results/caller_comparison_iGenVar_only/eval/S2L1/DUP_as_INS.min_qual_{min_qual}/pr_rec.txt",
+                              min_qual=list(range(config["quality_ranges"]["from"],
+                                                  config["quality_ranges"]["to"],
+                                                  config["quality_ranges"]["step"]))),
+        S1L2 = expand("results/caller_comparison_iGenVar_only/eval/S1L2/DUP_as_INS.min_qual_{min_qual}/pr_rec.txt",
+                              min_qual=list(range(config["quality_ranges"]["from"],
+                                                  config["quality_ranges"]["to"],
+                                                  config["quality_ranges"]["step"]))),
+        S2L2 = expand("results/caller_comparison_iGenVar_only/eval/S2L2/DUP_as_INS.min_qual_{min_qual}/pr_rec.txt",
+                              min_qual=list(range(config["quality_ranges"]["from"],
+                                                  config["quality_ranges"]["to"],
+                                                  config["quality_ranges"]["step"]))),
+        S1L3 = expand("results/caller_comparison_iGenVar_only/eval/S1L3/DUP_as_INS.min_qual_{min_qual}/pr_rec.txt",
+                              min_qual=list(range(config["quality_ranges"]["from"],
+                                                  config["quality_ranges"]["to"],
+                                                  config["quality_ranges"]["step"]))),
+        S2L3 = expand("results/caller_comparison_iGenVar_only/eval/S2L3/DUP_as_INS.min_qual_{min_qual}/pr_rec.txt",
+                              min_qual=list(range(config["quality_ranges"]["from"],
+                                                  config["quality_ranges"]["to"],
+                                                  config["quality_ranges"]["step"]))),
+        L1   = expand("results/caller_comparison_iGenVar_only/eval/L1/DUP_as_INS.min_qual_{min_qual}/pr_rec.txt",
+                              min_qual=list(range(config["quality_ranges"]["from"],
+                                                  config["quality_ranges"]["to"],
+                                                  config["quality_ranges"]["step"]))),
+        L2   = expand("results/caller_comparison_iGenVar_only/eval/L2/DUP_as_INS.min_qual_{min_qual}/pr_rec.txt",
+                              min_qual=list(range(config["quality_ranges"]["from"],
+                                                  config["quality_ranges"]["to"],
+                                                  config["quality_ranges"]["step"]))),
+        L3   = expand("results/caller_comparison_iGenVar_only/eval/L3/DUP_as_INS.min_qual_{min_qual}/pr_rec.txt",
+                              min_qual=list(range(config["quality_ranges"]["from"],
+                                                  config["quality_ranges"]["to"],
+                                                  config["quality_ranges"]["step"])))
+    output:
+        S1   = temp("results/caller_comparison_iGenVar_only/eval/S1.DUP_as_INS.all_results.txt"),
+        S2   = temp("results/caller_comparison_iGenVar_only/eval/S2.DUP_as_INS.all_results.txt"),
+        S1L1 = temp("results/caller_comparison_iGenVar_only/eval/S1L1.DUP_as_INS.all_results.txt"),
+        S2L1 = temp("results/caller_comparison_iGenVar_only/eval/S2L1.DUP_as_INS.all_results.txt"),
+        S1L2 = temp("results/caller_comparison_iGenVar_only/eval/S1L2.DUP_as_INS.all_results.txt"),
+        S2L2 = temp("results/caller_comparison_iGenVar_only/eval/S2L2.DUP_as_INS.all_results.txt"),
+        S1L3 = temp("results/caller_comparison_iGenVar_only/eval/S1L3.DUP_as_INS.all_results.txt"),
+        S2L3 = temp("results/caller_comparison_iGenVar_only/eval/S2L3.DUP_as_INS.all_results.txt"),
+        L1   = temp("results/caller_comparison_iGenVar_only/eval/L1.DUP_as_INS.all_results.txt"),
+        L2   = temp("results/caller_comparison_iGenVar_only/eval/L2.DUP_as_INS.all_results.txt"),
+        L3   = temp("results/caller_comparison_iGenVar_only/eval/L3.DUP_as_INS.all_results.txt"),
+        all = "results/caller_comparison_iGenVar_only/eval/DUP_as_INS.all_results.txt"
+    threads: 1
+    run:
+        shell("cat {input.S1} > {output.S1}")
+        shell("cat {input.S2} > {output.S2}")
+        shell("cat {input.S1L1} > {output.S1L1}")
+        shell("cat {input.S2L1} > {output.S2L1}")
+        shell("cat {input.S1L2} > {output.S1L2}")
+        shell("cat {input.S2L2} > {output.S2L2}")
+        shell("cat {input.S1L3} > {output.S1L3}")
+        shell("cat {input.S2L3} > {output.S2L3}")
+        shell("cat {input.L1} > {output.L1}")
+        shell("cat {input.L2} > {output.L2}")
+        shell("cat {input.L3} > {output.L3}")
+        shell("""
+            cat {output.S1} {output.S2} {output.S1L1} {output.S2L1} \
+                {output.S1L2} {output.S2L2} {output.S1L3} {output.S2L3} \
+                {output.L1} {output.L2} {output.L3} > {output.all}
+        """)