Skip to content

Commit

Permalink
Merge pull request #201 from Irallia/TEST/benchmarks/dataset_comparis…
Browse files Browse the repository at this point in the history
…on_igenvar_only

[BENCHMARK] dataset comparison igenvar only
  • Loading branch information
Irallia authored Apr 22, 2022
2 parents e02d86f + 9de92e5 commit fe3f74c
Show file tree
Hide file tree
Showing 10 changed files with 682 additions and 0 deletions.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added doc/plots/iGenVar_only-results.all.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
13 changes: 13 additions & 0 deletions test/benchmark/caller_comparison_iGenVar_only/Snakefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
configfile: "Repos/iGenVar/test/benchmark/config/caller_comparison_config.yaml"

include: "workflow/rules/callers.smk"
include: "workflow/rules/eval.smk"
include: "workflow/rules/eval.DUP_as_INS.smk"
include: "workflow/rules/plots.smk"

##### Target rules #####

rule all:
input:
"results/caller_comparison_iGenVar_only/eval/results.all.png",
"results/caller_comparison_iGenVar_only/eval/results.DUP_as_INS.all.png"
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
sample = config["parameters"]["sample"],
min_var_length = config["parameters"]["min_var_length"],
max_var_length = config["parameters"]["max_var_length"]

rule run_iGenVar:
output:
vcf = "results/caller_comparison_iGenVar_only/{input_combination}/variants.vcf"
log:
"logs/caller_comparison_iGenVar_only/{input_combination}_output.log"
threads: 2
run:
if wildcards.input_combination == 'S1': # Illumina Paired End
short_bam = config["short_read_bam"]["s1"]
shell("""
/usr/bin/time -v ./build/iGenVar/bin/iGenVar --input_short_reads {short_bam} \
--output {output.vcf} --vcf_sample_name {sample} --threads {threads} --verbose \
--min_var_length {min_var_length} --max_var_length {max_var_length} --min_qual 1 &>> {log}
""")
elif wildcards.input_combination == 'S2': # Illumina Mate Pair
short_bam = config["short_read_bam"]["s2"]
shell("""
/usr/bin/time -v ./build/iGenVar/bin/iGenVar --input_short_reads {short_bam} \
--output {output.vcf} --vcf_sample_name {sample} --threads {threads} --verbose \
--min_var_length {min_var_length} --max_var_length {max_var_length} --min_qual 1 &>> {log}
""")
elif wildcards.input_combination == 'S1L1': # Illumina Paired End & MtSinai PacBio
short_bam = config["short_read_bam"]["s1"],
long_bam = config["long_read_bam"]["l1"]
shell("""
/usr/bin/time -v ./build/iGenVar/bin/iGenVar --input_short_reads {short_bam} --input_long_reads {long_bam} \
--output {output.vcf} --vcf_sample_name {sample} --threads {threads} --verbose \
--min_var_length {min_var_length} --max_var_length {max_var_length} --min_qual 1 &>> {log}
""")
elif wildcards.input_combination == 'S2L1': # Illumina Mate Pair & MtSinai PacBio
short_bam = config["short_read_bam"]["s2"],
long_bam = config["long_read_bam"]["l1"]
shell("""
/usr/bin/time -v ./build/iGenVar/bin/iGenVar --input_short_reads {short_bam} --input_long_reads {long_bam} \
--output {output.vcf} --vcf_sample_name {sample} --threads {threads} --verbose \
--min_var_length {min_var_length} --max_var_length {max_var_length} --min_qual 1 &>> {log}
""")
elif wildcards.input_combination == 'S1L2': # Illumina Paired End & PacBio CCS
short_bam = config["short_read_bam"]["s1"],
long_bam = config["long_read_bam"]["l2"]
shell("""
/usr/bin/time -v ./build/iGenVar/bin/iGenVar --input_short_reads {short_bam} --input_long_reads {long_bam} \
--output {output.vcf} --vcf_sample_name {sample} --threads {threads} --verbose \
--min_var_length {min_var_length} --max_var_length {max_var_length} --min_qual 1 &>> {log}
""")
elif wildcards.input_combination == 'S2L2': # Illumina Mate Pair & PacBio CCS
short_bam = config["short_read_bam"]["s2"],
long_bam = config["long_read_bam"]["l2"]
shell("""
/usr/bin/time -v ./build/iGenVar/bin/iGenVar --input_short_reads {short_bam} --input_long_reads {long_bam} \
--output {output.vcf} --vcf_sample_name {sample} --threads {threads} --verbose \
--min_var_length {min_var_length} --max_var_length {max_var_length} --min_qual 1 &>> {log}
""")
elif wildcards.input_combination == 'S1L3': # Illumina Paired End & 10X Genomics
short_bam = config["short_read_bam"]["s1"],
long_bam = config["long_read_bam"]["l3"]
shell("""
/usr/bin/time -v ./build/iGenVar/bin/iGenVar --input_short_reads {short_bam} --input_long_reads {long_bam} \
--output {output.vcf} --vcf_sample_name {sample} --threads {threads} --verbose \
--min_var_length {min_var_length} --max_var_length {max_var_length} --min_qual 1 &>> {log}
""")
elif wildcards.input_combination == 'S2L3': # Illumina Mate Pair & 10X Genomics
short_bam = config["short_read_bam"]["s2"],
long_bam = config["long_read_bam"]["l3"]
shell("""
/usr/bin/time -v ./build/iGenVar/bin/iGenVar --input_short_reads {short_bam} --input_long_reads {long_bam} \
--output {output.vcf} --vcf_sample_name {sample} --threads {threads} --verbose \
--min_var_length {min_var_length} --max_var_length {max_var_length} --min_qual 1 &>> {log}
""")
elif wildcards.input_combination == 'L1': # MtSinai PacBio
long_bam = config["long_read_bam"]["l1"]
shell("""
/usr/bin/time -v ./build/iGenVar/bin/iGenVar --input_long_reads {long_bam} \
--output {output.vcf} --vcf_sample_name {sample} --threads {threads} --verbose \
--min_var_length {min_var_length} --max_var_length {max_var_length} --min_qual 1 &>> {log}
""")
elif wildcards.input_combination == 'L2': # PacBio CCS
long_bam = config["long_read_bam"]["l2"]
shell("""
/usr/bin/time -v ./build/iGenVar/bin/iGenVar --input_long_reads {long_bam} \
--output {output.vcf} --vcf_sample_name {sample} --threads {threads} --verbose \
--min_var_length {min_var_length} --max_var_length {max_var_length} --min_qual 1 &>> {log}
""")
else: # wildcards.input_combination == 'L3': # 10X Genomics
long_bam = config["long_read_bam"]["l3"]
shell("""
/usr/bin/time -v ./build/iGenVar/bin/iGenVar --input_long_reads {long_bam} \
--output {output.vcf} --vcf_sample_name {sample} --threads {threads} --verbose \
--min_var_length {min_var_length} --max_var_length {max_var_length} --min_qual 1 &>> {log}
""")
# Defaults:
# --method cigar_string --method split_read --method read_pairs --method read_depth
# --clustering_methods hierarchical_clustering --refinement_methods no_refinement
# --max_tol_inserted_length 50 --max_overlap 10 --hierarchical_clustering_cutoff 0.5
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
rule DUP_as_INS:
input:
vcf = "results/caller_comparison_iGenVar_only/{input_combination}/variants.vcf"
output:
vcf = "results/caller_comparison_iGenVar_only/{input_combination}/variants.DUP_as_INS.vcf"
log:
"logs/caller_comparison_iGenVar_only/DUP_as_INS.{input_combination}.log"
shell:
"sed -e 's/<DUP:TANDEM>/<INS>/g' {input.vcf} | sed -e 's/SVTYPE=DUP/SVTYPE=INS/g' > {output.vcf}"

rule DUP_as_INS_filter_vcf:
input:
vcf = "results/caller_comparison_iGenVar_only/{input_combination}/variants.DUP_as_INS.vcf"
output:
vcf = "results/caller_comparison_iGenVar_only/{input_combination}/variants.DUP_as_INS.min_qual_{min_qual}.vcf"
log:
"logs/caller_comparison_iGenVar_only/filter_vcf_output.{input_combination}.DUP_as_INS.{min_qual}.log",
conda:
"../../../envs/bcftools.yaml"
shell:
"bcftools view -i 'QUAL>={wildcards.min_qual}.00' {input.vcf} > {output.vcf} 2> {log}"

rule DUP_as_INS_truvari:
input:
vcf = "results/caller_comparison_iGenVar_only/{input_combination}/variants.DUP_as_INS.min_qual_{min_qual}.vcf.gz",
index = "results/caller_comparison_iGenVar_only/{input_combination}/variants.DUP_as_INS.min_qual_{min_qual}.vcf.gz.tbi"
output:
summary = "results/caller_comparison_iGenVar_only/eval/{input_combination}/DUP_as_INS.min_qual_{min_qual}/summary.txt"
log:
"logs/truvari/truvari_output.{input_combination}.{min_qual}.log"
params:
output_dir = "results/caller_comparison_iGenVar_only/eval/{input_combination}/DUP_as_INS.min_qual_{min_qual}",
truth_set_gz = config["truth_set_renamed_chr"]["gz"],
truth_set_bed = config["truth_set_renamed_chr"]["bed"]
shell:
"""
rm -rf {params.output_dir} && truvari bench -b {params.truth_set_gz} -c {input.vcf} -o {params.output_dir} -p 0 \
--passonly --includebed {params.truth_set_bed} &>> {log}
"""
# -f data/reference/hs37d5.fa

rule DUP_as_INS_reformat_truvari_results:
input:
"results/caller_comparison_iGenVar_only/eval/{input_combination}/DUP_as_INS.min_qual_{min_qual}/summary.txt"
output:
"results/caller_comparison_iGenVar_only/eval/{input_combination}/DUP_as_INS.min_qual_{min_qual}/pr_rec.txt"
threads: 1
shell:
"""
cat {input} | grep '\<precision\>\|\<recall\>' | tr -d ',' |sed 's/^[ \t]*//' | tr -d '\"' | tr -d ' ' \
| tr ':' '\t' | awk 'OFS=\"\\t\" {{ print \"{wildcards.input_combination}\", \"{wildcards.min_qual}\", $1, $2 }}' \
> {output}
"""

rule DUP_as_INS_cat_truvari_results_all:
input:
S1 = expand("results/caller_comparison_iGenVar_only/eval/S1/DUP_as_INS.min_qual_{min_qual}/pr_rec.txt",
min_qual=list(range(config["quality_ranges"]["from"],
config["quality_ranges"]["to"],
config["quality_ranges"]["step"]))),
S2 = expand("results/caller_comparison_iGenVar_only/eval/S2/DUP_as_INS.min_qual_{min_qual}/pr_rec.txt",
min_qual=list(range(config["quality_ranges"]["from"],
config["quality_ranges"]["to"],
config["quality_ranges"]["step"]))),
S1L1 = expand("results/caller_comparison_iGenVar_only/eval/S1L1/DUP_as_INS.min_qual_{min_qual}/pr_rec.txt",
min_qual=list(range(config["quality_ranges"]["from"],
config["quality_ranges"]["to"],
config["quality_ranges"]["step"]))),
S2L1 = expand("results/caller_comparison_iGenVar_only/eval/S2L1/DUP_as_INS.min_qual_{min_qual}/pr_rec.txt",
min_qual=list(range(config["quality_ranges"]["from"],
config["quality_ranges"]["to"],
config["quality_ranges"]["step"]))),
S1L2 = expand("results/caller_comparison_iGenVar_only/eval/S1L2/DUP_as_INS.min_qual_{min_qual}/pr_rec.txt",
min_qual=list(range(config["quality_ranges"]["from"],
config["quality_ranges"]["to"],
config["quality_ranges"]["step"]))),
S2L2 = expand("results/caller_comparison_iGenVar_only/eval/S2L2/DUP_as_INS.min_qual_{min_qual}/pr_rec.txt",
min_qual=list(range(config["quality_ranges"]["from"],
config["quality_ranges"]["to"],
config["quality_ranges"]["step"]))),
S1L3 = expand("results/caller_comparison_iGenVar_only/eval/S1L3/DUP_as_INS.min_qual_{min_qual}/pr_rec.txt",
min_qual=list(range(config["quality_ranges"]["from"],
config["quality_ranges"]["to"],
config["quality_ranges"]["step"]))),
S2L3 = expand("results/caller_comparison_iGenVar_only/eval/S2L3/DUP_as_INS.min_qual_{min_qual}/pr_rec.txt",
min_qual=list(range(config["quality_ranges"]["from"],
config["quality_ranges"]["to"],
config["quality_ranges"]["step"]))),
L1 = expand("results/caller_comparison_iGenVar_only/eval/L1/DUP_as_INS.min_qual_{min_qual}/pr_rec.txt",
min_qual=list(range(config["quality_ranges"]["from"],
config["quality_ranges"]["to"],
config["quality_ranges"]["step"]))),
L2 = expand("results/caller_comparison_iGenVar_only/eval/L2/DUP_as_INS.min_qual_{min_qual}/pr_rec.txt",
min_qual=list(range(config["quality_ranges"]["from"],
config["quality_ranges"]["to"],
config["quality_ranges"]["step"]))),
L3 = expand("results/caller_comparison_iGenVar_only/eval/L3/DUP_as_INS.min_qual_{min_qual}/pr_rec.txt",
min_qual=list(range(config["quality_ranges"]["from"],
config["quality_ranges"]["to"],
config["quality_ranges"]["step"])))
output:
S1 = temp("results/caller_comparison_iGenVar_only/eval/S1.DUP_as_INS.all_results.txt"),
S2 = temp("results/caller_comparison_iGenVar_only/eval/S2.DUP_as_INS.all_results.txt"),
S1L1 = temp("results/caller_comparison_iGenVar_only/eval/S1L1.DUP_as_INS.all_results.txt"),
S2L1 = temp("results/caller_comparison_iGenVar_only/eval/S2L1.DUP_as_INS.all_results.txt"),
S1L2 = temp("results/caller_comparison_iGenVar_only/eval/S1L2.DUP_as_INS.all_results.txt"),
S2L2 = temp("results/caller_comparison_iGenVar_only/eval/S2L2.DUP_as_INS.all_results.txt"),
S1L3 = temp("results/caller_comparison_iGenVar_only/eval/S1L3.DUP_as_INS.all_results.txt"),
S2L3 = temp("results/caller_comparison_iGenVar_only/eval/S2L3.DUP_as_INS.all_results.txt"),
L1 = temp("results/caller_comparison_iGenVar_only/eval/L1.DUP_as_INS.all_results.txt"),
L2 = temp("results/caller_comparison_iGenVar_only/eval/L2.DUP_as_INS.all_results.txt"),
L3 = temp("results/caller_comparison_iGenVar_only/eval/L3.DUP_as_INS.all_results.txt"),
all = "results/caller_comparison_iGenVar_only/eval/DUP_as_INS.all_results.txt"
threads: 1
run:
shell("cat {input.S1} > {output.S1}")
shell("cat {input.S2} > {output.S2}")
shell("cat {input.S1L1} > {output.S1L1}")
shell("cat {input.S2L1} > {output.S2L1}")
shell("cat {input.S1L2} > {output.S1L2}")
shell("cat {input.S2L2} > {output.S2L2}")
shell("cat {input.S1L3} > {output.S1L3}")
shell("cat {input.S2L3} > {output.S2L3}")
shell("cat {input.L1} > {output.L1}")
shell("cat {input.L2} > {output.L2}")
shell("cat {input.L3} > {output.L3}")
shell("""
cat {output.S1} {output.S2} {output.S1L1} {output.S2L1} \
{output.S1L2} {output.S2L2} {output.S1L3} {output.S2L3} \
{output.L1} {output.L2} {output.L3} > {output.all}
""")
Loading

0 comments on commit fe3f74c

Please sign in to comment.