Merge pull request #4 from kubranarci/dev

dev changes
nf-core · Feb 22, 2024 · c6ba2ad · c6ba2ad
2 parents d596b8a + eb514e8
commit c6ba2ad
Show file tree

Hide file tree

Showing 45 changed files with 565 additions and 663 deletions.
diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml
@@ -1,5 +1,5 @@
 report_comment: >
-  This report has been generated by the <a href="https://github.com/nf-core/variantbenchmarking/releases/tag/dev" target="_blank">nf-core/variantbenchmarking</a>
+  This report has been generated by the <a href="https://github.com/nf-core/variantbenchmarking/tree/dev" target="_blank">nf-core/variantbenchmarking</a>
   analysis pipeline. For information about how to interpret these results, please see the
   <a href="https://nf-co.re/variantbenchmarking/dev/docs/output" target="_blank">documentation</a>.
 report_section_order:

diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv
@@ -1,3 +1,3 @@
-sample,test_vcf,truth_vcf,caller,type
-HG002,"/Users/w620-admin/Desktop/nf-core/dataset/hg37/Broad_svaba_05052017/HG002.svaba.germline.sv.convBNDtoDEL.vcf","/Users/w620-admin/Desktop/nf-core/dataset/hg37/GIAB_Assemblytics_structural_variants_only_160618/hg002.Assemblytics_structural_variants.sorted.vcf.gz",svaba,sv
-HG003,"/Users/w620-admin/Desktop/nf-core/dataset/hg37/Broad_svaba_05052017/HG003.svaba.germline.sv.convBNDtoDEL.vcf","/Users/w620-admin/Desktop/nf-core/dataset/hg37/GIAB_Assemblytics_structural_variants_only_160618/hg003.Assemblytics_structural_variants.sorted.vcf.gz",svaba,sv
+test_vcf,caller,vartype
+"https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/gatk/haplotypecaller_calls/test2_haplotc.vcf.gz",mutect,sv
+"https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/vcf/sv_query.vcf.gz",unknown,sv
diff --git a/assets/samplesheet_HG002.csv b/assets/samplesheet_HG002.csv
@@ -1,4 +1,5 @@
-test_vcf,caller
-"/Users/w620-admin/Desktop/nf-core/dataset/hg37/dragen_paper/HG002_delly_SV_hg19.vcf.gz",delly
-"/Users/w620-admin/Desktop/nf-core/dataset/hg37/dragen_paper/HG002_lumpy_SV_hg19.vcf.gz",lumpy
-"/Users/w620-admin/Desktop/nf-core/dataset/hg37/dragen_paper/HG002_manta_SV_hg19_genotype.vcf",manta
+test_vcf,caller,vartype
+"/Users/w620-admin/Desktop/nf-core/dataset/hg37/dragen_paper/HG002_delly_SV_hg19.vcf.gz",delly,sv
+"/Users/w620-admin/Desktop/nf-core/dataset/hg37/dragen_paper/HG002_lumpy_SV_hg19.sorted.vcf.gz",lumpy,sv
+"/Users/w620-admin/Desktop/nf-core/dataset/hg37/dragen_paper/HG002_manta_SV_hg19_genotype2.vcf",manta,sv
+"/Users/w620-admin/Desktop/nf-core/dataset/hg37/Broad_svaba_05052017/full.svaba.germline.sv.vcf",svaba,sv
diff --git a/assets/samplesheet_HG002_hg19.csv b/assets/samplesheet_HG002_hg19.csv
diff --git a/assets/samplesheet_HG002_hg38.csv b/assets/samplesheet_HG002_hg38.csv
@@ -1,5 +1,7 @@
-test_vcf,caller
-"/Users/w620-admin/Desktop/nf-core/dataset/hg38/GIAB_GRCh38_SVs_06252018/ajtrio.lumpy.svtyper.HG002.md.sorted.recal.vcf.gz",lumpy
-"/Users/w620-admin/Desktop/nf-core/dataset/hg38/GIAB_GRCh38_SVs_06252018/manta.HG002.vcf.gz",manta
-"/Users/w620-admin/Desktop/nf-core/dataset/hg37/Ashkenazim_unnanotated/Ashkenazim_HG002.filtered.sv.vcf.gz",merged
+test_vcf,caller,vartype
+"/Users/w620-admin/Desktop/nf-core/dataset/hg38/GIAB_GRCh38_SVs_06252018/ajtrio.lumpy.svtyper.HG002.md.sorted.recal.vcf.gz",lumpy,sv
+"/Users/w620-admin/Desktop/nf-core/dataset/hg38/GIAB_GRCh38_SVs_06252018/manta.HG002.vcf.gz",manta,sv
+"/Users/w620-admin/Desktop/nf-core/dataset/hg38/Ashkenazim_unnanotated/Ashkenazim_HG002.filtered.sv.vcf.gz",merged,sv
+"/Users/w620-admin/Desktop/nf-core/dataset/hg38/HG002_DRAGEN_SV_hg19.vcf.gz",dragen,sv
+
 
diff --git a/assets/schema_input.json b/assets/schema_input.json
@@ -16,8 +16,14 @@
         "type": "string",
         "pattern": "^\\S+$",
         "errorMessage": "Name of the variant caller used to generate test file"
+      },
+      "vartype": {
+        "type": "string",
+        "pattern": "^\\S+$",
+        "errorMessage": "Variant type to benchmark"
       }
+
     },
-    "required": ["test_vcf","caller"]
+    "required": ["test_vcf","caller","vartype"]
   }
 }
diff --git a/bin/GangSTRConvertTruvariCompatible.py b/bin/GangSTRConvertTruvariCompatible.py
@@ -0,0 +1,166 @@
+import argparse
+import re
+from Bio import SeqIO
+
+header = """##FILTER=<ID=HOMREF,Description="Homozygous reference call">
+##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">
+##INFO=<ID=SVLEN,Number=A,Type=Integer,Description="Difference in length between REF and ALT alleles">
+##INFO=<ID=EVENTTYPE,Number=A,Type=String,Description="Type of associated event">
+##EVENTTYPE=<ID=STR,Description="Short tandem repeat">
+##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the longest variant described in this record">
+##INFO=<ID=REFRUC,Number=1,Type=Float,Description="Reference copy number of the tandem repeat">
+##INFO=<ID=CN,Number=A,Type=Float,Description="Copy number of allele">
+##INFO=<ID=RUS,Number=1,Type=String,Description="Repeat unit sequence of the corresponding repeat sequence">
+##INFO=<ID=RUL,Number=1,Type=Integer,Description="Repeat unit length of the corresponding repeat sequence">
+##INFO=<ID=RUC,Number=A,Type=Float,Description="Repeat unit count of the corresponding repeat sequence">
+##INFO=<ID=RUCCHANGE,Number=A,Type=Float,Description="Change in repeat unit count of the corresponding repeat sequence">
+##INFO=<ID=CNVTRLEN,Number=A,Type=Float,Description="Change in total length of the corresponding repeat sequence">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=CN,Number=1,Type=Float,Description="Sum of copy numbers">
+##ALT=<ID=CNV:TR,Description="Tandem repeat determined based on DNA abundance">
+"""
+
+
+def translateLine(line: str, ref: dict) -> str:
+    lineSplit = line.strip("\n").split("\t")
+
+    contigsAndAlleles = lineSplit[:-3]
+    contigsAndAlleles[3] = ref[contigsAndAlleles[0]].seq[
+        int(contigsAndAlleles[1]) - 1
+    ]
+    contigsAndAlleles[4] = re.sub("[a-z]+", "<CNV:TR>", contigsAndAlleles[4])
+
+    infoField = lineSplit[-3]
+    infoDict = re.findall("([A-Z]+)=([\+\-A-Za-z\.\d_:]+);", infoField)
+    assert len(infoDict) != 0
+    infoDict = dict(infoDict)
+
+    contigsAndAlleles[2] = (
+        "tr_"
+        + contigsAndAlleles[0]
+        + "_"
+        + contigsAndAlleles[1]
+        + "_"
+        + infoDict["END"]
+    )
+
+    formatKeys = lineSplit[-2].split(":")
+    if lineSplit[-1] == ".":
+        formatValues = ["."] * len(formatKeys)
+    else:
+        formatValues = lineSplit[-1].split(":")
+    formatDict = dict(zip(formatKeys, formatValues))
+
+    rul = len(infoDict["RU"])
+    svlen = int(infoDict["END"]) - int(contigsAndAlleles[1])
+    refruc = float(svlen) / rul
+    svlen = str(svlen)
+    repcn = formatDict["REPCN"].split(",")
+
+    if formatDict["GT"] in ["./.", "."]:
+        cn = "."
+        ruc = "."
+        rucchange = "."
+        cnvtrlen = "."
+        cnformat = "."
+    else:
+        if formatDict["GT"] == "1/2":
+            svlen = [svlen] * 2
+            ruc = [float(x) for x in repcn]
+            cn = [x / refruc for x in ruc]
+            rucchange = [x - refruc for x in ruc]
+            cnvtrlen = [x * rul for x in rucchange]
+        else:
+            svlen = [svlen]
+            ruc = (
+                float(repcn[0])
+                if formatDict["GT"] == "1/0"
+                else float(repcn[-1])
+            )
+            cn = [ruc / refruc]
+            rucchange = [ruc - refruc]
+            cnvtrlen = [rucchange[0] * rul]
+            ruc = [ruc]
+        cnformat = f"{sum(cn):.6f}"
+        svlen = ",".join(svlen)
+        cn = ",".join([f"{x:.2f}" for x in cn])
+        ruc = ",".join([f"{x:.2f}" for x in ruc])
+        rucchange = ",".join([f"{x:.6f}" for x in rucchange])
+        cnvtrlen = ",".join([f"{x:.6f}" for x in cnvtrlen])
+
+    info = ["SVTYPE=CNV", "EVENTTYPE=STR"]
+
+    info.append(f"SVLEN={svlen}")
+    info.append(f"END={infoDict['END']}")
+    info.append(f"REFRUC={refruc:.2f}")
+    info.append(f"RUS={infoDict['RU'].upper()}")
+    # RUL Gives issues with Wittyer (RulMismatch) not sure why
+    # info.append(
+    #    f"RUL={len(infoDict['RU'])}"
+    # )
+    info.append(f"CN={cn}")
+    info.append(f"RUC={ruc}")
+    info.append(f"RUCCHANGE={rucchange}")
+    info.append(f"CNVTRLEN={cnvtrlen}")
+
+    info = ";".join(info)
+
+    formatFields = ":".join(["GT", "CN"])
+
+    gt = formatDict["GT"]
+    gt = "0/1" if gt == "1/0" else gt
+
+    format = ":".join([gt, cnformat])
+
+    """
+    HOMREF calls have to be "masked" when using Witty.er, otherwise a HOMREF call that is not
+    present in the truth will be counted as a query FP, decreasing precision for no reason.
+    It's sufficient to change the FILTER from PASS or . to something else (like HOMREF)
+    """
+    contigsAndAlleles[6] = "HOMREF" if gt in ["0/0", "0"] else "PASS"
+
+    contigsAndAlleles = "\t".join(contigsAndAlleles)
+    return "\t".join([contigsAndAlleles, info, formatFields, format]) + "\n"
+
+
+def convertVcf(inputFile: str, outputFile: str, reference: str) -> None:
+    inputHandle = open(inputFile, "r")
+    outputHandle = open(outputFile, "w")
+
+    referenceDict = SeqIO.to_dict(SeqIO.parse(reference, "fasta"))
+
+    for line in inputHandle:
+        if re.search("^#[^#]", line):
+            outputHandle.write(header)
+            outputHandle.write(line)
+            continue
+
+        if re.search("^##(INFO)|(FORMAT)|(ALT)", line):
+            continue
+
+        if re.search("^##", line):
+            outputHandle.write(line)
+            continue
+
+        outputHandle.write(translateLine(line, referenceDict))
+
+    inputHandle.close()
+    outputHandle.close()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Convert the repeat genotyping module VCF output in a VCFv4.4 compliant format (similar to the VNTR module output)"
+    )
+    parser.add_argument(
+        "-i", "--input", help="input VCF", type=str, required=True
+    )
+    parser.add_argument(
+        "-r", "--reference", help="Genome FASTA", type=str, required=False
+    )
+    parser.add_argument(
+        "-o", "--output", help="output VCF", type=str, required=True
+    )
+    args = parser.parse_args()
+
+    convertVcf(args.input, args.output, args.reference)
diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py
@@ -36,9 +36,11 @@ def print_error(error, context="Line", context_str=""):
 def check_samplesheet(file_in, file_out):
     """
     This function checks that the samplesheet follows the following structure:
-    test_vcf,caller
-    test1.vcf,manta
-    test2.vcf,svaba
+    test_vcf,caller,vartype
+    test1.vcf,manta,sv
+    test2.vcf,svaba,small
+    test3.vcf,cnvkit,cnv
+
     For an example see:
     https://github.com/ghga-de/nf-benchmark/assets/samplesheet.csv
     """
@@ -47,8 +49,8 @@ def check_samplesheet(file_in, file_out):
     with open(file_in, "r", encoding='utf-8-sig') as fin:
 
         ## Check header
-        MIN_COLS = 2
-        HEADER = ["test_vcf","caller"]
+        MIN_COLS = 3
+        HEADER = ["test_vcf","caller","vartype"]
         header = [x.strip('"') for x in fin.readline().strip().split(",")]
         if header[: len(HEADER)] != HEADER:
             print(
@@ -78,7 +80,7 @@ def check_samplesheet(file_in, file_out):
                     )
 
                 ## Check caller name entries
-                test_vcf, caller = lspl[: len(HEADER)]
+                test_vcf, caller, vartype = lspl[: len(HEADER)]
                 if caller.find(" ") != -1:
                     print(
                         f"WARNING: Spaces have been replaced by underscores for caller: {caller}"
@@ -87,11 +89,11 @@ def check_samplesheet(file_in, file_out):
                 if not caller:
                     print_error("Caller entry has not been specified!", "Line", line)
 
-                sample_info = []  ## [test_vcf, caller ]
+                sample_info = []  ## [test_vcf, caller, vartype ]
 
-                sample_info = [test_vcf, caller]
+                sample_info = [test_vcf, caller, vartype]
 
-                ## Create caller mapping dictionary = {caller: [[test_vcf, caller ]]}
+                ## Create caller mapping dictionary = {caller: [[test_vcf, caller, vartype ]]}
                 if caller not in sample_mapping_dict:
                     sample_mapping_dict[caller] = [sample_info]
                 else:
@@ -106,7 +108,7 @@ def check_samplesheet(file_in, file_out):
         make_dir(out_dir)
         with open(file_out, "w") as fout:
             fout.write(
-                ",".join(["test_vcf","caller"])
+                ",".join(["test_vcf","caller", "vartype"])
                 + "\n"
             )
             for caller in sorted(sample_mapping_dict.keys()):