Skip to content

Commit

Permalink
Merge pull request #4 from kubranarci/dev
Browse files Browse the repository at this point in the history
dev changes
  • Loading branch information
kubranarci authored Feb 22, 2024
2 parents d596b8a + eb514e8 commit c6ba2ad
Show file tree
Hide file tree
Showing 45 changed files with 565 additions and 663 deletions.
2 changes: 1 addition & 1 deletion assets/multiqc_config.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
report_comment: >
This report has been generated by the <a href="https://github.com/nf-core/variantbenchmarking/releases/tag/dev" target="_blank">nf-core/variantbenchmarking</a>
This report has been generated by the <a href="https://github.com/nf-core/variantbenchmarking/tree/dev" target="_blank">nf-core/variantbenchmarking</a>
analysis pipeline. For information about how to interpret these results, please see the
<a href="https://nf-co.re/variantbenchmarking/dev/docs/output" target="_blank">documentation</a>.
report_section_order:
Expand Down
6 changes: 3 additions & 3 deletions assets/samplesheet.csv
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
sample,test_vcf,truth_vcf,caller,type
HG002,"/Users/w620-admin/Desktop/nf-core/dataset/hg37/Broad_svaba_05052017/HG002.svaba.germline.sv.convBNDtoDEL.vcf","/Users/w620-admin/Desktop/nf-core/dataset/hg37/GIAB_Assemblytics_structural_variants_only_160618/hg002.Assemblytics_structural_variants.sorted.vcf.gz",svaba,sv
HG003,"/Users/w620-admin/Desktop/nf-core/dataset/hg37/Broad_svaba_05052017/HG003.svaba.germline.sv.convBNDtoDEL.vcf","/Users/w620-admin/Desktop/nf-core/dataset/hg37/GIAB_Assemblytics_structural_variants_only_160618/hg003.Assemblytics_structural_variants.sorted.vcf.gz",svaba,sv
test_vcf,caller,vartype
"https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/gatk/haplotypecaller_calls/test2_haplotc.vcf.gz",mutect,sv
"https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/vcf/sv_query.vcf.gz",unknown,sv
9 changes: 5 additions & 4 deletions assets/samplesheet_HG002.csv
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
test_vcf,caller
"/Users/w620-admin/Desktop/nf-core/dataset/hg37/dragen_paper/HG002_delly_SV_hg19.vcf.gz",delly
"/Users/w620-admin/Desktop/nf-core/dataset/hg37/dragen_paper/HG002_lumpy_SV_hg19.vcf.gz",lumpy
"/Users/w620-admin/Desktop/nf-core/dataset/hg37/dragen_paper/HG002_manta_SV_hg19_genotype.vcf",manta
test_vcf,caller,vartype
"/Users/w620-admin/Desktop/nf-core/dataset/hg37/dragen_paper/HG002_delly_SV_hg19.vcf.gz",delly,sv
"/Users/w620-admin/Desktop/nf-core/dataset/hg37/dragen_paper/HG002_lumpy_SV_hg19.sorted.vcf.gz",lumpy,sv
"/Users/w620-admin/Desktop/nf-core/dataset/hg37/dragen_paper/HG002_manta_SV_hg19_genotype2.vcf",manta,sv
"/Users/w620-admin/Desktop/nf-core/dataset/hg37/Broad_svaba_05052017/full.svaba.germline.sv.vcf",svaba,sv
5 changes: 0 additions & 5 deletions assets/samplesheet_HG002_hg19.csv

This file was deleted.

10 changes: 6 additions & 4 deletions assets/samplesheet_HG002_hg38.csv
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
test_vcf,caller
"/Users/w620-admin/Desktop/nf-core/dataset/hg38/GIAB_GRCh38_SVs_06252018/ajtrio.lumpy.svtyper.HG002.md.sorted.recal.vcf.gz",lumpy
"/Users/w620-admin/Desktop/nf-core/dataset/hg38/GIAB_GRCh38_SVs_06252018/manta.HG002.vcf.gz",manta
"/Users/w620-admin/Desktop/nf-core/dataset/hg37/Ashkenazim_unnanotated/Ashkenazim_HG002.filtered.sv.vcf.gz",merged
test_vcf,caller,vartype
"/Users/w620-admin/Desktop/nf-core/dataset/hg38/GIAB_GRCh38_SVs_06252018/ajtrio.lumpy.svtyper.HG002.md.sorted.recal.vcf.gz",lumpy,sv
"/Users/w620-admin/Desktop/nf-core/dataset/hg38/GIAB_GRCh38_SVs_06252018/manta.HG002.vcf.gz",manta,sv
"/Users/w620-admin/Desktop/nf-core/dataset/hg38/Ashkenazim_unnanotated/Ashkenazim_HG002.filtered.sv.vcf.gz",merged,sv
"/Users/w620-admin/Desktop/nf-core/dataset/hg38/HG002_DRAGEN_SV_hg19.vcf.gz",dragen,sv


8 changes: 7 additions & 1 deletion assets/schema_input.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,14 @@
"type": "string",
"pattern": "^\\S+$",
"errorMessage": "Name of the variant caller used to generate test file"
},
"vartype": {
"type": "string",
"pattern": "^\\S+$",
"errorMessage": "Variant type to benchmark"
}

},
"required": ["test_vcf","caller"]
"required": ["test_vcf","caller","vartype"]
}
}
166 changes: 166 additions & 0 deletions bin/GangSTRConvertTruvariCompatible.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
import argparse
import re
from Bio import SeqIO

header = """##FILTER=<ID=HOMREF,Description="Homozygous reference call">
##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">
##INFO=<ID=SVLEN,Number=A,Type=Integer,Description="Difference in length between REF and ALT alleles">
##INFO=<ID=EVENTTYPE,Number=A,Type=String,Description="Type of associated event">
##EVENTTYPE=<ID=STR,Description="Short tandem repeat">
##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the longest variant described in this record">
##INFO=<ID=REFRUC,Number=1,Type=Float,Description="Reference copy number of the tandem repeat">
##INFO=<ID=CN,Number=A,Type=Float,Description="Copy number of allele">
##INFO=<ID=RUS,Number=1,Type=String,Description="Repeat unit sequence of the corresponding repeat sequence">
##INFO=<ID=RUL,Number=1,Type=Integer,Description="Repeat unit length of the corresponding repeat sequence">
##INFO=<ID=RUC,Number=A,Type=Float,Description="Repeat unit count of the corresponding repeat sequence">
##INFO=<ID=RUCCHANGE,Number=A,Type=Float,Description="Change in repeat unit count of the corresponding repeat sequence">
##INFO=<ID=CNVTRLEN,Number=A,Type=Float,Description="Change in total length of the corresponding repeat sequence">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=CN,Number=1,Type=Float,Description="Sum of copy numbers">
##ALT=<ID=CNV:TR,Description="Tandem repeat determined based on DNA abundance">
"""


def translateLine(line: str, ref: dict) -> str:
lineSplit = line.strip("\n").split("\t")

contigsAndAlleles = lineSplit[:-3]
contigsAndAlleles[3] = ref[contigsAndAlleles[0]].seq[
int(contigsAndAlleles[1]) - 1
]
contigsAndAlleles[4] = re.sub("[a-z]+", "<CNV:TR>", contigsAndAlleles[4])

infoField = lineSplit[-3]
infoDict = re.findall("([A-Z]+)=([\+\-A-Za-z\.\d_:]+);", infoField)
assert len(infoDict) != 0
infoDict = dict(infoDict)

contigsAndAlleles[2] = (
"tr_"
+ contigsAndAlleles[0]
+ "_"
+ contigsAndAlleles[1]
+ "_"
+ infoDict["END"]
)

formatKeys = lineSplit[-2].split(":")
if lineSplit[-1] == ".":
formatValues = ["."] * len(formatKeys)
else:
formatValues = lineSplit[-1].split(":")
formatDict = dict(zip(formatKeys, formatValues))

rul = len(infoDict["RU"])
svlen = int(infoDict["END"]) - int(contigsAndAlleles[1])
refruc = float(svlen) / rul
svlen = str(svlen)
repcn = formatDict["REPCN"].split(",")

if formatDict["GT"] in ["./.", "."]:
cn = "."
ruc = "."
rucchange = "."
cnvtrlen = "."
cnformat = "."
else:
if formatDict["GT"] == "1/2":
svlen = [svlen] * 2
ruc = [float(x) for x in repcn]
cn = [x / refruc for x in ruc]
rucchange = [x - refruc for x in ruc]
cnvtrlen = [x * rul for x in rucchange]
else:
svlen = [svlen]
ruc = (
float(repcn[0])
if formatDict["GT"] == "1/0"
else float(repcn[-1])
)
cn = [ruc / refruc]
rucchange = [ruc - refruc]
cnvtrlen = [rucchange[0] * rul]
ruc = [ruc]
cnformat = f"{sum(cn):.6f}"
svlen = ",".join(svlen)
cn = ",".join([f"{x:.2f}" for x in cn])
ruc = ",".join([f"{x:.2f}" for x in ruc])
rucchange = ",".join([f"{x:.6f}" for x in rucchange])
cnvtrlen = ",".join([f"{x:.6f}" for x in cnvtrlen])

info = ["SVTYPE=CNV", "EVENTTYPE=STR"]

info.append(f"SVLEN={svlen}")
info.append(f"END={infoDict['END']}")
info.append(f"REFRUC={refruc:.2f}")
info.append(f"RUS={infoDict['RU'].upper()}")
# RUL Gives issues with Wittyer (RulMismatch) not sure why
# info.append(
# f"RUL={len(infoDict['RU'])}"
# )
info.append(f"CN={cn}")
info.append(f"RUC={ruc}")
info.append(f"RUCCHANGE={rucchange}")
info.append(f"CNVTRLEN={cnvtrlen}")

info = ";".join(info)

formatFields = ":".join(["GT", "CN"])

gt = formatDict["GT"]
gt = "0/1" if gt == "1/0" else gt

format = ":".join([gt, cnformat])

"""
HOMREF calls have to be "masked" when using Witty.er, otherwise a HOMREF call that is not
present in the truth will be counted as a query FP, decreasing precision for no reason.
It's sufficient to change the FILTER from PASS or . to something else (like HOMREF)
"""
contigsAndAlleles[6] = "HOMREF" if gt in ["0/0", "0"] else "PASS"

contigsAndAlleles = "\t".join(contigsAndAlleles)
return "\t".join([contigsAndAlleles, info, formatFields, format]) + "\n"


def convertVcf(inputFile: str, outputFile: str, reference: str) -> None:
inputHandle = open(inputFile, "r")
outputHandle = open(outputFile, "w")

referenceDict = SeqIO.to_dict(SeqIO.parse(reference, "fasta"))

for line in inputHandle:
if re.search("^#[^#]", line):
outputHandle.write(header)
outputHandle.write(line)
continue

if re.search("^##(INFO)|(FORMAT)|(ALT)", line):
continue

if re.search("^##", line):
outputHandle.write(line)
continue

outputHandle.write(translateLine(line, referenceDict))

inputHandle.close()
outputHandle.close()


if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Convert the repeat genotyping module VCF output in a VCFv4.4 compliant format (similar to the VNTR module output)"
)
parser.add_argument(
"-i", "--input", help="input VCF", type=str, required=True
)
parser.add_argument(
"-r", "--reference", help="Genome FASTA", type=str, required=False
)
parser.add_argument(
"-o", "--output", help="output VCF", type=str, required=True
)
args = parser.parse_args()

convertVcf(args.input, args.output, args.reference)
22 changes: 12 additions & 10 deletions bin/check_samplesheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,11 @@ def print_error(error, context="Line", context_str=""):
def check_samplesheet(file_in, file_out):
"""
This function checks that the samplesheet follows the following structure:
test_vcf,caller
test1.vcf,manta
test2.vcf,svaba
test_vcf,caller,vartype
test1.vcf,manta,sv
test2.vcf,svaba,small
test3.vcf,cnvkit,cnv
For an example see:
https://github.com/ghga-de/nf-benchmark/assets/samplesheet.csv
"""
Expand All @@ -47,8 +49,8 @@ def check_samplesheet(file_in, file_out):
with open(file_in, "r", encoding='utf-8-sig') as fin:

## Check header
MIN_COLS = 2
HEADER = ["test_vcf","caller"]
MIN_COLS = 3
HEADER = ["test_vcf","caller","vartype"]
header = [x.strip('"') for x in fin.readline().strip().split(",")]
if header[: len(HEADER)] != HEADER:
print(
Expand Down Expand Up @@ -78,7 +80,7 @@ def check_samplesheet(file_in, file_out):
)

## Check caller name entries
test_vcf, caller = lspl[: len(HEADER)]
test_vcf, caller, vartype = lspl[: len(HEADER)]
if caller.find(" ") != -1:
print(
f"WARNING: Spaces have been replaced by underscores for caller: {caller}"
Expand All @@ -87,11 +89,11 @@ def check_samplesheet(file_in, file_out):
if not caller:
print_error("Caller entry has not been specified!", "Line", line)

sample_info = [] ## [test_vcf, caller ]
sample_info = [] ## [test_vcf, caller, vartype ]

sample_info = [test_vcf, caller]
sample_info = [test_vcf, caller, vartype]

## Create caller mapping dictionary = {caller: [[test_vcf, caller ]]}
## Create caller mapping dictionary = {caller: [[test_vcf, caller, vartype ]]}
if caller not in sample_mapping_dict:
sample_mapping_dict[caller] = [sample_info]
else:
Expand All @@ -106,7 +108,7 @@ def check_samplesheet(file_in, file_out):
make_dir(out_dir)
with open(file_out, "w") as fout:
fout.write(
",".join(["test_vcf","caller"])
",".join(["test_vcf","caller", "vartype"])
+ "\n"
)
for caller in sorted(sample_mapping_dict.keys()):
Expand Down
Loading

0 comments on commit c6ba2ad

Please sign in to comment.