Skip to content

Commit

Permalink
Address comments
Browse files Browse the repository at this point in the history
  • Loading branch information
mwalker174 committed Sep 23, 2022
1 parent 65ee3a7 commit dc2234e
Show file tree
Hide file tree
Showing 3 changed files with 95 additions and 60 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,4 @@

"SVConcordance.contig_list": {{ reference_resources.primary_contigs_list | tojson }},
"SVConcordance.reference_dict": {{ reference_resources.reference_dict | tojson }}
}
}
151 changes: 93 additions & 58 deletions src/sv-pipeline/scripts/format_svtk_vcf_for_gatk.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
#!/bin/env python

import argparse
import contextlib
import pysam
import sys
import gzip
from typing import List, Text, Set, Dict, Optional
from typing import Any, List, Text, Set, Dict, Optional

_gt_sum_map = dict()


def __parse_bnd_ends(vcf_path: Text) -> Dict[Text, int]:
def _parse_bnd_ends(vcf_path: Text) -> Dict[Text, int]:
"""
Since pysam automatically changes invalid END fields (i.e. when less than the start position), they must
be parsed manually.
Expand Down Expand Up @@ -45,7 +46,7 @@ def __parse_bnd_ends(vcf_path: Text) -> Dict[Text, int]:
return bnd_end_dict


def __parse_ploidy_table(path: Text) -> Dict[Text, Dict[Text, int]]:
def _parse_ploidy_table(path: Text) -> Dict[Text, Dict[Text, int]]:
"""
Parses tsv of sample ploidy values.
Expand Down Expand Up @@ -155,8 +156,7 @@ def convert(record: pysam.VariantRecord,
else:
alleles = record.alleles
contig = record.contig
new_record = vcf_out.new_record(contig=contig, start=record.start, stop=record.stop, alleles=alleles)
new_record.id = record.id
new_record = vcf_out.new_record(id=record.id, contig=contig, start=record.start, stop=record.stop, alleles=alleles)
# copy INFO fields
for key in record.info:
if key not in default_remove_infos and key not in remove_infos:
Expand Down Expand Up @@ -232,24 +232,37 @@ def convert(record: pysam.VariantRecord,


def _cache_gt_sum(gt):
if gt in _gt_sum_map:
return _gt_sum_map[gt]
s = sum([1 for a in gt if a is not None and a > 0])
_gt_sum_map[gt] = s
s = _gt_sum_map.get(gt, None)
if s is None:
s = sum([1 for a in gt if a is not None and a > 0])
_gt_sum_map[gt] = s
return s


def add_cn_ecn(record: pysam.VariantRecord,
vcf_out: pysam.VariantFile,
ploidy_dict: Dict[Text, Dict[Text, int]]) -> pysam.VariantRecord:
""""
Only modifies records by adding CN and ECN INFO fields, e.g. for 'fixed' VCFs that just need
this metadata for certain GATK tools such as SVCluster and SVConcordance
Parameters
----------
record: pysam.VariantRecord
input record
vcf_out: pysam.VariantFile
new vcf, to which the converted record will be written
ploidy_dict: Dict[Text, Dict[Text, int]]
map from sample to contig to ploidy
Returns
-------
header: pysam.VariantRecord
record with CN and ECN fields added"""
svtype = record.info['SVTYPE']
contig = record.contig
new_record = vcf_out.new_record(contig=contig, start=record.start, stop=record.stop, alleles=record.alleles)
new_record.id = record.id

# copy INFO fields
for key in record.info:
new_record.info[key] = record.info[key]
new_record = vcf_out.new_record(id=record.id, info=record.info, samples=record.samples,
contig=contig, start=record.start, stop=record.stop, alleles=record.alleles)

# copy FORMAT fields
for sample in record.samples:
Expand Down Expand Up @@ -277,14 +290,62 @@ def filter_unsupported_type(record: pysam.VariantRecord) -> bool:
return svtype == 'CPX' or svtype == 'CTX'


def __parse_arg_list(arg: Text) -> List[Text]:
def _parse_arg_list(arg: Text) -> List[Text]:
if arg is None:
return set()
else:
return arg.split(',')


def __parse_arguments(argv: List[Text]) -> argparse.Namespace:
def _process(vcf_in: pysam.VariantFile,
vcf_out: pysam.VariantFile,
arguments: Dict[Text, Any],
vcf_filter: Optional[pysam.VariantFile] = None) -> None:
""""
Master function for processing the given input vcf and writing output
Parameters
----------
vcf_in: pysam.VariantFile
input vcf
vcf_out: pysam.VariantFile
output vcf
arguments: Dict[Text, Any]
commandline arguments
vcf_filter: Optional[pysam.VariantFile]
if provided, write filtered records to this vcf
Returns
-------
header: pysam.VariantRecord
record with CN and ECN fields added"""
remove_formats = set(_parse_arg_list(arguments.remove_formats))
remove_infos = set(_parse_arg_list(arguments.remove_infos))
if not arguments.only_add_cn_fields and not arguments.use_end2:
bnd_end_dict = _parse_bnd_ends(arguments.vcf)
else:
bnd_end_dict = None
ploidy_dict = _parse_ploidy_table(arguments.ploidy_table)
for record in vcf_in:
if arguments.filter_unsupported_types and filter_unsupported_type(record):
if vcf_filter is not None:
vcf_filter.write(record)
else:
if arguments.only_add_cn_fields:
out = add_cn_ecn(record=record, vcf_out=vcf_out, ploidy_dict=ploidy_dict)
else:
out = convert(
record=record,
vcf_out=vcf_out,
remove_infos=remove_infos,
remove_formats=remove_formats,
bnd_end_dict=bnd_end_dict,
ploidy_dict=ploidy_dict
)
vcf_out.write(out)


def _parse_arguments(argv: List[Text]) -> argparse.Namespace:
# noinspection PyTypeChecker
parser = argparse.ArgumentParser(
description="Convert a GATK-style SV VCF to SVTK-style",
Expand Down Expand Up @@ -323,50 +384,24 @@ def __parse_arguments(argv: List[Text]) -> argparse.Namespace:
def main(argv: Optional[List[Text]] = None):
if argv is None:
argv = sys.argv
arguments = __parse_arguments(argv)
remove_formats = set(__parse_arg_list(arguments.remove_formats))
remove_infos = set(__parse_arg_list(arguments.remove_infos))
if not arguments.only_add_cn_fields and not arguments.use_end2:
bnd_end_dict = __parse_bnd_ends(arguments.vcf)
else:
bnd_end_dict = None
ploidy_dict = __parse_ploidy_table(arguments.ploidy_table)
arguments = _parse_arguments(argv)
remove_formats = set(_parse_arg_list(arguments.remove_formats))
remove_infos = set(_parse_arg_list(arguments.remove_infos))

# convert vcf header and records
vcf_in = pysam.VariantFile(arguments.vcf)
header = create_header(
header_in=vcf_in.header,
replace_ev_format=arguments.replace_ev_format,
remove_infos=remove_infos,
remove_formats=remove_formats
)
vcf_out = pysam.VariantFile(arguments.out, mode='w', header=header)
if arguments.filter_out is not None:
vcf_filter = pysam.VariantFile(arguments.filter_out, mode='w', header=vcf_in.header)
else:
vcf_filter = None

for record in vcf_in:
if arguments.filter_unsupported_types and filter_unsupported_type(record):
with pysam.VariantFile(arguments.vcf) as vcf_in:
header = create_header(
header_in=vcf_in.header,
replace_ev_format=arguments.replace_ev_format,
remove_infos=remove_infos,
remove_formats=remove_formats
)
with pysam.VariantFile(arguments.out, mode='w', header=header) as vcf_out:
vcf_filter = pysam.VariantFile(arguments.filter_out, mode='w', header=vcf_in.header) if \
arguments.filter_out is not None else None
_process(vcf_in, vcf_out, arguments, vcf_filter=vcf_filter)
if vcf_filter is not None:
vcf_filter.write(record)
else:
if arguments.only_add_cn_fields:
out = add_cn_ecn(record=record, vcf_out=vcf_out, ploidy_dict=ploidy_dict)
else:
out = convert(
record=record,
vcf_out=vcf_out,
remove_infos=remove_infos,
remove_formats=remove_formats,
bnd_end_dict=bnd_end_dict,
ploidy_dict=ploidy_dict
)
vcf_out.write(out)
vcf_in.close()
vcf_out.close()
if vcf_filter is not None:
vcf_filter.close()
vcf_filter.close()


if __name__ == "__main__":
Expand Down
2 changes: 1 addition & 1 deletion wdl/SVConcordance.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -277,4 +277,4 @@ task SVConcordance {
preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
}
}
}

0 comments on commit dc2234e

Please sign in to comment.