From dc2234e80dbd24edaa0d5c51415c6d3cb65e697c Mon Sep 17 00:00:00 2001 From: Mark Walker Date: Fri, 23 Sep 2022 15:23:43 -0400 Subject: [PATCH] Address comments --- .../SVConcordance/SVConcordance.json.tmpl | 2 +- .../scripts/format_svtk_vcf_for_gatk.py | 151 +++++++++++------- wdl/SVConcordance.wdl | 2 +- 3 files changed, 95 insertions(+), 60 deletions(-) diff --git a/inputs/templates/test/SVConcordance/SVConcordance.json.tmpl b/inputs/templates/test/SVConcordance/SVConcordance.json.tmpl index f6a2b8edc..bbadc4dc4 100644 --- a/inputs/templates/test/SVConcordance/SVConcordance.json.tmpl +++ b/inputs/templates/test/SVConcordance/SVConcordance.json.tmpl @@ -16,4 +16,4 @@ "SVConcordance.contig_list": {{ reference_resources.primary_contigs_list | tojson }}, "SVConcordance.reference_dict": {{ reference_resources.reference_dict | tojson }} -} \ No newline at end of file +} diff --git a/src/sv-pipeline/scripts/format_svtk_vcf_for_gatk.py b/src/sv-pipeline/scripts/format_svtk_vcf_for_gatk.py index 87c7f28a2..704a024f8 100644 --- a/src/sv-pipeline/scripts/format_svtk_vcf_for_gatk.py +++ b/src/sv-pipeline/scripts/format_svtk_vcf_for_gatk.py @@ -1,15 +1,16 @@ #!/bin/env python import argparse +import contextlib import pysam import sys import gzip -from typing import List, Text, Set, Dict, Optional +from typing import Any, List, Text, Set, Dict, Optional _gt_sum_map = dict() -def __parse_bnd_ends(vcf_path: Text) -> Dict[Text, int]: +def _parse_bnd_ends(vcf_path: Text) -> Dict[Text, int]: """ Since pysam automatically changes invalid END fields (i.e. when less than the start position), they must be parsed manually. @@ -45,7 +46,7 @@ def __parse_bnd_ends(vcf_path: Text) -> Dict[Text, int]: return bnd_end_dict -def __parse_ploidy_table(path: Text) -> Dict[Text, Dict[Text, int]]: +def _parse_ploidy_table(path: Text) -> Dict[Text, Dict[Text, int]]: """ Parses tsv of sample ploidy values. @@ -155,8 +156,7 @@ def convert(record: pysam.VariantRecord, else: alleles = record.alleles contig = record.contig - new_record = vcf_out.new_record(contig=contig, start=record.start, stop=record.stop, alleles=alleles) - new_record.id = record.id + new_record = vcf_out.new_record(id=record.id, contig=contig, start=record.start, stop=record.stop, alleles=alleles) # copy INFO fields for key in record.info: if key not in default_remove_infos and key not in remove_infos: @@ -232,24 +232,37 @@ def convert(record: pysam.VariantRecord, def _cache_gt_sum(gt): - if gt in _gt_sum_map: - return _gt_sum_map[gt] - s = sum([1 for a in gt if a is not None and a > 0]) - _gt_sum_map[gt] = s + s = _gt_sum_map.get(gt, None) + if s is None: + s = sum([1 for a in gt if a is not None and a > 0]) + _gt_sum_map[gt] = s return s def add_cn_ecn(record: pysam.VariantRecord, vcf_out: pysam.VariantFile, ploidy_dict: Dict[Text, Dict[Text, int]]) -> pysam.VariantRecord: + """" + Only modifies records by adding CN and ECN INFO fields, e.g. for 'fixed' VCFs that just need + this metadata for certain GATK tools such as SVCluster and SVConcordance + + Parameters + ---------- + record: pysam.VariantRecord + input record + vcf_out: pysam.VariantFile + new vcf, to which the converted record will be written + ploidy_dict: Dict[Text, Dict[Text, int]] + map from sample to contig to ploidy + + Returns + ------- + header: pysam.VariantRecord + record with CN and ECN fields added""" svtype = record.info['SVTYPE'] contig = record.contig - new_record = vcf_out.new_record(contig=contig, start=record.start, stop=record.stop, alleles=record.alleles) - new_record.id = record.id - - # copy INFO fields - for key in record.info: - new_record.info[key] = record.info[key] + new_record = vcf_out.new_record(id=record.id, info=record.info, samples=record.samples, + contig=contig, start=record.start, stop=record.stop, alleles=record.alleles) # copy FORMAT fields for sample in record.samples: @@ -277,14 +290,62 @@ def filter_unsupported_type(record: pysam.VariantRecord) -> bool: return svtype == 'CPX' or svtype == 'CTX' -def __parse_arg_list(arg: Text) -> List[Text]: +def _parse_arg_list(arg: Text) -> List[Text]: if arg is None: return set() else: return arg.split(',') -def __parse_arguments(argv: List[Text]) -> argparse.Namespace: +def _process(vcf_in: pysam.VariantFile, + vcf_out: pysam.VariantFile, + arguments: Dict[Text, Any], + vcf_filter: Optional[pysam.VariantFile] = None) -> None: + """" + Master function for processing the given input vcf and writing output + + Parameters + ---------- + vcf_in: pysam.VariantFile + input vcf + vcf_out: pysam.VariantFile + output vcf + arguments: Dict[Text, Any] + commandline arguments + vcf_filter: Optional[pysam.VariantFile] + if provided, write filtered records to this vcf + + Returns + ------- + header: pysam.VariantRecord + record with CN and ECN fields added""" + remove_formats = set(_parse_arg_list(arguments.remove_formats)) + remove_infos = set(_parse_arg_list(arguments.remove_infos)) + if not arguments.only_add_cn_fields and not arguments.use_end2: + bnd_end_dict = _parse_bnd_ends(arguments.vcf) + else: + bnd_end_dict = None + ploidy_dict = _parse_ploidy_table(arguments.ploidy_table) + for record in vcf_in: + if arguments.filter_unsupported_types and filter_unsupported_type(record): + if vcf_filter is not None: + vcf_filter.write(record) + else: + if arguments.only_add_cn_fields: + out = add_cn_ecn(record=record, vcf_out=vcf_out, ploidy_dict=ploidy_dict) + else: + out = convert( + record=record, + vcf_out=vcf_out, + remove_infos=remove_infos, + remove_formats=remove_formats, + bnd_end_dict=bnd_end_dict, + ploidy_dict=ploidy_dict + ) + vcf_out.write(out) + + +def _parse_arguments(argv: List[Text]) -> argparse.Namespace: # noinspection PyTypeChecker parser = argparse.ArgumentParser( description="Convert a GATK-style SV VCF to SVTK-style", @@ -323,50 +384,24 @@ def __parse_arguments(argv: List[Text]) -> argparse.Namespace: def main(argv: Optional[List[Text]] = None): if argv is None: argv = sys.argv - arguments = __parse_arguments(argv) - remove_formats = set(__parse_arg_list(arguments.remove_formats)) - remove_infos = set(__parse_arg_list(arguments.remove_infos)) - if not arguments.only_add_cn_fields and not arguments.use_end2: - bnd_end_dict = __parse_bnd_ends(arguments.vcf) - else: - bnd_end_dict = None - ploidy_dict = __parse_ploidy_table(arguments.ploidy_table) + arguments = _parse_arguments(argv) + remove_formats = set(_parse_arg_list(arguments.remove_formats)) + remove_infos = set(_parse_arg_list(arguments.remove_infos)) # convert vcf header and records - vcf_in = pysam.VariantFile(arguments.vcf) - header = create_header( - header_in=vcf_in.header, - replace_ev_format=arguments.replace_ev_format, - remove_infos=remove_infos, - remove_formats=remove_formats - ) - vcf_out = pysam.VariantFile(arguments.out, mode='w', header=header) - if arguments.filter_out is not None: - vcf_filter = pysam.VariantFile(arguments.filter_out, mode='w', header=vcf_in.header) - else: - vcf_filter = None - - for record in vcf_in: - if arguments.filter_unsupported_types and filter_unsupported_type(record): + with pysam.VariantFile(arguments.vcf) as vcf_in: + header = create_header( + header_in=vcf_in.header, + replace_ev_format=arguments.replace_ev_format, + remove_infos=remove_infos, + remove_formats=remove_formats + ) + with pysam.VariantFile(arguments.out, mode='w', header=header) as vcf_out: + vcf_filter = pysam.VariantFile(arguments.filter_out, mode='w', header=vcf_in.header) if \ + arguments.filter_out is not None else None + _process(vcf_in, vcf_out, arguments, vcf_filter=vcf_filter) if vcf_filter is not None: - vcf_filter.write(record) - else: - if arguments.only_add_cn_fields: - out = add_cn_ecn(record=record, vcf_out=vcf_out, ploidy_dict=ploidy_dict) - else: - out = convert( - record=record, - vcf_out=vcf_out, - remove_infos=remove_infos, - remove_formats=remove_formats, - bnd_end_dict=bnd_end_dict, - ploidy_dict=ploidy_dict - ) - vcf_out.write(out) - vcf_in.close() - vcf_out.close() - if vcf_filter is not None: - vcf_filter.close() + vcf_filter.close() if __name__ == "__main__": diff --git a/wdl/SVConcordance.wdl b/wdl/SVConcordance.wdl index e43ec5ecd..005255fb9 100644 --- a/wdl/SVConcordance.wdl +++ b/wdl/SVConcordance.wdl @@ -277,4 +277,4 @@ task SVConcordance { preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) } -} \ No newline at end of file +}