diff --git a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/CombineBatches.json.tmpl b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/CombineBatches.json.tmpl index 954af662d..efcf50447 100644 --- a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/CombineBatches.json.tmpl +++ b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/CombineBatches.json.tmpl @@ -1,10 +1,15 @@ { "CombineBatches.contig_list": "${workspace.primary_contigs_fai}", - "CombineBatches.pe_exclude_list": "${workspace.pesr_exclude_list}", - "CombineBatches.depth_exclude_list": "${workspace.depth_exclude_list}", - "CombineBatches.empty_file" : "${workspace.empty_file}", + + "CombineBatches.clustering_config_part1" : "${workspace.clustering_config_part1}", + "CombineBatches.stratification_config_part1" : "${workspace.stratification_config_part1}", + "CombineBatches.clustering_config_part2" : "${workspace.clustering_config_part2}", + "CombineBatches.stratification_config_part2" : "${workspace.stratification_config_part2}", + "CombineBatches.track_bed_files": {{ reference_resources.clustering_tracks | tojson }}, + "CombineBatches.track_names": {{ reference_resources.clustering_track_names | tojson }}, "CombineBatches.min_sr_background_fail_batches": 0.5, + "CombineBatches.gatk_docker": "${workspace.gatk_docker}", "CombineBatches.sv_pipeline_docker": "${workspace.sv_pipeline_docker}", "CombineBatches.sv_base_mini_docker": "${workspace.sv_base_mini_docker}", @@ -13,6 +18,10 @@ "CombineBatches.pesr_vcfs": "${this.sample_sets.genotyped_pesr_vcf}", "CombineBatches.depth_vcfs": "${this.regenotyped_depth_vcfs}", "CombineBatches.raw_sr_bothside_pass_files": "${this.sample_sets.sr_bothside_pass}", - "CombineBatches.raw_sr_background_fail_files": "${this.sample_sets.sr_background_fail}" + "CombineBatches.raw_sr_background_fail_files": "${this.sample_sets.sr_background_fail}", + "CombineBatches.ped_file": "${workspace.cohort_ped_file}", + "CombineBatches.reference_dict" : "${workspace.reference_dict}", + "CombineBatches.reference_fasta" : "${workspace.reference_fasta}", + "CombineBatches.reference_fasta_fai" : "${workspace.reference_index}" } diff --git a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/MakeCohortVcf.json.tmpl b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/MakeCohortVcf.json.tmpl deleted file mode 100644 index f3fbfb018..000000000 --- a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/MakeCohortVcf.json.tmpl +++ /dev/null @@ -1,52 +0,0 @@ -{ - "MakeCohortVcf.bin_exclude": "${workspace.bin_exclude}", - "MakeCohortVcf.contig_list": "${workspace.primary_contigs_fai}", - "MakeCohortVcf.allosome_fai": "${workspace.allosome_file}", - "MakeCohortVcf.cytobands": "${workspace.cytobands}", - "MakeCohortVcf.mei_bed": "${workspace.mei_bed}", - "MakeCohortVcf.pe_exclude_list": "${workspace.pesr_exclude_list}", - "MakeCohortVcf.depth_exclude_list": "${workspace.depth_exclude_list}", - "MakeCohortVcf.empty_file" : "${workspace.empty_file}", - "MakeCohortVcf.ref_dict": "${workspace.reference_dict}", - "MakeCohortVcf.HERVK_reference": "${workspace.HERVK_reference}", - "MakeCohortVcf.LINE1_reference": "${workspace.LINE1_reference}", - - "MakeCohortVcf.site_level_comparison_datasets": [ - {{ reference_resources.ccdg_abel_site_level_benchmarking_dataset | tojson }}, - {{ reference_resources.gnomad_v2_collins_site_level_benchmarking_dataset | tojson }}, - {{ reference_resources.hgsv_byrska_bishop_site_level_benchmarking_dataset | tojson }}, - {{ reference_resources.thousand_genomes_site_level_benchmarking_dataset | tojson }} - ], - - "MakeCohortVcf.min_sr_background_fail_batches": 0.5, - "MakeCohortVcf.max_shards_per_chrom_clean_vcf_step1": 200, - "MakeCohortVcf.min_records_per_shard_clean_vcf_step1": 5000, - "MakeCohortVcf.clean_vcf1b_records_per_shard": 10000, - "MakeCohortVcf.clean_vcf5_records_per_shard": 5000, - "MakeCohortVcf.samples_per_clean_vcf_step2_shard": 100, - "MakeCohortVcf.random_seed": 0, - "MakeCohortVcf.max_shard_size_resolve": 500, - - "MakeCohortVcf.linux_docker": "${workspace.linux_docker}", - "MakeCohortVcf.sv_pipeline_docker": "${workspace.sv_pipeline_docker}", - "MakeCohortVcf.sv_base_mini_docker": "${workspace.sv_base_mini_docker}", - "MakeCohortVcf.sv_pipeline_qc_docker": "${workspace.sv_pipeline_qc_docker}", - - "MakeCohortVcf.primary_contigs_list": "${workspace.primary_contigs_list}", - - "MakeCohortVcf.chr_x": "${workspace.chr_x}", - "MakeCohortVcf.chr_y": "${workspace.chr_y}", - - "MakeCohortVcf.cohort_name": "${this.sample_set_set_id}", - "MakeCohortVcf.batches": "${this.sample_sets.sample_set_id}", - "MakeCohortVcf.ped_file": "${workspace.cohort_ped_file}", - "MakeCohortVcf.disc_files": "${this.sample_sets.merged_PE}", - "MakeCohortVcf.bincov_files": "${this.sample_sets.merged_bincov}", - "MakeCohortVcf.median_coverage_files": "${this.sample_sets.median_cov}", - "MakeCohortVcf.rf_cutoff_files": "${this.sample_sets.cutoffs}", - "MakeCohortVcf.pesr_vcfs": "${this.sample_sets.genotyped_pesr_vcf}", - "MakeCohortVcf.depth_vcfs": "${this.regenotyped_depth_vcfs}", - "MakeCohortVcf.depth_gt_rd_sep_files": "${this.sample_sets.trained_genotype_depth_depth_sepcutoff}", - "MakeCohortVcf.raw_sr_bothside_pass_files": "${this.sample_sets.sr_bothside_pass}", - "MakeCohortVcf.raw_sr_background_fail_files": "${this.sample_sets.sr_background_fail}" -} diff --git a/inputs/templates/terra_workspaces/cohort_mode/workspace.tsv.tmpl b/inputs/templates/terra_workspaces/cohort_mode/workspace.tsv.tmpl index 6ee7a177d..f2f5fb774 100644 --- a/inputs/templates/terra_workspaces/cohort_mode/workspace.tsv.tmpl +++ b/inputs/templates/terra_workspaces/cohort_mode/workspace.tsv.tmpl @@ -1,4 +1,6 @@ workspace:cloud_sdk_docker {{ dockers.cloud_sdk_docker }} +clustering_config_part1 {{ reference_resources.clustering_config_part1 }} +clustering_config_part2 {{ reference_resources.clustering_config_part2 }} cnmops_docker {{ dockers.cnmops_docker }} condense_counts_docker {{ dockers.condense_counts_docker }} gatk_docker {{ dockers.gatk_docker }} @@ -57,6 +59,8 @@ reference_version {{ reference_resources.reference_version }} rmsk {{ reference_resources.rmsk }} segdups {{ reference_resources.segdups }} seed_cutoffs {{ reference_resources.seed_cutoffs }} +stratification_config_part1 {{ reference_resources.stratification_config_part1 }} +stratification_config_part2 {{ reference_resources.stratification_config_part2 }} wgd_scoring_mask {{ reference_resources.wgd_scoring_mask }} wham_include_list_bed_file {{ reference_resources.wham_include_list_bed_file }} chr_x {{ reference_resources.chr_x }} diff --git a/inputs/templates/terra_workspaces/single_sample/GATKSVPipelineSingleSample.json.tmpl b/inputs/templates/terra_workspaces/single_sample/GATKSVPipelineSingleSample.json.tmpl index 88c735b2d..70bae9352 100644 --- a/inputs/templates/terra_workspaces/single_sample/GATKSVPipelineSingleSample.json.tmpl +++ b/inputs/templates/terra_workspaces/single_sample/GATKSVPipelineSingleSample.json.tmpl @@ -30,6 +30,7 @@ "GATKSVPipelineSingleSample.linux_docker" : "${workspace.linux_docker}", "GATKSVPipelineSingleSample.manta_docker": "${workspace.manta_docker}", + "GATKSVPipelineSingleSample.scramble_docker": "${workspace.scramble_docker}", "GATKSVPipelineSingleSample.sv_base_docker": "${workspace.sv_base_docker}", "GATKSVPipelineSingleSample.sv_base_mini_docker": "${workspace.sv_base_mini_docker}", "GATKSVPipelineSingleSample.sv_pipeline_docker": "${workspace.sv_pipeline_docker}", @@ -42,7 +43,6 @@ "GATKSVPipelineSingleSample.autosome_file" : "${workspace.reference_autosome_file}", "GATKSVPipelineSingleSample.primary_contigs_list" : "${workspace.reference_primary_contigs_list}", "GATKSVPipelineSingleSample.primary_contigs_fai" : "${workspace.reference_primary_contigs_fai}", - "GATKSVPipelineSingleSample.empty_file" : "${workspace.reference_empty_file}", "GATKSVPipelineSingleSample.genome_file" : "${workspace.reference_genome_file}", "GATKSVPipelineSingleSample.max_ref_panel_carrier_freq": 0.03, "GATKSVPipelineSingleSample.manta_region_bed" : "${workspace.reference_manta_region_bed}", @@ -86,6 +86,14 @@ "GATKSVPipelineSingleSample.depth_interval_overlap": "0.8", "GATKSVPipelineSingleSample.depth_clustering_algorithm": "SINGLE_LINKAGE", + "GATKSVPipelineSingleSample.clustering_config_part1" : "${workspace.clustering_config_part1}", + "GATKSVPipelineSingleSample.stratification_config_part1" : "${workspace.clustering_config_part1}", + "GATKSVPipelineSingleSample.clustering_config_part2" : "${workspace.clustering_config_part2}", + "GATKSVPipelineSingleSample.stratification_config_part2" : "${workspace.stratification_config_part2}", + + "GATKSVPipelineSingleSample.clustering_track_names" : {{ reference_resources.clustering_track_names | tojson }}, + "GATKSVPipelineSingleSample.clustering_track_bed_files" : {{ reference_resources.clustering_tracks | tojson }}, + "GATKSVPipelineSingleSample.ref_copy_number_autosomal_contigs" : 2, "GATKSVPipelineSingleSample.clean_vcf_min_sr_background_fail_batches": 0.5, "GATKSVPipelineSingleSample.max_shard_size_resolve" : 500, diff --git a/inputs/templates/terra_workspaces/single_sample/workspace.tsv.tmpl b/inputs/templates/terra_workspaces/single_sample/workspace.tsv.tmpl index 2bcef0ad2..51db39b6f 100644 --- a/inputs/templates/terra_workspaces/single_sample/workspace.tsv.tmpl +++ b/inputs/templates/terra_workspaces/single_sample/workspace.tsv.tmpl @@ -1,4 +1,6 @@ workspace:cloud_sdk_docker {{ dockers.cloud_sdk_docker }} +clustering_config_part1 {{ reference_resources.clustering_config_part1 }} +clustering_config_part2 {{ reference_resources.clustering_config_part2 }} cnmops_docker {{ dockers.cnmops_docker }} condense_counts_docker {{ dockers.condense_counts_docker }} gatk_docker {{ dockers.gatk_docker }} @@ -12,6 +14,7 @@ sv_base_mini_docker {{ dockers.sv_base_mini_docker }} sv_pipeline_docker {{ dockers.sv_pipeline_docker }} sv_pipeline_qc_docker {{ dockers.sv_pipeline_qc_docker }} wham_docker {{ dockers.wham_docker }} +scramble_docker {{ dockers.scramble_docker }} ref_panel_name {{ ref_panel.name }} ref_panel_bincov_matrix {{ ref_panel.merged_coverage_file }} ref_panel_contig_ploidy_model_tar {{ ref_panel.contig_ploidy_model_tar }} @@ -77,3 +80,5 @@ reference_segdups {{ reference_resources.segdups }} reference_seed_cutoffs {{ reference_resources.seed_cutoffs }} reference_wgd_scoring_mask {{ reference_resources.wgd_scoring_mask }} reference_wham_include_list_bed_file {{ reference_resources.wham_include_list_bed_file }} +stratification_config_part1 {{ reference_resources.stratification_config_part1 }} +stratification_config_part2 {{ reference_resources.stratification_config_part2 }} diff --git a/inputs/templates/test/GATKSVPipelineBatch/GATKSVPipelineBatch.FromSampleEvidence.json.tmpl b/inputs/templates/test/GATKSVPipelineBatch/GATKSVPipelineBatch.FromSampleEvidence.json.tmpl index 2bfbc4f1b..ecf7fc764 100644 --- a/inputs/templates/test/GATKSVPipelineBatch/GATKSVPipelineBatch.FromSampleEvidence.json.tmpl +++ b/inputs/templates/test/GATKSVPipelineBatch/GATKSVPipelineBatch.FromSampleEvidence.json.tmpl @@ -123,7 +123,6 @@ "GATKSVPipelineBatch.RegenotypeCNVs.n_per_split": "5000", "GATKSVPipelineBatch.MakeCohortVcf.bin_exclude": {{ reference_resources.bin_exclude | tojson }}, - "GATKSVPipelineBatch.MakeCohortVcf.empty_file" : {{ reference_resources.empty_file | tojson }}, "GATKSVPipelineBatch.MakeCohortVcf.site_level_comparison_datasets": [ {{ reference_resources.ccdg_abel_site_level_benchmarking_dataset | tojson }}, {{ reference_resources.gnomad_v2_collins_site_level_benchmarking_dataset | tojson }}, @@ -133,7 +132,6 @@ "GATKSVPipelineBatch.MakeCohortVcf.cytobands": {{ reference_resources.cytobands | tojson }}, "GATKSVPipelineBatch.MakeCohortVcf.mei_bed": {{ reference_resources.mei_bed | tojson }}, "GATKSVPipelineBatch.MakeCohortVcf.pe_exclude_list": {{ reference_resources.pesr_exclude_list | tojson }}, - "GATKSVPipelineBatch.MakeCohortVcf.depth_exclude_list": {{ reference_resources.depth_exclude_list | tojson }}, "GATKSVPipelineBatch.MakeCohortVcf.HERVK_reference": {{ reference_resources.hervk_reference | tojson }}, "GATKSVPipelineBatch.MakeCohortVcf.LINE1_reference": {{ reference_resources.line1_reference | tojson }}, "GATKSVPipelineBatch.MakeCohortVcf.min_sr_background_fail_batches": 0.5, @@ -143,5 +141,11 @@ "GATKSVPipelineBatch.MakeCohortVcf.samples_per_clean_vcf_step2_shard": 100, "GATKSVPipelineBatch.MakeCohortVcf.clean_vcf5_records_per_shard": 5000, "GATKSVPipelineBatch.MakeCohortVcf.random_seed": 0, - "GATKSVPipelineBatch.MakeCohortVcf.max_shard_size_resolve": 500 + "GATKSVPipelineBatch.MakeCohortVcf.max_shard_size_resolve": 500, + "GATKSVPipelineBatch.MakeCohortVcf.clustering_config_part1" : {{ reference_resources.clustering_config_part1 | tojson }}, + "GATKSVPipelineBatch.MakeCohortVcf.stratification_config_part1" : {{ reference_resources.clustering_config_part1 | tojson }}, + "GATKSVPipelineBatch.MakeCohortVcf.clustering_config_part2" : {{ reference_resources.clustering_config_part2 | tojson }}, + "GATKSVPipelineBatch.MakeCohortVcf.stratification_config_part2" : {{ reference_resources.stratification_config_part2 | tojson }}, + "GATKSVPipelineBatch.MakeCohortVcf.track_bed_files": {{ reference_resources.clustering_tracks | tojson }}, + "GATKSVPipelineBatch.MakeCohortVcf.track_names": {{ reference_resources.clustering_track_names | tojson }} } diff --git a/inputs/templates/test/GATKSVPipelineBatch/GATKSVPipelineBatch.json.tmpl b/inputs/templates/test/GATKSVPipelineBatch/GATKSVPipelineBatch.json.tmpl index 9ba04d229..e0bf9102d 100644 --- a/inputs/templates/test/GATKSVPipelineBatch/GATKSVPipelineBatch.json.tmpl +++ b/inputs/templates/test/GATKSVPipelineBatch/GATKSVPipelineBatch.json.tmpl @@ -116,7 +116,6 @@ "GATKSVPipelineBatch.RegenotypeCNVs.n_per_split": "5000", "GATKSVPipelineBatch.MakeCohortVcf.bin_exclude": {{ reference_resources.bin_exclude | tojson }}, - "GATKSVPipelineBatch.MakeCohortVcf.empty_file" : {{ reference_resources.empty_file | tojson }}, "GATKSVPipelineBatch.MakeCohortVcf.site_level_comparison_datasets": [ {{ reference_resources.ccdg_abel_site_level_benchmarking_dataset | tojson }}, {{ reference_resources.gnomad_v2_collins_site_level_benchmarking_dataset | tojson }}, @@ -126,7 +125,6 @@ "GATKSVPipelineBatch.MakeCohortVcf.cytobands": {{ reference_resources.cytobands | tojson }}, "GATKSVPipelineBatch.MakeCohortVcf.mei_bed": {{ reference_resources.mei_bed | tojson }}, "GATKSVPipelineBatch.MakeCohortVcf.pe_exclude_list": {{ reference_resources.pesr_exclude_list | tojson }}, - "GATKSVPipelineBatch.MakeCohortVcf.depth_exclude_list": {{ reference_resources.depth_exclude_list | tojson }}, "GATKSVPipelineBatch.MakeCohortVcf.HERVK_reference": {{ reference_resources.hervk_reference | tojson }}, "GATKSVPipelineBatch.MakeCohortVcf.LINE1_reference": {{ reference_resources.line1_reference | tojson }}, "GATKSVPipelineBatch.MakeCohortVcf.min_sr_background_fail_batches": 0.5, @@ -137,6 +135,12 @@ "GATKSVPipelineBatch.MakeCohortVcf.clean_vcf5_records_per_shard": 5000, "GATKSVPipelineBatch.MakeCohortVcf.random_seed": 0, "GATKSVPipelineBatch.MakeCohortVcf.max_shard_size_resolve": 500, + "GATKSVPipelineBatch.MakeCohortVcf.clustering_config_part1" : {{ reference_resources.clustering_config_part1 | tojson }}, + "GATKSVPipelineBatch.MakeCohortVcf.stratification_config_part1" : {{ reference_resources.clustering_config_part1 | tojson }}, + "GATKSVPipelineBatch.MakeCohortVcf.clustering_config_part2" : {{ reference_resources.clustering_config_part2 | tojson }}, + "GATKSVPipelineBatch.MakeCohortVcf.stratification_config_part2" : {{ reference_resources.stratification_config_part2 | tojson }}, + "GATKSVPipelineBatch.MakeCohortVcf.track_bed_files": {{ reference_resources.clustering_tracks | tojson }}, + "GATKSVPipelineBatch.MakeCohortVcf.track_names": {{ reference_resources.clustering_track_names | tojson }}, "GATKSVPipelineBatch.GATKSVPipelinePhase1.runtime_attr_postprocess": { "cpu_cores": 1, diff --git a/inputs/templates/test/GATKSVPipelineSingleSample/GATKSVPipelineSingleSample.json.tmpl b/inputs/templates/test/GATKSVPipelineSingleSample/GATKSVPipelineSingleSample.json.tmpl index 870b5b82f..b4b33bed3 100644 --- a/inputs/templates/test/GATKSVPipelineSingleSample/GATKSVPipelineSingleSample.json.tmpl +++ b/inputs/templates/test/GATKSVPipelineSingleSample/GATKSVPipelineSingleSample.json.tmpl @@ -31,7 +31,6 @@ "GATKSVPipelineSingleSample.linux_docker" : {{ dockers.linux_docker | tojson }}, "GATKSVPipelineSingleSample.manta_docker": {{ dockers.manta_docker | tojson }}, - "GATKSVPipelineSingleSample.melt_docker": {{ dockers.melt_docker | tojson }}, "GATKSVPipelineSingleSample.scramble_docker": {{ dockers.scramble_docker | tojson }}, "GATKSVPipelineSingleSample.sv_base_docker": {{ dockers.sv_base_docker | tojson }}, "GATKSVPipelineSingleSample.sv_base_mini_docker": {{ dockers.sv_base_mini_docker | tojson }}, @@ -46,7 +45,6 @@ "GATKSVPipelineSingleSample.autosome_file" : {{ reference_resources.autosome_file | tojson }}, "GATKSVPipelineSingleSample.primary_contigs_list" : {{ reference_resources.primary_contigs_list | tojson }}, "GATKSVPipelineSingleSample.primary_contigs_fai" : {{ reference_resources.primary_contigs_fai | tojson }}, - "GATKSVPipelineSingleSample.empty_file" : {{ reference_resources.empty_file | tojson }}, "GATKSVPipelineSingleSample.genome_file" : {{ reference_resources.genome_file | tojson }}, "GATKSVPipelineSingleSample.max_ref_panel_carrier_freq": 0.03, "GATKSVPipelineSingleSample.manta_region_bed" : {{ reference_resources.manta_region_bed | tojson }}, @@ -90,6 +88,13 @@ "GATKSVPipelineSingleSample.depth_interval_overlap": "0.8", "GATKSVPipelineSingleSample.depth_clustering_algorithm": "SINGLE_LINKAGE", + "GATKSVPipelineSingleSample.clustering_config_part1" : {{ reference_resources.clustering_config_part1 | tojson }}, + "GATKSVPipelineSingleSample.stratification_config_part1" : {{ reference_resources.clustering_config_part1 | tojson }}, + "GATKSVPipelineSingleSample.clustering_config_part2" : {{ reference_resources.clustering_config_part2 | tojson }}, + "GATKSVPipelineSingleSample.stratification_config_part2" : {{ reference_resources.stratification_config_part2 | tojson }}, + "GATKSVPipelineSingleSample.clustering_track_bed_files": {{ reference_resources.clustering_tracks | tojson }}, + "GATKSVPipelineSingleSample.clustering_track_names": {{ reference_resources.clustering_track_names | tojson }}, + "GATKSVPipelineSingleSample.ref_copy_number_autosomal_contigs" : {{ reference_resources.copy_number_autosomal_contigs | tojson }}, "GATKSVPipelineSingleSample.clean_vcf_min_sr_background_fail_batches": 0.5, "GATKSVPipelineSingleSample.max_shard_size_resolve" : 500, diff --git a/inputs/templates/test/GATKSVPipelineSingleSample/GATKSVPipelineSingleSample.melt.json.tmpl b/inputs/templates/test/GATKSVPipelineSingleSample/GATKSVPipelineSingleSample.melt.json.tmpl index 8352403df..919d3b468 100644 --- a/inputs/templates/test/GATKSVPipelineSingleSample/GATKSVPipelineSingleSample.melt.json.tmpl +++ b/inputs/templates/test/GATKSVPipelineSingleSample/GATKSVPipelineSingleSample.melt.json.tmpl @@ -45,7 +45,6 @@ "GATKSVPipelineSingleSample.autosome_file" : {{ reference_resources.autosome_file | tojson }}, "GATKSVPipelineSingleSample.primary_contigs_list" : {{ reference_resources.primary_contigs_list | tojson }}, "GATKSVPipelineSingleSample.primary_contigs_fai" : {{ reference_resources.primary_contigs_fai | tojson }}, - "GATKSVPipelineSingleSample.empty_file" : {{ reference_resources.empty_file | tojson }}, "GATKSVPipelineSingleSample.genome_file" : {{ reference_resources.genome_file | tojson }}, "GATKSVPipelineSingleSample.max_ref_panel_carrier_freq": 0.03, "GATKSVPipelineSingleSample.manta_region_bed" : {{ reference_resources.manta_region_bed | tojson }}, @@ -89,6 +88,13 @@ "GATKSVPipelineSingleSample.depth_interval_overlap": "0.8", "GATKSVPipelineSingleSample.depth_clustering_algorithm": "SINGLE_LINKAGE", + "GATKSVPipelineSingleSample.clustering_config_part1" : {{ reference_resources.clustering_config_part1 | tojson }}, + "GATKSVPipelineSingleSample.stratification_config_part1" : {{ reference_resources.clustering_config_part1 | tojson }}, + "GATKSVPipelineSingleSample.clustering_config_part2" : {{ reference_resources.clustering_config_part2 | tojson }}, + "GATKSVPipelineSingleSample.stratification_config_part2" : {{ reference_resources.stratification_config_part2 | tojson }}, + "GATKSVPipelineSingleSample.clustering_track_bed_files": {{ reference_resources.clustering_tracks | tojson }}, + "GATKSVPipelineSingleSample.clustering_track_names": {{ reference_resources.clustering_track_names | tojson }}, + "GATKSVPipelineSingleSample.ref_copy_number_autosomal_contigs" : 2, "GATKSVPipelineSingleSample.MakeCohortVcf.HERVK_reference": {{ reference_resources.hervk_reference | tojson }}, diff --git a/inputs/templates/test/MakeCohortVcf/CombineBatches.json.tmpl b/inputs/templates/test/MakeCohortVcf/CombineBatches.json.tmpl index 21508edce..c8ed66625 100644 --- a/inputs/templates/test/MakeCohortVcf/CombineBatches.json.tmpl +++ b/inputs/templates/test/MakeCohortVcf/CombineBatches.json.tmpl @@ -1,14 +1,24 @@ { "CombineBatches.contig_list": {{ reference_resources.primary_contigs_fai | tojson }}, - "CombineBatches.pe_exclude_list": {{ reference_resources.pesr_exclude_list | tojson }}, - "CombineBatches.depth_exclude_list": {{ reference_resources.depth_exclude_list | tojson }}, - "CombineBatches.empty_file" : {{ reference_resources.empty_file | tojson }}, + + "CombineBatches.clustering_config_part1" : {{ reference_resources.clustering_config_part1 | tojson }}, + "CombineBatches.stratification_config_part1" : {{ reference_resources.clustering_config_part1 | tojson }}, + "CombineBatches.clustering_config_part2" : {{ reference_resources.clustering_config_part2 | tojson }}, + "CombineBatches.stratification_config_part2" : {{ reference_resources.stratification_config_part2 | tojson }}, + "CombineBatches.track_bed_files": {{ reference_resources.clustering_tracks | tojson }}, + "CombineBatches.track_names": {{ reference_resources.clustering_track_names | tojson }}, + + "CombineBatches.reference_fasta": {{ reference_resources.reference_fasta | tojson }}, + "CombineBatches.reference_dict": {{ reference_resources.reference_dict | tojson }}, + "CombineBatches.reference_fasta_fai": {{ reference_resources.reference_index | tojson }}, "CombineBatches.min_sr_background_fail_batches": 0.5, + "CombineBatches.gatk_docker": {{ dockers.gatk_docker | tojson }}, "CombineBatches.sv_pipeline_docker": {{ dockers.sv_pipeline_docker | tojson }}, "CombineBatches.sv_base_mini_docker":{{ dockers.sv_base_mini_docker | tojson }}, "CombineBatches.cohort_name": {{ test_batch.name | tojson }}, + "CombineBatches.ped_file": {{ test_batch.ped_file | tojson }}, "CombineBatches.batches": [ {{ test_batch.name | tojson }} ], diff --git a/inputs/templates/test/MakeCohortVcf/MakeCohortVcf.json.tmpl b/inputs/templates/test/MakeCohortVcf/MakeCohortVcf.json.tmpl deleted file mode 100644 index cad2a9e03..000000000 --- a/inputs/templates/test/MakeCohortVcf/MakeCohortVcf.json.tmpl +++ /dev/null @@ -1,72 +0,0 @@ -{ - "MakeCohortVcf.bin_exclude": {{ reference_resources.bin_exclude | tojson }}, - "MakeCohortVcf.contig_list": {{ reference_resources.primary_contigs_fai | tojson }}, - "MakeCohortVcf.allosome_fai": {{ reference_resources.allosome_file | tojson }}, - "MakeCohortVcf.cytobands": {{ reference_resources.cytobands | tojson }}, - "MakeCohortVcf.mei_bed": {{ reference_resources.mei_bed | tojson }}, - "MakeCohortVcf.pe_exclude_list": {{ reference_resources.pesr_exclude_list | tojson }}, - "MakeCohortVcf.depth_exclude_list": {{ reference_resources.depth_exclude_list | tojson }}, - "MakeCohortVcf.empty_file" : {{ reference_resources.empty_file | tojson }}, - "MakeCohortVcf.ref_dict": {{ reference_resources.reference_dict | tojson }}, - "MakeCohortVcf.HERVK_reference": {{ reference_resources.hervk_reference | tojson }}, - "MakeCohortVcf.LINE1_reference": {{ reference_resources.line1_reference | tojson }}, - - "MakeCohortVcf.site_level_comparison_datasets": [ - {{ reference_resources.ccdg_abel_site_level_benchmarking_dataset | tojson }}, - {{ reference_resources.gnomad_v2_collins_site_level_benchmarking_dataset | tojson }}, - {{ reference_resources.hgsv_byrska_bishop_site_level_benchmarking_dataset | tojson }}, - {{ reference_resources.thousand_genomes_site_level_benchmarking_dataset | tojson }} - ], - - "MakeCohortVcf.chr_x": {{ reference_resources.chr_x | tojson }}, - "MakeCohortVcf.chr_y": {{ reference_resources.chr_y | tojson }}, - - "MakeCohortVcf.min_sr_background_fail_batches": 0.5, - "MakeCohortVcf.max_shard_size_resolve" : 500, - "MakeCohortVcf.max_shards_per_chrom_clean_vcf_step1": 200, - "MakeCohortVcf.min_records_per_shard_clean_vcf_step1": 5000, - "MakeCohortVcf.clean_vcf1b_records_per_shard": 10000, - "MakeCohortVcf.samples_per_clean_vcf_step2_shard": 100, - "MakeCohortVcf.clean_vcf5_records_per_shard": 5000, - "MakeCohortVcf.random_seed": 0, - - "MakeCohortVcf.primary_contigs_list": {{ reference_resources.primary_contigs_list | tojson }}, - - "MakeCohortVcf.linux_docker": {{ dockers.linux_docker | tojson }}, - "MakeCohortVcf.sv_pipeline_docker": {{ dockers.sv_pipeline_docker | tojson }}, - "MakeCohortVcf.sv_base_mini_docker":{{ dockers.sv_base_mini_docker | tojson }}, - "MakeCohortVcf.sv_pipeline_qc_docker": {{ dockers.sv_pipeline_qc_docker | tojson }}, - - "MakeCohortVcf.cohort_name": {{ test_batch.name | tojson }}, - "MakeCohortVcf.batches": [ - {{ test_batch.name | tojson }} - ], - "MakeCohortVcf.ped_file": {{ test_batch.ped_file | tojson }}, - "MakeCohortVcf.disc_files": [ - {{ test_batch.merged_disc_file | tojson }} - ], - "MakeCohortVcf.bincov_files": [ - {{ test_batch.merged_coverage_file | tojson }} - ], - "MakeCohortVcf.median_coverage_files": [ - {{ test_batch.medianfile | tojson }} - ], - "MakeCohortVcf.rf_cutoff_files": [ - {{ test_batch.cutoffs | tojson }} - ], - "MakeCohortVcf.pesr_vcfs": [ - {{ test_batch.genotyped_pesr_vcf| tojson }} - ], - "MakeCohortVcf.depth_vcfs": [ - {{ test_batch.regenotyped_depth_vcf | tojson }} - ], - "MakeCohortVcf.depth_gt_rd_sep_files": [ - {{ test_batch.depth_gt_rd_sep_file | tojson }} - ], - "MakeCohortVcf.raw_sr_bothside_pass_files": [ - {{ test_batch.raw_sr_bothside_pass_file | tojson }} - ], - "MakeCohortVcf.raw_sr_background_fail_files": [ - {{ test_batch.raw_sr_background_fail_file | tojson }} - ] -} diff --git a/inputs/values/dockers.json b/inputs/values/dockers.json index b68084b91..5bf9f5f1e 100644 --- a/inputs/values/dockers.json +++ b/inputs/values/dockers.json @@ -2,7 +2,7 @@ "name": "dockers", "cnmops_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/cnmops:2024-11-08-v1.0-62adb329", "condense_counts_docker": "us.gcr.io/broad-dsde-methods/tsharpe/gatk:4.2.6.1-57-g9e03432", - "gatk_docker": "us.gcr.io/broad-dsde-methods/eph/gatk:2024-07-02-4.6.0.0-1-g4af2b49e9-NIGHTLY-SNAPSHOT", + "gatk_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/gatk:2024-12-05-4.6.1.0-6-gfc248dfc1-NIGHTLY-SNAPSHOT", "gatk_docker_pesr_override": "us.gcr.io/broad-dsde-methods/tsharpe/gatk:4.2.6.1-57-g9e03432", "genomes_in_the_cloud_docker": "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.2-1510681135", "linux_docker": "marketplace.gcr.io/google/ubuntu1804", diff --git a/inputs/values/resources_hg38.json b/inputs/values/resources_hg38.json index 545d4b2b0..de2335e42 100644 --- a/inputs/values/resources_hg38.json +++ b/inputs/values/resources_hg38.json @@ -7,6 +7,14 @@ "autosome_file" : "gs://gcp-public-data--broad-references/hg38/v0/sv-resources/resources/v1/autosome.fai", "bin_exclude" : "gs://gatk-sv-resources-public/hg38/v0/sv-resources/resources/v1/bin_exclude.hg38.gatkcov.bed.gz", "bin_exclude_index" : "gs://gatk-sv-resources-public/hg38/v0/sv-resources/resources/v1/bin_exclude.hg38.gatkcov.bed.gz.tbi", + "clustering_config_part1" : "gs://gatk-sv-resources-public/hg38/v0/sv-resources/resources/v1/clustering_config.part_one.tsv", + "clustering_config_part2" : "gs://gatk-sv-resources-public/hg38/v0/sv-resources/resources/v1/clustering_config.part_two.tsv", + "clustering_tracks" : [ + "gs://gatk-sv-resources-public/hg38/v0/sv-resources/resources/v1/hg38.SimpRep.sorted.pad_100.merged.bed", + "gs://gatk-sv-resources-public/hg38/v0/sv-resources/resources/v1/hg38.SegDup.sorted.merged.bed", + "gs://gatk-sv-resources-public/hg38/v0/sv-resources/resources/v1/hg38.RM.sorted.merged.bed" + ], + "clustering_track_names" : ["SR", "SD", "RM"], "cnmops_exclude_list" : "gs://gcp-public-data--broad-references/hg38/v0/sv-resources/resources/v1/GRCh38_Nmask.bed", "contig_ploidy_priors" : "gs://gatk-sv-resources-public/hg38/v0/sv-resources/resources/v1/hg38.contig_ploidy_priors_homo_sapiens.tsv", "copy_number_autosomal_contigs" : 2, @@ -53,6 +61,8 @@ "segdups_index" : "gs://gcp-public-data--broad-references/hg38/v0/sv-resources/resources/v1/hg38.SD_gaps_Cen_Tel_Heter_Satellite_lumpy.blacklist.sorted.merged.bed.gz.tbi", "seed_cutoffs" : "gs://gatk-sv-resources-public/hg38/v0/sv-resources/resources/v1/seed_cutoff.txt", "single_sample_qc_definitions": "gs://gatk-sv-resources-public/hg38/v0/sv-resources/ref-panel/1KG/v2/single_sample.qc_definitions.tsv", + "stratification_config_part1": "gs://gatk-sv-resources-public/hg38/v0/sv-resources/resources/v1/stratify_config.part_one.tsv", + "stratification_config_part2": "gs://gatk-sv-resources-public/hg38/v0/sv-resources/resources/v1/stratify_config.part_two.tsv", "wgd_scoring_mask" : "gs://gcp-public-data--broad-references/hg38/v0/sv-resources/resources/v1/wgd_scoring_mask.hg38.gnomad_v3.bed", "wham_include_list_bed_file" : "gs://gcp-public-data--broad-references/hg38/v0/sv-resources/resources/v1/wham_whitelist.bed", diff --git a/scripts/test/terra_validation.py b/scripts/test/terra_validation.py index 5a4893bf7..aa9079d19 100644 --- a/scripts/test/terra_validation.py +++ b/scripts/test/terra_validation.py @@ -113,7 +113,7 @@ def main(): parser.add_argument("-j", "--womtool-jar", help="Path to womtool jar", required=True) parser.add_argument("-n", "--num-input-jsons", help="Number of Terra input JSONs expected", - required=False, default=26, type=int) + required=False, default=25, type=int) parser.add_argument("--log-level", help="Specify level of logging information, ie. info, warning, error (not case-sensitive)", required=False, default="INFO") diff --git a/src/sv-pipeline/04_variant_resolution/scripts/add_genotypes.py b/src/sv-pipeline/04_variant_resolution/scripts/add_genotypes.py index 4cceab746..15bb89d3d 100755 --- a/src/sv-pipeline/04_variant_resolution/scripts/add_genotypes.py +++ b/src/sv-pipeline/04_variant_resolution/scripts/add_genotypes.py @@ -12,25 +12,6 @@ import pysam -def make_multiallelic_alts(record, max_CN, is_bca=False): - """ - Add alts for CN states up to half of max observed total CN - """ - - max_haplo_CN = int(np.ceil(max_CN / 2)) - - if is_bca: - alts = tuple([''] + - [''.format(i) for i in range(2, max_haplo_CN + 1)]) - else: - alts = tuple([''] + - [''.format(i) for i in range(2, max_haplo_CN + 1)]) - - stop = record.stop - record.alts = alts - record.stop = stop - - def make_evidence_int(ev): ev = ev.split(',') @@ -59,16 +40,14 @@ def add_genotypes(record, genotypes, varGQ): del record.format[fmt] max_GT = genotypes['GT'].max() - is_bca = record.info['SVTYPE'] not in 'DEL DUP'.split() - - if is_bca: - max_CN = max_GT - else: - max_CN = max_GT + 2 if max_GT > 2: - record.info['MULTIALLELIC'] = True - make_multiallelic_alts(record, max_CN, is_bca) + record.alts = ('',) + if record.info['SVTYPE'] != 'DUP': + msg = 'Invalid SVTYPE {0} for multiallelic record {1}' + msg = msg.format(record.info['SVTYPE'], record.id) + raise Exception(msg) + record.info['SVTYPE'] = 'CNV' cols = 'name sample GT GQ RD_CN RD_GQ PE_GT PE_GQ SR_GT SR_GQ EV'.split() gt_matrix = genotypes.reset_index()[cols].to_numpy() @@ -85,27 +64,7 @@ def add_genotypes(record, genotypes, varGQ): raise Exception(msg) if max_GT > 2: - if record.info['SVTYPE'] == 'DEL': - msg = 'Invalid SVTYPE {0} for multiallelic genotype in record {1}' - msg = msg.format(record.info['SVTYPE'], record.id) - raise Exception(msg) - - if is_bca: - idx1 = int(np.floor(data[2] / 2)) - idx2 = int(np.ceil(data[2] / 2)) - else: - # split copy state roughly evenly between haplotypes - idx1 = int(np.floor((data[2] + 2) / 2)) - idx2 = int(np.ceil((data[2] + 2) / 2)) - - # if copy state is 1, assign reference genotype - if idx1 == 1: - idx1 = 0 - if idx2 == 1: - idx2 = 0 - - record.samples[sample]['GT'] = (idx1, idx2) - + record.samples[sample]['GT'] = (None, None) elif data[2] == 0: record.samples[sample]['GT'] = (0, 0) elif data[2] == 1: diff --git a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1b_build_dict.py b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1b_build_dict.py index b7da153cb..ed341bed2 100644 --- a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1b_build_dict.py +++ b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1b_build_dict.py @@ -34,7 +34,7 @@ def __init__(self): self.sample_list = [] def _update_rd_cn(self, variant, sample_indices): - self.rd_cn[variant.id] = {s: variant.samples[s][VariantFormatTypes.RD_CN] for s in sample_indices} + self.rd_cn[variant.id] = {s: variant.samples[s].get(VariantFormatTypes.RD_CN) for s in sample_indices} @staticmethod def get_wider(f): diff --git a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1b_filter.py b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1b_filter.py index e63b890cd..0da863af8 100644 --- a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1b_filter.py +++ b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1b_filter.py @@ -43,13 +43,13 @@ def modify_variants(dict_file_gz, vcf, multi_cnvs): for sample_id in geno_normal_revise_dict[variant.id]: o = variant.samples[sample_id] o.update({"GT": (0, 1)}) - o.update({"GQ": o["RD_GQ"]}) + o.update({"GQ": o.get("RD_GQ")}) if variant.stop - variant.start >= 1000: if variant.info[SVTYPE] in [SVType.DEL, SVType.DUP]: is_del = variant.info[SVTYPE] == SVType.DEL for k, v in variant.samples.items(): - rd_cn = v[VariantFormatTypes.RD_CN] + rd_cn = v.get(VariantFormatTypes.RD_CN) if rd_cn is None: continue if (is_del and rd_cn > 3) or \ diff --git a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part5_update_records.py b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part5_update_records.py index a5bf42982..f38506177 100755 --- a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part5_update_records.py +++ b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part5_update_records.py @@ -93,19 +93,19 @@ def main(): record = revised_lines_by_id[record.id] if record.info.get('SVTYPE', None) == 'DEL': if abs(record.stop - record.pos) >= 1000: - sample_cn_map = {s: record.samples[s]['RD_CN'] for s in non_outlier_samples} + sample_cn_map = {s: record.samples[s].get('RD_CN') for s in non_outlier_samples} if len([s for s in sample_cn_map if (sample_cn_map[s] is not None and sample_cn_map[s] > 3)]) > vf_1: multi_del = True - gts = [record.samples[s]['GT'] for s in non_outlier_samples] + gts = [record.samples[s].get('GT') for s in non_outlier_samples] if any(gt not in biallelic_gts for gt in gts): gt5kb_del = True - if abs(record.stop - record.pos) >= 5000: + if abs(record.stop - record.pos) >= 5000 or '' in record.alts: if not multi_del: gt5kb_del = True if record.info.get('SVTYPE', None) == 'DUP': if abs(record.stop - record.pos) >= 1000: - sample_cn_map = {s: record.samples[s]['RD_CN'] for s in non_outlier_samples} + sample_cn_map = {s: record.samples[s].get('RD_CN') for s in non_outlier_samples} if sum(1 for s in sample_cn_map if sample_cn_map[s] is not None and sample_cn_map[s] > 4) > vf_1: multi_dup = True if sum(1 for x in Counter(sample_cn_map.values()) if x is not None and (x < 1 or x > 4)) > 4: @@ -113,10 +113,10 @@ def main(): if sum(1 for s in sample_cn_map if sample_cn_map[s] is not None and (sample_cn_map[s] < 1 or sample_cn_map[s] > 4) and gt4_copystate) > vf_1: multi_dup = True - gts = [record.samples[s]['GT'] for s in non_outlier_samples] + gts = [record.samples[s].get('GT') for s in non_outlier_samples] if any(gt not in biallelic_gts for gt in gts): gt5kb_dup = True - if abs(record.stop - record.pos) >= 5000: + if abs(record.stop - record.pos) >= 5000 or '' in record.alts: if not multi_dup: gt5kb_dup = True @@ -125,27 +125,30 @@ def main(): # Leave no-calls if sample_obj['GT'] == (None, None): continue - if not sample_obj['GQ'] is None and \ - (sample_obj['RD_CN'] is not None and sample_obj['RD_CN'] >= 2): + if not sample_obj.get('GQ') is None and \ + (sample_obj.get('RD_CN') is not None and sample_obj.get('RD_CN') >= 2): sample_obj['GT'] = (0, 0) - elif not sample_obj['GQ'] is None and \ - (sample_obj['RD_CN'] is not None and sample_obj['RD_CN'] == 1): + elif not sample_obj.get('GQ') is None and \ + (sample_obj.get('RD_CN') is not None and sample_obj.get('RD_CN') == 1): sample_obj['GT'] = (0, 1) - elif not sample_obj['GQ'] is None: + elif not sample_obj.get('GQ') is None: sample_obj['GT'] = (1, 1) # RD_CN 0 DEL if gt5kb_dup: + # Convert to bi-allelic + if '' in record.alts: + record.alts = ('',) for sample_obj in record.samples.itervalues(): # Leave no-calls - also causes bug that skips multiallelic genotypes for a biallelic variant if sample_obj['GT'] == (None, None): continue - if not sample_obj['GQ'] is None and \ - (sample_obj['RD_CN'] is not None and sample_obj['RD_CN'] <= 2): + if not sample_obj.get('GQ') is None and \ + (sample_obj.get('RD_CN') is not None and sample_obj.get('RD_CN') <= 2): sample_obj['GT'] = (0, 0) - elif not sample_obj['GQ'] is None and \ - (sample_obj['RD_CN'] is not None and sample_obj['RD_CN'] == 3): + elif not sample_obj.get('GQ') is None and \ + (sample_obj.get('RD_CN') is not None and sample_obj.get('RD_CN') == 3): sample_obj['GT'] = (0, 1) - elif not sample_obj['GQ'] is None: + elif not sample_obj.get('GQ') is None: sample_obj['GT'] = (1, 1) # RD_CN > 3 DUP if record.id in multi_geno_ids: @@ -153,23 +156,23 @@ def main(): if multi_del or multi_dup: record.filter.add('MULTIALLELIC') - for j, sample in enumerate(record.samples): + for sample in record.samples: record.samples[sample]['GT'] = None record.samples[sample]['GQ'] = None - record.samples[sample]['CN'] = record.samples[sample]['RD_CN'] - record.samples[sample]['CNQ'] = record.samples[sample]['RD_GQ'] + record.samples[sample]['CN'] = record.samples[sample].get('RD_CN') + record.samples[sample]['CNQ'] = record.samples[sample].get('RD_GQ') if len(record.filter) > 1 and 'PASS' in record.filter: del record.filter['PASS'] - if 'MULTIALLELIC' in record.filter and ('' in record.alts or '' in record.alts): + if 'MULTIALLELIC' in record.filter and ('' in record.alts or '' in record.alts or '' in record.alts): record.alts = ('',) record.info['SVTYPE'] = 'CNV' if record.id in sexchr_revise: for sample in record.samples: if sample in male_samples: - cn = record.samples[sample]['RD_CN'] + cn = record.samples[sample].get('RD_CN') if cn is not None and int(cn) > 0: cn = int(cn) record.samples[sample]['RD_CN'] = cn - 1 diff --git a/src/sv-pipeline/04_variant_resolution/scripts/overlap_breakpoint_filter.py b/src/sv-pipeline/04_variant_resolution/scripts/overlap_breakpoint_filter.py index bba7b6103..3a650b4ad 100644 --- a/src/sv-pipeline/04_variant_resolution/scripts/overlap_breakpoint_filter.py +++ b/src/sv-pipeline/04_variant_resolution/scripts/overlap_breakpoint_filter.py @@ -53,7 +53,7 @@ def __init__(self, record): else: raise ValueError("Uninterpretable evidence: {}".format(ev)) if record.id in bothside_pass: - self.both_end_support = bothside_pass[record.id] + self.both_end_support = 1 else: self.both_end_support = 0 self.sr_fail = record.id in background_fail @@ -112,7 +112,7 @@ def __str__(self): # Read bothside-pass/background-fail records sys.stderr.write("Reading SR files...\n") with open(BOTHSIDE_PASS_PATH) as f: - bothside_pass = {line.strip().split('\t')[-1]: float(line.strip().split('\t')[0]) for line in f} + bothside_pass = set([line.strip().split('\t')[-1] for line in f]) with open(BACKGROUND_FAIL_PATH) as f: background_fail = set([line.strip().split('\t')[-1] for line in f]) diff --git a/src/sv-pipeline/04_variant_resolution/scripts/reformat_genotyped_vcf.py b/src/sv-pipeline/04_variant_resolution/scripts/reformat_genotyped_vcf.py new file mode 100644 index 000000000..fe7034ae3 --- /dev/null +++ b/src/sv-pipeline/04_variant_resolution/scripts/reformat_genotyped_vcf.py @@ -0,0 +1,188 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# + +""" +Replaces EV field numeric codes with strings and fixes alleles for multi-allelic CNVs. Optionally drops records with +invalid coordinates. +""" + +import argparse +import logging +import pysam +import sys + +EVIDENCE_LIST = [ + '', # 0 + 'RD', # 1 + 'PE', # 2 + 'RD,PE', # 3 + 'SR', # 4 + 'RD,SR', # 5 + 'PE,SR', # 6 + 'RD,PE,SR' # 7 +] + +EV_FORMAT_HEADER = '##FORMAT=' + + +def get_new_header(header): + header_list = str(header).split('\n') + new_header_lines = list() + for line in header_list: + if line.startswith('##FORMAT=',) + record.info['SVTYPE'] = 'CNV' + # Remove MULTIALLELIC field (legacy) + record.info.pop('MULTIALLELIC') + # Since pysam messes with some of the formatting (i.e. END limitation) we parse the string and replace + max_ev = len(EVIDENCE_LIST) - 1 + record_tokens = str(record).strip().split('\t') + if drop_invalid_coords: + if not valid_record_coordinates(record_tokens=record_tokens, ref_contig_length_dict=ref_contig_length_dict): + logging.warning(f"Dropping record {record_tokens[2]}") + return None + format_keys = record_tokens[8].split(':') + gt_index = format_keys.index('GT') + ev_index = format_keys.index('EV') + new_record_tokens = record_tokens[:9] + for format in record_tokens[9:]: + format_tokens = format.split(':') + # Reset multiallelic CNV genotypes to ./. + if record.info['SVTYPE'] == 'CNV': + gt_tokens = format_tokens[gt_index].split('/') + format_tokens[gt_index] = "/".join(["." for _ in gt_tokens]) + # Convert evidence as string list representation + ev = format.split(':')[ev_index] + if ev == '.' or not ev: + new_ev = '.' + else: + ev_int = int(ev) + if ev_int < 0 or ev_int > max_ev: + raise ValueError(f"Invalid EV value {ev_int} in record {record.id}") + new_ev = EVIDENCE_LIST[ev_int] + format_tokens[ev_index] = new_ev + new_record_tokens.append(':'.join(format_tokens)) + return '\t'.join(new_record_tokens) + + +def parse_reference_fai(path): + if not path: + raise ValueError("Reference index required if using --drop-invalid-coords") + with open(path) as f: + result = dict() + for line in f: + tokens = line.strip().split('\t') + chrom = tokens[0] + length = int(tokens[1]) + result[chrom] = length + return result + + +def main(): + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument("--vcf", type=str, required=True, + help="Path to input VCF from GATK-SV GenotypeBatch module") + parser.add_argument("--drop-invalid-coords", action='store_true', + help="Drop records with invalid coordinates, i.e. POS/END/END2 greater than chromosome length") + parser.add_argument("--reference-fai", type=str, required=False, + help="Reference fasta index (.fai), required only if using --drop-invalid-coords") + args = parser.parse_args() + logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO) + + if args.drop_invalid_coords: + ref_contig_length_dict = parse_reference_fai(args.reference_fai) + else: + ref_contig_length_dict = None + + with pysam.VariantFile(args.vcf) as vcf: + new_header = get_new_header(vcf.header) + sys.stdout.write(str(new_header)) + for record in vcf: + new_record = process_record(record=record, + drop_invalid_coords=args.drop_invalid_coords, + ref_contig_length_dict=ref_contig_length_dict) + if new_record is not None: + sys.stdout.write(new_record + "\n") + + +if __name__ == '__main__': + main() diff --git a/src/sv-pipeline/04_variant_resolution/scripts/rename.py b/src/sv-pipeline/04_variant_resolution/scripts/rename.py index e9906ae3b..b890e0199 100755 --- a/src/sv-pipeline/04_variant_resolution/scripts/rename.py +++ b/src/sv-pipeline/04_variant_resolution/scripts/rename.py @@ -30,21 +30,7 @@ def rename(vcf, fout, chrom=None, prefix='SV_pipeline'): record.id = fmt.format(**locals()) # Clean metadata - end = record.stop record.ref = 'N' - # if not record.alts[0].startswith('<'): - # record.alts = ('<{0}>'.format(record.info['SVTYPE']), ) - record.stop = end - - # for key in record.format.keys(): - # if key != 'GT': - # for sample in record.samples.keys(): - # record.samples[sample].pop(key) - # del record.format[key] - - for info in 'CIPOS CIEND RMSSTD MULTIALLELIC_TYPE'.split(): - if info in record.info.keys(): - record.info.pop(info) # Clean metadata for CNVs sucked into complex resolution # and not appropriately cleaned diff --git a/src/sv-pipeline/java/org/broadinstitute/svpipeline/CleanVCFPart1.java b/src/sv-pipeline/java/org/broadinstitute/svpipeline/CleanVCFPart1.java index 5637154bb..7ba3b1713 100644 --- a/src/sv-pipeline/java/org/broadinstitute/svpipeline/CleanVCFPart1.java +++ b/src/sv-pipeline/java/org/broadinstitute/svpipeline/CleanVCFPart1.java @@ -116,10 +116,10 @@ public static void main( final String[] args ) { } } - // move the SVTYPE to the ALT field (except for MEs) + // move the SVTYPE to the ALT field (except for MEs and multi-allelic DUPs) final InfoField info = record.getInfo(); final ByteSequence svType = info.get(SVTYPE_KEY); - if ( !record.getAlt().contains(ME_VALUE) ) { + if ( !record.getAlt().contains(ME_VALUE) && !DUP_VALUE.equals(svType) ) { if ( svType != null ) { record.setAlt(new ByteSequence(LT_VALUE, svType, GT_VALUE)); } diff --git a/src/sv-pipeline/scripts/format_gatk_vcf_for_svtk.py b/src/sv-pipeline/scripts/format_gatk_vcf_for_svtk.py index f096c0267..43755c74b 100644 --- a/src/sv-pipeline/scripts/format_gatk_vcf_for_svtk.py +++ b/src/sv-pipeline/scripts/format_gatk_vcf_for_svtk.py @@ -5,16 +5,36 @@ import sys from typing import Optional, List, Text, Set -_gt_sum_map = dict() +_gt_map = dict() +_cnv_gt_map = dict() -def _cache_gt_sum(gt): +def _cache_gt(gt): if gt is None: - return 0 - s = _gt_sum_map.get(gt, None) + return 0, 0 + s = _gt_map.get(gt, None) if s is None: - s = sum([1 for a in gt if a is not None and a > 0]) - _gt_sum_map[gt] = s + x = sum([1 for a in gt if a is not None and a > 0]) + if x == 0: + s = (0, 0) + elif x == 1: + s = (0, 1) + else: + s = (1, 1) + _gt_map[gt] = s + return s + + +def _cache_cnv_gt(cn): + if cn is None: + return 0, 0 + s = _cnv_gt_map.get(cn, None) + if s is None: + # Split copies evenly between alleles, giving one more copy to the second allele if odd + x = max(0, cn - 2) + alt1 = min(x // 2, 4) + alt2 = min(x - alt1, 4) + _cnv_gt_map[cn] = (alt1, alt2) return s @@ -71,7 +91,8 @@ def create_header(header_in: pysam.VariantHeader, def convert(record: pysam.VariantRecord, vcf_out: pysam.VariantFile, remove_infos: Set[Text], - remove_formats: Set[Text]) -> pysam.VariantRecord: + remove_formats: Set[Text], + set_pass: bool) -> pysam.VariantRecord: """ Converts a record from gatk to svtk style. This includes updating all GT fields to diploid and reverting END tag values. @@ -86,6 +107,8 @@ def convert(record: pysam.VariantRecord, info fields to remove remove_formats: Set[Text] format fields to remove + set_pass: bool + set empty FILTER statuses to PASS Returns ------- @@ -100,15 +123,26 @@ def convert(record: pysam.VariantRecord, if svtype == 'BND': # Ensure we aren't using breakend notation here, since it isn't supported in some modules alleles = ('N', '') + elif svtype == 'CNV': + # Prior to CleanVcf, all CNVs have alleles + alleles = ('N', '', '', '', '') else: alleles = ('N', alleles[1]) contig = record.contig - new_record = vcf_out.new_record(contig=contig, start=record.start, stop=record.stop, alleles=alleles) + # Change filter to PASS if requested + if set_pass and len(record.filter) == 0: + filter = ('PASS',) + else: + filter = record.filter + new_record = vcf_out.new_record(contig=contig, start=record.start, stop=record.stop, alleles=alleles, filter=filter) new_record.id = record.id # copy INFO fields for key in record.info: if key not in remove_infos: new_record.info[key] = record.info[key] + if svtype == 'CNV': + # Prior to CleanVcf, all mCNVs are DUP type + new_record.info['SVTYPE'] = 'DUP' # svtk generally expects all records to have CHR2 assigned chr2 = record.info.get('CHR2', None) if chr2 is None: @@ -117,6 +151,10 @@ def convert(record: pysam.VariantRecord, if svtype == 'INS': new_record.info['SVLEN'] = record.info.get('SVLEN', -1) new_record.info['STRANDS'] = '+-' + # END information is lost when setting POS=END, so we need to set it to SR2POS if it's available + # Note that the END position is only important following SR breakpoint refinement in FilterBatch + if record.info.get('SR2POS') is not None: + new_record.stop = record.info['SR2POS'] elif svtype == 'BND' or svtype == 'CTX': new_record.stop = record.info['END2'] new_record.info['SVLEN'] = -1 @@ -124,7 +162,7 @@ def convert(record: pysam.VariantRecord, new_record.info['SVLEN'] = record.info.get('SVLEN', -1) elif svtype == 'DEL': new_record.info['STRANDS'] = '+-' - elif svtype == 'DUP': + elif svtype == 'DUP' or svtype == 'CNV': new_record.info['STRANDS'] = '-+' elif svtype == 'INV': new_record.info['STRANDS'] = record.info.get('STRANDS', None) @@ -137,10 +175,10 @@ def convert(record: pysam.VariantRecord, if key not in remove_formats: new_genotype[key] = genotype[key] # fix GT, always assuming diploid - if _cache_gt_sum(genotype.get('GT', None)) > 0: - new_genotype['GT'] = (0, 1) + if svtype == 'CNV': + new_genotype['GT'] = _cache_cnv_gt(genotype.get('CN', genotype.get('RD_CN'))) else: - new_genotype['GT'] = (0, 0) + new_genotype['GT'] = _cache_gt(genotype.get('GT', None)) return new_record @@ -159,7 +197,7 @@ def __parse_arg_list(arg: Text) -> List[Text]: def __parse_arguments(argv: List[Text]) -> argparse.Namespace: # noinspection PyTypeChecker parser = argparse.ArgumentParser( - description="Convert a GATK-style SV VCF from ClusterBatch for consumption by GenerateBatchMetrics.", + description="Convert a GATK-style SV VCF for consumption by svtk. Not to be used after CleanVcf.", formatter_class=argparse.ArgumentDefaultsHelpFormatter ) parser.add_argument("--vcf", type=str, required=True, @@ -174,6 +212,8 @@ def __parse_arguments(argv: List[Text]) -> argparse.Namespace: help="Comma-delimited list of FORMAT fields to remove") parser.add_argument("--remove-infos", type=str, help="Comma-delimited list of INFO fields to remove") + parser.add_argument("--set-pass", default=False, action='store_true', + help="Set empty FILTER fields (\".\") to PASS") if len(argv) <= 1: parser.parse_args(["--help"]) sys.exit(0) @@ -207,7 +247,8 @@ def main(argv: Optional[List[Text]] = None): record=record, vcf_out=vcf_out, remove_infos=remove_infos, - remove_formats=remove_formats + remove_formats=remove_formats, + set_pass=arguments.set_pass )) diff --git a/src/sv-pipeline/scripts/format_svtk_vcf_for_gatk.py b/src/sv-pipeline/scripts/format_svtk_vcf_for_gatk.py index 9b5ee31b3..91896b675 100644 --- a/src/sv-pipeline/scripts/format_svtk_vcf_for_gatk.py +++ b/src/sv-pipeline/scripts/format_svtk_vcf_for_gatk.py @@ -5,7 +5,7 @@ import sys import gzip from math import floor -from typing import Any, List, Text, Dict, Optional +from typing import Any, List, Text, Dict, Optional, Set GQ_FIELDS = ["GQ", "PE_GQ", "SR_GQ", "RD_GQ"] @@ -70,7 +70,8 @@ def _parse_ploidy_table(path: Text) -> Dict[Text, Dict[Text, int]]: return ploidy_dict -def update_header(header: pysam.VariantHeader) -> None: +def update_header(header: pysam.VariantHeader, + add_sr_pos: bool) -> None: """ Ingests the given header, removes specified fields, and adds necessary fields. @@ -78,11 +79,18 @@ def update_header(header: pysam.VariantHeader) -> None: ---------- header: pysam.VariantHeader input header + add_sr_pos: bool + add SR1POS and SR2POS INFO fields """ header.add_line('##FORMAT=') # Add these just in case (no effect if they exist) header.add_line('##INFO=') header.add_line('##INFO=') + header.add_line('##INFO=') + header.add_line('##INFO=') + if add_sr_pos: + header.add_line('##INFO=') + header.add_line('##INFO=') def rescale_gq(record): @@ -95,7 +103,10 @@ def rescale_gq(record): def convert(record: pysam.VariantRecord, bnd_end_dict: Optional[Dict[Text, int]], ploidy_dict: Dict[Text, Dict[Text, int]], - scale_down_gq: bool) -> pysam.VariantRecord: + scale_down_gq: bool, + bothside_pass_vid_set: Set[Text], + background_fail_vid_set: Set[Text], + add_sr_pos: bool) -> pysam.VariantRecord: """ Converts a record from svtk to gatk style. This includes updating END/END2 and adding necessary fields such as ECN. @@ -110,6 +121,12 @@ def convert(record: pysam.VariantRecord, map from sample to contig to ploidy scale_down_gq: bool scale GQs to 0-99 range + bothside_pass_vid_set: Set[Text] + set of variant IDs with bothside SR pass flag + background_fail_vid_set: Set[Text] + set of variant Ids with background SR fail flag + add_sr_pos: bool + add SR1POS and SR2POS INFO fields Returns ------- @@ -130,6 +147,13 @@ def is_null(val): if svtype == 'BND' or svtype == 'CTX': record.info['END2'] = bnd_end_dict[record.id] if bnd_end_dict is not None \ else record.info.get('END2', record.stop) + # Add SR1POS/SR2POS, CPX not supported + if add_sr_pos and svtype != 'CPX': + record.info['SR1POS'] = record.pos + if svtype == 'BND' or svtype == 'CTX': + record.info['SR2POS'] = record.info['END2'] + else: + record.info['SR2POS'] = record.stop # Fix this weird edge case (may be from CPX review workflow) if svtype == 'INV' and '' in record.alleles[1]: svtype = 'CPX' @@ -148,6 +172,11 @@ def is_null(val): val = record.info[k] if is_null(val) or (isinstance(val, tuple) and len(val) == 1 and is_null(val[0])): del record.info[k] + # Add SR flag to INFO field + if record.id in bothside_pass_vid_set: + record.info['BOTHSIDES_SUPPORT'] = True + if record.id in background_fail_vid_set: + record.info['HIGH_SR_BACKGROUND'] = True # copy FORMAT fields for sample, genotype in record.samples.items(): genotype['ECN'] = ploidy_dict[sample][contig] @@ -156,6 +185,13 @@ def is_null(val): return record +def parse_last_column(path): + if path is None: + return set() + with open(path) as f: + return set([line.strip().split('\t')[-1] for line in f]) + + def _process(vcf_in: pysam.VariantFile, vcf_out: pysam.VariantFile, arguments: Dict[Text, Any]) -> None: @@ -181,9 +217,15 @@ def _process(vcf_in: pysam.VariantFile, bnd_end_dict = None ploidy_dict = _parse_ploidy_table(arguments.ploidy_table) + bothside_pass_vid_set = parse_last_column(arguments.bothside_pass_list) + background_fail_vid_set = parse_last_column(arguments.background_fail_list) + for record in vcf_in: out = convert(record=record, bnd_end_dict=bnd_end_dict, - ploidy_dict=ploidy_dict, scale_down_gq=arguments.scale_down_gq) + ploidy_dict=ploidy_dict, scale_down_gq=arguments.scale_down_gq, + bothside_pass_vid_set=bothside_pass_vid_set, + background_fail_vid_set=background_fail_vid_set, + add_sr_pos=arguments.add_sr_pos) vcf_out.write(out) @@ -206,6 +248,12 @@ def _parse_arguments(argv: List[Text]) -> argparse.Namespace: help="Fix END tags and assign END2 to END") parser.add_argument("--scale-down-gq", action='store_true', help="Scales all GQs down from [0-999] to [0-99]") + parser.add_argument("--bothside-pass-list", type=str, + help="Path to bothside SR pass flag variant list") + parser.add_argument("--background-fail-list", type=str, + help="Path to background SR fail flag variant list") + parser.add_argument("--add-sr-pos", action='store_true', + help="Add SR1POS and SR2POS INFO fields") if len(argv) <= 1: parser.parse_args(["--help"]) sys.exit(0) @@ -221,7 +269,8 @@ def main(argv: Optional[List[Text]] = None): # convert vcf header and records with pysam.VariantFile(arguments.vcf) as vcf_in: update_header( - header=vcf_in.header + header=vcf_in.header, + add_sr_pos=arguments.add_sr_pos ) with pysam.VariantFile(arguments.out, mode='w', header=vcf_in.header) as vcf_out: _process(vcf_in, vcf_out, arguments) diff --git a/src/sv-pipeline/scripts/single_sample/convert_cnvs_without_depth_support_to_bnds.py b/src/sv-pipeline/scripts/single_sample/convert_cnvs_without_depth_support_to_bnds.py index 290d2f786..f99daa6ec 100755 --- a/src/sv-pipeline/scripts/single_sample/convert_cnvs_without_depth_support_to_bnds.py +++ b/src/sv-pipeline/scripts/single_sample/convert_cnvs_without_depth_support_to_bnds.py @@ -14,28 +14,28 @@ def has_depth_support_autosome(record, sample): - return record.samples[sample]['RD_CN'] is not None and \ - ((record.info['SVTYPE'] == 'DUP' and record.samples[sample]['RD_CN'] > 2) or - (record.info['SVTYPE'] == 'DEL' and record.samples[sample]['RD_CN'] < 2)) + return record.samples[sample].get('RD_CN') is not None and \ + ((record.info['SVTYPE'] == 'DUP' and record.samples[sample].get('RD_CN') > 2) or + (record.info['SVTYPE'] == 'DEL' and record.samples[sample].get('RD_CN') < 2)) def has_sr_or_pe_support(record, sample): - return (record.samples[sample]['PE_GT'] is not None and record.samples[sample]['PE_GT'] > 0) \ - or (record.samples[sample]['SR_GT'] is not None and record.samples[sample]['SR_GT'] > 0) + return (record.samples[sample].get('PE_GT') is not None and record.samples[sample].get('PE_GT') > 0) \ + or (record.samples[sample].get('SR_GT') is not None and record.samples[sample].get('SR_GT') > 0) def has_depth_support_allosome(record, sample, samples_with_same_sex): - if record.samples[sample]['RD_CN'] is None: + if record.samples[sample].get('RD_CN') is None: return False - cns = [record.samples[s]['RD_CN'] for s in samples_with_same_sex] + cns = [record.samples[s].get('RD_CN') for s in samples_with_same_sex] cns = [cn for cn in cns if cn is not None] if len(cns) == 0: return False cns.sort() median_cn = cns[int((len(cns) + 1) / 2)] - return record.samples[sample]['RD_CN'] is not None and \ - ((record.info['SVTYPE'] == 'DUP' and record.samples[sample]['RD_CN'] > median_cn) or - (record.info['SVTYPE'] == 'DEL' and record.samples[sample]['RD_CN'] < median_cn)) + return record.samples[sample].get('RD_CN') is not None and \ + ((record.info['SVTYPE'] == 'DUP' and record.samples[sample].get('RD_CN') > median_cn) or + (record.info['SVTYPE'] == 'DEL' and record.samples[sample].get('RD_CN') < median_cn)) def read_contigs_list(contigs_list): @@ -92,7 +92,7 @@ def main(): svtype = record.info['SVTYPE'] if (svtype == 'DEL' or svtype == 'DUP') and record.info['SVLEN'] >= min_size: pesr_support = has_sr_or_pe_support(record, case_sample) - if record.samples[case_sample]['RD_CN'] is None: + if record.samples[case_sample].get('RD_CN') is None: if not pesr_support: sys.stderr.write("Record {} has a missing depth genotype and no PE/SR support; dropping\n".format(record.id)) continue diff --git a/src/svtk/svtk/utils/genotype_merging.py b/src/svtk/svtk/utils/genotype_merging.py index 470cf6acc..b34c80ee9 100644 --- a/src/svtk/svtk/utils/genotype_merging.py +++ b/src/svtk/svtk/utils/genotype_merging.py @@ -50,43 +50,17 @@ def check_multiallelic(records): records : list of pysam.VariantRecord """ for record in records: - if record.alts[0] == '': + if record.alts[0] in ['', '']: return True - # for sample in record.samples: - # GT = record.samples[sample]['GT'] - # if GT[0] > 1 or GT[1] > 1: - # return True - return False def make_multiallelic_alts(records): """ - Make list of alts corresponding to record with highest observed copy number - - Parameters - ---------- - records : list of pysam.VariantRecord - - Returns - ------- - alts : tuple of str + Sets simple symbolic alt for multi-allelic records """ - - max_CN = 2 - - is_bca = records[0].info['SVTYPE'] not in 'DEL DUP'.split() - for record in records: - if record.alts[0] == '': - CN = int(record.alts[-1].strip('')) - if CN > max_CN: - max_CN = CN - - if is_bca: - return tuple([''] + ['' % i for i in range(1, max_CN + 1)]) - else: - return tuple([''] + ['' % i for i in range(1, max_CN + 1)]) + record.alts = ('',) def update_best_genotypes(new_record, records, preserve_multiallelic=False): @@ -113,7 +87,7 @@ def _str_to_tuple(x): is_multiallelic = False if is_multiallelic: - new_record.alts = make_multiallelic_alts(records) + make_multiallelic_alts(records) new_record_formats = new_record.header.formats.keys() for sample in new_record.samples: @@ -123,17 +97,12 @@ def _str_to_tuple(x): # If any record is multiallelic, replace non-multiallelic # genotypes with multiallelic equivalents if key == 'GT': - GT = best_record.samples[sample][key] + gt = best_record.samples[sample][key] if is_multiallelic: - if GT == (0, 1): - new_record.samples[sample][key] = (0, 2) - elif GT == (1, 1): - new_record.samples[sample][key] = (2, 2) - else: - new_record.samples[sample][key] = GT + # No genotypes for multi-allelic records since they can't always be determined + new_record.samples[sample][key] = (None, None) else: - GT = tuple([x if x is None else min(x, 1) for x in GT]) - new_record.samples[sample][key] = GT + new_record.samples[sample][key] = tuple([x if x is None else min(x, 1) for x in gt]) elif key == 'EV': ev_values = [r.samples[sample][key] for r in records] ev_values = [_str_to_tuple(v) for v in ev_values] diff --git a/src/svtk/svtk/utils/utils.py b/src/svtk/svtk/utils/utils.py index 8ff807513..7921b1a9c 100644 --- a/src/svtk/svtk/utils/utils.py +++ b/src/svtk/svtk/utils/utils.py @@ -116,8 +116,10 @@ def get_called_samples(record, include_null=False): samples.append(sample) if record.info.get('SVTYPE', None) == 'CNV': - for sample in record.samples.keys(): - if record.samples[sample]['CN'] != 2: + for sample, gt in record.samples.items(): + # Get CN, or RD_CN if it doesn't exist + cn = gt.get('CN', gt.get('RD_CN', None)) + if cn is not None and cn != 2: samples.append(sample) return sorted(samples) diff --git a/wdl/CleanVcfChromosome.wdl b/wdl/CleanVcfChromosome.wdl index 5800be909..900e1c0b6 100644 --- a/wdl/CleanVcfChromosome.wdl +++ b/wdl/CleanVcfChromosome.wdl @@ -33,6 +33,7 @@ workflow CleanVcfChromosome { String chr_y File? svtk_to_gatk_script # For debugging + File? make_clean_gq_script Boolean use_hail String? gcs_project @@ -241,6 +242,7 @@ workflow CleanVcfChromosome { prefix="~{prefix}.clean_vcf_5", records_per_shard=clean_vcf5_records_per_shard, threads_per_task=clean_vcf5_threads_per_task, + make_clean_gq_script=make_clean_gq_script, sv_pipeline_docker=sv_pipeline_docker, sv_base_mini_docker=sv_base_mini_docker, runtime_attr_override_scatter=runtime_override_clean_vcf_5_scatter, @@ -584,18 +586,18 @@ task CleanVcf4 { for sid in record.samples: s = record.samples[sid] # Pick best GT - if s['PE_GT'] is None: + if s.get('PE_GT') is None: continue - elif s['SR_GT'] is None: - gt = s['PE_GT'] - elif s['PE_GT'] > 0 and s['SR_GT'] == 0: - gt = s['PE_GT'] - elif s['PE_GT'] == 0: - gt = s['SR_GT'] - elif s['PE_GQ'] >= s['SR_GQ']: - gt = s['PE_GT'] + elif s.get('SR_GT') is None: + gt = s.get('PE_GT') + elif s.get('PE_GT') > 0 and s.get('SR_GT') == 0: + gt = s.get('PE_GT') + elif s.get('PE_GT') == 0: + gt = s.get('SR_GT') + elif s.get('PE_GQ') >= s.get('SR_GQ'): + gt = s.get('PE_GT') else: - gt = s['SR_GT'] + gt = s.get('SR_GT') if gt > 2: num_gt_over_2 += 1 if num_gt_over_2 > max_vf: diff --git a/wdl/ClusterSingleChromosome.wdl b/wdl/ClusterSingleChromosome.wdl deleted file mode 100644 index 9e67f9d9c..000000000 --- a/wdl/ClusterSingleChromosome.wdl +++ /dev/null @@ -1,110 +0,0 @@ -version 1.0 - -# Author: Ryan Collins - -import "TasksMakeCohortVcf.wdl" as MiniTasks -import "ShardedCluster.wdl" as ShardedCluster -import "HailMerge.wdl" as HailMerge - -# Workflow to perform sharding & clustering of a vcf for a single chromosome -workflow ClusterSingleChrom { - input { - File vcf - File vcf_index - Int num_samples - String contig - String cohort_name - String evidence_type - String prefix - Int dist - Float frac - Float sample_overlap - File? exclude_list - Int sv_size - Array[String] sv_types - File empty_file - - Boolean use_hail - String? gcs_project - - String sv_pipeline_docker - String sv_base_mini_docker - - # overrides for MiniTasks - RuntimeAttr? runtime_override_subset_sv_type - RuntimeAttr? runtime_override_cat_vid_lists_chrom - - # overrides for ShardedCluster - RuntimeAttr? runtime_override_shard_clusters - RuntimeAttr? runtime_override_shard_vids - RuntimeAttr? runtime_override_pull_vcf_shard - RuntimeAttr? runtime_override_svtk_vcf_cluster - RuntimeAttr? runtime_override_get_vcf_header_with_members_info_line - RuntimeAttr? runtime_override_concat_svtypes - RuntimeAttr? runtime_override_concat_sharded_cluster - RuntimeAttr? runtime_override_cat_vid_lists_sharded - RuntimeAttr? runtime_override_make_sites_only - RuntimeAttr? runtime_override_sort_merged_vcf - - RuntimeAttr? runtime_override_preconcat_sharded_cluster - RuntimeAttr? runtime_override_hail_merge_sharded_cluster - RuntimeAttr? runtime_override_fix_header_sharded_cluster - } - - #Scatter over svtypes - scatter ( sv_type in sv_types ) { - #Subset vcf to only contain records for that svtype - - call MiniTasks.FilterVcf as SubsetSvType { - input: - vcf=vcf, - vcf_index=vcf_index, - records_filter='INFO/SVTYPE="~{sv_type}"', - outfile_prefix="~{prefix}.~{sv_type}", - use_ssd=true, - sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override=runtime_override_subset_sv_type - } - - #For each svtype, intelligently shard VCF for clustering - call ShardedCluster.ShardedCluster { - input: - vcf=SubsetSvType.filtered_vcf, - num_samples=num_samples, - dist=dist, - frac=frac, - prefix="~{prefix}.~{sv_type}", - cohort_name=cohort_name, - contig=contig, - evidence_type=evidence_type, - sv_type=sv_type, - sample_overlap=sample_overlap, - exclude_list=exclude_list, - sv_size=sv_size, - sv_types=sv_types, - empty_file=empty_file, - use_hail=use_hail, - gcs_project=gcs_project, - sv_pipeline_docker=sv_pipeline_docker, - sv_base_mini_docker=sv_base_mini_docker, - runtime_override_shard_clusters=runtime_override_shard_clusters, - runtime_override_shard_vids=runtime_override_shard_vids, - runtime_override_pull_vcf_shard=runtime_override_pull_vcf_shard, - runtime_override_svtk_vcf_cluster=runtime_override_svtk_vcf_cluster, - runtime_override_get_vcf_header_with_members_info_line=runtime_override_get_vcf_header_with_members_info_line, - runtime_override_concat_sharded_cluster=runtime_override_concat_sharded_cluster, - runtime_override_cat_vid_lists_sharded=runtime_override_cat_vid_lists_sharded, - runtime_override_make_sites_only=runtime_override_make_sites_only, - runtime_override_sort_merged_vcf=runtime_override_sort_merged_vcf, - runtime_override_preconcat_sharded_cluster=runtime_override_preconcat_sharded_cluster, - runtime_override_hail_merge_sharded_cluster=runtime_override_hail_merge_sharded_cluster, - runtime_override_fix_header_sharded_cluster=runtime_override_fix_header_sharded_cluster - } - } - - #Output clustered vcf - output { - Array[File] clustered_vcfs = ShardedCluster.clustered_vcf - Array[File] clustered_vcf_indexes = ShardedCluster.clustered_vcf_idx - } -} diff --git a/wdl/CombineBatches.wdl b/wdl/CombineBatches.wdl index 624e3ed54..825a59137 100644 --- a/wdl/CombineBatches.wdl +++ b/wdl/CombineBatches.wdl @@ -1,106 +1,69 @@ version 1.0 -import "CombineSRBothsidePass.wdl" as CombineSRBothsidePass -import "VcfClusterSingleChromsome.wdl" as VcfClusterContig +import "CombineSRBothsidePass.wdl" as CombineSRBothsidePassWorkflow +import "FormatVcfForGatk.wdl" as GatkFormatting +import "TasksClusterBatch.wdl" as ClusterTasks +import "TasksGenotypeBatch.wdl" as GenotypeTasks import "TasksMakeCohortVcf.wdl" as MiniTasks -import "HailMerge.wdl" as HailMerge -import "HarmonizeHeaders.wdl" as HarmonizeHeaders -import "MergePesrDepth.wdl" as MergePesrDepth -import "Utils.wdl" as Utils workflow CombineBatches { input { String cohort_name Array[String] batches + File ped_file Boolean merge_vcfs = false Array[File] pesr_vcfs Array[File] depth_vcfs + # Set to true if using vcfs generated with a prior version, i.e. not ending in "_reformatted.vcf.gz" + Boolean legacy_vcfs = false Array[File] raw_sr_bothside_pass_files Array[File] raw_sr_background_fail_files File contig_list - Int localize_shard_size = 100000 - File pe_exclude_list - File depth_exclude_list Float min_sr_background_fail_batches - File empty_file - - Boolean use_hail = false - String? gcs_project - + # Reclustering parameters + File clustering_config_part1 + File stratification_config_part1 + File clustering_config_part2 + File stratification_config_part2 + # These arrays give the names and intervals for reference contexts for stratification (same lengths) + # Names must correspond to those in the stratification config files + Array[String] track_names + Array[File] track_bed_files + + File reference_fasta + File reference_fasta_fai + File reference_dict + String? chr_x + String? chr_y + + Float? java_mem_fraction + + String gatk_docker String sv_base_mini_docker String sv_pipeline_docker - # overrides for local tasks - RuntimeAttr? runtime_override_update_sr_list - RuntimeAttr? runtime_override_merge_pesr_depth - RuntimeAttr? runtime_override_reheader - RuntimeAttr? runtime_override_pull_header - - # overrides for mini tasks + RuntimeAttr? runtime_attr_create_ploidy + RuntimeAttr? runtime_attr_reformat_1 + RuntimeAttr? runtime_attr_reformat_2 + RuntimeAttr? runtime_attr_join_vcfs + RuntimeAttr? runtime_attr_cluster_sites + RuntimeAttr? runtime_attr_recluster_part1 + RuntimeAttr? runtime_attr_recluster_part2 RuntimeAttr? runtime_attr_get_non_ref_vids RuntimeAttr? runtime_attr_calculate_support_frac RuntimeAttr? runtime_override_clean_background_fail + RuntimeAttr? runtime_attr_gatk_to_svtk_vcf + RuntimeAttr? runtime_attr_extract_vids RuntimeAttr? runtime_override_concat - RuntimeAttr? runtime_override_concat_pesr_depth - RuntimeAttr? runtime_override_update_fix_pesr_header - RuntimeAttr? runtime_override_count_samples - RuntimeAttr? runtime_override_preconcat_pesr_depth - RuntimeAttr? runtime_override_hail_merge_pesr_depth - RuntimeAttr? runtime_override_fix_header_pesr_depth - RuntimeAttr? runtime_override_concat_large_pesr_depth - - # overrides for VcfClusterContig - RuntimeAttr? runtime_override_localize_vcfs - RuntimeAttr? runtime_override_join_vcfs - RuntimeAttr? runtime_override_fix_multiallelic - RuntimeAttr? runtime_override_fix_ev_tags - RuntimeAttr? runtime_override_subset_bothside_pass - RuntimeAttr? runtime_override_subset_background_fail - RuntimeAttr? runtime_override_subset_sv_type - RuntimeAttr? runtime_override_shard_clusters - RuntimeAttr? runtime_override_shard_vids - RuntimeAttr? runtime_override_pull_vcf_shard - RuntimeAttr? runtime_override_svtk_vcf_cluster - RuntimeAttr? runtime_override_get_vcf_header_with_members_info_line - RuntimeAttr? runtime_override_concat_vcf_cluster - RuntimeAttr? runtime_override_concat_svtypes - RuntimeAttr? runtime_override_concat_sharded_cluster - RuntimeAttr? runtime_override_make_sites_only - RuntimeAttr? runtime_override_sort_merged_vcf_cluster - RuntimeAttr? runtime_override_preconcat_sharded_cluster - RuntimeAttr? runtime_override_hail_merge_sharded_cluster - RuntimeAttr? runtime_override_fix_header_sharded_cluster - - # overerides for merge pesr depth - RuntimeAttr? runtime_override_shard_clusters_mpd - RuntimeAttr? runtime_override_shard_vids_mpd - RuntimeAttr? runtime_override_pull_vcf_shard_mpd - RuntimeAttr? runtime_override_merge_pesr_depth_mpd - - RuntimeAttr? runtime_override_sort_merged_vcf_mpd - RuntimeAttr? runtime_override_subset_small_mpd - RuntimeAttr? runtime_override_subset_large_mpd - RuntimeAttr? runtime_override_make_sites_only_mpd - RuntimeAttr? runtime_override_concat_large_pesr_depth_mpd - RuntimeAttr? runtime_override_concat_shards_mpd - - RuntimeAttr? runtime_override_preconcat_large_pesr_depth_mpd - RuntimeAttr? runtime_override_hail_merge_large_pesr_depth_mpd - RuntimeAttr? runtime_override_fix_header_large_pesr_depth_mpd - - RuntimeAttr? runtime_override_preconcat_pesr_depth_shards_mpd - RuntimeAttr? runtime_override_hail_merge_pesr_depth_shards_mpd - RuntimeAttr? runtime_override_fix_header_pesr_depth_shards_mpd - } # Preprocess some inputs - call CombineSRBothsidePass.CombineSRBothsidePass { + call CombineSRBothsidePassWorkflow.CombineSRBothsidePass { input: pesr_vcfs=pesr_vcfs, raw_sr_bothside_pass_files=raw_sr_bothside_pass_files, @@ -112,7 +75,7 @@ workflow CombineBatches { } Float min_background_fail_first_col = min_sr_background_fail_batches * length(raw_sr_background_fail_files) - call MiniTasks.CatUncompressedFiles as CleanBackgroundFail { + call MiniTasks.CatUncompressedFiles as CombineBackgroundFail { input: shards=raw_sr_background_fail_files, filter_command="sort | uniq -c | awk -v OFS='\\t' '{if($1 >= ~{min_background_fail_first_col}) print $2}'", @@ -121,270 +84,169 @@ workflow CombineBatches { runtime_attr_override=runtime_override_clean_background_fail } - call Utils.CountSamples { + call ClusterTasks.CreatePloidyTableFromPed { input: - vcf=depth_vcfs[0], - sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override=runtime_override_count_samples + ped_file=ped_file, + contig_list=contig_list, + retain_female_chr_y=true, + chr_x=chr_x, + chr_y=chr_y, + output_prefix="~{cohort_name}.ploidy", + sv_pipeline_docker=sv_pipeline_docker, + runtime_attr_override=runtime_attr_create_ploidy } - #Scatter per chromosome - Array[String] contigs = transpose(read_tsv(contig_list))[0] - scatter ( contig in contigs ) { - - #Subset PESR VCFs to single chromosome & cluster - #Note: also subsets bothside_pass and background_fail files to variants - #present on chromosome of interest - call VcfClusterContig.VcfClusterSingleChrom as ClusterPesr { + Array[File] all_vcfs = flatten([pesr_vcfs, depth_vcfs]) + scatter (vcf in all_vcfs) { + if (legacy_vcfs) { + call GenotypeTasks.ReformatGenotypedVcf { + input: + vcf = vcf, + output_prefix = basename(vcf, ".vcf.gz") + ".reformat_legacy", + sv_pipeline_docker = sv_pipeline_docker, + runtime_attr_override = runtime_attr_reformat_1 + } + } + File reformatted_vcf = select_first([ReformatGenotypedVcf.out, vcf]) + call GatkFormatting.FormatVcf { input: - vcfs=pesr_vcfs, - num_samples=CountSamples.num_samples, - batches=batches, - prefix="~{cohort_name}.~{contig}.pesr", - dist=300, - frac=0.1, - sample_overlap=0.5, - exclude_list=pe_exclude_list, - sv_size=50, - sv_types=["DEL","DUP","INV","BND","INS"], - contig=contig, - evidence_type="pesr", - cohort_name=cohort_name, - localize_shard_size=localize_shard_size, - subset_sr_lists=true, - bothside_pass=CombineSRBothsidePass.out, - background_fail=CleanBackgroundFail.outfile, - empty_file=empty_file, - use_hail=use_hail, - gcs_project=gcs_project, + vcf=reformatted_vcf, + ploidy_table=CreatePloidyTableFromPed.out, + args="--fix-end --add-sr-pos", + output_prefix=basename(vcf, ".vcf.gz") + ".reformat_gatk", + bothside_pass_list=CombineSRBothsidePass.out, + background_fail_list=CombineBackgroundFail.outfile, sv_pipeline_docker=sv_pipeline_docker, - sv_base_mini_docker=sv_base_mini_docker, - runtime_override_localize_vcfs = runtime_override_localize_vcfs, - runtime_override_join_vcfs = runtime_override_join_vcfs, - runtime_override_fix_multiallelic = runtime_override_fix_multiallelic, - runtime_override_fix_ev_tags = runtime_override_fix_ev_tags, - runtime_override_subset_bothside_pass=runtime_override_subset_bothside_pass, - runtime_override_subset_background_fail=runtime_override_subset_background_fail, - runtime_override_subset_sv_type=runtime_override_subset_sv_type, - runtime_override_shard_clusters=runtime_override_shard_clusters, - runtime_override_shard_vids=runtime_override_shard_vids, - runtime_override_pull_vcf_shard=runtime_override_pull_vcf_shard, - runtime_override_svtk_vcf_cluster=runtime_override_svtk_vcf_cluster, - runtime_override_get_vcf_header_with_members_info_line=runtime_override_get_vcf_header_with_members_info_line, - runtime_override_concat_vcf_cluster=runtime_override_concat_vcf_cluster, - runtime_override_concat_svtypes=runtime_override_concat_svtypes, - runtime_override_concat_sharded_cluster=runtime_override_concat_sharded_cluster, - runtime_override_make_sites_only=runtime_override_make_sites_only, - runtime_override_sort_merged_vcf=runtime_override_sort_merged_vcf_cluster, - runtime_override_preconcat_sharded_cluster=runtime_override_preconcat_sharded_cluster, - runtime_override_hail_merge_sharded_cluster=runtime_override_hail_merge_sharded_cluster, - runtime_override_fix_header_sharded_cluster=runtime_override_fix_header_sharded_cluster + runtime_attr_override=runtime_attr_reformat_2 } + } + + #Scatter per chromosome + Array[String] contigs = transpose(read_tsv(contig_list))[0] + scatter ( contig in contigs ) { - #Subset RD VCFs to single chromosome & cluster - call VcfClusterContig.VcfClusterSingleChrom as ClusterDepth { + # Naively join across batches + call ClusterTasks.SVCluster as JoinVcfs { input: - vcfs=depth_vcfs, - num_samples=CountSamples.num_samples, - batches=batches, - prefix="~{cohort_name}.~{contig}.depth", - dist=500000, - frac=0.5, - sample_overlap=0.5, - exclude_list=depth_exclude_list, - sv_size=5000, - sv_types=["DEL","DUP"], + vcfs=FormatVcf.out, + ploidy_table=CreatePloidyTableFromPed.out, + output_prefix="~{cohort_name}.combine_batches.~{contig}.join_vcfs", contig=contig, - evidence_type="depth", - cohort_name=cohort_name, - localize_shard_size=localize_shard_size, - subset_sr_lists=false, - bothside_pass=CombineSRBothsidePass.out, - background_fail=CleanBackgroundFail.outfile, - empty_file=empty_file, - use_hail=use_hail, - gcs_project=gcs_project, - sv_pipeline_docker=sv_pipeline_docker, - sv_base_mini_docker=sv_base_mini_docker, - runtime_override_localize_vcfs = runtime_override_localize_vcfs, - runtime_override_join_vcfs = runtime_override_join_vcfs, - runtime_override_fix_multiallelic = runtime_override_fix_multiallelic, - runtime_override_fix_ev_tags = runtime_override_fix_ev_tags, - runtime_override_subset_bothside_pass=runtime_override_subset_bothside_pass, - runtime_override_subset_background_fail=runtime_override_subset_background_fail, - runtime_override_subset_sv_type=runtime_override_subset_sv_type, - runtime_override_shard_clusters=runtime_override_shard_clusters, - runtime_override_shard_vids=runtime_override_shard_vids, - runtime_override_svtk_vcf_cluster=runtime_override_svtk_vcf_cluster, - runtime_override_get_vcf_header_with_members_info_line=runtime_override_get_vcf_header_with_members_info_line, - runtime_override_concat_vcf_cluster=runtime_override_concat_vcf_cluster, - runtime_override_concat_svtypes=runtime_override_concat_svtypes, - runtime_override_concat_sharded_cluster=runtime_override_concat_sharded_cluster, - runtime_override_make_sites_only=runtime_override_make_sites_only, - runtime_override_sort_merged_vcf=runtime_override_sort_merged_vcf_cluster, - runtime_override_preconcat_sharded_cluster=runtime_override_preconcat_sharded_cluster, - runtime_override_hail_merge_sharded_cluster=runtime_override_hail_merge_sharded_cluster, - runtime_override_fix_header_sharded_cluster=runtime_override_fix_header_sharded_cluster + fast_mode=false, + pesr_sample_overlap=0, + pesr_interval_overlap=1, + pesr_breakend_window=0, + depth_sample_overlap=0, + depth_interval_overlap=1, + depth_breakend_window=0, + mixed_sample_overlap=0, + mixed_interval_overlap=1, + mixed_breakend_window=0, + reference_fasta=reference_fasta, + reference_fasta_fai=reference_fasta_fai, + reference_dict=reference_dict, + java_mem_fraction=java_mem_fraction, + gatk_docker=gatk_docker, + runtime_attr_override=runtime_attr_join_vcfs } - call MiniTasks.ConcatVcfs as ConcatPesrSitesOnly { + # First round of clustering + call ClusterTasks.SVCluster as ClusterSites { input: - vcfs=ClusterPesr.clustered_vcfs, - vcfs_idx=ClusterPesr.clustered_vcf_indexes, - naive=true, - generate_index=false, - sites_only=true, - outfile_prefix="~{cohort_name}.clustered_pesr.sites_only", - sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override=runtime_override_concat + vcfs=[JoinVcfs.out], + ploidy_table=CreatePloidyTableFromPed.out, + output_prefix="~{cohort_name}.combine_batches.~{contig}.cluster_sites", + fast_mode=false, + breakpoint_summary_strategy="REPRESENTATIVE", + pesr_sample_overlap=0.5, + pesr_interval_overlap=0.1, + pesr_breakend_window=300, + depth_sample_overlap=0.5, + depth_interval_overlap=0.5, + depth_breakend_window=500000, + mixed_sample_overlap=0.5, + mixed_interval_overlap=0.5, + mixed_breakend_window=1000000, + reference_fasta=reference_fasta, + reference_fasta_fai=reference_fasta_fai, + reference_dict=reference_dict, + java_mem_fraction=java_mem_fraction, + variant_prefix="~{cohort_name}_~{contig}_", + gatk_docker=gatk_docker, + runtime_attr_override=runtime_attr_cluster_sites } - #Update SR background fail & bothside pass files (1) - call MiniTasks.UpdateSrList as UpdateBackgroundFailFirst { - input: - vcf=ConcatPesrSitesOnly.concat_vcf, - original_list=ClusterPesr.filtered_background_fail, - outfile="~{cohort_name}.~{contig}.sr_background_fail.updated.txt", - sv_pipeline_docker=sv_pipeline_docker, - runtime_attr_override=runtime_override_update_sr_list - } - call MiniTasks.UpdateSrList as UpdateBothsidePassFirst { + # Second round of clustering + call GroupedSVClusterTask as GroupedSVClusterPart1 { input: - vcf=ConcatPesrSitesOnly.concat_vcf, - original_list=ClusterPesr.filtered_bothside_pass, - outfile="~{cohort_name}.~{contig}.sr_bothside_pass.updated.txt", - sv_pipeline_docker=sv_pipeline_docker, - runtime_attr_override=runtime_override_update_sr_list + vcf=ClusterSites.out, + ploidy_table=CreatePloidyTableFromPed.out, + output_prefix="~{cohort_name}.combine_batches.~{contig}.recluster_part_1", + reference_fasta=reference_fasta, + reference_fasta_fai=reference_fasta_fai, + reference_dict=reference_dict, + clustering_config=clustering_config_part1, + stratification_config=stratification_config_part1, + track_bed_files=track_bed_files, + track_names=track_names, + breakpoint_summary_strategy="REPRESENTATIVE", + java_mem_fraction=java_mem_fraction, + gatk_docker=gatk_docker, + runtime_attr_override=runtime_attr_recluster_part1 } - call HarmonizeHeaders.HarmonizeHeaders { + # Third round of clustering + call GroupedSVClusterTask as GroupedSVClusterPart2 { input: - header_vcf=ClusterDepth.clustered_vcfs[0], - vcfs=ClusterPesr.clustered_vcfs, - prefix="~{cohort_name}.~{contig}.harmonize_headers", - sv_base_mini_docker=sv_base_mini_docker, - runtime_override_reheader=runtime_override_reheader, - runtime_override_pull_header=runtime_override_pull_header + vcf=GroupedSVClusterPart1.out, + ploidy_table=CreatePloidyTableFromPed.out, + output_prefix="~{cohort_name}.combine_batches.~{contig}.recluster_part_2", + reference_fasta=reference_fasta, + reference_fasta_fai=reference_fasta_fai, + reference_dict=reference_dict, + clustering_config=clustering_config_part2, + stratification_config=stratification_config_part2, + track_bed_files=track_bed_files, + track_names=track_names, + breakpoint_summary_strategy="REPRESENTATIVE", + java_mem_fraction=java_mem_fraction, + gatk_docker=gatk_docker, + runtime_attr_override=runtime_attr_recluster_part2 } - call MergePesrDepth.MergePesrDepth as MergeDeletions { + # Use "depth" as source to match legacy headers + # AC/AF cause errors due to being lists instead of single values + call ClusterTasks.GatkToSvtkVcf { input: - subtyped_pesr_vcf=HarmonizeHeaders.out[0], - subtyped_depth_vcf=ClusterDepth.clustered_vcfs[0], - svtype="DEL", - num_samples=CountSamples.num_samples, - prefix="~{cohort_name}.~{contig}.merge_del", - cohort_name=cohort_name, - contig=contig, - use_hail=use_hail, - gcs_project=gcs_project, - sv_base_mini_docker=sv_base_mini_docker, + vcf=GroupedSVClusterPart2.out, + output_prefix="~{cohort_name}.combine_batches.~{contig}.svtk_formatted", + source="depth", + contig_list=contig_list, + remove_formats="CN", + remove_infos="END2,AC,AF,AN,HIGH_SR_BACKGROUND,BOTHSIDES_SUPPORT,SR1POS,SR2POS", + set_pass=true, sv_pipeline_docker=sv_pipeline_docker, - runtime_override_shard_clusters=runtime_override_shard_clusters_mpd, - runtime_override_shard_vids=runtime_override_shard_vids_mpd, - runtime_override_pull_vcf_shard=runtime_override_pull_vcf_shard_mpd, - runtime_override_merge_pesr_depth=runtime_override_merge_pesr_depth_mpd, - runtime_override_sort_merged_vcf=runtime_override_sort_merged_vcf_mpd, - runtime_override_subset_small=runtime_override_subset_small_mpd, - runtime_override_subset_large=runtime_override_subset_large_mpd, - runtime_override_make_sites_only=runtime_override_make_sites_only_mpd, - runtime_override_concat_large_pesr_depth=runtime_override_concat_large_pesr_depth_mpd, - runtime_override_concat_shards=runtime_override_concat_shards_mpd, - runtime_override_preconcat_large_pesr_depth=runtime_override_preconcat_large_pesr_depth_mpd, - runtime_override_hail_merge_large_pesr_depth=runtime_override_hail_merge_large_pesr_depth_mpd, - runtime_override_fix_header_large_pesr_depth=runtime_override_fix_header_large_pesr_depth_mpd, - runtime_override_preconcat_pesr_depth_shards=runtime_override_preconcat_pesr_depth_shards_mpd, - runtime_override_hail_merge_pesr_depth_shards=runtime_override_hail_merge_pesr_depth_shards_mpd, - runtime_override_fix_header_pesr_depth_shards=runtime_override_fix_header_pesr_depth_shards_mpd + runtime_attr_override=runtime_attr_gatk_to_svtk_vcf } - call MergePesrDepth.MergePesrDepth as MergeDuplications { + call ExtractSRVariantLists { input: - subtyped_pesr_vcf=HarmonizeHeaders.out[1], - subtyped_depth_vcf=ClusterDepth.clustered_vcfs[1], - svtype="DUP", - num_samples=CountSamples.num_samples, - prefix="~{cohort_name}.~{contig}.merge_dup", - cohort_name=cohort_name, - contig=contig, - use_hail=use_hail, - gcs_project=gcs_project, + vcf=GroupedSVClusterPart2.out, + vcf_index=GroupedSVClusterPart2.out_index, + output_prefix="~{cohort_name}.combine_batches.~{contig}", sv_base_mini_docker=sv_base_mini_docker, - sv_pipeline_docker=sv_pipeline_docker, - runtime_override_shard_clusters=runtime_override_shard_clusters_mpd, - runtime_override_shard_vids=runtime_override_shard_vids_mpd, - runtime_override_pull_vcf_shard=runtime_override_pull_vcf_shard_mpd, - runtime_override_merge_pesr_depth=runtime_override_merge_pesr_depth_mpd, - runtime_override_sort_merged_vcf=runtime_override_sort_merged_vcf_mpd, - runtime_override_subset_small=runtime_override_subset_small_mpd, - runtime_override_subset_large=runtime_override_subset_large_mpd, - runtime_override_make_sites_only=runtime_override_make_sites_only_mpd, - runtime_override_concat_large_pesr_depth=runtime_override_concat_large_pesr_depth_mpd, - runtime_override_concat_shards=runtime_override_concat_shards_mpd, - runtime_override_preconcat_large_pesr_depth=runtime_override_preconcat_large_pesr_depth_mpd, - runtime_override_hail_merge_large_pesr_depth=runtime_override_hail_merge_large_pesr_depth_mpd, - runtime_override_fix_header_large_pesr_depth=runtime_override_fix_header_large_pesr_depth_mpd, - runtime_override_preconcat_pesr_depth_shards=runtime_override_preconcat_pesr_depth_shards_mpd, - runtime_override_hail_merge_pesr_depth_shards=runtime_override_hail_merge_pesr_depth_shards_mpd, - runtime_override_fix_header_pesr_depth_shards=runtime_override_fix_header_pesr_depth_shards_mpd - } - - #Merge PESR & RD VCFs - if (use_hail) { - call HailMerge.HailMerge as ConcatPesrDepthHail { - input: - vcfs=[MergeDeletions.out, MergeDuplications.out, HarmonizeHeaders.out[2], HarmonizeHeaders.out[3], HarmonizeHeaders.out[4]], - prefix="~{cohort_name}.~{contig}.concat_pesr_depth", - gcs_project=gcs_project, - sv_base_mini_docker=sv_base_mini_docker, - sv_pipeline_docker=sv_pipeline_docker, - runtime_override_preconcat=runtime_override_preconcat_pesr_depth, - runtime_override_hail_merge=runtime_override_hail_merge_pesr_depth, - runtime_override_fix_header=runtime_override_fix_header_pesr_depth - } - } - if (!use_hail) { - call MiniTasks.ConcatVcfs as ConcatPesrDepth { - input: - vcfs=[MergeDeletions.out, MergeDuplications.out, HarmonizeHeaders.out[2], HarmonizeHeaders.out[3], HarmonizeHeaders.out[4]], - vcfs_idx=[MergeDeletions.out+".tbi", MergeDuplications.out+".tbi", HarmonizeHeaders.out[2]+".tbi", HarmonizeHeaders.out[3]+".tbi", HarmonizeHeaders.out[4]+".tbi"], - allow_overlaps=true, - outfile_prefix="~{cohort_name}.~{contig}.concat_pesr_depth", - sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override=runtime_override_concat_large_pesr_depth - } - } - - #Update SR background fail & bothside pass files (2) - call MiniTasks.UpdateSrList as UpdateBackgroundFailSecond { - input: - vcf=select_first([ConcatPesrDepth.concat_vcf, ConcatPesrDepthHail.merged_vcf]), - original_list=UpdateBackgroundFailFirst.updated_list, - outfile="~{cohort_name}.~{contig}.sr_background_fail.updated2.txt", - sv_pipeline_docker=sv_pipeline_docker, - runtime_attr_override=runtime_override_update_sr_list + runtime_attr_override=runtime_attr_extract_vids } - call MiniTasks.UpdateSrList as UpdateBothsidePassSecond { - input: - vcf=select_first([ConcatPesrDepth.concat_vcf, ConcatPesrDepthHail.merged_vcf]), - original_list=UpdateBothsidePassFirst.updated_list, - outfile="~{cohort_name}.~{contig}.sr_bothside_pass.updated2.txt", - sv_pipeline_docker=sv_pipeline_docker, - runtime_attr_override=runtime_override_update_sr_list - } - - File vcfs_out_ = select_first([ConcatPesrDepth.concat_vcf, ConcatPesrDepthHail.merged_vcf]) - File vcf_indexes_out_ = select_first([ConcatPesrDepth.concat_vcf_idx, ConcatPesrDepthHail.merged_vcf_index]) } - #Merge resolved vcfs for QC + # Merge resolved vcfs for QC if (merge_vcfs) { call MiniTasks.ConcatVcfs { input: - vcfs=vcfs_out_, - vcfs_idx=vcf_indexes_out_, + vcfs=GatkToSvtkVcf.out, + vcfs_idx=GatkToSvtkVcf.out_index, naive=true, - outfile_prefix="~{cohort_name}.combine_batches", + outfile_prefix="~{cohort_name}.combine_batches.concat_all_contigs", sv_base_mini_docker=sv_base_mini_docker, runtime_attr_override=runtime_override_concat } @@ -392,11 +254,147 @@ workflow CombineBatches { #Final outputs output { - Array[File] combined_vcfs = vcfs_out_ - Array[File] combined_vcf_indexes = vcf_indexes_out_ - Array[File] cluster_bothside_pass_lists = UpdateBothsidePassSecond.updated_list - Array[File] cluster_background_fail_lists = UpdateBackgroundFailSecond.updated_list + Array[File] combined_vcfs = GatkToSvtkVcf.out + Array[File] combined_vcf_indexes = GatkToSvtkVcf.out_index + Array[File] cluster_background_fail_lists = ExtractSRVariantLists.high_sr_background_list + Array[File] cluster_bothside_pass_lists = ExtractSRVariantLists.bothsides_sr_support File? combine_batches_merged_vcf = ConcatVcfs.concat_vcf File? combine_batches_merged_vcf_index = ConcatVcfs.concat_vcf_idx } } + + +# Find intersection of Variant IDs from vid_list with those present in vcf, return as filtered_vid_list +task ExtractSRVariantLists { + input { + File vcf + File vcf_index + String output_prefix + String sv_base_mini_docker + RuntimeAttr? runtime_attr_override + } + + # when filtering/sorting/etc, memory usage will likely go up (much of the data will have to + # be held in memory or disk while working, potentially in a form that takes up more space) + RuntimeAttr runtime_default = object { + mem_gb: 3.75, + disk_gb: ceil(10.0 + size(vcf, "GB")), + cpu_cores: 1, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } + RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) + runtime { + memory: select_first([runtime_override.mem_gb, runtime_default.mem_gb]) + " GB" + disks: "local-disk " + select_first([runtime_override.disk_gb, runtime_default.disk_gb]) + " HDD" + cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) + preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) + maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) + docker: sv_base_mini_docker + bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) + } + + command <<< + set -euo pipefail + bcftools query -f '%ID\t%HIGH_SR_BACKGROUND\t%BOTHSIDES_SUPPORT\n' ~{vcf} > flags.txt + awk -F'\t' '($2 != "."){print $1}' flags.txt > ~{output_prefix}.high_sr_background.txt + awk -F'\t' '($3 != "."){print $1}' flags.txt > ~{output_prefix}.bothsides_sr_support.txt + >>> + + output { + File high_sr_background_list = "~{output_prefix}.high_sr_background.txt" + File bothsides_sr_support = "~{output_prefix}.bothsides_sr_support.txt" + } +} + +task GroupedSVClusterTask { + input { + File vcf + File ploidy_table + String output_prefix + + File reference_fasta + File reference_fasta_fai + File reference_dict + + File clustering_config + File stratification_config + Array[File] track_bed_files + Array[String] track_names + + Float context_overlap_frac = 0 + Int context_num_breakpoint_overlaps = 1 + Int context_interchrom_num_breakpoint_overlaps = 1 + + String? breakpoint_summary_strategy + String? contig + String? additional_args + + Float? java_mem_fraction + String gatk_docker + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + vcf: { + localization_optional: true + } + } + + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 3.75, + disk_gb: ceil(10 + size(vcf, "GB") * 2), + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 1 + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + + output { + File out = "~{output_prefix}.vcf.gz" + File out_index = "~{output_prefix}.vcf.gz.tbi" + } + command <<< + set -euo pipefail + + function getJavaMem() { + # get JVM memory in MiB by getting total memory from /proc/meminfo + # and multiplying by java_mem_fraction + cat /proc/meminfo \ + | awk -v MEM_FIELD="$1" '{ + f[substr($1, 1, length($1)-1)] = $2 + } END { + printf "%dM", f[MEM_FIELD] * ~{default="0.85" java_mem_fraction} / 1024 + }' + } + JVM_MAX_MEM=$(getJavaMem MemTotal) + echo "JVM memory: $JVM_MAX_MEM" + + gatk --java-options "-Xmx${JVM_MAX_MEM}" GroupedSVCluster \ + ~{"-L " + contig} \ + --reference ~{reference_fasta} \ + --ploidy-table ~{ploidy_table} \ + -V ~{vcf} \ + -O ~{output_prefix}.vcf.gz \ + --clustering-config ~{clustering_config} \ + --stratify-config ~{stratification_config} \ + --track-intervals ~{sep=" --track-intervals " track_bed_files} \ + --track-name ~{sep=" --track-name " track_names} \ + --stratify-overlap-fraction ~{context_overlap_frac} \ + --stratify-num-breakpoint-overlaps ~{context_num_breakpoint_overlaps} \ + --stratify-num-breakpoint-overlaps-interchromosomal ~{context_interchrom_num_breakpoint_overlaps} \ + ~{"--breakpoint-summary-strategy " + breakpoint_summary_strategy} \ + ~{additional_args} + >>> + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + docker: gatk_docker + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + } +} \ No newline at end of file diff --git a/wdl/DepthClustering.wdl b/wdl/DepthClustering.wdl index 6176c417b..20d94c5b7 100644 --- a/wdl/DepthClustering.wdl +++ b/wdl/DepthClustering.wdl @@ -105,7 +105,7 @@ workflow ClusterDepth { algorithm=clustering_algorithm, depth_sample_overlap=0, depth_interval_overlap=depth_interval_overlap, - depth_breakend_window=0, + depth_breakend_window=10000000, reference_fasta=reference_fasta, reference_fasta_fai=reference_fasta_fai, reference_dict=reference_dict, diff --git a/wdl/FormatVcfForGatk.wdl b/wdl/FormatVcfForGatk.wdl index e6461b841..703395a57 100644 --- a/wdl/FormatVcfForGatk.wdl +++ b/wdl/FormatVcfForGatk.wdl @@ -14,6 +14,9 @@ workflow FormatVcfForGatk { File contig_list File? contigs_header # Replaces vcf contig dictionary if provided String? formatter_args + # Variant ID lists for SR evidence flags. Adds as INFO field flags if provided + File? bothside_pass_list + File? background_fail_list String? chr_x String? chr_y @@ -61,6 +64,8 @@ workflow FormatVcfForGatk { args=formatter_args, output_prefix="~{prefix}.format.shard_~{i}", contigs_header=contigs_header, + bothside_pass_list=bothside_pass_list, + background_fail_list=background_fail_list, script=svtk_to_gatk_script, sv_pipeline_docker=sv_pipeline_docker, runtime_attr_override=runtime_attr_format @@ -89,6 +94,8 @@ task FormatVcf { input { File vcf File ploidy_table + File? bothside_pass_list + File? background_fail_list File? script String? args File? contigs_header # Overwrites contig dictionary, in case they are out of order @@ -119,6 +126,8 @@ task FormatVcf { --vcf ~{vcf} \ --out tmp.vcf.gz \ --ploidy-table ~{ploidy_table} \ + ~{"--bothside-pass-list " + bothside_pass_list} \ + ~{"--background-fail-list " + background_fail_list} \ ~{args} if ~{defined(contigs_header)}; then diff --git a/wdl/GATKSVPipelineBatch.wdl b/wdl/GATKSVPipelineBatch.wdl index 37ee021c1..c8d38b83c 100644 --- a/wdl/GATKSVPipelineBatch.wdl +++ b/wdl/GATKSVPipelineBatch.wdl @@ -284,7 +284,9 @@ workflow GATKSVPipelineBatch { depth_vcfs=RegenotypeCNVs.regenotyped_depth_vcfs, contig_list=primary_contigs_fai, allosome_fai=allosome_file, - ref_dict=reference_dict, + reference_fasta=reference_fasta, + reference_fasta_fai=reference_index, + reference_dict=reference_dict, chr_x=chr_x, chr_y=chr_y, disc_files=[GATKSVPipelinePhase1.merged_PE], @@ -297,6 +299,7 @@ workflow GATKSVPipelineBatch { run_module_metrics = run_makecohortvcf_metrics, primary_contigs_list = primary_contigs_list, linux_docker=linux_docker, + gatk_docker=gatk_docker, sv_pipeline_docker=sv_pipeline_docker, sv_pipeline_qc_docker=sv_pipeline_qc_docker, sv_base_mini_docker=sv_base_mini_docker diff --git a/wdl/GATKSVPipelineSingleSample.wdl b/wdl/GATKSVPipelineSingleSample.wdl index 8ead3db0d..e7cdd90e7 100644 --- a/wdl/GATKSVPipelineSingleSample.wdl +++ b/wdl/GATKSVPipelineSingleSample.wdl @@ -284,6 +284,7 @@ workflow GATKSVPipelineSingleSample { File ref_panel_dup_bed Int? depth_records_per_bed_shard_cluster_batch + File depth_exclude_list Float depth_exclude_overlap_fraction Float depth_interval_overlap String? depth_clustering_algorithm @@ -399,11 +400,16 @@ workflow GATKSVPipelineSingleSample { Float clean_vcf_min_sr_background_fail_batches - File cytobands + File clustering_config_part1 + File stratification_config_part1 + File clustering_config_part2 + File stratification_config_part2 + Array[String] clustering_track_names + Array[File] clustering_track_bed_files + Float? combine_batches_java_mem_fraction + File cytobands File mei_bed - File depth_exclude_list - File empty_file Int max_shard_size_resolve Int clean_vcf_max_shards_per_chrom_clean_vcf_step1 @@ -421,40 +427,27 @@ workflow GATKSVPipelineSingleSample { Boolean? run_makecohortvcf_metrics = false # overrides for local tasks - RuntimeAttr? runtime_overide_get_discfile_size - RuntimeAttr? runtime_override_update_sr_list_cluster - RuntimeAttr? runtime_override_merge_pesr_depth RuntimeAttr? runtime_override_integrate_resolved_vcfs RuntimeAttr? runtime_override_rename_variants - RuntimeAttr? runtime_override_rename_cleaned_samples # overrides for mini tasks - RuntimeAttr? runtime_override_clean_background_fail - RuntimeAttr? runtime_override_make_cpx_cnv_input_file RuntimeAttr? runtime_override_subset_inversions - RuntimeAttr? runtime_override_concat_merged_vcfs - RuntimeAttr? runtime_override_concat_cpx_vcfs RuntimeAttr? runtime_override_concat_cleaned_vcfs - # overrides for VcfClusterContig - RuntimeAttr? runtime_override_join_vcfs - RuntimeAttr? runtime_override_subset_bothside_pass - RuntimeAttr? runtime_override_subset_background_fail - RuntimeAttr? runtime_override_subset_sv_type - RuntimeAttr? runtime_override_shard_clusters - RuntimeAttr? runtime_override_shard_vids - RuntimeAttr? runtime_override_pull_vcf_shard - RuntimeAttr? runtime_override_svtk_vcf_cluster - RuntimeAttr? runtime_override_get_vcf_header_with_members_info_line - RuntimeAttr? runtime_override_cluster_merge - RuntimeAttr? runtime_override_concat_vcf_cluster - RuntimeAttr? runtime_override_concat_svtypes - RuntimeAttr? runtime_override_concat_sharded_cluster - RuntimeAttr? runtime_override_make_sites_only - RuntimeAttr? runtime_override_preconcat_sharded_cluster - RuntimeAttr? runtime_override_hail_merge_sharded_cluster - RuntimeAttr? runtime_override_fix_header_sharded_cluster - RuntimeAttr? runtime_override_concat_large_pesr_depth + # overrides for CombineBatches + RuntimeAttr? runtime_attr_create_ploidy + RuntimeAttr? runtime_attr_reformat_1 + RuntimeAttr? runtime_attr_reformat_2 + RuntimeAttr? runtime_attr_join_vcfs + RuntimeAttr? runtime_attr_cluster_sites + RuntimeAttr? runtime_attr_recluster_part1 + RuntimeAttr? runtime_attr_recluster_part2 + RuntimeAttr? runtime_attr_get_non_ref_vids + RuntimeAttr? runtime_attr_calculate_support_frac + RuntimeAttr? runtime_override_clean_background_fail + RuntimeAttr? runtime_attr_gatk_to_svtk_vcf + RuntimeAttr? runtime_attr_extract_vids + RuntimeAttr? runtime_override_concat_combine_batches # overrides for ResolveComplexVariants RuntimeAttr? runtime_override_update_sr_list_pass @@ -1153,7 +1146,6 @@ workflow GATKSVPipelineSingleSample { depth_vcfs=[GenotypeBatch.genotyped_depth_vcf], contig_list=primary_contigs_fai, allosome_fai=allosome_file, - ref_dict=reference_dict, merge_complex_genotype_vcfs = true, cytobands=cytobands, @@ -1165,8 +1157,17 @@ workflow GATKSVPipelineSingleSample { mei_bed=mei_bed, pe_exclude_list=pesr_exclude_intervals, - depth_exclude_list=depth_exclude_list, - empty_file=empty_file, + clustering_config_part1=clustering_config_part1, + stratification_config_part1=stratification_config_part1, + clustering_config_part2=clustering_config_part2, + stratification_config_part2=stratification_config_part2, + track_names=clustering_track_names, + track_bed_files=clustering_track_bed_files, + reference_fasta=reference_fasta, + reference_fasta_fai=reference_index, + reference_dict=reference_dict, + java_mem_fraction=combine_batches_java_mem_fraction, + gatk_docker=gatk_docker, cohort_name=batch, @@ -1196,36 +1197,23 @@ workflow GATKSVPipelineSingleSample { sv_pipeline_qc_docker=sv_pipeline_qc_docker, sv_base_mini_docker=sv_base_mini_docker, - runtime_overide_get_discfile_size=runtime_overide_get_discfile_size, - runtime_override_update_sr_list_cluster=runtime_override_update_sr_list_cluster, - runtime_override_merge_pesr_depth=runtime_override_merge_pesr_depth, runtime_override_integrate_resolved_vcfs=runtime_override_integrate_resolved_vcfs, runtime_override_rename_variants=runtime_override_rename_variants, - runtime_override_rename_cleaned_samples=runtime_override_rename_cleaned_samples, runtime_override_breakpoint_overlap_filter=runtime_override_breakpoint_overlap_filter, runtime_override_clean_background_fail=runtime_override_clean_background_fail, - runtime_override_make_cpx_cnv_input_file=runtime_override_make_cpx_cnv_input_file, - runtime_override_concat_merged_vcfs=runtime_override_concat_merged_vcfs, - runtime_override_concat_cpx_vcfs=runtime_override_concat_cpx_vcfs, runtime_override_concat_cleaned_vcfs=runtime_override_concat_cleaned_vcfs, - runtime_override_join_vcfs=runtime_override_join_vcfs, - runtime_override_subset_bothside_pass=runtime_override_subset_bothside_pass, - runtime_override_subset_background_fail=runtime_override_subset_background_fail, - runtime_override_subset_sv_type=runtime_override_subset_sv_type, - runtime_override_shard_clusters=runtime_override_shard_clusters, - runtime_override_shard_vids=runtime_override_shard_vids, - runtime_override_pull_vcf_shard=runtime_override_pull_vcf_shard, - runtime_override_svtk_vcf_cluster=runtime_override_svtk_vcf_cluster, - runtime_override_get_vcf_header_with_members_info_line=runtime_override_get_vcf_header_with_members_info_line, - runtime_override_cluster_merge=runtime_override_cluster_merge, - runtime_override_concat_vcf_cluster=runtime_override_concat_vcf_cluster, - runtime_override_concat_svtypes=runtime_override_concat_svtypes, - runtime_override_concat_sharded_cluster=runtime_override_concat_sharded_cluster, - runtime_override_make_sites_only=runtime_override_make_sites_only, - runtime_override_preconcat_sharded_cluster=runtime_override_preconcat_sharded_cluster, - runtime_override_hail_merge_sharded_cluster=runtime_override_hail_merge_sharded_cluster, - runtime_override_fix_header_sharded_cluster=runtime_override_fix_header_sharded_cluster, - runtime_override_concat_large_pesr_depth=runtime_override_concat_large_pesr_depth, + runtime_attr_create_ploidy=runtime_attr_create_ploidy, + runtime_attr_reformat_1=runtime_attr_reformat_1, + runtime_attr_reformat_2=runtime_attr_reformat_2, + runtime_attr_join_vcfs=runtime_attr_join_vcfs, + runtime_attr_cluster_sites=runtime_attr_cluster_sites, + runtime_attr_recluster_part1=runtime_attr_recluster_part1, + runtime_attr_recluster_part2=runtime_attr_recluster_part2, + runtime_attr_get_non_ref_vids=runtime_attr_get_non_ref_vids, + runtime_attr_calculate_support_frac=runtime_attr_calculate_support_frac, + runtime_attr_gatk_to_svtk_vcf=runtime_attr_gatk_to_svtk_vcf, + runtime_attr_extract_vids=runtime_attr_extract_vids, + runtime_override_concat_combine_batches=runtime_override_concat_combine_batches, runtime_override_update_sr_list_pass=runtime_override_update_sr_list_pass, runtime_override_update_sr_list_fail=runtime_override_update_sr_list_fail, runtime_override_subset_inversions=runtime_override_subset_inversions, diff --git a/wdl/GenotypeBatchMetrics.wdl b/wdl/GenotypeBatchMetrics.wdl index 49f85cf87..7e216f417 100644 --- a/wdl/GenotypeBatchMetrics.wdl +++ b/wdl/GenotypeBatchMetrics.wdl @@ -48,7 +48,7 @@ workflow GenotypeBatchMetrics { baseline_vcf = baseline_genotyped_depth_vcf, samples = samples, prefix = "genotyped_depth", - types = "DEL,DUP", + types = "DEL,DUP,CNV", contig_list = contig_list, sv_pipeline_docker = sv_pipeline_docker, runtime_attr_override = runtime_attr_depth_metrics diff --git a/wdl/GenotypeDepthPart2.wdl b/wdl/GenotypeDepthPart2.wdl index 983eb0f8c..579149fa2 100644 --- a/wdl/GenotypeDepthPart2.wdl +++ b/wdl/GenotypeDepthPart2.wdl @@ -156,6 +156,7 @@ workflow GenotypeDepthPart2 { sv_base_mini_docker=sv_base_mini_docker, runtime_attr_override=runtime_attr_concat_vcfs } + output { File genotyped_vcf = ConcatVcfs.concat_vcf File genotyped_vcf_index = ConcatVcfs.concat_vcf_idx diff --git a/wdl/GenotypePESRPart2.wdl b/wdl/GenotypePESRPart2.wdl index 071312205..80b45d061 100644 --- a/wdl/GenotypePESRPart2.wdl +++ b/wdl/GenotypePESRPart2.wdl @@ -661,4 +661,4 @@ task GenotypeSRPart2 { preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) } -} +} \ No newline at end of file diff --git a/wdl/HarmonizeHeaders.wdl b/wdl/HarmonizeHeaders.wdl deleted file mode 100644 index fe3746d96..000000000 --- a/wdl/HarmonizeHeaders.wdl +++ /dev/null @@ -1,79 +0,0 @@ -version 1.0 - -import "Structs.wdl" -import "TasksMakeCohortVcf.wdl" as MiniTasks - -# Reheader a list of vcfs with the header from another vcf - -workflow HarmonizeHeaders { - input { - File header_vcf # Vcf containing desired header - Array[File] vcfs # Vcfs to replace headers of - String prefix - String sv_base_mini_docker - RuntimeAttr? runtime_override_reheader - RuntimeAttr? runtime_override_pull_header - } - - call PullHeader { - input: - vcf=header_vcf, - prefix=prefix, - sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override=runtime_override_pull_header - } - - scatter (i in range(length(vcfs))) { - call MiniTasks.ReheaderVcf { - input: - vcf=vcfs[i], - vcf_index=vcfs[i] + ".tbi", - header=PullHeader.out, - prefix="~{prefix}.reheadered", - sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override=runtime_override_reheader - } - } - - output { - Array[File] out = ReheaderVcf.out - Array[File] out_index = ReheaderVcf.out_index - } -} - -task PullHeader { - input { - File vcf - String prefix - String sv_base_mini_docker - RuntimeAttr? runtime_attr_override - } - - RuntimeAttr runtime_default = object { - mem_gb: 2.0, - disk_gb: ceil(10.0 + size(vcf, "GiB") ), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GiB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_base_mini_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euo pipefail - bcftools view --header-only ~{vcf} > ~{prefix}.header - >>> - - output { - File out = "~{prefix}.header" - } -} \ No newline at end of file diff --git a/wdl/MakeCohortVcf.wdl b/wdl/MakeCohortVcf.wdl index f3891fc8d..f447b7ceb 100644 --- a/wdl/MakeCohortVcf.wdl +++ b/wdl/MakeCohortVcf.wdl @@ -35,15 +35,25 @@ workflow MakeCohortVcf { Boolean use_hail = false String? gcs_project + # CombineBatches parameters + Boolean? legacy_vcfs + File clustering_config_part1 + File stratification_config_part1 + File clustering_config_part2 + File stratification_config_part2 + Array[String] track_names + Array[File] track_bed_files + File reference_fasta + File reference_fasta_fai + File reference_dict + Float? java_mem_fraction + File bin_exclude File contig_list File allosome_fai - Int? localize_shard_size File cytobands File mei_bed File pe_exclude_list - File depth_exclude_list - File ref_dict File HERVK_reference File LINE1_reference Int max_shard_size_resolve @@ -59,7 +69,6 @@ workflow MakeCohortVcf { String chr_x String chr_y - File empty_file File? outlier_samples_list Int? random_seed Int? max_gq # Max GQ for plotting. Default = 99, ie. GQ is on a scale of [0,99]. Prior to CleanVcf, use 999 @@ -78,47 +87,34 @@ workflow MakeCohortVcf { File? baseline_cleaned_vcf String linux_docker + String gatk_docker String sv_base_mini_docker String sv_pipeline_docker String sv_pipeline_qc_docker # overrides for local tasks - RuntimeAttr? runtime_overide_get_discfile_size - RuntimeAttr? runtime_override_update_sr_list_cluster - RuntimeAttr? runtime_override_merge_pesr_depth RuntimeAttr? runtime_override_integrate_resolved_vcfs RuntimeAttr? runtime_override_rename_variants - RuntimeAttr? runtime_override_rename_cleaned_samples - RuntimeAttr? runtime_override_breakpoint_overlap_filter # overrides for mini tasks - RuntimeAttr? runtime_override_clean_background_fail - RuntimeAttr? runtime_override_make_cpx_cnv_input_file RuntimeAttr? runtime_override_subset_inversions - RuntimeAttr? runtime_override_concat_merged_vcfs - RuntimeAttr? runtime_override_concat_cpx_vcfs RuntimeAttr? runtime_override_concat_cleaned_vcfs - # overrides for VcfClusterContig - RuntimeAttr? runtime_override_join_vcfs - RuntimeAttr? runtime_override_subset_bothside_pass - RuntimeAttr? runtime_override_subset_background_fail - RuntimeAttr? runtime_override_subset_sv_type - RuntimeAttr? runtime_override_shard_clusters - RuntimeAttr? runtime_override_shard_vids - RuntimeAttr? runtime_override_pull_vcf_shard - RuntimeAttr? runtime_override_svtk_vcf_cluster - RuntimeAttr? runtime_override_get_vcf_header_with_members_info_line - RuntimeAttr? runtime_override_cluster_merge - RuntimeAttr? runtime_override_concat_vcf_cluster - RuntimeAttr? runtime_override_concat_svtypes - RuntimeAttr? runtime_override_concat_sharded_cluster - RuntimeAttr? runtime_override_make_sites_only - RuntimeAttr? runtime_override_preconcat_sharded_cluster - RuntimeAttr? runtime_override_hail_merge_sharded_cluster - RuntimeAttr? runtime_override_fix_header_sharded_cluster - RuntimeAttr? runtime_override_concat_large_pesr_depth + # overrides for CombineBatches + RuntimeAttr? runtime_attr_create_ploidy + RuntimeAttr? runtime_attr_reformat_1 + RuntimeAttr? runtime_attr_reformat_2 + RuntimeAttr? runtime_attr_join_vcfs + RuntimeAttr? runtime_attr_cluster_sites + RuntimeAttr? runtime_attr_recluster_part1 + RuntimeAttr? runtime_attr_recluster_part2 + RuntimeAttr? runtime_attr_get_non_ref_vids + RuntimeAttr? runtime_attr_calculate_support_frac + RuntimeAttr? runtime_override_clean_background_fail + RuntimeAttr? runtime_attr_gatk_to_svtk_vcf + RuntimeAttr? runtime_attr_extract_vids + RuntimeAttr? runtime_override_concat_combine_batches # overrides for ResolveComplexVariants RuntimeAttr? runtime_override_update_sr_list_pass @@ -134,6 +130,7 @@ workflow MakeCohortVcf { RuntimeAttr? runtime_override_resolve_cpx_per_shard RuntimeAttr? runtime_override_restore_unresolved_cnv_per_shard RuntimeAttr? runtime_override_concat_resolved_per_shard + RuntimeAttr? runtime_override_pull_vcf_shard RuntimeAttr? runtime_override_preconcat_resolve RuntimeAttr? runtime_override_hail_merge_resolve RuntimeAttr? runtime_override_fix_header_resolve @@ -243,42 +240,42 @@ workflow MakeCohortVcf { input: cohort_name=cohort_name, batches=batches, - merge_vcfs=merge_cluster_vcfs, + ped_file=ped_file, pesr_vcfs=pesr_vcfs, depth_vcfs=depth_vcfs, + legacy_vcfs=legacy_vcfs, raw_sr_bothside_pass_files=raw_sr_bothside_pass_files, raw_sr_background_fail_files=raw_sr_background_fail_files, contig_list=contig_list, - localize_shard_size=localize_shard_size, - pe_exclude_list=pe_exclude_list, - depth_exclude_list=depth_exclude_list, min_sr_background_fail_batches=min_sr_background_fail_batches, - empty_file=empty_file, - use_hail=use_hail, - gcs_project=gcs_project, + clustering_config_part1=clustering_config_part1, + stratification_config_part1=stratification_config_part1, + clustering_config_part2=clustering_config_part2, + stratification_config_part2=stratification_config_part2, + track_names=track_names, + track_bed_files=track_bed_files, + reference_fasta=reference_fasta, + reference_fasta_fai=reference_fasta_fai, + reference_dict=reference_dict, + chr_x=chr_x, + chr_y=chr_y, + java_mem_fraction=java_mem_fraction, + gatk_docker=gatk_docker, sv_base_mini_docker=sv_base_mini_docker, sv_pipeline_docker=sv_pipeline_docker, - runtime_override_update_sr_list=runtime_override_update_sr_list_cluster, - runtime_override_merge_pesr_depth=runtime_override_merge_pesr_depth, + runtime_attr_create_ploidy=runtime_attr_create_ploidy, + runtime_attr_reformat_1=runtime_attr_reformat_1, + runtime_attr_reformat_2=runtime_attr_reformat_2, + runtime_attr_join_vcfs=runtime_attr_join_vcfs, + runtime_attr_cluster_sites=runtime_attr_cluster_sites, + runtime_attr_recluster_part1=runtime_attr_recluster_part1, + runtime_attr_recluster_part2=runtime_attr_recluster_part2, + runtime_attr_get_non_ref_vids=runtime_attr_get_non_ref_vids, + runtime_attr_calculate_support_frac=runtime_attr_calculate_support_frac, runtime_override_clean_background_fail=runtime_override_clean_background_fail, - runtime_override_concat=runtime_override_cluster_merge, - runtime_override_join_vcfs=runtime_override_join_vcfs, - runtime_override_subset_bothside_pass=runtime_override_subset_bothside_pass, - runtime_override_subset_background_fail=runtime_override_subset_background_fail, - runtime_override_subset_sv_type=runtime_override_subset_sv_type, - runtime_override_shard_clusters=runtime_override_shard_clusters, - runtime_override_shard_vids=runtime_override_shard_vids, - runtime_override_pull_vcf_shard=runtime_override_pull_vcf_shard, - runtime_override_svtk_vcf_cluster=runtime_override_svtk_vcf_cluster, - runtime_override_get_vcf_header_with_members_info_line=runtime_override_get_vcf_header_with_members_info_line, - runtime_override_concat_vcf_cluster=runtime_override_concat_vcf_cluster, - runtime_override_concat_svtypes=runtime_override_concat_svtypes, - runtime_override_concat_sharded_cluster=runtime_override_concat_sharded_cluster, - runtime_override_make_sites_only=runtime_override_make_sites_only, - runtime_override_preconcat_sharded_cluster=runtime_override_preconcat_sharded_cluster, - runtime_override_hail_merge_sharded_cluster=runtime_override_hail_merge_sharded_cluster, - runtime_override_fix_header_sharded_cluster=runtime_override_fix_header_sharded_cluster, - runtime_override_concat_large_pesr_depth=runtime_override_concat_large_pesr_depth + runtime_attr_gatk_to_svtk_vcf=runtime_attr_gatk_to_svtk_vcf, + runtime_attr_extract_vids=runtime_attr_extract_vids, + runtime_override_concat=runtime_override_concat_combine_batches } call ComplexResolve.ResolveComplexVariants { @@ -294,7 +291,7 @@ workflow MakeCohortVcf { cytobands=cytobands, mei_bed=mei_bed, pe_exclude_list=pe_exclude_list, - ref_dict=ref_dict, + ref_dict=reference_dict, use_hail=use_hail, gcs_project=gcs_project, max_shard_size=max_shard_size_resolve, @@ -349,7 +346,7 @@ workflow MakeCohortVcf { median_coverage_files=median_coverage_files, bin_exclude=bin_exclude, contig_list=contig_list, - ref_dict=ref_dict, + ref_dict=reference_dict, linux_docker=linux_docker, sv_base_mini_docker=sv_base_mini_docker, sv_pipeline_docker=sv_pipeline_docker, diff --git a/wdl/MergePesrDepth.wdl b/wdl/MergePesrDepth.wdl deleted file mode 100644 index 65a2b139e..000000000 --- a/wdl/MergePesrDepth.wdl +++ /dev/null @@ -1,234 +0,0 @@ -version 1.0 - -import "Structs.wdl" -import "TasksMakeCohortVcf.wdl" as MiniTasks -import "HailMerge.wdl" as HailMerge -import "ShardedCluster.wdl" as ShardedCluster -import "Utils.wdl" as utils - -workflow MergePesrDepth { - input { - File subtyped_pesr_vcf - File subtyped_depth_vcf - Int num_samples - - String prefix - String cohort_name - String svtype - String contig - Float merging_shard_scale_factor = 30000000 - - Boolean use_hail = false - String? gcs_project - - String sv_pipeline_docker - String sv_base_mini_docker - - # overrides for local tasks - RuntimeAttr? runtime_override_shard_clusters - RuntimeAttr? runtime_override_shard_vids - RuntimeAttr? runtime_override_pull_vcf_shard - RuntimeAttr? runtime_override_merge_pesr_depth - - # overrides for MiniTasks - RuntimeAttr? runtime_override_sort_merged_vcf - RuntimeAttr? runtime_override_subset_small - RuntimeAttr? runtime_override_subset_large - RuntimeAttr? runtime_override_make_sites_only - RuntimeAttr? runtime_override_concat_large_pesr_depth - RuntimeAttr? runtime_override_concat_shards - - RuntimeAttr? runtime_override_preconcat_large_pesr_depth - RuntimeAttr? runtime_override_hail_merge_large_pesr_depth - RuntimeAttr? runtime_override_fix_header_large_pesr_depth - - RuntimeAttr? runtime_override_preconcat_pesr_depth_shards - RuntimeAttr? runtime_override_hail_merge_pesr_depth_shards - RuntimeAttr? runtime_override_fix_header_pesr_depth_shards - } - - # Pull out CNVs too small to cluster (less than reciprocal_overlap_fraction * min_depth_only_length) - call MiniTasks.FilterVcf as SubsetSmall { - input: - vcf=subtyped_pesr_vcf, - vcf_index=subtyped_pesr_vcf + ".tbi", - outfile_prefix="~{prefix}.subset_small", - records_filter='INFO/SVLEN<2500', - use_ssd=true, - sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override=runtime_override_subset_small - } - - call MiniTasks.FilterVcf as SubsetLarge { - input: - vcf=subtyped_pesr_vcf, - vcf_index=subtyped_pesr_vcf + ".tbi", - outfile_prefix="~{prefix}.subset_large", - records_filter='INFO/SVLEN>=2500', - use_ssd=true, - sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override=runtime_override_subset_large - } - - if (use_hail) { - call HailMerge.HailMerge as ConcatLargePesrDepthHail { - input: - vcfs=[SubsetLarge.filtered_vcf, subtyped_depth_vcf], - prefix="~{prefix}.large_pesr_depth", - gcs_project=gcs_project, - sv_base_mini_docker=sv_base_mini_docker, - sv_pipeline_docker=sv_pipeline_docker, - runtime_override_preconcat=runtime_override_preconcat_large_pesr_depth, - runtime_override_hail_merge=runtime_override_hail_merge_large_pesr_depth, - runtime_override_fix_header=runtime_override_fix_header_large_pesr_depth - } - } - if (!use_hail) { - call MiniTasks.ConcatVcfs as ConcatLargePesrDepth { - input: - vcfs=[SubsetLarge.filtered_vcf, subtyped_depth_vcf], - vcfs_idx=[SubsetLarge.filtered_vcf + ".tbi", subtyped_depth_vcf + ".tbi"], - allow_overlaps=true, - outfile_prefix="~{prefix}.large_pesr_depth", - sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override=runtime_override_concat_large_pesr_depth - } - } - - call MiniTasks.MakeSitesOnlyVcf { - input: - vcf=select_first([ConcatLargePesrDepth.concat_vcf, ConcatLargePesrDepthHail.merged_vcf]), - vcf_index=select_first([ConcatLargePesrDepth.concat_vcf_idx, ConcatLargePesrDepthHail.merged_vcf_index]), - prefix="~{prefix}.large_pesr_depth.sites_only", - sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override=runtime_override_make_sites_only - } - - # Fast cluster without sample overlap linkage for sharding - Int merge_shard_size = ceil(merging_shard_scale_factor / num_samples) - call ShardedCluster.ShardClusters { - input: - vcf=MakeSitesOnlyVcf.out, - prefix="~{prefix}.shard_clusters", - dist=1000000000, - frac=0.5, - svsize=0, - sv_types=[svtype], - sv_pipeline_docker=sv_pipeline_docker, - runtime_attr_override=runtime_override_shard_clusters - } - - call MiniTasks.ShardVidsForClustering { - input: - clustered_vcf=ShardClusters.out, - prefix=prefix, - records_per_shard=merge_shard_size, - sv_pipeline_docker=sv_pipeline_docker, - runtime_attr_override=runtime_override_shard_vids - } - - scatter (i in range(length(ShardVidsForClustering.out))) { - call MiniTasks.PullVcfShard { - input: - vcf=select_first([ConcatLargePesrDepth.concat_vcf, ConcatLargePesrDepthHail.merged_vcf]), - vids=ShardVidsForClustering.out[i], - prefix="~{prefix}.unclustered.shard_${i}", - sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override=runtime_override_pull_vcf_shard - } - call MergePesrDepthShard { - input: - vcf=PullVcfShard.out, - vcf_index=PullVcfShard.out_index, - prefix="~{prefix}.merge_pesr_depth.shard_~{i}", - vid_prefix="~{cohort_name}_~{contig}_mpd~{i}", - sv_pipeline_docker=sv_pipeline_docker, - runtime_attr_override=runtime_override_merge_pesr_depth - } - call MiniTasks.SortVcf { - input: - vcf = MergePesrDepthShard.out, - outfile_prefix = "~{prefix}.sorted.shard_${i}", - sv_base_mini_docker = sv_base_mini_docker, - runtime_attr_override = runtime_override_sort_merged_vcf - } - } - - if (use_hail) { - call HailMerge.HailMerge as ConcatShardsHail { - input: - vcfs=flatten([[SubsetSmall.filtered_vcf], SortVcf.out]), - prefix="~{prefix}.concat_shards", - gcs_project=gcs_project, - sv_base_mini_docker=sv_base_mini_docker, - sv_pipeline_docker=sv_pipeline_docker, - runtime_override_preconcat=runtime_override_preconcat_pesr_depth_shards, - runtime_override_hail_merge=runtime_override_hail_merge_pesr_depth_shards, - runtime_override_fix_header=runtime_override_fix_header_pesr_depth_shards - } - } - if (!use_hail) { - call MiniTasks.ConcatVcfs as ConcatShards { - input: - vcfs=flatten([[SubsetSmall.filtered_vcf], SortVcf.out]), - vcfs_idx=flatten([[SubsetSmall.filtered_vcf_idx], SortVcf.out_index]), - allow_overlaps=true, - outfile_prefix="~{prefix}.concat_shards", - sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override=runtime_override_concat_shards - } - } - - output { - File out = select_first([ConcatShards.concat_vcf, ConcatShardsHail.merged_vcf]) - File out_index = select_first([ConcatShards.concat_vcf_idx, ConcatShardsHail.merged_vcf_index]) - } -} - - -task MergePesrDepthShard { - input { - File vcf - File vcf_index - String prefix - String vid_prefix - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - String output_file = prefix + ".vcf.gz" - - # when filtering/sorting/etc, memory usage will likely go up (much of the data will have to - # be held in memory or disk while working, potentially in a form that takes up more space) - Float input_size = size(vcf, "GiB") - RuntimeAttr runtime_default = object { - mem_gb: 2.0 + 0.6 * input_size, - disk_gb: ceil(10.0 + 6 * input_size), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GiB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} SSD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euo pipefail - /opt/sv-pipeline/04_variant_resolution/scripts/merge_pesr_depth.py \ - --prefix ~{vid_prefix} \ - ~{vcf} \ - ~{output_file} - >>> - - output { - File out = output_file - } -} diff --git a/wdl/ResolveComplexVariants.wdl b/wdl/ResolveComplexVariants.wdl index f712537b6..92ed26476 100644 --- a/wdl/ResolveComplexVariants.wdl +++ b/wdl/ResolveComplexVariants.wdl @@ -318,6 +318,7 @@ task BreakpointOverlap { File bothside_pass_list File background_fail_list String prefix + File? script String sv_pipeline_docker RuntimeAttr? runtime_attr_override } @@ -346,7 +347,7 @@ task BreakpointOverlap { command <<< set -euo pipefail - python /opt/sv-pipeline/04_variant_resolution/scripts/overlap_breakpoint_filter.py \ + python ~{default="/opt/sv-pipeline/04_variant_resolution/scripts/overlap_breakpoint_filter.py" script} \ ~{vcf} \ ~{bothside_pass_list} \ ~{background_fail_list} \ diff --git a/wdl/TasksClusterBatch.wdl b/wdl/TasksClusterBatch.wdl index cab695ca9..d0f934e39 100644 --- a/wdl/TasksClusterBatch.wdl +++ b/wdl/TasksClusterBatch.wdl @@ -44,6 +44,7 @@ task SVCluster { File reference_dict Float? java_mem_fraction + String? additional_args String? variant_prefix String gatk_docker @@ -71,7 +72,7 @@ task SVCluster { File out_index = "~{output_prefix}.vcf.gz.tbi" } command <<< - set -euo pipefail + set -euxo pipefail function getJavaMem() { # get JVM memory in MiB by getting total memory from /proc/meminfo @@ -125,7 +126,8 @@ task SVCluster { ~{"--pesr-breakend-window " + pesr_breakend_window} \ ~{"--insertion-length-summary-strategy " + insertion_length_summary_strategy} \ ~{"--breakpoint-summary-strategy " + breakpoint_summary_strategy} \ - ~{"--alt-allele-summary-strategy " + alt_allele_summary_strategy} + ~{"--alt-allele-summary-strategy " + alt_allele_summary_strategy} \ + ~{additional_args} >>> runtime { cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) @@ -242,6 +244,7 @@ task GatkToSvtkVcf { File contig_list String? remove_infos String? remove_formats + Boolean set_pass = false String output_prefix String sv_pipeline_docker RuntimeAttr? runtime_attr_override @@ -269,7 +272,8 @@ task GatkToSvtkVcf { --source ~{source} \ --contigs ~{contig_list} \ ~{"--remove-infos " + remove_infos} \ - ~{"--remove-formats " + remove_formats} + ~{"--remove-formats " + remove_formats} \ + ~{if set_pass then "--set-pass" else ""} tabix ~{output_prefix}.vcf.gz >>> runtime { diff --git a/wdl/TasksGenotypeBatch.wdl b/wdl/TasksGenotypeBatch.wdl index aa1221b3e..151391489 100644 --- a/wdl/TasksGenotypeBatch.wdl +++ b/wdl/TasksGenotypeBatch.wdl @@ -608,3 +608,45 @@ task IntegrateDepthGq { maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) } } + +task ReformatGenotypedVcf { + input { + File vcf + File? script + String output_prefix + String sv_pipeline_docker + RuntimeAttr? runtime_attr_override + } + + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 3.75, + disk_gb: ceil(10 + size(vcf, "GB") * 2), + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 1 + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + + output { + File out = "~{output_prefix}.vcf.gz" + File out_index = "~{output_prefix}.vcf.gz.tbi" + } + command <<< + set -euo pipefail + python ~{default="/opt/sv-pipeline/04_variant_resolution/scripts/reformat_genotyped_vcf.py" script} --vcf ~{vcf} \ + | bgzip \ + > ~{output_prefix}.vcf.gz + tabix ~{output_prefix}.vcf.gz + + >>> + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + docker: sv_pipeline_docker + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + } +} \ No newline at end of file diff --git a/wdl/VcfClusterSingleChromsome.wdl b/wdl/VcfClusterSingleChromsome.wdl deleted file mode 100644 index 4476ce40a..000000000 --- a/wdl/VcfClusterSingleChromsome.wdl +++ /dev/null @@ -1,467 +0,0 @@ -version 1.0 - -# Author: Ryan Collins - -import "Structs.wdl" -import "TasksMakeCohortVcf.wdl" as MiniTasks -import "ClusterSingleChromosome.wdl" as VcfClusterTasks - -# Workflow to run parallelized vcf clustering for a single chromosome -workflow VcfClusterSingleChrom { - input { - Array[File] vcfs - Int num_samples - String prefix - String evidence_type - String cohort_name - Int dist - Float frac - Float sample_overlap - File? exclude_list - Array[String] batches - Int sv_size - Array[String] sv_types - String contig - Int localize_shard_size - Boolean subset_sr_lists - File bothside_pass - File background_fail - File empty_file - - Boolean use_hail - String? gcs_project - - String sv_pipeline_docker - String sv_base_mini_docker - - # overrides for local tasks - RuntimeAttr? runtime_override_localize_vcfs - RuntimeAttr? runtime_override_join_vcfs - RuntimeAttr? runtime_override_fix_multiallelic - RuntimeAttr? runtime_override_fix_ev_tags - - # overrides for MiniTasks - RuntimeAttr? runtime_override_subset_bothside_pass - RuntimeAttr? runtime_override_subset_background_fail - - # overrides for VcfClusterTasks - RuntimeAttr? runtime_override_shard_clusters - RuntimeAttr? runtime_override_shard_vids - RuntimeAttr? runtime_override_subset_sv_type - RuntimeAttr? runtime_override_shard_vcf_precluster - RuntimeAttr? runtime_override_pull_vcf_shard - RuntimeAttr? runtime_override_svtk_vcf_cluster - RuntimeAttr? runtime_override_get_vcf_header_with_members_info_line - RuntimeAttr? runtime_override_concat_vcf_cluster - RuntimeAttr? runtime_override_concat_svtypes - RuntimeAttr? runtime_override_concat_sharded_cluster - RuntimeAttr? runtime_override_make_sites_only - RuntimeAttr? runtime_override_sort_merged_vcf - - RuntimeAttr? runtime_override_preconcat_sharded_cluster - RuntimeAttr? runtime_override_hail_merge_sharded_cluster - RuntimeAttr? runtime_override_fix_header_sharded_cluster - } - - scatter (i in range(length(vcfs))) { - call LocalizeContigVcfs { - input: - vcf=vcfs[i], - vcf_index = vcfs[i] + ".tbi", - shard_size = localize_shard_size, - contig=contig, - prefix=prefix + "." + contig + "." + batches[i], - sv_pipeline_docker=sv_pipeline_docker, - runtime_attr_override=runtime_override_localize_vcfs - } - } - Array[Array[File]] sharded_vcfs_ = transpose(LocalizeContigVcfs.out) - - scatter (i in range(length(sharded_vcfs_))) { - call JoinVcfs { - input: - vcfs=sharded_vcfs_[i], - contig=contig, - prefix=prefix + "." + i, - sv_pipeline_docker=sv_pipeline_docker, - runtime_attr_override=runtime_override_join_vcfs - } - call FixMultiallelicRecords { - input: - joined_vcf=JoinVcfs.out, - batch_contig_vcfs=sharded_vcfs_[i], - contig=contig, - prefix=prefix + "." + i, - sv_pipeline_docker=sv_pipeline_docker, - runtime_attr_override=runtime_override_fix_multiallelic - } - call FixEvidenceTags { - input: - vcf=FixMultiallelicRecords.out, - contig=contig, - prefix=prefix + "." + i, - sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override=runtime_override_fix_ev_tags - } - } - - call MiniTasks.ConcatVcfs { - input: - vcfs=FixEvidenceTags.out, - vcfs_idx=FixEvidenceTags.out_index, - naive=true, - outfile_prefix="~{prefix}.precluster_concat", - sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override=runtime_override_concat_vcf_cluster - } - - #Run vcfcluster per chromosome - call VcfClusterTasks.ClusterSingleChrom { - input: - vcf=ConcatVcfs.concat_vcf, - vcf_index=ConcatVcfs.concat_vcf_idx, - num_samples=num_samples, - contig=contig, - cohort_name=cohort_name, - evidence_type=evidence_type, - prefix=prefix, - dist=dist, - frac=frac, - sample_overlap=sample_overlap, - exclude_list=exclude_list, - sv_size=sv_size, - sv_types=sv_types, - empty_file=empty_file, - use_hail=use_hail, - gcs_project=gcs_project, - sv_pipeline_docker=sv_pipeline_docker, - sv_base_mini_docker=sv_base_mini_docker, - runtime_override_subset_sv_type=runtime_override_subset_sv_type, - runtime_override_shard_clusters=runtime_override_shard_clusters, - runtime_override_shard_vids=runtime_override_shard_vids, - runtime_override_pull_vcf_shard=runtime_override_pull_vcf_shard, - runtime_override_svtk_vcf_cluster=runtime_override_svtk_vcf_cluster, - runtime_override_get_vcf_header_with_members_info_line=runtime_override_get_vcf_header_with_members_info_line, - runtime_override_concat_svtypes=runtime_override_concat_svtypes, - runtime_override_concat_sharded_cluster=runtime_override_concat_sharded_cluster, - runtime_override_make_sites_only=runtime_override_make_sites_only, - runtime_override_sort_merged_vcf=runtime_override_sort_merged_vcf, - runtime_override_preconcat_sharded_cluster=runtime_override_preconcat_sharded_cluster, - runtime_override_hail_merge_sharded_cluster=runtime_override_hail_merge_sharded_cluster, - runtime_override_fix_header_sharded_cluster=runtime_override_fix_header_sharded_cluster - } - - if(subset_sr_lists) { - #Subset bothside_pass & background_fail to chromosome of interest - call SubsetVariantList as SubsetBothsidePass { - input: - vid_list=bothside_pass, - vid_col=2, - vcf=ConcatVcfs.concat_vcf, - outfile_name="~{prefix}.pass.VIDs.list", - sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override=runtime_override_subset_bothside_pass - } - call SubsetVariantList as SubsetBackgroundFail { - input: - vid_list=background_fail, - vid_col=1, - vcf=ConcatVcfs.concat_vcf, - outfile_name="~{prefix}.fail.VIDs.list", - sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override=runtime_override_subset_background_fail - } - } - - output { - Array[File] clustered_vcfs = ClusterSingleChrom.clustered_vcfs - Array[File] clustered_vcf_indexes = ClusterSingleChrom.clustered_vcf_indexes - File filtered_bothside_pass = select_first([SubsetBothsidePass.filtered_vid_list, empty_file]) - File filtered_background_fail = select_first([SubsetBackgroundFail.filtered_vid_list, empty_file]) - } -} - -task LocalizeContigVcfs { - input { - File vcf - File vcf_index - Int shard_size - String contig - String prefix - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(10 + size(vcf, "GiB") * 1.5), - cpu_cores: 1, - preemptible_tries: 1, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - - runtime { - memory: select_first([runtime_override.mem_gb, runtime_default.mem_gb]) + " GiB" - disks: "local-disk " + select_first([runtime_override.disk_gb, runtime_default.disk_gb]) + " HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - - # See Issue #52 "Use GATK to retrieve VCF records in JoinContigFromRemoteVcfs" - # https://github.com/broadinstitute/gatk-sv/issues/52 - - tabix -h "~{vcf}" "~{contig}" \ - | sed "s/AN=[0-9]*;//g" \ - | sed "s/AC=[0-9]*;//g" \ - | bgzip \ - > contig.vcf.gz - tabix contig.vcf.gz - - python3 <>> - - output { - Array[File] out = glob("~{prefix}.*.vcf.gz") - } -} - -# Merge contig vcfs across batches -task JoinVcfs { - input { - Array[File] vcfs - String contig - String prefix - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size(vcfs, "GiB") - Float input_size_ratio = 3.0 - Float base_disk_gb = 10.0 - RuntimeAttr runtime_default = object { - mem_gb: 1.0, - disk_gb: ceil(base_disk_gb + input_size * input_size_ratio), - cpu_cores: 1, - preemptible_tries: 1, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - - runtime { - memory: select_first([runtime_override.mem_gb, runtime_default.mem_gb]) + " GiB" - disks: "local-disk " + select_first([runtime_override.disk_gb, runtime_default.disk_gb]) + " HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euo pipefail - - python3 < ~{prefix}.~{contig}.joined.vcf.gz - import sys - import gzip - - fl = open("~{write_lines(vcfs)}") - files = [gzip.open(f.strip(), 'rb') for f in fl.readlines()] - lines_zip = zip(*files) - - for linesb in lines_zip: - lines = [l.decode('utf-8') for l in linesb] - ex = lines[0] - if ex.startswith('##'): - sys.stdout.write(ex) - else: - sys.stdout.write(ex.strip()) - if len(lines) > 1: - sys.stdout.write('\t') - out_lines = [l.strip().split('\t', 9)[-1] for l in lines[1:]] - sys.stdout.write("\t".join(out_lines)) - sys.stdout.write('\n') - CODE - tabix ~{prefix}.~{contig}.joined.vcf.gz - >>> - - output { - File out = "~{prefix}.~{contig}.joined.vcf.gz" - File out_index = "~{prefix}.~{contig}.joined.vcf.gz.tbi" - } -} - -# Add in max CN state to multiallelics -task FixMultiallelicRecords { - input { - File joined_vcf - Array[File] batch_contig_vcfs - String contig - String prefix - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size(joined_vcf, "GiB") * 2 + size(batch_contig_vcfs, "GiB") - Float input_size_fraction = 2.0 - Float base_disk_gb = 10.0 - RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(base_disk_gb + input_size * input_size_fraction), - cpu_cores: 1, - preemptible_tries: 1, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - - runtime { - memory: select_first([runtime_override.mem_gb, runtime_default.mem_gb]) + " GiB" - disks: "local-disk " + select_first([runtime_override.disk_gb, runtime_default.disk_gb]) + " HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - /opt/sv-pipeline/04_variant_resolution/scripts/make_concordant_multiallelic_alts.py \ - ~{joined_vcf} \ - ~{write_lines(batch_contig_vcfs)} \ - ~{prefix}.~{contig}.fixed_multiallelics.vcf.gz - tabix ~{prefix}.~{contig}.fixed_multiallelics.vcf.gz - >>> - - output { - File out = "~{prefix}.~{contig}.fixed_multiallelics.vcf.gz" - File out_index = "~{prefix}.~{contig}.fixed_multiallelics.vcf.gz.tbi" - } -} - -# Convert EV field from String to Integer -task FixEvidenceTags { - input { - File vcf - String contig - String prefix - String sv_base_mini_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size(vcf, "GiB") - Float input_size_ratio = 2.0 - Float base_disk_gb = 10.0 - RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(base_disk_gb + input_size * input_size_ratio), - cpu_cores: 1, - preemptible_tries: 1, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - - runtime { - memory: select_first([runtime_override.mem_gb, runtime_default.mem_gb]) + " GiB" - disks: "local-disk " + select_first([runtime_override.disk_gb, runtime_default.disk_gb]) + " HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_base_mini_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - zcat ~{vcf} \ - | sed -e 's/:RD,PE,SR/:7/g' \ - | sed -e 's/:PE,SR/:6/g' \ - | sed -e 's/:RD,SR/:5/g' \ - | sed -e 's/:RD,PE/:3/g' \ - | sed -e 's/:PE\t/:2\t/g' -e 's/:SR\t/:4\t/g' -e 's/:RD\t/:1\t/g' \ - | sed -e 's/ID=EV,Number=.,Type=String/ID=EV,Number=1,Type=Integer/g' \ - | bgzip \ - > ~{prefix}.~{contig}.unclustered.vcf.gz - tabix ~{prefix}.~{contig}.unclustered.vcf.gz - >>> - - output { - File out = "~{prefix}.~{contig}.unclustered.vcf.gz" - File out_index = "~{prefix}.~{contig}.unclustered.vcf.gz.tbi" - } -} - -# Find intersection of Variant IDs from vid_list with those present in vcf, return as filtered_vid_list -task SubsetVariantList { - input { - File vid_list - Int vid_col - File vcf - String outfile_name - String sv_base_mini_docker - RuntimeAttr? runtime_attr_override - } - - # when filtering/sorting/etc, memory usage will likely go up (much of the data will have to - # be held in memory or disk while working, potentially in a form that takes up more space) - RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(10.0 + size(vid_list, "GB") * 2.0 + size(vcf, "GB")), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: select_first([runtime_override.mem_gb, runtime_default.mem_gb]) + " GB" - disks: "local-disk " + select_first([runtime_override.disk_gb, runtime_default.disk_gb]) + " HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_base_mini_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euo pipefail - zgrep -v "^#" ~{vcf} | cut -f3 > valid_vids.list - awk -F'\t' -v OFS='\t' 'ARGIND==1{inFileA[$1]; next} {if ($~{vid_col} in inFileA) print }' valid_vids.list ~{vid_list} \ - > ~{outfile_name} - >>> - - output { - File filtered_vid_list = outfile_name - } -} \ No newline at end of file