From 775caaae46150e23e0b24906b835688fbae1eafd Mon Sep 17 00:00:00 2001 From: Rapsssito Date: Wed, 6 Sep 2023 13:59:41 +0200 Subject: [PATCH] feat: Complete pipeline designer --- modules/oncoliner_assesment/README.md | 11 +- modules/oncoliner_harmonization/README.md | 8 + modules/oncoliner_improvement/README.md | 7 + tools/pipeline_designer/.gitignore | 1 - tools/pipeline_designer/README.md | 158 +++++++++++++++- tools/pipeline_designer/TODO | 0 tools/pipeline_designer/example.sh | 9 - tools/pipeline_designer/example/.gitignore | 1 + tools/pipeline_designer/example/example.sh | 9 + tools/pipeline_designer/example/fake_ref.fa | 1 + .../pipeline_designer/example/fake_ref.fa.fai | 1 + .../sample_1/variant_caller_1_sample_1.vcf | 169 ++++++++++++++++++ .../sample_2/variant_caller_1_sample_2.vcf | 168 +++++++++++++++++ .../sample_1/variant_caller_2_sample_1.vcf | 168 +++++++++++++++++ .../sample_2/variant_caller_2_sample_2.vcf | 167 +++++++++++++++++ .../input/truth/sample_1/truth_sample_1.vcf | 169 ++++++++++++++++++ .../input/truth/sample_2/truth_sample_2.vcf | 168 +++++++++++++++++ tools/pipeline_designer/src/main.py | 77 ++++---- 18 files changed, 1238 insertions(+), 54 deletions(-) delete mode 100644 tools/pipeline_designer/TODO delete mode 100644 tools/pipeline_designer/example.sh create mode 100644 tools/pipeline_designer/example/.gitignore create mode 100644 tools/pipeline_designer/example/example.sh create mode 120000 tools/pipeline_designer/example/fake_ref.fa create mode 120000 tools/pipeline_designer/example/fake_ref.fa.fai create mode 100644 tools/pipeline_designer/example/input/test/variant_caller_1/sample_1/variant_caller_1_sample_1.vcf create mode 100644 tools/pipeline_designer/example/input/test/variant_caller_1/sample_2/variant_caller_1_sample_2.vcf create mode 100644 tools/pipeline_designer/example/input/test/variant_caller_2/sample_1/variant_caller_2_sample_1.vcf create mode 100644 tools/pipeline_designer/example/input/test/variant_caller_2/sample_2/variant_caller_2_sample_2.vcf create mode 100644 tools/pipeline_designer/example/input/truth/sample_1/truth_sample_1.vcf create mode 100644 tools/pipeline_designer/example/input/truth/sample_2/truth_sample_2.vcf diff --git a/modules/oncoliner_assesment/README.md b/modules/oncoliner_assesment/README.md index 9039e03..c24d36b 100644 --- a/modules/oncoliner_assesment/README.md +++ b/modules/oncoliner_assesment/README.md @@ -20,12 +20,15 @@ It is written in Python 3 (**requires Python version 3.6 or higher**). Oncoliner's assesment module makes use of the following Python modules: * [`pandas`](https://pandas.pydata.org/) * [`pysam`](https://github.com/pysam-developers/pysam) +* [`variant-extractor`](https://github.com/EUCANCan/variant-extractor) You may install them using pip: ``` -pip3 install pandas pysam +pip3 install pandas pysam variant-extractor ``` +However, we recommend using the provided [Dockerfile](../../Dockerfile)/[Singularity recipe](../../singularity.def) for building the whole Oncoliner suite to avoid dependency issues. + ## Functional analysis The module will try to obtain the genes affected by the variants from the `INFO` field in the truth files. **WARNING: Oncoliner does not compute genes linked to false positives.** Oncoliner's assesment module is compatible with the following functional analysis tools annotations: @@ -88,8 +91,8 @@ options: * `{OUTPUT_PREFIX}fp.[snv|indel|sv].vcf.gz`: VCF files with the false positives (FP) variants. One file per variant type (SNV, indel and SV). * `{OUTPUT_PREFIX}fn.[snv|indel|sv].vcf.gz`: VCF files with the false negatives (FN) variants. One file per variant type (SNV, indel and SV). * `{OUTPUT_PREFIX}metrics.csv`: CSV file containing the metrics for the comparison of the test and truth VCF files. It contains the following columns: - * `variant_type`: variant type (SNV, indel or SV), as outputted by [VariantExtractor](https://github.com/EUCANCan/variant-extractor). - * `variant_size`: range of variant sizes for that particular row. + * `variant_type`: variant type, as outputted by [VariantExtractor](https://github.com/EUCANCan/variant-extractor). + * `variant_size`: range of variant sizes analyzed for that particular row. * `window_radius`: window radius used for the assessment. * `recall`: Recall. TP / (TP + FN). * `precision`: Precision. TP / (TP + FP). @@ -104,7 +107,7 @@ options: ### `assesment_bulk.py` -Wrapper for `assesment_main.py`. It allows to compare a series of (VCF/BCF/VCF.GZ) files generated by any variant callers against a series of (VCF/BCF/VCF.GZ) truth files for **multiple samples**. It takes advantage of multiple processors. It is provided as a standalone command line tool. Example of usage: +Wrapper for `assesment_main.py`. It allows to compare a series of (VCF/BCF/VCF.GZ) files generated by any variant callers against a series of (VCF/BCF/VCF.GZ) truth files for **multiple samples**. It takes advantage of multiple processors and is also able to recover from a previous execution (if the execution was interrupted). It is provided as a standalone command line tool. Example of usage: ``` python3 -O src/assesment_main.py -c config.tsv -o output_ diff --git a/modules/oncoliner_harmonization/README.md b/modules/oncoliner_harmonization/README.md index 493df95..88979c2 100644 --- a/modules/oncoliner_harmonization/README.md +++ b/modules/oncoliner_harmonization/README.md @@ -8,3 +8,11 @@ WIP Oncoliner's harmonization module makes use of the following Python modules: * [`pandas`](https://pandas.pydata.org/) * [`pysam`](https://github.com/pysam-developers/pysam) +* [`variant-extractor`](https://github.com/EUCANCan/variant-extractor) + +You may install them using pip: +``` +pip3 install pandas pysam variant-extractor +``` + +However, we recommend using the provided [Dockerfile](../../Dockerfile)/[Singularity recipe](../../singularity.def) for building the whole Oncoliner suite to avoid dependency issues. diff --git a/modules/oncoliner_improvement/README.md b/modules/oncoliner_improvement/README.md index e04b8cd..6a0bd02 100644 --- a/modules/oncoliner_improvement/README.md +++ b/modules/oncoliner_improvement/README.md @@ -14,7 +14,14 @@ WIP Oncoliner's improvement module makes use of the following Python modules: * [`pandas`](https://pandas.pydata.org/) * [`pysam`](https://github.com/pysam-developers/pysam) +* [`variant-extractor`](https://github.com/EUCANCan/variant-extractor) +You may install them using pip: +``` +pip3 install pandas pysam variant-extractor +``` + +However, we recommend using the provided [Dockerfile](../../Dockerfile)/[Singularity recipe](../../singularity.def) for building the whole Oncoliner suite to avoid dependency issues. ## Usage diff --git a/tools/pipeline_designer/.gitignore b/tools/pipeline_designer/.gitignore index a4ff7ff..68bc17f 100644 --- a/tools/pipeline_designer/.gitignore +++ b/tools/pipeline_designer/.gitignore @@ -158,4 +158,3 @@ cython_debug/ # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ -example \ No newline at end of file diff --git a/tools/pipeline_designer/README.md b/tools/pipeline_designer/README.md index 3bcf116..3340973 100644 --- a/tools/pipeline_designer/README.md +++ b/tools/pipeline_designer/README.md @@ -1 +1,157 @@ -# pipeline-designer \ No newline at end of file +# Pipeline Designer + +## Table of contents +- [Dependencies](#dependencies) +- [Usage](#usage) + - [Interface](#interface) + - [Output](#output) +- [Use case example](#use-case-example) + + +## Dependencies +Oncoliner's pipeline designer makes use of the following Python modules: +* [`pandas`](https://pandas.pydata.org/) +* [`pysam`](https://github.com/pysam-developers/pysam) +* [`pysam`](https://github.com/pysam-developers/pysam) + +You may install them using pip: +``` +pip3 install pandas pysam variant-extractor +``` + +However, we recommend using the provided [Dockerfile](../../Dockerfile)/[Singularity recipe](../../singularity.def) for building the whole Oncoliner suite to avoid dependency issues. + +## Usage + +**WARNING**: It is recommended to normalize indels and SNVs for each variant caller before executing the pipeline designer. For this purpose, we recommend using pre.py from [Illumina's Haplotype Comparison Tools (hap.py)](https://github.com/Illumina/hap.py). We provide an standalone and containerized **[EUCANCan's pre.py wrapper](https://github.com/EUCANCan/prepy-wrapper)** for this purpose. + +The main executable code is in the [`src/`](/src/) folder. There is one executable file: [`main.py`](/src/main.py). It is provided as a standalone command line tool. Example of usage: + +```bash +python3 src/main.py -t ./input/truth -v ./input/test -o ./output + -f ./fake_ref.fa \ + -rs sample_1 \ + -ps sample_2 \ + -p 32 \ + --max-combinations 5 +``` + +Check the example of usage in the [example](./example/) folder for more information. + +### Interface +``` +usage: main.py [-h] -t TRUTH -v TEST -o OUTPUT -f FASTA_REF -rs RECALL_SAMPLES [RECALL_SAMPLES ...] -ps PRECISION_SAMPLES [PRECISION_SAMPLES ...] [-it INDEL_THRESHOLD] [-wr WINDOW_RADIUS] + [--sv-size-bins SV_SIZE_BINS [SV_SIZE_BINS ...]] [--contigs CONTIGS [CONTIGS ...]] [-p PROCESSES] [--max-combinations MAX_COMBINATIONS] + +Pipeline designer + +options: + -h, --help show this help message and exit + -t TRUTH, --truth TRUTH + Path to the VCF truth folder + -v TEST, --test TEST Path to the VCF test folder + -o OUTPUT, --output OUTPUT + Path to the output folder + -f FASTA_REF, --fasta-ref FASTA_REF + Path to reference FASTA file + -rs RECALL_SAMPLES [RECALL_SAMPLES ...], --recall-samples RECALL_SAMPLES [RECALL_SAMPLES ...] + Recall samples names + -ps PRECISION_SAMPLES [PRECISION_SAMPLES ...], --precision-samples PRECISION_SAMPLES [PRECISION_SAMPLES ...] + Precision samples names + -it INDEL_THRESHOLD, --indel-threshold INDEL_THRESHOLD + Indel threshold, inclusive (default=100) + -wr WINDOW_RADIUS, --window-radius WINDOW_RADIUS + Window ratio (default=100) + --sv-size-bins SV_SIZE_BINS [SV_SIZE_BINS ...] + SV size bins for the output_prefix metrics (default=[500]) + --contigs CONTIGS [CONTIGS ...] + Contigs to process (default=['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', 'X', 'Y']) + -p PROCESSES, --processes PROCESSES + Number of processes to use + --max-combinations MAX_COMBINATIONS + Maximum number of combinations to perform (-1) for all +``` + +### Output + +The pipeline designer will generate a series of files in the output folder. Most of them are intermediate files that are used by the pipeline designer to generate the final output files and recover in case of failure. The most important ones are the `.csv` files in `$OUTPUT_FOLDER/improvement_list`. + +Each output `.csv` file is named after the variant type and the variant size (e.g `SNV_1.csv` contains the callers combinations results for SNVs of size 1). Each file contains the following columns: + +* `operation`: The combination performed (e.g. `variant_caller_1$or$variant_caller_2`, which means that the combination is the union of the results of `variant_caller_1` and `variant_caller_2`). +* `variant_type`: variant type, as outputted by [VariantExtractor](https://github.com/EUCANCan/variant-extractor). +* `variant_size`: range of variant sizes analyzed for that particular file. +* `window_radius`: window radius used for the assessment. +* `recall`: Recall. TP / (TP + FN). Calculated **only** from the recall samples. +* `precision`: Precision. TP / (TP + FP). Calculated **only** from the precision samples. +* `f1_score`: F1 score. 2 * (precision * recall) / (precision + recall). +* `tp`: Number of true positives. Calculated **only** from the recall samples. +* `fp`: Number of false positives. Calculated **only** from the precision samples. +* `fn`: Number of false negatives. Calculated **only** from the recall samples. +* `protein_affected_genes_count`: Number of genes affected by the variants. +* `protein_affected_driver_genes_count`: Number of cancer driver genes affected by the variants. +* `protein_affected_genes`: List of genes affected by the variants (separated by `;`). +* `protein_affected_driver_genes`: List of cancer driver genes affected by the variants (separated by `;`). + +## Use case example + +Assume that we have the following input variant callers: `variant_caller_1` and `variant_caller_2`. Both are SV callers. We want to combine them to improve the results of our pipeline. We have the following samples: `sample_1` and `sample_2`. We will use `sample_1` as a recall sample and `sample_2` as a precision sample. + +First of all, we need to run the variant callers and obtain the VCF files for each sample. Assume we obtain the following VCF files: `variant_caller_1.vcf` and `variant_caller_2.vcf` for each sample. **Make sure the names of the VCF files are the same across all the samples**. + +**Optional**. We recommend normalizing the VCF files before running the pipeline designer (see [Usage](#usage) for more information). However, in this case it is not necessary because we are only working with SVs. + +Now, we can run the pipeline designer. We will use the following command: + +```bash +python3 src/main.py -t ./input/truth -v ./input/test -o ./output + -f ./genome.fa \ + -rs sample_1 \ + -ps sample_2 \ + -p 32 \ + --max-combinations 5 +``` + +The `input` folder must have the following structure: +``` +input +├── truth +│   ├── sample_1 +│   │   └── truth_sample_1.vcf +│   └── sample_2 +│   └── truth_sample_2.vcf +└── test + ├── variant_caller_1 + │ ├── sample_1 + │   │  └── variant_caller_1_sample_1.vcf + │   └── sample_2 + │     └── variant_caller_1_sample_2.vcf + └── variant_caller_2 + ├── sample_1 + │ └── variant_caller_2_sample_1.vcf + └── sample_2 + └── variant_caller_2_sample_2.vcf +``` + +After running the pipeline designer, we will obtain the following output folder structure: +``` +output +├── ... +└── improvement_list + ├── SNV_1.csv + ├── INDEL_1__100.csv + ├── ... + └── SV_ALL.csv +``` + +We are looking for the `SV_ALL.csv` file. This file contains the results of the combinations of all the SVs. The file may look like this: + +| operation | variant_type | variant_size | recall | precision | f1_score | ... | num_callers | +| --------------------------------------- | ------------ | ------------ | ------ | --------- | -------- | --- | ----------- | +| `variant_caller_1$or$variant_caller_2` | SV | ALL | 1.00 | 0.50 | 0.67 | ... | 2 | +| `variant_caller_2` | SV | ALL | 0.67 | 0.50 | 0.57 | ... | 1 | +| `variant_caller_1$and$variant_caller_2` | SV | ALL | 0.33 | 1.00 | 0.5 | ... | 2 | +| `variant_caller_1` | SV | ALL | 0.67 | 1.00 | 0.8 | ... | 1 | + + +In our case, the best option for maximizing the F1 score is to use `variant_caller_1` alone. However, we can see that the union of `variant_caller_1` and `variant_caller_2` has a higher recall and that `variant_caller_1` alone has a higher precision. Selecting one option or another will depend on the use case. diff --git a/tools/pipeline_designer/TODO b/tools/pipeline_designer/TODO deleted file mode 100644 index e69de29..0000000 diff --git a/tools/pipeline_designer/example.sh b/tools/pipeline_designer/example.sh deleted file mode 100644 index bf3fe9d..0000000 --- a/tools/pipeline_designer/example.sh +++ /dev/null @@ -1,9 +0,0 @@ - -export EVALUATOR_COMMAND='python3 ./example/variant-evaluator/src/variant_evaluator/main.py' -python3 src/main.py -t ./example/input/truth -v ./example/input/test -o ./example/output \ - -f ./example/input/genome.fa \ - -rs PCAWG_pilot_1 PCAWG_pilot_2 \ - -ps PCAWG_pilot_2 \ - -p 32 \ - --max-combinations 5 - diff --git a/tools/pipeline_designer/example/.gitignore b/tools/pipeline_designer/example/.gitignore new file mode 100644 index 0000000..6caf68a --- /dev/null +++ b/tools/pipeline_designer/example/.gitignore @@ -0,0 +1 @@ +output \ No newline at end of file diff --git a/tools/pipeline_designer/example/example.sh b/tools/pipeline_designer/example/example.sh new file mode 100644 index 0000000..8bbd4a8 --- /dev/null +++ b/tools/pipeline_designer/example/example.sh @@ -0,0 +1,9 @@ + +export ASSESMENT_COMMAND='python3 ../../../modules/oncoliner_assesment/src/assesment_main.py' +python3 ../src/main.py -t ./input/truth -v ./input/test -o ./output \ + -f ./fake_ref.fa \ + -rs sample_1 \ + -ps sample_2 \ + -p 32 \ + --max-combinations 5 + diff --git a/tools/pipeline_designer/example/fake_ref.fa b/tools/pipeline_designer/example/fake_ref.fa new file mode 120000 index 0000000..a4394f0 --- /dev/null +++ b/tools/pipeline_designer/example/fake_ref.fa @@ -0,0 +1 @@ +../../../modules/oncoliner_assesment/examples/fake_ref.fa \ No newline at end of file diff --git a/tools/pipeline_designer/example/fake_ref.fa.fai b/tools/pipeline_designer/example/fake_ref.fa.fai new file mode 120000 index 0000000..12fcfb7 --- /dev/null +++ b/tools/pipeline_designer/example/fake_ref.fa.fai @@ -0,0 +1 @@ +../../../modules/oncoliner_assesment/examples/fake_ref.fa.fai \ No newline at end of file diff --git a/tools/pipeline_designer/example/input/test/variant_caller_1/sample_1/variant_caller_1_sample_1.vcf b/tools/pipeline_designer/example/input/test/variant_caller_1/sample_1/variant_caller_1_sample_1.vcf new file mode 100644 index 0000000..3a39e2f --- /dev/null +++ b/tools/pipeline_designer/example/input/test/variant_caller_1/sample_1/variant_caller_1_sample_1.vcf @@ -0,0 +1,169 @@ +##fileformat=VCFv4.2 +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##GATKCommandLine= +##GATKCommandLine= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##MutectVersion=2.2 +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##filtering_status=These calls have been filtered by FilterMutectCalls to label false positives with a list of failed filters and true positives with PASS. +##normal_sample=COLO829N +##source=FilterMutectCalls +##source=Mutect2 +##tumor_sample=COLO829T +##bcftools_normVersion=1.10.2+htslib-1.10.2 +##bcftools_normCommand=norm -Oz -m -both -f hg19.fa --threads 8 -o COLO829T_vs_COLO829N_Mutect2_filtered_pass_norm.vcf.gz COLO829T_vs_COLO829N_Mutect2_filtered_pass.vcf.gz; Date=Thu Jul 8 16:14:34 2021 +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT COLO829N COLO829T +1 900 . T T]1:3000] . PASS FP;TEST_VARIANT;AS_SB_TABLE=16,23|7,7;DP=55;ECNT=1;GERMQ=6;MBQ=38,34;MFRL=531,532;MMQ=60,38;MPOS=14;NALOD=1.23;NLOD=4.77;POPAF=1.54;TLOD=33.58 GT:AD:AF:DP:F1R2:F2R1:SB 0/0:16,0:0.056:16:4,0:11,0:7,9,0,0 0/1:23,14:0.38:37:13,5:10,7:9,14,7,7 +1 1100 . T T]1:2100] . PASS TP;AS_FilterStatus=SITE;AS_SB_TABLE=16,23|7,7;DP=55;ECNT=1;GERMQ=6;MBQ=38,34;MFRL=531,532;MMQ=60,38;MPOS=14;NALOD=1.23;NLOD=4.77;POPAF=1.54;TLOD=33.58 GT:AD:AF:DP:F1R2:F2R1:SB 0/0:16,0:0.056:16:4,0:11,0:7,9,0,0 0/1:23,14:0.38:37:13,5:10,7:9,14,7,7 +2 2000 . T T]2:3000] . PASS FP;TEST_VARIANT;AS_SB_TABLE=16,23|7,7;DP=55;ECNT=1;GERMQ=6;MBQ=38,34;MFRL=531,532;MMQ=60,38;MPOS=14;NALOD=1.23;NLOD=4.77;POPAF=1.54;TLOD=33.58 GT:AD:AF:DP:F1R2:F2R1:SB 0/0:16,0:0.056:16:4,0:11,0:7,9,0,0 0/1:23,14:0.38:37:13,5:10,7:9,14,7,7 +1 2000 . T T]2:3000] . PASS AS_FilterStatus=SITE;AS_SB_TABLE=16,23|7,7;DP=55;ECNT=1;GERMQ=6;MBQ=38,34;MFRL=531,532;MMQ=60,38;MPOS=14;NALOD=1.23;NLOD=4.77;POPAF=1.54;TLOD=33.58 GT:AD:AF:DP:F1R2:F2R1:SB 0/0:16,0:0.056:16:4,0:11,0:7,9,0,0 0/1:23,14:0.38:37:13,5:10,7:9,14,7,7 diff --git a/tools/pipeline_designer/example/input/test/variant_caller_1/sample_2/variant_caller_1_sample_2.vcf b/tools/pipeline_designer/example/input/test/variant_caller_1/sample_2/variant_caller_1_sample_2.vcf new file mode 100644 index 0000000..41e7a29 --- /dev/null +++ b/tools/pipeline_designer/example/input/test/variant_caller_1/sample_2/variant_caller_1_sample_2.vcf @@ -0,0 +1,168 @@ +##fileformat=VCFv4.2 +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##GATKCommandLine= +##GATKCommandLine= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##MutectVersion=2.2 +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##filtering_status=These calls have been filtered by FilterMutectCalls to label false positives with a list of failed filters and true positives with PASS. +##normal_sample=COLO829N +##source=FilterMutectCalls +##source=Mutect2 +##tumor_sample=COLO829T +##bcftools_normVersion=1.10.2+htslib-1.10.2 +##bcftools_normCommand=norm -Oz -m -both -f hg19.fa --threads 8 -o COLO829T_vs_COLO829N_Mutect2_filtered_pass_norm.vcf.gz COLO829T_vs_COLO829N_Mutect2_filtered_pass.vcf.gz; Date=Thu Jul 8 16:14:34 2021 +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT COLO829N COLO829T +1 63 . A C . PASS FP;TEST_VARIANT;AS_SB_TABLE=88,80|4,3;DP=176;ECNT=1;GERMQ=93;MBQ=36,35;MFRL=542,675;MMQ=40,34;MPOS=15;NALOD=1.63;NLOD=12.59;POPAF=0.903;TLOD=10.11 GT:AD:AF:DP:F1R2:F2R1:SB 0/0:42,0:0.023:42:17,0:24,0:19,23,0,0 0/1:126,7:0.06:133:69,3:56,4:69,57,4,3 +2 1000 . T T]2:3000] . PASS FP;AS_FilterStatus=SITE;AS_SB_TABLE=16,23|7,7;DP=55;ECNT=1;GERMQ=6;MBQ=38,34;MFRL=531,532;MMQ=60,38;MPOS=14;NALOD=1.23;NLOD=4.77;POPAF=1.54;TLOD=33.58 GT:AD:AF:DP:F1R2:F2R1:SB 0/0:16,0:0.056:16:4,0:11,0:7,9,0,0 0/1:23,14:0.38:37:13,5:10,7:9,14,7,7 +2 900 . T T]2:3000] . PASS TP;AS_FilterStatus=SITE;AS_SB_TABLE=16,23|7,7;DP=55;ECNT=1;GERMQ=6;MBQ=38,34;MFRL=531,532;MMQ=60,38;MPOS=14;NALOD=1.23;NLOD=4.77;POPAF=1.54;TLOD=33.58 GT:AD:AF:DP:F1R2:F2R1:SB 0/0:16,0:0.056:16:4,0:11,0:7,9,0,0 0/1:23,14:0.38:37:13,5:10,7:9,14,7,7 diff --git a/tools/pipeline_designer/example/input/test/variant_caller_2/sample_1/variant_caller_2_sample_1.vcf b/tools/pipeline_designer/example/input/test/variant_caller_2/sample_1/variant_caller_2_sample_1.vcf new file mode 100644 index 0000000..99fbcd9 --- /dev/null +++ b/tools/pipeline_designer/example/input/test/variant_caller_2/sample_1/variant_caller_2_sample_1.vcf @@ -0,0 +1,168 @@ +##fileformat=VCFv4.2 +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##GATKCommandLine= +##GATKCommandLine= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##MutectVersion=2.2 +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##filtering_status=These calls have been filtered by FilterMutectCalls to label false positives with a list of failed filters and true positives with PASS. +##normal_sample=COLO829N +##source=FilterMutectCalls +##source=Mutect2 +##tumor_sample=COLO829T +##bcftools_normVersion=1.10.2+htslib-1.10.2 +##bcftools_normCommand=norm -Oz -m -both -f hg19.fa --threads 8 -o COLO829T_vs_COLO829N_Mutect2_filtered_pass_norm.vcf.gz COLO829T_vs_COLO829N_Mutect2_filtered_pass.vcf.gz; Date=Thu Jul 8 16:14:34 2021 +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT COLO829N COLO829T +1 3000 . T T]1:6000] . PASS FP;AS_FilterStatus=SITE;AS_SB_TABLE=16,23|7,7;DP=55;ECNT=1;GERMQ=6;MBQ=38,34;MFRL=531,532;MMQ=60,38;MPOS=14;NALOD=1.23;NLOD=4.77;POPAF=1.54;TLOD=33.58 GT:AD:AF:DP:F1R2:F2R1:SB 0/0:16,0:0.056:16:4,0:11,0:7,9,0,0 0/1:23,14:0.38:37:13,5:10,7:9,14,7,7 +1 1100 . T T]1:2100] . PASS TP;AS_FilterStatus=SITE;AS_SB_TABLE=16,23|7,7;DP=55;ECNT=1;GERMQ=6;MBQ=38,34;MFRL=531,532;MMQ=60,38;MPOS=14;NALOD=1.23;NLOD=4.77;POPAF=1.54;TLOD=33.58 GT:AD:AF:DP:F1R2:F2R1:SB 0/0:16,0:0.056:16:4,0:11,0:7,9,0,0 0/1:23,14:0.38:37:13,5:10,7:9,14,7,7 +2 1000 . T T]2:3000] . PASS TP;AS_FilterStatus=SITE;AS_SB_TABLE=16,23|7,7;DP=55;ECNT=1;GERMQ=6;MBQ=38,34;MFRL=531,532;MMQ=60,38;MPOS=14;NALOD=1.23;NLOD=4.77;POPAF=1.54;TLOD=33.58 GT:AD:AF:DP:F1R2:F2R1:SB 0/0:16,0:0.056:16:4,0:11,0:7,9,0,0 0/1:23,14:0.38:37:13,5:10,7:9,14,7,7 diff --git a/tools/pipeline_designer/example/input/test/variant_caller_2/sample_2/variant_caller_2_sample_2.vcf b/tools/pipeline_designer/example/input/test/variant_caller_2/sample_2/variant_caller_2_sample_2.vcf new file mode 100644 index 0000000..3e25a52 --- /dev/null +++ b/tools/pipeline_designer/example/input/test/variant_caller_2/sample_2/variant_caller_2_sample_2.vcf @@ -0,0 +1,167 @@ +##fileformat=VCFv4.2 +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##GATKCommandLine= +##GATKCommandLine= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##MutectVersion=2.2 +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##filtering_status=These calls have been filtered by FilterMutectCalls to label false positives with a list of failed filters and true positives with PASS. +##normal_sample=COLO829N +##source=FilterMutectCalls +##source=Mutect2 +##tumor_sample=COLO829T +##bcftools_normVersion=1.10.2+htslib-1.10.2 +##bcftools_normCommand=norm -Oz -m -both -f hg19.fa --threads 8 -o COLO829T_vs_COLO829N_Mutect2_filtered_pass_norm.vcf.gz COLO829T_vs_COLO829N_Mutect2_filtered_pass.vcf.gz; Date=Thu Jul 8 16:14:34 2021 +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT COLO829N COLO829T +2 900 . T T]2:3000] . PASS TP;AS_FilterStatus=SITE;AS_SB_TABLE=16,23|7,7;DP=55;ECNT=1;GERMQ=6;MBQ=38,34;MFRL=531,532;MMQ=60,38;MPOS=14;NALOD=1.23;NLOD=4.77;POPAF=1.54;TLOD=33.58 GT:AD:AF:DP:F1R2:F2R1:SB 0/0:16,0:0.056:16:4,0:11,0:7,9,0,0 0/1:23,14:0.38:37:13,5:10,7:9,14,7,7 +2 2102 . T T]2:2200] . PASS FP;AS_FilterStatus=SITE;AS_SB_TABLE=16,23|7,7;DP=55;ECNT=1;GERMQ=6;MBQ=38,34;MFRL=531,532;MMQ=60,38;MPOS=14;NALOD=1.23;NLOD=4.77;POPAF=1.54;TLOD=33.58 GT:AD:AF:DP:F1R2:F2R1:SB 0/0:16,0:0.056:16:4,0:11,0:7,9,0,0 0/1:23,14:0.38:37:13,5:10,7:9,14,7,7 diff --git a/tools/pipeline_designer/example/input/truth/sample_1/truth_sample_1.vcf b/tools/pipeline_designer/example/input/truth/sample_1/truth_sample_1.vcf new file mode 100644 index 0000000..4f60627 --- /dev/null +++ b/tools/pipeline_designer/example/input/truth/sample_1/truth_sample_1.vcf @@ -0,0 +1,169 @@ +##fileformat=VCFv4.2 +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##GATKCommandLine= +##GATKCommandLine= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##MutectVersion=2.2 +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##filtering_status=These calls have been filtered by FilterMutectCalls to label false positives with a list of failed filters and true positives with PASS. +##normal_sample=COLO829N +##source=FilterMutectCalls +##source=Mutect2 +##tumor_sample=COLO829T +##bcftools_normVersion=1.10.2+htslib-1.10.2 +##bcftools_normCommand=norm -Oz -m -both -f hg19.fa --threads 8 -o COLO829T_vs_COLO829N_Mutect2_filtered_pass_norm.vcf.gz COLO829T_vs_COLO829N_Mutect2_filtered_pass.vcf.gz; Date=Thu Jul 8 16:14:34 2021 +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT COLO829N COLO829T +1 63 . A C . PASS AS_FilterStatus=SITE;CSQ=ENSG00000142611|ZSWIM7|protein_altering_variant,ENSG00000142611|PRDM16|protein_altering_variant;AS_SB_TABLE=88,80|4,3;DP=176;ECNT=1;GERMQ=93;MBQ=36,35;MFRL=542,675;MMQ=40,34;MPOS=15;NALOD=1.63;NLOD=12.59;POPAF=0.903;TLOD=10.11 GT:AD:AF:DP:F1R2:F2R1:SB 0/0:42,0:0.023:42:17,0:24,0:19,23,0,0 0/1:126,7:0.06:133:69,3:56,4:69,57,4,3 +1 1100 . T T]1:2100] . PASS AS_FilterStatus=SITE;AS_SB_TABLE=16,23|7,7;DP=55;ECNT=1;GERMQ=6;MBQ=38,34;MFRL=531,532;MMQ=60,38;MPOS=14;NALOD=1.23;NLOD=4.77;POPAF=1.54;TLOD=33.58 GT:AD:AF:DP:F1R2:F2R1:SB 0/0:16,0:0.056:16:4,0:11,0:7,9,0,0 0/1:23,14:0.38:37:13,5:10,7:9,14,7,7 +2 1000 . T T]2:3000] . PASS AS_FilterStatus=SITE;AS_SB_TABLE=16,23|7,7;DP=55;ECNT=1;GERMQ=6;MBQ=38,34;MFRL=531,532;MMQ=60,38;MPOS=14;NALOD=1.23;NLOD=4.77;POPAF=1.54;TLOD=33.58 GT:AD:AF:DP:F1R2:F2R1:SB 0/0:16,0:0.056:16:4,0:11,0:7,9,0,0 0/1:23,14:0.38:37:13,5:10,7:9,14,7,7 +1 2000 . T T]2:3000] . PASS AS_FilterStatus=SITE;AS_SB_TABLE=16,23|7,7;DP=55;ECNT=1;GERMQ=6;MBQ=38,34;MFRL=531,532;MMQ=60,38;MPOS=14;NALOD=1.23;NLOD=4.77;POPAF=1.54;TLOD=33.58 GT:AD:AF:DP:F1R2:F2R1:SB 0/0:16,0:0.056:16:4,0:11,0:7,9,0,0 0/1:23,14:0.38:37:13,5:10,7:9,14,7,7 diff --git a/tools/pipeline_designer/example/input/truth/sample_2/truth_sample_2.vcf b/tools/pipeline_designer/example/input/truth/sample_2/truth_sample_2.vcf new file mode 100644 index 0000000..93eedd6 --- /dev/null +++ b/tools/pipeline_designer/example/input/truth/sample_2/truth_sample_2.vcf @@ -0,0 +1,168 @@ +##fileformat=VCFv4.2 +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##GATKCommandLine= +##GATKCommandLine= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##MutectVersion=2.2 +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##filtering_status=These calls have been filtered by FilterMutectCalls to label false positives with a list of failed filters and true positives with PASS. +##normal_sample=COLO829N +##source=FilterMutectCalls +##source=Mutect2 +##tumor_sample=COLO829T +##bcftools_normVersion=1.10.2+htslib-1.10.2 +##bcftools_normCommand=norm -Oz -m -both -f hg19.fa --threads 8 -o COLO829T_vs_COLO829N_Mutect2_filtered_pass_norm.vcf.gz COLO829T_vs_COLO829N_Mutect2_filtered_pass.vcf.gz; Date=Thu Jul 8 16:14:34 2021 +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT COLO829N COLO829T +1 74 . T G . PASS AS_FilterStatus=SITE;AS_SB_TABLE=16,23|7,7;DP=55;ECNT=1;GERMQ=6;MBQ=38,34;MFRL=531,532;MMQ=60,38;MPOS=14;NALOD=1.23;NLOD=4.77;POPAF=1.54;TLOD=33.58 GT:AD:AF:DP:F1R2:F2R1:SB 0/0:16,0:0.056:16:4,0:11,0:7,9,0,0 0/1:23,14:0.38:37:13,5:10,7:9,14,7,7 +1 900 . T T]1:3000] . PASS AS_FilterStatus=SITE;AS_SB_TABLE=16,23|7,7;DP=55;ECNT=1;GERMQ=6;MBQ=38,34;MFRL=531,532;MMQ=60,38;MPOS=14;NALOD=1.23;NLOD=4.77;POPAF=1.54;TLOD=33.58 GT:AD:AF:DP:F1R2:F2R1:SB 0/0:16,0:0.056:16:4,0:11,0:7,9,0,0 0/1:23,14:0.38:37:13,5:10,7:9,14,7,7 +2 900 . T T]2:3000] . PASS AS_FilterStatus=SITE;AS_SB_TABLE=16,23|7,7;DP=55;ECNT=1;GERMQ=6;MBQ=38,34;MFRL=531,532;MMQ=60,38;MPOS=14;NALOD=1.23;NLOD=4.77;POPAF=1.54;TLOD=33.58 GT:AD:AF:DP:F1R2:F2R1:SB 0/0:16,0:0.056:16:4,0:11,0:7,9,0,0 0/1:23,14:0.38:37:13,5:10,7:9,14,7,7 diff --git a/tools/pipeline_designer/src/main.py b/tools/pipeline_designer/src/main.py index 76d5a93..635a417 100644 --- a/tools/pipeline_designer/src/main.py +++ b/tools/pipeline_designer/src/main.py @@ -10,10 +10,10 @@ from concurrent.futures import ProcessPoolExecutor, as_completed, ThreadPoolExecutor # Add vcf-ops to the path -sys.path.insert(0, os.path.join(os.path.abspath(os.path.dirname(__file__)), '..', '..', 'shared', 'vcf-ops','src')) +sys.path.insert(0, os.path.join(os.path.abspath(os.path.dirname(__file__)), '..', '..', '..', 'shared', 'vcf_ops', 'src')) from vcf_ops.constants import DEFAULT_INDEL_THRESHOLD, DEFAULT_WINDOW_RADIUS, DEFAULT_SV_BINS, DEFAULT_CONTIGS # noqa -from vcf_ops.metrics import aggregate_metrics # noqa +from vcf_ops.metrics import aggregate_metrics, combine_precision_recall_metrics # noqa from vcf_ops.intersect import intersect # noqa from vcf_ops.union import union # noqa from vcf_ops.i_o import read_vcfs, write_masked_vcfs # noqa @@ -38,27 +38,26 @@ def extract_vcfs(path_prefix: str) -> pd.DataFrame: return read_vcfs(files) -def evaluate_sample(truth_sample_folder, test_sample_folder, output_prefix, fasta_ref, indel_threshold, window_ratio, window_limit, sv_size_bins, contigs): +def evaluate_sample(truth_sample_folder, test_sample_folder, output_prefix, fasta_ref, indel_threshold, window_radius, sv_size_bins, contigs): # If output_prefix*metrics.csv exists and is not empty, skip output_metrics = glob.glob(output_prefix + '*metrics.csv') if len(output_metrics) > 0 and os.path.getsize(output_metrics[0]) > 0: return # Get evaluator path from the environment - evaluator_command = os.environ.get('EVALUATOR_COMMAND') - if evaluator_command is None: - raise Exception('EVALUATOR_COMMAND environment variable not defined') + assesment_command = os.environ.get('ASSESMENT_COMMAND') + if assesment_command is None: + raise Exception('ASSESMENT_COMMAND environment variable not defined') # Get files truth_sample_vcfs = get_vcf_files(truth_sample_folder + '/') test_sample_vcfs = get_vcf_files(test_sample_folder + '/') subprocess.check_call( - evaluator_command.split() + [ + assesment_command.split() + [ '-t'] + truth_sample_vcfs + [ '-v'] + test_sample_vcfs + [ '-o', output_prefix, '-f', fasta_ref, '-it', str(indel_threshold), - '-wr', str(window_ratio), - '-wl', str(window_limit), + '-wr', str(window_radius), '--sv-size-bins'] + [str(bin) for bin in sv_size_bins] + [ '--contigs'] + contigs ) @@ -127,7 +126,7 @@ def union_callers(caller_1_prefix: str, caller_2_prefix: str, output_prefix: str # Union df_union, _ = union(df_1, df_2, indel_threshold, window_radius) # Write VCF files - if len(df_union) != len(df_1): + if len(df_union) > len(df_1): write_masked_vcfs(df_union, output_prefix, indel_threshold, fasta_ref) return True return False @@ -200,14 +199,7 @@ def calculate_aggregated_metrics_caller(caller_folder: str, recall_samples: List recall_aggregated_metrics = aggregate_metrics([pd.read_csv(metrics_file) for metrics_file in recall_metrics_files]) precision_metrics_files = [glob.glob(os.path.join(caller_folder, sample, '*metrics.csv'))[0] for sample in precision_samples] precision_aggregated_metrics = aggregate_metrics([pd.read_csv(metrics_file) for metrics_file in precision_metrics_files]) - final_metrics = recall_aggregated_metrics - final_metrics['precision'] = precision_aggregated_metrics['precision'] - final_metrics['f1_score'] = 2 * final_metrics['recall'] * \ - final_metrics['precision'] / (final_metrics['recall'] + final_metrics['precision']) - final_metrics['f1_score'].fillna(0, inplace=True) - final_metrics['fp'] = precision_aggregated_metrics['fp'] - # Set columns - final_metrics = final_metrics[['variant_type', 'variant_size', 'window_size', 'tp', 'fp', 'fn', 'recall', 'precision', 'f1_score']] + final_metrics = combine_precision_recall_metrics(recall_aggregated_metrics, precision_aggregated_metrics) final_metrics.to_csv(output_file, index=False) @@ -257,19 +249,19 @@ def main(args): args.truth = os.path.abspath(args.truth) args.test = os.path.abspath(args.test) - # All subfolders (samples) in truth must be in test - if set(os.listdir(args.truth)) != set(os.listdir(args.test)): - raise ValueError('The samples in the truth and test folders are not the same') - - # All test subfolders (samples) must contain the same subfolders (callers) - callers_names = set(os.listdir(os.path.join(args.test, os.listdir(args.test)[0]))) - for folder in os.listdir(args.test): - if set(os.listdir(os.path.join(args.test, folder))) != callers_names: - raise ValueError(f'The samples in the test folder do not contain the same files: {folder}') - # Remove the extension from the callers names - callers_names = [caller.replace('.vcf.gz', '').replace('.bcf', '').replace('.vcf', '') for caller in callers_names] + # Get callers names + callers_names = os.listdir(args.test) callers_names.sort() + # All test folders (callers) must contain the same subfolders (samples) + for caller_folder in callers_names: + missing_truth_samples = set(os.listdir(args.truth)) - set(os.listdir(os.path.join(args.test, caller_folder))) + missing_test_samples = set(os.listdir(os.path.join(args.test, caller_folder))) - set(os.listdir(args.truth)) + if len(missing_truth_samples) > 0: + raise Exception(f'Caller {caller_folder} is missing samples from the truth folder: {missing_truth_samples}') + if len(missing_test_samples) > 0: + raise Exception(f'Caller {caller_folder} has more samples than the truth folder: {missing_test_samples}') + # Create output folder os.makedirs(args.output, exist_ok=True) # Create output folder for evaluations @@ -278,21 +270,26 @@ def main(args): # Create output folder for combinations output_combinations = os.path.join(args.output, 'combinations') os.makedirs(output_combinations, exist_ok=True) - # Copy each variant caller files to the output_combinations folder + # Evaluate the callers + original_caller_folders = [os.path.join(args.test, caller_name) for caller_name in callers_names] + evaluate_callers(original_caller_folders, args.truth, output_evaluations, args.processes, fasta_ref=args.fasta_ref, + indel_threshold=args.indel_threshold, window_radius=args.window_radius, + sv_size_bins=args.sv_size_bins, contigs=args.contigs) + + # Copy each variant caller TP+FP files to its corresponding output_combinations folder for caller_name in callers_names: caller_combination_folder = os.path.join(output_combinations, caller_name) os.makedirs(caller_combination_folder, exist_ok=True) - for sample in os.listdir(args.test): + for sample in os.listdir(os.path.join(args.test, caller_name)): caller_combination_sample_folder = os.path.join(caller_combination_folder, sample) os.makedirs(caller_combination_sample_folder, exist_ok=True) - for file in get_vcf_files(os.path.join(args.test, sample, caller_name)): + for file in get_vcf_files(os.path.join(output_evaluations, caller_name, sample, '')): + # Filter TP+FP files + if 'tp.' not in file and 'fp.' not in file: + continue shutil.copy(file, caller_combination_sample_folder) # Get callers prefixes callers_folders = [os.path.join(output_combinations, caller_name) for caller_name in os.listdir(output_combinations)] - # Evaluate the callers - evaluate_callers(callers_folders, args.truth, output_evaluations, args.processes, fasta_ref=args.fasta_ref, - indel_threshold=args.indel_threshold, window_radius=args.window_radius, - sv_size_bins=args.sv_size_bins, contigs=args.contigs) # Get the variant types for each caller # Create a dict variant_type -> callers callers_variant_types = { @@ -309,13 +306,15 @@ def main(args): callers_operations_by_variant_type[variant_type] = generate_combinations(callers, args.max_combinations) # Perform the operations for operations in callers_operations_by_variant_type.values(): + if len(operations) == 0: + continue execute_operations(operations, output_combinations, args.processes, fasta_ref=args.fasta_ref, - indel_threshold=args.indel_threshold, window_ratio=args.window_ratio, window_limit=args.window_limit) + indel_threshold=args.indel_threshold, window_radius=args.window_radius) # Evaluate the combinations combinations_prefixes = list(set(os.listdir(output_combinations)) - set(callers_folders)) combinations_folders = [os.path.join(output_combinations, combination_prefix) for combination_prefix in combinations_prefixes] evaluate_callers(combinations_folders, args.truth, output_evaluations, args.processes, fasta_ref=args.fasta_ref, - indel_threshold=args.indel_threshold, window_ratio=args.window_ratio, window_limit=args.window_limit, + indel_threshold=args.indel_threshold, window_radius=args.window_radius, sv_size_bins=args.sv_size_bins, contigs=args.contigs) # Calculate aggregated metrics @@ -328,7 +327,7 @@ def main(args): if __name__ == '__main__': - parser = argparse.ArgumentParser() + parser = argparse.ArgumentParser(description='Pipeline designer') parser.add_argument('-t', '--truth', help='Path to the VCF truth folder', required=True, type=str) parser.add_argument('-v', '--test', help='Path to the VCF test folder', required=True, type=str) parser.add_argument('-o', '--output', help='Path to the output folder', required=True, type=str)