feat: annotation of structural variants (#46) (#52)

varfish-org · Apr 20, 2023 · 8da1d38 · 8da1d38
1 parent 68b87da
commit 8da1d38
Show file tree

Hide file tree

Showing 27 changed files with 3,674 additions and 70 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -29,7 +29,7 @@ clap-verbosity-flag = "2.0.0"
 csv = "1.2.0"
 flatbuffers = "23.1.21"
 flate2 = "1.0.25"
-hgvs = "0.6.1"
+hgvs = "0.6.2"
 lazy_static = "1.4.0"
 log = "0.4.17"
 noodles = { version = "0.34.0", features = ["core", "vcf", "bcf", "csi", "fasta", "bgzf", "tabix"] }
@@ -53,6 +53,13 @@ bincode = "1.3.3"
 bgzip = "0.3.1"
 rustc-hash = "1.1.0"
 quick_cache = "0.2.4"
+uuid = { version = "1.3.1", features = ["fast-rng", "serde"] }
+strum = { version = "0.24.1", features = ["derive"] }
+tempdir = "0.3.7"
+jsonl = "4.0.1"
+chrono = "0.4.24"
+rand_core = "0.6.4"
+rand = "0.8.5"
 
 [build-dependencies]
 flatc-rust = "0.2.0"

diff --git a/docs/anno_strucvars.md b/docs/anno_strucvars.md
@@ -1,3 +1,67 @@
 Annotating structural variants.
 
 TODO: implement functionality and documentation.
+
+# Information Extracted from Caller VCF
+
+This section describes the information that is extracted from the VCF files of the SV callers.
+
+Independent of the used caller, the following information is extracted:
+
+- Sample names are extracted from the VCF header.
+- Chromosome and start position are extracted from the VCF `CHROM` and `POS` columns.
+- End position for inversion and deletions are extracted from the `INFO/END` field.
+- Only one alternate allele is supported in the `ALT` column.
+- The SV type is extracted from the `ALT` allele and the `INFO/SVTYPE` field.
+- In the case of breakends, the alternate allele is parsed, e.g., `[17:198982[A` to derive the target chromosome and position as well as the orientation (e.g., 3' to 3' in this case).
+- Unless specified otherwise, confidence intervals around start and end positions are extracted from `INFO/CIPOS` and `INFO/CIEND`.
+- The genotype for each sample is extracted from the standard field `FORMAT/GT`.
+- The genotype quality is extracted from the standard field `FORMAT/GQ`.
+- Any genotype-specific filter values are extracted from the standard field `FORMAT/FT`.
+
+The following section describes the SV caller specific information that is extracted.
+
+## Delly
+
+The Delly caller and version is identified by the `INFO/SVMETHOD` field of the first record.
+
+- Paired-end variant read count is extracted from `FORMAT/DV`.
+- Paired-end reference read count is extracted from `FORMAT/DR`.
+- Split-read variant read count is extracted from `FORMAT/RV`.
+- Split-read reference read count is extracted from `FORMAT/RR`.
+
+## Manta
+
+The Manta caller and version is identified by the `##source=` VCF header.
+The string `GenerateSVCandidates <VERSION>` is used to identify Manta and the used version.
+
+- Paired-end reference and variant read count is extracted from `FORMAT/PR[0]` and `FORMAT/PR[1]`.
+- Split-end reference and variant read count is extracted from `FORMAT/SR[0]` and `FORMAT/SR[1]`.
+
+## GATK gCNV
+
+GATK gCNV is identified by looking at the `##source=` header line.
+The tool is identified by `PostprocessGermlineCNVCalls` but the version cannot be identified automatically.
+
+- Copy number is extracted from `FORMAT/CN`
+- Number of points is extracted from `FORMAT/NP`
+
+## PopDel
+
+The PopDel caller and version is identified by the `INFO/SVMETHOD` of the first record.
+
+- Paired-end reference support is extracted from `FORMAT/DAD[0]`.
+- Paired-end variant support is extracted from `FORMAT/DAD[3]`.
+
+## Dragen SV
+
+The Dragen SV caller is identical to Manta.
+However, here the caller and version are identified by looking at the `##source=DRAGEN <VERSION>` line.
+
+## Dragen CNV
+
+The Dragen CNV caller is identified by the header line `##DRAGENVersion=<ID=dragen,Version="SW: <VERSION>, HW: <IGNORED>">`.
+
+- Copy number is extracted from `FORMAT/CN`.
+- Number of points is extracted from `FORMAT/BC`.
+- Paired-end variant support is extracted from `FORMAT/PE`.
diff --git a/src/annotate/mod.rs b/src/annotate/mod.rs
@@ -1,3 +1,4 @@
 //! Annotation of VCF files.
 
 pub mod seqvars;
+pub mod strucvars;
diff --git a/src/annotate/seqvars/binning.rs b/src/annotate/seqvars/binning.rs
@@ -0,0 +1,30 @@
+//! UCSC binning scheme.
+
+// Offsets for UCSC binning.
+//
+// cf. http://genomewiki.ucsc.edu/index.php/Bin_indexing_system
+static BIN_OFFSETS: &[i32] = &[512 + 64 + 8 + 1, 64 + 8 + 1, 8 + 1, 1, 0];
+
+const BIN_FIRST_SHIFT: i32 = 17;
+
+const BIN_NEXT_SHIFT: i32 = 3;
+
+// Compute UCSC bin from 0-based half-open interval.
+pub fn bin_from_range(begin: i32, end: i32) -> Result<i32, anyhow::Error> {
+    let mut begin_bin = begin >> BIN_FIRST_SHIFT;
+    let mut end_bin = std::cmp::max(begin, end - 1) >> BIN_FIRST_SHIFT;
+
+    for offset in BIN_OFFSETS {
+        if begin_bin == end_bin {
+            return Ok(offset + begin_bin);
+        }
+        begin_bin >>= BIN_NEXT_SHIFT;
+        end_bin >>= BIN_NEXT_SHIFT;
+    }
+
+    anyhow::bail!(
+        "begin {}, end {} out of range in bin_from_range (max is 512M",
+        begin,
+        end
+    );
+}