updated skalo with indel output

bacpop · Feb 9, 2025 · f591fb4 · f591fb4
1 parent 3ce160e
commit f591fb4
Show file tree

Hide file tree

Showing 6 changed files with 283 additions and 72 deletions.
diff --git a/src/cli.rs b/src/cli.rs
@@ -28,7 +28,7 @@ pub const DEFAULT_MINQUAL: u8 = 20;
 /// Default quality filtering criteria
 pub const DEFAULT_QUALFILTER: QualFilter = QualFilter::Strict;
 /// Default -m for ska lo
-pub const DEFAULT_MISSING_SKALO: f32 = 0.2;
+pub const DEFAULT_MISSING_SKALO: f32 = 0.1;
 /// Default -d for ska lo
 pub const DEFAULT_MAX_PATHDEPTH: usize = 4;
 /// Deafult -n for ska lo

diff --git a/src/lib.rs b/src/lib.rs
@@ -193,22 +193,25 @@
 //!
 //! ## ska lo
 //!
-//! Converts split k-mers from a `.skf` file into a colored De Bruijn graph and infers SNPs from variant groups in
-//! reference-free mode (equivalent to SKA align). SNPs are only composed of ATGC variants (no ambigous nucleotides).
-//! Multithreading ('-t' argument) is not yet optimised - it usually takes 4 threads to halve runtimes.
+//! Converts split k-mers from a `.skf` file into a colored De Bruijn graph and infers indels from graph bubbles and SNPs from variant groups in
+//! reference-free mode (as with `ska align`). SNPs are only composed of ATGC variants (no ambigous nucleotides). The same
+//! filtering applies to indels. 
+//! Multithreading ('-t' argument) is not fully optimised - it usually takes 4 threads to halve runtimes.
 //!
-//! To generate a SNP alignment (here in a file named 'test_snps.fas'):
+//! To generate a SNP alignment and an indel VCF file (here named 'test_snps.fas' and 'test_indels.vcf'):
 //! ```bash
-//! ska lo -i seqs.skf -o test
+//! ska lo seqs.skf test
 //! ```
 //!
-//! ska lo can also position SNPs on a reference genome if provided using the '-r' argument. The reference genome should
-//! be in FASTA format and composed of a unique sequence. skalo lo will then generate, in addition to the SNP alignemnt, a
-//! VCF file and a pseudo-genome alignment (equivalent to SKA map) that can be used for recombination analyses. In such use case,
-//! we recommmend to increase the maximum proportion of missing data allowed per SNP ('-m' argument), but not above 0.5.
+//! It can also position SNPs on a reference genome if provided using the '-r' argument. The reference genome should
+//! be in FASTA format and composed of a unique sequence. skalo lo will then generate, in addition to the SNP alignment and the indel
+//! VCF file, a SNP VCF file and a pseudo-genome alignment (as with `ska map`) that can be used for recombination analyses.
+//! In such use case, we recommmend to increase the maximum proportion of missing data allowed per variant ('-m' argument), but not above 0.5.
 //! ```bash
-//! ska lo -i seqs.skf -o test -r reference.fas -m 0.4
+//! ska lo seqs.skf test -r reference.fas -m 0.4
 //! ```
+//! Please note that at the moment indels cannot be positioned on a reference genome.
+//!
 //!
 //! ### Efficiency
 //!
@@ -842,11 +845,11 @@ pub fn main() {
             };
 
             if let Ok(ska_array) = load_array::<u64>(&[input_skf.clone()], *threads) {
-                log::info!(" # read file {}", input_skf);
+                log::info!("Reading file {}", input_skf);
                 log::info!("Using 64-bit representation");
                 skalo(ska_array, config);
             } else if let Ok(ska_array) = load_array::<u128>(&[input_skf.clone()], *threads) {
-                log::info!(" # read file {}", input_skf);
+                log::info!("Reading file {}", input_skf);
                 log::info!("Using 128-bit representation");
                 skalo(ska_array, config);
             } else {

diff --git a/src/skalo/mod.rs b/src/skalo/mod.rs
@@ -2,8 +2,9 @@
 mod compaction;
 pub mod extremities;
 pub mod input;
-mod output;
+mod output_snps;
 mod positioning;
 mod process_variants;
 pub mod read_graph;
 pub mod utils;
+pub mod process_indels;
diff --git a/src/skalo/output.rs → src/skalo/output_snps.rs b/src/skalo/output.rs → src/skalo/output_snps.rs
@@ -14,7 +14,6 @@ pub fn create_fasta_and_vcf(
     variant_map: HashMap<u32, Vec<char>>,
     config: &Config,
 ) {
-    log::info!("Writting output files");
 
     // replace non-ATGCN characters with 'N' in genome_seq
     for base in genome_seq.iter_mut() {

diff --git a/src/skalo/process_indels.rs b/src/skalo/process_indels.rs
@@ -0,0 +1,234 @@
+//! indel processing
+use bit_set::BitSet;
+use hashbrown::{HashMap, HashSet};
+use std::fs::File;
+use std::io::{Write, BufWriter};
+
+use crate::ska_dict::bit_encoding::UInt;
+use crate::skalo::utils::{Config, DataInfo, VariantInfo};
+
+type VariantGroups<IntT> = HashMap<(IntT, IntT), Vec<VariantInfo>>;
+
+/// This function processes indels. These are dereplicated and inserts are extracted.
+/// As for SNPs, indels are filtered to only retain true variants and based on the 
+/// proportion of missing samples.
+pub fn process_indels<IntT: for<'a> UInt<'a>>(
+    indel_groups: VariantGroups<IntT>,
+    kmer_2_samples: &HashMap<IntT, BitSet>,
+    data_info: &DataInfo,
+    config: &Config,
+) -> HashSet<IntT> {
+
+    log::info!("Processing indels");
+
+    // dereplicate indels
+    let (final_indels, entries_indels) = dereplicate_indels(indel_groups, data_info.k_graph);
+
+    // create VCF output file
+    let vcf_filename = format!("{}_indels.vcf", config.output_name);
+    let file = File::create(&vcf_filename).expect("Unable to create VCF file");
+    let mut writer = BufWriter::new(file);
+
+    // Wwrite VCF header
+    writeln!(writer, "##fileformat=VCFv4.2").unwrap();
+    writeln!(writer, "# REF corresponds to the most frequent variant among samples").unwrap();
+    writeln!(writer, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t{}", 
+             data_info.sample_names.join("\t")).unwrap();    
+
+    let mut nb_indels = 0;
+
+    // consider indels 1 by one
+    for vec_variants in final_indels.values() {
+        // get taxonomic sampling for each variant
+        let bitset_vec: Vec<BitSet> = vec_variants
+            .iter()
+            .filter_map(|variant| {
+                let encoded_kmer = IntT::encode_kmer(&variant.sequence.get_range(0, data_info.k_graph + 1));
+                kmer_2_samples.get(&encoded_kmer).cloned()
+            })
+            .collect();
+
+        // compute missing samples, including `0/1` as missing
+        let mut missing_samples = 0;
+        let mut ref_present = false;
+        let mut alt_present = false;
+
+        for i in 0..data_info.sample_names.len() {
+            let in_ref = bitset_vec[0].contains(i);
+            let in_alt = bitset_vec[1].contains(i);
+
+            if !in_ref && !in_alt {
+                missing_samples += 1;
+            }
+            else if in_ref && in_alt {
+                missing_samples += 1; // consider heterozygous calls as missing
+            }
+            else if in_ref {
+                ref_present = true;
+            }
+            else {
+                alt_present = true;
+            }
+        }
+
+        let proportion_missing = missing_samples as f32 / data_info.sample_names.len() as f32;
+
+        // filter indels based on proportion of missing data and only keep true variants
+        if proportion_missing <= config.max_missing && ref_present && alt_present {
+            nb_indels += 1;
+
+            // get inserts and 1st/last k-mers
+            let (vec_inserts, last_kmer) = extract_middle_bases(vec_variants, data_info.k_graph);
+            let first_kmer = vec_variants[0].sequence.decode()[..data_info.k_graph].to_string();
+
+            // determine the most frequent variant (REF) and the other (ALT)
+            let mut variants: Vec<(String, usize, &BitSet)> = 
+                vec_inserts.iter().zip(&bitset_vec)
+                .map(|(seq, bitset)| (seq.clone(), bitset.len(), bitset))
+                .collect();
+
+            // sort by frequency (descending) to find the most frequent variant
+            variants.sort_by(|a, b| b.1.cmp(&a.1));
+
+            let (ref_allele, _ref_count, ref_bitset) = &variants[0]; // most frequent (REF)
+            let (alt_allele, _alt_count, alt_bitset) = &variants[1]; // less frequent (ALT)
+
+            // Generate sample genotype calls
+            let sample_calls: Vec<String> = data_info.sample_names.iter()
+                .enumerate()
+                .map(|(i, _sample)| {
+                    let in_ref = ref_bitset.contains(i);
+                    let in_alt = alt_bitset.contains(i);
+
+                    match (in_ref, in_alt) {
+                        (true, true) => "0/1".to_string(), // both variants present (strain mixture)
+                        (true, false) => "0".to_string(),  // only REF
+                        (false, true) => "1".to_string(),  // only ALT
+                        (false, false) => ".".to_string(), // missing data
+                    }
+                })
+                .collect();
+
+            // Write the VCF line
+            writeln!(
+                writer,
+                ".\t.\t.\t{}\t{}\t.\tbefore={};after={}\t.\tGT\t{}",
+                ref_allele, alt_allele, first_kmer, last_kmer, sample_calls.join("\t")
+            ).unwrap();          
+
+        }       
+    }       
+
+    log::info!("{} indels", nb_indels);
+
+    // return entry k-mers of indels for SNP processing
+    entries_indels
+}
+
+
+// dereplicate indel groups: choose shortest between 'forward' and 'reverse-complement';
+// this is equivalent to indel realigning in read-alignment (useful in repeats)
+fn dereplicate_indels<IntT: for<'a> UInt<'a>>(
+    indel_groups: VariantGroups<IntT>,
+    k_graph: usize,
+) -> (VariantGroups<IntT>, HashSet<IntT>) {
+
+    let mut entries_indels: HashSet<IntT> = HashSet::new();
+    let mut final_indels: VariantGroups<IntT> = HashMap::new();
+
+    // create a vector of keys and their corresponding total sequence length, and sort it in increasing order
+    // we use the IntT value of the entry k-mer as tie breaker to get a stable list
+    let mut sorted_extremities: Vec<((IntT, IntT), usize)> = indel_groups
+        .iter()
+        .map(|(key, variants)| {
+            // calculate total sequence length
+            let total_length: usize = variants.iter()
+                .map(|variant| variant.sequence.decode().len())
+                .sum();
+            (*key, total_length)
+        })
+        .collect();
+
+    sorted_extremities.sort_by(|a, b| {
+        a.1.cmp(&b.1) // sort by sum of sequence lengths
+            .then_with(|| a.0 .0.cmp(&b.0 .0)) // sort by the first IntT value of the key when there's a tie
+    });
+
+    for (combined_ext, _) in sorted_extremities {
+        let vec_variants = indel_groups.get(&combined_ext).unwrap();
+        if !entries_indels.contains(&combined_ext.0) {
+            // save indel k-mers
+            let rc_1 = IntT::rev_comp(combined_ext.0, k_graph);
+            let rc_2 = IntT::rev_comp(combined_ext.1, k_graph);
+            entries_indels.insert(combined_ext.0);
+            entries_indels.insert(rc_1);
+            entries_indels.insert(combined_ext.1);
+            entries_indels.insert(rc_2);
+            // save indel group
+            final_indels.insert(combined_ext, vec_variants.clone());
+        }
+    }
+
+    (final_indels, entries_indels)
+}
+
+
+// extract inserts of an indel group
+fn extract_middle_bases(
+    vec_variants: &[VariantInfo],
+    k_graph: usize,
+) -> (Vec<String>, String) {  
+
+    // collect all sequences without the first kmer
+    let reduced_seq: Vec<String> = vec_variants
+        .iter()
+        .map(|variant| {
+            let seq = variant.sequence.decode();
+            seq[k_graph..].to_string()
+        })
+        .collect();
+
+    // get start position of last k-mer (i.e. find last position for which sequences differ (from the end))
+    let mut identical = true;
+    let mut n_nucl = 0;
+
+    while identical {
+        n_nucl += 1;
+        let mut all_ends: HashSet<String> = HashSet::new();
+        // extract last n nucleotide from each seq
+        for seq in &reduced_seq {
+            if n_nucl > seq.len() {
+                identical = false;
+            } else {                    
+                let last_n_chars: Vec<String> = seq.chars().rev().take(n_nucl).map(|c| c.to_string()).collect();
+                let concatenated_last_chars: String = last_n_chars.into_iter().rev().collect();
+                all_ends.insert(concatenated_last_chars.clone());
+            }
+        }
+        if all_ends.len() > 1 {
+            identical = false;
+        }
+    }
+    n_nucl -= 1;
+
+    // extract last kmer (using first sequence)
+    let pos_end = reduced_seq[0].len() - n_nucl;
+    let mut last_kmer = reduced_seq[0][pos_end..].to_string();
+
+    // the length of last k-mer might be in some very rare cases longer than expected (only observed in variants with lot of missing samples) -> truncate it
+    if last_kmer.len() > k_graph {
+        last_kmer = last_kmer[..k_graph].to_string();
+    }
+
+    // extract 'middle-bases' (remove last kmer from reduced sequences -> only middle base left)
+    let mut vec_middles: Vec<String> = Vec::new();
+    for seq in &reduced_seq {
+        let pos2_end = seq.len() - n_nucl;
+        let mut middle_bases = &seq[..pos2_end];
+        if middle_bases.is_empty() {middle_bases = "-";}
+        vec_middles.push(middle_bases.to_string());
+    }
+
+    (vec_middles, last_kmer)
+}
+