-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Romain Derelle
authored and
Romain Derelle
committed
Feb 9, 2025
1 parent
3ce160e
commit f591fb4
Showing
6 changed files
with
283 additions
and
72 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,234 @@ | ||
//! indel processing | ||
use bit_set::BitSet; | ||
use hashbrown::{HashMap, HashSet}; | ||
use std::fs::File; | ||
use std::io::{Write, BufWriter}; | ||
|
||
use crate::ska_dict::bit_encoding::UInt; | ||
use crate::skalo::utils::{Config, DataInfo, VariantInfo}; | ||
|
||
type VariantGroups<IntT> = HashMap<(IntT, IntT), Vec<VariantInfo>>; | ||
|
||
/// This function processes indels. These are dereplicated and inserts are extracted. | ||
/// As for SNPs, indels are filtered to only retain true variants and based on the | ||
/// proportion of missing samples. | ||
pub fn process_indels<IntT: for<'a> UInt<'a>>( | ||
indel_groups: VariantGroups<IntT>, | ||
kmer_2_samples: &HashMap<IntT, BitSet>, | ||
data_info: &DataInfo, | ||
config: &Config, | ||
) -> HashSet<IntT> { | ||
|
||
log::info!("Processing indels"); | ||
|
||
// dereplicate indels | ||
let (final_indels, entries_indels) = dereplicate_indels(indel_groups, data_info.k_graph); | ||
|
||
// create VCF output file | ||
let vcf_filename = format!("{}_indels.vcf", config.output_name); | ||
let file = File::create(&vcf_filename).expect("Unable to create VCF file"); | ||
let mut writer = BufWriter::new(file); | ||
|
||
// Wwrite VCF header | ||
writeln!(writer, "##fileformat=VCFv4.2").unwrap(); | ||
writeln!(writer, "# REF corresponds to the most frequent variant among samples").unwrap(); | ||
writeln!(writer, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t{}", | ||
data_info.sample_names.join("\t")).unwrap(); | ||
|
||
let mut nb_indels = 0; | ||
|
||
// consider indels 1 by one | ||
for vec_variants in final_indels.values() { | ||
// get taxonomic sampling for each variant | ||
let bitset_vec: Vec<BitSet> = vec_variants | ||
.iter() | ||
.filter_map(|variant| { | ||
let encoded_kmer = IntT::encode_kmer(&variant.sequence.get_range(0, data_info.k_graph + 1)); | ||
kmer_2_samples.get(&encoded_kmer).cloned() | ||
}) | ||
.collect(); | ||
|
||
// compute missing samples, including `0/1` as missing | ||
let mut missing_samples = 0; | ||
let mut ref_present = false; | ||
let mut alt_present = false; | ||
|
||
for i in 0..data_info.sample_names.len() { | ||
let in_ref = bitset_vec[0].contains(i); | ||
let in_alt = bitset_vec[1].contains(i); | ||
|
||
if !in_ref && !in_alt { | ||
missing_samples += 1; | ||
} | ||
else if in_ref && in_alt { | ||
missing_samples += 1; // consider heterozygous calls as missing | ||
} | ||
else if in_ref { | ||
ref_present = true; | ||
} | ||
else { | ||
alt_present = true; | ||
} | ||
} | ||
|
||
let proportion_missing = missing_samples as f32 / data_info.sample_names.len() as f32; | ||
|
||
// filter indels based on proportion of missing data and only keep true variants | ||
if proportion_missing <= config.max_missing && ref_present && alt_present { | ||
nb_indels += 1; | ||
|
||
// get inserts and 1st/last k-mers | ||
let (vec_inserts, last_kmer) = extract_middle_bases(vec_variants, data_info.k_graph); | ||
let first_kmer = vec_variants[0].sequence.decode()[..data_info.k_graph].to_string(); | ||
|
||
// determine the most frequent variant (REF) and the other (ALT) | ||
let mut variants: Vec<(String, usize, &BitSet)> = | ||
vec_inserts.iter().zip(&bitset_vec) | ||
.map(|(seq, bitset)| (seq.clone(), bitset.len(), bitset)) | ||
.collect(); | ||
|
||
// sort by frequency (descending) to find the most frequent variant | ||
variants.sort_by(|a, b| b.1.cmp(&a.1)); | ||
|
||
let (ref_allele, _ref_count, ref_bitset) = &variants[0]; // most frequent (REF) | ||
let (alt_allele, _alt_count, alt_bitset) = &variants[1]; // less frequent (ALT) | ||
|
||
// Generate sample genotype calls | ||
let sample_calls: Vec<String> = data_info.sample_names.iter() | ||
.enumerate() | ||
.map(|(i, _sample)| { | ||
let in_ref = ref_bitset.contains(i); | ||
let in_alt = alt_bitset.contains(i); | ||
|
||
match (in_ref, in_alt) { | ||
(true, true) => "0/1".to_string(), // both variants present (strain mixture) | ||
(true, false) => "0".to_string(), // only REF | ||
(false, true) => "1".to_string(), // only ALT | ||
(false, false) => ".".to_string(), // missing data | ||
} | ||
}) | ||
.collect(); | ||
|
||
// Write the VCF line | ||
writeln!( | ||
writer, | ||
".\t.\t.\t{}\t{}\t.\tbefore={};after={}\t.\tGT\t{}", | ||
ref_allele, alt_allele, first_kmer, last_kmer, sample_calls.join("\t") | ||
).unwrap(); | ||
|
||
} | ||
} | ||
|
||
log::info!("{} indels", nb_indels); | ||
|
||
// return entry k-mers of indels for SNP processing | ||
entries_indels | ||
} | ||
|
||
|
||
// dereplicate indel groups: choose shortest between 'forward' and 'reverse-complement'; | ||
// this is equivalent to indel realigning in read-alignment (useful in repeats) | ||
fn dereplicate_indels<IntT: for<'a> UInt<'a>>( | ||
indel_groups: VariantGroups<IntT>, | ||
k_graph: usize, | ||
) -> (VariantGroups<IntT>, HashSet<IntT>) { | ||
|
||
let mut entries_indels: HashSet<IntT> = HashSet::new(); | ||
let mut final_indels: VariantGroups<IntT> = HashMap::new(); | ||
|
||
// create a vector of keys and their corresponding total sequence length, and sort it in increasing order | ||
// we use the IntT value of the entry k-mer as tie breaker to get a stable list | ||
let mut sorted_extremities: Vec<((IntT, IntT), usize)> = indel_groups | ||
.iter() | ||
.map(|(key, variants)| { | ||
// calculate total sequence length | ||
let total_length: usize = variants.iter() | ||
.map(|variant| variant.sequence.decode().len()) | ||
.sum(); | ||
(*key, total_length) | ||
}) | ||
.collect(); | ||
|
||
sorted_extremities.sort_by(|a, b| { | ||
a.1.cmp(&b.1) // sort by sum of sequence lengths | ||
.then_with(|| a.0 .0.cmp(&b.0 .0)) // sort by the first IntT value of the key when there's a tie | ||
}); | ||
|
||
for (combined_ext, _) in sorted_extremities { | ||
let vec_variants = indel_groups.get(&combined_ext).unwrap(); | ||
if !entries_indels.contains(&combined_ext.0) { | ||
// save indel k-mers | ||
let rc_1 = IntT::rev_comp(combined_ext.0, k_graph); | ||
let rc_2 = IntT::rev_comp(combined_ext.1, k_graph); | ||
entries_indels.insert(combined_ext.0); | ||
entries_indels.insert(rc_1); | ||
entries_indels.insert(combined_ext.1); | ||
entries_indels.insert(rc_2); | ||
// save indel group | ||
final_indels.insert(combined_ext, vec_variants.clone()); | ||
} | ||
} | ||
|
||
(final_indels, entries_indels) | ||
} | ||
|
||
|
||
// extract inserts of an indel group | ||
fn extract_middle_bases( | ||
vec_variants: &[VariantInfo], | ||
k_graph: usize, | ||
) -> (Vec<String>, String) { | ||
|
||
// collect all sequences without the first kmer | ||
let reduced_seq: Vec<String> = vec_variants | ||
.iter() | ||
.map(|variant| { | ||
let seq = variant.sequence.decode(); | ||
seq[k_graph..].to_string() | ||
}) | ||
.collect(); | ||
|
||
// get start position of last k-mer (i.e. find last position for which sequences differ (from the end)) | ||
let mut identical = true; | ||
let mut n_nucl = 0; | ||
|
||
while identical { | ||
n_nucl += 1; | ||
let mut all_ends: HashSet<String> = HashSet::new(); | ||
// extract last n nucleotide from each seq | ||
for seq in &reduced_seq { | ||
if n_nucl > seq.len() { | ||
identical = false; | ||
} else { | ||
let last_n_chars: Vec<String> = seq.chars().rev().take(n_nucl).map(|c| c.to_string()).collect(); | ||
let concatenated_last_chars: String = last_n_chars.into_iter().rev().collect(); | ||
all_ends.insert(concatenated_last_chars.clone()); | ||
} | ||
} | ||
if all_ends.len() > 1 { | ||
identical = false; | ||
} | ||
} | ||
n_nucl -= 1; | ||
|
||
// extract last kmer (using first sequence) | ||
let pos_end = reduced_seq[0].len() - n_nucl; | ||
let mut last_kmer = reduced_seq[0][pos_end..].to_string(); | ||
|
||
// the length of last k-mer might be in some very rare cases longer than expected (only observed in variants with lot of missing samples) -> truncate it | ||
if last_kmer.len() > k_graph { | ||
last_kmer = last_kmer[..k_graph].to_string(); | ||
} | ||
|
||
// extract 'middle-bases' (remove last kmer from reduced sequences -> only middle base left) | ||
let mut vec_middles: Vec<String> = Vec::new(); | ||
for seq in &reduced_seq { | ||
let pos2_end = seq.len() - n_nucl; | ||
let mut middle_bases = &seq[..pos2_end]; | ||
if middle_bases.is_empty() {middle_bases = "-";} | ||
vec_middles.push(middle_bases.to_string()); | ||
} | ||
|
||
(vec_middles, last_kmer) | ||
} | ||
|
Oops, something went wrong.