From f7f2486ec807dccfd4cab89662cd6a6f08af1d3a Mon Sep 17 00:00:00 2001 From: John Lees Date: Thu, 25 May 2023 10:16:38 +0100 Subject: [PATCH 1/2] Deal with palindromes i.e. own self-rc k-mers --- Cargo.toml | 2 +- src/ska_dict.rs | 43 ++++++++++- src/ska_dict/split_kmer.rs | 8 ++- tests/fasta_input.rs | 71 +++++++++++++++++++ tests/test_files_in/palindrome_1.fa | 2 + tests/test_files_in/palindrome_2.fa | 2 + tests/test_files_in/palindrome_reps_1.fa | 4 ++ tests/test_files_in/palindrome_reps_2.fa | 4 ++ tests/test_results_correct/palindrome.stdout | 4 ++ .../palindrome_norc.stdout | 4 ++ .../palindrome_reps.stdout | 4 ++ 11 files changed, 144 insertions(+), 4 deletions(-) create mode 100644 tests/test_files_in/palindrome_1.fa create mode 100644 tests/test_files_in/palindrome_2.fa create mode 100644 tests/test_files_in/palindrome_reps_1.fa create mode 100644 tests/test_files_in/palindrome_reps_2.fa create mode 100644 tests/test_results_correct/palindrome.stdout create mode 100644 tests/test_results_correct/palindrome_norc.stdout create mode 100644 tests/test_results_correct/palindrome_reps.stdout diff --git a/Cargo.toml b/Cargo.toml index 2833258..73209de 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "ska" -version = "0.2.4" +version = "0.2.5" authors = [ "John Lees ", "Simon Harris ", diff --git a/src/ska_dict.rs b/src/ska_dict.rs index 1369dec..cee0b31 100644 --- a/src/ska_dict.rs +++ b/src/ska_dict.rs @@ -63,6 +63,37 @@ where .or_insert(decode_base(base)); } + /// Adds a split k-mer which is a self-rc to the dict + /// This requires amibguity of middle_base + rc(middle_base) to be added + fn add_palindrome_to_dict(&mut self, kmer: IntT, base: u8) { + self.split_kmers + .entry(kmer) + .and_modify(|b| { + *b = match b { + b'W' => { + if base == 0 || base == 2 { + b'W' + } else { + b'N' + } + } + b'S' => { + if base == 0 || base == 2 { + b'N' + } else { + b'S' + } + } + _ => panic!("Palindrome middle base not W/S: {}", *b), + } + }) + .or_insert(match base { + 0 | 2 => b'W', // A or T + 1 | 3 => b'S', // C or G + _ => panic!("Base encoding error: {base}"), + }); + } + /// Iterates through all the k-mers from an input fastx file and adds them /// to the dictionary fn add_file_kmers(&mut self, filename: &str, is_reads: bool, qual: &QualOpts) { @@ -86,14 +117,22 @@ where && Ordering::is_eq(self.kmer_filter.filter(&kmer_it))) { let (kmer, base, _rc) = kmer_it.get_curr_kmer(); - self.add_to_dict(kmer, base); + if kmer_it.self_palindrome() { + self.add_palindrome_to_dict(kmer, base); + } else { + self.add_to_dict(kmer, base); + } } while let Some((kmer, base, _rc)) = kmer_it.get_next_kmer() { if !is_reads || (kmer_it.middle_base_qual() && Ordering::is_eq(self.kmer_filter.filter(&kmer_it))) { - self.add_to_dict(kmer, base); + if kmer_it.self_palindrome() { + self.add_palindrome_to_dict(kmer, base); + } else { + self.add_to_dict(kmer, base); + } } } } diff --git a/src/ska_dict/split_kmer.rs b/src/ska_dict/split_kmer.rs index c67714a..432f82d 100644 --- a/src/ska_dict/split_kmer.rs +++ b/src/ska_dict/split_kmer.rs @@ -128,7 +128,7 @@ impl<'a, IntT: for<'b> UInt<'b>> SplitKmer<'a, IntT> { } } - // For reads, start an rolling hash + // For reads, start a rolling hash let hash_gen = if is_reads { Some(NtHashIterator::new(&seq[*idx..(*idx + k)], k, rc)) } else { @@ -139,6 +139,12 @@ impl<'a, IntT: for<'b> UInt<'b>> SplitKmer<'a, IntT> { Some((upper, lower, middle_base, hash_gen)) } + /// Checks if the split k-mer arms are palindromes, i.e. k-mer is its own reverse complement + /// In this case the middle base needs ambiguity with its rc added. + pub fn self_palindrome(&mut self) -> bool { + self.rc && self.upper == self.rc_upper && self.lower == self.rc_lower + } + /// Update the stored reverse complement using the stored split-k and middle base fn update_rc(&mut self) { self.rc_upper = self.lower.rev_comp(self.k - 1) & self.upper_mask; diff --git a/tests/fasta_input.rs b/tests/fasta_input.rs index 2f20664..bfbf7a2 100644 --- a/tests/fasta_input.rs +++ b/tests/fasta_input.rs @@ -218,3 +218,74 @@ fn repeats() { .assert() .stdout_eq_path(sandbox.file_string("dup_rc.stdout", TestDir::Correct)); } + +#[test] +fn palindromes() { + let sandbox = TestSetup::setup(); + + // Check that palindrome k-mers (self-rc) are correctly ambiguous + Command::new(cargo_bin("ska")) + .current_dir(sandbox.get_wd()) + .arg("build") + .arg("-k") + .arg("15") + .arg(sandbox.file_string("palindrome_1.fa", TestDir::Input)) + .arg(sandbox.file_string("palindrome_2.fa", TestDir::Input)) + .arg("-o") + .arg("otto") + .arg("-v") + .assert() + .success(); + + Command::new(cargo_bin("ska")) + .current_dir(sandbox.get_wd()) + .arg("align") + .arg("otto.skf") + .args(&["--filter", "no-filter"]) + .assert() + .stdout_eq_path(sandbox.file_string("palindrome.stdout", TestDir::Correct)); + + // Single strand forces orientation so no ambiguity + Command::new(cargo_bin("ska")) + .current_dir(sandbox.get_wd()) + .arg("build") + .arg("-k") + .arg("15") + .arg(sandbox.file_string("palindrome_1.fa", TestDir::Input)) + .arg(sandbox.file_string("palindrome_2.fa", TestDir::Input)) + .arg("-o") + .arg("otan") + .arg("--single-strand") + .assert() + .success(); + + Command::new(cargo_bin("ska")) + .current_dir(sandbox.get_wd()) + .arg("align") + .arg("otan.skf") + .assert() + .stdout_eq_path(sandbox.file_string("palindrome_norc.stdout", TestDir::Correct)); + + // Check that palindrome k-mers (self-rc) are correctly ambiguous, and + // work correctly when multiple copies are present + Command::new(cargo_bin("ska")) + .current_dir(sandbox.get_wd()) + .arg("build") + .arg("-k") + .arg("15") + .arg(sandbox.file_string("palindrome_reps_1.fa", TestDir::Input)) + .arg(sandbox.file_string("palindrome_reps_2.fa", TestDir::Input)) + .arg("-o") + .arg("ottootto") + .arg("-v") + .assert() + .success(); + + Command::new(cargo_bin("ska")) + .current_dir(sandbox.get_wd()) + .arg("align") + .arg("ottootto.skf") + .args(&["--filter", "no-filter"]) + .assert() + .stdout_eq_path(sandbox.file_string("palindrome_reps.stdout", TestDir::Correct)); +} diff --git a/tests/test_files_in/palindrome_1.fa b/tests/test_files_in/palindrome_1.fa new file mode 100644 index 0000000..a34e1d1 --- /dev/null +++ b/tests/test_files_in/palindrome_1.fa @@ -0,0 +1,2 @@ +>ottoA +GTGGAAGACTTCCACC diff --git a/tests/test_files_in/palindrome_2.fa b/tests/test_files_in/palindrome_2.fa new file mode 100644 index 0000000..d13f0a6 --- /dev/null +++ b/tests/test_files_in/palindrome_2.fa @@ -0,0 +1,2 @@ +>ottoB +GTGGAAGTCTTCCACC diff --git a/tests/test_files_in/palindrome_reps_1.fa b/tests/test_files_in/palindrome_reps_1.fa new file mode 100644 index 0000000..54a65ce --- /dev/null +++ b/tests/test_files_in/palindrome_reps_1.fa @@ -0,0 +1,4 @@ +>ottoB +GTGGAAGTCTTCCACA +>ottoB2 +GTGGAAGTCTTCCACT diff --git a/tests/test_files_in/palindrome_reps_2.fa b/tests/test_files_in/palindrome_reps_2.fa new file mode 100644 index 0000000..5b44b79 --- /dev/null +++ b/tests/test_files_in/palindrome_reps_2.fa @@ -0,0 +1,4 @@ +>ottoB +GTGGAAGTCTTCCACC +>ottoB2 +GTGGAAGCCTTCCACG diff --git a/tests/test_results_correct/palindrome.stdout b/tests/test_results_correct/palindrome.stdout new file mode 100644 index 0000000..093e54b --- /dev/null +++ b/tests/test_results_correct/palindrome.stdout @@ -0,0 +1,4 @@ +>palindrome_1 +W +>palindrome_2 +W diff --git a/tests/test_results_correct/palindrome_norc.stdout b/tests/test_results_correct/palindrome_norc.stdout new file mode 100644 index 0000000..9e19ff4 --- /dev/null +++ b/tests/test_results_correct/palindrome_norc.stdout @@ -0,0 +1,4 @@ +>palindrome_1 +A +>palindrome_2 +T diff --git a/tests/test_results_correct/palindrome_reps.stdout b/tests/test_results_correct/palindrome_reps.stdout new file mode 100644 index 0000000..d60959f --- /dev/null +++ b/tests/test_results_correct/palindrome_reps.stdout @@ -0,0 +1,4 @@ +>palindrome_reps_1 +W +>palindrome_reps_2 +N From 20eac855b70e38d3810220a99e077f375638d7b9 Mon Sep 17 00:00:00 2001 From: John Lees Date: Fri, 26 May 2023 13:14:34 +0100 Subject: [PATCH 2/2] Add a unit test --- src/ska_dict.rs | 48 +++++++++++++++++++++++++++++++- src/ska_dict/count_min_filter.rs | 2 +- 2 files changed, 48 insertions(+), 2 deletions(-) diff --git a/src/ska_dict.rs b/src/ska_dict.rs index cee0b31..2ecf737 100644 --- a/src/ska_dict.rs +++ b/src/ska_dict.rs @@ -35,7 +35,7 @@ use crate::ska_dict::count_min_filter::KmerFilter; pub mod nthash; /// Holds the split-kmer dictionary, and basic information such as k-mer size. -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Default)] pub struct SkaDict { /// K-mer size k: usize, @@ -256,3 +256,49 @@ where &self.name } } + + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_add_palindrome_to_dict() { + // Initialize the test subject + let mut test_obj = SkaDict::::default(); // Replace YourStruct with the actual struct name + + // Test case 1: Updating existing entry + test_obj.split_kmers.insert(123, b'W'); + test_obj.add_palindrome_to_dict(123, 1); + assert_eq!(test_obj.split_kmers[&123], b'N'); + + // Test case 2: Adding new entry with base 0 + test_obj.split_kmers.clear(); + test_obj.add_palindrome_to_dict(456, 0); + assert_eq!(test_obj.split_kmers[&456], b'W'); + + // Test case 3: Adding new entry with base 3 + test_obj.split_kmers.clear(); + test_obj.add_palindrome_to_dict(789, 3); + assert_eq!(test_obj.split_kmers[&789], b'S'); + } + + #[test] + #[should_panic] + fn test_panic_add_palindrome_to_dict() { + // Test case 4: Panicking with invalid base + let mut test_obj_panic = SkaDict::::default(); + test_obj_panic.add_palindrome_to_dict(987, 5); + } + + #[test] + #[should_panic] + fn test_panic2_add_palindrome_to_dict() { + // Test case 5: Panicking with invalid middle base + let mut test_obj_panic = SkaDict::::default(); + test_obj_panic.split_kmers.clear(); + test_obj_panic.split_kmers.insert(555, b'A'); + test_obj_panic.add_palindrome_to_dict(555, 1); + } +} + diff --git a/src/ska_dict/count_min_filter.rs b/src/ska_dict/count_min_filter.rs index 64109ad..e231251 100644 --- a/src/ska_dict/count_min_filter.rs +++ b/src/ska_dict/count_min_filter.rs @@ -36,7 +36,7 @@ const CM_WIDTH: usize = 1 << 24; /// This has the advantage of using less memory than a larger countmin filter, /// being a bit faster (bloom is ~3x faster than countmin, but having count also /// allows entry to dictionary to be only checked once for each passing k-mer) -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Default)] pub struct KmerFilter { /// Size of the bloom filter buf_size: u64,