From f7f2486ec807dccfd4cab89662cd6a6f08af1d3a Mon Sep 17 00:00:00 2001
From: John Lees <lees.john6@gmail.com>
Date: Thu, 25 May 2023 10:16:38 +0100
Subject: [PATCH 1/2] Deal with palindromes

i.e. own self-rc k-mers
---
 Cargo.toml                                    |  2 +-
 src/ska_dict.rs                               | 43 ++++++++++-
 src/ska_dict/split_kmer.rs                    |  8 ++-
 tests/fasta_input.rs                          | 71 +++++++++++++++++++
 tests/test_files_in/palindrome_1.fa           |  2 +
 tests/test_files_in/palindrome_2.fa           |  2 +
 tests/test_files_in/palindrome_reps_1.fa      |  4 ++
 tests/test_files_in/palindrome_reps_2.fa      |  4 ++
 tests/test_results_correct/palindrome.stdout  |  4 ++
 .../palindrome_norc.stdout                    |  4 ++
 .../palindrome_reps.stdout                    |  4 ++
 11 files changed, 144 insertions(+), 4 deletions(-)
 create mode 100644 tests/test_files_in/palindrome_1.fa
 create mode 100644 tests/test_files_in/palindrome_2.fa
 create mode 100644 tests/test_files_in/palindrome_reps_1.fa
 create mode 100644 tests/test_files_in/palindrome_reps_2.fa
 create mode 100644 tests/test_results_correct/palindrome.stdout
 create mode 100644 tests/test_results_correct/palindrome_norc.stdout
 create mode 100644 tests/test_results_correct/palindrome_reps.stdout

diff --git a/Cargo.toml b/Cargo.toml
index 2833258..73209de 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "ska"
-version = "0.2.4"
+version = "0.2.5"
 authors = [
     "John Lees <jlees@ebi.ac.uk>",
     "Simon Harris <simon.harris@gatesfoundation.org>",
diff --git a/src/ska_dict.rs b/src/ska_dict.rs
index 1369dec..cee0b31 100644
--- a/src/ska_dict.rs
+++ b/src/ska_dict.rs
@@ -63,6 +63,37 @@ where
             .or_insert(decode_base(base));
     }
 
+    /// Adds a split k-mer which is a self-rc to the dict
+    /// This requires amibguity of middle_base + rc(middle_base) to be added
+    fn add_palindrome_to_dict(&mut self, kmer: IntT, base: u8) {
+        self.split_kmers
+            .entry(kmer)
+            .and_modify(|b| {
+                *b = match b {
+                    b'W' => {
+                        if base == 0 || base == 2 {
+                            b'W'
+                        } else {
+                            b'N'
+                        }
+                    }
+                    b'S' => {
+                        if base == 0 || base == 2 {
+                            b'N'
+                        } else {
+                            b'S'
+                        }
+                    }
+                    _ => panic!("Palindrome middle base not W/S: {}", *b),
+                }
+            })
+            .or_insert(match base {
+                0 | 2 => b'W', // A or T
+                1 | 3 => b'S', // C or G
+                _ => panic!("Base encoding error: {base}"),
+            });
+    }
+
     /// Iterates through all the k-mers from an input fastx file and adds them
     /// to the dictionary
     fn add_file_kmers(&mut self, filename: &str, is_reads: bool, qual: &QualOpts) {
@@ -86,14 +117,22 @@ where
                         && Ordering::is_eq(self.kmer_filter.filter(&kmer_it)))
                 {
                     let (kmer, base, _rc) = kmer_it.get_curr_kmer();
-                    self.add_to_dict(kmer, base);
+                    if kmer_it.self_palindrome() {
+                        self.add_palindrome_to_dict(kmer, base);
+                    } else {
+                        self.add_to_dict(kmer, base);
+                    }
                 }
                 while let Some((kmer, base, _rc)) = kmer_it.get_next_kmer() {
                     if !is_reads
                         || (kmer_it.middle_base_qual()
                             && Ordering::is_eq(self.kmer_filter.filter(&kmer_it)))
                     {
-                        self.add_to_dict(kmer, base);
+                        if kmer_it.self_palindrome() {
+                            self.add_palindrome_to_dict(kmer, base);
+                        } else {
+                            self.add_to_dict(kmer, base);
+                        }
                     }
                 }
             }
diff --git a/src/ska_dict/split_kmer.rs b/src/ska_dict/split_kmer.rs
index c67714a..432f82d 100644
--- a/src/ska_dict/split_kmer.rs
+++ b/src/ska_dict/split_kmer.rs
@@ -128,7 +128,7 @@ impl<'a, IntT: for<'b> UInt<'b>> SplitKmer<'a, IntT> {
             }
         }
 
-        // For reads, start an rolling hash
+        // For reads, start a rolling hash
         let hash_gen = if is_reads {
             Some(NtHashIterator::new(&seq[*idx..(*idx + k)], k, rc))
         } else {
@@ -139,6 +139,12 @@ impl<'a, IntT: for<'b> UInt<'b>> SplitKmer<'a, IntT> {
         Some((upper, lower, middle_base, hash_gen))
     }
 
+    /// Checks if the split k-mer arms are palindromes, i.e. k-mer is its own reverse complement
+    /// In this case the middle base needs ambiguity with its rc added.
+    pub fn self_palindrome(&mut self) -> bool {
+        self.rc && self.upper == self.rc_upper && self.lower == self.rc_lower
+    }
+
     /// Update the stored reverse complement using the stored split-k and middle base
     fn update_rc(&mut self) {
         self.rc_upper = self.lower.rev_comp(self.k - 1) & self.upper_mask;
diff --git a/tests/fasta_input.rs b/tests/fasta_input.rs
index 2f20664..bfbf7a2 100644
--- a/tests/fasta_input.rs
+++ b/tests/fasta_input.rs
@@ -218,3 +218,74 @@ fn repeats() {
         .assert()
         .stdout_eq_path(sandbox.file_string("dup_rc.stdout", TestDir::Correct));
 }
+
+#[test]
+fn palindromes() {
+    let sandbox = TestSetup::setup();
+
+    // Check that palindrome k-mers (self-rc) are correctly ambiguous
+    Command::new(cargo_bin("ska"))
+        .current_dir(sandbox.get_wd())
+        .arg("build")
+        .arg("-k")
+        .arg("15")
+        .arg(sandbox.file_string("palindrome_1.fa", TestDir::Input))
+        .arg(sandbox.file_string("palindrome_2.fa", TestDir::Input))
+        .arg("-o")
+        .arg("otto")
+        .arg("-v")
+        .assert()
+        .success();
+
+    Command::new(cargo_bin("ska"))
+        .current_dir(sandbox.get_wd())
+        .arg("align")
+        .arg("otto.skf")
+        .args(&["--filter", "no-filter"])
+        .assert()
+        .stdout_eq_path(sandbox.file_string("palindrome.stdout", TestDir::Correct));
+
+    // Single strand forces orientation so no ambiguity
+    Command::new(cargo_bin("ska"))
+        .current_dir(sandbox.get_wd())
+        .arg("build")
+        .arg("-k")
+        .arg("15")
+        .arg(sandbox.file_string("palindrome_1.fa", TestDir::Input))
+        .arg(sandbox.file_string("palindrome_2.fa", TestDir::Input))
+        .arg("-o")
+        .arg("otan")
+        .arg("--single-strand")
+        .assert()
+        .success();
+
+    Command::new(cargo_bin("ska"))
+        .current_dir(sandbox.get_wd())
+        .arg("align")
+        .arg("otan.skf")
+        .assert()
+        .stdout_eq_path(sandbox.file_string("palindrome_norc.stdout", TestDir::Correct));
+
+    // Check that palindrome k-mers (self-rc) are correctly ambiguous, and
+    // work correctly when multiple copies are present
+    Command::new(cargo_bin("ska"))
+        .current_dir(sandbox.get_wd())
+        .arg("build")
+        .arg("-k")
+        .arg("15")
+        .arg(sandbox.file_string("palindrome_reps_1.fa", TestDir::Input))
+        .arg(sandbox.file_string("palindrome_reps_2.fa", TestDir::Input))
+        .arg("-o")
+        .arg("ottootto")
+        .arg("-v")
+        .assert()
+        .success();
+
+    Command::new(cargo_bin("ska"))
+        .current_dir(sandbox.get_wd())
+        .arg("align")
+        .arg("ottootto.skf")
+        .args(&["--filter", "no-filter"])
+        .assert()
+        .stdout_eq_path(sandbox.file_string("palindrome_reps.stdout", TestDir::Correct));
+}
diff --git a/tests/test_files_in/palindrome_1.fa b/tests/test_files_in/palindrome_1.fa
new file mode 100644
index 0000000..a34e1d1
--- /dev/null
+++ b/tests/test_files_in/palindrome_1.fa
@@ -0,0 +1,2 @@
+>ottoA
+GTGGAAGACTTCCACC
diff --git a/tests/test_files_in/palindrome_2.fa b/tests/test_files_in/palindrome_2.fa
new file mode 100644
index 0000000..d13f0a6
--- /dev/null
+++ b/tests/test_files_in/palindrome_2.fa
@@ -0,0 +1,2 @@
+>ottoB
+GTGGAAGTCTTCCACC
diff --git a/tests/test_files_in/palindrome_reps_1.fa b/tests/test_files_in/palindrome_reps_1.fa
new file mode 100644
index 0000000..54a65ce
--- /dev/null
+++ b/tests/test_files_in/palindrome_reps_1.fa
@@ -0,0 +1,4 @@
+>ottoB
+GTGGAAGTCTTCCACA
+>ottoB2
+GTGGAAGTCTTCCACT
diff --git a/tests/test_files_in/palindrome_reps_2.fa b/tests/test_files_in/palindrome_reps_2.fa
new file mode 100644
index 0000000..5b44b79
--- /dev/null
+++ b/tests/test_files_in/palindrome_reps_2.fa
@@ -0,0 +1,4 @@
+>ottoB
+GTGGAAGTCTTCCACC
+>ottoB2
+GTGGAAGCCTTCCACG
diff --git a/tests/test_results_correct/palindrome.stdout b/tests/test_results_correct/palindrome.stdout
new file mode 100644
index 0000000..093e54b
--- /dev/null
+++ b/tests/test_results_correct/palindrome.stdout
@@ -0,0 +1,4 @@
+>palindrome_1
+W
+>palindrome_2
+W
diff --git a/tests/test_results_correct/palindrome_norc.stdout b/tests/test_results_correct/palindrome_norc.stdout
new file mode 100644
index 0000000..9e19ff4
--- /dev/null
+++ b/tests/test_results_correct/palindrome_norc.stdout
@@ -0,0 +1,4 @@
+>palindrome_1
+A
+>palindrome_2
+T
diff --git a/tests/test_results_correct/palindrome_reps.stdout b/tests/test_results_correct/palindrome_reps.stdout
new file mode 100644
index 0000000..d60959f
--- /dev/null
+++ b/tests/test_results_correct/palindrome_reps.stdout
@@ -0,0 +1,4 @@
+>palindrome_reps_1
+W
+>palindrome_reps_2
+N

From 20eac855b70e38d3810220a99e077f375638d7b9 Mon Sep 17 00:00:00 2001
From: John Lees <lees.john6@gmail.com>
Date: Fri, 26 May 2023 13:14:34 +0100
Subject: [PATCH 2/2] Add a unit test

---
 src/ska_dict.rs                  | 48 +++++++++++++++++++++++++++++++-
 src/ska_dict/count_min_filter.rs |  2 +-
 2 files changed, 48 insertions(+), 2 deletions(-)

diff --git a/src/ska_dict.rs b/src/ska_dict.rs
index cee0b31..2ecf737 100644
--- a/src/ska_dict.rs
+++ b/src/ska_dict.rs
@@ -35,7 +35,7 @@ use crate::ska_dict::count_min_filter::KmerFilter;
 pub mod nthash;
 
 /// Holds the split-kmer dictionary, and basic information such as k-mer size.
-#[derive(Debug, Clone)]
+#[derive(Debug, Clone, Default)]
 pub struct SkaDict<IntT> {
     /// K-mer size
     k: usize,
@@ -256,3 +256,49 @@ where
         &self.name
     }
 }
+
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_add_palindrome_to_dict() {
+        // Initialize the test subject
+        let mut test_obj = SkaDict::<u64>::default(); // Replace YourStruct with the actual struct name
+
+        // Test case 1: Updating existing entry
+        test_obj.split_kmers.insert(123, b'W');
+        test_obj.add_palindrome_to_dict(123, 1);
+        assert_eq!(test_obj.split_kmers[&123], b'N');
+
+        // Test case 2: Adding new entry with base 0
+        test_obj.split_kmers.clear();
+        test_obj.add_palindrome_to_dict(456, 0);
+        assert_eq!(test_obj.split_kmers[&456], b'W');
+
+        // Test case 3: Adding new entry with base 3
+        test_obj.split_kmers.clear();
+        test_obj.add_palindrome_to_dict(789, 3);
+        assert_eq!(test_obj.split_kmers[&789], b'S');
+    }
+
+    #[test]
+    #[should_panic]
+    fn test_panic_add_palindrome_to_dict() {
+        // Test case 4: Panicking with invalid base
+        let mut test_obj_panic = SkaDict::<u64>::default();
+        test_obj_panic.add_palindrome_to_dict(987, 5);
+    }
+
+    #[test]
+    #[should_panic]
+    fn test_panic2_add_palindrome_to_dict() {
+        // Test case 5: Panicking with invalid middle base
+        let mut test_obj_panic = SkaDict::<u64>::default();
+        test_obj_panic.split_kmers.clear();
+        test_obj_panic.split_kmers.insert(555, b'A');
+        test_obj_panic.add_palindrome_to_dict(555, 1);
+    }
+}
+
diff --git a/src/ska_dict/count_min_filter.rs b/src/ska_dict/count_min_filter.rs
index 64109ad..e231251 100644
--- a/src/ska_dict/count_min_filter.rs
+++ b/src/ska_dict/count_min_filter.rs
@@ -36,7 +36,7 @@ const CM_WIDTH: usize = 1 << 24;
 /// This has the advantage of using less memory than a larger countmin filter,
 /// being a bit faster (bloom is ~3x faster than countmin, but having count also
 /// allows entry to dictionary to be only checked once for each passing k-mer)
-#[derive(Debug, Clone)]
+#[derive(Debug, Clone, Default)]
 pub struct KmerFilter {
     /// Size of the bloom filter
     buf_size: u64,