Skip to content

Commit

Permalink
Merge pull request #30 from bacpop/palindromes
Browse files Browse the repository at this point in the history
Deal with palindromes
  • Loading branch information
johnlees authored May 26, 2023
2 parents d9beb84 + 20eac85 commit 6fa8953
Show file tree
Hide file tree
Showing 12 changed files with 192 additions and 6 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "ska"
version = "0.2.4"
version = "0.2.5"
authors = [
"John Lees <[email protected]>",
"Simon Harris <[email protected]>",
Expand Down
91 changes: 88 additions & 3 deletions src/ska_dict.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ use crate::ska_dict::count_min_filter::KmerFilter;
pub mod nthash;

/// Holds the split-kmer dictionary, and basic information such as k-mer size.
#[derive(Debug, Clone)]
#[derive(Debug, Clone, Default)]
pub struct SkaDict<IntT> {
/// K-mer size
k: usize,
Expand Down Expand Up @@ -63,6 +63,37 @@ where
.or_insert(decode_base(base));
}

/// Adds a split k-mer which is a self-rc to the dict
/// This requires amibguity of middle_base + rc(middle_base) to be added
fn add_palindrome_to_dict(&mut self, kmer: IntT, base: u8) {
self.split_kmers
.entry(kmer)
.and_modify(|b| {
*b = match b {
b'W' => {
if base == 0 || base == 2 {
b'W'
} else {
b'N'
}
}
b'S' => {
if base == 0 || base == 2 {
b'N'
} else {
b'S'
}
}
_ => panic!("Palindrome middle base not W/S: {}", *b),
}
})
.or_insert(match base {
0 | 2 => b'W', // A or T
1 | 3 => b'S', // C or G
_ => panic!("Base encoding error: {base}"),
});
}

/// Iterates through all the k-mers from an input fastx file and adds them
/// to the dictionary
fn add_file_kmers(&mut self, filename: &str, is_reads: bool, qual: &QualOpts) {
Expand All @@ -86,14 +117,22 @@ where
&& Ordering::is_eq(self.kmer_filter.filter(&kmer_it)))
{
let (kmer, base, _rc) = kmer_it.get_curr_kmer();
self.add_to_dict(kmer, base);
if kmer_it.self_palindrome() {
self.add_palindrome_to_dict(kmer, base);
} else {
self.add_to_dict(kmer, base);
}
}
while let Some((kmer, base, _rc)) = kmer_it.get_next_kmer() {
if !is_reads
|| (kmer_it.middle_base_qual()
&& Ordering::is_eq(self.kmer_filter.filter(&kmer_it)))
{
self.add_to_dict(kmer, base);
if kmer_it.self_palindrome() {
self.add_palindrome_to_dict(kmer, base);
} else {
self.add_to_dict(kmer, base);
}
}
}
}
Expand Down Expand Up @@ -217,3 +256,49 @@ where
&self.name
}
}


#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_add_palindrome_to_dict() {
// Initialize the test subject
let mut test_obj = SkaDict::<u64>::default(); // Replace YourStruct with the actual struct name

// Test case 1: Updating existing entry
test_obj.split_kmers.insert(123, b'W');
test_obj.add_palindrome_to_dict(123, 1);
assert_eq!(test_obj.split_kmers[&123], b'N');

// Test case 2: Adding new entry with base 0
test_obj.split_kmers.clear();
test_obj.add_palindrome_to_dict(456, 0);
assert_eq!(test_obj.split_kmers[&456], b'W');

// Test case 3: Adding new entry with base 3
test_obj.split_kmers.clear();
test_obj.add_palindrome_to_dict(789, 3);
assert_eq!(test_obj.split_kmers[&789], b'S');
}

#[test]
#[should_panic]
fn test_panic_add_palindrome_to_dict() {
// Test case 4: Panicking with invalid base
let mut test_obj_panic = SkaDict::<u64>::default();
test_obj_panic.add_palindrome_to_dict(987, 5);
}

#[test]
#[should_panic]
fn test_panic2_add_palindrome_to_dict() {
// Test case 5: Panicking with invalid middle base
let mut test_obj_panic = SkaDict::<u64>::default();
test_obj_panic.split_kmers.clear();
test_obj_panic.split_kmers.insert(555, b'A');
test_obj_panic.add_palindrome_to_dict(555, 1);
}
}

2 changes: 1 addition & 1 deletion src/ska_dict/count_min_filter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ const CM_WIDTH: usize = 1 << 24;
/// This has the advantage of using less memory than a larger countmin filter,
/// being a bit faster (bloom is ~3x faster than countmin, but having count also
/// allows entry to dictionary to be only checked once for each passing k-mer)
#[derive(Debug, Clone)]
#[derive(Debug, Clone, Default)]
pub struct KmerFilter {
/// Size of the bloom filter
buf_size: u64,
Expand Down
8 changes: 7 additions & 1 deletion src/ska_dict/split_kmer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ impl<'a, IntT: for<'b> UInt<'b>> SplitKmer<'a, IntT> {
}
}

// For reads, start an rolling hash
// For reads, start a rolling hash
let hash_gen = if is_reads {
Some(NtHashIterator::new(&seq[*idx..(*idx + k)], k, rc))
} else {
Expand All @@ -139,6 +139,12 @@ impl<'a, IntT: for<'b> UInt<'b>> SplitKmer<'a, IntT> {
Some((upper, lower, middle_base, hash_gen))
}

/// Checks if the split k-mer arms are palindromes, i.e. k-mer is its own reverse complement
/// In this case the middle base needs ambiguity with its rc added.
pub fn self_palindrome(&mut self) -> bool {
self.rc && self.upper == self.rc_upper && self.lower == self.rc_lower
}

/// Update the stored reverse complement using the stored split-k and middle base
fn update_rc(&mut self) {
self.rc_upper = self.lower.rev_comp(self.k - 1) & self.upper_mask;
Expand Down
71 changes: 71 additions & 0 deletions tests/fasta_input.rs
Original file line number Diff line number Diff line change
Expand Up @@ -218,3 +218,74 @@ fn repeats() {
.assert()
.stdout_eq_path(sandbox.file_string("dup_rc.stdout", TestDir::Correct));
}

#[test]
fn palindromes() {
let sandbox = TestSetup::setup();

// Check that palindrome k-mers (self-rc) are correctly ambiguous
Command::new(cargo_bin("ska"))
.current_dir(sandbox.get_wd())
.arg("build")
.arg("-k")
.arg("15")
.arg(sandbox.file_string("palindrome_1.fa", TestDir::Input))
.arg(sandbox.file_string("palindrome_2.fa", TestDir::Input))
.arg("-o")
.arg("otto")
.arg("-v")
.assert()
.success();

Command::new(cargo_bin("ska"))
.current_dir(sandbox.get_wd())
.arg("align")
.arg("otto.skf")
.args(&["--filter", "no-filter"])
.assert()
.stdout_eq_path(sandbox.file_string("palindrome.stdout", TestDir::Correct));

// Single strand forces orientation so no ambiguity
Command::new(cargo_bin("ska"))
.current_dir(sandbox.get_wd())
.arg("build")
.arg("-k")
.arg("15")
.arg(sandbox.file_string("palindrome_1.fa", TestDir::Input))
.arg(sandbox.file_string("palindrome_2.fa", TestDir::Input))
.arg("-o")
.arg("otan")
.arg("--single-strand")
.assert()
.success();

Command::new(cargo_bin("ska"))
.current_dir(sandbox.get_wd())
.arg("align")
.arg("otan.skf")
.assert()
.stdout_eq_path(sandbox.file_string("palindrome_norc.stdout", TestDir::Correct));

// Check that palindrome k-mers (self-rc) are correctly ambiguous, and
// work correctly when multiple copies are present
Command::new(cargo_bin("ska"))
.current_dir(sandbox.get_wd())
.arg("build")
.arg("-k")
.arg("15")
.arg(sandbox.file_string("palindrome_reps_1.fa", TestDir::Input))
.arg(sandbox.file_string("palindrome_reps_2.fa", TestDir::Input))
.arg("-o")
.arg("ottootto")
.arg("-v")
.assert()
.success();

Command::new(cargo_bin("ska"))
.current_dir(sandbox.get_wd())
.arg("align")
.arg("ottootto.skf")
.args(&["--filter", "no-filter"])
.assert()
.stdout_eq_path(sandbox.file_string("palindrome_reps.stdout", TestDir::Correct));
}
2 changes: 2 additions & 0 deletions tests/test_files_in/palindrome_1.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
>ottoA
GTGGAAGACTTCCACC
2 changes: 2 additions & 0 deletions tests/test_files_in/palindrome_2.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
>ottoB
GTGGAAGTCTTCCACC
4 changes: 4 additions & 0 deletions tests/test_files_in/palindrome_reps_1.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
>ottoB
GTGGAAGTCTTCCACA
>ottoB2
GTGGAAGTCTTCCACT
4 changes: 4 additions & 0 deletions tests/test_files_in/palindrome_reps_2.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
>ottoB
GTGGAAGTCTTCCACC
>ottoB2
GTGGAAGCCTTCCACG
4 changes: 4 additions & 0 deletions tests/test_results_correct/palindrome.stdout
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
>palindrome_1
W
>palindrome_2
W
4 changes: 4 additions & 0 deletions tests/test_results_correct/palindrome_norc.stdout
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
>palindrome_1
A
>palindrome_2
T
4 changes: 4 additions & 0 deletions tests/test_results_correct/palindrome_reps.stdout
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
>palindrome_reps_1
W
>palindrome_reps_2
N

0 comments on commit 6fa8953

Please sign in to comment.