From 8e28a4324b240337c48855e5b02a6f4ae23901e9 Mon Sep 17 00:00:00 2001 From: John Lees Date: Tue, 21 Nov 2023 17:18:38 +0000 Subject: [PATCH 1/8] Change default min-count with strict default Can be more lenient as more filtering happens elsewhere --- src/cli.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/cli.rs b/src/cli.rs index 8ed36a0..4baf753 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -16,11 +16,11 @@ pub const DEFAULT_AMBIGMASK: bool = false; /// Default gap ignoring behaviour (at constant sites) pub const DEFAULT_CONSTGAPS: bool = false; /// Default minimum k-mer count for FASTQ files -pub const DEFAULT_MINCOUNT: u16 = 10; +pub const DEFAULT_MINCOUNT: u16 = 5; /// Default minimum base quality (PHRED score) for FASTQ files pub const DEFAULT_MINQUAL: u8 = 20; /// Default quality filtering criteria -pub const DEFAULT_QUALFILTER: QualFilter = QualFilter::Middle; +pub const DEFAULT_QUALFILTER: QualFilter = QualFilter::Strict; #[doc(hidden)] fn valid_kmer(s: &str) -> Result { @@ -153,7 +153,7 @@ pub enum Commands { min_qual: u8, /// Quality filtering criteria (with reads) - #[arg(long, value_enum, default_value_t = QualFilter::Strict)] + #[arg(long, value_enum, default_value_t = DEFAULT_QUALFILTER)] qual_filter: QualFilter, /// Number of CPU threads From d29cccfc97052f54a1eba117b671399f9b842246 Mon Sep 17 00:00:00 2001 From: John Lees Date: Tue, 21 Nov 2023 17:27:18 +0000 Subject: [PATCH 2/8] Flow through new option No impl yet --- src/cli.rs | 10 ++++++++++ src/generic_modes.rs | 11 ++++++++++- src/lib.rs | 6 ++++++ src/merge_ska_array.rs | 1 + 4 files changed, 27 insertions(+), 1 deletion(-) diff --git a/src/cli.rs b/src/cli.rs index 4baf753..b540190 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -9,6 +9,8 @@ use super::QualFilter; pub const DEFAULT_KMER: usize = 17; /// Default single strand (which is equivalent to !rc) pub const DEFAULT_STRAND: bool = false; +/// Default behaviour when min-freq counting ambig sites +pub const DEFAULT_AMBIGMISSING: bool = false; /// Default repeat masking behaviour pub const DEFAULT_REPEATMASK: bool = false; /// Default ambiguous masking behaviour @@ -174,6 +176,10 @@ pub enum Commands { #[arg(short, long, value_parser = zero_to_one, default_value_t = 0.9)] min_freq: f64, + /// With min_freq, only count non-ambiguous sites + #[arg(long, default_value_t = DEFAULT_AMBIGMISSING)] + filter_ambig_as_missing: bool, + /// Filter for constant middle base sites #[arg(long, value_enum, default_value_t = FilterType::NoConst)] filter: FilterType, @@ -286,6 +292,10 @@ pub enum Commands { #[arg(short, long, value_parser = zero_to_one, default_value_t = 0.0)] min_freq: f64, + /// With min_freq, only count non-ambiguous sites + #[arg(long, default_value_t = DEFAULT_AMBIGMISSING)] + filter_ambig_as_missing: bool, + /// Filter for constant middle base sites #[arg(long, value_enum, default_value_t = FilterType::NoFilter)] filter: FilterType, diff --git a/src/generic_modes.rs b/src/generic_modes.rs index 7893e59..47d86ff 100644 --- a/src/generic_modes.rs +++ b/src/generic_modes.rs @@ -22,12 +22,13 @@ pub fn align UInt<'a>>( mask_ambig: bool, ignore_const_gaps: bool, min_freq: f64, + filter_ambig_as_missing: bool, ) { // In debug mode (cannot be set from CLI, give details) log::debug!("{ska_array}"); // Apply filters - apply_filters(ska_array, min_freq, filter, mask_ambig, ignore_const_gaps); + apply_filters(ska_array, min_freq, filter_ambig_as_missing, filter, mask_ambig, ignore_const_gaps); // Write out to file/stdout log::info!("Writing alignment"); @@ -103,6 +104,7 @@ pub fn merge UInt<'a>>( pub fn apply_filters UInt<'a>>( ska_array: &mut MergeSkaArray, min_freq: f64, + filter_ambig_as_missing: bool, filter: &FilterType, ambig_mask: bool, ignore_const_gaps: bool, @@ -112,6 +114,7 @@ pub fn apply_filters UInt<'a>>( log::info!("Applying filters: threshold={filter_threshold} constant_site_filter={filter} ambig_mask={ambig_mask} no_gap_only_sites={ignore_const_gaps}"); ska_array.filter( filter_threshold, + filter_ambig_as_missing, filter, ambig_mask, ignore_const_gaps, @@ -134,9 +137,11 @@ pub fn distance UInt<'a>>( let mask_ambig = false; let ignore_const_gaps = false; + let filter_ambig_as_missing = false; let constant = apply_filters( ska_array, min_freq, + filter_ambig_as_missing, &FilterType::NoConst, mask_ambig, ignore_const_gaps, @@ -146,6 +151,7 @@ pub fn distance UInt<'a>>( apply_filters( ska_array, min_freq, + filter_ambig_as_missing, &FilterType::NoAmbigOrConst, mask_ambig, ignore_const_gaps, @@ -154,6 +160,7 @@ pub fn distance UInt<'a>>( apply_filters( ska_array, min_freq, + filter_ambig_as_missing, &FilterType::NoFilter, mask_ambig, ignore_const_gaps, @@ -211,6 +218,7 @@ pub fn weed UInt<'a>>( weed_file: &Option, reverse: bool, min_freq: f64, + filter_ambig_as_missing: bool, filter: &FilterType, ambig_mask: bool, ignore_const_gaps: bool, @@ -246,6 +254,7 @@ pub fn weed UInt<'a>>( let update_kmers = true; ska_array.filter( filter_threshold, + filter_ambig_as_missing, filter, ambig_mask, ignore_const_gaps, diff --git a/src/lib.rs b/src/lib.rs index a8756ae..e452fd3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -514,6 +514,7 @@ pub fn main() { input, output, min_freq, + filter_ambig_as_missing, filter, ambig_mask, no_gap_only_sites, @@ -530,6 +531,7 @@ pub fn main() { *ambig_mask, *no_gap_only_sites, *min_freq, + *filter_ambig_as_missing, ); } else if let Ok(mut ska_array) = load_array::(input, *threads) { // In debug mode (cannot be set from CLI, give details) @@ -541,6 +543,7 @@ pub fn main() { *ambig_mask, *no_gap_only_sites, *min_freq, + *filter_ambig_as_missing, ); } else { panic!("Could not read input file(s): {input:?}"); @@ -658,6 +661,7 @@ pub fn main() { output, reverse, min_freq, + filter_ambig_as_missing, ambig_mask, no_gap_only_sites, filter, @@ -669,6 +673,7 @@ pub fn main() { weed_file, *reverse, *min_freq, + *filter_ambig_as_missing, filter, *ambig_mask, *no_gap_only_sites, @@ -684,6 +689,7 @@ pub fn main() { weed_file, *reverse, *min_freq, + *filter_ambig_as_missing, filter, *ambig_mask, *no_gap_only_sites, diff --git a/src/merge_ska_array.rs b/src/merge_ska_array.rs index 9bca332..1a4b539 100644 --- a/src/merge_ska_array.rs +++ b/src/merge_ska_array.rs @@ -245,6 +245,7 @@ where pub fn filter( &mut self, min_count: usize, + filter_ambig_as_missing: bool, filter: &FilterType, mask_ambig: bool, ignore_const_gaps: bool, From 552ab4cacaf71bdd6a7ea64a77736a1efe7ef873 Mon Sep 17 00:00:00 2001 From: John Lees Date: Tue, 21 Nov 2023 17:27:32 +0000 Subject: [PATCH 3/8] bump version --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 02e3e65..f010c13 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "ska" -version = "0.3.4" +version = "0.3.5" authors = [ "John Lees ", "Simon Harris ", From 24a1d8366107bccb779ad667be0cc90e231658dd Mon Sep 17 00:00:00 2001 From: John Lees Date: Tue, 21 Nov 2023 17:40:04 +0000 Subject: [PATCH 4/8] Add code for ambiguous as missing --- src/generic_modes.rs | 9 ++++++++- src/merge_ska_array.rs | 20 +++++++++++++++++--- 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/src/generic_modes.rs b/src/generic_modes.rs index 47d86ff..833e28c 100644 --- a/src/generic_modes.rs +++ b/src/generic_modes.rs @@ -28,7 +28,14 @@ pub fn align UInt<'a>>( log::debug!("{ska_array}"); // Apply filters - apply_filters(ska_array, min_freq, filter_ambig_as_missing, filter, mask_ambig, ignore_const_gaps); + apply_filters( + ska_array, + min_freq, + filter_ambig_as_missing, + filter, + mask_ambig, + ignore_const_gaps, + ); // Write out to file/stdout log::info!("Writing alignment"); diff --git a/src/merge_ska_array.rs b/src/merge_ska_array.rs index 1a4b539..5945995 100644 --- a/src/merge_ska_array.rs +++ b/src/merge_ska_array.rs @@ -101,13 +101,20 @@ where /// Update `variant_count` after changing `variants`. /// /// Recalculates counts, and removes any totally empty rows. - fn update_counts(&mut self) { + /// + /// # Arguments + /// + /// - `filter_ambig_as_missing` -- any non-ACGTU base counts as missing. + fn update_counts(&mut self, filter_ambig_as_missing: bool) { let mut new_counts = Vec::with_capacity(self.variant_count.len()); let mut new_sk = Vec::with_capacity(self.split_kmers.len()); let mut new_variants = Array2::zeros((0, self.names.len())); for (var_row, sk) in self.variants.outer_iter().zip(self.split_kmers.iter()) { - let count = var_row.iter().filter(|b| **b != b'-').count(); + let count = var_row + .iter() + .filter(|b| **b != b'-' && (!filter_ambig_as_missing || !is_ambiguous(**b))) + .count(); if count > 0 { new_counts.push(count); new_sk.push(*sk); @@ -224,7 +231,7 @@ where } self.variants = filtered_variants; self.names = new_names; - self.update_counts(); + self.update_counts(false); } /// Filters variants (middle bases) by frequency. @@ -331,6 +338,13 @@ where if update_kmers { self.split_kmers = filtered_kmers; } + + if filter_ambig_as_missing { + let before_count = self.variant_count.len(); + self.update_counts(true); + removed += (before_count - self.variant_count.len()) as i32; + } + log::info!("Filtering removed {removed} split k-mers"); // Replace any ambiguous variants with Ns, if requested From 0418efd0b1f12a4c99c8a4f837f15b71e15dfb4a Mon Sep 17 00:00:00 2001 From: John Lees Date: Wed, 22 Nov 2023 13:21:22 +0000 Subject: [PATCH 5/8] Change recount to before other filters --- src/generic_modes.rs | 4 ++-- src/io_utils.rs | 2 +- src/merge_ska_array.rs | 17 ++++++++++------- 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/src/generic_modes.rs b/src/generic_modes.rs index 833e28c..b3192bb 100644 --- a/src/generic_modes.rs +++ b/src/generic_modes.rs @@ -118,7 +118,7 @@ pub fn apply_filters UInt<'a>>( ) -> i32 { let update_kmers = false; let filter_threshold = f64::ceil(ska_array.nsamples() as f64 * min_freq) as usize; - log::info!("Applying filters: threshold={filter_threshold} constant_site_filter={filter} ambig_mask={ambig_mask} no_gap_only_sites={ignore_const_gaps}"); + log::info!("Applying filters: threshold={filter_threshold} constant_site_filter={filter} filter_ambig_as_missing={filter_ambig_as_missing} ambig_mask={ambig_mask} no_gap_only_sites={ignore_const_gaps}"); ska_array.filter( filter_threshold, filter_ambig_as_missing, @@ -257,7 +257,7 @@ pub fn weed UInt<'a>>( let filter_threshold = f64::floor(ska_array.nsamples() as f64 * min_freq) as usize; if filter_threshold > 0 || *filter != FilterType::NoFilter || ambig_mask || ignore_const_gaps { - log::info!("Applying filters: threshold={filter_threshold} constant_site_filter={filter} ambig_mask={ambig_mask} no_gap_only_sites={ignore_const_gaps}"); + log::info!("Applying filters: threshold={filter_threshold} constant_site_filter={filter} filter_ambig_as_missing={filter_ambig_as_missing} ambig_mask={ambig_mask} no_gap_only_sites={ignore_const_gaps}"); let update_kmers = true; ska_array.filter( filter_threshold, diff --git a/src/io_utils.rs b/src/io_utils.rs index 46b284d..11ed47f 100644 --- a/src/io_utils.rs +++ b/src/io_utils.rs @@ -59,7 +59,7 @@ pub fn load_array UInt<'a>>( ) -> Result, Box> { // Obtain a merged ska array if input.len() == 1 { - log::info!("Single file as input, trying to load as skf"); + log::info!("Single file as input, trying to load as skf {}-bits", IntT::n_bits()); MergeSkaArray::load(input[0].as_str()) } else { log::info!("Multiple files as input, running ska build with default settings"); diff --git a/src/merge_ska_array.rs b/src/merge_ska_array.rs index 5945995..808b0dc 100644 --- a/src/merge_ska_array.rs +++ b/src/merge_ska_array.rs @@ -106,9 +106,11 @@ where /// /// - `filter_ambig_as_missing` -- any non-ACGTU base counts as missing. fn update_counts(&mut self, filter_ambig_as_missing: bool) { + log::info!("Updating variant counts"); let mut new_counts = Vec::with_capacity(self.variant_count.len()); let mut new_sk = Vec::with_capacity(self.split_kmers.len()); + let mut empty: usize = 0; let mut new_variants = Array2::zeros((0, self.names.len())); for (var_row, sk) in self.variants.outer_iter().zip(self.split_kmers.iter()) { let count = var_row @@ -119,8 +121,11 @@ where new_counts.push(count); new_sk.push(*sk); new_variants.push_row(var_row).unwrap(); + } else { + empty += 1; } } + log::info!("Removed {empty} empty variants"); self.split_kmers = new_sk; self.variants = new_variants; self.variant_count = new_counts; @@ -267,6 +272,11 @@ where let mut filtered_counts = Vec::new(); let mut filtered_kmers = Vec::new(); let mut removed = 0; + + if filter_ambig_as_missing { + self.update_counts(true); + } + for count_it in self .variant_count .iter() @@ -338,13 +348,6 @@ where if update_kmers { self.split_kmers = filtered_kmers; } - - if filter_ambig_as_missing { - let before_count = self.variant_count.len(); - self.update_counts(true); - removed += (before_count - self.variant_count.len()) as i32; - } - log::info!("Filtering removed {removed} split k-mers"); // Replace any ambiguous variants with Ns, if requested From 217c399a7335def5f6a0a8c884f96b84db13160d Mon Sep 17 00:00:00 2001 From: John Lees Date: Wed, 22 Nov 2023 13:37:42 +0000 Subject: [PATCH 6/8] Fix the doc tests --- src/lib.rs | 3 ++- src/merge_ska_array.rs | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index e452fd3..838a5a5 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -368,11 +368,12 @@ //! //! // Apply filters //! let min_count = 2; +//! let filter_ambig_as_missing = false; //! let update_kmers = false; //! let filter = FilterType::NoConst; //! let ignore_const_gaps = false; //! let ambig_mask = false; -//! ska_array.filter(min_count, &filter, ambig_mask, ignore_const_gaps, update_kmers); +//! ska_array.filter(min_count, filter_ambig_as_missing, &filter, ambig_mask, ignore_const_gaps, update_kmers); //! //! // Write out to stdout //! let mut out_stream = set_ostream(&None); diff --git a/src/merge_ska_array.rs b/src/merge_ska_array.rs index 808b0dc..47893b8 100644 --- a/src/merge_ska_array.rs +++ b/src/merge_ska_array.rs @@ -54,11 +54,12 @@ use crate::cli::FilterType; /// /// // Remove constant sites and save /// let min_count = 1; // no filtering by minor allele frequency +/// let filter_ambig_as_missing = false; // allow ambiguous bases when counting allele frequency /// let filter = FilterType::NoAmbigOrConst; // remove sites with no minor allele /// let mask_ambiguous = false; // leave ambiguous sites as IUPAC codes /// let ignore_const_gaps = false; // keep sites with only '-' as variants /// let update_counts = true; // keep counts updated, as saving -/// ska_array.filter(min_count, &filter, mask_ambiguous, ignore_const_gaps, update_counts); +/// ska_array.filter(min_count, filter_ambig_as_missing, &filter, mask_ambiguous, ignore_const_gaps, update_counts); /// ska_array.save(&"no_const_sites.skf"); /// /// // Create an iterators From e0bc56b32d2aedd917c09b06edbf0deba072d37c Mon Sep 17 00:00:00 2001 From: John Lees Date: Wed, 22 Nov 2023 14:07:59 +0000 Subject: [PATCH 7/8] Add unit test for count function --- src/merge_ska_array.rs | 49 +++++++++++++++++++++++++++++++----------- 1 file changed, 36 insertions(+), 13 deletions(-) diff --git a/src/merge_ska_array.rs b/src/merge_ska_array.rs index 47893b8..b9f37dd 100644 --- a/src/merge_ska_array.rs +++ b/src/merge_ska_array.rs @@ -651,21 +651,39 @@ mod tests { use ndarray::array; fn setup_struct() -> MergeSkaArray { - let example_array = MergeSkaArray:: { + // Populate with some initial data. + MergeSkaArray:: { k: 31, rc: true, - split_kmers: vec![0, 1], - variants: array![[1, 2, 3], [4, 5, 6]], - variant_count: vec![3, 3], - ska_version: "NA".to_string(), - k_bits: 64, names: vec![ "Sample1".to_string(), "Sample2".to_string(), "Sample3".to_string(), ], - }; - example_array + split_kmers: vec![1, 2, 3], + variants: array![[b'A', b'G', b'Y'], [b'T', b'-', b'Y'], [b'N', b'Y', b'Y']], + variant_count: vec![3, 2, 3], + ska_version: "NA".to_string(), + k_bits: 64, + } + } + + #[test] + fn test_update_counts() { + let mut merge_ska_array = setup_struct(); + + // Test with filter_ambig_as_missing = true + merge_ska_array.update_counts(true); + + // Check if variant counts are updated correctly + assert_eq!(merge_ska_array.variant_count, vec![2, 1]); // Expected counts + + // Check if variants are updated correctly + // Assuming `variants` should now contain only the non-empty rows + assert_eq!(merge_ska_array.variants, array![[b'A', b'G', b'Y'], [b'T', b'-', b'Y']]); + + // Check if split_kmers are updated correctly + assert_eq!(merge_ska_array.split_kmers, vec![1, 2]); // Expected kmers } #[test] @@ -675,13 +693,18 @@ mod tests { // First iteration let (kmer, vars) = iter.next().unwrap(); - assert_eq!(kmer, 0); - assert_eq!(vars, vec![1, 2, 3]); + assert_eq!(kmer, 1); + assert_eq!(vars, vec![b'A', b'G', b'Y']); // Second iteration let (kmer, vars) = iter.next().unwrap(); - assert_eq!(kmer, 1); - assert_eq!(vars, vec![4, 5, 6]); + assert_eq!(kmer, 2); + assert_eq!(vars, vec![b'T', b'-', b'Y']); + + // Third iteration + let (kmer, vars) = iter.next().unwrap(); + assert_eq!(kmer, 3); + assert_eq!(vars, vec![b'N', b'Y', b'Y']); // No more items assert!(iter.next().is_none()); @@ -695,7 +718,7 @@ mod tests { // Check that the samples were deleted assert_eq!(ska_array.names, vec!["Sample3"]); - assert_eq!(ska_array.variants, array![[3], [6]]); + assert_eq!(ska_array.variants, array![[b'Y'], [b'Y'], [b'Y']]); } #[test] From 9e550028cefdaea39adb49d72e9e9801d11c4044 Mon Sep 17 00:00:00 2001 From: John Lees Date: Wed, 22 Nov 2023 14:15:35 +0000 Subject: [PATCH 8/8] Hit new line in integration tests --- tests/align.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/align.rs b/tests/align.rs index a6a93be..f8ede5b 100644 --- a/tests/align.rs +++ b/tests/align.rs @@ -183,6 +183,7 @@ fn filters() { .arg(sandbox.file_string("merge_k9.skf", TestDir::Input)) .arg("--filter") .arg("no-ambig") + .arg("--filter-ambig-as-missing") .arg("-v") .output() .unwrap()