From 3d1b0bd274feb52e3d2dd98340e873b36c0518b6 Mon Sep 17 00:00:00 2001 From: John Lees Date: Mon, 23 Jan 2023 13:20:11 +0000 Subject: [PATCH 01/14] Start work on map test and fixing (not compiling yet, see TODOs) --- src/ska_dict/count_min_filter.rs | 1 + src/ska_ref.rs | 95 +++++++++++++++++++------------- src/ska_ref/aln_writer.rs | 92 +++++++++++++++++++++++++++++++ tests/align.rs | 4 +- tests/common/mod.rs | 24 ++++++-- tests/fastq_input.rs | 12 ++-- tests/map.rs | 52 ++++++++++++++++- 7 files changed, 229 insertions(+), 51 deletions(-) create mode 100644 src/ska_ref/aln_writer.rs diff --git a/src/ska_dict/count_min_filter.rs b/src/ska_dict/count_min_filter.rs index 9bebfbf..6242d25 100644 --- a/src/ska_dict/count_min_filter.rs +++ b/src/ska_dict/count_min_filter.rs @@ -60,6 +60,7 @@ impl CountMin { // Consistent with consts used, but ensures a power of two let width_bits: usize = f64::floor(f64::log2(width as f64)) as usize; let width = 1 << (width_bits + 1); + // TODO change this to use MSB rather than LSB let mask = width as u64 - 1; // Reserve for these gets call by the vec! macro used in init diff --git a/src/ska_ref.rs b/src/ska_ref.rs index f0cbaa6..210cb73 100644 --- a/src/ska_ref.rs +++ b/src/ska_ref.rs @@ -27,6 +27,7 @@ //! ``` use std::str; +use std::io::Write; use noodles_vcf::{ self as vcf, @@ -43,7 +44,6 @@ use noodles_vcf::{ }, Record, }; -use std::io::Write; extern crate needletail; use ndarray::{s, Array2, ArrayView}; @@ -52,6 +52,9 @@ use needletail::{ parser::{write_fasta, Format}, }; +pub mod aln_writer; +use crate::ska_ref::aln_writer::AlnWriter; + use crate::merge_ska_dict::MergeSkaDict; use crate::ska_dict::bit_encoding::RC_IUPAC; use crate::ska_dict::split_kmer::SplitKmer; @@ -298,16 +301,16 @@ impl RefSka { /// Rows are variants, columns are samples, so this is broadly equivalent /// to writing out the mapped variant array and its metadata. /// - /// Extra work is used to check for unmapped missing bases between split k-mer + /// - Extra work is used to check for unmapped missing bases between split k-mer /// matches. - /// - /// A basic VCF header is written out first. - /// - /// Variants are then written to the output buffer one at a time. + /// - A basic VCF header is written out first. + /// - Variants are then written to the output buffer one at a time. + /// - Any ambiguous base is currently output as N, for simplicity. /// /// # Panics /// - /// If [`RefSka::map()`] has not been run yet, or no split-kmers mapped + /// If [`RefSka::map()`] has not been run yet, or no split-kmers mapped to + /// the reference. pub fn write_vcf(&self, f: &mut W) -> Result<(), std::io::Error> { if !self.is_mapped() { panic!("No split k-mers mapped to reference"); @@ -434,7 +437,10 @@ impl RefSka { /// /// # Panics /// - /// If [`RefSka::map()`] has not been run yet, or no split-kmers mapped + /// If [`RefSka::map()`] has not been run yet, or no split-kmers mapped. + /// + /// Will panic if output length not the same as reference, to safeguard against + /// terror in the implementation pub fn write_aln(&self, f: &mut W) -> Result<(), needletail::errors::ParseError> { if !self.is_mapped() { panic!("TNo split k-mers mapped to reference"); @@ -448,17 +454,19 @@ impl RefSka { let mut seq: Vec = Vec::new(); seq.reserve(self.total_size); - // if this proves tricky, it might be better to iterate through non-missing - // matches and paste each flanking end and middle base into the right place - // in the vec (and skip to next match where right end is beyond last paste) - // the alternative would be to: - // - allocate all missing - // - iterate through ref - // - write the whole k-mer in when found (ref, base, ref) - // - then skip over k in ref - // (even just modifying map to skip over k would work, but not a VCF) - let (mut next_pos, mut curr_chrom) = (0, 0); + // TODO: there needs to be a small amount past the last k-mer added sometimes, + // as the last matching k-mer would extend, but we skipped over it + // TODO: same for chromosome skip + let (mut next_pos, mut curr_chrom, mut last_mapped) = (0, 0, 0); for ((map_chrom, map_pos), base) in self.mapped_pos.iter().zip(sample_vars.iter()) { + println!("{map_pos} {next_pos} {}", String::from_utf8(seq.clone()).unwrap()); + if *base == b'-' { + continue; + } + if *map_pos < next_pos { + last_mapped = *map_pos + half_split_len; + continue; + } // Move forward to next chromosome/contig if *map_chrom > curr_chrom { seq.extend_from_slice( @@ -471,35 +479,45 @@ impl RefSka { curr_chrom += 1; next_pos = 0; } - if *map_pos > next_pos { - if *map_pos > next_pos + half_split_len { - // Missing bases, then first flanking half of split k-mer - // if no k-mers mapped over a large region - seq.extend(vec![b'-'; *map_pos - next_pos - half_split_len]); + if *map_pos > next_pos + half_split_len { + if *map_pos - next_pos - half_split_len > last_mapped { seq.extend_from_slice( - &self.seq[curr_chrom][(*map_pos - half_split_len)..*map_pos], + &self.seq[curr_chrom][seq.len()..(last_mapped + half_split_len)], ); - } else { - // Short missing region, which can happen with fastq input - // Reasonable to take bases from match - seq.extend_from_slice(&self.seq[curr_chrom][next_pos..*map_pos]); } + // Missing bases + seq.extend(vec![b'-'; *map_pos - next_pos - half_split_len]); } - next_pos = *map_pos + 1; - if *base == b'-' { - // This is around a split k-mer match, so we can fill in the - // flanking region with the reference - seq.push(self.seq[curr_chrom][*map_pos]); - } else { - seq.push(*base); - } + // First half of split k-mer + seq.extend_from_slice( + &self.seq[curr_chrom][(*map_pos - half_split_len)..*map_pos], + ); + // Middle base + seq.push(*base); + // Second half of split k-mer + seq.extend_from_slice( + &self.seq[curr_chrom][(*map_pos + 1)..=(*map_pos + half_split_len)], + ); + next_pos = *map_pos + self.k; } // Fill up to end of contig - seq.extend_from_slice(&self.seq[curr_chrom][next_pos..(next_pos + half_split_len)]); + if *map_pos - next_pos - half_split_len > last_mapped { + seq.extend_from_slice( + &self.seq[curr_chrom][last_mapped..(*map_pos - next_pos - half_split_len)], + ); + } seq.extend(vec![ b'-'; - self.seq[curr_chrom].len() - (next_pos + half_split_len) + self.seq[curr_chrom].len() - next_pos ]); + println!("{next_pos} {}", String::from_utf8(seq.clone()).unwrap()); + if seq.len() != self.total_size { + panic!( + "Internal error: output length {} not identical to chromosome length {}", + seq.len(), + self.total_size + ); + } write_fasta( sample_name.as_bytes(), seq.as_slice(), @@ -510,3 +528,4 @@ impl RefSka { Ok(()) } } + diff --git a/src/ska_ref/aln_writer.rs b/src/ska_ref/aln_writer.rs new file mode 100644 index 0000000..0a5c598 --- /dev/null +++ b/src/ska_ref/aln_writer.rs @@ -0,0 +1,92 @@ + +use std::{error::Error, fmt}; + +#[derive(Debug)] +pub struct AlnWriter<'a> { + next_pos: usize, + curr_chrom: usize, + last_mapped: usize, + last_written: usize, + ref_seq: &'a Vec>, + total_size: usize, + seq_out: Vec, + k: usize, + half_split_len: usize +} + +impl<'a> AlnWriter<'a> { + pub fn new(ref_seq: &'a Vec>, total_size: usize, k: usize) -> Self { + let (next_pos, curr_chrom, last_mapped, last_written) = (0, 0, 0, 0); + let seq_out = Vec::new(); + seq_out.reserve(total_size); + let half_split_len = (k - 1) / 2; + Self {next_pos, curr_chrom, last_mapped, last_written, ref_seq, total_size, seq_out, k, half_split_len} + } + + // TODO: fill to end of chromosome. Also called at the end + fn fill_contig(&mut self) { + seq.extend_from_slice( + &self.seq[curr_chrom][next_pos..(next_pos + half_split_len)], + ); + seq.extend(vec![ + b'-'; + self.seq[curr_chrom].len() - (next_pos + half_split_len) // This shouldn't overflow + ]); + curr_chrom += 1; + next_pos = 0; + } + + pub fn write_split_kmer(&mut self, mapped_pos: usize, mapped_chrom: usize, base: u8) { + if mapped_chrom > self.curr_chrom { + self.fill_contig(); + } + if base != b'-' { + if mapped_pos < self.next_pos { + self.last_mapped = mapped_pos + self.half_split_len; + } else { + // Write bases + // TODO easiest to write this out on a piece of paper + // probably want to calculate lengths first, then write to seq + // to make things clearer + // TODO bases at the end of last valid match not yet written + last_match_overhang = + seq.extend_from_slice( + &self.seq[curr_chrom][seq.len()..(last_mapped + half_split_len)], + ); + // TODO missing bases + seq.extend(vec![b'-'; *map_pos - next_pos - half_split_len]); + // TODO first half of split k-mer + seq.extend_from_slice( + &self.seq[curr_chrom][(*map_pos - half_split_len)..*map_pos], + ); + // TODO base + seq.push(*base); + // TODO second half of split k-mer + seq.extend_from_slice( + &self.seq[curr_chrom][(*map_pos + 1)..=(*map_pos + half_split_len)], + ); + // TODO update indices + next_pos = *map_pos + self.k; + } + } + } + + pub fn get_seq(&mut self) -> Result<&'a[u8], Box> { + self.fill_contig(); + if self.ref_seq.len() != self.total_size { + Result::Err(Box::new(self)) + } else { + Result::Ok(self.seq_out.as_slice()) + } + } +} + +impl Error for &mut AlnWriter<'_> {} + +impl fmt::Display for AlnWriter<'_> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "Internal error: output length {} not identical to chromosome length {}", + self.ref_seq.len(), + self.total_size) + } +} diff --git a/tests/align.rs b/tests/align.rs index ec95c71..d215581 100644 --- a/tests/align.rs +++ b/tests/align.rs @@ -1,7 +1,7 @@ use snapbox::cmd::{cargo_bin, Command}; pub mod common; -use crate::common::{var_hash, TestDir, TestSetup}; +use crate::common::*; use hashbrown::HashSet; @@ -11,7 +11,7 @@ use hashbrown::HashSet; fn build_cli() { let sandbox = TestSetup::setup(); // Create an rfile in the tmp dir - let rfile_name = sandbox.create_rfile(&"test", false); + let rfile_name = sandbox.create_rfile(&"test", FxType::Fasta); Command::new(cargo_bin("ska")) .current_dir(sandbox.get_wd()) diff --git a/tests/common/mod.rs b/tests/common/mod.rs index 3052167..627c3a0 100644 --- a/tests/common/mod.rs +++ b/tests/common/mod.rs @@ -22,6 +22,12 @@ pub enum TestDir { Correct, } +#[derive(Debug, PartialEq, Copy, Clone)] +pub enum FxType { + Fasta, + Fastq, +} + pub struct TestSetup { wd: TempDir, } @@ -81,14 +87,14 @@ impl TestSetup { predicate_fn.eval(self.wd.child(name_out).path()) } - pub fn create_rfile(&self, prefix: &str, fastq: bool) -> &str { + pub fn create_rfile(&self, prefix: &str, fx_type: FxType) -> &str { // Create an rfile in the tmp dir let mut rfile = LineWriter::new( File::create(format!("{}/{}", self.get_wd(), RFILE_NAME)) .expect("Could not write rfile"), ); - match fastq { - true => { + match fx_type { + FxType::Fastq => { writeln!( rfile, "{}", @@ -112,7 +118,7 @@ impl TestSetup { ) .unwrap(); } - false => { + FxType::Fasta => { writeln!( rfile, "{}", @@ -178,3 +184,13 @@ pub fn var_hash(aln_string: &[u8]) -> HashSet> { return variant_pairs; } + +// Helper for comparing mapped alignments with different sample names +pub fn cmp_map_aln(aln1: &[u8], aln2: &[u8]) { + let aln1_str = String::from_utf8(aln1.to_vec()).unwrap(); + let aln2_str = String::from_utf8(aln2.to_vec()).unwrap(); + + for (line1, line2) in aln1_str.lines().zip(aln2_str.lines()).skip(1).step_by(2) { + assert_eq!(line1, line2); + } +} diff --git a/tests/fastq_input.rs b/tests/fastq_input.rs index 5804fba..f4fed97 100644 --- a/tests/fastq_input.rs +++ b/tests/fastq_input.rs @@ -1,13 +1,13 @@ use snapbox::cmd::{cargo_bin, Command}; pub mod common; -use crate::common::{var_hash, TestDir, TestSetup}; +use crate::common::*; // Uses perfect reads with the same fasta input as merge.skf #[test] fn align_fastq() { let sandbox = TestSetup::setup(); - let rfile_name = sandbox.create_rfile(&"test", true); + let rfile_name = sandbox.create_rfile(&"test", FxType::Fastq); Command::new(cargo_bin("ska")) .current_dir(sandbox.get_wd()) @@ -54,7 +54,7 @@ fn align_fastq() { #[test] fn map_fastq() { let sandbox = TestSetup::setup(); - let rfile_name = sandbox.create_rfile(&"test", true); + let rfile_name = sandbox.create_rfile(&"test", FxType::Fastq); Command::new(cargo_bin("ska")) .current_dir(sandbox.get_wd()) @@ -136,7 +136,7 @@ fn error_fastq() { let sandbox = TestSetup::setup(); // Without errors - let rfile_name = sandbox.create_rfile(&"test", true); + let rfile_name = sandbox.create_rfile(&"test", FxType::Fastq); Command::new(cargo_bin("ska")) .current_dir(sandbox.get_wd()) .arg("build") @@ -159,7 +159,7 @@ fn error_fastq() { all_hash.remove(&vec!['C', 'T']); // With errors - let rfile_name = sandbox.create_rfile(&"test_error", true); + let rfile_name = sandbox.create_rfile(&"test_error", FxType::Fastq); Command::new(cargo_bin("ska")) .current_dir(sandbox.get_wd()) .arg("build") @@ -182,7 +182,7 @@ fn error_fastq() { assert_eq!(var_hash(&fastq_align_out_error), all_hash); // With low quality score - let rfile_name = sandbox.create_rfile(&"test_quality", true); + let rfile_name = sandbox.create_rfile(&"test_quality", FxType::Fastq); Command::new(cargo_bin("ska")) .current_dir(sandbox.get_wd()) .arg("build") diff --git a/tests/map.rs b/tests/map.rs index c5437c3..a8f246d 100644 --- a/tests/map.rs +++ b/tests/map.rs @@ -1,7 +1,7 @@ use snapbox::cmd::{cargo_bin, Command}; pub mod common; -use crate::common::{TestDir, TestSetup}; +use crate::common::*; // NB: to view output, uncomment the current_dir lines @@ -61,3 +61,53 @@ fn map_vcf() { .assert() .stdout_matches_path(sandbox.file_string("map_vcf.stdout", TestDir::Correct)); } + +#[test] +fn map_rev_comp() { + let sandbox = TestSetup::setup(); + + Command::new(cargo_bin("ska")) + .current_dir(sandbox.get_wd()) + .arg("build") + .arg("-o") + .arg("rc_build") + .arg("-k") + .arg("15") + .arg(sandbox.file_string("test_1.fa", TestDir::Input)) + .arg(sandbox.file_string("test_2_rc.fa", TestDir::Input)) + .assert() + .success(); + + let rc_map = Command::new(cargo_bin("ska")) + .current_dir(sandbox.get_wd()) + .arg("map") + .arg(sandbox.file_string("test_ref.fa", TestDir::Input)) + .arg("fwd_build.skf") + .output() + .unwrap() + .stdout; + + let fwd_map = Command::new(cargo_bin("ska")) + .current_dir(sandbox.get_wd()) + .arg("map") + .arg(sandbox.file_string("test_ref.fa", TestDir::Input)) + .arg(sandbox.file_string("merge.skf", TestDir::Input)) + .output() + .unwrap() + .stdout; + + cmp_map_aln(&rc_map, &fwd_map); + + Command::new(cargo_bin("ska")) + .current_dir(sandbox.get_wd()) + .arg("build") + .arg("-o") + .arg("ss_map") + .arg("-k") + .arg("15") + .arg("--single-strand") + .arg(sandbox.file_string("test_1.fa", TestDir::Input)) + .arg(sandbox.file_string("test_2_rc.fa", TestDir::Input)) + .assert() + .success(); +} From 19184f6a531882c27b34500099830a178de9be7f Mon Sep 17 00:00:00 2001 From: John Lees Date: Wed, 1 Feb 2023 15:14:01 +0000 Subject: [PATCH 02/14] Compiling aln writer --- src/ska_ref.rs | 75 +++------------------------ src/ska_ref/aln_writer.rs | 106 ++++++++++++++++++++++---------------- 2 files changed, 69 insertions(+), 112 deletions(-) diff --git a/src/ska_ref.rs b/src/ska_ref.rs index 210cb73..4e38357 100644 --- a/src/ska_ref.rs +++ b/src/ska_ref.rs @@ -443,84 +443,21 @@ impl RefSka { /// terror in the implementation pub fn write_aln(&self, f: &mut W) -> Result<(), needletail::errors::ParseError> { if !self.is_mapped() { - panic!("TNo split k-mers mapped to reference"); + panic!("No split k-mers mapped to reference"); } if self.chrom_names.len() > 1 { eprintln!("WARNING: Reference contained multiple contigs, in the output they will be concatenated"); } - let half_split_len = (self.k - 1) / 2; for (sample_idx, sample_name) in self.mapped_names.iter().enumerate() { let sample_vars = self.mapped_variants.slice(s![.., sample_idx]); - let mut seq: Vec = Vec::new(); - seq.reserve(self.total_size); - - // TODO: there needs to be a small amount past the last k-mer added sometimes, - // as the last matching k-mer would extend, but we skipped over it - // TODO: same for chromosome skip - let (mut next_pos, mut curr_chrom, mut last_mapped) = (0, 0, 0); - for ((map_chrom, map_pos), base) in self.mapped_pos.iter().zip(sample_vars.iter()) { - println!("{map_pos} {next_pos} {}", String::from_utf8(seq.clone()).unwrap()); - if *base == b'-' { - continue; - } - if *map_pos < next_pos { - last_mapped = *map_pos + half_split_len; - continue; - } - // Move forward to next chromosome/contig - if *map_chrom > curr_chrom { - seq.extend_from_slice( - &self.seq[curr_chrom][next_pos..(next_pos + half_split_len)], - ); - seq.extend(vec![ - b'-'; - self.seq[curr_chrom].len() - (next_pos + half_split_len) // This shouldn't overflow - ]); - curr_chrom += 1; - next_pos = 0; - } - if *map_pos > next_pos + half_split_len { - if *map_pos - next_pos - half_split_len > last_mapped { - seq.extend_from_slice( - &self.seq[curr_chrom][seq.len()..(last_mapped + half_split_len)], - ); - } - // Missing bases - seq.extend(vec![b'-'; *map_pos - next_pos - half_split_len]); - } - // First half of split k-mer - seq.extend_from_slice( - &self.seq[curr_chrom][(*map_pos - half_split_len)..*map_pos], - ); - // Middle base - seq.push(*base); - // Second half of split k-mer - seq.extend_from_slice( - &self.seq[curr_chrom][(*map_pos + 1)..=(*map_pos + half_split_len)], - ); - next_pos = *map_pos + self.k; - } - // Fill up to end of contig - if *map_pos - next_pos - half_split_len > last_mapped { - seq.extend_from_slice( - &self.seq[curr_chrom][last_mapped..(*map_pos - next_pos - half_split_len)], - ); - } - seq.extend(vec![ - b'-'; - self.seq[curr_chrom].len() - next_pos - ]); - println!("{next_pos} {}", String::from_utf8(seq.clone()).unwrap()); - if seq.len() != self.total_size { - panic!( - "Internal error: output length {} not identical to chromosome length {}", - seq.len(), - self.total_size - ); + let mut seq = AlnWriter::new(&self.seq, self.total_size, self.k); + for ((mapped_chrom, mapped_pos), base) in self.mapped_pos.iter().zip(sample_vars.iter()) { + seq.write_split_kmer(*mapped_pos, *mapped_chrom, *base); } + let aligned_seq = seq.get_seq().expect("Internal map error"); write_fasta( sample_name.as_bytes(), - seq.as_slice(), + aligned_seq, f, needletail::parser::LineEnding::Unix, )?; diff --git a/src/ska_ref/aln_writer.rs b/src/ska_ref/aln_writer.rs index 0a5c598..cf27637 100644 --- a/src/ska_ref/aln_writer.rs +++ b/src/ska_ref/aln_writer.rs @@ -10,30 +10,51 @@ pub struct AlnWriter<'a> { ref_seq: &'a Vec>, total_size: usize, seq_out: Vec, - k: usize, half_split_len: usize } +// TODO two things to try (and time) +// 1) Copy the whole seq then only edit the middle base and missing bases +// 2) Parallelise this, and write out at the end impl<'a> AlnWriter<'a> { pub fn new(ref_seq: &'a Vec>, total_size: usize, k: usize) -> Self { - let (next_pos, curr_chrom, last_mapped, last_written) = (0, 0, 0, 0); - let seq_out = Vec::new(); + let (curr_chrom, last_mapped, last_written) = (0, 0, 0); + let mut seq_out = Vec::new(); seq_out.reserve(total_size); let half_split_len = (k - 1) / 2; - Self {next_pos, curr_chrom, last_mapped, last_written, ref_seq, total_size, seq_out, k, half_split_len} + let next_pos = half_split_len; + Self {next_pos, curr_chrom, last_mapped, last_written, ref_seq, total_size, seq_out, half_split_len} } - // TODO: fill to end of chromosome. Also called at the end + // Fill to end of chromosome. Also called at the end + // AC[GGC]----[AT | C`] + // 1 2 3 + // 1: Last written -> last map-pos + half-k + // 2: Last mapped + half-k + // 3: Mapped position + fn fill_to(&mut self, end: usize) { + // bases at the end of last valid match not yet written + let last_written_base = self.last_mapped + self.half_split_len; + let last_match_overhang = (self.last_mapped + self.half_split_len).saturating_sub(last_written_base); + if last_match_overhang > 0 { + self.seq_out.extend_from_slice( + &self.ref_seq[self.curr_chrom][last_written_base..(last_written_base + last_match_overhang)] + ); + } + + // 2-start of split k-mer: missing bases + let missing_bases = end.saturating_sub(self.last_mapped).saturating_sub(2 * self.half_split_len); + if missing_bases > 0 { + self.seq_out.extend(vec![b'-'; missing_bases]); + } + + } + + // Fill up to the end of the contig and update indices fn fill_contig(&mut self) { - seq.extend_from_slice( - &self.seq[curr_chrom][next_pos..(next_pos + half_split_len)], - ); - seq.extend(vec![ - b'-'; - self.seq[curr_chrom].len() - (next_pos + half_split_len) // This shouldn't overflow - ]); - curr_chrom += 1; - next_pos = 0; + self.fill_to(self.ref_seq[self.curr_chrom].len()); + self.curr_chrom += 1; + self.next_pos = self.half_split_len; } pub fn write_split_kmer(&mut self, mapped_pos: usize, mapped_chrom: usize, base: u8) { @@ -41,37 +62,36 @@ impl<'a> AlnWriter<'a> { self.fill_contig(); } if base != b'-' { - if mapped_pos < self.next_pos { - self.last_mapped = mapped_pos + self.half_split_len; - } else { - // Write bases - // TODO easiest to write this out on a piece of paper - // probably want to calculate lengths first, then write to seq - // to make things clearer - // TODO bases at the end of last valid match not yet written - last_match_overhang = - seq.extend_from_slice( - &self.seq[curr_chrom][seq.len()..(last_mapped + half_split_len)], - ); - // TODO missing bases - seq.extend(vec![b'-'; *map_pos - next_pos - half_split_len]); - // TODO first half of split k-mer - seq.extend_from_slice( - &self.seq[curr_chrom][(*map_pos - half_split_len)..*map_pos], - ); - // TODO base - seq.push(*base); - // TODO second half of split k-mer - seq.extend_from_slice( - &self.seq[curr_chrom][(*map_pos + 1)..=(*map_pos + half_split_len)], - ); - // TODO update indices - next_pos = *map_pos + self.k; - } + return; } + if mapped_pos < self.next_pos { + self.last_mapped = mapped_pos; + return; + } + + // Write bases between last match and this one + self.fill_to(mapped_pos); + + // First half of split k-mer + self.seq_out.extend_from_slice( + &self.ref_seq[self.curr_chrom][(mapped_pos - self.half_split_len)..mapped_pos], + ); + // Middle base + self.seq_out.push(base); + // Second half of split k-mer + self.seq_out.extend_from_slice( + &self.ref_seq[self.curr_chrom][(mapped_pos + 1)..=(mapped_pos + self.half_split_len)], + ); + + // update indices + self.next_pos = mapped_pos + 2 * self.half_split_len; + self.last_mapped = mapped_pos; + self.last_written = mapped_pos + self.half_split_len; + + return; } - pub fn get_seq(&mut self) -> Result<&'a[u8], Box> { + pub fn get_seq(&'a mut self) -> Result<&'a[u8], Box> { self.fill_contig(); if self.ref_seq.len() != self.total_size { Result::Err(Box::new(self)) @@ -85,7 +105,7 @@ impl Error for &mut AlnWriter<'_> {} impl fmt::Display for AlnWriter<'_> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "Internal error: output length {} not identical to chromosome length {}", + write!(f, "Output length {} not identical to chromosome length {}", self.ref_seq.len(), self.total_size) } From d7c89acf155c572bd31d0f3a6745cd84571e18dd Mon Sep 17 00:00:00 2001 From: John Lees Date: Wed, 1 Feb 2023 20:06:18 +0000 Subject: [PATCH 03/14] Fix some more problems, but start another approach I think better to start with full length seq and write into it --- src/ska_ref/aln_writer.rs | 56 +++++++++++++++++++++++++-------------- 1 file changed, 36 insertions(+), 20 deletions(-) diff --git a/src/ska_ref/aln_writer.rs b/src/ska_ref/aln_writer.rs index cf27637..c6e7fde 100644 --- a/src/ska_ref/aln_writer.rs +++ b/src/ska_ref/aln_writer.rs @@ -1,7 +1,6 @@ use std::{error::Error, fmt}; -#[derive(Debug)] pub struct AlnWriter<'a> { next_pos: usize, curr_chrom: usize, @@ -19,8 +18,7 @@ pub struct AlnWriter<'a> { impl<'a> AlnWriter<'a> { pub fn new(ref_seq: &'a Vec>, total_size: usize, k: usize) -> Self { let (curr_chrom, last_mapped, last_written) = (0, 0, 0); - let mut seq_out = Vec::new(); - seq_out.reserve(total_size); + let seq_out = vec![b'-'; total_size]; let half_split_len = (k - 1) / 2; let next_pos = half_split_len; Self {next_pos, curr_chrom, last_mapped, last_written, ref_seq, total_size, seq_out, half_split_len} @@ -32,36 +30,43 @@ impl<'a> AlnWriter<'a> { // 1: Last written -> last map-pos + half-k // 2: Last mapped + half-k // 3: Mapped position - fn fill_to(&mut self, end: usize) { + fn fill_to(&mut self, end_position: usize, contig_end: bool) { // bases at the end of last valid match not yet written - let last_written_base = self.last_mapped + self.half_split_len; - let last_match_overhang = (self.last_mapped + self.half_split_len).saturating_sub(last_written_base); - if last_match_overhang > 0 { - self.seq_out.extend_from_slice( - &self.ref_seq[self.curr_chrom][last_written_base..(last_written_base + last_match_overhang)] - ); + if self.last_written > 0 { + let last_match_overhang = (self.last_mapped + self.half_split_len).saturating_sub(self.last_written); + if last_match_overhang > 0 { + self.seq_out.extend_from_slice( + &self.ref_seq[self.curr_chrom][(self.last_written + 1)..(self.last_written + last_match_overhang)] + ); + } } - + println!("{}", String::from_utf8(self.seq_out.clone()).unwrap()); // 2-start of split k-mer: missing bases - let missing_bases = end.saturating_sub(self.last_mapped).saturating_sub(2 * self.half_split_len); + let mut missing_bases = end_position.saturating_sub(self.last_mapped + self.half_split_len); + // This accounts for the fact that at the start of the sequence no bases have + // been written yet + if !contig_end && self.last_written > 0 { + missing_bases = missing_bases.saturating_sub(self.half_split_len); + } if missing_bases > 0 { self.seq_out.extend(vec![b'-'; missing_bases]); } - + println!("{}", String::from_utf8(self.seq_out.clone()).unwrap()); } // Fill up to the end of the contig and update indices fn fill_contig(&mut self) { - self.fill_to(self.ref_seq[self.curr_chrom].len()); + self.fill_to(self.ref_seq[self.curr_chrom].len(), true); self.curr_chrom += 1; self.next_pos = self.half_split_len; } pub fn write_split_kmer(&mut self, mapped_pos: usize, mapped_chrom: usize, base: u8) { + println!("{} {}", mapped_pos, base); if mapped_chrom > self.curr_chrom { self.fill_contig(); } - if base != b'-' { + if base == b'-' { return; } if mapped_pos < self.next_pos { @@ -70,7 +75,9 @@ impl<'a> AlnWriter<'a> { } // Write bases between last match and this one - self.fill_to(mapped_pos); + if mapped_pos > self.next_pos { + self.fill_to(mapped_pos, false); + } // First half of split k-mer self.seq_out.extend_from_slice( @@ -82,9 +89,10 @@ impl<'a> AlnWriter<'a> { self.seq_out.extend_from_slice( &self.ref_seq[self.curr_chrom][(mapped_pos + 1)..=(mapped_pos + self.half_split_len)], ); + println!("{}", String::from_utf8(self.seq_out.clone()).unwrap()); // update indices - self.next_pos = mapped_pos + 2 * self.half_split_len; + self.next_pos = mapped_pos + 2 * self.half_split_len + 1; self.last_mapped = mapped_pos; self.last_written = mapped_pos + self.half_split_len; @@ -92,8 +100,10 @@ impl<'a> AlnWriter<'a> { } pub fn get_seq(&'a mut self) -> Result<&'a[u8], Box> { + println!("{}", String::from_utf8(self.seq_out.clone()).unwrap()); self.fill_contig(); - if self.ref_seq.len() != self.total_size { + println!("{}", String::from_utf8(self.seq_out.clone()).unwrap()); + if self.seq_out.len() != self.total_size { Result::Err(Box::new(self)) } else { Result::Ok(self.seq_out.as_slice()) @@ -104,9 +114,15 @@ impl<'a> AlnWriter<'a> { impl Error for &mut AlnWriter<'_> {} impl fmt::Display for AlnWriter<'_> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}", String::from_utf8(self.seq_out.clone()).unwrap()) + } +} + +impl fmt::Debug for AlnWriter<'_> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "Output length {} not identical to chromosome length {}", - self.ref_seq.len(), + self.seq_out.len(), self.total_size) } -} +} \ No newline at end of file From 70a127bb12ffa629b3123db5fd8deccc43222682 Mon Sep 17 00:00:00 2001 From: John Lees Date: Fri, 3 Feb 2023 11:32:41 +0000 Subject: [PATCH 04/14] Parallelise align writer --- src/lib.rs | 4 +- src/ska_ref.rs | 40 +++++++----- src/ska_ref/aln_writer.rs | 134 +++++++++++++++----------------------- 3 files changed, 79 insertions(+), 99 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 3c025d3..c5f50d6 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -348,9 +348,9 @@ pub fn main() { let mut out_stream = set_ostream(output); match format { FileType::Aln => { - log::info!("Writing alignment"); + log::info!("Writing alignment with {} threads", *threads); ska_ref - .write_aln(&mut out_stream) + .write_aln(&mut out_stream, *threads) .expect("Failed to write output alignment"); } FileType::Vcf => { diff --git a/src/ska_ref.rs b/src/ska_ref.rs index 4e38357..898df94 100644 --- a/src/ska_ref.rs +++ b/src/ska_ref.rs @@ -26,8 +26,10 @@ //! ref_kmers.write_aln(&mut out_stream); //! ``` -use std::str; use std::io::Write; +use std::str; + +use rayon::prelude::*; use noodles_vcf::{ self as vcf, @@ -93,8 +95,6 @@ pub struct RefSka { chrom_names: Vec, /// Sequence, indexed by chromosome, then position seq: Vec>, - /// Total length of all input sequence - total_size: usize, /// Mapping information @@ -192,7 +192,6 @@ impl RefSka { let mut split_kmer_pos = Vec::new(); let mut seq = Vec::new(); let mut chrom_names = Vec::new(); - let mut total_size = 0; let mut reader = parse_fastx_file(filename).unwrap_or_else(|_| panic!("Invalid path/file: {filename}",)); @@ -204,7 +203,6 @@ impl RefSka { } chrom_names.push(str::from_utf8(seqrec.id()).unwrap().to_owned()); split_kmer_pos.reserve(seqrec.num_bases()); - total_size += seqrec.num_bases(); let kmer_opt = SplitKmer::new(seqrec.seq(), seqrec.num_bases(), None, k, rc, 0); if let Some(mut kmer_it) = kmer_opt { @@ -241,7 +239,6 @@ impl RefSka { Self { k, seq, - total_size, chrom_names, split_kmer_pos, mapped_pos, @@ -441,23 +438,37 @@ impl RefSka { /// /// Will panic if output length not the same as reference, to safeguard against /// terror in the implementation - pub fn write_aln(&self, f: &mut W) -> Result<(), needletail::errors::ParseError> { + pub fn write_aln( + &self, + f: &mut W, + threads: usize, + ) -> Result<(), needletail::errors::ParseError> { if !self.is_mapped() { panic!("No split k-mers mapped to reference"); } if self.chrom_names.len() > 1 { eprintln!("WARNING: Reference contained multiple contigs, in the output they will be concatenated"); } - for (sample_idx, sample_name) in self.mapped_names.iter().enumerate() { - let sample_vars = self.mapped_variants.slice(s![.., sample_idx]); - let mut seq = AlnWriter::new(&self.seq, self.total_size, self.k); - for ((mapped_chrom, mapped_pos), base) in self.mapped_pos.iter().zip(sample_vars.iter()) { - seq.write_split_kmer(*mapped_pos, *mapped_chrom, *base); + + let mut seqs_out = vec![AlnWriter::new(&self.seq, self.k); self.mapped_names.len()]; + rayon::ThreadPoolBuilder::new() + .num_threads(threads) + .build_global() + .unwrap(); + seqs_out.par_iter_mut().enumerate().for_each(|(idx, seq)| { + let sample_vars = self.mapped_variants.slice(s![.., idx]); + for ((mapped_chrom, mapped_pos), base) in self.mapped_pos.iter().zip(sample_vars.iter()) + { + if *base != b'-' { + seq.write_split_kmer(*mapped_pos, *mapped_chrom, *base); + } } - let aligned_seq = seq.get_seq().expect("Internal map error"); + }); + + for (sample_name, mut aligned_seq) in self.mapped_names.iter().zip(seqs_out) { write_fasta( sample_name.as_bytes(), - aligned_seq, + aligned_seq.get_seq(), f, needletail::parser::LineEnding::Unix, )?; @@ -465,4 +476,3 @@ impl RefSka { Ok(()) } } - diff --git a/src/ska_ref/aln_writer.rs b/src/ska_ref/aln_writer.rs index c6e7fde..c700f69 100644 --- a/src/ska_ref/aln_writer.rs +++ b/src/ska_ref/aln_writer.rs @@ -1,128 +1,98 @@ - -use std::{error::Error, fmt}; - +#[derive(Clone)] pub struct AlnWriter<'a> { next_pos: usize, curr_chrom: usize, last_mapped: usize, last_written: usize, + chrom_offset: usize, ref_seq: &'a Vec>, - total_size: usize, seq_out: Vec, - half_split_len: usize + half_split_len: usize, } -// TODO two things to try (and time) -// 1) Copy the whole seq then only edit the middle base and missing bases -// 2) Parallelise this, and write out at the end impl<'a> AlnWriter<'a> { - pub fn new(ref_seq: &'a Vec>, total_size: usize, k: usize) -> Self { - let (curr_chrom, last_mapped, last_written) = (0, 0, 0); + pub fn new(ref_seq: &'a Vec>, k: usize) -> Self { + let (curr_chrom, last_mapped, last_written, chrom_offset) = (0, 0, 0, 0); + let total_size = ref_seq.iter().map(|x| x.len()).sum(); let seq_out = vec![b'-'; total_size]; let half_split_len = (k - 1) / 2; let next_pos = half_split_len; - Self {next_pos, curr_chrom, last_mapped, last_written, ref_seq, total_size, seq_out, half_split_len} + Self { + next_pos, + curr_chrom, + last_mapped, + last_written, + chrom_offset, + ref_seq, + seq_out, + half_split_len, + } } // Fill to end of chromosome. Also called at the end - // AC[GGC]----[AT | C`] + // AC[GGC]----[AT | CC] // 1 2 3 // 1: Last written -> last map-pos + half-k // 2: Last mapped + half-k // 3: Mapped position - fn fill_to(&mut self, end_position: usize, contig_end: bool) { + fn fill_bases(&mut self) { // bases at the end of last valid match not yet written if self.last_written > 0 { - let last_match_overhang = (self.last_mapped + self.half_split_len).saturating_sub(self.last_written); + let last_match_overhang = + (self.last_mapped + self.half_split_len).saturating_sub(self.last_written); if last_match_overhang > 0 { - self.seq_out.extend_from_slice( - &self.ref_seq[self.curr_chrom][(self.last_written + 1)..(self.last_written + last_match_overhang)] - ); + let start = self.last_written; + let end = self.last_written + last_match_overhang; + self.seq_out[(start + self.chrom_offset)..=(end + self.chrom_offset)] + .copy_from_slice(&self.ref_seq[self.curr_chrom][start..=end]); } } - println!("{}", String::from_utf8(self.seq_out.clone()).unwrap()); - // 2-start of split k-mer: missing bases - let mut missing_bases = end_position.saturating_sub(self.last_mapped + self.half_split_len); - // This accounts for the fact that at the start of the sequence no bases have - // been written yet - if !contig_end && self.last_written > 0 { - missing_bases = missing_bases.saturating_sub(self.half_split_len); - } - if missing_bases > 0 { - self.seq_out.extend(vec![b'-'; missing_bases]); - } - println!("{}", String::from_utf8(self.seq_out.clone()).unwrap()); } // Fill up to the end of the contig and update indices fn fill_contig(&mut self) { - self.fill_to(self.ref_seq[self.curr_chrom].len(), true); + self.fill_bases(); + let chrom_length = self.ref_seq[self.curr_chrom].len(); + self.chrom_offset += chrom_length; self.curr_chrom += 1; self.next_pos = self.half_split_len; } pub fn write_split_kmer(&mut self, mapped_pos: usize, mapped_chrom: usize, base: u8) { - println!("{} {}", mapped_pos, base); if mapped_chrom > self.curr_chrom { self.fill_contig(); } - if base == b'-' { - return; - } + if mapped_pos < self.next_pos { self.last_mapped = mapped_pos; - return; - } - - // Write bases between last match and this one - if mapped_pos > self.next_pos { - self.fill_to(mapped_pos, false); - } - - // First half of split k-mer - self.seq_out.extend_from_slice( - &self.ref_seq[self.curr_chrom][(mapped_pos - self.half_split_len)..mapped_pos], - ); - // Middle base - self.seq_out.push(base); - // Second half of split k-mer - self.seq_out.extend_from_slice( - &self.ref_seq[self.curr_chrom][(mapped_pos + 1)..=(mapped_pos + self.half_split_len)], - ); - println!("{}", String::from_utf8(self.seq_out.clone()).unwrap()); - - // update indices - self.next_pos = mapped_pos + 2 * self.half_split_len + 1; - self.last_mapped = mapped_pos; - self.last_written = mapped_pos + self.half_split_len; + } else { + // Write bases between last match and this one + if mapped_pos > self.next_pos { + self.fill_bases(); + } - return; - } + // First half of split k-mer + let start = mapped_pos - self.half_split_len; + let end = mapped_pos; + self.seq_out[(start + self.chrom_offset)..(end + self.chrom_offset)] + .copy_from_slice(&self.ref_seq[self.curr_chrom][start..end]); + // Middle base + self.seq_out[mapped_pos + self.chrom_offset] = base; + // Second half of split k-mer + let start = mapped_pos + 1; + let end = mapped_pos + self.half_split_len; + self.seq_out[(start + self.chrom_offset)..(end + self.chrom_offset)] + .copy_from_slice(&self.ref_seq[self.curr_chrom][start..end]); - pub fn get_seq(&'a mut self) -> Result<&'a[u8], Box> { - println!("{}", String::from_utf8(self.seq_out.clone()).unwrap()); - self.fill_contig(); - println!("{}", String::from_utf8(self.seq_out.clone()).unwrap()); - if self.seq_out.len() != self.total_size { - Result::Err(Box::new(self)) - } else { - Result::Ok(self.seq_out.as_slice()) + // update indices + self.next_pos = mapped_pos + 2 * self.half_split_len; + self.last_mapped = mapped_pos; + self.last_written = mapped_pos + self.half_split_len; } } -} - -impl Error for &mut AlnWriter<'_> {} -impl fmt::Display for AlnWriter<'_> { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "{}", String::from_utf8(self.seq_out.clone()).unwrap()) + pub fn get_seq(&'a mut self) -> &'a [u8] { + self.fill_contig(); + self.seq_out.as_slice() } } - -impl fmt::Debug for AlnWriter<'_> { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "Output length {} not identical to chromosome length {}", - self.seq_out.len(), - self.total_size) - } -} \ No newline at end of file From 8fef2000c47af760cf452522ae138f491993a7de Mon Sep 17 00:00:00 2001 From: John Lees Date: Sun, 5 Feb 2023 23:00:32 +0000 Subject: [PATCH 05/14] Finish new map approach --- src/ska_ref.rs | 9 +-- src/ska_ref/aln_writer.rs | 73 +++++++++++++------ tests/fasta_input.rs | 2 + tests/fastq_input.rs | 4 +- tests/map.rs | 31 +++++--- tests/test_files_in/merge_k9.skf | Bin 0 -> 545 bytes tests/test_files_in/test_1_fwd.fastq.gz | Bin 879 -> 886 bytes tests/test_results_correct/map_N.stdout | 2 +- tests/test_results_correct/map_aln.stdout | 2 +- tests/test_results_correct/map_aln_k9.stdout | 4 + 10 files changed, 84 insertions(+), 43 deletions(-) create mode 100644 tests/test_files_in/merge_k9.skf create mode 100644 tests/test_results_correct/map_aln_k9.stdout diff --git a/src/ska_ref.rs b/src/ska_ref.rs index 898df94..35f0cad 100644 --- a/src/ska_ref.rs +++ b/src/ska_ref.rs @@ -23,7 +23,7 @@ //! // Run mapping, output an alignment to stdout //! ref_kmers.map(&ska_dict); //! let mut out_stream = set_ostream(&None); -//! ref_kmers.write_aln(&mut out_stream); +//! ref_kmers.write_aln(&mut out_stream, threads); //! ``` use std::io::Write; @@ -147,8 +147,7 @@ impl RefSka { /// /// The result can then be used with a writer. So far, only over missing records. /// - /// (I wrote it as an iterator to avoid passing the writer outside of the write function - /// some pretty new rust stuff here! ...lifetimes, move, iterator trait) + /// (I wrote it as an iterator to avoid passing the writer outside of the write function) fn iter_missing_vcf_rows<'a>( &'a self, chrom: usize, @@ -435,9 +434,6 @@ impl RefSka { /// # Panics /// /// If [`RefSka::map()`] has not been run yet, or no split-kmers mapped. - /// - /// Will panic if output length not the same as reference, to safeguard against - /// terror in the implementation pub fn write_aln( &self, f: &mut W, @@ -450,6 +446,7 @@ impl RefSka { eprintln!("WARNING: Reference contained multiple contigs, in the output they will be concatenated"); } + // Do most of the writing in parallel let mut seqs_out = vec![AlnWriter::new(&self.seq, self.k); self.mapped_names.len()]; rayon::ThreadPoolBuilder::new() .num_threads(threads) diff --git a/src/ska_ref/aln_writer.rs b/src/ska_ref/aln_writer.rs index c700f69..54ac225 100644 --- a/src/ska_ref/aln_writer.rs +++ b/src/ska_ref/aln_writer.rs @@ -1,3 +1,19 @@ +//! Generate a linear pseudoalignment. +//! +//! An empty sequence with the same length is set as the sequence +//! Matches are written as the first half, plus the middle base, e.g. for 7-mer +//! ACTGCTA then ACTG would be written. The first half is copied over from the +//! reference. Matches are skipped over until the next `k / 2 + 1` position +//! which is where more sequence can then be written. If bases are missing +//! or at the end of the contig these are caught up by adding the skipped +//! matches and the front half of the k-mer. +//! +//! All sequence is stored in memory. Calling [`AlnWriter::get_seq()`] finalises +//! and writes to the end of the last contig. + +/// Stores indexes which keep track of the writing, the output sequence and a +/// reference to the reference sequences which are written in to flanks of +/// each match. #[derive(Clone)] pub struct AlnWriter<'a> { next_pos: usize, @@ -11,6 +27,8 @@ pub struct AlnWriter<'a> { } impl<'a> AlnWriter<'a> { + /// Create a new [`AlnWriter`] taking the reference sequence mapped against + /// and the k-mer size used for the mapping pub fn new(ref_seq: &'a Vec>, k: usize) -> Self { let (curr_chrom, last_mapped, last_written, chrom_offset) = (0, 0, 0, 0); let total_size = ref_seq.iter().map(|x| x.len()).sum(); @@ -29,46 +47,60 @@ impl<'a> AlnWriter<'a> { } } - // Fill to end of chromosome. Also called at the end - // AC[GGC]----[AT | CC] - // 1 2 3 - // 1: Last written -> last map-pos + half-k - // 2: Last mapped + half-k - // 3: Mapped position - fn fill_bases(&mut self) { + // Fill fwd bases, and any skipped over. + // e.g. with split 7-mers perfectly matching + // CCGA AAGT + // 1 23 + // 1: Last written + // 2: Last mapped + // 3: Current mapped position + // Don't write anything + // CCGA[TT]-----AAGT + // 1 A2 B 3 + // Want to write between + // A = last written + 1 + // B = A + half-k + (last mapped - last written) + // maximum is provided as the base about to be written next, this will not + // write over this position + fn fill_fwd_bases(&mut self, maximum: usize) { // bases at the end of last valid match not yet written if self.last_written > 0 { let last_match_overhang = (self.last_mapped + self.half_split_len).saturating_sub(self.last_written); - if last_match_overhang > 0 { - let start = self.last_written; - let end = self.last_written + last_match_overhang; - self.seq_out[(start + self.chrom_offset)..=(end + self.chrom_offset)] - .copy_from_slice(&self.ref_seq[self.curr_chrom][start..=end]); + let start = self.last_written + 1; + let end = usize::min(start + last_match_overhang, maximum); + if end > start { + self.seq_out[(start + self.chrom_offset)..(end + self.chrom_offset)] + .copy_from_slice(&self.ref_seq[self.curr_chrom][start..end]); + self.last_written = end; } } } // Fill up to the end of the contig and update indices fn fill_contig(&mut self) { - self.fill_bases(); let chrom_length = self.ref_seq[self.curr_chrom].len(); + self.fill_fwd_bases(chrom_length); self.chrom_offset += chrom_length; self.curr_chrom += 1; self.next_pos = self.half_split_len; } + /// Write the first half of a split k-mer (by copying from the reference) + /// and its middle base, also filling any skipped bases as appropiate. + /// + /// Call in a loop over mapped positions, usually excluding '-' base which + /// haven't actually been mapped pub fn write_split_kmer(&mut self, mapped_pos: usize, mapped_chrom: usize, base: u8) { if mapped_chrom > self.curr_chrom { self.fill_contig(); } - if mapped_pos < self.next_pos { self.last_mapped = mapped_pos; } else { // Write bases between last match and this one if mapped_pos > self.next_pos { - self.fill_bases(); + self.fill_fwd_bases(mapped_pos - self.half_split_len); } // First half of split k-mer @@ -78,19 +110,16 @@ impl<'a> AlnWriter<'a> { .copy_from_slice(&self.ref_seq[self.curr_chrom][start..end]); // Middle base self.seq_out[mapped_pos + self.chrom_offset] = base; - // Second half of split k-mer - let start = mapped_pos + 1; - let end = mapped_pos + self.half_split_len; - self.seq_out[(start + self.chrom_offset)..(end + self.chrom_offset)] - .copy_from_slice(&self.ref_seq[self.curr_chrom][start..end]); // update indices - self.next_pos = mapped_pos + 2 * self.half_split_len; + self.next_pos = mapped_pos + self.half_split_len + 1; self.last_mapped = mapped_pos; - self.last_written = mapped_pos + self.half_split_len; + self.last_written = mapped_pos; } } + /// Retrieve the written sequence. Fills to the end of the final contig + /// so should only be called after the last call to [`AlnWriter::write_split_kmer()`]. pub fn get_seq(&'a mut self) -> &'a [u8] { self.fill_contig(); self.seq_out.as_slice() diff --git a/tests/fasta_input.rs b/tests/fasta_input.rs index b274c1d..f2a7bc8 100644 --- a/tests/fasta_input.rs +++ b/tests/fasta_input.rs @@ -36,6 +36,8 @@ fn map_n() { .arg("build") .arg(sandbox.file_string("N_test_1.fa", TestDir::Input)) .arg(sandbox.file_string("N_test_2.fa", TestDir::Input)) + .arg("-k") + .arg("11") .arg("-o") .arg("N_test") .assert() diff --git a/tests/fastq_input.rs b/tests/fastq_input.rs index f4fed97..152e69f 100644 --- a/tests/fastq_input.rs +++ b/tests/fastq_input.rs @@ -97,8 +97,8 @@ fn map_fastq() { .stdout; assert_eq!( - String::from_utf8(fastq_map_out), - String::from_utf8(fasta_map_out) + String::from_utf8(fastq_map_out).unwrap(), + String::from_utf8(fasta_map_out).unwrap() ); let fastq_map_out_vcf = Command::new(cargo_bin("ska")) diff --git a/tests/map.rs b/tests/map.rs index a8f246d..693037b 100644 --- a/tests/map.rs +++ b/tests/map.rs @@ -13,8 +13,8 @@ fn map_aln() { .current_dir(sandbox.get_wd()) .arg("map") .arg(sandbox.file_string("test_ref.fa", TestDir::Input)) - .arg(sandbox.file_string("merge.skf", TestDir::Input)) - .args(&["-v", "--threads", "2"]) + .arg(sandbox.file_string("test_1.fa", TestDir::Input)) + .arg(sandbox.file_string("test_2.fa", TestDir::Input)) .arg("-o") .arg("map.aln") .assert() @@ -26,10 +26,19 @@ fn map_aln() { .current_dir(sandbox.get_wd()) .arg("map") .arg(sandbox.file_string("test_ref.fa", TestDir::Input)) - .arg(sandbox.file_string("test_1.fa", TestDir::Input)) - .arg(sandbox.file_string("test_2.fa", TestDir::Input)) + .arg(sandbox.file_string("merge.skf", TestDir::Input)) + .args(&["-v", "--threads", "2"]) .assert() .stdout_matches_path(sandbox.file_string("map_aln.stdout", TestDir::Correct)); + + Command::new(cargo_bin("ska")) + .current_dir(sandbox.get_wd()) + .arg("map") + .arg(sandbox.file_string("test_ref.fa", TestDir::Input)) + .arg(sandbox.file_string("merge_k9.skf", TestDir::Input)) + .args(&["-v", "--threads", "2"]) + .assert() + .stdout_matches_path(sandbox.file_string("map_aln_k9.stdout", TestDir::Correct)); } #[test] @@ -40,11 +49,12 @@ fn map_vcf() { .current_dir(sandbox.get_wd()) .arg("map") .arg(sandbox.file_string("test_ref.fa", TestDir::Input)) - .arg(sandbox.file_string("merge.skf", TestDir::Input)) - .arg("-f") - .arg("vcf") + .arg(sandbox.file_string("test_1.fa", TestDir::Input)) + .arg(sandbox.file_string("test_2.fa", TestDir::Input)) .arg("-o") .arg("map.vcf") + .arg("-f") + .arg("vcf") .assert() .success(); @@ -54,8 +64,7 @@ fn map_vcf() { .current_dir(sandbox.get_wd()) .arg("map") .arg(sandbox.file_string("test_ref.fa", TestDir::Input)) - .arg(sandbox.file_string("test_1.fa", TestDir::Input)) - .arg(sandbox.file_string("test_2.fa", TestDir::Input)) + .arg(sandbox.file_string("merge.skf", TestDir::Input)) .arg("-f") .arg("vcf") .assert() @@ -72,7 +81,7 @@ fn map_rev_comp() { .arg("-o") .arg("rc_build") .arg("-k") - .arg("15") + .arg("9") .arg(sandbox.file_string("test_1.fa", TestDir::Input)) .arg(sandbox.file_string("test_2_rc.fa", TestDir::Input)) .assert() @@ -104,7 +113,7 @@ fn map_rev_comp() { .arg("-o") .arg("ss_map") .arg("-k") - .arg("15") + .arg("9") .arg("--single-strand") .arg(sandbox.file_string("test_1.fa", TestDir::Input)) .arg(sandbox.file_string("test_2_rc.fa", TestDir::Input)) diff --git a/tests/test_files_in/merge_k9.skf b/tests/test_files_in/merge_k9.skf new file mode 100644 index 0000000000000000000000000000000000000000..bf0e22c14fbfcc1075ca4af887344023894efaec GIT binary patch literal 545 zcmXX>O=}ZT6n%G`N#Dps6BA)hNg+&UQHl{FC0M0kNH8O)h|WUYq?3+s2A0hYl|NA{NcVA)O-E*$e&g5nqhm6-3cZkIvS{pObCP^ z6D~t00=M*4tawnSh)g^qe0zbNIiwSu9w9x~f%76ZZ{uPI>%U-}z{sQY1*;+CGZ@D> zzS4gK9}!tFF0u9ljR`^%o0sTn$ZdmM>}B9yp~PXWp%fuMM)nElJ#M&odJdD&WjG5l zouR9P~5}g3#2>nR2=QXlsgB`$a4mP|J^yDX3Ogz>Q$-Lasp=-=@kK6psW`} z#V9PMis-her5^8M-;(3q(i!KLsOhyuHfDMxztJhh%X%%zxkxGvE;34%OuCbbGq%!T zLL--zl?_%%#%v`|8iiyKCrkK-tfJha=c-IO`kdaNlqw5vmeo}Lw*UXQ mdh^405X`EQL=q7t6&?>VB?MDarkm>e-Q1Q?Nm@K1O8)?+!*qrK literal 0 HcmV?d00001 diff --git a/tests/test_files_in/test_1_fwd.fastq.gz b/tests/test_files_in/test_1_fwd.fastq.gz index b10b0edf4f34fb9ad7413437a7172f6ac01e0578..ce92d56e89d5e1960bfabda29933a21f1bb45346 100644 GIT binary patch literal 886 zcmV-+1Bv_}iwFq*^4wzp19W9`bYC%FW_M&RW?^%5aR7~1yN=v24D9z;0^C`m-qjf{ zjO+IY4oJELI3URXhZH55ThP*i<(SLYkfV>kUw^)T|1!lC7t!9$V?Uqg6MyIa`0@B0 zzsE<1+TsBXp#*xKgjS$Xz@s7bM7IaI)x0_aSD)x~KhNV-fW8)Oa#d@@w|IyG2G!?*5fs<-!DbQSuMI$JsUc*(OW-HF3)e`L0@u|5rXy`gq zv^9Tl!qKQgL~&@fN1ZGhHWcU3D27&Vbxv$VzR*mcV^B3Db`v*- z;m$Y58a5Y?OQT`i!qV2&#*=#Wd^WF8TWKp}T7`A*mR6jYz{#S+ErX?NYnCadAstuw z>i4YHINCU_mU-3#*RVm%AFRwDjuNdXs>)b7Y9xvF$W@iuHi10wu!6@GVT*13VX%h8 zG1<`Cw&qdMu*gX)OK5w6@fsEw`?5*yzi{=Vn%`)phV@huAl`J|s?5bO>Mgh;nOy{I z{IvyP>5eCADb%Ao0@ARV*(3JUghSY%dZ)!MoLp;9(!vSWO}OOkz{3(A_zFp0-`^td zXd+*7zMV_&sZcJnjA?_ZVFORj70qmZd_nitf4S#H03(HsryDDlNaO-0M~k?QWgN)g1x zgq4dK-MBk$S@@;L>?XE_TiF)5ih3KXHtEMg!&-5 zjo;(FLw${ahEM{%PC^?{s1VT*dZODWxwX6o0<%wadYtF;G=Q->GJ+oT|A+nesrrIO z(DQXXF^0qPnyZg!od2_-4QQ!=p@JmU(lVfXbsmkNlsboNK;2e@N8=J4?D(|YIy7{> zQ?#5v#cW|cxpo`H@a$1oSj-EX=E(1}Cud;;c7nUm+GQ32+Z75LSD}rTiA@zYH12K> z!NTIeV$n7knpQIlTco;gKmLBat<`v!*&Sa@C)jWFH;(fhZLzReW=u5+-^8uy*9$jC zBy)y>%*XY`Y)U-RFQajfS~wO}h$sP#d(_EdVN(qOjbdo^Hs{1={$OSPaFl38QB}suF(WCoN3N>Owh8P54=Z?F5x)A?9|2oP z9Fq;LZEG173yYk@vZS^b1aDzMu%9-`{TG2h;`~M{Eo`Kc0P&{t)?_Y*!MEUwWOfm- z@z)lFr8{1zrNBpb1Y}`%VUOBV6Aod6_)e=`IJwr5q(xAyn{X-mfrlkL$Q9DOzQ08= z&_sU9`F1X2q(X&VWK0`O3!7weu4rcS6En&rnd3$z6Inz~DXvaLRoqcmW!Xn<)GVd8 z(I5xWU!5Kn*2(=*IZm!n@^nOf>rO*kSj|>gVQUIhLB;M#9aEI+r6=b8hNVPu%gHV1 zqSwM^ItLJXC7*YdEQ)`y!oRf3-B7ikT#vkoT-Mv~UNmPXI!b*pQB&DVeYCo}uTlne zF=6Fm<}mI~Sh+hns=SvT3pwG_qHy2(Ubc{#q`GU*rIxCNEu!hKeKS+`nOUo~#65S* zCfZS{rip?qPU%xv(Wk|;eDbU)dLb<=44t*~pe$Esm)N_test_1 -------------GATTTGAAGGCGTTTATGATATTTAGCTGTACGAGAGTCTTTTAA----------------------------- +------------GATTTGAAGGCGTTTATGATATTTAGCTGTACGAGAGTCTTTT---AGTGTTTTGATGGT-------------- >N_test_2 ------------GATTTGAAGGCGTTTATGATATATAGCTGTACGAGAGTCTTTTAA----------------------------- diff --git a/tests/test_results_correct/map_aln.stdout b/tests/test_results_correct/map_aln.stdout index a5683ab..c1148e9 100644 --- a/tests/test_results_correct/map_aln.stdout +++ b/tests/test_results_correct/map_aln.stdout @@ -1,4 +1,4 @@ >test_1 ------------GATTTGAAGGCGTTTATGATATTTAGCTGTACGAGAGTCTTTTAAGAGTGTTTTGATGGTTT------------ >test_2 -------------GATTTGAAGGCGTTTATGATATATAGCTGTACGAGAGTCTTTTAAAAGTGTTTTGATGGTTT------------ +------------GATTTGAAGGCGTTTATGATATATAGCTGTACGAGAGTCTTTTAAAAGTGTTTT-------------------- diff --git a/tests/test_results_correct/map_aln_k9.stdout b/tests/test_results_correct/map_aln_k9.stdout new file mode 100644 index 0000000..f90ac74 --- /dev/null +++ b/tests/test_results_correct/map_aln_k9.stdout @@ -0,0 +1,4 @@ +>test_1 +------------GATTTGAAGGCGTTTATGATATTTAGCTGTACGAGAGTCTTTTAAGAGTSTTTTGATGGTTT------------ +>test_2 +------------GATTTGAAGGCGTTTATGATATATAGCTGTACGAGAGTCTTTTAAAAGTGTTTTGATGGTTT------------ From fd68754f00fb52fab1917ae37ff0ebadd93c9e3a Mon Sep 17 00:00:00 2001 From: John Lees Date: Sun, 5 Feb 2023 23:21:31 +0000 Subject: [PATCH 06/14] Finish the test for map aln --- src/merge_ska_array.rs | 8 +++++--- tests/map.rs | 8 ++++++++ tests/test_results_correct/map_ss.stdout | 4 ++++ 3 files changed, 17 insertions(+), 3 deletions(-) create mode 100644 tests/test_results_correct/map_ss.stdout diff --git a/src/merge_ska_array.rs b/src/merge_ska_array.rs index 2e371ad..53a7e5a 100644 --- a/src/merge_ska_array.rs +++ b/src/merge_ska_array.rs @@ -36,6 +36,7 @@ use crate::ska_ref::RefSka; /// ``` /// use ska::merge_ska_array::MergeSkaArray; /// use ska::io_utils::set_ostream; +/// use ska::ska_ref::RefSka; /// /// // Load an array from file /// let mut ska_array = MergeSkaArray::load(&"tests/test_files_in/merge.skf").expect("Could not open array"); @@ -53,6 +54,10 @@ use crate::ska_ref::RefSka; /// /// // Delete a sample /// ska_array.delete_samples(&[&"test_1"]); +/// +/// // Delete k-mers +/// let ska_weed = RefSka::new(ska_array.kmer_len(), &"tests/test_files_in/weed.fa", ska_array.rc()); +/// ska_array.weed(&ska_weed); /// ``` #[derive(Serialize, Deserialize)] pub struct MergeSkaArray { @@ -267,9 +272,6 @@ impl MergeSkaArray { /// may be a multi-FASTA) generated with [`RefSka::new()`] /// /// Used with `ska weed`. - /// - /// # Examples - /// #TODO pub fn weed(&mut self, weed_ref: &RefSka) { let weed_kmers: HashSet = HashSet::from_iter(weed_ref.kmer_iter()); diff --git a/tests/map.rs b/tests/map.rs index 693037b..16fa8e6 100644 --- a/tests/map.rs +++ b/tests/map.rs @@ -119,4 +119,12 @@ fn map_rev_comp() { .arg(sandbox.file_string("test_2_rc.fa", TestDir::Input)) .assert() .success(); + + Command::new(cargo_bin("ska")) + .current_dir(sandbox.get_wd()) + .arg("map") + .arg(sandbox.file_string("test_ref.fa", TestDir::Input)) + .arg("ss_map.skf") + .assert() + .stdout_matches_path(sandbox.file_string("map_ss.stdout", TestDir::Correct)); } diff --git a/tests/test_results_correct/map_ss.stdout b/tests/test_results_correct/map_ss.stdout new file mode 100644 index 0000000..f512e77 --- /dev/null +++ b/tests/test_results_correct/map_ss.stdout @@ -0,0 +1,4 @@ +>test_1 +------------GATTTGAAGGCGTTTATGATATTTAGCTGTACGAGAGTCTTTTAAGAGTSTTTTGATGGTTT------------ +>test_2_rc +-------------------------------------------------------------------------------------- From f76af3c4e0745ab657fdf12ca5153ca8c3e972c3 Mon Sep 17 00:00:00 2001 From: John Lees Date: Mon, 6 Feb 2023 11:31:21 +0000 Subject: [PATCH 07/14] Add tests for multiple chromosome references --- src/ska_dict/bit_encoding.rs | 4 +++ tests/map.rs | 19 ++++++++++- tests/test_files_in/test_ref_two_chrom.fa | 4 +++ .../map_aln_two_chrom.stdout | 4 +++ .../map_vcf_two_chrom.stdout | 32 +++++++++++++++++++ 5 files changed, 62 insertions(+), 1 deletion(-) create mode 100644 tests/test_files_in/test_ref_two_chrom.fa create mode 100644 tests/test_results_correct/map_aln_two_chrom.stdout create mode 100644 tests/test_results_correct/map_vcf_two_chrom.stdout diff --git a/src/ska_dict/bit_encoding.rs b/src/ska_dict/bit_encoding.rs index d1aae98..e6811a6 100644 --- a/src/ska_dict/bit_encoding.rs +++ b/src/ska_dict/bit_encoding.rs @@ -89,13 +89,17 @@ pub fn decode_kmer(k: usize, kmer: u64, upper_mask: u64, lower_mask: u64) -> (St /// e.g. on construction or after skipping an N. #[inline(always)] pub fn revcomp64_v2(mut res: u64, k_size: usize) -> u64 { + // This part reverses the bases by shuffling them using an on/off pattern + // of bits res = (res >> 2 & 0x3333333333333333) | (res & 0x3333333333333333) << 2; res = (res >> 4 & 0x0F0F0F0F0F0F0F0F) | (res & 0x0F0F0F0F0F0F0F0F) << 4; res = (res >> 8 & 0x00FF00FF00FF00FF) | (res & 0x00FF00FF00FF00FF) << 8; res = (res >> 16 & 0x0000FFFF0000FFFF) | (res & 0x0000FFFF0000FFFF) << 16; res = (res >> 32 & 0x00000000FFFFFFFF) | (res & 0x00000000FFFFFFFF) << 32; + // This reverse complements res ^= 0xAAAAAAAAAAAAAAAA; + // Shifts so LSB is at the bottom res >> (2 * (32 - k_size)) } diff --git a/tests/map.rs b/tests/map.rs index 16fa8e6..1a69463 100644 --- a/tests/map.rs +++ b/tests/map.rs @@ -36,9 +36,16 @@ fn map_aln() { .arg("map") .arg(sandbox.file_string("test_ref.fa", TestDir::Input)) .arg(sandbox.file_string("merge_k9.skf", TestDir::Input)) - .args(&["-v", "--threads", "2"]) .assert() .stdout_matches_path(sandbox.file_string("map_aln_k9.stdout", TestDir::Correct)); + + Command::new(cargo_bin("ska")) + .current_dir(sandbox.get_wd()) + .arg("map") + .arg(sandbox.file_string("test_ref_two_chrom.fa", TestDir::Input)) + .arg(sandbox.file_string("merge.skf", TestDir::Input)) + .assert() + .stdout_matches_path(sandbox.file_string("map_aln_two_chrom.stdout", TestDir::Correct)); } #[test] @@ -69,6 +76,16 @@ fn map_vcf() { .arg("vcf") .assert() .stdout_matches_path(sandbox.file_string("map_vcf.stdout", TestDir::Correct)); + + Command::new(cargo_bin("ska")) + .current_dir(sandbox.get_wd()) + .arg("map") + .arg(sandbox.file_string("test_ref_two_chrom.fa", TestDir::Input)) + .arg(sandbox.file_string("merge.skf", TestDir::Input)) + .arg("-f") + .arg("vcf") + .assert() + .stdout_matches_path(sandbox.file_string("map_vcf_two_chrom.stdout", TestDir::Correct)); } #[test] diff --git a/tests/test_files_in/test_ref_two_chrom.fa b/tests/test_files_in/test_ref_two_chrom.fa new file mode 100644 index 0000000..c04dcde --- /dev/null +++ b/tests/test_files_in/test_ref_two_chrom.fa @@ -0,0 +1,4 @@ +>fake_ref_chrom1 +ACGTACGTACGTGATTTGAAGGCGTTTATGATATTTAGCTGTACGAGAGTCTTTTAAGAGTGTTTTGATGGTTTACGTACGTACGT +>fake_ref_chrom2 +GATTTGAAGGCGTTTATGATATTTAGCTGTACGAGAGTCTTTTAAGAGTGTTTTGATGGTTT diff --git a/tests/test_results_correct/map_aln_two_chrom.stdout b/tests/test_results_correct/map_aln_two_chrom.stdout new file mode 100644 index 0000000..a1161c2 --- /dev/null +++ b/tests/test_results_correct/map_aln_two_chrom.stdout @@ -0,0 +1,4 @@ +>test_1 +------------GATTTGAAGGCGTTTATGATATTTAGCTGTACGAGAGTCTTTTAAGAGTGTTTTGATGGTTT------------GATTTGAAGGCGTTTATGATATTTAGCTGTACGAGAGTCTTTTAAGAGTGTTTTGATGGTTT +>test_2 +------------GATTTGAAGGCGTTTATGATATATAGCTGTACGAGAGTCTTTTAAAAGTGTTTT--------------------GATTTGAAGGCGTTTATGATATATAGCTGTACGAGAGTCTTTTAAAAGTGTTTT-------- diff --git a/tests/test_results_correct/map_vcf_two_chrom.stdout b/tests/test_results_correct/map_vcf_two_chrom.stdout new file mode 100644 index 0000000..fd20743 --- /dev/null +++ b/tests/test_results_correct/map_vcf_two_chrom.stdout @@ -0,0 +1,32 @@ +##fileformat=VCFv4.3 +##contig= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT test_1 test_2 +fake_ref_chrom1 1 . A . . . . GT . . +fake_ref_chrom1 2 . C . . . . GT . . +fake_ref_chrom1 3 . G . . . . GT . . +fake_ref_chrom1 4 . T . . . . GT . . +fake_ref_chrom1 5 . A . . . . GT . . +fake_ref_chrom1 6 . C . . . . GT . . +fake_ref_chrom1 7 . G . . . . GT . . +fake_ref_chrom1 8 . T . . . . GT . . +fake_ref_chrom1 9 . A . . . . GT . . +fake_ref_chrom1 10 . C . . . . GT . . +fake_ref_chrom1 11 . G . . . . GT . . +fake_ref_chrom1 12 . T . . . . GT . . +fake_ref_chrom1 35 . T A . . . GT 0 1 +fake_ref_chrom1 58 . G A . . . GT 0 1 +fake_ref_chrom1 75 . A . . . . GT . . +fake_ref_chrom1 76 . C . . . . GT . . +fake_ref_chrom1 77 . G . . . . GT . . +fake_ref_chrom1 78 . T . . . . GT . . +fake_ref_chrom1 79 . A . . . . GT . . +fake_ref_chrom1 80 . C . . . . GT . . +fake_ref_chrom1 81 . G . . . . GT . . +fake_ref_chrom1 82 . T . . . . GT . . +fake_ref_chrom1 83 . A . . . . GT . . +fake_ref_chrom1 84 . C . . . . GT . . +fake_ref_chrom1 85 . G . . . . GT . . +fake_ref_chrom1 86 . T . . . . GT . . +fake_ref_chrom2 23 . T A . . . GT 0 1 +fake_ref_chrom2 46 . G A . . . GT 0 1 From d525d3dd0ae41a72fd9faa73b335d0ba123562bb Mon Sep 17 00:00:00 2001 From: John Lees Date: Mon, 6 Feb 2023 13:17:12 +0000 Subject: [PATCH 08/14] Refactor the alnwriter in preparation for new vcf writer --- src/ska_ref.rs | 53 +++++++++++++++++++++------------------ src/ska_ref/aln_writer.rs | 18 ++++++++++--- 2 files changed, 44 insertions(+), 27 deletions(-) diff --git a/src/ska_ref.rs b/src/ska_ref.rs index 35f0cad..62b744c 100644 --- a/src/ska_ref.rs +++ b/src/ska_ref.rs @@ -292,6 +292,30 @@ impl RefSka { self.split_kmer_pos.iter().map(|k| k.kmer) } + fn gen_seqs(&self, threads: usize) -> Vec { + if !self.is_mapped() { + panic!("No split k-mers mapped to reference"); + } + + // Do most of the writing in parallel + let mut seq_writers = vec![AlnWriter::new(&self.seq, self.k); self.mapped_names.len()]; + rayon::ThreadPoolBuilder::new() + .num_threads(threads) + .build_global() + .unwrap(); + seq_writers.par_iter_mut().enumerate().for_each(|(idx, seq)| { + let sample_vars = self.mapped_variants.slice(s![.., idx]); + for ((mapped_chrom, mapped_pos), base) in self.mapped_pos.iter().zip(sample_vars.iter()) + { + if *base != b'-' { + seq.write_split_kmer(*mapped_pos, *mapped_chrom, *base); + } + } + seq.finalise(); + }); + seq_writers + } + /// Write mapped variants as a VCF, using [`noodles_vcf`]. /// /// Rows are variants, columns are samples, so this is broadly equivalent @@ -373,17 +397,16 @@ impl RefSka { for mapped_base in bases { let mut gt = String::from("0"); if *mapped_base != ref_base { - if *mapped_base == b'-' { - gt = ".".to_string(); - } else { + let alt_base = u8_to_base(*mapped_base); if !alt_bases.contains(&alt_base) { alt_bases.push(alt_base); } gt = (alt_bases.iter().position(|&r| r == alt_base).unwrap() + 1) .to_string(); - } + } + println!("{} {} {} {}", *map_pos, *mapped_base as char, gt, alt_bases.len()); let field = Field::new(Key::Genotype, Some(Value::String(gt))); genotype_vec .push(Genotype::try_from(vec![field]).expect("Could not construct genotypes")); @@ -439,30 +462,12 @@ impl RefSka { f: &mut W, threads: usize, ) -> Result<(), needletail::errors::ParseError> { - if !self.is_mapped() { - panic!("No split k-mers mapped to reference"); - } if self.chrom_names.len() > 1 { eprintln!("WARNING: Reference contained multiple contigs, in the output they will be concatenated"); } - // Do most of the writing in parallel - let mut seqs_out = vec![AlnWriter::new(&self.seq, self.k); self.mapped_names.len()]; - rayon::ThreadPoolBuilder::new() - .num_threads(threads) - .build_global() - .unwrap(); - seqs_out.par_iter_mut().enumerate().for_each(|(idx, seq)| { - let sample_vars = self.mapped_variants.slice(s![.., idx]); - for ((mapped_chrom, mapped_pos), base) in self.mapped_pos.iter().zip(sample_vars.iter()) - { - if *base != b'-' { - seq.write_split_kmer(*mapped_pos, *mapped_chrom, *base); - } - } - }); - - for (sample_name, mut aligned_seq) in self.mapped_names.iter().zip(seqs_out) { + let alignments = self.gen_seqs(threads); + for (sample_name, mut aligned_seq) in self.mapped_names.iter().zip(alignments) { write_fasta( sample_name.as_bytes(), aligned_seq.get_seq(), diff --git a/src/ska_ref/aln_writer.rs b/src/ska_ref/aln_writer.rs index 54ac225..171c39f 100644 --- a/src/ska_ref/aln_writer.rs +++ b/src/ska_ref/aln_writer.rs @@ -24,6 +24,7 @@ pub struct AlnWriter<'a> { ref_seq: &'a Vec>, seq_out: Vec, half_split_len: usize, + finalised: bool } impl<'a> AlnWriter<'a> { @@ -31,6 +32,7 @@ impl<'a> AlnWriter<'a> { /// and the k-mer size used for the mapping pub fn new(ref_seq: &'a Vec>, k: usize) -> Self { let (curr_chrom, last_mapped, last_written, chrom_offset) = (0, 0, 0, 0); + let finalised = false; let total_size = ref_seq.iter().map(|x| x.len()).sum(); let seq_out = vec![b'-'; total_size]; let half_split_len = (k - 1) / 2; @@ -44,6 +46,7 @@ impl<'a> AlnWriter<'a> { ref_seq, seq_out, half_split_len, + finalised } } @@ -118,10 +121,19 @@ impl<'a> AlnWriter<'a> { } } - /// Retrieve the written sequence. Fills to the end of the final contig - /// so should only be called after the last call to [`AlnWriter::write_split_kmer()`]. - pub fn get_seq(&'a mut self) -> &'a [u8] { + /// Fills to the end of the final contig + /// Should only be called after the last call to [`AlnWriter::write_split_kmer()`]. + pub fn finalise(&mut self) { self.fill_contig(); + self.finalised = true; + } + + /// Retrieve the written sequence. Calls [`AlnWriter::finalise()`] if not already called. + pub fn get_seq(&'a mut self) -> &'a [u8] { + if !self.finalised { + self.fill_contig(); + self.finalised = true; + } self.seq_out.as_slice() } } From d0727f1ba84da87a2fb1ffb235638c9fca25c10f Mon Sep 17 00:00:00 2001 From: John Lees Date: Tue, 7 Feb 2023 08:58:20 +0000 Subject: [PATCH 09/14] Use MSB in countmin --- src/ska_dict/count_min_filter.rs | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/ska_dict/count_min_filter.rs b/src/ska_dict/count_min_filter.rs index 6242d25..59e9fb1 100644 --- a/src/ska_dict/count_min_filter.rs +++ b/src/ska_dict/count_min_filter.rs @@ -37,6 +37,8 @@ use ahash::RandomState; pub struct CountMin { /// Table width (estimated number of unique k-mers) width: usize, + /// Number of bits to shift masked value to get table entry + width_shift: u32, /// Table height (number of hashes) height: usize, /// Hash generators @@ -60,8 +62,9 @@ impl CountMin { // Consistent with consts used, but ensures a power of two let width_bits: usize = f64::floor(f64::log2(width as f64)) as usize; let width = 1 << (width_bits + 1); - // TODO change this to use MSB rather than LSB - let mask = width as u64 - 1; + // Use MSB rather than LSB + let width_shift = u64::BITS - width_bits as u32; + let mask = (width as u64 - 1) << width_shift; // Reserve for these gets call by the vec! macro used in init let hash_factory = Vec::new(); @@ -69,6 +72,7 @@ impl CountMin { Self { width, + width_shift, height, hash_factory, mask, @@ -103,7 +107,8 @@ impl CountMin { let mut count = 0; for hash_it in self.hash_factory.iter().enumerate() { let (row_idx, hash) = hash_it; - let table_idx = row_idx * self.width + ((hash.hash_one(kmer) & self.mask) as usize); + let table_idx = row_idx * self.width + + (((hash.hash_one(kmer) & self.mask) >> self.width_shift) as usize); self.counts[table_idx] = self.counts[table_idx].saturating_add(1); if row_idx == 0 { count = self.counts[table_idx]; From 572add72fcb103fea5853668207f27dd79f24f20 Mon Sep 17 00:00:00 2001 From: John Lees Date: Tue, 7 Feb 2023 08:58:45 +0000 Subject: [PATCH 10/14] Rework VCF writer --- src/lib.rs | 6 +- src/ska_ref.rs | 254 +++++++----------- src/ska_ref/aln_writer.rs | 8 +- src/ska_ref/idx_check.rs | 35 +++ tests/test_files_in/indel_test.fa | 2 + tests/test_results_correct/map_vcf.stdout | 8 + .../map_vcf_two_chrom.stdout | 16 ++ 7 files changed, 165 insertions(+), 164 deletions(-) create mode 100644 src/ska_ref/idx_check.rs create mode 100644 tests/test_files_in/indel_test.fa diff --git a/src/lib.rs b/src/lib.rs index c5f50d6..990494f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -348,15 +348,15 @@ pub fn main() { let mut out_stream = set_ostream(output); match format { FileType::Aln => { - log::info!("Writing alignment with {} threads", *threads); + log::info!("Generating alignment with {} threads", *threads); ska_ref .write_aln(&mut out_stream, *threads) .expect("Failed to write output alignment"); } FileType::Vcf => { - log::info!("Writing VCF"); + log::info!("Generating VCF with {} threads", *threads); ska_ref - .write_vcf(&mut out_stream) + .write_vcf(&mut out_stream, *threads) .expect("Failed to write output VCF"); } } diff --git a/src/ska_ref.rs b/src/ska_ref.rs index 62b744c..8d9d8b1 100644 --- a/src/ska_ref.rs +++ b/src/ska_ref.rs @@ -44,7 +44,6 @@ use noodles_vcf::{ reference_bases::Base, AlternateBases, Genotypes, Position, }, - Record, }; extern crate needletail; @@ -56,6 +55,8 @@ use needletail::{ pub mod aln_writer; use crate::ska_ref::aln_writer::AlnWriter; +pub mod idx_check; +use crate::ska_ref::idx_check::IdxCheck; use crate::merge_ska_dict::MergeSkaDict; use crate::ska_dict::bit_encoding::RC_IUPAC; @@ -125,52 +126,12 @@ fn gt_keys() -> Keys { "GT".parse().expect("Genotype format error") } -/// Genotypes "." when no reference mapped to (of sample length) -/// These can be used as genotypes in the record builder -fn generate_missing_genotypes(n_samples: usize) -> Genotypes { - let missing_field = Field::new(Key::Genotype, Some(Value::String(".".to_string()))); - let missing_genotype_vec = vec![ - Genotype::try_from(vec![missing_field]) - .expect("Could not construct genotypes"); - n_samples - ]; - Genotypes::new(gt_keys(), missing_genotype_vec) -} - impl RefSka { /// Whether [`map`] has been run fn is_mapped(&self) -> bool { self.mapped_variants.nrows() > 0 } - /// An [`Iterator`] between start and end positions which generates records with the passed genotypes - /// - /// The result can then be used with a writer. So far, only over missing records. - /// - /// (I wrote it as an iterator to avoid passing the writer outside of the write function) - fn iter_missing_vcf_rows<'a>( - &'a self, - chrom: usize, - start: usize, - end: usize, - geno: &'a Genotypes, - ) -> impl Iterator + 'a { - (start..end).map(move |missing_pos| { - let ref_allele = u8_to_base(self.seq[chrom][missing_pos]); - vcf::Record::builder() - .set_chromosome( - self.chrom_names[chrom] - .parse() - .expect("Invalid chromosome name"), - ) - .set_position(Position::from(missing_pos + 1)) - .add_reference_base(ref_allele) - .set_genotypes(geno.clone()) - .build() - .expect("Could not construct record") - }) - } - /// Create a list of split k-mers from an input FASTA file. /// /// Input may have multiple sequences (which are treated as chromosomes and @@ -292,50 +253,101 @@ impl RefSka { self.split_kmer_pos.iter().map(|k| k.kmer) } - fn gen_seqs(&self, threads: usize) -> Vec { + // Calls the necessary parts of AlnWriter (in parallel) to produce all the + // pseudoalignments. The calling function either modifies these (VCF) or + // simply writes them out (ALN) + fn pseudoalignment(&self, threads: usize) -> Vec { if !self.is_mapped() { panic!("No split k-mers mapped to reference"); } - // Do most of the writing in parallel let mut seq_writers = vec![AlnWriter::new(&self.seq, self.k); self.mapped_names.len()]; rayon::ThreadPoolBuilder::new() .num_threads(threads) .build_global() .unwrap(); - seq_writers.par_iter_mut().enumerate().for_each(|(idx, seq)| { - let sample_vars = self.mapped_variants.slice(s![.., idx]); - for ((mapped_chrom, mapped_pos), base) in self.mapped_pos.iter().zip(sample_vars.iter()) - { - if *base != b'-' { - seq.write_split_kmer(*mapped_pos, *mapped_chrom, *base); + seq_writers + .par_iter_mut() + .enumerate() + .for_each(|(idx, seq)| { + let sample_vars = self.mapped_variants.slice(s![.., idx]); + for ((mapped_chrom, mapped_pos), base) in + self.mapped_pos.iter().zip(sample_vars.iter()) + { + if *base != b'-' { + seq.write_split_kmer(*mapped_pos, *mapped_chrom, *base); + } } - } - seq.finalise(); - }); + seq.finalise(); + }); seq_writers } - /// Write mapped variants as a VCF, using [`noodles_vcf`]. + /// Write mapped variants as an aln/multi-FASTA file, using [`needletail`]. + /// + /// Rows are samples, columns are variants, so this is broadly equivalent + /// to writing out the transpose of the mapped variant array. + /// + /// Extra work is used to check to write out flanking bases of the split + /// k-mer, and padding with any missing positions. /// - /// Rows are variants, columns are samples, so this is broadly equivalent - /// to writing out the mapped variant array and its metadata. + /// This is done in memory, then samples are written to the output buffer + /// one at a time. /// - /// - Extra work is used to check for unmapped missing bases between split k-mer - /// matches. + /// # Panics + /// + /// If [`RefSka::map()`] has not been run yet, or no split-kmers mapped. + pub fn write_aln( + &self, + f: &mut W, + threads: usize, + ) -> Result<(), needletail::errors::ParseError> { + if self.chrom_names.len() > 1 { + eprintln!("WARNING: Reference contained multiple contigs, in the output they will be concatenated"); + } + + let alignments = self.pseudoalignment(threads); + log::info!("Writing alignment to file"); + for (sample_name, mut aligned_seq) in self.mapped_names.iter().zip(alignments) { + write_fasta( + sample_name.as_bytes(), + aligned_seq.get_seq(), + f, + needletail::parser::LineEnding::Unix, + )?; + } + Ok(()) + } + + /// Write mapped variants as a VCF, using [`noodles_vcf`]. + /// + /// This uses [`RefSka::write_aln()`] and converts the output to a VCF: /// - A basic VCF header is written out first. - /// - Variants are then written to the output buffer one at a time. - /// - Any ambiguous base is currently output as N, for simplicity. + /// - An alignment is created (in memory), which is then converted to a VCF + /// - Any ambiguous base is currently output as N, for simplicity of GT field. /// /// # Panics /// /// If [`RefSka::map()`] has not been run yet, or no split-kmers mapped to /// the reference. - pub fn write_vcf(&self, f: &mut W) -> Result<(), std::io::Error> { + pub fn write_vcf(&self, f: &mut W, threads: usize) -> Result<(), std::io::Error> { if !self.is_mapped() { panic!("No split k-mers mapped to reference"); } + // Get the pseudoalignment and store in array form + let alignments = self.pseudoalignment(threads); + + log::info!("Converting to VCF"); + let mut variants = Array2::zeros((0, alignments[0].total_size())); + for mut seq in alignments { + variants.push_row(ArrayView::from(seq.get_seq())).unwrap(); + } + // Transpose the array + let var_t = variants.t(); + let mut var_t_owned = Array2::zeros(var_t.raw_dim()); + var_t_owned.assign(&var_t); + // Write the VCF header let mut writer = vcf::Writer::new(f); let mut header_builder = vcf::Header::builder(); @@ -352,76 +364,45 @@ impl RefSka { // Write each record (column) let keys = gt_keys(); - let missing_genotypes = generate_missing_genotypes(self.mapped_names.len()); - - // Note that a lot of the logic here is similar to write_aln below, which - // is a bit simpler to follow - let half_split_len = (self.k - 1) / 2; - let (mut next_pos, mut curr_chrom) = (0, 0); - for ((map_chrom, map_pos), bases) in self - .mapped_pos - .iter() - .zip(self.mapped_variants.outer_iter()) - { - // Fill missing bases to the end of the last chromosome - if *map_chrom > curr_chrom { - for record in self.iter_missing_vcf_rows( - curr_chrom, - next_pos + half_split_len, - self.seq[curr_chrom].len(), - &missing_genotypes, - ) { - writer.write_record(&record)?; - } - curr_chrom += 1; - next_pos = 0; - } - // Fill missing bases between k-mer matches - if *map_pos > next_pos { - for record in self.iter_missing_vcf_rows( - *map_chrom, - next_pos, - *map_pos - half_split_len, - &missing_genotypes, - ) { - writer.write_record(&record)?; - } - } - - let ref_base = self.seq[*map_chrom][*map_pos]; + let mut idx_map = IdxCheck::new(&self.seq); + for (idx, sample_variants) in var_t_owned.outer_iter().enumerate() { + let (map_chrom, map_pos) = idx_map.idx_to_coor(idx); + let ref_base = self.seq[map_chrom][map_pos]; let ref_allele = u8_to_base(ref_base); let mut genotype_vec = Vec::new(); - genotype_vec.reserve(bases.len()); + genotype_vec.reserve(var_t_owned.ncols()); let mut alt_bases: Vec = Vec::new(); - for mapped_base in bases { - let mut gt = String::from("0"); - if *mapped_base != ref_base { - - let alt_base = u8_to_base(*mapped_base); - if !alt_bases.contains(&alt_base) { - alt_bases.push(alt_base); - } - gt = (alt_bases.iter().position(|&r| r == alt_base).unwrap() + 1) - .to_string(); - - } - println!("{} {} {} {}", *map_pos, *mapped_base as char, gt, alt_bases.len()); + let mut variant = false; + for mapped_base in sample_variants { + let gt = if *mapped_base == ref_base { + String::from("0") + } else if *mapped_base == b'-' { + variant = true; + String::from(".") + } else { + variant = true; + let alt_base = u8_to_base(*mapped_base); + if !alt_bases.contains(&alt_base) { + alt_bases.push(alt_base); + } + (alt_bases.iter().position(|&r| r == alt_base).unwrap() + 1).to_string() + }; let field = Field::new(Key::Genotype, Some(Value::String(gt))); genotype_vec .push(Genotype::try_from(vec![field]).expect("Could not construct genotypes")); } - if !alt_bases.is_empty() { + if variant { let genotypes = Genotypes::new(keys.clone(), genotype_vec); let alt_alleles: Vec = alt_bases.iter().map(|a| Allele::Bases(vec![*a])).collect(); let record = vcf::Record::builder() .set_chromosome( - self.chrom_names[*map_chrom] + self.chrom_names[map_chrom] .parse() .expect("Invalid chromosome name"), ) - .set_position(Position::from(*map_pos + 1)) + .set_position(Position::from(map_pos + 1)) .add_reference_base(ref_allele) .set_alternate_bases(AlternateBases::from(alt_alleles)) .set_genotypes(genotypes) @@ -429,51 +410,6 @@ impl RefSka { .expect("Could not construct record"); writer.write_record(&record)?; } - next_pos = *map_pos + 1; - } - // Fill any missing bases at the end of final contig - let final_chrom_id = self.chrom_names.len() - 1; - for record in self.iter_missing_vcf_rows( - final_chrom_id, - next_pos + half_split_len, - self.seq[final_chrom_id].len(), - &missing_genotypes, - ) { - writer.write_record(&record)?; - } - Ok(()) - } - - /// Write mapped variants as an aln/multi-FASTA file, using [`needletail`]. - /// - /// Rows are samples, columns are variants, so this is broadly equivalent - /// to writing out the transpose of the mapped variant array. - /// - /// Extra work is used to check to write out flanking bases of the split - /// k-mer, and padding with any missing positions. - /// - /// Samples are written to the output buffer one at a time. - /// - /// # Panics - /// - /// If [`RefSka::map()`] has not been run yet, or no split-kmers mapped. - pub fn write_aln( - &self, - f: &mut W, - threads: usize, - ) -> Result<(), needletail::errors::ParseError> { - if self.chrom_names.len() > 1 { - eprintln!("WARNING: Reference contained multiple contigs, in the output they will be concatenated"); - } - - let alignments = self.gen_seqs(threads); - for (sample_name, mut aligned_seq) in self.mapped_names.iter().zip(alignments) { - write_fasta( - sample_name.as_bytes(), - aligned_seq.get_seq(), - f, - needletail::parser::LineEnding::Unix, - )?; } Ok(()) } diff --git a/src/ska_ref/aln_writer.rs b/src/ska_ref/aln_writer.rs index 171c39f..04ad915 100644 --- a/src/ska_ref/aln_writer.rs +++ b/src/ska_ref/aln_writer.rs @@ -24,7 +24,7 @@ pub struct AlnWriter<'a> { ref_seq: &'a Vec>, seq_out: Vec, half_split_len: usize, - finalised: bool + finalised: bool, } impl<'a> AlnWriter<'a> { @@ -46,10 +46,14 @@ impl<'a> AlnWriter<'a> { ref_seq, seq_out, half_split_len, - finalised + finalised, } } + pub fn total_size(&self) -> usize { + self.seq_out.len() + } + // Fill fwd bases, and any skipped over. // e.g. with split 7-mers perfectly matching // CCGA AAGT diff --git a/src/ska_ref/idx_check.rs b/src/ska_ref/idx_check.rs new file mode 100644 index 0000000..583e6fb --- /dev/null +++ b/src/ska_ref/idx_check.rs @@ -0,0 +1,35 @@ +#[derive(Debug)] +pub struct IdxCheck { + end_coor: Vec, + current_chr: usize, +} + +impl IdxCheck { + pub fn new(ref_seq: &[Vec]) -> Self { + let mut end_coor = Vec::new(); + let current_chr = 0; + + let mut cum_pos = 0; + for chrom in ref_seq { + cum_pos += chrom.len(); + end_coor.push(cum_pos); + } + + Self { + end_coor, + current_chr, + } + } + + /// Assumes idx is incremented, no skips or interesting intervals + pub fn idx_to_coor(&mut self, idx: usize) -> (usize, usize) { + if idx >= self.end_coor[self.current_chr] { + self.current_chr += 1; + } + let mut pos = idx; + if self.current_chr > 0 { + pos -= self.end_coor[self.current_chr - 1]; + } + (self.current_chr, pos) + } +} diff --git a/tests/test_files_in/indel_test.fa b/tests/test_files_in/indel_test.fa new file mode 100644 index 0000000..674407c --- /dev/null +++ b/tests/test_files_in/indel_test.fa @@ -0,0 +1,2 @@ +>sample1 +GATTTGAAGGGTTTATGATATTTAGCTGTACGAGAGTCTTTTAAGGGAGTGTTTTGATGGTTT diff --git a/tests/test_results_correct/map_vcf.stdout b/tests/test_results_correct/map_vcf.stdout index ed6736e..5f473e6 100644 --- a/tests/test_results_correct/map_vcf.stdout +++ b/tests/test_results_correct/map_vcf.stdout @@ -15,6 +15,14 @@ fake_ref 11 . G . . . . GT . . fake_ref 12 . T . . . . GT . . fake_ref 35 . T A . . . GT 0 1 fake_ref 58 . G A . . . GT 0 1 +fake_ref 67 . G . . . . GT 0 . +fake_ref 68 . A . . . . GT 0 . +fake_ref 69 . T . . . . GT 0 . +fake_ref 70 . G . . . . GT 0 . +fake_ref 71 . G . . . . GT 0 . +fake_ref 72 . T . . . . GT 0 . +fake_ref 73 . T . . . . GT 0 . +fake_ref 74 . T . . . . GT 0 . fake_ref 75 . A . . . . GT . . fake_ref 76 . C . . . . GT . . fake_ref 77 . G . . . . GT . . diff --git a/tests/test_results_correct/map_vcf_two_chrom.stdout b/tests/test_results_correct/map_vcf_two_chrom.stdout index fd20743..c8e9925 100644 --- a/tests/test_results_correct/map_vcf_two_chrom.stdout +++ b/tests/test_results_correct/map_vcf_two_chrom.stdout @@ -16,6 +16,14 @@ fake_ref_chrom1 11 . G . . . . GT . . fake_ref_chrom1 12 . T . . . . GT . . fake_ref_chrom1 35 . T A . . . GT 0 1 fake_ref_chrom1 58 . G A . . . GT 0 1 +fake_ref_chrom1 67 . G . . . . GT 0 . +fake_ref_chrom1 68 . A . . . . GT 0 . +fake_ref_chrom1 69 . T . . . . GT 0 . +fake_ref_chrom1 70 . G . . . . GT 0 . +fake_ref_chrom1 71 . G . . . . GT 0 . +fake_ref_chrom1 72 . T . . . . GT 0 . +fake_ref_chrom1 73 . T . . . . GT 0 . +fake_ref_chrom1 74 . T . . . . GT 0 . fake_ref_chrom1 75 . A . . . . GT . . fake_ref_chrom1 76 . C . . . . GT . . fake_ref_chrom1 77 . G . . . . GT . . @@ -30,3 +38,11 @@ fake_ref_chrom1 85 . G . . . . GT . . fake_ref_chrom1 86 . T . . . . GT . . fake_ref_chrom2 23 . T A . . . GT 0 1 fake_ref_chrom2 46 . G A . . . GT 0 1 +fake_ref_chrom2 55 . G . . . . GT 0 . +fake_ref_chrom2 56 . A . . . . GT 0 . +fake_ref_chrom2 57 . T . . . . GT 0 . +fake_ref_chrom2 58 . G . . . . GT 0 . +fake_ref_chrom2 59 . G . . . . GT 0 . +fake_ref_chrom2 60 . T . . . . GT 0 . +fake_ref_chrom2 61 . T . . . . GT 0 . +fake_ref_chrom2 62 . T . . . . GT 0 . From be5328a33e58553222f45113cbae43dec247d161 Mon Sep 17 00:00:00 2001 From: John Lees Date: Tue, 7 Feb 2023 09:26:14 +0000 Subject: [PATCH 11/14] Redo idx lookup as an iterator --- src/ska_ref.rs | 6 ++--- src/ska_ref/idx_check.rs | 51 ++++++++++++++++++++++++++++++---------- 2 files changed, 42 insertions(+), 15 deletions(-) diff --git a/src/ska_ref.rs b/src/ska_ref.rs index 8d9d8b1..dd7fd5e 100644 --- a/src/ska_ref.rs +++ b/src/ska_ref.rs @@ -364,9 +364,9 @@ impl RefSka { // Write each record (column) let keys = gt_keys(); - let mut idx_map = IdxCheck::new(&self.seq); - for (idx, sample_variants) in var_t_owned.outer_iter().enumerate() { - let (map_chrom, map_pos) = idx_map.idx_to_coor(idx); + let idx_map = IdxCheck::new(&self.seq); + for (sample_variants, (map_chrom, map_pos)) in var_t_owned.outer_iter().zip(idx_map.iter()) + { let ref_base = self.seq[map_chrom][map_pos]; let ref_allele = u8_to_base(ref_base); diff --git a/src/ska_ref/idx_check.rs b/src/ska_ref/idx_check.rs index 583e6fb..622226b 100644 --- a/src/ska_ref/idx_check.rs +++ b/src/ska_ref/idx_check.rs @@ -1,13 +1,21 @@ +//! Keep track of current chromosome during iteration over a concatenated +//! alignment. +//! +//! Simple counter with state to avoid doing range lookups (e.g. with an +//! interval tree) at every position. Used when converting alignment to VCF. + +/// Builds the contig change indices and keeps track of the current chromosome. +/// +/// NB: not robust to leaps in the passed index, as chromosome is only incremented +/// by one (so works for linear iteration over pos only) #[derive(Debug)] pub struct IdxCheck { end_coor: Vec, - current_chr: usize, } impl IdxCheck { pub fn new(ref_seq: &[Vec]) -> Self { let mut end_coor = Vec::new(); - let current_chr = 0; let mut cum_pos = 0; for chrom in ref_seq { @@ -15,21 +23,40 @@ impl IdxCheck { end_coor.push(cum_pos); } - Self { - end_coor, - current_chr, + Self { end_coor } + } + + pub fn iter(&self) -> IdxCheckIter<'_> { + IdxCheckIter { + end_coor: &self.end_coor, + current_chr: 0, + idx: 0, } } +} + +pub struct IdxCheckIter<'a> { + end_coor: &'a Vec, + current_chr: usize, + idx: usize, +} + +impl<'a> Iterator for IdxCheckIter<'a> { + type Item = (usize, usize); - /// Assumes idx is incremented, no skips or interesting intervals - pub fn idx_to_coor(&mut self, idx: usize) -> (usize, usize) { - if idx >= self.end_coor[self.current_chr] { + fn next(&mut self) -> Option<(usize, usize)> { + if self.idx >= self.end_coor[self.current_chr] { self.current_chr += 1; } - let mut pos = idx; - if self.current_chr > 0 { - pos -= self.end_coor[self.current_chr - 1]; + if self.current_chr < self.end_coor.len() { + let mut pos = self.idx; + if self.current_chr > 0 { + pos -= self.end_coor[self.current_chr - 1]; + } + self.idx += 1; + Some((self.current_chr, pos)) + } else { + None } - (self.current_chr, pos) } } From 46fd1dbcc062ba048a898988c2664bf0b0bacbfb Mon Sep 17 00:00:00 2001 From: John Lees Date: Tue, 7 Feb 2023 09:29:20 +0000 Subject: [PATCH 12/14] Add docs --- src/ska_ref/aln_writer.rs | 1 + src/ska_ref/idx_check.rs | 8 +++++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/ska_ref/aln_writer.rs b/src/ska_ref/aln_writer.rs index 04ad915..8fd1f0c 100644 --- a/src/ska_ref/aln_writer.rs +++ b/src/ska_ref/aln_writer.rs @@ -50,6 +50,7 @@ impl<'a> AlnWriter<'a> { } } + /// Get the total length of the concatenated sequence output pub fn total_size(&self) -> usize { self.seq_out.len() } diff --git a/src/ska_ref/idx_check.rs b/src/ska_ref/idx_check.rs index 622226b..5738ed9 100644 --- a/src/ska_ref/idx_check.rs +++ b/src/ska_ref/idx_check.rs @@ -5,15 +5,13 @@ //! interval tree) at every position. Used when converting alignment to VCF. /// Builds the contig change indices and keeps track of the current chromosome. -/// -/// NB: not robust to leaps in the passed index, as chromosome is only incremented -/// by one (so works for linear iteration over pos only) #[derive(Debug)] pub struct IdxCheck { end_coor: Vec, } impl IdxCheck { + /// Create indicies from Vec of Vec repr of a reference sequence pub fn new(ref_seq: &[Vec]) -> Self { let mut end_coor = Vec::new(); @@ -26,6 +24,7 @@ impl IdxCheck { Self { end_coor } } + /// Iterate over index to return (chromosome, position) tuples pub fn iter(&self) -> IdxCheckIter<'_> { IdxCheckIter { end_coor: &self.end_coor, @@ -35,6 +34,9 @@ impl IdxCheck { } } +/// Carries state which keeps track of chromosome during iteration +/// +/// Iterator as separate class so [`IdxCheck`] not modified during iteration pub struct IdxCheckIter<'a> { end_coor: &'a Vec, current_chr: usize, From fe63e2299c7f4308dedc3a43692964241a8a28ab Mon Sep 17 00:00:00 2001 From: John Lees Date: Tue, 7 Feb 2023 11:21:19 +0000 Subject: [PATCH 13/14] Add final map tests --- tests/map.rs | 30 +++++++ .../map_aln_indels.stdout | 4 + .../map_vcf_indels.stdout | 38 ++++++++ tests/test_results_correct/map_vcf_ss.stdout | 89 +++++++++++++++++++ 4 files changed, 161 insertions(+) create mode 100644 tests/test_results_correct/map_aln_indels.stdout create mode 100644 tests/test_results_correct/map_vcf_indels.stdout create mode 100644 tests/test_results_correct/map_vcf_ss.stdout diff --git a/tests/map.rs b/tests/map.rs index 1a69463..02b5b63 100644 --- a/tests/map.rs +++ b/tests/map.rs @@ -46,6 +46,15 @@ fn map_aln() { .arg(sandbox.file_string("merge.skf", TestDir::Input)) .assert() .stdout_matches_path(sandbox.file_string("map_aln_two_chrom.stdout", TestDir::Correct)); + + Command::new(cargo_bin("ska")) + .current_dir(sandbox.get_wd()) + .arg("map") + .arg(sandbox.file_string("test_ref.fa", TestDir::Input)) + .arg(sandbox.file_string("test_1.fa", TestDir::Input)) + .arg(sandbox.file_string("indel_test.fa", TestDir::Input)) + .assert() + .stdout_matches_path(sandbox.file_string("map_aln_indels.stdout", TestDir::Correct)); } #[test] @@ -86,6 +95,17 @@ fn map_vcf() { .arg("vcf") .assert() .stdout_matches_path(sandbox.file_string("map_vcf_two_chrom.stdout", TestDir::Correct)); + + Command::new(cargo_bin("ska")) + .current_dir(sandbox.get_wd()) + .arg("map") + .arg(sandbox.file_string("test_ref.fa", TestDir::Input)) + .arg(sandbox.file_string("test_1.fa", TestDir::Input)) + .arg(sandbox.file_string("indel_test.fa", TestDir::Input)) + .arg("-f") + .arg("vcf") + .assert() + .stdout_matches_path(sandbox.file_string("map_vcf_indels.stdout", TestDir::Correct)); } #[test] @@ -144,4 +164,14 @@ fn map_rev_comp() { .arg("ss_map.skf") .assert() .stdout_matches_path(sandbox.file_string("map_ss.stdout", TestDir::Correct)); + + Command::new(cargo_bin("ska")) + .current_dir(sandbox.get_wd()) + .arg("map") + .arg(sandbox.file_string("test_ref.fa", TestDir::Input)) + .arg("ss_map.skf") + .arg("-f") + .arg("vcf") + .assert() + .stdout_matches_path(sandbox.file_string("map_vcf_ss.stdout", TestDir::Correct)); } diff --git a/tests/test_results_correct/map_aln_indels.stdout b/tests/test_results_correct/map_aln_indels.stdout new file mode 100644 index 0000000..fe906b6 --- /dev/null +++ b/tests/test_results_correct/map_aln_indels.stdout @@ -0,0 +1,4 @@ +>test_1 +------------GATTTGAAGGCGTTTATGATATTTAGCTGTACGAGAGTCTTTTAAGAGTGTTTTGATGGTTT------------ +>indel_test +-----------------------GTTTATGATATTTAGCTGTACGAGAGTCTTTTAAGAGTGTTTTGATGGTTT------------ diff --git a/tests/test_results_correct/map_vcf_indels.stdout b/tests/test_results_correct/map_vcf_indels.stdout new file mode 100644 index 0000000..3bbb5f9 --- /dev/null +++ b/tests/test_results_correct/map_vcf_indels.stdout @@ -0,0 +1,38 @@ +##fileformat=VCFv4.3 +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT test_1 indel_test +fake_ref 1 . A . . . . GT . . +fake_ref 2 . C . . . . GT . . +fake_ref 3 . G . . . . GT . . +fake_ref 4 . T . . . . GT . . +fake_ref 5 . A . . . . GT . . +fake_ref 6 . C . . . . GT . . +fake_ref 7 . G . . . . GT . . +fake_ref 8 . T . . . . GT . . +fake_ref 9 . A . . . . GT . . +fake_ref 10 . C . . . . GT . . +fake_ref 11 . G . . . . GT . . +fake_ref 12 . T . . . . GT . . +fake_ref 13 . G . . . . GT 0 . +fake_ref 14 . A . . . . GT 0 . +fake_ref 15 . T . . . . GT 0 . +fake_ref 16 . T . . . . GT 0 . +fake_ref 17 . T . . . . GT 0 . +fake_ref 18 . G . . . . GT 0 . +fake_ref 19 . A . . . . GT 0 . +fake_ref 20 . A . . . . GT 0 . +fake_ref 21 . G . . . . GT 0 . +fake_ref 22 . G . . . . GT 0 . +fake_ref 23 . C . . . . GT 0 . +fake_ref 75 . A . . . . GT . . +fake_ref 76 . C . . . . GT . . +fake_ref 77 . G . . . . GT . . +fake_ref 78 . T . . . . GT . . +fake_ref 79 . A . . . . GT . . +fake_ref 80 . C . . . . GT . . +fake_ref 81 . G . . . . GT . . +fake_ref 82 . T . . . . GT . . +fake_ref 83 . A . . . . GT . . +fake_ref 84 . C . . . . GT . . +fake_ref 85 . G . . . . GT . . +fake_ref 86 . T . . . . GT . . diff --git a/tests/test_results_correct/map_vcf_ss.stdout b/tests/test_results_correct/map_vcf_ss.stdout new file mode 100644 index 0000000..777ceb8 --- /dev/null +++ b/tests/test_results_correct/map_vcf_ss.stdout @@ -0,0 +1,89 @@ +##fileformat=VCFv4.3 +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT test_1 test_2_rc +fake_ref 1 . A . . . . GT . . +fake_ref 2 . C . . . . GT . . +fake_ref 3 . G . . . . GT . . +fake_ref 4 . T . . . . GT . . +fake_ref 5 . A . . . . GT . . +fake_ref 6 . C . . . . GT . . +fake_ref 7 . G . . . . GT . . +fake_ref 8 . T . . . . GT . . +fake_ref 9 . A . . . . GT . . +fake_ref 10 . C . . . . GT . . +fake_ref 11 . G . . . . GT . . +fake_ref 12 . T . . . . GT . . +fake_ref 13 . G . . . . GT 0 . +fake_ref 14 . A . . . . GT 0 . +fake_ref 15 . T . . . . GT 0 . +fake_ref 16 . T . . . . GT 0 . +fake_ref 17 . T . . . . GT 0 . +fake_ref 18 . G . . . . GT 0 . +fake_ref 19 . A . . . . GT 0 . +fake_ref 20 . A . . . . GT 0 . +fake_ref 21 . G . . . . GT 0 . +fake_ref 22 . G . . . . GT 0 . +fake_ref 23 . C . . . . GT 0 . +fake_ref 24 . G . . . . GT 0 . +fake_ref 25 . T . . . . GT 0 . +fake_ref 26 . T . . . . GT 0 . +fake_ref 27 . T . . . . GT 0 . +fake_ref 28 . A . . . . GT 0 . +fake_ref 29 . T . . . . GT 0 . +fake_ref 30 . G . . . . GT 0 . +fake_ref 31 . A . . . . GT 0 . +fake_ref 32 . T . . . . GT 0 . +fake_ref 33 . A . . . . GT 0 . +fake_ref 34 . T . . . . GT 0 . +fake_ref 35 . T . . . . GT 0 . +fake_ref 36 . T . . . . GT 0 . +fake_ref 37 . A . . . . GT 0 . +fake_ref 38 . G . . . . GT 0 . +fake_ref 39 . C . . . . GT 0 . +fake_ref 40 . T . . . . GT 0 . +fake_ref 41 . G . . . . GT 0 . +fake_ref 42 . T . . . . GT 0 . +fake_ref 43 . A . . . . GT 0 . +fake_ref 44 . C . . . . GT 0 . +fake_ref 45 . G . . . . GT 0 . +fake_ref 46 . A . . . . GT 0 . +fake_ref 47 . G . . . . GT 0 . +fake_ref 48 . A . . . . GT 0 . +fake_ref 49 . G . . . . GT 0 . +fake_ref 50 . T . . . . GT 0 . +fake_ref 51 . C . . . . GT 0 . +fake_ref 52 . T . . . . GT 0 . +fake_ref 53 . T . . . . GT 0 . +fake_ref 54 . T . . . . GT 0 . +fake_ref 55 . T . . . . GT 0 . +fake_ref 56 . A . . . . GT 0 . +fake_ref 57 . A . . . . GT 0 . +fake_ref 58 . G . . . . GT 0 . +fake_ref 59 . A . . . . GT 0 . +fake_ref 60 . G . . . . GT 0 . +fake_ref 61 . T . . . . GT 0 . +fake_ref 62 . G N . . . GT 1 . +fake_ref 63 . T . . . . GT 0 . +fake_ref 64 . T . . . . GT 0 . +fake_ref 65 . T . . . . GT 0 . +fake_ref 66 . T . . . . GT 0 . +fake_ref 67 . G . . . . GT 0 . +fake_ref 68 . A . . . . GT 0 . +fake_ref 69 . T . . . . GT 0 . +fake_ref 70 . G . . . . GT 0 . +fake_ref 71 . G . . . . GT 0 . +fake_ref 72 . T . . . . GT 0 . +fake_ref 73 . T . . . . GT 0 . +fake_ref 74 . T . . . . GT 0 . +fake_ref 75 . A . . . . GT . . +fake_ref 76 . C . . . . GT . . +fake_ref 77 . G . . . . GT . . +fake_ref 78 . T . . . . GT . . +fake_ref 79 . A . . . . GT . . +fake_ref 80 . C . . . . GT . . +fake_ref 81 . G . . . . GT . . +fake_ref 82 . T . . . . GT . . +fake_ref 83 . A . . . . GT . . +fake_ref 84 . C . . . . GT . . +fake_ref 85 . G . . . . GT . . +fake_ref 86 . T . . . . GT . . From 1c200e89f7fc2c023b3d61d47b4e3aa4b8772892 Mon Sep 17 00:00:00 2001 From: John Lees Date: Tue, 7 Feb 2023 11:43:40 +0000 Subject: [PATCH 14/14] Simplify struct initialisation --- src/merge_ska_array.rs | 9 +++---- src/merge_ska_dict.rs | 6 ++--- src/ska_dict.rs | 11 ++++----- src/ska_dict/count_min_filter.rs | 8 ++----- src/ska_dict/split_kmer.rs | 8 +++---- src/ska_ref.rs | 9 +++---- src/ska_ref/aln_writer.rs | 41 +++++++++++++++++++------------- src/ska_ref/idx_check.rs | 4 ++++ 8 files changed, 46 insertions(+), 50 deletions(-) diff --git a/src/merge_ska_array.rs b/src/merge_ska_array.rs index 53a7e5a..56ea7bf 100644 --- a/src/merge_ska_array.rs +++ b/src/merge_ska_array.rs @@ -102,9 +102,6 @@ impl MergeSkaArray { /// Convert a dynamic [`MergeSkaDict`] to static array representation. pub fn new(dynamic: &MergeSkaDict) -> Self { - let k = dynamic.kmer_len(); - let rc = dynamic.rc(); - let names = dynamic.names().clone(); let mut variants = Array2::zeros((0, dynamic.nsamples())); let mut split_kmers: Vec = Vec::new(); split_kmers.reserve(dynamic.ksize()); @@ -116,9 +113,9 @@ impl MergeSkaArray { } variants.mapv_inplace(|b| u8::max(b, b'-')); // turns zeros to missing Self { - k, - rc, - names, + k: dynamic.kmer_len(), + rc: dynamic.rc(), + names: dynamic.names().clone(), split_kmers, variants, variant_count, diff --git a/src/merge_ska_dict.rs b/src/merge_ska_dict.rs index a8bf6a7..b57055f 100644 --- a/src/merge_ska_dict.rs +++ b/src/merge_ska_dict.rs @@ -37,14 +37,12 @@ impl MergeSkaDict { /// Create an empty merged dictionary, to be used with [`MergeSkaDict::merge()`] /// or [`MergeSkaDict::append()`]. pub fn new(k: usize, n_samples: usize, rc: bool) -> Self { - let names = vec!["".to_string(); n_samples]; - let split_kmers = HashMap::default(); Self { k, rc, n_samples, - names, - split_kmers, + names: vec!["".to_string(); n_samples], + split_kmers: HashMap::default(), } } diff --git a/src/ska_dict.rs b/src/ska_dict.rs index dd35ed9..9054590 100644 --- a/src/ska_dict.rs +++ b/src/ska_dict.rs @@ -140,17 +140,14 @@ impl SkaDict { if !(5..=31).contains(&k) || k % 2 == 0 { panic!("Invalid k-mer length"); } - // Default/empty structs - let name = name.to_string(); - let split_kmers: HashMap = HashMap::default(); - let cm_filter = CountMin::empty(CM_WIDTH, CM_HEIGHT, min_count); + let mut sk_dict = Self { k, rc, sample_idx, - name, - split_kmers, - cm_filter, + name: name.to_string(), + split_kmers: HashMap::default(), + cm_filter: CountMin::empty(CM_WIDTH, CM_HEIGHT, min_count), }; // Check if we're working with reads, and initalise the CM filter if so diff --git a/src/ska_dict/count_min_filter.rs b/src/ska_dict/count_min_filter.rs index 59e9fb1..9334702 100644 --- a/src/ska_dict/count_min_filter.rs +++ b/src/ska_dict/count_min_filter.rs @@ -66,17 +66,13 @@ impl CountMin { let width_shift = u64::BITS - width_bits as u32; let mask = (width as u64 - 1) << width_shift; - // Reserve for these gets call by the vec! macro used in init - let hash_factory = Vec::new(); - let counts = Vec::new(); - Self { width, width_shift, height, - hash_factory, + hash_factory: Vec::new(), mask, - counts, + counts: Vec::new(), min_count, } } diff --git a/src/ska_dict/split_kmer.rs b/src/ska_dict/split_kmer.rs index 9d30172..faf2c22 100644 --- a/src/ska_dict/split_kmer.rs +++ b/src/ska_dict/split_kmer.rs @@ -185,7 +185,7 @@ impl<'a> SplitKmer<'a> { rc: bool, min_qual: u8, ) -> Option { - let (mut index, rc_upper, rc_lower, rc_middle_base) = (0, 0, 0, 0); + let mut index = 0; let first_kmer = Self::build(&seq, seq_len, qual, k, &mut index, min_qual); if let Some((upper, lower, middle_base)) = first_kmer { let (lower_mask, upper_mask) = generate_masks(k); @@ -201,9 +201,9 @@ impl<'a> SplitKmer<'a> { lower, middle_base, rc, - rc_upper, - rc_lower, - rc_middle_base, + rc_upper: 0, + rc_lower: 0, + rc_middle_base: 0, index, }; if rc { diff --git a/src/ska_ref.rs b/src/ska_ref.rs index dd7fd5e..87d0e4b 100644 --- a/src/ska_ref.rs +++ b/src/ska_ref.rs @@ -193,17 +193,14 @@ impl RefSka { panic!("{filename} has no valid sequence"); } - let mapped_variants = Array2::zeros((0, 0)); - let mapped_pos = Vec::new(); - let mapped_names = Vec::new(); Self { k, seq, chrom_names, split_kmer_pos, - mapped_pos, - mapped_variants, - mapped_names, + mapped_pos: Vec::new(), + mapped_variants: Array2::zeros((0, 0)), + mapped_names: Vec::new(), } } diff --git a/src/ska_ref/aln_writer.rs b/src/ska_ref/aln_writer.rs index 8fd1f0c..8e32fa0 100644 --- a/src/ska_ref/aln_writer.rs +++ b/src/ska_ref/aln_writer.rs @@ -16,14 +16,24 @@ /// each match. #[derive(Clone)] pub struct AlnWriter<'a> { + /// Next position where it is valid to write a match. next_pos: usize, + /// The current chromsome. curr_chrom: usize, + /// The latest position where a base has been mapped (may not have been written). last_mapped: usize, + /// The latest position in `seq_out` that has been written to. last_written: usize, + /// An offset which is added to convert chromosome positions to concatenated position. chrom_offset: usize, + /// The reference sequences. ref_seq: &'a Vec>, + /// The output alignment. seq_out: Vec, + /// The size of the flanking region for each split k-mer match. half_split_len: usize, + /// Whether the finalise function has been run, filling to the end of the + /// final contig. finalised: bool, } @@ -31,22 +41,18 @@ impl<'a> AlnWriter<'a> { /// Create a new [`AlnWriter`] taking the reference sequence mapped against /// and the k-mer size used for the mapping pub fn new(ref_seq: &'a Vec>, k: usize) -> Self { - let (curr_chrom, last_mapped, last_written, chrom_offset) = (0, 0, 0, 0); - let finalised = false; let total_size = ref_seq.iter().map(|x| x.len()).sum(); - let seq_out = vec![b'-'; total_size]; let half_split_len = (k - 1) / 2; - let next_pos = half_split_len; Self { - next_pos, - curr_chrom, - last_mapped, - last_written, - chrom_offset, + next_pos: half_split_len, + curr_chrom: 0, + last_mapped: 0, + last_written: 0, + chrom_offset: 0, ref_seq, - seq_out, + seq_out: vec![b'-'; total_size], half_split_len, - finalised, + finalised: false, } } @@ -129,16 +135,17 @@ impl<'a> AlnWriter<'a> { /// Fills to the end of the final contig /// Should only be called after the last call to [`AlnWriter::write_split_kmer()`]. pub fn finalise(&mut self) { - self.fill_contig(); - self.finalised = true; + if !self.finalised { + while self.curr_chrom < self.ref_seq.len() { + self.fill_contig(); + } + self.finalised = true; + } } /// Retrieve the written sequence. Calls [`AlnWriter::finalise()`] if not already called. pub fn get_seq(&'a mut self) -> &'a [u8] { - if !self.finalised { - self.fill_contig(); - self.finalised = true; - } + self.finalise(); self.seq_out.as_slice() } } diff --git a/src/ska_ref/idx_check.rs b/src/ska_ref/idx_check.rs index 5738ed9..b069129 100644 --- a/src/ska_ref/idx_check.rs +++ b/src/ska_ref/idx_check.rs @@ -7,6 +7,7 @@ /// Builds the contig change indices and keeps track of the current chromosome. #[derive(Debug)] pub struct IdxCheck { + /// The end coordinates (last base) of each chromosome end_coor: Vec, } @@ -38,8 +39,11 @@ impl IdxCheck { /// /// Iterator as separate class so [`IdxCheck`] not modified during iteration pub struct IdxCheckIter<'a> { + /// Ref to end coordinates end_coor: &'a Vec, + /// Current chromosome current_chr: usize, + /// Current absolute position idx: usize, }