diff --git a/src/lib.rs b/src/lib.rs index 3c025d3..990494f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -348,15 +348,15 @@ pub fn main() { let mut out_stream = set_ostream(output); match format { FileType::Aln => { - log::info!("Writing alignment"); + log::info!("Generating alignment with {} threads", *threads); ska_ref - .write_aln(&mut out_stream) + .write_aln(&mut out_stream, *threads) .expect("Failed to write output alignment"); } FileType::Vcf => { - log::info!("Writing VCF"); + log::info!("Generating VCF with {} threads", *threads); ska_ref - .write_vcf(&mut out_stream) + .write_vcf(&mut out_stream, *threads) .expect("Failed to write output VCF"); } } diff --git a/src/merge_ska_array.rs b/src/merge_ska_array.rs index 2e371ad..56ea7bf 100644 --- a/src/merge_ska_array.rs +++ b/src/merge_ska_array.rs @@ -36,6 +36,7 @@ use crate::ska_ref::RefSka; /// ``` /// use ska::merge_ska_array::MergeSkaArray; /// use ska::io_utils::set_ostream; +/// use ska::ska_ref::RefSka; /// /// // Load an array from file /// let mut ska_array = MergeSkaArray::load(&"tests/test_files_in/merge.skf").expect("Could not open array"); @@ -53,6 +54,10 @@ use crate::ska_ref::RefSka; /// /// // Delete a sample /// ska_array.delete_samples(&[&"test_1"]); +/// +/// // Delete k-mers +/// let ska_weed = RefSka::new(ska_array.kmer_len(), &"tests/test_files_in/weed.fa", ska_array.rc()); +/// ska_array.weed(&ska_weed); /// ``` #[derive(Serialize, Deserialize)] pub struct MergeSkaArray { @@ -97,9 +102,6 @@ impl MergeSkaArray { /// Convert a dynamic [`MergeSkaDict`] to static array representation. pub fn new(dynamic: &MergeSkaDict) -> Self { - let k = dynamic.kmer_len(); - let rc = dynamic.rc(); - let names = dynamic.names().clone(); let mut variants = Array2::zeros((0, dynamic.nsamples())); let mut split_kmers: Vec = Vec::new(); split_kmers.reserve(dynamic.ksize()); @@ -111,9 +113,9 @@ impl MergeSkaArray { } variants.mapv_inplace(|b| u8::max(b, b'-')); // turns zeros to missing Self { - k, - rc, - names, + k: dynamic.kmer_len(), + rc: dynamic.rc(), + names: dynamic.names().clone(), split_kmers, variants, variant_count, @@ -267,9 +269,6 @@ impl MergeSkaArray { /// may be a multi-FASTA) generated with [`RefSka::new()`] /// /// Used with `ska weed`. - /// - /// # Examples - /// #TODO pub fn weed(&mut self, weed_ref: &RefSka) { let weed_kmers: HashSet = HashSet::from_iter(weed_ref.kmer_iter()); diff --git a/src/merge_ska_dict.rs b/src/merge_ska_dict.rs index a8bf6a7..b57055f 100644 --- a/src/merge_ska_dict.rs +++ b/src/merge_ska_dict.rs @@ -37,14 +37,12 @@ impl MergeSkaDict { /// Create an empty merged dictionary, to be used with [`MergeSkaDict::merge()`] /// or [`MergeSkaDict::append()`]. pub fn new(k: usize, n_samples: usize, rc: bool) -> Self { - let names = vec!["".to_string(); n_samples]; - let split_kmers = HashMap::default(); Self { k, rc, n_samples, - names, - split_kmers, + names: vec!["".to_string(); n_samples], + split_kmers: HashMap::default(), } } diff --git a/src/ska_dict.rs b/src/ska_dict.rs index dd35ed9..9054590 100644 --- a/src/ska_dict.rs +++ b/src/ska_dict.rs @@ -140,17 +140,14 @@ impl SkaDict { if !(5..=31).contains(&k) || k % 2 == 0 { panic!("Invalid k-mer length"); } - // Default/empty structs - let name = name.to_string(); - let split_kmers: HashMap = HashMap::default(); - let cm_filter = CountMin::empty(CM_WIDTH, CM_HEIGHT, min_count); + let mut sk_dict = Self { k, rc, sample_idx, - name, - split_kmers, - cm_filter, + name: name.to_string(), + split_kmers: HashMap::default(), + cm_filter: CountMin::empty(CM_WIDTH, CM_HEIGHT, min_count), }; // Check if we're working with reads, and initalise the CM filter if so diff --git a/src/ska_dict/bit_encoding.rs b/src/ska_dict/bit_encoding.rs index d1aae98..e6811a6 100644 --- a/src/ska_dict/bit_encoding.rs +++ b/src/ska_dict/bit_encoding.rs @@ -89,13 +89,17 @@ pub fn decode_kmer(k: usize, kmer: u64, upper_mask: u64, lower_mask: u64) -> (St /// e.g. on construction or after skipping an N. #[inline(always)] pub fn revcomp64_v2(mut res: u64, k_size: usize) -> u64 { + // This part reverses the bases by shuffling them using an on/off pattern + // of bits res = (res >> 2 & 0x3333333333333333) | (res & 0x3333333333333333) << 2; res = (res >> 4 & 0x0F0F0F0F0F0F0F0F) | (res & 0x0F0F0F0F0F0F0F0F) << 4; res = (res >> 8 & 0x00FF00FF00FF00FF) | (res & 0x00FF00FF00FF00FF) << 8; res = (res >> 16 & 0x0000FFFF0000FFFF) | (res & 0x0000FFFF0000FFFF) << 16; res = (res >> 32 & 0x00000000FFFFFFFF) | (res & 0x00000000FFFFFFFF) << 32; + // This reverse complements res ^= 0xAAAAAAAAAAAAAAAA; + // Shifts so LSB is at the bottom res >> (2 * (32 - k_size)) } diff --git a/src/ska_dict/count_min_filter.rs b/src/ska_dict/count_min_filter.rs index 9bebfbf..9334702 100644 --- a/src/ska_dict/count_min_filter.rs +++ b/src/ska_dict/count_min_filter.rs @@ -37,6 +37,8 @@ use ahash::RandomState; pub struct CountMin { /// Table width (estimated number of unique k-mers) width: usize, + /// Number of bits to shift masked value to get table entry + width_shift: u32, /// Table height (number of hashes) height: usize, /// Hash generators @@ -60,18 +62,17 @@ impl CountMin { // Consistent with consts used, but ensures a power of two let width_bits: usize = f64::floor(f64::log2(width as f64)) as usize; let width = 1 << (width_bits + 1); - let mask = width as u64 - 1; - - // Reserve for these gets call by the vec! macro used in init - let hash_factory = Vec::new(); - let counts = Vec::new(); + // Use MSB rather than LSB + let width_shift = u64::BITS - width_bits as u32; + let mask = (width as u64 - 1) << width_shift; Self { width, + width_shift, height, - hash_factory, + hash_factory: Vec::new(), mask, - counts, + counts: Vec::new(), min_count, } } @@ -102,7 +103,8 @@ impl CountMin { let mut count = 0; for hash_it in self.hash_factory.iter().enumerate() { let (row_idx, hash) = hash_it; - let table_idx = row_idx * self.width + ((hash.hash_one(kmer) & self.mask) as usize); + let table_idx = row_idx * self.width + + (((hash.hash_one(kmer) & self.mask) >> self.width_shift) as usize); self.counts[table_idx] = self.counts[table_idx].saturating_add(1); if row_idx == 0 { count = self.counts[table_idx]; diff --git a/src/ska_dict/split_kmer.rs b/src/ska_dict/split_kmer.rs index 9d30172..faf2c22 100644 --- a/src/ska_dict/split_kmer.rs +++ b/src/ska_dict/split_kmer.rs @@ -185,7 +185,7 @@ impl<'a> SplitKmer<'a> { rc: bool, min_qual: u8, ) -> Option { - let (mut index, rc_upper, rc_lower, rc_middle_base) = (0, 0, 0, 0); + let mut index = 0; let first_kmer = Self::build(&seq, seq_len, qual, k, &mut index, min_qual); if let Some((upper, lower, middle_base)) = first_kmer { let (lower_mask, upper_mask) = generate_masks(k); @@ -201,9 +201,9 @@ impl<'a> SplitKmer<'a> { lower, middle_base, rc, - rc_upper, - rc_lower, - rc_middle_base, + rc_upper: 0, + rc_lower: 0, + rc_middle_base: 0, index, }; if rc { diff --git a/src/ska_ref.rs b/src/ska_ref.rs index f0cbaa6..87d0e4b 100644 --- a/src/ska_ref.rs +++ b/src/ska_ref.rs @@ -23,11 +23,14 @@ //! // Run mapping, output an alignment to stdout //! ref_kmers.map(&ska_dict); //! let mut out_stream = set_ostream(&None); -//! ref_kmers.write_aln(&mut out_stream); +//! ref_kmers.write_aln(&mut out_stream, threads); //! ``` +use std::io::Write; use std::str; +use rayon::prelude::*; + use noodles_vcf::{ self as vcf, header::format::Key, @@ -41,9 +44,7 @@ use noodles_vcf::{ reference_bases::Base, AlternateBases, Genotypes, Position, }, - Record, }; -use std::io::Write; extern crate needletail; use ndarray::{s, Array2, ArrayView}; @@ -52,6 +53,11 @@ use needletail::{ parser::{write_fasta, Format}, }; +pub mod aln_writer; +use crate::ska_ref::aln_writer::AlnWriter; +pub mod idx_check; +use crate::ska_ref::idx_check::IdxCheck; + use crate::merge_ska_dict::MergeSkaDict; use crate::ska_dict::bit_encoding::RC_IUPAC; use crate::ska_dict::split_kmer::SplitKmer; @@ -90,8 +96,6 @@ pub struct RefSka { chrom_names: Vec, /// Sequence, indexed by chromosome, then position seq: Vec>, - /// Total length of all input sequence - total_size: usize, /// Mapping information @@ -122,53 +126,12 @@ fn gt_keys() -> Keys { "GT".parse().expect("Genotype format error") } -/// Genotypes "." when no reference mapped to (of sample length) -/// These can be used as genotypes in the record builder -fn generate_missing_genotypes(n_samples: usize) -> Genotypes { - let missing_field = Field::new(Key::Genotype, Some(Value::String(".".to_string()))); - let missing_genotype_vec = vec![ - Genotype::try_from(vec![missing_field]) - .expect("Could not construct genotypes"); - n_samples - ]; - Genotypes::new(gt_keys(), missing_genotype_vec) -} - impl RefSka { /// Whether [`map`] has been run fn is_mapped(&self) -> bool { self.mapped_variants.nrows() > 0 } - /// An [`Iterator`] between start and end positions which generates records with the passed genotypes - /// - /// The result can then be used with a writer. So far, only over missing records. - /// - /// (I wrote it as an iterator to avoid passing the writer outside of the write function - /// some pretty new rust stuff here! ...lifetimes, move, iterator trait) - fn iter_missing_vcf_rows<'a>( - &'a self, - chrom: usize, - start: usize, - end: usize, - geno: &'a Genotypes, - ) -> impl Iterator + 'a { - (start..end).map(move |missing_pos| { - let ref_allele = u8_to_base(self.seq[chrom][missing_pos]); - vcf::Record::builder() - .set_chromosome( - self.chrom_names[chrom] - .parse() - .expect("Invalid chromosome name"), - ) - .set_position(Position::from(missing_pos + 1)) - .add_reference_base(ref_allele) - .set_genotypes(geno.clone()) - .build() - .expect("Could not construct record") - }) - } - /// Create a list of split k-mers from an input FASTA file. /// /// Input may have multiple sequences (which are treated as chromosomes and @@ -189,7 +152,6 @@ impl RefSka { let mut split_kmer_pos = Vec::new(); let mut seq = Vec::new(); let mut chrom_names = Vec::new(); - let mut total_size = 0; let mut reader = parse_fastx_file(filename).unwrap_or_else(|_| panic!("Invalid path/file: {filename}",)); @@ -201,7 +163,6 @@ impl RefSka { } chrom_names.push(str::from_utf8(seqrec.id()).unwrap().to_owned()); split_kmer_pos.reserve(seqrec.num_bases()); - total_size += seqrec.num_bases(); let kmer_opt = SplitKmer::new(seqrec.seq(), seqrec.num_bases(), None, k, rc, 0); if let Some(mut kmer_it) = kmer_opt { @@ -232,18 +193,14 @@ impl RefSka { panic!("{filename} has no valid sequence"); } - let mapped_variants = Array2::zeros((0, 0)); - let mapped_pos = Vec::new(); - let mapped_names = Vec::new(); Self { k, seq, - total_size, chrom_names, split_kmer_pos, - mapped_pos, - mapped_variants, - mapped_names, + mapped_pos: Vec::new(), + mapped_variants: Array2::zeros((0, 0)), + mapped_names: Vec::new(), } } @@ -293,26 +250,101 @@ impl RefSka { self.split_kmer_pos.iter().map(|k| k.kmer) } - /// Write mapped variants as a VCF, using [`noodles_vcf`]. + // Calls the necessary parts of AlnWriter (in parallel) to produce all the + // pseudoalignments. The calling function either modifies these (VCF) or + // simply writes them out (ALN) + fn pseudoalignment(&self, threads: usize) -> Vec { + if !self.is_mapped() { + panic!("No split k-mers mapped to reference"); + } + + let mut seq_writers = vec![AlnWriter::new(&self.seq, self.k); self.mapped_names.len()]; + rayon::ThreadPoolBuilder::new() + .num_threads(threads) + .build_global() + .unwrap(); + seq_writers + .par_iter_mut() + .enumerate() + .for_each(|(idx, seq)| { + let sample_vars = self.mapped_variants.slice(s![.., idx]); + for ((mapped_chrom, mapped_pos), base) in + self.mapped_pos.iter().zip(sample_vars.iter()) + { + if *base != b'-' { + seq.write_split_kmer(*mapped_pos, *mapped_chrom, *base); + } + } + seq.finalise(); + }); + seq_writers + } + + /// Write mapped variants as an aln/multi-FASTA file, using [`needletail`]. + /// + /// Rows are samples, columns are variants, so this is broadly equivalent + /// to writing out the transpose of the mapped variant array. /// - /// Rows are variants, columns are samples, so this is broadly equivalent - /// to writing out the mapped variant array and its metadata. + /// Extra work is used to check to write out flanking bases of the split + /// k-mer, and padding with any missing positions. /// - /// Extra work is used to check for unmapped missing bases between split k-mer - /// matches. + /// This is done in memory, then samples are written to the output buffer + /// one at a time. /// - /// A basic VCF header is written out first. + /// # Panics /// - /// Variants are then written to the output buffer one at a time. + /// If [`RefSka::map()`] has not been run yet, or no split-kmers mapped. + pub fn write_aln( + &self, + f: &mut W, + threads: usize, + ) -> Result<(), needletail::errors::ParseError> { + if self.chrom_names.len() > 1 { + eprintln!("WARNING: Reference contained multiple contigs, in the output they will be concatenated"); + } + + let alignments = self.pseudoalignment(threads); + log::info!("Writing alignment to file"); + for (sample_name, mut aligned_seq) in self.mapped_names.iter().zip(alignments) { + write_fasta( + sample_name.as_bytes(), + aligned_seq.get_seq(), + f, + needletail::parser::LineEnding::Unix, + )?; + } + Ok(()) + } + + /// Write mapped variants as a VCF, using [`noodles_vcf`]. + /// + /// This uses [`RefSka::write_aln()`] and converts the output to a VCF: + /// - A basic VCF header is written out first. + /// - An alignment is created (in memory), which is then converted to a VCF + /// - Any ambiguous base is currently output as N, for simplicity of GT field. /// /// # Panics /// - /// If [`RefSka::map()`] has not been run yet, or no split-kmers mapped - pub fn write_vcf(&self, f: &mut W) -> Result<(), std::io::Error> { + /// If [`RefSka::map()`] has not been run yet, or no split-kmers mapped to + /// the reference. + pub fn write_vcf(&self, f: &mut W, threads: usize) -> Result<(), std::io::Error> { if !self.is_mapped() { panic!("No split k-mers mapped to reference"); } + // Get the pseudoalignment and store in array form + let alignments = self.pseudoalignment(threads); + + log::info!("Converting to VCF"); + let mut variants = Array2::zeros((0, alignments[0].total_size())); + for mut seq in alignments { + variants.push_row(ArrayView::from(seq.get_seq())).unwrap(); + } + // Transpose the array + let var_t = variants.t(); + let mut var_t_owned = Array2::zeros(var_t.raw_dim()); + var_t_owned.assign(&var_t); + // Write the VCF header let mut writer = vcf::Writer::new(f); let mut header_builder = vcf::Header::builder(); @@ -329,77 +361,45 @@ impl RefSka { // Write each record (column) let keys = gt_keys(); - let missing_genotypes = generate_missing_genotypes(self.mapped_names.len()); - - // Note that a lot of the logic here is similar to write_aln below, which - // is a bit simpler to follow - let half_split_len = (self.k - 1) / 2; - let (mut next_pos, mut curr_chrom) = (0, 0); - for ((map_chrom, map_pos), bases) in self - .mapped_pos - .iter() - .zip(self.mapped_variants.outer_iter()) + let idx_map = IdxCheck::new(&self.seq); + for (sample_variants, (map_chrom, map_pos)) in var_t_owned.outer_iter().zip(idx_map.iter()) { - // Fill missing bases to the end of the last chromosome - if *map_chrom > curr_chrom { - for record in self.iter_missing_vcf_rows( - curr_chrom, - next_pos + half_split_len, - self.seq[curr_chrom].len(), - &missing_genotypes, - ) { - writer.write_record(&record)?; - } - curr_chrom += 1; - next_pos = 0; - } - // Fill missing bases between k-mer matches - if *map_pos > next_pos { - for record in self.iter_missing_vcf_rows( - *map_chrom, - next_pos, - *map_pos - half_split_len, - &missing_genotypes, - ) { - writer.write_record(&record)?; - } - } - - let ref_base = self.seq[*map_chrom][*map_pos]; + let ref_base = self.seq[map_chrom][map_pos]; let ref_allele = u8_to_base(ref_base); let mut genotype_vec = Vec::new(); - genotype_vec.reserve(bases.len()); + genotype_vec.reserve(var_t_owned.ncols()); let mut alt_bases: Vec = Vec::new(); - for mapped_base in bases { - let mut gt = String::from("0"); - if *mapped_base != ref_base { - if *mapped_base == b'-' { - gt = ".".to_string(); - } else { - let alt_base = u8_to_base(*mapped_base); - if !alt_bases.contains(&alt_base) { - alt_bases.push(alt_base); - } - gt = (alt_bases.iter().position(|&r| r == alt_base).unwrap() + 1) - .to_string(); + let mut variant = false; + for mapped_base in sample_variants { + let gt = if *mapped_base == ref_base { + String::from("0") + } else if *mapped_base == b'-' { + variant = true; + String::from(".") + } else { + variant = true; + let alt_base = u8_to_base(*mapped_base); + if !alt_bases.contains(&alt_base) { + alt_bases.push(alt_base); } - } + (alt_bases.iter().position(|&r| r == alt_base).unwrap() + 1).to_string() + }; let field = Field::new(Key::Genotype, Some(Value::String(gt))); genotype_vec .push(Genotype::try_from(vec![field]).expect("Could not construct genotypes")); } - if !alt_bases.is_empty() { + if variant { let genotypes = Genotypes::new(keys.clone(), genotype_vec); let alt_alleles: Vec = alt_bases.iter().map(|a| Allele::Bases(vec![*a])).collect(); let record = vcf::Record::builder() .set_chromosome( - self.chrom_names[*map_chrom] + self.chrom_names[map_chrom] .parse() .expect("Invalid chromosome name"), ) - .set_position(Position::from(*map_pos + 1)) + .set_position(Position::from(map_pos + 1)) .add_reference_base(ref_allele) .set_alternate_bases(AlternateBases::from(alt_alleles)) .set_genotypes(genotypes) @@ -407,105 +407,6 @@ impl RefSka { .expect("Could not construct record"); writer.write_record(&record)?; } - next_pos = *map_pos + 1; - } - // Fill any missing bases at the end of final contig - let final_chrom_id = self.chrom_names.len() - 1; - for record in self.iter_missing_vcf_rows( - final_chrom_id, - next_pos + half_split_len, - self.seq[final_chrom_id].len(), - &missing_genotypes, - ) { - writer.write_record(&record)?; - } - Ok(()) - } - - /// Write mapped variants as an aln/multi-FASTA file, using [`needletail`]. - /// - /// Rows are samples, columns are variants, so this is broadly equivalent - /// to writing out the transpose of the mapped variant array. - /// - /// Extra work is used to check to write out flanking bases of the split - /// k-mer, and padding with any missing positions. - /// - /// Samples are written to the output buffer one at a time. - /// - /// # Panics - /// - /// If [`RefSka::map()`] has not been run yet, or no split-kmers mapped - pub fn write_aln(&self, f: &mut W) -> Result<(), needletail::errors::ParseError> { - if !self.is_mapped() { - panic!("TNo split k-mers mapped to reference"); - } - if self.chrom_names.len() > 1 { - eprintln!("WARNING: Reference contained multiple contigs, in the output they will be concatenated"); - } - let half_split_len = (self.k - 1) / 2; - for (sample_idx, sample_name) in self.mapped_names.iter().enumerate() { - let sample_vars = self.mapped_variants.slice(s![.., sample_idx]); - let mut seq: Vec = Vec::new(); - seq.reserve(self.total_size); - - // if this proves tricky, it might be better to iterate through non-missing - // matches and paste each flanking end and middle base into the right place - // in the vec (and skip to next match where right end is beyond last paste) - // the alternative would be to: - // - allocate all missing - // - iterate through ref - // - write the whole k-mer in when found (ref, base, ref) - // - then skip over k in ref - // (even just modifying map to skip over k would work, but not a VCF) - let (mut next_pos, mut curr_chrom) = (0, 0); - for ((map_chrom, map_pos), base) in self.mapped_pos.iter().zip(sample_vars.iter()) { - // Move forward to next chromosome/contig - if *map_chrom > curr_chrom { - seq.extend_from_slice( - &self.seq[curr_chrom][next_pos..(next_pos + half_split_len)], - ); - seq.extend(vec![ - b'-'; - self.seq[curr_chrom].len() - (next_pos + half_split_len) // This shouldn't overflow - ]); - curr_chrom += 1; - next_pos = 0; - } - if *map_pos > next_pos { - if *map_pos > next_pos + half_split_len { - // Missing bases, then first flanking half of split k-mer - // if no k-mers mapped over a large region - seq.extend(vec![b'-'; *map_pos - next_pos - half_split_len]); - seq.extend_from_slice( - &self.seq[curr_chrom][(*map_pos - half_split_len)..*map_pos], - ); - } else { - // Short missing region, which can happen with fastq input - // Reasonable to take bases from match - seq.extend_from_slice(&self.seq[curr_chrom][next_pos..*map_pos]); - } - } - next_pos = *map_pos + 1; - if *base == b'-' { - // This is around a split k-mer match, so we can fill in the - // flanking region with the reference - seq.push(self.seq[curr_chrom][*map_pos]); - } else { - seq.push(*base); - } - } - // Fill up to end of contig - seq.extend_from_slice(&self.seq[curr_chrom][next_pos..(next_pos + half_split_len)]); - seq.extend(vec![ - b'-'; - self.seq[curr_chrom].len() - (next_pos + half_split_len) - ]); - write_fasta( - sample_name.as_bytes(), - seq.as_slice(), - f, - needletail::parser::LineEnding::Unix, - )?; } Ok(()) } diff --git a/src/ska_ref/aln_writer.rs b/src/ska_ref/aln_writer.rs new file mode 100644 index 0000000..8e32fa0 --- /dev/null +++ b/src/ska_ref/aln_writer.rs @@ -0,0 +1,151 @@ +//! Generate a linear pseudoalignment. +//! +//! An empty sequence with the same length is set as the sequence +//! Matches are written as the first half, plus the middle base, e.g. for 7-mer +//! ACTGCTA then ACTG would be written. The first half is copied over from the +//! reference. Matches are skipped over until the next `k / 2 + 1` position +//! which is where more sequence can then be written. If bases are missing +//! or at the end of the contig these are caught up by adding the skipped +//! matches and the front half of the k-mer. +//! +//! All sequence is stored in memory. Calling [`AlnWriter::get_seq()`] finalises +//! and writes to the end of the last contig. + +/// Stores indexes which keep track of the writing, the output sequence and a +/// reference to the reference sequences which are written in to flanks of +/// each match. +#[derive(Clone)] +pub struct AlnWriter<'a> { + /// Next position where it is valid to write a match. + next_pos: usize, + /// The current chromsome. + curr_chrom: usize, + /// The latest position where a base has been mapped (may not have been written). + last_mapped: usize, + /// The latest position in `seq_out` that has been written to. + last_written: usize, + /// An offset which is added to convert chromosome positions to concatenated position. + chrom_offset: usize, + /// The reference sequences. + ref_seq: &'a Vec>, + /// The output alignment. + seq_out: Vec, + /// The size of the flanking region for each split k-mer match. + half_split_len: usize, + /// Whether the finalise function has been run, filling to the end of the + /// final contig. + finalised: bool, +} + +impl<'a> AlnWriter<'a> { + /// Create a new [`AlnWriter`] taking the reference sequence mapped against + /// and the k-mer size used for the mapping + pub fn new(ref_seq: &'a Vec>, k: usize) -> Self { + let total_size = ref_seq.iter().map(|x| x.len()).sum(); + let half_split_len = (k - 1) / 2; + Self { + next_pos: half_split_len, + curr_chrom: 0, + last_mapped: 0, + last_written: 0, + chrom_offset: 0, + ref_seq, + seq_out: vec![b'-'; total_size], + half_split_len, + finalised: false, + } + } + + /// Get the total length of the concatenated sequence output + pub fn total_size(&self) -> usize { + self.seq_out.len() + } + + // Fill fwd bases, and any skipped over. + // e.g. with split 7-mers perfectly matching + // CCGA AAGT + // 1 23 + // 1: Last written + // 2: Last mapped + // 3: Current mapped position + // Don't write anything + // CCGA[TT]-----AAGT + // 1 A2 B 3 + // Want to write between + // A = last written + 1 + // B = A + half-k + (last mapped - last written) + // maximum is provided as the base about to be written next, this will not + // write over this position + fn fill_fwd_bases(&mut self, maximum: usize) { + // bases at the end of last valid match not yet written + if self.last_written > 0 { + let last_match_overhang = + (self.last_mapped + self.half_split_len).saturating_sub(self.last_written); + let start = self.last_written + 1; + let end = usize::min(start + last_match_overhang, maximum); + if end > start { + self.seq_out[(start + self.chrom_offset)..(end + self.chrom_offset)] + .copy_from_slice(&self.ref_seq[self.curr_chrom][start..end]); + self.last_written = end; + } + } + } + + // Fill up to the end of the contig and update indices + fn fill_contig(&mut self) { + let chrom_length = self.ref_seq[self.curr_chrom].len(); + self.fill_fwd_bases(chrom_length); + self.chrom_offset += chrom_length; + self.curr_chrom += 1; + self.next_pos = self.half_split_len; + } + + /// Write the first half of a split k-mer (by copying from the reference) + /// and its middle base, also filling any skipped bases as appropiate. + /// + /// Call in a loop over mapped positions, usually excluding '-' base which + /// haven't actually been mapped + pub fn write_split_kmer(&mut self, mapped_pos: usize, mapped_chrom: usize, base: u8) { + if mapped_chrom > self.curr_chrom { + self.fill_contig(); + } + if mapped_pos < self.next_pos { + self.last_mapped = mapped_pos; + } else { + // Write bases between last match and this one + if mapped_pos > self.next_pos { + self.fill_fwd_bases(mapped_pos - self.half_split_len); + } + + // First half of split k-mer + let start = mapped_pos - self.half_split_len; + let end = mapped_pos; + self.seq_out[(start + self.chrom_offset)..(end + self.chrom_offset)] + .copy_from_slice(&self.ref_seq[self.curr_chrom][start..end]); + // Middle base + self.seq_out[mapped_pos + self.chrom_offset] = base; + + // update indices + self.next_pos = mapped_pos + self.half_split_len + 1; + self.last_mapped = mapped_pos; + self.last_written = mapped_pos; + } + } + + /// Fills to the end of the final contig + /// Should only be called after the last call to [`AlnWriter::write_split_kmer()`]. + pub fn finalise(&mut self) { + if !self.finalised { + while self.curr_chrom < self.ref_seq.len() { + self.fill_contig(); + } + self.finalised = true; + } + } + + /// Retrieve the written sequence. Calls [`AlnWriter::finalise()`] if not already called. + pub fn get_seq(&'a mut self) -> &'a [u8] { + self.finalise(); + self.seq_out.as_slice() + } +} diff --git a/src/ska_ref/idx_check.rs b/src/ska_ref/idx_check.rs new file mode 100644 index 0000000..b069129 --- /dev/null +++ b/src/ska_ref/idx_check.rs @@ -0,0 +1,68 @@ +//! Keep track of current chromosome during iteration over a concatenated +//! alignment. +//! +//! Simple counter with state to avoid doing range lookups (e.g. with an +//! interval tree) at every position. Used when converting alignment to VCF. + +/// Builds the contig change indices and keeps track of the current chromosome. +#[derive(Debug)] +pub struct IdxCheck { + /// The end coordinates (last base) of each chromosome + end_coor: Vec, +} + +impl IdxCheck { + /// Create indicies from Vec of Vec repr of a reference sequence + pub fn new(ref_seq: &[Vec]) -> Self { + let mut end_coor = Vec::new(); + + let mut cum_pos = 0; + for chrom in ref_seq { + cum_pos += chrom.len(); + end_coor.push(cum_pos); + } + + Self { end_coor } + } + + /// Iterate over index to return (chromosome, position) tuples + pub fn iter(&self) -> IdxCheckIter<'_> { + IdxCheckIter { + end_coor: &self.end_coor, + current_chr: 0, + idx: 0, + } + } +} + +/// Carries state which keeps track of chromosome during iteration +/// +/// Iterator as separate class so [`IdxCheck`] not modified during iteration +pub struct IdxCheckIter<'a> { + /// Ref to end coordinates + end_coor: &'a Vec, + /// Current chromosome + current_chr: usize, + /// Current absolute position + idx: usize, +} + +impl<'a> Iterator for IdxCheckIter<'a> { + type Item = (usize, usize); + + fn next(&mut self) -> Option<(usize, usize)> { + if self.idx >= self.end_coor[self.current_chr] { + self.current_chr += 1; + } + if self.current_chr < self.end_coor.len() { + let mut pos = self.idx; + if self.current_chr > 0 { + pos -= self.end_coor[self.current_chr - 1]; + } + self.idx += 1; + Some((self.current_chr, pos)) + } else { + None + } + } +} diff --git a/tests/align.rs b/tests/align.rs index ec95c71..d215581 100644 --- a/tests/align.rs +++ b/tests/align.rs @@ -1,7 +1,7 @@ use snapbox::cmd::{cargo_bin, Command}; pub mod common; -use crate::common::{var_hash, TestDir, TestSetup}; +use crate::common::*; use hashbrown::HashSet; @@ -11,7 +11,7 @@ use hashbrown::HashSet; fn build_cli() { let sandbox = TestSetup::setup(); // Create an rfile in the tmp dir - let rfile_name = sandbox.create_rfile(&"test", false); + let rfile_name = sandbox.create_rfile(&"test", FxType::Fasta); Command::new(cargo_bin("ska")) .current_dir(sandbox.get_wd()) diff --git a/tests/common/mod.rs b/tests/common/mod.rs index 3052167..627c3a0 100644 --- a/tests/common/mod.rs +++ b/tests/common/mod.rs @@ -22,6 +22,12 @@ pub enum TestDir { Correct, } +#[derive(Debug, PartialEq, Copy, Clone)] +pub enum FxType { + Fasta, + Fastq, +} + pub struct TestSetup { wd: TempDir, } @@ -81,14 +87,14 @@ impl TestSetup { predicate_fn.eval(self.wd.child(name_out).path()) } - pub fn create_rfile(&self, prefix: &str, fastq: bool) -> &str { + pub fn create_rfile(&self, prefix: &str, fx_type: FxType) -> &str { // Create an rfile in the tmp dir let mut rfile = LineWriter::new( File::create(format!("{}/{}", self.get_wd(), RFILE_NAME)) .expect("Could not write rfile"), ); - match fastq { - true => { + match fx_type { + FxType::Fastq => { writeln!( rfile, "{}", @@ -112,7 +118,7 @@ impl TestSetup { ) .unwrap(); } - false => { + FxType::Fasta => { writeln!( rfile, "{}", @@ -178,3 +184,13 @@ pub fn var_hash(aln_string: &[u8]) -> HashSet> { return variant_pairs; } + +// Helper for comparing mapped alignments with different sample names +pub fn cmp_map_aln(aln1: &[u8], aln2: &[u8]) { + let aln1_str = String::from_utf8(aln1.to_vec()).unwrap(); + let aln2_str = String::from_utf8(aln2.to_vec()).unwrap(); + + for (line1, line2) in aln1_str.lines().zip(aln2_str.lines()).skip(1).step_by(2) { + assert_eq!(line1, line2); + } +} diff --git a/tests/fasta_input.rs b/tests/fasta_input.rs index b274c1d..f2a7bc8 100644 --- a/tests/fasta_input.rs +++ b/tests/fasta_input.rs @@ -36,6 +36,8 @@ fn map_n() { .arg("build") .arg(sandbox.file_string("N_test_1.fa", TestDir::Input)) .arg(sandbox.file_string("N_test_2.fa", TestDir::Input)) + .arg("-k") + .arg("11") .arg("-o") .arg("N_test") .assert() diff --git a/tests/fastq_input.rs b/tests/fastq_input.rs index 5804fba..152e69f 100644 --- a/tests/fastq_input.rs +++ b/tests/fastq_input.rs @@ -1,13 +1,13 @@ use snapbox::cmd::{cargo_bin, Command}; pub mod common; -use crate::common::{var_hash, TestDir, TestSetup}; +use crate::common::*; // Uses perfect reads with the same fasta input as merge.skf #[test] fn align_fastq() { let sandbox = TestSetup::setup(); - let rfile_name = sandbox.create_rfile(&"test", true); + let rfile_name = sandbox.create_rfile(&"test", FxType::Fastq); Command::new(cargo_bin("ska")) .current_dir(sandbox.get_wd()) @@ -54,7 +54,7 @@ fn align_fastq() { #[test] fn map_fastq() { let sandbox = TestSetup::setup(); - let rfile_name = sandbox.create_rfile(&"test", true); + let rfile_name = sandbox.create_rfile(&"test", FxType::Fastq); Command::new(cargo_bin("ska")) .current_dir(sandbox.get_wd()) @@ -97,8 +97,8 @@ fn map_fastq() { .stdout; assert_eq!( - String::from_utf8(fastq_map_out), - String::from_utf8(fasta_map_out) + String::from_utf8(fastq_map_out).unwrap(), + String::from_utf8(fasta_map_out).unwrap() ); let fastq_map_out_vcf = Command::new(cargo_bin("ska")) @@ -136,7 +136,7 @@ fn error_fastq() { let sandbox = TestSetup::setup(); // Without errors - let rfile_name = sandbox.create_rfile(&"test", true); + let rfile_name = sandbox.create_rfile(&"test", FxType::Fastq); Command::new(cargo_bin("ska")) .current_dir(sandbox.get_wd()) .arg("build") @@ -159,7 +159,7 @@ fn error_fastq() { all_hash.remove(&vec!['C', 'T']); // With errors - let rfile_name = sandbox.create_rfile(&"test_error", true); + let rfile_name = sandbox.create_rfile(&"test_error", FxType::Fastq); Command::new(cargo_bin("ska")) .current_dir(sandbox.get_wd()) .arg("build") @@ -182,7 +182,7 @@ fn error_fastq() { assert_eq!(var_hash(&fastq_align_out_error), all_hash); // With low quality score - let rfile_name = sandbox.create_rfile(&"test_quality", true); + let rfile_name = sandbox.create_rfile(&"test_quality", FxType::Fastq); Command::new(cargo_bin("ska")) .current_dir(sandbox.get_wd()) .arg("build") diff --git a/tests/map.rs b/tests/map.rs index c5437c3..02b5b63 100644 --- a/tests/map.rs +++ b/tests/map.rs @@ -1,7 +1,7 @@ use snapbox::cmd::{cargo_bin, Command}; pub mod common; -use crate::common::{TestDir, TestSetup}; +use crate::common::*; // NB: to view output, uncomment the current_dir lines @@ -13,8 +13,8 @@ fn map_aln() { .current_dir(sandbox.get_wd()) .arg("map") .arg(sandbox.file_string("test_ref.fa", TestDir::Input)) - .arg(sandbox.file_string("merge.skf", TestDir::Input)) - .args(&["-v", "--threads", "2"]) + .arg(sandbox.file_string("test_1.fa", TestDir::Input)) + .arg(sandbox.file_string("test_2.fa", TestDir::Input)) .arg("-o") .arg("map.aln") .assert() @@ -26,10 +26,35 @@ fn map_aln() { .current_dir(sandbox.get_wd()) .arg("map") .arg(sandbox.file_string("test_ref.fa", TestDir::Input)) - .arg(sandbox.file_string("test_1.fa", TestDir::Input)) - .arg(sandbox.file_string("test_2.fa", TestDir::Input)) + .arg(sandbox.file_string("merge.skf", TestDir::Input)) + .args(&["-v", "--threads", "2"]) .assert() .stdout_matches_path(sandbox.file_string("map_aln.stdout", TestDir::Correct)); + + Command::new(cargo_bin("ska")) + .current_dir(sandbox.get_wd()) + .arg("map") + .arg(sandbox.file_string("test_ref.fa", TestDir::Input)) + .arg(sandbox.file_string("merge_k9.skf", TestDir::Input)) + .assert() + .stdout_matches_path(sandbox.file_string("map_aln_k9.stdout", TestDir::Correct)); + + Command::new(cargo_bin("ska")) + .current_dir(sandbox.get_wd()) + .arg("map") + .arg(sandbox.file_string("test_ref_two_chrom.fa", TestDir::Input)) + .arg(sandbox.file_string("merge.skf", TestDir::Input)) + .assert() + .stdout_matches_path(sandbox.file_string("map_aln_two_chrom.stdout", TestDir::Correct)); + + Command::new(cargo_bin("ska")) + .current_dir(sandbox.get_wd()) + .arg("map") + .arg(sandbox.file_string("test_ref.fa", TestDir::Input)) + .arg(sandbox.file_string("test_1.fa", TestDir::Input)) + .arg(sandbox.file_string("indel_test.fa", TestDir::Input)) + .assert() + .stdout_matches_path(sandbox.file_string("map_aln_indels.stdout", TestDir::Correct)); } #[test] @@ -40,11 +65,12 @@ fn map_vcf() { .current_dir(sandbox.get_wd()) .arg("map") .arg(sandbox.file_string("test_ref.fa", TestDir::Input)) - .arg(sandbox.file_string("merge.skf", TestDir::Input)) - .arg("-f") - .arg("vcf") + .arg(sandbox.file_string("test_1.fa", TestDir::Input)) + .arg(sandbox.file_string("test_2.fa", TestDir::Input)) .arg("-o") .arg("map.vcf") + .arg("-f") + .arg("vcf") .assert() .success(); @@ -54,10 +80,98 @@ fn map_vcf() { .current_dir(sandbox.get_wd()) .arg("map") .arg(sandbox.file_string("test_ref.fa", TestDir::Input)) - .arg(sandbox.file_string("test_1.fa", TestDir::Input)) - .arg(sandbox.file_string("test_2.fa", TestDir::Input)) + .arg(sandbox.file_string("merge.skf", TestDir::Input)) .arg("-f") .arg("vcf") .assert() .stdout_matches_path(sandbox.file_string("map_vcf.stdout", TestDir::Correct)); + + Command::new(cargo_bin("ska")) + .current_dir(sandbox.get_wd()) + .arg("map") + .arg(sandbox.file_string("test_ref_two_chrom.fa", TestDir::Input)) + .arg(sandbox.file_string("merge.skf", TestDir::Input)) + .arg("-f") + .arg("vcf") + .assert() + .stdout_matches_path(sandbox.file_string("map_vcf_two_chrom.stdout", TestDir::Correct)); + + Command::new(cargo_bin("ska")) + .current_dir(sandbox.get_wd()) + .arg("map") + .arg(sandbox.file_string("test_ref.fa", TestDir::Input)) + .arg(sandbox.file_string("test_1.fa", TestDir::Input)) + .arg(sandbox.file_string("indel_test.fa", TestDir::Input)) + .arg("-f") + .arg("vcf") + .assert() + .stdout_matches_path(sandbox.file_string("map_vcf_indels.stdout", TestDir::Correct)); +} + +#[test] +fn map_rev_comp() { + let sandbox = TestSetup::setup(); + + Command::new(cargo_bin("ska")) + .current_dir(sandbox.get_wd()) + .arg("build") + .arg("-o") + .arg("rc_build") + .arg("-k") + .arg("9") + .arg(sandbox.file_string("test_1.fa", TestDir::Input)) + .arg(sandbox.file_string("test_2_rc.fa", TestDir::Input)) + .assert() + .success(); + + let rc_map = Command::new(cargo_bin("ska")) + .current_dir(sandbox.get_wd()) + .arg("map") + .arg(sandbox.file_string("test_ref.fa", TestDir::Input)) + .arg("fwd_build.skf") + .output() + .unwrap() + .stdout; + + let fwd_map = Command::new(cargo_bin("ska")) + .current_dir(sandbox.get_wd()) + .arg("map") + .arg(sandbox.file_string("test_ref.fa", TestDir::Input)) + .arg(sandbox.file_string("merge.skf", TestDir::Input)) + .output() + .unwrap() + .stdout; + + cmp_map_aln(&rc_map, &fwd_map); + + Command::new(cargo_bin("ska")) + .current_dir(sandbox.get_wd()) + .arg("build") + .arg("-o") + .arg("ss_map") + .arg("-k") + .arg("9") + .arg("--single-strand") + .arg(sandbox.file_string("test_1.fa", TestDir::Input)) + .arg(sandbox.file_string("test_2_rc.fa", TestDir::Input)) + .assert() + .success(); + + Command::new(cargo_bin("ska")) + .current_dir(sandbox.get_wd()) + .arg("map") + .arg(sandbox.file_string("test_ref.fa", TestDir::Input)) + .arg("ss_map.skf") + .assert() + .stdout_matches_path(sandbox.file_string("map_ss.stdout", TestDir::Correct)); + + Command::new(cargo_bin("ska")) + .current_dir(sandbox.get_wd()) + .arg("map") + .arg(sandbox.file_string("test_ref.fa", TestDir::Input)) + .arg("ss_map.skf") + .arg("-f") + .arg("vcf") + .assert() + .stdout_matches_path(sandbox.file_string("map_vcf_ss.stdout", TestDir::Correct)); } diff --git a/tests/test_files_in/indel_test.fa b/tests/test_files_in/indel_test.fa new file mode 100644 index 0000000..674407c --- /dev/null +++ b/tests/test_files_in/indel_test.fa @@ -0,0 +1,2 @@ +>sample1 +GATTTGAAGGGTTTATGATATTTAGCTGTACGAGAGTCTTTTAAGGGAGTGTTTTGATGGTTT diff --git a/tests/test_files_in/merge_k9.skf b/tests/test_files_in/merge_k9.skf new file mode 100644 index 0000000..bf0e22c Binary files /dev/null and b/tests/test_files_in/merge_k9.skf differ diff --git a/tests/test_files_in/test_1_fwd.fastq.gz b/tests/test_files_in/test_1_fwd.fastq.gz index b10b0ed..ce92d56 100644 Binary files a/tests/test_files_in/test_1_fwd.fastq.gz and b/tests/test_files_in/test_1_fwd.fastq.gz differ diff --git a/tests/test_files_in/test_ref_two_chrom.fa b/tests/test_files_in/test_ref_two_chrom.fa new file mode 100644 index 0000000..c04dcde --- /dev/null +++ b/tests/test_files_in/test_ref_two_chrom.fa @@ -0,0 +1,4 @@ +>fake_ref_chrom1 +ACGTACGTACGTGATTTGAAGGCGTTTATGATATTTAGCTGTACGAGAGTCTTTTAAGAGTGTTTTGATGGTTTACGTACGTACGT +>fake_ref_chrom2 +GATTTGAAGGCGTTTATGATATTTAGCTGTACGAGAGTCTTTTAAGAGTGTTTTGATGGTTT diff --git a/tests/test_results_correct/map_N.stdout b/tests/test_results_correct/map_N.stdout index e9c14f1..5ac3cce 100644 --- a/tests/test_results_correct/map_N.stdout +++ b/tests/test_results_correct/map_N.stdout @@ -1,4 +1,4 @@ >N_test_1 -------------GATTTGAAGGCGTTTATGATATTTAGCTGTACGAGAGTCTTTTAA----------------------------- +------------GATTTGAAGGCGTTTATGATATTTAGCTGTACGAGAGTCTTTT---AGTGTTTTGATGGT-------------- >N_test_2 ------------GATTTGAAGGCGTTTATGATATATAGCTGTACGAGAGTCTTTTAA----------------------------- diff --git a/tests/test_results_correct/map_aln.stdout b/tests/test_results_correct/map_aln.stdout index a5683ab..c1148e9 100644 --- a/tests/test_results_correct/map_aln.stdout +++ b/tests/test_results_correct/map_aln.stdout @@ -1,4 +1,4 @@ >test_1 ------------GATTTGAAGGCGTTTATGATATTTAGCTGTACGAGAGTCTTTTAAGAGTGTTTTGATGGTTT------------ >test_2 -------------GATTTGAAGGCGTTTATGATATATAGCTGTACGAGAGTCTTTTAAAAGTGTTTTGATGGTTT------------ +------------GATTTGAAGGCGTTTATGATATATAGCTGTACGAGAGTCTTTTAAAAGTGTTTT-------------------- diff --git a/tests/test_results_correct/map_aln_indels.stdout b/tests/test_results_correct/map_aln_indels.stdout new file mode 100644 index 0000000..fe906b6 --- /dev/null +++ b/tests/test_results_correct/map_aln_indels.stdout @@ -0,0 +1,4 @@ +>test_1 +------------GATTTGAAGGCGTTTATGATATTTAGCTGTACGAGAGTCTTTTAAGAGTGTTTTGATGGTTT------------ +>indel_test +-----------------------GTTTATGATATTTAGCTGTACGAGAGTCTTTTAAGAGTGTTTTGATGGTTT------------ diff --git a/tests/test_results_correct/map_aln_k9.stdout b/tests/test_results_correct/map_aln_k9.stdout new file mode 100644 index 0000000..f90ac74 --- /dev/null +++ b/tests/test_results_correct/map_aln_k9.stdout @@ -0,0 +1,4 @@ +>test_1 +------------GATTTGAAGGCGTTTATGATATTTAGCTGTACGAGAGTCTTTTAAGAGTSTTTTGATGGTTT------------ +>test_2 +------------GATTTGAAGGCGTTTATGATATATAGCTGTACGAGAGTCTTTTAAAAGTGTTTTGATGGTTT------------ diff --git a/tests/test_results_correct/map_aln_two_chrom.stdout b/tests/test_results_correct/map_aln_two_chrom.stdout new file mode 100644 index 0000000..a1161c2 --- /dev/null +++ b/tests/test_results_correct/map_aln_two_chrom.stdout @@ -0,0 +1,4 @@ +>test_1 +------------GATTTGAAGGCGTTTATGATATTTAGCTGTACGAGAGTCTTTTAAGAGTGTTTTGATGGTTT------------GATTTGAAGGCGTTTATGATATTTAGCTGTACGAGAGTCTTTTAAGAGTGTTTTGATGGTTT +>test_2 +------------GATTTGAAGGCGTTTATGATATATAGCTGTACGAGAGTCTTTTAAAAGTGTTTT--------------------GATTTGAAGGCGTTTATGATATATAGCTGTACGAGAGTCTTTTAAAAGTGTTTT-------- diff --git a/tests/test_results_correct/map_ss.stdout b/tests/test_results_correct/map_ss.stdout new file mode 100644 index 0000000..f512e77 --- /dev/null +++ b/tests/test_results_correct/map_ss.stdout @@ -0,0 +1,4 @@ +>test_1 +------------GATTTGAAGGCGTTTATGATATTTAGCTGTACGAGAGTCTTTTAAGAGTSTTTTGATGGTTT------------ +>test_2_rc +-------------------------------------------------------------------------------------- diff --git a/tests/test_results_correct/map_vcf.stdout b/tests/test_results_correct/map_vcf.stdout index ed6736e..5f473e6 100644 --- a/tests/test_results_correct/map_vcf.stdout +++ b/tests/test_results_correct/map_vcf.stdout @@ -15,6 +15,14 @@ fake_ref 11 . G . . . . GT . . fake_ref 12 . T . . . . GT . . fake_ref 35 . T A . . . GT 0 1 fake_ref 58 . G A . . . GT 0 1 +fake_ref 67 . G . . . . GT 0 . +fake_ref 68 . A . . . . GT 0 . +fake_ref 69 . T . . . . GT 0 . +fake_ref 70 . G . . . . GT 0 . +fake_ref 71 . G . . . . GT 0 . +fake_ref 72 . T . . . . GT 0 . +fake_ref 73 . T . . . . GT 0 . +fake_ref 74 . T . . . . GT 0 . fake_ref 75 . A . . . . GT . . fake_ref 76 . C . . . . GT . . fake_ref 77 . G . . . . GT . . diff --git a/tests/test_results_correct/map_vcf_indels.stdout b/tests/test_results_correct/map_vcf_indels.stdout new file mode 100644 index 0000000..3bbb5f9 --- /dev/null +++ b/tests/test_results_correct/map_vcf_indels.stdout @@ -0,0 +1,38 @@ +##fileformat=VCFv4.3 +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT test_1 indel_test +fake_ref 1 . A . . . . GT . . +fake_ref 2 . C . . . . GT . . +fake_ref 3 . G . . . . GT . . +fake_ref 4 . T . . . . GT . . +fake_ref 5 . A . . . . GT . . +fake_ref 6 . C . . . . GT . . +fake_ref 7 . G . . . . GT . . +fake_ref 8 . T . . . . GT . . +fake_ref 9 . A . . . . GT . . +fake_ref 10 . C . . . . GT . . +fake_ref 11 . G . . . . GT . . +fake_ref 12 . T . . . . GT . . +fake_ref 13 . G . . . . GT 0 . +fake_ref 14 . A . . . . GT 0 . +fake_ref 15 . T . . . . GT 0 . +fake_ref 16 . T . . . . GT 0 . +fake_ref 17 . T . . . . GT 0 . +fake_ref 18 . G . . . . GT 0 . +fake_ref 19 . A . . . . GT 0 . +fake_ref 20 . A . . . . GT 0 . +fake_ref 21 . G . . . . GT 0 . +fake_ref 22 . G . . . . GT 0 . +fake_ref 23 . C . . . . GT 0 . +fake_ref 75 . A . . . . GT . . +fake_ref 76 . C . . . . GT . . +fake_ref 77 . G . . . . GT . . +fake_ref 78 . T . . . . GT . . +fake_ref 79 . A . . . . GT . . +fake_ref 80 . C . . . . GT . . +fake_ref 81 . G . . . . GT . . +fake_ref 82 . T . . . . GT . . +fake_ref 83 . A . . . . GT . . +fake_ref 84 . C . . . . GT . . +fake_ref 85 . G . . . . GT . . +fake_ref 86 . T . . . . GT . . diff --git a/tests/test_results_correct/map_vcf_ss.stdout b/tests/test_results_correct/map_vcf_ss.stdout new file mode 100644 index 0000000..777ceb8 --- /dev/null +++ b/tests/test_results_correct/map_vcf_ss.stdout @@ -0,0 +1,89 @@ +##fileformat=VCFv4.3 +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT test_1 test_2_rc +fake_ref 1 . A . . . . GT . . +fake_ref 2 . C . . . . GT . . +fake_ref 3 . G . . . . GT . . +fake_ref 4 . T . . . . GT . . +fake_ref 5 . A . . . . GT . . +fake_ref 6 . C . . . . GT . . +fake_ref 7 . G . . . . GT . . +fake_ref 8 . T . . . . GT . . +fake_ref 9 . A . . . . GT . . +fake_ref 10 . C . . . . GT . . +fake_ref 11 . G . . . . GT . . +fake_ref 12 . T . . . . GT . . +fake_ref 13 . G . . . . GT 0 . +fake_ref 14 . A . . . . GT 0 . +fake_ref 15 . T . . . . GT 0 . +fake_ref 16 . T . . . . GT 0 . +fake_ref 17 . T . . . . GT 0 . +fake_ref 18 . G . . . . GT 0 . +fake_ref 19 . A . . . . GT 0 . +fake_ref 20 . A . . . . GT 0 . +fake_ref 21 . G . . . . GT 0 . +fake_ref 22 . G . . . . GT 0 . +fake_ref 23 . C . . . . GT 0 . +fake_ref 24 . G . . . . GT 0 . +fake_ref 25 . T . . . . GT 0 . +fake_ref 26 . T . . . . GT 0 . +fake_ref 27 . T . . . . GT 0 . +fake_ref 28 . A . . . . GT 0 . +fake_ref 29 . T . . . . GT 0 . +fake_ref 30 . G . . . . GT 0 . +fake_ref 31 . A . . . . GT 0 . +fake_ref 32 . T . . . . GT 0 . +fake_ref 33 . A . . . . GT 0 . +fake_ref 34 . T . . . . GT 0 . +fake_ref 35 . T . . . . GT 0 . +fake_ref 36 . T . . . . GT 0 . +fake_ref 37 . A . . . . GT 0 . +fake_ref 38 . G . . . . GT 0 . +fake_ref 39 . C . . . . GT 0 . +fake_ref 40 . T . . . . GT 0 . +fake_ref 41 . G . . . . GT 0 . +fake_ref 42 . T . . . . GT 0 . +fake_ref 43 . A . . . . GT 0 . +fake_ref 44 . C . . . . GT 0 . +fake_ref 45 . G . . . . GT 0 . +fake_ref 46 . A . . . . GT 0 . +fake_ref 47 . G . . . . GT 0 . +fake_ref 48 . A . . . . GT 0 . +fake_ref 49 . G . . . . GT 0 . +fake_ref 50 . T . . . . GT 0 . +fake_ref 51 . C . . . . GT 0 . +fake_ref 52 . T . . . . GT 0 . +fake_ref 53 . T . . . . GT 0 . +fake_ref 54 . T . . . . GT 0 . +fake_ref 55 . T . . . . GT 0 . +fake_ref 56 . A . . . . GT 0 . +fake_ref 57 . A . . . . GT 0 . +fake_ref 58 . G . . . . GT 0 . +fake_ref 59 . A . . . . GT 0 . +fake_ref 60 . G . . . . GT 0 . +fake_ref 61 . T . . . . GT 0 . +fake_ref 62 . G N . . . GT 1 . +fake_ref 63 . T . . . . GT 0 . +fake_ref 64 . T . . . . GT 0 . +fake_ref 65 . T . . . . GT 0 . +fake_ref 66 . T . . . . GT 0 . +fake_ref 67 . G . . . . GT 0 . +fake_ref 68 . A . . . . GT 0 . +fake_ref 69 . T . . . . GT 0 . +fake_ref 70 . G . . . . GT 0 . +fake_ref 71 . G . . . . GT 0 . +fake_ref 72 . T . . . . GT 0 . +fake_ref 73 . T . . . . GT 0 . +fake_ref 74 . T . . . . GT 0 . +fake_ref 75 . A . . . . GT . . +fake_ref 76 . C . . . . GT . . +fake_ref 77 . G . . . . GT . . +fake_ref 78 . T . . . . GT . . +fake_ref 79 . A . . . . GT . . +fake_ref 80 . C . . . . GT . . +fake_ref 81 . G . . . . GT . . +fake_ref 82 . T . . . . GT . . +fake_ref 83 . A . . . . GT . . +fake_ref 84 . C . . . . GT . . +fake_ref 85 . G . . . . GT . . +fake_ref 86 . T . . . . GT . . diff --git a/tests/test_results_correct/map_vcf_two_chrom.stdout b/tests/test_results_correct/map_vcf_two_chrom.stdout new file mode 100644 index 0000000..c8e9925 --- /dev/null +++ b/tests/test_results_correct/map_vcf_two_chrom.stdout @@ -0,0 +1,48 @@ +##fileformat=VCFv4.3 +##contig= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT test_1 test_2 +fake_ref_chrom1 1 . A . . . . GT . . +fake_ref_chrom1 2 . C . . . . GT . . +fake_ref_chrom1 3 . G . . . . GT . . +fake_ref_chrom1 4 . T . . . . GT . . +fake_ref_chrom1 5 . A . . . . GT . . +fake_ref_chrom1 6 . C . . . . GT . . +fake_ref_chrom1 7 . G . . . . GT . . +fake_ref_chrom1 8 . T . . . . GT . . +fake_ref_chrom1 9 . A . . . . GT . . +fake_ref_chrom1 10 . C . . . . GT . . +fake_ref_chrom1 11 . G . . . . GT . . +fake_ref_chrom1 12 . T . . . . GT . . +fake_ref_chrom1 35 . T A . . . GT 0 1 +fake_ref_chrom1 58 . G A . . . GT 0 1 +fake_ref_chrom1 67 . G . . . . GT 0 . +fake_ref_chrom1 68 . A . . . . GT 0 . +fake_ref_chrom1 69 . T . . . . GT 0 . +fake_ref_chrom1 70 . G . . . . GT 0 . +fake_ref_chrom1 71 . G . . . . GT 0 . +fake_ref_chrom1 72 . T . . . . GT 0 . +fake_ref_chrom1 73 . T . . . . GT 0 . +fake_ref_chrom1 74 . T . . . . GT 0 . +fake_ref_chrom1 75 . A . . . . GT . . +fake_ref_chrom1 76 . C . . . . GT . . +fake_ref_chrom1 77 . G . . . . GT . . +fake_ref_chrom1 78 . T . . . . GT . . +fake_ref_chrom1 79 . A . . . . GT . . +fake_ref_chrom1 80 . C . . . . GT . . +fake_ref_chrom1 81 . G . . . . GT . . +fake_ref_chrom1 82 . T . . . . GT . . +fake_ref_chrom1 83 . A . . . . GT . . +fake_ref_chrom1 84 . C . . . . GT . . +fake_ref_chrom1 85 . G . . . . GT . . +fake_ref_chrom1 86 . T . . . . GT . . +fake_ref_chrom2 23 . T A . . . GT 0 1 +fake_ref_chrom2 46 . G A . . . GT 0 1 +fake_ref_chrom2 55 . G . . . . GT 0 . +fake_ref_chrom2 56 . A . . . . GT 0 . +fake_ref_chrom2 57 . T . . . . GT 0 . +fake_ref_chrom2 58 . G . . . . GT 0 . +fake_ref_chrom2 59 . G . . . . GT 0 . +fake_ref_chrom2 60 . T . . . . GT 0 . +fake_ref_chrom2 61 . T . . . . GT 0 . +fake_ref_chrom2 62 . T . . . . GT 0 .