diff --git a/src/ska_ref.rs b/src/ska_ref.rs index 2d101f7..f0cbaa6 100644 --- a/src/ska_ref.rs +++ b/src/ska_ref.rs @@ -466,17 +466,24 @@ impl RefSka { ); seq.extend(vec![ b'-'; - self.seq[curr_chrom].len() - (next_pos + half_split_len) + self.seq[curr_chrom].len() - (next_pos + half_split_len) // This shouldn't overflow ]); curr_chrom += 1; next_pos = 0; } if *map_pos > next_pos { - // Missing bases, if no k-mers mapped over a region - seq.extend(vec![b'-'; *map_pos - next_pos - half_split_len]); - seq.extend_from_slice( - &self.seq[curr_chrom][(*map_pos - half_split_len)..*map_pos], - ); + if *map_pos > next_pos + half_split_len { + // Missing bases, then first flanking half of split k-mer + // if no k-mers mapped over a large region + seq.extend(vec![b'-'; *map_pos - next_pos - half_split_len]); + seq.extend_from_slice( + &self.seq[curr_chrom][(*map_pos - half_split_len)..*map_pos], + ); + } else { + // Short missing region, which can happen with fastq input + // Reasonable to take bases from match + seq.extend_from_slice(&self.seq[curr_chrom][next_pos..*map_pos]); + } } next_pos = *map_pos + 1; if *base == b'-' { diff --git a/tests/fastq_input.rs b/tests/fastq_input.rs index 6a4b857..5804fba 100644 --- a/tests/fastq_input.rs +++ b/tests/fastq_input.rs @@ -50,6 +50,83 @@ fn align_fastq() { assert_eq!(var_hash(&fastq_align_out), var_hash(&fasta_align_out)); } +// Uses perfect reads with the same fasta input as merge.skf +#[test] +fn map_fastq() { + let sandbox = TestSetup::setup(); + let rfile_name = sandbox.create_rfile(&"test", true); + + Command::new(cargo_bin("ska")) + .current_dir(sandbox.get_wd()) + .arg("build") + .arg("-f") + .arg(rfile_name) + .arg("-o") + .arg("reads") + .args(&["--min-count", "1", "-v", "-k", "9", "--min-qual", "2"]) + .assert() + .success(); + + let fastq_map_out = Command::new(cargo_bin("ska")) + .current_dir(sandbox.get_wd()) + .arg("map") + .arg(sandbox.file_string("test_ref.fa", TestDir::Input)) + .arg("reads.skf") + .output() + .unwrap() + .stdout; + + Command::new(cargo_bin("ska")) + .current_dir(sandbox.get_wd()) + .arg("build") + .arg(sandbox.file_string("test_1.fa", TestDir::Input)) + .arg(sandbox.file_string("test_2.fa", TestDir::Input)) + .arg("-o") + .arg("assemblies") + .args(&["-v", "-k", "9"]) + .assert() + .success(); + + let fasta_map_out = Command::new(cargo_bin("ska")) + .current_dir(sandbox.get_wd()) + .arg("map") + .arg(sandbox.file_string("test_ref.fa", TestDir::Input)) + .arg("assemblies.skf") + .output() + .unwrap() + .stdout; + + assert_eq!( + String::from_utf8(fastq_map_out), + String::from_utf8(fasta_map_out) + ); + + let fastq_map_out_vcf = Command::new(cargo_bin("ska")) + .current_dir(sandbox.get_wd()) + .arg("map") + .arg(sandbox.file_string("test_ref.fa", TestDir::Input)) + .arg("reads.skf") + .args(&["-f", "vcf"]) + .output() + .unwrap() + .stdout; + + let fasta_map_out_vcf = Command::new(cargo_bin("ska")) + .current_dir(sandbox.get_wd()) + .arg("map") + .arg(sandbox.file_string("test_ref.fa", TestDir::Input)) + .arg("assemblies.skf") + .args(&["-f", "vcf"]) + .output() + .unwrap() + .stdout; + + assert_eq!( + String::from_utf8(fastq_map_out_vcf), + String::from_utf8(fasta_map_out_vcf) + ); +} + // Add errors and low quality scores to split k-mers around // CACT TTAA C,T // zgrep 'TTAA.AGTG' test_1_fwd.fastq.gz