From 02262bc1f8c5dec7651a7afb261257b4ac99ba43 Mon Sep 17 00:00:00 2001 From: Ben Woodcroft Date: Wed, 18 Sep 2024 06:59:42 +1000 Subject: [PATCH 1/4] contig: strobealign-aemb: Initial efforts. --- src/lib.rs | 1 + src/strobealign_aemb.rs | 67 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+) create mode 100644 src/strobealign_aemb.rs diff --git a/src/lib.rs b/src/lib.rs index ac0ee89..bb659a6 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -13,6 +13,7 @@ pub mod mapping_index_maintenance; pub mod mapping_parameters; pub mod mosdepth_genome_coverage_estimators; pub mod shard_bam_reader; +pub mod strobealign_aemb; use rust_htslib::bam::record::Record; use std::sync::Arc; diff --git a/src/strobealign_aemb.rs b/src/strobealign_aemb.rs new file mode 100644 index 0000000..10e18f8 --- /dev/null +++ b/src/strobealign_aemb.rs @@ -0,0 +1,67 @@ +use crate::bam_generator::complete_processes; +use crate::coverage_takers::*; +use crate::mapping_parameters::MappingParameters; +use crate::mosdepth_genome_coverage_estimators::*; +use crate::ReadsMapped; + +pub fn strobealign_aemb_coverage( + mapping_parameters: MappingParameters, + _coverage_taker: &mut T, + _coverage_estimators: &mut [CoverageEstimator], + // print_zero_coverage_contigs: bool, + threads: u16, +) -> Vec { + let mut commands = vec![]; + let mut log_files = vec![]; + + for p1 in mapping_parameters { + for p2 in p1 { + let mapping_result = tempfile::Builder::new() + .prefix("coverm-strobealign-aemb-mapping-result") + .tempfile() + .unwrap_or_else(|_| panic!("Failed to create strobealign-aemb result tempfile")); + let mapping_log = tempfile::Builder::new() + .prefix("coverm-strobealign-aemb-mapping-log") + .tempfile() + .unwrap_or_else(|_| panic!("Failed to create strobealign-aemb log tempfile")); + let cmd = format!( + "strobealign --aemb -t {} {} {} {} > {} 2> {}", + threads, + p2.reference, + p2.read1, + p2.read2.unwrap_or(""), + mapping_result + .path() + .to_str() + .expect("Failed to get strobealign-aemb result path"), + mapping_log + .path() + .to_str() + .expect("Failed to get strobealign-aemb log path"), + ); + log_files.push(mapping_log); + commands.push(cmd); + } + } + + // Run each command in serial, and complete each process + // before starting the next one. + for cmd_string in commands { + debug!("Queuing cmd_string: {}", cmd_string); + let mut cmd = std::process::Command::new("bash"); + cmd.arg("-c") + .arg(&cmd_string) + .stderr(std::process::Stdio::piped()); + let process = cmd.spawn().expect("Unable to execute bash"); + complete_processes( + vec![process], + vec![cmd_string], + vec!["strobealign-aemb log file".to_string()], + vec![log_files.remove(0)], + None, + ); + } + + // Read the mapping results and pass them to the coverage taker + panic!(); +} From 9b32a6abecfd1a11717bfca5744bc65d797de2c4 Mon Sep 17 00:00:00 2001 From: Ben Woodcroft Date: Sat, 28 Sep 2024 20:58:48 +1000 Subject: [PATCH 2/4] contig: Add strobealign-aemb method. --- Cargo.toml | 1 + src/bam_generator.rs | 56 ++++++++++---------- src/bin/coverm.rs | 27 +++++++++- src/cli.rs | 5 +- src/contig.rs | 1 - src/coverage_printer.rs | 1 - src/genome.rs | 1 - src/lib.rs | 1 + src/mapping_parameters.rs | 4 +- src/mosdepth_genome_coverage_estimators.rs | 20 ++++++- src/strobealign_aemb.rs | 61 ++++++++++++++++++---- tests/test_cmdline.rs | 34 ++++++++++-- 12 files changed, 162 insertions(+), 50 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 652f007..94e9d8d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -36,6 +36,7 @@ galah = "0.4.0" bird_tool_utils-man = "0.4.0" roff = "0.2.*" needletail = "0.5.*" +csv = "1.*" [dev-dependencies] assert_cli = "0.6.*" diff --git a/src/bam_generator.rs b/src/bam_generator.rs index 1d04bc4..c4bcdff 100644 --- a/src/bam_generator.rs +++ b/src/bam_generator.rs @@ -160,6 +160,28 @@ impl NamedBamReaderGenerator for StreamingNamedBamReade } } +pub fn name_stoit( + index_path: &str, + read1_path: &str, + include_reference_in_stoit_name: bool, +) -> String { + (match include_reference_in_stoit_name { + true => (std::path::Path::new(&index_path) + .file_name() + .expect("Unable to convert reference to file name") + .to_str() + .expect("Unable to covert file name into str") + .to_string() + + "/") + .to_string(), + false => "".to_string(), + }) + std::path::Path::new(read1_path) + .file_name() + .expect("Unable to convert read1 name to file name") + .to_str() + .expect("Unable to covert file name into str") +} + pub fn complete_processes( processes: Vec, command_strings: Vec, @@ -412,22 +434,11 @@ pub fn generate_named_bam_readers_from_reads( log_files.push(samtools_view_cache_log); } - let stoit_name = match include_reference_in_stoit_name { - true => { - std::path::Path::new(&index.index_path()) - .file_name() - .expect("Unable to convert reference to file name") - .to_str() - .expect("Unable to covert file name into str") - .to_string() - + "/" - } - false => "".to_string(), - } + std::path::Path::new(read1_path) - .file_name() - .expect("Unable to convert read1 name to file name") - .to_str() - .expect("Unable to covert file name into str"); + let stoit_name = name_stoit( + index.index_path(), + read1_path, + include_reference_in_stoit_name, + ); StreamingNamedBamReaderGenerator { stoit_name, @@ -797,18 +808,7 @@ pub fn generate_bam_maker_generator_from_reads( let log_files = vec![mapping_log, samtools2_log, samtools_view_cache_log]; return NamedBamMakerGenerator { - stoit_name: std::path::Path::new(index.index_path()) - .file_name() - .expect("Unable to convert reference to file name") - .to_str() - .expect("Unable to covert file name into str") - .to_string() - + "/" - + std::path::Path::new(read1_path) - .file_name() - .expect("Unable to convert read1 name to file name") - .to_str() - .expect("Unable to covert file name into str"), + stoit_name: name_stoit(index.index_path(), read1_path, true), pre_processes: vec![cmd], command_strings: vec![format!("bash -c \"{}\"", cmd_string)], log_file_descriptions: log_descriptions, diff --git a/src/bin/coverm.rs b/src/bin/coverm.rs index c84b1a3..e331b91 100644 --- a/src/bin/coverm.rs +++ b/src/bin/coverm.rs @@ -11,6 +11,7 @@ use coverm::mapping_index_maintenance::check_reference_existence; use coverm::mapping_parameters::*; use coverm::mosdepth_genome_coverage_estimators::*; use coverm::shard_bam_reader::*; +use coverm::strobealign_aemb::strobealign_aemb_coverage; use coverm::FlagFilter; use coverm::OutputWriter; use coverm::CONCATENATED_FASTA_FILE_SEPARATOR; @@ -505,7 +506,23 @@ fn main() { estimators_and_taker = estimators_and_taker.print_headers("Contig", print_stream.clone()); - if m.contains_id("bam-files") { + if let CoverageEstimator::StrobealignAembEstimator {} = + estimators_and_taker.estimators[0] + { + let mapping_params = + MappingParameters::generate_from_clap(m, MappingProgram::STROBEALIGN, &None); + debug!( + "Running strobealign-aemb coverage with mapping parameters: {:?}", + mapping_params + ); + strobealign_aemb_coverage( + mapping_params, + &mut estimators_and_taker.taker, + threads, + &mut estimators_and_taker.printer, + &mut print_stream, + ); + } else if m.contains_id("bam-files") { let bam_files: Vec<&str> = m .get_many::("bam-files") .unwrap() @@ -1029,6 +1046,13 @@ impl EstimatorsAndTaker { "reads_per_base" => { estimators.push(CoverageEstimator::new_estimator_reads_per_base()); } + "strobealign-aemb" => { + if methods.len() > 1 { + error!("Cannot (currently) specify the strobealign-aemb method with any other coverage methods"); + process::exit(1); + } + estimators.push(CoverageEstimator::new_estimator_strobealign_aemb()); + } _ => unreachable!(), }; } @@ -1086,6 +1110,7 @@ impl EstimatorsAndTaker { CoverageEstimator::ReadCountCalculator { .. } => die("counts"), CoverageEstimator::ReferenceLengthCalculator { .. } => die("length"), CoverageEstimator::ReadsPerBaseCalculator { .. } => die("reads_per_base"), + CoverageEstimator::StrobealignAembEstimator { .. } => die("strobealign-aemb"), _ => {} } } diff --git a/src/cli.rs b/src/cli.rs index 3f54881..dee9299 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -527,7 +527,9 @@ pub fn contig_full_help() -> Manual { &[&monospace_roff("trimmed_mean"), &format!("Average number of aligned reads overlapping each position after removing the most deeply and shallow-ly covered positions. See {}/{} to adjust.", &monospace_roff("--trim-min"), &monospace_roff("--trim-max"))], - + &[&monospace_roff("strobealign-aemb"), + &format!("Mean coverage as estimated by strobealign --aemb, which is faster than the {} method, giving similar but not identical values. See https://github.com/ksahlin/strobealign for details. Cannot currently be used with other methods.", + &monospace_roff("mean"))], &[&monospace_roff("coverage_histogram"), "Histogram of coverage depths"], &[&monospace_roff("covered_fraction"), "Proportion of bases covered by 1 or more reads"], &[&monospace_roff("covered_bases"), "Number of bases covered by 1 or more reads"], @@ -1747,6 +1749,7 @@ Ben J. Woodcroft "reads_per_base", "rpkm", "tpm", + "strobealign-aemb", ]) .default_value("mean") .action(clap::ArgAction::Append) diff --git a/src/contig.rs b/src/contig.rs index 2031366..470f7cb 100644 --- a/src/contig.rs +++ b/src/contig.rs @@ -273,7 +273,6 @@ mod tests { use mapping_index_maintenance::VanillaIndexStruct; use mapping_parameters::*; use shard_bam_reader::*; - use std::io::Read; use std::str; use OutputWriter; diff --git a/src/coverage_printer.rs b/src/coverage_printer.rs index 74f6611..9ad454f 100644 --- a/src/coverage_printer.rs +++ b/src/coverage_printer.rs @@ -545,7 +545,6 @@ pub fn print_dense_cached_coverage_taker( mod tests { use super::*; use std::io::Cursor; - use std::io::Read; use std::str; use OutputWriter; diff --git a/src/genome.rs b/src/genome.rs index b2d5291..dcd20e6 100644 --- a/src/genome.rs +++ b/src/genome.rs @@ -923,7 +923,6 @@ mod tests { use rust_htslib::bam::Read; use shard_bam_reader::*; use std::collections::HashSet; - use std::io::Read as _; use OutputWriter; fn test_streaming_with_stream>( diff --git a/src/lib.rs b/src/lib.rs index bb659a6..fec62a4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -36,6 +36,7 @@ extern crate clap_complete; extern crate lazy_static; extern crate bird_tool_utils; extern crate bird_tool_utils_man; +extern crate csv; extern crate galah; extern crate needletail; extern crate roff; diff --git a/src/mapping_parameters.rs b/src/mapping_parameters.rs index deb63c4..4c98ec4 100644 --- a/src/mapping_parameters.rs +++ b/src/mapping_parameters.rs @@ -5,13 +5,14 @@ use tempfile::NamedTempFile; use bam_generator::MappingProgram; use mapping_index_maintenance::check_reference_existence; -#[derive(Clone)] +#[derive(Clone, Debug)] pub enum ReadFormat { Coupled, Interleaved, Single, } +#[derive(Debug)] pub struct MappingParameters<'a> { references: Vec<&'a str>, threads: u16, @@ -266,6 +267,7 @@ impl<'a> Iterator for SingleReferenceMappingParameters<'a> { } } +#[derive(Debug)] pub struct OneSampleMappingParameters<'a> { pub reference: &'a str, pub read_format: ReadFormat, diff --git a/src/mosdepth_genome_coverage_estimators.rs b/src/mosdepth_genome_coverage_estimators.rs index ed47d5b..b52e864 100644 --- a/src/mosdepth_genome_coverage_estimators.rs +++ b/src/mosdepth_genome_coverage_estimators.rs @@ -73,6 +73,7 @@ pub enum CoverageEstimator { observed_contig_length: u64, num_mapped_reads: u64, }, + StrobealignAembEstimator {}, } impl CoverageEstimator { @@ -93,6 +94,7 @@ impl CoverageEstimator { CoverageEstimator::ReferenceLengthCalculator { .. } => vec!["Length"], CoverageEstimator::ReadCountCalculator { .. } => vec!["Read Count"], CoverageEstimator::ReadsPerBaseCalculator { .. } => vec!["Reads per base"], + CoverageEstimator::StrobealignAembEstimator { .. } => vec!["Strobealign aemb"], } } } @@ -206,6 +208,9 @@ impl CoverageEstimator { num_mapped_reads: 0, } } + pub fn new_estimator_strobealign_aemb() -> CoverageEstimator { + CoverageEstimator::StrobealignAembEstimator {} + } fn calculate_unobserved_bases( unobserved_contig_lengths: &[u64], @@ -330,6 +335,7 @@ impl MosdepthGenomeCoverageEstimator for CoverageEstimator { } => { *num_mapped_reads = 0; } + CoverageEstimator::StrobealignAembEstimator { .. } => panic!("Programming error"), } } @@ -489,6 +495,7 @@ impl MosdepthGenomeCoverageEstimator for CoverageEstimator { } => { *num_mapped_reads += num_mapped_reads_in_contig; } + CoverageEstimator::StrobealignAembEstimator { .. } => unreachable!(), } } @@ -798,6 +805,7 @@ impl MosdepthGenomeCoverageEstimator for CoverageEstimator { / (*observed_contig_length + unobserved_contig_lengths.iter().sum::()) as f32 } + CoverageEstimator::StrobealignAembEstimator {} => unreachable!(), } } @@ -887,6 +895,9 @@ impl MosdepthGenomeCoverageEstimator for CoverageEstimator { CoverageEstimator::ReadsPerBaseCalculator { .. } => { CoverageEstimator::new_estimator_reads_per_base() } + CoverageEstimator::StrobealignAembEstimator { .. } => { + CoverageEstimator::new_estimator_strobealign_aemb() + } } } @@ -901,7 +912,8 @@ impl MosdepthGenomeCoverageEstimator for CoverageEstimator { | CoverageEstimator::VarianceGenomeCoverageEstimator { .. } | CoverageEstimator::ReferenceLengthCalculator { .. } | CoverageEstimator::ReadCountCalculator { .. } - | CoverageEstimator::ReadsPerBaseCalculator { .. } => { + | CoverageEstimator::ReadsPerBaseCalculator { .. } + | CoverageEstimator::StrobealignAembEstimator { .. } => { coverage_taker.add_single_coverage(coverage); } CoverageEstimator::PileupCountsGenomeCoverageEstimator { counts, .. } => { @@ -933,7 +945,8 @@ impl MosdepthGenomeCoverageEstimator for CoverageEstimator { | CoverageEstimator::TPMCoverageEstimator { .. } | CoverageEstimator::VarianceGenomeCoverageEstimator { .. } | CoverageEstimator::ReadCountCalculator { .. } - | CoverageEstimator::ReadsPerBaseCalculator { .. } => { + | CoverageEstimator::ReadsPerBaseCalculator { .. } + | CoverageEstimator::StrobealignAembEstimator { .. } => { coverage_taker.add_single_coverage(0.0); } CoverageEstimator::PileupCountsGenomeCoverageEstimator { .. } => {} @@ -1006,6 +1019,9 @@ impl MosdepthGenomeCoverageEstimator for CoverageEstimator { observed_contig_length: _, num_mapped_reads, } => *num_mapped_reads, + CoverageEstimator::StrobealignAembEstimator { .. } => { + panic!("Strobealign AEMB does not calculate number of mapped reads") + } } } } diff --git a/src/strobealign_aemb.rs b/src/strobealign_aemb.rs index 10e18f8..fa54f4c 100644 --- a/src/strobealign_aemb.rs +++ b/src/strobealign_aemb.rs @@ -1,21 +1,31 @@ -use crate::bam_generator::complete_processes; -use crate::coverage_takers::*; +use crate::bam_generator::{complete_processes, name_stoit}; use crate::mapping_parameters::MappingParameters; -use crate::mosdepth_genome_coverage_estimators::*; -use crate::ReadsMapped; +use crate::{coverage_printer, coverage_takers::*, OutputWriter}; +// use crate::mosdepth_genome_coverage_estimators::*; -pub fn strobealign_aemb_coverage( +use csv; + +struct MappingResultStruct { + tmpfile: tempfile::NamedTempFile, + stoit_name: String, +} + +pub fn strobealign_aemb_coverage( mapping_parameters: MappingParameters, - _coverage_taker: &mut T, - _coverage_estimators: &mut [CoverageEstimator], + coverage_taker: &mut CoverageTakerType, + // _coverage_estimators: &mut [CoverageEstimator], // print_zero_coverage_contigs: bool, threads: u16, -) -> Vec { + coverage_printer: &mut coverage_printer::CoveragePrinter, + print_stream: &mut OutputWriter, +) { let mut commands = vec![]; let mut log_files = vec![]; + let mut mapping_results: Vec = vec![]; for p1 in mapping_parameters { for p2 in p1 { + debug!("Processing mapping parameters: {:?}", p2); let mapping_result = tempfile::Builder::new() .prefix("coverm-strobealign-aemb-mapping-result") .tempfile() @@ -25,7 +35,7 @@ pub fn strobealign_aemb_coverage( .tempfile() .unwrap_or_else(|_| panic!("Failed to create strobealign-aemb log tempfile")); let cmd = format!( - "strobealign --aemb -t {} {} {} {} > {} 2> {}", + "strobealign --aemb -t {} '{}' '{}' '{}' > '{}' 2> '{}'", threads, p2.reference, p2.read1, @@ -41,6 +51,10 @@ pub fn strobealign_aemb_coverage( ); log_files.push(mapping_log); commands.push(cmd); + mapping_results.push(MappingResultStruct { + tmpfile: mapping_result, + stoit_name: name_stoit(p2.reference, p2.read1, true), + }); } } @@ -63,5 +77,32 @@ pub fn strobealign_aemb_coverage( } // Read the mapping results and pass them to the coverage taker - panic!(); + for mapping_result_tempfile in mapping_results { + debug!("Starting stoid {}", mapping_result_tempfile.stoit_name); + coverage_taker.start_stoit(&mapping_result_tempfile.stoit_name); + // Read the TSV format + let file = std::fs::File::open(mapping_result_tempfile.tmpfile.path()) + .expect("Failed to open strobealign-aemb mapping result"); + let mut rdr = csv::ReaderBuilder::new() + .delimiter(b'\t') + .has_headers(false) + .from_reader(file); + for (i, result) in rdr.records().enumerate() { + let record = result.expect("Failed to read strobealign-aemb mapping result"); + if record.len() != 2 { + panic!( + "Unexpected number of columns in strobealign-aemb mapping result line {}: {:?}", + i, record + ); + } + coverage_taker.start_entry(i, &record[0]); + coverage_taker.add_single_coverage( + record[1] + .parse() + .expect("Failed to parse strobealign-aemb coverage"), + ); + coverage_taker.finish_entry(); + } + } + coverage_printer.finalise_printing(coverage_taker, print_stream, None, &vec![], None, None); } diff --git a/tests/test_cmdline.rs b/tests/test_cmdline.rs index 69a5432..27352cf 100644 --- a/tests/test_cmdline.rs +++ b/tests/test_cmdline.rs @@ -3111,14 +3111,40 @@ genome6~random_sequence_length_11003 0 0 0 .succeeds() .stdout() .is( - format!( - "Genome\t7seqs.fna/reads_for_seq1_and_seq2.1.fq.gz Mean\t7seqs.fna/reads_for_seq1_and_seq2.1.fq.gz Covered Fraction\n\ - genome1\t0.040328056\t0.026624106\n" - ) + "Genome\t7seqs.fna/reads_for_seq1_and_seq2.1.fq.gz Mean\t7seqs.fna/reads_for_seq1_and_seq2.1.fq.gz Covered Fraction\n\ + genome1\t0.040328056\t0.026624106\n".to_string() .as_str(), ) .unwrap(); } + + #[test] + fn test_strobelalign_aemb_contig() { + Assert::main_binary() + .with_args(&[ + "contig", + "--coupled", + "tests/data/reads_for_seq1_and_seq2.1.fq.gz", + "tests/data/reads_for_seq1_and_seq2.2.fq.gz", + "--reference", + "tests/data/7seqs.fna", + "-m", + "strobealign-aemb", + ]) + .succeeds() + .stdout() + .is( + "Contig\t7seqs.fna/reads_for_seq1_and_seq2.1.fq.gz Strobealign aemb\n\ + genome1~random_sequence_length_11000\t0\n\ + genome1~random_sequence_length_11010\t0\n\ + genome2~seq1\t1.2\n\ + genome3~random_sequence_length_11001\t0\n\ + genome4~random_sequence_length_11002\t0\n\ + genome5~seq2\t1.2\n\ + genome6~random_sequence_length_11003\t0\n", + ) + .unwrap(); + } } // TODO: Add mismatching bases test From 00009649a6e3e3707c3895c98789d947b846523e Mon Sep 17 00:00:00 2001 From: Ben Woodcroft Date: Wed, 2 Oct 2024 19:33:37 +1000 Subject: [PATCH 3/4] test: Fix test name so osx skips. --- tests/test_cmdline.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_cmdline.rs b/tests/test_cmdline.rs index 27352cf..3f61288 100644 --- a/tests/test_cmdline.rs +++ b/tests/test_cmdline.rs @@ -3119,7 +3119,7 @@ genome6~random_sequence_length_11003 0 0 0 } #[test] - fn test_strobelalign_aemb_contig() { + fn test_strobealign_aemb_contig() { Assert::main_binary() .with_args(&[ "contig", From 31e28b4eac3d9dba7fc7f546fe00518da60ff36b Mon Sep 17 00:00:00 2001 From: Ben Woodcroft Date: Wed, 23 Oct 2024 09:23:06 +1000 Subject: [PATCH 4/4] test: Fix syntax. --- tests/test_cmdline.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_cmdline.rs b/tests/test_cmdline.rs index e37587b..d0ab8a4 100644 --- a/tests/test_cmdline.rs +++ b/tests/test_cmdline.rs @@ -3143,6 +3143,8 @@ genome6~random_sequence_length_11003 0 0 0 genome5~seq2\t1.2\n\ genome6~random_sequence_length_11003\t0\n", ) + .unwrap(); + } #[test] fn test_cmdline_mapq_filtering_all_out() {