diff --git a/Cargo.toml b/Cargo.toml index 3af5fc6..e4eaa7c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -37,6 +37,7 @@ galah = "0.4.0" bird_tool_utils-man = "0.4.0" roff = "0.2.*" needletail = "0.5.*" +csv = "1.*" [dev-dependencies] assert_cli = "0.6.*" diff --git a/src/bam_generator.rs b/src/bam_generator.rs index 5ded5a0..4a9285e 100644 --- a/src/bam_generator.rs +++ b/src/bam_generator.rs @@ -160,6 +160,28 @@ impl NamedBamReaderGenerator for StreamingNamedBamReade } } +pub fn name_stoit( + index_path: &str, + read1_path: &str, + include_reference_in_stoit_name: bool, +) -> String { + (match include_reference_in_stoit_name { + true => (std::path::Path::new(&index_path) + .file_name() + .expect("Unable to convert reference to file name") + .to_str() + .expect("Unable to covert file name into str") + .to_string() + + "/") + .to_string(), + false => "".to_string(), + }) + std::path::Path::new(read1_path) + .file_name() + .expect("Unable to convert read1 name to file name") + .to_str() + .expect("Unable to covert file name into str") +} + pub fn complete_processes( processes: Vec, command_strings: Vec, @@ -412,22 +434,11 @@ pub fn generate_named_bam_readers_from_reads( log_files.push(samtools_view_cache_log); } - let stoit_name = match include_reference_in_stoit_name { - true => { - std::path::Path::new(&index.index_path()) - .file_name() - .expect("Unable to convert reference to file name") - .to_str() - .expect("Unable to covert file name into str") - .to_string() - + "/" - } - false => "".to_string(), - } + std::path::Path::new(read1_path) - .file_name() - .expect("Unable to convert read1 name to file name") - .to_str() - .expect("Unable to covert file name into str"); + let stoit_name = name_stoit( + index.index_path(), + read1_path, + include_reference_in_stoit_name, + ); StreamingNamedBamReaderGenerator { stoit_name, @@ -803,18 +814,7 @@ pub fn generate_bam_maker_generator_from_reads( let log_files = vec![mapping_log, samtools2_log, samtools_view_cache_log]; return NamedBamMakerGenerator { - stoit_name: std::path::Path::new(index.index_path()) - .file_name() - .expect("Unable to convert reference to file name") - .to_str() - .expect("Unable to covert file name into str") - .to_string() - + "/" - + std::path::Path::new(read1_path) - .file_name() - .expect("Unable to convert read1 name to file name") - .to_str() - .expect("Unable to covert file name into str"), + stoit_name: name_stoit(index.index_path(), read1_path, true), pre_processes: vec![cmd], command_strings: vec![format!("bash -c \"{}\"", cmd_string)], log_file_descriptions: log_descriptions, diff --git a/src/bin/coverm.rs b/src/bin/coverm.rs index 7761d49..e1ab008 100644 --- a/src/bin/coverm.rs +++ b/src/bin/coverm.rs @@ -11,6 +11,7 @@ use coverm::mapping_index_maintenance::check_reference_existence; use coverm::mapping_parameters::*; use coverm::mosdepth_genome_coverage_estimators::*; use coverm::shard_bam_reader::*; +use coverm::strobealign_aemb::strobealign_aemb_coverage; use coverm::FlagFilter; use coverm::OutputWriter; use coverm::CONCATENATED_FASTA_FILE_SEPARATOR; @@ -507,7 +508,23 @@ fn main() { estimators_and_taker = estimators_and_taker.print_headers("Contig", print_stream.clone()); - if m.contains_id("bam-files") { + if let CoverageEstimator::StrobealignAembEstimator {} = + estimators_and_taker.estimators[0] + { + let mapping_params = + MappingParameters::generate_from_clap(m, MappingProgram::STROBEALIGN, &None); + debug!( + "Running strobealign-aemb coverage with mapping parameters: {:?}", + mapping_params + ); + strobealign_aemb_coverage( + mapping_params, + &mut estimators_and_taker.taker, + threads, + &mut estimators_and_taker.printer, + &mut print_stream, + ); + } else if m.contains_id("bam-files") { let bam_files: Vec<&str> = m .get_many::("bam-files") .unwrap() @@ -1032,6 +1049,13 @@ impl EstimatorsAndTaker { "reads_per_base" => { estimators.push(CoverageEstimator::new_estimator_reads_per_base()); } + "strobealign-aemb" => { + if methods.len() > 1 { + error!("Cannot (currently) specify the strobealign-aemb method with any other coverage methods"); + process::exit(1); + } + estimators.push(CoverageEstimator::new_estimator_strobealign_aemb()); + } _ => unreachable!(), }; } @@ -1089,6 +1113,7 @@ impl EstimatorsAndTaker { CoverageEstimator::ReadCountCalculator { .. } => die("counts"), CoverageEstimator::ReferenceLengthCalculator { .. } => die("length"), CoverageEstimator::ReadsPerBaseCalculator { .. } => die("reads_per_base"), + CoverageEstimator::StrobealignAembEstimator { .. } => die("strobealign-aemb"), _ => {} } } diff --git a/src/cli.rs b/src/cli.rs index 7d10b14..a31aa21 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -537,7 +537,9 @@ pub fn contig_full_help() -> Manual { &[&monospace_roff("trimmed_mean"), &format!("Average number of aligned reads overlapping each position after removing the most deeply and shallow-ly covered positions. See {}/{} to adjust.", &monospace_roff("--trim-min"), &monospace_roff("--trim-max"))], - + &[&monospace_roff("strobealign-aemb"), + &format!("Mean coverage as estimated by strobealign --aemb, which is faster than the {} method, giving similar but not identical values. See https://github.com/ksahlin/strobealign for details. Cannot currently be used with other methods.", + &monospace_roff("mean"))], &[&monospace_roff("coverage_histogram"), "Histogram of coverage depths"], &[&monospace_roff("covered_fraction"), "Proportion of bases covered by 1 or more reads"], &[&monospace_roff("covered_bases"), "Number of bases covered by 1 or more reads"], @@ -1767,6 +1769,7 @@ Ben J. Woodcroft "reads_per_base", "rpkm", "tpm", + "strobealign-aemb", ]) .default_value("mean") .action(clap::ArgAction::Append) diff --git a/src/lib.rs b/src/lib.rs index 63ae513..d274474 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -13,6 +13,7 @@ pub mod mapping_index_maintenance; pub mod mapping_parameters; pub mod mosdepth_genome_coverage_estimators; pub mod shard_bam_reader; +pub mod strobealign_aemb; use rust_htslib::bam::record::Record; use std::sync::Arc; @@ -35,6 +36,7 @@ extern crate clap_complete; extern crate lazy_static; extern crate bird_tool_utils; extern crate bird_tool_utils_man; +extern crate csv; extern crate galah; extern crate needletail; extern crate roff; diff --git a/src/mapping_parameters.rs b/src/mapping_parameters.rs index deb63c4..4c98ec4 100644 --- a/src/mapping_parameters.rs +++ b/src/mapping_parameters.rs @@ -5,13 +5,14 @@ use tempfile::NamedTempFile; use bam_generator::MappingProgram; use mapping_index_maintenance::check_reference_existence; -#[derive(Clone)] +#[derive(Clone, Debug)] pub enum ReadFormat { Coupled, Interleaved, Single, } +#[derive(Debug)] pub struct MappingParameters<'a> { references: Vec<&'a str>, threads: u16, @@ -266,6 +267,7 @@ impl<'a> Iterator for SingleReferenceMappingParameters<'a> { } } +#[derive(Debug)] pub struct OneSampleMappingParameters<'a> { pub reference: &'a str, pub read_format: ReadFormat, diff --git a/src/mosdepth_genome_coverage_estimators.rs b/src/mosdepth_genome_coverage_estimators.rs index ed47d5b..b52e864 100644 --- a/src/mosdepth_genome_coverage_estimators.rs +++ b/src/mosdepth_genome_coverage_estimators.rs @@ -73,6 +73,7 @@ pub enum CoverageEstimator { observed_contig_length: u64, num_mapped_reads: u64, }, + StrobealignAembEstimator {}, } impl CoverageEstimator { @@ -93,6 +94,7 @@ impl CoverageEstimator { CoverageEstimator::ReferenceLengthCalculator { .. } => vec!["Length"], CoverageEstimator::ReadCountCalculator { .. } => vec!["Read Count"], CoverageEstimator::ReadsPerBaseCalculator { .. } => vec!["Reads per base"], + CoverageEstimator::StrobealignAembEstimator { .. } => vec!["Strobealign aemb"], } } } @@ -206,6 +208,9 @@ impl CoverageEstimator { num_mapped_reads: 0, } } + pub fn new_estimator_strobealign_aemb() -> CoverageEstimator { + CoverageEstimator::StrobealignAembEstimator {} + } fn calculate_unobserved_bases( unobserved_contig_lengths: &[u64], @@ -330,6 +335,7 @@ impl MosdepthGenomeCoverageEstimator for CoverageEstimator { } => { *num_mapped_reads = 0; } + CoverageEstimator::StrobealignAembEstimator { .. } => panic!("Programming error"), } } @@ -489,6 +495,7 @@ impl MosdepthGenomeCoverageEstimator for CoverageEstimator { } => { *num_mapped_reads += num_mapped_reads_in_contig; } + CoverageEstimator::StrobealignAembEstimator { .. } => unreachable!(), } } @@ -798,6 +805,7 @@ impl MosdepthGenomeCoverageEstimator for CoverageEstimator { / (*observed_contig_length + unobserved_contig_lengths.iter().sum::()) as f32 } + CoverageEstimator::StrobealignAembEstimator {} => unreachable!(), } } @@ -887,6 +895,9 @@ impl MosdepthGenomeCoverageEstimator for CoverageEstimator { CoverageEstimator::ReadsPerBaseCalculator { .. } => { CoverageEstimator::new_estimator_reads_per_base() } + CoverageEstimator::StrobealignAembEstimator { .. } => { + CoverageEstimator::new_estimator_strobealign_aemb() + } } } @@ -901,7 +912,8 @@ impl MosdepthGenomeCoverageEstimator for CoverageEstimator { | CoverageEstimator::VarianceGenomeCoverageEstimator { .. } | CoverageEstimator::ReferenceLengthCalculator { .. } | CoverageEstimator::ReadCountCalculator { .. } - | CoverageEstimator::ReadsPerBaseCalculator { .. } => { + | CoverageEstimator::ReadsPerBaseCalculator { .. } + | CoverageEstimator::StrobealignAembEstimator { .. } => { coverage_taker.add_single_coverage(coverage); } CoverageEstimator::PileupCountsGenomeCoverageEstimator { counts, .. } => { @@ -933,7 +945,8 @@ impl MosdepthGenomeCoverageEstimator for CoverageEstimator { | CoverageEstimator::TPMCoverageEstimator { .. } | CoverageEstimator::VarianceGenomeCoverageEstimator { .. } | CoverageEstimator::ReadCountCalculator { .. } - | CoverageEstimator::ReadsPerBaseCalculator { .. } => { + | CoverageEstimator::ReadsPerBaseCalculator { .. } + | CoverageEstimator::StrobealignAembEstimator { .. } => { coverage_taker.add_single_coverage(0.0); } CoverageEstimator::PileupCountsGenomeCoverageEstimator { .. } => {} @@ -1006,6 +1019,9 @@ impl MosdepthGenomeCoverageEstimator for CoverageEstimator { observed_contig_length: _, num_mapped_reads, } => *num_mapped_reads, + CoverageEstimator::StrobealignAembEstimator { .. } => { + panic!("Strobealign AEMB does not calculate number of mapped reads") + } } } } diff --git a/src/strobealign_aemb.rs b/src/strobealign_aemb.rs new file mode 100644 index 0000000..fa54f4c --- /dev/null +++ b/src/strobealign_aemb.rs @@ -0,0 +1,108 @@ +use crate::bam_generator::{complete_processes, name_stoit}; +use crate::mapping_parameters::MappingParameters; +use crate::{coverage_printer, coverage_takers::*, OutputWriter}; +// use crate::mosdepth_genome_coverage_estimators::*; + +use csv; + +struct MappingResultStruct { + tmpfile: tempfile::NamedTempFile, + stoit_name: String, +} + +pub fn strobealign_aemb_coverage( + mapping_parameters: MappingParameters, + coverage_taker: &mut CoverageTakerType, + // _coverage_estimators: &mut [CoverageEstimator], + // print_zero_coverage_contigs: bool, + threads: u16, + coverage_printer: &mut coverage_printer::CoveragePrinter, + print_stream: &mut OutputWriter, +) { + let mut commands = vec![]; + let mut log_files = vec![]; + + let mut mapping_results: Vec = vec![]; + for p1 in mapping_parameters { + for p2 in p1 { + debug!("Processing mapping parameters: {:?}", p2); + let mapping_result = tempfile::Builder::new() + .prefix("coverm-strobealign-aemb-mapping-result") + .tempfile() + .unwrap_or_else(|_| panic!("Failed to create strobealign-aemb result tempfile")); + let mapping_log = tempfile::Builder::new() + .prefix("coverm-strobealign-aemb-mapping-log") + .tempfile() + .unwrap_or_else(|_| panic!("Failed to create strobealign-aemb log tempfile")); + let cmd = format!( + "strobealign --aemb -t {} '{}' '{}' '{}' > '{}' 2> '{}'", + threads, + p2.reference, + p2.read1, + p2.read2.unwrap_or(""), + mapping_result + .path() + .to_str() + .expect("Failed to get strobealign-aemb result path"), + mapping_log + .path() + .to_str() + .expect("Failed to get strobealign-aemb log path"), + ); + log_files.push(mapping_log); + commands.push(cmd); + mapping_results.push(MappingResultStruct { + tmpfile: mapping_result, + stoit_name: name_stoit(p2.reference, p2.read1, true), + }); + } + } + + // Run each command in serial, and complete each process + // before starting the next one. + for cmd_string in commands { + debug!("Queuing cmd_string: {}", cmd_string); + let mut cmd = std::process::Command::new("bash"); + cmd.arg("-c") + .arg(&cmd_string) + .stderr(std::process::Stdio::piped()); + let process = cmd.spawn().expect("Unable to execute bash"); + complete_processes( + vec![process], + vec![cmd_string], + vec!["strobealign-aemb log file".to_string()], + vec![log_files.remove(0)], + None, + ); + } + + // Read the mapping results and pass them to the coverage taker + for mapping_result_tempfile in mapping_results { + debug!("Starting stoid {}", mapping_result_tempfile.stoit_name); + coverage_taker.start_stoit(&mapping_result_tempfile.stoit_name); + // Read the TSV format + let file = std::fs::File::open(mapping_result_tempfile.tmpfile.path()) + .expect("Failed to open strobealign-aemb mapping result"); + let mut rdr = csv::ReaderBuilder::new() + .delimiter(b'\t') + .has_headers(false) + .from_reader(file); + for (i, result) in rdr.records().enumerate() { + let record = result.expect("Failed to read strobealign-aemb mapping result"); + if record.len() != 2 { + panic!( + "Unexpected number of columns in strobealign-aemb mapping result line {}: {:?}", + i, record + ); + } + coverage_taker.start_entry(i, &record[0]); + coverage_taker.add_single_coverage( + record[1] + .parse() + .expect("Failed to parse strobealign-aemb coverage"), + ); + coverage_taker.finish_entry(); + } + } + coverage_printer.finalise_printing(coverage_taker, print_stream, None, &vec![], None, None); +} diff --git a/tests/test_cmdline.rs b/tests/test_cmdline.rs index c0134d5..d0ab8a4 100644 --- a/tests/test_cmdline.rs +++ b/tests/test_cmdline.rs @@ -3111,15 +3111,41 @@ genome6~random_sequence_length_11003 0 0 0 .succeeds() .stdout() .is( - format!( - "Genome\t7seqs.fna/reads_for_seq1_and_seq2.1.fq.gz Mean\t7seqs.fna/reads_for_seq1_and_seq2.1.fq.gz Covered Fraction\n\ - genome1\t0.040328056\t0.026624106\n" - ) + "Genome\t7seqs.fna/reads_for_seq1_and_seq2.1.fq.gz Mean\t7seqs.fna/reads_for_seq1_and_seq2.1.fq.gz Covered Fraction\n\ + genome1\t0.040328056\t0.026624106\n".to_string() .as_str(), ) .unwrap(); } + #[test] + fn test_strobealign_aemb_contig() { + Assert::main_binary() + .with_args(&[ + "contig", + "--coupled", + "tests/data/reads_for_seq1_and_seq2.1.fq.gz", + "tests/data/reads_for_seq1_and_seq2.2.fq.gz", + "--reference", + "tests/data/7seqs.fna", + "-m", + "strobealign-aemb", + ]) + .succeeds() + .stdout() + .is( + "Contig\t7seqs.fna/reads_for_seq1_and_seq2.1.fq.gz Strobealign aemb\n\ + genome1~random_sequence_length_11000\t0\n\ + genome1~random_sequence_length_11010\t0\n\ + genome2~seq1\t1.2\n\ + genome3~random_sequence_length_11001\t0\n\ + genome4~random_sequence_length_11002\t0\n\ + genome5~seq2\t1.2\n\ + genome6~random_sequence_length_11003\t0\n", + ) + .unwrap(); + } + #[test] fn test_cmdline_mapq_filtering_all_out() { Assert::main_binary()