Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Strobealign aemb #225

Merged
merged 5 commits into from
Oct 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ galah = "0.4.0"
bird_tool_utils-man = "0.4.0"
roff = "0.2.*"
needletail = "0.5.*"
csv = "1.*"

[dev-dependencies]
assert_cli = "0.6.*"
Expand Down
56 changes: 28 additions & 28 deletions src/bam_generator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,28 @@ impl NamedBamReaderGenerator<StreamingNamedBamReader> for StreamingNamedBamReade
}
}

pub fn name_stoit(
index_path: &str,
read1_path: &str,
include_reference_in_stoit_name: bool,
) -> String {
(match include_reference_in_stoit_name {
true => (std::path::Path::new(&index_path)
.file_name()
.expect("Unable to convert reference to file name")
.to_str()
.expect("Unable to covert file name into str")
.to_string()
+ "/")
.to_string(),
false => "".to_string(),
}) + std::path::Path::new(read1_path)
.file_name()
.expect("Unable to convert read1 name to file name")
.to_str()
.expect("Unable to covert file name into str")
}

pub fn complete_processes(
processes: Vec<std::process::Child>,
command_strings: Vec<String>,
Expand Down Expand Up @@ -412,22 +434,11 @@ pub fn generate_named_bam_readers_from_reads(
log_files.push(samtools_view_cache_log);
}

let stoit_name = match include_reference_in_stoit_name {
true => {
std::path::Path::new(&index.index_path())
.file_name()
.expect("Unable to convert reference to file name")
.to_str()
.expect("Unable to covert file name into str")
.to_string()
+ "/"
}
false => "".to_string(),
} + std::path::Path::new(read1_path)
.file_name()
.expect("Unable to convert read1 name to file name")
.to_str()
.expect("Unable to covert file name into str");
let stoit_name = name_stoit(
index.index_path(),
read1_path,
include_reference_in_stoit_name,
);

StreamingNamedBamReaderGenerator {
stoit_name,
Expand Down Expand Up @@ -803,18 +814,7 @@ pub fn generate_bam_maker_generator_from_reads(
let log_files = vec![mapping_log, samtools2_log, samtools_view_cache_log];

return NamedBamMakerGenerator {
stoit_name: std::path::Path::new(index.index_path())
.file_name()
.expect("Unable to convert reference to file name")
.to_str()
.expect("Unable to covert file name into str")
.to_string()
+ "/"
+ std::path::Path::new(read1_path)
.file_name()
.expect("Unable to convert read1 name to file name")
.to_str()
.expect("Unable to covert file name into str"),
stoit_name: name_stoit(index.index_path(), read1_path, true),
pre_processes: vec![cmd],
command_strings: vec![format!("bash -c \"{}\"", cmd_string)],
log_file_descriptions: log_descriptions,
Expand Down
27 changes: 26 additions & 1 deletion src/bin/coverm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ use coverm::mapping_index_maintenance::check_reference_existence;
use coverm::mapping_parameters::*;
use coverm::mosdepth_genome_coverage_estimators::*;
use coverm::shard_bam_reader::*;
use coverm::strobealign_aemb::strobealign_aemb_coverage;
use coverm::FlagFilter;
use coverm::OutputWriter;
use coverm::CONCATENATED_FASTA_FILE_SEPARATOR;
Expand Down Expand Up @@ -507,7 +508,23 @@ fn main() {
estimators_and_taker =
estimators_and_taker.print_headers("Contig", print_stream.clone());

if m.contains_id("bam-files") {
if let CoverageEstimator::StrobealignAembEstimator {} =
estimators_and_taker.estimators[0]
{
let mapping_params =
MappingParameters::generate_from_clap(m, MappingProgram::STROBEALIGN, &None);
debug!(
"Running strobealign-aemb coverage with mapping parameters: {:?}",
mapping_params
);
strobealign_aemb_coverage(
mapping_params,
&mut estimators_and_taker.taker,
threads,
&mut estimators_and_taker.printer,
&mut print_stream,
);
} else if m.contains_id("bam-files") {
let bam_files: Vec<&str> = m
.get_many::<String>("bam-files")
.unwrap()
Expand Down Expand Up @@ -1032,6 +1049,13 @@ impl EstimatorsAndTaker {
"reads_per_base" => {
estimators.push(CoverageEstimator::new_estimator_reads_per_base());
}
"strobealign-aemb" => {
if methods.len() > 1 {
error!("Cannot (currently) specify the strobealign-aemb method with any other coverage methods");
process::exit(1);
}
estimators.push(CoverageEstimator::new_estimator_strobealign_aemb());
}
_ => unreachable!(),
};
}
Expand Down Expand Up @@ -1089,6 +1113,7 @@ impl EstimatorsAndTaker {
CoverageEstimator::ReadCountCalculator { .. } => die("counts"),
CoverageEstimator::ReferenceLengthCalculator { .. } => die("length"),
CoverageEstimator::ReadsPerBaseCalculator { .. } => die("reads_per_base"),
CoverageEstimator::StrobealignAembEstimator { .. } => die("strobealign-aemb"),
_ => {}
}
}
Expand Down
5 changes: 4 additions & 1 deletion src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -537,7 +537,9 @@ pub fn contig_full_help() -> Manual {
&[&monospace_roff("trimmed_mean"), &format!("Average number of aligned reads overlapping each position after removing the most deeply and shallow-ly covered positions. See {}/{} to adjust.",
&monospace_roff("--trim-min"),
&monospace_roff("--trim-max"))],

&[&monospace_roff("strobealign-aemb"),
&format!("Mean coverage as estimated by strobealign --aemb, which is faster than the {} method, giving similar but not identical values. See https://github.com/ksahlin/strobealign for details. Cannot currently be used with other methods.",
&monospace_roff("mean"))],
&[&monospace_roff("coverage_histogram"), "Histogram of coverage depths"],
&[&monospace_roff("covered_fraction"), "Proportion of bases covered by 1 or more reads"],
&[&monospace_roff("covered_bases"), "Number of bases covered by 1 or more reads"],
Expand Down Expand Up @@ -1767,6 +1769,7 @@ Ben J. Woodcroft <benjwoodcroft near gmail.com>
"reads_per_base",
"rpkm",
"tpm",
"strobealign-aemb",
])
.default_value("mean")
.action(clap::ArgAction::Append)
Expand Down
2 changes: 2 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ pub mod mapping_index_maintenance;
pub mod mapping_parameters;
pub mod mosdepth_genome_coverage_estimators;
pub mod shard_bam_reader;
pub mod strobealign_aemb;

use rust_htslib::bam::record::Record;
use std::sync::Arc;
Expand All @@ -35,6 +36,7 @@ extern crate clap_complete;
extern crate lazy_static;
extern crate bird_tool_utils;
extern crate bird_tool_utils_man;
extern crate csv;
extern crate galah;
extern crate needletail;
extern crate roff;
Expand Down
4 changes: 3 additions & 1 deletion src/mapping_parameters.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,14 @@ use tempfile::NamedTempFile;
use bam_generator::MappingProgram;
use mapping_index_maintenance::check_reference_existence;

#[derive(Clone)]
#[derive(Clone, Debug)]
pub enum ReadFormat {
Coupled,
Interleaved,
Single,
}

#[derive(Debug)]
pub struct MappingParameters<'a> {
references: Vec<&'a str>,
threads: u16,
Expand Down Expand Up @@ -266,6 +267,7 @@ impl<'a> Iterator for SingleReferenceMappingParameters<'a> {
}
}

#[derive(Debug)]
pub struct OneSampleMappingParameters<'a> {
pub reference: &'a str,
pub read_format: ReadFormat,
Expand Down
20 changes: 18 additions & 2 deletions src/mosdepth_genome_coverage_estimators.rs
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ pub enum CoverageEstimator {
observed_contig_length: u64,
num_mapped_reads: u64,
},
StrobealignAembEstimator {},
}

impl CoverageEstimator {
Expand All @@ -93,6 +94,7 @@ impl CoverageEstimator {
CoverageEstimator::ReferenceLengthCalculator { .. } => vec!["Length"],
CoverageEstimator::ReadCountCalculator { .. } => vec!["Read Count"],
CoverageEstimator::ReadsPerBaseCalculator { .. } => vec!["Reads per base"],
CoverageEstimator::StrobealignAembEstimator { .. } => vec!["Strobealign aemb"],
}
}
}
Expand Down Expand Up @@ -206,6 +208,9 @@ impl CoverageEstimator {
num_mapped_reads: 0,
}
}
pub fn new_estimator_strobealign_aemb() -> CoverageEstimator {
CoverageEstimator::StrobealignAembEstimator {}
}

fn calculate_unobserved_bases(
unobserved_contig_lengths: &[u64],
Expand Down Expand Up @@ -330,6 +335,7 @@ impl MosdepthGenomeCoverageEstimator for CoverageEstimator {
} => {
*num_mapped_reads = 0;
}
CoverageEstimator::StrobealignAembEstimator { .. } => panic!("Programming error"),
}
}

Expand Down Expand Up @@ -489,6 +495,7 @@ impl MosdepthGenomeCoverageEstimator for CoverageEstimator {
} => {
*num_mapped_reads += num_mapped_reads_in_contig;
}
CoverageEstimator::StrobealignAembEstimator { .. } => unreachable!(),
}
}

Expand Down Expand Up @@ -798,6 +805,7 @@ impl MosdepthGenomeCoverageEstimator for CoverageEstimator {
/ (*observed_contig_length + unobserved_contig_lengths.iter().sum::<u64>())
as f32
}
CoverageEstimator::StrobealignAembEstimator {} => unreachable!(),
}
}

Expand Down Expand Up @@ -887,6 +895,9 @@ impl MosdepthGenomeCoverageEstimator for CoverageEstimator {
CoverageEstimator::ReadsPerBaseCalculator { .. } => {
CoverageEstimator::new_estimator_reads_per_base()
}
CoverageEstimator::StrobealignAembEstimator { .. } => {
CoverageEstimator::new_estimator_strobealign_aemb()
}
}
}

Expand All @@ -901,7 +912,8 @@ impl MosdepthGenomeCoverageEstimator for CoverageEstimator {
| CoverageEstimator::VarianceGenomeCoverageEstimator { .. }
| CoverageEstimator::ReferenceLengthCalculator { .. }
| CoverageEstimator::ReadCountCalculator { .. }
| CoverageEstimator::ReadsPerBaseCalculator { .. } => {
| CoverageEstimator::ReadsPerBaseCalculator { .. }
| CoverageEstimator::StrobealignAembEstimator { .. } => {
coverage_taker.add_single_coverage(coverage);
}
CoverageEstimator::PileupCountsGenomeCoverageEstimator { counts, .. } => {
Expand Down Expand Up @@ -933,7 +945,8 @@ impl MosdepthGenomeCoverageEstimator for CoverageEstimator {
| CoverageEstimator::TPMCoverageEstimator { .. }
| CoverageEstimator::VarianceGenomeCoverageEstimator { .. }
| CoverageEstimator::ReadCountCalculator { .. }
| CoverageEstimator::ReadsPerBaseCalculator { .. } => {
| CoverageEstimator::ReadsPerBaseCalculator { .. }
| CoverageEstimator::StrobealignAembEstimator { .. } => {
coverage_taker.add_single_coverage(0.0);
}
CoverageEstimator::PileupCountsGenomeCoverageEstimator { .. } => {}
Expand Down Expand Up @@ -1006,6 +1019,9 @@ impl MosdepthGenomeCoverageEstimator for CoverageEstimator {
observed_contig_length: _,
num_mapped_reads,
} => *num_mapped_reads,
CoverageEstimator::StrobealignAembEstimator { .. } => {
panic!("Strobealign AEMB does not calculate number of mapped reads")
}
}
}
}
108 changes: 108 additions & 0 deletions src/strobealign_aemb.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
use crate::bam_generator::{complete_processes, name_stoit};
use crate::mapping_parameters::MappingParameters;
use crate::{coverage_printer, coverage_takers::*, OutputWriter};
// use crate::mosdepth_genome_coverage_estimators::*;

use csv;

struct MappingResultStruct {
tmpfile: tempfile::NamedTempFile,
stoit_name: String,
}

pub fn strobealign_aemb_coverage(
mapping_parameters: MappingParameters,
coverage_taker: &mut CoverageTakerType,
// _coverage_estimators: &mut [CoverageEstimator],
// print_zero_coverage_contigs: bool,
threads: u16,
coverage_printer: &mut coverage_printer::CoveragePrinter,
print_stream: &mut OutputWriter,
) {
let mut commands = vec![];
let mut log_files = vec![];

let mut mapping_results: Vec<MappingResultStruct> = vec![];
for p1 in mapping_parameters {
for p2 in p1 {
debug!("Processing mapping parameters: {:?}", p2);
let mapping_result = tempfile::Builder::new()
.prefix("coverm-strobealign-aemb-mapping-result")
.tempfile()
.unwrap_or_else(|_| panic!("Failed to create strobealign-aemb result tempfile"));
let mapping_log = tempfile::Builder::new()
.prefix("coverm-strobealign-aemb-mapping-log")
.tempfile()
.unwrap_or_else(|_| panic!("Failed to create strobealign-aemb log tempfile"));
let cmd = format!(
"strobealign --aemb -t {} '{}' '{}' '{}' > '{}' 2> '{}'",
threads,
p2.reference,
p2.read1,
p2.read2.unwrap_or(""),
mapping_result
.path()
.to_str()
.expect("Failed to get strobealign-aemb result path"),
mapping_log
.path()
.to_str()
.expect("Failed to get strobealign-aemb log path"),
);
log_files.push(mapping_log);
commands.push(cmd);
mapping_results.push(MappingResultStruct {
tmpfile: mapping_result,
stoit_name: name_stoit(p2.reference, p2.read1, true),
});
}
}

// Run each command in serial, and complete each process
// before starting the next one.
for cmd_string in commands {
debug!("Queuing cmd_string: {}", cmd_string);
let mut cmd = std::process::Command::new("bash");
cmd.arg("-c")
.arg(&cmd_string)
.stderr(std::process::Stdio::piped());
let process = cmd.spawn().expect("Unable to execute bash");
complete_processes(
vec![process],
vec![cmd_string],
vec!["strobealign-aemb log file".to_string()],
vec![log_files.remove(0)],
None,
);
}

// Read the mapping results and pass them to the coverage taker
for mapping_result_tempfile in mapping_results {
debug!("Starting stoid {}", mapping_result_tempfile.stoit_name);
coverage_taker.start_stoit(&mapping_result_tempfile.stoit_name);
// Read the TSV format
let file = std::fs::File::open(mapping_result_tempfile.tmpfile.path())
.expect("Failed to open strobealign-aemb mapping result");
let mut rdr = csv::ReaderBuilder::new()
.delimiter(b'\t')
.has_headers(false)
.from_reader(file);
for (i, result) in rdr.records().enumerate() {
let record = result.expect("Failed to read strobealign-aemb mapping result");
if record.len() != 2 {
panic!(
"Unexpected number of columns in strobealign-aemb mapping result line {}: {:?}",
i, record
);
}
coverage_taker.start_entry(i, &record[0]);
coverage_taker.add_single_coverage(
record[1]
.parse()
.expect("Failed to parse strobealign-aemb coverage"),
);
coverage_taker.finish_entry();
}
}
coverage_printer.finalise_printing(coverage_taker, print_stream, None, &vec![], None, None);
}
Loading