From ad60b01f23ef70986988a059ff3d576696e9784f Mon Sep 17 00:00:00 2001 From: Ryan Wick Date: Mon, 20 Jan 2025 12:12:28 +1100 Subject: [PATCH] Add --max_contigs to Autocycler compress --- src/compress.rs | 30 ++++++++++++++++++++++++------ src/main.rs | 8 ++++++-- src/tests.rs | 23 ++++++++++++++--------- 3 files changed, 44 insertions(+), 17 deletions(-) diff --git a/src/compress.rs b/src/compress.rs index e7f32a6..dee508c 100644 --- a/src/compress.rs +++ b/src/compress.rs @@ -30,14 +30,16 @@ use crate::unitig_graph::UnitigGraph; -pub fn compress(assemblies_dir: PathBuf, autocycler_dir: PathBuf, k_size: u32, threads: usize) { +pub fn compress(assemblies_dir: PathBuf, autocycler_dir: PathBuf, k_size: u32, max_contigs: u32, + threads: usize) { let start_time = Instant::now(); check_settings(&assemblies_dir, &autocycler_dir, k_size, threads); starting_message(); print_settings(&assemblies_dir, &autocycler_dir, k_size, threads); create_dir(&autocycler_dir); let mut metrics = InputAssemblyMetrics::default(); - let (sequences, assembly_count) = load_sequences(&assemblies_dir, k_size, &mut metrics); + let (sequences, assembly_count) = + load_sequences(&assemblies_dir, k_size, &mut metrics, max_contigs); let kmer_graph = build_kmer_graph(k_size, assembly_count, &sequences); let mut unitig_graph = build_unitig_graph(kmer_graph); simplify_unitig_graph(&mut unitig_graph, &sequences); @@ -79,8 +81,23 @@ fn print_settings(assemblies_dir: &Path, autocycler_dir: &Path, k_size: u32, thr } -pub fn load_sequences(assemblies_dir: &Path, k_size: u32, metrics: &mut InputAssemblyMetrics) - -> (Vec, usize) { +fn check_sequence_count(sequences: &[Sequence], assembly_count: usize, max_contigs: u32) { + let sequence_count = sequences.len() as f64; + if sequence_count == 0.0 { + quit_with_error("no sequences found in input assemblies") + } + let mean_seqs_per_assembly = sequence_count / (assembly_count as f64); + if mean_seqs_per_assembly > max_contigs as f64 { + let e = format!("the mean number of contigs per input assembly ({:.1}) exceeds the allowed \ + threshold ({}). Are your input assemblies fragmented or contaminated?", + mean_seqs_per_assembly, max_contigs); + quit_with_error(&e); + } +} + + +pub fn load_sequences(assemblies_dir: &Path, k_size: u32, metrics: &mut InputAssemblyMetrics, + max_contigs: u32) -> (Vec, usize) { section_header("Loading input assemblies"); explanation("Input assemblies are now loaded and each contig is given a unique ID."); let assemblies = find_all_assemblies(assemblies_dir); @@ -106,6 +123,7 @@ pub fn load_sequences(assemblies_dir: &Path, k_size: u32, metrics: &mut InputAss metrics.input_assembly_details.push(assembly_details); } eprintln!(); + check_sequence_count(&sequences, assemblies.len(), max_contigs); let pb = spinner("repairing sequence ends..."); sequence_end_repair(&mut sequences, k_size); pb.finish_and_clear(); @@ -330,7 +348,7 @@ mod tests { make_test_file(&assembly_dir.path().join("b.fasta"), ">b1\nACGT\n>b2\nACGT\n"); make_test_file(&assembly_dir.path().join("c.fasta"), ">c1\nACGT\n>c2\nACGT\n>c3\nACGT\n"); let mut metrics = InputAssemblyMetrics::default(); - let (sequences, count) = load_sequences(&assembly_dir.into_path(), 3, &mut metrics); + let (sequences, count) = load_sequences(&assembly_dir.into_path(), 3, &mut metrics, 25); assert_eq!(sequences.len(), 6); assert_eq!(count, 3); } @@ -344,7 +362,7 @@ mod tests { make_test_file(&assembly_dir.path().join("c.fasta"), ">c1\nACGT\n>c1\nACGT\n>c3\nACGT\n"); assert!(panic::catch_unwind(|| { let mut metrics = InputAssemblyMetrics::default(); - load_sequences(&assembly_dir.into_path(), 3, &mut metrics); + load_sequences(&assembly_dir.into_path(), 3, &mut metrics, 25); }).is_err()); } } diff --git a/src/main.rs b/src/main.rs index d8975dd..48569c9 100644 --- a/src/main.rs +++ b/src/main.rs @@ -133,6 +133,10 @@ enum Commands { #[clap(long = "kmer", default_value = "51")] kmer: u32, + /// refuse to run if mean contigs per assembly exceeds this value + #[clap(long = "max_contigs", default_value = "25")] + max_contigs: u32, + /// Number of CPU threads #[clap(short = 't', long = "threads", default_value = "8")] threads: usize, @@ -274,8 +278,8 @@ fn main() { Some(Commands::Combine { autocycler_dir, in_gfas }) => { combine::combine(autocycler_dir, in_gfas); }, - Some(Commands::Compress { assemblies_dir, autocycler_dir, kmer, threads }) => { - compress::compress(assemblies_dir, autocycler_dir, kmer, threads); + Some(Commands::Compress { assemblies_dir, autocycler_dir, kmer, max_contigs, threads }) => { + compress::compress(assemblies_dir, autocycler_dir, kmer, max_contigs, threads); }, Some(Commands::Decompress { in_gfa, out_dir, out_file }) => { decompress::decompress(in_gfa, out_dir, out_file); diff --git a/src/tests.rs b/src/tests.rs index 21c70c7..1d8b8f9 100644 --- a/src/tests.rs +++ b/src/tests.rs @@ -95,7 +95,7 @@ fn test_high_level(seq_a: &str, seq_b: &str, seq_c: &str, seq_d: &str, seq_e: &s // Build a k-mer graph from the sequences. let mut metrics = InputAssemblyMetrics::default(); let (sequences, assembly_count) = load_sequences(&assembly_dir.path().to_path_buf(), - k_size, &mut metrics); + k_size, &mut metrics, 25); assert_eq!(assembly_count, 5); let mut kmer_graph = KmerGraph::new(k_size); kmer_graph.add_sequences(&sequences, assembly_count); @@ -131,11 +131,16 @@ fn test_high_level(seq_a: &str, seq_b: &str, seq_c: &str, seq_d: &str, seq_e: &s #[test] fn test_fixed_seqs() { - let seq_a = ">a\nCTTATGAGCAGTCCTTAACGTAGCGGTGTGTGGCTTTGAGAAGTTAGCGGTGGCGAGCTACATCCTGGCTCCAAT\n".to_string(); - let seq_b = ">b\nACCGTTACGTTAAGGACTGCTCATAAGATTGGAGCCAGGATGTAGCTCGCCACGGCTAACTTCTCAAAGCGGCAC\n".to_string(); - let seq_c = ">c\nCATCCTGGCTCCAATCTTATGAGCAGTCCTTAACGTAACGGTGTGTGGCTTTGAGAAGTTAGCCGTGGCGAGATA\n".to_string(); - let seq_d = ">d\nGGACTGCTCATAAGATTGGAGCCAGGATGTAGCTCGCCACGGCTAACTTCTCAAAGCCACACACCGTTACGTTAA\n".to_string(); - let seq_e = ">e\nTTGAGAAGTTAGCCGTGGCGAGCTACATCCTGGCTCCAATCTTATGAGCAGTCCTTAACGTAACGGTGTGTGGCC\n".to_string(); + let seq_a = ">a\nCTTATGAGCAGTCCTTAACGTAGCGGTGTGTGGCTTTGAGAA\ + GTTAGCGGTGGCGAGCTACATCCTGGCTCCAAT\n".to_string(); + let seq_b = ">b\nACCGTTACGTTAAGGACTGCTCATAAGATTGGAGCCAGGATG\ + TAGCTCGCCACGGCTAACTTCTCAAAGCGGCAC\n".to_string(); + let seq_c = ">c\nCATCCTGGCTCCAATCTTATGAGCAGTCCTTAACGTAACGGT\ + GTGTGGCTTTGAGAAGTTAGCCGTGGCGAGATA\n".to_string(); + let seq_d = ">d\nGGACTGCTCATAAGATTGGAGCCAGGATGTAGCTCGCCACGG\ + CTAACTTCTCAAAGCCACACACCGTTACGTTAA\n".to_string(); + let seq_e = ">e\nTTGAGAAGTTAGCCGTGGCGAGCTACATCCTGGCTCCAATCT\ + TATGAGCAGTCCTTAACGTAACGGTGTGTGGCC\n".to_string(); test_high_level(&seq_a, &seq_b, &seq_c, &seq_d, &seq_e, 1); test_high_level(&seq_a, &seq_b, &seq_c, &seq_d, &seq_e, 5); test_high_level(&seq_a, &seq_b, &seq_c, &seq_d, &seq_e, 9); @@ -174,12 +179,12 @@ fn test_whitespace() { let k_size = 11; let mut metrics = InputAssemblyMetrics::default(); let (sequences, assembly_count) = load_sequences(&temp_dir.path().to_path_buf(), k_size, - &mut metrics); + &mut metrics, 25); assert_eq!(assembly_count, 1); let sequence = sequences.first().unwrap(); assert_eq!(sequence.filename, "assembly.fasta"); assert_eq!(sequence.contig_name(), "name"); assert_eq!(sequence.contig_header, "name abc def ghi"); - assert_eq!(sequence.forward_seq, String::from(".....CTTATGAGCAGTCCTTAACGTAGCGGT.....").into_bytes()); + assert_eq!(sequence.forward_seq, + String::from(".....CTTATGAGCAGTCCTTAACGTAGCGGT.....").into_bytes()); } -