Skip to content

Commit

Permalink
Add --max_contigs to Autocycler compress
Browse files Browse the repository at this point in the history
  • Loading branch information
rrwick committed Jan 20, 2025
1 parent dd8dcb3 commit ad60b01
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 17 deletions.
30 changes: 24 additions & 6 deletions src/compress.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,16 @@ use crate::unitig_graph::UnitigGraph;



pub fn compress(assemblies_dir: PathBuf, autocycler_dir: PathBuf, k_size: u32, threads: usize) {
pub fn compress(assemblies_dir: PathBuf, autocycler_dir: PathBuf, k_size: u32, max_contigs: u32,
threads: usize) {
let start_time = Instant::now();
check_settings(&assemblies_dir, &autocycler_dir, k_size, threads);
starting_message();
print_settings(&assemblies_dir, &autocycler_dir, k_size, threads);
create_dir(&autocycler_dir);
let mut metrics = InputAssemblyMetrics::default();
let (sequences, assembly_count) = load_sequences(&assemblies_dir, k_size, &mut metrics);
let (sequences, assembly_count) =
load_sequences(&assemblies_dir, k_size, &mut metrics, max_contigs);
let kmer_graph = build_kmer_graph(k_size, assembly_count, &sequences);
let mut unitig_graph = build_unitig_graph(kmer_graph);
simplify_unitig_graph(&mut unitig_graph, &sequences);
Expand Down Expand Up @@ -79,8 +81,23 @@ fn print_settings(assemblies_dir: &Path, autocycler_dir: &Path, k_size: u32, thr
}


pub fn load_sequences(assemblies_dir: &Path, k_size: u32, metrics: &mut InputAssemblyMetrics)
-> (Vec<Sequence>, usize) {
fn check_sequence_count(sequences: &[Sequence], assembly_count: usize, max_contigs: u32) {
let sequence_count = sequences.len() as f64;
if sequence_count == 0.0 {
quit_with_error("no sequences found in input assemblies")
}
let mean_seqs_per_assembly = sequence_count / (assembly_count as f64);
if mean_seqs_per_assembly > max_contigs as f64 {
let e = format!("the mean number of contigs per input assembly ({:.1}) exceeds the allowed \
threshold ({}). Are your input assemblies fragmented or contaminated?",
mean_seqs_per_assembly, max_contigs);
quit_with_error(&e);
}
}


pub fn load_sequences(assemblies_dir: &Path, k_size: u32, metrics: &mut InputAssemblyMetrics,
max_contigs: u32) -> (Vec<Sequence>, usize) {
section_header("Loading input assemblies");
explanation("Input assemblies are now loaded and each contig is given a unique ID.");
let assemblies = find_all_assemblies(assemblies_dir);
Expand All @@ -106,6 +123,7 @@ pub fn load_sequences(assemblies_dir: &Path, k_size: u32, metrics: &mut InputAss
metrics.input_assembly_details.push(assembly_details);
}
eprintln!();
check_sequence_count(&sequences, assemblies.len(), max_contigs);
let pb = spinner("repairing sequence ends...");
sequence_end_repair(&mut sequences, k_size);
pb.finish_and_clear();
Expand Down Expand Up @@ -330,7 +348,7 @@ mod tests {
make_test_file(&assembly_dir.path().join("b.fasta"), ">b1\nACGT\n>b2\nACGT\n");
make_test_file(&assembly_dir.path().join("c.fasta"), ">c1\nACGT\n>c2\nACGT\n>c3\nACGT\n");
let mut metrics = InputAssemblyMetrics::default();
let (sequences, count) = load_sequences(&assembly_dir.into_path(), 3, &mut metrics);
let (sequences, count) = load_sequences(&assembly_dir.into_path(), 3, &mut metrics, 25);
assert_eq!(sequences.len(), 6);
assert_eq!(count, 3);
}
Expand All @@ -344,7 +362,7 @@ mod tests {
make_test_file(&assembly_dir.path().join("c.fasta"), ">c1\nACGT\n>c1\nACGT\n>c3\nACGT\n");
assert!(panic::catch_unwind(|| {
let mut metrics = InputAssemblyMetrics::default();
load_sequences(&assembly_dir.into_path(), 3, &mut metrics);
load_sequences(&assembly_dir.into_path(), 3, &mut metrics, 25);
}).is_err());
}
}
8 changes: 6 additions & 2 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,10 @@ enum Commands {
#[clap(long = "kmer", default_value = "51")]
kmer: u32,

/// refuse to run if mean contigs per assembly exceeds this value
#[clap(long = "max_contigs", default_value = "25")]
max_contigs: u32,

/// Number of CPU threads
#[clap(short = 't', long = "threads", default_value = "8")]
threads: usize,
Expand Down Expand Up @@ -274,8 +278,8 @@ fn main() {
Some(Commands::Combine { autocycler_dir, in_gfas }) => {
combine::combine(autocycler_dir, in_gfas);
},
Some(Commands::Compress { assemblies_dir, autocycler_dir, kmer, threads }) => {
compress::compress(assemblies_dir, autocycler_dir, kmer, threads);
Some(Commands::Compress { assemblies_dir, autocycler_dir, kmer, max_contigs, threads }) => {
compress::compress(assemblies_dir, autocycler_dir, kmer, max_contigs, threads);
},
Some(Commands::Decompress { in_gfa, out_dir, out_file }) => {
decompress::decompress(in_gfa, out_dir, out_file);
Expand Down
23 changes: 14 additions & 9 deletions src/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ fn test_high_level(seq_a: &str, seq_b: &str, seq_c: &str, seq_d: &str, seq_e: &s
// Build a k-mer graph from the sequences.
let mut metrics = InputAssemblyMetrics::default();
let (sequences, assembly_count) = load_sequences(&assembly_dir.path().to_path_buf(),
k_size, &mut metrics);
k_size, &mut metrics, 25);
assert_eq!(assembly_count, 5);
let mut kmer_graph = KmerGraph::new(k_size);
kmer_graph.add_sequences(&sequences, assembly_count);
Expand Down Expand Up @@ -131,11 +131,16 @@ fn test_high_level(seq_a: &str, seq_b: &str, seq_c: &str, seq_d: &str, seq_e: &s

#[test]
fn test_fixed_seqs() {
let seq_a = ">a\nCTTATGAGCAGTCCTTAACGTAGCGGTGTGTGGCTTTGAGAAGTTAGCGGTGGCGAGCTACATCCTGGCTCCAAT\n".to_string();
let seq_b = ">b\nACCGTTACGTTAAGGACTGCTCATAAGATTGGAGCCAGGATGTAGCTCGCCACGGCTAACTTCTCAAAGCGGCAC\n".to_string();
let seq_c = ">c\nCATCCTGGCTCCAATCTTATGAGCAGTCCTTAACGTAACGGTGTGTGGCTTTGAGAAGTTAGCCGTGGCGAGATA\n".to_string();
let seq_d = ">d\nGGACTGCTCATAAGATTGGAGCCAGGATGTAGCTCGCCACGGCTAACTTCTCAAAGCCACACACCGTTACGTTAA\n".to_string();
let seq_e = ">e\nTTGAGAAGTTAGCCGTGGCGAGCTACATCCTGGCTCCAATCTTATGAGCAGTCCTTAACGTAACGGTGTGTGGCC\n".to_string();
let seq_a = ">a\nCTTATGAGCAGTCCTTAACGTAGCGGTGTGTGGCTTTGAGAA\
GTTAGCGGTGGCGAGCTACATCCTGGCTCCAAT\n".to_string();
let seq_b = ">b\nACCGTTACGTTAAGGACTGCTCATAAGATTGGAGCCAGGATG\
TAGCTCGCCACGGCTAACTTCTCAAAGCGGCAC\n".to_string();
let seq_c = ">c\nCATCCTGGCTCCAATCTTATGAGCAGTCCTTAACGTAACGGT\
GTGTGGCTTTGAGAAGTTAGCCGTGGCGAGATA\n".to_string();
let seq_d = ">d\nGGACTGCTCATAAGATTGGAGCCAGGATGTAGCTCGCCACGG\
CTAACTTCTCAAAGCCACACACCGTTACGTTAA\n".to_string();
let seq_e = ">e\nTTGAGAAGTTAGCCGTGGCGAGCTACATCCTGGCTCCAATCT\
TATGAGCAGTCCTTAACGTAACGGTGTGTGGCC\n".to_string();
test_high_level(&seq_a, &seq_b, &seq_c, &seq_d, &seq_e, 1);
test_high_level(&seq_a, &seq_b, &seq_c, &seq_d, &seq_e, 5);
test_high_level(&seq_a, &seq_b, &seq_c, &seq_d, &seq_e, 9);
Expand Down Expand Up @@ -174,12 +179,12 @@ fn test_whitespace() {
let k_size = 11;
let mut metrics = InputAssemblyMetrics::default();
let (sequences, assembly_count) = load_sequences(&temp_dir.path().to_path_buf(), k_size,
&mut metrics);
&mut metrics, 25);
assert_eq!(assembly_count, 1);
let sequence = sequences.first().unwrap();
assert_eq!(sequence.filename, "assembly.fasta");
assert_eq!(sequence.contig_name(), "name");
assert_eq!(sequence.contig_header, "name abc def ghi");
assert_eq!(sequence.forward_seq, String::from(".....CTTATGAGCAGTCCTTAACGTAGCGGT.....").into_bytes());
assert_eq!(sequence.forward_seq,
String::from(".....CTTATGAGCAGTCCTTAACGTAGCGGT.....").into_bytes());
}

0 comments on commit ad60b01

Please sign in to comment.