From 853354fc1782223c830a1b13eb60d07bfc7109fb Mon Sep 17 00:00:00 2001 From: AroneyS Date: Tue, 17 Dec 2024 13:48:14 +1000 Subject: [PATCH 1/4] switch to using skani cli --- Cargo.toml | 2 +- galah.yml | 1 + src/cluster_argument_parsing.rs | 1 + src/clusterer.rs | 2 + src/skani.rs | 257 ++++++++++++++++++-------------- 5 files changed, 152 insertions(+), 111 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 6b8894b..0003eb5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -33,7 +33,7 @@ needletail = "0.5.*" bird_tool_utils-man = "0.4.0" lazy_static = "1.4.0" ansi_term = "0.12" -skani = "0.1.1" +# skani = "0.1.1" # skani = { path = "../skani" } concurrent-queue = "2" diff --git a/galah.yml b/galah.yml index 759054f..4bc9e48 100644 --- a/galah.yml +++ b/galah.yml @@ -5,4 +5,5 @@ channels: dependencies: - dashing >= 0.4.0 - fastani >= 1.3 + - skani >= 0.1.1 - extern diff --git a/src/cluster_argument_parsing.rs b/src/cluster_argument_parsing.rs index beab606..11c7131 100644 --- a/src/cluster_argument_parsing.rs +++ b/src/cluster_argument_parsing.rs @@ -1048,6 +1048,7 @@ pub fn generate_galah_clusterer<'a>( ) ) }), + threads, }), _ => panic!("Programming error"), }, diff --git a/src/clusterer.rs b/src/clusterer.rs index 7a8e194..0ee1db1 100644 --- a/src/clusterer.rs +++ b/src/clusterer.rs @@ -624,6 +624,7 @@ mod tests { &crate::skani::SkaniPreclusterer { threshold: 90.0, min_aligned_threshold: 0.2, + threads: 1, }, &crate::skani::SkaniClusterer { threshold: 99.0, @@ -650,6 +651,7 @@ mod tests { &crate::skani::SkaniPreclusterer { threshold: 90.0, min_aligned_threshold: 0.2, + threads: 1, }, &crate::skani::SkaniClusterer { threshold: 99.0, diff --git a/src/skani.rs b/src/skani.rs index a734ed3..772ea57 100644 --- a/src/skani.rs +++ b/src/skani.rs @@ -1,19 +1,18 @@ +use std; +use std::io::BufReader; +use std::io::Write; + use crate::sorted_pair_genome_distance_cache::SortedPairGenomeDistanceCache; use crate::ClusterDistanceFinder; use crate::PreclusterDistanceFinder; -use rayon::prelude::*; - -use skani::chain; -use skani::file_io; -use skani::params::*; -use skani::screen; - -use concurrent_queue::ConcurrentQueue; +use bird_tool_utils::command::finish_command_safely; +use tempfile; pub struct SkaniPreclusterer { pub threshold: f32, pub min_aligned_threshold: f32, + pub threads: u16, } impl PreclusterDistanceFinder for SkaniPreclusterer { @@ -22,6 +21,7 @@ impl PreclusterDistanceFinder for SkaniPreclusterer { genome_fasta_paths, self.threshold, self.min_aligned_threshold, + self.threads, ) } @@ -34,75 +34,103 @@ fn precluster_skani( genome_fasta_paths: &[&str], threshold: f32, min_aligned_threshold: f32, + threads: u16, ) -> SortedPairGenomeDistanceCache { - let (command_params, sketch_params) = default_params(Mode::Dist, min_aligned_threshold); - - debug!("Sketching genomes with skani .."); - let fasta_strings = genome_fasta_paths - .iter() - .map(|x| x.to_string()) - .collect::>(); - // Note that sketches is now shuffled! - let sketches = &file_io::fastx_to_sketches(&fasta_strings, &sketch_params, true); - - // Right now implemented by parallel collection into a queue, and then - // reprocessed into a BTreeMap. Could potentially be made more efficient by - // direct collection into a concurrent BTreeMap, but eh for now. - let queue = ConcurrentQueue::unbounded(); - - // Screen genomes before ANI calculation - let kmer_to_sketch = screen::kmer_to_sketch_from_refs(sketches); - - info!("Calculating ANI from skani sketches .."); - sketches.par_iter().enumerate().for_each(|(i, ref_sketch)| { - let ref_sketch_i = &sketches[i]; - let screened_refs = screen::screen_refs( - 0.80, - &kmer_to_sketch, - ref_sketch_i, - &sketch_params, - sketches, - ); - debug!( - "{} has {} refs passing screening.", - ref_sketch_i.file_name, - screened_refs.len() + if threshold < 85.0 { + panic!( + "Error: skani produces inaccurate results with ANI less than 85%. Provided: {}", + threshold ); + } - screened_refs.into_par_iter().for_each(|j| { - if j > i { - let map_params = chain::map_params_from_sketch(ref_sketch, false, &command_params); - let ani_result = chain::chain_seeds(ref_sketch, &sketches[j], map_params); - let ani = ani_result.ani * 100.; - let ref_name = ref_sketch.file_name.clone(); - let query_name = sketches[j].file_name.clone(); + // Create a tempfile to list all the fasta file paths + let mut tf = tempfile::Builder::new() + .prefix("galah-input-genomes") + .suffix(".txt") + .tempfile() + .expect("Failed to open temporary file to run skani"); - debug!("Pushing ANI result for {} and {}", ref_name, query_name); - let ref_index = genome_fasta_paths + for fasta in genome_fasta_paths { + writeln!(tf, "{}", fasta) + .expect("Failed to write genome fasta paths to tempfile for skani"); + } + + // --sparse only outputs non-zero entries in an edge-list output + // Ref_file Query_file ANI Align_fraction_ref Align_fraction_query Ref_name Query_name + info!("Running skani to get distances .."); + let mut cmd = std::process::Command::new("skani"); + cmd.arg("triangle") + .arg("-t") + .arg(&format!("{}", threads)) + .arg("--sparse") + .arg("--min-af") + .arg(&format!("{}", min_aligned_threshold)) + .arg("-l") + .arg(tf.path().to_str().unwrap()) + .stdout(std::process::Stdio::piped()) + .stderr(std::process::Stdio::piped()); + debug!("Running skani command: {:?}", &cmd); + + // Parse the distances + let mut process = cmd + .spawn() + .unwrap_or_else(|_| panic!("Failed to spawn {}", "skani")); + let stdout = process.stdout.as_mut().unwrap(); + let stdout_reader = BufReader::new(stdout); + + let mut rdr = csv::ReaderBuilder::new() + .delimiter(b'\t') + .has_headers(true) + .from_reader(stdout_reader); + + let mut distances = SortedPairGenomeDistanceCache::new(); + + // Order is likely not conserved, so need to keep track + for record_res in rdr.records() { + match record_res { + Ok(record) => { + debug!("Found skani record {:?}", record); + + // Get index of genomes within genome_fasta_paths + let genome_id1 = genome_fasta_paths .iter() - .position(|&r| r == ref_name) - .unwrap(); - let query_index = genome_fasta_paths + .position(|&x| x == &record[0]) + .unwrap_or_else(|| { + panic!( + "Failed to find genome fasta path in genome_fasta_paths: {}", + &record[0] + ) + }); + let genome_id2 = genome_fasta_paths .iter() - .position(|&r| r == query_name) - .unwrap(); + .position(|&x| x == &record[1]) + .unwrap_or_else(|| { + panic!( + "Failed to find genome fasta path in genome_fasta_paths: {}", + &record[1] + ) + }); + + let ani: f32 = record[2].parse().unwrap_or_else(|_| { + panic!("Failed to convert skani ANI to float value: {}", &record[2]) + }); + trace!("Found ANI {}", ani); if ani >= threshold { - queue - .push((ref_index, query_index, ani)) - .expect("Failed to push to queue during preclustering"); + trace!("Accepting ANI since it passed threshold"); + distances.insert((genome_id1, genome_id2), Some(ani)) } } - }); - }); - debug!("Converting ANI results into sparse cache .."); - - let mut to_return = SortedPairGenomeDistanceCache::new(); - while let Ok((i, j, ani)) = queue.pop() { - to_return.insert((i, j), Some(ani)); + Err(e) => { + error!("Error parsing skani output: {}", e); + std::process::exit(1); + } + } } - debug!("Finished skani."); + finish_command_safely(process, "skani"); + debug!("Found skani distances: {:#?}", distances); + info!("Finished skani triangle."); - to_return + distances } pub struct SkaniClusterer { @@ -128,50 +156,59 @@ impl ClusterDistanceFinder for SkaniClusterer { } } -fn default_params(mode: Mode, min_aligned_frac: f32) -> (CommandParams, SketchParams) { - let cmd_params = CommandParams { - screen: true, - screen_val: 0.00, - mode, - out_file_name: "".to_string(), - ref_files: vec![], - query_files: vec![], - refs_are_sketch: false, - queries_are_sketch: false, - robust: false, - median: false, - sparse: false, - full_matrix: false, - max_results: 10000000, // for Triange usize::MAX, - individual_contig_q: false, - individual_contig_r: false, - min_aligned_frac: min_aligned_frac as f64, - keep_refs: false, - est_ci: false, - learned_ani: true, - learned_ani_cmd: false, - detailed_out: false, - // distance: false, - // rescue_small: true, - }; - - let m = 1000; - let c = 125; - let k = 15; - let sketch_params = SketchParams::new(m, c, k, false, false); - (cmd_params, sketch_params) -} - -pub fn calculate_skani(fasta1: &str, fasta2: &str, min_aligned_frac: f32) -> f32 { - //Vector of Strings - let refs = vec![fasta1.to_string()]; - let queries = vec![fasta2.to_string()]; +pub fn calculate_skani(fasta1: &str, fasta2: &str, min_aligned_threshold: f32) -> f32 { + // --sparse only outputs non-zero entries in an edge-list output + // Ref_file Query_file ANI Align_fraction_ref Align_fraction_query Ref_name Query_name + info!("Running skani to get distances .."); + let mut cmd = std::process::Command::new("skani"); + cmd.arg("dist") + .arg("--min-af") + .arg(&format!("{}", min_aligned_threshold)) + .arg("-q") + .arg(fasta1) + .arg("-r") + .arg(fasta2) + .stdout(std::process::Stdio::piped()) + .stderr(std::process::Stdio::piped()); + debug!("Running skani command: {:?}", &cmd); + + // Parse the distances + let mut process = cmd + .spawn() + .unwrap_or_else(|_| panic!("Failed to spawn {}", "skani")); + let stdout = process.stdout.as_mut().unwrap(); + let stdout_reader = BufReader::new(stdout); + + let mut rdr = csv::ReaderBuilder::new() + .delimiter(b'\t') + .has_headers(true) + .from_reader(stdout_reader); + + let mut num_records = 0; + let mut to_return = 0.0; + + for record_res in rdr.records() { + match record_res { + Ok(record) => { + if num_records > 0 { + error!("Unexpectedly found >1 result from skani"); + std::process::exit(1); + } - let (command_params, sketch_params) = default_params(Mode::Dist, min_aligned_frac); - let ref_sketch = &file_io::fastx_to_sketches(&refs, &sketch_params, true)[0]; - let query_sketch = &file_io::fastx_to_sketches(&queries, &sketch_params, true)[0]; - let map_params = chain::map_params_from_sketch(ref_sketch, false, &command_params); - let ani_result = chain::chain_seeds(ref_sketch, query_sketch, map_params); + assert!(record.len() == 7); + to_return = record[2].parse().unwrap_or_else(|_| { + panic!("Failed to convert skani ANI to float value: {}", &record[2]) + }); + num_records += 1; + } + Err(e) => { + error!("Error parsing skani output: {}", e); + std::process::exit(1); + } + } + } - ani_result.ani * 100.0 + debug!("skani of {} against {} was {:?}", fasta1, fasta2, to_return); + finish_command_safely(process, "skani"); + to_return } From f6f670cf478ae09e7a17b5726a333ff973305487 Mon Sep 17 00:00:00 2001 From: AroneyS Date: Tue, 17 Dec 2024 13:53:23 +1000 Subject: [PATCH 2/4] add threads arg to tests --- src/skani.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/skani.rs b/src/skani.rs index 2d3d7f8..8b99df5 100644 --- a/src/skani.rs +++ b/src/skani.rs @@ -236,6 +236,7 @@ mod tests { ], 80.0, 0.2, + 1, ); } @@ -251,6 +252,7 @@ mod tests { ], 95.0, 0.2, + 1, ); } } From 3e9fb5d939fb3ef80982652ffc6650e988d2ad20 Mon Sep 17 00:00:00 2001 From: AroneyS Date: Wed, 18 Dec 2024 11:11:55 +1000 Subject: [PATCH 3/4] fix min-af format skani takes percent as input --- src/skani.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/skani.rs b/src/skani.rs index 8b99df5..a7a586c 100644 --- a/src/skani.rs +++ b/src/skani.rs @@ -64,7 +64,7 @@ fn precluster_skani( .arg(&format!("{}", threads)) .arg("--sparse") .arg("--min-af") - .arg(&format!("{}", min_aligned_threshold)) + .arg(&format!("{}", min_aligned_threshold * 100.0)) .arg("-l") .arg(tf.path().to_str().unwrap()) .stdout(std::process::Stdio::piped()) @@ -163,7 +163,7 @@ pub fn calculate_skani(fasta1: &str, fasta2: &str, min_aligned_threshold: f32) - let mut cmd = std::process::Command::new("skani"); cmd.arg("dist") .arg("--min-af") - .arg(&format!("{}", min_aligned_threshold)) + .arg(&format!("{}", min_aligned_threshold * 100.0)) .arg("-q") .arg(fasta1) .arg("-r") From fe9b1c904183cafb86074ca7495b5e3f56c04821 Mon Sep 17 00:00:00 2001 From: AroneyS Date: Thu, 19 Dec 2024 13:27:09 +1000 Subject: [PATCH 4/4] implement contig clustering restricted to skani preclusterer and skipping clusterer --- src/cluster_argument_parsing.rs | 110 +++++++++++- src/clusterer.rs | 71 +++++++- src/dashing.rs | 9 + src/finch.rs | 9 + src/lib.rs | 6 + src/skani.rs | 113 ++++++++++++ tests/data/contigs/contigs.fna | 300 ++++++++++++++++++++++++++++++++ tests/test_cmdline.rs | 21 +++ 8 files changed, 627 insertions(+), 12 deletions(-) create mode 100644 tests/data/contigs/contigs.fna diff --git a/src/cluster_argument_parsing.rs b/src/cluster_argument_parsing.rs index 11c7131..1730a1f 100644 --- a/src/cluster_argument_parsing.rs +++ b/src/cluster_argument_parsing.rs @@ -17,6 +17,7 @@ use bird_tool_utils::clap_utils::*; use bird_tool_utils::clap_utils::{default_roff, monospace_roff}; use bird_tool_utils_man::prelude::{Author, Flag, Manual, Opt, Section}; use clap::*; +use needletail::parse_fastx_file; use rayon::prelude::*; use crate::genome_info_file; @@ -37,6 +38,18 @@ impl PreclusterDistanceFinder for Preclusterer { } } + fn distances_contigs( + &self, + genome_fasta_paths: &[&str], + contig_names: &[&str], + ) -> SortedPairGenomeDistanceCache { + match self { + Preclusterer::Dashing(d) => d.distances_contigs(genome_fasta_paths, contig_names), + Preclusterer::Finch(f) => f.distances_contigs(genome_fasta_paths, contig_names), + Preclusterer::Skani(s) => s.distances_contigs(genome_fasta_paths, contig_names), + } + } + fn method_name(&self) -> &str { match self { Preclusterer::Dashing(d) => d.method_name(), @@ -85,6 +98,8 @@ pub struct GalahClusterer<'a> { pub genome_fasta_paths: Vec<&'a str>, pub preclusterer: Preclusterer, pub clusterer: Clusterer, + pub cluster_contigs: bool, + pub contig_names: &'a Option>, } pub struct GalahClustererCommandDefinition { @@ -95,6 +110,7 @@ pub struct GalahClustererCommandDefinition { pub dereplication_cluster_method_argument: String, pub dereplication_aligned_fraction_argument: String, pub dereplication_fraglen_argument: String, + pub dereplication_cluster_contigs_argument: String, // pub dereplication_ani_method_argument: String, pub dereplication_output_cluster_definition_file: String, pub dereplication_output_representative_fasta_directory: String, @@ -112,6 +128,7 @@ lazy_static! { dereplication_cluster_method_argument: "cluster-method".to_string(), dereplication_aligned_fraction_argument: "min-aligned-fraction".to_string(), dereplication_fraglen_argument: "fragment-length".to_string(), + dereplication_cluster_contigs_argument: "cluster-contigs".to_string(), // dereplication_ani_method_argument: "ani-method".to_string(), dereplication_output_cluster_definition_file: "output-cluster-definition".to_string(), dereplication_output_representative_fasta_directory: @@ -313,6 +330,14 @@ pub fn add_dereplication_clustering_parameters_to_section( default_roff(crate::DEFAULT_CLUSTER_METHOD) )), ) + .flag( + Flag::new() + .long(&format!( + "--{}", + definition.dereplication_cluster_contigs_argument + )) + .help("Cluster contigs within a fasta file instead of genomes."), + ) } pub fn add_dereplication_output_parameters_to_section( @@ -413,8 +438,51 @@ pub fn run_cluster_subcommand( let genome_fasta_files: Vec = parse_list_of_genome_fasta_files(m, true).unwrap(); - let galah = generate_galah_clusterer(&genome_fasta_files, m, &GALAH_COMMAND_DEFINITION) - .expect("Failed to parse galah clustering arguments correctly"); + let cluster_contigs = m.get_flag("cluster-contigs"); + + let contig_names_owned = if cluster_contigs { + if m.contains_id("output-representative-fasta-directory") + || m.contains_id("output-representative-fasta-directory-copy") + { + panic!("Cannot specify --cluster-contigs with --output-representative-fasta-directory or --output-representative-fasta-directory-copy"); + } + + // Get all contig names from the fasta files + let mut contig_names2 = vec![]; + for genome_fasta_path in &genome_fasta_files { + let mut reader = parse_fastx_file(genome_fasta_path) + .unwrap_or_else(|_| panic!("Failed to read contig names from file '{}' as there was a problem opening the file", + genome_fasta_path)); + while let Some(seq1) = reader.next() { + let seq = seq1.expect("invalid record"); + let seq_id = String::from_utf8_lossy(seq.id()).into_owned(); + if contig_names2.contains(&seq_id) { + panic!( + "Duplicate contig name found in file '{}': {}", + genome_fasta_path, seq_id + ); + } + contig_names2.push(seq_id); + } + } + + Some(contig_names2) + } else { + None + }; + + let contig_names = contig_names_owned + .as_ref() + .map(|c| c.iter().map(String::as_str).collect::>()); + + let galah = generate_galah_clusterer( + &genome_fasta_files, + &contig_names, + cluster_contigs, + m, + &GALAH_COMMAND_DEFINITION, + ) + .expect("Failed to parse galah clustering arguments correctly"); // Open file handles here so errors are caught before CPU-heavy clustering let output_definitions = setup_galah_outputs(m, &GALAH_COMMAND_DEFINITION); @@ -425,7 +493,12 @@ pub fn run_cluster_subcommand( info!("Found {} genome clusters", clusters.len()); - write_galah_outputs(output_definitions, &clusters, passed_genomes); + write_galah_outputs( + output_definitions, + &clusters, + passed_genomes, + contig_names.as_ref(), + ); info!("Finished printing genome clusters"); } @@ -433,7 +506,12 @@ pub fn write_galah_outputs( output_definitions: GalahOutput, clusters: &Vec>, passed_genomes: &[&str], + contig_names: Option<&Vec<&str>>, ) { + let references = match contig_names { + Some(c) => c, + None => passed_genomes, + }; if let Some(mut f) = output_definitions.output_clusters_file { for cluster in clusters { let rep_index = cluster[0]; @@ -441,7 +519,7 @@ pub fn write_galah_outputs( writeln!( f, "{}\t{}", - passed_genomes[rep_index], passed_genomes[*genome_index] + references[rep_index], references[*genome_index] ) .expect("Failed to write to output clusters file"); } @@ -450,7 +528,7 @@ pub fn write_galah_outputs( write_cluster_reps_to_directory( clusters, - passed_genomes, + references, &output_definitions.output_representative_fasta_directory, &|link, current_stab, rep| { symlink(link, std::path::Path::new(¤t_stab)).unwrap_or_else(|_| { @@ -463,7 +541,7 @@ pub fn write_galah_outputs( ); write_cluster_reps_to_directory( clusters, - passed_genomes, + references, &output_definitions.output_representative_fasta_directory_copy, &|link, current_stab, rep| { std::fs::copy(link, std::path::Path::new(¤t_stab)).unwrap_or_else(|_| { @@ -478,7 +556,7 @@ pub fn write_galah_outputs( if let Some(mut f) = output_definitions.output_representative_list { for cluster in clusters { let rep_index = cluster[0]; - writeln!(f, "{}", passed_genomes[rep_index]) + writeln!(f, "{}", references[rep_index]) .expect("Failed to write to output representative list file"); } } @@ -578,6 +656,10 @@ pub fn filter_genomes_through_checkm<'a>( clap_matches: &clap::ArgMatches, argument_definition: &GalahClustererCommandDefinition, ) -> std::result::Result, String> { + if clap_matches.get_flag("cluster-contigs") { + return Ok(genome_fasta_files.iter().map(|s| &**s).collect()); + } + match clap_matches.contains_id("checkm-tab-table") || clap_matches.contains_id("genome-info") || clap_matches.contains_id("checkm2-quality-report") @@ -896,6 +978,8 @@ fn filter_and_calculate_genome_stats< pub fn generate_galah_clusterer<'a>( genome_fasta_paths: &'a Vec, + contig_names: &'a Option>, + cluster_contigs: bool, clap_matches: &clap::ArgMatches, argument_definition: &GalahClustererCommandDefinition, ) -> std::result::Result, String> { @@ -1153,6 +1237,8 @@ pub fn generate_galah_clusterer<'a>( }), _ => panic!("Programming error"), }, + cluster_contigs, + contig_names, }) } } @@ -1188,6 +1274,8 @@ impl GalahClusterer<'_> { &self.genome_fasta_paths, &self.preclusterer, &self.clusterer, + self.cluster_contigs, + self.contig_names.as_deref(), ) } } @@ -1317,14 +1405,18 @@ pub fn add_cluster_subcommand(app: clap::Command) -> clap::Command { .default_value(crate::DEFAULT_PRETHRESHOLD_ANI)) .arg(Arg::new(&*GALAH_COMMAND_DEFINITION.dereplication_precluster_method_argument) .long("precluster-method") - .help("method of calculating rough ANI. 'dashing' for HyperLogLog, 'finch' for finch MinHash, 'skani' for Skani") + .help("method of calculating rough ANI. 'dashing' for HyperLogLog, 'finch' for finch MinHash, 'skani' for skani") .value_parser(crate::PRECLUSTER_METHODS) .default_value(crate::DEFAULT_PRECLUSTER_METHOD)) .arg(Arg::new(&*GALAH_COMMAND_DEFINITION.dereplication_cluster_method_argument) .long("cluster-method") - .help("method of calculating ANI. 'fastani' for FastANI, 'skani' for Skani") + .help("method of calculating ANI. 'fastani' for FastANI, 'skani' for skani") .value_parser(crate::CLUSTER_METHODS) .default_value(crate::DEFAULT_CLUSTER_METHOD)) + .arg(Arg::new(&*GALAH_COMMAND_DEFINITION.dereplication_cluster_contigs_argument) + .long("cluster-contigs") + .help("Cluster contigs instead of genomes") + .action(clap::ArgAction::SetTrue)) .arg(Arg::new("threads") .short('t') .long("threads") diff --git a/src/clusterer.rs b/src/clusterer.rs index 0ee1db1..aee9c02 100644 --- a/src/clusterer.rs +++ b/src/clusterer.rs @@ -15,6 +15,8 @@ pub fn cluster, ) -> Vec> { clusterer.initialise(); @@ -32,11 +34,27 @@ pub fn cluster>> = Mutex::new(vec![]); @@ -70,7 +88,11 @@ pub fn cluster SortedPairGenomeDistanceCache { + // Dashing doesn't offer high-quality ANI with self-self comparisons, so we can't use it for contig comparisons. + SortedPairGenomeDistanceCache::new() + } + fn method_name(&self) -> &str { "dashing" } diff --git a/src/finch.rs b/src/finch.rs index cd7554f..dee8bcd 100644 --- a/src/finch.rs +++ b/src/finch.rs @@ -18,6 +18,15 @@ impl PreclusterDistanceFinder for FinchPreclusterer { ) } + fn distances_contigs( + &self, + _genome_fasta_paths: &[&str], + _contig_names: &[&str], + ) -> SortedPairGenomeDistanceCache { + // Finch doesn't offer high-quality ANI with self-self comparisons, so we can't use it for contig comparisons. + SortedPairGenomeDistanceCache::new() + } + fn method_name(&self) -> &str { "finch" } diff --git a/src/lib.rs b/src/lib.rs index 0aada20..f084b28 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -23,6 +23,12 @@ use crate::sorted_pair_genome_distance_cache::SortedPairGenomeDistanceCache; pub trait PreclusterDistanceFinder { fn distances(&self, genome_fasta_paths: &[&str]) -> SortedPairGenomeDistanceCache; + fn distances_contigs( + &self, + genome_fasta_paths: &[&str], + contig_names: &[&str], + ) -> SortedPairGenomeDistanceCache; + fn method_name(&self) -> &str; } diff --git a/src/skani.rs b/src/skani.rs index a7a586c..a6c8952 100644 --- a/src/skani.rs +++ b/src/skani.rs @@ -25,6 +25,20 @@ impl PreclusterDistanceFinder for SkaniPreclusterer { ) } + fn distances_contigs( + &self, + genome_fasta_paths: &[&str], + contig_names: &[&str], + ) -> SortedPairGenomeDistanceCache { + precluster_skani_contigs( + genome_fasta_paths, + self.threshold, + self.min_aligned_threshold, + self.threads, + contig_names, + ) + } + fn method_name(&self) -> &str { "skani" } @@ -133,6 +147,105 @@ fn precluster_skani( distances } +fn precluster_skani_contigs( + genome_fasta_paths: &[&str], + threshold: f32, + min_aligned_threshold: f32, + threads: u16, + contig_names: &[&str], +) -> SortedPairGenomeDistanceCache { + if threshold < 85.0 { + panic!( + "Error: skani produces inaccurate results with ANI less than 85%. Provided: {}", + threshold + ); + } + + // Create a tempfile to list all the fasta file paths + let mut tf = tempfile::Builder::new() + .prefix("galah-input-genomes") + .suffix(".txt") + .tempfile() + .expect("Failed to open temporary file to run skani"); + + for fasta in genome_fasta_paths { + writeln!(tf, "{}", fasta) + .expect("Failed to write genome fasta paths to tempfile for skani"); + } + + // --sparse only outputs non-zero entries in an edge-list output + info!("Running skani to get distances .."); + let mut cmd = std::process::Command::new("skani"); + cmd.arg("triangle") + .arg("-i") + .arg("-t") + .arg(&format!("{}", threads)) + .arg("--sparse") + .arg("--min-af") + .arg(&format!("{}", min_aligned_threshold * 100.0)) + .arg("-l") + .arg(tf.path().to_str().unwrap()) + .stdout(std::process::Stdio::piped()) + .stderr(std::process::Stdio::piped()); + debug!("Running skani command: {:?}", &cmd); + + // Parse the distances + let mut process = cmd + .spawn() + .unwrap_or_else(|_| panic!("Failed to spawn {}", "skani")); + let stdout = process.stdout.as_mut().unwrap(); + let stdout_reader = BufReader::new(stdout); + + let mut rdr = csv::ReaderBuilder::new() + .delimiter(b'\t') + .has_headers(true) + .from_reader(stdout_reader); + + let mut distances = SortedPairGenomeDistanceCache::new(); + + // Returning contig names from Ref_name and Query_name + // Ref_file Query_file ANI Align_fraction_ref Align_fraction_query Ref_name Query_name + for record_res in rdr.records() { + match record_res { + Ok(record) => { + debug!("Found skani record {:?}", record); + + // Get index of contigs within contig_names + let contig_id1 = contig_names + .iter() + .position(|&x| x == &record[5]) + .unwrap_or_else(|| { + panic!("Failed to find contig name in contig_names: {}", &record[5]) + }); + let contig_id2 = contig_names + .iter() + .position(|&x| x == &record[6]) + .unwrap_or_else(|| { + panic!("Failed to find contig name in contig_names: {}", &record[6]) + }); + + let ani: f32 = record[2].parse().unwrap_or_else(|_| { + panic!("Failed to convert skani ANI to float value: {}", &record[2]) + }); + trace!("Found ANI {}", ani); + if ani >= threshold { + trace!("Accepting ANI since it passed threshold"); + distances.insert((contig_id1, contig_id2), Some(ani)) + } + } + Err(e) => { + error!("Error parsing skani output: {}", e); + std::process::exit(1); + } + } + } + finish_command_safely(process, "skani"); + debug!("Found skani distances: {:#?}", distances); + info!("Finished skani triangle."); + + distances +} + pub struct SkaniClusterer { pub threshold: f32, pub min_aligned_threshold: f32, diff --git a/tests/data/contigs/contigs.fna b/tests/data/contigs/contigs.fna new file mode 100644 index 0000000..e4e9199 --- /dev/null +++ b/tests/data/contigs/contigs.fna @@ -0,0 +1,300 @@ +>73.20110600_S2D.10_contig_13024 +ACAAGGCAGCGTTATATGCTCGGCGTGTGGCGCACAGCGTCACTATGTCATACGAAGCGT +GATTTATGGGACGGACACGCCGCCGATGGCCCCTGTGGGGATGAAATATGATGCGTGGGA +CTTCGCGAAGACCAGCGAATGTCTGAACTGTAAAGAACACGCACGGCACAGCGTCCATCT +CGACGAGTACAAGATGAACGTGATCTGTGACGAGTGCGGTTTTTCGCGAGTGTTGAAGCT +CGATTATCTTGTCTTCCCGCATGTACTCTCGCACGAATAAGCCCCCTGAAGATGAGCACG +CGGTTGTACGTCTTTCACGCAGGGCAGTGTGATCCGAAGAAGTGCACCGCCAAAAAACTT +GCCAAATTTCATCTCGTAACTCTTTTCTCGACGCCGCGACCCTTGTGGGGCTCTCATACG +GTGCTCCTTGATCCATCCGCACCCGCTCCGTTATGCCGCGCGGATGTTGTGACGGCGCTC +GTTGCGGTTGACTGCTCCTGGAAACAACGTGAGCGCGTGTTCGAGTCGTTCCGGGCGCGG +ACGAGTCGACGACTTCCGTATCTTTTGGCAGCGAATCCTGTGAACTATGGGCGGCCGTTC +GAGCTCACAACCGTAGAAGCGCTGGCGGCTGCGCTCTACGTCCTTGGGGAATCACGACGC +GCAAGAGAACTCTTGAACAAGTTCAAATGGGGTTCGACATTTTTGAAGTTGAACGCGGCA +CCCCTCGATGAATACGCCGGCGCGAACACCGTCAGCGACGTGCTCGCACTTGAACGCGCC +TATATAGAACGCTAAATGACTTCTTATCCGCGTTCTTGCGGTTAACGCAAACAGAAAGAT +TAAATGCTGTTAGGCGTATGGTTCTACCGACGTGAGAGGTACATCTGATGGCACGTTTTC +CAGAAGCTGAAAAAAGGTTGCTTAATATCAAAATCTGCATGCGGTGTAACGCCCGCAACC +CGCCACGGGCGACTCGATGCAGGAAATGTGGTTATAAAAACCTGAGGGTCAAGTCGAAGG +AACGAAGGGGTTAGTCGATTGAGCATCGAGCAGCATCTCGAAGAGGCGATCAGCAGAGAA +GGAACGATACACCTAACGCTTATCGATCCCGACTCACAGCCGCCAGAGGAAGCCGCACGA +ATTGCACGGGGCGCCACGGCTGGAGGCACCACTGCGGTTCTTGTGGGAGGTTCTGTAGGC +GTCGGCGGACGTGGCATAGAGTGCACCGTATCGGAAATCAAAGGTAGCACGGGTGTCCCT +GCTATCATCTTCCCTTCAGACGTAGGGAGTGTGTGTACGACGGCCGACGCGATGTTTTTC +TTGTGCCTTATGAACTCTAGGAGCACGGCGTATCTCAGTACGAACCAAGCACTCGGCGCC +GTGTTTGTCAAGAGCTACGGAATAGAGCCCATTCCAGTAGCGTACATCATCGTCGAACCG +GGGGGAACTGTTGGCTGGATAGGTGATGCGCGTTTGATTCCGCGCAACAAACCCGAGATC +GCCGCCGCATACGCGCTCGCAGGAAAATATCTTGGGATGCGATGGGTCTACCTTGAGGCA +GGTTCTGGTGTCCCCGATACCGTCCCCCCCGAAATGGTGAAGGCGGTAAAAGCATCCATA +GACGATACAAGGCTCATCGTAGGCGGCGGGATACGCACCGACAAACAGGCCGAGTTGCTA +GCGAAGGCGGGAGCGGATGTCATCGTAACGGGCACGGCAATCGAAGAGGACGTTGACATC +GAAAACAAGATCGCAAGCTTTGTNNAGATCCCAAAATCGCTTGCCGCTTTGTACCTTGAT +AAGCCGCATGTCGTGTTCTTTGCACTTGCGGGCAGTTATGCGCAGCCTTCCTGATTTCGG +CAGGGGGAGTCCAAACGTGCACTTCGGATACCCGCTGCACCCTACAAACCGTCCGCCGCG +TTTTGAGCGACGCACGATGAGATCTGAGCTGCACTGAGGGCATGTCCCGACGGTCGCGTC +TCCGCGTAGCCCTTCTTGCAGCGATTTGCTAATCTCTTCGAGTTGCTCATCGAGCGTGCT +CAAGACCTTATCGAGAATATCCGTTGAGCTGACCACCACAAACTCCTTTGACAGCGTTCC +CGTTTTGATTTTGTCCATCTCTTCTTCGAGGAGTCGGGTCATCTCAGGTTTCGTAATTGT +GGGGGCGTAGCGCTCGAGCGATTGTGTGACGCTGCACGCGATACCGGTGGGACGAAGGGG +ATTCACGTCGACATAGCCCCGCTGAGCGAGCTTCTTGATGATCTCGTGGCGCGTGCTTTT +CGTCCCCAAGCCGAGCTCATCCATCTTCTTTATTAACTGGCCTTGTCCCCATCGCTGTGG +CGGCTGCGTCTCCTTCTCCACAAGCTGTTTGTTCGTGACAGTGACAACTTCGCCTTCGTG +AAGGTCAGGGAGGTATCGCTCTTGCGTCTGTGCATACGGGTAGTGATAGCGCCAGCCTCG +ATATGAAAGTTCTAGCCCAGTCGCTCTGAGCTGCTCCCCTCGGGAGTCCAGCGTGACCTT +TTTCGTGCACCATCGCGCCGGGTCGGAGAGCGTCGCAAAGAACCGTCGCACAACGAGCTC +GTACACCTTCCAGCTTTGTGGCTCGAGATCAGCTTGTGTGGCGCGCATCGTGGGATAGAT +CGGCGGATGGTCGGTCGTTTGCTTTTTACCGCGGGTGGCGACCAGCTGCTTCTTTTCTAG +CAGCATGCGAGCATTCTCAGCGAATGGACCGCTTCTGAACATGCTCACTGAGAGGCGGAG +ATCGAGCTCATTTGGATAGACTGTGTTGTCGGTGCGCGGATAGCTTATCCACCCTGCCGT +GTAGAGGCGCTCCGCCGTTTCCATAGCGCGGCTTGCTGAGATCCCAAGCGCGCTTGCTGC +AGCGAGAAACTCGGTCGTATTGAAGGGTTTCGGGGGGGCGTCCACTCGTTCGGTTTTGGC +AAACGTAACGACGGTGGCGGTGTCGCCGATGTTTTGTATCGCCGCCGTTGCGCTCGCTTT +GTTTCGAAATCGGTCTGCTTGGTGCTTTGCGTGAAACGTTTCGCTGCTCGTGCTGAGATC +AATGTATATTTCCCAGTATGGCTCTGACACGAATTGTGCAATTTTTCGCTCGCGATCAAC +GATCAGGGCGAGCGTTGGACTTTGAACGCGTCCCGCAGATAAGAAACGGTTGCCAAGTCG +CTTCGCAGACGTCGATATGAAGCGCGTGAGCGCAGCACCCCAGATGAGGTCCACTGCCTG +CCGTGCCTCTCCAGAACGAGCAAGGTTGAGGTCAATTCTGGTTGGCGATCGAAACGCGCG +CTTGACTTCAATGGGGGTCATTGCACTAAACCGTACGCGATCCACCCCGACGTCCTTGAC +CGACTTGACGATCGTGCACGCTTCTAAACCGATGAGTTCGCCTTCCGTGTCAAAGTCAGT +CGCAATGGTCACCCGGTCCGCGTCTTTGGCGGCCCGTTTGAGTCCTGACACGATGGTCTT +GTGCGTTGGAACGGTGACGACATCAGCCCGTATTAGGTCGTGAAGGTTTGTCTTCCAGTC +ACGGTATTCATCTGGGAAGTCGAGCTTGACGATGTGCCCCCTTAGGCCTAGCACGACCGT +ACTGTTGAAGCGATACGCGTTAGCGCCGGCGACTTTGGTTGCAGATGCTGCGCCGCCGGA +GAGAATCTCGGCGATCCGCTTCGCCGCGATATCCTTCTCAGTTACGATGAGGTGCATCAG +AGTTGCTCCCGAATAAGACGATTGGTAAGGGTGGGATCAGCCCGTCCGCGTGTCTTCTTC +ATGACCTGACCGACGAGGAAGTTAAGCGCGCCTTTCTTGCCGCTGTGATAGTCGTCAACG +GCCTGTGGGTTCTCCTCAACTGCTTGCTTAACGGCAGCGGTGATCTGTTCGACTGGGACA +GCTTGAAGCGCCTGTGCGTCGATGATCTCGCAAGGGGTCGCTCCGGTGTCGAGGACAGTT +CGTATGACCTCGACGGCTGCGTTTTCGGTAATGGTTCCTTGGTTCACGTTGGTGATGACG +TCGATTAAGAAGTTGGGGCTGAACGCATCAAGGGTAAGCCCGCGGTAGTTTAGCTCGCCC +TGGAGCACGTCAACGACCCAGCTTGCCGCCGCCGACGGGTCGACTCGTGCGGCGACCGTT +TCGAAGTAGTCGGCCAATGGCTTCGACGCGGTGAGCACTTTGGCAAGGTAATCTGAAATT +CTGTACTGCTCGATGAAGCGGTGACGTTTTGCATCCGGGAGTTCGACTTGCACTTGCTGC +GGGAACTGTTCGGCCACCGGTTCGGGATCGACGCGCGGAAGATCAGGTTCGGGGAAATAC +CGATAATCGTGCTCTTGCTCTTTTGAACGGAGTGAAATTGTGATGCCGCGCGCTTCGTCG +AAGTGCCGCGTTTCTTGCGTGATATGGATGCCTCTTCGAAGCAAGTTTCGCTGGCGGGTG +ATTTCAAAAAGGAGCGCTTTTTCCACGCCTTTGTATGAGGAGATGTTTTTGACTTCGGCC +CGTTGCCCGCCGGCGATCGAAATGTTAGCGTCGACGCGAAGCGATCCTTCGAGGTTGCCG +TCGAAGACGTCGAGATATTCCAGGATGTTGCGCAACTTGTTCAGGAAGCGACGTGCTTCC +TTAGGGGCGCGAAGGTCGGGTTCAGTAACCACTTCGAGCAGTGGCACGCCGGATCGATTG +TAATCTATCAGGGAATACTGCGCCTTTTCGATGGTTCCGCGGTACGTGAGCTTCCCCGGA +TCTTCTTCTAAGTGAACGCGTCTGATCCGAATCCTCCGTTCCTGCCCTTCGCTGTCGACC +ATCATGTAGCCGTTTGTGCCGAGGGGAAAGTCATACTGGCTGATTTGGAATCCTTTCGGC +AGGTCGGGGTAGTAGTAGTTCTTCCGGTAGAACTGCGTCTGCTGCACCGAGCAGCTGAGC +GCTTTCGCGACCCGTTCGGCGTAAACGATTGCCATTTCGTTGACGACCGGCAGACTCCCC +GGAAGCCCAAGACAGATGGGACACGTGTGGGTGTTTGGCTCTGAGTCTTGATACCGCGTT +GAACATCCGCAAAACAGCTTCGATTGCGTAGTTAGCTGTACGTGTGCCTCAAGCCCTATG +ATAACATCGGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGCGCTGATGG +GTTGCAGCACCTCGCACTGACTGGGAAGTCTGGAGGGTAGATCTATCAACCACGAGATCG +GACGGCACTGCAGCGAACCAAGTGAAAAGGTTTATCTGATAATCAGCAGCTTGTATATGG +GAGGAGTCAATATGAGCGCGTATAAATGGGCCCTTCCACTGCTGGTCACCGTCTTGTTGC +TCGCTTCGGCCTGCACCGTGGCGAGCGCGCAAAGCACACAAGGAGGCGTAGTACAGATCA +GCTCGCCTCCAAAGAACGGCAAGGCCGGACCTACGGAGCGCGTAAGCGGTAAAGCTCAGC +TTTCAAAGGGGTATTCGCTTTGGATCGTGCCGTACGATCCTACCGCGGGCAAGTACTATC +CTCAAGCGCCCTCGCTCACGGTGCGGAGTGATGCGACCTGGTCGTCTCGTCTCTCGGTAG +GCACCATATTTGGCGTCGGCAAAACGTTCCAGATCGATGCGGTCGTCGCGGATGAAAGCG +CCAGTACTGCGCTGAGCGGTTACGCCGCGCAGCGAATGGCAATAACGAAACTGCCGCCAG +GTGCGCAGGCTGTCGATGCGGTCACCGTAAAGCGCGTGGCAGCAGACGTCACGGGCACGC +CATCGGCAAGCCCCTCGCCCAGCGCCGTGCCACGCACGGCATCAACCGCTGGATCGAGCG +CGACCGCAGGGGCGAGTCCTTCTATAGCGAACGTCGCTCAAGCGAGTGCAGCTGCAAGTA +GCCAAAACCCGCTTCCCGGTTTTGAGGCGTTGTACGCACTTGGTGCCATTGCTGCCGTGT +TTCTGGCATTACGTGTCACCCGCCGTACCTGAGGCTCTGGTTGCGATGGATTGCCAATCT +TAAAACGCCTGCTTCAGGACAGATTAGCTGGAGGGGGTCAATATACAGTTGTCACACGAA +TGCTATCGATGACTTTTCCTCATTCCGTCGAAAACATGTGGGGCAGCTTGCGGCATTAAC +GAGT +>73.20110600_S2D.10_contig_13024_2 +ACAAGGCAGCGTTATATGCTCGGCGTGTGGCGCACAGCGTCACTATGTCATACGAAGCGT +GATTTATGGGACGGACACGCCGCCGATGGCCCCTGTGGGGATGAAATATGATGCGTGGGA +CTTCGCGAAGACCAGCGAATGTCTGAACTGTAAAGAACACGCACGGCACAGCGTCCATCT +CGACGAGTACAAGATGAACGTGATCTGTGACGAGTGCGGTTTTTCGCGAGTGTTGAAGCT +CGATTATCTTGTCTTCCCGCATGTACTCTCGCACGAATAAGCCCCCTGAAGATGAGCACG +CGGTTGTACGTCTTTCACGCAGGGCAGTGTGATCCGAAGAAGTGCACCGCCAAAAAACTT +GCCAAATTTCATCTCGTAACTCTTTTCTCGACGCCGCGACCCTTGTGGGGCTCTCATACG +GTGCTCCTTGATCCATCCGCACCCGCTCCGTTATGCCGCGCGGATGTTGTGACGGCGCTC +GTTGCGGTTGACTGCTCCTGGAAACAACGTGAGCGCGTGTTCGAGTCGTTCCGGGCGCGG +ACGAGTCGACGACTTCCGTATCTTTTGGCAGCGAATCCTGTGAACTATGGGCGGCCGTTC +GAGCTCACAACCGTAGAAGCGCTGGCGGCTGCGCTCTACGTCCTTGGGGAATCACGACGC +GCAAGAGAACTCTTGAACAAGTTCAAATGGGGTTCGACATTTTTGAAGTTGAACGCGGCA +CCCCTCGATGAATACGCCGGCGCGAACACCGTCAGCGACGTGCTCGCACTTGAACGCGCC +TATATAGAACGCTAAATGACTTCTTATCCGCGTTCTTGCGGTTAACGCAAACAGAAAGAT +TAAATGCTGTTAGGCGTATGGTTCTACCGACGTGAGAGGTACATCTGATGGCACGTTTTC +CAGAAGCTGAAAAAAGGTTGCTTAATATCAAAATCTGCATGCGGTGTAACGCCCGCAACC +CGCCACGGGCGACTCGATGCAGGAAATGTGGTTATAAAAACCTGAGGGTCAAGTCGAAGG +AACGAAGGGGTTAGTCGATTGAGCATCGAGCAGCATCTCGAAGAGGCGATCAGCAGAGAA +GGAACGATACACCTAACGCTTATCGATCCCGACTCACAGCCGCCAGAGGAAGCCGCACGA +ATTGCACGGGGCGCCACGGCTGGAGGCACCACTGCGGTTCTTGTGGGAGGTTCTGTAGGC +GTCGGCGGACGTGGCATAGAGTGCACCGTATCGGAAATCAAAGGTAGCACGGGTGTCCCT +GCTATCATCTTCCCTTCAGACGTAGGGAGTGTGTGTACGACGGCCGACGCGATGTTTTTC +TTGTGCCTTATGAACTCTAGGAGCACGGCGTATCTCAGTACGAACCAAGCACTCGGCGCC +GTGTTTGTCAAGAGCTACGGAATAGAGCCCATTCCAGTAGCGTACATCATCGTCGAACCG +GGGGGAACTGTTGGCTGGATAGGTGATGCGCGTTTGATTCCGCGCAACAAACCCGAGATC +GCCGCCGCATACGCGCTCGCAGGAAAATATCTTGGGATGCGATGGGTCTACCTTGAGGCA +GGTTCTGGTGTCCCCGATACCGTCCCCCCCGAAATGGTGAAGGCGGTAAAAGCATCCATA +GACGATACAAGGCTCATCGTAGGCGGCGGGATACGCACCGACAAACAGGCCGAGTTGCTA +GCGAAGGCGGGAGCGGATGTCATCGTAACGGGCACGGCAATCGAAGAGGACGTTGACATC +GAAAACAAGATCGCAAGCTTTGTNNAGATCCCAAAATCGCTTGCCGCTTTGTACCTTGAT +AAGCCGCATGTCGTGTTCTTTGCACTTGCGGGCAGTTATGCGCAGCCTTCCTGATTTCGG +CAGGGGGAGTCCAAACGTGCACTTCGGATACCCGCTGCACCCTACAAACCGTCCGCCGCG +TTTTGAGCGACGCACGATGAGATCTGAGCTGCACTGAGGGCATGTCCCGACGGTCGCGTC +TCCGCGTAGCCCTTCTTGCAGCGATTTGCTAATCTCTTCGAGTTGCTCATCGAGCGTGCT +CAAGACCTTATCGAGAATATCCGTTGAGCTGACCACCACAAACTCCTTTGACAGCGTTCC +CGTTTTGATTTTGTCCATCTCTTCTTCGAGGAGTCGGGTCATCTCAGGTTTCGTAATTGT +GGGGGCGTAGCGCTCGAGCGATTGTGTGACGCTGCACGCGATACCGGTGGGACGAAGGGG +ATTCACGTCGACATAGCCCCGCTGAGCGAGCTTCTTGATGATCTCGTGGCGCGTGCTTTT +CGTCCCCAAGCCGAGCTCATCCATCTTCTTTATTAACTGGCCTTGTCCCCATCGCTGTGG +CGGCTGCGTCTCCTTCTCCACAAGCTGTTTGTTCGTGACAGTGACAACTTCGCCTTCGTG +AAGGTCAGGGAGGTATCGCTCTTGCGTCTGTGCATACGGGTAGTGATAGCGCCAGCCTCG +ATATGAAAGTTCTAGCCCAGTCGCTCTGAGCTGCTCCCCTCGGGAGTCCAGCGTGACCTT +TTTCGTGCACCATCGCGCCGGGTCGGAGAGCGTCGCAAAGAACCGTCGCACAACGAGCTC +GTACACCTTCCAGCTTTGTGGCTCGAGATCAGCTTGTGTGGCGCGCATCGTGGGATAGAT +CGGCGGATGGTCGGTCGTTTGCTTTTTACCGCGGGTGGCGACCAGCTGCTTCTTTTCTAG +CAGCATGCGAGCATTCTCAGCGAATGGACCGCTTCTGAACATGCTCACTGAGAGGCGGAG +ATCGAGCTCATTTGGATAGACTGTGTTGTCGGTGCGCGGATAGCTTATCCACCCTGCCGT +GTAGAGGCGCTCCGCCGTTTCCATAGCGCGGCTTGCTGAGATCCCAAGCGCGCTTGCTGC +AGCGAGAAACTCGGTCGTATTGAAGGGTTTCGGGGGGGCGTCCACTCGTTCGGTTTTGGC +AAACGTAACGACGGTGGCGGTGTCGCCGATGTTTTGTATCGCCGCCGTTGCGCTCGCTTT +GTTTCGAAATCGGTCTGCTTGGTGCTTTGCGTGAAACGTTTCGCTGCTCGTGCTGAGATC +AATGTATATTTCCCAGTATGGCTCTGACACGAATTGTGCAATTTTTCGCTCGCGATCAAC +GATCAGGGCGAGCGTTGGACTTTGAACGCGTCCCGCAGATAAGAAACGGTTGCCAAGTCG +CTTCGCAGACGTCGATATGAAGCGCGTGAGCGCAGCACCCCAGATGAGGTCCACTGCCTG +CCGTGCCTCTCCAGAACGAGCAAGGTTGAGGTCAATTCTGGTTGGCGATCGAAACGCGCG +CTTGACTTCAATGGGGGTCATTGCACTAAACCGTACGCGATCCACCCCGACGTCCTTGAC +CGACTTGACGATCGTGCACGCTTCTAAACCGATGAGTTCGCCTTCCGTGTCAAAGTCAGT +CGCAATGGTCACCCGGTCCGCGTCTTTGGCGGCCCGTTTGAGTCCTGACACGATGGTCTT +GTGCGTTGGAACGGTGACGACATCAGCCCGTATTAGGTCGTGAAGGTTTGTCTTCCAGTC +ACGGTATTCATCTGGGAAGTCGAGCTTGACGATGTGCCCCCTTAGGCCTAGCACGACCGT +ACTGTTGAAGCGATACGCGTTAGCGCCGGCGACTTTGGTTGCAGATGCTGCGCCGCCGGA +GAGAATCTCGGCGATCCGCTTCGCCGCGATATCCTTCTCAGTTACGATGAGGTGCATCAG +AGTTGCTCCCGAATAAGACGATTGGTAAGGGTGGGATCAGCCCGTCCGCGTGTCTTCTTC +ATGACCTGACCGACGAGGAAGTTAAGCGCGCCTTTCTTGCCGCTGTGATAGTCGTCAACG +GCCTGTGGGTTCTCCTCAACTGCTTGCTTAACGGCAGCGGTGATCTGTTCGACTGGGACA +GCTTGAAGCGCCTGTGCGTCGATGATCTCGCAAGGGGTCGCTCCGGTGTCGAGGACAGTT +CGTATGACCTCGACGGCTGCGTTTTCGGTAATGGTTCCTTGGTTCACGTTGGTGATGACG +TCGATTAAGAAGTTGGGGCTGAACGCATCAAGGGTAAGCCCGCGGTAGTTTAGCTCGCCC +TGGAGCACGTCAACGACCCAGCTTGCCGCCGCCGACGGGTCGACTCGTGCGGCGACCGTT +TCGAAGTAGTCGGCCAATGGCTTCGACGCGGTGAGCACTTTGGCAAGGTAATCTGAAATT +CTGTACTGCTCGATGAAGCGGTGACGTTTTGCATCCGGGAGTTCGACTTGCACTTGCTGC +GGGAACTGTTCGGCCACCGGTTCGGGATCGACGCGCGGAAGATCAGGTTCGGGGAAATAC +CGATAATCGTGCTCTTGCTCTTTTGAACGGAGTGAAATTGTGATGCCGCGCGCTTCGTCG +AAGTGCCGCGTTTCTTGCGTGATATGGATGCCTCTTCGAAGCAAGTTTCGCTGGCGGGTG +ATTTCAAAAAGGAGCGCTTTTTCCACGCCTTTGTATGAGGAGATGTTTTTGACTTCGGCC +CGTTGCCCGCCGGCGATCGAAATGTTAGCGTCGACGCGAAGCGATCCTTCGAGGTTGCCG +TCGAAGACGTCGAGATATTCCAGGATGTTGCGCAACTTGTTCAGGAAGCGACGTGCTTCC +TTAGGGGCGCGAAGGTCGGGTTCAGTAACCACTTCGAGCAGTGGCACGCCGGATCGATTG +TAATCTATCAGGGAATACTGCGCCTTTTCGATGGTTCCGCGGTACGTGAGCTTCCCCGGA +TCTTCTTCTAAGTGAACGCGTCTGATCCGAATCCTCCGTTCCTGCCCTTCGCTGTCGACC +ATCATGTAGCCGTTTGTGCCGAGGGGAAAGTCATACTGGCTGATTTGGAATCCTTTCGGC +AGGTCGGGGTAGTAGTAGTTCTTCCGGTAGAACTGCGTCTGCTGCACCGAGCAGCTGAGC +GCTTTCGCGACCCGTTCGGCGTAAACGATTGCCATTTCGTTGACGACCGGCAGACTCCCC +GGAAGCCCAAGACAGATGGGACACGTGTGGGTGTTTGGCTCTGAGTCTTGATACCGCGTT +GAACATCCGCAAAACAGCTTCGATTGCGTAGTTAGCTGTACGTGTGCCTCAAGCCCTATG +ATAACATCGGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGCGCTGATGG +GTTGCAGCACCTCGCACTGACTGGGAAGTCTGGAGGGTAGATCTATCAACCACGAGATCG +GACGGCACTGCAGCGAACCAAGTGAAAAGGTTTATCTGATAATCAGCAGCTTGTATATGG +GAGGAGTCAATATGAGCGCGTATAAATGGGCCCTTCCACTGCTGGTCACCGTCTTGTTGC +TCGCTTCGGCCTGCACCGTGGCGAGCGCGCAAAGCACACAAGGAGGCGTAGTACAGATCA +GCTCGCCTCCAAAGAACGGCAAGGCCGGACCTACGGAGCGCGTAAGCGGTAAAGCTCAGC +TTTCAAAGGGGTATTCGCTTTGGATCGTGCCGTACGATCCTACCGCGGGCAAGTACTATC +CTCAAGCGCCCTCGCTCACGGTGCGGAGTGATGCGACCTGGTCGTCTCGTCTCTCGGTAG +GCACCATATTTGGCGTCGGCAAAACGTTCCAGATCGATGCGGTCGTCGCGGATGAAAGCG +CCAGTACTGCGCTGAGCGGTTACGCCGCGCAGCGAATGGCAATAACGAAACTGCCGCCAG +GTGCGCAGGCTGTCGATGCGGTCACCGTAAAGCGCGTGGCAGCAGACGTCACGGGCACGC +CATCGGCAAGCCCCTCGCCCAGCGCCGTGCCACGCACGGCATCAACCGCTGGATCGAGCG +CGACCGCAGGGGCGAGTCCTTCTATAGCGAACGTCGCTCAAGCGAGTGCAGCTGCAAGTA +GCCAAAACCCGCTTCCCGGTTTTGAGGCGTTGTACGCACTTGGTGCCATTGCTGCCGTGT +TTCTGGCATTACGTGTCACCCGCCGTACCTGAGGCTCTGGTTGCGATGGATTGCCAATCT +TAAAACGCCTGCTTCAGGACAGATTAGCTGGAGGGGGTCAATATACAGTTGTCACACGAA +TGCTATCGATGACTTTTCCTCATTCCGTCGAAAACATGTGGGGCAGCTTGCGGCATTAAC +GAGT +>73.20110600_S2D.10_contig_50844 +ACTCCCTCGTTTCATTGGTGGTCATTCGCGCGCTGCAGGCGGACGAGAACCCCCTATACC +CCGAACCAACTTCTAAGCCACGCCTGGTGTATTTCGAGGGGCAATTTTTCCATTGGCGAT +GTGCTGCTCCGCAATTTACGTTCCGTTTCTTCAAGCTCGGTTTTCGCCCACTCGTCACTA +TCAGTGACATCGATCCTGAAGAATGCGCGAGTGTCAAGGAAGTTCTCAAATAAGATTAGA +TCGTCGCNNTTGTTCTTCGATTGGCGTGACAACCAGTAAAGCTTGACAAGGGCAGGCTTG +TCTATTCGCCACGGACAAGGGACCGCTACTCGCCACGGATCGATCTGCGAAAAGTCGATC +TGCGATGGAAGGAGACGCCCCCATTCCGATTCCGCGGCCTGCGCTTTAATCCCATCGGAT +TCCCATGCTCGTACTTTCTCTATAGCAGCAACCCAGATAACCACAAGCTTGTTGAGGAAG +TCGGCTTTTTCTGTGTTGTCTGCGCCCGGAAAGAGGTCGAGCCTGTCATAATCTTCCCTG +ACGAATTTCATTTTGATATCCTCAGAAGCAATACGCTGTGTCGACAGCGGACCAATGATG +GAGGGGAGCGTATAGACCGCGAAGGGGCGTTCGTCGAGCGACGCGGCGGCACGCTTTATA +GCCTCGATCATTTCCGTTGTAAGAGCCCGCGGCTTTGTAAGGCCGGAATAACGTACCGCC +CATACCGCGGCGGCTTGCTCAACCGTTTCCGATAACCATAGTGAACAAGGGCGACCCGGA +CGCACTTTTTCGCCGTTCTCCTCGAGCTTCTGCCTTTTTTCCTGTCTTTTCTTCGCTGCG +TCGCTTCCTTTTTTCGGGCGGGCTCCGACGAATTTCTTTAGTCGAGGCTTGTTGCTAGGC +TGGTGCTGCTCGTAATTGGGGATAATATGTTCACTTGCCCATAGTCGAAGCGTTGCAATT +GGGACAGGAACTCCTAGCGTTTCGAGCTTCCTGACAAGCTGGCTCGAATTTATCTTACTT +GTCGCAGCAGCCATAATTTTCTAATAACTGGTTTCAAATTATTGACCAATACTTAATTAA +TTTCCACGTATAAGACAGTTTGTGGATGTTTTCAGAGGTGCAGCAATGAAAATGGAACCG +GAAAAAAACGTGCCTATGCTCTATCGCGGAGCTCGGGCGCTGACCGACGATCTAGATAAC +GCGGTCCTAGACTACGTTTTGCGTGCAGGAGATGCATCGGTGCGTCGCATCACCACAGAG +ATTGAAAGGCCCTACTCAACGGTTATCGTCCGTTGTCTAAAGCTGGAGGCGACAGGATTT +CTGCACAATATGTGGCCCGGCAACATCAGGCGGCGGCTCCACGCGGTGAACCGGCAAGGG +CGTACAAATGTGCGACGACATTCAATTGAAGGGGAAAGGCCGTTGTTGCCAAAATCTGAA +GAATGNNAAAATCATCAGCCACGAGATCACGATGCGGGCGGCCGCACGGTCGATGGGCAT +TGACATTTCCACGGTATCACGGCACATGAGCAATTGCGTTCCGAAACGCGTAGGCGAGCT +CGTTAAACCTGAACCGACAGAGGTACACGACCTAAACTGTGTCAATATCCTCGTATCATC +ACATCAGGACCTGCGAAACATCTACACCGAAGCAAGAGAGAAGGGCGACCTAAGGAACTC +ACTGAAGACGCTCGAGGTTGAGATTAAGCAAGTGCACGAGATTGCGACGCTCACAGGCCA +AACGCACGACGGCCCGCAGTTTAACTTATTGATGCTCCCGGAGTATGTCGAGTTCAAGCA +AACAGTTCTTGCCGCAGTCAGTGACTATCCTGAGGTTAGGGCCCGGATCAGCGCGGCGCT +CATGTGCACGGAGCCGGAACCAGGACCAGAAGAAGATGCTAGCGACGAACCCGTTTTTTA +GTGACCTCGCGGCTGCCCTAGACCCTGTTGACTTCGCGCAATCGCTCGGCATCGACCCCG +ATCCATGGCAGGAGGACATTCTCAGATCTGACTCCAACCGCATAATCCTCAACTGCGCGA +GACAAACTGGCAAAAGCAGCGTCGTCGCCATAATTGCCCTCCATCACGCATTGTATCATC +CGAAAGCGATGGTTATCATCATCAGCCATACGCTCCAGCAAGCGGCCGAGACGTTTCGCA +AGGTGCATGACTACTACCGGCAGATCGCAAAGCCGGTTCTCTCGATCATCGAGTCAGTTC +ACCGGCTCGAGCTGACAAACGGGTCTCGAATCGTCACGCTCACCGGCCAAGCGCCAGACA +GCATCCGGGGCTTTTCTAACGTCAGTCTCCTAATTATTGACGAAGCGAGTCAGGTTCCTG +ACGAGGCATATTATGCAGCTCGGCCAATGGTCGCGGTCAACGCCGGGAGAATTATATTAT +TATCAACTCCTCACGGCCAAAGAGGGTTTTATTGGCAGGCTTGGGCGAACGAGGACGATT +GGGAGAAGGTCGAAATCAACGCCGACCAGTGTCCTCGATTAACAAAAGCCTTTATCGTGG +AGGAAAAGGCGCTATATCCGTCATGGCTCTTTAGGCAGGAGTATTACAATGAATTCGCCG +AGGGAGTCACGTCAGTATTCCGGACCGAGGACATCGATGCGGCATT +>73.20110600_S2D.10_contig_37820 +ACCCAGCCAAAATGAGTCTAGGTAACACTAATCAGGTTCTACGACCTGTTAGCACCCTAC +CTTCACGGGAATCCGCACGTTTAAGCCGTCACGAAGATCTTGCGTATTTGTTTGTCGTTG +CGGGCTGTTCGTGGTTCTACTGAAAACTCTCGGGAGGGGCGTAAGCGCATGCGGAGATTA +TTCTGAACTTACCAATTTCTGTTTTGAAGAAGTTTGTATCTGGATCAATGAGGCAGAAAT +CTTGAAATTGCGATTTGCTATTTGCTATTTGGTGACTCTTAAAGCGTGTAGCGAAGGCGA +AAAGAGACGGTCCGCGAACATAAGGATAGCGAGTTTTGTTGTGTGCCTTTACCACATTCG +TCGCTGCGGCGAAATGTTCAACGTCGCATTGGGCGAGTGAAAAATGAGTGTATATCACAT +CAAGGGCGTTTTGTCTCGGCTTGTTATTTAAGCTGCTTTTTCCCGCAGCAAGACGCCGAG +TTTCGGAGCAATTCCGGCTGGCGAGAATTGATCTCTTTGCTTCAGTATGTTCTCCTGTTG +CGCGCGGTAACGATCCAGAGGTTTCTTTTCAAGGATCTCCATAACAGCACCCTTTGCGTC +ATTGTAACTCTGGCACAGGTACCCGATGTCCCCCATCTTGTTGAAGTAATATTCCGAGAG +TGGGGTCTTTAGTGCGATAATTGGCTTGAGATGTGAAAATGCGTCCAGCATCGCAGCAGT +TCCATTTAATTCGTAAGCACTCGCTACATGAAAAAAGAGGGCATAATCGATGCGTTTACT +ATATTTGTCAAACTCCTCTTGAGTGAGGGGCGCGTTAAGTGCGGGGATATTGACAGACGT +GTGCCGCATCTCTTTGAGCTGCTCATCTCTTACATCACCAATGAGAACAAAAATTGGCCG +ATATTTTGTTCTTGTAGAGCTCACTTCATCGGCTAACTTGAAAAAAATGTCCGCTCCCTT +GCGCAAACTCGCGTGACCATAAAAACCAAAATGAACAACATCATCTTTGAACGGCTGGCT +GCCGGTAGAATCTGCAAAGACATAAGGAACGTCTATCGGCGAAATACAGTTGTGTAGACG +TGGCAAGTAATTCTTAAGTCCTTGTTCCATATGGGGCGGGNNGTCCCGGGAGTAAGTAAT +GTATTCTCTCCGTGTTACCGGCGAGAAGCCAGTGTCGATATGAAAGGAACTGAGATGTCA +GAAGCTTTGGTTGAGGCGTGCTGATTTTGTTCTGCAGATCGTGTAGGACAACAAAACAGG +CGATCTGAGAAAATCGTCTCAGCATCATTTTGATAATGACTAGATCGTTCCTATGTGCTG +CTGTACAAAACACTATTTGCCTCGCTCGATTTTTGTCGGCCAATCTGAATAAGCGCCTGT +ACAGGGCTAGCTCCTCGAGGAGCGCTGCAAAGGGCAAGCGCCGACCTCGTTGCTCTTCGC +GCAGGGCCCTATCTGGAAGTTCGACTTCGACAAACCTGACTGTGGCCACGTGAGTATCTG +CAATCCACTGGACATGATTCAAATGTTCGCGTTCAGCACAAAATATCAGCTCCTCCTCAG +GGAAAGCCTCAGCGACTGAACAAACCAGAGCCGCATTAAATTGGGCGTGCCCAAAACCGC +GAAAATGCGGTTCGCATATAACTATCATGAGCTCTCTATATCTTTAATCAGTTGATCTTA +TCGATTTCGTTTTTTCGTTGACTGCGCTTCGTGATAATGGTGAGGTTCATTTCCGCATAC +AAGCGCTATTCCAGCCCTTGAGCTTTTCAGATGCGCAGACATACCTTGATTTGCCAATAT +ATATTTAACGTTGCAGGGAAGCTGCATCTGTCAGGAAGCTTGACTGTCAGATTATCTCAG +TTCACAAGCCTTCGCACCTGCCTGAGAATCCAGTCTTACACAATTGCCAGGATCAAACGT +CTAACTCGCTTCAAAAAAGCGTCAGAAAAAGCGATCAGCCCGGATCTCTTCAGTTCCGAT +GCGTAAATGGATGACAATGACCTTGTTTGGTTTCCAAACGAGGTGGTTAGCACGCTCGTA +GAGGTAGTCTCACTTTTGAACAAGCTCGTCGTACGGTTCATCGCTCAACAAACCATTGAA +CCTCATGACAGTTGCGTTTTTATTCGTAAAGAGCACAGCAAAAATCCTCCATCAGTGTAG +CGCAACACGCCTGACATTCAGTCCACGAGTATTCTCTGTTGATCAATTCTAAGCTTTCTC +GTTATTCCTTTACTACTTCACACCATATGTTTTCGTTGAAGGTTTTCGAAGAGAAGATCC +TAGAAAGAATAAAGACCGCGGTTCTCGGTATCCGATTGTTCCACAAGTTTATACCAACTG +TCGCATTTATCTTGGTCATCTCTTTGTAGCTTCGGATCTTTCCTCCACTTAAGCGCACGA +TAGTATTCACCTCTTCTTGTGAAAAACCACAATGTTGATGCGTTTTATCGAGTTGATGGT +ATTCAACAACTCCGCGTTCCCAGAGCGCCTCATTTCTTTTCGGAACACACAAAAGGACGT +TCTCTTGGCAACTCTTGATAGATTTGCTCAATAAAGTACATGGATCTTCTACGTGCTCTA +ATAACCAGTACAAAACCACGGTGTCGATTTTTGGGAAGGTAACTGCGGTTGCGTCGTTCG +TTATGATCTCAACGCCCGGCGGGTACGCATAATCGTCGTACCGTTGTGCGTCAATGCCAA +TGTAT diff --git a/tests/test_cmdline.rs b/tests/test_cmdline.rs index 87c3553..b2a16d5 100644 --- a/tests/test_cmdline.rs +++ b/tests/test_cmdline.rs @@ -368,6 +368,27 @@ mod tests { .unwrap(); } + #[test] + fn test_contig_cluster() { + Assert::main_binary() + .with_args(&[ + "cluster", + "--genome-fasta-files", + "tests/data/contigs/contigs.fna", + "--cluster-contigs", + "--output-cluster-definition", + "/dev/stdout", + ]) + .succeeds() + .stdout() + .is("\ + 73.20110600_S2D.10_contig_13024 73.20110600_S2D.10_contig_13024\n\ + 73.20110600_S2D.10_contig_13024 73.20110600_S2D.10_contig_13024_2\n\ + 73.20110600_S2D.10_contig_50844 73.20110600_S2D.10_contig_50844\n\ + 73.20110600_S2D.10_contig_37820 73.20110600_S2D.10_contig_37820\n") + .unwrap(); + } + // #[test] // fn test_fraglen() { // Assert::main_binary()