Skip to content

Commit

Permalink
cluster: Add --min-aligned-fraction.
Browse files Browse the repository at this point in the history
  • Loading branch information
wwood committed Apr 28, 2020
1 parent 64cb41a commit ce19df7
Show file tree
Hide file tree
Showing 7 changed files with 100 additions and 19 deletions.
18 changes: 16 additions & 2 deletions src/cluster_argument_parsing.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ pub enum Preclusterer {
pub struct GalahClusterer<'a> {
pub genome_fasta_paths: Vec<&'a str>,
pub ani: f32,
pub min_aligned_fraction: f32,
pub preclusterer: Preclusterer,
}

Expand All @@ -31,6 +32,7 @@ pub struct GalahClustererCommandDefinition {
pub dereplication_prethreshold_ani_argument: String,
pub dereplication_quality_formula_argument: String,
pub dereplication_precluster_method_argument: String,
pub dereplication_aligned_fraction_argument: String,
}

pub fn run_cluster_subcommand(matches: &clap::ArgMatches) {
Expand All @@ -50,6 +52,7 @@ pub fn run_cluster_subcommand(matches: &clap::ArgMatches) {
dereplication_prethreshold_ani_argument: "prethreshold-ani".to_string(),
dereplication_quality_formula_argument: "quality-formula".to_string(),
dereplication_precluster_method_argument: "precluster-method".to_string(),
dereplication_aligned_fraction_argument: "min-aligned-fraction".to_string(),
})
.expect("Failed to parse galah clustering arguments correctly");

Expand Down Expand Up @@ -320,6 +323,9 @@ pub fn generate_galah_clusterer<'a>(
ani: parse_percentage(&clap_matches, &argument_definition.dereplication_ani_argument)
.expect(&format!("Failed to parse ani {:?}", clap_matches.value_of(&argument_definition.dereplication_ani_argument)))
.expect(&format!("Failed to parse ani {:?}", clap_matches.value_of(&argument_definition.dereplication_ani_argument))),
min_aligned_fraction: parse_percentage(&clap_matches, &argument_definition.dereplication_aligned_fraction_argument)
.expect(&format!("Failed to parse min-aligned-fraction {:?}", clap_matches.value_of(&argument_definition.dereplication_aligned_fraction_argument)))
.expect(&format!("Failed to parse min-aligned-fraction {:?}", clap_matches.value_of(&argument_definition.dereplication_aligned_fraction_argument))),
preclusterer: match clap_matches.value_of(&argument_definition.dereplication_precluster_method_argument).unwrap() {
"dashing" => {
crate::external_command_checker::check_for_dashing();
Expand Down Expand Up @@ -368,6 +374,7 @@ impl GalahClusterer<'_> {
pub fn cluster(&self) -> Vec<Vec<usize>> {
let genomes = &self.genome_fasta_paths;
let ani = self.ani*100.;
let fastani_min_aligned_threshold = self.min_aligned_fraction;
match self.preclusterer {
Preclusterer::Dashing {
min_ani,
Expand All @@ -379,7 +386,8 @@ impl GalahClusterer<'_> {
min_ani: min_ani,
threads: threads
},
ani
ani,
fastani_min_aligned_threshold,
)
}
Preclusterer::Finch {
Expand All @@ -394,7 +402,8 @@ impl GalahClusterer<'_> {
num_kmers: num_kmers,
kmer_length: kmer_length,
},
ani
ani,
fastani_min_aligned_threshold,
)
}
}
Expand All @@ -410,6 +419,11 @@ pub fn add_cluster_subcommand<'a>(app: clap::App<'a, 'a>) -> clap::App<'a, 'a> {
.help("Average nucleotide identity threshold for clustering")
.takes_value(true)
.default_value("99"))
.arg(Arg::with_name("min-aligned-fraction")
.long("min-aligned-fraction")
.help("Min aligned fraction of two genomes for clustering")
.takes_value(true)
.default_value("50"))
.arg(Arg::with_name("checkm-tab-table")
.long("checkm-tab-table")
.help("Output of CheckM lineage_wf/taxonomy_wf/qa with --tab_table specified")
Expand Down
6 changes: 3 additions & 3 deletions src/cluster_validation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use crate::clusterer::calculate_fastani;

use rayon::prelude::*;

pub fn validate_clusters(clustering_file: &str, ani_threshold: f32) {
pub fn validate_clusters(clustering_file: &str, ani_threshold: f32, fastani_min_aligned_threshold: f32) {
let ani = ani_threshold*100.;

// Read cluster file
Expand All @@ -17,7 +17,7 @@ pub fn validate_clusters(clustering_file: &str, ani_threshold: f32) {
clusters.par_iter().for_each( |cluster| {
let rep = &cluster[0];
cluster.par_iter().for_each( |genome| {
let fastani_res = calculate_fastani(&rep, &genome);
let fastani_res = calculate_fastani(&rep, &genome, fastani_min_aligned_threshold);
match fastani_res {
Some(fastani) => {
if fastani >= ani {
Expand All @@ -39,7 +39,7 @@ pub fn validate_clusters(clustering_file: &str, ani_threshold: f32) {
reps.par_iter().enumerate().for_each(|(i,rep1)| {
reps.par_iter().enumerate().for_each(|(j,rep2)| {
if i<j {
let fastani_res = calculate_fastani(rep1, rep2);
let fastani_res = calculate_fastani(rep1, rep2,fastani_min_aligned_threshold);
match fastani_res {
Some(fastani) => {
if fastani < ani {
Expand Down
38 changes: 25 additions & 13 deletions src/clusterer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ pub fn cluster<P: PreclusterDistanceFinder>(
genomes: &[&str],
preclusterer: &P,
fastani_threshold: f32,
fastani_min_aligned_threshold: f32,
) -> Vec<Vec<usize>> {

assert!(fastani_threshold > 1.0);
Expand Down Expand Up @@ -55,14 +56,14 @@ pub fn cluster<P: PreclusterDistanceFinder>(
debug!("Calculating genome representatives by dashing+fastani in precluster {} ..",
precluster_id);
let (clusters, calculated_fastanis) = find_dashing_fastani_representatives(
&precluster_dashing_cache, precluster_genomes.as_slice(), fastani_threshold
&precluster_dashing_cache, precluster_genomes.as_slice(), fastani_threshold, fastani_min_aligned_threshold
);
debug!("In precluster {}, found {} genome representatives", precluster_id, clusters.len());

debug!("Assigning genomes to representatives by dashing+fastani in precluster {}..",
precluster_id);
let clusters = find_dashing_fastani_memberships(
&clusters, &precluster_dashing_cache, precluster_genomes.as_slice(), calculated_fastanis
&clusters, &precluster_dashing_cache, precluster_genomes.as_slice(), calculated_fastanis, fastani_min_aligned_threshold
);
// Indices here are within this single linkage cluster only, so
// here we map them back to their original indices, and then
Expand Down Expand Up @@ -131,7 +132,8 @@ fn partition_sketches(
fn find_dashing_fastani_representatives(
dashing_cache: &SortedPairGenomeDistanceCache,
genomes: &[&str],
fastani_threshold: f32)
fastani_threshold: f32,
fastani_min_aligned_threshold: f32)
-> (BTreeSet<usize>, SortedPairGenomeDistanceCache) {

let mut clusters_to_return: BTreeSet<usize> = BTreeSet::new();
Expand All @@ -156,7 +158,8 @@ fn find_dashing_fastani_representatives(
let fastanis = calculate_fastani_many_to_one_pairwise_stop_early(
potential_refs.iter().map(|ref_index| genomes[*ref_index]).collect::<Vec<_>>().as_slice(),
genomes[i],
fastani_threshold);
fastani_threshold,
fastani_min_aligned_threshold);
let mut is_rep = true;
for (potential_ref, fastani) in potential_refs.into_iter().zip(fastanis.iter()) {
debug!("Read in fastani {:?} from ref {} {} and genome {} {}",
Expand Down Expand Up @@ -185,11 +188,12 @@ fn find_dashing_fastani_representatives(
/// Calculate FastANI values, submitting each genome pair in parallel.
fn calculate_fastani_many_to_one_pairwise(
query_genome_paths: &[&str],
ref_genome_path: &str)
ref_genome_path: &str,
fastani_min_aligned_threshold: f32)
-> Vec<Option<f32>> {

query_genome_paths.par_iter().map(|query_genome| {
calculate_fastani(query_genome, ref_genome_path)
calculate_fastani(query_genome, ref_genome_path, fastani_min_aligned_threshold)
}).collect()
}

Expand All @@ -200,13 +204,14 @@ fn calculate_fastani_many_to_one_pairwise(
fn calculate_fastani_many_to_one_pairwise_stop_early(
query_genome_paths: &[&str],
ref_genome_path: &str,
ani_threshold: f32)
ani_threshold: f32,
fastani_min_aligned_threshold: f32)
-> Vec<Option<f32>> {

let to_return: Mutex<Vec<Option<f32>>> = Mutex::new(vec![None;query_genome_paths.len()]);

query_genome_paths.par_iter().enumerate().find_any(|(i,query_genome)| {
let ani = calculate_fastani(query_genome, ref_genome_path);
let ani = calculate_fastani(query_genome, ref_genome_path, fastani_min_aligned_threshold);
to_return.lock().unwrap()[*i] = ani;
match ani {
Some(real_ani) => real_ani >= ani_threshold,
Expand All @@ -217,12 +222,12 @@ fn calculate_fastani_many_to_one_pairwise_stop_early(
return to_return.into_inner().unwrap()
}

pub fn calculate_fastani(fasta1: &str, fasta2: &str) -> Option<f32> {
let one = calculate_fastani_one_way(fasta1,fasta2);
pub fn calculate_fastani(fasta1: &str, fasta2: &str, fastani_min_aligned_threshold: f32) -> Option<f32> {
let one = calculate_fastani_one_way(fasta1,fasta2,fastani_min_aligned_threshold);
match one {
None => return None,
Some(first) => {
let two = calculate_fastani_one_way(fasta2,fasta1);
let two = calculate_fastani_one_way(fasta2,fasta1,fastani_min_aligned_threshold);
match two {
None => return None,
Some(second) => return Some(
Expand All @@ -236,11 +241,13 @@ pub fn calculate_fastani(fasta1: &str, fasta2: &str) -> Option<f32> {
}
}

fn calculate_fastani_one_way(fasta1: &str, fasta2: &str) -> Option<f32> {
fn calculate_fastani_one_way(fasta1: &str, fasta2: &str, fastani_min_aligned_threshold: f32) -> Option<f32> {
let mut cmd = std::process::Command::new("fastANI");
cmd
.arg("-o")
.arg("/dev/stdout")
.arg("--minFraction")
.arg(&format!("{}",fastani_min_aligned_threshold))
.arg("--query")
.arg(&fasta1)
.arg("--ref")
Expand Down Expand Up @@ -322,7 +329,8 @@ fn find_dashing_fastani_memberships(
representatives: &BTreeSet<usize>,
dashing_cache: &SortedPairGenomeDistanceCache,
genomes: &[&str],
calculated_fastanis: SortedPairGenomeDistanceCache
calculated_fastanis: SortedPairGenomeDistanceCache,
fastani_min_aligned_threshold: f32,
) -> Vec<Vec<usize>> {

let mut rep_to_index = BTreeMap::new();
Expand Down Expand Up @@ -353,6 +361,7 @@ fn find_dashing_fastani_memberships(
let new_fastanis = calculate_fastani_many_to_one_pairwise(
&potential_refs.iter().map(|ref_i| genomes[**ref_i]).collect::<Vec<_>>(),
genomes[i],
fastani_min_aligned_threshold,
);
for (ref_i, ani_opt) in potential_refs.iter().zip(new_fastanis.iter()) {
calculated_fastanis_lock.lock().unwrap().insert((i,**ref_i), *ani_opt);
Expand Down Expand Up @@ -457,6 +466,7 @@ mod tests {
kmer_length: 21
},
95.0,
0.2,
);
for cluster in clusters.iter_mut() { cluster.sort_unstable(); }
assert_eq!(
Expand All @@ -480,6 +490,7 @@ mod tests {
kmer_length: 21
},
98.0,
0.2,
);
for cluster in clusters.iter_mut() { cluster.sort_unstable(); }
assert_eq!(
Expand All @@ -503,6 +514,7 @@ mod tests {
kmer_length: 21
},
98.0,
0.2,
);
for cluster in clusters.iter_mut() { cluster.sort_unstable(); }
assert_eq!(
Expand Down
11 changes: 10 additions & 1 deletion src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,12 @@ fn main(){
.expect("Programming error: rayon initialised multiple times");

let ani = galah::cluster_argument_parsing::parse_percentage(&m, "ani");
let min_aligned_fraction = galah::cluster_argument_parsing::parse_percentage(&m, "min-aligned-fraction");

galah::cluster_validation::validate_clusters(m.value_of("cluster-file").unwrap(), ani.unwrap().unwrap());
galah::cluster_validation::validate_clusters(
m.value_of("cluster-file").unwrap(),
ani.unwrap().unwrap(),
min_aligned_fraction.unwrap().unwrap());

},
_ => panic!("Programming error")
Expand Down Expand Up @@ -62,6 +66,11 @@ fn build_cli() -> App<'static, 'static> {
.default_value("99")
.help("ANI to validate against")
.takes_value(true))
.arg(Arg::with_name("min-aligned-fraction")
.long("min-aligned-fraction")
.help("Min aligned fraction of two genomes for clustering")
.takes_value(true)
.default_value("50"))
.arg(Arg::with_name("threads")
.short("t")
.long("threads")
Expand Down
2 changes: 2 additions & 0 deletions tests/data/set2/1mbp.fna

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions tests/data/set2/1mbp.half_aligned.fna

Large diffs are not rendered by default.

41 changes: 41 additions & 0 deletions tests/test_cmdline.rs
Original file line number Diff line number Diff line change
Expand Up @@ -162,4 +162,45 @@ mod tests {
tests/data/set1_name_clash/500kb.fna\n")
.unwrap();
}


#[test]
fn test_min_aligned_fraction(){
Assert::main_binary()
.with_args(&[
"cluster",
"--genome-fasta-files",
"tests/data/set2/1mbp.fna",
"tests/data/set2/1mbp.half_aligned.fna",
"--min-aligned-fraction",
"0.2",
"--precluster-method", // only needed temporarily
"finch",
"--output-representative-list",
"/dev/stdout"])
.succeeds()
.stdout()
.is("
tests/data/set2/1mbp.fna\n")
.unwrap();

Assert::main_binary()
.with_args(&[
"cluster",
"--genome-fasta-files",
"tests/data/set2/1mbp.fna",
"tests/data/set2/1mbp.half_aligned.fna",
"--min-aligned-fraction",
"0.6",
"--precluster-method", // only needed temporarily
"finch",
"--output-representative-list",
"/dev/stdout"])
.succeeds()
.stdout()
.is("
tests/data/set2/1mbp.fna\n\
tests/data/set2/1mbp.half_aligned.fna\n")
.unwrap();
}
}

0 comments on commit ce19df7

Please sign in to comment.