From 39f4c2b28092df3b10d8b748d0efe7d6f2f660de Mon Sep 17 00:00:00 2001 From: Ryan Wick Date: Wed, 8 Jan 2025 14:13:39 +1100 Subject: [PATCH] Change merge logic and add clean command --- src/clean.rs | 117 ++++++++++++++++++++++++++++++++++++ src/cluster.rs | 2 +- src/combine.rs | 10 +-- src/graph_simplification.rs | 29 ++++++--- src/main.rs | 19 ++++++ src/resolve.rs | 8 +-- src/trim.rs | 2 +- src/unitig.rs | 4 ++ src/unitig_graph.rs | 2 - 9 files changed, 173 insertions(+), 20 deletions(-) create mode 100644 src/clean.rs diff --git a/src/clean.rs b/src/clean.rs new file mode 100644 index 0000000..136e81c --- /dev/null +++ b/src/clean.rs @@ -0,0 +1,117 @@ +// This file contains the code for the autocycler combine subcommand. + +// Copyright 2024 Ryan Wick (rrwick@gmail.com) +// https://github.com/rrwick/Autocycler + +// This file is part of Autocycler. Autocycler is free software: you can redistribute it and/or +// modify it under the terms of the GNU General Public License as published by the Free Software +// Foundation, either version 3 of the License, or (at your option) any later version. Autocycler +// is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the +// implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +// Public License for more details. You should have received a copy of the GNU General Public +// License along with Autocycler. If not, see . + +use std::collections::HashSet; +use std::path::{Path, PathBuf}; + +use crate::graph_simplification::merge_linear_paths; +use crate::log::{section_header, explanation}; +use crate::misc::{check_if_file_exists, quit_with_error}; +use crate::unitig_graph::UnitigGraph; + + +pub fn clean(in_gfa: PathBuf, out_gfa: PathBuf, remove: Option) { + check_settings(&in_gfa); + starting_message(); + let remove_tigs = parse_remove_tigs(remove); + print_settings(&in_gfa, &out_gfa, &remove_tigs); + let mut graph = load_graph(&in_gfa); + clean_graph(&mut graph, &remove_tigs, &in_gfa); + merge_graph(&mut graph); + graph.save_gfa(&out_gfa, &vec![]).unwrap(); + finished_message(&out_gfa); +} + + +fn check_settings(in_gfa: &Path) { + check_if_file_exists(in_gfa); +} + + +fn starting_message() { + section_header("Starting autocycler clean"); + explanation("This command removes user-specified tigs from a combined Autocycler graph and \ + then merges all linear paths to produce a clean output graph."); +} + + +fn print_settings(in_gfa: &Path, out_gfa: &Path, remove_tigs: &[u32]) { + eprintln!("Settings:"); + eprintln!(" --in_gfa {}", in_gfa.display()); + eprintln!(" --out_gfa {}", out_gfa.display()); + if !remove_tigs.is_empty() { + eprintln!(" --remove {}", remove_tigs.iter().map(|c| c.to_string()) + .collect::>() .join(",")); + } + eprintln!(); +} + + +fn finished_message(out_gfa: &Path) { + section_header("Finished!"); + eprintln!("Cleaned graph: {}", out_gfa.display()); + eprintln!(); +} + + +fn clean_graph(graph: &mut UnitigGraph, remove_tigs: &[u32], in_gfa: &Path) { + section_header("Cleaning graph"); + explanation("The user-specified tigs are now removed from the graph."); + check_tig_nums_are_valid(in_gfa, &graph, remove_tigs); + let remove_set: HashSet = HashSet::from_iter(remove_tigs.iter().cloned()); + graph.remove_unitigs_by_number(remove_set); + graph.print_basic_graph_info(); +} + + +fn merge_graph(graph: &mut UnitigGraph) { + section_header("Merging linear paths"); + explanation("Linear paths in the graph are now merged."); + merge_linear_paths(graph, &vec![], false); + graph.print_basic_graph_info(); + graph.renumber_unitigs(); +} + + +fn load_graph(gfa: &Path) -> UnitigGraph { + section_header("Loading graph"); + explanation("The unitig graph is now loaded into memory."); + let (graph, _) = UnitigGraph::from_gfa_file(gfa); + graph.print_basic_graph_info(); + graph +} + + +fn check_tig_nums_are_valid(in_gfa: &Path, graph: &UnitigGraph, remove_tigs: &[u32]) { + let mut all_tig_numbers = HashSet::new(); + for unitig in &graph.unitigs { + all_tig_numbers.insert(unitig.borrow().number); + } + for tig in remove_tigs { + if !all_tig_numbers.contains(tig) { + quit_with_error(&format!("{} does not contain tig {}", in_gfa.display(), tig)); + } + } +} + + +fn parse_remove_tigs(remove: Option) -> Vec { + if remove.is_none() { + return Vec::new(); + } + let mut remove_tigs: Vec<_> = remove.unwrap().split(',') + .map(|s| s.parse::().unwrap_or_else(|_| quit_with_error( + &format!("failed to parse '{}' as a node number", s)))).collect(); + remove_tigs.sort(); + remove_tigs +} diff --git a/src/cluster.rs b/src/cluster.rs index c44c7bd..ebd0ca0 100644 --- a/src/cluster.rs +++ b/src/cluster.rs @@ -792,7 +792,7 @@ fn save_cluster_gfa(sequences: &[Sequence], cluster_num: u16, gfa_lines: &Vec) { starting_message(); print_settings(&autocycler_dir, &in_gfas); - // TODO: add an optional argument for reads, which will add depth values to the combined - // assembly. Find unique k-mers in the combined assembly and then count the occurrences - // of those k-mers in the reads. + // TODO: add an optional argument for reads. When used, the reads will be used to set unitig + // depths (instead of input assembly count). Find unique k-mers in the combined assembly + // and then count the occurrences of those k-mers in the reads. // TODO: sort the input GFA files in order of decreasing size. They may already be in this // order (from the clustering step), but not necessarily (e.g. due to small plasmid @@ -106,11 +106,13 @@ fn combine_clusters(in_gfas: &Vec, combined_gfa: &Path, combined_fasta: let unitig_seq = String::from_utf8_lossy(&unitig.forward_seq); let circ = if unitig.is_isolated_and_circular() { " circular=true".to_string() } else { "".to_string() }; + let depth_tag = format!("\tDP:f:{:.2}", unitig.depth); let mut colour_tag = unitig.colour_tag(); if colour_tag.is_empty() { colour_tag = "\tCL:z:orangered".to_string(); } - writeln!(gfa_file, "S\t{}\t{}{}", unitig_num, unitig_seq, colour_tag).unwrap(); + writeln!(gfa_file, "S\t{}\t{}{}{}", + unitig_num, unitig_seq, depth_tag, colour_tag).unwrap(); writeln!(fasta_file, ">{} length={}{}", unitig_num, unitig.length(), circ).unwrap(); writeln!(fasta_file, "{}", unitig_seq).unwrap(); } diff --git a/src/graph_simplification.rs b/src/graph_simplification.rs index a8432c3..b6f9e3e 100644 --- a/src/graph_simplification.rs +++ b/src/graph_simplification.rs @@ -313,7 +313,7 @@ fn get_common_end_seq(unitigs: &[UnitigStrand]) -> Vec { } -pub fn merge_linear_paths(graph: &mut UnitigGraph, seqs: &Vec, depth: Option) { +pub fn merge_linear_paths(graph: &mut UnitigGraph, seqs: &Vec, require_same_depth: bool) { // This function looks for linear paths in the graph (where one Unitig leads only to another // and vice versa) and merges them together when possible. // @@ -343,10 +343,12 @@ pub fn merge_linear_paths(graph: &mut UnitigGraph, seqs: &Vec, depth: // Extend the path as far as possible. loop { let unitig = current_path.last().unwrap(); + let current_depth = unitig.depth(); if cannot_merge_end(unitig.number(), unitig.strand, &fixed_starts, &fixed_ends) { break; } let mut outputs = if unitig.strand { get_exclusive_outputs(&unitig.unitig) } else { get_exclusive_inputs(&unitig.unitig) }; if outputs.len() != 1 { break; } let output = &mut outputs[0]; + if require_same_depth && output.depth() != current_depth { break; } if !unitig.strand { output.strand = !output.strand; } let output_number = output.number(); if already_used.contains(&output_number) { break; } @@ -364,7 +366,7 @@ pub fn merge_linear_paths(graph: &mut UnitigGraph, seqs: &Vec, depth: let mut new_unitig_number: u32 = graph.max_unitig_number(); for path in merge_paths { new_unitig_number += 1; - merge_path(graph, &path, new_unitig_number, depth); + merge_path(graph, &path, new_unitig_number); } graph.delete_dangling_links(); graph.build_unitig_index(); @@ -408,7 +410,7 @@ fn has_single_exclusive_input(unitig_rc: &Rc>, unitig_strand: bo } -fn merge_path(graph: &mut UnitigGraph, path: &Vec, new_unitig_number: u32, depth: Option) { +fn merge_path(graph: &mut UnitigGraph, path: &Vec, new_unitig_number: u32) { let merged_seq = merge_unitig_seqs(path); let first = &path[0]; let last = path.last().unwrap(); @@ -431,7 +433,8 @@ fn merge_path(graph: &mut UnitigGraph, path: &Vec, new_unitig_numb number: new_unitig_number, reverse_seq: reverse_complement(&merged_seq), forward_seq: merged_seq, - depth: if let Some(d) = depth { d } else { forward_positions.len() as f64 }, + depth: if !forward_positions.is_empty() { forward_positions.len() as f64 } + else { weighted_mean_depth(path) }, forward_positions, reverse_positions, forward_next, forward_prev, reverse_next, reverse_prev, ..Default::default() @@ -499,6 +502,16 @@ fn merge_unitig_seqs(path: &Vec) -> Vec { } +fn weighted_mean_depth(path: &Vec) -> f64 { + let total_length = path.iter().map(|u| u.length()).sum::() as f64; + let mut depth_sum = 0.0; + for u in path { + depth_sum += u.depth() * u.length() as f64; + } + depth_sum / total_length +} + + #[cfg(test)] mod tests { use crate::test_gfa::*; @@ -716,7 +729,7 @@ mod tests { fn test_merge_linear_paths_1() { let (mut graph, seqs) = UnitigGraph::from_gfa_lines(&get_test_gfa_3()); assert_eq!(graph.unitigs.len(), 7); - merge_linear_paths(&mut graph, &seqs, None); + merge_linear_paths(&mut graph, &seqs, false); assert_eq!(graph.unitigs.len(), 3); assert_eq!(std::str::from_utf8(&graph.unitig_index.get(&8).unwrap().borrow().forward_seq).unwrap(), "TTCGCTGCGCTCGCTTCGCTTTTGCACAGCGACGACGGCATGCCTGAATCGCCTA"); @@ -740,7 +753,7 @@ mod tests { fn test_merge_linear_paths_2() { let (mut graph, seqs) = UnitigGraph::from_gfa_lines(&get_test_gfa_4()); assert_eq!(graph.unitigs.len(), 5); - merge_linear_paths(&mut graph, &seqs, None); + merge_linear_paths(&mut graph, &seqs, false); assert_eq!(graph.unitigs.len(), 2); assert_eq!(std::str::from_utf8(&graph.unitig_index.get(&6).unwrap().borrow().forward_seq).unwrap(), "ACGACTACGAGCACGAGTCGTCGTCGTAACTGACT"); @@ -759,7 +772,7 @@ mod tests { fn test_merge_linear_paths_3() { let (mut graph, seqs) = UnitigGraph::from_gfa_lines(&get_test_gfa_5()); assert_eq!(graph.unitigs.len(), 6); - merge_linear_paths(&mut graph, &seqs, None); + merge_linear_paths(&mut graph, &seqs, false); assert_eq!(graph.unitigs.len(), 5); assert_eq!(std::str::from_utf8(&graph.unitig_index.get(&7).unwrap().borrow().forward_seq).unwrap(), "AAATGCGACTGTG"); @@ -769,7 +782,7 @@ mod tests { fn test_merge_linear_paths_4() { let (mut graph, seqs) = UnitigGraph::from_gfa_lines(&get_test_gfa_14()); assert_eq!(graph.unitigs.len(), 13); - merge_linear_paths(&mut graph, &seqs, None); + merge_linear_paths(&mut graph, &seqs, false); assert_eq!(graph.unitigs.len(), 11); } } diff --git a/src/main.rs b/src/main.rs index 4f7601b..cc874c9 100644 --- a/src/main.rs +++ b/src/main.rs @@ -15,6 +15,7 @@ use std::path::PathBuf; use clap::{Parser, Subcommand, crate_version}; +mod clean; mod cluster; mod combine; mod compress; @@ -63,6 +64,21 @@ struct Cli { #[derive(Subcommand)] enum Commands { + /// Remove specified tigs and then merge linear paths + Clean { + /// Autocycler GFA file (required) + #[clap(short = 'i', long = "in_gfa", required = true)] + in_gfa: PathBuf, + + /// Output GFA file (required) + #[clap(short = 'o', long = "out_gfa", required = true)] + out_gfa: PathBuf, + + /// Tig numbers to remove from the input graph + #[clap(short = 'r', long = "remove")] + remove: Option, + }, + /// cluster contigs in the unitig graph based on similarity Cluster { /// Autocycler directory containing input_assemblies.gfa file (required) @@ -245,6 +261,9 @@ fn main() { let cli = Cli::parse(); match cli.command { + Some(Commands::Clean { in_gfa, out_gfa, remove }) => { + clean::clean(in_gfa, out_gfa, remove); + }, Some(Commands::Cluster { autocycler_dir, cutoff, min_assemblies, max_contigs, manual }) => { cluster::cluster(autocycler_dir, cutoff, min_assemblies, max_contigs, manual); }, diff --git a/src/resolve.rs b/src/resolve.rs index c26677c..fdcd54b 100644 --- a/src/resolve.rs +++ b/src/resolve.rs @@ -50,7 +50,7 @@ pub fn resolve(cluster_dir: PathBuf, verbose: bool) { apply_unique_message(); apply_bridges(&mut unitig_graph, &bridges, bridge_depth); unitig_graph.save_gfa(&bridged_gfa, &vec![]).unwrap(); - merge_after_bridging(&mut unitig_graph, bridge_depth); + merge_after_bridging(&mut unitig_graph); unitig_graph.save_gfa(&merged_gfa, &vec![]).unwrap(); let cull_count = cull_ambiguity(&mut bridges, verbose); @@ -58,7 +58,7 @@ pub fn resolve(cluster_dir: PathBuf, verbose: bool) { (unitig_graph, _) = load_graph(&gfa_lines, false, Some(&anchors)); apply_final_message(); apply_bridges(&mut unitig_graph, &bridges, bridge_depth); - merge_after_bridging(&mut unitig_graph, bridge_depth); + merge_after_bridging(&mut unitig_graph); } else { eprintln!("All bridges were unique, no culling necessary.\n"); } @@ -237,8 +237,8 @@ fn apply_bridges(graph: &mut UnitigGraph, bridges: &Vec, bridge_depth: f } -fn merge_after_bridging(graph: &mut UnitigGraph, bridge_depth: f64) { - merge_linear_paths(graph, &vec![], Some(bridge_depth)); +fn merge_after_bridging(graph: &mut UnitigGraph) { + merge_linear_paths(graph, &vec![], true); graph.print_basic_graph_info(); graph.renumber_unitigs(); } diff --git a/src/trim.rs b/src/trim.rs index 3156d1c..01f4ed1 100644 --- a/src/trim.rs +++ b/src/trim.rs @@ -267,7 +267,7 @@ fn clean_up_graph(graph: &mut UnitigGraph, sequences: &Vec) { has occurred above."); graph.recalculate_depths(); graph.remove_zero_depth_unitigs(); - merge_linear_paths(graph, sequences, None); + merge_linear_paths(graph, sequences, false); graph.print_basic_graph_info(); graph.renumber_unitigs(); } diff --git a/src/unitig.rs b/src/unitig.rs index aea2b22..6c493cb 100644 --- a/src/unitig.rs +++ b/src/unitig.rs @@ -338,6 +338,10 @@ impl UnitigStrand { self.unitig.borrow().length() } + pub fn depth(&self) -> f64 { + self.unitig.borrow().depth + } + pub fn get_seq(&self) -> Vec { self.unitig.borrow().get_seq(self.strand) } diff --git a/src/unitig_graph.rs b/src/unitig_graph.rs index 0f36adf..5eb8f18 100644 --- a/src/unitig_graph.rs +++ b/src/unitig_graph.rs @@ -86,8 +86,6 @@ impl UnitigGraph { } } } - quit_with_error("could not find a valid k-mer tag (e.g. KM:i:51) in the GFA header line.\n\ - Are you sure this is an Autocycler-generated GFA file?"); } fn build_links_from_gfa(&mut self, link_lines: &[&str]) {