Skip to content

Commit

Permalink
Change merge logic and add clean command
Browse files Browse the repository at this point in the history
  • Loading branch information
rrwick committed Jan 8, 2025
1 parent d5d1d5b commit 39f4c2b
Show file tree
Hide file tree
Showing 9 changed files with 173 additions and 20 deletions.
117 changes: 117 additions & 0 deletions src/clean.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
// This file contains the code for the autocycler combine subcommand.

// Copyright 2024 Ryan Wick ([email protected])
// https://github.com/rrwick/Autocycler

// This file is part of Autocycler. Autocycler is free software: you can redistribute it and/or
// modify it under the terms of the GNU General Public License as published by the Free Software
// Foundation, either version 3 of the License, or (at your option) any later version. Autocycler
// is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the
// implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
// Public License for more details. You should have received a copy of the GNU General Public
// License along with Autocycler. If not, see <http://www.gnu.org/licenses/>.

use std::collections::HashSet;
use std::path::{Path, PathBuf};

use crate::graph_simplification::merge_linear_paths;
use crate::log::{section_header, explanation};
use crate::misc::{check_if_file_exists, quit_with_error};
use crate::unitig_graph::UnitigGraph;


pub fn clean(in_gfa: PathBuf, out_gfa: PathBuf, remove: Option<String>) {
check_settings(&in_gfa);
starting_message();
let remove_tigs = parse_remove_tigs(remove);
print_settings(&in_gfa, &out_gfa, &remove_tigs);
let mut graph = load_graph(&in_gfa);
clean_graph(&mut graph, &remove_tigs, &in_gfa);
merge_graph(&mut graph);
graph.save_gfa(&out_gfa, &vec![]).unwrap();
finished_message(&out_gfa);
}


fn check_settings(in_gfa: &Path) {
check_if_file_exists(in_gfa);
}


fn starting_message() {
section_header("Starting autocycler clean");
explanation("This command removes user-specified tigs from a combined Autocycler graph and \
then merges all linear paths to produce a clean output graph.");
}


fn print_settings(in_gfa: &Path, out_gfa: &Path, remove_tigs: &[u32]) {
eprintln!("Settings:");
eprintln!(" --in_gfa {}", in_gfa.display());
eprintln!(" --out_gfa {}", out_gfa.display());
if !remove_tigs.is_empty() {
eprintln!(" --remove {}", remove_tigs.iter().map(|c| c.to_string())
.collect::<Vec<String>>() .join(","));
}
eprintln!();
}


fn finished_message(out_gfa: &Path) {
section_header("Finished!");
eprintln!("Cleaned graph: {}", out_gfa.display());
eprintln!();
}


fn clean_graph(graph: &mut UnitigGraph, remove_tigs: &[u32], in_gfa: &Path) {
section_header("Cleaning graph");
explanation("The user-specified tigs are now removed from the graph.");
check_tig_nums_are_valid(in_gfa, &graph, remove_tigs);
let remove_set: HashSet<u32> = HashSet::from_iter(remove_tigs.iter().cloned());
graph.remove_unitigs_by_number(remove_set);
graph.print_basic_graph_info();
}


fn merge_graph(graph: &mut UnitigGraph) {
section_header("Merging linear paths");
explanation("Linear paths in the graph are now merged.");
merge_linear_paths(graph, &vec![], false);
graph.print_basic_graph_info();
graph.renumber_unitigs();
}


fn load_graph(gfa: &Path) -> UnitigGraph {
section_header("Loading graph");
explanation("The unitig graph is now loaded into memory.");
let (graph, _) = UnitigGraph::from_gfa_file(gfa);
graph.print_basic_graph_info();
graph
}


fn check_tig_nums_are_valid(in_gfa: &Path, graph: &UnitigGraph, remove_tigs: &[u32]) {
let mut all_tig_numbers = HashSet::new();
for unitig in &graph.unitigs {
all_tig_numbers.insert(unitig.borrow().number);
}
for tig in remove_tigs {
if !all_tig_numbers.contains(tig) {
quit_with_error(&format!("{} does not contain tig {}", in_gfa.display(), tig));
}
}
}


fn parse_remove_tigs(remove: Option<String>) -> Vec<u32> {
if remove.is_none() {
return Vec::new();
}
let mut remove_tigs: Vec<_> = remove.unwrap().split(',')
.map(|s| s.parse::<u32>().unwrap_or_else(|_| quit_with_error(
&format!("failed to parse '{}' as a node number", s)))).collect();
remove_tigs.sort();
remove_tigs
}
2 changes: 1 addition & 1 deletion src/cluster.rs
Original file line number Diff line number Diff line change
Expand Up @@ -792,7 +792,7 @@ fn save_cluster_gfa(sequences: &[Sequence], cluster_num: u16, gfa_lines: &Vec<St
cluster_graph.remove_sequence_from_graph(id);
}
cluster_graph.remove_zero_depth_unitigs();
merge_linear_paths(&mut cluster_graph, &cluster_seqs, None);
merge_linear_paths(&mut cluster_graph, &cluster_seqs, false);
cluster_graph.save_gfa(&out_gfa, &cluster_seqs).unwrap();
}

Expand Down
10 changes: 6 additions & 4 deletions src/combine.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,9 @@ pub fn combine(autocycler_dir: PathBuf, in_gfas: Vec<PathBuf>) {
starting_message();
print_settings(&autocycler_dir, &in_gfas);

// TODO: add an optional argument for reads, which will add depth values to the combined
// assembly. Find unique k-mers in the combined assembly and then count the occurrences
// of those k-mers in the reads.
// TODO: add an optional argument for reads. When used, the reads will be used to set unitig
// depths (instead of input assembly count). Find unique k-mers in the combined assembly
// and then count the occurrences of those k-mers in the reads.

// TODO: sort the input GFA files in order of decreasing size. They may already be in this
// order (from the clustering step), but not necessarily (e.g. due to small plasmid
Expand Down Expand Up @@ -106,11 +106,13 @@ fn combine_clusters(in_gfas: &Vec<PathBuf>, combined_gfa: &Path, combined_fasta:
let unitig_seq = String::from_utf8_lossy(&unitig.forward_seq);
let circ = if unitig.is_isolated_and_circular() { " circular=true".to_string() }
else { "".to_string() };
let depth_tag = format!("\tDP:f:{:.2}", unitig.depth);
let mut colour_tag = unitig.colour_tag();
if colour_tag.is_empty() {
colour_tag = "\tCL:z:orangered".to_string();
}
writeln!(gfa_file, "S\t{}\t{}{}", unitig_num, unitig_seq, colour_tag).unwrap();
writeln!(gfa_file, "S\t{}\t{}{}{}",
unitig_num, unitig_seq, depth_tag, colour_tag).unwrap();
writeln!(fasta_file, ">{} length={}{}", unitig_num, unitig.length(), circ).unwrap();
writeln!(fasta_file, "{}", unitig_seq).unwrap();
}
Expand Down
29 changes: 21 additions & 8 deletions src/graph_simplification.rs
Original file line number Diff line number Diff line change
Expand Up @@ -313,7 +313,7 @@ fn get_common_end_seq(unitigs: &[UnitigStrand]) -> Vec<u8> {
}


pub fn merge_linear_paths(graph: &mut UnitigGraph, seqs: &Vec<Sequence>, depth: Option<f64>) {
pub fn merge_linear_paths(graph: &mut UnitigGraph, seqs: &Vec<Sequence>, require_same_depth: bool) {
// This function looks for linear paths in the graph (where one Unitig leads only to another
// and vice versa) and merges them together when possible.
//
Expand Down Expand Up @@ -343,10 +343,12 @@ pub fn merge_linear_paths(graph: &mut UnitigGraph, seqs: &Vec<Sequence>, depth:
// Extend the path as far as possible.
loop {
let unitig = current_path.last().unwrap();
let current_depth = unitig.depth();
if cannot_merge_end(unitig.number(), unitig.strand, &fixed_starts, &fixed_ends) { break; }
let mut outputs = if unitig.strand { get_exclusive_outputs(&unitig.unitig) } else { get_exclusive_inputs(&unitig.unitig) };
if outputs.len() != 1 { break; }
let output = &mut outputs[0];
if require_same_depth && output.depth() != current_depth { break; }
if !unitig.strand { output.strand = !output.strand; }
let output_number = output.number();
if already_used.contains(&output_number) { break; }
Expand All @@ -364,7 +366,7 @@ pub fn merge_linear_paths(graph: &mut UnitigGraph, seqs: &Vec<Sequence>, depth:
let mut new_unitig_number: u32 = graph.max_unitig_number();
for path in merge_paths {
new_unitig_number += 1;
merge_path(graph, &path, new_unitig_number, depth);
merge_path(graph, &path, new_unitig_number);
}
graph.delete_dangling_links();
graph.build_unitig_index();
Expand Down Expand Up @@ -408,7 +410,7 @@ fn has_single_exclusive_input(unitig_rc: &Rc<RefCell<Unitig>>, unitig_strand: bo
}


fn merge_path(graph: &mut UnitigGraph, path: &Vec<UnitigStrand>, new_unitig_number: u32, depth: Option<f64>) {
fn merge_path(graph: &mut UnitigGraph, path: &Vec<UnitigStrand>, new_unitig_number: u32) {
let merged_seq = merge_unitig_seqs(path);
let first = &path[0];
let last = path.last().unwrap();
Expand All @@ -431,7 +433,8 @@ fn merge_path(graph: &mut UnitigGraph, path: &Vec<UnitigStrand>, new_unitig_numb
number: new_unitig_number,
reverse_seq: reverse_complement(&merged_seq),
forward_seq: merged_seq,
depth: if let Some(d) = depth { d } else { forward_positions.len() as f64 },
depth: if !forward_positions.is_empty() { forward_positions.len() as f64 }
else { weighted_mean_depth(path) },
forward_positions, reverse_positions,
forward_next, forward_prev, reverse_next, reverse_prev,
..Default::default()
Expand Down Expand Up @@ -499,6 +502,16 @@ fn merge_unitig_seqs(path: &Vec<UnitigStrand>) -> Vec<u8> {
}


fn weighted_mean_depth(path: &Vec<UnitigStrand>) -> f64 {
let total_length = path.iter().map(|u| u.length()).sum::<u32>() as f64;
let mut depth_sum = 0.0;
for u in path {
depth_sum += u.depth() * u.length() as f64;
}
depth_sum / total_length
}


#[cfg(test)]
mod tests {
use crate::test_gfa::*;
Expand Down Expand Up @@ -716,7 +729,7 @@ mod tests {
fn test_merge_linear_paths_1() {
let (mut graph, seqs) = UnitigGraph::from_gfa_lines(&get_test_gfa_3());
assert_eq!(graph.unitigs.len(), 7);
merge_linear_paths(&mut graph, &seqs, None);
merge_linear_paths(&mut graph, &seqs, false);
assert_eq!(graph.unitigs.len(), 3);
assert_eq!(std::str::from_utf8(&graph.unitig_index.get(&8).unwrap().borrow().forward_seq).unwrap(),
"TTCGCTGCGCTCGCTTCGCTTTTGCACAGCGACGACGGCATGCCTGAATCGCCTA");
Expand All @@ -740,7 +753,7 @@ mod tests {
fn test_merge_linear_paths_2() {
let (mut graph, seqs) = UnitigGraph::from_gfa_lines(&get_test_gfa_4());
assert_eq!(graph.unitigs.len(), 5);
merge_linear_paths(&mut graph, &seqs, None);
merge_linear_paths(&mut graph, &seqs, false);
assert_eq!(graph.unitigs.len(), 2);
assert_eq!(std::str::from_utf8(&graph.unitig_index.get(&6).unwrap().borrow().forward_seq).unwrap(),
"ACGACTACGAGCACGAGTCGTCGTCGTAACTGACT");
Expand All @@ -759,7 +772,7 @@ mod tests {
fn test_merge_linear_paths_3() {
let (mut graph, seqs) = UnitigGraph::from_gfa_lines(&get_test_gfa_5());
assert_eq!(graph.unitigs.len(), 6);
merge_linear_paths(&mut graph, &seqs, None);
merge_linear_paths(&mut graph, &seqs, false);
assert_eq!(graph.unitigs.len(), 5);
assert_eq!(std::str::from_utf8(&graph.unitig_index.get(&7).unwrap().borrow().forward_seq).unwrap(),
"AAATGCGACTGTG");
Expand All @@ -769,7 +782,7 @@ mod tests {
fn test_merge_linear_paths_4() {
let (mut graph, seqs) = UnitigGraph::from_gfa_lines(&get_test_gfa_14());
assert_eq!(graph.unitigs.len(), 13);
merge_linear_paths(&mut graph, &seqs, None);
merge_linear_paths(&mut graph, &seqs, false);
assert_eq!(graph.unitigs.len(), 11);
}
}
19 changes: 19 additions & 0 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
use std::path::PathBuf;
use clap::{Parser, Subcommand, crate_version};

mod clean;
mod cluster;
mod combine;
mod compress;
Expand Down Expand Up @@ -63,6 +64,21 @@ struct Cli {
#[derive(Subcommand)]
enum Commands {

/// Remove specified tigs and then merge linear paths
Clean {
/// Autocycler GFA file (required)
#[clap(short = 'i', long = "in_gfa", required = true)]
in_gfa: PathBuf,

/// Output GFA file (required)
#[clap(short = 'o', long = "out_gfa", required = true)]
out_gfa: PathBuf,

/// Tig numbers to remove from the input graph
#[clap(short = 'r', long = "remove")]
remove: Option<String>,
},

/// cluster contigs in the unitig graph based on similarity
Cluster {
/// Autocycler directory containing input_assemblies.gfa file (required)
Expand Down Expand Up @@ -245,6 +261,9 @@ fn main() {
let cli = Cli::parse();

match cli.command {
Some(Commands::Clean { in_gfa, out_gfa, remove }) => {
clean::clean(in_gfa, out_gfa, remove);
},
Some(Commands::Cluster { autocycler_dir, cutoff, min_assemblies, max_contigs, manual }) => {
cluster::cluster(autocycler_dir, cutoff, min_assemblies, max_contigs, manual);
},
Expand Down
8 changes: 4 additions & 4 deletions src/resolve.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,15 +50,15 @@ pub fn resolve(cluster_dir: PathBuf, verbose: bool) {
apply_unique_message();
apply_bridges(&mut unitig_graph, &bridges, bridge_depth);
unitig_graph.save_gfa(&bridged_gfa, &vec![]).unwrap();
merge_after_bridging(&mut unitig_graph, bridge_depth);
merge_after_bridging(&mut unitig_graph);
unitig_graph.save_gfa(&merged_gfa, &vec![]).unwrap();

let cull_count = cull_ambiguity(&mut bridges, verbose);
if cull_count > 0 {
(unitig_graph, _) = load_graph(&gfa_lines, false, Some(&anchors));
apply_final_message();
apply_bridges(&mut unitig_graph, &bridges, bridge_depth);
merge_after_bridging(&mut unitig_graph, bridge_depth);
merge_after_bridging(&mut unitig_graph);
} else {
eprintln!("All bridges were unique, no culling necessary.\n");
}
Expand Down Expand Up @@ -237,8 +237,8 @@ fn apply_bridges(graph: &mut UnitigGraph, bridges: &Vec<Bridge>, bridge_depth: f
}


fn merge_after_bridging(graph: &mut UnitigGraph, bridge_depth: f64) {
merge_linear_paths(graph, &vec![], Some(bridge_depth));
fn merge_after_bridging(graph: &mut UnitigGraph) {
merge_linear_paths(graph, &vec![], true);
graph.print_basic_graph_info();
graph.renumber_unitigs();
}
Expand Down
2 changes: 1 addition & 1 deletion src/trim.rs
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,7 @@ fn clean_up_graph(graph: &mut UnitigGraph, sequences: &Vec<Sequence>) {
has occurred above.");
graph.recalculate_depths();
graph.remove_zero_depth_unitigs();
merge_linear_paths(graph, sequences, None);
merge_linear_paths(graph, sequences, false);
graph.print_basic_graph_info();
graph.renumber_unitigs();
}
Expand Down
4 changes: 4 additions & 0 deletions src/unitig.rs
Original file line number Diff line number Diff line change
Expand Up @@ -338,6 +338,10 @@ impl UnitigStrand {
self.unitig.borrow().length()
}

pub fn depth(&self) -> f64 {
self.unitig.borrow().depth
}

pub fn get_seq(&self) -> Vec<u8> {
self.unitig.borrow().get_seq(self.strand)
}
Expand Down
2 changes: 0 additions & 2 deletions src/unitig_graph.rs
Original file line number Diff line number Diff line change
Expand Up @@ -86,8 +86,6 @@ impl UnitigGraph {
}
}
}
quit_with_error("could not find a valid k-mer tag (e.g. KM:i:51) in the GFA header line.\n\
Are you sure this is an Autocycler-generated GFA file?");
}

fn build_links_from_gfa(&mut self, link_lines: &[&str]) {
Expand Down

0 comments on commit 39f4c2b

Please sign in to comment.