diff --git a/src/clean.rs b/src/clean.rs
new file mode 100644
index 0000000..136e81c
--- /dev/null
+++ b/src/clean.rs
@@ -0,0 +1,117 @@
+// This file contains the code for the autocycler combine subcommand.
+
+// Copyright 2024 Ryan Wick (rrwick@gmail.com)
+// https://github.com/rrwick/Autocycler
+
+// This file is part of Autocycler. Autocycler is free software: you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as published by the Free Software
+// Foundation, either version 3 of the License, or (at your option) any later version. Autocycler
+// is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the
+// implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+// Public License for more details. You should have received a copy of the GNU General Public
+// License along with Autocycler. If not, see .
+
+use std::collections::HashSet;
+use std::path::{Path, PathBuf};
+
+use crate::graph_simplification::merge_linear_paths;
+use crate::log::{section_header, explanation};
+use crate::misc::{check_if_file_exists, quit_with_error};
+use crate::unitig_graph::UnitigGraph;
+
+
+pub fn clean(in_gfa: PathBuf, out_gfa: PathBuf, remove: Option) {
+ check_settings(&in_gfa);
+ starting_message();
+ let remove_tigs = parse_remove_tigs(remove);
+ print_settings(&in_gfa, &out_gfa, &remove_tigs);
+ let mut graph = load_graph(&in_gfa);
+ clean_graph(&mut graph, &remove_tigs, &in_gfa);
+ merge_graph(&mut graph);
+ graph.save_gfa(&out_gfa, &vec![]).unwrap();
+ finished_message(&out_gfa);
+}
+
+
+fn check_settings(in_gfa: &Path) {
+ check_if_file_exists(in_gfa);
+}
+
+
+fn starting_message() {
+ section_header("Starting autocycler clean");
+ explanation("This command removes user-specified tigs from a combined Autocycler graph and \
+ then merges all linear paths to produce a clean output graph.");
+}
+
+
+fn print_settings(in_gfa: &Path, out_gfa: &Path, remove_tigs: &[u32]) {
+ eprintln!("Settings:");
+ eprintln!(" --in_gfa {}", in_gfa.display());
+ eprintln!(" --out_gfa {}", out_gfa.display());
+ if !remove_tigs.is_empty() {
+ eprintln!(" --remove {}", remove_tigs.iter().map(|c| c.to_string())
+ .collect::>() .join(","));
+ }
+ eprintln!();
+}
+
+
+fn finished_message(out_gfa: &Path) {
+ section_header("Finished!");
+ eprintln!("Cleaned graph: {}", out_gfa.display());
+ eprintln!();
+}
+
+
+fn clean_graph(graph: &mut UnitigGraph, remove_tigs: &[u32], in_gfa: &Path) {
+ section_header("Cleaning graph");
+ explanation("The user-specified tigs are now removed from the graph.");
+ check_tig_nums_are_valid(in_gfa, &graph, remove_tigs);
+ let remove_set: HashSet = HashSet::from_iter(remove_tigs.iter().cloned());
+ graph.remove_unitigs_by_number(remove_set);
+ graph.print_basic_graph_info();
+}
+
+
+fn merge_graph(graph: &mut UnitigGraph) {
+ section_header("Merging linear paths");
+ explanation("Linear paths in the graph are now merged.");
+ merge_linear_paths(graph, &vec![], false);
+ graph.print_basic_graph_info();
+ graph.renumber_unitigs();
+}
+
+
+fn load_graph(gfa: &Path) -> UnitigGraph {
+ section_header("Loading graph");
+ explanation("The unitig graph is now loaded into memory.");
+ let (graph, _) = UnitigGraph::from_gfa_file(gfa);
+ graph.print_basic_graph_info();
+ graph
+}
+
+
+fn check_tig_nums_are_valid(in_gfa: &Path, graph: &UnitigGraph, remove_tigs: &[u32]) {
+ let mut all_tig_numbers = HashSet::new();
+ for unitig in &graph.unitigs {
+ all_tig_numbers.insert(unitig.borrow().number);
+ }
+ for tig in remove_tigs {
+ if !all_tig_numbers.contains(tig) {
+ quit_with_error(&format!("{} does not contain tig {}", in_gfa.display(), tig));
+ }
+ }
+}
+
+
+fn parse_remove_tigs(remove: Option) -> Vec {
+ if remove.is_none() {
+ return Vec::new();
+ }
+ let mut remove_tigs: Vec<_> = remove.unwrap().split(',')
+ .map(|s| s.parse::().unwrap_or_else(|_| quit_with_error(
+ &format!("failed to parse '{}' as a node number", s)))).collect();
+ remove_tigs.sort();
+ remove_tigs
+}
diff --git a/src/cluster.rs b/src/cluster.rs
index c44c7bd..ebd0ca0 100644
--- a/src/cluster.rs
+++ b/src/cluster.rs
@@ -792,7 +792,7 @@ fn save_cluster_gfa(sequences: &[Sequence], cluster_num: u16, gfa_lines: &Vec) {
starting_message();
print_settings(&autocycler_dir, &in_gfas);
- // TODO: add an optional argument for reads, which will add depth values to the combined
- // assembly. Find unique k-mers in the combined assembly and then count the occurrences
- // of those k-mers in the reads.
+ // TODO: add an optional argument for reads. When used, the reads will be used to set unitig
+ // depths (instead of input assembly count). Find unique k-mers in the combined assembly
+ // and then count the occurrences of those k-mers in the reads.
// TODO: sort the input GFA files in order of decreasing size. They may already be in this
// order (from the clustering step), but not necessarily (e.g. due to small plasmid
@@ -106,11 +106,13 @@ fn combine_clusters(in_gfas: &Vec, combined_gfa: &Path, combined_fasta:
let unitig_seq = String::from_utf8_lossy(&unitig.forward_seq);
let circ = if unitig.is_isolated_and_circular() { " circular=true".to_string() }
else { "".to_string() };
+ let depth_tag = format!("\tDP:f:{:.2}", unitig.depth);
let mut colour_tag = unitig.colour_tag();
if colour_tag.is_empty() {
colour_tag = "\tCL:z:orangered".to_string();
}
- writeln!(gfa_file, "S\t{}\t{}{}", unitig_num, unitig_seq, colour_tag).unwrap();
+ writeln!(gfa_file, "S\t{}\t{}{}{}",
+ unitig_num, unitig_seq, depth_tag, colour_tag).unwrap();
writeln!(fasta_file, ">{} length={}{}", unitig_num, unitig.length(), circ).unwrap();
writeln!(fasta_file, "{}", unitig_seq).unwrap();
}
diff --git a/src/graph_simplification.rs b/src/graph_simplification.rs
index a8432c3..b6f9e3e 100644
--- a/src/graph_simplification.rs
+++ b/src/graph_simplification.rs
@@ -313,7 +313,7 @@ fn get_common_end_seq(unitigs: &[UnitigStrand]) -> Vec {
}
-pub fn merge_linear_paths(graph: &mut UnitigGraph, seqs: &Vec, depth: Option) {
+pub fn merge_linear_paths(graph: &mut UnitigGraph, seqs: &Vec, require_same_depth: bool) {
// This function looks for linear paths in the graph (where one Unitig leads only to another
// and vice versa) and merges them together when possible.
//
@@ -343,10 +343,12 @@ pub fn merge_linear_paths(graph: &mut UnitigGraph, seqs: &Vec, depth:
// Extend the path as far as possible.
loop {
let unitig = current_path.last().unwrap();
+ let current_depth = unitig.depth();
if cannot_merge_end(unitig.number(), unitig.strand, &fixed_starts, &fixed_ends) { break; }
let mut outputs = if unitig.strand { get_exclusive_outputs(&unitig.unitig) } else { get_exclusive_inputs(&unitig.unitig) };
if outputs.len() != 1 { break; }
let output = &mut outputs[0];
+ if require_same_depth && output.depth() != current_depth { break; }
if !unitig.strand { output.strand = !output.strand; }
let output_number = output.number();
if already_used.contains(&output_number) { break; }
@@ -364,7 +366,7 @@ pub fn merge_linear_paths(graph: &mut UnitigGraph, seqs: &Vec, depth:
let mut new_unitig_number: u32 = graph.max_unitig_number();
for path in merge_paths {
new_unitig_number += 1;
- merge_path(graph, &path, new_unitig_number, depth);
+ merge_path(graph, &path, new_unitig_number);
}
graph.delete_dangling_links();
graph.build_unitig_index();
@@ -408,7 +410,7 @@ fn has_single_exclusive_input(unitig_rc: &Rc>, unitig_strand: bo
}
-fn merge_path(graph: &mut UnitigGraph, path: &Vec, new_unitig_number: u32, depth: Option) {
+fn merge_path(graph: &mut UnitigGraph, path: &Vec, new_unitig_number: u32) {
let merged_seq = merge_unitig_seqs(path);
let first = &path[0];
let last = path.last().unwrap();
@@ -431,7 +433,8 @@ fn merge_path(graph: &mut UnitigGraph, path: &Vec, new_unitig_numb
number: new_unitig_number,
reverse_seq: reverse_complement(&merged_seq),
forward_seq: merged_seq,
- depth: if let Some(d) = depth { d } else { forward_positions.len() as f64 },
+ depth: if !forward_positions.is_empty() { forward_positions.len() as f64 }
+ else { weighted_mean_depth(path) },
forward_positions, reverse_positions,
forward_next, forward_prev, reverse_next, reverse_prev,
..Default::default()
@@ -499,6 +502,16 @@ fn merge_unitig_seqs(path: &Vec) -> Vec {
}
+fn weighted_mean_depth(path: &Vec) -> f64 {
+ let total_length = path.iter().map(|u| u.length()).sum::() as f64;
+ let mut depth_sum = 0.0;
+ for u in path {
+ depth_sum += u.depth() * u.length() as f64;
+ }
+ depth_sum / total_length
+}
+
+
#[cfg(test)]
mod tests {
use crate::test_gfa::*;
@@ -716,7 +729,7 @@ mod tests {
fn test_merge_linear_paths_1() {
let (mut graph, seqs) = UnitigGraph::from_gfa_lines(&get_test_gfa_3());
assert_eq!(graph.unitigs.len(), 7);
- merge_linear_paths(&mut graph, &seqs, None);
+ merge_linear_paths(&mut graph, &seqs, false);
assert_eq!(graph.unitigs.len(), 3);
assert_eq!(std::str::from_utf8(&graph.unitig_index.get(&8).unwrap().borrow().forward_seq).unwrap(),
"TTCGCTGCGCTCGCTTCGCTTTTGCACAGCGACGACGGCATGCCTGAATCGCCTA");
@@ -740,7 +753,7 @@ mod tests {
fn test_merge_linear_paths_2() {
let (mut graph, seqs) = UnitigGraph::from_gfa_lines(&get_test_gfa_4());
assert_eq!(graph.unitigs.len(), 5);
- merge_linear_paths(&mut graph, &seqs, None);
+ merge_linear_paths(&mut graph, &seqs, false);
assert_eq!(graph.unitigs.len(), 2);
assert_eq!(std::str::from_utf8(&graph.unitig_index.get(&6).unwrap().borrow().forward_seq).unwrap(),
"ACGACTACGAGCACGAGTCGTCGTCGTAACTGACT");
@@ -759,7 +772,7 @@ mod tests {
fn test_merge_linear_paths_3() {
let (mut graph, seqs) = UnitigGraph::from_gfa_lines(&get_test_gfa_5());
assert_eq!(graph.unitigs.len(), 6);
- merge_linear_paths(&mut graph, &seqs, None);
+ merge_linear_paths(&mut graph, &seqs, false);
assert_eq!(graph.unitigs.len(), 5);
assert_eq!(std::str::from_utf8(&graph.unitig_index.get(&7).unwrap().borrow().forward_seq).unwrap(),
"AAATGCGACTGTG");
@@ -769,7 +782,7 @@ mod tests {
fn test_merge_linear_paths_4() {
let (mut graph, seqs) = UnitigGraph::from_gfa_lines(&get_test_gfa_14());
assert_eq!(graph.unitigs.len(), 13);
- merge_linear_paths(&mut graph, &seqs, None);
+ merge_linear_paths(&mut graph, &seqs, false);
assert_eq!(graph.unitigs.len(), 11);
}
}
diff --git a/src/main.rs b/src/main.rs
index 4f7601b..cc874c9 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -15,6 +15,7 @@
use std::path::PathBuf;
use clap::{Parser, Subcommand, crate_version};
+mod clean;
mod cluster;
mod combine;
mod compress;
@@ -63,6 +64,21 @@ struct Cli {
#[derive(Subcommand)]
enum Commands {
+ /// Remove specified tigs and then merge linear paths
+ Clean {
+ /// Autocycler GFA file (required)
+ #[clap(short = 'i', long = "in_gfa", required = true)]
+ in_gfa: PathBuf,
+
+ /// Output GFA file (required)
+ #[clap(short = 'o', long = "out_gfa", required = true)]
+ out_gfa: PathBuf,
+
+ /// Tig numbers to remove from the input graph
+ #[clap(short = 'r', long = "remove")]
+ remove: Option,
+ },
+
/// cluster contigs in the unitig graph based on similarity
Cluster {
/// Autocycler directory containing input_assemblies.gfa file (required)
@@ -245,6 +261,9 @@ fn main() {
let cli = Cli::parse();
match cli.command {
+ Some(Commands::Clean { in_gfa, out_gfa, remove }) => {
+ clean::clean(in_gfa, out_gfa, remove);
+ },
Some(Commands::Cluster { autocycler_dir, cutoff, min_assemblies, max_contigs, manual }) => {
cluster::cluster(autocycler_dir, cutoff, min_assemblies, max_contigs, manual);
},
diff --git a/src/resolve.rs b/src/resolve.rs
index c26677c..fdcd54b 100644
--- a/src/resolve.rs
+++ b/src/resolve.rs
@@ -50,7 +50,7 @@ pub fn resolve(cluster_dir: PathBuf, verbose: bool) {
apply_unique_message();
apply_bridges(&mut unitig_graph, &bridges, bridge_depth);
unitig_graph.save_gfa(&bridged_gfa, &vec![]).unwrap();
- merge_after_bridging(&mut unitig_graph, bridge_depth);
+ merge_after_bridging(&mut unitig_graph);
unitig_graph.save_gfa(&merged_gfa, &vec![]).unwrap();
let cull_count = cull_ambiguity(&mut bridges, verbose);
@@ -58,7 +58,7 @@ pub fn resolve(cluster_dir: PathBuf, verbose: bool) {
(unitig_graph, _) = load_graph(&gfa_lines, false, Some(&anchors));
apply_final_message();
apply_bridges(&mut unitig_graph, &bridges, bridge_depth);
- merge_after_bridging(&mut unitig_graph, bridge_depth);
+ merge_after_bridging(&mut unitig_graph);
} else {
eprintln!("All bridges were unique, no culling necessary.\n");
}
@@ -237,8 +237,8 @@ fn apply_bridges(graph: &mut UnitigGraph, bridges: &Vec, bridge_depth: f
}
-fn merge_after_bridging(graph: &mut UnitigGraph, bridge_depth: f64) {
- merge_linear_paths(graph, &vec![], Some(bridge_depth));
+fn merge_after_bridging(graph: &mut UnitigGraph) {
+ merge_linear_paths(graph, &vec![], true);
graph.print_basic_graph_info();
graph.renumber_unitigs();
}
diff --git a/src/trim.rs b/src/trim.rs
index 3156d1c..01f4ed1 100644
--- a/src/trim.rs
+++ b/src/trim.rs
@@ -267,7 +267,7 @@ fn clean_up_graph(graph: &mut UnitigGraph, sequences: &Vec) {
has occurred above.");
graph.recalculate_depths();
graph.remove_zero_depth_unitigs();
- merge_linear_paths(graph, sequences, None);
+ merge_linear_paths(graph, sequences, false);
graph.print_basic_graph_info();
graph.renumber_unitigs();
}
diff --git a/src/unitig.rs b/src/unitig.rs
index aea2b22..6c493cb 100644
--- a/src/unitig.rs
+++ b/src/unitig.rs
@@ -338,6 +338,10 @@ impl UnitigStrand {
self.unitig.borrow().length()
}
+ pub fn depth(&self) -> f64 {
+ self.unitig.borrow().depth
+ }
+
pub fn get_seq(&self) -> Vec {
self.unitig.borrow().get_seq(self.strand)
}
diff --git a/src/unitig_graph.rs b/src/unitig_graph.rs
index 0f36adf..5eb8f18 100644
--- a/src/unitig_graph.rs
+++ b/src/unitig_graph.rs
@@ -86,8 +86,6 @@ impl UnitigGraph {
}
}
}
- quit_with_error("could not find a valid k-mer tag (e.g. KM:i:51) in the GFA header line.\n\
- Are you sure this is an Autocycler-generated GFA file?");
}
fn build_links_from_gfa(&mut self, link_lines: &[&str]) {