diff --git a/Cargo.toml b/Cargo.toml index e371faa..9099709 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ name = "barkit" version = "0.1.0" # managed by release.sh edition = "2021" authors = ["Nikita Syzrantsev syzrantsev.n@yandex.ru"] -description = "Tool to process barcodes in FASTQ" +description = "BarKit — a cross-platform and ultrafast toolkit for barcodes manipulation in FASTQ files" license = "GPL-3.0" readme = "README.md" homepage = "https://github.com/nsyzrantsev/barkit" diff --git a/README.md b/README.md index 8b04636..01f7b98 100644 --- a/README.md +++ b/README.md @@ -1,21 +1,37 @@ # BarKit -> [!WARNING] -> This tool is under development. Please use the first release version when it becomes available. - -BarKit (Barcodes Toolkit) is a toolkit designed for manipulating FASTQ barcodes. +BarKit (**Bar**codes Tool**Kit**) is a toolkit designed for manipulating FASTQ barcodes. ## Installation +### From crates.io + +Barkit can be installed from [`crates.io`](https://crates.io/crates/barkit) using `cargo`. This can be done with the following command: + ```bash cargo install barkit ``` -## Extract Command +### Build from source + +1. Clone the repository: + +```bash +git clone https://github.com/nsyzrantsev/barkit +cd barkit/ +``` + +2. Build: + +```bash +cargo build --release && sudo mv target/release/barkit /usr/local/bin/ +``` + +## Extract subcommand -The extract command is designed to parse barcode sequences from FASTQ reads using approximate regex matching based on a provided pattern. +The extract subcommand is designed to parse barcode sequences from FASTQ reads using approximate regex matching based on a provided pattern. -All parsed barcode sequences are moved to the read header with base quality separated by colons: +All parsed barcode sequences are moved to the read header with base quality, separated by colons: ``` @SEQ_ID UMI:ATGC:???? CB:ATGC:???? SB:ATGC:???? @@ -34,7 +50,7 @@ Parse the first twelve nucleotides as a UMI from each forward read: barkit extract -1 -2 -p "^(?P[ATGCN]{12})" -o -O ``` -Parse the first sixteen nucleotides as a cell barcode from each reverse read before the `atgccat` sequence: +Parse the first sixteen nucleotides as a cell barcode from each reverse read before the `atgccat` adapter sequence: ```bash barkit extract -1 -2 -P "^(?P[ATGCN]{16})atgccat" -o -O diff --git a/barkit-extract/Cargo.toml b/barkit-extract/Cargo.toml index a76e73d..767ae38 100644 --- a/barkit-extract/Cargo.toml +++ b/barkit-extract/Cargo.toml @@ -3,7 +3,7 @@ name = "barkit-extract" version = "0.1.0" # managed by release.sh edition = "2021" authors = ["Nikita Syzrantsev syzrantsev.n@yandex.ru"] -description = "Tool to extract barcodes" +description = "Tool for extracting barcode nucleotide sequence according to a specified regex pattern" license = "GPL-3.0" readme = "../README.md" homepage = "https://github.com/nsyzrantsev/barkit" diff --git a/barkit-extract/src/error.rs b/barkit-extract/src/error.rs index 1f2da1c..3b8162d 100644 --- a/barkit-extract/src/error.rs +++ b/barkit-extract/src/error.rs @@ -12,10 +12,14 @@ pub enum Error { BarcodeCaptureGroupNotFound(String), #[error("Provided unexpected barcode capture group {0}")] UnexpectedCaptureGroupName(String), - #[error("Failed to read a file: {0}")] - FileRead(#[from] std::io::Error), + #[error("I/O error: {0}")] + IO(#[from] std::io::Error), #[error("No match")] PatternNotMatched, + #[error("Fancy regex error: {0}")] + FancyRegex(#[from] fancy_regex::Error), + #[error("Failed to choose permutation mask")] + PermutationMaskSize, } impl Clone for Error { @@ -30,8 +34,10 @@ impl Clone for Error { Error::UnexpectedCaptureGroupName(capture_group) => { Error::UnexpectedCaptureGroupName(capture_group.clone()) } - Error::FileRead(err) => Error::FileRead(err.kind().into()), + Error::IO(err) => Error::IO(err.kind().into()), Error::PatternNotMatched => Error::PatternNotMatched, + Error::FancyRegex(err) => Error::FancyRegex(err.clone()), + Error::PermutationMaskSize => Error::PermutationMaskSize, } } } diff --git a/barkit-extract/src/fastq.rs b/barkit-extract/src/fastq.rs index 3778398..f8a6cb8 100644 --- a/barkit-extract/src/fastq.rs +++ b/barkit-extract/src/fastq.rs @@ -1,8 +1,10 @@ +#![allow(clippy::result_large_err)] + use std::fs::{File, OpenOptions}; use std::io::{self, BufRead, BufReader, BufWriter, Read, Write}; use std::path::Path; use std::rc::Rc; -use std::sync::{Mutex, MutexGuard}; +use std::sync::Mutex; use flate2::{read::MultiGzDecoder, write::GzEncoder, Compression}; use gzp::{ @@ -10,9 +12,9 @@ use gzp::{ par::decompress::ParDecompressBuilder, }; use lz4::{Decoder, EncoderBuilder}; -use seq_io::fastq::{OwnedRecord, Reader}; +use seq_io::fastq::{self, OwnedRecord, Reader, RecordSet}; -use crate::error; +use crate::error::{self, Error}; const WRITE_BUFFER_SIZE: usize = 128 * 1024 * 1024; // 128 KB buffer size, you can adjust this size as needed @@ -34,6 +36,13 @@ pub enum CompressionType { } impl CompressionType { + /// Returns magic bytes for specified compression type + /// + /// Example: + /// + /// use barkit_extract::fastq::CompressionType; + /// + /// assert_eq!(CompressionType::Gzip.magic_bytes(), &[0x1f, 0x8b]); fn magic_bytes(&self) -> &'static [u8] { match self { CompressionType::Bgzf => &[0x42, 0x43, 0x02, 0x00], @@ -44,7 +53,22 @@ impl CompressionType { } } - pub fn get_output_compression_type(gz: &bool, bgz: &bool, mgz: &bool, lz4: &bool) -> Self { + /// Selects `CompressionType` by provided values + /// + /// Example: + /// + /// use barkit_extract::fastq::CompressionType; + /// + /// assert_eq!( + /// CompressionType::select( + /// true, + /// false, + /// false, + /// false, + /// ), + /// &[0x1f, 0x8b] + /// ); + pub fn select(gz: &bool, bgz: &bool, mgz: &bool, lz4: &bool) -> Self { match (gz, bgz, mgz, lz4) { (true, false, false, false) => Self::Gzip, (false, true, false, false) => Self::Mgzip, @@ -54,7 +78,8 @@ impl CompressionType { } } - fn get_input_compression_type(path: &Path) -> CompressionType { + /// Detects the compression type (`CompressionType`) of the provided file + fn detect(path: &Path) -> CompressionType { let mut buffer = [0u8; 16]; File::open(path) @@ -74,132 +99,208 @@ impl CompressionType { } } -pub fn get_reads_count( - file: &str, - threads_num: usize, - buffer_size_in_megabytes: Option, -) -> usize { - create_reader(file, threads_num, buffer_size_in_megabytes) - .unwrap_or_else(|_| panic!("couldn't open file {}", file)) - .into_records() - .count() +pub struct FastqReader { + /// FASTQ reader + reader: fastq::Reader>, } -pub fn create_reader( - fastq_path: &str, - threads_num: usize, - buffer_size_in_megabytes: Option, -) -> Result>, error::Error> { - let path = Path::new(fastq_path); - let file = File::open(path).unwrap_or_else(|_| panic!("couldn't open file {}", fastq_path)); - - let buffer_size_in_bytes = get_reader_buffer_size(&file, buffer_size_in_megabytes)?; - - let decoder: Box = match CompressionType::get_input_compression_type(path) { - CompressionType::Gzip | CompressionType::Mgzip => Box::new(MultiGzDecoder::new(file)), - CompressionType::Lz4 => Box::new(Decoder::new(file)?), - CompressionType::Bgzf => Box::new( - ParDecompressBuilder::::new() - .num_threads(threads_num) - .expect("Provided unexpected number of threads") - .from_reader(BufReader::with_capacity(buffer_size_in_bytes, file)), - ), - CompressionType::No => Box::new(file), - }; - - Ok(Reader::new(Box::new(BufReader::with_capacity( - buffer_size_in_bytes, - decoder, - )))) -} +impl FastqReader { + pub fn new(fq: &str, threads: usize, max_memory: Option) -> Result { + let path = Path::new(fq); + let file = File::open(path).unwrap_or_else(|_| panic!("couldn't open file {}", fq)); + + let buffer_size_in_bytes = Self::calculate_buffer_size(&file, max_memory)?; -fn get_reader_buffer_size( - fastq_file: &File, - max_memory: Option, -) -> Result { - let fastq_file_size_bytes = fastq_file.metadata()?.len() as usize; - match max_memory { - Some(buffer_size) => { - let buffer_size_bytes = buffer_size * 1024 * 1024; - if buffer_size_bytes > fastq_file_size_bytes { - Ok(fastq_file_size_bytes) - } else { - Ok(buffer_size_bytes) + let decoder: Box = match CompressionType::detect(path) { + CompressionType::Gzip | CompressionType::Mgzip => Box::new(MultiGzDecoder::new(file)), + CompressionType::Lz4 => Box::new(Decoder::new(file)?), + CompressionType::Bgzf => Box::new( + ParDecompressBuilder::::new() + .num_threads(threads) + .expect("Provided unexpected number of threads") + .from_reader(BufReader::with_capacity(buffer_size_in_bytes, file)), + ), + CompressionType::No => Box::new(file), + }; + + Ok(FastqReader { + reader: Reader::new(Box::new(BufReader::with_capacity( + buffer_size_in_bytes, + decoder, + ))), + }) + } + + /// Calculates optimal buffer size based on FASTQ file size and max memory consumption + fn calculate_buffer_size( + fastq_file: &File, + max_memory: Option, + ) -> Result { + let fastq_file_size_bytes = fastq_file.metadata()?.len() as usize; + match max_memory { + Some(buffer_size) => { + let buffer_size_bytes = buffer_size * 1024 * 1024; + if buffer_size_bytes > fastq_file_size_bytes { + Ok(fastq_file_size_bytes) + } else { + Ok(buffer_size_bytes) + } } + None => Ok(fastq_file_size_bytes), } - None => Ok(fastq_file_size_bytes), } + + /// Counts reads in the FASTQ + pub fn count_reads( + file: &str, + threads_num: usize, + buffer_size_in_megabytes: Option, + ) -> usize { + Self::new(file, threads_num, buffer_size_in_megabytes) + .unwrap_or_else(|_| panic!("couldn't open file {}", file)) + .reader + .into_records() + .count() + } + + pub fn read_record_set(&mut self) -> Option { + let mut record_set = RecordSet::default(); + + self.reader + .read_record_set(&mut record_set) + .map(|_| record_set) + } +} + +pub struct FastqsReader { + /// Forward FASTQ reader + reader1: FastqReader, + + /// Reverse FASTQ reader + reader2: FastqReader, } -type WriterType = Rc>>>; - -pub fn create_writer( - file: &str, - compression: &CompressionType, - threads_num: usize, - force: bool, -) -> Result { - let path = Path::new(file); - - // Check if file exists and handle force logic - if path.exists() && !force { - return Err(io::Error::new( - io::ErrorKind::AlreadyExists, - format!("File {} already exists and force is set to false", file), - ) - .into()); +impl FastqsReader { + pub fn new( + fq1: &str, + fq2: &str, + threads: usize, + max_memory: Option, + ) -> Result { + Ok(Self { + reader1: FastqReader::new(fq1, threads, max_memory)?, + reader2: FastqReader::new(fq2, threads, max_memory)?, + }) } - let file = if force { - File::create(path)? - } else { - OpenOptions::new().write(true).create_new(true).open(path)? - }; - - let writer: Box = match compression { - CompressionType::Gzip => Box::new(GzEncoder::new(file, Compression::default())), - CompressionType::Bgzf => Box::new( - ParCompressBuilder::::new() - .num_threads(threads_num) - .expect("Provided unexpected number of threads") - .from_writer(file), - ), - CompressionType::Mgzip => Box::new( - ParCompressBuilder::::new() - .num_threads(threads_num) - .expect("Provided unexpected number of threads") - .from_writer(file), - ), - CompressionType::Lz4 => Box::new(EncoderBuilder::new().build(file)?), - _ => Box::new(file), - }; - - Ok(Rc::new(Mutex::new(BufWriter::with_capacity( - WRITE_BUFFER_SIZE, - writer, - )))) + pub fn read_record_sets(&mut self) -> Result<(Option, Option), Error> { + Ok(( + self.reader1.read_record_set(), + self.reader2.read_record_set(), + )) + } } -fn write_read_to_file(read: &OwnedRecord, writer: &mut MutexGuard>>) { - let _ = seq_io::fastq::write_to(&mut **writer, &read.head, &read.seq, &read.qual); +pub struct FastqWriter { + /// FASTQ writer + writer: Rc>>>, } -pub fn save_pair_end_reads_to_file( - result_read_pairs: Vec<(OwnedRecord, OwnedRecord)>, - mut writer1: MutexGuard>>, - mut writer2: MutexGuard>>, -) { - for (read1_record, read2_record) in result_read_pairs { - write_read_to_file(&read1_record, &mut writer1); - write_read_to_file(&read2_record, &mut writer2); +impl FastqWriter { + pub fn new( + fq: &str, + compression: &CompressionType, + threads_num: usize, + force: bool, + ) -> Result { + let path = Path::new(fq); + + // Check if file exists and handle force logic + if path.exists() && !force { + return Err(io::Error::new( + io::ErrorKind::AlreadyExists, + format!("{} is already existed, use --force to override", fq), + ) + .into()); + } + + let file = if force { + File::create(path)? + } else { + OpenOptions::new().write(true).create_new(true).open(path)? + }; + + let writer: Box = match compression { + CompressionType::Gzip => Box::new(GzEncoder::new(file, Compression::default())), + CompressionType::Bgzf => Box::new( + ParCompressBuilder::::new() + .num_threads(threads_num) + .expect("Provided unexpected number of threads") + .from_writer(file), + ), + CompressionType::Mgzip => Box::new( + ParCompressBuilder::::new() + .num_threads(threads_num) + .expect("Provided unexpected number of threads") + .from_writer(file), + ), + CompressionType::Lz4 => Box::new(EncoderBuilder::new().build(file)?), + _ => Box::new(file), + }; + + Ok(Self { + writer: Rc::new(Mutex::new(BufWriter::with_capacity( + WRITE_BUFFER_SIZE, + writer, + ))), + }) + } + + fn write(&mut self, read: &OwnedRecord) -> Result<(), io::Error> { + let mut writer = self.writer.lock().expect("Failed to lock writer"); + seq_io::fastq::write_to(&mut *writer, &read.head, &read.seq, &read.qual) + .map_err(|e| io::Error::new(io::ErrorKind::Other, e)) + } + + pub fn write_all(&mut self, result_reads: Vec) -> Result<(), io::Error> { + for read_record in result_reads { + self.write(&read_record)?; + } + + Ok(()) } } -pub fn save_single_end_reads_to_file( - result_reads: Vec, - mut writer: MutexGuard>>, -) { - for read_record in result_reads { - write_read_to_file(&read_record, &mut writer); +pub struct FastqsWriter { + /// Forward FASTQ writer + writer1: FastqWriter, + + /// Reverse FASTQ writer + writer2: FastqWriter, +} + +impl FastqsWriter { + pub fn new( + fq1: &str, + fq2: &str, + compression: &CompressionType, + threads_num: usize, + force: bool, + ) -> Result { + Ok(Self { + writer1: FastqWriter::new(fq1, compression, threads_num, force)?, + writer2: FastqWriter::new(fq2, compression, threads_num, force)?, + }) + } + + pub fn write_all( + &mut self, + pe_reads: Vec<(OwnedRecord, OwnedRecord)>, + ) -> Result<(), io::Error> { + for (read1_record, read2_record) in pe_reads { + self.writer1.write(&read1_record)?; + self.writer2.write(&read2_record)?; + } + Ok(()) } } diff --git a/barkit-extract/src/lib.rs b/barkit-extract/src/lib.rs index 87c0240..0d3c025 100644 --- a/barkit-extract/src/lib.rs +++ b/barkit-extract/src/lib.rs @@ -1,6 +1,6 @@ -pub mod barcode; pub mod error; pub mod fastq; pub mod logger; +pub mod parse; pub mod pattern; pub mod run; diff --git a/barkit-extract/src/logger.rs b/barkit-extract/src/logger.rs index 0ec2725..9764e45 100644 --- a/barkit-extract/src/logger.rs +++ b/barkit-extract/src/logger.rs @@ -1,25 +1,114 @@ -use console::Emoji; -use indicatif::{ProgressBar, ProgressStyle}; +use std::time::Instant; -use crate::error::Error; -use crate::fastq; +use console::{style, Emoji}; +use indicatif::{HumanDuration, ProgressBar, ProgressStyle}; pub static SPARKLE: Emoji<'_, '_> = Emoji("✨ ", ":-)"); +static PROGRESS_BAR_TEMPLATE: &str = + "{spinner:.green} [{elapsed_precise}] {bar:40.cyan/blue} {pos:>7}/{len:7} {per_sec} ({eta})"; -pub fn create_progress_bar( - fastq_file: &str, - threads: usize, - max_memory: Option, -) -> Result { - let progress_bar_style = ProgressStyle::with_template( - "{spinner:.green} [{elapsed_precise}] {bar:40.cyan/blue} {pos:>7}/{len:7} {per_sec} ({eta})", - ).expect("Failed to parse a progress bar template") - .progress_chars("##-"); +pub struct Logger { + /// Index of the current step + current: usize, - let read_num = fastq::get_reads_count(fastq_file, threads, max_memory) as u64; + /// Total number of steps + total: usize, - let progress_bar = ProgressBar::new(read_num as u64); - progress_bar.set_style(progress_bar_style.clone()); + /// If true, logs will not be displayed + quiet: bool, - Ok(progress_bar) + /// Start time of execution + execution_start: Instant, + + /// Progress bar + progress_bar: Option, +} + +impl Logger { + /// Creates `Logger` instance with default values. + /// + /// # Example + /// + /// ``` + /// use barkit_extract::logger::Logger; + /// + /// let logger = Logger::new(3, false); + /// ``` + pub fn new(total: usize, quiet: bool) -> Self { + Self { + current: 0, + total, + quiet, + execution_start: Instant::now(), + progress_bar: None, + } + } + + /// Prints logging message for the current step. + /// + /// # Example + /// + /// ``` + /// use barkit_extract::logger::Logger; + /// + /// let mut logger = Logger::new(2, false); + /// + /// logger.message("first logging message"); + /// // Output: "[1/2] first logging message" + /// + /// logger.message("second logging message"); + /// // Output: "[2/2] second logging message" + /// + /// logger.message("third logging message"); + /// // Output: "Warning: Current step exceeds total steps." + /// ``` + pub fn message(&mut self, text: &str) { + if self.current < self.total { + self.current += 1; + if !self.quiet { + println!( + "{} {}", + style(format!("[{}/{}]", self.current, self.total)) + .bold() + .dim(), + text + ); + } + } else { + // Optionally handle the case where `current` exceeds `total` + eprintln!("Warning: Current step exceeds total steps."); + } + } + + /// Increments progress in the progress bar + pub fn increment_progress(&self, done_lines: usize) { + if let Some(ref pb) = self.progress_bar { + pb.inc(done_lines as u64) + } + } + + /// Sets progress bar instance with specified length + pub fn set_progress_bar(&mut self, size: usize) { + if !self.quiet { + let progress_bar_style = ProgressStyle::with_template(PROGRESS_BAR_TEMPLATE) + .expect("Failed to parse a progress bar template") + .progress_chars("##-"); + + let progress_bar = ProgressBar::new(size as u64); + progress_bar.set_style(progress_bar_style); + + self.progress_bar = Some(progress_bar); + } + } + + /// Prints a final message when all steps are completed + pub fn final_message(&self) { + if self.progress_bar.is_some() { + println!( + "{} Done in {}", + SPARKLE, + HumanDuration(self.execution_start.elapsed()) + ) + } + } } diff --git a/barkit-extract/src/barcode.rs b/barkit-extract/src/parse.rs similarity index 64% rename from barkit-extract/src/barcode.rs rename to barkit-extract/src/parse.rs index c332523..ad4a0d3 100644 --- a/barkit-extract/src/barcode.rs +++ b/barkit-extract/src/parse.rs @@ -1,8 +1,10 @@ -use crate::pattern; -use regex::bytes::{Captures, Regex}; +#![allow(clippy::result_large_err)] + +use crate::pattern::BarcodeRegex; +use regex::bytes::Captures; use seq_io::fastq::{OwnedRecord, Record, RefRecord}; -use std::{fmt, str}; +use std::str; use crate::error::Error; @@ -29,96 +31,31 @@ const TRANSLATION_TABLE: [u8; 256] = { table }; -#[derive(Clone)] -pub enum BarcodeType { - Umi, - Sample, - Cell, -} - -impl fmt::Display for BarcodeType { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - let barcode_str = match self { - BarcodeType::Umi => "UMI", - BarcodeType::Sample => "SB", - BarcodeType::Cell => "CB", - }; - write!(f, "{}", barcode_str) - } -} - -impl BarcodeType { - fn get_barcode_type(name: &str) -> Result { - match name { - "UMI" => Ok(BarcodeType::Umi), - "SB" => Ok(BarcodeType::Sample), - "CB" => Ok(BarcodeType::Cell), - _ => Err(Error::UnexpectedCaptureGroupName(name.to_owned())), - } - } -} - -#[derive(Clone)] -pub struct BarcodeRegex { - regex: Regex, - barcode_types: Vec, -} - -impl BarcodeRegex { - pub fn new(pattern: &str, max_error: usize) -> Result { - let fuzzy_pattern = pattern::get_with_errors(pattern, &max_error); - let regex = Regex::new(&fuzzy_pattern)?; - let barcode_types = Self::parse_capture_groups(®ex)?; - Ok(Self { - regex, - barcode_types, - }) - } - - pub fn get_captures<'a>(&'a self, read_seq: &'a [u8]) -> Result { - match self.regex.captures(read_seq) { - Some(capture) => Ok(capture), - None => Err(Error::PatternNotMatched), - } - } - - fn parse_capture_groups(regex: &Regex) -> Result, Error> { - let mut capture_groups = Vec::::new(); - for capture_group in regex - .capture_names() - .collect::>() - .into_iter() - .flatten() - { - capture_groups.push(BarcodeType::get_barcode_type(capture_group)?) - } - if capture_groups.is_empty() { - return Err(Error::BarcodeCaptureGroupNotFound(regex.to_string())); - } - Ok(capture_groups) - } -} - pub struct BarcodeParser { + /// Prepared regex pattern to parse barcodes barcode_regex: BarcodeRegex, + + /// If `true`, all captured patterns will not be trimmed skip_trimming: bool, + + /// If `true`, the barcode pattern will also be matched in the reverse complement sequence. rc_barcodes: bool, } impl BarcodeParser { pub fn new( - barcode_regex: Option, + barcode_regex: Option<&BarcodeRegex>, skip_trimming: bool, rc_barcodes: bool, ) -> Option { - barcode_regex.map(|regex| BarcodeParser { - barcode_regex: regex, + Some(BarcodeParser { + barcode_regex: barcode_regex?.to_owned(), skip_trimming, rc_barcodes, }) } - pub fn extract_barcodes(&self, record: &RefRecord) -> Option { + pub fn parse_barcodes(&self, record: &RefRecord) -> Option { let read_captures = self.barcode_regex.get_captures(record.seq()); let read_seq_rc: Vec; let read_captures = if read_captures.is_err() && self.rc_barcodes { @@ -127,20 +64,20 @@ impl BarcodeParser { } else { read_captures }; - self.create_new_read(read_captures.map(Some), record) + self.create_read(read_captures.map(Some), record) } - fn create_new_read( + fn create_read( &self, read_captures: Result, Error>, record: &RefRecord, ) -> Option { match (read_captures, self.skip_trimming) { (Ok(Some(captures)), true) => { - Some(self.get_read_with_new_header(&captures, record).ok()?) + Some(self.create_read_with_new_header(&captures, record).ok()?) } (Ok(Some(captures)), false) => { - let new_read = self.get_read_with_new_header(&captures, record).ok()?; + let new_read = self.create_read_with_new_header(&captures, record).ok()?; Some(trim_adapters(captures, &new_read).ok()?) } (Ok(None), _) => Some(OwnedRecord { @@ -152,7 +89,7 @@ impl BarcodeParser { } } - fn get_read_with_new_header( + fn create_read_with_new_header( &self, captures: &Captures, record: &RefRecord, @@ -161,7 +98,7 @@ impl BarcodeParser { let seq = record.seq().to_vec(); let qual = record.qual().to_vec(); - for barcode in &self.barcode_regex.barcode_types { + for barcode in &self.barcode_regex.get_barcode_types() { let barcode_name = barcode.to_string(); let (barcode_start, barcode_end) = get_barcode_match_positions(&barcode_name, captures)?; @@ -245,7 +182,7 @@ pub fn get_reverse_complement(sequence: &[u8]) -> Vec { mod tests { use rstest::rstest; - use crate::barcode::get_reverse_complement; + use crate::parse::get_reverse_complement; #[rstest] #[case(b"", b"")] diff --git a/barkit-extract/src/pattern.rs b/barkit-extract/src/pattern.rs index bc4e010..8b8d764 100644 --- a/barkit-extract/src/pattern.rs +++ b/barkit-extract/src/pattern.rs @@ -1,70 +1,237 @@ -use std::mem::size_of; +#![allow(clippy::result_large_err)] -use fancy_regex::Regex; +use std::{fmt, mem::size_of}; + +use fancy_regex::Regex as FancyRegex; +use regex::bytes::{Captures, Regex}; + +use crate::error::Error; const FUZZY_CHARACTER: &str = "."; const ADAPTER_PATTERN_REGEX: &str = r"(? Vec { - if *errors_num == 0 { - return vec![string.to_string().to_ascii_uppercase()]; - } +pub struct BarcodePattern { + adapter_pattern: FancyRegex, + barcode_pattern: String, + max_error: usize, +} - if string.is_empty() { - return Vec::new(); +impl BarcodePattern { + pub fn new(pattern: &str, max_error: &usize) -> Result { + Ok(Self { + adapter_pattern: FancyRegex::new(ADAPTER_PATTERN_REGEX)?, + barcode_pattern: pattern.to_owned(), + max_error: *max_error, + }) } - if *errors_num >= string.len() { - return vec![FUZZY_CHARACTER.repeat(string.len())]; - } + /// Generates sequences with errors that may occur during amplification. + /// + /// # Example + /// + /// ``` + /// use barkit_extract::pattern::BarcodePattern; + /// + /// let barcode_pattern = BarcodePattern::new("^atgc(?[ATGCN]{12})", &1).unwrap(); + /// + /// let sequences_with_errors = barcode_pattern.get_sequence_with_errors("ATGC").unwrap(); + /// assert_eq!(vec!["ATG.", "AT.C", "A.GC", ".TGC"], sequences_with_errors); + /// ``` + pub fn get_sequence_with_errors(&self, sequence: &str) -> Result, Error> { + if self.max_error == 0 { + return Ok(vec![sequence.to_string().to_ascii_uppercase()]); + } - let num_chars = string.chars().count(); - assert!(num_chars <= usize::BITS as usize * 8, "too many characters"); + if sequence.is_empty() { + return Ok(Vec::new()); + } - let max_permutation_mask = usize::MAX - .checked_shr(size_of::() as u32 * 8 - num_chars as u32) - .unwrap(); + if self.max_error >= sequence.len() { + return Ok(vec![FUZZY_CHARACTER.repeat(sequence.len())]); + } - let mut cases = Vec::new(); + let num_chars = sequence.chars().count(); + assert!(num_chars <= usize::BITS as usize * 8, "too many characters"); - let upper: Vec = string.chars().map(|c| c.to_ascii_uppercase()).collect(); + let max_permutation_mask = usize::MAX + .checked_shr(size_of::() as u32 * 8 - num_chars as u32) + .ok_or(Error::PermutationMaskSize)?; - for permutation_mask in 0..=max_permutation_mask { - if permutation_mask.count_ones() as usize != num_chars - errors_num { - continue; - } - let mut s = String::new(); - for (idx, _) in upper.iter().enumerate().take(num_chars) { - if (permutation_mask & (1 << idx)) == 0 { - s.push_str(FUZZY_CHARACTER) - } else { - s.push(upper[idx]) + let mut cases = Vec::new(); + + let upper: Vec = sequence.chars().map(|c| c.to_ascii_uppercase()).collect(); + + for permutation_mask in 0..=max_permutation_mask { + if permutation_mask.count_ones() as usize != num_chars - self.max_error { + continue; + } + let mut s = String::new(); + for (idx, _) in upper.iter().enumerate().take(num_chars) { + if (permutation_mask & (1 << idx)) == 0 { + s.push_str(FUZZY_CHARACTER) + } else { + s.push(upper[idx]) + } } + cases.push(s); } - cases.push(s); + Ok(cases) } - cases + /// Returns regex pattern with PCR errors. + /// + /// # Example + /// + /// ``` + /// use barkit_extract::pattern::BarcodePattern; + /// + /// let barcode_pattern = BarcodePattern::new("^atgc(?[ATGCN]{12})", &1).unwrap(); + /// + /// let pattern_with_pcr_errors = barcode_pattern.get_pattern_with_errors().unwrap(); + /// assert_eq!("^(ATG.|AT.C|A.GC|.TGC)(?[ATGCN]{12})", pattern_with_pcr_errors); + /// ``` + pub fn get_pattern_with_errors(&self) -> Result { + let mut result = String::new(); + let mut last_end = 0; + + for mat in self.adapter_pattern.find_iter(&self.barcode_pattern) { + let mat = mat?; + result.push_str(&self.barcode_pattern[last_end..mat.start()]); + + let fuzzy_patterns = self.get_sequence_with_errors(mat.as_str()); + result.push_str(&format!("({})", fuzzy_patterns?.join("|"))); + + last_end = mat.end(); + } + + result.push_str(&self.barcode_pattern[last_end..]); + Ok(result) + } } -pub fn get_with_errors(pattern: &str, max_error: &usize) -> String { - let regex_pattern = Regex::new(ADAPTER_PATTERN_REGEX).unwrap(); +#[derive(Clone, Debug, PartialEq, Eq)] +pub enum BarcodeType { + /// Moleculare barcode (UMI) + Umi, - let mut result = String::new(); - let mut last_end = 0; + /// Sample barcode + Sample, - for mat in regex_pattern.find_iter(pattern) { - let mat = mat.unwrap(); - result.push_str(&pattern[last_end..mat.start()]); + /// Cell barcode + Cell, +} - let fuzzy_patterns = generate_sequences_with_pcr_errors(mat.as_str(), max_error); - result.push_str(&format!("({})", fuzzy_patterns.join("|"))); +impl BarcodeType { + /// Parses type of barcode + /// + /// # Example + /// + /// ``` + /// use barkit_extract::pattern::BarcodeType; + /// use barkit_extract::error::Error::UnexpectedCaptureGroupName; + /// + /// assert_eq!(BarcodeType::Umi, BarcodeType::parse_type("UMI").unwrap()); + /// assert_eq!(BarcodeType::Sample, BarcodeType::parse_type("SB").unwrap()); + /// assert_eq!(BarcodeType::Cell, BarcodeType::parse_type("CB").unwrap()); + /// ``` + pub fn parse_type(name: &str) -> Result { + match name { + "UMI" => Ok(BarcodeType::Umi), + "SB" => Ok(BarcodeType::Sample), + "CB" => Ok(BarcodeType::Cell), + _ => Err(Error::UnexpectedCaptureGroupName(name.to_owned())), + } + } +} + +impl fmt::Display for BarcodeType { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!( + f, + "{}", + match self { + BarcodeType::Umi => "UMI", + BarcodeType::Sample => "SB", + BarcodeType::Cell => "CB", + } + ) + } +} + +#[derive(Clone)] +pub struct BarcodeRegex { + /// Regex pattern to parse barcode(s) from read sequence + regex: Regex, + + /// List of barcode types parsed from provided pattern + barcode_types: Vec, +} + +impl BarcodeRegex { + /// Creates `BarcodeRegex` instance + /// + /// Example + /// ``` + /// use barkit_extract::pattern::BarcodeRegex; + /// + /// let barcode_regex = BarcodeRegex::new("^atgc(?[ATGCN]{6})", 1); + /// ``` + pub fn new(pattern: &str, max_error: usize) -> Result { + let barcode_pattern = BarcodePattern::new(pattern, &max_error)?; + let fuzzy_pattern = barcode_pattern.get_pattern_with_errors()?; + let regex = Regex::new(&fuzzy_pattern)?; + let barcode_types = Self::parse_capture_groups(®ex)?; + Ok(Self { + regex, + barcode_types, + }) + } - last_end = mat.end(); + /// Parses capture groups from regex pattern + fn parse_capture_groups(regex: &Regex) -> Result, Error> { + let mut capture_groups = Vec::::new(); + for capture_group in regex + .capture_names() + .collect::>() + .into_iter() + .flatten() + { + capture_groups.push(BarcodeType::parse_type(capture_group)?) + } + if capture_groups.is_empty() { + return Err(Error::BarcodeCaptureGroupNotFound(regex.to_string())); + } + Ok(capture_groups) } - result.push_str(&pattern[last_end..]); - result + /// Captures barcodes in read sequence + /// + /// Example + /// ``` + /// use barkit_extract::pattern::BarcodeRegex; + /// + /// let barcode_regex = BarcodeRegex::new("^atgc(?[ATGCN]{6})", 1).unwrap(); + /// + /// assert_eq!( + /// b"NNNNNN", + /// barcode_regex + /// .get_captures(b"ATGCNNNNNNCCC") + /// .unwrap() + /// .name("UMI") + /// .unwrap() + /// .as_bytes() + /// ); + /// ``` + pub fn get_captures<'a>(&self, read_seq: &'a [u8]) -> Result, Error> { + match self.regex.captures(read_seq) { + Some(capture) => Ok(capture), + None => Err(Error::PatternNotMatched), + } + } + + pub fn get_barcode_types(&self) -> Vec { + self.barcode_types.clone() + } } #[cfg(test)] @@ -83,11 +250,12 @@ mod tests { fn test_generate_sequences_with_pcr_errors( #[case] expected: Vec<&str>, #[case] text: &str, - #[case] errors_num: usize, + #[case] max_error: usize, ) { + let barcode_pattern = pattern::BarcodePattern::new("", &max_error).unwrap(); assert_eq!( expected, - pattern::generate_sequences_with_pcr_errors(text, &errors_num) + barcode_pattern.get_sequence_with_errors(text).unwrap() ); } @@ -103,6 +271,7 @@ mod tests { )] #[case("^(?P[ATGCN]{3})", "^(?P[ATGCN]{3})", 1)] fn test_create_fuzzy(#[case] expected: &str, #[case] pattern: &str, #[case] max_error: usize) { - assert_eq!(expected, pattern::get_with_errors(pattern, &max_error)) + let barcode_pattern = pattern::BarcodePattern::new(pattern, &max_error).unwrap(); + assert_eq!(expected, barcode_pattern.get_pattern_with_errors().unwrap()) } } diff --git a/barkit-extract/src/run.rs b/barkit-extract/src/run.rs index e26fe0c..55dfb9d 100644 --- a/barkit-extract/src/run.rs +++ b/barkit-extract/src/run.rs @@ -1,12 +1,10 @@ -use std::time::Instant; - -use console::style; -use indicatif::HumanDuration; use rayon::prelude::*; +use seq_io::fastq::{OwnedRecord, RefRecord}; -use crate::barcode::{self, BarcodeRegex}; -use crate::fastq::{self, CompressionType}; +use crate::fastq::{CompressionType, FastqReader, FastqWriter, FastqsReader, FastqsWriter}; use crate::logger; +use crate::parse::{self, BarcodeParser}; +use crate::pattern::BarcodeRegex; #[allow(clippy::too_many_arguments)] pub fn run( @@ -61,11 +59,31 @@ pub fn run( } } +/// Parses barcodes from single-end reads in parallel +fn parse_se_reads( + records: &Vec, + barcode: &BarcodeRegex, + skip_trimming: bool, + rc_barcodes: bool, +) -> Vec { + records + .par_iter() + .filter_map(|record| { + // Create a new BarcodeParser with the appropriate configuration + let barcodes_parser = BarcodeParser::new(Some(barcode), skip_trimming, rc_barcodes); + + // Parse the barcodes from the RefRecord + // `record` needs to be passed as a `&RefRecord` + barcodes_parser?.parse_barcodes(record) + }) + .collect() +} + #[allow(clippy::too_many_arguments)] fn process_single_end_fastq( - read: String, + fq: String, pattern: String, - out_read: String, + out_fq: String, max_memory: Option, threads: usize, rc_barcodes: bool, @@ -75,74 +93,107 @@ fn process_single_end_fastq( quiet: bool, force: bool, ) { - let mut reader = - fastq::create_reader(&read, threads, max_memory).expect("Failed to create reader"); - let writer = fastq::create_writer(&out_read, &output_compression, threads, force) - .expect("Failed to create writer"); - - if !quiet { - println!( - "{} Parsing barcode patterns...", - style("[1/3]").bold().dim() - ); - } + let mut logger = logger::Logger::new(3, quiet); + logger.message("Estimating reads count..."); - let barcode_re = BarcodeRegex::new(&pattern, max_error) - .expect("Failed to create barcode regex with the provided pattern and max error."); + let lines_number = FastqReader::count_reads(&fq, threads, max_memory); + logger.set_progress_bar(lines_number); - let progress_bar = match quiet { - false => { - println!("{} Estimating reads count...", style("[2/3]").bold().dim()); - Some( - logger::create_progress_bar(&read, threads, max_memory) - .expect("Failed to create progress bar"), - ) - } - true => None, - }; - - if !quiet { - println!( - "{} Extracting barcodes from reads...", - style("[3/3]").bold().dim() - ); - } + let mut reader = FastqReader::new(&fq, threads, max_memory).unwrap_or_else(|e| { + eprintln!("{}", e); + std::process::exit(1); + }); + + let mut writer = + FastqWriter::new(&out_fq, &output_compression, threads, force).unwrap_or_else(|e| { + eprintln!("{}", e); + std::process::exit(1); + }); + + logger.message("Parsing barcode patterns..."); + + let barcode = BarcodeRegex::new(&pattern, max_error).unwrap_or_else(|e| { + eprintln!("{}", e); + std::process::exit(1); + }); + + logger.message("Extracting barcodes from reads..."); loop { - let mut record_set = seq_io::fastq::RecordSet::default(); + let record_set = reader.read_record_set(); - let filled_set = reader.read_record_set(&mut record_set); + if let Some(records) = record_set { + // Flatten the record set into individual records + let records = records.into_iter().collect::>(); - if filled_set.is_none() { - break; + // Parallel processing of individual records to extract parsed reads + let result_reads = parse_se_reads(&records, &barcode, skip_trimming, rc_barcodes); + + // Write the processed reads to the output FASTQ + writer.write_all(result_reads).unwrap_or_else(|e| { + eprintln!("{}", e); + std::process::exit(1); + }); + + // Increment the progress tracker based on the number of records processed + logger.increment_progress(records.len()); } else { - let records = record_set.into_iter().collect::>(); - let result_reads: Vec<_> = records - .par_iter() - .filter_map(|record| { - let barcodes_parser = barcode::BarcodeParser::new( - Some(barcode_re.clone()), - skip_trimming, - rc_barcodes, - ); - barcodes_parser?.extract_barcodes(record) - }) - .collect(); - - let writer = writer.lock().unwrap(); - fastq::save_single_end_reads_to_file(result_reads, writer); - - if let Some(ref pb) = progress_bar { - pb.inc(records.len() as u64) - } + break; } } - if let Some(pb) = progress_bar { - pb.finish_with_message("all reads successfully processed") + logger.final_message(); +} + +/// Returns final reads, that will be saved to the output file +fn get_new_reads( + new_records: (Option, Option), + record1: &RefRecord, + record2: &RefRecord, +) -> Option<(OwnedRecord, OwnedRecord)> { + match new_records { + (Some(new_record1), Some(new_record2)) => Some((new_record1, new_record2)), + (None, Some(new_record2)) => Some((record1.to_owned_record(), new_record2)), + (Some(new_record1), None) => Some((new_record1, record2.to_owned_record())), + (None, None) => None, } } +/// Parses barcodes from paired-end reads in parallel +fn parse_pe_reads( + records1: &Vec, + records2: &Vec, + barcode1: &Option, + barcode2: &Option, + skip_trimming: bool, + rc_barcodes: bool, +) -> Vec<(OwnedRecord, OwnedRecord)> { + records1 + .par_iter() + .zip(records2.par_iter()) + .filter_map(|(record1, record2)| { + // Create a new `BarcodeParser` with the appropriate configuration for forward and reverse reads + let barcode1_parser = + parse::BarcodeParser::new(barcode1.as_ref(), skip_trimming, rc_barcodes); + let barcode2_parser = + parse::BarcodeParser::new(barcode2.as_ref(), skip_trimming, rc_barcodes); + + // Parse the barcodes from the RefRecord + // `record` needs to be passed as a `&RefRecord` + let new_reads = ( + barcode1_parser + .as_ref() + .and_then(|parser| parser.parse_barcodes(record1)), + barcode2_parser + .as_ref() + .and_then(|parser| parser.parse_barcodes(record2)), + ); + + get_new_reads(new_reads, record1, record2) + }) + .collect() +} + #[allow(clippy::too_many_arguments)] fn process_pair_end_fastq( fq1: String, @@ -160,108 +211,66 @@ fn process_pair_end_fastq( quiet: bool, force: bool, ) { - let mut reader1 = fastq::create_reader(&fq1, threads, max_memory) - .expect("Failed to read input forward reads"); - let mut reader2 = fastq::create_reader(&fq2, threads, max_memory) - .expect("Failed to read input reverse reads"); - - let writer1 = fastq::create_writer(&out_fq1, &output_compression, threads, force) - .expect("Failed to write output forward reads"); - let writer2 = fastq::create_writer(&out_fq2, &output_compression, threads, force) - .expect("Failed to write output reverse reads"); - - if !quiet { - println!( - "{} Parsing barcode patterns...", - style("[1/3]").bold().dim() - ); - } + let mut logger = logger::Logger::new(3, quiet); + logger.message("Estimating reads count..."); + + let lines_number = FastqReader::count_reads(&fq1, threads, max_memory); + logger.set_progress_bar(lines_number); + + let mut reader = FastqsReader::new(&fq1, &fq2, threads, max_memory).unwrap_or_else(|e| { + eprintln!("{}", e); + std::process::exit(1); + }); + + let mut writer = FastqsWriter::new(&out_fq1, &out_fq2, &output_compression, threads, force) + .unwrap_or_else(|e| { + eprintln!("{}", e); + std::process::exit(1); + }); + + logger.message("Parsing barcode patterns..."); let barcode1 = pattern1.as_ref().map(|pat| { - BarcodeRegex::new(pat, max_error).expect( - "Failed to create barcode regex for pattern1 with the provided pattern and max error", - ) + BarcodeRegex::new(pat, max_error).unwrap_or_else(|e| { + eprintln!("{}", e); + std::process::exit(1); + }) }); let barcode2 = pattern2.as_ref().map(|pat| { - BarcodeRegex::new(pat, max_error).expect( - "Failed to create barcode regex for pattern2 with the provided pattern and max error", - ) + BarcodeRegex::new(pat, max_error).unwrap_or_else(|e| { + eprintln!("{}", e); + std::process::exit(1); + }) }); - let started = Instant::now(); - let progress_bar = match quiet { - false => { - println!("{} Estimating reads count...", style("[2/3]").bold().dim()); - Some( - logger::create_progress_bar(&fq1, threads, max_memory) - .expect("Failed to create progress bar"), - ) - } - true => None, - }; - - if !quiet { - println!( - "{} Extracting barcodes from reads...", - style("[3/3]").bold().dim() - ); - } + logger.message("Extracting barcodes from reads..."); loop { - let mut record_set1 = seq_io::fastq::RecordSet::default(); - let mut record_set2 = seq_io::fastq::RecordSet::default(); + let record_sets = reader.read_record_sets(); - let filled_set1 = reader1.read_record_set(&mut record_set1); - let filled_set2 = reader2.read_record_set(&mut record_set2); + if let Ok((Some(records1), Some(records2))) = record_sets { + let records1 = records1.into_iter().collect::>(); + let records2 = records2.into_iter().collect::>(); - if filled_set1.is_none() || filled_set2.is_none() { - break; + let new_reads = parse_pe_reads( + &records1, + &records2, + &barcode1, + &barcode2, + skip_trimming, + rc_barcodes, + ); + + writer.write_all(new_reads).unwrap_or_else(|e| { + eprintln!("{}", e); + std::process::exit(1); + }); + + logger.increment_progress(records1.len()); } else { - let records1 = record_set1.into_iter().collect::>(); - let records2 = record_set2.into_iter().collect::>(); - - let result_read_pairs: Vec<_> = records1 - .par_iter() - .zip(records2.par_iter()) - .filter_map(|(record1, record2)| { - let barcode1_parser = - barcode::BarcodeParser::new(barcode1.clone(), skip_trimming, rc_barcodes); - let barcode2_parser = - barcode::BarcodeParser::new(barcode2.clone(), skip_trimming, rc_barcodes); - - let new_records = ( - barcode1_parser - .as_ref() - .and_then(|parser| parser.extract_barcodes(record1)), - barcode2_parser - .as_ref() - .and_then(|parser| parser.extract_barcodes(record2)), - ); - - match new_records { - (Some(new_record1), Some(new_record2)) => Some((new_record1, new_record2)), - (None, Some(new_record2)) => Some((record1.to_owned_record(), new_record2)), - (Some(new_record1), None) => Some((new_record1, record2.to_owned_record())), - (None, None) => None, - } - }) - .collect(); - - let writer1 = writer1.lock().unwrap(); - let writer2 = writer2.lock().unwrap(); - fastq::save_pair_end_reads_to_file(result_read_pairs, writer1, writer2); - - if let Some(ref pb) = progress_bar { - pb.inc(records1.len() as u64) - } + break; } } - if progress_bar.is_some() { - println!( - "{} Done in {}", - logger::SPARKLE, - HumanDuration(started.elapsed()) - ) - } + logger.final_message(); } diff --git a/src/lib.rs b/src/lib.rs index 750d688..a763945 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -5,76 +5,66 @@ use clap::{command, ArgAction, Parser, Subcommand}; pub struct Args { #[command(subcommand)] pub command: Commands, + + /// Max RAM usage in megabytes + #[arg(short = 'm', long)] + pub max_memory: Option, + + /// The approximate number of threads to use. + #[arg(short = 't', long, default_value = "1", global = true)] + pub threads: usize, + + /// Be quiet and do not show extra information + #[arg(long, action = ArgAction::SetTrue, global = true)] + pub quiet: bool, + + /// Overwrite output files + #[arg(short = 'f', long, action = ArgAction::SetTrue, global = true)] + pub force: bool, } #[derive(Subcommand, Debug)] pub enum Commands { - /// Tool for parsing barcodes from single-end or paired-end FASTQ files + /// Extract barcode nucleotide sequence according to a specified regex pattern #[clap(arg_required_else_help = true)] Extract { - /// Input forward FASTQ file - #[arg(short = '1', long, value_name = "IN_FASTQ1", requires = "out_fq1")] - fq1: String, - - /// Input reverse FASTQ file - #[arg(short = '2', long, value_name = "IN_FASTQ2", requires_all = ["fq1", "out_fq2"])] - fq2: Option, - - /// Output forward FASTQ file - #[arg(short = 'o', long, value_name = "OUT_FASTQ1")] - out_fq1: String, + #[clap(flatten)] + input_fastqs: InputsGroup, - /// Output reverse FASTQ file - #[arg(short = 'O', long, value_name = "OUT_FASTQ2", requires = "out_fq1")] - out_fq2: Option, + #[clap(flatten)] + output_fastqs: OutputsGroup, #[clap(flatten)] patterns: PatternsGroup, - /// Max RAM usage in megabytes - #[arg(short = 'm', long)] - max_memory: Option, - - /// The approximate number of threads to use. - #[arg(short = 't', long, default_value = "1")] - threads: usize, - - /// Searches for both barcode pattern in reverse complement - #[arg(short = 'r', long, action=ArgAction::SetTrue)] - rc_barcodes: bool, - - /// Skip trimming the adapter sequence from the read - #[arg(short = 's', long, action=ArgAction::SetTrue)] - skip_trimming: bool, - - /// Max error (mismatch) between provided pattern and read sequence - #[arg(short = 'e', long, default_value = "1")] - max_error: usize, - - /// Compress outputs in gzip format - #[arg(long, action = ArgAction::SetTrue, conflicts_with_all = ["bgz", "mgz", "lz4"])] - gz: bool, - - /// Compress outputs in bgzf (bgzip) format - #[arg(long, action = ArgAction::SetTrue, conflicts_with_all = ["gz", "mgz", "lz4"])] - bgz: bool, - - /// Compress outputs in mgzip format - #[arg(long, action = ArgAction::SetTrue, conflicts_with_all = ["gz", "bgz", "lz4"])] - mgz: bool, + #[clap(flatten)] + compression: CompressionGroup, - /// Compress outputs in lz4 format - #[arg(long, action = ArgAction::SetTrue, conflicts_with_all = ["gz", "bgz", "mgz"])] - lz4: bool, + #[clap(flatten)] + additional_params: AdditionalParamsGroup, + }, +} - /// Be quiet and do not show extra information - #[arg(short = 'q', long, action = ArgAction::SetTrue)] - quiet: bool, +#[derive(Debug, clap::Args)] +pub struct InputsGroup { + /// Input forward FASTQ file + #[arg(short = '1', long, value_name = "IN_FASTQ1", requires = "out_fq1")] + pub fq1: String, + + /// Input reverse FASTQ file + #[arg(short = '2', long, value_name = "IN_FASTQ2", requires_all = ["fq1", "out_fq2"])] + pub fq2: Option, +} - /// Overwrite output files - #[arg(short = 'f', long, action = ArgAction::SetTrue)] - force: bool, - }, +#[derive(Debug, clap::Args)] +pub struct OutputsGroup { + /// Output forward FASTQ file + #[arg(short = 'o', long, value_name = "OUT_FASTQ1")] + pub out_fq1: String, + + /// Output reverse FASTQ file + #[arg(short = 'O', long, value_name = "OUT_FASTQ2", requires = "out_fq1")] + pub out_fq2: Option, } #[derive(Debug, clap::Args)] @@ -88,3 +78,37 @@ pub struct PatternsGroup { #[arg(short = 'P', long, requires = "fq2")] pub pattern2: Option, } + +#[derive(Debug, clap::Args)] +pub struct CompressionGroup { + /// Compress outputs in gzip format + #[arg(long, action = ArgAction::SetTrue, conflicts_with_all = ["bgz", "mgz", "lz4"])] + pub gz: bool, + + /// Compress outputs in bgzf (bgzip) format + #[arg(long, action = ArgAction::SetTrue, conflicts_with_all = ["gz", "mgz", "lz4"])] + pub bgz: bool, + + /// Compress outputs in mgzip format + #[arg(long, action = ArgAction::SetTrue, conflicts_with_all = ["gz", "bgz", "lz4"])] + pub mgz: bool, + + /// Compress outputs in lz4 format + #[arg(long, action = ArgAction::SetTrue, conflicts_with_all = ["gz", "bgz", "mgz"])] + pub lz4: bool, +} + +#[derive(Debug, clap::Args)] +pub struct AdditionalParamsGroup { + /// Searches for both barcode pattern in reverse complement + #[arg(short = 'r', long, action=ArgAction::SetTrue)] + pub rc_barcodes: bool, + + /// Skip trimming the adapter sequence from the read + #[arg(short = 's', long, action=ArgAction::SetTrue)] + pub skip_trimming: bool, + + /// Max error (mismatch) between provided pattern and read sequence + #[arg(short = 'e', long, default_value = "1")] + pub max_error: usize, +} diff --git a/src/main.rs b/src/main.rs index acaa0f7..bab7809 100644 --- a/src/main.rs +++ b/src/main.rs @@ -5,42 +5,33 @@ fn main() { match &args.command { barkit::Commands::Extract { - fq1, - fq2, - out_fq1, - out_fq2, - max_memory, - threads, - rc_barcodes, - skip_trimming, - max_error, + input_fastqs, + output_fastqs, + additional_params, patterns, - gz, - bgz, - mgz, - lz4, - quiet, - force, + compression, } => { - let output_compression = - barkit_extract::fastq::CompressionType::get_output_compression_type( - gz, bgz, mgz, lz4, - ); + let output_compression = barkit_extract::fastq::CompressionType::select( + &compression.gz, + &compression.bgz, + &compression.mgz, + &compression.lz4, + ); barkit_extract::run::run( - fq1.to_string(), - fq2.clone(), + input_fastqs.fq1.to_string(), + input_fastqs.fq2.clone(), patterns.pattern1.clone(), patterns.pattern2.clone(), - out_fq1.to_string(), - out_fq2.clone(), - *max_memory, - *threads, - *rc_barcodes, - *skip_trimming, - *max_error, + output_fastqs.out_fq1.to_string(), + output_fastqs.out_fq2.clone(), + args.max_memory, + args.threads, + additional_params.rc_barcodes, + additional_params.skip_trimming, + additional_params.max_error, output_compression, - *quiet, - *force, + args.quiet, + args.force, ); } }