diff --git a/Cargo.toml b/Cargo.toml index 9a7f2f2..46f173d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,6 +17,8 @@ clap-verbosity-flag = "2.2.0" color-eyre = "0.6.3" glob = "0.3.1" rayon = "1.9.0" +regex = "1.10.4" +tracing = "0.1.40" [profile.dev.package.backtrace] opt-level = 3 diff --git a/src/cli.rs b/src/cli.rs index 8369cdf..3d059c9 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -57,10 +57,10 @@ pub enum Commands { /// The sequencing platform where FASTQs came from #[arg(short, long, required = true)] platform: SeqPlatform, - // /// Output file prefix (the part before the `_samplesheet.csv`) - // #[arg(short, long, required = false)] - // output_file: Option, + /// Output file prefix (the part before the `_samplesheet.csv`) + #[arg(short, long, required = false, default_value = None)] + output_prefix: Option, // /// Check a pre-existing samplesheet // #[arg(short, long, required = false, default_value = "samplesheet.csv")] // check: Option, @@ -81,5 +81,9 @@ pub enum Commands { /// the number of cells expected #[arg(short, long, required = true)] expected_cells: i64, + + /// Output file prefix (the part before the `_samplesheet.csv`) + #[arg(short, long, required = false, default_value = None)] + output_prefix: Option, }, } diff --git a/src/main.rs b/src/main.rs index cd1a59d..42a3869 100644 --- a/src/main.rs +++ b/src/main.rs @@ -15,15 +15,17 @@ fn main() -> Result<()> { input_dir, fastq_ext, platform, + output_prefix, }) => { - viralrecon::give_a_sheet(input_dir, fastq_ext, platform)?; + viralrecon::give_a_sheet(input_dir, fastq_ext, platform, output_prefix)?; } Some(Commands::Scrnaseq { input_dir, fastq_ext, expected_cells, + output_prefix, }) => { - scrnaseq::give_a_sheet(input_dir, fastq_ext, &expected_cells)?; + scrnaseq::give_a_sheet(input_dir, fastq_ext, &expected_cells, output_prefix)?; } None => { eprintln!("{}\n", cli::INFO); diff --git a/src/scrnaseq.rs b/src/scrnaseq.rs index 56c478d..6641af9 100644 --- a/src/scrnaseq.rs +++ b/src/scrnaseq.rs @@ -1,9 +1,14 @@ +use regex::Regex; use std::{collections::HashSet, ffi::OsStr, path::Path, rc::Rc}; +use tracing::warn; +use crate::utils::write_lines; pub use crate::viralrecon::find_files; use color_eyre::eyre::Result; fn retrieve_samples(file_paths: &[Rc]) -> HashSet> { + let illumina_pattern = Regex::new(r"_L\d{3}_R\d_\d{3}\.fastq\.gz$").unwrap(); + file_paths .into_iter() .map(|path| { @@ -11,14 +16,21 @@ fn retrieve_samples(file_paths: &[Rc]) -> HashSet> { path.file_name() .unwrap_or(OsStr::new("")) .to_string_lossy() - .replace("_L001_R1_001.fastq.gz", "") - .replace("_L001_R2_001.fastq.gz", "") .as_ref(), ) }) + .map(|x| Rc::from(illumina_pattern.replace_all(&x, "").to_string())) .collect() } +fn check_sample_ids(sample_ids: &HashSet>) { + for id in sample_ids { + if id.chars().count() >= 64 { + warn!("Sample id {} is 64 or more characters long, which is maximum enforced by some of the SCRNAseq aligners. SCRNAseq may crash if you don't manually shorten the id in your samplesheet.", id) + } + } +} + fn collect_per_sample( sample_id: &Rc, fastq_paths: &[Rc], @@ -52,7 +64,7 @@ fn collect_per_sample( Ok(vec![sample_id, fastq1, fastq2, &cell_str].join(",")) } -pub fn concat_lines( +fn concat_lines( sample_ids: &HashSet>, fastq_paths: &[Rc], expected_cells: &i64, @@ -63,14 +75,21 @@ pub fn concat_lines( .collect::>() } -pub fn give_a_sheet(input_dir: &Path, fastq_ext: &str, expected_cells: &i64) -> Result<()> { +pub fn give_a_sheet( + input_dir: &Path, + fastq_ext: &str, + expected_cells: &i64, + output_prefix: &Option, +) -> Result<()> { + // find the FASTQ files and separate out the unique sample IDs let fastq_paths = find_files(input_dir, fastq_ext)?; let sample_ids: HashSet> = retrieve_samples(&fastq_paths); - let lines = concat_lines(&sample_ids, &fastq_paths, expected_cells); - for line in lines { - println!("{}", line) - } + // check the sample IDs for any that are too long + check_sample_ids(&sample_ids); - Ok(()) + // concatenate and write the lines + let lines = concat_lines(&sample_ids, &fastq_paths, expected_cells); + let header = "sample,fastq_1,fastq_2,expected_cells"; + write_lines(&lines, header, output_prefix) } diff --git a/src/utils.rs b/src/utils.rs index df10ad7..9df980f 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -1,4 +1,5 @@ -use std::{collections::HashSet, fmt, path::Path, rc::Rc}; +use color_eyre::eyre::Result; +use std::{collections::HashSet, fmt, fs::File, io::BufWriter, io::Write, path::Path, rc::Rc}; use clap::ValueEnum; @@ -26,3 +27,20 @@ impl fmt::Display for SeqPlatform { pub trait RetrieveSampleIds { fn retrieve_samples(&self, file_paths: &[Rc]) -> HashSet>; } + +pub fn write_lines(lines: &[String], header: &str, output_prefix: &Option) -> Result<()> { + let out_name = match output_prefix { + Some(prefix) => format!("{}_samplesheet.csv", prefix), + None => String::from("samplesheet.csv"), + }; + + let file = File::create(out_name)?; + let mut buf_writer = BufWriter::new(file); + + writeln!(buf_writer, "{}", header)?; + lines + .into_iter() + .try_for_each(|line| writeln!(buf_writer, "{}", line))?; + + Ok(()) +} diff --git a/src/viralrecon.rs b/src/viralrecon.rs index 443f74c..c6f0c7b 100644 --- a/src/viralrecon.rs +++ b/src/viralrecon.rs @@ -1,6 +1,7 @@ -use crate::utils::SeqPlatform; +use crate::utils::{write_lines, SeqPlatform}; use color_eyre::eyre::Result; use glob::glob; +use regex::Regex; use std::{collections::HashSet, ffi::OsStr, path::Path, rc::Rc}; use crate::utils::RetrieveSampleIds; @@ -23,19 +24,21 @@ impl RetrieveSampleIds for SeqPlatform { fn retrieve_samples(&self, file_paths: &[Rc]) -> HashSet> { match self { // handle paired end FASTQ files for Illumina - SeqPlatform::Illumina => file_paths - .into_iter() - .map(|path| { - Rc::from( - path.file_name() - .unwrap_or(OsStr::new("")) - .to_string_lossy() - .replace("_L001_R1_001.fastq.gz", "") - .replace("_L001_R2_001.fastq.gz", "") - .as_ref(), - ) - }) - .collect(), + SeqPlatform::Illumina => { + let illumina_pattern = Regex::new(r"_L\d{3}_R\d_\d{3}\.fastq\.gz$").unwrap(); + file_paths + .into_iter() + .map(|path| { + Rc::from( + path.file_name() + .unwrap_or(OsStr::new("")) + .to_string_lossy() + .as_ref(), + ) + }) + .map(|x| Rc::from(illumina_pattern.replace_all(&x, "").to_string())) + .collect() + } // handle per-barcode single FASTQs for Nanopore SeqPlatform::Nanopore => file_paths .into_iter() @@ -121,14 +124,15 @@ pub fn concat_lines( .collect::>() } -pub fn give_a_sheet(input_dir: &Path, fastq_ext: &str, platform: &SeqPlatform) -> Result<()> { +pub fn give_a_sheet( + input_dir: &Path, + fastq_ext: &str, + platform: &SeqPlatform, + output_prefix: &Option, +) -> Result<()> { let fastq_paths = find_files(input_dir, fastq_ext)?; let sample_ids: &HashSet> = &platform.retrieve_samples(&fastq_paths); let lines = concat_lines(sample_ids, &fastq_paths, platform); - - for line in lines { - eprintln!("{}", line); - } - - Ok(()) + let header = "sample,fastq_1,fastq_2"; + write_lines(&lines, header, output_prefix) }