Skip to content

Commit

Permalink
give a sheet now handles writing
Browse files Browse the repository at this point in the history
  • Loading branch information
nrminor committed Mar 27, 2024
1 parent 7b5dbfc commit 458644f
Show file tree
Hide file tree
Showing 6 changed files with 85 additions and 36 deletions.
2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ clap-verbosity-flag = "2.2.0"
color-eyre = "0.6.3"
glob = "0.3.1"
rayon = "1.9.0"
regex = "1.10.4"
tracing = "0.1.40"

[profile.dev.package.backtrace]
opt-level = 3
Expand Down
10 changes: 7 additions & 3 deletions src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,10 +57,10 @@ pub enum Commands {
/// The sequencing platform where FASTQs came from
#[arg(short, long, required = true)]
platform: SeqPlatform,
// /// Output file prefix (the part before the `_samplesheet.csv`)
// #[arg(short, long, required = false)]
// output_file: Option<String>,

/// Output file prefix (the part before the `_samplesheet.csv`)
#[arg(short, long, required = false, default_value = None)]
output_prefix: Option<String>,
// /// Check a pre-existing samplesheet
// #[arg(short, long, required = false, default_value = "samplesheet.csv")]
// check: Option<String>,
Expand All @@ -81,5 +81,9 @@ pub enum Commands {
/// the number of cells expected
#[arg(short, long, required = true)]
expected_cells: i64,

/// Output file prefix (the part before the `_samplesheet.csv`)
#[arg(short, long, required = false, default_value = None)]
output_prefix: Option<String>,
},
}
6 changes: 4 additions & 2 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,17 @@ fn main() -> Result<()> {
input_dir,
fastq_ext,
platform,
output_prefix,
}) => {
viralrecon::give_a_sheet(input_dir, fastq_ext, platform)?;
viralrecon::give_a_sheet(input_dir, fastq_ext, platform, output_prefix)?;
}
Some(Commands::Scrnaseq {
input_dir,
fastq_ext,
expected_cells,
output_prefix,
}) => {
scrnaseq::give_a_sheet(input_dir, fastq_ext, &expected_cells)?;
scrnaseq::give_a_sheet(input_dir, fastq_ext, &expected_cells, output_prefix)?;
}
None => {
eprintln!("{}\n", cli::INFO);
Expand Down
37 changes: 28 additions & 9 deletions src/scrnaseq.rs
Original file line number Diff line number Diff line change
@@ -1,24 +1,36 @@
use regex::Regex;
use std::{collections::HashSet, ffi::OsStr, path::Path, rc::Rc};
use tracing::warn;

use crate::utils::write_lines;
pub use crate::viralrecon::find_files;
use color_eyre::eyre::Result;

fn retrieve_samples(file_paths: &[Rc<Path>]) -> HashSet<Rc<str>> {
let illumina_pattern = Regex::new(r"_L\d{3}_R\d_\d{3}\.fastq\.gz$").unwrap();

file_paths
.into_iter()
.map(|path| {
Rc::from(
path.file_name()
.unwrap_or(OsStr::new(""))
.to_string_lossy()
.replace("_L001_R1_001.fastq.gz", "")
.replace("_L001_R2_001.fastq.gz", "")
.as_ref(),
)
})
.map(|x| Rc::from(illumina_pattern.replace_all(&x, "").to_string()))
.collect()
}

fn check_sample_ids(sample_ids: &HashSet<Rc<str>>) {
for id in sample_ids {
if id.chars().count() >= 64 {
warn!("Sample id {} is 64 or more characters long, which is maximum enforced by some of the SCRNAseq aligners. SCRNAseq may crash if you don't manually shorten the id in your samplesheet.", id)
}
}
}

fn collect_per_sample(
sample_id: &Rc<str>,
fastq_paths: &[Rc<Path>],
Expand Down Expand Up @@ -52,7 +64,7 @@ fn collect_per_sample(
Ok(vec![sample_id, fastq1, fastq2, &cell_str].join(","))
}

pub fn concat_lines(
fn concat_lines(
sample_ids: &HashSet<Rc<str>>,
fastq_paths: &[Rc<Path>],
expected_cells: &i64,
Expand All @@ -63,14 +75,21 @@ pub fn concat_lines(
.collect::<Vec<String>>()
}

pub fn give_a_sheet(input_dir: &Path, fastq_ext: &str, expected_cells: &i64) -> Result<()> {
pub fn give_a_sheet(
input_dir: &Path,
fastq_ext: &str,
expected_cells: &i64,
output_prefix: &Option<String>,
) -> Result<()> {
// find the FASTQ files and separate out the unique sample IDs
let fastq_paths = find_files(input_dir, fastq_ext)?;
let sample_ids: HashSet<Rc<str>> = retrieve_samples(&fastq_paths);
let lines = concat_lines(&sample_ids, &fastq_paths, expected_cells);

for line in lines {
println!("{}", line)
}
// check the sample IDs for any that are too long
check_sample_ids(&sample_ids);

Ok(())
// concatenate and write the lines
let lines = concat_lines(&sample_ids, &fastq_paths, expected_cells);
let header = "sample,fastq_1,fastq_2,expected_cells";
write_lines(&lines, header, output_prefix)
}
20 changes: 19 additions & 1 deletion src/utils.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use std::{collections::HashSet, fmt, path::Path, rc::Rc};
use color_eyre::eyre::Result;
use std::{collections::HashSet, fmt, fs::File, io::BufWriter, io::Write, path::Path, rc::Rc};

use clap::ValueEnum;

Expand Down Expand Up @@ -26,3 +27,20 @@ impl fmt::Display for SeqPlatform {
pub trait RetrieveSampleIds {
fn retrieve_samples(&self, file_paths: &[Rc<Path>]) -> HashSet<Rc<str>>;
}

pub fn write_lines(lines: &[String], header: &str, output_prefix: &Option<String>) -> Result<()> {
let out_name = match output_prefix {
Some(prefix) => format!("{}_samplesheet.csv", prefix),
None => String::from("samplesheet.csv"),
};

let file = File::create(out_name)?;
let mut buf_writer = BufWriter::new(file);

writeln!(buf_writer, "{}", header)?;
lines
.into_iter()
.try_for_each(|line| writeln!(buf_writer, "{}", line))?;

Ok(())
}
46 changes: 25 additions & 21 deletions src/viralrecon.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use crate::utils::SeqPlatform;
use crate::utils::{write_lines, SeqPlatform};
use color_eyre::eyre::Result;
use glob::glob;
use regex::Regex;
use std::{collections::HashSet, ffi::OsStr, path::Path, rc::Rc};

use crate::utils::RetrieveSampleIds;
Expand All @@ -23,19 +24,21 @@ impl RetrieveSampleIds for SeqPlatform {
fn retrieve_samples(&self, file_paths: &[Rc<Path>]) -> HashSet<Rc<str>> {
match self {
// handle paired end FASTQ files for Illumina
SeqPlatform::Illumina => file_paths
.into_iter()
.map(|path| {
Rc::from(
path.file_name()
.unwrap_or(OsStr::new(""))
.to_string_lossy()
.replace("_L001_R1_001.fastq.gz", "")
.replace("_L001_R2_001.fastq.gz", "")
.as_ref(),
)
})
.collect(),
SeqPlatform::Illumina => {
let illumina_pattern = Regex::new(r"_L\d{3}_R\d_\d{3}\.fastq\.gz$").unwrap();
file_paths
.into_iter()
.map(|path| {
Rc::from(
path.file_name()
.unwrap_or(OsStr::new(""))
.to_string_lossy()
.as_ref(),
)
})
.map(|x| Rc::from(illumina_pattern.replace_all(&x, "").to_string()))
.collect()
}
// handle per-barcode single FASTQs for Nanopore
SeqPlatform::Nanopore => file_paths
.into_iter()
Expand Down Expand Up @@ -121,14 +124,15 @@ pub fn concat_lines(
.collect::<Vec<String>>()
}

pub fn give_a_sheet(input_dir: &Path, fastq_ext: &str, platform: &SeqPlatform) -> Result<()> {
pub fn give_a_sheet(
input_dir: &Path,
fastq_ext: &str,
platform: &SeqPlatform,
output_prefix: &Option<String>,
) -> Result<()> {
let fastq_paths = find_files(input_dir, fastq_ext)?;
let sample_ids: &HashSet<Rc<str>> = &platform.retrieve_samples(&fastq_paths);
let lines = concat_lines(sample_ids, &fastq_paths, platform);

for line in lines {
eprintln!("{}", line);
}

Ok(())
let header = "sample,fastq_1,fastq_2";
write_lines(&lines, header, output_prefix)
}

0 comments on commit 458644f

Please sign in to comment.