From e2ef96d9ea59dd75fa03c21c6e36032790e2169b Mon Sep 17 00:00:00 2001 From: Yangyang Li Date: Wed, 21 Aug 2024 16:56:22 -0500 Subject: [PATCH 01/14] feat: Add BAM to fastq conversion functionality --- crates/deepbiop-bam/Cargo.toml | 1 + crates/deepbiop-bam/src/io.rs | 35 +++++++++++++++++++++++++++ crates/deepbiop-bam/src/lib.rs | 1 + crates/deepbiop-cli/src/cli.rs | 2 ++ crates/deepbiop-cli/src/cli/bam2fq.rs | 32 ++++++++++++++++++++++++ crates/deepbiop-cli/src/cli/fq2fa.rs | 0 crates/deepbiop-cli/src/main.rs | 10 +++++++- 7 files changed, 80 insertions(+), 1 deletion(-) create mode 100644 crates/deepbiop-bam/src/io.rs create mode 100644 crates/deepbiop-cli/src/cli/bam2fq.rs create mode 100644 crates/deepbiop-cli/src/cli/fq2fa.rs diff --git a/crates/deepbiop-bam/Cargo.toml b/crates/deepbiop-bam/Cargo.toml index 424002f..078bf1b 100644 --- a/crates/deepbiop-bam/Cargo.toml +++ b/crates/deepbiop-bam/Cargo.toml @@ -24,6 +24,7 @@ lexical = { workspace = true } derive_builder = { workspace = true } deepbiop-utils = { workspace = true } +deepbiop-fq = { workspace = true } [dev-dependencies] bio = { workspace = true } diff --git a/crates/deepbiop-bam/src/io.rs b/crates/deepbiop-bam/src/io.rs new file mode 100644 index 0000000..5926ee0 --- /dev/null +++ b/crates/deepbiop-bam/src/io.rs @@ -0,0 +1,35 @@ +use anyhow::Result; +use noodles::{bam, bgzf}; +use rayon::prelude::*; +use std::{fs::File, num::NonZeroUsize, path::Path, thread}; + +use noodles::fastq; + +pub fn bam2fq(bam: &Path, threads: Option) -> Result> { + let file = File::open(bam)?; + let worker_count = if let Some(threads) = threads { + NonZeroUsize::new(threads) + .unwrap() + .min(thread::available_parallelism().unwrap_or(NonZeroUsize::MIN)) + } else { + thread::available_parallelism().unwrap_or(NonZeroUsize::MIN) + }; + + let decoder = bgzf::MultithreadedReader::with_worker_count(worker_count, file); + let mut reader = bam::io::Reader::from(decoder); + let _header = reader.read_header()?; + + Ok(reader + .records() + .par_bridge() + .map(|result| { + let record = result.unwrap(); + let fq_record = fastq::Record::new( + fastq::record::Definition::new(record.name().unwrap().to_vec(), ""), + record.sequence().as_ref().to_vec(), + record.quality_scores().as_ref().to_vec(), + ); + fq_record + }) + .collect::>()) +} diff --git a/crates/deepbiop-bam/src/lib.rs b/crates/deepbiop-bam/src/lib.rs index 8450b02..534c609 100644 --- a/crates/deepbiop-bam/src/lib.rs +++ b/crates/deepbiop-bam/src/lib.rs @@ -2,6 +2,7 @@ pub mod chimeric; pub mod cigar; +pub mod io; #[cfg(feature = "python")] pub mod python; diff --git a/crates/deepbiop-cli/src/cli.rs b/crates/deepbiop-cli/src/cli.rs index 1bd50b3..372e6e6 100644 --- a/crates/deepbiop-cli/src/cli.rs +++ b/crates/deepbiop-cli/src/cli.rs @@ -1,3 +1,5 @@ +pub mod bam2fq; +pub use bam2fq::*; pub mod chimeric_count; pub use chimeric_count::*; diff --git a/crates/deepbiop-cli/src/cli/bam2fq.rs b/crates/deepbiop-cli/src/cli/bam2fq.rs new file mode 100644 index 0000000..4107f38 --- /dev/null +++ b/crates/deepbiop-cli/src/cli/bam2fq.rs @@ -0,0 +1,32 @@ +use anyhow::Result; +use clap::Parser; +use deepbiop_bam as bam; +use deepbiop_fq as fq; + +use std::path::PathBuf; + +use super::set_up_threads; + +#[derive(Debug, Parser)] +pub struct BamToFq { + /// path to the bam file + #[arg(value_name = "bam", action=clap::ArgAction::Append)] + bam: Vec, + + /// threads number + #[arg(short, long, default_value = "2")] + threads: Option, +} + +impl BamToFq { + pub fn run(&self) -> Result<()> { + set_up_threads(self.threads)?; + + for bam in &self.bam { + let fq_records = bam::io::bam2fq(bam, self.threads)?; + let file_path = bam.with_extension("fq.bgz"); + fq::io::write_fq_parallel_for_noodle_record(&fq_records, file_path, self.threads)?; + } + Ok(()) + } +} diff --git a/crates/deepbiop-cli/src/cli/fq2fa.rs b/crates/deepbiop-cli/src/cli/fq2fa.rs new file mode 100644 index 0000000..e69de29 diff --git a/crates/deepbiop-cli/src/main.rs b/crates/deepbiop-cli/src/main.rs index 9f67d84..1b3ce76 100644 --- a/crates/deepbiop-cli/src/main.rs +++ b/crates/deepbiop-cli/src/main.rs @@ -30,12 +30,16 @@ struct Cli { pub enum Commands { /// Count chimeric reads in a BAM file. CountChimeric(cli::CountChimeric), + + /// BAM to fastq conversion. + BamToFq(cli::BamToFq), } impl Display for Commands { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { Commands::CountChimeric(_) => write!(f, "chimericcount"), + Commands::BamToFq(_) => write!(f, "bam2fq"), } } } @@ -73,7 +77,11 @@ fn main() -> Result<()> { match &cli.command { Some(Commands::CountChimeric(count_chimeric)) => { - println!("{:?}", count_chimeric.run()); + count_chimeric.run().unwrap(); + } + + Some(Commands::BamToFq(bam2fq)) => { + bam2fq.run().unwrap(); } None => { From 4b5b60247cfba7d492ad0f620e6cddc9d036f870 Mon Sep 17 00:00:00 2001 From: Yangyang Li Date: Wed, 21 Aug 2024 16:56:48 -0500 Subject: [PATCH 02/14] feat: Add new file fq2fa.rs --- crates/deepbiop-cli/src/cli/fq2fa.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/crates/deepbiop-cli/src/cli/fq2fa.rs b/crates/deepbiop-cli/src/cli/fq2fa.rs index e69de29..8b13789 100644 --- a/crates/deepbiop-cli/src/cli/fq2fa.rs +++ b/crates/deepbiop-cli/src/cli/fq2fa.rs @@ -0,0 +1 @@ + From ef166c24d8b3f186c2828c7aa5233e7ed901cf02 Mon Sep 17 00:00:00 2001 From: Yangyang Li Date: Wed, 21 Aug 2024 18:02:05 -0500 Subject: [PATCH 03/14] feat: Add fastq to fasta conversion command --- crates/deepbiop-cli/Cargo.toml | 1 + crates/deepbiop-cli/src/cli.rs | 2 ++ crates/deepbiop-cli/src/cli/fq2fa.rs | 35 ++++++++++++++++++++++++++++ crates/deepbiop-cli/src/main.rs | 8 +++++++ 4 files changed, 46 insertions(+) diff --git a/crates/deepbiop-cli/Cargo.toml b/crates/deepbiop-cli/Cargo.toml index e64625d..244e48b 100644 --- a/crates/deepbiop-cli/Cargo.toml +++ b/crates/deepbiop-cli/Cargo.toml @@ -11,6 +11,7 @@ readme = "../../README.md" description = "CLI tool for Processing Biological Data." [dependencies] +noodles = { workspace = true } deepbiop-fq = { workspace = true } deepbiop-bam = { workspace = true } deepbiop-utils = { workspace = true } diff --git a/crates/deepbiop-cli/src/cli.rs b/crates/deepbiop-cli/src/cli.rs index 372e6e6..33c26e8 100644 --- a/crates/deepbiop-cli/src/cli.rs +++ b/crates/deepbiop-cli/src/cli.rs @@ -1,5 +1,7 @@ pub mod bam2fq; pub use bam2fq::*; +pub mod fq2fa; +pub use fq2fa::*; pub mod chimeric_count; pub use chimeric_count::*; diff --git a/crates/deepbiop-cli/src/cli/fq2fa.rs b/crates/deepbiop-cli/src/cli/fq2fa.rs index 8b13789..c5daee1 100644 --- a/crates/deepbiop-cli/src/cli/fq2fa.rs +++ b/crates/deepbiop-cli/src/cli/fq2fa.rs @@ -1 +1,36 @@ +use anyhow::Result; +use clap::Parser; +use deepbiop_fq as fq; +use noodles::fasta; +use std::path::PathBuf; + +use super::set_up_threads; + +#[derive(Debug, Parser)] +pub struct FqToFa { + /// path to the bam file + #[arg(value_name = "fq", action=clap::ArgAction::Append)] + fq: Vec, + + /// threads number + #[arg(short, long, default_value = "2")] + threads: Option, +} + +impl FqToFa { + pub fn run(&self) -> Result<()> { + set_up_threads(self.threads)?; + + for fq in &self.fq { + let fq_records = fq::io::fastq_to_fasta(fq)?; + let file_path = fq.with_extension("fa"); + let file = std::fs::File::create(&file_path)?; + let mut writer = fasta::io::Writer::new(file); + for record in fq_records { + writer.write_record(&record)?; + } + } + Ok(()) + } +} diff --git a/crates/deepbiop-cli/src/main.rs b/crates/deepbiop-cli/src/main.rs index 1b3ce76..f831322 100644 --- a/crates/deepbiop-cli/src/main.rs +++ b/crates/deepbiop-cli/src/main.rs @@ -33,6 +33,9 @@ pub enum Commands { /// BAM to fastq conversion. BamToFq(cli::BamToFq), + + /// Fastq to fasta conversion. + FqToFa(cli::FqToFa), } impl Display for Commands { @@ -40,6 +43,7 @@ impl Display for Commands { match self { Commands::CountChimeric(_) => write!(f, "chimericcount"), Commands::BamToFq(_) => write!(f, "bam2fq"), + Commands::FqToFa(_) => write!(f, "fq2fa"), } } } @@ -84,6 +88,10 @@ fn main() -> Result<()> { bam2fq.run().unwrap(); } + Some(Commands::FqToFa(fq2fa)) => { + fq2fa.run().unwrap(); + } + None => { println!("No command provided!"); } From 761a4e48d0bfe9568144cf029ec3c4f5ea4fe9c0 Mon Sep 17 00:00:00 2001 From: Yangyang Li Date: Wed, 21 Aug 2024 18:27:13 -0500 Subject: [PATCH 04/14] refactor: Improve code readability and performance --- crates/deepbiop-bam/src/io.rs | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/crates/deepbiop-bam/src/io.rs b/crates/deepbiop-bam/src/io.rs index 5926ee0..13173b2 100644 --- a/crates/deepbiop-bam/src/io.rs +++ b/crates/deepbiop-bam/src/io.rs @@ -24,10 +24,16 @@ pub fn bam2fq(bam: &Path, threads: Option) -> Result> .par_bridge() .map(|result| { let record = result.unwrap(); + + let seq = record.sequence().as_ref().to_vec(); + let qual = record.quality_scores().as_ref().to_vec(); + + assert_eq!(seq.len(), qual.len()); + let fq_record = fastq::Record::new( fastq::record::Definition::new(record.name().unwrap().to_vec(), ""), - record.sequence().as_ref().to_vec(), - record.quality_scores().as_ref().to_vec(), + seq, + qual, ); fq_record }) From 5ef9333f942ee8a3201375dbf0254020ebf90e69 Mon Sep 17 00:00:00 2001 From: Yangyang Li Date: Wed, 21 Aug 2024 18:32:34 -0500 Subject: [PATCH 05/14] feat: Add support for writing compressed fastq files --- crates/deepbiop-cli/src/cli/bam2fq.rs | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/crates/deepbiop-cli/src/cli/bam2fq.rs b/crates/deepbiop-cli/src/cli/bam2fq.rs index 4107f38..3827ee8 100644 --- a/crates/deepbiop-cli/src/cli/bam2fq.rs +++ b/crates/deepbiop-cli/src/cli/bam2fq.rs @@ -2,8 +2,9 @@ use anyhow::Result; use clap::Parser; use deepbiop_bam as bam; use deepbiop_fq as fq; +use noodles::fastq; -use std::path::PathBuf; +use std::path::{Path, PathBuf}; use super::set_up_threads; @@ -16,6 +17,19 @@ pub struct BamToFq { /// threads number #[arg(short, long, default_value = "2")] threads: Option, + + /// output compressed fastq file + #[arg(short, long, action=clap::ArgAction::SetTrue)] + compressed: bool, +} + +fn write_fq>(data: &[fastq::Record], path: P) -> Result<()> { + let file = std::fs::File::create(path.as_ref())?; + let mut writer = fastq::io::Writer::new(file); + for record in data { + writer.write_record(record)?; + } + Ok(()) } impl BamToFq { @@ -25,7 +39,12 @@ impl BamToFq { for bam in &self.bam { let fq_records = bam::io::bam2fq(bam, self.threads)?; let file_path = bam.with_extension("fq.bgz"); - fq::io::write_fq_parallel_for_noodle_record(&fq_records, file_path, self.threads)?; + + if self.compressed { + fq::io::write_fq_parallel_for_noodle_record(&fq_records, file_path, self.threads)?; + } else { + write_fq(&fq_records, file_path)?; + } } Ok(()) } From ea57e26583bc78ec4c0f7d94a7dbe65e5bd43759 Mon Sep 17 00:00:00 2001 From: Yangyang Li Date: Wed, 21 Aug 2024 18:35:47 -0500 Subject: [PATCH 06/14] refactor: Refactor file_path assignment in bam2fq.rs --- crates/deepbiop-cli/src/cli/bam2fq.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/crates/deepbiop-cli/src/cli/bam2fq.rs b/crates/deepbiop-cli/src/cli/bam2fq.rs index 3827ee8..9653d89 100644 --- a/crates/deepbiop-cli/src/cli/bam2fq.rs +++ b/crates/deepbiop-cli/src/cli/bam2fq.rs @@ -38,11 +38,12 @@ impl BamToFq { for bam in &self.bam { let fq_records = bam::io::bam2fq(bam, self.threads)?; - let file_path = bam.with_extension("fq.bgz"); if self.compressed { + let file_path = bam.with_extension("fq.bgz"); fq::io::write_fq_parallel_for_noodle_record(&fq_records, file_path, self.threads)?; } else { + let file_path = bam.with_extension("fq"); write_fq(&fq_records, file_path)?; } } From 695a4782f6c24412c65659b479e3ce9082669826 Mon Sep 17 00:00:00 2001 From: Yangyang Li Date: Wed, 21 Aug 2024 19:07:42 -0500 Subject: [PATCH 07/14] style: Improve function and field comments --- crates/deepbiop-bam/src/io.rs | 2 ++ crates/deepbiop-cli/src/cli/bam2fq.rs | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/crates/deepbiop-bam/src/io.rs b/crates/deepbiop-bam/src/io.rs index 13173b2..260e104 100644 --- a/crates/deepbiop-bam/src/io.rs +++ b/crates/deepbiop-bam/src/io.rs @@ -5,6 +5,8 @@ use std::{fs::File, num::NonZeroUsize, path::Path, thread}; use noodles::fastq; +// FIXME: The function has a bug since seq != qual + pub fn bam2fq(bam: &Path, threads: Option) -> Result> { let file = File::open(bam)?; let worker_count = if let Some(threads) = threads { diff --git a/crates/deepbiop-cli/src/cli/bam2fq.rs b/crates/deepbiop-cli/src/cli/bam2fq.rs index 9653d89..99c11ac 100644 --- a/crates/deepbiop-cli/src/cli/bam2fq.rs +++ b/crates/deepbiop-cli/src/cli/bam2fq.rs @@ -18,7 +18,7 @@ pub struct BamToFq { #[arg(short, long, default_value = "2")] threads: Option, - /// output compressed fastq file + /// output bgzip compressed fastq file #[arg(short, long, action=clap::ArgAction::SetTrue)] compressed: bool, } From 859118c3e81d79d051c50c896f7b272bcd78ffa0 Mon Sep 17 00:00:00 2001 From: Yangyang Li Date: Wed, 21 Aug 2024 22:56:12 -0500 Subject: [PATCH 08/14] feat(io): Refactor bam2fq function to handle equal seq and qual lengths --- crates/deepbiop-bam/src/io.rs | 14 +++++++++++--- crates/deepbiop-cli/src/cli/chimeric_count.rs | 13 +++++++------ 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/crates/deepbiop-bam/src/io.rs b/crates/deepbiop-bam/src/io.rs index 260e104..e000ec1 100644 --- a/crates/deepbiop-bam/src/io.rs +++ b/crates/deepbiop-bam/src/io.rs @@ -21,7 +21,7 @@ pub fn bam2fq(bam: &Path, threads: Option) -> Result> let mut reader = bam::io::Reader::from(decoder); let _header = reader.read_header()?; - Ok(reader + reader .records() .par_bridge() .map(|result| { @@ -32,12 +32,20 @@ pub fn bam2fq(bam: &Path, threads: Option) -> Result> assert_eq!(seq.len(), qual.len()); + if seq.len() != qual.len() { + let name = String::from_utf8_lossy(record.name().unwrap().as_ref()).to_string(); + return Err(anyhow::anyhow!( + "{} seq and qual length are not equal", + name + )); + } + let fq_record = fastq::Record::new( fastq::record::Definition::new(record.name().unwrap().to_vec(), ""), seq, qual, ); - fq_record + Ok(fq_record) }) - .collect::>()) + .collect::>>() } diff --git a/crates/deepbiop-cli/src/cli/chimeric_count.rs b/crates/deepbiop-cli/src/cli/chimeric_count.rs index d7e1f5b..8037d5f 100644 --- a/crates/deepbiop-cli/src/cli/chimeric_count.rs +++ b/crates/deepbiop-cli/src/cli/chimeric_count.rs @@ -1,4 +1,3 @@ -use ahash::HashMap; use anyhow::Result; use clap::Parser; use deepbiop_bam as bam; @@ -18,11 +17,13 @@ pub struct CountChimeric { } impl CountChimeric { - pub fn run(&self) -> Result> { + pub fn run(&self) -> Result<()> { set_up_threads(self.threads)?; - Ok(bam::chimeric::count_chimeric_reads_for_paths( - &self.bam, - self.threads, - )) + let res = bam::chimeric::count_chimeric_reads_for_paths(&self.bam, self.threads); + for (path, count) in res { + log::info!("{}: {}", path.to_string_lossy(), count); + } + + Ok(()) } } From a8599082f2f1accec543bbdda4a79908dfa5881e Mon Sep 17 00:00:00 2001 From: Yangyang Li Date: Wed, 21 Aug 2024 23:07:09 -0500 Subject: [PATCH 09/14] style: Remove unnecessary empty line --- crates/deepbiop-bam/src/io.rs | 2 -- crates/deepbiop-cli/src/cli/chimeric_count.rs | 1 - 2 files changed, 3 deletions(-) diff --git a/crates/deepbiop-bam/src/io.rs b/crates/deepbiop-bam/src/io.rs index e000ec1..314726c 100644 --- a/crates/deepbiop-bam/src/io.rs +++ b/crates/deepbiop-bam/src/io.rs @@ -30,8 +30,6 @@ pub fn bam2fq(bam: &Path, threads: Option) -> Result> let seq = record.sequence().as_ref().to_vec(); let qual = record.quality_scores().as_ref().to_vec(); - assert_eq!(seq.len(), qual.len()); - if seq.len() != qual.len() { let name = String::from_utf8_lossy(record.name().unwrap().as_ref()).to_string(); return Err(anyhow::anyhow!( diff --git a/crates/deepbiop-cli/src/cli/chimeric_count.rs b/crates/deepbiop-cli/src/cli/chimeric_count.rs index 8037d5f..1c8ee9e 100644 --- a/crates/deepbiop-cli/src/cli/chimeric_count.rs +++ b/crates/deepbiop-cli/src/cli/chimeric_count.rs @@ -23,7 +23,6 @@ impl CountChimeric { for (path, count) in res { log::info!("{}: {}", path.to_string_lossy(), count); } - Ok(()) } } From 340fcf16c71426762895465c0953d11fdafdc45f Mon Sep 17 00:00:00 2001 From: Yangyang Li Date: Thu, 22 Aug 2024 12:30:21 -0500 Subject: [PATCH 10/14] fix: Update pyo3-stub-gen version to 0.6.0 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index d24c68a..7840c4a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,7 +17,7 @@ pyo3 = { version = "0.21.0", features = [ "extension-module", "anyhow", ] } -pyo3-stub-gen = "0.5.2" +pyo3-stub-gen = "0.6.0" thiserror = "1.0" anyhow = "1.0" walkdir = { version = "2.4" } From 23eb174ae32d9b3d2ad93caaabb11b21d2fb6726 Mon Sep 17 00:00:00 2001 From: Yangyang Li Date: Thu, 22 Aug 2024 12:56:36 -0500 Subject: [PATCH 11/14] feat: Add Python bindings for TensorEncoder --- crates/deepbiop-fq/src/encode/tensor.rs | 4 +++- crates/deepbiop-fq/src/python.rs | 1 + py-deepbiop/deepbiop/fq.pyi | 12 ++++++++++++ 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/crates/deepbiop-fq/src/encode/tensor.rs b/crates/deepbiop-fq/src/encode/tensor.rs index 2b420b2..7b1acee 100644 --- a/crates/deepbiop-fq/src/encode/tensor.rs +++ b/crates/deepbiop-fq/src/encode/tensor.rs @@ -22,9 +22,11 @@ use crate::{ use super::{triat::Encoder, FqEncoderOption}; use needletail::Sequence; +use pyo3_stub_gen::derive::*; use rayon::prelude::*; -#[pyclass] +#[gen_stub_pyclass] +#[pyclass(module = "deepbiop.fq")] #[derive(Debug, Builder, Default, Clone)] #[builder(build_fn(skip))] // Specify custom build function pub struct TensorEncoder { diff --git a/crates/deepbiop-fq/src/python.rs b/crates/deepbiop-fq/src/python.rs index bc4a753..6431eb1 100644 --- a/crates/deepbiop-fq/src/python.rs +++ b/crates/deepbiop-fq/src/python.rs @@ -19,6 +19,7 @@ use rayon::prelude::*; use pyo3_stub_gen::derive::*; +#[gen_stub_pymethods] #[pymethods] impl encode::TensorEncoder { #[new] diff --git a/py-deepbiop/deepbiop/fq.pyi b/py-deepbiop/deepbiop/fq.pyi index 5212d43..b345bfa 100644 --- a/py-deepbiop/deepbiop/fq.pyi +++ b/py-deepbiop/deepbiop/fq.pyi @@ -86,6 +86,18 @@ class RecordData: def set_seq(self, seq: str) -> None: ... def set_qual(self, qual: str) -> None: ... +class TensorEncoder: + tensor_max_width: int + tensor_max_seq_len: int + kmer2id_table: dict[list[int], int] + id2kmer_table: dict[int, list[int]] + def __new__( + cls, + option: FqEncoderOption, + tensor_max_width: int | None, + tensor_max_seq_len: int | None, + ): ... + def convert_multiple_fqs_to_one_fq( paths: typing.Sequence[str | os.PathLike | pathlib.Path], result_path: str | os.PathLike | pathlib.Path, From 41ef14c4b3af7a2b2704871062d729ad8e81e19d Mon Sep 17 00:00:00 2001 From: Yangyang Li Date: Fri, 23 Aug 2024 13:13:56 -0500 Subject: [PATCH 12/14] feat: Update keywords in Cargo.toml files --- Cargo.toml | 2 +- crates/deepbiop-bam/Cargo.toml | 2 +- crates/deepbiop-cli/Cargo.toml | 2 +- crates/deepbiop-fq/Cargo.toml | 2 +- crates/deepbiop-utils/Cargo.toml | 2 +- crates/deepbiop/Cargo.toml | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 7840c4a..c86f302 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -55,7 +55,7 @@ tempfile = "3.10" parquet = "52.0.0" arrow = "52.0" candle-core = { git = "https://github.com/huggingface/candle.git", version = "0.6.0" } -colored = "2" +colored = "2.1" textwrap = "0.16" flate2 = { version = "1.0.30", features = [ "zlib-ng", diff --git a/crates/deepbiop-bam/Cargo.toml b/crates/deepbiop-bam/Cargo.toml index 078bf1b..7297f4f 100644 --- a/crates/deepbiop-bam/Cargo.toml +++ b/crates/deepbiop-bam/Cargo.toml @@ -5,7 +5,7 @@ authors = { workspace = true } edition = { workspace = true } homepage = { workspace = true } repository = { workspace = true } -keywords = ["bam", "deep-learning"] +keywords = ["bam", "deep-learning", "science::bioinformatics", "science::ml"] license = { workspace = true } readme = "../../README.md" description = "Deep Learning Processing Library for Bam Format" diff --git a/crates/deepbiop-cli/Cargo.toml b/crates/deepbiop-cli/Cargo.toml index 244e48b..d2f2bb7 100644 --- a/crates/deepbiop-cli/Cargo.toml +++ b/crates/deepbiop-cli/Cargo.toml @@ -5,7 +5,7 @@ authors = { workspace = true } edition = { workspace = true } homepage = { workspace = true } repository = { workspace = true } -keywords = ["deep-learning", "bioinformatics", "biological-data", "cli"] +keywords = ["deep-learning", "science::bioinformatics", "biological-data", "command-line-utilities"] license = { workspace = true } readme = "../../README.md" description = "CLI tool for Processing Biological Data." diff --git a/crates/deepbiop-fq/Cargo.toml b/crates/deepbiop-fq/Cargo.toml index 1633124..15869ac 100644 --- a/crates/deepbiop-fq/Cargo.toml +++ b/crates/deepbiop-fq/Cargo.toml @@ -5,7 +5,7 @@ authors = { workspace = true } edition = { workspace = true } homepage = { workspace = true } repository = { workspace = true } -keywords = ["arrow", "parquet", "fastq", "deep-learning"] +keywords = ["parquet", "fastq", "deep-learning", "science::ml"] license = { workspace = true } readme = "../../README.md" description = "Deep Learning Preprocessing Library for Fastq Format" diff --git a/crates/deepbiop-utils/Cargo.toml b/crates/deepbiop-utils/Cargo.toml index de20bd5..bb7301c 100644 --- a/crates/deepbiop-utils/Cargo.toml +++ b/crates/deepbiop-utils/Cargo.toml @@ -5,7 +5,7 @@ authors = { workspace = true } edition = { workspace = true } homepage = { workspace = true } repository = { workspace = true } -keywords = ["deep-learning", "utils"] +keywords = ["deep-learning", "utils", "science::ml"] license = { workspace = true } readme = "../../README.md" description = "Deep Learning Preprocessing Library for Biological Data" diff --git a/crates/deepbiop/Cargo.toml b/crates/deepbiop/Cargo.toml index b7c1d26..41333c6 100644 --- a/crates/deepbiop/Cargo.toml +++ b/crates/deepbiop/Cargo.toml @@ -5,7 +5,7 @@ authors = { workspace = true } edition = { workspace = true } homepage = { workspace = true } repository = { workspace = true } -keywords = ["deep-learning", "bioinformatics", "biological-data"] +keywords = ["deep-learning", "science::bioinformatics", "biological-data", "science::ml"] license = { workspace = true } readme = "../../README.md" description = "Deep Learning Processing Library for Biological Data" From a76169d3edf29180515e4844c93f29c426304c21 Mon Sep 17 00:00:00 2001 From: Yangyang Li Date: Sat, 24 Aug 2024 19:53:06 -0500 Subject: [PATCH 13/14] feat: Add FaToFq command for fastq to fasta conversion --- crates/deepbiop-cli/src/cli.rs | 2 ++ crates/deepbiop-cli/src/cli/fa2fq.rs | 48 ++++++++++++++++++++++++++++ crates/deepbiop-cli/src/cli/fq2fa.rs | 2 +- crates/deepbiop-cli/src/main.rs | 8 +++++ 4 files changed, 59 insertions(+), 1 deletion(-) create mode 100644 crates/deepbiop-cli/src/cli/fa2fq.rs diff --git a/crates/deepbiop-cli/src/cli.rs b/crates/deepbiop-cli/src/cli.rs index 33c26e8..d11413d 100644 --- a/crates/deepbiop-cli/src/cli.rs +++ b/crates/deepbiop-cli/src/cli.rs @@ -2,6 +2,8 @@ pub mod bam2fq; pub use bam2fq::*; pub mod fq2fa; pub use fq2fa::*; +pub mod fa2fq; +pub use fa2fq::*; pub mod chimeric_count; pub use chimeric_count::*; diff --git a/crates/deepbiop-cli/src/cli/fa2fq.rs b/crates/deepbiop-cli/src/cli/fa2fq.rs new file mode 100644 index 0000000..5f8f0fe --- /dev/null +++ b/crates/deepbiop-cli/src/cli/fa2fq.rs @@ -0,0 +1,48 @@ +use anyhow::Result; +use clap::Parser; + +use noodles::fasta; +use noodles::fastq; +use std::path::PathBuf; + +use super::set_up_threads; + +#[derive(Debug, Parser)] +pub struct FaToFq { + /// path to the fa file + #[arg(value_name = "fa", action=clap::ArgAction::Append)] + fa: Vec, + + /// threads number + #[arg(short, long, default_value = "2")] + threads: Option, +} + +impl FaToFq { + pub fn run(&self) -> Result<()> { + set_up_threads(self.threads)?; + + for fa in &self.fa { + let fq_file_path = fa.with_extension("fa"); + + let mut reader = fasta::io::reader::Builder.build_from_path(fa)?; + + let fq_writer_handle = std::fs::File::create(fq_file_path)?; + let mut fq_writer = fastq::io::Writer::new(fq_writer_handle); + + for record in reader.records() { + let record = record?; + let name = record.name(); + let sequence = record.sequence().as_ref().to_vec(); + let quality = vec![b'@'; sequence.len()]; + let fq_record = fastq::Record::new( + fastq::record::Definition::new(name.to_vec(), ""), + sequence, + quality, + ); + fq_writer.write_record(&fq_record)?; + } + } + Ok(()) + } +} diff --git a/crates/deepbiop-cli/src/cli/fq2fa.rs b/crates/deepbiop-cli/src/cli/fq2fa.rs index c5daee1..896e8c2 100644 --- a/crates/deepbiop-cli/src/cli/fq2fa.rs +++ b/crates/deepbiop-cli/src/cli/fq2fa.rs @@ -9,7 +9,7 @@ use super::set_up_threads; #[derive(Debug, Parser)] pub struct FqToFa { - /// path to the bam file + /// path to the fq file #[arg(value_name = "fq", action=clap::ArgAction::Append)] fq: Vec, diff --git a/crates/deepbiop-cli/src/main.rs b/crates/deepbiop-cli/src/main.rs index f831322..bdab89a 100644 --- a/crates/deepbiop-cli/src/main.rs +++ b/crates/deepbiop-cli/src/main.rs @@ -36,6 +36,9 @@ pub enum Commands { /// Fastq to fasta conversion. FqToFa(cli::FqToFa), + + /// Fastq to fasta conversion. + FaToFq(cli::FaToFq), } impl Display for Commands { @@ -44,6 +47,7 @@ impl Display for Commands { Commands::CountChimeric(_) => write!(f, "chimericcount"), Commands::BamToFq(_) => write!(f, "bam2fq"), Commands::FqToFa(_) => write!(f, "fq2fa"), + Commands::FaToFq(_) => write!(f, "fa2fq"), } } } @@ -92,6 +96,10 @@ fn main() -> Result<()> { fq2fa.run().unwrap(); } + Some(Commands::FaToFq(fa2fq)) => { + fa2fq.run().unwrap(); + } + None => { println!("No command provided!"); } From 1cf0c031964a8066def276ca8850a5365c3c542f Mon Sep 17 00:00:00 2001 From: Yangyang Li Date: Sat, 24 Aug 2024 19:54:19 -0500 Subject: [PATCH 14/14] refactor: Improve file variable names in FqToFa struct --- crates/deepbiop-cli/src/cli/fq2fa.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/crates/deepbiop-cli/src/cli/fq2fa.rs b/crates/deepbiop-cli/src/cli/fq2fa.rs index 896e8c2..8a14825 100644 --- a/crates/deepbiop-cli/src/cli/fq2fa.rs +++ b/crates/deepbiop-cli/src/cli/fq2fa.rs @@ -24,11 +24,11 @@ impl FqToFa { for fq in &self.fq { let fq_records = fq::io::fastq_to_fasta(fq)?; - let file_path = fq.with_extension("fa"); - let file = std::fs::File::create(&file_path)?; - let mut writer = fasta::io::Writer::new(file); + let fa_file_path = fq.with_extension("fa"); + let fa_file_handler = std::fs::File::create(&fa_file_path)?; + let mut fa_writer = fasta::io::Writer::new(fa_file_handler); for record in fq_records { - writer.write_record(&record)?; + fa_writer.write_record(&record)?; } } Ok(())