Skip to content

Commit

Permalink
Merge pull request #25 from cauliyang/dev
Browse files Browse the repository at this point in the history
feat: Add BAM to fastq conversion functionality
  • Loading branch information
cauliyang authored Aug 25, 2024
2 parents 4579506 + 1cf0c03 commit 4fc83f0
Show file tree
Hide file tree
Showing 17 changed files with 248 additions and 15 deletions.
4 changes: 2 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ pyo3 = { version = "0.21.0", features = [
"extension-module",
"anyhow",
] }
pyo3-stub-gen = "0.5.2"
pyo3-stub-gen = "0.6.0"
thiserror = "1.0"
anyhow = "1.0"
walkdir = { version = "2.4" }
Expand Down Expand Up @@ -55,7 +55,7 @@ tempfile = "3.10"
parquet = "52.0.0"
arrow = "52.0"
candle-core = { git = "https://github.com/huggingface/candle.git", version = "0.6.0" }
colored = "2"
colored = "2.1"
textwrap = "0.16"
flate2 = { version = "1.0.30", features = [
"zlib-ng",
Expand Down
3 changes: 2 additions & 1 deletion crates/deepbiop-bam/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ authors = { workspace = true }
edition = { workspace = true }
homepage = { workspace = true }
repository = { workspace = true }
keywords = ["bam", "deep-learning"]
keywords = ["bam", "deep-learning", "science::bioinformatics", "science::ml"]
license = { workspace = true }
readme = "../../README.md"
description = "Deep Learning Processing Library for Bam Format"
Expand All @@ -24,6 +24,7 @@ lexical = { workspace = true }

derive_builder = { workspace = true }
deepbiop-utils = { workspace = true }
deepbiop-fq = { workspace = true }

[dev-dependencies]
bio = { workspace = true }
Expand Down
49 changes: 49 additions & 0 deletions crates/deepbiop-bam/src/io.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
use anyhow::Result;
use noodles::{bam, bgzf};
use rayon::prelude::*;
use std::{fs::File, num::NonZeroUsize, path::Path, thread};

use noodles::fastq;

// FIXME: The function has a bug since seq != qual

pub fn bam2fq(bam: &Path, threads: Option<usize>) -> Result<Vec<fastq::Record>> {
let file = File::open(bam)?;
let worker_count = if let Some(threads) = threads {
NonZeroUsize::new(threads)
.unwrap()
.min(thread::available_parallelism().unwrap_or(NonZeroUsize::MIN))
} else {
thread::available_parallelism().unwrap_or(NonZeroUsize::MIN)
};

let decoder = bgzf::MultithreadedReader::with_worker_count(worker_count, file);
let mut reader = bam::io::Reader::from(decoder);
let _header = reader.read_header()?;

reader
.records()
.par_bridge()
.map(|result| {
let record = result.unwrap();

let seq = record.sequence().as_ref().to_vec();
let qual = record.quality_scores().as_ref().to_vec();

if seq.len() != qual.len() {
let name = String::from_utf8_lossy(record.name().unwrap().as_ref()).to_string();
return Err(anyhow::anyhow!(
"{} seq and qual length are not equal",
name
));
}

let fq_record = fastq::Record::new(
fastq::record::Definition::new(record.name().unwrap().to_vec(), ""),
seq,
qual,
);
Ok(fq_record)
})
.collect::<Result<Vec<fastq::Record>>>()
}
1 change: 1 addition & 0 deletions crates/deepbiop-bam/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
pub mod chimeric;
pub mod cigar;
pub mod io;

#[cfg(feature = "python")]
pub mod python;
3 changes: 2 additions & 1 deletion crates/deepbiop-cli/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,13 @@ authors = { workspace = true }
edition = { workspace = true }
homepage = { workspace = true }
repository = { workspace = true }
keywords = ["deep-learning", "bioinformatics", "biological-data", "cli"]
keywords = ["deep-learning", "science::bioinformatics", "biological-data", "command-line-utilities"]
license = { workspace = true }
readme = "../../README.md"
description = "CLI tool for Processing Biological Data."

[dependencies]
noodles = { workspace = true }
deepbiop-fq = { workspace = true }
deepbiop-bam = { workspace = true }
deepbiop-utils = { workspace = true }
Expand Down
6 changes: 6 additions & 0 deletions crates/deepbiop-cli/src/cli.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
pub mod bam2fq;
pub use bam2fq::*;
pub mod fq2fa;
pub use fq2fa::*;
pub mod fa2fq;
pub use fa2fq::*;
pub mod chimeric_count;
pub use chimeric_count::*;

Expand Down
52 changes: 52 additions & 0 deletions crates/deepbiop-cli/src/cli/bam2fq.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
use anyhow::Result;
use clap::Parser;
use deepbiop_bam as bam;
use deepbiop_fq as fq;
use noodles::fastq;

use std::path::{Path, PathBuf};

use super::set_up_threads;

#[derive(Debug, Parser)]
pub struct BamToFq {
/// path to the bam file
#[arg(value_name = "bam", action=clap::ArgAction::Append)]
bam: Vec<PathBuf>,

/// threads number
#[arg(short, long, default_value = "2")]
threads: Option<usize>,

/// output bgzip compressed fastq file
#[arg(short, long, action=clap::ArgAction::SetTrue)]
compressed: bool,
}

fn write_fq<P: AsRef<Path>>(data: &[fastq::Record], path: P) -> Result<()> {
let file = std::fs::File::create(path.as_ref())?;
let mut writer = fastq::io::Writer::new(file);
for record in data {
writer.write_record(record)?;
}
Ok(())
}

impl BamToFq {
pub fn run(&self) -> Result<()> {
set_up_threads(self.threads)?;

for bam in &self.bam {
let fq_records = bam::io::bam2fq(bam, self.threads)?;

if self.compressed {
let file_path = bam.with_extension("fq.bgz");
fq::io::write_fq_parallel_for_noodle_record(&fq_records, file_path, self.threads)?;
} else {
let file_path = bam.with_extension("fq");
write_fq(&fq_records, file_path)?;
}
}
Ok(())
}
}
12 changes: 6 additions & 6 deletions crates/deepbiop-cli/src/cli/chimeric_count.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
use ahash::HashMap;
use anyhow::Result;
use clap::Parser;
use deepbiop_bam as bam;
Expand All @@ -18,11 +17,12 @@ pub struct CountChimeric {
}

impl CountChimeric {
pub fn run(&self) -> Result<HashMap<PathBuf, usize>> {
pub fn run(&self) -> Result<()> {
set_up_threads(self.threads)?;
Ok(bam::chimeric::count_chimeric_reads_for_paths(
&self.bam,
self.threads,
))
let res = bam::chimeric::count_chimeric_reads_for_paths(&self.bam, self.threads);
for (path, count) in res {
log::info!("{}: {}", path.to_string_lossy(), count);
}
Ok(())
}
}
48 changes: 48 additions & 0 deletions crates/deepbiop-cli/src/cli/fa2fq.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
use anyhow::Result;
use clap::Parser;

use noodles::fasta;
use noodles::fastq;
use std::path::PathBuf;

use super::set_up_threads;

#[derive(Debug, Parser)]
pub struct FaToFq {
/// path to the fa file
#[arg(value_name = "fa", action=clap::ArgAction::Append)]
fa: Vec<PathBuf>,

/// threads number
#[arg(short, long, default_value = "2")]
threads: Option<usize>,
}

impl FaToFq {
pub fn run(&self) -> Result<()> {
set_up_threads(self.threads)?;

for fa in &self.fa {
let fq_file_path = fa.with_extension("fa");

let mut reader = fasta::io::reader::Builder.build_from_path(fa)?;

let fq_writer_handle = std::fs::File::create(fq_file_path)?;
let mut fq_writer = fastq::io::Writer::new(fq_writer_handle);

for record in reader.records() {
let record = record?;
let name = record.name();
let sequence = record.sequence().as_ref().to_vec();
let quality = vec![b'@'; sequence.len()];
let fq_record = fastq::Record::new(
fastq::record::Definition::new(name.to_vec(), ""),
sequence,
quality,
);
fq_writer.write_record(&fq_record)?;
}
}
Ok(())
}
}
36 changes: 36 additions & 0 deletions crates/deepbiop-cli/src/cli/fq2fa.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
use anyhow::Result;
use clap::Parser;
use deepbiop_fq as fq;

use noodles::fasta;
use std::path::PathBuf;

use super::set_up_threads;

#[derive(Debug, Parser)]
pub struct FqToFa {
/// path to the fq file
#[arg(value_name = "fq", action=clap::ArgAction::Append)]
fq: Vec<PathBuf>,

/// threads number
#[arg(short, long, default_value = "2")]
threads: Option<usize>,
}

impl FqToFa {
pub fn run(&self) -> Result<()> {
set_up_threads(self.threads)?;

for fq in &self.fq {
let fq_records = fq::io::fastq_to_fasta(fq)?;
let fa_file_path = fq.with_extension("fa");
let fa_file_handler = std::fs::File::create(&fa_file_path)?;
let mut fa_writer = fasta::io::Writer::new(fa_file_handler);
for record in fq_records {
fa_writer.write_record(&record)?;
}
}
Ok(())
}
}
26 changes: 25 additions & 1 deletion crates/deepbiop-cli/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,24 @@ struct Cli {
pub enum Commands {
/// Count chimeric reads in a BAM file.
CountChimeric(cli::CountChimeric),

/// BAM to fastq conversion.
BamToFq(cli::BamToFq),

/// Fastq to fasta conversion.
FqToFa(cli::FqToFa),

/// Fastq to fasta conversion.
FaToFq(cli::FaToFq),
}

impl Display for Commands {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Commands::CountChimeric(_) => write!(f, "chimericcount"),
Commands::BamToFq(_) => write!(f, "bam2fq"),
Commands::FqToFa(_) => write!(f, "fq2fa"),
Commands::FaToFq(_) => write!(f, "fa2fq"),
}
}
}
Expand Down Expand Up @@ -73,7 +85,19 @@ fn main() -> Result<()> {

match &cli.command {
Some(Commands::CountChimeric(count_chimeric)) => {
println!("{:?}", count_chimeric.run());
count_chimeric.run().unwrap();
}

Some(Commands::BamToFq(bam2fq)) => {
bam2fq.run().unwrap();
}

Some(Commands::FqToFa(fq2fa)) => {
fq2fa.run().unwrap();
}

Some(Commands::FaToFq(fa2fq)) => {
fa2fq.run().unwrap();
}

None => {
Expand Down
2 changes: 1 addition & 1 deletion crates/deepbiop-fq/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ authors = { workspace = true }
edition = { workspace = true }
homepage = { workspace = true }
repository = { workspace = true }
keywords = ["arrow", "parquet", "fastq", "deep-learning"]
keywords = ["parquet", "fastq", "deep-learning", "science::ml"]
license = { workspace = true }
readme = "../../README.md"
description = "Deep Learning Preprocessing Library for Fastq Format"
Expand Down
4 changes: 3 additions & 1 deletion crates/deepbiop-fq/src/encode/tensor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,11 @@ use crate::{

use super::{triat::Encoder, FqEncoderOption};
use needletail::Sequence;
use pyo3_stub_gen::derive::*;
use rayon::prelude::*;

#[pyclass]
#[gen_stub_pyclass]
#[pyclass(module = "deepbiop.fq")]
#[derive(Debug, Builder, Default, Clone)]
#[builder(build_fn(skip))] // Specify custom build function
pub struct TensorEncoder {
Expand Down
1 change: 1 addition & 0 deletions crates/deepbiop-fq/src/python.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ use rayon::prelude::*;

use pyo3_stub_gen::derive::*;

#[gen_stub_pymethods]
#[pymethods]
impl encode::TensorEncoder {
#[new]
Expand Down
2 changes: 1 addition & 1 deletion crates/deepbiop-utils/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ authors = { workspace = true }
edition = { workspace = true }
homepage = { workspace = true }
repository = { workspace = true }
keywords = ["deep-learning", "utils"]
keywords = ["deep-learning", "utils", "science::ml"]
license = { workspace = true }
readme = "../../README.md"
description = "Deep Learning Preprocessing Library for Biological Data"
Expand Down
2 changes: 1 addition & 1 deletion crates/deepbiop/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ authors = { workspace = true }
edition = { workspace = true }
homepage = { workspace = true }
repository = { workspace = true }
keywords = ["deep-learning", "bioinformatics", "biological-data"]
keywords = ["deep-learning", "science::bioinformatics", "biological-data", "science::ml"]
license = { workspace = true }
readme = "../../README.md"
description = "Deep Learning Processing Library for Biological Data"
Expand Down
12 changes: 12 additions & 0 deletions py-deepbiop/deepbiop/fq.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,18 @@ class RecordData:
def set_seq(self, seq: str) -> None: ...
def set_qual(self, qual: str) -> None: ...

class TensorEncoder:
tensor_max_width: int
tensor_max_seq_len: int
kmer2id_table: dict[list[int], int]
id2kmer_table: dict[int, list[int]]
def __new__(
cls,
option: FqEncoderOption,
tensor_max_width: int | None,
tensor_max_seq_len: int | None,
): ...

def convert_multiple_fqs_to_one_fq(
paths: typing.Sequence[str | os.PathLike | pathlib.Path],
result_path: str | os.PathLike | pathlib.Path,
Expand Down

0 comments on commit 4fc83f0

Please sign in to comment.