Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add BAM to fastq conversion functionality #25

Merged
merged 14 commits into from
Aug 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ pyo3 = { version = "0.21.0", features = [
"extension-module",
"anyhow",
] }
pyo3-stub-gen = "0.5.2"
pyo3-stub-gen = "0.6.0"
thiserror = "1.0"
anyhow = "1.0"
walkdir = { version = "2.4" }
Expand Down Expand Up @@ -55,7 +55,7 @@ tempfile = "3.10"
parquet = "52.0.0"
arrow = "52.0"
candle-core = { git = "https://github.com/huggingface/candle.git", version = "0.6.0" }
colored = "2"
colored = "2.1"
textwrap = "0.16"
flate2 = { version = "1.0.30", features = [
"zlib-ng",
Expand Down
3 changes: 2 additions & 1 deletion crates/deepbiop-bam/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ authors = { workspace = true }
edition = { workspace = true }
homepage = { workspace = true }
repository = { workspace = true }
keywords = ["bam", "deep-learning"]
keywords = ["bam", "deep-learning", "science::bioinformatics", "science::ml"]
license = { workspace = true }
readme = "../../README.md"
description = "Deep Learning Processing Library for Bam Format"
Expand All @@ -24,6 +24,7 @@ lexical = { workspace = true }

derive_builder = { workspace = true }
deepbiop-utils = { workspace = true }
deepbiop-fq = { workspace = true }

[dev-dependencies]
bio = { workspace = true }
Expand Down
49 changes: 49 additions & 0 deletions crates/deepbiop-bam/src/io.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
use anyhow::Result;
use noodles::{bam, bgzf};
use rayon::prelude::*;
use std::{fs::File, num::NonZeroUsize, path::Path, thread};

use noodles::fastq;

// FIXME: The function has a bug since seq != qual

pub fn bam2fq(bam: &Path, threads: Option<usize>) -> Result<Vec<fastq::Record>> {
let file = File::open(bam)?;
let worker_count = if let Some(threads) = threads {
NonZeroUsize::new(threads)
.unwrap()
.min(thread::available_parallelism().unwrap_or(NonZeroUsize::MIN))
} else {
thread::available_parallelism().unwrap_or(NonZeroUsize::MIN)
};

let decoder = bgzf::MultithreadedReader::with_worker_count(worker_count, file);
let mut reader = bam::io::Reader::from(decoder);
let _header = reader.read_header()?;

reader
.records()
.par_bridge()
.map(|result| {
let record = result.unwrap();

let seq = record.sequence().as_ref().to_vec();
let qual = record.quality_scores().as_ref().to_vec();

if seq.len() != qual.len() {
let name = String::from_utf8_lossy(record.name().unwrap().as_ref()).to_string();
return Err(anyhow::anyhow!(
"{} seq and qual length are not equal",
name
));
}

let fq_record = fastq::Record::new(
fastq::record::Definition::new(record.name().unwrap().to_vec(), ""),
seq,
qual,
);
Ok(fq_record)
})
.collect::<Result<Vec<fastq::Record>>>()
}
1 change: 1 addition & 0 deletions crates/deepbiop-bam/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

pub mod chimeric;
pub mod cigar;
pub mod io;

#[cfg(feature = "python")]
pub mod python;
3 changes: 2 additions & 1 deletion crates/deepbiop-cli/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,13 @@ authors = { workspace = true }
edition = { workspace = true }
homepage = { workspace = true }
repository = { workspace = true }
keywords = ["deep-learning", "bioinformatics", "biological-data", "cli"]
keywords = ["deep-learning", "science::bioinformatics", "biological-data", "command-line-utilities"]
license = { workspace = true }
readme = "../../README.md"
description = "CLI tool for Processing Biological Data."

[dependencies]
noodles = { workspace = true }
deepbiop-fq = { workspace = true }
deepbiop-bam = { workspace = true }
deepbiop-utils = { workspace = true }
Expand Down
6 changes: 6 additions & 0 deletions crates/deepbiop-cli/src/cli.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
pub mod bam2fq;
pub use bam2fq::*;
pub mod fq2fa;
pub use fq2fa::*;
pub mod fa2fq;
pub use fa2fq::*;
pub mod chimeric_count;
pub use chimeric_count::*;

Expand Down
52 changes: 52 additions & 0 deletions crates/deepbiop-cli/src/cli/bam2fq.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
use anyhow::Result;
use clap::Parser;
use deepbiop_bam as bam;
use deepbiop_fq as fq;
use noodles::fastq;

use std::path::{Path, PathBuf};

use super::set_up_threads;

#[derive(Debug, Parser)]
pub struct BamToFq {
/// path to the bam file
#[arg(value_name = "bam", action=clap::ArgAction::Append)]
bam: Vec<PathBuf>,

/// threads number
#[arg(short, long, default_value = "2")]
threads: Option<usize>,

/// output bgzip compressed fastq file
#[arg(short, long, action=clap::ArgAction::SetTrue)]
compressed: bool,
}

fn write_fq<P: AsRef<Path>>(data: &[fastq::Record], path: P) -> Result<()> {
let file = std::fs::File::create(path.as_ref())?;
let mut writer = fastq::io::Writer::new(file);
for record in data {
writer.write_record(record)?;
}
Ok(())
}

impl BamToFq {
pub fn run(&self) -> Result<()> {
set_up_threads(self.threads)?;

for bam in &self.bam {
let fq_records = bam::io::bam2fq(bam, self.threads)?;

if self.compressed {
let file_path = bam.with_extension("fq.bgz");
fq::io::write_fq_parallel_for_noodle_record(&fq_records, file_path, self.threads)?;
} else {
let file_path = bam.with_extension("fq");
write_fq(&fq_records, file_path)?;
}
}
Ok(())
}
}
12 changes: 6 additions & 6 deletions crates/deepbiop-cli/src/cli/chimeric_count.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
use ahash::HashMap;
use anyhow::Result;
use clap::Parser;
use deepbiop_bam as bam;
Expand All @@ -18,11 +17,12 @@ pub struct CountChimeric {
}

impl CountChimeric {
pub fn run(&self) -> Result<HashMap<PathBuf, usize>> {
pub fn run(&self) -> Result<()> {
set_up_threads(self.threads)?;
Ok(bam::chimeric::count_chimeric_reads_for_paths(
&self.bam,
self.threads,
))
let res = bam::chimeric::count_chimeric_reads_for_paths(&self.bam, self.threads);
for (path, count) in res {
log::info!("{}: {}", path.to_string_lossy(), count);
}
Ok(())
}
}
48 changes: 48 additions & 0 deletions crates/deepbiop-cli/src/cli/fa2fq.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
use anyhow::Result;
use clap::Parser;

use noodles::fasta;
use noodles::fastq;
use std::path::PathBuf;

use super::set_up_threads;

#[derive(Debug, Parser)]
pub struct FaToFq {
/// path to the fa file
#[arg(value_name = "fa", action=clap::ArgAction::Append)]
fa: Vec<PathBuf>,

/// threads number
#[arg(short, long, default_value = "2")]
threads: Option<usize>,
}

impl FaToFq {
pub fn run(&self) -> Result<()> {
set_up_threads(self.threads)?;

for fa in &self.fa {
let fq_file_path = fa.with_extension("fa");

let mut reader = fasta::io::reader::Builder.build_from_path(fa)?;

let fq_writer_handle = std::fs::File::create(fq_file_path)?;
let mut fq_writer = fastq::io::Writer::new(fq_writer_handle);

for record in reader.records() {
let record = record?;
let name = record.name();
let sequence = record.sequence().as_ref().to_vec();
let quality = vec![b'@'; sequence.len()];
let fq_record = fastq::Record::new(
fastq::record::Definition::new(name.to_vec(), ""),
sequence,
quality,
);
fq_writer.write_record(&fq_record)?;
}
}
Ok(())
}
}
36 changes: 36 additions & 0 deletions crates/deepbiop-cli/src/cli/fq2fa.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
use anyhow::Result;
use clap::Parser;
use deepbiop_fq as fq;

use noodles::fasta;
use std::path::PathBuf;

use super::set_up_threads;

#[derive(Debug, Parser)]
pub struct FqToFa {
/// path to the fq file
#[arg(value_name = "fq", action=clap::ArgAction::Append)]
fq: Vec<PathBuf>,

/// threads number
#[arg(short, long, default_value = "2")]
threads: Option<usize>,
}

impl FqToFa {
pub fn run(&self) -> Result<()> {
set_up_threads(self.threads)?;

for fq in &self.fq {
let fq_records = fq::io::fastq_to_fasta(fq)?;
let fa_file_path = fq.with_extension("fa");
let fa_file_handler = std::fs::File::create(&fa_file_path)?;
let mut fa_writer = fasta::io::Writer::new(fa_file_handler);
for record in fq_records {
fa_writer.write_record(&record)?;
}
}
Ok(())
}
}
26 changes: 25 additions & 1 deletion crates/deepbiop-cli/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,24 @@ struct Cli {
pub enum Commands {
/// Count chimeric reads in a BAM file.
CountChimeric(cli::CountChimeric),

/// BAM to fastq conversion.
BamToFq(cli::BamToFq),

/// Fastq to fasta conversion.
FqToFa(cli::FqToFa),

/// Fastq to fasta conversion.
FaToFq(cli::FaToFq),
}

impl Display for Commands {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Commands::CountChimeric(_) => write!(f, "chimericcount"),
Commands::BamToFq(_) => write!(f, "bam2fq"),
Commands::FqToFa(_) => write!(f, "fq2fa"),
Commands::FaToFq(_) => write!(f, "fa2fq"),
}
}
}
Expand Down Expand Up @@ -73,7 +85,19 @@ fn main() -> Result<()> {

match &cli.command {
Some(Commands::CountChimeric(count_chimeric)) => {
println!("{:?}", count_chimeric.run());
count_chimeric.run().unwrap();
}

Some(Commands::BamToFq(bam2fq)) => {
bam2fq.run().unwrap();
}

Some(Commands::FqToFa(fq2fa)) => {
fq2fa.run().unwrap();
}

Some(Commands::FaToFq(fa2fq)) => {
fa2fq.run().unwrap();
}

None => {
Expand Down
2 changes: 1 addition & 1 deletion crates/deepbiop-fq/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ authors = { workspace = true }
edition = { workspace = true }
homepage = { workspace = true }
repository = { workspace = true }
keywords = ["arrow", "parquet", "fastq", "deep-learning"]
keywords = ["parquet", "fastq", "deep-learning", "science::ml"]
license = { workspace = true }
readme = "../../README.md"
description = "Deep Learning Preprocessing Library for Fastq Format"
Expand Down
4 changes: 3 additions & 1 deletion crates/deepbiop-fq/src/encode/tensor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,11 @@ use crate::{

use super::{triat::Encoder, FqEncoderOption};
use needletail::Sequence;
use pyo3_stub_gen::derive::*;
use rayon::prelude::*;

#[pyclass]
#[gen_stub_pyclass]
#[pyclass(module = "deepbiop.fq")]
#[derive(Debug, Builder, Default, Clone)]
#[builder(build_fn(skip))] // Specify custom build function
pub struct TensorEncoder {
Expand Down
1 change: 1 addition & 0 deletions crates/deepbiop-fq/src/python.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ use rayon::prelude::*;

use pyo3_stub_gen::derive::*;

#[gen_stub_pymethods]
#[pymethods]
impl encode::TensorEncoder {
#[new]
Expand Down
2 changes: 1 addition & 1 deletion crates/deepbiop-utils/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ authors = { workspace = true }
edition = { workspace = true }
homepage = { workspace = true }
repository = { workspace = true }
keywords = ["deep-learning", "utils"]
keywords = ["deep-learning", "utils", "science::ml"]
license = { workspace = true }
readme = "../../README.md"
description = "Deep Learning Preprocessing Library for Biological Data"
Expand Down
2 changes: 1 addition & 1 deletion crates/deepbiop/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ authors = { workspace = true }
edition = { workspace = true }
homepage = { workspace = true }
repository = { workspace = true }
keywords = ["deep-learning", "bioinformatics", "biological-data"]
keywords = ["deep-learning", "science::bioinformatics", "biological-data", "science::ml"]
license = { workspace = true }
readme = "../../README.md"
description = "Deep Learning Processing Library for Biological Data"
Expand Down
12 changes: 12 additions & 0 deletions py-deepbiop/deepbiop/fq.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,18 @@ class RecordData:
def set_seq(self, seq: str) -> None: ...
def set_qual(self, qual: str) -> None: ...

class TensorEncoder:
tensor_max_width: int
tensor_max_seq_len: int
kmer2id_table: dict[list[int], int]
id2kmer_table: dict[int, list[int]]
def __new__(
cls,
option: FqEncoderOption,
tensor_max_width: int | None,
tensor_max_seq_len: int | None,
): ...

def convert_multiple_fqs_to_one_fq(
paths: typing.Sequence[str | os.PathLike | pathlib.Path],
result_path: str | os.PathLike | pathlib.Path,
Expand Down
Loading