Skip to content

Commit

Permalink
add fastq module
Browse files Browse the repository at this point in the history
  • Loading branch information
cauliyang committed Aug 2, 2024
1 parent f8c5892 commit 3c4d3ba
Show file tree
Hide file tree
Showing 22 changed files with 1,753 additions and 7 deletions.
81 changes: 77 additions & 4 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,79 @@
[package]
name = "DeepBioP"
[workspace]
resolver = "2"
members = ["crates/*"]

[workspace.package]
version = "0.1.0"
edition = "2021"
edition = "2024"
authors = ["Yangyang Li <[email protected]>"]
homepage = "https://github.com/cauliyang/DeepBioP"
repository = "https://github.com/cauliyang/DeepBioP"
license = "Apache-2.0"

[workspace.dependencies]
thiserror = "1.0"
anyhow = "1.0"
walkdir = { version = "2.4" }
rayon = { version = "1.8" }
pyo3 = { version = "0.21.0", features = [
"abi3-py310",
"extension-module",
"anyhow",
] }

ctrlc = "3.4"
log = "0.4"
pyo3-log = "0.10"
noodles = { version = "0.78.0", features = [
"bgzf",
"core",
"csi",
"fasta",
"fastq",
"sam",
"bam",
] }

bio = "2.0"
needletail = "0.5"

ahash = "0.8.11"
numpy = "0.21"
ndarray = { version = "0.15", features = ["serde", "rayon"] }
num-traits = { version = "0.2" }
serde = "1.0"
serde_derive = "1.0"
serde_json = "1.0"
rand = "0.8"
rand_distr = "0.4"
bitvec = "1.0"
itertools = "0.13.0"
derive_builder = "0.20"
lexical = "6.1"
bstr = "1.9.1"
lazy_static = "1.4.0"
tempfile = "3.10"
parquet = "52.0.0"
arrow = "52.0"
candle-core = { git = "https://github.com/huggingface/candle.git", version = "0.5.1" }
colored = "2"
textwrap = "0.16"
flate2 = { version = "1.0.30", features = [
"zlib-ng",
], default-features = false }


[profile.opt-dev]
inherits = "dev"
opt-level = 1

[profile.debug-release]
inherits = "release"
debug = true
incremental = true
codegen-units = 16
lto = "thin"

[dependencies]
[profile.release]
codegen-units = 1
lto = "fat"
17 changes: 17 additions & 0 deletions crates/deepbiop-fq/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
[package]
name = "deepbiop-fq"
version = { workspace = true }
authors = { workspace = true }
edition = { workspace = true }
homepage = { workspace = true }
repository = { workspace = true }
keywords = ["arrow", "parquet", "fastq"]
license = { workspace = true }
readme = "../../README.md"
description = "Deep Learning Preprocessing Library for Fastq Format"

[dependencies]
noodles = { workspace = true }


[[example]]
2 changes: 2 additions & 0 deletions crates/deepbiop-fq/src/default.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
pub const QUAL_OFFSET: u8 = 33;
pub const BASES: &[u8] = b"ATCGN";
18 changes: 18 additions & 0 deletions crates/deepbiop-fq/src/encode.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
mod json;
mod option;
mod parquet;
mod record;
mod tensor;
mod triat;

pub use json::*;
pub use option::*;
pub use parquet::*;
pub use record::*;
pub use tensor::*;
pub use triat::*;

// @462:528|738735b7-2105-460e-9e56-da980ef816c2+4f605fb4-4107-4827-9aed-9448d02834a8
// CGTTGGTGGTGTTCAGTTGTGGCGGTTGCTGGTCAGTAACAGCCAAGATGCTGCGGAATCTGCTGGCTTACCGTCAGATTGGGCAGAGGACGATAAGCACTGCTTCCCGCAGGCATTTTAAAAATAAAGTTCCGGAGAAGCAAAACTGTTCCAGGAGGATGATGAAATTCCACTGTATCTAAAAGGGTAGGGTAGCTGATGCCCTCCTGTATAGAGCCACCATGATCTTACAGTTGGTGGAACAGCATATGCCATATATGAGCTGGCTGTGGCTTCATTTCCCAAGAAGCAGGAGTGACTTTCAGCTTTATCTCCAGCAATTGCTTGGTCAGTTTTTCATTCAGCTCTCTATGGACCAGTAATCTGATAAATAACCGAGCTCTTCTTTGGGGATCAATATTTATTGATTGTAGTAACTGCCACCAATAAAGCAGTCTTTACCATGAAAAAAAAAAAAAAAAATCCCCCTACCCCTCTCTCCCAACTTATCCATACACAACCTGCCCCTCCAACCTCTTTCTAAACCCTTGGCGCCTCGGAGGCGTTCAGCTGCTTCAAGATGAAGCTGAACATCTTCCTTCCCAGCCACTGGCTGCCAGAAACTCATTGAAGTGGACGATGAACGCAAACTTCTGCACTTTCTATGAGAAGCGTATGGCCACAGAAGTTGCTGCTGACGCTTTGGGTGAAGAATGGAAGGGTTATGTGGTCCGAATCAGTGGTGGGAACGACAAACAAGGTTTCCCCATGAAGCAGGGTGTCTTGACCCATGGCCGTGTCCGCCTGCTACTGAGTAAGGGGCATTCCTGTTACAGACCAAGGAGAACTGGAGAAAGAAAGAGAAAATCAGTTCGTGGTTGCATTGTGGATGCAATCTGAGCGTTCTCAACTTGGTTATTGTAAAAAGGAGAGAAGGATATTCCTGGACTGACTGATACTACAGTGCCTCGCCGCCTGGGCCCCAAAAGAGCTAGCAGAATCCGCAAACTTTTCAATCTCTCTAAAAAGAAGATGATGTCCGCCAGTATCGTTGTAAGAAAGCCCTAAAATAAAGAAGGTAAGAAACCTAGGACCAAAGCACCCAAGATTCAGCGTCTGTTACTCCACGTGTCCTGCAGCACAAACGGCGGCGTATTGCTCTGAAGAAGCAGCGTACCAAGAAAAATAAAAGAAGAGGCTGCAGAATATGCTAAACTTTTGGCCTAGAGAATGAAGGAGGCTAAGGAGAAGCGCCAGGAACAAATTGCGAAGAGACGCAGACTTTCCTCTCTGCGGGACTCTACTTCTAAGTCTGAATCCAGTCAGAAATAAGATTTTTTGAGTAACAAATAATAAGATCGGGACTCTGA
// +
// 3A88;IMSJ872377DIJRSRRQRSSRSSSSSSSSGECCKLDIDDGLQRSRRRROSSPRRNOOSSEBCDEJSQKHHJSSSSSSSMMMPSSSSSRJ97677;<SSSSSRRSSSSSSSSSSSSJJKSSSSSSHFFBFGLBCBC<OPLMOP?KSIII6435@ESSSSKSSSSPSSSSD?22275@DB;(((478GIIJKMSIFEFKFA2-)&&''=ALPQQSSRSS,,;>SSSSSSSSSSSSOKGKOSSSSSSSQFLHGISSSSIGGHSSSSFFB.AGA0<AKLM9SSPLLMKMKLJJ..-02::<=0,+-)&&-:?BEFHNLIBA>>E89SSSSSASSQPSOPLMHG7788SSSCB==BCKLMPPPQQKIINRSSSSSSSSSSSSSSSSSPSRPOPPGGCH,,CEH1*%289<DACCRGGHISSSSSSSSQRQRSSSSSMQSRRRFFPPPPPPPPPPPPPPPPP--'.,,/42$))(('')'0314319HFF2104/)*+&#""33%%%(%%$##"""""""&..05%$((*(*36FSSSSSSS8794555HJI0///?SSSSSSSSSSSSSFD110AHKHKKJMJNOPS@;@@@HMQSSLMSOFC>546:<JNSIIIIKJJKSSSSSROO+(((--,1BLJKKSSSSSSSSSSSSSPKJLPSSSSSSSSSMFGPS22116559IIIISQQSSSSSSSSSSSSSQPMB651.13SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSCC>.1ABR+(96;>SSRSQPPPSSSSSSRQMQNQRSSSSSSSSSSMSSSSSMKICDSRQPIF>>>?CL0GHLNMC=<;DBCCBBBCDABBSSSSSRRSSQRMNPPROHGBCFBBBAGGGL@OEC>53038=JSSSROJGKIIHGDDFLOOOSSIBCENLNENFFSSSSNMLF@JSSSPPRQR;:989FLPDBA00//7>PSSSSSSMIHHIRSSJGGIKKPPRRSED;:;?JO=::EKIGBD'&),QQSSSN>S/0KJJIMNORLH@6679FCF5556OOORNNLRQSSPSIII>4HF6A?=AHHSSPOKSLSN-,-?EFMSRSQ;;;DIIGEEIJOSLFE@?*)+-<CD?CFDA999AK;HIFGMQSSSSSSOKCBDSSSOLJ43115AJJGKKLOOMMNLOQSSSSI1,1CCOSS11:IIMSSSSSSSSSSSSOQRRSNJHSSSSSL/../<DEKMLLGPNOA?>>MOSSLKSSSBBCJRRRQSSSSS>76654;<BSONKIKK.--+0,,51+)+450045-,.5OSSHED777SSEEEJSSSSKKIGOOSSSSIIHJSSSSSSLIJOQSSJMSS;EFAA5**)+-2556BBJOM
170 changes: 170 additions & 0 deletions crates/deepbiop-fq/src/encode/json.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
use std::{
fmt::Display,
ops::Range,
path::{Path, PathBuf},
};

use anyhow::{anyhow, Context, Result};
use derive_builder::Builder;
use log::info;
use pyo3::prelude::*;
use rayon::prelude::*;
use serde_json::json;

use crate::{kmer::to_kmer_target_region, types::Element};

use super::{triat::Encoder, FqEncoderOption};

#[pyclass]
#[derive(Debug, Builder, Default, Clone)]
pub struct JsonEncoder {
pub option: FqEncoderOption,
}

impl JsonEncoder {
pub fn new(option: FqEncoderOption) -> Self {
Self { option }
}
}

impl Display for JsonEncoder {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "FqEncoder {{ option: {} }}", self.option)
}
}

impl Encoder for JsonEncoder {
type TargetOutput = Result<Vec<Element>>;
type EncodeOutput = Result<Vec<serde_json::Value>>;
type RecordOutput = Result<serde_json::Value>;

fn encode_target(&self, id: &[u8], kmer_seq_len: Option<usize>) -> Self::TargetOutput {
let target = Self::parse_target_from_id(id).context("Failed to parse target from ID")?;
let kmer_target = target
.par_iter()
.map(|range| to_kmer_target_region(range, self.option.kmer_size as usize, None))
.collect::<Result<Vec<Range<usize>>>>()?;

let encoded_target = if self.option.vectorized_target {
if kmer_seq_len.is_none() {
return Err(anyhow!(
"kmer_seq_len is None when encodeing target in vector way"
));
}
let mut encoded_target = vec![0; kmer_seq_len.unwrap()];
kmer_target
.iter()
.for_each(|x| (x.start..x.end).for_each(|i| encoded_target[i] = 1));
encoded_target
} else {
kmer_target
.into_par_iter()
.map(|x| [x.start as Element, x.end as Element])
.flatten()
.collect()
};

Ok(encoded_target)
}

fn encode_record(&self, id: &[u8], seq: &[u8], qual: &[u8]) -> Self::RecordOutput {
// println!("encoding record: {}", String::from_utf8_lossy(id));
// 1.encode the sequence
// 2.encode the quality

// normalize to make sure all the bases are consistently capitalized and
// that we remove the newlines since this is FASTA
// change unknwon base to 'N'
// encode the sequence
let encoded_seq = self.encoder_seq(seq, self.option.kmer_size, true)?;

let encoded_seq_str: Vec<String> = encoded_seq
.into_par_iter()
.map(|x| String::from_utf8_lossy(x).to_string())
.collect();

// encode the quality
let (encoded_qual, encoded_kmer_qual) =
self.encode_qual(qual, self.option.kmer_size, self.option.qual_offset);

let encoded_target = self.encode_target(id, Some(encoded_seq_str.len()))?;

Ok(json!({
"id": String::from_utf8_lossy(id),
"kmer_seq": encoded_seq_str,
"kmer_qual": encoded_kmer_qual,
"kmer_target": encoded_target,
"qual": encoded_qual,
}))
}

fn encode<P: AsRef<Path>>(&mut self, path: P) -> Self::EncodeOutput {
let records = self.fetch_records(path, self.option.kmer_size)?;
let data: Vec<serde_json::Value> = records
.into_par_iter()
.filter_map(|data| {
let id = data.id.as_ref();
let seq = data.seq.as_ref();
let qual = data.qual.as_ref();

match self.encode_record(id, seq, qual).context(format!(
"encode fq read id {} error",
String::from_utf8_lossy(id)
)) {
Ok(result) => Some(result),
Err(_e) => None,
}
})
.collect();

info!("encoded records: {}", data.len());
Ok(data)
}

fn encode_multiple(&mut self, paths: &[PathBuf], parallel: bool) -> Self::EncodeOutput {
let result = if parallel {
paths
.into_par_iter()
.map(|path| {
let mut encoder = self.clone();
encoder.encode(path)
})
.collect::<Result<Vec<_>>>()?
} else {
paths
.iter()
.map(|path| {
let mut encoder = self.clone();
encoder.encode(path)
})
.collect::<Result<Vec<_>>>()?
};

Ok(result.into_iter().flatten().collect())
}
}

#[cfg(test)]
mod tests {
use crate::{fq_encode::FqEncoderOptionBuilder, output::write_json};

use super::*;

#[test]
fn test_encode_fq_for_json_for_large_size_fq() {
let option = FqEncoderOptionBuilder::default()
.kmer_size(3)
.vectorized_target(true)
.build()
.unwrap();

let mut encoder = JsonEncoderBuilder::default()
.option(option)
.build()
.unwrap();

let result = encoder.encode("tests/data/one_record.fq").unwrap();
let temp_file = tempfile::NamedTempFile::new().unwrap();
write_json(temp_file, result).unwrap();
}
}
61 changes: 61 additions & 0 deletions crates/deepbiop-fq/src/encode/option.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
use crate::default::{BASES, KMER_SIZE, QUAL_OFFSET, VECTORIZED_TARGET};
use derive_builder::Builder;
use serde::{Deserialize, Serialize};
use std::fmt::{self, Display, Formatter};

use pyo3::prelude::*;

#[pyclass]
#[derive(Debug, Builder, Default, Clone, Serialize, Deserialize)]
pub struct FqEncoderOption {
#[pyo3(get, set)]
#[builder(default = "KMER_SIZE")]
pub kmer_size: u8,

#[pyo3(get, set)]
#[builder(default = "QUAL_OFFSET")]
pub qual_offset: u8,

#[pyo3(get, set)]
#[builder(default = "BASES.to_vec()")]
pub bases: Vec<u8>,

#[pyo3(get, set)]
#[builder(default = "VECTORIZED_TARGET")]
pub vectorized_target: bool,

#[pyo3(get, set)]
#[builder(default = "2")]
pub threads: usize,
}

#[pymethods]
impl FqEncoderOption {
#[new]
fn py_new(
kmer_size: u8,
qual_offset: u8,
bases: String,
vectorized_target: bool,
threads: Option<usize>,
) -> Self {
FqEncoderOptionBuilder::default()
.kmer_size(kmer_size)
.qual_offset(qual_offset)
.bases(bases.as_bytes().to_vec())
.vectorized_target(vectorized_target)
.threads(threads.unwrap_or(2))
.build()
.expect("Failed to build FqEncoderOption from Python arguments.")
}
}

impl Display for FqEncoderOption {
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
write!(
f,
"FqEncoderOption {{ kmer_size: {}, qual_offset: {}, bases: {:?}, vectorized_target: {}}}",
self.kmer_size, self.qual_offset, self.bases, self.vectorized_target
)
}
}
Loading

0 comments on commit 3c4d3ba

Please sign in to comment.