-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
22 changed files
with
1,753 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,79 @@ | ||
[package] | ||
name = "DeepBioP" | ||
[workspace] | ||
resolver = "2" | ||
members = ["crates/*"] | ||
|
||
[workspace.package] | ||
version = "0.1.0" | ||
edition = "2021" | ||
edition = "2024" | ||
authors = ["Yangyang Li <[email protected]>"] | ||
homepage = "https://github.com/cauliyang/DeepBioP" | ||
repository = "https://github.com/cauliyang/DeepBioP" | ||
license = "Apache-2.0" | ||
|
||
[workspace.dependencies] | ||
thiserror = "1.0" | ||
anyhow = "1.0" | ||
walkdir = { version = "2.4" } | ||
rayon = { version = "1.8" } | ||
pyo3 = { version = "0.21.0", features = [ | ||
"abi3-py310", | ||
"extension-module", | ||
"anyhow", | ||
] } | ||
|
||
ctrlc = "3.4" | ||
log = "0.4" | ||
pyo3-log = "0.10" | ||
noodles = { version = "0.78.0", features = [ | ||
"bgzf", | ||
"core", | ||
"csi", | ||
"fasta", | ||
"fastq", | ||
"sam", | ||
"bam", | ||
] } | ||
|
||
bio = "2.0" | ||
needletail = "0.5" | ||
|
||
ahash = "0.8.11" | ||
numpy = "0.21" | ||
ndarray = { version = "0.15", features = ["serde", "rayon"] } | ||
num-traits = { version = "0.2" } | ||
serde = "1.0" | ||
serde_derive = "1.0" | ||
serde_json = "1.0" | ||
rand = "0.8" | ||
rand_distr = "0.4" | ||
bitvec = "1.0" | ||
itertools = "0.13.0" | ||
derive_builder = "0.20" | ||
lexical = "6.1" | ||
bstr = "1.9.1" | ||
lazy_static = "1.4.0" | ||
tempfile = "3.10" | ||
parquet = "52.0.0" | ||
arrow = "52.0" | ||
candle-core = { git = "https://github.com/huggingface/candle.git", version = "0.5.1" } | ||
colored = "2" | ||
textwrap = "0.16" | ||
flate2 = { version = "1.0.30", features = [ | ||
"zlib-ng", | ||
], default-features = false } | ||
|
||
|
||
[profile.opt-dev] | ||
inherits = "dev" | ||
opt-level = 1 | ||
|
||
[profile.debug-release] | ||
inherits = "release" | ||
debug = true | ||
incremental = true | ||
codegen-units = 16 | ||
lto = "thin" | ||
|
||
[dependencies] | ||
[profile.release] | ||
codegen-units = 1 | ||
lto = "fat" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
[package] | ||
name = "deepbiop-fq" | ||
version = { workspace = true } | ||
authors = { workspace = true } | ||
edition = { workspace = true } | ||
homepage = { workspace = true } | ||
repository = { workspace = true } | ||
keywords = ["arrow", "parquet", "fastq"] | ||
license = { workspace = true } | ||
readme = "../../README.md" | ||
description = "Deep Learning Preprocessing Library for Fastq Format" | ||
|
||
[dependencies] | ||
noodles = { workspace = true } | ||
|
||
|
||
[[example]] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
pub const QUAL_OFFSET: u8 = 33; | ||
pub const BASES: &[u8] = b"ATCGN"; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
mod json; | ||
mod option; | ||
mod parquet; | ||
mod record; | ||
mod tensor; | ||
mod triat; | ||
|
||
pub use json::*; | ||
pub use option::*; | ||
pub use parquet::*; | ||
pub use record::*; | ||
pub use tensor::*; | ||
pub use triat::*; | ||
|
||
// @462:528|738735b7-2105-460e-9e56-da980ef816c2+4f605fb4-4107-4827-9aed-9448d02834a8 | ||
// CGTTGGTGGTGTTCAGTTGTGGCGGTTGCTGGTCAGTAACAGCCAAGATGCTGCGGAATCTGCTGGCTTACCGTCAGATTGGGCAGAGGACGATAAGCACTGCTTCCCGCAGGCATTTTAAAAATAAAGTTCCGGAGAAGCAAAACTGTTCCAGGAGGATGATGAAATTCCACTGTATCTAAAAGGGTAGGGTAGCTGATGCCCTCCTGTATAGAGCCACCATGATCTTACAGTTGGTGGAACAGCATATGCCATATATGAGCTGGCTGTGGCTTCATTTCCCAAGAAGCAGGAGTGACTTTCAGCTTTATCTCCAGCAATTGCTTGGTCAGTTTTTCATTCAGCTCTCTATGGACCAGTAATCTGATAAATAACCGAGCTCTTCTTTGGGGATCAATATTTATTGATTGTAGTAACTGCCACCAATAAAGCAGTCTTTACCATGAAAAAAAAAAAAAAAAATCCCCCTACCCCTCTCTCCCAACTTATCCATACACAACCTGCCCCTCCAACCTCTTTCTAAACCCTTGGCGCCTCGGAGGCGTTCAGCTGCTTCAAGATGAAGCTGAACATCTTCCTTCCCAGCCACTGGCTGCCAGAAACTCATTGAAGTGGACGATGAACGCAAACTTCTGCACTTTCTATGAGAAGCGTATGGCCACAGAAGTTGCTGCTGACGCTTTGGGTGAAGAATGGAAGGGTTATGTGGTCCGAATCAGTGGTGGGAACGACAAACAAGGTTTCCCCATGAAGCAGGGTGTCTTGACCCATGGCCGTGTCCGCCTGCTACTGAGTAAGGGGCATTCCTGTTACAGACCAAGGAGAACTGGAGAAAGAAAGAGAAAATCAGTTCGTGGTTGCATTGTGGATGCAATCTGAGCGTTCTCAACTTGGTTATTGTAAAAAGGAGAGAAGGATATTCCTGGACTGACTGATACTACAGTGCCTCGCCGCCTGGGCCCCAAAAGAGCTAGCAGAATCCGCAAACTTTTCAATCTCTCTAAAAAGAAGATGATGTCCGCCAGTATCGTTGTAAGAAAGCCCTAAAATAAAGAAGGTAAGAAACCTAGGACCAAAGCACCCAAGATTCAGCGTCTGTTACTCCACGTGTCCTGCAGCACAAACGGCGGCGTATTGCTCTGAAGAAGCAGCGTACCAAGAAAAATAAAAGAAGAGGCTGCAGAATATGCTAAACTTTTGGCCTAGAGAATGAAGGAGGCTAAGGAGAAGCGCCAGGAACAAATTGCGAAGAGACGCAGACTTTCCTCTCTGCGGGACTCTACTTCTAAGTCTGAATCCAGTCAGAAATAAGATTTTTTGAGTAACAAATAATAAGATCGGGACTCTGA | ||
// + | ||
// 3A88;IMSJ872377DIJRSRRQRSSRSSSSSSSSGECCKLDIDDGLQRSRRRROSSPRRNOOSSEBCDEJSQKHHJSSSSSSSMMMPSSSSSRJ97677;<SSSSSRRSSSSSSSSSSSSJJKSSSSSSHFFBFGLBCBC<OPLMOP?KSIII6435@ESSSSKSSSSPSSSSD?22275@DB;(((478GIIJKMSIFEFKFA2-)&&''=ALPQQSSRSS,,;>SSSSSSSSSSSSOKGKOSSSSSSSQFLHGISSSSIGGHSSSSFFB.AGA0<AKLM9SSPLLMKMKLJJ..-02::<=0,+-)&&-:?BEFHNLIBA>>E89SSSSSASSQPSOPLMHG7788SSSCB==BCKLMPPPQQKIINRSSSSSSSSSSSSSSSSSPSRPOPPGGCH,,CEH1*%289<DACCRGGHISSSSSSSSQRQRSSSSSMQSRRRFFPPPPPPPPPPPPPPPPP--'.,,/42$))(('')'0314319HFF2104/)*+&#""33%%%(%%$##"""""""&..05%$((*(*36FSSSSSSS8794555HJI0///?SSSSSSSSSSSSSFD110AHKHKKJMJNOPS@;@@@HMQSSLMSOFC>546:<JNSIIIIKJJKSSSSSROO+(((--,1BLJKKSSSSSSSSSSSSSPKJLPSSSSSSSSSMFGPS22116559IIIISQQSSSSSSSSSSSSSQPMB651.13SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSCC>.1ABR+(96;>SSRSQPPPSSSSSSRQMQNQRSSSSSSSSSSMSSSSSMKICDSRQPIF>>>?CL0GHLNMC=<;DBCCBBBCDABBSSSSSRRSSQRMNPPROHGBCFBBBAGGGL@OEC>53038=JSSSROJGKIIHGDDFLOOOSSIBCENLNENFFSSSSNMLF@JSSSPPRQR;:989FLPDBA00//7>PSSSSSSMIHHIRSSJGGIKKPPRRSED;:;?JO=::EKIGBD'&),QQSSSN>S/0KJJIMNORLH@6679FCF5556OOORNNLRQSSPSIII>4HF6A?=AHHSSPOKSLSN-,-?EFMSRSQ;;;DIIGEEIJOSLFE@?*)+-<CD?CFDA999AK;HIFGMQSSSSSSOKCBDSSSOLJ43115AJJGKKLOOMMNLOQSSSSI1,1CCOSS11:IIMSSSSSSSSSSSSOQRRSNJHSSSSSL/../<DEKMLLGPNOA?>>MOSSLKSSSBBCJRRRQSSSSS>76654;<BSONKIKK.--+0,,51+)+450045-,.5OSSHED777SSEEEJSSSSKKIGOOSSSSIIHJSSSSSSLIJOQSSJMSS;EFAA5**)+-2556BBJOM |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,170 @@ | ||
use std::{ | ||
fmt::Display, | ||
ops::Range, | ||
path::{Path, PathBuf}, | ||
}; | ||
|
||
use anyhow::{anyhow, Context, Result}; | ||
use derive_builder::Builder; | ||
use log::info; | ||
use pyo3::prelude::*; | ||
use rayon::prelude::*; | ||
use serde_json::json; | ||
|
||
use crate::{kmer::to_kmer_target_region, types::Element}; | ||
|
||
use super::{triat::Encoder, FqEncoderOption}; | ||
|
||
#[pyclass] | ||
#[derive(Debug, Builder, Default, Clone)] | ||
pub struct JsonEncoder { | ||
pub option: FqEncoderOption, | ||
} | ||
|
||
impl JsonEncoder { | ||
pub fn new(option: FqEncoderOption) -> Self { | ||
Self { option } | ||
} | ||
} | ||
|
||
impl Display for JsonEncoder { | ||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { | ||
write!(f, "FqEncoder {{ option: {} }}", self.option) | ||
} | ||
} | ||
|
||
impl Encoder for JsonEncoder { | ||
type TargetOutput = Result<Vec<Element>>; | ||
type EncodeOutput = Result<Vec<serde_json::Value>>; | ||
type RecordOutput = Result<serde_json::Value>; | ||
|
||
fn encode_target(&self, id: &[u8], kmer_seq_len: Option<usize>) -> Self::TargetOutput { | ||
let target = Self::parse_target_from_id(id).context("Failed to parse target from ID")?; | ||
let kmer_target = target | ||
.par_iter() | ||
.map(|range| to_kmer_target_region(range, self.option.kmer_size as usize, None)) | ||
.collect::<Result<Vec<Range<usize>>>>()?; | ||
|
||
let encoded_target = if self.option.vectorized_target { | ||
if kmer_seq_len.is_none() { | ||
return Err(anyhow!( | ||
"kmer_seq_len is None when encodeing target in vector way" | ||
)); | ||
} | ||
let mut encoded_target = vec![0; kmer_seq_len.unwrap()]; | ||
kmer_target | ||
.iter() | ||
.for_each(|x| (x.start..x.end).for_each(|i| encoded_target[i] = 1)); | ||
encoded_target | ||
} else { | ||
kmer_target | ||
.into_par_iter() | ||
.map(|x| [x.start as Element, x.end as Element]) | ||
.flatten() | ||
.collect() | ||
}; | ||
|
||
Ok(encoded_target) | ||
} | ||
|
||
fn encode_record(&self, id: &[u8], seq: &[u8], qual: &[u8]) -> Self::RecordOutput { | ||
// println!("encoding record: {}", String::from_utf8_lossy(id)); | ||
// 1.encode the sequence | ||
// 2.encode the quality | ||
|
||
// normalize to make sure all the bases are consistently capitalized and | ||
// that we remove the newlines since this is FASTA | ||
// change unknwon base to 'N' | ||
// encode the sequence | ||
let encoded_seq = self.encoder_seq(seq, self.option.kmer_size, true)?; | ||
|
||
let encoded_seq_str: Vec<String> = encoded_seq | ||
.into_par_iter() | ||
.map(|x| String::from_utf8_lossy(x).to_string()) | ||
.collect(); | ||
|
||
// encode the quality | ||
let (encoded_qual, encoded_kmer_qual) = | ||
self.encode_qual(qual, self.option.kmer_size, self.option.qual_offset); | ||
|
||
let encoded_target = self.encode_target(id, Some(encoded_seq_str.len()))?; | ||
|
||
Ok(json!({ | ||
"id": String::from_utf8_lossy(id), | ||
"kmer_seq": encoded_seq_str, | ||
"kmer_qual": encoded_kmer_qual, | ||
"kmer_target": encoded_target, | ||
"qual": encoded_qual, | ||
})) | ||
} | ||
|
||
fn encode<P: AsRef<Path>>(&mut self, path: P) -> Self::EncodeOutput { | ||
let records = self.fetch_records(path, self.option.kmer_size)?; | ||
let data: Vec<serde_json::Value> = records | ||
.into_par_iter() | ||
.filter_map(|data| { | ||
let id = data.id.as_ref(); | ||
let seq = data.seq.as_ref(); | ||
let qual = data.qual.as_ref(); | ||
|
||
match self.encode_record(id, seq, qual).context(format!( | ||
"encode fq read id {} error", | ||
String::from_utf8_lossy(id) | ||
)) { | ||
Ok(result) => Some(result), | ||
Err(_e) => None, | ||
} | ||
}) | ||
.collect(); | ||
|
||
info!("encoded records: {}", data.len()); | ||
Ok(data) | ||
} | ||
|
||
fn encode_multiple(&mut self, paths: &[PathBuf], parallel: bool) -> Self::EncodeOutput { | ||
let result = if parallel { | ||
paths | ||
.into_par_iter() | ||
.map(|path| { | ||
let mut encoder = self.clone(); | ||
encoder.encode(path) | ||
}) | ||
.collect::<Result<Vec<_>>>()? | ||
} else { | ||
paths | ||
.iter() | ||
.map(|path| { | ||
let mut encoder = self.clone(); | ||
encoder.encode(path) | ||
}) | ||
.collect::<Result<Vec<_>>>()? | ||
}; | ||
|
||
Ok(result.into_iter().flatten().collect()) | ||
} | ||
} | ||
|
||
#[cfg(test)] | ||
mod tests { | ||
use crate::{fq_encode::FqEncoderOptionBuilder, output::write_json}; | ||
|
||
use super::*; | ||
|
||
#[test] | ||
fn test_encode_fq_for_json_for_large_size_fq() { | ||
let option = FqEncoderOptionBuilder::default() | ||
.kmer_size(3) | ||
.vectorized_target(true) | ||
.build() | ||
.unwrap(); | ||
|
||
let mut encoder = JsonEncoderBuilder::default() | ||
.option(option) | ||
.build() | ||
.unwrap(); | ||
|
||
let result = encoder.encode("tests/data/one_record.fq").unwrap(); | ||
let temp_file = tempfile::NamedTempFile::new().unwrap(); | ||
write_json(temp_file, result).unwrap(); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
use crate::default::{BASES, KMER_SIZE, QUAL_OFFSET, VECTORIZED_TARGET}; | ||
use derive_builder::Builder; | ||
use serde::{Deserialize, Serialize}; | ||
use std::fmt::{self, Display, Formatter}; | ||
|
||
use pyo3::prelude::*; | ||
|
||
#[pyclass] | ||
#[derive(Debug, Builder, Default, Clone, Serialize, Deserialize)] | ||
pub struct FqEncoderOption { | ||
#[pyo3(get, set)] | ||
#[builder(default = "KMER_SIZE")] | ||
pub kmer_size: u8, | ||
|
||
#[pyo3(get, set)] | ||
#[builder(default = "QUAL_OFFSET")] | ||
pub qual_offset: u8, | ||
|
||
#[pyo3(get, set)] | ||
#[builder(default = "BASES.to_vec()")] | ||
pub bases: Vec<u8>, | ||
|
||
#[pyo3(get, set)] | ||
#[builder(default = "VECTORIZED_TARGET")] | ||
pub vectorized_target: bool, | ||
|
||
#[pyo3(get, set)] | ||
#[builder(default = "2")] | ||
pub threads: usize, | ||
} | ||
|
||
#[pymethods] | ||
impl FqEncoderOption { | ||
#[new] | ||
fn py_new( | ||
kmer_size: u8, | ||
qual_offset: u8, | ||
bases: String, | ||
vectorized_target: bool, | ||
threads: Option<usize>, | ||
) -> Self { | ||
FqEncoderOptionBuilder::default() | ||
.kmer_size(kmer_size) | ||
.qual_offset(qual_offset) | ||
.bases(bases.as_bytes().to_vec()) | ||
.vectorized_target(vectorized_target) | ||
.threads(threads.unwrap_or(2)) | ||
.build() | ||
.expect("Failed to build FqEncoderOption from Python arguments.") | ||
} | ||
} | ||
|
||
impl Display for FqEncoderOption { | ||
fn fmt(&self, f: &mut Formatter) -> fmt::Result { | ||
write!( | ||
f, | ||
"FqEncoderOption {{ kmer_size: {}, qual_offset: {}, bases: {:?}, vectorized_target: {}}}", | ||
self.kmer_size, self.qual_offset, self.bases, self.vectorized_target | ||
) | ||
} | ||
} |
Oops, something went wrong.