This repository has been archived by the owner on Dec 23, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(extract): implement tool to parse barcodes (#2)
- Loading branch information
1 parent
e25ad9a
commit b7e7a2e
Showing
13 changed files
with
1,133 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
[package] | ||
name = "barkit" | ||
version = "0.1.0" | ||
edition = "2021" | ||
|
||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html | ||
|
||
[workspace] | ||
members = ["extract"] | ||
|
||
[dependencies] | ||
clap = { version = "4.5.4", features = ["env", "derive"] } | ||
extract = { path = "extract" } | ||
|
||
[profile.dev] | ||
opt-level = 0 | ||
debug = true | ||
panic = "abort" | ||
|
||
[profile.test] | ||
opt-level = 0 | ||
debug = true | ||
|
||
[profile.release] | ||
opt-level = 3 | ||
debug = false | ||
strip = "symbols" | ||
debug-assertions = false | ||
overflow-checks = false | ||
lto = "fat" | ||
panic = "unwind" | ||
incremental = false | ||
codegen-units = 1 | ||
|
||
[profile.bench] | ||
opt-level = 3 | ||
debug = false |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,45 @@ | ||
# BarKit | ||
|
||
BarKit (barcodes toolkit) is an ultrafast toolkit for manipulating FASTQ barcodes | ||
> [!WARNING] | ||
> This tool is under development. Please use the first release version when it becomes available. | ||
BarKit (Barcodes Toolkit) is a toolkit designed for manipulating FASTQ barcodes. | ||
|
||
## Building from Source | ||
|
||
```bash | ||
cargo build --release | ||
sudo mv barkit /usr/local/bin/ | ||
``` | ||
|
||
## Extract Command | ||
|
||
The extract command is designed to parse barcode sequences from FASTQ reads using approximate regex matching based on a provided pattern. | ||
|
||
All parsed barcode sequences are moved to the read header with base quality separated by colons: | ||
|
||
``` | ||
@SEQ_ID UMI:ATGC:???? CB:ATGC:???? SB:ATGC:???? | ||
``` | ||
|
||
* **UMI**: Unique Molecular Identifier (Molecular Barcode) | ||
* **CB**: Cell Barcode | ||
* **SB**: Sample Barcode | ||
|
||
|
||
### Examples | ||
|
||
Parse the first twelve nucleotides as a UMI from each forward read: | ||
|
||
```bash | ||
barkit extract -1 <IN_FASTQ1> -2 <IN_FASTQ2> -p "^(?P<UMI>[ATGCN]{12})" -o <OUT_FASTQ1> -O <OUT_FASTQ2> | ||
``` | ||
|
||
Parse the first sixteen nucleotides as a cell barcode from each reverse read before the `atgccat` sequence: | ||
|
||
```bash | ||
barkit extract -1 <IN_FASTQ1> -2 <IN_FASTQ2> -P "^(?P<CB>[ATGCN]{16})atgccat" -o <OUT_FASTQ1> -O <OUT_FASTQ2> | ||
``` | ||
|
||
> [!NOTE] | ||
> Use lowercase letters for fuzzy match patterns. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
[package] | ||
name = "extract" | ||
version = "0.1.0" | ||
edition = "2021" | ||
|
||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html | ||
|
||
[dependencies] | ||
clap = { version = "4.5.4", features = ["derive", "env"] } | ||
flate2 = "1.0.30" | ||
regex = "1.10.4" | ||
seq_io = "0.3.2" | ||
thiserror = "1.0.61" | ||
rayon = "1.10.0" | ||
fancy-regex = "0.13.0" | ||
lz4 = "1.26.0" | ||
gzp = "0.11.3" | ||
indicatif = "0.17.8" | ||
console = "0.15.8" | ||
|
||
[dev-dependencies] | ||
rstest = "0.22.0" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,255 @@ | ||
use crate::pattern; | ||
use regex::bytes::{Captures, Regex}; | ||
|
||
use seq_io::fastq::{OwnedRecord, Record, RefRecord}; | ||
use std::{fmt, str}; | ||
|
||
use crate::error::Error; | ||
|
||
/// https://www.bioinformatics.org/sms/iupac.html | ||
const TRANSLATION_TABLE: [u8; 256] = { | ||
let mut table = [b'A'; 256]; | ||
|
||
table[b'A' as usize] = b'T'; | ||
table[b'T' as usize] = b'A'; | ||
table[b'G' as usize] = b'C'; | ||
table[b'C' as usize] = b'G'; | ||
table[b'R' as usize] = b'R'; | ||
table[b'Y' as usize] = b'Y'; | ||
table[b'S' as usize] = b'S'; | ||
table[b'W' as usize] = b'W'; | ||
table[b'K' as usize] = b'K'; | ||
table[b'M' as usize] = b'M'; | ||
table[b'B' as usize] = b'B'; | ||
table[b'D' as usize] = b'D'; | ||
table[b'H' as usize] = b'H'; | ||
table[b'V' as usize] = b'V'; | ||
table[b'N' as usize] = b'N'; | ||
|
||
table | ||
}; | ||
|
||
#[derive(Clone)] | ||
pub enum BarcodeType { | ||
Umi, | ||
Sample, | ||
Cell | ||
} | ||
|
||
impl fmt::Display for BarcodeType { | ||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { | ||
let barcode_str = match self { | ||
BarcodeType::Umi => "UMI", | ||
BarcodeType::Sample => "SB", | ||
BarcodeType::Cell => "CB", | ||
}; | ||
write!(f, "{}", barcode_str) | ||
} | ||
} | ||
|
||
impl BarcodeType { | ||
fn get_barcode_type(name: &str) -> Result<Self, Error> { | ||
match name { | ||
"UMI" => Ok(BarcodeType::Umi), | ||
"SB" => Ok(BarcodeType::Sample), | ||
"CB" => Ok(BarcodeType::Cell), | ||
_ => Err(Error::UnexpectedCaptureGroupName(name.to_owned())), | ||
} | ||
} | ||
} | ||
|
||
#[derive(Clone)] | ||
pub struct BarcodeRegex { | ||
regex: Regex, | ||
barcode_types: Vec<BarcodeType>, | ||
} | ||
|
||
impl BarcodeRegex { | ||
pub fn new(pattern: &str, max_error: usize) -> Result<Self, Error> { | ||
let fuzzy_pattern = pattern::get_with_errors(pattern, &max_error); | ||
let regex = Regex::new(&fuzzy_pattern)?; | ||
let barcode_types = Self::parse_capture_groups(®ex)?; | ||
Ok(Self { | ||
regex, | ||
barcode_types, | ||
}) | ||
} | ||
|
||
pub fn get_captures<'a>(&'a self, read_seq: &'a [u8]) -> Result<Captures, Error> { | ||
match self.regex.captures(read_seq) { | ||
Some(capture) => Ok(capture), | ||
None => Err(Error::PatternNotMatched), | ||
} | ||
} | ||
|
||
fn parse_capture_groups(regex: &Regex) -> Result<Vec<BarcodeType>, Error> { | ||
let mut capture_groups = Vec::<BarcodeType>::new(); | ||
for capture_group in regex.capture_names().collect::<Vec<_>>().into_iter().flatten() { | ||
capture_groups.push(BarcodeType::get_barcode_type(capture_group)?) | ||
} | ||
if capture_groups.is_empty() { | ||
return Err(Error::BarcodeCaptureGroupNotFound(regex.to_string())); | ||
} | ||
Ok(capture_groups) | ||
} | ||
} | ||
|
||
pub struct BarcodeParser { | ||
barcode_regex: BarcodeRegex, | ||
skip_trimming: bool, | ||
rc_barcodes: bool, | ||
} | ||
|
||
impl BarcodeParser { | ||
pub fn new( | ||
barcode_regex: Option<BarcodeRegex>, | ||
skip_trimming: bool, | ||
rc_barcodes: bool, | ||
) -> Option<Self> { | ||
barcode_regex.map(|regex| BarcodeParser { | ||
barcode_regex: regex, | ||
skip_trimming, | ||
rc_barcodes, | ||
}) | ||
} | ||
|
||
pub fn extract_barcodes(&self, record: &RefRecord) -> Option<OwnedRecord> { | ||
let read_captures = self.barcode_regex.get_captures(record.seq()); | ||
let read_seq_rc: Vec<u8>; | ||
let read_captures = if read_captures.is_err() && self.rc_barcodes { | ||
read_seq_rc = get_reverse_complement(record.seq()); | ||
self.barcode_regex.get_captures(&read_seq_rc) | ||
} else { | ||
read_captures | ||
}; | ||
self.create_new_read(read_captures.map(Some), record) | ||
} | ||
|
||
fn create_new_read( | ||
&self, | ||
read_captures: Result<Option<Captures>, Error>, | ||
record: &RefRecord, | ||
) -> Option<seq_io::fastq::OwnedRecord> { | ||
match (read_captures, self.skip_trimming) { | ||
(Ok(Some(captures)), true) => { | ||
Some(self.get_read_with_new_header(&captures, record).ok()?) | ||
} | ||
(Ok(Some(captures)), false) => { | ||
let new_read = self.get_read_with_new_header(&captures, record).ok()?; | ||
Some(trim_adapters(captures, &new_read).ok()?) | ||
} | ||
(Ok(None), _) => Some(OwnedRecord { | ||
head: record.head().to_vec(), | ||
seq: record.seq().to_vec(), | ||
qual: record.qual().to_vec(), | ||
}), | ||
(Err(_), _) => None, | ||
} | ||
} | ||
|
||
fn get_read_with_new_header( | ||
&self, | ||
captures: &Captures, | ||
record: &RefRecord, | ||
) -> Result<OwnedRecord, Error> { | ||
let mut head = record.head().to_vec(); | ||
let seq = record.seq().to_vec(); | ||
let qual = record.qual().to_vec(); | ||
|
||
for barcode in &self.barcode_regex.barcode_types { | ||
let barcode_name = barcode.to_string(); | ||
let (barcode_start, barcode_end) = | ||
get_barcode_match_positions(&barcode_name, captures)?; | ||
head = add_to_the_header( | ||
&barcode_name, | ||
&head, | ||
&seq, | ||
&qual, | ||
barcode_start, | ||
barcode_end, | ||
)?; | ||
} | ||
|
||
Ok(OwnedRecord { head, seq, qual }) | ||
} | ||
} | ||
|
||
fn get_full_match_positions(captures: &Captures) -> Result<(usize, usize), Error> { | ||
let full_match = captures | ||
.get(0) | ||
.ok_or(Error::BarcodeCaptureGroupNotFound("0".to_owned()))?; | ||
|
||
Ok((full_match.start(), full_match.end())) | ||
} | ||
|
||
fn get_barcode_match_positions( | ||
barcode_name: &str, | ||
captures: &Captures, | ||
) -> Result<(usize, usize), Error> { | ||
let full_match = captures | ||
.name(barcode_name) | ||
.ok_or(Error::BarcodeCaptureGroupNotFound(barcode_name.to_string()))?; | ||
|
||
Ok((full_match.start(), full_match.end())) | ||
} | ||
|
||
fn trim_adapters(captures: Captures, record: &OwnedRecord) -> Result<OwnedRecord, Error> { | ||
let (start, end) = get_full_match_positions(&captures)?; | ||
let seq = [&record.seq()[..start], &record.seq()[end..]].concat(); | ||
let qual = [&record.qual()[..start], &record.qual()[end..]].concat(); | ||
|
||
Ok(OwnedRecord { | ||
head: record.head().to_vec(), | ||
seq, | ||
qual, | ||
}) | ||
} | ||
|
||
fn add_to_the_header( | ||
barcode_type: &str, | ||
head: &[u8], | ||
seq: &[u8], | ||
qual: &[u8], | ||
start: usize, | ||
end: usize, | ||
) -> Result<Vec<u8>, Error> { | ||
let barcode_seq = &seq[start..end]; | ||
let barcode_qual = &qual[start..end]; | ||
|
||
let mut result = Vec::with_capacity( | ||
head.len() + barcode_type.len() + barcode_seq.len() + barcode_qual.len() + 3, | ||
); | ||
result.extend_from_slice(head); | ||
result.extend_from_slice( | ||
format!(" {}:{}:", barcode_type, std::str::from_utf8(barcode_seq)?).as_bytes(), | ||
); | ||
result.extend_from_slice(barcode_qual); | ||
|
||
Ok(result) | ||
} | ||
|
||
pub fn get_reverse_complement(sequence: &[u8]) -> Vec<u8> { | ||
sequence | ||
.iter() | ||
.rev() | ||
.map(|&base| TRANSLATION_TABLE[base as usize]) | ||
.collect() | ||
} | ||
|
||
#[cfg(test)] | ||
mod tests { | ||
use rstest::rstest; | ||
|
||
use crate::barcode::get_reverse_complement; | ||
|
||
#[rstest] | ||
#[case(b"", b"")] | ||
#[case(b"GGGCCCAAATTT", b"AAATTTGGGCCC")] | ||
#[case(b"ATGCN", b"NGCAT")] | ||
#[case(b"AAP", b"ATT")] | ||
#[case(b"CCX", b"AGG")] | ||
#[case(b"PPP", b"AAA")] | ||
fn test_get_reverse_complement(#[case] sequence: &[u8], #[case] rc_sequence: &[u8]) { | ||
assert_eq!(get_reverse_complement(sequence), rc_sequence); | ||
} | ||
} |
Oops, something went wrong.