Skip to content
This repository has been archived by the owner on Dec 23, 2024. It is now read-only.

Commit

Permalink
feat(extract): implement tool to parse barcodes (#2)
Browse files Browse the repository at this point in the history
  • Loading branch information
nsyzrantsev committed Aug 26, 2024
1 parent e25ad9a commit b7e7a2e
Show file tree
Hide file tree
Showing 13 changed files with 1,133 additions and 1 deletion.
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,8 @@ Cargo.lock

# MSVC Windows builds of rustc generate these, which store debugging information
*.pdb


# Added by cargo

/target
37 changes: 37 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
[package]
name = "barkit"
version = "0.1.0"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[workspace]
members = ["extract"]

[dependencies]
clap = { version = "4.5.4", features = ["env", "derive"] }
extract = { path = "extract" }

[profile.dev]
opt-level = 0
debug = true
panic = "abort"

[profile.test]
opt-level = 0
debug = true

[profile.release]
opt-level = 3
debug = false
strip = "symbols"
debug-assertions = false
overflow-checks = false
lto = "fat"
panic = "unwind"
incremental = false
codegen-units = 1

[profile.bench]
opt-level = 3
debug = false
44 changes: 43 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,45 @@
# BarKit

BarKit (barcodes toolkit) is an ultrafast toolkit for manipulating FASTQ barcodes
> [!WARNING]
> This tool is under development. Please use the first release version when it becomes available.
BarKit (Barcodes Toolkit) is a toolkit designed for manipulating FASTQ barcodes.

## Building from Source

```bash
cargo build --release
sudo mv barkit /usr/local/bin/
```

## Extract Command

The extract command is designed to parse barcode sequences from FASTQ reads using approximate regex matching based on a provided pattern.

All parsed barcode sequences are moved to the read header with base quality separated by colons:

```
@SEQ_ID UMI:ATGC:???? CB:ATGC:???? SB:ATGC:????
```

* **UMI**: Unique Molecular Identifier (Molecular Barcode)
* **CB**: Cell Barcode
* **SB**: Sample Barcode


### Examples

Parse the first twelve nucleotides as a UMI from each forward read:

```bash
barkit extract -1 <IN_FASTQ1> -2 <IN_FASTQ2> -p "^(?P<UMI>[ATGCN]{12})" -o <OUT_FASTQ1> -O <OUT_FASTQ2>
```

Parse the first sixteen nucleotides as a cell barcode from each reverse read before the `atgccat` sequence:

```bash
barkit extract -1 <IN_FASTQ1> -2 <IN_FASTQ2> -P "^(?P<CB>[ATGCN]{16})atgccat" -o <OUT_FASTQ1> -O <OUT_FASTQ2>
```

> [!NOTE]
> Use lowercase letters for fuzzy match patterns.
22 changes: 22 additions & 0 deletions extract/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
[package]
name = "extract"
version = "0.1.0"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
clap = { version = "4.5.4", features = ["derive", "env"] }
flate2 = "1.0.30"
regex = "1.10.4"
seq_io = "0.3.2"
thiserror = "1.0.61"
rayon = "1.10.0"
fancy-regex = "0.13.0"
lz4 = "1.26.0"
gzp = "0.11.3"
indicatif = "0.17.8"
console = "0.15.8"

[dev-dependencies]
rstest = "0.22.0"
255 changes: 255 additions & 0 deletions extract/src/barcode.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,255 @@
use crate::pattern;
use regex::bytes::{Captures, Regex};

use seq_io::fastq::{OwnedRecord, Record, RefRecord};
use std::{fmt, str};

use crate::error::Error;

/// https://www.bioinformatics.org/sms/iupac.html
const TRANSLATION_TABLE: [u8; 256] = {
let mut table = [b'A'; 256];

table[b'A' as usize] = b'T';
table[b'T' as usize] = b'A';
table[b'G' as usize] = b'C';
table[b'C' as usize] = b'G';
table[b'R' as usize] = b'R';
table[b'Y' as usize] = b'Y';
table[b'S' as usize] = b'S';
table[b'W' as usize] = b'W';
table[b'K' as usize] = b'K';
table[b'M' as usize] = b'M';
table[b'B' as usize] = b'B';
table[b'D' as usize] = b'D';
table[b'H' as usize] = b'H';
table[b'V' as usize] = b'V';
table[b'N' as usize] = b'N';

table
};

#[derive(Clone)]
pub enum BarcodeType {
Umi,
Sample,
Cell
}

impl fmt::Display for BarcodeType {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let barcode_str = match self {
BarcodeType::Umi => "UMI",
BarcodeType::Sample => "SB",
BarcodeType::Cell => "CB",
};
write!(f, "{}", barcode_str)
}
}

impl BarcodeType {
fn get_barcode_type(name: &str) -> Result<Self, Error> {
match name {
"UMI" => Ok(BarcodeType::Umi),
"SB" => Ok(BarcodeType::Sample),
"CB" => Ok(BarcodeType::Cell),
_ => Err(Error::UnexpectedCaptureGroupName(name.to_owned())),
}
}
}

#[derive(Clone)]
pub struct BarcodeRegex {
regex: Regex,
barcode_types: Vec<BarcodeType>,
}

impl BarcodeRegex {
pub fn new(pattern: &str, max_error: usize) -> Result<Self, Error> {
let fuzzy_pattern = pattern::get_with_errors(pattern, &max_error);
let regex = Regex::new(&fuzzy_pattern)?;
let barcode_types = Self::parse_capture_groups(&regex)?;
Ok(Self {
regex,
barcode_types,
})
}

pub fn get_captures<'a>(&'a self, read_seq: &'a [u8]) -> Result<Captures, Error> {
match self.regex.captures(read_seq) {
Some(capture) => Ok(capture),
None => Err(Error::PatternNotMatched),
}
}

fn parse_capture_groups(regex: &Regex) -> Result<Vec<BarcodeType>, Error> {
let mut capture_groups = Vec::<BarcodeType>::new();
for capture_group in regex.capture_names().collect::<Vec<_>>().into_iter().flatten() {
capture_groups.push(BarcodeType::get_barcode_type(capture_group)?)
}
if capture_groups.is_empty() {
return Err(Error::BarcodeCaptureGroupNotFound(regex.to_string()));
}
Ok(capture_groups)
}
}

pub struct BarcodeParser {
barcode_regex: BarcodeRegex,
skip_trimming: bool,
rc_barcodes: bool,
}

impl BarcodeParser {
pub fn new(
barcode_regex: Option<BarcodeRegex>,
skip_trimming: bool,
rc_barcodes: bool,
) -> Option<Self> {
barcode_regex.map(|regex| BarcodeParser {
barcode_regex: regex,
skip_trimming,
rc_barcodes,
})
}

pub fn extract_barcodes(&self, record: &RefRecord) -> Option<OwnedRecord> {
let read_captures = self.barcode_regex.get_captures(record.seq());
let read_seq_rc: Vec<u8>;
let read_captures = if read_captures.is_err() && self.rc_barcodes {
read_seq_rc = get_reverse_complement(record.seq());
self.barcode_regex.get_captures(&read_seq_rc)
} else {
read_captures
};
self.create_new_read(read_captures.map(Some), record)
}

fn create_new_read(
&self,
read_captures: Result<Option<Captures>, Error>,
record: &RefRecord,
) -> Option<seq_io::fastq::OwnedRecord> {
match (read_captures, self.skip_trimming) {
(Ok(Some(captures)), true) => {
Some(self.get_read_with_new_header(&captures, record).ok()?)
}
(Ok(Some(captures)), false) => {
let new_read = self.get_read_with_new_header(&captures, record).ok()?;
Some(trim_adapters(captures, &new_read).ok()?)
}
(Ok(None), _) => Some(OwnedRecord {
head: record.head().to_vec(),
seq: record.seq().to_vec(),
qual: record.qual().to_vec(),
}),
(Err(_), _) => None,
}
}

fn get_read_with_new_header(
&self,
captures: &Captures,
record: &RefRecord,
) -> Result<OwnedRecord, Error> {
let mut head = record.head().to_vec();
let seq = record.seq().to_vec();
let qual = record.qual().to_vec();

for barcode in &self.barcode_regex.barcode_types {
let barcode_name = barcode.to_string();
let (barcode_start, barcode_end) =
get_barcode_match_positions(&barcode_name, captures)?;
head = add_to_the_header(
&barcode_name,
&head,
&seq,
&qual,
barcode_start,
barcode_end,
)?;
}

Ok(OwnedRecord { head, seq, qual })
}
}

fn get_full_match_positions(captures: &Captures) -> Result<(usize, usize), Error> {
let full_match = captures
.get(0)
.ok_or(Error::BarcodeCaptureGroupNotFound("0".to_owned()))?;

Ok((full_match.start(), full_match.end()))
}

fn get_barcode_match_positions(
barcode_name: &str,
captures: &Captures,
) -> Result<(usize, usize), Error> {
let full_match = captures
.name(barcode_name)
.ok_or(Error::BarcodeCaptureGroupNotFound(barcode_name.to_string()))?;

Ok((full_match.start(), full_match.end()))
}

fn trim_adapters(captures: Captures, record: &OwnedRecord) -> Result<OwnedRecord, Error> {
let (start, end) = get_full_match_positions(&captures)?;
let seq = [&record.seq()[..start], &record.seq()[end..]].concat();
let qual = [&record.qual()[..start], &record.qual()[end..]].concat();

Ok(OwnedRecord {
head: record.head().to_vec(),
seq,
qual,
})
}

fn add_to_the_header(
barcode_type: &str,
head: &[u8],
seq: &[u8],
qual: &[u8],
start: usize,
end: usize,
) -> Result<Vec<u8>, Error> {
let barcode_seq = &seq[start..end];
let barcode_qual = &qual[start..end];

let mut result = Vec::with_capacity(
head.len() + barcode_type.len() + barcode_seq.len() + barcode_qual.len() + 3,
);
result.extend_from_slice(head);
result.extend_from_slice(
format!(" {}:{}:", barcode_type, std::str::from_utf8(barcode_seq)?).as_bytes(),
);
result.extend_from_slice(barcode_qual);

Ok(result)
}

pub fn get_reverse_complement(sequence: &[u8]) -> Vec<u8> {
sequence
.iter()
.rev()
.map(|&base| TRANSLATION_TABLE[base as usize])
.collect()
}

#[cfg(test)]
mod tests {
use rstest::rstest;

use crate::barcode::get_reverse_complement;

#[rstest]
#[case(b"", b"")]
#[case(b"GGGCCCAAATTT", b"AAATTTGGGCCC")]
#[case(b"ATGCN", b"NGCAT")]
#[case(b"AAP", b"ATT")]
#[case(b"CCX", b"AGG")]
#[case(b"PPP", b"AAA")]
fn test_get_reverse_complement(#[case] sequence: &[u8], #[case] rc_sequence: &[u8]) {
assert_eq!(get_reverse_complement(sequence), rc_sequence);
}
}
Loading

0 comments on commit b7e7a2e

Please sign in to comment.