Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor #38

Merged
merged 13 commits into from
Jan 13, 2024
2 changes: 1 addition & 1 deletion py-ft/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[package]
edition = "2021"
name = "pyft"
version = "0.1.11"
version = "0.1.12"

[lib]
crate-type = ["cdylib"]
Expand Down
4 changes: 4 additions & 0 deletions py-ft/DEV-NOTES.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,7 @@ source .env/bin/activate
#maturin develop
#maturin publish
```


# Remote pip install of dev branch
pip install -e 'git+https://github.com/fiberseq/fibertools-rs.git@refactor#egg=pyft&subdirectory=py-ft'
4 changes: 2 additions & 2 deletions py-ft/src/fiberdata.rs
Original file line number Diff line number Diff line change
Expand Up @@ -134,12 +134,12 @@ impl Fiberdata {
let m6a = Basemods::new(
fiber.m6a.starts.clone(),
fiber.m6a.reference_starts.clone(),
fiber.m6a.ml.clone(),
fiber.m6a.qual.clone(),
);
let cpg = Basemods::new(
fiber.cpg.starts.clone(),
fiber.cpg.reference_starts.clone(),
fiber.cpg.ml.clone(),
fiber.cpg.qual.clone(),
);

let nuc = Ranges {
Expand Down
141 changes: 51 additions & 90 deletions src/bamlift/mod.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
use itertools::multiunzip;
use rust_htslib::{bam, bam::ext::BamRecordExtensions};
use std::collections::HashMap;
use std::fmt::{Debug, Display};

/// Merge two lists into a sorted list
/// Normal sort is supposed to be very fast on two sorted lists
Expand Down Expand Up @@ -54,43 +53,6 @@ where
x
}

/// get positions on the complimented sequence in the cigar record
pub fn positions_on_complimented_sequence(
record: &bam::Record,
input_positions: &[i64],
) -> Vec<i64> {
// reverse positions if needed
let positions: Vec<i64> = if record.is_reverse() {
let seq_len = i64::try_from(record.seq_len()).unwrap();
input_positions
.iter()
.rev()
.map(|p| seq_len - p - 1)
.collect()
} else {
input_positions.to_vec()
};
positions
}

/// get positions on the complimented sequence in the cigar record
pub fn positions_on_complimented_sequence_in_place(
record: &bam::Record,
input_positions: &mut [i64],
part_of_range: bool,
) {
if !record.is_reverse() {
return;
}
let seq_len = i64::try_from(record.seq_len()).unwrap();
// need to correct for going from [) to (] if we are part of a range
let offset = if part_of_range { 0 } else { 1 };
for p in input_positions.iter_mut() {
*p = seq_len - *p - offset;
}
input_positions.reverse();
}

#[inline(always)]
pub fn is_sorted<T>(v: &[T]) -> bool
where
Expand All @@ -99,58 +61,6 @@ where
v.windows(2).all(|w| w[0] <= w[1])
}

/// search a sorted array for insertions positions of another sorted array
/// returned index i satisfies
/// left
/// a\[i-1\] < v <= a\[i\]
/// right
/// a\[i-1\] <= v < a\[i\]
/// <https://numpy.org/doc/stable/reference/generated/numpy.searchsorted.html>
/// ```
/// use fibertools_rs::bamlift::*;
/// let a = vec![1, 2, 3, 5, 6, 7, 8, 9, 10];
/// let v = vec![0, 1, 3, 4, 11, 11];
/// let indexes = search_sorted(&a, &v);
/// assert_eq!(indexes, vec![0, 0, 2, 3, 9, 9]);
/// ```
pub fn search_sorted<T>(a: &[T], v: &[T]) -> Vec<usize>
where
T: Ord,
T: Display,
[T]: Debug,
{
if !is_sorted(v) {
panic!("v is not sorted: {:?}", v);
}

let mut indexes = Vec::with_capacity(v.len());
let mut a_idx = 0;
for cur_v in v {
while a_idx < a.len() {
// check starting condition
if a_idx == 0 && *cur_v <= a[a_idx] {
indexes.push(0);
break;
} else if a_idx == 0 {
a_idx += 1;
}
// end condition
if a_idx == a.len() - 1 && *cur_v > a[a_idx] {
indexes.push(a_idx + 1);
break;
}
// middle of the array
else if (a[a_idx - 1] < *cur_v) && (*cur_v <= a[a_idx]) {
indexes.push(a_idx);
break;
}
a_idx += 1;
}
}
log::trace!("search_sorted: {:?}\n{:?}", v, indexes);
indexes
}

//
// CLOSEST LIFTOVER FUNCTIONS
//
Expand Down Expand Up @@ -378,3 +288,54 @@ pub fn lift_query_positions_exact(
liftover_exact(&aligned_block_pairs, reference_positions, true)
}
}

#[allow(clippy::type_complexity)]
fn lift_range_exact(
record: &bam::Record,
starts: &[i64],
ends: &[i64],
lift_reference_to_query: bool,
) -> (Vec<Option<i64>>, Vec<Option<i64>>, Vec<Option<i64>>) {
assert_eq!(starts.len(), ends.len());
let (ref_starts, ref_ends) = if !lift_reference_to_query {
(
lift_reference_positions_exact(record, starts),
lift_reference_positions_exact(record, ends),
)
} else {
(
lift_query_positions_exact(record, starts),
lift_query_positions_exact(record, ends),
)
};
assert_eq!(ref_starts.len(), ref_ends.len());
let rtn = ref_starts
.into_iter()
.zip(ref_ends)
.map(|(start, end)| match (start, end) {
(Some(start), Some(end)) => (Some(start), Some(end), Some(end - start)),
_ => (None, None, None),
})
.collect::<Vec<_>>();
multiunzip(rtn)
}

/// Find the exact range in the reference from a query range
#[allow(clippy::type_complexity)]
pub fn lift_query_range_exact(
record: &bam::Record,
starts: &[i64],
ends: &[i64],
) -> (Vec<Option<i64>>, Vec<Option<i64>>, Vec<Option<i64>>) {
lift_range_exact(record, starts, ends, false)
}

/// Find the exact range in the query from a reference range
#[allow(clippy::type_complexity)]
pub fn lift_reference_range_exact(
record: &bam::Record,
starts: &[i64],
ends: &[i64],
) -> (Vec<Option<i64>>, Vec<Option<i64>>, Vec<Option<i64>>) {
lift_range_exact(record, starts, ends, true)
}
Loading
Loading