Skip to content

Commit

Permalink
feat: add qual to ft cetner, and clean the ft center code more
Browse files Browse the repository at this point in the history
  • Loading branch information
mrvollger committed Jul 24, 2023
1 parent 1a1f260 commit b9f4950
Show file tree
Hide file tree
Showing 2 changed files with 92 additions and 96 deletions.
120 changes: 58 additions & 62 deletions src/center.rs
Original file line number Diff line number Diff line change
Expand Up @@ -94,46 +94,6 @@ impl CenteredFiberData {
String::from_utf8_lossy(&out_seq).to_string()
}

pub fn m6a_positions(&self) -> Vec<i64> {
self.fiber.m6a_positions(self.reference).to_vec()
}

pub fn cpg_positions(&self) -> Vec<i64> {
self.fiber.cpg_positions(self.reference).to_vec()
}

pub fn nuc_positions(&self) -> Vec<(i64, i64)> {
let (starts, ends) = if self.reference {
(
&self.fiber.nuc.reference_starts,
&self.fiber.nuc.reference_ends,
)
} else {
(&self.fiber.nuc.starts, &self.fiber.nuc.ends)
};
starts
.clone()
.into_iter()
.zip(ends.clone().into_iter())
.collect()
}

pub fn msp_positions(&self) -> Vec<(i64, i64)> {
let (starts, ends) = if self.reference {
(
&self.fiber.msp.reference_starts,
&self.fiber.msp.reference_ends,
)
} else {
(&self.fiber.msp.starts, &self.fiber.msp.ends)
};
starts
.clone()
.into_iter()
.zip(ends.clone().into_iter())
.collect()
}

pub fn get_sequence(&self) -> String {
let forward_bases = if self.center_position.strand == '+' {
self.fiber.record.seq().as_bytes()
Expand All @@ -160,16 +120,42 @@ impl CenteredFiberData {
)
}

#[allow(clippy::type_complexity)]
fn grab_data(&self) -> (&[i64], &[u8], &[i64], &[u8], &[i64], &[i64], &[i64], &[i64]) {
if self.reference {
(
&self.fiber.m6a.reference_starts,
&self.fiber.m6a.ml,
&self.fiber.cpg.reference_starts,
&self.fiber.cpg.ml,
&self.fiber.nuc.reference_starts,
&self.fiber.nuc.reference_ends,
&self.fiber.msp.reference_starts,
&self.fiber.msp.reference_ends,
)
} else {
(
&self.fiber.m6a.starts,
&self.fiber.m6a.ml,
&self.fiber.cpg.starts,
&self.fiber.cpg.ml,
&self.fiber.nuc.starts,
&self.fiber.nuc.ends,
&self.fiber.msp.starts,
&self.fiber.msp.ends,
)
}
}

pub fn write(&self) -> String {
let m6a = join_by_str(self.m6a_positions(), ",");
let cpg = join_by_str(self.cpg_positions(), ",");
let (nuc_st, nuc_en) = unzip_to_vectors(self.nuc_positions());
let (msp_st, msp_en) = unzip_to_vectors(self.msp_positions());
let (m6a, m6a_qual, cpg, cpg_qual, nuc_st, nuc_en, msp_st, msp_en) = self.grab_data();
format!(
"{}{}\t{}\t{}\t{}\t{}\t{}\t{}\n",
"{}{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n",
self.leading_columns(),
m6a,
cpg,
join_by_str(m6a, ""),
join_by_str(m6a_qual, ","),
join_by_str(cpg, ""),
join_by_str(cpg_qual, ","),
join_by_str(nuc_st, ","),
join_by_str(nuc_en, ","),
join_by_str(msp_st, ","),
Expand All @@ -196,10 +182,12 @@ impl CenteredFiberData {
}
pub fn header() -> String {
format!(
"{}{}\t{}\t{}\t{}\t{}\t{}\t{}\n",
"{}{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n",
CenteredFiberData::leading_header(),
"centered_m6a_positions",
"m6a_qual",
"centered_5mC_positions",
"5mC_qual",
"centered_nuc_starts",
"centered_nuc_ends",
"centered_msp_starts",
Expand All @@ -210,32 +198,35 @@ impl CenteredFiberData {

pub fn long_header() -> String {
format!(
"{}{}\t{}\t{}\n",
"{}{}\t{}\t{}\t{}\n",
CenteredFiberData::leading_header(),
"centered_position_type",
"centered_start",
"centered_end",
"centered_qual",
)
}

pub fn write_long(&self) -> String {
let m6a = self.m6a_positions();
let cpg = self.cpg_positions();
let (nuc_st, nuc_en) = unzip_to_vectors(self.nuc_positions());
let (msp_st, msp_en) = unzip_to_vectors(self.msp_positions());
let mut rtn = String::new();
let (m6a, m6a_qual, cpg, cpg_qual, nuc_st, nuc_en, msp_st, msp_en) = self.grab_data();
for (t, vals) in [
("m6a", (m6a, None)),
("5mC", (cpg, None)),
("nuc", (nuc_st, Some(nuc_en))),
("msp", (msp_st, Some(msp_en))),
("m6a", (m6a, None, Some(m6a_qual))),
("5mC", (cpg, None, Some(cpg_qual))),
("nuc", (nuc_st, Some(nuc_en), None)),
("msp", (msp_st, Some(msp_en), None)),
] {
let starts = vals.0;
let ends = match vals.1 {
Some(ends) => ends,
None => starts.iter().map(|&st| st + 1).collect(),
let ends: Vec<i64> = match vals.1 {
Some(ends) => ends.to_vec(),
None => starts.to_vec().iter().map(|&st| st + 1).collect(),
};
for (&st, &en) in starts.iter().zip(ends.iter()) {
let quals = match vals.2 {
Some(quals) => quals.to_vec(),
None => vec![0; starts.len()],
};

for ((&st, &en), &qual) in starts.iter().zip(ends.iter()).zip(quals.iter()) {
let mut write = true;
if let Some(dist) = self.dist {
// skip writing if we are outside the motif range
Expand All @@ -247,7 +238,12 @@ impl CenteredFiberData {
// add the leading data
rtn.push_str(&self.leading_columns());
// add the long form data
write!(&mut rtn, "{}", format_args!("{}\t{}\t{}\n", t, st, en)).unwrap();
write!(
&mut rtn,
"{}",
format_args!("{}\t{}\t{}\t{}\n", t, st, en, qual)
)
.unwrap();
}
}
}
Expand Down
68 changes: 34 additions & 34 deletions tests/data/center.bed
Original file line number Diff line number Diff line change
Expand Up @@ -4,37 +4,37 @@ chr15 74054174 74054209 CTCF_XL 3.870000e-11 + AAGCAGTTCCCCCATGTAACCTGCAGGTGGCAC
chr13 78121669 78121704 CTCF_XL 2.330000e-11 - CTGCACTTCTTGTTTGCTGCCAGCAGGGGGAGGTG
chr1 15323698 15323733 CTCF_XL 4.970000e-11 + CTGCAGGGTCCTGTAATGGCCAGCAGAGGGCAGCA
chr1 153568701 153568736 CTCF_XL 4.680000e-11 - CTGCAATGACACGCCTTAGCCACCAGAGGGCACGA
#chr7 131689283 131689318 CTCF_XL 8.620000e-11 + ATGCACTGCTGGGATCTGTCCACCAGAGGGCAGGA
#chr1 32876798 32876833 CTCF_XL 6.370000e-11 + CTGCACAGCCTGGGCACAGCCAGCAGAGGGCGCCA
#chr8 42692645 42692680 CTCF_XL 4.890000e-11 + CTGCAGTGTGAGGAGTGAGCCACCAGAGGGCACCC
#chr15 75802259 75802294 CTCF_XL 8.000000e-11 - AAGCAGTTCCCCCATATAACCTGCAGGTGGCACCA
#chr15 38440420 38440455 CTCF_XL 9.810000e-12 + ATGCAGTTCCATCTACAGGCCAGTAGATGGCGCTG
#chr22 36301453 36301488 CTCF_XL 1.100000e-11 + ATGCCGTGCCTACCGATGGCCAGCAGGTGGCAGCA
#chr11 19077152 19077187 CTCF_XL 4.970000e-11 + CTGCAGTGCTAGGCCTCCTCCACAAGAGGGCGCAC
#chr4 54234421 54234456 CTCF_XL 8.620000e-11 - CAGTAGTTCTTACGAGTGGCCAGCAGATGGCAGTG
#chr14 104896837 104896872 CTCF_XL 9.760000e-11 + CTGCAATTCAGAGATGCTGCCACCAGGAGGCGGCC
#chr4 37603163 37603198 CTCF_XL 1.180000e-12 + GTGCAGTTCTCCGATTCGGCCAGCAGAGGGAGCGA
#chrX 140766189 140766224 CTCF_XL 1.710000e-11 + TAGCTGTTCCACGCTGTAGCCAGCAGAGGGCACTG
#chr3 126522437 126522472 CTCF_XL 5.370000e-11 + ATGTTATTCCCAACTTTGGCCACCAGGTGGCAGCA
#chr1 46477574 46477609 CTCF_XL 2.000000e-12 + GTGCAGTTCTGAGAGCCAGACAGCAGGGGGCGGCC
#chr22 26360575 26360610 CTCF_XL 4.520000e-11 - CTGCAGTGCAGAAAAATGACCAGCAGGTGGCAGTG
#chr10 30392653 30392688 CTCF_XL 4.680000e-11 + AAGCAGTTCCCCCACATAACCTGCAGGTGGCACCA
#chr17 60638915 60638950 CTCF_XL 5.750000e-12 - ATGCAATTCCAGCAGGCAGCCACTAGAGGGCACCT
#chr1 30719373 30719408 CTCF_XL 4.490000e-14 + GTGCAGTTCTGCGCTTTGGCCACCAGATGGCAGCC
#chr12 64802374 64802409 CTCF_XL 4.150000e-11 + CTGCTGTTACCCGAGATGGCCAGCAGAGGGCAGGC
#chr13 75695980 75696015 CTCF_XL 6.100000e-12 + CTGCAGTACTCAGAATGTGCCAGCAGGTGGCAGCA
#chr17 21280152 21280187 CTCF_XL 6.370000e-11 + GAGCACTGCCGGGCCCTGGTCACCAGAGGGCGCTG
#chr1 5510035 5510070 CTCF_XL 2.740000e-11 + CTGCAGTGTGCAGAACTGGCCACCAGGTGGCAGCA
#chr15 77959535 77959570 CTCF_XL 8.000000e-11 + AAGCAGTTCCCCCATATAACCTGCAGGTGGCACCA
#chr7 19709134 19709169 CTCF_XL 9.760000e-11 + CTGCATTACGCAGCCCTGACCAGCAGGGGGAACCG
#chr21 43646828 43646863 CTCF_XL 6.000000e-11 - CAGCAATGCTCCCATCTGGCCACAAGGTGGCGGCC
#chr8 22137534 22137569 CTCF_XL 5.950000e-11 - TTGCACCTCTGAGGAGTGGCCACGAGAGGGCAGCA
#chr11 15650153 15650188 CTCF_XL 5.560000e-11 + CTGCAGTTCTCCGCTGCGGACGGTAGGTGGCAGCG
#chr10 101836140 101836175 CTCF_XL 6.760000e-11 + CTGCAGTTCTATACAGCCACCACCGGGGGGCACGC
#chr15 89358745 89358780 CTCF_XL 6.320000e-11 + CTGCATCTCCCCGGAACGCCCACCAGAGGGCGCTG
#chr1 27300950 27300985 CTCF_XL 1.370000e-11 + GTGCAGTTGCACTCTGTGCCCAGCAGAGGGCAGCC
#chr2 238970654 238970689 CTCF_XL 3.930000e-11 - CAGCAGTTTCAGGGGTTGGCCACTAGGTGGCAGAG
#chr2 26469278 26469313 CTCF_XL 2.760000e-11 - CTGCAATTTTCACAGACTGCCACCAGGTGGCGGTG
#chr19 18388551 18388586 CTCF_XL 5.240000e-11 + TTGCACTTGCGGCCGCAAGCCGCCAGGGGGCGCCG
#chr1 27097569 27097604 CTCF_XL 8.660000e-12 - GTGCATTTCCTGGATCCAACCACCAGAGGGCAGTG
#chr10 116788234 116788269 CTCF_XL 3.830000e-11 - GCGCAGTTCACCAGCTCCACCACCAGAGGGCGGCG
chr7 131689283 131689318 CTCF_XL 8.620000e-11 + ATGCACTGCTGGGATCTGTCCACCAGAGGGCAGGA
chr1 32876798 32876833 CTCF_XL 6.370000e-11 + CTGCACAGCCTGGGCACAGCCAGCAGAGGGCGCCA
chr8 42692645 42692680 CTCF_XL 4.890000e-11 + CTGCAGTGTGAGGAGTGAGCCACCAGAGGGCACCC
chr15 75802259 75802294 CTCF_XL 8.000000e-11 - AAGCAGTTCCCCCATATAACCTGCAGGTGGCACCA
chr15 38440420 38440455 CTCF_XL 9.810000e-12 + ATGCAGTTCCATCTACAGGCCAGTAGATGGCGCTG
chr22 36301453 36301488 CTCF_XL 1.100000e-11 + ATGCCGTGCCTACCGATGGCCAGCAGGTGGCAGCA
chr11 19077152 19077187 CTCF_XL 4.970000e-11 + CTGCAGTGCTAGGCCTCCTCCACAAGAGGGCGCAC
chr4 54234421 54234456 CTCF_XL 8.620000e-11 - CAGTAGTTCTTACGAGTGGCCAGCAGATGGCAGTG
chr14 104896837 104896872 CTCF_XL 9.760000e-11 + CTGCAATTCAGAGATGCTGCCACCAGGAGGCGGCC
chr4 37603163 37603198 CTCF_XL 1.180000e-12 + GTGCAGTTCTCCGATTCGGCCAGCAGAGGGAGCGA
chrX 140766189 140766224 CTCF_XL 1.710000e-11 + TAGCTGTTCCACGCTGTAGCCAGCAGAGGGCACTG
chr3 126522437 126522472 CTCF_XL 5.370000e-11 + ATGTTATTCCCAACTTTGGCCACCAGGTGGCAGCA
chr1 46477574 46477609 CTCF_XL 2.000000e-12 + GTGCAGTTCTGAGAGCCAGACAGCAGGGGGCGGCC
chr22 26360575 26360610 CTCF_XL 4.520000e-11 - CTGCAGTGCAGAAAAATGACCAGCAGGTGGCAGTG
chr10 30392653 30392688 CTCF_XL 4.680000e-11 + AAGCAGTTCCCCCACATAACCTGCAGGTGGCACCA
chr17 60638915 60638950 CTCF_XL 5.750000e-12 - ATGCAATTCCAGCAGGCAGCCACTAGAGGGCACCT
chr1 30719373 30719408 CTCF_XL 4.490000e-14 + GTGCAGTTCTGCGCTTTGGCCACCAGATGGCAGCC
chr12 64802374 64802409 CTCF_XL 4.150000e-11 + CTGCTGTTACCCGAGATGGCCAGCAGAGGGCAGGC
chr13 75695980 75696015 CTCF_XL 6.100000e-12 + CTGCAGTACTCAGAATGTGCCAGCAGGTGGCAGCA
chr17 21280152 21280187 CTCF_XL 6.370000e-11 + GAGCACTGCCGGGCCCTGGTCACCAGAGGGCGCTG
chr1 5510035 5510070 CTCF_XL 2.740000e-11 + CTGCAGTGTGCAGAACTGGCCACCAGGTGGCAGCA
chr15 77959535 77959570 CTCF_XL 8.000000e-11 + AAGCAGTTCCCCCATATAACCTGCAGGTGGCACCA
chr7 19709134 19709169 CTCF_XL 9.760000e-11 + CTGCATTACGCAGCCCTGACCAGCAGGGGGAACCG
chr21 43646828 43646863 CTCF_XL 6.000000e-11 - CAGCAATGCTCCCATCTGGCCACAAGGTGGCGGCC
chr8 22137534 22137569 CTCF_XL 5.950000e-11 - TTGCACCTCTGAGGAGTGGCCACGAGAGGGCAGCA
chr11 15650153 15650188 CTCF_XL 5.560000e-11 + CTGCAGTTCTCCGCTGCGGACGGTAGGTGGCAGCG
chr10 101836140 101836175 CTCF_XL 6.760000e-11 + CTGCAGTTCTATACAGCCACCACCGGGGGGCACGC
chr15 89358745 89358780 CTCF_XL 6.320000e-11 + CTGCATCTCCCCGGAACGCCCACCAGAGGGCGCTG
chr1 27300950 27300985 CTCF_XL 1.370000e-11 + GTGCAGTTGCACTCTGTGCCCAGCAGAGGGCAGCC
chr2 238970654 238970689 CTCF_XL 3.930000e-11 - CAGCAGTTTCAGGGGTTGGCCACTAGGTGGCAGAG
chr2 26469278 26469313 CTCF_XL 2.760000e-11 - CTGCAATTTTCACAGACTGCCACCAGGTGGCGGTG
chr19 18388551 18388586 CTCF_XL 5.240000e-11 + TTGCACTTGCGGCCGCAAGCCGCCAGGGGGCGCCG
chr1 27097569 27097604 CTCF_XL 8.660000e-12 - GTGCATTTCCTGGATCCAACCACCAGAGGGCAGTG
chr10 116788234 116788269 CTCF_XL 3.830000e-11 - GCGCAGTTCACCAGCTCCACCACCAGAGGGCGGCG

0 comments on commit b9f4950

Please sign in to comment.