From 7a1a8991d455d8eb8767f09b3ea80356ddf42ca3 Mon Sep 17 00:00:00 2001 From: Florian Zwagemaker Date: Mon, 7 Aug 2023 16:38:40 +0200 Subject: [PATCH] feat: primer removal process now works with reference-specific primer coordinates --- AmpliGone/cut_reads.py | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/AmpliGone/cut_reads.py b/AmpliGone/cut_reads.py index 518a064..664f722 100644 --- a/AmpliGone/cut_reads.py +++ b/AmpliGone/cut_reads.py @@ -70,16 +70,21 @@ def CutReads( workers, ): Frame, _threadnumber = data - RVSet = set() - FWSet = set() - for _, start, end, strand in primer_df[["start", "end", "strand"]].itertuples(): + RVDict = {} + FWDict = {} + + reference_ids = set(primer_df["ref"].unique()) + for refid in reference_ids: + RVDict[refid] = set() + FWDict[refid] = set() + + for _, refid, start, end, strand in primer_df[["ref", "start", "end", "strand"]].itertuples(): + for coord in range(start + 1, end): # +1 because reference is 1-based if strand == "+": - FWSet.add(coord) + FWDict[refid].add(coord) elif strand == "-": - RVSet.add(coord) - FWList = tuple(FWSet) # Since tuples are hashable - RVList = tuple(RVSet) + RVDict[refid].add(coord) Aln = mp.Aligner( reference, @@ -127,6 +132,11 @@ def CutReads( previous_seq = seq + # Fetch the primer coordinates that correspond to the reference that the read maps to + # we're using tuples here because they are hashable + FWTuple = tuple(FWDict[hit.ctg]) + RVTuple = tuple(RVDict[hit.ctg]) + qstart = hit.q_st qend = hit.q_en @@ -139,7 +149,7 @@ def CutReads( seq, qual, PositionNeedsCutting=PositionInOrBeforePrimer, - primer_list=FWList, + primer_list=FWTuple, position_on_reference=hit.r_st, cut_direction=1, read_direction=hit.strand, @@ -159,7 +169,7 @@ def CutReads( seq, qual, PositionNeedsCutting=PositionInOrAfterPrimer, - primer_list=RVList, + primer_list=RVTuple, position_on_reference=hit.r_en, cut_direction=-1, read_direction=hit.strand,