From 94a77013440a22be6b3a14b91f9516425f4f86eb Mon Sep 17 00:00:00 2001 From: Anton Goloborodko Date: Sat, 9 Mar 2024 15:30:24 +0100 Subject: [PATCH 1/8] refactor parse with read_alignment_block() --- pairtools/lib/parse.py | 195 +++++++++++++++++++++++------------------ 1 file changed, 108 insertions(+), 87 deletions(-) diff --git a/pairtools/lib/parse.py b/pairtools/lib/parse.py index a144fa46..7c907df5 100644 --- a/pairtools/lib/parse.py +++ b/pairtools/lib/parse.py @@ -38,6 +38,35 @@ from .parse_pysam import get_mismatches_c +def group_alignments_by_side(sams): + return [sam for sam in sams if sam.is_read1], [sam for sam in sams if sam.is_read2] + + +def read_alignment_block(instream, sort=True, group_by_side=True, return_readID=True): + sams = [] + + prev_readID = None + while True: + sam_entry = next(instream, None) + readID = sam_entry.query_name if sam_entry else None + + # Read is fully populated, then parse and write: + if not (sam_entry) or ((readID != prev_readID) and prev_readID): + if sort: + sams = sorted(sams, key=lambda a: (a.is_read2, a.query_alignment_start)) + out = sams if not group_by_side else group_alignments_by_side(sams) + out = out if not return_readID else (prev_readID, out) + yield out + + sams.clear() + + if sam_entry is None: + break + else: + sams.append(sam_entry) + prev_readID = readID + + def streaming_classify( instream, outstream, chromosomes, out_alignments_stream, out_stat, **kwargs ): @@ -105,102 +134,94 @@ def streaming_classify( ### Iterate over input pysam: instream = iter(instream) - while sam_entry is not None: - sam_entry = next(instream, None) - - readID = sam_entry.query_name if sam_entry else None - if readID_transform is not None and readID is not None: + for (readID, (sams1, sams2)) in read_alignment_block(instream, sort=True, group_by_side=True, return_readID=True): + if readID_transform is not None: readID = eval(readID_transform) - # Read is fully populated, then parse and write: - if not (sam_entry) or ((readID != prev_readID) and prev_readID): + ### Parse + if not parse2: # regular parser: + pairstream, all_algns1, all_algns2 = parse_read( + sams1, + sams2, + min_mapq=kwargs["min_mapq"], + max_molecule_size=kwargs["max_molecule_size"], + max_inter_align_gap=kwargs["max_inter_align_gap"], + walks_policy=kwargs["walks_policy"], + sam_tags=sam_tags, + store_seq=store_seq, + report_mismatches=True if "mismatches" in add_columns else False, + ) + else: # parse2 parser: + pairstream, all_algns1, all_algns2 = parse2_read( + sams1, + sams2, + min_mapq=kwargs["min_mapq"], + max_inter_align_gap=kwargs["max_inter_align_gap"], + max_insert_size=kwargs.get("max_insert_size", 500), + single_end=kwargs["single_end"], + report_position=kwargs["report_position"], + report_orientation=kwargs["report_orientation"], + sam_tags=sam_tags, + dedup_max_mismatch=kwargs["dedup_max_mismatch"], + store_seq=store_seq, + expand=kwargs["expand"], + max_expansion_depth=kwargs["max_expansion_depth"], + report_mismatches=True if "mismatches" in add_columns else False, + ) - ### Parse - if not parse2: # regular parser: - pairstream, all_algns1, all_algns2 = parse_read( - sams1, - sams2, - min_mapq=kwargs["min_mapq"], - max_molecule_size=kwargs["max_molecule_size"], - max_inter_align_gap=kwargs["max_inter_align_gap"], - walks_policy=kwargs["walks_policy"], - sam_tags=sam_tags, - store_seq=store_seq, - report_mismatches=True if "mismatches" in add_columns else False, - ) - else: # parse2 parser: - pairstream, all_algns1, all_algns2 = parse2_read( - sams1, - sams2, - min_mapq=kwargs["min_mapq"], - max_inter_align_gap=kwargs["max_inter_align_gap"], - max_insert_size=kwargs.get("max_insert_size", 500), - single_end=kwargs["single_end"], - report_position=kwargs["report_position"], - report_orientation=kwargs["report_orientation"], - sam_tags=sam_tags, - dedup_max_mismatch=kwargs["dedup_max_mismatch"], - store_seq=store_seq, - expand=kwargs["expand"], - max_expansion_depth=kwargs["max_expansion_depth"], - report_mismatches=True if "mismatches" in add_columns else False, - ) + ### Write: + read_has_alignments = False + for (algn1, algn2, pair_index) in pairstream: + read_has_alignments = True - ### Write: - read_has_alignments = False - for (algn1, algn2, pair_index) in pairstream: - read_has_alignments = True + # Alignment end defaults to 5' if report_alignment_end is unspecified: + if kwargs.get("report_alignment_end", "5") == "5": + algn1["pos"] = algn1["pos5"] + algn2["pos"] = algn2["pos5"] + else: + algn1["pos"] = algn1["pos3"] + algn2["pos"] = algn2["pos3"] + + if kwargs["flip"]: + flip_pair = not check_pair_order(algn1, algn2, chrom_enum) + if flip_pair: + algn1, algn2 = algn2, algn1 + sams1, sams2 = sams2, sams1 + + write_pairsam( + algn1, + algn2, + readID=prev_readID, + pair_index=pair_index, + sams1=sams1, + sams2=sams2, + out_file=outstream, + drop_readid=kwargs["drop_readid"], + drop_seq=kwargs["drop_seq"], + drop_sam=kwargs["drop_sam"], + add_pair_index=kwargs["add_pair_index"], + add_columns=add_columns, + ) - # Alignment end defaults to 5' if report_alignment_end is unspecified: - if kwargs.get("report_alignment_end", "5") == "5": - algn1["pos"] = algn1["pos5"] - algn2["pos"] = algn2["pos5"] - else: - algn1["pos"] = algn1["pos3"] - algn2["pos"] = algn2["pos3"] - - if kwargs["flip"]: - flip_pair = not check_pair_order(algn1, algn2, chrom_enum) - if flip_pair: - algn1, algn2 = algn2, algn1 - sams1, sams2 = sams2, sams1 - - write_pairsam( - algn1, - algn2, - readID=prev_readID, - pair_index=pair_index, - sams1=sams1, - sams2=sams2, - out_file=outstream, - drop_readid=kwargs["drop_readid"], - drop_seq=kwargs["drop_seq"], - drop_sam=kwargs["drop_sam"], - add_pair_index=kwargs["add_pair_index"], - add_columns=add_columns, + # add a pair to PairCounter for stats output: + if out_stat: + out_stat.add_pair( + algn1["chrom"], + int(algn1["pos"]), + algn1["strand"], + algn2["chrom"], + int(algn2["pos"]), + algn2["strand"], + algn1["type"] + algn2["type"], ) - # add a pair to PairCounter for stats output: - if out_stat: - out_stat.add_pair( - algn1["chrom"], - int(algn1["pos"]), - algn1["strand"], - algn2["chrom"], - int(algn2["pos"]), - algn2["strand"], - algn1["type"] + algn2["type"], - ) + # write all alignments: + if out_alignments_stream and read_has_alignments: + write_all_algnments( + prev_readID, all_algns1, all_algns2, out_alignments_stream + ) - # write all alignments: - if out_alignments_stream and read_has_alignments: - write_all_algnments( - prev_readID, all_algns1, all_algns2, out_alignments_stream - ) - # Empty read after writing: - sams1.clear() - sams2.clear() if sam_entry is not None: push_pysam(sam_entry, sams1, sams2) From 2e444638f5de2aaf1feef17c48221fd660102ac8 Mon Sep 17 00:00:00 2001 From: Anton Goloborodko Date: Mon, 25 Mar 2024 10:29:59 +0100 Subject: [PATCH 2/8] parse: deprecate lowering pair_type in walks --- pairtools/lib/parse.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pairtools/lib/parse.py b/pairtools/lib/parse.py index 7c907df5..13640094 100644 --- a/pairtools/lib/parse.py +++ b/pairtools/lib/parse.py @@ -580,14 +580,14 @@ def parse_read( hic_algn2 = algn break - # lower-case reported walks on the chimeric side + # DEPRECATED: lower-case reported walks on the chimeric side if walks_policy != "mask": if is_chimeric_1: hic_algn1 = dict(hic_algn1) - hic_algn1["type"] = hic_algn1["type"].lower() + # hic_algn1["type"] = hic_algn1["type"].lower() if is_chimeric_2: hic_algn2 = dict(hic_algn2) - hic_algn2["type"] = hic_algn2["type"].lower() + # hic_algn2["type"] = hic_algn2["type"].lower() else: raise ValueError(f"Walks policy {walks_policy} is not supported.") From 414db7267d1cc12cbec05cd0540ea0f876a393d6 Mon Sep 17 00:00:00 2001 From: Anton Goloborodko Date: Mon, 25 Mar 2024 10:34:38 +0100 Subject: [PATCH 3/8] parse: finish switching to read_alignment_block() --- pairtools/lib/parse.py | 20 ++------------------ 1 file changed, 2 insertions(+), 18 deletions(-) diff --git a/pairtools/lib/parse.py b/pairtools/lib/parse.py index 13640094..99a90067 100644 --- a/pairtools/lib/parse.py +++ b/pairtools/lib/parse.py @@ -122,16 +122,6 @@ def streaming_classify( if readID_transform is not None: readID_transform = compile(readID_transform, "", "eval") - ### Prepare for iterative parsing of the input stream - # Each read is represented by readID, sams1 (left alignments) and sams2 (right alignments) - readID = "" # Read id of the current read - sams1 = [] # Placeholder for the left alignments - sams2 = [] # Placeholder for the right alignments - # Each read is comprised of multiple alignments, or sam entries: - sam_entry = "" # Placeholder for each aligned segment - # Keep the id of the previous sam entry to detect when the read is completely populated: - prev_readID = "" # Placeholder for the read id - ### Iterate over input pysam: instream = iter(instream) for (readID, (sams1, sams2)) in read_alignment_block(instream, sort=True, group_by_side=True, return_readID=True): @@ -191,7 +181,7 @@ def streaming_classify( write_pairsam( algn1, algn2, - readID=prev_readID, + readID=readID, pair_index=pair_index, sams1=sams1, sams2=sams2, @@ -218,16 +208,10 @@ def streaming_classify( # write all alignments: if out_alignments_stream and read_has_alignments: write_all_algnments( - prev_readID, all_algns1, all_algns2, out_alignments_stream + readID, all_algns1, all_algns2, out_alignments_stream ) - - if sam_entry is not None: - push_pysam(sam_entry, sams1, sams2) - prev_readID = readID - - ############################ ### Alignment utilities: ### ############################ From 5d08a415487a66617e5cb3232190547f029543d2 Mon Sep 17 00:00:00 2001 From: Anton Goloborodko Date: Mon, 25 Mar 2024 11:01:45 +0100 Subject: [PATCH 4/8] parse: refactor alignment list sorting/tagging / insertion of empty algns --- pairtools/lib/parse.py | 121 ++++++++++++++++++++++------------------- 1 file changed, 66 insertions(+), 55 deletions(-) diff --git a/pairtools/lib/parse.py b/pairtools/lib/parse.py index 99a90067..f6382c20 100644 --- a/pairtools/lib/parse.py +++ b/pairtools/lib/parse.py @@ -392,6 +392,66 @@ def flip_alignment(hic_algn): return hic_algn + +def _convert_gaps_into_alignments(sorted_algns, max_inter_align_gap): + """ + Inplace conversion of gaps longer than max_inter_align_gap into alignments + """ + if (len(sorted_algns) == 1) and (not sorted_algns[0]["is_mapped"]): + return + + last_5_pos = 0 + for i in range(len(sorted_algns)): + algn = sorted_algns[i] + if algn["dist_to_5"] - last_5_pos > max_inter_align_gap: + new_algn = empty_alignment() + new_algn["dist_to_5"] = last_5_pos + new_algn["algn_read_span"] = algn["dist_to_5"] - last_5_pos + new_algn["read_len"] = algn["read_len"] + new_algn["dist_to_3"] = new_algn["read_len"] - algn["dist_to_5"] + + last_5_pos = algn["dist_to_5"] + algn["algn_read_span"] + + sorted_algns.insert(i, new_algn) + i += 2 + else: + last_5_pos = max(last_5_pos, algn["dist_to_5"] + algn["algn_read_span"]) + i += 1 + + +def normalize_alignment_list(algns, side, sort_by="dist_to_5", max_inter_align_gap=None): + """ + Normalize the alignment list: insert empty alignments in gaps between alignments, + sort by distance to the 5' end, add read side, alignment index. + + Args: + algns (list): The list of alignments. + side (str): The side of the alignment. + sort_by (str, optional): The key to sort the alignments by. Defaults to "dist_to_5". + max_inter_align_gap (int, optional): The maximum allowed gap between alignments. Defaults to None. + + Returns: + list: The normalized alignment list. + + """ + if len(algns) == 0: + algns = [empty_alignment()] + + if max_inter_align_gap is not None: + _convert_gaps_into_alignments(algns, max_inter_align_gap) + + if sort_by: + algns = sorted(algns, key=lambda algn: algn[sort_by]) + + + for i, algn in enumerate(algns): + algn["read_side"] = side + algn["align_idx"] = i + algn["tot_align_count"] = len(algns) + + return algns + + def flip_orientation(hic_algn): """ Flip orientation of a single alignment @@ -475,18 +535,9 @@ def parse_read( for sam in sams2 ] - if len(algns1) > 0: - algns1 = sorted(algns1, key=lambda algn: algn["dist_to_5"]) - else: - algns1 = [empty_alignment()] # Empty alignment dummy - if len(algns2) > 0: - algns2 = sorted(algns2, key=lambda algn: algn["dist_to_5"]) - else: - algns2 = [empty_alignment()] # Empty alignment dummy + algns1 = normalize_alignment_list(algns1, 1, sort_by="dist_to_5", max_inter_align_gap=max_inter_align_gap) + algns2 = normalize_alignment_list(algns2, 2, sort_by="dist_to_5", max_inter_align_gap=max_inter_align_gap) - if max_inter_align_gap is not None: - _convert_gaps_into_alignments(algns1, max_inter_align_gap) - _convert_gaps_into_alignments(algns2, max_inter_align_gap) # By default, assume each molecule is a single pair with single unconfirmed pair: hic_algn1 = algns1[0] @@ -619,10 +670,8 @@ def parse2_read( ) for sam in sams2 # note sams2, that's how these reads are typically parsed ] - algns1 = sorted(algns1, key=lambda algn: algn["dist_to_5"]) - if max_inter_align_gap is not None: - _convert_gaps_into_alignments(algns1, max_inter_align_gap) - + + algns1 = normalize_alignment_list(algns1, 1, sort_by="dist_to_5", max_inter_align_gap=max_inter_align_gap) algns2 = [empty_alignment()] # Empty alignment dummy if len(algns1) > 1: @@ -678,20 +727,8 @@ def parse2_read( for sam in sams2 ] - # Sort alignments by the distance to the 5'-end: - if len(algns1) > 0: - algns1 = sorted(algns1, key=lambda algn: algn["dist_to_5"]) - else: - algns1 = [empty_alignment()] # Empty alignment dummy - if len(algns2) > 0: - algns2 = sorted(algns2, key=lambda algn: algn["dist_to_5"]) - else: - algns2 = [empty_alignment()] # Empty alignment dummy - - # Convert alignment gaps to alignments: - if max_inter_align_gap is not None: - _convert_gaps_into_alignments(algns1, max_inter_align_gap) - _convert_gaps_into_alignments(algns2, max_inter_align_gap) + algns1 = normalize_alignment_list(algns1, 1, sort_by="dist_to_5", max_inter_align_gap=max_inter_align_gap) + algns2 = normalize_alignment_list(algns2, 2, sort_by="dist_to_5", max_inter_align_gap=max_inter_align_gap) is_chimeric_1 = len(algns1) > 1 is_chimeric_2 = len(algns2) > 1 @@ -829,32 +866,6 @@ def rescue_walk(algns1, algns2, max_molecule_size): return None -def _convert_gaps_into_alignments(sorted_algns, max_inter_align_gap): - """ - Inplace conversion of gaps longer than max_inter_align_gap into alignments - """ - if (len(sorted_algns) == 1) and (not sorted_algns[0]["is_mapped"]): - return - - last_5_pos = 0 - for i in range(len(sorted_algns)): - algn = sorted_algns[i] - if algn["dist_to_5"] - last_5_pos > max_inter_align_gap: - new_algn = empty_alignment() - new_algn["dist_to_5"] = last_5_pos - new_algn["algn_read_span"] = algn["dist_to_5"] - last_5_pos - new_algn["read_len"] = algn["read_len"] - new_algn["dist_to_3"] = new_algn["read_len"] - algn["dist_to_5"] - - last_5_pos = algn["dist_to_5"] + algn["algn_read_span"] - - sorted_algns.insert(i, new_algn) - i += 2 - else: - last_5_pos = max(last_5_pos, algn["dist_to_5"] + algn["algn_read_span"]) - i += 1 - - def parse_complex_walk( algns1, algns2, From f61451ba679abc13fa173a583cf9a5de6a55ab68 Mon Sep 17 00:00:00 2001 From: Anton Goloborodko Date: Mon, 25 Mar 2024 12:06:32 +0100 Subject: [PATCH 5/8] parse: update align index col names, update pairsam_format --- pairtools/lib/pairsam_format.py | 7 +++++++ pairtools/lib/parse.py | 5 ++--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/pairtools/lib/pairsam_format.py b/pairtools/lib/pairsam_format.py index a472e9f5..7383bab1 100644 --- a/pairtools/lib/pairsam_format.py +++ b/pairtools/lib/pairsam_format.py @@ -100,6 +100,10 @@ "dist_to_3", "seq", "mismatches", # Format: "{ref_letter}:{mut_letter}:{phred}:{ref_position}:{read_position}" + "read_side", + "algn_idx", + "same_side_algn_count" + ] DTYPES_EXTRA_COLUMNS = { @@ -115,4 +119,7 @@ "dist_to_3": int, "seq": str, "mismatches": str, + "read_side": int, + "algn_idx": int, + "same_side_algn_count": int, } diff --git a/pairtools/lib/parse.py b/pairtools/lib/parse.py index f6382c20..12da73bf 100644 --- a/pairtools/lib/parse.py +++ b/pairtools/lib/parse.py @@ -443,11 +443,10 @@ def normalize_alignment_list(algns, side, sort_by="dist_to_5", max_inter_align_g if sort_by: algns = sorted(algns, key=lambda algn: algn[sort_by]) - for i, algn in enumerate(algns): algn["read_side"] = side - algn["align_idx"] = i - algn["tot_align_count"] = len(algns) + algn["algn_idx"] = i + algn["same_side_algn_count"] = len(algns) return algns From e472c97724b4e7bc8f0a8625349b10ef523d60f3 Mon Sep 17 00:00:00 2001 From: Anton Goloborodko Date: Mon, 25 Mar 2024 12:28:09 +0100 Subject: [PATCH 6/8] api.parse: rearrange code --- pairtools/lib/parse.py | 43 +++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/pairtools/lib/parse.py b/pairtools/lib/parse.py index 12da73bf..f09caadd 100644 --- a/pairtools/lib/parse.py +++ b/pairtools/lib/parse.py @@ -392,6 +392,28 @@ def flip_alignment(hic_algn): return hic_algn +def flip_orientation(hic_algn): + """ + Flip orientation of a single alignment + :param hic_algn: Alignment to be modified + :return: + """ + hic_algn = dict(hic_algn) # overwrite the variable with the copy of dictionary + hic_algn["strand"] = "+" if (hic_algn["strand"] == "-") else "-" + return hic_algn + + +def flip_position(hic_algn): + """ + Flip ends of a single alignment + :param hic_algn: Alignment to be modified + :return: + """ + hic_algn = dict(hic_algn) # overwrite the variable with the copy of dictionary + hic_algn["pos5"], hic_algn["pos3"] = hic_algn["pos3"], hic_algn["pos5"] + return hic_algn + + def _convert_gaps_into_alignments(sorted_algns, max_inter_align_gap): """ @@ -451,27 +473,6 @@ def normalize_alignment_list(algns, side, sort_by="dist_to_5", max_inter_align_g return algns -def flip_orientation(hic_algn): - """ - Flip orientation of a single alignment - :param hic_algn: Alignment to be modified - :return: - """ - hic_algn = dict(hic_algn) # overwrite the variable with the copy of dictionary - hic_algn["strand"] = "+" if (hic_algn["strand"] == "-") else "-" - return hic_algn - - -def flip_position(hic_algn): - """ - Flip ends of a single alignment - :param hic_algn: Alignment to be modified - :return: - """ - hic_algn = dict(hic_algn) # overwrite the variable with the copy of dictionary - hic_algn["pos5"], hic_algn["pos3"] = hic_algn["pos3"], hic_algn["pos5"] - return hic_algn - #################### ### Parsing utilities: From 1e5459cd2ebf5406294797da79760c42bb02a8d2 Mon Sep 17 00:00:00 2001 From: Anton Goloborodko Date: Mon, 25 Mar 2024 12:46:58 +0100 Subject: [PATCH 7/8] api.parse: fix recently introduced bugs --- pairtools/lib/parse.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pairtools/lib/parse.py b/pairtools/lib/parse.py index f09caadd..33fb8a13 100644 --- a/pairtools/lib/parse.py +++ b/pairtools/lib/parse.py @@ -459,12 +459,12 @@ def normalize_alignment_list(algns, side, sort_by="dist_to_5", max_inter_align_g if len(algns) == 0: algns = [empty_alignment()] + if sort_by: + algns = sorted(algns, key=lambda algn: algn[sort_by]) + if max_inter_align_gap is not None: _convert_gaps_into_alignments(algns, max_inter_align_gap) - if sort_by: - algns = sorted(algns, key=lambda algn: algn[sort_by]) - for i, algn in enumerate(algns): algn["read_side"] = side algn["algn_idx"] = i From e9250f9ee9adeaede0f25d166a41bcce4e1bd84f Mon Sep 17 00:00:00 2001 From: Anton Goloborodko Date: Sun, 7 Apr 2024 16:07:47 +0200 Subject: [PATCH 8/8] parse: change the default walk policy to 5unique --- pairtools/cli/parse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pairtools/cli/parse.py b/pairtools/cli/parse.py index e1eb6741..e9541956 100644 --- a/pairtools/cli/parse.py +++ b/pairtools/cli/parse.py @@ -123,7 +123,7 @@ @click.option( "--walks-policy", type=click.Choice(["mask", "5any", "5unique", "3any", "3unique", "all"]), - default="mask", + default="5unique", help="the policy for reporting unrescuable walks (reads containing more" " than one alignment on one or both sides, that can not be explained by a" " single ligation between two mappable DNA fragments)."