Cleanup and bugfixes pre0.4.0:

* Removed OrderedDict remnants: #113 * Pairtools dedup fix of cython forgotten line from recent PR: #107 * header add_columns function * pairtools restrict: header improvement * pairtools stats: warning of dataframe slicing fix * np.genfromtxt warning resolved: #71
open2c · Apr 7, 2022 · a31f52d · a31f52d
1 parent 7e4712f
commit a31f52d
Show file tree

Hide file tree

Showing 6 changed files with 57 additions and 32 deletions.
diff --git a/pairtools/_dedup.pyx b/pairtools/_dedup.pyx
@@ -112,9 +112,6 @@ def mark_duplicates(
             high = low + 1  # restart high
             continue
 
-
-
-
         if methodid == 0:
             extraCondition = (max(abs(p1[low] - p1[high]),
                                   abs(p2[low] - p2[high])) <= max_mismatch)
@@ -202,11 +199,12 @@ cdef class OnlineDuplicateDetector(object):
         self.rm = self.rm[self.low:]
         self.high = self.high-self.low
         self.N = self.N - self.low
-        self.low = 0
         if self.returnData == 1:
+            self.low = 0
             return ret
         if self.keep_parent_id == 1: # Return parent readIDs alongside with duplicates mask:
             pastidx = self.parent_idxs[:self.low]
+            self.low = 0
             return pastrm, pastidx
         return pastrm
 

diff --git a/pairtools/_headerops.py b/pairtools/_headerops.py
@@ -1,4 +1,4 @@
-from collections import OrderedDict, defaultdict
+from collections import defaultdict
 import sys
 import copy
 import itertools
@@ -12,7 +12,8 @@
 
 
 PAIRS_FORMAT_VERSION = "1.0.0"
-
+SEP_COLS = " "
+SEP_CHROMS = " "
 
 def get_header(instream, comment_char="#"):
     """Returns a header from the stream and an the reaminder of the stream
@@ -92,7 +93,7 @@ def extract_column_names(header):
     columns = extract_fields(header, "columns")
 
     if len(columns) != 0:
-        return columns[0].split(" ")
+        return columns[0].split(SEP_COLS)
     else:
         return []
 
@@ -103,7 +104,7 @@ def extract_chromsizes(header):
     """
 
     chromsizes_str = extract_fields(header, "chromsize")
-    chromsizes_str = list(zip(*[s.split(" ") for s in chromsizes_str]))
+    chromsizes_str = list(zip(*[s.split(SEP_CHROMS) for s in chromsizes_str]))
     chromsizes = pd.Series(data=chromsizes_str[1], index=chromsizes_str[0]).astype(
         np.int64
     )
@@ -112,10 +113,10 @@ def extract_chromsizes(header):
 
 
 def get_chromsizes_from_pysam_header(samheader):
-    """Convert pysam header to pairtools chromosomes (Ordered dict).
+    """Convert pysam header to pairtools chromosomes dict (ordered by Python default since 3.7).
 
     Example of pysam header converted to dict:
-    OrderedDict([
+    dict([
         ('SQ', [{'SN': 'chr1', 'LN': 248956422},
          {'SN': 'chr10', 'LN': 133797422},
          {'SN': 'chr11', 'LN': 135086622},
@@ -125,15 +126,15 @@ def get_chromsizes_from_pysam_header(samheader):
     """
     SQs = samheader.to_dict()["SQ"]
     chromsizes = [(sq["SN"], int(sq["LN"])) for sq in SQs]
-    return OrderedDict(chromsizes)
+    return dict(chromsizes)
 
 
 def get_chrom_order(chroms_file, sam_chroms=None):
     """
     Produce an "enumeration" of chromosomes based on the list
     of chromosomes
     """
-    chrom_enum = OrderedDict()
+    chrom_enum = dict()
     i = 1
     with open(chroms_file, "rt") as f:
         for line in f:
@@ -176,7 +177,7 @@ def make_standard_pairsheader(
         for chrom, length in chromsizes:
             header.append("#chromsize: {} {}".format(chrom, length))
 
-    header.append("#columns: " + " ".join(columns))
+    header.append("#columns: " + SEP_COLS.join(columns))
 
     return header
 
@@ -188,7 +189,7 @@ def subset_chroms_in_pairsheader(header, chrom_subset):
             if line.strip().split()[1] in chrom_subset:
                 new_header.append(line)
         elif line.startswith("#chromosomes:"):
-            line = " ".join(
+            line = SEP_CHROMS.join(
                 ["#chromosomes:"]
                 + [c for c in line.strip().split()[1:] if c in chrom_subset]
             )
@@ -225,8 +226,8 @@ def mark_header_as_sorted(header):
             header.insert(0, "#sorted: chr1-chr2-pos1-pos2")
     for i in range(len(header)):
         if header[i].startswith("#chromosomes"):
-            chroms = header[i][12:].strip().split(" ")
-            header[i] = "#chromosomes: {}".format(" ".join(sorted(chroms)))
+            chroms = header[i][12:].strip().split(SEP_CHROMS)
+            header[i] = "#chromosomes: {}".format(SEP_CHROMS.join(sorted(chroms)))
     return header
 
 
@@ -609,6 +610,24 @@ def merge_headers(headers, force=False):
     return new_header
 
 
+def append_columns(header, columns):
+    """
+    Appends columns to the header, separated by SEP_COLS
+
+    Parameters
+    ----------
+    header: Previous header
+    columns: List of column names to append
+
+    Returns
+    -------
+    Modified header (appended columns to the field "#columns")
+    """
+    for i in range(len(header)):
+        if header[i].startswith("#columns: "):
+            header[i] += SEP_COLS + SEP_COLS.join(columns)
+    return header
+
 # def _guess_genome_assembly(samheader):
 #    PG = [l for l in samheader if l.startswith('@PG') and '\tID:bwa' in l][0]
 #    CL = [field for field in PG.split('\t') if field.startswith('CL:')]

diff --git a/pairtools/pairtools_dedup.py b/pairtools/pairtools_dedup.py
@@ -401,8 +401,8 @@ def dedup_py(
         outstream.writelines((l + "\n" for l in header))
     if send_header_to_dup and outstream_dups and (outstream_dups != outstream):
         dups_header = header
-        if keep_parent_id:
-            dups_header[-1] += " parent_readID"
+        if keep_parent_id and len(dups_header)>0:
+            dups_header= _headerops.append_columns(dups_header, ["parent_readID"])
         outstream_dups.writelines((l + "\n" for l in dups_header))
     if (
         outstream_unmapped
@@ -911,6 +911,7 @@ def streaming_dedup_cython(
                     ar(s1, 32),
                     ar(s2, 32),
                 )
+
             else:
                 res = dd.push(
                     ar(c1, 32),
@@ -925,6 +926,7 @@ def streaming_dedup_cython(
                 if keep_parent_id:
                     res_tmp, parents_tmp = dd.finish()
                     parents = np.concatenate([parents, parents_tmp])
+
                 else:
                     res_tmp = dd.finish()
                 res = np.concatenate([res, res_tmp])

diff --git a/pairtools/pairtools_parse.py b/pairtools/pairtools_parse.py
@@ -1,6 +1,5 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
-from collections import OrderedDict
 import subprocess
 import fileinput
 import itertools

diff --git a/pairtools/pairtools_restrict.py b/pairtools/pairtools_restrict.py
@@ -59,9 +59,17 @@ def restrict_py(pairs_path, frags, output, **kwargs):
 
     header, body_stream = _headerops.get_header(instream)
     header = _headerops.append_new_pg(header, ID=UTIL_NAME, PN=UTIL_NAME)
+    header = _headerops.append_columns(header,
+                              ['rfrag1', 'rfrag_start1', 'rfrag_end1',
+                               'rfrag2', 'rfrag_start2', 'rfrag_end2'])
+
     outstream.writelines((l+'\n' for l in header))
     rfrags = np.genfromtxt(
-        frags, delimiter='\t', comments='#', dtype=None,
+        frags,
+        delimiter='\t',
+        comments='#',
+        dtype=None,
+        encoding='ascii',
         names=['chrom', 'start', 'end'])
 
 
@@ -92,7 +100,7 @@ def restrict_py(pairs_path, frags, output, **kwargs):
 
 
 def find_rfrag(rfrags, chrom, pos):
-    rsites_chrom = rfrags[chrom.encode('ascii')]
+    rsites_chrom = rfrags[chrom]
     idx = min(max(0,rsites_chrom.searchsorted(pos, 'right')-1), len(rsites_chrom)-2)
     return idx, rsites_chrom[idx], rsites_chrom[idx+1]
 

diff --git a/pairtools/pairtools_stats.py b/pairtools/pairtools_stats.py
@@ -452,9 +452,7 @@ def add_pairs_from_dataframe(self, df, unmapped_chrom="!"):
         self._stat["total_unmapped"] += int(unmapped_count)
 
         # Count the mapped:
-        df_mapped = df[
-            (df["chrom1"] != unmapped_chrom) & (df["chrom2"] != unmapped_chrom)
-        ]
+        df_mapped = df.loc[(df["chrom1"] != unmapped_chrom) & (df["chrom2"] != unmapped_chrom), :]
         mapped_count = df_mapped.shape[0]
 
         self._stat["total_mapped"] += mapped_count
@@ -480,15 +478,16 @@ def add_pairs_from_dataframe(self, df, unmapped_chrom="!"):
             )
 
         # Count cis-trans by pairs:
-        df_nodups = df_mapped[~mask_dups]
-        cis = df_nodups[df_nodups["chrom1"] == df_nodups["chrom2"]]
-        self._stat["cis"] += cis.shape[0]
-        self._stat["trans"] += df_nodups.shape[0] - cis.shape[0]
-        dist = np.abs(cis["pos2"].values - cis["pos1"].values)
-
-        cis.loc[:, "bin_idx"] = np.searchsorted(self._dist_bins, dist, "right") - 1
+        df_nodups = df_mapped.loc[~mask_dups, :]
+        mask_cis = df_nodups["chrom1"] == df_nodups["chrom2"]
+        df_cis = df_nodups.loc[mask_cis, :].copy()
+        self._stat["cis"] += df_cis.shape[0]
+        self._stat["trans"] += df_nodups.shape[0] - df_cis.shape[0]
+        dist = np.abs(df_cis["pos2"].values - df_cis["pos1"].values)
+
+        df_cis.loc[:, "bin_idx"] = np.searchsorted(self._dist_bins, dist, "right") - 1
         for (strand1, strand2, bin_id), strand_bin_count in (
-            cis[["strand1", "strand2", "bin_idx"]].value_counts().items()
+            df_cis[["strand1", "strand2", "bin_idx"]].value_counts().items()
         ):
             self._stat["dist_freq"][strand1 + strand2][bin_id] += strand_bin_count
         self._stat["cis_1kb+"] += int(np.sum(dist >= 1000))