open2c · agalitsyna · Apr 27, 2022 · Jun 15, 2021 · Jun 15, 2021 · Jun 15, 2021
diff --git a/pairtools/pairtools_stats.py b/pairtools/pairtools_stats.py
@@ -27,14 +27,14 @@
 )
 @common_io_options
 def stats(input_path, output, merge, **kwargs):
-    """Calculate pairs statistics. 
+    """Calculate pairs statistics.
 
     INPUT_PATH : by default, a .pairs/.pairsam file to calculate statistics.
     If not provided, the input is read from stdin.
-    If --merge is specified, then INPUT_PATH is interpreted as an arbitrary number 
+    If --merge is specified, then INPUT_PATH is interpreted as an arbitrary number
     of stats files to merge.
-    
-    The files with paths ending with .gz/.lz4 are decompressed by bgzip/lz4c. 
+
+    The files with paths ending with .gz/.lz4 are decompressed by bgzip/lz4c.
     """
     stats_py(input_path, output, merge, **kwargs)
 
@@ -72,9 +72,22 @@ def stats_py(input_path, output, merge, **kwargs):
     stats = PairCounter()
 
     # Collecting statistics
-
-    for chunk in pd.read_table(body_stream, names=cols, chunksize=100_000):
-        stats.add_pairs_from_dataframe(chunk)
+    for line in body_stream:
+        cols = line.rstrip().split(_pairsam_format.PAIRSAM_SEP)
+
+        # algn1:
+        chrom1 = cols[_pairsam_format.COL_C1]
+        pos1 = int(cols[_pairsam_format.COL_P1])
+        strand1 = cols[_pairsam_format.COL_S1]
+        # algn2:
+        chrom2 = cols[_pairsam_format.COL_C2]
+        pos2 = int(cols[_pairsam_format.COL_P2])
+        strand2 = cols[_pairsam_format.COL_S2]
+        # pair type:
+        pair_type = cols[_pairsam_format.COL_PTYPE]
+
+        stats.add_pair(chrom1, pos1, strand1, chrom2, pos2, strand2, pair_type)
+    stats.calculate_summaries()
 
     # save statistics to file ...
     stats.save(outstream)
@@ -118,6 +131,13 @@ def do_merge(output, files_to_merge, **kwargs):
         outstream.close()
 
 
+def complexity_estimate(total_mapped, total_dups):
+    try:
+        return total_mapped ** 2 / total_dups / 2
+    except ZeroDivisionError:
+        return 0
+
+
 class PairCounter(Mapping):
     """
     A Counter for Hi-C pairs that accumulates various statistics.
@@ -180,6 +200,19 @@ def __init__(self, min_log10_dist=0, max_log10_dist=9, log10_dist_bin_step=0.25)
             "--": np.zeros(len(self._dist_bins), dtype=np.int),
             "++": np.zeros(len(self._dist_bins), dtype=np.int),
         }
+        # Summaries are derived from other stats and are recalculated on merge
+        self._stat["summary"] = dict(
+            [
+                ("frac_cis", 0),
+                ("frac_cis_1kb+", 0),
+                ("frac_cis_2kb+", 0),
+                ("frac_cis_4kb+", 0),
+                ("frac_cis_10kb+", 0),
+                ("frac_cis_20kb+", 0),
+                ("frac_cis_40kb+", 0),
+                ("complexity", 0),
+            ]
+        )
 
     def __getitem__(self, key):
         if isinstance(key, str):
@@ -195,7 +228,7 @@ def __getitem__(self, key):
                 return self._stat[key]
         else:
             # clearly an error:
-            raise ValueError("{} is not a valid key: must be str".format(key))
+            raise ValueError(f"{key} is not a valid key: must be str")
 
         # K_FIELDS:
         # process multi-key case:
@@ -209,9 +242,7 @@ def __getitem__(self, key):
                 return self._stat[k][k_fields[0]]
             else:
                 raise ValueError(
-                    "{} is not a valid key: {} section implies 1 identifier".format(
-                        key, k
-                    )
+                    f"{key} is not a valid key: {k} section implies 1 identifier"
                 )
 
         elif k == "chrom_freq":
@@ -220,9 +251,7 @@ def __getitem__(self, key):
                 return self._stat[k][tuple(k_fields)]
             else:
                 raise ValueError(
-                    "{} is not a valid key: {} section implies 2 identifiers".format(
-                        key, k
-                    )
+                    f"{key} is not a valid key: {k} section implies 2 identifiers"
                 )
 
         elif k == "dist_freq":
@@ -248,19 +277,39 @@ def __getitem__(self, key):
                 return self._stat["dist_freq"][dirs][bin_idx]
             else:
                 raise ValueError(
-                    "{} is not a valid key: {} section implies 2 identifiers".format(
-                        key, k
-                    )
+                    f"{key} is not a valid key: {k} section implies 2 identifiers"
                 )
         else:
-            raise ValueError("{} is not a valid key".format(k))
+            raise ValueError(f"{k} is not a valid key")
 
     def __iter__(self):
         return iter(self._stat)
 
     def __len__(self):
         return len(self._stat)
 
+    def calculate_summaries(self):
+        """calculate summary statistics (fraction of cis pairs at different cutoffs,
+        complexity estimate) based on accumulated counts. Results are saved into
+        self._stat['summary']
+
+        """
+        for cis_count in (
+            "cis",
+            "cis_1kb+",
+            "cis_2kb+",
+            "cis_4kb+",
+            "cis_10kb+",
+            "cis_20kb+",
+            "cis_40kb+",
+        ):
+            self._stat["summary"][f"frac_{cis_count}"] = (
+                self._stat[cis_count] / self._stat["total_nodups"]
+            )
+        self._stat["summary"]["complexity"] = complexity_estimate(
+            self._stat["total_mapped"], self._stat["total_dups"]
+        )
+
     @classmethod
     def from_file(cls, file_handle):
         """create instance of PairCounter from file
@@ -301,14 +350,17 @@ def from_file(cls, file_handle):
                         )
                     )
             else:
-                # in this case key must be in ['pair_types','chrom_freq','dist_freq','dedup']
+                # in this case key must be in ['pair_types','chrom_freq','dist_freq','dedup', 'summary']
                 # get the first 'key' and keep the remainders in 'key_fields'
                 key = key_fields.pop(0)
-                if key in ["pair_types", "dedup"]:
+                if key in ["pair_types", "dedup", "summary"]:
                     # assert there is only one element in key_fields left:
                     # 'pair_types' and 'dedup' treated the same
                     if len(key_fields) == 1:
-                        stat_from_file._stat[key][key_fields[0]] = int(fields[1])
+                        try:
+                            stat_from_file._stat[key][key_fields[0]] = int(fields[1])
+                        except ValueError:
+                            stat_from_file._stat[key][key_fields[0]] = float(fields[1])
                     else:
                         raise _fileio.ParseError(
                             "{} is not a valid stats file: {} section implies 1 identifier".format(
@@ -452,7 +504,9 @@ def add_pairs_from_dataframe(self, df, unmapped_chrom="!"):
         self._stat["total_unmapped"] += int(unmapped_count)
 
         # Count the mapped:
-        df_mapped = df.loc[(df["chrom1"] != unmapped_chrom) & (df["chrom2"] != unmapped_chrom), :]
+        df_mapped = df.loc[
+            (df["chrom1"] != unmapped_chrom) & (df["chrom2"] != unmapped_chrom), :
+        ]
         mapped_count = df_mapped.shape[0]
 
         self._stat["total_mapped"] += mapped_count
@@ -539,6 +593,7 @@ def __add__(self, other):
                 if k == "dist_freq":
                     for dirs in sum_stat[k]:
                         sum_stat[k][dirs] = self._stat[k][dirs] + other._stat[k][dirs]
+        sum_stat.calculate_summaries()
         return sum_stat
 
     # we need this to be able to sum(list_of_PairCounters)
@@ -549,6 +604,7 @@ def __radd__(self, other):
             return self.__add__(other)
 
     def flatten(self):
+
         """return a flattened dict (formatted same way as .stats file)
 
         """
@@ -578,8 +634,9 @@ def flatten(self):
                                 ).format(k, self._dist_bins[i], dirs)
                             # store key,value pair:
                             flat_stat[formatted_key] = freqs[i]
-                elif (k in ["pair_types", "dedup"]) and v:
-                    # 'pair_types' and 'dedup' are simple dicts inside,
+                elif (k in ["pair_types", "dedup", "summary"]) and v:
+                    # 'pair_types', 'dedup' and 'summary' are simple dicts inside,
+
                     # treat them the exact same way:
                     for k_item, freq in v.items():
                         formatted_key = self._KEY_SEP.join(["{}", "{}"]).format(
@@ -624,4 +681,3 @@ def save(self, outstream):
 
 if __name__ == "__main__":
     stats()
-
diff --git a/tests/test_stats.py b/tests/test_stats.py
@@ -27,7 +27,10 @@ def test_mock_pairsam():
                 if not l.startswith('#') and l.strip())
 
     for k in stats:
-        stats[k] = int(stats[k])
+        try:
+            stats[k] = int(stats[k])
+        except ValueError:
+            stats[k] = float(stats[k])
     print(stats)
 
     assert stats['total'] == 8
@@ -53,4 +56,11 @@ def test_mock_pairsam():
     assert stats['dist_freq/1-2/++'] == 1
     assert stats['dist_freq/2-3/++'] == 1
     assert stats['dist_freq/32-56/++'] == 1
+    assert stats['summary/frac_cis'] == 0.6
+    assert stats['summary/frac_cis_1kb+'] == 0
+    assert stats['summary/frac_cis_2kb+'] == 0
+    assert stats['summary/frac_cis_4kb+'] == 0
+    assert stats['summary/frac_cis_10kb+'] == 0
+    assert stats['summary/frac_cis_20kb+'] == 0
+    assert stats['summary/frac_cis_40kb+'] == 0